libcsv.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545
  1. /*
  2. libcsv - parse and write csv data
  3. Copyright (C) 2008 Robert Gamble
  4. This library is free software; you can redistribute it and/or
  5. modify it under the terms of the GNU Lesser General Public
  6. License as published by the Free Software Foundation; either
  7. version 2.1 of the License, or (at your option) any later version.
  8. This library is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. Lesser General Public License for more details.
  12. You should have received a copy of the GNU Lesser General Public
  13. License along with this library; if not, write to the Free Software
  14. Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  15. */
  16. #include <assert.h>
  17. #if __STDC_VERSION__ >= 199901L
  18. # include <stdint.h>
  19. #else
  20. # define SIZE_MAX ((size_t)-1) /* C89 doesn't have stdint.h or SIZE_MAX */
  21. #endif
  22. #include "csv.h"
  23. #define VERSION "3.0.3"
  24. #define ROW_NOT_BEGUN 0
  25. #define FIELD_NOT_BEGUN 1
  26. #define FIELD_BEGUN 2
  27. #define FIELD_MIGHT_HAVE_ENDED 3
  28. /*
  29. Explanation of states
  30. ROW_NOT_BEGUN There have not been any fields encountered for this row
  31. FIELD_NOT_BEGUN There have been fields but we are currently not in one
  32. FIELD_BEGUN We are in a field
  33. FIELD_MIGHT_HAVE_ENDED
  34. We encountered a double quote inside a quoted field, the
  35. field is either ended or the quote is literal
  36. */
  37. #define MEM_BLK_SIZE 128
  38. #define SUBMIT_FIELD(p) \
  39. do { \
  40. if (!quoted) \
  41. entry_pos -= spaces; \
  42. if (p->options & CSV_APPEND_NULL) \
  43. ((p)->entry_buf[entry_pos]) = '\0'; \
  44. if (cb1 && (p->options & CSV_EMPTY_IS_NULL) && !quoted && entry_pos == 0) \
  45. cb1(NULL, entry_pos, data); \
  46. else if (cb1) \
  47. cb1(p->entry_buf, entry_pos, data); \
  48. pstate = FIELD_NOT_BEGUN; \
  49. entry_pos = quoted = spaces = 0; \
  50. } while (0)
  51. #define SUBMIT_ROW(p, c) \
  52. do { \
  53. if (cb2) \
  54. cb2(c, data); \
  55. pstate = ROW_NOT_BEGUN; \
  56. entry_pos = quoted = spaces = 0; \
  57. } while (0)
  58. #define SUBMIT_CHAR(p, c) ((p)->entry_buf[entry_pos++] = (c))
  59. static const char *csv_errors[] = {"success",
  60. "error parsing data while strict checking enabled",
  61. "memory exhausted while increasing buffer size",
  62. "data size too large",
  63. "invalid status code"};
  64. int
  65. csv_error(const struct csv_parser *p)
  66. {
  67. assert(p && "received null csv_parser");
  68. /* Return the current status of the parser */
  69. return p->status;
  70. }
  71. const char *
  72. csv_strerror(int status)
  73. {
  74. /* Return a textual description of status */
  75. if (status >= CSV_EINVALID || status < 0)
  76. return csv_errors[CSV_EINVALID];
  77. else
  78. return csv_errors[status];
  79. }
  80. int
  81. csv_get_opts(const struct csv_parser *p)
  82. {
  83. /* Return the currently set options of parser */
  84. if (p == NULL)
  85. return -1;
  86. return p->options;
  87. }
  88. int
  89. csv_set_opts(struct csv_parser *p, unsigned char options)
  90. {
  91. /* Set the options */
  92. if (p == NULL)
  93. return -1;
  94. p->options = options;
  95. return 0;
  96. }
  97. int
  98. csv_init(struct csv_parser *p, unsigned char options)
  99. {
  100. /* Initialize a csv_parser object returns 0 on success, -1 on error */
  101. if (p == NULL)
  102. return -1;
  103. p->entry_buf = NULL;
  104. p->pstate = ROW_NOT_BEGUN;
  105. p->quoted = 0;
  106. p->spaces = 0;
  107. p->entry_pos = 0;
  108. p->entry_size = 0;
  109. p->status = 0;
  110. p->options = options;
  111. p->quote_char = CSV_QUOTE;
  112. p->delim_char = CSV_COMMA;
  113. p->is_space = NULL;
  114. p->is_term = NULL;
  115. p->blk_size = MEM_BLK_SIZE;
  116. p->malloc_func = NULL;
  117. p->realloc_func = realloc;
  118. p->free_func = free;
  119. return 0;
  120. }
  121. void
  122. csv_free(struct csv_parser *p)
  123. {
  124. /* Free the entry_buffer of csv_parser object */
  125. if (p == NULL)
  126. return;
  127. if (p->entry_buf && p->free_func)
  128. p->free_func(p->entry_buf);
  129. p->entry_buf = NULL;
  130. p->entry_size = 0;
  131. return;
  132. }
  133. int
  134. csv_fini(struct csv_parser *p, void (*cb1)(void *, size_t, void *), void (*cb2)(int c, void *), void *data)
  135. {
  136. if (p == NULL)
  137. return -1;
  138. /* Finalize parsing. Needed, for example, when file does not end in a newline */
  139. int quoted = p->quoted;
  140. int pstate = p->pstate;
  141. size_t spaces = p->spaces;
  142. size_t entry_pos = p->entry_pos;
  143. if ((pstate == FIELD_BEGUN) && p->quoted && (p->options & CSV_STRICT) && (p->options & CSV_STRICT_FINI)) {
  144. /* Current field is quoted, no end-quote was seen, and CSV_STRICT_FINI is set */
  145. p->status = CSV_EPARSE;
  146. return -1;
  147. }
  148. switch (pstate) {
  149. case FIELD_MIGHT_HAVE_ENDED:
  150. p->entry_pos -= p->spaces + 1; /* get rid of spaces and original quote */
  151. entry_pos = p->entry_pos;
  152. /*lint -fallthrough */
  153. case FIELD_NOT_BEGUN:
  154. case FIELD_BEGUN:
  155. /* Unnecessary:
  156. quoted = p->quoted, pstate = p->pstate;
  157. spaces = p->spaces, entry_pos = p->entry_pos;
  158. */
  159. SUBMIT_FIELD(p);
  160. SUBMIT_ROW(p, -1);
  161. break;
  162. case ROW_NOT_BEGUN: /* Already ended properly */
  163. ;
  164. }
  165. /* Reset parser */
  166. p->spaces = p->quoted = p->entry_pos = p->status = 0;
  167. p->pstate = ROW_NOT_BEGUN;
  168. return 0;
  169. }
  170. void
  171. csv_set_delim(struct csv_parser *p, unsigned char c)
  172. {
  173. /* Set the delimiter */
  174. if (p) p->delim_char = c;
  175. }
  176. void
  177. csv_set_quote(struct csv_parser *p, unsigned char c)
  178. {
  179. /* Set the quote character */
  180. if (p) p->quote_char = c;
  181. }
  182. unsigned char
  183. csv_get_delim(const struct csv_parser *p)
  184. {
  185. assert(p && "received null csv_parser");
  186. /* Get the delimiter */
  187. return p->delim_char;
  188. }
  189. unsigned char
  190. csv_get_quote(const struct csv_parser *p)
  191. {
  192. assert(p && "received null csv_parser");
  193. /* Get the quote character */
  194. return p->quote_char;
  195. }
  196. void
  197. csv_set_space_func(struct csv_parser *p, int (*f)(unsigned char))
  198. {
  199. /* Set the space function */
  200. if (p) p->is_space = f;
  201. }
  202. void
  203. csv_set_term_func(struct csv_parser *p, int (*f)(unsigned char))
  204. {
  205. /* Set the term function */
  206. if (p) p->is_term = f;
  207. }
  208. void
  209. csv_set_realloc_func(struct csv_parser *p, void *(*f)(void *, size_t))
  210. {
  211. /* Set the realloc function used to increase buffer size */
  212. if (p && f) p->realloc_func = f;
  213. }
  214. void
  215. csv_set_free_func(struct csv_parser *p, void (*f)(void *))
  216. {
  217. /* Set the free function used to free the buffer */
  218. if (p && f) p->free_func = f;
  219. }
  220. void
  221. csv_set_blk_size(struct csv_parser *p, size_t size)
  222. {
  223. /* Set the block size used to increment buffer size */
  224. if (p) p->blk_size = size;
  225. }
  226. size_t
  227. csv_get_buffer_size(const struct csv_parser *p)
  228. {
  229. /* Get the size of the entry buffer */
  230. if (p)
  231. return p->entry_size;
  232. return 0;
  233. }
  234. static int
  235. csv_increase_buffer(struct csv_parser *p)
  236. {
  237. if (p == NULL) return 0;
  238. if (p->realloc_func == NULL) return 0;
  239. /* Increase the size of the entry buffer. Attempt to increase size by
  240. * p->blk_size, if this is larger than SIZE_MAX try to increase current
  241. * buffer size to SIZE_MAX. If allocation fails, try to allocate halve
  242. * the size and try again until successful or increment size is zero.
  243. */
  244. size_t to_add = p->blk_size;
  245. void *vp;
  246. if ( p->entry_size >= SIZE_MAX - to_add )
  247. to_add = SIZE_MAX - p->entry_size;
  248. if (!to_add) {
  249. p->status = CSV_ETOOBIG;
  250. return -1;
  251. }
  252. while ((vp = p->realloc_func(p->entry_buf, p->entry_size + to_add)) == NULL) {
  253. to_add /= 2;
  254. if (!to_add) {
  255. p->status = CSV_ENOMEM;
  256. return -1;
  257. }
  258. }
  259. /* Update entry buffer pointer and entry_size if successful */
  260. p->entry_buf = vp;
  261. p->entry_size += to_add;
  262. return 0;
  263. }
  264. size_t
  265. csv_parse(struct csv_parser *p, const void *s, size_t len, void (*cb1)(void *, size_t, void *), void (*cb2)(int c, void *), void *data)
  266. {
  267. assert(p && "received null csv_parser");
  268. if (s == NULL) return 0;
  269. unsigned const char *us = s; /* Access input data as array of unsigned char */
  270. unsigned char c; /* The character we are currently processing */
  271. size_t pos = 0; /* The number of characters we have processed in this call */
  272. /* Store key fields into local variables for performance */
  273. unsigned char delim = p->delim_char;
  274. unsigned char quote = p->quote_char;
  275. int (*is_space)(unsigned char) = p->is_space;
  276. int (*is_term)(unsigned char) = p->is_term;
  277. int quoted = p->quoted;
  278. int pstate = p->pstate;
  279. size_t spaces = p->spaces;
  280. size_t entry_pos = p->entry_pos;
  281. if (!p->entry_buf && pos < len) {
  282. /* Buffer hasn't been allocated yet and len > 0 */
  283. if (csv_increase_buffer(p) != 0) {
  284. p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
  285. return pos;
  286. }
  287. }
  288. while (pos < len) {
  289. /* Check memory usage, increase buffer if necessary */
  290. if (entry_pos == ((p->options & CSV_APPEND_NULL) ? p->entry_size - 1 : p->entry_size) ) {
  291. if (csv_increase_buffer(p) != 0) {
  292. p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
  293. return pos;
  294. }
  295. }
  296. c = us[pos++];
  297. switch (pstate) {
  298. case ROW_NOT_BEGUN:
  299. case FIELD_NOT_BEGUN:
  300. if ((is_space ? is_space(c) : c == CSV_SPACE || c == CSV_TAB) && c!=delim) { /* Space or Tab */
  301. continue;
  302. } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) { /* Carriage Return or Line Feed */
  303. if (pstate == FIELD_NOT_BEGUN) {
  304. SUBMIT_FIELD(p);
  305. SUBMIT_ROW(p, c);
  306. } else { /* ROW_NOT_BEGUN */
  307. /* Don't submit empty rows by default */
  308. if (p->options & CSV_REPALL_NL) {
  309. SUBMIT_ROW(p, c);
  310. }
  311. }
  312. continue;
  313. } else if (c == delim) { /* Comma */
  314. SUBMIT_FIELD(p);
  315. break;
  316. } else if (c == quote) { /* Quote */
  317. pstate = FIELD_BEGUN;
  318. quoted = 1;
  319. } else { /* Anything else */
  320. pstate = FIELD_BEGUN;
  321. quoted = 0;
  322. SUBMIT_CHAR(p, c);
  323. }
  324. break;
  325. case FIELD_BEGUN:
  326. if (c == quote) { /* Quote */
  327. if (quoted) {
  328. SUBMIT_CHAR(p, c);
  329. pstate = FIELD_MIGHT_HAVE_ENDED;
  330. } else {
  331. /* STRICT ERROR - double quote inside non-quoted field */
  332. if (p->options & CSV_STRICT) {
  333. p->status = CSV_EPARSE;
  334. p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
  335. return pos-1;
  336. }
  337. SUBMIT_CHAR(p, c);
  338. spaces = 0;
  339. }
  340. } else if (c == delim) { /* Comma */
  341. if (quoted) {
  342. SUBMIT_CHAR(p, c);
  343. } else {
  344. SUBMIT_FIELD(p);
  345. }
  346. } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) { /* Carriage Return or Line Feed */
  347. if (!quoted) {
  348. SUBMIT_FIELD(p);
  349. SUBMIT_ROW(p, c);
  350. } else {
  351. SUBMIT_CHAR(p, c);
  352. }
  353. } else if (!quoted && (is_space? is_space(c) : c == CSV_SPACE || c == CSV_TAB)) { /* Tab or space for non-quoted field */
  354. SUBMIT_CHAR(p, c);
  355. spaces++;
  356. } else { /* Anything else */
  357. SUBMIT_CHAR(p, c);
  358. spaces = 0;
  359. }
  360. break;
  361. case FIELD_MIGHT_HAVE_ENDED:
  362. /* This only happens when a quote character is encountered in a quoted field */
  363. if (c == delim) { /* Comma */
  364. entry_pos -= spaces + 1; /* get rid of spaces and original quote */
  365. SUBMIT_FIELD(p);
  366. } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) { /* Carriage Return or Line Feed */
  367. entry_pos -= spaces + 1; /* get rid of spaces and original quote */
  368. SUBMIT_FIELD(p);
  369. SUBMIT_ROW(p, c);
  370. } else if (is_space ? is_space(c) : c == CSV_SPACE || c == CSV_TAB) { /* Space or Tab */
  371. SUBMIT_CHAR(p, c);
  372. spaces++;
  373. } else if (c == quote) { /* Quote */
  374. if (spaces) {
  375. /* STRICT ERROR - unescaped double quote */
  376. if (p->options & CSV_STRICT) {
  377. p->status = CSV_EPARSE;
  378. p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
  379. return pos-1;
  380. }
  381. spaces = 0;
  382. SUBMIT_CHAR(p, c);
  383. } else {
  384. /* Two quotes in a row */
  385. pstate = FIELD_BEGUN;
  386. }
  387. } else { /* Anything else */
  388. /* STRICT ERROR - unescaped double quote */
  389. if (p->options & CSV_STRICT) {
  390. p->status = CSV_EPARSE;
  391. p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
  392. return pos-1;
  393. }
  394. pstate = FIELD_BEGUN;
  395. spaces = 0;
  396. SUBMIT_CHAR(p, c);
  397. }
  398. break;
  399. default:
  400. break;
  401. }
  402. }
  403. p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
  404. return pos;
  405. }
  406. size_t
  407. csv_write (void *dest, size_t dest_size, const void *src, size_t src_size)
  408. {
  409. return csv_write2(dest, dest_size, src, src_size, CSV_QUOTE);
  410. }
  411. int
  412. csv_fwrite (FILE *fp, const void *src, size_t src_size)
  413. {
  414. return csv_fwrite2(fp, src, src_size, CSV_QUOTE);
  415. }
  416. size_t
  417. csv_write2 (void *dest, size_t dest_size, const void *src, size_t src_size, unsigned char quote)
  418. {
  419. unsigned char *cdest = dest;
  420. const unsigned char *csrc = src;
  421. size_t chars = 0;
  422. if (src == NULL)
  423. return 0;
  424. if (dest == NULL)
  425. dest_size = 0;
  426. if (dest_size > 0)
  427. *cdest++ = quote;
  428. chars++;
  429. while (src_size) {
  430. if (*csrc == quote) {
  431. if (dest_size > chars)
  432. *cdest++ = quote;
  433. if (chars < SIZE_MAX) chars++;
  434. }
  435. if (dest_size > chars)
  436. *cdest++ = *csrc;
  437. if (chars < SIZE_MAX) chars++;
  438. src_size--;
  439. csrc++;
  440. }
  441. if (dest_size > chars)
  442. *cdest = quote;
  443. if (chars < SIZE_MAX) chars++;
  444. return chars;
  445. }
  446. int
  447. csv_fwrite2 (FILE *fp, const void *src, size_t src_size, unsigned char quote)
  448. {
  449. const unsigned char *csrc = src;
  450. if (fp == NULL || src == NULL)
  451. return 0;
  452. if (fputc(quote, fp) == EOF)
  453. return EOF;
  454. while (src_size) {
  455. if (*csrc == quote) {
  456. if (fputc(quote, fp) == EOF)
  457. return EOF;
  458. }
  459. if (fputc(*csrc, fp) == EOF)
  460. return EOF;
  461. src_size--;
  462. csrc++;
  463. }
  464. if (fputc(quote, fp) == EOF) {
  465. return EOF;
  466. }
  467. return 0;
  468. }