Leptonica  1.77.0
Image processing and image analysis suite
encoding.c
1 /*====================================================================*
2  - Copyright (C) 2001 Leptonica. All rights reserved.
3  - This software is distributed in the hope that it will be
4  - useful, but with NO WARRANTY OF ANY KIND.
5  - No author or distributor accepts responsibility to anyone for the
6  - consequences of using this software, or for whether it serves any
7  - particular purpose or works at all, unless he or she says so in
8  - writing. Everyone is granted permission to copy, modify and
9  - redistribute this source code, for commercial or non-commercial
10  - purposes, with the following restrictions: (1) the origin of this
11  - source code must not be misrepresented; (2) modified versions must
12  - be plainly marked as such; and (3) this notice may not be removed
13  - or altered from any source or modified source distribution.
14  *====================================================================*/
15 
16 /*
17  * encodings.c
18  *
19  * Base64
20  * char *encodeBase64()
21  * l_uint8 *decodeBase64()
22  * static l_int32 isBase64()
23  * static l_int32 *genReverseTab64()
24  * static void byteConvert3to4()
25  * static void byteConvert4to3()
26  *
27  * Ascii85
28  * char *encodeAscii85()
29  * l_uint8 *decodeAscii85()
30  * static l_int32 convertChunkToAscii85()
31  *
32  * String reformatting for base 64 encoded data
33  * char *reformatPacked64()
34  *
35  * Base64 encoding is useful for encding binary data in a restricted set of
36  * 64 printable ascii symbols, that includes the 62 alphanumerics and '+'
37  * and '/'. Notably it does not include quotes, so that base64 encoded
38  * strings can be used in situations where quotes are used for formatting.
39  * 64 symbols was chosen because it is the smallest number that can be used
40  * in 4-for-3 byte encoding of binary data:
41  * log2(64) / log2(256) = 0.75 = 3/4
42  *
43  * Ascii85 encoding is used in PostScript and some pdf files for
44  * representing binary data (for example, a compressed image) in printable
45  * ascii symbols. It has a dictionary of 85 symbols; 85 was chosen because
46  * it is the smallest number that can be used in 5-for-4 byte encoding
47  * of binary data (256 possible input values). This can be seen from
48  * the max information content in such a sequence:
49  * log2(84) / log2(256) = 0.799 < 4/5
50  * log2(85) / log2(256) = 0.801 > 4/5
51  */
52 
53 #include <ctype.h>
54 #include "allheaders.h"
55 
56  /* Base64 encoding table in string representation */
57 static const l_int32 MAX_BASE64_LINE = 72; /* max line length base64 */
58 static const char *tablechar64 =
59  "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
60  "abcdefghijklmnopqrstuvwxyz"
61  "0123456789+/";
62 
63 static l_int32 isBase64(char);
64 static l_int32 *genReverseTab64(void);
65 static void byteConvert3to4(l_uint8 *in3, l_uint8 *out4);
66 static void byteConvert4to3(l_uint8 *in4, l_uint8 *out3);
67 
68  /* Ascii85 encoding */
69 static const l_int32 MAX_ASCII85_LINE = 64; /* max line length ascii85 */
70 static const l_uint32 power85[5] = {1,
71  85,
72  85 * 85,
73  85 * 85 * 85,
74  85 * 85 * 85 * 85};
75 
76 static l_int32 convertChunkToAscii85(l_uint8 *inarray, l_int32 insize,
77  l_int32 *pindex, char *outbuf,
78  l_int32 *pnbout);
79 
80 
81 /*-------------------------------------------------------------*
82  * Utility for encoding and decoding data with base64 *
83  *-------------------------------------------------------------*/
99 char *
100 encodeBase64(l_uint8 *inarray,
101  l_int32 insize,
102  l_int32 *poutsize)
103 {
104 char *chara;
105 l_uint8 *bytea;
106 l_uint8 array3[3], array4[4];
107 l_int32 outsize, i, j, index, linecount;
108 
109  PROCNAME("encodeBase64");
110 
111  if (!poutsize)
112  return (char *)ERROR_PTR("&outsize not defined", procName, NULL);
113  *poutsize = 0;
114  if (!inarray)
115  return (char *)ERROR_PTR("inarray not defined", procName, NULL);
116  if (insize <= 0)
117  return (char *)ERROR_PTR("insize not > 0", procName, NULL);
118 
119  /* The output array is padded to a multiple of 4 bytes, not
120  * counting the newlines. We just need to allocate a large
121  * enough array, and add 4 bytes to make sure it is big enough. */
122  outsize = 4 * ((insize + 2) / 3); /* without newlines */
123  outsize += outsize / MAX_BASE64_LINE + 4; /* with the newlines */
124  if ((chara = (char *)LEPT_CALLOC(outsize, sizeof(char))) == NULL)
125  return (char *)ERROR_PTR("chara not made", procName, NULL);
126 
127  /* Read all the input data, and convert in sets of 3 input
128  * bytes --> 4 output bytes. */
129  i = index = linecount = 0;
130  bytea = inarray;
131  while (insize--) {
132  if (linecount == MAX_BASE64_LINE) {
133  chara[index++] = '\n';
134  linecount = 0;
135  }
136  array3[i++] = *bytea++;
137  if (i == 3) { /* convert 3 to 4 and save */
138  byteConvert3to4(array3, array4);
139  for (j = 0; j < 4; j++)
140  chara[index++] = tablechar64[array4[j]];
141  i = 0;
142  linecount += 4;
143  }
144  }
145 
146  /* Suppose 1 or 2 bytes has been read but not yet processed.
147  * If 1 byte has been read, this will generate 2 bytes of
148  * output, with 6 bits to the first byte and 2 bits to the second.
149  * We will add two bytes of '=' for padding.
150  * If 2 bytes has been read, this will generate 3 bytes of output,
151  * with 6 bits to the first 2 bytes and 4 bits to the third, and
152  * we add a fourth padding byte ('='). */
153  if (i > 0) { /* left-over 1 or 2 input bytes */
154  for (j = i; j < 3; j++)
155  array3[j] = '\0'; /* zero the remaining input bytes */
156  byteConvert3to4(array3, array4);
157  for (j = 0; j <= i; j++)
158  chara[index++] = tablechar64[array4[j]];
159  for (j = i + 1; j < 4; j++)
160  chara[index++] = '=';
161  }
162  *poutsize = index;
163 
164  return chara;
165 }
166 
167 
187 l_uint8 *
188 decodeBase64(const char *inarray,
189  l_int32 insize,
190  l_int32 *poutsize)
191 {
192 char inchar;
193 l_uint8 *bytea;
194 l_uint8 array3[3], array4[4];
195 l_int32 *rtable64;
196 l_int32 i, j, outsize, in_index, out_index;
197 
198  PROCNAME("decodeBase64");
199 
200  if (!poutsize)
201  return (l_uint8 *)ERROR_PTR("&outsize not defined", procName, NULL);
202  *poutsize = 0;
203  if (!inarray)
204  return (l_uint8 *)ERROR_PTR("inarray not defined", procName, NULL);
205  if (insize <= 0)
206  return (l_uint8 *)ERROR_PTR("insize not > 0", procName, NULL);
207 
208  /* Validate the input data */
209  for (i = 0; i < insize; i++) {
210  inchar = inarray[i];
211  if (inchar == '\n') continue;
212  if (isBase64(inchar) == 0 && inchar != '=')
213  return (l_uint8 *)ERROR_PTR("invalid char in inarray",
214  procName, NULL);
215  }
216 
217  /* The input array typically is made with a newline every
218  * MAX_BASE64_LINE input bytes. However, as a printed string, the
219  * newlines would be stripped. So when we allocate the output
220  * array, assume the input array is all data, but strip
221  * out the newlines during decoding. This guarantees that
222  * the allocated array is large enough. */
223  outsize = 3 * ((insize + 3) / 4) + 4;
224  if ((bytea = (l_uint8 *)LEPT_CALLOC(outsize, sizeof(l_uint8))) == NULL)
225  return (l_uint8 *)ERROR_PTR("bytea not made", procName, NULL);
226 
227  /* The number of encoded input data bytes is always a multiple of 4.
228  * Read all the data, until you reach either the end or
229  * the first pad character '='. The data is processed in
230  * units of 4 input bytes, generating 3 output decoded bytes
231  * of binary data. Newlines are ignored. If there are no
232  * pad bytes, i == 0 at the end of this section. */
233  rtable64 = genReverseTab64();
234  i = in_index = out_index = 0;
235  for (in_index = 0; in_index < insize; in_index++) {
236  inchar = inarray[in_index];
237  if (inchar == '\n') continue;
238  if (inchar == '=') break;
239  array4[i++] = rtable64[(unsigned char)inchar];
240  if (i < 4) {
241  continue;
242  } else { /* i == 4; convert 4 to 3 and save */
243  byteConvert4to3(array4, array3);
244  for (j = 0; j < 3; j++)
245  bytea[out_index++] = array3[j];
246  i = 0;
247  }
248  }
249 
250  /* If i > 0, we ran into pad bytes ('='). If i == 2, there are
251  * two input pad bytes and one output data byte. If i == 3,
252  * there is one input pad byte and two output data bytes. */
253  if (i > 0) {
254  for (j = i; j < 4; j++)
255  array4[j] = '\0'; /* zero the remaining input bytes */
256  byteConvert4to3(array4, array3);
257  for (j = 0; j < i - 1; j++)
258  bytea[out_index++] = array3[j];
259  }
260  *poutsize = out_index;
261 
262  LEPT_FREE(rtable64);
263  return bytea;
264 }
265 
266 
270 static l_int32
271 isBase64(char c)
272 {
273  return (isalnum(((int)c)) || ((c) == '+') || ((c) == '/')) ? 1 : 0;
274 }
275 
279 static l_int32 *
280 genReverseTab64()
281 {
282 l_int32 i;
283 l_int32 *rtable64;
284 
285  rtable64 = (l_int32 *)LEPT_CALLOC(128, sizeof(l_int32));
286  for (i = 0; i < 64; i++) {
287  rtable64[(unsigned char)tablechar64[i]] = i;
288  }
289  return rtable64;
290 }
291 
295 static void
296 byteConvert3to4(l_uint8 *in3,
297  l_uint8 *out4)
298 {
299  out4[0] = in3[0] >> 2;
300  out4[1] = ((in3[0] & 0x03) << 4) | (in3[1] >> 4);
301  out4[2] = ((in3[1] & 0x0f) << 2) | (in3[2] >> 6);
302  out4[3] = in3[2] & 0x3f;
303  return;
304 }
305 
309 static void
310 byteConvert4to3(l_uint8 *in4,
311  l_uint8 *out3)
312 {
313  out3[0] = (in4[0] << 2) | (in4[1] >> 4);
314  out3[1] = ((in4[1] & 0x0f) << 4) | (in4[2] >> 2);
315  out3[2] = ((in4[2] & 0x03) << 6) | in4[3];
316  return;
317 }
318 
319 
320 /*-------------------------------------------------------------*
321  * Utility for encoding and decoding data with ascii85 *
322  *-------------------------------------------------------------*/
338 char *
339 encodeAscii85(l_uint8 *inarray,
340  l_int32 insize,
341  l_int32 *poutsize)
342 {
343 char *chara;
344 char outbuf[8];
345 l_int32 maxsize, i, index, outindex, linecount, nbout, eof;
346 
347  PROCNAME("encodeAscii85");
348 
349  if (!poutsize)
350  return (char *)ERROR_PTR("&outsize not defined", procName, NULL);
351  *poutsize = 0;
352  if (!inarray)
353  return (char *)ERROR_PTR("inarray not defined", procName, NULL);
354  if (insize <= 0)
355  return (char *)ERROR_PTR("insize not > 0", procName, NULL);
356 
357  /* Accumulate results in char array */
358  maxsize = (l_int32)(80. + (insize * 5. / 4.) *
359  (1. + 2. / MAX_ASCII85_LINE));
360  if ((chara = (char *)LEPT_CALLOC(maxsize, sizeof(char))) == NULL)
361  return (char *)ERROR_PTR("chara not made", procName, NULL);
362 
363  linecount = 0;
364  index = 0;
365  outindex = 0;
366  while (1) {
367  eof = convertChunkToAscii85(inarray, insize, &index, outbuf, &nbout);
368  for (i = 0; i < nbout; i++) {
369  chara[outindex++] = outbuf[i];
370  linecount++;
371  if (linecount >= MAX_ASCII85_LINE) {
372  chara[outindex++] = '\n';
373  linecount = 0;
374  }
375  }
376  if (eof == TRUE) {
377  if (linecount != 0)
378  chara[outindex++] = '\n';
379  chara[outindex++] = '~';
380  chara[outindex++] = '>';
381  chara[outindex++] = '\n';
382  break;
383  }
384  }
385 
386  *poutsize = outindex;
387  return chara;
388 }
389 
390 
407 static l_int32
408 convertChunkToAscii85(l_uint8 *inarray,
409  l_int32 insize,
410  l_int32 *pindex,
411  char *outbuf,
412  l_int32 *pnbout)
413 {
414 l_uint8 inbyte;
415 l_uint32 inword, val;
416 l_int32 eof, index, nread, nbout, i;
417 
418  eof = FALSE;
419  index = *pindex;
420  nread = L_MIN(4, (insize - index));
421  if (insize == index + nread)
422  eof = TRUE;
423  *pindex += nread; /* save new index */
424 
425  /* Read input data and save in l_uint32 */
426  inword = 0;
427  for (i = 0; i < nread; i++) {
428  inbyte = inarray[index + i];
429  inword += inbyte << (8 * (3 - i));
430  }
431 
432 #if 0
433  fprintf(stderr, "index = %d, nread = %d\n", index, nread);
434  fprintf(stderr, "inword = %x\n", inword);
435  fprintf(stderr, "eof = %d\n", eof);
436 #endif
437 
438  /* Special case: output 1 byte only */
439  if (inword == 0) {
440  outbuf[0] = 'z';
441  nbout = 1;
442  } else { /* output nread + 1 bytes */
443  for (i = 4; i >= 4 - nread; i--) {
444  val = inword / power85[i];
445  outbuf[4 - i] = (l_uint8)(val + '!');
446  inword -= val * power85[i];
447  }
448  nbout = nread + 1;
449  }
450  *pnbout = nbout;
451 
452  return eof;
453 }
454 
455 
472 l_uint8 *
473 decodeAscii85(char *inarray,
474  l_int32 insize,
475  l_int32 *poutsize)
476 {
477 char inc;
478 char *pin;
479 l_uint8 val;
480 l_uint8 *outa;
481 l_int32 maxsize, ocount, bytecount, index;
482 l_uint32 oword;
483 
484  PROCNAME("decodeAscii85");
485 
486  if (!poutsize)
487  return (l_uint8 *)ERROR_PTR("&outsize not defined", procName, NULL);
488  *poutsize = 0;
489  if (!inarray)
490  return (l_uint8 *)ERROR_PTR("inarray not defined", procName, NULL);
491  if (insize <= 0)
492  return (l_uint8 *)ERROR_PTR("insize not > 0", procName, NULL);
493 
494  /* Accumulate results in outa */
495  maxsize = (l_int32)(80. + (insize * 4. / 5.)); /* plenty big */
496  if ((outa = (l_uint8 *)LEPT_CALLOC(maxsize, sizeof(l_uint8))) == NULL)
497  return (l_uint8 *)ERROR_PTR("outa not made", procName, NULL);
498 
499  pin = inarray;
500  ocount = 0; /* byte index into outa */
501  oword = 0;
502  for (index = 0, bytecount = 0; index < insize; index++, pin++) {
503  inc = *pin;
504 
505  if (inc == ' ' || inc == '\t' || inc == '\n' ||
506  inc == '\f' || inc == '\r' || inc == '\v') /* ignore white space */
507  continue;
508 
509  val = inc - '!';
510  if (val < 85) {
511  oword = oword * 85 + val;
512  if (bytecount < 4) {
513  bytecount++;
514  } else { /* we have all 5 input chars for the oword */
515  outa[ocount] = (oword >> 24) & 0xff;
516  outa[ocount + 1] = (oword >> 16) & 0xff;
517  outa[ocount + 2] = (oword >> 8) & 0xff;
518  outa[ocount + 3] = oword & 0xff;
519  ocount += 4;
520  bytecount = 0;
521  oword = 0;
522  }
523  } else if (inc == 'z' && bytecount == 0) {
524  outa[ocount] = 0;
525  outa[ocount + 1] = 0;
526  outa[ocount + 2] = 0;
527  outa[ocount + 3] = 0;
528  ocount += 4;
529  } else if (inc == '~') { /* end of data */
530  L_INFO(" %d extra bytes output\n", procName, bytecount - 1);
531  switch (bytecount) {
532  case 0: /* normal eof */
533  case 1: /* error */
534  break;
535  case 2: /* 1 extra byte */
536  oword = oword * power85[3] + 0xffffff;
537  outa[ocount] = (oword >> 24) & 0xff;
538  break;
539  case 3: /* 2 extra bytes */
540  oword = oword * power85[2] + 0xffff;
541  outa[ocount] = (oword >> 24) & 0xff;
542  outa[ocount + 1] = (oword >> 16) & 0xff;
543  break;
544  case 4: /* 3 extra bytes */
545  oword = oword * 85 + 0xff;
546  outa[ocount] = (oword >> 24) & 0xff;
547  outa[ocount + 1] = (oword >> 16) & 0xff;
548  outa[ocount + 2] = (oword >> 8) & 0xff;
549  break;
550  }
551  if (bytecount > 1)
552  ocount += (bytecount - 1);
553  break;
554  }
555  }
556  *poutsize = ocount;
557 
558  return outa;
559 }
560 
561 
562 /*-------------------------------------------------------------*
563  * String reformatting for base 64 encoded data *
564  *-------------------------------------------------------------*/
586 char *
587 reformatPacked64(char *inarray,
588  l_int32 insize,
589  l_int32 leadspace,
590  l_int32 linechars,
591  l_int32 addquotes,
592  l_int32 *poutsize)
593 {
594 char *flata, *outa;
595 l_int32 i, j, flatindex, flatsize, outindex, nlines, linewithpad, linecount;
596 
597  PROCNAME("reformatPacked64");
598 
599  if (!poutsize)
600  return (char *)ERROR_PTR("&outsize not defined", procName, NULL);
601  *poutsize = 0;
602  if (!inarray)
603  return (char *)ERROR_PTR("inarray not defined", procName, NULL);
604  if (insize <= 0)
605  return (char *)ERROR_PTR("insize not > 0", procName, NULL);
606  if (leadspace < 0)
607  return (char *)ERROR_PTR("leadspace must be >= 0", procName, NULL);
608  if (linechars % 4)
609  return (char *)ERROR_PTR("linechars % 4 must be 0", procName, NULL);
610 
611  /* Remove all white space */
612  if ((flata = (char *)LEPT_CALLOC(insize, sizeof(char))) == NULL)
613  return (char *)ERROR_PTR("flata not made", procName, NULL);
614  for (i = 0, flatindex = 0; i < insize; i++) {
615  if (isBase64(inarray[i]) || inarray[i] == '=')
616  flata[flatindex++] = inarray[i];
617  }
618 
619  /* Generate output string */
620  flatsize = flatindex;
621  nlines = (flatsize + linechars - 1) / linechars;
622  linewithpad = leadspace + linechars + 1; /* including newline */
623  if (addquotes) linewithpad += 2;
624  if ((outa = (char *)LEPT_CALLOC((size_t)nlines * linewithpad,
625  sizeof(char))) == NULL) {
626  LEPT_FREE(flata);
627  return (char *)ERROR_PTR("outa not made", procName, NULL);
628  }
629  for (j = 0, outindex = 0; j < leadspace; j++)
630  outa[outindex++] = ' ';
631  if (addquotes) outa[outindex++] = '"';
632  for (i = 0, linecount = 0; i < flatsize; i++) {
633  if (linecount == linechars) {
634  if (addquotes) outa[outindex++] = '"';
635  outa[outindex++] = '\n';
636  for (j = 0; j < leadspace; j++)
637  outa[outindex++] = ' ';
638  if (addquotes) outa[outindex++] = '"';
639  linecount = 0;
640  }
641  outa[outindex++] = flata[i];
642  linecount++;
643  }
644  if (addquotes) outa[outindex++] = '"';
645  *poutsize = outindex;
646 
647  LEPT_FREE(flata);
648  return outa;
649 }
l_int32 nlines
Definition: dewarp.h:169