Leptonica  1.77.0
Image processing and image analysis suite
sarray2.c
Go to the documentation of this file.
1 /*====================================================================*
2  - Copyright (C) 2001 Leptonica. All rights reserved.
3  -
4  - Redistribution and use in source and binary forms, with or without
5  - modification, are permitted provided that the following conditions
6  - are met:
7  - 1. Redistributions of source code must retain the above copyright
8  - notice, this list of conditions and the following disclaimer.
9  - 2. Redistributions in binary form must reproduce the above
10  - copyright notice, this list of conditions and the following
11  - disclaimer in the documentation and/or other materials
12  - provided with the distribution.
13  -
14  - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15  - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16  - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17  - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY
18  - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23  - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *====================================================================*/
26 
73 #include <string.h>
74 #include "allheaders.h"
75 
76 /*----------------------------------------------------------------------*
77  * Sort *
78  *----------------------------------------------------------------------*/
94 SARRAY *
96  SARRAY *sain,
97  l_int32 sortorder)
98 {
99 char **array;
100 char *tmp;
101 l_int32 n, i, j, gap;
102 
103  PROCNAME("sarraySort");
104 
105  if (!sain)
106  return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL);
107 
108  /* Make saout if necessary; otherwise do in-place */
109  if (!saout)
110  saout = sarrayCopy(sain);
111  else if (sain != saout)
112  return (SARRAY *)ERROR_PTR("invalid: not in-place", procName, NULL);
113  array = saout->array; /* operate directly on the array */
114  n = sarrayGetCount(saout);
115 
116  /* Shell sort */
117  for (gap = n/2; gap > 0; gap = gap / 2) {
118  for (i = gap; i < n; i++) {
119  for (j = i - gap; j >= 0; j -= gap) {
120  if ((sortorder == L_SORT_INCREASING &&
121  stringCompareLexical(array[j], array[j + gap])) ||
122  (sortorder == L_SORT_DECREASING &&
123  stringCompareLexical(array[j + gap], array[j])))
124  {
125  tmp = array[j];
126  array[j] = array[j + gap];
127  array[j + gap] = tmp;
128  }
129  }
130  }
131  }
132 
133  return saout;
134 }
135 
136 
144 SARRAY *
146  NUMA *naindex)
147 {
148 char *str;
149 l_int32 i, n, index;
150 SARRAY *saout;
151 
152  PROCNAME("sarraySortByIndex");
153 
154  if (!sain)
155  return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL);
156  if (!naindex)
157  return (SARRAY *)ERROR_PTR("naindex not defined", procName, NULL);
158 
159  n = sarrayGetCount(sain);
160  saout = sarrayCreate(n);
161  for (i = 0; i < n; i++) {
162  numaGetIValue(naindex, i, &index);
163  str = sarrayGetString(sain, index, L_COPY);
164  sarrayAddString(saout, str, L_INSERT);
165  }
166 
167  return saout;
168 }
169 
170 
184 l_int32
185 stringCompareLexical(const char *str1,
186  const char *str2)
187 {
188 l_int32 i, len1, len2, len;
189 
190  PROCNAME("sarrayCompareLexical");
191 
192  if (!str1)
193  return ERROR_INT("str1 not defined", procName, 1);
194  if (!str2)
195  return ERROR_INT("str2 not defined", procName, 1);
196 
197  len1 = strlen(str1);
198  len2 = strlen(str2);
199  len = L_MIN(len1, len2);
200 
201  for (i = 0; i < len; i++) {
202  if (str1[i] == str2[i])
203  continue;
204  if (str1[i] > str2[i])
205  return 1;
206  else
207  return 0;
208  }
209 
210  if (len1 > len2)
211  return 1;
212  else
213  return 0;
214 }
215 
216 
217 /*----------------------------------------------------------------------*
218  * Set operations using aset (rbtree) *
219  *----------------------------------------------------------------------*/
236 SARRAY *
238  SARRAY *sa2)
239 {
240 SARRAY *sa3, *sad;
241 
242  PROCNAME("sarrayUnionByAset");
243 
244  if (!sa1)
245  return (SARRAY *)ERROR_PTR("sa1 not defined", procName, NULL);
246  if (!sa2)
247  return (SARRAY *)ERROR_PTR("sa2 not defined", procName, NULL);
248 
249  /* Join */
250  sa3 = sarrayCopy(sa1);
251  sarrayJoin(sa3, sa2);
252 
253  /* Eliminate duplicates */
254  sad = sarrayRemoveDupsByAset(sa3);
255  sarrayDestroy(&sa3);
256  return sad;
257 }
258 
259 
276 SARRAY *
278 {
279 char *str;
280 l_int32 i, n;
281 l_uint64 hash;
282 L_ASET *set;
283 RB_TYPE key;
284 SARRAY *sad;
285 
286  PROCNAME("sarrayRemoveDupsByAset");
287 
288  if (!sas)
289  return (SARRAY *)ERROR_PTR("sas not defined", procName, NULL);
290 
291  set = l_asetCreate(L_UINT_TYPE);
292  sad = sarrayCreate(0);
293  n = sarrayGetCount(sas);
294  for (i = 0; i < n; i++) {
295  str = sarrayGetString(sas, i, L_NOCOPY);
296  l_hashStringToUint64(str, &hash);
297  key.utype = hash;
298  if (!l_asetFind(set, key)) {
299  sarrayAddString(sad, str, L_COPY);
300  l_asetInsert(set, key);
301  }
302  }
303 
304  l_asetDestroy(&set);
305  return sad;
306 }
307 
308 
327 SARRAY *
329  SARRAY *sa2)
330 {
331 char *str;
332 l_int32 n1, n2, i, n;
333 l_uint64 hash;
334 L_ASET *set1, *set2;
335 RB_TYPE key;
336 SARRAY *sa_small, *sa_big, *sad;
337 
338  PROCNAME("sarrayIntersectionByAset");
339 
340  if (!sa1)
341  return (SARRAY *)ERROR_PTR("sa1 not defined", procName, NULL);
342  if (!sa2)
343  return (SARRAY *)ERROR_PTR("sa2 not defined", procName, NULL);
344 
345  /* Put the elements of the biggest array into a set */
346  n1 = sarrayGetCount(sa1);
347  n2 = sarrayGetCount(sa2);
348  sa_small = (n1 < n2) ? sa1 : sa2; /* do not destroy sa_small */
349  sa_big = (n1 < n2) ? sa2 : sa1; /* do not destroy sa_big */
350  set1 = l_asetCreateFromSarray(sa_big);
351 
352  /* Build up the intersection of strings */
353  sad = sarrayCreate(0);
354  n = sarrayGetCount(sa_small);
355  set2 = l_asetCreate(L_UINT_TYPE);
356  for (i = 0; i < n; i++) {
357  str = sarrayGetString(sa_small, i, L_NOCOPY);
358  l_hashStringToUint64(str, &hash);
359  key.utype = hash;
360  if (l_asetFind(set1, key) && !l_asetFind(set2, key)) {
361  sarrayAddString(sad, str, L_COPY);
362  l_asetInsert(set2, key);
363  }
364  }
365 
366  l_asetDestroy(&set1);
367  l_asetDestroy(&set2);
368  return sad;
369 }
370 
371 
378 L_ASET *
380 {
381 char *str;
382 l_int32 i, n;
383 l_uint64 hash;
384 L_ASET *set;
385 RB_TYPE key;
386 
387  PROCNAME("l_asetCreateFromSarray");
388 
389  if (!sa)
390  return (L_ASET *)ERROR_PTR("sa not defined", procName, NULL);
391 
392  set = l_asetCreate(L_UINT_TYPE);
393  n = sarrayGetCount(sa);
394  for (i = 0; i < n; i++) {
395  str = sarrayGetString(sa, i, L_NOCOPY);
396  l_hashStringToUint64(str, &hash);
397  key.utype = hash;
398  l_asetInsert(set, key);
399  }
400 
401  return set;
402 }
403 
404 
405 /*----------------------------------------------------------------------*
406  * Set operations using hashing (dnahash) *
407  *----------------------------------------------------------------------*/
430 l_ok
432  SARRAY **psad,
433  L_DNAHASH **pdahash)
434 {
435 char *str;
436 l_int32 i, n, index, items;
437 l_uint32 nsize;
438 l_uint64 key;
439 SARRAY *sad;
440 L_DNAHASH *dahash;
441 
442  PROCNAME("sarrayRemoveDupsByHash");
443 
444  if (pdahash) *pdahash = NULL;
445  if (!psad)
446  return ERROR_INT("&sad not defined", procName, 1);
447  *psad = NULL;
448  if (!sas)
449  return ERROR_INT("sas not defined", procName, 1);
450 
451  n = sarrayGetCount(sas);
452  findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */
453  dahash = l_dnaHashCreate(nsize, 8);
454  sad = sarrayCreate(n);
455  *psad = sad;
456  for (i = 0, items = 0; i < n; i++) {
457  str = sarrayGetString(sas, i, L_NOCOPY);
458  sarrayFindStringByHash(sad, dahash, str, &index);
459  if (index < 0) { /* not found */
460  l_hashStringToUint64(str, &key);
461  l_dnaHashAdd(dahash, key, (l_float64)items);
462  sarrayAddString(sad, str, L_COPY);
463  items++;
464  }
465  }
466 
467  if (pdahash)
468  *pdahash = dahash;
469  else
470  l_dnaHashDestroy(&dahash);
471  return 0;
472 }
473 
474 
487 SARRAY *
489  SARRAY *sa2)
490 {
491 char *str;
492 l_int32 n1, n2, nsmall, i, index1, index2;
493 l_uint32 nsize2;
494 l_uint64 key;
495 L_DNAHASH *dahash1, *dahash2;
496 SARRAY *sa_small, *sa_big, *sad;
497 
498  PROCNAME("sarrayIntersectionByHash");
499 
500  if (!sa1)
501  return (SARRAY *)ERROR_PTR("sa1 not defined", procName, NULL);
502  if (!sa2)
503  return (SARRAY *)ERROR_PTR("sa2 not defined", procName, NULL);
504 
505  /* Put the elements of the biggest sarray into a dnahash */
506  n1 = sarrayGetCount(sa1);
507  n2 = sarrayGetCount(sa2);
508  sa_small = (n1 < n2) ? sa1 : sa2; /* do not destroy sa_small */
509  sa_big = (n1 < n2) ? sa2 : sa1; /* do not destroy sa_big */
510  dahash1 = l_dnaHashCreateFromSarray(sa_big);
511 
512  /* Build up the intersection of strings. Add to %sad
513  * if the string is in sa_big (using dahash1) but hasn't
514  * yet been seen in the traversal of sa_small (using dahash2). */
515  sad = sarrayCreate(0);
516  nsmall = sarrayGetCount(sa_small);
517  findNextLargerPrime(nsmall / 20, &nsize2); /* buckets in hash table */
518  dahash2 = l_dnaHashCreate(nsize2, 0);
519  for (i = 0; i < nsmall; i++) {
520  str = sarrayGetString(sa_small, i, L_NOCOPY);
521  sarrayFindStringByHash(sa_big, dahash1, str, &index1);
522  if (index1 >= 0) {
523  sarrayFindStringByHash(sa_small, dahash2, str, &index2);
524  if (index2 == -1) {
525  sarrayAddString(sad, str, L_COPY);
526  l_hashStringToUint64(str, &key);
527  l_dnaHashAdd(dahash2, key, (l_float64)i);
528  }
529  }
530  }
531 
532  l_dnaHashDestroy(&dahash1);
533  l_dnaHashDestroy(&dahash2);
534  return sad;
535 }
536 
537 
562 l_ok
564  L_DNAHASH *dahash,
565  const char *str,
566  l_int32 *pindex)
567 {
568 char *stri;
569 l_int32 i, nvals, index;
570 l_uint64 key;
571 L_DNA *da;
572 
573  PROCNAME("sarrayFindStringByHash");
574 
575  if (!pindex)
576  return ERROR_INT("&index not defined", procName, 1);
577  *pindex = -1;
578  if (!sa)
579  return ERROR_INT("sa not defined", procName, 1);
580  if (!dahash)
581  return ERROR_INT("dahash not defined", procName, 1);
582 
583  l_hashStringToUint64(str, &key);
584  da = l_dnaHashGetDna(dahash, key, L_NOCOPY);
585  if (!da) return 0;
586 
587  /* Run through the da, looking for this string */
588  nvals = l_dnaGetCount(da);
589  for (i = 0; i < nvals; i++) {
590  l_dnaGetIValue(da, i, &index);
591  stri = sarrayGetString(sa, index, L_NOCOPY);
592  if (!strcmp(str, stri)) { /* duplicate */
593  *pindex = index;
594  return 0;
595  }
596  }
597 
598  return 0;
599 }
600 
601 
608 L_DNAHASH *
610 {
611 char *str;
612 l_int32 i, n;
613 l_uint32 nsize;
614 l_uint64 key;
615 L_DNAHASH *dahash;
616 
617  /* Build up dnaHash of indices, hashed by a 64-bit key that
618  * should randomize the lower bits used in bucket selection.
619  * Having about 20 pts in each bucket is roughly optimal. */
620  n = sarrayGetCount(sa);
621  findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */
622 /* fprintf(stderr, "Prime used: %d\n", nsize); */
623 
624  /* Add each string, using the hash as key and the index into %sa
625  * as the value. Storing the index enables operations that check
626  * for duplicates. */
627  dahash = l_dnaHashCreate(nsize, 8);
628  for (i = 0; i < n; i++) {
629  str = sarrayGetString(sa, i, L_NOCOPY);
630  l_hashStringToUint64(str, &key);
631  l_dnaHashAdd(dahash, key, (l_float64)i);
632  }
633 
634  return dahash;
635 }
636 
637 
638 /*----------------------------------------------------------------------*
639  * Miscellaneous operations *
640  *----------------------------------------------------------------------*/
647 SARRAY *
649 {
650 char buf[32];
651 l_int32 i;
652 SARRAY *sa;
653 
654  PROCNAME("sarrayGenerateIntegers");
655 
656  if ((sa = sarrayCreate(n)) == NULL)
657  return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
658  for (i = 0; i < n; i++) {
659  snprintf(buf, sizeof(buf), "%d", i);
660  sarrayAddString(sa, buf, L_COPY);
661  }
662  return sa;
663 }
664 
665 
687 l_ok
689  const char *keystring,
690  char **pvalstring)
691 {
692 char *key, *val, *str;
693 l_int32 i, n;
694 SARRAY *sa1;
695 
696  PROCNAME("sarrayLookupCSKV");
697 
698  if (!pvalstring)
699  return ERROR_INT("&valstring not defined", procName, 1);
700  *pvalstring = NULL;
701  if (!sa)
702  return ERROR_INT("sa not defined", procName, 1);
703  if (!keystring)
704  return ERROR_INT("keystring not defined", procName, 1);
705 
706  n = sarrayGetCount(sa);
707  for (i = 0; i < n; i++) {
708  str = sarrayGetString(sa, i, L_NOCOPY);
709  sa1 = sarrayCreate(2);
710  sarraySplitString(sa1, str, ",");
711  if (sarrayGetCount(sa1) != 2) {
712  sarrayDestroy(&sa1);
713  continue;
714  }
715  key = sarrayGetString(sa1, 0, L_NOCOPY);
716  val = sarrayGetString(sa1, 1, L_NOCOPY);
717  if (!strcmp(key, keystring)) {
718  *pvalstring = stringNew(val);
719  sarrayDestroy(&sa1);
720  return 0;
721  }
722  sarrayDestroy(&sa1);
723  }
724 
725  return 0;
726 }
Definition: pix.h:717
L_DNA * l_dnaHashGetDna(L_DNAHASH *dahash, l_uint64 key, l_int32 copyflag)
l_dnaHashGetDna()
Definition: dnahash.c:232
l_int32 l_dnaGetCount(L_DNA *da)
l_dnaGetCount()
Definition: dnabasic.c:597
SARRAY * sarrayCopy(SARRAY *sa)
sarrayCopy()
Definition: sarray1.c:393
SARRAY * sarrayGenerateIntegers(l_int32 n)
sarrayGenerateIntegers()
Definition: sarray2.c:648
Definition: pix.h:716
void l_dnaHashDestroy(L_DNAHASH **pdahash)
l_dnaHashDestroy()
Definition: dnahash.c:152
char * stringNew(const char *src)
stringNew()
Definition: utils2.c:215
SARRAY * sarrayUnionByAset(SARRAY *sa1, SARRAY *sa2)
sarrayUnionByAset()
Definition: sarray2.c:237
l_ok l_dnaGetIValue(L_DNA *da, l_int32 index, l_int32 *pival)
l_dnaGetIValue()
Definition: dnabasic.c:693
SARRAY * sarrayCreate(l_int32 n)
sarrayCreate()
Definition: sarray1.c:163
Definition: array.h:83
L_ASET * l_asetCreateFromSarray(SARRAY *sa)
l_asetCreateFromSarray()
Definition: sarray2.c:379
l_ok findNextLargerPrime(l_int32 start, l_uint32 *pprime)
findNextLargerPrime()
Definition: utils1.c:689
L_DNAHASH * l_dnaHashCreateFromSarray(SARRAY *sa)
l_dnaHashCreateFromSarray()
Definition: sarray2.c:609
SARRAY * sarrayRemoveDupsByAset(SARRAY *sas)
sarrayRemoveDupsByAset()
Definition: sarray2.c:277
Definition: array.h:116
l_ok l_dnaHashAdd(L_DNAHASH *dahash, l_uint64 key, l_float64 value)
l_dnaHashAdd()
Definition: dnahash.c:267
l_ok numaGetIValue(NUMA *na, l_int32 index, l_int32 *pival)
numaGetIValue()
Definition: numabasic.c:727
Definition: array.h:59
SARRAY * sarraySortByIndex(SARRAY *sain, NUMA *naindex)
sarraySortByIndex()
Definition: sarray2.c:145
l_ok sarrayAddString(SARRAY *sa, const char *string, l_int32 copyflag)
sarrayAddString()
Definition: sarray1.c:446
l_int32 stringCompareLexical(const char *str1, const char *str2)
stringCompareLexical()
Definition: sarray2.c:185
l_ok l_hashStringToUint64(const char *str, l_uint64 *phash)
l_hashStringToUint64()
Definition: utils1.c:579
char * sarrayGetString(SARRAY *sa, l_int32 index, l_int32 copyflag)
sarrayGetString()
Definition: sarray1.c:681
l_ok sarrayRemoveDupsByHash(SARRAY *sas, SARRAY **psad, L_DNAHASH **pdahash)
sarrayRemoveDupsByHash()
Definition: sarray2.c:431
l_int32 sarrayGetCount(SARRAY *sa)
sarrayGetCount()
Definition: sarray1.c:621
l_ok sarrayLookupCSKV(SARRAY *sa, const char *keystring, char **pvalstring)
sarrayLookupCSKV()
Definition: sarray2.c:688
l_ok sarrayJoin(SARRAY *sa1, SARRAY *sa2)
sarrayJoin()
Definition: sarray1.c:880
SARRAY * sarraySort(SARRAY *saout, SARRAY *sain, l_int32 sortorder)
sarraySort()
Definition: sarray2.c:95
Definition: pix.h:718
l_ok sarrayFindStringByHash(SARRAY *sa, L_DNAHASH *dahash, const char *str, l_int32 *pindex)
sarrayFindStringByHash()
Definition: sarray2.c:563
SARRAY * sarrayIntersectionByAset(SARRAY *sa1, SARRAY *sa2)
sarrayIntersectionByAset()
Definition: sarray2.c:328
Definition: rbtree.h:61
L_DNAHASH * l_dnaHashCreate(l_int32 nbuckets, l_int32 initsize)
l_dnaHashCreate()
Definition: dnahash.c:122
char ** array
Definition: array.h:121
SARRAY * sarrayIntersectionByHash(SARRAY *sa1, SARRAY *sa2)
sarrayIntersectionByHash()
Definition: sarray2.c:488
void sarrayDestroy(SARRAY **psa)
sarrayDestroy()
Definition: sarray1.c:355