From: huddles@corp1.retail.com ("Huddleston, Joe") Newsgroups: comp.databases.informix Subject: FW: Phonetic searching algorithms Date: 25 Jan 1995 12:53:31 -0500 granholm@digex.net, if you have a problem with unloading the attached c code file, e-mail me and i'll send it in the body of a message. -joe- (huddles@hasting.com) [[ CFUNC.C : 2398 in CFUNC.C ]] ---------- From: Morgan, Joseph To: Huddleston, Joe Subject: RE: Phonetic searching algorithms Date: Wednesday, January 25, 1995 9:23AM Joe ... I have a rather robust C library. In there is a Soundex function that I have been developing over the years. It is an improved Russellian soundex procedure. I wrote it, though, using other functions in the library. The file is: /u/morganj/4gl/lib/cfunc.c He would need to compile the library with: cc -c cfunc.c And then include that as an additional object library in his program module. Joe. ---------- From: ilist To: informix-list Subject: Phonetic searching algorithms Date: Wednesday, January 25, 1995 12:23AM Does anyone have experience writing phonetic search algorithms for I4GL (or ESQL/C) I am interested in any ideas / source code. Thanks in advance. granholm@digex.net #include #include #include #include /* ======================= */ #define NIL(type) (type *)0 /* Points ANYTHING to NULL */ /* ======================= */ /* ========================= */ /* Local Function Prototypes */ /* ========================= */ #define MAXFNAME 144 typedef struct fglfileTAG { FILE *ptr; char name[MAXFNAME]; int open; struct fglfileTAG *nextfile; } FGLFILE; FGLFILE *FirstFGLFile = NIL(FGLFILE); FGLFILE *get_fglfile(char *); FGLFILE *make_fglfile(char *); int isvowel(int); char *strchgchar(char *, int, int); char *strchgstr(char *, char *, char *); int strnicmp(char *, char *, int); char *strremchar(char *, int, char *); char *strremchars(char *, char *, char *); char *strremcharsi(char *, char *, char *); char *strremstr(char *, char *, char *); char *strremstri(char *, char *, char *); char *strupr(char *); char *unlsetstr(char *); void errputs(char *); void protoerr(char *); /* ========================= */ /* 4GL Function Prototypes */ /* ========================= */ int file_delete(int); int file_exists(int); int fglfclose(int); int fglfeof(int); int fglfgets(int); int fglfopen(int); int fglfputs(int); int get_strtoken(int); int gettempfile(int); int getuname(int); int getwhoami(int); int runnoclear(int); int soundex(int); int string_char(int); int string_in_string(int); int string_ini_string(int); int string_rchar(int); int string_remove_chars(int); int string_remove_charsi(int); int string_set(int); int tempstropen(int); int tempstrwrite(int); int tempstrclose(int); /* ========= */ /* Functions */ /* ========= */ /* ========================================================== */ /* Deletes a system file. Returns 1 if successful, 0 if not. */ /* ========================================================== */ int file_delete(int n) { char fname[MAXFNAME]; int status_flag; if (n != 1) { errputs("\nERROR (file_delete): invalid protocol"); return(0); } popquote(fname, MAXFNAME); unlsetstr(fname); status_flag = unlink(fname); retint(++status_flag); return(1); } /* ================================================================ */ /* Checks to see if a given file exists. Returns 1 if so, 0 if not */ /* or if there is an error. */ /* ================================================================ */ int file_exists(int n) { char fname[MAXFNAME]; int status_flag; FILE *file_ptr; if (n != 1) { errputs("\nERROR (file_exists): invalid protocol"); return(0); } popquote(fname, MAXFNAME); unlsetstr(fname); if ((file_ptr = fopen(fname, "r")) == NULL) status_flag = 0; else { fclose(file_ptr); status_flag = 1; } retint(status_flag); return(1); } /* ======================================== */ /* Closes a 4gl accessible direct read file */ /* ======================================== */ int fglfclose(int n) { char fname[MAXFNAME]; FGLFILE *thisfile; if (n != 1) { puts("\nERROR (fglfclose): Invalid Protocol."); return 0; } popquote(fname, MAXFNAME); unlsetstr(fname); if ((thisfile = get_fglfile(fname)) == NIL(FGLFILE)) return 0; fflush(thisfile->ptr); fclose(thisfile->ptr); thisfile->open = 0; return 0; } /* =============================================================== */ /* This function tests to see if a given file pointer has actually */ /* reached the end of a file for a 4gl direct read file. */ /* =============================================================== */ int fglfeof(int n) { char fname[MAXFNAME]; FGLFILE *thisfile; if (n != 1) { puts("\nERROR (fglfeof): Invalid Protocol>"); return 0; } popquote(fname, MAXFNAME); unlsetstr(fname); if ((thisfile = get_fglfile(fname)) == NIL(FGLFILE)) { retint(0); return(1); } retint(feof(thisfile->ptr)); return(1); } /* ===================================================================== */ /* Gets and returns a string and stats from a 4gl direct accessible file */ /* ===================================================================== */ int fglfgets(int n) { char fname[MAXFNAME], *buffer; int len, status_flag; FGLFILE *thisfile; if (n != 2) { errputs("\nERROR (fglfgets): Invalid Protocol."); return 0; } popquote(fname, MAXFNAME); unlsetstr(fname); popint(&len); if ((thisfile = get_fglfile(fname)) == NIL(FGLFILE)) { retquote(""); retint(0); return(2); } if (!thisfile->open) { retquote(""); retint(0); return(2); } buffer = (char *)malloc(++len); status_flag = (fgets(buffer, len, thisfile->ptr) == NIL(char)) ? 0 : 1; buffer[strlen(buffer) - 1] = '\0'; retquote(buffer); free(buffer); retint(status_flag); return 2; } /* ===================================================== */ /* This function opens a 4gl accessible direct read file */ /* ===================================================== */ int fglfopen(int n) { FGLFILE *thisfile; char fname[MAXFNAME], fmode[5]; if (n != 2) { protoerr("fglfopen"); return 0; } popquote(fmode, 4); unlsetstr(fmode); popquote(fname, MAXFNAME); unlsetstr(fname); if ((thisfile = get_fglfile(fname)) == NIL(FGLFILE)) { if ((thisfile = make_fglfile(fname)) == NIL(FGLFILE)) { retint(2); return 1; } } if (thisfile->open) { retint(1); return 1; } if ((thisfile->ptr = fopen(fname, fmode)) == NIL(FILE)) { retint(0); return(1); } thisfile->open = 1; retint(10); return(1); } /* ======================================================= */ /* This file writes output to a 4gl direct accessible file */ /* ======================================================= */ int fglfputs(int n) { char fname[MAXFNAME], *buffer; int len; FGLFILE *thisfile; if (n != 3) { protoerr("fglfputs"); return(0); } popquote(fname, MAXFNAME); unlsetstr(fname); popint(&len); len += 16; buffer = (char *)malloc(len); popquote(buffer, len); unlsetstr(buffer); strcat(buffer, "\n"); if ((thisfile = get_fglfile(fname)) == NIL(FGLFILE)) return 0; if (thisfile->ptr == NIL(FILE)) { errputs("File Pointer is NULL"); } fputs(buffer, thisfile->ptr); fflush(thisfile->ptr); free(buffer); return 0; } /* ================================================== */ /* This gets an fglfile by name from the current list */ /* ================================================== */ FGLFILE *get_fglfile(char *fname) { FGLFILE *thisfile; if (FirstFGLFile == NIL(FGLFILE)) return(FirstFGLFile); for (thisfile = FirstFGLFile; thisfile != NIL(FGLFILE); thisfile = thisfile->nextfile) { if (!strcmp(thisfile->name, fname)) break; } return(thisfile); } /* ================================= */ /* This function gets a string token */ /* ================================= */ int get_strtoken(int n) { char string[255], delimiter[2], *ptr, *substring; int start_pos; if (n != 3) { protoerr("get_strtoken"); return 0; } popquote(delimiter, 2); popint(&start_pos); popquote(string, 255); unlsetstr(string); substring = ptr = string + start_pos - 1; if (*ptr == '\0') { retquote(""); retint(-1); return(2); } if (*ptr == delimiter[0]) { ++substring; ++ptr; } while ((*ptr != delimiter[0]) && (*ptr != '\0')) ++ptr; if (*ptr == '\0') retquote(substring); else { *ptr = '\0'; retquote(substring); *ptr = delimiter[0]; } retint(ptr - string + 1); return(2); } /* ======================================================= */ /* This function gets a system certain temporary file name */ /* ======================================================= */ int gettempfile(int n) { char *tmpfile, tmpdir[80]; char *ptr; if (n != 1) return 0; popquote(tmpdir, 80); unlsetstr(tmpdir); tmpfile = tempnam(tmpdir, ""); retquote(tmpfile); return(1); } int getwhoami(int n) { char whoami[81]; FILE *whofile; whofile = popen("whoami", "r"); fgets(whoami, 80, whofile); pclose(whofile); whoami[strlen(whoami) - 1] = '\0'; retquote(whoami); return(1); } int getuname(int n) { char uname[16]; FILE *ufile; ufile = popen("uname", "r"); fgets(uname, 16, ufile); pclose(ufile); uname[strlen(uname) - 1] = '\0'; retquote(uname); return(1); } /* ================================================ */ /* This function opens a 4gl accessible system file */ /* ================================================ */ FGLFILE *make_fglfile(char *fname) { FGLFILE *thisfile, *lastfile; if ((thisfile = (FGLFILE *)malloc(sizeof(FGLFILE))) == NIL(FGLFILE)) return(thisfile); if (FirstFGLFile == NIL(FGLFILE)) { FirstFGLFile = thisfile; } else { for (lastfile = FirstFGLFile; lastfile->nextfile != NIL(FGLFILE); lastfile = lastfile->nextfile); lastfile->nextfile = thisfile; } thisfile->nextfile = NIL(FGLFILE); strcpy(thisfile->name, fname); thisfile->open = 0; return(thisfile); } /* =============================================================== */ /* This function runs a system command WITHOUT clearing the screen */ /* =============================================================== */ int runnoclear(int n) { char syscmd[255]; int wait; char *ptr; if (n != 2) { errputs("\nError (runnoclear): Invalid Protocol."); return(0); } popint(&wait); popquote(syscmd, 254); unlsetstr(syscmd); if (!wait) strcat(syscmd, " &"); system(syscmd); return(0); } /* ================================================================== */ /* The function returns a modified Rusellian Soundex of a string */ /* ================================================================== */ int Soundex(int n) { char *ptr, *dptr, source[256], *target; int len; if (n != 1) { errputs("\nERROR (soundex): Invalid Protocol."); return 0; } popquote(source, 254); unlsetstr(source); strupr(source); target = strchgstr(source, "&", "AND"); strremchars(target, "$!%@#^*()-+=_{}[]\\|\"':;<>,.?/", source); len = strlen(source); for (ptr = source; *ptr != '\0'; ptr++) { if (isvowel(*ptr) && ptr != source) { /* Leave Vowel if First Ltr */ if (*(ptr + 1) == '\0') { /* If Vowel is Last Ltr */ if (*ptr == 'Y') /* If Vowel is 'Y' */ *ptr = 'I'; /* Convert to 'I' */ if (*ptr == 'E') /* If it is 'E' */ *ptr = ' '; /* Zap It */ } else { *ptr = ' '; /* Zap it Otherwise */ } continue; } if (*(ptr + 1) == *ptr) { dptr = ptr + 1; while (*dptr == *ptr) { *dptr = ' '; ++dptr; } ptr = dptr - 1; continue; } switch (*ptr) { case 'Q': *ptr = 'K'; break; case 'C': if ((*ptr + 1) == 'K') { *ptr = 'K'; *(++ptr) = ' '; } break; case 'P': if (*(ptr + 1) == 'H') { *ptr = 'F'; *(++ptr) = ' '; break; } case 'D': case 'G': case 'K': case 'M': if (*(ptr + 1) == 'N') { *ptr = 'N'; *(++ptr) = ' '; } break; case 'S': case 'T': case 'Z': if (*(ptr + 1) == 'S') { *ptr = 'S'; *(++ptr) = ' '; break; } if (*(ptr + 1) == 'H') *(++ptr) = ' '; break; default: break; } } strremchar(source, ' ', target); retquote(target); return 1; } int isvowel(int c) { if (c == 'A' || c == 'a' || c == 'E' || c == 'e' || c == 'I' || c == 'i' || c == 'O' || c == 'o' || c == 'U' || c == 'u' || c == 'Y' || c == 'y' ) return 1; return 0; } /* =============================================================== */ /* This function returns the location of the first occurrence of a */ /* given character in a string. It also returns a the substring */ /* beginning at such occurrence, or 0 and "" if the given char is */ /* not found. */ /* =============================================================== */ int string_char(int n) { char *string, character[2], *ptr; int len, pos; if (n != 3) { errputs("\nError (string_char): Invalid Protocol."); return(0); } popquote(character, 2); character[1] = '\0'; popint(&len); string = (char *)malloc(++len); popquote(string, len); unlsetstr(string); if ((ptr = strchr(string, character[0])) == NIL(char)) { retquote(""); pos = 0; } else { retquote(ptr); pos = ptr - string + 1; } free(string); retint(pos); return(2); } /* ============================================================== */ /* This function returns both the substring and offset of a given */ /* substring within a string, or "" and 0 if substring is not */ /* found. */ /* ============================================================== */ int string_in_string(int n) { char *string1, *string2, *ptr; int offset, len2, len1; if (n != 4) { errputs("\nError (string_in_string): Invalid Protocol."); return(0); } popint(&len2); string2 = (char *)malloc(++len2); popquote(string2, len2); unlsetstr(string2); popint(&len1); string1 = (char *)malloc(++len1); popquote(string1, len1); unlsetstr(string1); retquote(ptr = strstr(string1, string2)); retint((*ptr == '\0') ? 0 : (ptr - string1 + 1)); free(string2); free(string1); return(2); } /* ============================================================ */ /* This function is a case insensitive version of the one above */ /* ============================================================ */ int string_ini_string(int n) { char *string1, *string2, *origstr, *ptr; int offset, len1, len2; if (n != 4) { errputs("\nError (string_ini_string): Invalid Protocol."); return(0); } popint(&len2); string2 = (char *)malloc(++len2); popquote(string2, len2); unlsetstr(string2); strupr(string2); popint(&len1); string1 = (char *)malloc(++len1); popquote(string1, len1); unlsetstr(string1); origstr = strdup(string1); strupr(string1); ptr = strstr(string1, string2); retquote((*ptr == '\0') ? "" : ptr = origstr + (ptr - string1)); retint((*ptr == '\0') ? 0 : (ptr - origstr + 1)); free(string2); free(string1); free(origstr); return(2); } /* ================================================================ */ /* This function finds the last occurence of a given char in string */ /* ================================================================ */ int string_rchar(int n) { char *string, character[2], *ptr; int len, pos; if (n != 3) { errputs("\nError (string_char): Invalid Protocol."); return(0); } popquote(character, 2); character[1] = '\0'; popint(&len); string = (char *)malloc(++len); popquote(string, len); unlsetstr(string); if ((ptr = strrchr(string, character[0])) == NIL(char)) { retquote(""); pos = 0; } else { retquote(ptr); pos = ptr - string + 1; } free(string); retint(pos); return(2); } /* ============================================================== */ /* This function removes a list of characters from a given string */ /* ============================================================== */ int string_remove_chars(int n) { char *instring, badchars[256], *outstring; int len; if (n != 3) { errputs("\nError (string_remove_chars): Invalid Protocol."); return(0); } popquote(badchars, 255); popint(&len); instring = (char *)malloc(++len); popquote(instring, len); outstring = (char *)malloc(len); unlsetstr(instring); unlsetstr(badchars); strremchars(instring, badchars, outstring); retquote(outstring); free(instring); free(outstring); return(1); } /* ============================================================== */ /* This function removes a list of characters from a given string */ /* The cases of the characters and the string are unimportant. */ /* ============================================================== */ int string_remove_charsi(int n) { char *instring, badchars[256], *outstring; int len; if (n != 3) { errputs("\nError (string_remove_chars): Invalid Protocol."); return(0); } popquote(badchars, 255); popint(&len); instring = (char *)malloc(++len); popquote(instring, len); outstring = (char *)malloc(len); unlsetstr(instring); unlsetstr(badchars); strremcharsi(instring, badchars, outstring); retquote(outstring); free(instring); free(outstring); return(1); } /* ================================================================== */ /* This function sets a string to a given length to a given character */ /* ================================================================== */ int string_set(int n) { char *string, character[3]; int len, x; if (n != 2) { errputs("\nError (string_set): Invalid Protocol."); return(0); } popquote(character, 2); popint(&len); string = (char *)malloc(len + 1); for (x = 0; x < len; x++) *(string + x) = character[0]; *(string + x) = '\0'; retquote(string); free(string); return(1); } /* ============================================================ */ /* This function removes all of a given character from a string */ /* ============================================================ */ char *strremchar(char *str, int c, char *target) { char *ptr; for (ptr = str; *ptr != '\0'; ptr++) { if (*ptr != c) { *target = *ptr; ++target; } } *target = '\0'; return(target); } /* ========================================================= */ /* This function changes all ocurrences of a given character */ /* in a given string to another character */ /* ========================================================= */ char *strchgchar(char *str, int oldch, int newch) { char *ptr = str; while (*ptr != '\0') { if (*ptr == oldch) *ptr = newch; } return(str); } /* ============================================================= */ /* This function changes all occurrences of a sub-string to that */ /* of another substring. */ /* ============================================================= */ char *strchgstr(char *str, char *oldstr, char *newstr) { char *iptr, *nptr, *outstr; unsigned oldlen, newlen, newsize, count = 0; oldlen = strlen(oldstr); newlen = strlen(newstr); iptr = str; while ((nptr = strstr(iptr, oldstr)) != NIL(char)) { ++count; iptr = ++nptr; } newsize = (strlen(str) - (oldlen * count)) + (newlen * count) + 1; outstr = (char *)malloc(newsize); *outstr = '\0'; iptr = str; while ((nptr = strstr(iptr, oldstr)) != NIL(char)) { *nptr = '\0'; strcat(outstr, iptr); strcat(outstr, newstr); iptr = nptr + oldlen; if (*iptr == '\0') break; } if (*iptr != '\0') strcat(outstr, iptr); return(outstr); } int strnicmp(char *str1, char *str2, int n) { char *ustr1, *ustr2; int result; ustr1 = strdup(str1); strupr(ustr1); ustr2 = strdup(str2); strupr(ustr2); result = strncmp(ustr1, ustr2, n); free(ustr1); free(ustr2); return result; } /* ==================================================================== */ /* This function removes all of a given set of characters from a string */ /* ==================================================================== */ char *strremchars(char *instr, char *badchars, char *target) { char *iptr, *optr, *bptr; int match; for (iptr = instr, optr = target; *iptr != '\0'; iptr++) { for (bptr = badchars, match = 0; *bptr != '\0'; bptr++) { if (*bptr == *iptr) { match = 1; break; } } if (!match) *optr++ = *iptr; } *optr = '\0'; return(target); } /* ============================================================= */ /* This function removes all of a given set of characters from a */ /* string where case if the bad character and the string is not */ /* important. */ /* ============================================================= */ char *strremcharsi(char *instr, char *badchars, char *outstr) { char *iptr, *optr, *bptr; int match, ch; strupr(badchars); for (iptr = instr, optr = outstr; *iptr != '\0'; iptr++) { for (bptr = badchars, match = 0; *bptr != '\0'; *bptr++) { ch = toupper(*iptr); if (*bptr == ch) { match = 1; break; } } if (!match) *optr++ = *iptr; } *optr = '\0'; return(outstr); } /* =================================================== */ /* This function removes all substrings from a string. */ /* =================================================== */ char *strremstr(char *instr, char *badstr, char *outstr) { char *iptr, *optr; unsigned len; len = strlen(badstr); for (iptr = instr, optr = outstr; *iptr != '\0'; iptr++) { if (!strncmp(iptr, badstr, len)) { if (strlen(iptr) >= len) iptr += len - 1; else break; } else *optr++ = *iptr; } *optr = '\0'; return(outstr); } /* =================================================== */ /* This function removes all substrings from a string */ /* regardless of case of either string. */ /* =================================================== */ char *strremstri(char *instr, char *badstr, char *outstr) { char *iptr, *optr; unsigned len; len = strlen(badstr); for (iptr = instr, optr = outstr; *iptr != '\0'; iptr++) { if (!strnicmp(iptr, badstr, len)) { if (strlen(iptr) >= len) iptr += len - 1; else break; } else *optr++ = *iptr; } *optr = '\0'; return(outstr); } /* ============================================= */ /* This function converts a string to upper case */ /* ============================================= */ char *strupr(char *str) { char *ptr; for (ptr = str; *ptr != '\0'; ptr++) if (*ptr > 96 && *ptr < 123) *ptr -= 32; return(str); } /* =========================================================== */ /* This function removes padding on the right side of a string */ /* =========================================================== */ char *unlsetstr(char *str) { int slen; char *ptr; slen = strlen(str) - 1; for (ptr = str + slen; *ptr == ' '; ptr--); ++ptr; *ptr = '\0'; return(str); } /* =============== */ /* Error Functions */ /* =============== */ void errputs(char *str) { puts(str); fflush(stdout); } void protoerr(char *str) { printf("\n***** Error (%s): Invalid Protocol\n"); fflush(stdout); } /* =============================================================== */ /* The purpose of the following functions are to evaluate internal */ /* INFORMIX errors. These are not standard functions for use in */ /* final software, but for development only. */ /* =============================================================== */ #define TABSIZE 10 struct elalloc { char *allocarea; short iallocarea; }; extern struct elalloc alloctab[TABSIZE]; FILE *TmpStrFile; int tempstropen(int n) { int status = 0; if ((TmpStrFile = fopen("TSlist.asc", "w")) != NIL(FILE)) status = 1; retint(status); return(1); } int tempstrwrite(int n) { int len, x; char *str; if (n != 2) return(0); popint(&len); str = (char *)malloc(++len); popquote(str, len); fprintf(TmpStrFile, "\n\n%s:\n\t", str); for (x = 0; x < TABSIZE; x++) if (alloctab[x].allocarea != NULL && alloctab[x].iallocarea != 0) fprintf(TmpStrFile, "B%ld=%3d ", x, alloctab[x].iallocarea); fflush(TmpStrFile); free(str); return(0); } int tempstrclose(int n) { fflush(TmpStrFile); fclose(TmpStrFile); return(0); }