#include #define Corpus "to be$or$not to be$" char *corpus; int N; int *s; int *lcp; int *docid; int *doclink; int suffix_compare(int *a, int *b){ return strcmp(corpus+*a, corpus+*b); } int complcp(int i, int *sarray){ int j; for(j = 0; corpus[sarray[i]+j] == corpus[sarray[i-1]+j]; j++); return j; } int *suffix_array(char *cor){ int *sarray = (int *)malloc(sizeof(int) * N); int i; for(i = 0; i < N; i++) sarray[i] = i; qsort(sarray, N, sizeof(int), suffix_compare); return sarray; } int *lcp_array(int *sarray){ int *lcparray = (int *)malloc(sizeof(int) * N + 1); int i; for(i = 1; i < N; i++) { lcparray[i] = complcp(i, sarray); } lcparray[0] = lcparray[N] = 0; return lcparray; } int *doc_table(char *corpus, int *docnum){ int *doc_id = (int *)malloc(sizeof(int) * N); int i; int id = 0; for(i = 0; i < N; i++){ doc_id[i] = id; if(corpus[i] == '$') id++; } *docnum = id; return doc_id; } int doc(int i){ return docid[s[i]]; } #define STACK_SIZE 100000 #define Top_i (stack[sp-1].i) #define Top_k (stack[sp-1].k) #define Top_df (stack[sp-1].df) struct STACK {int i; int k; int df;} stack[STACK_SIZE]; int sp = 0; /* stack pointer */ void push(int i, int k, int df) { if(sp >= STACK_SIZE){ fprintf(stderr, "stack overflow\n"); exit(2);} stack[sp].i = i; stack[sp].k = k; stack[sp++].df = df;} void pop() {sp--;} void output(int i, int j, int k, int df) { int LBL; if(lcp[i] > lcp[j+1]) LBL = lcp[i]; else LBL = lcp[j+1]; if(i==j) printf("trivial <%d,%d>, tf=1\n", i, j); else if(LBL < lcp[k]) printf("nontrivial <%d, %d>, rep=%d, tf=%d, df=%d\n", i, j, k, j-i+1, df); } /* * Print_LDIs_with_df does not only print tf, but also df. * It takes the corpus size, N, and the number of documents, D. * doc() returns the document number of the suffix array's index. * dec_df() decrease a df-counter in the stack when duplicate * counting occurs. */ void dec_df(int docid) { int beg=0, end=sp, mid=sp/2; while(beg != mid) { if(doclink[docid] >= stack[mid].i) beg = mid; else end = mid; mid = (beg + end) / 2; } stack[mid].df--; } print_LDIs_with_df(int N, int D) { int i, j, df; doclink = (int *)malloc(sizeof(int) * D); for(i = 0; i < D; i++) doclink[i] = -1; push(0,0,1); for(j = 0; j < N; j++) { output(j,j,0,1); if(doclink[doc(j)] != -1) dec_df(doc(j)); doclink[doc(j)] = j; df = 1; while (lcp[j+1] < lcp[Top_k]) { df = Top_df + df; output(Top_i,j,Top_k,df); pop(); } /* original code */ push(Top_k, j+1, df); /* efficient code */ /* if(lcp[Top_k] == lcp[j+1]) { Top_k = j+1; Top_df += df; } else push(Top_k, j+1, df); */ } } main(){ int i, D; corpus = Corpus; N = strlen(corpus); s = suffix_array(corpus); printf(" i = "); for(i = 0; i < N; i++) printf("%2d ", i); printf("\n"); printf("cor[i] = "); for(i = 0; i < N; i++) printf(" %c ", corpus[i]); printf("\n"); printf("s[i] = "); for(i = 0; i < N; i++) printf("%2d ", s[i]); printf("\n"); lcp = lcp_array(s); printf("lcp[i] = "); for(i = 0; i < N+1; i++) printf("%2d ", lcp[i]); printf("\n"); docid = doc_table(corpus, &D); printf("doc[i] = "); for(i = 0; i < N; i++) printf("%2d ", docid[i]); printf("\n"); print_LDIs_with_df(N, D); }