/* * Computing all tf's and df's for all substrings in a text. * * tfdf.c * Dec. 15, 1997. * Mikio Yamamoto * */ #include #include #include "util.h" #define STACK_DEPTH 1000 struct suf s; int *art_table; int art_num; int *art_link; read2(int fd, void *p, int size, int err_id){ if(read(fd, p, size) <= 0) read_error(err_id); } read_error(int err_id){ fprintf(stderr, "Read error: %d\n", err_id); exit(1); } int *read_artbound(char *filename, int *art_num){ int fd; int *artbl; if((fd = open(filename, O_RDONLY, 0)) == -1){ fprintf(stderr, "can't open %s\n", filename); exit(1); } read2(fd, art_num, sizeof(int), 0); artbl = (int *)malloc(sizeof(int) * (*art_num + 1) ); read2(fd, artbl+1, sizeof(int) * *art_num, 1); *artbl = -1; return(artbl); } int get_docnum(int pos){ int beg = 0; int end; int mid; end = art_num; mid = (beg + end) / 2; while(beg != mid){ if(pos > art_table[mid]){ beg = mid; } else { end = mid; } mid = (beg + end) / 2; } return(mid); } /* * stack proc. */ struct stack_ele { int sufi; int suf; short slcp; int df; int dfcancel; }; struct stack_ele *stack; int stackp; int max_depth; create_stack(){ stack = (struct stack_ele *)malloc(sizeof(struct stack_ele) * STACK_DEPTH); stackp = 0; max_depth = 0; } int push(){ return(stackp++); } int pop() { return(--stackp); } push_dummy_ele(){ int p; p = push(); stack[p].slcp = -1; stack[p].sufi = -1; stack[p].df = 0; stack[p].dfcancel = 0; } short stack_top_sclp(){ return(stack[stackp-1].slcp); } /* * stack proc. end */ void set_expire_df(int i, int suf){ int beg = 0; int end; int mid; int docnum; docnum = get_docnum(suf); if(art_link[docnum] == -1){ art_link[docnum] = i; return; } end = stackp; mid = (beg + end) / 2; while(beg != mid){ if(art_link[docnum] >= stack[mid].sufi){ beg = mid; } else { end = mid; } mid = (beg + end) / 2; } /* fprintf(stderr, "frame(suf=%d, doc=%d) = %d", suf, docnum, mid); */ stack[mid].dfcancel++; art_link[docnum] = i; } push_a_suffix(int i, int suf, short slcp){ int p; p = push(); stack[p].sufi = i; stack[p].suf = suf; stack[p].slcp = slcp; stack[p].df = 1; stack[p].dfcancel = 0; } push_a_suffix_with_inherit(int i, int df, int suf, short slcp){ int p; p = push(); stack[p].suf = suf; stack[p].slcp = slcp; stack[p].df = df; stack[p].dfcancel = 0; } int output_substrings(int i, short slcp){ int p; int df; short from; p = pop(); if(slcp > stack_top_sclp()) from = slcp; else from = stack_top_sclp(); df = stack[p].df - stack[p].dfcancel; if(from > stack[p].slcp){ fprintf(stderr, "tfdf ERROR: from=%d, to=%d\n", from, stack[p].slcp); } if(from != stack[p].slcp){ putw(stack[p].suf, stdout); fwrite(&from, sizeof(short), 1, stdout); fwrite(&(stack[p].slcp), sizeof(short), 1, stdout); putw(i - stack[p].sufi + 1, stdout); putw(df, stdout); } return(df); } void add_df_top_frame(int df){ stack[stackp-1].df += df; } print_usage(){ fprintf(stderr, "USAGE: tfdf corpus > tf-df-stream(binary)\n"); exit(1); } /* void readcor(struct suf *s, char *filename){ char buf[256]; int n; s->text = mmapfile(filename, &n); s->N = n / sizeof(int); } */ main(int argc, char **argv){ char filename[512]; int i; int pre_df; FILE *fsuf, *flcp; int suf; short slcp; if(argc != 2) print_usage(); /* readcor(&s, argv[1]); */ sprintf(filename, "%s.suf", argv[1]); if((fsuf = fopen(filename, "r")) == NULL){ fprintf(stderr, "can't open %s\n", filename); exit(1); } sprintf(filename, "%s.slcp", argv[1]); if((flcp = fopen(filename, "r")) == NULL){ fprintf(stderr, "can't open %s\n", filename); exit(1); } sprintf(filename, "%s.ab", argv[1]); art_table = read_artbound(filename, &art_num); art_link = (int *)malloc(sizeof(int) * art_num); for(i = 0; i < art_num; i++) art_link[i] = -1; create_stack(); push_dummy_ele(); i = 0; while(fread(&suf, sizeof(int), 1, fsuf) != 0){ if(fread(&slcp, sizeof(short), 1, flcp) == 0){ fprintf(stderr, "slcp file isn't compatible to suf file\n"); exit(2); } /* fprintf(stderr, "set_expire_df(i=%d,xx): ", i); set_expire_df(i, suf); fprintf(stderr, "\n"); */ set_expire_df(i, suf); if(slcp > stack_top_sclp()) push_a_suffix(i, suf, slcp); else { pre_df = 1; while(stack_top_sclp() >= slcp){ add_df_top_frame(pre_df); pre_df = output_substrings(i, slcp); } push_a_suffix_with_inherit(i, pre_df, suf, slcp); } i++; } }