word_count.c

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#define READ_MODE "rt"    //for windows

typedef struct WordCountTreeNode{
    char *word;
    int count;
    struct WordCountTreeNode *left;    //upper
    struct WordCountTreeNode *right;    //bottom
}WCTNode;

WCTNode* wctnode_add(WCTNode *t, char *w);
void wctnode_print(WCTNode *t);

char* get_token(FILE *readfile);

static int usage(int exit_status){
    fputs("Usage: token_split TEXT_FILE\n", stderr);
    return exit_status;
}

int main(int argc, char *argv[]){
    FILE *readfile;
    char *word;
    WCTNode *root = NULL;
    
    if (argc != 2) return usage(0);
    if ( (readfile = fopen(argv[1], READ_MODE) ) == NULL)
        perror(argv[1]);
    
    //根っこをつくる
    word = get_token(readfile);
    if (word != NULL){
        root = wctnode_add(root, word);
        free(word);
    }
    
    //根っこから継ぎ足していく
    while( (word = get_token(NULL) ) != NULL){
        root = wctnode_add(root, word);
        free(word);
    }
    
    wctnode_print(root);
    
    fclose(readfile);
    return 0;
}



/* - * - * - * - * - * - * - * - * - * - * - * - * - * - */

static int istoken(int c);
static void chomp(char *s);

char* get_token(FILE *readfile){
    static FILE *fp;
    int c, i;
    long bpos, len;
    char *token;
    
    if (readfile != NULL) fp = readfile;    //初回のみ
    
    while( (c = fgetc(fp) ) != EOF){
        if (istoken(c)){
            bpos = ftell(fp);
            while(istoken(c = fgetc(fp)));
            len = ftell(fp) - bpos;
            if ( (token = (char*)malloc(len + 1) ) == NULL)
                return NULL;
            
            fseek(fp, (len + 1) * (-1), SEEK_CUR);      //tokenの1byte前なので
            for(i = 0; i < len; i++) token[i] = fgetc(fp);//tokenからfgetcする
            token[i] = '\0';
            chomp(token);
            return token;
        }
    }
    
    return NULL;    //tokenに出会わずEOF
}

//istoken: false...0, true...1
static int istoken(int c){
    if (isalnum(c)) return 1;
    if (c == '\'' || c == '-')
        return 1;
    return 0;
}

static void chomp(char *s){
    if (*s == '\0') return;
    while(*s != '\0') s++;
    if (*(--s) == '\n') *s = '\0';
}


/* - * - * - * - * - * - * - * - * - * - * - * - * - * - */

static WCTNode* new(void);
static char* dup(char *str);

WCTNode* wctnode_add(WCTNode *t, char *w){
    int cmp;
    
    if (t == NULL){
        t = new();
        t->word = dup(w);
    }else{
        cmp = strcmp(w, t->word);
        if (cmp == 0){
            t->count++;
        }else if (cmp < 0){
            t->left = wctnode_add(t->left, w);
        }else{
            t->right = wctnode_add(t->right, w);
        }
    }
    
    return t;
}

void wctnode_print(WCTNode *t){
    if (t != NULL){
        wctnode_print(t->left);
        printf("%4d\t%s\n", t->count, t->word);
        wctnode_print(t->right);
    }
}

static WCTNode* new(void){
    WCTNode *t;
    t = (WCTNode*)malloc(sizeof(WCTNode));
    if (t != NULL){
        t->count = 1;
        t->word = NULL;
        t->left = t->right = NULL;
    }
    return t;
}


static char* dup(char *str){
    char *s;
    s = (char*)malloc(strlen(str) + 1);
    if (s != NULL) strcpy(s, str);
    return s;
}