| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #ifdef HAVE_CONFIG_H |
| #include <config.h> |
| #endif |
|
|
| #include <string.h> |
| #include <assert.h> |
|
|
| #include <pocketsphinx/err.h> |
| #include <pocketsphinx/logmath.h> |
|
|
| #include "lm/ngram_model.h" |
| #include "util/ckd_alloc.h" |
| #include "util/filename.h" |
| #include "util/pio.h" |
| #include "util/strfuncs.h" |
| #include "util/case.h" |
|
|
| #include "pocketsphinx_internal.h" |
| #include "ngram_model_internal.h" |
| #include "ngram_model_trie.h" |
|
|
| ngram_file_type_t |
| ngram_file_name_to_type(const char *file_name) |
| { |
| const char *ext; |
|
|
| ext = strrchr(file_name, '.'); |
| if (ext == NULL) { |
| return NGRAM_INVALID; |
| } |
| if (0 == strcmp_nocase(ext, ".gz")) { |
| while (--ext >= file_name) { |
| if (*ext == '.') |
| break; |
| } |
| if (ext < file_name) { |
| return NGRAM_INVALID; |
| } |
| } |
| else if (0 == strcmp_nocase(ext, ".bz2")) { |
| while (--ext >= file_name) { |
| if (*ext == '.') |
| break; |
| } |
| if (ext < file_name) { |
| return NGRAM_INVALID; |
| } |
| } |
| |
| if (0 == strncmp_nocase(ext, ".ARPA", 5)) |
| return NGRAM_ARPA; |
| if (0 == strncmp_nocase(ext, ".DMP", 4) |
| || 0 == strncmp_nocase(ext, ".BIN", 4)) |
| return NGRAM_BIN; |
| return NGRAM_INVALID; |
| } |
|
|
| ngram_file_type_t |
| ngram_str_to_type(const char *str_name) |
| { |
| if (0 == strcmp_nocase(str_name, "arpa")) |
| return NGRAM_ARPA; |
| if (0 == strcmp_nocase(str_name, "dmp") |
| || 0 == strcmp_nocase(str_name, "bin")) |
| return NGRAM_BIN; |
| return NGRAM_INVALID; |
| } |
|
|
| char const * |
| ngram_type_to_str(int type) |
| { |
| switch (type) { |
| case NGRAM_ARPA: |
| return "arpa"; |
| case NGRAM_BIN: |
| return "dmp/bin"; |
| default: |
| return NULL; |
| } |
| } |
|
|
|
|
| ngram_model_t * |
| ngram_model_read(cmd_ln_t * config, |
| const char *file_name, |
| ngram_file_type_t file_type, logmath_t * lmath) |
| { |
| ngram_model_t *model = NULL; |
| switch (file_type) { |
| case NGRAM_AUTO:{ |
| if ((model = |
| ngram_model_trie_read_bin(config, file_name, |
| lmath)) != NULL) |
| break; |
| if ((model = |
| ngram_model_trie_read_arpa(config, file_name, |
| lmath)) != NULL) |
| break; |
| if ((model = |
| ngram_model_trie_read_dmp(config, file_name, |
| lmath)) != NULL) |
| break; |
| return NULL; |
| } |
| case NGRAM_ARPA: |
| model = ngram_model_trie_read_arpa(config, file_name, lmath); |
| break; |
| case NGRAM_BIN: |
| if ((model = |
| ngram_model_trie_read_bin(config, file_name, lmath)) != NULL) |
| break; |
| if ((model = |
| ngram_model_trie_read_dmp(config, file_name, lmath)) != NULL) |
| break; |
| return NULL; |
| default: |
| E_ERROR("language model file type not supported\n"); |
| return NULL; |
| } |
|
|
| |
| if (config) { |
| float32 lw = 1.0; |
| float32 wip = 1.0; |
|
|
| lw = ps_config_float(config, "lw"); |
| wip = ps_config_float(config, "wip"); |
|
|
| ngram_model_apply_weights(model, lw, wip); |
| } |
|
|
| return model; |
| } |
|
|
| int |
| ngram_model_write(ngram_model_t * model, const char *file_name, |
| ngram_file_type_t file_type) |
| { |
| switch (file_type) { |
| case NGRAM_AUTO:{ |
| file_type = ngram_file_name_to_type(file_name); |
| |
| if (file_type == NGRAM_INVALID) |
| file_type = NGRAM_ARPA; |
| return ngram_model_write(model, file_name, file_type); |
| } |
| case NGRAM_ARPA: |
| return ngram_model_trie_write_arpa(model, file_name); |
| case NGRAM_BIN: |
| return ngram_model_trie_write_bin(model, file_name); |
| default: |
| E_ERROR("language model file type not supported\n"); |
| return -1; |
| } |
| E_ERROR("language model file type not supported\n"); |
| return -1; |
| } |
|
|
| int32 |
| ngram_model_init(ngram_model_t * base, |
| ngram_funcs_t * funcs, |
| logmath_t * lmath, int32 n, int32 n_unigram) |
| { |
| base->refcount = 1; |
| base->funcs = funcs; |
| base->n = n; |
| |
| if (base->n_counts == NULL) |
| base->n_counts = (uint32 *) ckd_calloc(n, sizeof(*base->n_counts)); |
| |
| if (base->lmath != lmath) { |
| |
| base->lw = 1.0; |
| base->log_wip = 0; |
| base->log_zero = logmath_get_zero(lmath); |
| base->lmath = lmath; |
| } |
| |
| if (base->word_str) { |
| |
| if (base->writable) { |
| int32 i; |
| for (i = 0; i < base->n_words; ++i) { |
| ckd_free(base->word_str[i]); |
| base->word_str[i] = NULL; |
| } |
| } |
| base->word_str = |
| (char **) ckd_realloc(base->word_str, |
| n_unigram * sizeof(char *)); |
| } |
| else { |
| base->word_str = (char **) ckd_calloc(n_unigram, sizeof(char *)); |
| } |
| |
| |
| if (base->wid) |
| hash_table_empty(base->wid); |
| else |
| base->wid = hash_table_new(n_unigram, FALSE); |
| base->n_counts[0] = base->n_1g_alloc = base->n_words = n_unigram; |
|
|
| return 0; |
| } |
|
|
| ngram_model_t * |
| ngram_model_retain(ngram_model_t * model) |
| { |
| ++model->refcount; |
| return model; |
| } |
|
|
| void |
| ngram_model_flush(ngram_model_t * model) |
| { |
| if (model->funcs && model->funcs->flush) |
| (*model->funcs->flush) (model); |
| } |
|
|
| int |
| ngram_model_free(ngram_model_t * model) |
| { |
| int i; |
|
|
| if (model == NULL) |
| return 0; |
| if (--model->refcount > 0) |
| return model->refcount; |
| if (model->funcs && model->funcs->free) |
| (*model->funcs->free) (model); |
| if (model->writable) { |
| |
| for (i = 0; i < model->n_words; ++i) { |
| ckd_free(model->word_str[i]); |
| } |
| } |
| else { |
| |
| for (i = 0; i < model->n_classes; ++i) { |
| ngram_class_t *lmclass; |
| int32 j; |
|
|
| lmclass = model->classes[i]; |
| for (j = 0; j < lmclass->n_words; ++j) { |
| ckd_free(model->word_str[lmclass->start_wid + j]); |
| } |
| for (j = 0; j < lmclass->n_hash; ++j) { |
| if (lmclass->nword_hash[j].wid != -1) { |
| ckd_free(model->word_str[lmclass->nword_hash[j].wid]); |
| } |
| } |
| } |
| } |
| for (i = 0; i < model->n_classes; ++i) { |
| ngram_class_free(model->classes[i]); |
| } |
| ckd_free(model->classes); |
| hash_table_free(model->wid); |
| ckd_free(model->word_str); |
| ckd_free(model->n_counts); |
| ckd_free(model); |
| return 0; |
| } |
|
|
| int |
| ngram_model_casefold(ngram_model_t * model, int kase) |
| { |
| int writable, i; |
| hash_table_t *new_wid; |
|
|
| |
| writable = model->writable; |
| |
| model->writable = TRUE; |
|
|
| |
| |
| new_wid = hash_table_new(model->n_words, FALSE); |
| for (i = 0; i < model->n_words; ++i) { |
| char *outstr; |
| if (writable) { |
| outstr = model->word_str[i]; |
| } |
| else { |
| outstr = ckd_salloc(model->word_str[i]); |
| } |
| |
| if (outstr[0] == '<' || outstr[0] == '[') { |
| } |
| else { |
| switch (kase) { |
| case NGRAM_UPPER: |
| ucase(outstr); |
| break; |
| case NGRAM_LOWER: |
| lcase(outstr); |
| break; |
| default: |
| ; |
| } |
| } |
| model->word_str[i] = outstr; |
|
|
| |
| |
| if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) { |
| E_WARN("Duplicate word in dictionary after conversion: %s\n", |
| model->word_str[i]); |
| } |
| } |
| |
| hash_table_free(model->wid); |
| model->wid = new_wid; |
| return 0; |
| } |
|
|
| int |
| ngram_model_apply_weights(ngram_model_t * model, float32 lw, float32 wip) |
| { |
| return (*model->funcs->apply_weights) (model, lw, wip); |
| } |
|
|
| float32 |
| ngram_model_get_weights(ngram_model_t * model, int32 * out_log_wip) |
| { |
| if (out_log_wip) |
| *out_log_wip = model->log_wip; |
| return model->lw; |
| } |
|
|
|
|
| int32 |
| ngram_ng_score(ngram_model_t * model, int32 wid, int32 * history, |
| int32 n_hist, int32 * n_used) |
| { |
| int32 score, class_weight = 0; |
| int i; |
|
|
| |
| if (wid == NGRAM_INVALID_WID) |
| return model->log_zero; |
|
|
| |
| if (NGRAM_IS_CLASSWID(wid)) { |
| ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)]; |
|
|
| class_weight = ngram_class_prob(lmclass, wid); |
| if (class_weight == 1) |
| return model->log_zero; |
| wid = lmclass->tag_wid; |
| } |
| for (i = 0; i < n_hist; ++i) { |
| if (history[i] != NGRAM_INVALID_WID |
| && NGRAM_IS_CLASSWID(history[i])) |
| history[i] = |
| model->classes[NGRAM_CLASSID(history[i])]->tag_wid; |
| } |
| score = (*model->funcs->score) (model, wid, history, n_hist, n_used); |
|
|
| |
| return score + class_weight; |
| } |
|
|
| int32 |
| ngram_score(ngram_model_t * model, const char *word, ...) |
| { |
| va_list history; |
| const char *hword; |
| int32 *histid; |
| int32 n_hist; |
| int32 n_used; |
| int32 prob; |
|
|
| va_start(history, word); |
| n_hist = 0; |
| while ((hword = va_arg(history, const char *)) != NULL) |
| ++n_hist; |
| va_end(history); |
|
|
| histid = ckd_calloc(n_hist, sizeof(*histid)); |
| va_start(history, word); |
| n_hist = 0; |
| while ((hword = va_arg(history, const char *)) != NULL) { |
| histid[n_hist] = ngram_wid(model, hword); |
| ++n_hist; |
| } |
| va_end(history); |
|
|
| prob = ngram_ng_score(model, ngram_wid(model, word), |
| histid, n_hist, &n_used); |
| ckd_free(histid); |
| return prob; |
| } |
|
|
| int32 |
| ngram_tg_score(ngram_model_t * model, int32 w3, int32 w2, int32 w1, |
| int32 * n_used) |
| { |
| int32 hist[2]; |
| hist[0] = w2; |
| hist[1] = w1; |
| return ngram_ng_score(model, w3, hist, 2, n_used); |
| } |
|
|
| int32 |
| ngram_bg_score(ngram_model_t * model, int32 w2, int32 w1, int32 * n_used) |
| { |
| return ngram_ng_score(model, w2, &w1, 1, n_used); |
| } |
|
|
| int32 |
| ngram_ng_prob(ngram_model_t * model, int32 wid, int32 * history, |
| int32 n_hist, int32 * n_used) |
| { |
| int32 prob, class_weight = 0; |
| int i; |
|
|
| |
| if (wid == NGRAM_INVALID_WID) |
| return model->log_zero; |
|
|
| |
| if (NGRAM_IS_CLASSWID(wid)) { |
| ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)]; |
|
|
| class_weight = ngram_class_prob(lmclass, wid); |
| if (class_weight == 1) |
| return class_weight; |
| wid = lmclass->tag_wid; |
| } |
| for (i = 0; i < n_hist; ++i) { |
| if (history[i] != NGRAM_INVALID_WID |
| && NGRAM_IS_CLASSWID(history[i])) |
| history[i] = |
| model->classes[NGRAM_CLASSID(history[i])]->tag_wid; |
| } |
| prob = (*model->funcs->raw_score) (model, wid, history, |
| n_hist, n_used); |
| |
| return prob + class_weight; |
| } |
|
|
| int32 |
| ngram_probv(ngram_model_t * model, const char *word, ...) |
| { |
| va_list history; |
| const char *hword; |
| int32 *histid; |
| int32 n_hist; |
| int32 n_used; |
| int32 prob; |
|
|
| va_start(history, word); |
| n_hist = 0; |
| while ((hword = va_arg(history, const char *)) != NULL) |
| ++n_hist; |
| va_end(history); |
|
|
| histid = ckd_calloc(n_hist, sizeof(*histid)); |
| va_start(history, word); |
| n_hist = 0; |
| while ((hword = va_arg(history, const char *)) != NULL) { |
| histid[n_hist] = ngram_wid(model, hword); |
| ++n_hist; |
| } |
| va_end(history); |
|
|
| prob = ngram_ng_prob(model, ngram_wid(model, word), |
| histid, n_hist, &n_used); |
| ckd_free(histid); |
| return prob; |
| } |
|
|
| int32 |
| ngram_prob(ngram_model_t * model, const char* const *words, int32 n) |
| { |
| int32 *ctx_id; |
| int32 nused; |
| int32 prob; |
| int32 wid; |
| uint32 i; |
|
|
| ctx_id = (int32 *) ckd_calloc(n - 1, sizeof(*ctx_id)); |
| for (i = 1; i < (uint32) n; ++i) |
| ctx_id[i - 1] = ngram_wid(model, words[i]); |
|
|
| wid = ngram_wid(model, *words); |
| prob = ngram_ng_prob(model, wid, ctx_id, n - 1, &nused); |
| ckd_free(ctx_id); |
|
|
| return prob; |
| } |
|
|
| int32 |
| ngram_score_to_prob(ngram_model_t * base, int32 score) |
| { |
| int32 prob; |
|
|
| |
| prob = score - base->log_wip; |
| |
| prob = (int32) (prob / base->lw); |
|
|
| return prob; |
| } |
|
|
| int32 |
| ngram_unknown_wid(ngram_model_t * model) |
| { |
| int32 val; |
|
|
| |
| |
| if (hash_table_lookup_int32(model->wid, "<UNK>", &val) == -1) |
| return NGRAM_INVALID_WID; |
| else |
| return val; |
| } |
|
|
| int32 |
| ngram_zero(ngram_model_t * model) |
| { |
| return model->log_zero; |
| } |
|
|
| int32 |
| ngram_model_get_size(ngram_model_t * model) |
| { |
| if (model != NULL) |
| return model->n; |
| return 0; |
| } |
|
|
| uint32 const * |
| ngram_model_get_counts(ngram_model_t * model) |
| { |
| if (model != NULL) |
| return model->n_counts; |
| return NULL; |
| } |
|
|
| int32 |
| ngram_wid(ngram_model_t * model, const char *word) |
| { |
| int32 val; |
|
|
| if (hash_table_lookup_int32(model->wid, word, &val) == -1) |
| return ngram_unknown_wid(model); |
| else |
| return val; |
| } |
|
|
| const char * |
| ngram_word(ngram_model_t * model, int32 wid) |
| { |
| |
| wid = NGRAM_BASEWID(wid); |
| if (wid >= model->n_words) |
| return NULL; |
| return model->word_str[wid]; |
| } |
|
|
| |
| |
| |
| int32 |
| ngram_add_word_internal(ngram_model_t * model, |
| const char *word, int32 classid) |
| { |
|
|
| |
| int32 wid; |
| if (hash_table_lookup_int32(model->wid, word, &wid) == 0) { |
| E_WARN("Omit duplicate word '%s'\n", word); |
| return wid; |
| } |
|
|
| |
| wid = model->n_words; |
| if (classid >= 0) { |
| wid = NGRAM_CLASSWID(wid, classid); |
| } |
|
|
| |
| if (model->n_words >= model->n_1g_alloc) { |
| model->n_1g_alloc += UG_ALLOC_STEP; |
| model->word_str = ckd_realloc(model->word_str, |
| sizeof(*model->word_str) * |
| model->n_1g_alloc); |
| } |
| |
| |
| model->word_str[model->n_words] = ckd_salloc(word); |
| |
| if (hash_table_enter_int32 |
| (model->wid, model->word_str[model->n_words], wid) != wid) { |
| E_ERROR |
| ("Hash insertion failed for word %s => %p (should not happen)\n", |
| model->word_str[model->n_words], (void *) (size_t) (wid)); |
| } |
| |
| ++model->n_words; |
| return wid; |
| } |
|
|
| int32 |
| ngram_model_add_word(ngram_model_t * model, |
| const char *word, float32 weight) |
| { |
| int32 wid, prob = model->log_zero; |
|
|
| |
| if (!model->writable) { |
| E_WARN("Can't add word '%s' to read-only language model. " |
| "Disable mmap with '-mmap no' to make it writable\n", word); |
| return -1; |
| } |
|
|
| wid = ngram_add_word_internal(model, word, -1); |
| if (wid == NGRAM_INVALID_WID) |
| return wid; |
|
|
| |
| if (model->funcs && model->funcs->add_ug) |
| prob = |
| (*model->funcs->add_ug) (model, wid, |
| logmath_log(model->lmath, weight)); |
| if (prob == 0) |
| return -1; |
|
|
| return wid; |
| } |
|
|
| ngram_class_t * |
| ngram_class_new(ngram_model_t * model, int32 tag_wid, int32 start_wid, |
| glist_t classwords) |
| { |
| ngram_class_t *lmclass; |
| gnode_t *gn; |
| float32 tprob; |
| int i; |
|
|
| lmclass = ckd_calloc(1, sizeof(*lmclass)); |
| lmclass->tag_wid = tag_wid; |
| |
| lmclass->start_wid = start_wid; |
| lmclass->n_words = glist_count(classwords); |
| lmclass->prob1 = ckd_calloc(lmclass->n_words, sizeof(*lmclass->prob1)); |
| lmclass->nword_hash = NULL; |
| lmclass->n_hash = 0; |
| tprob = 0.0; |
| for (gn = classwords; gn; gn = gnode_next(gn)) { |
| tprob += gnode_float32(gn); |
| } |
| if (tprob > 1.1 || tprob < 0.9) { |
| E_INFO("Total class probability is %f, will normalize\n", tprob); |
| for (gn = classwords; gn; gn = gnode_next(gn)) { |
| gn->data.fl /= tprob; |
| } |
| } |
| for (i = 0, gn = classwords; gn; ++i, gn = gnode_next(gn)) { |
| lmclass->prob1[i] = logmath_log(model->lmath, gnode_float32(gn)); |
| } |
|
|
| return lmclass; |
| } |
|
|
| int32 |
| ngram_class_add_word(ngram_class_t * lmclass, int32 wid, int32 lweight) |
| { |
| int32 hash; |
|
|
| if (lmclass->nword_hash == NULL) { |
| |
| lmclass->nword_hash = |
| ckd_malloc(NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash)); |
| memset(lmclass->nword_hash, 0xff, |
| NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash)); |
| lmclass->n_hash = NGRAM_HASH_SIZE; |
| lmclass->n_hash_inuse = 0; |
| } |
| |
| |
| |
| hash = wid & (lmclass->n_hash - 1); |
| if (lmclass->nword_hash[hash].wid == -1) { |
| |
| lmclass->nword_hash[hash].wid = wid; |
| lmclass->nword_hash[hash].prob1 = lweight; |
| ++lmclass->n_hash_inuse; |
| return hash; |
| } |
| else { |
| int32 next; |
| |
| while (lmclass->nword_hash[hash].next != -1) |
| hash = lmclass->nword_hash[hash].next; |
| assert(hash != -1); |
| |
| if (lmclass->n_hash_inuse == lmclass->n_hash) { |
| |
| lmclass->nword_hash = ckd_realloc(lmclass->nword_hash, |
| lmclass->n_hash * 2 * |
| sizeof(*lmclass-> |
| nword_hash)); |
| memset(lmclass->nword_hash + lmclass->n_hash, 0xff, |
| lmclass->n_hash * sizeof(*lmclass->nword_hash)); |
| |
| next = lmclass->n_hash; |
| lmclass->n_hash *= 2; |
| } |
| else { |
| |
| for (next = 0; next < lmclass->n_hash; ++next) |
| if (lmclass->nword_hash[next].wid == -1) |
| break; |
| |
| assert(next != lmclass->n_hash); |
| } |
| lmclass->nword_hash[next].wid = wid; |
| lmclass->nword_hash[next].prob1 = lweight; |
| lmclass->nword_hash[hash].next = next; |
| ++lmclass->n_hash_inuse; |
| return next; |
| } |
| } |
|
|
| void |
| ngram_class_free(ngram_class_t * lmclass) |
| { |
| ckd_free(lmclass->nword_hash); |
| ckd_free(lmclass->prob1); |
| ckd_free(lmclass); |
| } |
|
|
| int32 |
| ngram_model_add_class_word(ngram_model_t * model, |
| const char *classname, |
| const char *word, float32 weight) |
| { |
| ngram_class_t *lmclass; |
| int32 classid, tag_wid, wid, i, scale; |
| float32 fprob; |
|
|
| |
| |
| |
| tag_wid = ngram_wid(model, classname); |
| if (tag_wid == NGRAM_INVALID_WID) { |
| E_ERROR("No such word or class tag: %s\n", classname); |
| return tag_wid; |
| } |
| for (classid = 0; classid < model->n_classes; ++classid) { |
| if (model->classes[classid]->tag_wid == tag_wid) |
| break; |
| } |
| |
| if (classid == model->n_classes) { |
| E_ERROR |
| ("Word %s is not a class tag (call ngram_model_add_class() first)\n", |
| classname); |
| return NGRAM_INVALID_WID; |
| } |
| lmclass = model->classes[classid]; |
|
|
| |
| wid = ngram_add_word_internal(model, word, classid); |
| if (wid == NGRAM_INVALID_WID) |
| return wid; |
|
|
| |
| fprob = weight * 1.0f / (lmclass->n_words + lmclass->n_hash_inuse + 1); |
| |
| |
| |
| scale = logmath_log(model->lmath, 1.0 - fprob); |
| for (i = 0; i < lmclass->n_words; ++i) |
| lmclass->prob1[i] += scale; |
| for (i = 0; i < lmclass->n_hash; ++i) |
| if (lmclass->nword_hash[i].wid != -1) |
| lmclass->nword_hash[i].prob1 += scale; |
|
|
| |
| return ngram_class_add_word(lmclass, wid, |
| logmath_log(model->lmath, fprob)); |
| } |
|
|
| int32 |
| ngram_model_add_class(ngram_model_t * model, |
| const char *classname, |
| float32 classweight, |
| char **words, const float32 * weights, int32 n_words) |
| { |
| ngram_class_t *lmclass; |
| glist_t classwords = NULL; |
| int32 i, start_wid = -1; |
| int32 classid, tag_wid; |
|
|
| |
| if ((tag_wid = |
| ngram_wid(model, classname)) == ngram_unknown_wid(model)) { |
| tag_wid = ngram_model_add_word(model, classname, classweight); |
| if (tag_wid == NGRAM_INVALID_WID) |
| return -1; |
| } |
|
|
| if (model->n_classes == 128) { |
| E_ERROR("Number of classes cannot exceed 128 (sorry)\n"); |
| return -1; |
| } |
| classid = model->n_classes; |
| for (i = 0; i < n_words; ++i) { |
| int32 wid; |
|
|
| wid = ngram_add_word_internal(model, words[i], classid); |
| if (wid == NGRAM_INVALID_WID) |
| return -1; |
| if (start_wid == -1) |
| start_wid = NGRAM_BASEWID(wid); |
| classwords = glist_add_float32(classwords, weights[i]); |
| } |
| classwords = glist_reverse(classwords); |
| lmclass = ngram_class_new(model, tag_wid, start_wid, classwords); |
| glist_free(classwords); |
| if (lmclass == NULL) |
| return -1; |
|
|
| ++model->n_classes; |
| if (model->classes == NULL) |
| model->classes = ckd_calloc(1, sizeof(*model->classes)); |
| else |
| model->classes = ckd_realloc(model->classes, |
| model->n_classes * |
| sizeof(*model->classes)); |
| model->classes[classid] = lmclass; |
| return classid; |
| } |
|
|
| int32 |
| ngram_class_prob(ngram_class_t * lmclass, int32 wid) |
| { |
| int32 base_wid = NGRAM_BASEWID(wid); |
|
|
| if (base_wid < lmclass->start_wid |
| || base_wid > lmclass->start_wid + lmclass->n_words) { |
| int32 hash; |
|
|
| |
| hash = wid & (lmclass->n_hash - 1); |
| while (hash != -1 && lmclass->nword_hash[hash].wid != wid) |
| hash = lmclass->nword_hash[hash].next; |
| if (hash == -1) |
| return 1; |
| return lmclass->nword_hash[hash].prob1; |
| } |
| else { |
| return lmclass->prob1[base_wid - lmclass->start_wid]; |
| } |
| } |
|
|
| int32 |
| read_classdef_file(hash_table_t * classes, const char *file_name) |
| { |
| FILE *fp; |
| int32 is_pipe; |
| int inclass; |
| int32 rv = -1; |
| gnode_t *gn; |
| glist_t classwords = NULL; |
| glist_t classprobs = NULL; |
| char *classname = NULL; |
|
|
| if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) { |
| E_ERROR("File %s not found\n", file_name); |
| return -1; |
| } |
|
|
| inclass = FALSE; |
| while (!feof(fp)) { |
| char line[512]; |
| char *wptr[2]; |
| int n_words; |
|
|
| if (fgets(line, sizeof(line), fp) == NULL) |
| break; |
|
|
| n_words = str2words(line, wptr, 2); |
| if (n_words <= 0) |
| continue; |
|
|
| if (inclass) { |
| |
| if (n_words == 2 && 0 == strcmp(wptr[0], "END")) { |
| classdef_t *classdef; |
| gnode_t *word, *weight; |
| int32 i; |
|
|
| if (classname == NULL || 0 != strcmp(wptr[1], classname)) |
| goto error_out; |
| inclass = FALSE; |
|
|
| |
| classdef = ckd_calloc(1, sizeof(*classdef)); |
| classwords = glist_reverse(classwords); |
| classprobs = glist_reverse(classprobs); |
| classdef->n_words = glist_count(classwords); |
| classdef->words = ckd_calloc(classdef->n_words, |
| sizeof(*classdef->words)); |
| classdef->weights = ckd_calloc(classdef->n_words, |
| sizeof(*classdef->weights)); |
| word = classwords; |
| weight = classprobs; |
| for (i = 0; i < classdef->n_words; ++i) { |
| classdef->words[i] = gnode_ptr(word); |
| classdef->weights[i] = gnode_float32(weight); |
| word = gnode_next(word); |
| weight = gnode_next(weight); |
| } |
|
|
| |
| if (hash_table_enter(classes, classname, classdef) != |
| classdef) { |
| classdef_free(classdef); |
| goto error_out; |
| } |
|
|
| |
| glist_free(classwords); |
| glist_free(classprobs); |
| classwords = NULL; |
| classprobs = NULL; |
| classname = NULL; |
| } |
| else { |
| float32 fprob; |
|
|
| if (n_words == 2) |
| fprob = atof_c(wptr[1]); |
| else |
| fprob = 1.0f; |
| |
| classwords = |
| glist_add_ptr(classwords, ckd_salloc(wptr[0])); |
| classprobs = glist_add_float32(classprobs, fprob); |
| } |
| } |
| else { |
| |
| if (n_words == 2 && 0 == strcmp(wptr[0], "LMCLASS")) { |
| if (inclass) |
| goto error_out; |
| inclass = TRUE; |
| classname = ckd_salloc(wptr[1]); |
| } |
| |
| } |
| } |
| rv = 0; |
|
|
| error_out: |
| |
| fclose_comp(fp, is_pipe); |
| for (gn = classwords; gn; gn = gnode_next(gn)) |
| ckd_free(gnode_ptr(gn)); |
| glist_free(classwords); |
| glist_free(classprobs); |
| ckd_free(classname); |
|
|
| return rv; |
| } |
|
|
| void |
| classdef_free(classdef_t * classdef) |
| { |
| int32 i; |
| for (i = 0; i < classdef->n_words; ++i) |
| ckd_free(classdef->words[i]); |
| ckd_free(classdef->words); |
| ckd_free(classdef->weights); |
| ckd_free(classdef); |
| } |
|
|
|
|
| int32 |
| ngram_model_read_classdef(ngram_model_t * model, const char *file_name) |
| { |
| hash_table_t *classes; |
| glist_t hl = NULL; |
| gnode_t *gn; |
| int32 rv = -1; |
|
|
| classes = hash_table_new(0, FALSE); |
| if (read_classdef_file(classes, file_name) < 0) { |
| hash_table_free(classes); |
| return -1; |
| } |
|
|
| |
| hl = hash_table_tolist(classes, NULL); |
| for (gn = hl; gn; gn = gnode_next(gn)) { |
| hash_entry_t *he = gnode_ptr(gn); |
| classdef_t *classdef = he->val; |
|
|
| if (ngram_model_add_class(model, he->key, 1.0, |
| classdef->words, |
| classdef->weights, |
| classdef->n_words) < 0) |
| goto error_out; |
| } |
| rv = 0; |
|
|
| error_out: |
| for (gn = hl; gn; gn = gnode_next(gn)) { |
| hash_entry_t *he = gnode_ptr(gn); |
| ckd_free((char *) he->key); |
| classdef_free(he->val); |
| } |
| glist_free(hl); |
| hash_table_free(classes); |
| return rv; |
| } |
|
|