| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | #ifdef HAVE_CONFIG_H |
| | #include <config.h> |
| | #endif |
| |
|
| | #include <string.h> |
| | #include <assert.h> |
| |
|
| | #include <pocketsphinx/err.h> |
| | #include <pocketsphinx/logmath.h> |
| |
|
| | #include "lm/ngram_model.h" |
| | #include "util/ckd_alloc.h" |
| | #include "util/filename.h" |
| | #include "util/pio.h" |
| | #include "util/strfuncs.h" |
| | #include "util/case.h" |
| |
|
| | #include "pocketsphinx_internal.h" |
| | #include "ngram_model_internal.h" |
| | #include "ngram_model_trie.h" |
| |
|
| | ngram_file_type_t |
| | ngram_file_name_to_type(const char *file_name) |
| | { |
| | const char *ext; |
| |
|
| | ext = strrchr(file_name, '.'); |
| | if (ext == NULL) { |
| | return NGRAM_INVALID; |
| | } |
| | if (0 == strcmp_nocase(ext, ".gz")) { |
| | while (--ext >= file_name) { |
| | if (*ext == '.') |
| | break; |
| | } |
| | if (ext < file_name) { |
| | return NGRAM_INVALID; |
| | } |
| | } |
| | else if (0 == strcmp_nocase(ext, ".bz2")) { |
| | while (--ext >= file_name) { |
| | if (*ext == '.') |
| | break; |
| | } |
| | if (ext < file_name) { |
| | return NGRAM_INVALID; |
| | } |
| | } |
| | |
| | if (0 == strncmp_nocase(ext, ".ARPA", 5)) |
| | return NGRAM_ARPA; |
| | if (0 == strncmp_nocase(ext, ".DMP", 4) |
| | || 0 == strncmp_nocase(ext, ".BIN", 4)) |
| | return NGRAM_BIN; |
| | return NGRAM_INVALID; |
| | } |
| |
|
| | ngram_file_type_t |
| | ngram_str_to_type(const char *str_name) |
| | { |
| | if (0 == strcmp_nocase(str_name, "arpa")) |
| | return NGRAM_ARPA; |
| | if (0 == strcmp_nocase(str_name, "dmp") |
| | || 0 == strcmp_nocase(str_name, "bin")) |
| | return NGRAM_BIN; |
| | return NGRAM_INVALID; |
| | } |
| |
|
| | char const * |
| | ngram_type_to_str(int type) |
| | { |
| | switch (type) { |
| | case NGRAM_ARPA: |
| | return "arpa"; |
| | case NGRAM_BIN: |
| | return "dmp/bin"; |
| | default: |
| | return NULL; |
| | } |
| | } |
| |
|
| |
|
| | ngram_model_t * |
| | ngram_model_read(cmd_ln_t * config, |
| | const char *file_name, |
| | ngram_file_type_t file_type, logmath_t * lmath) |
| | { |
| | ngram_model_t *model = NULL; |
| | switch (file_type) { |
| | case NGRAM_AUTO:{ |
| | if ((model = |
| | ngram_model_trie_read_bin(config, file_name, |
| | lmath)) != NULL) |
| | break; |
| | if ((model = |
| | ngram_model_trie_read_arpa(config, file_name, |
| | lmath)) != NULL) |
| | break; |
| | if ((model = |
| | ngram_model_trie_read_dmp(config, file_name, |
| | lmath)) != NULL) |
| | break; |
| | return NULL; |
| | } |
| | case NGRAM_ARPA: |
| | model = ngram_model_trie_read_arpa(config, file_name, lmath); |
| | break; |
| | case NGRAM_BIN: |
| | if ((model = |
| | ngram_model_trie_read_bin(config, file_name, lmath)) != NULL) |
| | break; |
| | if ((model = |
| | ngram_model_trie_read_dmp(config, file_name, lmath)) != NULL) |
| | break; |
| | return NULL; |
| | default: |
| | E_ERROR("language model file type not supported\n"); |
| | return NULL; |
| | } |
| |
|
| | |
| | if (config) { |
| | float32 lw = 1.0; |
| | float32 wip = 1.0; |
| |
|
| | lw = ps_config_float(config, "lw"); |
| | wip = ps_config_float(config, "wip"); |
| |
|
| | ngram_model_apply_weights(model, lw, wip); |
| | } |
| |
|
| | return model; |
| | } |
| |
|
| | int |
| | ngram_model_write(ngram_model_t * model, const char *file_name, |
| | ngram_file_type_t file_type) |
| | { |
| | switch (file_type) { |
| | case NGRAM_AUTO:{ |
| | file_type = ngram_file_name_to_type(file_name); |
| | |
| | if (file_type == NGRAM_INVALID) |
| | file_type = NGRAM_ARPA; |
| | return ngram_model_write(model, file_name, file_type); |
| | } |
| | case NGRAM_ARPA: |
| | return ngram_model_trie_write_arpa(model, file_name); |
| | case NGRAM_BIN: |
| | return ngram_model_trie_write_bin(model, file_name); |
| | default: |
| | E_ERROR("language model file type not supported\n"); |
| | return -1; |
| | } |
| | E_ERROR("language model file type not supported\n"); |
| | return -1; |
| | } |
| |
|
| | int32 |
| | ngram_model_init(ngram_model_t * base, |
| | ngram_funcs_t * funcs, |
| | logmath_t * lmath, int32 n, int32 n_unigram) |
| | { |
| | base->refcount = 1; |
| | base->funcs = funcs; |
| | base->n = n; |
| | |
| | if (base->n_counts == NULL) |
| | base->n_counts = (uint32 *) ckd_calloc(n, sizeof(*base->n_counts)); |
| | |
| | if (base->lmath != lmath) { |
| | |
| | base->lw = 1.0; |
| | base->log_wip = 0; |
| | base->log_zero = logmath_get_zero(lmath); |
| | base->lmath = lmath; |
| | } |
| | |
| | if (base->word_str) { |
| | |
| | if (base->writable) { |
| | int32 i; |
| | for (i = 0; i < base->n_words; ++i) { |
| | ckd_free(base->word_str[i]); |
| | base->word_str[i] = NULL; |
| | } |
| | } |
| | base->word_str = |
| | (char **) ckd_realloc(base->word_str, |
| | n_unigram * sizeof(char *)); |
| | } |
| | else { |
| | base->word_str = (char **) ckd_calloc(n_unigram, sizeof(char *)); |
| | } |
| | |
| | |
| | if (base->wid) |
| | hash_table_empty(base->wid); |
| | else |
| | base->wid = hash_table_new(n_unigram, FALSE); |
| | base->n_counts[0] = base->n_1g_alloc = base->n_words = n_unigram; |
| |
|
| | return 0; |
| | } |
| |
|
| | ngram_model_t * |
| | ngram_model_retain(ngram_model_t * model) |
| | { |
| | ++model->refcount; |
| | return model; |
| | } |
| |
|
| | void |
| | ngram_model_flush(ngram_model_t * model) |
| | { |
| | if (model->funcs && model->funcs->flush) |
| | (*model->funcs->flush) (model); |
| | } |
| |
|
| | int |
| | ngram_model_free(ngram_model_t * model) |
| | { |
| | int i; |
| |
|
| | if (model == NULL) |
| | return 0; |
| | if (--model->refcount > 0) |
| | return model->refcount; |
| | if (model->funcs && model->funcs->free) |
| | (*model->funcs->free) (model); |
| | if (model->writable) { |
| | |
| | for (i = 0; i < model->n_words; ++i) { |
| | ckd_free(model->word_str[i]); |
| | } |
| | } |
| | else { |
| | |
| | for (i = 0; i < model->n_classes; ++i) { |
| | ngram_class_t *lmclass; |
| | int32 j; |
| |
|
| | lmclass = model->classes[i]; |
| | for (j = 0; j < lmclass->n_words; ++j) { |
| | ckd_free(model->word_str[lmclass->start_wid + j]); |
| | } |
| | for (j = 0; j < lmclass->n_hash; ++j) { |
| | if (lmclass->nword_hash[j].wid != -1) { |
| | ckd_free(model->word_str[lmclass->nword_hash[j].wid]); |
| | } |
| | } |
| | } |
| | } |
| | for (i = 0; i < model->n_classes; ++i) { |
| | ngram_class_free(model->classes[i]); |
| | } |
| | ckd_free(model->classes); |
| | hash_table_free(model->wid); |
| | ckd_free(model->word_str); |
| | ckd_free(model->n_counts); |
| | ckd_free(model); |
| | return 0; |
| | } |
| |
|
| | int |
| | ngram_model_casefold(ngram_model_t * model, int kase) |
| | { |
| | int writable, i; |
| | hash_table_t *new_wid; |
| |
|
| | |
| | writable = model->writable; |
| | |
| | model->writable = TRUE; |
| |
|
| | |
| | |
| | new_wid = hash_table_new(model->n_words, FALSE); |
| | for (i = 0; i < model->n_words; ++i) { |
| | char *outstr; |
| | if (writable) { |
| | outstr = model->word_str[i]; |
| | } |
| | else { |
| | outstr = ckd_salloc(model->word_str[i]); |
| | } |
| | |
| | if (outstr[0] == '<' || outstr[0] == '[') { |
| | } |
| | else { |
| | switch (kase) { |
| | case NGRAM_UPPER: |
| | ucase(outstr); |
| | break; |
| | case NGRAM_LOWER: |
| | lcase(outstr); |
| | break; |
| | default: |
| | ; |
| | } |
| | } |
| | model->word_str[i] = outstr; |
| |
|
| | |
| | |
| | if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) { |
| | E_WARN("Duplicate word in dictionary after conversion: %s\n", |
| | model->word_str[i]); |
| | } |
| | } |
| | |
| | hash_table_free(model->wid); |
| | model->wid = new_wid; |
| | return 0; |
| | } |
| |
|
| | int |
| | ngram_model_apply_weights(ngram_model_t * model, float32 lw, float32 wip) |
| | { |
| | return (*model->funcs->apply_weights) (model, lw, wip); |
| | } |
| |
|
| | float32 |
| | ngram_model_get_weights(ngram_model_t * model, int32 * out_log_wip) |
| | { |
| | if (out_log_wip) |
| | *out_log_wip = model->log_wip; |
| | return model->lw; |
| | } |
| |
|
| |
|
| | int32 |
| | ngram_ng_score(ngram_model_t * model, int32 wid, int32 * history, |
| | int32 n_hist, int32 * n_used) |
| | { |
| | int32 score, class_weight = 0; |
| | int i; |
| |
|
| | |
| | if (wid == NGRAM_INVALID_WID) |
| | return model->log_zero; |
| |
|
| | |
| | if (NGRAM_IS_CLASSWID(wid)) { |
| | ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)]; |
| |
|
| | class_weight = ngram_class_prob(lmclass, wid); |
| | if (class_weight == 1) |
| | return model->log_zero; |
| | wid = lmclass->tag_wid; |
| | } |
| | for (i = 0; i < n_hist; ++i) { |
| | if (history[i] != NGRAM_INVALID_WID |
| | && NGRAM_IS_CLASSWID(history[i])) |
| | history[i] = |
| | model->classes[NGRAM_CLASSID(history[i])]->tag_wid; |
| | } |
| | score = (*model->funcs->score) (model, wid, history, n_hist, n_used); |
| |
|
| | |
| | return score + class_weight; |
| | } |
| |
|
| | int32 |
| | ngram_score(ngram_model_t * model, const char *word, ...) |
| | { |
| | va_list history; |
| | const char *hword; |
| | int32 *histid; |
| | int32 n_hist; |
| | int32 n_used; |
| | int32 prob; |
| |
|
| | va_start(history, word); |
| | n_hist = 0; |
| | while ((hword = va_arg(history, const char *)) != NULL) |
| | ++n_hist; |
| | va_end(history); |
| |
|
| | histid = ckd_calloc(n_hist, sizeof(*histid)); |
| | va_start(history, word); |
| | n_hist = 0; |
| | while ((hword = va_arg(history, const char *)) != NULL) { |
| | histid[n_hist] = ngram_wid(model, hword); |
| | ++n_hist; |
| | } |
| | va_end(history); |
| |
|
| | prob = ngram_ng_score(model, ngram_wid(model, word), |
| | histid, n_hist, &n_used); |
| | ckd_free(histid); |
| | return prob; |
| | } |
| |
|
| | int32 |
| | ngram_tg_score(ngram_model_t * model, int32 w3, int32 w2, int32 w1, |
| | int32 * n_used) |
| | { |
| | int32 hist[2]; |
| | hist[0] = w2; |
| | hist[1] = w1; |
| | return ngram_ng_score(model, w3, hist, 2, n_used); |
| | } |
| |
|
| | int32 |
| | ngram_bg_score(ngram_model_t * model, int32 w2, int32 w1, int32 * n_used) |
| | { |
| | return ngram_ng_score(model, w2, &w1, 1, n_used); |
| | } |
| |
|
| | int32 |
| | ngram_ng_prob(ngram_model_t * model, int32 wid, int32 * history, |
| | int32 n_hist, int32 * n_used) |
| | { |
| | int32 prob, class_weight = 0; |
| | int i; |
| |
|
| | |
| | if (wid == NGRAM_INVALID_WID) |
| | return model->log_zero; |
| |
|
| | |
| | if (NGRAM_IS_CLASSWID(wid)) { |
| | ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)]; |
| |
|
| | class_weight = ngram_class_prob(lmclass, wid); |
| | if (class_weight == 1) |
| | return class_weight; |
| | wid = lmclass->tag_wid; |
| | } |
| | for (i = 0; i < n_hist; ++i) { |
| | if (history[i] != NGRAM_INVALID_WID |
| | && NGRAM_IS_CLASSWID(history[i])) |
| | history[i] = |
| | model->classes[NGRAM_CLASSID(history[i])]->tag_wid; |
| | } |
| | prob = (*model->funcs->raw_score) (model, wid, history, |
| | n_hist, n_used); |
| | |
| | return prob + class_weight; |
| | } |
| |
|
| | int32 |
| | ngram_probv(ngram_model_t * model, const char *word, ...) |
| | { |
| | va_list history; |
| | const char *hword; |
| | int32 *histid; |
| | int32 n_hist; |
| | int32 n_used; |
| | int32 prob; |
| |
|
| | va_start(history, word); |
| | n_hist = 0; |
| | while ((hword = va_arg(history, const char *)) != NULL) |
| | ++n_hist; |
| | va_end(history); |
| |
|
| | histid = ckd_calloc(n_hist, sizeof(*histid)); |
| | va_start(history, word); |
| | n_hist = 0; |
| | while ((hword = va_arg(history, const char *)) != NULL) { |
| | histid[n_hist] = ngram_wid(model, hword); |
| | ++n_hist; |
| | } |
| | va_end(history); |
| |
|
| | prob = ngram_ng_prob(model, ngram_wid(model, word), |
| | histid, n_hist, &n_used); |
| | ckd_free(histid); |
| | return prob; |
| | } |
| |
|
| | int32 |
| | ngram_prob(ngram_model_t * model, const char* const *words, int32 n) |
| | { |
| | int32 *ctx_id; |
| | int32 nused; |
| | int32 prob; |
| | int32 wid; |
| | uint32 i; |
| |
|
| | ctx_id = (int32 *) ckd_calloc(n - 1, sizeof(*ctx_id)); |
| | for (i = 1; i < (uint32) n; ++i) |
| | ctx_id[i - 1] = ngram_wid(model, words[i]); |
| |
|
| | wid = ngram_wid(model, *words); |
| | prob = ngram_ng_prob(model, wid, ctx_id, n - 1, &nused); |
| | ckd_free(ctx_id); |
| |
|
| | return prob; |
| | } |
| |
|
| | int32 |
| | ngram_score_to_prob(ngram_model_t * base, int32 score) |
| | { |
| | int32 prob; |
| |
|
| | |
| | prob = score - base->log_wip; |
| | |
| | prob = (int32) (prob / base->lw); |
| |
|
| | return prob; |
| | } |
| |
|
| | int32 |
| | ngram_unknown_wid(ngram_model_t * model) |
| | { |
| | int32 val; |
| |
|
| | |
| | |
| | if (hash_table_lookup_int32(model->wid, "<UNK>", &val) == -1) |
| | return NGRAM_INVALID_WID; |
| | else |
| | return val; |
| | } |
| |
|
| | int32 |
| | ngram_zero(ngram_model_t * model) |
| | { |
| | return model->log_zero; |
| | } |
| |
|
| | int32 |
| | ngram_model_get_size(ngram_model_t * model) |
| | { |
| | if (model != NULL) |
| | return model->n; |
| | return 0; |
| | } |
| |
|
| | uint32 const * |
| | ngram_model_get_counts(ngram_model_t * model) |
| | { |
| | if (model != NULL) |
| | return model->n_counts; |
| | return NULL; |
| | } |
| |
|
| | int32 |
| | ngram_wid(ngram_model_t * model, const char *word) |
| | { |
| | int32 val; |
| |
|
| | if (hash_table_lookup_int32(model->wid, word, &val) == -1) |
| | return ngram_unknown_wid(model); |
| | else |
| | return val; |
| | } |
| |
|
| | const char * |
| | ngram_word(ngram_model_t * model, int32 wid) |
| | { |
| | |
| | wid = NGRAM_BASEWID(wid); |
| | if (wid >= model->n_words) |
| | return NULL; |
| | return model->word_str[wid]; |
| | } |
| |
|
| | |
| | |
| | |
| | int32 |
| | ngram_add_word_internal(ngram_model_t * model, |
| | const char *word, int32 classid) |
| | { |
| |
|
| | |
| | int32 wid; |
| | if (hash_table_lookup_int32(model->wid, word, &wid) == 0) { |
| | E_WARN("Omit duplicate word '%s'\n", word); |
| | return wid; |
| | } |
| |
|
| | |
| | wid = model->n_words; |
| | if (classid >= 0) { |
| | wid = NGRAM_CLASSWID(wid, classid); |
| | } |
| |
|
| | |
| | if (model->n_words >= model->n_1g_alloc) { |
| | model->n_1g_alloc += UG_ALLOC_STEP; |
| | model->word_str = ckd_realloc(model->word_str, |
| | sizeof(*model->word_str) * |
| | model->n_1g_alloc); |
| | } |
| | |
| | |
| | model->word_str[model->n_words] = ckd_salloc(word); |
| | |
| | if (hash_table_enter_int32 |
| | (model->wid, model->word_str[model->n_words], wid) != wid) { |
| | E_ERROR |
| | ("Hash insertion failed for word %s => %p (should not happen)\n", |
| | model->word_str[model->n_words], (void *) (size_t) (wid)); |
| | } |
| | |
| | ++model->n_words; |
| | return wid; |
| | } |
| |
|
| | int32 |
| | ngram_model_add_word(ngram_model_t * model, |
| | const char *word, float32 weight) |
| | { |
| | int32 wid, prob = model->log_zero; |
| |
|
| | |
| | if (!model->writable) { |
| | E_WARN("Can't add word '%s' to read-only language model. " |
| | "Disable mmap with '-mmap no' to make it writable\n", word); |
| | return -1; |
| | } |
| |
|
| | wid = ngram_add_word_internal(model, word, -1); |
| | if (wid == NGRAM_INVALID_WID) |
| | return wid; |
| |
|
| | |
| | if (model->funcs && model->funcs->add_ug) |
| | prob = |
| | (*model->funcs->add_ug) (model, wid, |
| | logmath_log(model->lmath, weight)); |
| | if (prob == 0) |
| | return -1; |
| |
|
| | return wid; |
| | } |
| |
|
| | ngram_class_t * |
| | ngram_class_new(ngram_model_t * model, int32 tag_wid, int32 start_wid, |
| | glist_t classwords) |
| | { |
| | ngram_class_t *lmclass; |
| | gnode_t *gn; |
| | float32 tprob; |
| | int i; |
| |
|
| | lmclass = ckd_calloc(1, sizeof(*lmclass)); |
| | lmclass->tag_wid = tag_wid; |
| | |
| | lmclass->start_wid = start_wid; |
| | lmclass->n_words = glist_count(classwords); |
| | lmclass->prob1 = ckd_calloc(lmclass->n_words, sizeof(*lmclass->prob1)); |
| | lmclass->nword_hash = NULL; |
| | lmclass->n_hash = 0; |
| | tprob = 0.0; |
| | for (gn = classwords; gn; gn = gnode_next(gn)) { |
| | tprob += gnode_float32(gn); |
| | } |
| | if (tprob > 1.1 || tprob < 0.9) { |
| | E_INFO("Total class probability is %f, will normalize\n", tprob); |
| | for (gn = classwords; gn; gn = gnode_next(gn)) { |
| | gn->data.fl /= tprob; |
| | } |
| | } |
| | for (i = 0, gn = classwords; gn; ++i, gn = gnode_next(gn)) { |
| | lmclass->prob1[i] = logmath_log(model->lmath, gnode_float32(gn)); |
| | } |
| |
|
| | return lmclass; |
| | } |
| |
|
| | int32 |
| | ngram_class_add_word(ngram_class_t * lmclass, int32 wid, int32 lweight) |
| | { |
| | int32 hash; |
| |
|
| | if (lmclass->nword_hash == NULL) { |
| | |
| | lmclass->nword_hash = |
| | ckd_malloc(NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash)); |
| | memset(lmclass->nword_hash, 0xff, |
| | NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash)); |
| | lmclass->n_hash = NGRAM_HASH_SIZE; |
| | lmclass->n_hash_inuse = 0; |
| | } |
| | |
| | |
| | |
| | hash = wid & (lmclass->n_hash - 1); |
| | if (lmclass->nword_hash[hash].wid == -1) { |
| | |
| | lmclass->nword_hash[hash].wid = wid; |
| | lmclass->nword_hash[hash].prob1 = lweight; |
| | ++lmclass->n_hash_inuse; |
| | return hash; |
| | } |
| | else { |
| | int32 next; |
| | |
| | while (lmclass->nword_hash[hash].next != -1) |
| | hash = lmclass->nword_hash[hash].next; |
| | assert(hash != -1); |
| | |
| | if (lmclass->n_hash_inuse == lmclass->n_hash) { |
| | |
| | lmclass->nword_hash = ckd_realloc(lmclass->nword_hash, |
| | lmclass->n_hash * 2 * |
| | sizeof(*lmclass-> |
| | nword_hash)); |
| | memset(lmclass->nword_hash + lmclass->n_hash, 0xff, |
| | lmclass->n_hash * sizeof(*lmclass->nword_hash)); |
| | |
| | next = lmclass->n_hash; |
| | lmclass->n_hash *= 2; |
| | } |
| | else { |
| | |
| | for (next = 0; next < lmclass->n_hash; ++next) |
| | if (lmclass->nword_hash[next].wid == -1) |
| | break; |
| | |
| | assert(next != lmclass->n_hash); |
| | } |
| | lmclass->nword_hash[next].wid = wid; |
| | lmclass->nword_hash[next].prob1 = lweight; |
| | lmclass->nword_hash[hash].next = next; |
| | ++lmclass->n_hash_inuse; |
| | return next; |
| | } |
| | } |
| |
|
| | void |
| | ngram_class_free(ngram_class_t * lmclass) |
| | { |
| | ckd_free(lmclass->nword_hash); |
| | ckd_free(lmclass->prob1); |
| | ckd_free(lmclass); |
| | } |
| |
|
| | int32 |
| | ngram_model_add_class_word(ngram_model_t * model, |
| | const char *classname, |
| | const char *word, float32 weight) |
| | { |
| | ngram_class_t *lmclass; |
| | int32 classid, tag_wid, wid, i, scale; |
| | float32 fprob; |
| |
|
| | |
| | |
| | |
| | tag_wid = ngram_wid(model, classname); |
| | if (tag_wid == NGRAM_INVALID_WID) { |
| | E_ERROR("No such word or class tag: %s\n", classname); |
| | return tag_wid; |
| | } |
| | for (classid = 0; classid < model->n_classes; ++classid) { |
| | if (model->classes[classid]->tag_wid == tag_wid) |
| | break; |
| | } |
| | |
| | if (classid == model->n_classes) { |
| | E_ERROR |
| | ("Word %s is not a class tag (call ngram_model_add_class() first)\n", |
| | classname); |
| | return NGRAM_INVALID_WID; |
| | } |
| | lmclass = model->classes[classid]; |
| |
|
| | |
| | wid = ngram_add_word_internal(model, word, classid); |
| | if (wid == NGRAM_INVALID_WID) |
| | return wid; |
| |
|
| | |
| | fprob = weight * 1.0f / (lmclass->n_words + lmclass->n_hash_inuse + 1); |
| | |
| | |
| | |
| | scale = logmath_log(model->lmath, 1.0 - fprob); |
| | for (i = 0; i < lmclass->n_words; ++i) |
| | lmclass->prob1[i] += scale; |
| | for (i = 0; i < lmclass->n_hash; ++i) |
| | if (lmclass->nword_hash[i].wid != -1) |
| | lmclass->nword_hash[i].prob1 += scale; |
| |
|
| | |
| | return ngram_class_add_word(lmclass, wid, |
| | logmath_log(model->lmath, fprob)); |
| | } |
| |
|
| | int32 |
| | ngram_model_add_class(ngram_model_t * model, |
| | const char *classname, |
| | float32 classweight, |
| | char **words, const float32 * weights, int32 n_words) |
| | { |
| | ngram_class_t *lmclass; |
| | glist_t classwords = NULL; |
| | int32 i, start_wid = -1; |
| | int32 classid, tag_wid; |
| |
|
| | |
| | if ((tag_wid = |
| | ngram_wid(model, classname)) == ngram_unknown_wid(model)) { |
| | tag_wid = ngram_model_add_word(model, classname, classweight); |
| | if (tag_wid == NGRAM_INVALID_WID) |
| | return -1; |
| | } |
| |
|
| | if (model->n_classes == 128) { |
| | E_ERROR("Number of classes cannot exceed 128 (sorry)\n"); |
| | return -1; |
| | } |
| | classid = model->n_classes; |
| | for (i = 0; i < n_words; ++i) { |
| | int32 wid; |
| |
|
| | wid = ngram_add_word_internal(model, words[i], classid); |
| | if (wid == NGRAM_INVALID_WID) |
| | return -1; |
| | if (start_wid == -1) |
| | start_wid = NGRAM_BASEWID(wid); |
| | classwords = glist_add_float32(classwords, weights[i]); |
| | } |
| | classwords = glist_reverse(classwords); |
| | lmclass = ngram_class_new(model, tag_wid, start_wid, classwords); |
| | glist_free(classwords); |
| | if (lmclass == NULL) |
| | return -1; |
| |
|
| | ++model->n_classes; |
| | if (model->classes == NULL) |
| | model->classes = ckd_calloc(1, sizeof(*model->classes)); |
| | else |
| | model->classes = ckd_realloc(model->classes, |
| | model->n_classes * |
| | sizeof(*model->classes)); |
| | model->classes[classid] = lmclass; |
| | return classid; |
| | } |
| |
|
| | int32 |
| | ngram_class_prob(ngram_class_t * lmclass, int32 wid) |
| | { |
| | int32 base_wid = NGRAM_BASEWID(wid); |
| |
|
| | if (base_wid < lmclass->start_wid |
| | || base_wid > lmclass->start_wid + lmclass->n_words) { |
| | int32 hash; |
| |
|
| | |
| | hash = wid & (lmclass->n_hash - 1); |
| | while (hash != -1 && lmclass->nword_hash[hash].wid != wid) |
| | hash = lmclass->nword_hash[hash].next; |
| | if (hash == -1) |
| | return 1; |
| | return lmclass->nword_hash[hash].prob1; |
| | } |
| | else { |
| | return lmclass->prob1[base_wid - lmclass->start_wid]; |
| | } |
| | } |
| |
|
| | int32 |
| | read_classdef_file(hash_table_t * classes, const char *file_name) |
| | { |
| | FILE *fp; |
| | int32 is_pipe; |
| | int inclass; |
| | int32 rv = -1; |
| | gnode_t *gn; |
| | glist_t classwords = NULL; |
| | glist_t classprobs = NULL; |
| | char *classname = NULL; |
| |
|
| | if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) { |
| | E_ERROR("File %s not found\n", file_name); |
| | return -1; |
| | } |
| |
|
| | inclass = FALSE; |
| | while (!feof(fp)) { |
| | char line[512]; |
| | char *wptr[2]; |
| | int n_words; |
| |
|
| | if (fgets(line, sizeof(line), fp) == NULL) |
| | break; |
| |
|
| | n_words = str2words(line, wptr, 2); |
| | if (n_words <= 0) |
| | continue; |
| |
|
| | if (inclass) { |
| | |
| | if (n_words == 2 && 0 == strcmp(wptr[0], "END")) { |
| | classdef_t *classdef; |
| | gnode_t *word, *weight; |
| | int32 i; |
| |
|
| | if (classname == NULL || 0 != strcmp(wptr[1], classname)) |
| | goto error_out; |
| | inclass = FALSE; |
| |
|
| | |
| | classdef = ckd_calloc(1, sizeof(*classdef)); |
| | classwords = glist_reverse(classwords); |
| | classprobs = glist_reverse(classprobs); |
| | classdef->n_words = glist_count(classwords); |
| | classdef->words = ckd_calloc(classdef->n_words, |
| | sizeof(*classdef->words)); |
| | classdef->weights = ckd_calloc(classdef->n_words, |
| | sizeof(*classdef->weights)); |
| | word = classwords; |
| | weight = classprobs; |
| | for (i = 0; i < classdef->n_words; ++i) { |
| | classdef->words[i] = gnode_ptr(word); |
| | classdef->weights[i] = gnode_float32(weight); |
| | word = gnode_next(word); |
| | weight = gnode_next(weight); |
| | } |
| |
|
| | |
| | if (hash_table_enter(classes, classname, classdef) != |
| | classdef) { |
| | classdef_free(classdef); |
| | goto error_out; |
| | } |
| |
|
| | |
| | glist_free(classwords); |
| | glist_free(classprobs); |
| | classwords = NULL; |
| | classprobs = NULL; |
| | classname = NULL; |
| | } |
| | else { |
| | float32 fprob; |
| |
|
| | if (n_words == 2) |
| | fprob = atof_c(wptr[1]); |
| | else |
| | fprob = 1.0f; |
| | |
| | classwords = |
| | glist_add_ptr(classwords, ckd_salloc(wptr[0])); |
| | classprobs = glist_add_float32(classprobs, fprob); |
| | } |
| | } |
| | else { |
| | |
| | if (n_words == 2 && 0 == strcmp(wptr[0], "LMCLASS")) { |
| | if (inclass) |
| | goto error_out; |
| | inclass = TRUE; |
| | classname = ckd_salloc(wptr[1]); |
| | } |
| | |
| | } |
| | } |
| | rv = 0; |
| |
|
| | error_out: |
| | |
| | fclose_comp(fp, is_pipe); |
| | for (gn = classwords; gn; gn = gnode_next(gn)) |
| | ckd_free(gnode_ptr(gn)); |
| | glist_free(classwords); |
| | glist_free(classprobs); |
| | ckd_free(classname); |
| |
|
| | return rv; |
| | } |
| |
|
| | void |
| | classdef_free(classdef_t * classdef) |
| | { |
| | int32 i; |
| | for (i = 0; i < classdef->n_words; ++i) |
| | ckd_free(classdef->words[i]); |
| | ckd_free(classdef->words); |
| | ckd_free(classdef->weights); |
| | ckd_free(classdef); |
| | } |
| |
|
| |
|
| | int32 |
| | ngram_model_read_classdef(ngram_model_t * model, const char *file_name) |
| | { |
| | hash_table_t *classes; |
| | glist_t hl = NULL; |
| | gnode_t *gn; |
| | int32 rv = -1; |
| |
|
| | classes = hash_table_new(0, FALSE); |
| | if (read_classdef_file(classes, file_name) < 0) { |
| | hash_table_free(classes); |
| | return -1; |
| | } |
| |
|
| | |
| | hl = hash_table_tolist(classes, NULL); |
| | for (gn = hl; gn; gn = gnode_next(gn)) { |
| | hash_entry_t *he = gnode_ptr(gn); |
| | classdef_t *classdef = he->val; |
| |
|
| | if (ngram_model_add_class(model, he->key, 1.0, |
| | classdef->words, |
| | classdef->weights, |
| | classdef->n_words) < 0) |
| | goto error_out; |
| | } |
| | rv = 0; |
| |
|
| | error_out: |
| | for (gn = hl; gn; gn = gnode_next(gn)) { |
| | hash_entry_t *he = gnode_ptr(gn); |
| | ckd_free((char *) he->key); |
| | classdef_free(he->val); |
| | } |
| | glist_free(hl); |
| | hash_table_free(classes); |
| | return rv; |
| | } |
| |
|