/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ /* ==================================================================== * Copyright (c) 2015 Carnegie Mellon University. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * This work was supported in part by funding from the Defense Advanced * Research Projects Agency and the National Science Foundation of the * United States of America, and the CMU Sphinx Speech Consortium. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ==================================================================== * */ #include #include #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include "util/pio.h" #include "util/strfuncs.h" #include "util/ckd_alloc.h" #include "util/byteorder.h" #include "lm/ngram_model_trie.h" static const char trie_hdr[] = "Trie Language Model"; static const char dmp_hdr[] = "Darpa Trigram LM"; static ngram_funcs_t ngram_model_trie_funcs; /* * Read and return #unigrams, #bigrams, #trigrams as stated in input file. */ static int read_counts_arpa(lineiter_t ** li, uint32 * counts, int *order) { int32 ngram, prev_ngram; uint32 ngram_cnt; /* skip file until past the '\data\' marker */ while (*li) { if (strcmp((*li)->buf, "\\data\\") == 0) break; *li = lineiter_next(*li); } if (*li == NULL || strcmp((*li)->buf, "\\data\\") != 0) { E_INFO("No \\data\\ mark in LM file\n"); return -1; } prev_ngram = 0; *order = 0; while ((*li = lineiter_next(*li))) { if (sscanf((*li)->buf, "ngram %d=%d", &ngram, &ngram_cnt) != 2) break; if (ngram != prev_ngram + 1) { E_ERROR ("Ngram counts in LM file is not in order. %d goes after %d\n", ngram, prev_ngram); return -1; } prev_ngram = ngram; counts[*order] = ngram_cnt; (*order)++; } if (*li == NULL) { E_ERROR("EOF while reading ngram counts\n"); return -1; } return 0; } static int read_1grams_arpa(lineiter_t ** li, uint32 count, ngram_model_t * base, unigram_t * unigrams) { uint32 i; int n; int n_parts; char *wptr[3]; while (*li && strcmp((*li)->buf, "\\1-grams:") != 0) { *li = lineiter_next(*li); } if (*li == NULL) { E_ERROR_SYSTEM("Failed to read \\1-grams: mark"); return -1; } n_parts = 2; for (i = 0; i < count; i++) { unigram_t *unigram; *li = lineiter_next(*li); if (*li == NULL) { E_ERROR ("Unexpected end of ARPA file. Failed to read unigram %d\n", i + 1); return -1; } if ((n = str2words((*li)->buf, wptr, 3)) < n_parts) { E_ERROR("Format error at line %d, Failed to read unigrams\n", (*li)->lineno); return -1; } unigram = &unigrams[i]; unigram->prob = logmath_log10_to_log_float(base->lmath, atof_c(wptr[0])); if (unigram->prob > 0) { E_WARN("Unigram '%s' has positive probability\n", wptr[1]); unigram->prob = 0; } if (n == n_parts + 1) { unigram->bo = logmath_log10_to_log_float(base->lmath, atof_c(wptr[2])); } else { unigram->bo = 0.0f; } /* TODO: classify float with fpclassify and warn if bad value occurred */ base->word_str[i] = ckd_salloc(wptr[1]); } /* fill hash-table that maps unigram names to their word ids */ for (i = 0; i < count; i++) { if ((hash_table_enter (base->wid, base->word_str[i], (void *) (size_t) i)) != (void *) (size_t) i) { E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]); } } return 0; } ngram_model_t * ngram_model_trie_read_arpa(ps_config_t * config, const char *path, logmath_t * lmath) { FILE *fp; lineiter_t *li; ngram_model_trie_t *model; ngram_model_t *base; ngram_raw_t **raw_ngrams; int32 is_pipe; uint32 counts[NGRAM_MAX_ORDER]; int order; int i; (void)config; E_INFO("Trying to read LM in arpa format\n"); if ((fp = fopen_comp(path, "r", &is_pipe)) == NULL) { E_ERROR("File %s not found\n", path); return NULL; } model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model)); li = lineiter_start_clean(fp); /* Read n-gram counts from file */ if (read_counts_arpa(&li, counts, &order) == -1) { ckd_free(model); lineiter_free(li); fclose_comp(fp, is_pipe); return NULL; } E_INFO("LM of order %d\n", order); for (i = 0; i < order; i++) { E_INFO("#%d-grams: %d\n", i + 1, counts[i]); } base = &model->base; ngram_model_init(base, &ngram_model_trie_funcs, lmath, order, (int32) counts[0]); base->writable = TRUE; model->trie = lm_trie_create(counts[0], order); if (read_1grams_arpa(&li, counts[0], base, model->trie->unigrams) < 0) { ngram_model_free(base); lineiter_free(li); fclose_comp(fp, is_pipe); return NULL; } if (order > 1) { raw_ngrams = ngrams_raw_read_arpa(&li, base->lmath, counts, order, base->wid); if (raw_ngrams == NULL) { ngram_model_free(base); lineiter_free(li); fclose_comp(fp, is_pipe); return NULL; } lm_trie_build(model->trie, raw_ngrams, counts, base->n_counts, order); ngrams_raw_free(raw_ngrams, counts, order); } lineiter_free(li); fclose_comp(fp, is_pipe); return base; } int ngram_model_trie_write_arpa(ngram_model_t * base, const char *path) { int i; uint32 j; ngram_model_trie_t *model = (ngram_model_trie_t *) base; FILE *fp = fopen(path, "w"); if (!fp) { E_ERROR("Unable to open %s to write arpa LM from trie\n", path); return -1; } fprintf(fp, "This is an ARPA-format language model file, generated by CMU Sphinx\n"); /* Write N-gram counts. */ fprintf(fp, "\\data\\\n"); for (i = 0; i < base->n; ++i) { fprintf(fp, "ngram %d=%d\n", i + 1, base->n_counts[i]); } /* Write 1-grams */ fprintf(fp, "\n\\1-grams:\n"); for (j = 0; j < base->n_counts[0]; j++) { unigram_t *unigram = &model->trie->unigrams[j]; fprintf(fp, "%.4f\t%s", logmath_log_float_to_log10(base->lmath, unigram->prob), base->word_str[j]); if (base->n > 1) { fprintf(fp, "\t%.4f", logmath_log_float_to_log10(base->lmath, unigram->bo)); } fprintf(fp, "\n"); } /* Write ngrams */ if (base->n > 1) { for (i = 2; i <= base->n; ++i) { ngram_raw_t *raw_ngrams = (ngram_raw_t *) ckd_calloc((size_t) base->n_counts[i - 1], sizeof(*raw_ngrams)); uint32 raw_ngram_idx; uint32 j; uint32 hist[NGRAM_MAX_ORDER]; node_range_t range; raw_ngram_idx = 0; range.begin = range.end = 0; /* we need to iterate over a trie here. recursion should do the job */ lm_trie_fill_raw_ngram(model->trie, raw_ngrams, &raw_ngram_idx, base->n_counts, range, hist, 0, i, base->n); assert(raw_ngram_idx == base->n_counts[i - 1]); qsort(raw_ngrams, (size_t) base->n_counts[i - 1], sizeof(ngram_raw_t), &ngram_ord_comparator); fprintf(fp, "\n\\%d-grams:\n", i); for (j = 0; j < base->n_counts[i - 1]; j++) { int k; fprintf(fp, "%.4f", logmath_log_float_to_log10(base->lmath, raw_ngrams[j].prob)); for (k = 0; k < i; k++) { fprintf(fp, "\t%s", base->word_str[raw_ngrams[j].words[k]]); } ckd_free(raw_ngrams[j].words); if (i < base->n) { fprintf(fp, "\t%.4f", logmath_log_float_to_log10(base->lmath, raw_ngrams[j].backoff)); } fprintf(fp, "\n"); } ckd_free(raw_ngrams); } } fprintf(fp, "\n\\end\\\n"); return fclose(fp); } static void read_word_str(ngram_model_t * base, FILE * fp, int do_swap) { int32 k; uint32 i, j; char *tmp_word_str; /* read ascii word strings */ base->writable = TRUE; fread(&k, sizeof(k), 1, fp); if (do_swap) SWAP_INT32(&k); E_INFO("#word_str: %d\n", k); tmp_word_str = (char *) ckd_calloc((size_t) k, 1); fread(tmp_word_str, 1, (size_t) k, fp); /* First make sure string just read contains n_counts[0] words (PARANOIA!!) */ for (i = 0, j = 0; i < (uint32) k; i++) if (tmp_word_str[i] == '\0') j++; if (j != base->n_counts[0]) { E_ERROR ("Error reading word strings (%d doesn't match n_unigrams %d)\n", j, base->n_counts[0]); } /* Break up string just read into words */ j = 0; for (i = 0; i < base->n_counts[0]; i++) { base->word_str[i] = ckd_salloc(tmp_word_str + j); if (hash_table_enter(base->wid, base->word_str[i], (void *) (size_t) i) != (void *) (size_t) i) { E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]); } j += strlen(base->word_str[i]) + 1; } free(tmp_word_str); } ngram_model_t * ngram_model_trie_read_bin(ps_config_t * config, const char *path, logmath_t * lmath) { int32 is_pipe; FILE *fp; size_t hdr_size; char *hdr; int cmp_res; uint8 i, order; uint32 counts[NGRAM_MAX_ORDER]; ngram_model_trie_t *model; ngram_model_t *base; (void)config; E_INFO("Trying to read LM in trie binary format\n"); if ((fp = fopen_comp(path, "rb", &is_pipe)) == NULL) { E_ERROR("File %s not found\n", path); return NULL; } hdr_size = strlen(trie_hdr); hdr = (char *) ckd_calloc(hdr_size + 1, sizeof(*hdr)); fread(hdr, sizeof(*hdr), hdr_size, fp); cmp_res = strcmp(hdr, trie_hdr); ckd_free(hdr); if (cmp_res) { E_INFO("Header doesn't match\n"); fclose_comp(fp, is_pipe); return NULL; } model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model)); base = &model->base; fread(&order, sizeof(order), 1, fp); for (i = 0; i < order; i++) { fread(&counts[i], sizeof(counts[i]), 1, fp); if (SWAP_LM_TRIE) SWAP_INT32(&counts[i]); E_INFO("#%d-grams: %d\n", i + 1, counts[i]); } ngram_model_init(base, &ngram_model_trie_funcs, lmath, order, (int32) counts[0]); for (i = 0; i < order; i++) { base->n_counts[i] = counts[i]; } model->trie = lm_trie_read_bin(counts, order, fp); read_word_str(base, fp, SWAP_LM_TRIE); fclose_comp(fp, is_pipe); return base; } static void write_word_str(FILE * fp, ngram_model_t * model, int do_swap) { int32 k; uint32 i; k = 0; for (i = 0; i < model->n_counts[0]; i++) k += strlen(model->word_str[i]) + 1; E_INFO("#word_str: %d\n", k); if (do_swap) SWAP_INT32(&k); fwrite(&k, sizeof(k), 1, fp); for (i = 0; i < model->n_counts[0]; i++) fwrite(model->word_str[i], 1, strlen(model->word_str[i]) + 1, fp); } int ngram_model_trie_write_bin(ngram_model_t * base, const char *path) { int i; int32 is_pipe; ngram_model_trie_t *model = (ngram_model_trie_t *) base; FILE *fp = fopen_comp(path, "wb", &is_pipe); if (!fp) { E_ERROR("Unable to open %s to write binary trie LM\n", path); return -1; } fwrite(trie_hdr, sizeof(*trie_hdr), strlen(trie_hdr), fp); fwrite(&model->base.n, sizeof(model->base.n), 1, fp); for (i = 0; i < model->base.n; i++) { uint32 count = model->base.n_counts[i]; if (SWAP_LM_TRIE) SWAP_INT32(&count); fwrite(&count, sizeof(count), 1, fp); } lm_trie_write_bin(model->trie, base->n_counts[0], fp); write_word_str(fp, base, SWAP_LM_TRIE); fclose_comp(fp, is_pipe); return 0; } ngram_model_t * ngram_model_trie_read_dmp(ps_config_t * config, const char *file_name, logmath_t * lmath) { uint8 do_swap; int32 is_pipe; int32 k; uint32 j; int32 vn, ts; int32 count; uint32 counts[3]; uint32 *unigram_next; int order; char str[1024]; FILE *fp; ngram_model_trie_t *model; ngram_model_t *base; ngram_raw_t **raw_ngrams; (void)config; E_INFO("Trying to read LM in dmp format\n"); if ((fp = fopen_comp(file_name, "rb", &is_pipe)) == NULL) { E_ERROR("Dump file %s not found\n", file_name); return NULL; } do_swap = FALSE; fread(&k, sizeof(k), 1, fp); if (k != strlen(dmp_hdr) + 1) { SWAP_INT32(&k); if (k != strlen(dmp_hdr) + 1) { E_ERROR ("Wrong magic header size number %x: %s is not a dump file\n", k, file_name); return NULL; } do_swap = 1; } if (fread(str, 1, k, fp) != (size_t) k) { E_ERROR("Cannot read header\n"); return NULL; } if (strncmp(str, dmp_hdr, k) != 0) { E_ERROR("Wrong header %s: %s is not a dump file\n", dmp_hdr); return NULL; } if (fread(&k, sizeof(k), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&k); if (fread(str, 1, k, fp) != (size_t) k) { E_ERROR("Cannot read LM filename in header\n"); return NULL; } /* read version#, if present (must be <= 0) */ if (fread(&vn, sizeof(vn), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&vn); if (vn <= 0) { /* read and don't compare timestamps (we don't care) */ if (fread(&ts, sizeof(ts), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&ts); /* read and skip format description */ for (;;) { if (fread(&k, sizeof(k), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&k); if (k == 0) break; if (fread(str, 1, k, fp) != (size_t) k) { E_ERROR("Failed to read word\n"); return NULL; } } /* read model->ucount */ if (fread(&count, sizeof(count), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&count); counts[0] = count; } else { counts[0] = vn; } /* read model->bcount, tcount */ if (fread(&count, sizeof(count), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&count); counts[1] = count; if (fread(&count, sizeof(count), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&count); counts[2] = count; E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", counts[0], counts[1], counts[2]); model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model)); base = &model->base; if (counts[2] > 0) order = 3; else if (counts[1] > 0) order = 2; else order = 1; ngram_model_init(base, &ngram_model_trie_funcs, lmath, order, (int32) counts[0]); model->trie = lm_trie_create(counts[0], order); unigram_next = (uint32 *) ckd_calloc((int32) counts[0] + 1, sizeof(unigram_next)); for (j = 0; j <= counts[0]; j++) { int32 bigrams; int32 mapid; dmp_weight_t weightp; dmp_weight_t weightb; /* Skip over the mapping ID, we don't care about it. */ /* Read the weights from actual unigram structure. */ fread(&mapid, sizeof(int32), 1, fp); fread(&weightp, sizeof(weightp), 1, fp); fread(&weightb, sizeof(weightb), 1, fp); fread(&bigrams, sizeof(int32), 1, fp); if (do_swap) { SWAP_INT32(&weightp.l); SWAP_INT32(&weightb.l); SWAP_INT32(&bigrams); } model->trie->unigrams[j].prob = logmath_log10_to_log_float(lmath, weightp.f); model->trie->unigrams[j].bo = logmath_log10_to_log_float(lmath, weightb.f); model->trie->unigrams[j].next = bigrams; unigram_next[j] = bigrams; } if (order > 1) { raw_ngrams = ngrams_raw_read_dmp(fp, lmath, counts, order, unigram_next, do_swap); if (raw_ngrams == NULL) { ngram_model_free(base); ckd_free(unigram_next); fclose_comp(fp, is_pipe); return NULL; } lm_trie_build(model->trie, raw_ngrams, counts, base->n_counts, order); ngrams_raw_free(raw_ngrams, counts, order); } /* Sentinel unigram and bigrams read before */ ckd_free(unigram_next); /* read ascii word strings */ read_word_str(base, fp, do_swap); fclose_comp(fp, is_pipe); return base; } static void ngram_model_trie_free(ngram_model_t * base) { ngram_model_trie_t *model = (ngram_model_trie_t *) base; lm_trie_free(model->trie); } static int trie_apply_weights(ngram_model_t * base, float32 lw, float32 wip) { /* just update weights that are going to be used on score calculation */ base->lw = lw; base->log_wip = logmath_log(base->lmath, wip); return 0; } static int32 weight_score(ngram_model_t * base, int32 score) { return (int32) (score * base->lw + base->log_wip); } static int32 ngram_model_trie_raw_score(ngram_model_t * base, int32 wid, int32 * hist, int32 n_hist, int32 * n_used) { int32 i; ngram_model_trie_t *model = (ngram_model_trie_t *) base; if (n_hist > model->base.n - 1) n_hist = model->base.n - 1; for (i = 0; i < n_hist; i++) { if (hist[i] < 0) { n_hist = i; break; } } return (int32) lm_trie_score(model->trie, model->base.n, wid, hist, n_hist, n_used); } static int32 ngram_model_trie_score(ngram_model_t * base, int32 wid, int32 * hist, int32 n_hist, int32 * n_used) { return weight_score(base, ngram_model_trie_raw_score(base, wid, hist, n_hist, n_used)); } static int32 lm_trie_add_ug(ngram_model_t * base, int32 wid, int32 lweight) { ngram_model_trie_t *model = (ngram_model_trie_t *) base; /* This would be very bad if this happened! */ assert(!NGRAM_IS_CLASSWID(wid)); /* Reallocate unigram array. */ model->trie->unigrams = (unigram_t *) ckd_realloc(model->trie->unigrams, sizeof(*model->trie->unigrams) * (base->n_1g_alloc + 1)); memset(model->trie->unigrams + (base->n_counts[0] + 1), 0, (size_t) (base->n_1g_alloc - base->n_counts[0]) * sizeof(*model->trie->unigrams)); ++base->n_counts[0]; lweight += logmath_log(base->lmath, 1.0 / base->n_counts[0]); model->trie->unigrams[wid + 1].next = model->trie->unigrams[wid].next; model->trie->unigrams[wid].prob = (float) lweight; /* This unigram by definition doesn't participate in any bigrams, * so its backoff weight is undefined and next pointer same as in finish unigram*/ model->trie->unigrams[wid].bo = 0; /* Finally, increase the unigram count */ /* FIXME: Note that this can actually be quite bogus due to the * presence of class words. If wid falls outside the unigram * count, increase it to compensate, at the cost of no longer * really knowing how many unigrams we have :( */ if ((uint32) wid >= base->n_counts[0]) base->n_counts[0] = wid + 1; return (int32) weight_score(base, lweight); } static void lm_trie_flush(ngram_model_t * base) { ngram_model_trie_t *model = (ngram_model_trie_t *) base; lm_trie_t *trie = model->trie; memset(trie->hist_cache, -1, sizeof(trie->hist_cache)); memset(trie->backoff_cache, 0, sizeof(trie->backoff_cache)); return; } static ngram_funcs_t ngram_model_trie_funcs = { ngram_model_trie_free, /* free */ trie_apply_weights, /* apply_weights */ ngram_model_trie_score, /* score */ ngram_model_trie_raw_score, /* raw_score */ lm_trie_add_ug, /* add_ug */ lm_trie_flush /* flush */ };