/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ /* ==================================================================== * Copyright (c) 2008 Carnegie Mellon University. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * This work was supported in part by funding from the Defense Advanced * Research Projects Agency and the National Science Foundation of the * United States of America, and the CMU Sphinx Speech Consortium. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ==================================================================== * */ /** * \file sphinx_lm_eval.c * Language model evaluation tool. */ #include #include "lm/ngram_model.h" #include "util/ckd_alloc.h" #include "util/cmd_ln.h" #include "util/ckd_alloc.h" #include "util/pio.h" #include "util/strfuncs.h" #include "pocketsphinx_internal.h" #include #include #include static const ps_arg_t defn[] = { { "help", ARG_BOOLEAN, "no", "Shows the usage of the tool"}, { "logbase", ARG_FLOATING, "1.0001", "Base in which all log-likelihoods calculated" }, { "lm", ARG_STRING, NULL, "Language model file"}, { "probdef", ARG_STRING, NULL, "Probability definition file for classes in LM"}, { "lmctlfn", ARG_STRING, NULL, "Control file listing a set of language models"}, { "lmname", ARG_STRING, NULL, "Name of language model in -lmctlfn to use for all utterances" }, { "lsn", ARG_STRING, NULL, "Transcription file to evaluate"}, { "text", ARG_STRING, NULL, "Text string to evaluate"}, { "mmap", ARG_BOOLEAN, "no", "Use memory-mapped I/O for reading binary LM files"}, { "lw", ARG_FLOATING, "1.0", "Language model weight" }, { "wip", ARG_FLOATING, "1.0", "Word insertion probability" }, { "verbose", ARG_BOOLEAN, "no", "Print details of perplexity calculation" }, /* FIXME: Support -lmstartsym, -lmendsym, -lmctlfn, -ctl_lm */ { NULL, 0, NULL, NULL } }; static int verbose; static int calc_entropy(ngram_model_t *lm, char **words, int32 n, int32 *out_n_ccs, int32 *out_n_oovs, int32 *out_lm_score) { int32 *wids; int32 startwid; int32 i, ch, nccs, noovs, unk; if (n == 0) return 0; unk = ngram_unknown_wid(lm); /* Reverse this array into an array of word IDs. */ wids = ckd_calloc(n, sizeof(*wids)); for (i = 0; i < n; ++i) wids[n-i-1] = ngram_wid(lm, words[i]); /* Skip as it's a context cue (HACK, this should be configurable). */ startwid = ngram_wid(lm, ""); /* Now evaluate the list of words in reverse using the * remainder of the array as the history. */ ch = noovs = nccs = 0; for (i = 0; i < n; ++i) { int32 n_used; int32 prob; /* Skip as it's a context cue (HACK, this should be configurable). */ if (wids[i] == startwid) { ++nccs; continue; } /* Skip and count OOVs. */ if (wids[i] == NGRAM_INVALID_WID || wids[i] == unk) { ++noovs; continue; } /* Sum up information for each N-gram */ prob = ngram_ng_score(lm, wids[i], wids + i + 1, n - i - 1, &n_used); if (verbose) { int m; printf("log P(%s|", ngram_word(lm, wids[i])); m = i + ngram_model_get_size(lm) - 1; if (m >= n) m = n - 1; while (m > i) { printf("%s ", ngram_word(lm, wids[m--])); } printf(") = %d\n", prob); } ch -= prob; } if (out_n_ccs) *out_n_ccs = nccs; if (out_n_oovs) *out_n_oovs = noovs; /* Calculate cross-entropy CH = - 1/N sum log P(W|H) */ n -= (nccs + noovs); if (n <= 0) return 0; if (out_lm_score) *out_lm_score = -ch; return ch / n; } static void evaluate_file(ngram_model_t *lm, logmath_t *lmath, const char *lsnfn) { FILE *fh; lineiter_t *litor; int32 nccs, noovs, nwords, lscr; float64 ch, log_to_log2;; if ((fh = fopen(lsnfn, "r")) == NULL) E_FATAL_SYSTEM("failed to open transcript file %s", lsnfn); /* We have to keep ch in floating-point to avoid overflows, so * we might as well use log2. */ log_to_log2 = log(logmath_get_base(lmath)) / log(2); lscr = nccs = noovs = nwords = 0; ch = 0.0; for (litor = lineiter_start(fh); litor; litor = lineiter_next(litor)) { char **words; int32 n, tmp_ch, tmp_noovs, tmp_nccs, tmp_lscr; n = str2words(litor->buf, NULL, 0); if (n < 0) E_FATAL("str2words(line, NULL, 0) = %d, should not happen\n", n); if (n == 0) /* Do nothing! */ continue; words = ckd_calloc(n, sizeof(*words)); str2words(litor->buf, words, n); /* Remove any utterance ID (FIXME: has to be a single "word") */ if (words[n-1][0] == '(' && words[n-1][strlen(words[n-1])-1] == ')') n = n - 1; tmp_ch = calc_entropy(lm, words, n, &tmp_nccs, &tmp_noovs, &tmp_lscr); ch += (float64) tmp_ch * (n - tmp_nccs - tmp_noovs) * log_to_log2; nccs += tmp_nccs; noovs += tmp_noovs; lscr += tmp_lscr; nwords += n; ckd_free(words); } ch /= (nwords - nccs - noovs); printf("cross-entropy: %f bits\n", ch); /* Calculate perplexity pplx = exp CH */ printf("perplexity: %f\n", pow(2.0, ch)); printf("lm score: %d\n", lscr); /* Report OOVs and CCs */ printf("%d words evaluated\n", nwords); printf("%d OOVs (%.2f%%), %d context cues removed\n", noovs, (double)noovs / nwords * 100, nccs); } static void evaluate_string(ngram_model_t *lm, logmath_t *lmath, const char *text) { char *textfoo; char **words; int32 n, ch, noovs, nccs, lscr; /* Split it into an array of strings. */ textfoo = ckd_salloc(text); n = str2words(textfoo, NULL, 0); if (n < 0) E_FATAL("str2words(textfoo, NULL, 0) = %d, should not happen\n", n); if (n == 0) /* Do nothing! */ return; words = ckd_calloc(n, sizeof(*words)); str2words(textfoo, words, n); ch = calc_entropy(lm, words, n, &nccs, &noovs, &lscr); printf("input: %s\n", text); printf("cross-entropy: %f bits\n", ch * log(logmath_get_base(lmath)) / log(2)); /* Calculate perplexity pplx = exp CH */ printf("perplexity: %f\n", logmath_exp(lmath, ch)); printf("lm score: %d\n", lscr); /* Report OOVs and CCs */ printf("%d words evaluated\n", n); printf("%d OOVs, %d context cues removed\n", noovs, nccs); ckd_free(textfoo); ckd_free(words); } int main(int argc, char *argv[]) { cmd_ln_t *config; ngram_model_t *lm = NULL; logmath_t *lmath; const char *lmfn, *probdefn, *lsnfn, *text; if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL) { /* This probably just means that we got no arguments. */ err_set_loglevel(ERR_INFO); cmd_ln_log_help_r(NULL, defn); return 1; } verbose = ps_config_bool(config, "verbose"); /* Create log math object. */ if ((lmath = logmath_init (ps_config_float(config, "logbase"), 0, 0)) == NULL) { E_FATAL("Failed to initialize log math\n"); } /* Load the language model. */ lmfn = ps_config_str(config, "lm"); if (lmfn == NULL || (lm = ngram_model_read(config, lmfn, NGRAM_AUTO, lmath)) == NULL) { E_FATAL("Failed to load language model from %s\n", ps_config_str(config, "lm")); } if ((probdefn = ps_config_str(config, "probdef")) != NULL) ngram_model_read_classdef(lm, probdefn); ngram_model_apply_weights(lm, ps_config_float(config, "lw"), ps_config_float(config, "wip")); /* Now evaluate some text. */ lsnfn = ps_config_str(config, "lsn"); text = ps_config_str(config, "text"); if (lsnfn) { evaluate_file(lm, lmath, lsnfn); } else if (text) { evaluate_string(lm, lmath, text); } return 0; }