pocketsphinx-20.04 / programs /pocketsphinx_lm_eval.c

pocketsphinx-20.04

37a92a9 about 3 years ago

9.26 kB

	/* -- c-basic-offset: 4; indent-tabs-mode: nil -- */
	/* ====================================================================
	* Copyright (c) 2008 Carnegie Mellon University. All rights
	* reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	*
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	*
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in
	* the documentation and/or other materials provided with the
	* distribution.
	*
	* This work was supported in part by funding from the Defense Advanced
	* Research Projects Agency and the National Science Foundation of the
	* United States of America, and the CMU Sphinx Speech Consortium.
	*
	* THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
	* ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
	* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
	* NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	* ====================================================================
	*
	*/
	/**
	* \file sphinx_lm_eval.c
	* Language model evaluation tool.
	*/
	#include <pocketsphinx.h>

	#include "lm/ngram_model.h"
	#include "util/ckd_alloc.h"
	#include "util/cmd_ln.h"
	#include "util/ckd_alloc.h"
	#include "util/pio.h"
	#include "util/strfuncs.h"
	#include "pocketsphinx_internal.h"

	#include <stdio.h>
	#include <string.h>
	#include <math.h>

	static const ps_arg_t defn[] = {
	{ "help",
	ARG_BOOLEAN,
	"no",
	"Shows the usage of the tool"},

	{ "logbase",
	ARG_FLOATING,
	"1.0001",
	"Base in which all log-likelihoods calculated" },

	{ "lm",
	ARG_STRING,
	NULL,
	"Language model file"},

	{ "probdef",
	ARG_STRING,
	NULL,
	"Probability definition file for classes in LM"},

	{ "lmctlfn",
	ARG_STRING,
	NULL,
	"Control file listing a set of language models"},

	{ "lmname",
	ARG_STRING,
	NULL,
	"Name of language model in -lmctlfn to use for all utterances" },

	{ "lsn",
	ARG_STRING,
	NULL,
	"Transcription file to evaluate"},

	{ "text",
	ARG_STRING,
	NULL,
	"Text string to evaluate"},

	{ "mmap",
	ARG_BOOLEAN,
	"no",
	"Use memory-mapped I/O for reading binary LM files"},

	{ "lw",
	ARG_FLOATING,
	"1.0",
	"Language model weight" },

	{ "wip",
	ARG_FLOATING,
	"1.0",
	"Word insertion probability" },

	{ "verbose",
	ARG_BOOLEAN,
	"no",
	"Print details of perplexity calculation" },

	/* FIXME: Support -lmstartsym, -lmendsym, -lmctlfn, -ctl_lm */
	{ NULL, 0, NULL, NULL }
	};

	static int verbose;

	static int
	calc_entropy(ngram_model_t lm, char *words, int32 n,
	int32 out_n_ccs, int32 out_n_oovs, int32 *out_lm_score)
	{
	int32 *wids;
	int32 startwid;
	int32 i, ch, nccs, noovs, unk;

	if (n == 0)
	return 0;

	unk = ngram_unknown_wid(lm);

	/* Reverse this array into an array of word IDs. */
	wids = ckd_calloc(n, sizeof(*wids));
	for (i = 0; i < n; ++i)
	wids[n-i-1] = ngram_wid(lm, words[i]);
	/* Skip <s> as it's a context cue (HACK, this should be configurable). */
	startwid = ngram_wid(lm, "<s>");

	/* Now evaluate the list of words in reverse using the
	* remainder of the array as the history. */
	ch = noovs = nccs = 0;
	for (i = 0; i < n; ++i) {
	int32 n_used;
	int32 prob;

	/* Skip <s> as it's a context cue (HACK, this should be configurable). */
	if (wids[i] == startwid) {
	++nccs;
	continue;
	}
	/* Skip and count OOVs. */
	if (wids[i] == NGRAM_INVALID_WID \|\| wids[i] == unk) {
	++noovs;
	continue;
	}
	/* Sum up information for each N-gram */
	prob = ngram_ng_score(lm,
	wids[i], wids + i + 1,
	n - i - 1, &n_used);
	if (verbose) {
	int m;
	printf("log P(%s\|", ngram_word(lm, wids[i]));
	m = i + ngram_model_get_size(lm) - 1;
	if (m >= n)
	m = n - 1;
	while (m > i) {
	printf("%s ", ngram_word(lm, wids[m--]));
	}
	printf(") = %d\n", prob);
	}
	ch -= prob;
	}

	if (out_n_ccs) *out_n_ccs = nccs;
	if (out_n_oovs) *out_n_oovs = noovs;

	/* Calculate cross-entropy CH = - 1/N sum log P(W\|H) */
	n -= (nccs + noovs);
	if (n <= 0)
	return 0;
	if (out_lm_score)
	*out_lm_score = -ch;
	return ch / n;
	}

	static void
	evaluate_file(ngram_model_t lm, logmath_t lmath, const char *lsnfn)
	{
	FILE *fh;
	lineiter_t *litor;
	int32 nccs, noovs, nwords, lscr;
	float64 ch, log_to_log2;;

	if ((fh = fopen(lsnfn, "r")) == NULL)
	E_FATAL_SYSTEM("failed to open transcript file %s", lsnfn);

	/* We have to keep ch in floating-point to avoid overflows, so
	* we might as well use log2. */
	log_to_log2 = log(logmath_get_base(lmath)) / log(2);
	lscr = nccs = noovs = nwords = 0;
	ch = 0.0;
	for (litor = lineiter_start(fh); litor; litor = lineiter_next(litor)) {
	char **words;
	int32 n, tmp_ch, tmp_noovs, tmp_nccs, tmp_lscr;

	n = str2words(litor->buf, NULL, 0);
	if (n < 0)
	E_FATAL("str2words(line, NULL, 0) = %d, should not happen\n", n);
	if (n == 0) /* Do nothing! */
	continue;
	words = ckd_calloc(n, sizeof(*words));
	str2words(litor->buf, words, n);

	/* Remove any utterance ID (FIXME: has to be a single "word") */
	if (words[n-1][0] == '('
	&& words[n-1][strlen(words[n-1])-1] == ')')
	n = n - 1;

	tmp_ch = calc_entropy(lm, words, n, &tmp_nccs,
	&tmp_noovs, &tmp_lscr);

	ch += (float64) tmp_ch * (n - tmp_nccs - tmp_noovs) * log_to_log2;
	nccs += tmp_nccs;
	noovs += tmp_noovs;
	lscr += tmp_lscr;
	nwords += n;

	ckd_free(words);
	}

	ch /= (nwords - nccs - noovs);
	printf("cross-entropy: %f bits\n", ch);

	/* Calculate perplexity pplx = exp CH */
	printf("perplexity: %f\n", pow(2.0, ch));
	printf("lm score: %d\n", lscr);

	/* Report OOVs and CCs */
	printf("%d words evaluated\n", nwords);
	printf("%d OOVs (%.2f%%), %d context cues removed\n",
	noovs, (double)noovs / nwords * 100, nccs);
	}

	static void
	evaluate_string(ngram_model_t lm, logmath_t lmath, const char *text)
	{
	char *textfoo;
	char **words;
	int32 n, ch, noovs, nccs, lscr;

	/* Split it into an array of strings. */
	textfoo = ckd_salloc(text);
	n = str2words(textfoo, NULL, 0);
	if (n < 0)
	E_FATAL("str2words(textfoo, NULL, 0) = %d, should not happen\n", n);
	if (n == 0) /* Do nothing! */
	return;
	words = ckd_calloc(n, sizeof(*words));
	str2words(textfoo, words, n);

	ch = calc_entropy(lm, words, n, &nccs, &noovs, &lscr);

	printf("input: %s\n", text);
	printf("cross-entropy: %f bits\n",
	ch * log(logmath_get_base(lmath)) / log(2));

	/* Calculate perplexity pplx = exp CH */
	printf("perplexity: %f\n", logmath_exp(lmath, ch));
	printf("lm score: %d\n", lscr);

	/* Report OOVs and CCs */
	printf("%d words evaluated\n", n);
	printf("%d OOVs, %d context cues removed\n",
	noovs, nccs);

	ckd_free(textfoo);
	ckd_free(words);
	}

	int
	main(int argc, char *argv[])
	{
	cmd_ln_t *config;
	ngram_model_t *lm = NULL;
	logmath_t *lmath;
	const char lmfn, probdefn, lsnfn, text;

	if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL) {
	/* This probably just means that we got no arguments. */
	err_set_loglevel(ERR_INFO);
	cmd_ln_log_help_r(NULL, defn);
	return 1;
	}

	verbose = ps_config_bool(config, "verbose");

	/* Create log math object. */
	if ((lmath = logmath_init
	(ps_config_float(config, "logbase"), 0, 0)) == NULL) {
	E_FATAL("Failed to initialize log math\n");
	}

	/* Load the language model. */
	lmfn = ps_config_str(config, "lm");
	if (lmfn == NULL
	\|\| (lm = ngram_model_read(config, lmfn,
	NGRAM_AUTO, lmath)) == NULL) {
	E_FATAL("Failed to load language model from %s\n",
	ps_config_str(config, "lm"));
	}
	if ((probdefn = ps_config_str(config, "probdef")) != NULL)
	ngram_model_read_classdef(lm, probdefn);
	ngram_model_apply_weights(lm,
	ps_config_float(config, "lw"),
	ps_config_float(config, "wip"));

	/* Now evaluate some text. */
	lsnfn = ps_config_str(config, "lsn");
	text = ps_config_str(config, "text");
	if (lsnfn) {
	evaluate_file(lm, lmath, lsnfn);
	}
	else if (text) {
	evaluate_string(lm, lmath, text);
	}

	return 0;
	}