pocketsphinx

5610573 about 3 years ago

15.7 kB

	/* -- c-basic-offset: 4; indent-tabs-mode: nil -- */
	/* ====================================================================
	* Copyright (c) 2008 Carnegie Mellon University. All rights
	* reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	*
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	*
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in
	* the documentation and/or other materials provided with the
	* distribution.
	*
	* This work was supported in part by funding from the Defense Advanced
	* Research Projects Agency and the National Science Foundation of the
	* United States of America, and the CMU Sphinx Speech Consortium.
	*
	* THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
	* ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
	* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
	* NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	* ====================================================================
	*
	*/

	/**
	* @file ngram_search.h N-Gram based multi-pass search ("FBS")
	*/

	#ifndef __NGRAM_SEARCH_H__
	#define __NGRAM_SEARCH_H__

	#include <pocketsphinx.h>

	#include "lm/ngram_model.h"
	#include "util/listelem_alloc.h"
	#include "pocketsphinx_internal.h"
	#include "hmm.h"

	#ifdef __cplusplus
	extern "C" {
	#endif
	#if 0
	}
	#endif

	/**
	* Lexical tree node data type.
	*
	* Not the first HMM for words, which multiplex HMMs based on
	* different left contexts. This structure is used both in the
	* dynamic HMM tree structure and in the per-word last-phone right
	* context fanout.
	*/
	typedef struct chan_s {
	hmm_t hmm; /*< Basic HMM structure. This must* be first in
	the structure because chan_t and root_chan_t are
	sometimes used interchangeably */
	struct chan_s next; /*< first descendant of this channel; or, in the
	case of the last phone of a word, the next
	alternative right context channel */
	struct chan_s alt; /< sibling; i.e., next descendant of parent HMM /

	int32 ciphone; /*< ciphone for this node /
	union {
	int32 penult_phn_wid; /**< list of words whose last phone follows this one;
	this field indicates the first of the list; the
	rest must be built up in a separate array. Used
	only within HMM tree. -1 if none */
	int32 rc_id; /*< right-context id for last phone of words /
	} info;
	} chan_t;

	/**
	* Lexical tree node data type for the first phone (root) of each dynamic HMM tree
	* structure.
	*
	* Each state may have a different parent static HMM. Most fields are
	* similar to those in chan_t.
	*/
	typedef struct root_chan_s {
	hmm_t hmm; /*< Basic HMM structure. This must* be first in
	the structure because chan_t and root_chan_t are
	sometimes used interchangeably. */
	chan_t next; /< first descendant of this channel /

	int32 penult_phn_wid;
	int32 this_phn_wid; /**< list of words consisting of this single phone;
	actually the first of the list, like penult_phn_wid;
	-1 if none */
	int16 ciphone; /**< first ciphone of this node; all words rooted at this
	node begin with this ciphone */
	int16 ci2phone; /**< second ciphone of this node; one root HMM for each
	unique right context */
	} root_chan_t;

	/**
	* Back pointer table (forward pass lattice; actually a tree)
	*/
	typedef struct bptbl_s {
	frame_idx_t frame; /*< start or end frame /
	uint8 valid; /*< For absolute pruning /
	uint8 refcnt; /*< Reference count (number of successors) /
	int32 wid; /*< Word index /
	int32 bp; /*< Back Pointer /
	int32 score; /*< Score (best among all right contexts) /
	int32 s_idx; /*< Start of BScoreStack for various right contexts/
	int32 real_wid; /*< wid of this or latest predecessor real word /
	int32 prev_real_wid; /*< wid of second-last real word /
	int16 last_phone; /*< last phone of this word /
	int16 last2_phone; /*< next-to-last phone of this word /
	} bptbl_t;

	/**
	* Segmentation "iterator" for backpointer table results.
	*/
	typedef struct bptbl_seg_s {
	ps_seg_t base; /*< Base structure. /
	int32 bpidx; /< Sequence of backpointer IDs. /
	int16 n_bpidx; /*< Number of backpointer IDs. /
	int16 cur; /*< Current position in bpidx. /
	} bptbl_seg_t;

	/*
	* Candidates words for entering their last phones. Cleared and rebuilt in each
	* frame.
	* NOTE: candidates can only be multi-phone, real dictionary words.
	*/
	typedef struct lastphn_cand_s {
	int32 wid;
	int32 score;
	int32 bp;
	int32 next; /* next candidate starting at the same frame */
	} lastphn_cand_t;

	/*
	* Since the same instance of a word (i.e., <word,start-frame>) reaches its last
	* phone several times, we can compute its best BP and LM transition score info
	* just the first time and cache it for future occurrences. Structure for such
	* a cache.
	*/
	typedef struct last_ltrans_s {
	int32 sf; /* Start frame */
	int32 dscr; /* Delta-score upon entering last phone */
	int32 bp; /* Best BP */
	} last_ltrans_t;

	#define CAND_SF_ALLOCSIZE 32
	typedef struct cand_sf_s {
	int32 bp_ef;
	int32 cand;
	} cand_sf_t;

	/*
	* Structure for reorganizing the BP table entries in the current frame according
	* to distinct right context ci-phones. Each entry contains the best BP entry for
	* a given right context. Each successor word will pick up the correct entry based
	* on its first ci-phone.
	*/
	typedef struct bestbp_rc_s {
	int32 score;
	int32 path; /* BP table index corresponding to this entry */
	int32 lc; /* right most ci-phone of above BP entry word */
	} bestbp_rc_t;

	#define NO_BP -1

	/**
	* Various statistics for profiling.
	*/
	typedef struct ngram_search_stats_s {
	int32 n_phone_eval;
	int32 n_root_chan_eval;
	int32 n_nonroot_chan_eval;
	int32 n_last_chan_eval;
	int32 n_word_lastchan_eval;
	int32 n_lastphn_cand_utt;
	int32 n_fwdflat_chan;
	int32 n_fwdflat_words;
	int32 n_fwdflat_word_transition;
	int32 n_senone_active_utt;
	} ngram_search_stats_t;


	/**
	* N-Gram search module structure.
	*/
	struct ngram_search_s {
	ps_search_t base;
	ngram_model_t lmset; /< Set of language models. /
	hmm_context_t hmmctx; /< HMM context. /

	/* Flags to quickly indicate which passes are enabled. */
	uint8 fwdtree;
	uint8 fwdflat;
	uint8 bestpath;

	/* State of processing. */
	uint8 done;

	/* Allocators */
	listelem_alloc_t chan_alloc; /< For chan_t /
	listelem_alloc_t root_chan_alloc; /< For root_chan_t /
	listelem_alloc_t latnode_alloc; /< For latnode_t /

	/**
	* Search structure of HMM instances.
	*
	* The word triphone sequences (HMM instances) are transformed
	* into tree structures, one tree per unique left triphone in the
	* entire dictionary (actually diphone, since its left context
	* varies dyamically during the search process). The entire set
	* of trees of channels is allocated once and for all during
	* initialization (since dynamic management of active CHANs is
	* time consuming), with one exception: the last phones of words,
	* that need multiple right context modelling, are not maintained
	* in this static structure since there are too many of them and
	* few are active at any time. Instead they are maintained as
	* linked lists of CHANs, one list per word, and each CHAN in this
	* set is allocated only on demand and freed if inactive.
	*/
	root_chan_t root_chan; /< Roots of search tree. /
	int32 n_root_chan_alloc; /*< Number of root_chan allocated /
	int32 n_root_chan; /*< Number of valid root_chan /
	int32 n_nonroot_chan; /*< Number of valid non-root channels /
	int32 max_nonroot_chan; /*< Maximum possible number of non-root channels /
	root_chan_t rhmm_1ph; /< Root HMMs for single-phone words /

	/**
	* Channels associated with a given word (only used for right
	* contexts, single-phone words in fwdtree search, and word HMMs
	* in fwdflat search). WARNING: For single-phone words and
	* fwdflat search, this actually contains pointers to root_chan_t,
	* which are allocated using root_chan_alloc. This is a
	* suboptimal state of affairs.
	*/
	chan_t **word_chan;
	bitvec_t word_active; /< array of active flags for all words. /

	/**
	* Each node in the HMM tree structure may point to a set of words
	* whose last phone would follow that node in the tree structure
	* (but is not included in the tree structure for reasons
	* explained above). The channel node points to one word in this
	* set of words. The remaining words are linked through
	* homophone_set[].
	*
	* Single-phone words are not represented in the HMM tree; they
	* are kept in word_chan.
	*
	* Specifically, homophone_set[w] = wid of next word in the same
	* set as w.
	*/
	int32 *homophone_set;
	int32 single_phone_wid; /< list of single-phone word ids /
	int32 n_1ph_words; /*< Number single phone words in dict (total) /
	int32 n_1ph_LMwords; /**< Number single phone dict words also in LM;
	these come first in single_phone_wid */
	/**
	* Array of active channels for current and next frame.
	*
	* In any frame, only some HMM tree nodes are active.
	* active_chan_list[f mod 2] = list of nonroot channels in the HMM
	* tree active in frame f.
	*/
	chan_t ***active_chan_list;
	int32 n_active_chan[2]; /*< Number entries in active_chan_list /
	/**
	* Array of active multi-phone words for current and next frame.
	*
	* Similarly to active_chan_list, active_word_list[f mod 2] = list
	* of word ids for which active channels exist in word_chan in
	* frame f.
	*
	* Statically allocated single-phone words are always active and
	* should not appear in this list.
	*/
	int32 **active_word_list;
	int32 n_active_word[2]; /*< Number entries in active_word_list /

	/*
	* FIXME: Document all of these bits.
	*/
	lastphn_cand_t *lastphn_cand;
	int32 n_lastphn_cand;
	last_ltrans_t last_ltrans; / one per word */
	int32 cand_sf_alloc;
	cand_sf_t *cand_sf;
	bestbp_rc_t *bestbp_rc;

	bptbl_t bp_table; / Forward pass lattice */
	int32 bpidx; /* First free BPTable entry */
	int32 bp_table_size;
	int32 bscore_stack; / Score stack for all possible right contexts */
	int32 bss_head; /* First free BScoreStack entry */
	int32 bscore_stack_size;

	int32 n_frame_alloc; /*< Number of frames allocated in bp_table_idx and friends. /
	int32 n_frame; /*< Number of frames actually present. /
	int32 bp_table_idx; / First BPTable entry for each frame */
	int32 word_lat_idx; / BPTable index for any word in current frame;
	cleared before each frame */

	/*
	* Flat lexicon (2nd pass) search stuff.
	*/
	ps_latnode_t frm_wordlist; /< List of active words in each frame. */
	int32 fwdflat_wordlist; /< List of active word IDs for utterance. /
	bitvec_t *expand_word_flag;
	int32 *expand_word_list;
	int32 n_expand_words;
	int32 min_ef_width;
	int32 max_sf_win;
	float32 fwdflat_fwdtree_lw_ratio;

	int32 best_score; /*< Best Viterbi path score. /
	int32 last_phone_best_score; /*< Best Viterbi path score for last phone. /
	int32 renormalized;

	/*
	* DAG (3rd pass) search stuff.
	*/
	float32 bestpath_fwdtree_lw_ratio;
	float32 ascale; /*< Acoustic score scale for posterior probabilities. /

	ngram_search_stats_t st; /*< Various statistics for profiling. /
	ptmr_t fwdtree_perf;
	ptmr_t fwdflat_perf;
	ptmr_t bestpath_perf;
	int32 n_tot_frame;

	/* A collection of beam widths. */
	int32 beam;
	int32 dynamic_beam;
	int32 pbeam;
	int32 wbeam;
	int32 lpbeam;
	int32 lponlybeam;
	int32 fwdflatbeam;
	int32 fwdflatwbeam;
	int32 fillpen;
	int32 silpen;
	int32 wip;
	int32 nwpen;
	int32 pip;
	int32 maxwpf;
	int32 maxhmmpf;
	};
	typedef struct ngram_search_s ngram_search_t;

	/**
	* Initialize the N-Gram search module.
	*/
	ps_search_t ngram_search_init(const char name,
	ngram_model_t *lm,
	cmd_ln_t *config,
	acmod_t *acmod,
	dict_t *dict,
	dict2pid_t *d2p);

	/**
	* Finalize the N-Gram search module.
	*/
	void ngram_search_free(ps_search_t *ngs);

	/**
	* Record the current frame's index in the backpointer table.
	*
	* @return the current backpointer index.
	*/
	int ngram_search_mark_bptable(ngram_search_t *ngs, int frame_idx);

	/**
	* Enter a word in the backpointer table.
	*/
	void ngram_search_save_bp(ngram_search_t *ngs, int frame_idx, int32 w,
	int32 score, int32 path, int32 rc);

	/**
	* Allocate last phone channels for all possible right contexts for word w.
	*/
	void ngram_search_alloc_all_rc(ngram_search_t *ngs, int32 w);

	/**
	* Allocate last phone channels for all possible right contexts for word w.
	*/
	void ngram_search_free_all_rc(ngram_search_t *ngs, int32 w);

	/**
	* Find the best word exit for the current frame in the backpointer table.
	*
	* @return the backpointer index of the best word exit.
	*/
	int ngram_search_find_exit(ngram_search_t ngs, int frame_idx, int32 out_best_score);

	/**
	* Backtrace from a given backpointer index to obtain a word hypothesis.
	*
	* @return a <strong>read-only</strong> string with the best hypothesis.
	*/
	char const ngram_search_bp_hyp(ngram_search_t ngs, int bpidx);

	/**
	* Compute language and acoustic scores for backpointer table entries.
	*/
	void ngram_compute_seg_scores(ngram_search_t *ngs, float32 lwf);

	/**
	* Construct a word lattice from the current hypothesis.
	*/
	ps_lattice_t ngram_search_lattice(ps_search_t search);

	/**
	* Get the exit score for a backpointer entry with a given right context.
	*/
	int32 ngram_search_exit_score(ngram_search_t ngs, bptbl_t pbe, int rcphone);

	/**
	* Sets the global language model.
	*
	* Sets the language model to use if nothing was passed in configuration
	*/
	void ngram_search_set_lm(ngram_model_t *lm);

	#ifdef __cplusplus
	} /* extern "C" */
	#endif

	#endif /* __NGRAM_SEARCH_H__ */