| /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ | |
| /* ==================================================================== | |
| * Copyright (c) 2008 Carnegie Mellon University. All rights | |
| * reserved. | |
| * | |
| * Redistribution and use in source and binary forms, with or without | |
| * modification, are permitted provided that the following conditions | |
| * are met: | |
| * | |
| * 1. Redistributions of source code must retain the above copyright | |
| * notice, this list of conditions and the following disclaimer. | |
| * | |
| * 2. Redistributions in binary form must reproduce the above copyright | |
| * notice, this list of conditions and the following disclaimer in | |
| * the documentation and/or other materials provided with the | |
| * distribution. | |
| * | |
| * This work was supported in part by funding from the Defense Advanced | |
| * Research Projects Agency and the National Science Foundation of the | |
| * United States of America, and the CMU Sphinx Speech Consortium. | |
| * | |
| * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND | |
| * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, | |
| * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
| * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY | |
| * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
| * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
| * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
| * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
| * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| * | |
| * ==================================================================== | |
| * | |
| */ | |
| /** | |
| * @file ngram_search.h N-Gram based multi-pass search ("FBS") | |
| */ | |
| extern "C" { | |
| } | |
| /** | |
| * Lexical tree node data type. | |
| * | |
| * Not the first HMM for words, which multiplex HMMs based on | |
| * different left contexts. This structure is used both in the | |
| * dynamic HMM tree structure and in the per-word last-phone right | |
| * context fanout. | |
| */ | |
| typedef struct chan_s { | |
| hmm_t hmm; /**< Basic HMM structure. This *must* be first in | |
| the structure because chan_t and root_chan_t are | |
| sometimes used interchangeably */ | |
| struct chan_s *next; /**< first descendant of this channel; or, in the | |
| case of the last phone of a word, the next | |
| alternative right context channel */ | |
| struct chan_s *alt; /**< sibling; i.e., next descendant of parent HMM */ | |
| int32 ciphone; /**< ciphone for this node */ | |
| union { | |
| int32 penult_phn_wid; /**< list of words whose last phone follows this one; | |
| this field indicates the first of the list; the | |
| rest must be built up in a separate array. Used | |
| only within HMM tree. -1 if none */ | |
| int32 rc_id; /**< right-context id for last phone of words */ | |
| } info; | |
| } chan_t; | |
| /** | |
| * Lexical tree node data type for the first phone (root) of each dynamic HMM tree | |
| * structure. | |
| * | |
| * Each state may have a different parent static HMM. Most fields are | |
| * similar to those in chan_t. | |
| */ | |
| typedef struct root_chan_s { | |
| hmm_t hmm; /**< Basic HMM structure. This *must* be first in | |
| the structure because chan_t and root_chan_t are | |
| sometimes used interchangeably. */ | |
| chan_t *next; /**< first descendant of this channel */ | |
| int32 penult_phn_wid; | |
| int32 this_phn_wid; /**< list of words consisting of this single phone; | |
| actually the first of the list, like penult_phn_wid; | |
| -1 if none */ | |
| int16 ciphone; /**< first ciphone of this node; all words rooted at this | |
| node begin with this ciphone */ | |
| int16 ci2phone; /**< second ciphone of this node; one root HMM for each | |
| unique right context */ | |
| } root_chan_t; | |
| /** | |
| * Back pointer table (forward pass lattice; actually a tree) | |
| */ | |
| typedef struct bptbl_s { | |
| frame_idx_t frame; /**< start or end frame */ | |
| uint8 valid; /**< For absolute pruning */ | |
| uint8 refcnt; /**< Reference count (number of successors) */ | |
| int32 wid; /**< Word index */ | |
| int32 bp; /**< Back Pointer */ | |
| int32 score; /**< Score (best among all right contexts) */ | |
| int32 s_idx; /**< Start of BScoreStack for various right contexts*/ | |
| int32 real_wid; /**< wid of this or latest predecessor real word */ | |
| int32 prev_real_wid; /**< wid of second-last real word */ | |
| int16 last_phone; /**< last phone of this word */ | |
| int16 last2_phone; /**< next-to-last phone of this word */ | |
| } bptbl_t; | |
| /** | |
| * Segmentation "iterator" for backpointer table results. | |
| */ | |
| typedef struct bptbl_seg_s { | |
| ps_seg_t base; /**< Base structure. */ | |
| int32 *bpidx; /**< Sequence of backpointer IDs. */ | |
| int16 n_bpidx; /**< Number of backpointer IDs. */ | |
| int16 cur; /**< Current position in bpidx. */ | |
| } bptbl_seg_t; | |
| /* | |
| * Candidates words for entering their last phones. Cleared and rebuilt in each | |
| * frame. | |
| * NOTE: candidates can only be multi-phone, real dictionary words. | |
| */ | |
| typedef struct lastphn_cand_s { | |
| int32 wid; | |
| int32 score; | |
| int32 bp; | |
| int32 next; /* next candidate starting at the same frame */ | |
| } lastphn_cand_t; | |
| /* | |
| * Since the same instance of a word (i.e., <word,start-frame>) reaches its last | |
| * phone several times, we can compute its best BP and LM transition score info | |
| * just the first time and cache it for future occurrences. Structure for such | |
| * a cache. | |
| */ | |
| typedef struct last_ltrans_s { | |
| int32 sf; /* Start frame */ | |
| int32 dscr; /* Delta-score upon entering last phone */ | |
| int32 bp; /* Best BP */ | |
| } last_ltrans_t; | |
| typedef struct cand_sf_s { | |
| int32 bp_ef; | |
| int32 cand; | |
| } cand_sf_t; | |
| /* | |
| * Structure for reorganizing the BP table entries in the current frame according | |
| * to distinct right context ci-phones. Each entry contains the best BP entry for | |
| * a given right context. Each successor word will pick up the correct entry based | |
| * on its first ci-phone. | |
| */ | |
| typedef struct bestbp_rc_s { | |
| int32 score; | |
| int32 path; /* BP table index corresponding to this entry */ | |
| int32 lc; /* right most ci-phone of above BP entry word */ | |
| } bestbp_rc_t; | |
| /** | |
| * Various statistics for profiling. | |
| */ | |
| typedef struct ngram_search_stats_s { | |
| int32 n_phone_eval; | |
| int32 n_root_chan_eval; | |
| int32 n_nonroot_chan_eval; | |
| int32 n_last_chan_eval; | |
| int32 n_word_lastchan_eval; | |
| int32 n_lastphn_cand_utt; | |
| int32 n_fwdflat_chan; | |
| int32 n_fwdflat_words; | |
| int32 n_fwdflat_word_transition; | |
| int32 n_senone_active_utt; | |
| } ngram_search_stats_t; | |
| /** | |
| * N-Gram search module structure. | |
| */ | |
| struct ngram_search_s { | |
| ps_search_t base; | |
| ngram_model_t *lmset; /**< Set of language models. */ | |
| hmm_context_t *hmmctx; /**< HMM context. */ | |
| /* Flags to quickly indicate which passes are enabled. */ | |
| uint8 fwdtree; | |
| uint8 fwdflat; | |
| uint8 bestpath; | |
| /* State of processing. */ | |
| uint8 done; | |
| /* Allocators */ | |
| listelem_alloc_t *chan_alloc; /**< For chan_t */ | |
| listelem_alloc_t *root_chan_alloc; /**< For root_chan_t */ | |
| listelem_alloc_t *latnode_alloc; /**< For latnode_t */ | |
| /** | |
| * Search structure of HMM instances. | |
| * | |
| * The word triphone sequences (HMM instances) are transformed | |
| * into tree structures, one tree per unique left triphone in the | |
| * entire dictionary (actually diphone, since its left context | |
| * varies dyamically during the search process). The entire set | |
| * of trees of channels is allocated once and for all during | |
| * initialization (since dynamic management of active CHANs is | |
| * time consuming), with one exception: the last phones of words, | |
| * that need multiple right context modelling, are not maintained | |
| * in this static structure since there are too many of them and | |
| * few are active at any time. Instead they are maintained as | |
| * linked lists of CHANs, one list per word, and each CHAN in this | |
| * set is allocated only on demand and freed if inactive. | |
| */ | |
| root_chan_t *root_chan; /**< Roots of search tree. */ | |
| int32 n_root_chan_alloc; /**< Number of root_chan allocated */ | |
| int32 n_root_chan; /**< Number of valid root_chan */ | |
| int32 n_nonroot_chan; /**< Number of valid non-root channels */ | |
| int32 max_nonroot_chan; /**< Maximum possible number of non-root channels */ | |
| root_chan_t *rhmm_1ph; /**< Root HMMs for single-phone words */ | |
| /** | |
| * Channels associated with a given word (only used for right | |
| * contexts, single-phone words in fwdtree search, and word HMMs | |
| * in fwdflat search). WARNING: For single-phone words and | |
| * fwdflat search, this actually contains pointers to root_chan_t, | |
| * which are allocated using root_chan_alloc. This is a | |
| * suboptimal state of affairs. | |
| */ | |
| chan_t **word_chan; | |
| bitvec_t *word_active; /**< array of active flags for all words. */ | |
| /** | |
| * Each node in the HMM tree structure may point to a set of words | |
| * whose last phone would follow that node in the tree structure | |
| * (but is not included in the tree structure for reasons | |
| * explained above). The channel node points to one word in this | |
| * set of words. The remaining words are linked through | |
| * homophone_set[]. | |
| * | |
| * Single-phone words are not represented in the HMM tree; they | |
| * are kept in word_chan. | |
| * | |
| * Specifically, homophone_set[w] = wid of next word in the same | |
| * set as w. | |
| */ | |
| int32 *homophone_set; | |
| int32 *single_phone_wid; /**< list of single-phone word ids */ | |
| int32 n_1ph_words; /**< Number single phone words in dict (total) */ | |
| int32 n_1ph_LMwords; /**< Number single phone dict words also in LM; | |
| these come first in single_phone_wid */ | |
| /** | |
| * Array of active channels for current and next frame. | |
| * | |
| * In any frame, only some HMM tree nodes are active. | |
| * active_chan_list[f mod 2] = list of nonroot channels in the HMM | |
| * tree active in frame f. | |
| */ | |
| chan_t ***active_chan_list; | |
| int32 n_active_chan[2]; /**< Number entries in active_chan_list */ | |
| /** | |
| * Array of active multi-phone words for current and next frame. | |
| * | |
| * Similarly to active_chan_list, active_word_list[f mod 2] = list | |
| * of word ids for which active channels exist in word_chan in | |
| * frame f. | |
| * | |
| * Statically allocated single-phone words are always active and | |
| * should not appear in this list. | |
| */ | |
| int32 **active_word_list; | |
| int32 n_active_word[2]; /**< Number entries in active_word_list */ | |
| /* | |
| * FIXME: Document all of these bits. | |
| */ | |
| lastphn_cand_t *lastphn_cand; | |
| int32 n_lastphn_cand; | |
| last_ltrans_t *last_ltrans; /* one per word */ | |
| int32 cand_sf_alloc; | |
| cand_sf_t *cand_sf; | |
| bestbp_rc_t *bestbp_rc; | |
| bptbl_t *bp_table; /* Forward pass lattice */ | |
| int32 bpidx; /* First free BPTable entry */ | |
| int32 bp_table_size; | |
| int32 *bscore_stack; /* Score stack for all possible right contexts */ | |
| int32 bss_head; /* First free BScoreStack entry */ | |
| int32 bscore_stack_size; | |
| int32 n_frame_alloc; /**< Number of frames allocated in bp_table_idx and friends. */ | |
| int32 n_frame; /**< Number of frames actually present. */ | |
| int32 *bp_table_idx; /* First BPTable entry for each frame */ | |
| int32 *word_lat_idx; /* BPTable index for any word in current frame; | |
| cleared before each frame */ | |
| /* | |
| * Flat lexicon (2nd pass) search stuff. | |
| */ | |
| ps_latnode_t **frm_wordlist; /**< List of active words in each frame. */ | |
| int32 *fwdflat_wordlist; /**< List of active word IDs for utterance. */ | |
| bitvec_t *expand_word_flag; | |
| int32 *expand_word_list; | |
| int32 n_expand_words; | |
| int32 min_ef_width; | |
| int32 max_sf_win; | |
| float32 fwdflat_fwdtree_lw_ratio; | |
| int32 best_score; /**< Best Viterbi path score. */ | |
| int32 last_phone_best_score; /**< Best Viterbi path score for last phone. */ | |
| int32 renormalized; | |
| /* | |
| * DAG (3rd pass) search stuff. | |
| */ | |
| float32 bestpath_fwdtree_lw_ratio; | |
| float32 ascale; /**< Acoustic score scale for posterior probabilities. */ | |
| ngram_search_stats_t st; /**< Various statistics for profiling. */ | |
| ptmr_t fwdtree_perf; | |
| ptmr_t fwdflat_perf; | |
| ptmr_t bestpath_perf; | |
| int32 n_tot_frame; | |
| /* A collection of beam widths. */ | |
| int32 beam; | |
| int32 dynamic_beam; | |
| int32 pbeam; | |
| int32 wbeam; | |
| int32 lpbeam; | |
| int32 lponlybeam; | |
| int32 fwdflatbeam; | |
| int32 fwdflatwbeam; | |
| int32 fillpen; | |
| int32 silpen; | |
| int32 wip; | |
| int32 nwpen; | |
| int32 pip; | |
| int32 maxwpf; | |
| int32 maxhmmpf; | |
| }; | |
| typedef struct ngram_search_s ngram_search_t; | |
| /** | |
| * Initialize the N-Gram search module. | |
| */ | |
| ps_search_t *ngram_search_init(const char *name, | |
| ngram_model_t *lm, | |
| cmd_ln_t *config, | |
| acmod_t *acmod, | |
| dict_t *dict, | |
| dict2pid_t *d2p); | |
| /** | |
| * Finalize the N-Gram search module. | |
| */ | |
| void ngram_search_free(ps_search_t *ngs); | |
| /** | |
| * Record the current frame's index in the backpointer table. | |
| * | |
| * @return the current backpointer index. | |
| */ | |
| int ngram_search_mark_bptable(ngram_search_t *ngs, int frame_idx); | |
| /** | |
| * Enter a word in the backpointer table. | |
| */ | |
| void ngram_search_save_bp(ngram_search_t *ngs, int frame_idx, int32 w, | |
| int32 score, int32 path, int32 rc); | |
| /** | |
| * Allocate last phone channels for all possible right contexts for word w. | |
| */ | |
| void ngram_search_alloc_all_rc(ngram_search_t *ngs, int32 w); | |
| /** | |
| * Allocate last phone channels for all possible right contexts for word w. | |
| */ | |
| void ngram_search_free_all_rc(ngram_search_t *ngs, int32 w); | |
| /** | |
| * Find the best word exit for the current frame in the backpointer table. | |
| * | |
| * @return the backpointer index of the best word exit. | |
| */ | |
| int ngram_search_find_exit(ngram_search_t *ngs, int frame_idx, int32 *out_best_score); | |
| /** | |
| * Backtrace from a given backpointer index to obtain a word hypothesis. | |
| * | |
| * @return a <strong>read-only</strong> string with the best hypothesis. | |
| */ | |
| char const *ngram_search_bp_hyp(ngram_search_t *ngs, int bpidx); | |
| /** | |
| * Compute language and acoustic scores for backpointer table entries. | |
| */ | |
| void ngram_compute_seg_scores(ngram_search_t *ngs, float32 lwf); | |
| /** | |
| * Construct a word lattice from the current hypothesis. | |
| */ | |
| ps_lattice_t *ngram_search_lattice(ps_search_t *search); | |
| /** | |
| * Get the exit score for a backpointer entry with a given right context. | |
| */ | |
| int32 ngram_search_exit_score(ngram_search_t *ngs, bptbl_t *pbe, int rcphone); | |
| /** | |
| * Sets the global language model. | |
| * | |
| * Sets the language model to use if nothing was passed in configuration | |
| */ | |
| void ngram_search_set_lm(ngram_model_t *lm); | |
| } /* extern "C" */ | |