File size: 9,798 Bytes
5610573 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 | /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
/* ====================================================================
* Copyright (c) 1999-2013 Carnegie Mellon University. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
*
* THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
* ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
* NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* ====================================================================
*
*/
/*
* fsg_lextree.h -- The collection of all the lextrees for the entire FSM.
*
*/
#ifndef __S2_FSG_LEXTREE_H__
#define __S2_FSG_LEXTREE_H__
#include <pocketsphinx.h>
#include "lm/fsg_model.h"
#include "hmm.h"
#include "dict.h"
#include "dict2pid.h"
#ifdef __cplusplus
extern "C" {
#endif
#if 0
}
#endif
/*
* Compile-time constant determining the size of the
* bitvector fsg_pnode_t.fsg_pnode_ctxt_t.bv. (See below.)
* But it makes memory allocation simpler and more efficient.
* Make it smaller (2) to save memory if your phoneset has less than
* 64 phones.
*/
#define FSG_PNODE_CTXT_BVSZ 4
typedef struct fsg_pnode_ctxt_s {
uint32 bv[FSG_PNODE_CTXT_BVSZ];
} fsg_pnode_ctxt_t;
/*
* All transitions (words) out of any given FSG state represented are by a
* phonetic prefix lextree (except for epsilon or null transitions; they
* are not part of the lextree). Lextree leaf nodes represent individual
* FSG transitions, so no sharing is allowed at the leaf nodes. The FSG
* transition probs are distributed along the lextree: the prob at a node
* is the max of the probs of all leaf nodes (and, hence, FSG transitions)
* reachable from that node.
*
* To conserve memory, the underlying HMMs with state-level information are
* allocated only as needed. Root and leaf nodes must also account for all
* the possible phonetic contexts, with an independent HMM for each distinct
* context.
*/
typedef struct fsg_pnode_s {
/*
* If this is not a leaf node, the first successor (child) node. Otherwise
* the parent FSG transition for which this is the leaf node (for figuring
* the FSG destination state, and word emitted by the transition). A node
* may have several children. The succ ptr gives just the first; the rest
* are linked via the sibling ptr below.
*/
union {
struct fsg_pnode_s *succ;
fsg_link_t *fsglink;
} next;
/*
* For simplicity of memory management (i.e., freeing the pnodes), all
* pnodes allocated for all transitions out of a state are maintained in a
* linear linked list through the alloc_next pointer.
*/
struct fsg_pnode_s *alloc_next;
/*
* The next node that is also a child of the parent of this node; NULL if
* none.
*/
struct fsg_pnode_s *sibling;
/*
* The transition (log) probability to be incurred upon transitioning to
* this node. (Transition probabilities are really associated with the
* transitions. But a lextree node has exactly one incoming transition.
* Hence, the prob can be associated with the node.)
* This is a logs2(prob) value, and includes the language weight.
*/
int32 logs2prob;
/*
* The root and leaf positions associated with any transition have to deal
* with multiple phonetic contexts. However, different contexts may result
* in the same SSID (senone-seq ID), and can share a single pnode with that
* SSID. But the pnode should track the set of context CI phones that share
* it. Hence the fsg_pnode_ctxt_t bit-vector set-representation. (For
* simplicity of implementation, its size is a compile-time constant for
* now.) Single phone words would need a 2-D array of context, but that's
* too expensive. For now, they simply use SIL as right context, so only
* the left context is properly modelled.
* (For word-internal phones, this field is unused, of course.)
*/
fsg_pnode_ctxt_t ctxt;
uint16 ci_ext; /* This node's CIphone as viewed externally (context) */
uint8 ppos; /* Phoneme position in pronunciation */
uint8 leaf; /* Whether this is a leaf node */
/* HMM-state-level stuff here */
hmm_context_t *ctx;
hmm_t hmm;
} fsg_pnode_t;
/* Access macros */
#define fsg_pnode_leaf(p) ((p)->leaf)
#define fsg_pnode_logs2prob(p) ((p)->logs2prob)
#define fsg_pnode_succ(p) ((p)->next.succ)
#define fsg_pnode_fsglink(p) ((p)->next.fsglink)
#define fsg_pnode_sibling(p) ((p)->sibling)
#define fsg_pnode_hmmptr(p) (&((p)->hmm))
#define fsg_pnode_ci_ext(p) ((p)->ci_ext)
#define fsg_pnode_ppos(p) ((p)->ppos)
#define fsg_pnode_leaf(p) ((p)->leaf)
#define fsg_pnode_ctxt(p) ((p)->ctxt)
#define fsg_pnode_add_ctxt(p,c) ((p)->ctxt.bv[(c)>>5] |= (1 << ((c)&0x001f)))
/*
* The following is macroized because its called very frequently
* ::: uint32 fsg_pnode_ctxt_sub (fsg_pnode_ctxt_t *src, fsg_pnode_ctxt_t *sub);
*/
/*
* Subtract bitvector sub from bitvector src (src updated with the result).
* Return 0 if result is all 0, non-zero otherwise.
*/
#if (FSG_PNODE_CTXT_BVSZ == 1)
#define FSG_PNODE_CTXT_SUB(src,sub) \
((src)->bv[0] = (~((sub)->bv[0]) & (src)->bv[0]))
#elif (FSG_PNODE_CTXT_BVSZ == 2)
#define FSG_PNODE_CTXT_SUB(src,sub) \
(((src)->bv[0] = (~((sub)->bv[0]) & (src)->bv[0])) | \
((src)->bv[1] = (~((sub)->bv[1]) & (src)->bv[1])))
#elif (FSG_PNODE_CTXT_BVSZ == 4)
#define FSG_PNODE_CTXT_SUB(src,sub) \
(((src)->bv[0] = (~((sub)->bv[0]) & (src)->bv[0])) | \
((src)->bv[1] = (~((sub)->bv[1]) & (src)->bv[1])) | \
((src)->bv[2] = (~((sub)->bv[2]) & (src)->bv[2])) | \
((src)->bv[3] = (~((sub)->bv[3]) & (src)->bv[3])))
#else
#define FSG_PNODE_CTXT_SUB(src,sub) fsg_pnode_ctxt_sub_generic((src),(sub))
#endif
/**
* Collection of lextrees for an FSG.
*/
typedef struct fsg_lextree_s {
fsg_model_t *fsg; /**< The fsg for which this lextree is built. */
hmm_context_t *ctx; /**< HMM context structure. */
dict_t *dict; /**< Pronunciation dictionary for this FSG. */
dict2pid_t *d2p; /**< Context-dependent phone mappings for this FSG. */
bin_mdef_t *mdef; /**< Model definition (triphone mappings). */
/*
* Left and right CIphone sets for each state.
* Left context CIphones for a state S: If word W transitions into S, W's
* final CIphone is in S's {lc}. Words transitioning out of S must consider
* these left context CIphones.
* Similarly, right contexts for state S: If word W transitions out of S,
* W's first CIphone is in S's {rc}. Words transitioning into S must consider
* these right contexts.
*
* NOTE: Words may transition into and out of S INDIRECTLY, with intermediate
* null transitions.
* NOTE: Single-phone words are difficult; only SILENCE right context is
* modelled for them.
* NOTE: Non-silence filler phones aren't included in these sets. Filler
* words don't use context, and present the SILENCE phone as context to
* adjacent words.
*/
int16 **lc; /**< Left context triphone mappings for FSG. */
int16 **rc; /**< Right context triphone mappings for FSG. */
fsg_pnode_t **root; /* root[s] = lextree representing all transitions
out of state s. Note that the "tree" for each
state is actually a collection of trees, linked
via fsg_pnode_t.sibling (root[s]->sibling) */
fsg_pnode_t **alloc_head; /* alloc_head[s] = head of linear list of all
pnodes allocated for state s */
int32 n_pnode; /* #HMM nodes in search structure */
int32 wip;
int32 pip;
} fsg_lextree_t;
/* Access macros */
#define fsg_lextree_root(lt,s) ((lt)->root[s])
#define fsg_lextree_n_pnode(lt) ((lt)->n_pnode)
/**
* Create, initialize, and return a new phonetic lextree for the given FSG.
*/
fsg_lextree_t *fsg_lextree_init(fsg_model_t *fsg, dict_t *dict,
dict2pid_t *d2p,
bin_mdef_t *mdef, hmm_context_t *ctx,
int32 wip, int32 pip);
/**
* Free lextrees for an FSG.
*/
void fsg_lextree_free(fsg_lextree_t *fsg);
/**
* Print an FSG lextree to a file for debugging.
*/
void fsg_lextree_dump(fsg_lextree_t *fsg, FILE *fh);
/**
* Mark the given pnode as inactive (for search).
*/
void fsg_psubtree_pnode_deactivate(fsg_pnode_t *pnode);
/**
* Set all flags on in the given context bitvector.
*/
void fsg_pnode_add_all_ctxt(fsg_pnode_ctxt_t *ctxt);
/**
* Generic variant for arbitrary size
*/
uint32 fsg_pnode_ctxt_sub_generic(fsg_pnode_ctxt_t *src, fsg_pnode_ctxt_t *sub);
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif
|