pocketsphinx

5610573 about 3 years ago

19.6 kB

	/* -- c-basic-offset: 4; indent-tabs-mode: nil -- */
	/* ====================================================================
	* Copyright (c) 1999-2004 Carnegie Mellon University. All rights
	* reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	*
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	*
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in
	* the documentation and/or other materials provided with the
	* distribution.
	*
	* This work was supported in part by funding from the Defense Advanced
	* Research Projects Agency and the National Science Foundation of the
	* United States of America, and the CMU Sphinx Speech Consortium.
	*
	* THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
	* ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
	* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
	* NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	* ====================================================================
	*
	*/
	/*
	* feat.h -- Cepstral features computation.
	*/

	#ifndef _S3_FEAT_H_
	#define _S3_FEAT_H_

	#include <stdio.h>

	#include <pocketsphinx/prim_type.h>

	#include "fe/fe.h"
	#include "feat/cmn.h"
	#include "feat/agc.h"

	#ifdef __cplusplus
	extern "C" {
	#endif
	#if 0
	/* Fool Emacs. */
	}
	#endif

	/** \file feat.h
	* \brief compute the dynamic coefficients from the cepstral vector.
	*/
	#define LIVEBUFBLOCKSIZE 256 /** Blocks of 256 vectors allocated
	for livemode decoder */
	#define S3_MAX_FRAMES 15000 /* RAH, I believe this is still too large, but better than before */

	#define cepstral_to_feature_command_line_macro() \
	{ "feat", \
	ARG_STRING, \
	"1s_c_d_dd", \
	"Feature stream type, depends on the acoustic model" }, \
	{ "ceplen", \
	ARG_INTEGER, \
	"13", \
	"Number of components in the input feature vector" }, \
	{ "cmn", \
	ARG_STRING, \
	"live", \
	"Cepstral mean normalization scheme ('live', 'batch', or 'none')" }, \
	{ "cmninit", \
	ARG_STRING, \
	"40,3,-1", \
	"Initial values (comma-separated) for cepstral mean when 'live' is used" }, \
	{ "varnorm", \
	ARG_BOOLEAN, \
	"no", \
	"Variance normalize each utterance (only if CMN == current)" }, \
	{ "agc", \
	ARG_STRING, \
	"none", \
	"Automatic gain control for c0 ('max', 'emax', 'noise', or 'none')" }, \
	{ "agcthresh", \
	ARG_FLOATING, \
	"2.0", \
	"Initial threshold for automatic gain control" }, \
	{ "lda", \
	ARG_STRING, \
	NULL, \
	"File containing transformation matrix to be applied to features (single-stream features only)" }, \
	{ "ldadim", \
	ARG_INTEGER, \
	"0", \
	"Dimensionality of output of feature transformation (0 to use entire matrix)" }, \
	{"svspec", \
	ARG_STRING, \
	NULL, \
	"Subvector specification (e.g., 24,0-11/25,12-23/26-38 or 0-12/13-25/26-38)"}

	/**
	* \struct feat_t
	* \brief Structure for describing a speech feature type
	* Structure for describing a speech feature type (no. of streams and stream widths),
	* as well as the computation for converting the input speech (e.g., Sphinx-II format
	* MFC cepstra) into this type of feature vectors.
	*/
	typedef struct feat_s {
	int refcount; /*< Reference count. /
	char name; /< Printable name for this feature type /
	int32 cepsize; /*< Size of input speech vector (typically, a cepstrum vector) /
	int32 n_stream; /*< Number of feature streams; e.g., 4 in Sphinx-II /
	uint32 stream_len; /< Vector length of each feature stream /
	int32 window_size; /**< Number of extra frames around given input frame needed to compute
	corresponding output feature (so total = window_size2 + 1) /
	int32 n_sv; /*< Number of subvectors /
	uint32 sv_len; /< Vector length of each subvector /
	int32 subvecs; /< Subvector specification (or NULL for none) */
	mfcc_t sv_buf; /< Temporary copy buffer for subvector projection /
	int32 sv_dim; /*< Total dimensionality of subvector (length of sv_buf) /

	cmn_type_t cmn; /*< Type of CMN to be performed on each utterance /
	int32 varnorm; /**< Whether variance normalization is to be performed on each utt;
	Irrelevant if no CMN is performed */
	agc_type_t agc; /*< Type of AGC to be performed on each utterance /

	/**
	* Feature computation function.
	* @param fcb the feat_t describing this feature type
	* @param input pointer into the input cepstra
	* @param feat a 2-d array of output features (n_stream x stream_len)
	* @return 0 if successful, -ve otherwise.
	*
	* Function for converting window of input speech vector
	* (input[-window_size..window_size]) to output feature vector
	* (feat[stream][]). If NULL, no conversion available, the
	* speech input must be feature vector itself.
	**/
	void (compute_feat)(struct feat_s fcb, mfcc_t input, mfcc_t feat);
	cmn_t cmn_struct; /*< Structure that stores the temporary variables for cepstral
	means normalization*/
	agc_t agc_struct; /*< Structure that stores the temporary variables for acoustic
	gain control*/

	mfcc_t cepbuf; /< Circular buffer of MFCC frames for live feature computation. */
	mfcc_t tmpcepbuf; /< Array of pointers into cepbuf to handle border cases. */
	int32 bufpos; /*< Write index in cepbuf. /
	int32 curpos; /*< Read index in cepbuf. /

	mfcc_t *lda; /< Array of linear transformations (for LDA, MLLT, or whatever) */
	uint32 n_lda; /*< Number of linear transformations in lda. /
	uint32 out_dim; /*< Output dimensionality /
	} feat_t;

	/**
	* Name of feature type.
	*/
	#define feat_name(f) ((f)->name)
	/**
	* Input dimensionality of feature.
	*/
	#define feat_cepsize(f) ((f)->cepsize)
	/**
	* Size of dynamic feature window.
	*/
	#define feat_window_size(f) ((f)->window_size)
	/**
	* Number of feature streams.
	*
	* @deprecated Do not use this, use feat_dimension1() instead.
	*/
	#define feat_n_stream(f) ((f)->n_stream)
	/**
	* Length of feature stream i.
	*
	* @deprecated Do not use this, use feat_dimension2() instead.
	*/
	#define feat_stream_len(f,i) ((f)->stream_len[i])
	/**
	* Number of streams or subvectors in feature output.
	*/
	#define feat_dimension1(f) ((f)->n_sv ? (f)->n_sv : f->n_stream)
	/**
	* Dimensionality of stream/subvector i in feature output.
	*/
	#define feat_dimension2(f,i) ((f)->lda ? (f)->out_dim : ((f)->sv_len ? (f)->sv_len[i] : f->stream_len[i]))
	/**
	* Total dimensionality of feature output.
	*/
	#define feat_dimension(f) ((f)->out_dim)
	/**
	* Array with stream/subvector lengths
	*/
	#define feat_stream_lengths(f) ((f)->lda ? (&(f)->out_dim) : (f)->sv_len ? (f)->sv_len : f->stream_len)

	/**
	* Parse subvector specification string.
	*
	* Format of specification:
	* \li '/' separated list of subvectors
	* \li each subvector is a ',' separated list of subranges
	* \li each subrange is a single \verbatim <number> \endverbatim or
	* \verbatim <number>-<number> \endverbatim (inclusive), where
	* \verbatim <number> \endverbatim is a feature vector dimension
	* specifier.
	*
	* E.g., "24,0-11/25,12-23/26,27-38" has:
	* \li 3 subvectors
	* \li the 1st subvector has feature dims: 24, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, and 11.
	* \li etc.
	*
	* @param str subvector specification string.
	* @return allocated 2-D array of subvector specs (free with
	* subvecs_free()). If there are N subvectors specified, subvec[N] =
	* NULL; and each subvec[0]..subvec[N-1] is -1 terminated vector of
	* feature dims.
	*/
	int32 *parse_subvecs(char const str);

	/**
	* Free array of subvector specs.
	*/
	void subvecs_free(int32 **subvecs);


	/**
	* Allocate an array to hold several frames worth of feature vectors. The returned value
	* is the mfcc_t ***data array, organized as follows:
	*
	* - data[0][0] = frame 0 stream 0 vector, data[0][1] = frame 0 stream 1 vector, ...
	* - data[1][0] = frame 1 stream 0 vector, data[0][1] = frame 1 stream 1 vector, ...
	* - data[2][0] = frame 2 stream 0 vector, data[0][1] = frame 2 stream 1 vector, ...
	* - ...
	*
	* NOTE: For I/O convenience, the entire data area is allocated as one contiguous block.
	* @return pointer to the allocated space if successful, NULL if any error.
	*/
	mfcc_t **feat_array_alloc(feat_t fcb, /**< In: Descriptor from feat_init(), used
	to obtain number of streams and stream sizes */
	int32 nfr /*< In: Number of frames for which to allocate /
	);

	/**
	* Realloate the array of features. Requires us to know the old size
	*/
	mfcc_t **feat_array_realloc(feat_t fcb, /**< In: Descriptor from feat_init(), used
	to obtain number of streams and stream sizes */
	mfcc_t *old_feat, /< Feature array. Freed */
	int32 ofr, /*< In: Previous number of frames /
	int32 nfr /*< In: Number of frames for which to allocate /
	);

	/**
	* Free a buffer allocated with feat_array_alloc()
	*/
	void feat_array_free(mfcc_t ***feat);


	/**
	* Initialize feature module to use the selected type of feature stream.
	* One-time only initialization at the beginning of the program. Input type
	* is a string defining the kind of input->feature conversion desired:
	*
	* - "s2_4x": s2mfc->Sphinx-II 4-feature stream,
	* - "1s_c_d_dd": s2mfc->Sphinx 3.x single feature stream,
	* - "s3_1x39": s2mfc->Sphinx 3.0 single feature stream,
	* - "n1,n2,n3,...": Explicit feature vector layout spec. with comma-separated
	* feature stream lengths. In this case, the input data is already in the
	* feature format and there is no conversion necessary.
	*
	* @return (feat_t *) descriptor if successful, NULL if error. Caller
	* must not directly modify the contents of the returned value.
	*/
	feat_t feat_init(char const type,/*< In: Type of feature stream /
	cmn_type_t cmn, /**< In: Type of cepstram mean normalization to
	be done before feature computation; can be
	CMN_NONE (for none) */
	int32 varnorm, /**< In: (boolean) Whether variance
	normalization done on each utt; only
	applicable if CMN also done */
	agc_type_t agc, /**< In: Type of automatic gain control to be
	done before feature computation */
	int32 breport, /*< In: Whether to show a report for feat_t /
	int32 cepsize /**< Number of components in the input vector
	(or 0 for the default for this feature type,
	which is usually 13) */
	);

	/**
	* Add an LDA transformation to the feature module from a file.
	* @return 0 for success or -1 if reading the LDA file failed.
	**/
	int32 feat_read_lda(feat_t feat, /< In: Descriptor from feat_init() /
	const char ldafile, /< In: File to read the LDA matrix from. /
	int32 dim /*< In: Dimensionality of LDA output. /
	);

	/**
	* Transform a block of features using the feature module's LDA transform.
	**/
	void feat_lda_transform(feat_t fcb, /< In: Descriptor from feat_init() /
	mfcc_t *inout_feat, /< Feature block to transform. */
	uint32 nfr /*< In: Number of frames in inout_feat. /
	);

	/**
	* Add a subvector specification to the feature module.
	*
	* The subvector splitting will be performed after dynamic feature
	* computation, CMN, AGC, and any LDA transformation. The number of
	* streams in the dynamic feature type must be one, as with LDA.
	*
	* After adding a subvector specification, the output of feature
	* computation will be split into multiple subvectors, and
	* feat_array_alloc() will allocate pointers accordingly. The number
	* of <em>streams</em> will remain the
	*
	* @param fcb the feature descriptor.
	* @param subvecs subvector specification. This pointer is retained
	* by the feat_t and should not be freed manually.
	* @return 0 for success or -1 if the subvector specification was
	* invalid.
	*/
	int feat_set_subvecs(feat_t fcb, int32 *subvecs);

	/**
	* Print the given block of feature vectors to the given FILE.
	*/
	void feat_print(feat_t fcb, /< In: Descriptor from feat_init() /
	mfcc_t *feat, /< In: Feature data to be printed */
	int32 nfr, /*< In: Number of frames of feature data above /
	FILE fp /< In: Output file pointer /
	);


	/**
	* Read a specified MFC file (or given segment within it), perform
	* CMN/AGC as indicated by <code>fcb</code>, and compute feature
	* vectors. Feature vectors are computed for the entire segment
	* specified, by including additional surrounding or padding frames to
	* accommodate the feature windows.
	*
	* @return Number of frames of feature vectors computed if successful;
	* -1 if any error. <code>If</code> feat is NULL, then no actual
	* computation will be done, and the number of frames which must be
	* allocated will be returned.
	*
	* A note on how the file path is constructed: If the control file
	* already specifies extension or absolute path, then these are not
	* applied. The default extension is defined by the application.
	*/
	int32 feat_s2mfc2feat(feat_t fcb, /< In: Descriptor from feat_init() /
	const char file, /< In: File to be read /
	const char dir, /*< In: Directory prefix for file,
	if needed; can be NULL */
	const char cepext,/*< In: Extension of the
	cepstrum file.It cannot be
	NULL */
	int32 sf, int32 ef, /* Start/End frames
	within file to be read. Use
	0,-1 to process entire
	file */
	mfcc_t *feat, /< Out: Computed feature vectors;
	caller must allocate this space */
	int32 maxfr /**< In: Available space (number of frames) in
	above feat array; it must be
	sufficient to hold the result.
	Pass -1 for no limit. */
	);


	/**
	* Feature computation routine for live mode decoder.
	*
	* This function computes features for blocks of incoming data. It
	* retains an internal buffer for computing deltas, which means that
	* the number of output frames will not necessarily equal the number
	* of input frames.
	*
	* <strong>It is very important</strong> to realize that the number of
	* output frames can be <strong>greater than</strong> the number of
	* input frames, specifically when <code>endutt</code> is true. It is
	* guaranteed to never exceed <code>*inout_ncep +
	* feat_window_size(fcb)</code>. You <strong>MUST</strong> have
	* allocated at least that many frames in <code>ofeat</code>, or you
	* will experience a buffer overflow.
	*
	* If beginutt and endutt are both true, CMN_CURRENT and AGC_MAX will
	* be done. Otherwise only CMN_PRIOR and AGC_EMAX will be done.
	*
	* If beginutt is false, endutt is true, and the number of input
	* frames exceeds the input size, then end-of-utterance processing
	* won't actually be done. This condition can easily be checked,
	* because <code>*inout_ncep</code> will equal the return value on
	* exit, and will also be smaller than the value of
	* <code>*inout_ncep</code> on entry.
	*
	* @return The number of output frames actually computed.
	**/
	int32 feat_s2mfc2feat_live(feat_t fcb, /< In: Descriptor from feat_init() /
	mfcc_t uttcep, /< In: Incoming cepstral buffer */
	int32 inout_ncep,/*< In: Size of incoming buffer.
	Out: Number of incoming frames consumed. */
	int32 beginutt, /*< In: Begining of utterance flag /
	int32 endutt, /*< In: End of utterance flag /
	mfcc_t *ofeat /< In: Output feature buffer. See
	<strong>VERY IMPORTANT</strong> note
	about the size of this buffer above. */
	);


	/**
	* Update the normalization stats, possibly in the end of utterance
	*
	*/
	void feat_update_stats(feat_t *fcb);


	/**
	* Retain ownership of feat_t.
	*
	* @return pointer to retained feat_t.
	*/
	feat_t feat_retain(feat_t f);

	/**
	* Release resource associated with feat_t
	*
	* @return new reference count (0 if freed)
	*/
	int feat_free(feat_t f /< In: feat_t /
	);

	/**
	* Report the feat_t data structure
	*/
	void feat_report(feat_t f /< In: feat_t /
	);
	#ifdef __cplusplus
	}
	#endif


	#endif