pocketsphinx / src /lm /ngram_model.c
camenduru's picture
pocketsphinx
5610573
/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
/* ====================================================================
* Copyright (c) 1999-2007 Carnegie Mellon University. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* This work was supported in part by funding from the Defense Advanced
* Research Projects Agency and the National Science Foundation of the
* United States of America, and the CMU Sphinx Speech Consortium.
*
* THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
* ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
* NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* ====================================================================
*
*/
/*
* \file ngram_model.c N-Gram language models.
*
* Author: David Huggins-Daines, much code taken from sphinx3/src/libs3decoder/liblm
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#include <string.h>
#include <assert.h>
#include <pocketsphinx/err.h>
#include <pocketsphinx/logmath.h>
#include "lm/ngram_model.h"
#include "util/ckd_alloc.h"
#include "util/filename.h"
#include "util/pio.h"
#include "util/strfuncs.h"
#include "util/case.h"
#include "pocketsphinx_internal.h"
#include "ngram_model_internal.h"
#include "ngram_model_trie.h"
ngram_file_type_t
ngram_file_name_to_type(const char *file_name)
{
const char *ext;
ext = strrchr(file_name, '.');
if (ext == NULL) {
return NGRAM_INVALID;
}
if (0 == strcmp_nocase(ext, ".gz")) {
while (--ext >= file_name) {
if (*ext == '.')
break;
}
if (ext < file_name) {
return NGRAM_INVALID;
}
}
else if (0 == strcmp_nocase(ext, ".bz2")) {
while (--ext >= file_name) {
if (*ext == '.')
break;
}
if (ext < file_name) {
return NGRAM_INVALID;
}
}
/* We use strncmp because there might be a .gz on the end. */
if (0 == strncmp_nocase(ext, ".ARPA", 5))
return NGRAM_ARPA;
if (0 == strncmp_nocase(ext, ".DMP", 4)
|| 0 == strncmp_nocase(ext, ".BIN", 4))
return NGRAM_BIN;
return NGRAM_INVALID;
}
ngram_file_type_t
ngram_str_to_type(const char *str_name)
{
if (0 == strcmp_nocase(str_name, "arpa"))
return NGRAM_ARPA;
if (0 == strcmp_nocase(str_name, "dmp")
|| 0 == strcmp_nocase(str_name, "bin"))
return NGRAM_BIN;
return NGRAM_INVALID;
}
char const *
ngram_type_to_str(int type)
{
switch (type) {
case NGRAM_ARPA:
return "arpa";
case NGRAM_BIN:
return "dmp/bin";
default:
return NULL;
}
}
ngram_model_t *
ngram_model_read(cmd_ln_t * config,
const char *file_name,
ngram_file_type_t file_type, logmath_t * lmath)
{
ngram_model_t *model = NULL;
switch (file_type) {
case NGRAM_AUTO:{
if ((model =
ngram_model_trie_read_bin(config, file_name,
lmath)) != NULL)
break;
if ((model =
ngram_model_trie_read_arpa(config, file_name,
lmath)) != NULL)
break;
if ((model =
ngram_model_trie_read_dmp(config, file_name,
lmath)) != NULL)
break;
return NULL;
}
case NGRAM_ARPA:
model = ngram_model_trie_read_arpa(config, file_name, lmath);
break;
case NGRAM_BIN:
if ((model =
ngram_model_trie_read_bin(config, file_name, lmath)) != NULL)
break;
if ((model =
ngram_model_trie_read_dmp(config, file_name, lmath)) != NULL)
break;
return NULL;
default:
E_ERROR("language model file type not supported\n");
return NULL;
}
/* Now set weights based on config if present. */
if (config) {
float32 lw = 1.0;
float32 wip = 1.0;
lw = ps_config_float(config, "lw");
wip = ps_config_float(config, "wip");
ngram_model_apply_weights(model, lw, wip);
}
return model;
}
int
ngram_model_write(ngram_model_t * model, const char *file_name,
ngram_file_type_t file_type)
{
switch (file_type) {
case NGRAM_AUTO:{
file_type = ngram_file_name_to_type(file_name);
/* Default to ARPA (catches .lm and other things) */
if (file_type == NGRAM_INVALID)
file_type = NGRAM_ARPA;
return ngram_model_write(model, file_name, file_type);
}
case NGRAM_ARPA:
return ngram_model_trie_write_arpa(model, file_name);
case NGRAM_BIN:
return ngram_model_trie_write_bin(model, file_name);
default:
E_ERROR("language model file type not supported\n");
return -1;
}
E_ERROR("language model file type not supported\n");
return -1;
}
int32
ngram_model_init(ngram_model_t * base,
ngram_funcs_t * funcs,
logmath_t * lmath, int32 n, int32 n_unigram)
{
base->refcount = 1;
base->funcs = funcs;
base->n = n;
/* If this was previously initialized... */
if (base->n_counts == NULL)
base->n_counts = (uint32 *) ckd_calloc(n, sizeof(*base->n_counts));
/* Don't reset weights if logmath object hasn't changed. */
if (base->lmath != lmath) {
/* Set default values for weights. */
base->lw = 1.0;
base->log_wip = 0; /* i.e. 1.0 */
base->log_zero = logmath_get_zero(lmath);
base->lmath = lmath;
}
/* Allocate or reallocate space for word strings. */
if (base->word_str) {
/* Free all previous word strings if they were allocated. */
if (base->writable) {
int32 i;
for (i = 0; i < base->n_words; ++i) {
ckd_free(base->word_str[i]);
base->word_str[i] = NULL;
}
}
base->word_str =
(char **) ckd_realloc(base->word_str,
n_unigram * sizeof(char *));
}
else {
base->word_str = (char **) ckd_calloc(n_unigram, sizeof(char *));
}
/* NOTE: They are no longer case-insensitive since we are allowing
* other encodings for word strings. Beware. */
if (base->wid)
hash_table_empty(base->wid);
else
base->wid = hash_table_new(n_unigram, FALSE);
base->n_counts[0] = base->n_1g_alloc = base->n_words = n_unigram;
return 0;
}
ngram_model_t *
ngram_model_retain(ngram_model_t * model)
{
++model->refcount;
return model;
}
void
ngram_model_flush(ngram_model_t * model)
{
if (model->funcs && model->funcs->flush)
(*model->funcs->flush) (model);
}
int
ngram_model_free(ngram_model_t * model)
{
int i;
if (model == NULL)
return 0;
if (--model->refcount > 0)
return model->refcount;
if (model->funcs && model->funcs->free)
(*model->funcs->free) (model);
if (model->writable) {
/* Free all words. */
for (i = 0; i < model->n_words; ++i) {
ckd_free(model->word_str[i]);
}
}
else {
/* Free all class words. */
for (i = 0; i < model->n_classes; ++i) {
ngram_class_t *lmclass;
int32 j;
lmclass = model->classes[i];
for (j = 0; j < lmclass->n_words; ++j) {
ckd_free(model->word_str[lmclass->start_wid + j]);
}
for (j = 0; j < lmclass->n_hash; ++j) {
if (lmclass->nword_hash[j].wid != -1) {
ckd_free(model->word_str[lmclass->nword_hash[j].wid]);
}
}
}
}
for (i = 0; i < model->n_classes; ++i) {
ngram_class_free(model->classes[i]);
}
ckd_free(model->classes);
hash_table_free(model->wid);
ckd_free(model->word_str);
ckd_free(model->n_counts);
ckd_free(model);
return 0;
}
int
ngram_model_casefold(ngram_model_t * model, int kase)
{
int writable, i;
hash_table_t *new_wid;
/* Were word strings already allocated? */
writable = model->writable;
/* Either way, we are going to allocate some word strings. */
model->writable = TRUE;
/* And, don't forget, we need to rebuild the word to unigram ID
* mapping. */
new_wid = hash_table_new(model->n_words, FALSE);
for (i = 0; i < model->n_words; ++i) {
char *outstr;
if (writable) {
outstr = model->word_str[i];
}
else {
outstr = ckd_salloc(model->word_str[i]);
}
/* Don't case-fold <tags> or [classes] */
if (outstr[0] == '<' || outstr[0] == '[') {
}
else {
switch (kase) {
case NGRAM_UPPER:
ucase(outstr);
break;
case NGRAM_LOWER:
lcase(outstr);
break;
default:
;
}
}
model->word_str[i] = outstr;
/* Now update the hash table. We might have terrible
* collisions here, so warn about them. */
if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) {
E_WARN("Duplicate word in dictionary after conversion: %s\n",
model->word_str[i]);
}
}
/* Swap out the hash table. */
hash_table_free(model->wid);
model->wid = new_wid;
return 0;
}
int
ngram_model_apply_weights(ngram_model_t * model, float32 lw, float32 wip)
{
return (*model->funcs->apply_weights) (model, lw, wip);
}
float32
ngram_model_get_weights(ngram_model_t * model, int32 * out_log_wip)
{
if (out_log_wip)
*out_log_wip = model->log_wip;
return model->lw;
}
int32
ngram_ng_score(ngram_model_t * model, int32 wid, int32 * history,
int32 n_hist, int32 * n_used)
{
int32 score, class_weight = 0;
int i;
/* Closed vocabulary, OOV word probability is zero */
if (wid == NGRAM_INVALID_WID)
return model->log_zero;
/* "Declassify" wid and history */
if (NGRAM_IS_CLASSWID(wid)) {
ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
class_weight = ngram_class_prob(lmclass, wid);
if (class_weight == 1) /* Meaning, not found in class. */
return model->log_zero;
wid = lmclass->tag_wid;
}
for (i = 0; i < n_hist; ++i) {
if (history[i] != NGRAM_INVALID_WID
&& NGRAM_IS_CLASSWID(history[i]))
history[i] =
model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
}
score = (*model->funcs->score) (model, wid, history, n_hist, n_used);
/* Multiply by unigram in-class weight. */
return score + class_weight;
}
int32
ngram_score(ngram_model_t * model, const char *word, ...)
{
va_list history;
const char *hword;
int32 *histid;
int32 n_hist;
int32 n_used;
int32 prob;
va_start(history, word);
n_hist = 0;
while ((hword = va_arg(history, const char *)) != NULL)
++n_hist;
va_end(history);
histid = ckd_calloc(n_hist, sizeof(*histid));
va_start(history, word);
n_hist = 0;
while ((hword = va_arg(history, const char *)) != NULL) {
histid[n_hist] = ngram_wid(model, hword);
++n_hist;
}
va_end(history);
prob = ngram_ng_score(model, ngram_wid(model, word),
histid, n_hist, &n_used);
ckd_free(histid);
return prob;
}
int32
ngram_tg_score(ngram_model_t * model, int32 w3, int32 w2, int32 w1,
int32 * n_used)
{
int32 hist[2];
hist[0] = w2;
hist[1] = w1;
return ngram_ng_score(model, w3, hist, 2, n_used);
}
int32
ngram_bg_score(ngram_model_t * model, int32 w2, int32 w1, int32 * n_used)
{
return ngram_ng_score(model, w2, &w1, 1, n_used);
}
int32
ngram_ng_prob(ngram_model_t * model, int32 wid, int32 * history,
int32 n_hist, int32 * n_used)
{
int32 prob, class_weight = 0;
int i;
/* Closed vocabulary, OOV word probability is zero */
if (wid == NGRAM_INVALID_WID)
return model->log_zero;
/* "Declassify" wid and history */
if (NGRAM_IS_CLASSWID(wid)) {
ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
class_weight = ngram_class_prob(lmclass, wid);
if (class_weight == 1) /* Meaning, not found in class. */
return class_weight;
wid = lmclass->tag_wid;
}
for (i = 0; i < n_hist; ++i) {
if (history[i] != NGRAM_INVALID_WID
&& NGRAM_IS_CLASSWID(history[i]))
history[i] =
model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
}
prob = (*model->funcs->raw_score) (model, wid, history,
n_hist, n_used);
/* Multiply by unigram in-class weight. */
return prob + class_weight;
}
int32
ngram_probv(ngram_model_t * model, const char *word, ...)
{
va_list history;
const char *hword;
int32 *histid;
int32 n_hist;
int32 n_used;
int32 prob;
va_start(history, word);
n_hist = 0;
while ((hword = va_arg(history, const char *)) != NULL)
++n_hist;
va_end(history);
histid = ckd_calloc(n_hist, sizeof(*histid));
va_start(history, word);
n_hist = 0;
while ((hword = va_arg(history, const char *)) != NULL) {
histid[n_hist] = ngram_wid(model, hword);
++n_hist;
}
va_end(history);
prob = ngram_ng_prob(model, ngram_wid(model, word),
histid, n_hist, &n_used);
ckd_free(histid);
return prob;
}
int32
ngram_prob(ngram_model_t * model, const char* const *words, int32 n)
{
int32 *ctx_id;
int32 nused;
int32 prob;
int32 wid;
uint32 i;
ctx_id = (int32 *) ckd_calloc(n - 1, sizeof(*ctx_id));
for (i = 1; i < (uint32) n; ++i)
ctx_id[i - 1] = ngram_wid(model, words[i]);
wid = ngram_wid(model, *words);
prob = ngram_ng_prob(model, wid, ctx_id, n - 1, &nused);
ckd_free(ctx_id);
return prob;
}
int32
ngram_score_to_prob(ngram_model_t * base, int32 score)
{
int32 prob;
/* Undo insertion penalty. */
prob = score - base->log_wip;
/* Undo language weight. */
prob = (int32) (prob / base->lw);
return prob;
}
int32
ngram_unknown_wid(ngram_model_t * model)
{
int32 val;
/* FIXME: This could be memoized for speed if necessary. */
/* Look up <UNK>, if not found return NGRAM_INVALID_WID. */
if (hash_table_lookup_int32(model->wid, "<UNK>", &val) == -1)
return NGRAM_INVALID_WID;
else
return val;
}
int32
ngram_zero(ngram_model_t * model)
{
return model->log_zero;
}
int32
ngram_model_get_size(ngram_model_t * model)
{
if (model != NULL)
return model->n;
return 0;
}
uint32 const *
ngram_model_get_counts(ngram_model_t * model)
{
if (model != NULL)
return model->n_counts;
return NULL;
}
int32
ngram_wid(ngram_model_t * model, const char *word)
{
int32 val;
if (hash_table_lookup_int32(model->wid, word, &val) == -1)
return ngram_unknown_wid(model);
else
return val;
}
const char *
ngram_word(ngram_model_t * model, int32 wid)
{
/* Remove any class tag */
wid = NGRAM_BASEWID(wid);
if (wid >= model->n_words)
return NULL;
return model->word_str[wid];
}
/**
* Add a word to the word string and ID mapping.
*/
int32
ngram_add_word_internal(ngram_model_t * model,
const char *word, int32 classid)
{
/* Check for hash collisions. */
int32 wid;
if (hash_table_lookup_int32(model->wid, word, &wid) == 0) {
E_WARN("Omit duplicate word '%s'\n", word);
return wid;
}
/* Take the next available word ID */
wid = model->n_words;
if (classid >= 0) {
wid = NGRAM_CLASSWID(wid, classid);
}
/* Reallocate word_str if necessary. */
if (model->n_words >= model->n_1g_alloc) {
model->n_1g_alloc += UG_ALLOC_STEP;
model->word_str = ckd_realloc(model->word_str,
sizeof(*model->word_str) *
model->n_1g_alloc);
}
/* Add the word string in the appropriate manner. */
/* Class words are always dynamically allocated. */
model->word_str[model->n_words] = ckd_salloc(word);
/* Now enter it into the hash table. */
if (hash_table_enter_int32
(model->wid, model->word_str[model->n_words], wid) != wid) {
E_ERROR
("Hash insertion failed for word %s => %p (should not happen)\n",
model->word_str[model->n_words], (void *) (size_t) (wid));
}
/* Increment number of words. */
++model->n_words;
return wid;
}
int32
ngram_model_add_word(ngram_model_t * model,
const char *word, float32 weight)
{
int32 wid, prob = model->log_zero;
/* If we add word to unwritable model, we need to make it writable */
if (!model->writable) {
E_WARN("Can't add word '%s' to read-only language model. "
"Disable mmap with '-mmap no' to make it writable\n", word);
return -1;
}
wid = ngram_add_word_internal(model, word, -1);
if (wid == NGRAM_INVALID_WID)
return wid;
/* Do what needs to be done to add the word to the unigram. */
if (model->funcs && model->funcs->add_ug)
prob =
(*model->funcs->add_ug) (model, wid,
logmath_log(model->lmath, weight));
if (prob == 0)
return -1;
return wid;
}
ngram_class_t *
ngram_class_new(ngram_model_t * model, int32 tag_wid, int32 start_wid,
glist_t classwords)
{
ngram_class_t *lmclass;
gnode_t *gn;
float32 tprob;
int i;
lmclass = ckd_calloc(1, sizeof(*lmclass));
lmclass->tag_wid = tag_wid;
/* wid_base is the wid (minus class tag) of the first word in the list. */
lmclass->start_wid = start_wid;
lmclass->n_words = glist_count(classwords);
lmclass->prob1 = ckd_calloc(lmclass->n_words, sizeof(*lmclass->prob1));
lmclass->nword_hash = NULL;
lmclass->n_hash = 0;
tprob = 0.0;
for (gn = classwords; gn; gn = gnode_next(gn)) {
tprob += gnode_float32(gn);
}
if (tprob > 1.1 || tprob < 0.9) {
E_INFO("Total class probability is %f, will normalize\n", tprob);
for (gn = classwords; gn; gn = gnode_next(gn)) {
gn->data.fl /= tprob;
}
}
for (i = 0, gn = classwords; gn; ++i, gn = gnode_next(gn)) {
lmclass->prob1[i] = logmath_log(model->lmath, gnode_float32(gn));
}
return lmclass;
}
int32
ngram_class_add_word(ngram_class_t * lmclass, int32 wid, int32 lweight)
{
int32 hash;
if (lmclass->nword_hash == NULL) {
/* Initialize everything in it to -1 */
lmclass->nword_hash =
ckd_malloc(NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
memset(lmclass->nword_hash, 0xff,
NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
lmclass->n_hash = NGRAM_HASH_SIZE;
lmclass->n_hash_inuse = 0;
}
/* Stupidest possible hash function. This will work pretty well
* when this function is called repeatedly with contiguous word
* IDs, though... */
hash = wid & (lmclass->n_hash - 1);
if (lmclass->nword_hash[hash].wid == -1) {
/* Good, no collision. */
lmclass->nword_hash[hash].wid = wid;
lmclass->nword_hash[hash].prob1 = lweight;
++lmclass->n_hash_inuse;
return hash;
}
else {
int32 next; /**< Next available bucket. */
/* Collision... Find the end of the hash chain. */
while (lmclass->nword_hash[hash].next != -1)
hash = lmclass->nword_hash[hash].next;
assert(hash != -1);
/* Does we has any more bukkit? */
if (lmclass->n_hash_inuse == lmclass->n_hash) {
/* Oh noes! Ok, we makes more. */
lmclass->nword_hash = ckd_realloc(lmclass->nword_hash,
lmclass->n_hash * 2 *
sizeof(*lmclass->
nword_hash));
memset(lmclass->nword_hash + lmclass->n_hash, 0xff,
lmclass->n_hash * sizeof(*lmclass->nword_hash));
/* Just use the next allocated one (easy) */
next = lmclass->n_hash;
lmclass->n_hash *= 2;
}
else {
/* Look for any available bucket. We hope this doesn't happen. */
for (next = 0; next < lmclass->n_hash; ++next)
if (lmclass->nword_hash[next].wid == -1)
break;
/* This should absolutely not happen. */
assert(next != lmclass->n_hash);
}
lmclass->nword_hash[next].wid = wid;
lmclass->nword_hash[next].prob1 = lweight;
lmclass->nword_hash[hash].next = next;
++lmclass->n_hash_inuse;
return next;
}
}
void
ngram_class_free(ngram_class_t * lmclass)
{
ckd_free(lmclass->nword_hash);
ckd_free(lmclass->prob1);
ckd_free(lmclass);
}
int32
ngram_model_add_class_word(ngram_model_t * model,
const char *classname,
const char *word, float32 weight)
{
ngram_class_t *lmclass;
int32 classid, tag_wid, wid, i, scale;
float32 fprob;
/* Find the class corresponding to classname. Linear search
* probably okay here since there won't be very many classes, and
* this doesn't have to be fast. */
tag_wid = ngram_wid(model, classname);
if (tag_wid == NGRAM_INVALID_WID) {
E_ERROR("No such word or class tag: %s\n", classname);
return tag_wid;
}
for (classid = 0; classid < model->n_classes; ++classid) {
if (model->classes[classid]->tag_wid == tag_wid)
break;
}
/* Hmm, no such class. It's probably not a good idea to create one. */
if (classid == model->n_classes) {
E_ERROR
("Word %s is not a class tag (call ngram_model_add_class() first)\n",
classname);
return NGRAM_INVALID_WID;
}
lmclass = model->classes[classid];
/* Add this word to the model's set of words. */
wid = ngram_add_word_internal(model, word, classid);
if (wid == NGRAM_INVALID_WID)
return wid;
/* This is the fixed probability of the new word. */
fprob = weight * 1.0f / (lmclass->n_words + lmclass->n_hash_inuse + 1);
/* Now normalize everything else to fit it in. This is
* accomplished by simply scaling all the other probabilities
* by (1-fprob). */
scale = logmath_log(model->lmath, 1.0 - fprob);
for (i = 0; i < lmclass->n_words; ++i)
lmclass->prob1[i] += scale;
for (i = 0; i < lmclass->n_hash; ++i)
if (lmclass->nword_hash[i].wid != -1)
lmclass->nword_hash[i].prob1 += scale;
/* Now add it to the class hash table. */
return ngram_class_add_word(lmclass, wid,
logmath_log(model->lmath, fprob));
}
int32
ngram_model_add_class(ngram_model_t * model,
const char *classname,
float32 classweight,
char **words, const float32 * weights, int32 n_words)
{
ngram_class_t *lmclass;
glist_t classwords = NULL;
int32 i, start_wid = -1;
int32 classid, tag_wid;
/* Check if classname already exists in model. If not, add it. */
if ((tag_wid =
ngram_wid(model, classname)) == ngram_unknown_wid(model)) {
tag_wid = ngram_model_add_word(model, classname, classweight);
if (tag_wid == NGRAM_INVALID_WID)
return -1;
}
if (model->n_classes == 128) {
E_ERROR("Number of classes cannot exceed 128 (sorry)\n");
return -1;
}
classid = model->n_classes;
for (i = 0; i < n_words; ++i) {
int32 wid;
wid = ngram_add_word_internal(model, words[i], classid);
if (wid == NGRAM_INVALID_WID)
return -1;
if (start_wid == -1)
start_wid = NGRAM_BASEWID(wid);
classwords = glist_add_float32(classwords, weights[i]);
}
classwords = glist_reverse(classwords);
lmclass = ngram_class_new(model, tag_wid, start_wid, classwords);
glist_free(classwords);
if (lmclass == NULL)
return -1;
++model->n_classes;
if (model->classes == NULL)
model->classes = ckd_calloc(1, sizeof(*model->classes));
else
model->classes = ckd_realloc(model->classes,
model->n_classes *
sizeof(*model->classes));
model->classes[classid] = lmclass;
return classid;
}
int32
ngram_class_prob(ngram_class_t * lmclass, int32 wid)
{
int32 base_wid = NGRAM_BASEWID(wid);
if (base_wid < lmclass->start_wid
|| base_wid > lmclass->start_wid + lmclass->n_words) {
int32 hash;
/* Look it up in the hash table. */
hash = wid & (lmclass->n_hash - 1);
while (hash != -1 && lmclass->nword_hash[hash].wid != wid)
hash = lmclass->nword_hash[hash].next;
if (hash == -1)
return 1;
return lmclass->nword_hash[hash].prob1;
}
else {
return lmclass->prob1[base_wid - lmclass->start_wid];
}
}
int32
read_classdef_file(hash_table_t * classes, const char *file_name)
{
FILE *fp;
int32 is_pipe;
int inclass; /**< Are we currently reading a list of class words? */
int32 rv = -1;
gnode_t *gn;
glist_t classwords = NULL;
glist_t classprobs = NULL;
char *classname = NULL;
if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) {
E_ERROR("File %s not found\n", file_name);
return -1;
}
inclass = FALSE;
while (!feof(fp)) {
char line[512];
char *wptr[2];
int n_words;
if (fgets(line, sizeof(line), fp) == NULL)
break;
n_words = str2words(line, wptr, 2);
if (n_words <= 0)
continue;
if (inclass) {
/* Look for an end of class marker. */
if (n_words == 2 && 0 == strcmp(wptr[0], "END")) {
classdef_t *classdef;
gnode_t *word, *weight;
int32 i;
if (classname == NULL || 0 != strcmp(wptr[1], classname))
goto error_out;
inclass = FALSE;
/* Construct a class from the list of words collected. */
classdef = ckd_calloc(1, sizeof(*classdef));
classwords = glist_reverse(classwords);
classprobs = glist_reverse(classprobs);
classdef->n_words = glist_count(classwords);
classdef->words = ckd_calloc(classdef->n_words,
sizeof(*classdef->words));
classdef->weights = ckd_calloc(classdef->n_words,
sizeof(*classdef->weights));
word = classwords;
weight = classprobs;
for (i = 0; i < classdef->n_words; ++i) {
classdef->words[i] = gnode_ptr(word);
classdef->weights[i] = gnode_float32(weight);
word = gnode_next(word);
weight = gnode_next(weight);
}
/* Add this class to the hash table. */
if (hash_table_enter(classes, classname, classdef) !=
classdef) {
classdef_free(classdef);
goto error_out;
}
/* Reset everything. */
glist_free(classwords);
glist_free(classprobs);
classwords = NULL;
classprobs = NULL;
classname = NULL;
}
else {
float32 fprob;
if (n_words == 2)
fprob = atof_c(wptr[1]);
else
fprob = 1.0f;
/* Add it to the list of words for this class. */
classwords =
glist_add_ptr(classwords, ckd_salloc(wptr[0]));
classprobs = glist_add_float32(classprobs, fprob);
}
}
else {
/* Start a new LM class if the LMCLASS marker is seen */
if (n_words == 2 && 0 == strcmp(wptr[0], "LMCLASS")) {
if (inclass)
goto error_out;
inclass = TRUE;
classname = ckd_salloc(wptr[1]);
}
/* Otherwise, just ignore whatever junk we got */
}
}
rv = 0; /* Success. */
error_out:
/* Free all the stuff we might have allocated. */
fclose_comp(fp, is_pipe);
for (gn = classwords; gn; gn = gnode_next(gn))
ckd_free(gnode_ptr(gn));
glist_free(classwords);
glist_free(classprobs);
ckd_free(classname);
return rv;
}
void
classdef_free(classdef_t * classdef)
{
int32 i;
for (i = 0; i < classdef->n_words; ++i)
ckd_free(classdef->words[i]);
ckd_free(classdef->words);
ckd_free(classdef->weights);
ckd_free(classdef);
}
int32
ngram_model_read_classdef(ngram_model_t * model, const char *file_name)
{
hash_table_t *classes;
glist_t hl = NULL;
gnode_t *gn;
int32 rv = -1;
classes = hash_table_new(0, FALSE);
if (read_classdef_file(classes, file_name) < 0) {
hash_table_free(classes);
return -1;
}
/* Create a new class in the language model for each classdef. */
hl = hash_table_tolist(classes, NULL);
for (gn = hl; gn; gn = gnode_next(gn)) {
hash_entry_t *he = gnode_ptr(gn);
classdef_t *classdef = he->val;
if (ngram_model_add_class(model, he->key, 1.0,
classdef->words,
classdef->weights,
classdef->n_words) < 0)
goto error_out;
}
rv = 0;
error_out:
for (gn = hl; gn; gn = gnode_next(gn)) {
hash_entry_t *he = gnode_ptr(gn);
ckd_free((char *) he->key);
classdef_free(he->val);
}
glist_free(hl);
hash_table_free(classes);
return rv;
}