File size: 7,352 Bytes
/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
/* ====================================================================
 * Copyright (c) 1999-2007 Carnegie Mellon University.  All rights
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer. 
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * This work was supported in part by funding from the Defense Advanced 
 * Research Projects Agency and the National Science Foundation of the 
 * United States of America, and the CMU Sphinx Speech Consortium.
 *
 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * ====================================================================
 *
 */
/*
 * \file ngram_model_internal.h Internal structures for N-Gram models
 *
 * Author: David Huggins-Daines <dhdaines@gmail.com>
 */

#ifndef __NGRAM_MODEL_INTERNAL_H__
#define __NGRAM_MODEL_INTERNAL_H__

#include "lm/ngram_model.h"
#include "util/hash_table.h"

/**
 * Common implementation of ngram_model_t.
 *
 * The details of bigram, trigram, and higher-order N-gram storage, if any, can
 * vary somewhat depending on the file format in use.
 */
struct ngram_model_s {
    int refcount;       /**< Reference count */
    uint32 *n_counts;    /**< Counts for 1, 2, 3, ... grams */
    int32 n_1g_alloc;   /**< Number of allocated word strings (for new word addition) */
    int32 n_words;      /**< Number of actual word strings (NOT the same as the
                             number of unigrams, due to class words). */

    uint8 n;            /**< This is an n-gram model (1, 2, 3, ...). */
    uint8 n_classes;    /**< Number of classes (maximum 128) */
    uint8 writable;     /**< Are word strings writable? */
    uint8 flags;        /**< Any other flags we might care about
                             (FIXME: Merge this and writable) */
    logmath_t *lmath;   /**< Log-math object */
    float32 lw;         /**< Language model scaling factor */
    int32 log_wip;      /**< Log of word insertion penalty */
    int32 log_zero;     /**< Zero probability, cached here for quick lookup */
    char **word_str;    /**< Unigram names */
    hash_table_t *wid;  /**< Mapping of unigram names to word IDs. */
    int32 *tmp_wids;    /**< Temporary array of word IDs for ngram_model_get_ngram() */
    struct ngram_class_s **classes; /**< Word class definitions. */
    struct ngram_funcs_s *funcs;   /**< Implementation-specific methods. */
};

/**
 * Implementation of ngram_class_t.
 */
struct ngram_class_s {
    int32 tag_wid;  /**< Base word ID for this class tag */
    int32 start_wid; /**< Starting base word ID for this class' words */
    int32 n_words;   /**< Number of base words for this class */
    int32 *prob1;    /**< Probability table for base words */
    /**
     * Custom hash table for additional words.
     */
    struct ngram_hash_s {
        int32 wid;    /**< Word ID of this bucket */
        int32 prob1;  /**< Probability for this word */
        int32 next;   /**< Index of next bucket (or -1 for no collision) */
    } *nword_hash;
    int32 n_hash;       /**< Number of buckets in nword_hash (power of 2) */
    int32 n_hash_inuse; /**< Number of words in nword_hash */
};

#define NGRAM_MAX_ORDER 5

#define NGRAM_HASH_SIZE 128

#define NGRAM_BASEWID(wid) ((wid)&0xffffff)
#define NGRAM_CLASSID(wid) (((wid)>>24) & 0x7f)
#define NGRAM_CLASSWID(wid,classid) (((classid)<<24) | 0x80000000 | (wid))
#define NGRAM_IS_CLASSWID(wid) ((wid)&0x80000000)

#define UG_ALLOC_STEP 10

/** Implementation-specific functions for operating on ngram_model_t objects */
typedef struct ngram_funcs_s {
    /**
     * Implementation-specific function for freeing an ngram_model_t.
     */
    void (*free) (ngram_model_t * model);
    /**
     * Implementation-specific function for applying language model weights.
     */
    int (*apply_weights) (ngram_model_t * model, float32 lw, float32 wip);
    /**
     * Implementation-specific function for querying language model score.
     */
     int32(*score) (ngram_model_t * model,
                    int32 wid,
                    int32 * history, int32 n_hist, int32 * n_used);
    /**
     * Implementation-specific function for querying raw language
     * model probability.
     */
     int32(*raw_score) (ngram_model_t * model,
                        int32 wid,
                        int32 * history, int32 n_hist, int32 * n_used);
    /**
     * Implementation-specific function for adding unigrams.
     *
     * This function updates the internal structures of a language
     * model to add the given unigram with the given weight (defined
     * as a log-factor applied to the uniform distribution).  This
     * includes reallocating or otherwise resizing the set of unigrams.
     *
     * @return The language model score (not raw log-probability) of
     * the new word, or 0 for failure.
     */
     int32(*add_ug) (ngram_model_t * model, int32 wid, int32 lweight);

    /**
     * Implementation-specific function for purging N-Gram cache
     */
    void (*flush) (ngram_model_t * model);
} ngram_funcs_t;

/**
 * One class definition from a classdef file.
 */
typedef struct classdef_s {
    char **words;
    float32 *weights;
    int32 n_words;
} classdef_t;

/**
 * Initialize the base ngram_model_t structure.
 */
int32
ngram_model_init(ngram_model_t * model,
                 ngram_funcs_t * funcs,
                 logmath_t * lmath, int32 n, int32 n_unigram);

/**
 * Read a probdef file.
 */
int32 read_classdef_file(hash_table_t * classes,
                         const char *classdef_file);

/**
 * Free a class definition.
 */
void classdef_free(classdef_t * classdef);

/**
 * Allocate and initialize an N-Gram class.
 */
ngram_class_t *ngram_class_new(ngram_model_t * model, int32 tag_wid,
                               int32 start_wid, glist_t classwords);

/**
 * Deallocate an N-Gram class.
 */
void ngram_class_free(ngram_class_t * lmclass);

/**
 * Get the in-class log probability for a word in an N-Gram class.
 *
 * @return This probability, or 1 if word not found.
 */
int32 ngram_class_prob(ngram_class_t * lmclass, int32 wid);

#endif                          /* __NGRAM_MODEL_INTERNAL_H__ */