File size: 12,675 Bytes
/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
/* ====================================================================
 * Copyright (c) 2015 Carnegie Mellon University.  All rights
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer. 
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * This work was supported in part by funding from the Defense Advanced 
 * Research Projects Agency and the National Science Foundation of the 
 * United States of America, and the CMU Sphinx Speech Consortium.
 *
 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * ====================================================================
 *
 */

#include <string.h>

#include <pocketsphinx/err.h>

#include "util/pio.h"
#include "util/strfuncs.h"
#include "util/ckd_alloc.h"
#include "util/byteorder.h"
#include "lm/ngram_model_internal.h"
#include "lm/ngrams_raw.h"

int
ngram_ord_comparator(const void *a_raw, const void *b_raw)
{
    ngram_raw_t *a = (ngram_raw_t *) a_raw;
    ngram_raw_t *b = (ngram_raw_t *) b_raw;
    int a_w_ptr = 0;
    int b_w_ptr = 0;
    while ((uint32)a_w_ptr < a->order && (uint32)b_w_ptr < b->order) {
        if (a->words[a_w_ptr] == b->words[b_w_ptr]) {
            a_w_ptr++;
            b_w_ptr++;
            continue;
        }
        if (a->words[a_w_ptr] < b->words[b_w_ptr])
            return -1;
        else
            return 1;
    }
    return a->order - b->order;
}

static int
ngrams_raw_read_line(lineiter_t *li, hash_table_t *wid,
                    logmath_t *lmath, int order, int order_max,
                    ngram_raw_t *raw_ngram)
{
    int n, i;
    int words_expected;
    char *wptr[NGRAM_MAX_ORDER + 1];
    uint32 *word_out;

    words_expected = order + 1;
    if ((n =
         str2words(li->buf, wptr,
                   NGRAM_MAX_ORDER + 1)) < words_expected) {
        E_ERROR("Format error; %d-gram ignored at line %d\n", order, li->lineno);
        return -1;
    }

    raw_ngram->order = order;

    if (order == order_max) {
        raw_ngram->prob = atof_c(wptr[0]);
        if (raw_ngram->prob > 0) {
            E_WARN("%d-gram '%s' has positive probability\n", order, wptr[1]);
            raw_ngram->prob = 0.0f;
        }
        raw_ngram->prob =
            logmath_log10_to_log_float(lmath, raw_ngram->prob);
    }
    else {
        float weight, backoff;

        weight = atof_c(wptr[0]);
        if (weight > 0) {
            E_WARN("%d-gram '%s' has positive probability\n", order, wptr[1]);
            raw_ngram->prob = 0.0f;
        }
        else {
            raw_ngram->prob =
                logmath_log10_to_log_float(lmath, weight);
        }

        if (n == order + 1) {
            raw_ngram->backoff = 0.0f;
        }
        else {
            backoff = atof_c(wptr[order + 1]);
            raw_ngram->backoff =
                logmath_log10_to_log_float(lmath, backoff);
        }
    }
    raw_ngram->words =
        (uint32 *) ckd_calloc(order, sizeof(*raw_ngram->words));
    for (word_out = raw_ngram->words + order - 1, i = 1;
         word_out >= raw_ngram->words; --word_out, i++) {
        hash_table_lookup_int32(wid, wptr[i], (int32 *) word_out);
    }
    return 0;
}

static int
ngrams_raw_read_section(ngram_raw_t ** raw_ngrams, lineiter_t ** li,
                      hash_table_t * wid, logmath_t * lmath, uint32 *count,
                      int order, int order_max)
{
    char expected_header[20];
    uint32 i, cur;

    sprintf(expected_header, "\\%d-grams:", order);
    while (*li && strcmp((*li)->buf, expected_header) != 0) {
	*li = lineiter_next(*li);
    }
    
    if (*li == NULL) {
	E_ERROR("Failed to find '%s', language model file truncated\n", expected_header);
	return -1;
    }
    
    *raw_ngrams = (ngram_raw_t *) ckd_calloc(*count, sizeof(ngram_raw_t));
    for (i = 0, cur = 0; i < *count && *li != NULL; i++) {
	*li = lineiter_next(*li);
        if (*li == NULL) {
	    E_ERROR("Unexpected end of ARPA file. Failed to read %d-gram\n",
                order);
    	    return -1;
	}
        if (ngrams_raw_read_line(*li, wid, lmath, order, order_max,
                                *raw_ngrams + cur) == 0) {
            cur++;
        }
    }
    *count = cur;
    qsort(*raw_ngrams, *count, sizeof(ngram_raw_t), &ngram_ord_comparator);
    return 0;
}

ngram_raw_t **
ngrams_raw_read_arpa(lineiter_t ** li, logmath_t * lmath, uint32 * counts,
                     int order, hash_table_t * wid)
{
    ngram_raw_t **raw_ngrams;
    int order_it;

    raw_ngrams =
        (ngram_raw_t **) ckd_calloc(order - 1, sizeof(*raw_ngrams));

    for (order_it = 2; order_it <= order; order_it++) {
        if (ngrams_raw_read_section(&raw_ngrams[order_it - 2], li, wid, lmath,
                              counts + order_it - 1, order_it, order) < 0)
        break;
    }

    /* Check if we found ARPA end-mark */
    if (*li == NULL) {
        E_ERROR("ARPA file ends without end-mark\n");
	ngrams_raw_free(raw_ngrams, counts, order);
        return NULL;
    } else {
        *li = lineiter_next(*li);
	if (strcmp((*li)->buf, "\\end\\") != 0) {
    	    E_WARN
        	("Finished reading ARPA file. Expecting end mark but found '%s'\n",
        	 (*li)->buf);
        }
    }

    return raw_ngrams;
}

static void
read_dmp_weight_array(FILE * fp, logmath_t * lmath, uint8 do_swap,
                      int32 counts, ngram_raw_t * raw_ngrams,
                      int weight_idx)
{
    int32 i, k;
    dmp_weight_t *tmp_weight_arr;

    fread(&k, sizeof(k), 1, fp);
    if (do_swap)
        SWAP_INT32(&k);
    tmp_weight_arr =
        (dmp_weight_t *) ckd_calloc(k, sizeof(*tmp_weight_arr));
    fread(tmp_weight_arr, sizeof(*tmp_weight_arr), k, fp);
    for (i = 0; i < k; i++) {
        if (do_swap)
            SWAP_INT32(&tmp_weight_arr[i].l);
        /* Convert values to log. */
        tmp_weight_arr[i].f =
            logmath_log10_to_log_float(lmath, tmp_weight_arr[i].f);
    }
    /* replace indexes with real probs in raw bigrams */
    for (i = 0; i < counts; i++) {
	if (weight_idx == 0) {
	    raw_ngrams[i].prob =
                tmp_weight_arr[(int) raw_ngrams[i].prob].f;
        } else {
	    raw_ngrams[i].backoff =
                tmp_weight_arr[(int) raw_ngrams[i].backoff].f;
        }
    }
    ckd_free(tmp_weight_arr);
}

#define BIGRAM_SEGMENT_SIZE 9

ngram_raw_t **
ngrams_raw_read_dmp(FILE * fp, logmath_t * lmath, uint32 * counts,
                    int order, uint32 * unigram_next, uint8 do_swap)
{
    uint32 j, ngram_idx;
    uint16 *bigrams_next;
    ngram_raw_t **raw_ngrams =
        (ngram_raw_t **) ckd_calloc(order - 1, sizeof(*raw_ngrams));

    /* read bigrams */
    raw_ngrams[0] =
        (ngram_raw_t *) ckd_calloc((size_t) (counts[1] + 1),
                                   sizeof(*raw_ngrams[0]));
    bigrams_next =
        (uint16 *) ckd_calloc((size_t) (counts[1] + 1),
                              sizeof(*bigrams_next));
    ngram_idx = 1;
    for (j = 0; j <= counts[1]; j++) {
        uint16 wid, prob_idx, bo_idx;
        ngram_raw_t *raw_ngram = &raw_ngrams[0][j];

        fread(&wid, sizeof(wid), 1, fp);
        if (do_swap)
            SWAP_INT16(&wid);
        raw_ngram->order = 2;
        while (ngram_idx < counts[0] && j == unigram_next[ngram_idx]) {
            ngram_idx++;
        }
	
	if (j != counts[1]) {
            raw_ngram->words =
    		(uint32 *) ckd_calloc(2, sizeof(*raw_ngram->words));
    	    raw_ngram->words[0] = (uint32) wid;
	    raw_ngram->words[1] = (uint32) ngram_idx - 1;
	}

        fread(&prob_idx, sizeof(prob_idx), 1, fp);
        fread(&bo_idx, sizeof(bo_idx), 1, fp);
        fread(&bigrams_next[j], sizeof(bigrams_next[j]), 1, fp);
        if (do_swap) {
            SWAP_INT16(&prob_idx);
            SWAP_INT16(&bo_idx);
            SWAP_INT16(&bigrams_next[j]);
        }

	if (j != counts[1]) {
            raw_ngram->prob = prob_idx + 0.5f; /* keep index in float. ugly but avoiding using extra memory */
	    raw_ngram->backoff = bo_idx + 0.5f;
	}
    }

    if (ngram_idx < counts[0]) {
        E_ERROR("Corrupted model, not enough unigrams %d %d\n", ngram_idx, counts[0]);
        ckd_free(bigrams_next);
        ngrams_raw_free(raw_ngrams, counts, order);
        return NULL;
    }

    /* read trigrams */
    if (order > 2) {
        raw_ngrams[1] =
            (ngram_raw_t *) ckd_calloc((size_t) counts[2],
                                       sizeof(*raw_ngrams[1]));
        for (j = 0; j < counts[2]; j++) {
            uint16 wid, prob_idx;
            ngram_raw_t *raw_ngram = &raw_ngrams[1][j];

            fread(&wid, sizeof(wid), 1, fp);
            fread(&prob_idx, sizeof(prob_idx), 1, fp);
            if (do_swap) {
                SWAP_INT16(&wid);
                SWAP_INT16(&prob_idx);
            }
            
    	    raw_ngram->order = 3;
            raw_ngram->words =
                (uint32 *) ckd_calloc(3, sizeof(*raw_ngram->words));
            raw_ngram->words[0] = (uint32) wid;
            raw_ngram->prob = prob_idx + 0.5f; /* keep index in float. ugly but avoiding using extra memory */
        }
    }

    /* read prob2 */
    read_dmp_weight_array(fp, lmath, do_swap, (int32) counts[1],
                          raw_ngrams[0], 0);
    /* read bo2 */
    if (order > 2) {
        int32 k;
        int32 *tseg_base;
        read_dmp_weight_array(fp, lmath, do_swap, (int32) counts[1],
                              raw_ngrams[0], 1);
        /* read prob3 */
        read_dmp_weight_array(fp, lmath, do_swap, (int32) counts[2],
                              raw_ngrams[1], 0);
        /* Read tseg_base size and tseg_base to fill trigram's first words */
        fread(&k, sizeof(k), 1, fp);
        if (do_swap)
            SWAP_INT32(&k);
        tseg_base = (int32 *) ckd_calloc(k, sizeof(int32));
        fread(tseg_base, sizeof(int32), k, fp);
        if (do_swap) {
            for (j = 0; j < (uint32) k; j++) {
                SWAP_INT32(&tseg_base[j]);
            }
        }
        ngram_idx = 0;
        for (j = 1; j <= counts[1]; j++) {
            uint32 next_ngram_idx =
                (uint32) (tseg_base[j >> BIGRAM_SEGMENT_SIZE] +
                          bigrams_next[j]);
            while (ngram_idx < next_ngram_idx) {
                raw_ngrams[1][ngram_idx].words[1] =
                    raw_ngrams[0][j - 1].words[0];
                raw_ngrams[1][ngram_idx].words[2] =
                    raw_ngrams[0][j - 1].words[1];
                ngram_idx++;
            }
        }
        ckd_free(tseg_base);
        
        if (ngram_idx < counts[2]) {
      	    E_ERROR("Corrupted model, some trigrams have no corresponding bigram\n");
    	    ckd_free(bigrams_next);
    	    ngrams_raw_free(raw_ngrams, counts, order);
    	    return NULL;
        }
    }
    ckd_free(bigrams_next);

    /* sort raw ngrams for reverse trie */
    qsort(raw_ngrams[0], (size_t) counts[1], sizeof(*raw_ngrams[0]),
          &ngram_ord_comparator);
    if (order > 2) {
        qsort(raw_ngrams[1], (size_t) counts[2], sizeof(*raw_ngrams[1]),
              &ngram_ord_comparator);
    }
    return raw_ngrams;
}

void
ngrams_raw_free(ngram_raw_t ** raw_ngrams, uint32 * counts, int order)
{
    uint32 num;
    int order_it;

    for (order_it = 0; order_it < order - 1; order_it++) {
        for (num = 0; num < counts[order_it + 1]; num++) {
            ckd_free(raw_ngrams[order_it][num].words);
        }
        ckd_free(raw_ngrams[order_it]);
    }
    ckd_free(raw_ngrams);
}