pocketsphinx / programs /pocketsphinx_main.c
camenduru's picture
pocketsphinx
5610573
/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
/* ====================================================================
* Copyright (c) 1999-2022 Carnegie Mellon University. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* This work was supported in part by funding from the Defense Advanced
* Research Projects Agency and the National Science Foundation of the
* United States of America, and the CMU Sphinx Speech Consortium.
*
* THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
* ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
* NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* ====================================================================
*
*/
/**
* @file pocketsphinx.c
* @brief Simple command-line speech recognition.
*/
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <signal.h>
#include <pocketsphinx.h>
#include "util/ckd_alloc.h"
#include "config_macro.h"
#include "pocketsphinx_internal.h"
#include "ps_alignment_internal.h"
/* Le sigh. Didn't want to have to do this. */
static const ps_arg_t ps_main_args_def[] = {
POCKETSPHINX_OPTIONS,
{ "config",
ARG_STRING,
NULL,
"JSON file with configuration." },
{ "phone_align",
ARG_BOOLEAN,
"no",
"Run a second pass to align phones and print their durations "
"(DOES NOT WORK IN LIVE MODE)." },
{ "state_align",
ARG_BOOLEAN,
"no",
"Run a second pass to align phones and states and print their durations "
"(Implies -phone_align) "
"(DOES NOT WORK IN LIVE MODE)." },
CMDLN_EMPTY_OPTION
};
static int global_done = 0;
static void
catch_sig(int signum)
{
(void)signum;
global_done = 1;
}
#define HYP_FORMAT "{\"b\":%.3f,\"d\":%.3f,\"p\":%.3f,\"t\":\"%s\""
static int
format_hyp(char *outptr, int len, ps_endpointer_t *ep, ps_decoder_t *decoder)
{
logmath_t *lmath;
double prob, st, et;
const char *hyp;
lmath = ps_get_logmath(decoder);
prob = logmath_exp(lmath, ps_get_prob(decoder));
if (ep == NULL) {
st = 0.0;
et = (double)ps_get_n_frames(decoder)
/ ps_config_int(ps_get_config(decoder), "frate");
}
else {
st = ps_endpointer_speech_start(ep);
et = ps_endpointer_speech_end(ep);
}
hyp = ps_get_hyp(decoder, NULL);
if (hyp == NULL)
hyp = "";
return snprintf(outptr, len, HYP_FORMAT, st, et - st, prob, hyp);
}
static int
format_seg(char *outptr, int len, ps_seg_t *seg,
double utt_start, int frate,
logmath_t *lmath)
{
double prob, st, dur;
int sf, ef;
const char *word;
ps_seg_frames(seg, &sf, &ef);
st = utt_start + (double)sf / frate;
dur = (double)(ef + 1 - sf) / frate;
word = ps_seg_word(seg);
if (word == NULL)
word = "";
prob = logmath_exp(lmath, ps_seg_prob(seg, NULL, NULL, NULL));
len = snprintf(outptr, len, HYP_FORMAT, st, dur, prob, word);
if (outptr) {
outptr += len;
*outptr++ = '}';
*outptr = '\0';
}
len++;
return len;
}
static int
format_align_iter(char *outptr, int maxlen,
ps_alignment_iter_t *itor, double utt_start, int frate, logmath_t *lmath)
{
int start, duration, score;
double prob, st, dur;
const char *word;
score = ps_alignment_iter_seg(itor, &start, &duration);
st = utt_start + (double)start / frate;
dur = (double)duration / frate;
prob = logmath_exp(lmath, score);
word = ps_alignment_iter_name(itor);
if (word == NULL)
word = "";
return snprintf(outptr, maxlen, HYP_FORMAT, st, dur, prob, word);
}
static int
format_seg_align(char *outptr, int maxlen,
ps_alignment_iter_t *itor,
double utt_start, int frate,
logmath_t *lmath, int state_align)
{
ps_alignment_iter_t *pitor;
int len = 0, hyplen;
hyplen = format_align_iter(outptr, maxlen,
itor, utt_start, frate, lmath);
len += hyplen;
if (outptr)
outptr += hyplen;
if (maxlen)
maxlen -= hyplen;
len += 6; /* "w":,[ */
if (outptr) {
memcpy(outptr, ",\"w\":[", 6);
outptr += 6;
}
if (maxlen)
maxlen -= 6;
pitor = ps_alignment_iter_children(itor);
while (pitor != NULL) {
hyplen = format_align_iter(outptr, maxlen,
pitor, utt_start, frate, lmath);
len += hyplen;
if (outptr)
outptr += hyplen;
if (maxlen)
maxlen -= hyplen;
/* FIXME: refactor with recursion, someday */
if (state_align) {
ps_alignment_iter_t *sitor = ps_alignment_iter_children(pitor);
len += 6; /* "w":,[ */
if (outptr) {
memcpy(outptr, ",\"w\":[", 6);
outptr += 6;
}
if (maxlen)
maxlen -= 6;
while (sitor != NULL) {
hyplen = format_align_iter(outptr, maxlen,
sitor, utt_start, frate, lmath);
len += hyplen;
if (outptr)
outptr += hyplen;
if (maxlen)
maxlen -= hyplen;
len++; /* } */
if (outptr)
*outptr++ = '}';
if (maxlen)
maxlen--;
sitor = ps_alignment_iter_next(sitor);
if (sitor != NULL) {
len++;
if (outptr)
*outptr++ = ',';
if (maxlen)
maxlen--;
}
}
len++;
if (outptr)
*outptr++ = ']';
if (maxlen)
maxlen--;
}
len++; /* } */
if (outptr)
*outptr++ = '}';
if (maxlen)
maxlen--;
pitor = ps_alignment_iter_next(pitor);
if (pitor != NULL) {
len++;
if (outptr)
*outptr++ = ',';
if (maxlen)
maxlen--;
}
}
len += 2;
if (outptr) {
*outptr++ = ']';
*outptr++ = '}';
*outptr = '\0';
}
if (maxlen)
maxlen--;
return len;
}
static void
output_hyp(ps_endpointer_t *ep, ps_decoder_t *decoder, ps_alignment_t *alignment)
{
logmath_t *lmath;
char *hyp_json, *ptr;
int frate;
int maxlen, len;
double st;
int state_align = ps_config_bool(decoder->config, "state_align");
maxlen = format_hyp(NULL, 0, ep, decoder);
maxlen += 6; /* "w":,[ */
lmath = ps_get_logmath(decoder);
frate = ps_config_int(ps_get_config(decoder), "frate");
if (ep == NULL)
st = 0.0;
else
st = ps_endpointer_speech_start(ep);
if (alignment) {
ps_alignment_iter_t *itor = ps_alignment_words(alignment);
if (itor == NULL)
maxlen++; /* ] at end */
for (; itor; itor = ps_alignment_iter_next(itor)) {
maxlen += format_seg_align(NULL, 0, itor, st, frate,
lmath, state_align);
maxlen++; /* , or ] at end */
}
}
else {
ps_seg_t *itor = ps_seg_iter(decoder);
if (itor == NULL)
maxlen++; /* ] at end */
for (; itor; itor = ps_seg_next(itor)) {
maxlen += format_seg(NULL, 0, itor, st, frate, lmath);
maxlen++; /* , or ] at end */
}
}
maxlen++; /* final } */
maxlen++; /* trailing \0 */
ptr = hyp_json = ckd_calloc(maxlen, 1);
len = maxlen;
len = format_hyp(hyp_json, len, ep, decoder);
ptr += len;
maxlen -= len;
assert(maxlen > 6);
memcpy(ptr, ",\"w\":[", 6);
ptr += 6;
maxlen -= 6;
if (alignment) {
ps_alignment_iter_t *itor;
for (itor = ps_alignment_words(alignment); itor;
itor = ps_alignment_iter_next(itor)) {
assert(maxlen > 0);
len = format_seg_align(ptr, maxlen, itor, st, frate, lmath,
state_align);
ptr += len;
maxlen -= len;
*ptr++ = ',';
maxlen--;
}
}
else {
ps_seg_t *itor = ps_seg_iter(decoder);
if (itor == NULL) {
*ptr++ = ']'; /* Gets overwritten below... */
maxlen--;
}
for (; itor; itor = ps_seg_next(itor)) {
assert(maxlen > 0);
len = format_seg(ptr, maxlen, itor, st, frate, lmath);
ptr += len;
maxlen -= len;
*ptr++ = ',';
maxlen--;
}
}
--ptr;
*ptr++ = ']';
assert(maxlen == 2);
*ptr++ = '}';
--maxlen;
*ptr = '\0';
puts(hyp_json);
ckd_free(hyp_json);
}
static int
live(ps_config_t *config, FILE *infile)
{
ps_decoder_t *decoder = NULL;
ps_endpointer_t *ep = NULL;
short *frame = NULL;
size_t frame_size;
if ((decoder = ps_init(config)) == NULL) {
E_FATAL("PocketSphinx decoder init failed\n");
goto error_out;
}
if ((ep = ps_endpointer_init(0, 0.0,
0, ps_config_int(config, "samprate"),
0)) == NULL) {
E_ERROR("PocketSphinx endpointer init failed\n");
goto error_out;
}
frame_size = ps_endpointer_frame_size(ep);
if ((frame = ckd_calloc(frame_size, sizeof(frame[0]))) == NULL) {
E_ERROR("Failed to allocate frame");
goto error_out;
}
if (signal(SIGINT, catch_sig) == SIG_ERR)
E_FATAL_SYSTEM("Failed to set SIGINT handler");
while (!global_done) {
const int16 *speech;
int prev_in_speech = ps_endpointer_in_speech(ep);
size_t len, end_samples;
if ((len = fread(frame, sizeof(frame[0]),
frame_size, infile)) != frame_size) {
if (len > 0) {
speech = ps_endpointer_end_stream(ep, frame,
frame_size,
&end_samples);
}
else
break;
} else
speech = ps_endpointer_process(ep, frame);
if (speech != NULL) {
if (!prev_in_speech) {
E_INFO("Speech start at %.2f\n",
ps_endpointer_speech_start(ep));
ps_start_utt(decoder);
}
if (ps_process_raw(decoder, speech, frame_size, FALSE, FALSE) < 0) {
E_ERROR("ps_process_raw() failed\n");
goto error_out;
}
if (!ps_endpointer_in_speech(ep)) {
E_INFO("Speech end at %.2f\n",
ps_endpointer_speech_end(ep));
ps_end_utt(decoder);
if (ps_config_bool(decoder->config, "phone_align"))
E_WARN("Subword alignment not yet supported in live mode\n");
output_hyp(ep, decoder, NULL);
}
}
}
ckd_free(frame);
ps_endpointer_free(ep);
ps_free(decoder);
return 0;
error_out:
if (frame)
ckd_free(frame);
if (ep)
ps_endpointer_free(ep);
if (decoder)
ps_free(decoder);
return -1;
}
static int
decode_single(ps_decoder_t *decoder, FILE *infile)
{
ps_alignment_t *alignment = NULL;
size_t data_size, block_size;
short *data, *ptr;
int rv = 0;
data_size = 65536;
block_size = 2048;
ptr = data = ckd_calloc(data_size, sizeof(*data));
if (signal(SIGINT, catch_sig) == SIG_ERR)
E_FATAL_SYSTEM("Failed to set SIGINT handler");
while (!global_done) {
size_t len;
if ((size_t)(ptr + block_size - data) > data_size) {
len = ptr - data;
data_size *= 2;
data = ckd_realloc(data, data_size * sizeof(*data));
ptr = data + len;
}
len = fread(ptr, sizeof(*ptr), block_size, infile);
if (len == 0) {
if (feof(infile))
break;
else {
E_ERROR_SYSTEM("Failed to read %d bytes\n",
sizeof(*ptr) * block_size);
rv = -1;
goto error_out;
}
}
ptr += len;
}
if ((rv = ps_start_utt(decoder)) < 0)
goto error_out;
if ((rv = ps_process_raw(decoder, data, ptr - data, FALSE, TRUE)) < 0) {
E_ERROR("ps_process_raw() failed\n");
goto error_out;
}
if ((rv = ps_end_utt(decoder)) < 0)
goto error_out;
if (ps_config_bool(decoder->config, "phone_align")) {
if (ps_set_alignment(decoder, NULL) < 0)
goto error_out;
if ((rv = ps_start_utt(decoder)) < 0)
goto error_out;
if ((rv = ps_process_raw(decoder, data, ptr - data, FALSE, TRUE)) < 0) {
E_ERROR("ps_process_raw() failed\n");
goto error_out;
}
if ((rv = ps_end_utt(decoder)) < 0)
goto error_out;
if ((alignment = ps_get_alignment(decoder)) == NULL)
goto error_out;
ps_activate_search(decoder, NULL);
}
output_hyp(NULL, decoder, alignment);
/* Fall through intentionally */
error_out:
ckd_free(data);
return rv;
}
static int
single(ps_config_t *config, FILE *infile)
{
ps_decoder_t *decoder;
int rv = 0;
if ((decoder = ps_init(config)) == NULL) {
E_FATAL("PocketSphinx decoder init failed\n");
return -1;
}
rv = decode_single(decoder, infile);
ps_free(decoder);
return rv;
}
static char *
string_array_join(char **strings, int nstrings)
{
char *joined, *ptr;
int i, *len, jlen;
len = ckd_malloc(nstrings * sizeof(*len));
for (jlen = i = 0; i < nstrings; ++i) {
len[i] = strlen(strings[i]);
jlen += len[i] + 1;
}
ptr = joined = ckd_malloc(jlen);
for (i = 0; i < nstrings; ++i) {
memcpy(ptr, strings[i], len[i]);
ptr += len[i];
*ptr++ = ' ';
}
*--ptr = '\0';
ckd_free(len);
return joined;
}
static int
align(ps_config_t *config, char **inputs, int ninputs)
{
int rv = 0, is_stdin = FALSE;
ps_decoder_t *decoder = NULL;
char *text = NULL;
FILE *fh = NULL;
if (ninputs < 2) {
E_ERROR("Usage: pocketsphinx align INFILE TEXT...\n");
return -1;
}
/* Please do not use bestpath for alignment. */
ps_config_set_bool(config, "bestpath", FALSE);
ps_config_set_str(config, "lm", NULL);
if (0 == strcmp(inputs[0], "-")) {
is_stdin = TRUE;
fh = stdin;
}
else if ((fh = fopen(inputs[0], "rb")) == NULL) {
E_ERROR_SYSTEM("Failed to open %s for input", inputs[0]);
goto error_out;
}
if ((rv = ps_config_soundfile(config, fh, inputs[0])) < 0)
goto error_out;
if ((decoder = ps_init(config)) == NULL) {
E_FATAL("PocketSphinx decoder init failed\n");
rv = -1;
goto error_out;
}
text = string_array_join(inputs + 1, ninputs - 1);
if ((rv = ps_set_align_text(decoder, text)) < 0)
goto error_out;
rv = decode_single(decoder, fh);
/* Fall through intentionally. */
error_out:
if (fh && !is_stdin)
fclose(fh);
if (text)
ckd_free(text);
if (decoder)
ps_free(decoder);
return rv;
}
#if 0
static int sample_rates[] = {
8000,
11025,
16000,
22050,
32000,
44100,
48000
};
static const int n_sample_rates = sizeof(sample_rates)/sizeof(sample_rates[0]);
static int
minimum_samprate(ps_config_t *config)
{
double upperf = ps_config_float(config, "upperf");
int nyquist = (int)(upperf * 2);
int i;
for (i = 0; i < n_sample_rates; ++i)
if (sample_rates[i] >= nyquist)
break;
if (i == n_sample_rates)
E_FATAL("Unable to find sampling rate for -upperf %f\n", upperf);
return sample_rates[i];
}
#endif
#define SOX_FORMAT "-r %d -c 1 -b 16 -e signed-integer -t raw -"
static int
soxflags(ps_config_t *config)
{
int maxlen, len;
int samprate;
char *args;
/* Get feature extraction parameters. */
ps_expand_model_config(config);
samprate = ps_config_int(config, "samprate");
maxlen = snprintf(NULL, 0, SOX_FORMAT, samprate);
if (maxlen < 0) {
E_ERROR_SYSTEM("Failed to snprintf()");
return -1;
}
maxlen++;
args = ckd_calloc(maxlen, 1);
len = snprintf(args, maxlen, SOX_FORMAT, samprate);
if (len != maxlen - 1) {
E_ERROR_SYSTEM("Failed to snprintf()");
return -1;
}
puts(args);
fflush(stdout);
ckd_free(args);
return 0;
}
static char *
find_command(int *argc, char **argv)
{
int i;
for (i = 1; i < *argc; i += 2) {
char *arg = argv[i];
if (arg && arg[0] && arg[0] != '-') {
memmove(&argv[i],
&argv[i + 1],
(*argc - i - 1) * sizeof(argv[i]));
--*argc;
return arg;
}
}
return "live";
}
static char **
find_inputs(int *argc, char **argv, int *ninputs)
{
char **inputs = NULL;
int i = 1;
*ninputs = 0;
while (i < *argc) {
char *arg = argv[i];
/* Bubble-bogo-bobo-backward-sort them to the end of argv. */
if (arg && arg[0]
/* "-" on its own is an input, otherwise, - starts args. */
&& (arg[0] != '-' || arg[1] == '\0')) {
memmove(&argv[i],
&argv[i + 1],
(*argc - i - 1) * sizeof(argv[i]));
--*argc;
argv[*argc] = arg;
inputs = &argv[*argc];
++*ninputs;
}
else
i += 2;
}
/* Now reverse them. I won't be passing Google's coding interview
any time soon, not that it matters in this particular case. */
for (i = 0; i < *ninputs / 2; ++i) {
char *tmp = inputs[i];
inputs[i] = inputs[*ninputs - i - 1];
inputs[*ninputs - i - 1] = tmp;
}
return inputs;
}
int
process_inputs(int (*func)(ps_config_t *, FILE *),
ps_config_t *config,
char **inputs, int ninputs)
{
int rv = 0;
if (ninputs == 0)
return func(config, stdin);
else {
int i, rv_one;
for (i = 0; i < ninputs; ++i) {
char *file = inputs[i];
int is_stdin = FALSE;
FILE *fh;
if (0 == strcmp(file, "-")) {
is_stdin = TRUE;
fh = stdin;
}
else if ((fh = fopen(file, "rb")) == NULL) {
E_ERROR_SYSTEM("Failed to open %s for input", file);
rv = -1;
continue;
}
if ((rv_one = ps_config_soundfile(config, fh, file)) < 0) {
fclose(fh);
rv = rv_one;
continue;
}
if ((rv_one = func(config, fh)) < 0) {
rv = rv_one;
E_ERROR("Recognition failed on %s\n", file);
}
if (!is_stdin)
fclose(fh);
}
}
return rv;
}
static int
print_config(ps_config_t *config)
{
if (puts(ps_config_serialize_json(config)) < 0)
return -1;
return 0;
}
void
usage(char *name, int help_config)
{
fprintf(stderr, "Usage: %s [PARAMS] [soxflags | config | help | help-config | live | single | align] INPUTS...\n", name);
fprintf(stderr, "Examples:\n");
fprintf(stderr, "\tsox input.mp3 $(%s soxflags) | %s single -\n", name, name);
fprintf(stderr, "\tsox -qd $(%s soxflags) | %s live -\n", name, name);
fprintf(stderr, "\t%s single INPUT\n", name);
fprintf(stderr, "\t%s align INPUT WORDS...\n", name);
fprintf(stderr, "\nFor detailed PARAMS values, run %s help-config\n", name);
if (help_config) {
err_set_loglevel(ERR_INFO);
cmd_ln_log_help_r(NULL, ps_args());
}
}
int
main(int argc, char *argv[])
{
ps_config_t *config;
const char *conffile;
char *command;
char **inputs;
int rv, ninputs;
command = find_command(&argc, argv);
inputs = find_inputs(&argc, argv, &ninputs);
/* Only soxflags and config take no arguments */
if (ninputs == 0) {
if ((0 != strcmp(command, "soxflags"))
&& 0 != strcmp(command, "config")
&& 0 != strcmp(command, "help-config")) {
usage(argv[0], FALSE);
return 1;
}
}
/* If arg parsing fails */
if ((config = ps_config_parse_args(ps_main_args_def, argc, argv)) == NULL) {
usage(argv[0], FALSE);
return 1;
}
ps_default_search_args(config);
if (ps_config_bool(config, "state_align"))
ps_config_set_bool(config, "phone_align", TRUE);
if ((conffile = ps_config_str(config, "config")) != NULL) {
char *json;
FILE *infh;
size_t len;
if ((infh = fopen(conffile, "rt")) == NULL) {
E_ERROR_SYSTEM("Failed to open config file %s", conffile);
return 1;
}
fseek(infh, 0, SEEK_END);
len = (size_t)ftell(infh);
fseek(infh, 0, SEEK_SET);
json = ckd_malloc(len + 1);
if (fread(json, 1, len, infh) != len) {
E_ERROR_SYSTEM("Failed to read config file %s", conffile);
ckd_free(json);
fclose(infh);
return 1;
}
json[len] = '\0';
fclose(infh);
config = ps_config_parse_json(config, json);
ckd_free(json);
if (config == NULL)
return 1;
ps_config_set_str(config, "config", NULL);
}
if (0 == strcmp(command, "soxflags"))
rv = soxflags(config);
else if (0 == strcmp(command, "config"))
rv = print_config(config);
else if (0 == strcmp(command, "live"))
rv = process_inputs(live, config, inputs, ninputs);
else if (0 == strcmp(command, "single"))
rv = process_inputs(single, config, inputs, ninputs);
else if (0 == strcmp(command, "align"))
rv = align(config, inputs, ninputs);
else if (0 == strcmp(command, "help")) {
rv = 0;
usage(argv[0], FALSE);
}
else if (0 == strcmp(command, "help-config")) {
rv = 0;
usage(argv[0], TRUE);
}
else {
E_ERROR("Unknown command \"%s\"\n", command);
return 1;
}
ps_config_free(config);
return rv;
}