camenduru's picture
pocketsphinx-20.04
37a92a9
/* -*- c-basic-offset:4; indent-tabs-mode: nil -*- */
/* ====================================================================
* Copyright (c) 2022 David Huggins-Daines. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*/
/**
* @file vad.h
* @brief Simple voice activity detection
*
* Because doxygen is Bad Software, the actual documentation can only
* exist in \ref ps_vad_t. Sorry about that.
*/
#ifndef __PS_VAD_H__
#define __PS_VAD_H__
#include <pocketsphinx/prim_type.h>
#include <pocketsphinx/export.h>
#ifdef __cplusplus
extern "C" {
#endif
#if 0
}
#endif
/**
* @struct ps_vad_t pocketsphinx/vad.h
* @brief Voice activity detector.
*/
typedef struct ps_vad_s ps_vad_t;
/**
* @enum ps_vad_mode_e pocketsphinx/vad.h
* @brief Voice activity detection "aggressiveness" levels.
*/
typedef enum ps_vad_mode_e {
PS_VAD_LOOSE = 0,
PS_VAD_MEDIUM_LOOSE = 1,
PS_VAD_MEDIUM_STRICT = 2,
PS_VAD_STRICT = 3
} ps_vad_mode_t;
/**
* @enum ps_vad_class_e pocketsphinx/vad.h
* @brief Classification of input frames returned by ps_vad_classify().
*/
typedef enum ps_vad_class_e {
PS_VAD_ERROR = -1,
PS_VAD_NOT_SPEECH = 0,
PS_VAD_SPEECH = 1
} ps_vad_class_t;
/**
* Default sampling rate for voice activity detector
*/
#define PS_VAD_DEFAULT_SAMPLE_RATE 16000
/**
* Default frame length for voice activity detector
*/
#define PS_VAD_DEFAULT_FRAME_LENGTH 0.03
/**
* Initialize voice activity detection.
*
* @memberof ps_vad_t
* @param mode "Aggressiveness" of voice activity detection. Stricter
* values (see ps_vad_mode_t) are less likely to
* misclassify non-speech as speech.
* @param sample_rate Sampling rate of input, or 0 for default (which can
* be obtained with ps_vad_sample_rate()). Only 8000,
* 16000, 32000, 48000 are directly supported. See
* ps_vad_set_input_params() for more information.
* @param frame_length Frame length in seconds, or 0.0 for the default. Only
* 0.01, 0.02, 0.03 currently supported. **Actual** value
* may differ, you must use ps_vad_frame_length() to
* obtain it.
* @return VAD object or NULL on failure (invalid parameter for instance).
*/
POCKETSPHINX_EXPORT
ps_vad_t *ps_vad_init(ps_vad_mode_t mode, int sample_rate, double frame_length);
/**
* Retain a pointer to voice activity detector.
*
* @memberof ps_vad_t
* @param vad Voice activity detector.
* @return Voice activity detector with incremented reference count.
*/
POCKETSPHINX_EXPORT
ps_vad_t *ps_vad_retain(ps_vad_t *vad);
/**
* Release a pointer to voice activity detector.
*
* @memberof ps_vad_t
* @param vad Voice activity detector.
* @return New reference count (0 if freed).
*/
POCKETSPHINX_EXPORT
int ps_vad_free(ps_vad_t *vad);
/**
* Set the input parameters for voice activity detection.
*
* @memberof ps_vad_t
* @param sample_rate Sampling rate of input, or 0 for default (which can
* be obtained with ps_vad_sample_rate()). Only 8000,
* 16000, 32000, 48000 are directly supported, others
* will use the closest supported rate (within reason).
* Note that this means that the actual frame length
* may not be exactly the one requested, so you must
* always use the one returned by ps_vad_frame_size()
* (in samples) or ps_vad_frame_length() (in seconds).
* @param frame_length Requested frame length in seconds, or 0.0 for the
* default. Only 0.01, 0.02, 0.03 currently supported.
* **Actual frame length may be different, you must
* always use ps_vad_frame_length() to obtain it.**
* @return 0 for success or -1 on error.
*/
POCKETSPHINX_EXPORT
int ps_vad_set_input_params(ps_vad_t *vad, int sample_rate, double frame_length);
/**
* Get the sampling rate expected by voice activity detection.
*
* @memberof ps_vad_t
* @param vad Voice activity detector.
* @return Expected sampling rate.
*/
POCKETSPHINX_EXPORT
int ps_vad_sample_rate(ps_vad_t *vad);
/**
* Get the number of samples expected by voice activity detection.
*
* You **must** always ensure that the buffers passed to
* ps_vad_classify() contain this number of samples (zero-pad them if
* necessary).
*
* @memberof ps_vad_t
* @param vad Voice activity detector.
* @return Size, in samples, of the frames passed to ps_vad_classify().
*/
POCKETSPHINX_EXPORT
size_t ps_vad_frame_size(ps_vad_t *vad);
/**
* Get the *actual* length of a frame in seconds.
*
* This may differ from the value requested in ps_vad_set_input_params().
*/
#define ps_vad_frame_length(vad) ((double)ps_vad_frame_size(vad) / ps_vad_sample_rate(vad))
/**
* Classify a frame as speech or not speech.
*
* @memberof ps_vad_t
* @param vad Voice activity detector.
* @param frame Frame of input, **must** contain the number of
* samples returned by ps_vad_frame_size().
* @return PS_VAD_SPEECH, PS_VAD_NOT_SPEECH, or PS_VAD_ERROR (see
* ps_vad_class_t).
*/
POCKETSPHINX_EXPORT
ps_vad_class_t ps_vad_classify(ps_vad_t *vad, const int16 *frame);
#ifdef __cplusplus
}
#endif
#endif /* __PS_VAD_H__ */