File size: 7,954 Bytes
37a92a9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 | /* -*- c-basic-offset:4; indent-tabs-mode: nil -*- */
/* ====================================================================
* Copyright (c) 2022 David Huggins-Daines. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*/
/**
* @file endpointer.h
* @brief VAD-based endpointer for PocketSphinx
*
* Because doxygen is Bad Software, the actual documentation can only
* exist in \ref ps_endpointer_t. Sorry about that.
*/
#ifndef __PS_ENDPOINTER_H__
#define __PS_ENDPOINTER_H__
#include <pocketsphinx/prim_type.h>
#include <pocketsphinx/export.h>
#ifdef __cplusplus
extern "C" {
#endif
#if 0
}
#endif
#include <pocketsphinx/prim_type.h>
#include <pocketsphinx/export.h>
#include <pocketsphinx/vad.h>
/**
* @struct ps_endpointer_t pocketsphinx/endpointer.h
* @brief Simple voice activity detection based endpointing
*/
typedef struct ps_endpointer_s ps_endpointer_t;
/**
* Default window in seconds of audio to use for speech start/end decision.
*/
#define PS_ENDPOINTER_DEFAULT_WINDOW 0.3
/**
* Default ratio of frames in window to trigger start/end decision.
*/
#define PS_ENDPOINTER_DEFAULT_RATIO 0.9
/**
* Initialize endpointing.
*
* @memberof ps_endpointer_t
* @param window Seconds of audio to use in speech start/end decision,
* or 0 to use the default (PS_ENDPOINTER_DEFAULT_WINDOW).
* @param ratio Ratio of frames needed to trigger start/end decision,
* or 0 for the default (PS_ENDPOINTER_DEFAULT_RATIO).
* @param mode "Aggressiveness" of voice activity detection. Stricter
* values (see ps_vad_mode_t) are less likely to
* misclassify non-speech as speech.
* @param sample_rate Sampling rate of input, or 0 for default (which can
* be obtained with ps_vad_sample_rate()). Only 8000,
* 16000, 32000, 48000 are directly supported, others
* will use the closest supported rate (within reason).
* Note that this means that the actual frame length
* may not be exactly the one requested, so you must
* always use the one returned by
* ps_endpointer_frame_size()
* (in samples) or ps_endpointer_frame_length() (in
* seconds).
* @param frame_length Requested frame length in seconds, or 0.0 for the
* default. Only 0.01, 0.02, 0.03 currently supported.
* **Actual frame length may be different, you must
* always use ps_endpointer_frame_length() to obtain it.**
* @return Endpointer object or NULL on failure (invalid parameter for
* instance).
*/
POCKETSPHINX_EXPORT
ps_endpointer_t *ps_endpointer_init(double window,
double ratio,
ps_vad_mode_t mode,
int sample_rate, double frame_length);
/**
* Retain a pointer to endpointer
*
* @memberof ps_endpointer_t
* @param ep Endpointer.
* @return Endpointer with incremented reference count.
*/
POCKETSPHINX_EXPORT
ps_endpointer_t *ps_endpointer_retain(ps_endpointer_t *ep);
/**
* Release a pointer to endpointer.
*
* @memberof ps_endpointer_t
* @param ep Endpointer
* @return New reference count (0 if freed).
*/
POCKETSPHINX_EXPORT
int ps_endpointer_free(ps_endpointer_t *ep);
/**
* Get the voice activity detector used by the endpointer.
*
* @memberof ps_endpointer_t
* @return VAD object. The endpointer retains ownership of this
* object, so you must use ps_vad_retain() if you wish to use it
* outside of the lifetime of the endpointer.
*/
POCKETSPHINX_EXPORT
ps_vad_t *ps_endpointer_vad(ps_endpointer_t *ep);
/**
* Get the frame size (in samples) consumed by the endpointer.
*
* Multiply this by 2 to get the size of the frame buffer required.
*/
#define ps_endpointer_frame_size(ep) ps_vad_frame_size(ps_endpointer_vad(ep))
/**
* Get the frame length (in seconds) consumed by the endpointer.
*/
#define ps_endpointer_frame_length(ep) ps_vad_frame_length(ps_endpointer_vad(ep))
/**
* Get the sample rate required by the endpointer.
*/
#define ps_endpointer_sample_rate(ep) ps_vad_sample_rate(ps_endpointer_vad(ep))
/**
* Process a frame of audio, returning a frame if in a speech region.
*
* Note that the endpointer is *not* thread-safe. You must call all
* endpointer functions from the same thread.
*
* @memberof ps_endpointer_t
* @param ep Endpointer.
* @param frame Frame of data, must contain ps_endpointer_frame_size()
* samples.
* @return NULL if no speech available, or pointer to a frame of
* ps_endpointer_frame_size() samples (no more and no less).
*/
POCKETSPHINX_EXPORT
const int16 *ps_endpointer_process(ps_endpointer_t *ep,
const int16 *frame);
/**
* Process remaining samples at end of stream.
*
* Note that the endpointer is *not* thread-safe. You must call all
* endpointer functions from the same thread.
*
* @memberof ps_endpointer_t
* @param ep Endpointer.
* @param frame Frame of data, must contain ps_endpointer_frame_size()
* samples or less.
* @param nsamp: Number of samples in frame.
* @param out_nsamp: Output, number of samples available.
* @return Pointer to available samples, or NULL if none available.
*/
POCKETSPHINX_EXPORT
const int16 *ps_endpointer_end_stream(ps_endpointer_t *ep,
const int16 *frame,
size_t nsamp,
size_t *out_nsamp);
/**
* Get the current state (speech/not-speech) of the endpointer.
*
* This function can be used to detect speech/non-speech transitions.
* If it returns 0, and a subsequent call to ps_endpointer_process()
* returns non-NULL, this indicates a transition to speech.
* Conversely, if ps_endpointer_process() returns non-NULL and a
* subsequent call to this function returns 0, this indicates a
* transition to non-speech.
*
* @memberof ps_endpointer_t
* @param ep Endpointer.
* @return non-zero if in a speech segment after processing the last
* frame of data.
*/
POCKETSPHINX_EXPORT
int ps_endpointer_in_speech(ps_endpointer_t *ep);
/**
* Get the start time of the last speech segment.
* @memberof ps_endpointer_t
*/
POCKETSPHINX_EXPORT
double ps_endpointer_speech_start(ps_endpointer_t *ep);
/**
* Get the end time of the last speech segment
* @memberof ps_endpointer_t
*/
POCKETSPHINX_EXPORT
double ps_endpointer_speech_end(ps_endpointer_t *ep);
#ifdef __cplusplus
}
#endif
#endif /* __PS_ENDPOINTER_H__ */
|