File size: 7,954 Bytes
37a92a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
/* -*- c-basic-offset:4; indent-tabs-mode: nil -*- */
/* ====================================================================
 * Copyright (c) 2022 David Huggins-Daines.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer. 
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 * OF THE POSSIBILITY OF SUCH DAMAGE.
 * ====================================================================
 */
/**
 * @file endpointer.h
 * @brief VAD-based endpointer for PocketSphinx
 *
 * Because doxygen is Bad Software, the actual documentation can only
 * exist in \ref ps_endpointer_t.  Sorry about that.
 */

#ifndef __PS_ENDPOINTER_H__
#define __PS_ENDPOINTER_H__


#include <pocketsphinx/prim_type.h>
#include <pocketsphinx/export.h>

#ifdef __cplusplus
extern "C" {
#endif
#if 0
}
#endif

#include <pocketsphinx/prim_type.h>
#include <pocketsphinx/export.h>
#include <pocketsphinx/vad.h>

/**
 * @struct ps_endpointer_t pocketsphinx/endpointer.h
 * @brief Simple voice activity detection based endpointing
 */
typedef struct ps_endpointer_s ps_endpointer_t;

/**
 * Default window in seconds of audio to use for speech start/end decision.
 */
#define PS_ENDPOINTER_DEFAULT_WINDOW 0.3
/**
 * Default ratio of frames in window to trigger start/end decision.
 */
#define PS_ENDPOINTER_DEFAULT_RATIO 0.9

/**
 * Initialize endpointing.
 *
 * @memberof ps_endpointer_t
 * @param window Seconds of audio to use in speech start/end decision,
 *               or 0 to use the default (PS_ENDPOINTER_DEFAULT_WINDOW).
 * @param ratio Ratio of frames needed to trigger start/end decision,
 *              or 0 for the default (PS_ENDPOINTER_DEFAULT_RATIO).
 * @param mode "Aggressiveness" of voice activity detection.  Stricter
 *             values (see ps_vad_mode_t) are less likely to
 *             misclassify non-speech as speech.
 * @param sample_rate Sampling rate of input, or 0 for default (which can
 *                    be obtained with ps_vad_sample_rate()).  Only 8000,
 *                    16000, 32000, 48000 are directly supported, others
 *                    will use the closest supported rate (within reason).
 *                    Note that this means that the actual frame length
 *                    may not be exactly the one requested, so you must
 *                    always use the one returned by
 *                    ps_endpointer_frame_size()
 *                    (in samples) or ps_endpointer_frame_length() (in
 *                    seconds).
 * @param frame_length Requested frame length in seconds, or 0.0 for the
 *                     default.  Only 0.01, 0.02, 0.03 currently supported.
 *                     **Actual frame length may be different, you must
 *                     always use ps_endpointer_frame_length() to obtain it.**
 * @return Endpointer object or NULL on failure (invalid parameter for
 * instance).
 */
POCKETSPHINX_EXPORT
ps_endpointer_t *ps_endpointer_init(double window,
                                    double ratio,
                                    ps_vad_mode_t mode,
                                    int sample_rate, double frame_length);

/**
 * Retain a pointer to endpointer
 *
 * @memberof ps_endpointer_t
 * @param ep Endpointer.
 * @return Endpointer with incremented reference count.
 */
POCKETSPHINX_EXPORT
ps_endpointer_t *ps_endpointer_retain(ps_endpointer_t *ep);

/**
 * Release a pointer to endpointer.
 *
 * @memberof ps_endpointer_t
 * @param ep Endpointer
 * @return New reference count (0 if freed).
 */
POCKETSPHINX_EXPORT
int ps_endpointer_free(ps_endpointer_t *ep);

/**
 * Get the voice activity detector used by the endpointer.
 *
 * @memberof ps_endpointer_t
 * @return VAD object. The endpointer retains ownership of this
 * object, so you must use ps_vad_retain() if you wish to use it
 * outside of the lifetime of the endpointer.
 */
POCKETSPHINX_EXPORT
ps_vad_t *ps_endpointer_vad(ps_endpointer_t *ep);

/**
 * Get the frame size (in samples) consumed by the endpointer.
 *
 * Multiply this by 2 to get the size of the frame buffer required.
 */
#define ps_endpointer_frame_size(ep) ps_vad_frame_size(ps_endpointer_vad(ep))

/**
 * Get the frame length (in seconds) consumed by the endpointer.
 */
#define ps_endpointer_frame_length(ep) ps_vad_frame_length(ps_endpointer_vad(ep))

/**
 * Get the sample rate required by the endpointer.
 */
#define ps_endpointer_sample_rate(ep) ps_vad_sample_rate(ps_endpointer_vad(ep))

/**
 * Process a frame of audio, returning a frame if in a speech region.
 *
 * Note that the endpointer is *not* thread-safe.  You must call all
 * endpointer functions from the same thread.
 *
 * @memberof ps_endpointer_t
 * @param ep Endpointer.
 * @param frame Frame of data, must contain ps_endpointer_frame_size()
 *              samples.
 * @return NULL if no speech available, or pointer to a frame of
 *         ps_endpointer_frame_size() samples (no more and no less).
 */
POCKETSPHINX_EXPORT
const int16 *ps_endpointer_process(ps_endpointer_t *ep,
                                   const int16 *frame);

/**
 * Process remaining samples at end of stream.
 *
 * Note that the endpointer is *not* thread-safe.  You must call all
 * endpointer functions from the same thread.
 *
 * @memberof ps_endpointer_t
 * @param ep Endpointer.
 * @param frame Frame of data, must contain ps_endpointer_frame_size()
 *              samples or less.
 * @param nsamp: Number of samples in frame.
 * @param out_nsamp: Output, number of samples available.
 * @return Pointer to available samples, or NULL if none available.
 */
POCKETSPHINX_EXPORT
const int16 *ps_endpointer_end_stream(ps_endpointer_t *ep,
                                      const int16 *frame,
                                      size_t nsamp,
                                      size_t *out_nsamp);

/**
 * Get the current state (speech/not-speech) of the endpointer.
 *
 * This function can be used to detect speech/non-speech transitions.
 * If it returns 0, and a subsequent call to ps_endpointer_process()
 * returns non-NULL, this indicates a transition to speech.
 * Conversely, if ps_endpointer_process() returns non-NULL and a
 * subsequent call to this function returns 0, this indicates a
 * transition to non-speech.
 *
 * @memberof ps_endpointer_t
 * @param ep Endpointer.
 * @return non-zero if in a speech segment after processing the last
 *         frame of data.
 */
POCKETSPHINX_EXPORT
int ps_endpointer_in_speech(ps_endpointer_t *ep);

/**
 * Get the start time of the last speech segment.
 * @memberof ps_endpointer_t
 */
POCKETSPHINX_EXPORT
double ps_endpointer_speech_start(ps_endpointer_t *ep);

/**
 * Get the end time of the last speech segment
 * @memberof ps_endpointer_t
 */
POCKETSPHINX_EXPORT
double ps_endpointer_speech_end(ps_endpointer_t *ep);

#ifdef __cplusplus
}
#endif

#endif /* __PS_ENDPOINTER_H__ */