camenduru's picture
pocketsphinx
5610573
/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
/*
* Copyright (c) 2008 Beyond Access, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY BEYOND ACCESS, INC. ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL BEYOND ACCESS, INC. NOR
* ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/**
* @file yin.c Implementation of pitch extraction.
* @author David Huggins-Daines <dhdaines@gmail.com>
*/
/* This implements part of the YIN algorithm:
*
* "YIN, a fundamental frequency estimator for speech and music".
* Alain de Cheveigné and Hideki Kawahara. Journal of the Acoustical
* Society of America, 111 (4), April 2002.
*/
#include <pocketsphinx.h>
#include "util/ckd_alloc.h"
#include "fe/fixpoint.h"
#include "fe/yin.h"
#include <stdio.h>
#include <string.h>
struct yin_s {
uint16 frame_size; /** Size of analysis frame. */
uint16 search_threshold; /**< Threshold for finding period, in Q15 */
uint16 search_range; /**< Range around best local estimate to search, in Q15 */
uint16 nfr; /**< Number of frames read so far. */
unsigned char wsize; /**< Size of smoothing window. */
unsigned char wstart; /**< First frame in window. */
unsigned char wcur; /**< Current frame of analysis. */
unsigned char endut; /**< Hoch Hech! Are we at the utterance end? */
fixed32 **diff_window; /**< Window of difference function outputs. */
uint16 *period_window; /**< Window of best period estimates. */
};
/**
* The core of YIN: cumulative mean normalized difference function.
*/
static void
cmn_diff(int16 const *signal, int32 *out_diff, int ndiff)
{
uint32 cum, cshift;
int32 t, tscale;
out_diff[0] = 32768;
cum = 0;
cshift = 0;
/* Determine how many bits we can scale t up by below. */
for (tscale = 0; tscale < 32; ++tscale)
if (ndiff & (1<<(31-tscale)))
break;
--tscale; /* Avoid teh overflowz. */
/* printf("tscale is %d (ndiff - 1) << tscale is %d\n",
tscale, (ndiff-1) << tscale); */
/* Somewhat elaborate block floating point implementation.
* The fp implementation of this is really a lot simpler. */
for (t = 1; t < ndiff; ++t) {
uint32 dd, dshift, norm;
int j;
dd = 0;
dshift = 0;
for (j = 0; j < ndiff; ++j) {
int diff = signal[j] - signal[t + j];
/* Guard against overflows. */
if (dd > (1UL<<tscale)) {
dd >>= 1;
++dshift;
}
dd += (diff * diff) >> dshift;
}
/* Make sure the diffs and cum are shifted to the same
* scaling factor (usually dshift will be zero) */
if (dshift > cshift) {
cum += dd << (dshift-cshift);
}
else {
cum += dd >> (cshift-dshift);
}
/* Guard against overflows and also ensure that (t<<tscale) > cum. */
while (cum > (1UL<<tscale)) {
cum >>= 1;
++cshift;
}
/* Avoid divide-by-zero! */
if (cum == 0) cum = 1;
/* Calculate the normalizer in high precision. */
norm = (t << tscale) / cum;
/* Do a long multiply and shift down to Q15. */
out_diff[t] = (int32)(((long long)dd * norm)
>> (tscale - 15 + cshift - dshift));
/* printf("dd %d cshift %d dshift %d scaledt %d cum %d norm %d cmn %d\n",
dd, cshift, dshift, (t<<tscale), cum, norm, out_diff[t]); */
}
}
yin_t *
yin_init(int frame_size, float search_threshold,
float search_range, int smooth_window)
{
yin_t *pe;
pe = ckd_calloc(1, sizeof(*pe));
pe->frame_size = frame_size;
pe->search_threshold = (uint16)(search_threshold * 32768);
pe->search_range = (uint16)(search_range * 32768);
pe->wsize = smooth_window * 2 + 1;
pe->diff_window = ckd_calloc_2d(pe->wsize,
pe->frame_size / 2,
sizeof(**pe->diff_window));
pe->period_window = ckd_calloc(pe->wsize,
sizeof(*pe->period_window));
return pe;
}
void
yin_free(yin_t *pe)
{
ckd_free_2d(pe->diff_window);
ckd_free(pe->period_window);
ckd_free(pe);
}
void
yin_start(yin_t *pe)
{
/* Reset the circular window pointers. */
pe->wstart = pe->endut = 0;
pe->nfr = 0;
}
void
yin_end(yin_t *pe)
{
pe->endut = 1;
}
int
thresholded_search(int32 *diff_window, fixed32 threshold, int start, int end)
{
int i, min, argmin;
min = INT_MAX;
argmin = 0;
for (i = start; i < end; ++i) {
int diff = diff_window[i];
if (diff < threshold) {
min = diff;
argmin = i;
break;
}
if (diff < min) {
min = diff;
argmin = i;
}
}
return argmin;
}
void
yin_write(yin_t *pe, int16 const *frame)
{
int outptr, difflen;
/* Rotate the window one frame forward. */
++pe->wstart;
/* Fill in the frame before wstart. */
outptr = pe->wstart - 1;
/* Wrap around the window pointer. */
if (pe->wstart == pe->wsize)
pe->wstart = 0;
/* Now calculate normalized difference function. */
difflen = pe->frame_size / 2;
cmn_diff(frame, pe->diff_window[outptr], difflen);
/* Find the first point under threshold. If not found, then
* use the absolute minimum. */
pe->period_window[outptr]
= thresholded_search(pe->diff_window[outptr],
pe->search_threshold, 0, difflen);
/* Increment total number of frames. */
++pe->nfr;
}
int
yin_read(yin_t *pe, uint16 *out_period, uint16 *out_bestdiff)
{
int wstart, wlen, half_wsize, i;
int best, best_diff, search_width, low_period, high_period;
half_wsize = (pe->wsize-1)/2;
/* Without any smoothing, just return the current value (don't
* need to do anything to the current poitner either). */
if (half_wsize == 0) {
if (pe->endut)
return 0;
*out_period = pe->period_window[0];
*out_bestdiff = pe->diff_window[0][pe->period_window[0]];
return 1;
}
/* We can't do anything unless we have at least (wsize-1)/2 + 1
* frames, unless we're at the end of the utterance. */
if (pe->endut == 0 && pe->nfr < half_wsize + 1) {
/* Don't increment the current pointer either. */
return 0;
}
/* Establish the smoothing window. */
/* End of utterance. */
if (pe->endut) {
/* We are done (no more data) when pe->wcur = pe->wstart. */
if (pe->wcur == pe->wstart)
return 0;
/* I.e. pe->wcur (circular minus) half_wsize. */
wstart = (pe->wcur + pe->wsize - half_wsize) % pe->wsize;
/* Number of frames from wstart up to pe->wstart. */
wlen = pe->wstart - wstart;
if (wlen < 0) wlen += pe->wsize;
/*printf("ENDUT! ");*/
}
/* Beginning of utterance. */
else if (pe->nfr < pe->wsize) {
wstart = 0;
wlen = pe->nfr;
}
/* Normal case, it is what it is. */
else {
wstart = pe->wstart;
wlen = pe->wsize;
}
/* Now (finally) look for the best local estimate. */
/* printf("Searching for local estimate in %d frames around %d\n",
wlen, pe->nfr + 1 - wlen); */
best = pe->period_window[pe->wcur];
best_diff = pe->diff_window[pe->wcur][best];
for (i = 0; i < wlen; ++i) {
int j = wstart + i;
int diff;
j %= pe->wsize;
diff = pe->diff_window[j][pe->period_window[j]];
/* printf("%.2f,%.2f ", 1.0 - (double)diff/32768,
pe->period_window[j] ? 8000.0/pe->period_window[j] : 8000.0); */
if (diff < best_diff) {
best_diff = diff;
best = pe->period_window[j];
}
}
/* printf("best: %.2f, %.2f\n", 1.0 - (double)best_diff/32768,
best ? 8000.0/best : 8000.0); */
/* If it's the same as the current one then return it. */
if (best == pe->period_window[pe->wcur]) {
/* Increment the current pointer. */
if (++pe->wcur == pe->wsize)
pe->wcur = 0;
*out_period = best;
*out_bestdiff = best_diff;
return 1;
}
/* Otherwise, redo the search inside a narrower range. */
search_width = best * pe->search_range / 32768;
/* printf("Search width = %d * %.2f = %d\n",
best, (double)pe->search_range/32768, search_width); */
if (search_width == 0) search_width = 1;
low_period = best - search_width;
high_period = best + search_width;
if (low_period < 0) low_period = 0;
if (high_period > pe->frame_size / 2) high_period = pe->frame_size / 2;
/* printf("Searching from %d to %d\n", low_period, high_period); */
best = thresholded_search(pe->diff_window[pe->wcur],
pe->search_threshold,
low_period, high_period);
best_diff = pe->diff_window[pe->wcur][best];
if (out_period)
*out_period = (best > 32768) ? 32768 : best;
if (out_bestdiff)
*out_bestdiff = (best_diff > 32768) ? 32768 : best_diff;
/* Increment the current pointer. */
if (++pe->wcur == pe->wsize)
pe->wcur = 0;
return 1;
}