Fix wild head movement (throttle DoA tracking to 2Hz) and lower RMS VAD threshold to 0.005
Browse files- talk/stt.py +29 -20
talk/stt.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
"""Speech recording + Google STT for the Talk app.
|
| 2 |
|
| 3 |
Records from the robot's ReSpeaker mic array (16 kHz, stereo float32)
|
| 4 |
-
using the SDK's recording pipeline. Uses
|
| 5 |
-
Transcribes via Google Speech Recognition
|
| 6 |
"""
|
| 7 |
|
| 8 |
import io
|
|
@@ -20,7 +20,8 @@ SAMPLE_RATE = 16000 # ReSpeaker hardware rate
|
|
| 20 |
SILENCE_DURATION = 1.2 # s of silence to end utterance
|
| 21 |
MAX_DURATION = 20.0 # hard cap per utterance
|
| 22 |
MIN_SPEECH_DURATION = 0.4 # discard very short sounds (spurious noise)
|
| 23 |
-
RMS_SPEECH_THRESHOLD = 0.
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
def _rms(chunk: np.ndarray) -> float:
|
|
@@ -30,8 +31,8 @@ def _rms(chunk: np.ndarray) -> float:
|
|
| 30 |
|
| 31 |
def _chunks_to_wav_bytes(chunks: list) -> bytes:
|
| 32 |
"""Convert (N, 2) float32 chunks to mono 16-bit PCM WAV bytes."""
|
| 33 |
-
audio = np.concatenate(chunks)
|
| 34 |
-
mono = audio[:, 0]
|
| 35 |
int16 = (mono * 32767.0).clip(-32768, 32767).astype(np.int16)
|
| 36 |
buf = io.BytesIO()
|
| 37 |
with wave.open(buf, "wb") as w:
|
|
@@ -49,15 +50,16 @@ def record_utterance(
|
|
| 49 |
) -> tuple[list, float, bool]:
|
| 50 |
"""Wait for speech, record until silence, return (chunks, doa_angle, antenna_pressed).
|
| 51 |
|
| 52 |
-
|
| 53 |
-
|
| 54 |
"""
|
| 55 |
chunks: list = []
|
| 56 |
last_speech_t: Optional[float] = None
|
| 57 |
speech_started_t: Optional[float] = None
|
| 58 |
-
last_doa_angle: float = math.pi / 2
|
|
|
|
| 59 |
|
| 60 |
-
# Drain stale
|
| 61 |
drained = 0
|
| 62 |
while reachy_mini.media.get_audio_sample() is not None:
|
| 63 |
drained += 1
|
|
@@ -70,27 +72,34 @@ def record_utterance(
|
|
| 70 |
if should_stop(reachy_mini):
|
| 71 |
return [], last_doa_angle, True
|
| 72 |
|
| 73 |
-
# DoA
|
| 74 |
doa = reachy_mini.media.get_DoA()
|
| 75 |
if doa is not None:
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
chunk = reachy_mini.media.get_audio_sample()
|
| 84 |
if chunk is not None:
|
| 85 |
-
|
|
|
|
| 86 |
if speech_started_t is None:
|
| 87 |
-
logger.debug("Speech started (
|
| 88 |
speech_started_t = now
|
| 89 |
last_speech_t = now
|
| 90 |
if speech_started_t is not None:
|
| 91 |
chunks.append(chunk)
|
| 92 |
|
| 93 |
-
# End of utterance:
|
| 94 |
if (last_speech_t is not None
|
| 95 |
and now - last_speech_t > SILENCE_DURATION
|
| 96 |
and speech_started_t is not None
|
|
|
|
| 1 |
"""Speech recording + Google STT for the Talk app.
|
| 2 |
|
| 3 |
Records from the robot's ReSpeaker mic array (16 kHz, stereo float32)
|
| 4 |
+
using the SDK's recording pipeline. Uses RMS energy for VAD; DoA for
|
| 5 |
+
head-direction tracking only. Transcribes via Google Speech Recognition.
|
| 6 |
"""
|
| 7 |
|
| 8 |
import io
|
|
|
|
| 20 |
SILENCE_DURATION = 1.2 # s of silence to end utterance
|
| 21 |
MAX_DURATION = 20.0 # hard cap per utterance
|
| 22 |
MIN_SPEECH_DURATION = 0.4 # discard very short sounds (spurious noise)
|
| 23 |
+
RMS_SPEECH_THRESHOLD = 0.005 # float32 RMS; tunable
|
| 24 |
+
HEAD_UPDATE_INTERVAL = 0.5 # s between head-direction updates while waiting
|
| 25 |
|
| 26 |
|
| 27 |
def _rms(chunk: np.ndarray) -> float:
|
|
|
|
| 31 |
|
| 32 |
def _chunks_to_wav_bytes(chunks: list) -> bytes:
|
| 33 |
"""Convert (N, 2) float32 chunks to mono 16-bit PCM WAV bytes."""
|
| 34 |
+
audio = np.concatenate(chunks)
|
| 35 |
+
mono = audio[:, 0]
|
| 36 |
int16 = (mono * 32767.0).clip(-32768, 32767).astype(np.int16)
|
| 37 |
buf = io.BytesIO()
|
| 38 |
with wave.open(buf, "wb") as w:
|
|
|
|
| 50 |
) -> tuple[list, float, bool]:
|
| 51 |
"""Wait for speech, record until silence, return (chunks, doa_angle, antenna_pressed).
|
| 52 |
|
| 53 |
+
VAD is energy-based (RMS threshold). DoA is used only for head tracking
|
| 54 |
+
while waiting — updates are throttled to avoid jerky movement.
|
| 55 |
"""
|
| 56 |
chunks: list = []
|
| 57 |
last_speech_t: Optional[float] = None
|
| 58 |
speech_started_t: Optional[float] = None
|
| 59 |
+
last_doa_angle: float = math.pi / 2 # default: facing front
|
| 60 |
+
last_head_update: float = 0.0
|
| 61 |
|
| 62 |
+
# Drain stale audio buffered during TTS playback.
|
| 63 |
drained = 0
|
| 64 |
while reachy_mini.media.get_audio_sample() is not None:
|
| 65 |
drained += 1
|
|
|
|
| 72 |
if should_stop(reachy_mini):
|
| 73 |
return [], last_doa_angle, True
|
| 74 |
|
| 75 |
+
# Update DoA angle (direction only — not used as VAD).
|
| 76 |
doa = reachy_mini.media.get_DoA()
|
| 77 |
if doa is not None:
|
| 78 |
+
last_doa_angle = doa[0]
|
| 79 |
+
|
| 80 |
+
# Smooth head tracking toward speaker, throttled to HEAD_UPDATE_INTERVAL.
|
| 81 |
+
# Only while waiting for speech; once recording, head stays put.
|
| 82 |
+
if speech_started_t is None and now - last_head_update >= HEAD_UPDATE_INTERVAL:
|
| 83 |
+
y = math.sin(last_doa_angle - math.pi / 2) * 0.6
|
| 84 |
+
try:
|
| 85 |
+
reachy_mini.look_at_world(1.0, y, 0.0, duration=0.4)
|
| 86 |
+
except Exception:
|
| 87 |
+
pass
|
| 88 |
+
last_head_update = now
|
| 89 |
+
|
| 90 |
+
# Energy-based VAD.
|
| 91 |
chunk = reachy_mini.media.get_audio_sample()
|
| 92 |
if chunk is not None:
|
| 93 |
+
rms = _rms(chunk)
|
| 94 |
+
if rms > RMS_SPEECH_THRESHOLD:
|
| 95 |
if speech_started_t is None:
|
| 96 |
+
logger.debug("Speech started (RMS %.4f)", rms)
|
| 97 |
speech_started_t = now
|
| 98 |
last_speech_t = now
|
| 99 |
if speech_started_t is not None:
|
| 100 |
chunks.append(chunk)
|
| 101 |
|
| 102 |
+
# End of utterance: silence long enough after speech.
|
| 103 |
if (last_speech_t is not None
|
| 104 |
and now - last_speech_t > SILENCE_DURATION
|
| 105 |
and speech_started_t is not None
|