Spaces:

onitsche
/

talk

Running

App Files Files Community

onitsche commited on 15 days ago

Commit

035779f

1 Parent(s): b728731

Fix wild head movement (throttle DoA tracking to 2Hz) and lower RMS VAD threshold to 0.005

Browse files

Files changed (1) hide show

talk/stt.py +29 -20

talk/stt.py CHANGED Viewed

@@ -1,8 +1,8 @@
 """Speech recording + Google STT for the Talk app.
 Records from the robot's ReSpeaker mic array (16 kHz, stereo float32)
-using the SDK's recording pipeline. Uses DoA for VAD and head tracking.
-Transcribes via Google Speech Recognition (free, no API key, German de-DE).
 """
 import io
@@ -20,7 +20,8 @@ SAMPLE_RATE = 16000       # ReSpeaker hardware rate
 SILENCE_DURATION = 1.2    # s of silence to end utterance
 MAX_DURATION = 20.0       # hard cap per utterance
 MIN_SPEECH_DURATION = 0.4 # discard very short sounds (spurious noise)
-RMS_SPEECH_THRESHOLD = 0.008  # float32 RMS; speech ~0.01–0.1, noise ~0.001–0.005
 def _rms(chunk: np.ndarray) -> float:
@@ -30,8 +31,8 @@ def _rms(chunk: np.ndarray) -> float:
 def _chunks_to_wav_bytes(chunks: list) -> bytes:
     """Convert (N, 2) float32 chunks to mono 16-bit PCM WAV bytes."""
-    audio = np.concatenate(chunks)                              # (N, 2)
-    mono = audio[:, 0]                                          # take channel 0
     int16 = (mono * 32767.0).clip(-32768, 32767).astype(np.int16)
     buf = io.BytesIO()
     with wave.open(buf, "wb") as w:
@@ -49,15 +50,16 @@ def record_utterance(
 ) -> tuple[list, float, bool]:
     """Wait for speech, record until silence, return (chunks, doa_angle, antenna_pressed).
-    While waiting for speech, tracks the robot's head toward the speaker via DoA.
-    Checks `should_stop(reachy_mini)` each iteration to allow antenna-press exit.
     """
     chunks: list = []
     last_speech_t: Optional[float] = None
     speech_started_t: Optional[float] = None
-    last_doa_angle: float = math.pi / 2  # default: facing front
-    # Drain stale buffered audio before we start a fresh utterance.
     drained = 0
     while reachy_mini.media.get_audio_sample() is not None:
         drained += 1
@@ -70,27 +72,34 @@ def record_utterance(
         if should_stop(reachy_mini):
             return [], last_doa_angle, True
-        # DoA: direction only — used for head tracking, not VAD.
         doa = reachy_mini.media.get_DoA()
         if doa is not None:
-            angle, _ = doa
-            last_doa_angle = angle
-            if speech_started_t is None:
-                y = math.sin(angle - math.pi / 2) * 0.6
-                reachy_mini.look_at_world(1.0, y, 0.0, duration=0)
-        # Energy-based VAD: reliable even when DoA speech flag isn't triggering.
         chunk = reachy_mini.media.get_audio_sample()
         if chunk is not None:
-            if _rms(chunk) > RMS_SPEECH_THRESHOLD:
                 if speech_started_t is None:
-                    logger.debug("Speech started (energy VAD)")
                     speech_started_t = now
                 last_speech_t = now
             if speech_started_t is not None:
                 chunks.append(chunk)
-        # End of utterance: had speech, now silent long enough.
         if (last_speech_t is not None
                 and now - last_speech_t > SILENCE_DURATION
                 and speech_started_t is not None

 """Speech recording + Google STT for the Talk app.
 Records from the robot's ReSpeaker mic array (16 kHz, stereo float32)
+using the SDK's recording pipeline. Uses RMS energy for VAD; DoA for
+head-direction tracking only. Transcribes via Google Speech Recognition.
 """
 import io
 SILENCE_DURATION = 1.2    # s of silence to end utterance
 MAX_DURATION = 20.0       # hard cap per utterance
 MIN_SPEECH_DURATION = 0.4 # discard very short sounds (spurious noise)
+RMS_SPEECH_THRESHOLD = 0.005  # float32 RMS; tunable
+HEAD_UPDATE_INTERVAL = 0.5    # s between head-direction updates while waiting
 def _rms(chunk: np.ndarray) -> float:
 def _chunks_to_wav_bytes(chunks: list) -> bytes:
     """Convert (N, 2) float32 chunks to mono 16-bit PCM WAV bytes."""
+    audio = np.concatenate(chunks)
+    mono = audio[:, 0]
     int16 = (mono * 32767.0).clip(-32768, 32767).astype(np.int16)
     buf = io.BytesIO()
     with wave.open(buf, "wb") as w:
 ) -> tuple[list, float, bool]:
     """Wait for speech, record until silence, return (chunks, doa_angle, antenna_pressed).
+    VAD is energy-based (RMS threshold). DoA is used only for head tracking
+    while waiting — updates are throttled to avoid jerky movement.
     """
     chunks: list = []
     last_speech_t: Optional[float] = None
     speech_started_t: Optional[float] = None
+    last_doa_angle: float = math.pi / 2   # default: facing front
+    last_head_update: float = 0.0
+    # Drain stale audio buffered during TTS playback.
     drained = 0
     while reachy_mini.media.get_audio_sample() is not None:
         drained += 1
         if should_stop(reachy_mini):
             return [], last_doa_angle, True
+        # Update DoA angle (direction only — not used as VAD).
         doa = reachy_mini.media.get_DoA()
         if doa is not None:
+            last_doa_angle = doa[0]
+        # Smooth head tracking toward speaker, throttled to HEAD_UPDATE_INTERVAL.
+        # Only while waiting for speech; once recording, head stays put.
+        if speech_started_t is None and now - last_head_update >= HEAD_UPDATE_INTERVAL:
+            y = math.sin(last_doa_angle - math.pi / 2) * 0.6
+            try:
+                reachy_mini.look_at_world(1.0, y, 0.0, duration=0.4)
+            except Exception:
+                pass
+            last_head_update = now
+        # Energy-based VAD.
         chunk = reachy_mini.media.get_audio_sample()
         if chunk is not None:
+            rms = _rms(chunk)
+            if rms > RMS_SPEECH_THRESHOLD:
                 if speech_started_t is None:
+                    logger.debug("Speech started (RMS %.4f)", rms)
                     speech_started_t = now
                 last_speech_t = now
             if speech_started_t is not None:
                 chunks.append(chunk)
+        # End of utterance: silence long enough after speech.
         if (last_speech_t is not None
                 and now - last_speech_t > SILENCE_DURATION
                 and speech_started_t is not None