onitsche commited on
Commit
035779f
·
1 Parent(s): b728731

Fix wild head movement (throttle DoA tracking to 2Hz) and lower RMS VAD threshold to 0.005

Browse files
Files changed (1) hide show
  1. talk/stt.py +29 -20
talk/stt.py CHANGED
@@ -1,8 +1,8 @@
1
  """Speech recording + Google STT for the Talk app.
2
 
3
  Records from the robot's ReSpeaker mic array (16 kHz, stereo float32)
4
- using the SDK's recording pipeline. Uses DoA for VAD and head tracking.
5
- Transcribes via Google Speech Recognition (free, no API key, German de-DE).
6
  """
7
 
8
  import io
@@ -20,7 +20,8 @@ SAMPLE_RATE = 16000 # ReSpeaker hardware rate
20
  SILENCE_DURATION = 1.2 # s of silence to end utterance
21
  MAX_DURATION = 20.0 # hard cap per utterance
22
  MIN_SPEECH_DURATION = 0.4 # discard very short sounds (spurious noise)
23
- RMS_SPEECH_THRESHOLD = 0.008 # float32 RMS; speech ~0.01–0.1, noise ~0.001–0.005
 
24
 
25
 
26
  def _rms(chunk: np.ndarray) -> float:
@@ -30,8 +31,8 @@ def _rms(chunk: np.ndarray) -> float:
30
 
31
  def _chunks_to_wav_bytes(chunks: list) -> bytes:
32
  """Convert (N, 2) float32 chunks to mono 16-bit PCM WAV bytes."""
33
- audio = np.concatenate(chunks) # (N, 2)
34
- mono = audio[:, 0] # take channel 0
35
  int16 = (mono * 32767.0).clip(-32768, 32767).astype(np.int16)
36
  buf = io.BytesIO()
37
  with wave.open(buf, "wb") as w:
@@ -49,15 +50,16 @@ def record_utterance(
49
  ) -> tuple[list, float, bool]:
50
  """Wait for speech, record until silence, return (chunks, doa_angle, antenna_pressed).
51
 
52
- While waiting for speech, tracks the robot's head toward the speaker via DoA.
53
- Checks `should_stop(reachy_mini)` each iteration to allow antenna-press exit.
54
  """
55
  chunks: list = []
56
  last_speech_t: Optional[float] = None
57
  speech_started_t: Optional[float] = None
58
- last_doa_angle: float = math.pi / 2 # default: facing front
 
59
 
60
- # Drain stale buffered audio before we start a fresh utterance.
61
  drained = 0
62
  while reachy_mini.media.get_audio_sample() is not None:
63
  drained += 1
@@ -70,27 +72,34 @@ def record_utterance(
70
  if should_stop(reachy_mini):
71
  return [], last_doa_angle, True
72
 
73
- # DoA: direction only — used for head tracking, not VAD.
74
  doa = reachy_mini.media.get_DoA()
75
  if doa is not None:
76
- angle, _ = doa
77
- last_doa_angle = angle
78
- if speech_started_t is None:
79
- y = math.sin(angle - math.pi / 2) * 0.6
80
- reachy_mini.look_at_world(1.0, y, 0.0, duration=0)
81
-
82
- # Energy-based VAD: reliable even when DoA speech flag isn't triggering.
 
 
 
 
 
 
83
  chunk = reachy_mini.media.get_audio_sample()
84
  if chunk is not None:
85
- if _rms(chunk) > RMS_SPEECH_THRESHOLD:
 
86
  if speech_started_t is None:
87
- logger.debug("Speech started (energy VAD)")
88
  speech_started_t = now
89
  last_speech_t = now
90
  if speech_started_t is not None:
91
  chunks.append(chunk)
92
 
93
- # End of utterance: had speech, now silent long enough.
94
  if (last_speech_t is not None
95
  and now - last_speech_t > SILENCE_DURATION
96
  and speech_started_t is not None
 
1
  """Speech recording + Google STT for the Talk app.
2
 
3
  Records from the robot's ReSpeaker mic array (16 kHz, stereo float32)
4
+ using the SDK's recording pipeline. Uses RMS energy for VAD; DoA for
5
+ head-direction tracking only. Transcribes via Google Speech Recognition.
6
  """
7
 
8
  import io
 
20
  SILENCE_DURATION = 1.2 # s of silence to end utterance
21
  MAX_DURATION = 20.0 # hard cap per utterance
22
  MIN_SPEECH_DURATION = 0.4 # discard very short sounds (spurious noise)
23
+ RMS_SPEECH_THRESHOLD = 0.005 # float32 RMS; tunable
24
+ HEAD_UPDATE_INTERVAL = 0.5 # s between head-direction updates while waiting
25
 
26
 
27
  def _rms(chunk: np.ndarray) -> float:
 
31
 
32
  def _chunks_to_wav_bytes(chunks: list) -> bytes:
33
  """Convert (N, 2) float32 chunks to mono 16-bit PCM WAV bytes."""
34
+ audio = np.concatenate(chunks)
35
+ mono = audio[:, 0]
36
  int16 = (mono * 32767.0).clip(-32768, 32767).astype(np.int16)
37
  buf = io.BytesIO()
38
  with wave.open(buf, "wb") as w:
 
50
  ) -> tuple[list, float, bool]:
51
  """Wait for speech, record until silence, return (chunks, doa_angle, antenna_pressed).
52
 
53
+ VAD is energy-based (RMS threshold). DoA is used only for head tracking
54
+ while waiting updates are throttled to avoid jerky movement.
55
  """
56
  chunks: list = []
57
  last_speech_t: Optional[float] = None
58
  speech_started_t: Optional[float] = None
59
+ last_doa_angle: float = math.pi / 2 # default: facing front
60
+ last_head_update: float = 0.0
61
 
62
+ # Drain stale audio buffered during TTS playback.
63
  drained = 0
64
  while reachy_mini.media.get_audio_sample() is not None:
65
  drained += 1
 
72
  if should_stop(reachy_mini):
73
  return [], last_doa_angle, True
74
 
75
+ # Update DoA angle (direction only — not used as VAD).
76
  doa = reachy_mini.media.get_DoA()
77
  if doa is not None:
78
+ last_doa_angle = doa[0]
79
+
80
+ # Smooth head tracking toward speaker, throttled to HEAD_UPDATE_INTERVAL.
81
+ # Only while waiting for speech; once recording, head stays put.
82
+ if speech_started_t is None and now - last_head_update >= HEAD_UPDATE_INTERVAL:
83
+ y = math.sin(last_doa_angle - math.pi / 2) * 0.6
84
+ try:
85
+ reachy_mini.look_at_world(1.0, y, 0.0, duration=0.4)
86
+ except Exception:
87
+ pass
88
+ last_head_update = now
89
+
90
+ # Energy-based VAD.
91
  chunk = reachy_mini.media.get_audio_sample()
92
  if chunk is not None:
93
+ rms = _rms(chunk)
94
+ if rms > RMS_SPEECH_THRESHOLD:
95
  if speech_started_t is None:
96
+ logger.debug("Speech started (RMS %.4f)", rms)
97
  speech_started_t = now
98
  last_speech_t = now
99
  if speech_started_t is not None:
100
  chunks.append(chunk)
101
 
102
+ # End of utterance: silence long enough after speech.
103
  if (last_speech_t is not None
104
  and now - last_speech_t > SILENCE_DURATION
105
  and speech_started_t is not None