tfrere HF Staff commited on
Commit
893babc
Β·
1 Parent(s): cdce2ed

feat: circle states + head wobble agent driven by AI speech

Browse files

- Add a `processing` state (user stopped, reply not ready yet) alongside
listening / user-speaking / ai-speaking, with distinct colors and
animations (dashed spinning ring, warm thinking throb).
- Make the user-speaking state react to the robot mic in real time via a
smoothed RMS monitor that writes `--audio-level` into CSS.
- Ship a parallel "head wobble" agent ported from the Python
`speech_tapper` + `head_wobbler` modules: hooks a Web Audio analyser
to the assistant MediaStreamTrack, runs hysteretic VAD + sinusoidal
sway at 20Hz, and drives robot.setHeadPose with a ~200ms latency so
the motion feels slaved to the voice.
- Reset the wobbler when the user barges in so the head settles during
listening, matching the reference conversation app.

Made-with: Cursor

Files changed (4) hide show
  1. src/head-wobbler.ts +240 -0
  2. src/main.ts +141 -1
  3. src/openai-realtime.ts +19 -3
  4. src/style.css +112 -2
src/head-wobbler.ts ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Head wobble generator - JS port of the Python
3
+ * `reachy_mini_conversation_app/audio/speech_tapper.py` + `head_wobbler.py`.
4
+ *
5
+ * Purpose: take the assistant's outgoing voice (the audio OpenAI plays back
6
+ * through the robot) and drive small, organic head sways + nods in sync with
7
+ * the speech, so Reachy "comes alive" while talking.
8
+ *
9
+ * The Python version sampled raw PCM base64 deltas over the OpenAI WebSocket
10
+ * API; we instead hook a Web Audio `AnalyserNode` to the remote
11
+ * `MediaStreamTrack` (which is how audio reaches us in WebRTC mode). Both
12
+ * yield the same loudness envelope we need to modulate the sine oscillators.
13
+ *
14
+ * The SDK only exposes rotational `setHeadPose(roll, pitch, yaw)` in degrees,
15
+ * so we drop the small (Β±mm-range) translations from the original Python
16
+ * pipeline - they're a cherry-on-top that isn't reachable through the public
17
+ * JS API anyway.
18
+ */
19
+
20
+ // ─── Constants ported from speech_tapper.py ─────────────────────────────
21
+ // Frame / hop timing. Hop drives how often we update offsets.
22
+ const HOP_MS = 50; // produce one pose target every 50ms (20Hz)
23
+ const FRAME_MS = 20; // RMS window in ms
24
+
25
+ // Loudness mapping: RMS in dBFS is remapped to [0..1] with a slight gamma.
26
+ const SWAY_DB_LOW = -46;
27
+ const SWAY_DB_HIGH = -18;
28
+ const SWAY_DB_GAMMA = 0.9;
29
+ const SWAY_MASTER = 1.5;
30
+ const SENS_DB_OFFSET = 4;
31
+
32
+ // Hysteretic VAD thresholds on the sliding-frame dBFS envelope.
33
+ const VAD_DB_ON = -35;
34
+ const VAD_DB_OFF = -45;
35
+ // Attack / release for the 0β†’1 envelope follower that multiplies the sway.
36
+ const ENV_FOLLOW_GAIN = 0.65;
37
+
38
+ // Sinusoid frequencies (Hz) + peak amplitudes (degrees).
39
+ // All three axes use independent initial phases so the motion doesn't feel
40
+ // mechanical when they align.
41
+ const SWAY_F_PITCH = 2.2;
42
+ const SWAY_A_PITCH_DEG = 4.5;
43
+
44
+ const SWAY_F_YAW = 0.6;
45
+ const SWAY_A_YAW_DEG = 7.5;
46
+
47
+ const SWAY_F_ROLL = 1.3;
48
+ const SWAY_A_ROLL_DEG = 2.25;
49
+
50
+ // Deliberate delay between the audio we analyse and the matching head
51
+ // motion. A touch of latency makes the motion feel slaved to the voice
52
+ // (reactive, not predictive). Matches `MOVEMENT_LATENCY_S = 0.2` in Python.
53
+ const MOVEMENT_LATENCY_S = 0.2;
54
+
55
+ // ─── Types ──────────────────────────────────────────────────────────────
56
+
57
+ export interface HeadOffsetsDeg {
58
+ roll: number;
59
+ pitch: number;
60
+ yaw: number;
61
+ }
62
+
63
+ export interface HeadWobblerOptions {
64
+ track: MediaStreamTrack;
65
+ onOffsets: (offsets: HeadOffsetsDeg) => void;
66
+ /**
67
+ * Master scalar applied on top of the per-axis amplitudes. Lets callers
68
+ * tone the whole effect down without recompiling constants. Defaults to 1.
69
+ */
70
+ gain?: number;
71
+ }
72
+
73
+ // ─── Implementation ─────────────────────────────────────────────────────
74
+
75
+ /**
76
+ * Consumes a `MediaStreamTrack` (assistant voice) and emits head rotation
77
+ * offsets in degrees every hop (~50 ms), smoothly ramping in and out with
78
+ * the speech envelope.
79
+ *
80
+ * Call `start()` to begin, `stop()` to tear down, `reset()` to zero the
81
+ * internal sway state (useful when the user barges in on the assistant).
82
+ */
83
+ export class HeadWobbler {
84
+ private readonly track: MediaStreamTrack;
85
+ private readonly onOffsets: (offsets: HeadOffsetsDeg) => void;
86
+ private readonly gain: number;
87
+
88
+ private audioCtx: AudioContext | null = null;
89
+ private analyser: AnalyserNode | null = null;
90
+ private sourceNode: MediaStreamAudioSourceNode | null = null;
91
+ private frameBuf: Float32Array<ArrayBuffer> | null = null;
92
+
93
+ private timer: number | null = null;
94
+
95
+ // Sway state
96
+ private t = 0; // seconds since start() for the sine oscillators
97
+ private swayEnv = 0; // 0..1 envelope follower on VAD-on/off
98
+ private vadOn = false;
99
+ private pendingOffsets: HeadOffsetsDeg[] = [];
100
+
101
+ // Per-axis phase offsets so the three sines don't line up on the downbeat.
102
+ private readonly phasePitch = 0.13 * Math.PI;
103
+ private readonly phaseYaw = 0.48 * Math.PI;
104
+ private readonly phaseRoll = 1.21 * Math.PI;
105
+
106
+ constructor(options: HeadWobblerOptions) {
107
+ this.track = options.track;
108
+ this.onOffsets = options.onOffsets;
109
+ this.gain = options.gain ?? 1.0;
110
+ }
111
+
112
+ start(): void {
113
+ if (this.audioCtx) return; // already running
114
+
115
+ // A dedicated AudioContext keeps the robot mic path untouched. We don't
116
+ // need the browser to actually play this back (the audio is also piped
117
+ // straight to the robot speaker via replaceTrack), we just want samples.
118
+ const ctx = new AudioContext();
119
+ const sourceStream = new MediaStream([this.track]);
120
+ const source = ctx.createMediaStreamSource(sourceStream);
121
+ const analyser = ctx.createAnalyser();
122
+ analyser.fftSize = 1024;
123
+ analyser.smoothingTimeConstant = 0;
124
+ source.connect(analyser);
125
+
126
+ const frameSamples = Math.max(
127
+ 128,
128
+ Math.round((ctx.sampleRate * FRAME_MS) / 1000),
129
+ );
130
+ this.frameBuf = new Float32Array(new ArrayBuffer(frameSamples * 4));
131
+
132
+ this.audioCtx = ctx;
133
+ this.sourceNode = source;
134
+ this.analyser = analyser;
135
+
136
+ // Kick the tick loop on setInterval so the rate stays aligned with
137
+ // hop_ms regardless of requestAnimationFrame throttling when the tab
138
+ // is backgrounded. 20 Hz is low enough not to spam the robot.
139
+ this.timer = window.setInterval(() => this.tick(), HOP_MS);
140
+ }
141
+
142
+ stop(): void {
143
+ if (this.timer !== null) {
144
+ clearInterval(this.timer);
145
+ this.timer = null;
146
+ }
147
+ try {
148
+ this.sourceNode?.disconnect();
149
+ this.analyser?.disconnect();
150
+ this.audioCtx?.close();
151
+ } catch {
152
+ // ignored
153
+ }
154
+ this.sourceNode = null;
155
+ this.analyser = null;
156
+ this.audioCtx = null;
157
+ this.frameBuf = null;
158
+ this.pendingOffsets = [];
159
+
160
+ // Return to neutral head pose on teardown so the robot doesn't freeze
161
+ // mid-motion.
162
+ this.onOffsets({ roll: 0, pitch: 0, yaw: 0 });
163
+ }
164
+
165
+ /**
166
+ * Reset the sway envelope + latency queue. Call this when the user starts
167
+ * speaking while the assistant is still talking: we want motion to fade
168
+ * out smoothly instead of holding the last sine value.
169
+ */
170
+ reset(): void {
171
+ this.swayEnv = 0;
172
+ this.vadOn = false;
173
+ this.pendingOffsets = [];
174
+ this.onOffsets({ roll: 0, pitch: 0, yaw: 0 });
175
+ }
176
+
177
+ // ─── Internals ───────────────────────────────────────────────────────
178
+
179
+ private tick(): void {
180
+ if (!this.analyser || !this.frameBuf) return;
181
+
182
+ // 1. Measure current loudness in dBFS on the most recent ~20 ms.
183
+ this.analyser.getFloatTimeDomainData(this.frameBuf);
184
+ const db = rmsDbfs(this.frameBuf) + SENS_DB_OFFSET;
185
+
186
+ // 2. Hysteretic VAD on the dB envelope.
187
+ if (this.vadOn) {
188
+ if (db < VAD_DB_OFF) this.vadOn = false;
189
+ } else {
190
+ if (db > VAD_DB_ON) this.vadOn = true;
191
+ }
192
+
193
+ // 3. Smooth 0..1 envelope the sines are modulated by.
194
+ const target = this.vadOn ? 1 : 0;
195
+ this.swayEnv += (target - this.swayEnv) * ENV_FOLLOW_GAIN;
196
+
197
+ // 4. Loudness gain - how much of the max amplitude we reach on this hop.
198
+ const loud = loudnessGain(db);
199
+
200
+ // 5. Time-evolve the oscillators.
201
+ this.t += HOP_MS / 1000;
202
+ const twoPiT = 2 * Math.PI * this.t;
203
+ const mod = loud * this.swayEnv * this.gain * SWAY_MASTER;
204
+
205
+ const pitchDeg =
206
+ SWAY_A_PITCH_DEG * mod * Math.sin(twoPiT * SWAY_F_PITCH + this.phasePitch);
207
+ const yawDeg =
208
+ SWAY_A_YAW_DEG * mod * Math.sin(twoPiT * SWAY_F_YAW + this.phaseYaw);
209
+ const rollDeg =
210
+ SWAY_A_ROLL_DEG * mod * Math.sin(twoPiT * SWAY_F_ROLL + this.phaseRoll);
211
+
212
+ // 6. Enqueue with latency, then emit the oldest ready sample.
213
+ this.pendingOffsets.push({ roll: rollDeg, pitch: pitchDeg, yaw: yawDeg });
214
+ const maxQueue = Math.ceil((MOVEMENT_LATENCY_S * 1000) / HOP_MS);
215
+ while (this.pendingOffsets.length > maxQueue) {
216
+ const next = this.pendingOffsets.shift();
217
+ if (next) this.onOffsets(next);
218
+ }
219
+ }
220
+ }
221
+
222
+ // ─── Pure helpers ───────────────────────────────────────────────────────
223
+
224
+ function rmsDbfs(samples: Float32Array<ArrayBuffer>): number {
225
+ let sum = 0;
226
+ for (let i = 0; i < samples.length; i++) {
227
+ const s = samples[i];
228
+ sum += s * s;
229
+ }
230
+ const rms = Math.sqrt(sum / Math.max(1, samples.length));
231
+ if (rms <= 1e-8) return -120;
232
+ return 20 * Math.log10(rms);
233
+ }
234
+
235
+ function loudnessGain(db: number): number {
236
+ // Linear remap [SWAY_DB_LOW..SWAY_DB_HIGH] -> [0..1] with a gamma.
237
+ const norm = (db - SWAY_DB_LOW) / (SWAY_DB_HIGH - SWAY_DB_LOW);
238
+ const clamped = Math.max(0, Math.min(1, norm));
239
+ return Math.pow(clamped, SWAY_DB_GAMMA);
240
+ }
src/main.ts CHANGED
@@ -21,6 +21,7 @@
21
  import "./style.css";
22
 
23
  import { OpenaiRealtimeClient } from "./openai-realtime.ts";
 
24
  import type {
25
  ReachyMiniInstance,
26
  RobotInfo,
@@ -79,6 +80,7 @@ type AppState =
79
  | "starting"
80
  | "listening"
81
  | "user-speaking"
 
82
  | "ai-speaking"
83
  | "error";
84
 
@@ -129,6 +131,11 @@ const STATE_VIEWS: Record<AppState, StateView> = {
129
  sublabel: "Go on, I'm hearing you",
130
  disabled: false,
131
  },
 
 
 
 
 
132
  "ai-speaking": {
133
  label: "Speaking…",
134
  sublabel: "Reachy is answering",
@@ -151,6 +158,7 @@ const STATE_CLASS: Record<AppState, string> = {
151
  starting: "state-starting",
152
  listening: "state-listening",
153
  "user-speaking": "state-user-speaking",
 
154
  "ai-speaking": "state-ai-speaking",
155
  error: "state-error",
156
  };
@@ -194,6 +202,14 @@ let openai: OpenaiRealtimeClient | null = null;
194
  // keeps the MediaStream alive and actually decodes the incoming track).
195
  let openaiSink: HTMLAudioElement | null = null;
196
 
 
 
 
 
 
 
 
 
197
  // ─── UI rendering ───────────────────────────────────────────────────────
198
 
199
  function setState(next: AppState): void {
@@ -385,6 +401,8 @@ async function doStart(): Promise<void> {
385
  return;
386
  }
387
 
 
 
388
  setStatus("Connecting to OpenAI Realtime…");
389
 
390
  openai = new OpenaiRealtimeClient({
@@ -395,7 +413,10 @@ async function doStart(): Promise<void> {
395
  inputTrack: robotMicTrack,
396
  });
397
 
398
- openai.on("outputTrack", ({ track }) => routeOpenaiToRobot(track));
 
 
 
399
 
400
  openai.on("status", ({ status }) => {
401
  switch (status) {
@@ -405,6 +426,13 @@ async function doStart(): Promise<void> {
405
  break;
406
  case "user-speaking":
407
  setState("user-speaking");
 
 
 
 
 
 
 
408
  break;
409
  case "ai-speaking":
410
  setState("ai-speaking");
@@ -480,6 +508,115 @@ function routeOpenaiToRobot(track: MediaStreamTrack): void {
480
  openaiSink.srcObject = new MediaStream([track]);
481
  }
482
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
483
  async function teardown(): Promise<void> {
484
  try {
485
  await openai?.close();
@@ -488,6 +625,9 @@ async function teardown(): Promise<void> {
488
  }
489
  openai = null;
490
 
 
 
 
491
  if (openaiSink) {
492
  openaiSink.srcObject = null;
493
  openaiSink.remove();
 
21
  import "./style.css";
22
 
23
  import { OpenaiRealtimeClient } from "./openai-realtime.ts";
24
+ import { HeadWobbler } from "./head-wobbler.ts";
25
  import type {
26
  ReachyMiniInstance,
27
  RobotInfo,
 
80
  | "starting"
81
  | "listening"
82
  | "user-speaking"
83
+ | "processing"
84
  | "ai-speaking"
85
  | "error";
86
 
 
131
  sublabel: "Go on, I'm hearing you",
132
  disabled: false,
133
  },
134
+ processing: {
135
+ label: "Thinking…",
136
+ sublabel: "Reachy is coming up with a reply",
137
+ disabled: false,
138
+ },
139
  "ai-speaking": {
140
  label: "Speaking…",
141
  sublabel: "Reachy is answering",
 
158
  starting: "state-starting",
159
  listening: "state-listening",
160
  "user-speaking": "state-user-speaking",
161
+ processing: "state-processing",
162
  "ai-speaking": "state-ai-speaking",
163
  error: "state-error",
164
  };
 
202
  // keeps the MediaStream alive and actually decodes the incoming track).
203
  let openaiSink: HTMLAudioElement | null = null;
204
 
205
+ // Head-motion agent: samples the assistant's voice and drives small head
206
+ // sways via robot.setHeadPose. Lives for the duration of a session.
207
+ let wobbler: HeadWobbler | null = null;
208
+
209
+ // Mic level monitor: feeds a CSS custom property `--audio-level` in [0,1]
210
+ // so the circle breathes/glows in reaction to the user's voice in real time.
211
+ let micLevel: MicLevelMonitor | null = null;
212
+
213
  // ─── UI rendering ───────────────────────────────────────────────────────
214
 
215
  function setState(next: AppState): void {
 
401
  return;
402
  }
403
 
404
+ startMicLevelMonitor(robotMicTrack);
405
+
406
  setStatus("Connecting to OpenAI Realtime…");
407
 
408
  openai = new OpenaiRealtimeClient({
 
413
  inputTrack: robotMicTrack,
414
  });
415
 
416
+ openai.on("outputTrack", ({ track }) => {
417
+ routeOpenaiToRobot(track);
418
+ startWobbler(track);
419
+ });
420
 
421
  openai.on("status", ({ status }) => {
422
  switch (status) {
 
426
  break;
427
  case "user-speaking":
428
  setState("user-speaking");
429
+ // User barges in β†’ stop the assistant's sway immediately so the head
430
+ // settles while we listen, mirroring the reference app's
431
+ // head_wobbler.reset() on speech_started.
432
+ wobbler?.reset();
433
+ break;
434
+ case "processing":
435
+ setState("processing");
436
  break;
437
  case "ai-speaking":
438
  setState("ai-speaking");
 
508
  openaiSink.srcObject = new MediaStream([track]);
509
  }
510
 
511
+ // ─── Head motion agent ──────────────────────────────────────────────────
512
+
513
+ /**
514
+ * Spawn the head-motion agent from the assistant audio track. Each new
515
+ * session replaces the previous instance; no-op if the wobbler is already
516
+ * wired to this exact track.
517
+ */
518
+ function startWobbler(assistantTrack: MediaStreamTrack): void {
519
+ if (!robot) return;
520
+
521
+ wobbler?.stop();
522
+ wobbler = new HeadWobbler({
523
+ track: assistantTrack,
524
+ onOffsets: ({ roll, pitch, yaw }) => {
525
+ // The SDK's setHeadPose expects degrees. Our offsets are already in
526
+ // degrees; we push them as absolute target poses around the neutral
527
+ // head position (no base pose is preserved, which keeps the motion
528
+ // unambiguously around "looking forward").
529
+ robot?.setHeadPose(roll, pitch, yaw);
530
+ },
531
+ });
532
+ wobbler.start();
533
+ }
534
+
535
+ function stopWobbler(): void {
536
+ wobbler?.stop();
537
+ wobbler = null;
538
+ // Ensure the head returns to neutral when the session ends.
539
+ robot?.setHeadPose(0, 0, 0);
540
+ }
541
+
542
+ // ─── Mic-level monitor (circle audio-reactivity) ────────────────────────
543
+
544
+ /**
545
+ * Sample the robot's microphone to a CSS custom property `--audio-level` so
546
+ * the circle can breathe/glow in sync with the user's voice. The value is a
547
+ * smoothed normalized RMS in [0, 1].
548
+ */
549
+ class MicLevelMonitor {
550
+ private ctx: AudioContext | null = null;
551
+ private analyser: AnalyserNode | null = null;
552
+ private source: MediaStreamAudioSourceNode | null = null;
553
+ private raf = 0;
554
+ private buf: Float32Array<ArrayBuffer> | null = null;
555
+ private level = 0;
556
+
557
+ start(track: MediaStreamTrack): void {
558
+ this.stop();
559
+ const ctx = new AudioContext();
560
+ const src = ctx.createMediaStreamSource(new MediaStream([track]));
561
+ const analyser = ctx.createAnalyser();
562
+ analyser.fftSize = 1024;
563
+ analyser.smoothingTimeConstant = 0.7;
564
+ src.connect(analyser);
565
+
566
+ this.ctx = ctx;
567
+ this.source = src;
568
+ this.analyser = analyser;
569
+ this.buf = new Float32Array(new ArrayBuffer(analyser.fftSize * 4));
570
+
571
+ const tick = () => {
572
+ if (!this.analyser || !this.buf) return;
573
+ this.analyser.getFloatTimeDomainData(this.buf);
574
+ let sum = 0;
575
+ for (let i = 0; i < this.buf.length; i++) sum += this.buf[i] * this.buf[i];
576
+ const rms = Math.sqrt(sum / this.buf.length);
577
+
578
+ // Map raw RMS (~0..0.2 for typical speech) to 0..1 with mild boost.
579
+ const boosted = Math.min(1, Math.pow(rms * 6, 0.7));
580
+ // Attack/release smoothing so the CSS animation stays fluid.
581
+ const attack = boosted > this.level ? 0.55 : 0.12;
582
+ this.level += (boosted - this.level) * attack;
583
+
584
+ document.documentElement.style.setProperty(
585
+ "--audio-level",
586
+ this.level.toFixed(3),
587
+ );
588
+ this.raf = requestAnimationFrame(tick);
589
+ };
590
+ this.raf = requestAnimationFrame(tick);
591
+ }
592
+
593
+ stop(): void {
594
+ cancelAnimationFrame(this.raf);
595
+ this.raf = 0;
596
+ try {
597
+ this.source?.disconnect();
598
+ this.analyser?.disconnect();
599
+ this.ctx?.close();
600
+ } catch {
601
+ // ignored
602
+ }
603
+ this.ctx = null;
604
+ this.source = null;
605
+ this.analyser = null;
606
+ this.buf = null;
607
+ document.documentElement.style.setProperty("--audio-level", "0");
608
+ }
609
+ }
610
+
611
+ function startMicLevelMonitor(track: MediaStreamTrack): void {
612
+ micLevel ??= new MicLevelMonitor();
613
+ micLevel.start(track);
614
+ }
615
+
616
+ function stopMicLevelMonitor(): void {
617
+ micLevel?.stop();
618
+ }
619
+
620
  async function teardown(): Promise<void> {
621
  try {
622
  await openai?.close();
 
625
  }
626
  openai = null;
627
 
628
+ stopWobbler();
629
+ stopMicLevelMonitor();
630
+
631
  if (openaiSink) {
632
  openaiSink.srcObject = null;
633
  openaiSink.remove();
src/openai-realtime.ts CHANGED
@@ -25,6 +25,7 @@ export type RealtimeStatus =
25
  | "connecting"
26
  | "connected"
27
  | "user-speaking"
 
28
  | "ai-speaking"
29
  | "closed"
30
  | "error";
@@ -187,19 +188,34 @@ export class OpenaiRealtimeClient {
187
  break;
188
 
189
  case "input_audio_buffer.speech_stopped":
 
 
 
190
  if (this.status === "user-speaking") {
191
- this.setStatus("connected");
 
 
 
 
 
 
 
 
192
  }
193
  break;
194
 
195
  case "response.created":
196
  case "response.output_item.added":
197
- this.setStatus("ai-speaking");
 
 
 
 
198
  break;
199
 
200
  case "response.done":
201
  case "response.cancelled":
202
- if (this.status === "ai-speaking") {
203
  this.setStatus("connected");
204
  }
205
  break;
 
25
  | "connecting"
26
  | "connected"
27
  | "user-speaking"
28
+ | "processing"
29
  | "ai-speaking"
30
  | "closed"
31
  | "error";
 
188
  break;
189
 
190
  case "input_audio_buffer.speech_stopped":
191
+ // User stopped talking; the model is computing its reply and will
192
+ // start audio shortly. We surface this as `processing` so the UI
193
+ // can show a "thinking" visual until audio actually arrives.
194
  if (this.status === "user-speaking") {
195
+ this.setStatus("processing");
196
+ }
197
+ break;
198
+
199
+ case "response.audio.delta":
200
+ case "response.output_audio.delta":
201
+ // First audio chunk from the assistant β€” switch out of processing.
202
+ if (this.status !== "ai-speaking") {
203
+ this.setStatus("ai-speaking");
204
  }
205
  break;
206
 
207
  case "response.created":
208
  case "response.output_item.added":
209
+ // No audio yet, just metadata; keep us in `processing` until the
210
+ // first audio delta lands.
211
+ if (this.status === "connected" || this.status === "user-speaking") {
212
+ this.setStatus("processing");
213
+ }
214
  break;
215
 
216
  case "response.done":
217
  case "response.cancelled":
218
+ if (this.status === "ai-speaking" || this.status === "processing") {
219
  this.setStatus("connected");
220
  }
221
  break;
src/style.css CHANGED
@@ -12,9 +12,14 @@
12
  --accent-2: #22d3ee;
13
  --listening: #22d3ee;
14
  --speaking: #8b7dff;
 
15
  --error: #ff6a75;
16
  --success: #34d399;
17
 
 
 
 
 
18
  --radius-sm: 8px;
19
  --radius-md: 14px;
20
  --radius-lg: 22px;
@@ -217,7 +222,9 @@ a:hover {
217
  text-align: center;
218
  }
219
 
220
- /* Pulse animation: used in "streaming" states */
 
 
221
  @keyframes pulse {
222
  0%, 100% {
223
  transform: scale(1);
@@ -229,6 +236,7 @@ a:hover {
229
  }
230
  }
231
 
 
232
  @keyframes pulse-fast {
233
  0%, 100% {
234
  transform: scale(1);
@@ -240,6 +248,7 @@ a:hover {
240
  }
241
  }
242
 
 
243
  @keyframes breathe {
244
  0%, 100% {
245
  transform: scale(1);
@@ -249,6 +258,71 @@ a:hover {
249
  }
250
  }
251
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  /* ─── State-specific colors ──────────────────────────────────────────── */
253
 
254
  .circle.state-signed-out {
@@ -300,13 +374,40 @@ a:hover {
300
  opacity: 0.7;
301
  }
302
 
 
 
 
303
  .circle.state-user-speaking {
304
  --glow: var(--listening);
305
  --core-from: #0e2a36;
306
  --core-to: #091a22;
307
  }
308
  .circle.state-user-speaking .circle-glow {
309
- animation: pulse-fast 0.9s ease-in-out infinite;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
  }
311
 
312
  .circle.state-ai-speaking {
@@ -317,6 +418,15 @@ a:hover {
317
  .circle.state-ai-speaking .circle-glow {
318
  animation: pulse-fast 1.05s ease-in-out infinite;
319
  }
 
 
 
 
 
 
 
 
 
320
 
321
  .circle.state-error {
322
  --glow: var(--error);
 
12
  --accent-2: #22d3ee;
13
  --listening: #22d3ee;
14
  --speaking: #8b7dff;
15
+ --processing: #f59e0b;
16
  --error: #ff6a75;
17
  --success: #34d399;
18
 
19
+ /* Smoothed mic RMS in [0..1], updated every frame from JS while a session
20
+ * is active. Used by audio-reactive circle states. */
21
+ --audio-level: 0;
22
+
23
  --radius-sm: 8px;
24
  --radius-md: 14px;
25
  --radius-lg: 22px;
 
222
  text-align: center;
223
  }
224
 
225
+ /* ─── Circle animation keyframes ──────────────────────────────────────── */
226
+
227
+ /* Generic medium pulse on the outer glow. */
228
  @keyframes pulse {
229
  0%, 100% {
230
  transform: scale(1);
 
236
  }
237
  }
238
 
239
+ /* Faster / stronger pulse used while speaking. */
240
  @keyframes pulse-fast {
241
  0%, 100% {
242
  transform: scale(1);
 
248
  }
249
  }
250
 
251
+ /* Slow, subtle breathing for "warm idle" states. */
252
  @keyframes breathe {
253
  0%, 100% {
254
  transform: scale(1);
 
258
  }
259
  }
260
 
261
+ /* Outer ring expanding and fading - conveys "I am producing audio". */
262
+ @keyframes ring-pulse-outer {
263
+ 0% {
264
+ transform: scale(0.95);
265
+ opacity: 0.5;
266
+ }
267
+ 100% {
268
+ transform: scale(1.25);
269
+ opacity: 0;
270
+ }
271
+ }
272
+
273
+ /* Soft inner scale for the core while talking. */
274
+ @keyframes core-breathe {
275
+ 0%, 100% {
276
+ transform: scale(1);
277
+ }
278
+ 50% {
279
+ transform: scale(1.03);
280
+ }
281
+ }
282
+
283
+ /* Opacity throb used for the "thinking" state - dimmer and more pensive
284
+ * than the speaking pulse. */
285
+ @keyframes thinking {
286
+ 0%, 100% {
287
+ opacity: 0.45;
288
+ transform: scale(0.98);
289
+ }
290
+ 50% {
291
+ opacity: 0.8;
292
+ transform: scale(1.02);
293
+ }
294
+ }
295
+
296
+ /* Slow spin for the processing ring accent. */
297
+ @keyframes ring-spin {
298
+ from { transform: rotate(0deg); }
299
+ to { transform: rotate(360deg); }
300
+ }
301
+
302
+ /* ─── Ring decorations (sit absolutely inside .circle) ────────────────── */
303
+
304
+ .circle::before,
305
+ .circle::after {
306
+ content: "";
307
+ position: absolute;
308
+ inset: 0;
309
+ border-radius: 50%;
310
+ pointer-events: none;
311
+ }
312
+
313
+ /* Inner thin ring - shows up in speaking states, hidden by default. */
314
+ .circle::before {
315
+ border: 1px solid var(--glow, var(--accent));
316
+ opacity: 0;
317
+ transition: opacity 0.25s;
318
+ }
319
+
320
+ /* Outer expanding ring - hidden by default, animated in speaking. */
321
+ .circle::after {
322
+ border: 2px solid var(--glow, var(--accent));
323
+ opacity: 0;
324
+ }
325
+
326
  /* ─── State-specific colors ──────────────────────────────────────────── */
327
 
328
  .circle.state-signed-out {
 
374
  opacity: 0.7;
375
  }
376
 
377
+ /* User is speaking β€” scale + opacity driven in real time by the mic RMS
378
+ * via `--audio-level`. The CSS doesn't need a keyframe: JS smoothly
379
+ * interpolates the value, and `transition` handles the visual smoothing. */
380
  .circle.state-user-speaking {
381
  --glow: var(--listening);
382
  --core-from: #0e2a36;
383
  --core-to: #091a22;
384
  }
385
  .circle.state-user-speaking .circle-glow {
386
+ animation: none;
387
+ opacity: calc(0.55 + 0.45 * var(--audio-level));
388
+ transform: scale(calc(1 + 0.18 * var(--audio-level)));
389
+ transition: transform 0.08s linear, opacity 0.08s linear;
390
+ }
391
+ .circle.state-user-speaking::before {
392
+ opacity: calc(0.25 + 0.5 * var(--audio-level));
393
+ transform: scale(calc(1.02 + 0.08 * var(--audio-level)));
394
+ transition: transform 0.08s linear, opacity 0.08s linear;
395
+ }
396
+
397
+ /* Assistant is computing its answer: warm dim pulse + slow spinning accent.
398
+ * No audio yet, so no reactive modulation. */
399
+ .circle.state-processing {
400
+ --glow: var(--processing);
401
+ --core-from: #3a2f12;
402
+ --core-to: #1e1a0c;
403
+ }
404
+ .circle.state-processing .circle-glow {
405
+ animation: thinking 1.6s ease-in-out infinite;
406
+ }
407
+ .circle.state-processing::before {
408
+ opacity: 0.4;
409
+ border-style: dashed;
410
+ animation: ring-spin 6s linear infinite;
411
  }
412
 
413
  .circle.state-ai-speaking {
 
418
  .circle.state-ai-speaking .circle-glow {
419
  animation: pulse-fast 1.05s ease-in-out infinite;
420
  }
421
+ .circle.state-ai-speaking .circle-core {
422
+ animation: core-breathe 1.6s ease-in-out infinite;
423
+ }
424
+ .circle.state-ai-speaking::before {
425
+ opacity: 0.5;
426
+ }
427
+ .circle.state-ai-speaking::after {
428
+ animation: ring-pulse-outer 1.6s ease-out infinite;
429
+ }
430
 
431
  .circle.state-error {
432
  --glow: var(--error);