Spaces:

pollen-robotics
/

reachy_mini_conversation_app

Running

App Files Files Community

apirrone commited on Nov 17

Commit

f0177ee

2 Parent(s): 6df93fa 4af767f

Merge branch 'develop' into 62-appify-the-demo

Browse files

Files changed (30) hide show

.github/workflows/tests.yml +74 -0
.github/workflows/typecheck.yml +29 -0
.gitignore +1 -0
README.md +16 -8
docs/assets/conversation_app_arch.svg +3 -0
{src/reachy_mini_conversation_demo/images → docs/assets}/reachy_mini_dance.gif +0 -0
docs/scheme.mmd +58 -0
pyproject.toml +25 -10
src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/__init__.py +0 -0
src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/audio/__init__.py +0 -0
src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/audio/head_wobbler.py +15 -13
src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/audio/speech_tapper.py +11 -9
src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/camera_worker.py +25 -26
src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/config.py +15 -26
src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/console.py +22 -11
src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/dance_emotion_moves.py +21 -18
src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/images/reachymini_avatar.png +0 -0
src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/images/user_avatar.png +0 -0
src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/main.py +12 -11
src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/moves.py +29 -28
src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/openai_realtime.py +203 -83
src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/prompts.py +0 -0
src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/tools.py +36 -34
src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/utils.py +12 -10
src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/vision/__init__.py +0 -0
src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/vision/processors.py +17 -15
src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/vision/yolo_head_tracker.py +15 -9
tests/audio/test_head_wobbler.py +4 -3
tests/test_openai_realtime.py +117 -0
uv.lock +0 -0

.github/workflows/tests.yml ADDED Viewed

	@@ -0,0 +1,74 @@

+name: Tests
+on:
+  push:
+  pull_request:
+permissions:
+  contents: read
+  actions: write
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  tests:
+    name: pytest (py${{ matrix.python-version }})
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.12"]
+    env:
+      HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      HF_HUB_ETAG_TIMEOUT: "120"
+      HF_HUB_DOWNLOAD_TIMEOUT: "120"
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - uses: astral-sh/setup-uv@v5
+      - name: Set HF_HOME
+        shell: bash
+        run: |
+          echo "HF_HOME=${RUNNER_TEMP}/.hf" >> "$GITHUB_ENV"
+          mkdir -p "${RUNNER_TEMP}/.hf"
+      - name: Cache Hugging Face hub
+        uses: actions/cache@v4
+        with:
+          path: ${{ runner.temp }}/.hf
+          key: hf-${{ runner.os }}-${{ hashFiles('uv.lock', 'pyproject.toml') }}
+          restore-keys: hf-${{ runner.os }}-
+      # test-only .env file
+      - name: Create test .env
+        run: |
+          printf "OPENAI_API_KEY=test-dummy\n" > .env
+      - name: Install (locked)
+        run: |
+          uv sync --frozen --group dev --extra all_vision
+      # Prefetch HF dataset to avoid download during test collection
+      - name: Prefetch HF dataset
+        run: |
+          uv run python - <<'PY'
+          from huggingface_hub import snapshot_download
+          snapshot_download(
+              repo_id="pollen-robotics/reachy-mini-emotions-library",
+              repo_type="dataset",
+              etag_timeout=120,
+              max_workers=4
+          )
+          PY
+      - name: Run tests
+        run: uv run pytest -q

.github/workflows/typecheck.yml ADDED Viewed

	@@ -0,0 +1,29 @@

+name: Type check
+on: [push, pull_request]
+permissions:
+  contents: read
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  mypy:
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - uses: astral-sh/setup-uv@v5
+      - name: Install deps (locked) incl. vision extras
+        run: uv sync --frozen --group dev --extra all_vision
+      - name: Run mypy
+        run: uv run mypy --pretty --show-error-codes .

.gitignore CHANGED Viewed

@@ -29,6 +29,7 @@ coverage.xml
 # Linting and formatting
 .ruff_cache/
 # IDE
 .vscode/

 # Linting and formatting
 .ruff_cache/
+.mypy_cache/
 # IDE
 .vscode/

README.md CHANGED Viewed

@@ -9,11 +9,19 @@ tags:
  - reachy_mini
 ---
-# Reachy Mini conversation demo
-Conversational demo for the Reachy Mini robot combining OpenAI's realtime APIs, vision pipelines, and choreographed motion libraries.
-![Reachy Mini Dance](src/reachy_mini_conversation_demo/images/reachy_mini_dance.gif)
 ## Overview
 - Real-time audio conversation loop powered by the OpenAI realtime API and `fastrtc` for low-latency streaming.
@@ -94,12 +102,12 @@ Some wheels (e.g. PyTorch) are large and require compatible CUDA or CPU builds
 | `HF_TOKEN` | Optional token for Hugging Face models (only used with `--local-vision` flag, falls back to `huggingface-cli login`).
 | `LOCAL_VISION_MODEL` | Hugging Face model path for local vision processing (only used with `--local-vision` flag, defaults to `HuggingFaceTB/SmolVLM2-2.2B-Instruct`).
-## Running the demo
 Activate your virtual environment, ensure the Reachy Mini robot (or simulator) is reachable, then launch:
 ```bash
-reachy-mini-conversation-demo
 ```
 By default, the app runs in console mode for direct audio interaction. Use the `--gradio` flag to launch a web UI served locally at http://127.0.0.1:7860/ (required when running in simulation mode). With a camera attached, vision is handled by the gpt-realtime model when the camera tool is used. For local vision processing, use the `--local-vision` flag to process frames periodically using the SmolVLM2 model. Additionally, you can enable face tracking via YOLO or MediaPipe pipelines depending on the extras you installed.
@@ -119,19 +127,19 @@ By default, the app runs in console mode for direct audio interaction. Use the `
 - Run on hardware with MediaPipe face tracking:
   ```bash
-  reachy-mini-conversation-demo --head-tracker mediapipe
   ```
 - Run with local vision processing (requires `local_vision` extra):
   ```bash
-  reachy-mini-conversation-demo --local-vision
   ```
 - Disable the camera pipeline (audio-only conversation):
   ```bash
-  reachy-mini-conversation-demo --no-camera
   ```
 ## LLM tools exposed to the assistant

  - reachy_mini
 ---
+# Reachy Mini conversation app
+Conversational app for the Reachy Mini robot combining OpenAI's realtime APIs, vision pipelines, and choreographed motion libraries.
+![Reachy Mini Dance](docs/assets/reachy_mini_dance.gif)
+## Architecture
+The app follows a layered architecture connecting the user, AI services, and robot hardware:
+<p align="center">
+  <img src="docs/assets/conversation_app_arch.svg" alt="Architecture Diagram" width="600"/>
+</p>
 ## Overview
 - Real-time audio conversation loop powered by the OpenAI realtime API and `fastrtc` for low-latency streaming.
 | `HF_TOKEN` | Optional token for Hugging Face models (only used with `--local-vision` flag, falls back to `huggingface-cli login`).
 | `LOCAL_VISION_MODEL` | Hugging Face model path for local vision processing (only used with `--local-vision` flag, defaults to `HuggingFaceTB/SmolVLM2-2.2B-Instruct`).
+## Running the app
 Activate your virtual environment, ensure the Reachy Mini robot (or simulator) is reachable, then launch:
 ```bash
+reachy-mini-conversation-app
 ```
 By default, the app runs in console mode for direct audio interaction. Use the `--gradio` flag to launch a web UI served locally at http://127.0.0.1:7860/ (required when running in simulation mode). With a camera attached, vision is handled by the gpt-realtime model when the camera tool is used. For local vision processing, use the `--local-vision` flag to process frames periodically using the SmolVLM2 model. Additionally, you can enable face tracking via YOLO or MediaPipe pipelines depending on the extras you installed.
 - Run on hardware with MediaPipe face tracking:
   ```bash
+  reachy-mini-conversation-app --head-tracker mediapipe
   ```
 - Run with local vision processing (requires `local_vision` extra):
   ```bash
+  reachy-mini-conversation-app --local-vision
   ```
 - Disable the camera pipeline (audio-only conversation):
   ```bash
+  reachy-mini-conversation-app --no-camera
   ```
 ## LLM tools exposed to the assistant

docs/assets/conversation_app_arch.svg ADDED Viewed

Git LFS Details

SHA256: 2d3251bc98d5a0bf1d41d0332b76e7e86496745b2a0999f228b7d8647dd453a2
Pointer size: 131 Bytes
Size of remote file: 122 kB

{src/reachy_mini_conversation_demo/images → docs/assets}/reachy_mini_dance.gif RENAMED Viewed

File without changes

docs/scheme.mmd ADDED Viewed

	@@ -0,0 +1,58 @@

+---
+config:
+  layout: dagre
+  flowchart:
+    htmlLabels: true
+---
+flowchart TB
+    User(["<span style='font-size:16px;font-weight:bold;'>User</span><br><span style='font-size:13px;color:#01579b;'>Person interacting with system</span>"])
+      -- audio stream -->
+    UI@{ label: "<span style='font-size:16px;font-weight:bold;'>UI Layer</span><br><span style='font-size:13px;color:#0277bd;'>Gradio/Console</span>" }
+    UI -- audio stream -->
+    OpenAI@{ label: "<span style='font-size:17px;font-weight:bold;'>gpt-realtime API</span><br><span style='font-size:13px; color:#7b1fa2;'>Audio+Tool Calls+Vision</span>" }
+    OpenAI -- audio stream -->
+    Motion@{ label: "<span style='font-size:16px;font-weight:bold;'>Motion Control</span><br><span style='font-size:13px;color:#f57f17;'>Audio Sync + Tracking</span>" }
+    OpenAI -- tool calls -->
+    Handlers@{ label: "<span style='font-size:16px;font-weight:bold;'>Tool Handlers</span><br><span style='font-size:12px;color:#f9a825;'>move_head, camera, head_tracking,<br/>dance, play_emotion, do_nothing</span>" }
+    Handlers -- movement
+    requests --> Motion
+    Handlers -- camera frames, face tracking -->
+    Camera@{ label: "<span style='font-size:16px;font-weight:bold;'>Camera Worker</span><br><span style='font-size:13px;color:#f57f17;'>Frame Buffer + Face Tracking</span>" }
+    Handlers -. image for
+    analysis .-> OpenAI
+    Camera -- face tracking --> Motion
+    Camera -. frames .->
+    Vision@{ label: "<span style='font-size:16px;font-weight:bold;'>Vision Processor</span><br><span style='font-size:13px;color:#7b1fa2;'>Local VLM (optional)</span>" }
+    Vision -. description .-> Handlers
+    Robot@{ label: "<span style='font-size:16px;font-weight:bold;'>reachy_mini</span><br><span style='font-size:13px;color:#c62828;'>Robot Control Library</span>" }
+    -- camera
+    frames --> Camera
+    Motion -- commands --> Robot
+    Handlers -- results --> OpenAI
+     User:::userStyle
+     UI:::uiStyle
+     OpenAI:::aiStyle
+     Motion:::coreStyle
+     Handlers:::toolStyle
+     Camera:::coreStyle
+     Vision:::aiStyle
+     Robot:::hardwareStyle
+    classDef userStyle fill:#e1f5fe,stroke:#01579b,stroke-width:3px
+    classDef uiStyle fill:#b3e5fc,stroke:#0277bd,stroke-width:2px
+    classDef aiStyle fill:#e1bee7,stroke:#7b1fa2,stroke-width:3px
+    classDef coreStyle fill:#fff9c4,stroke:#f57f17,stroke-width:2px
+    classDef hardwareStyle fill:#ef9a9a,stroke:#c62828,stroke-width:3px
+    classDef toolStyle fill:#fffde7,stroke:#f9a825,stroke-width:1px

pyproject.toml CHANGED Viewed

@@ -3,7 +3,7 @@ requires = ["setuptools"]
 build-backend = "setuptools.build_meta"
 [project]
-name = "reachy_mini_conversation_demo"
 version = "0.1.0"
 authors = [{ name = "Pollen Robotics", email = "contact@pollen-robotics.com" }]
 description = ""
@@ -12,7 +12,7 @@ requires-python = ">=3.10"
 dependencies = [
     #Media
     "aiortc>=1.13.0",
-    "fastrtc@git+ssh://git@github.com/gradio-app/fastrtc.git@main",
     "gradio>=5.49.0",
     "huggingface_hub>=0.34.4",
     "opencv-python>=4.12.0.88",
@@ -23,7 +23,7 @@ dependencies = [
     #OpenAI
     "openai>=2.1",
-    #Reachy mini
     "reachy_mini_dances_library",
     "reachy_mini_toolbox",
     "reachy_mini>=1.0.0.rc5",
@@ -34,16 +34,23 @@ local_vision = ["torch", "transformers", "num2words"]
 yolo_vision = ["ultralytics", "supervision"]
 mediapipe_vision = ["mediapipe>=0.10.14"]
 all_vision = [
-  "reachy_mini_conversation_demo[local_vision]",
-  "reachy_mini_conversation_demo[yolo_vision]",
-  "reachy_mini_conversation_demo[mediapipe_vision]",
 ]
 [dependency-groups]
-dev = ["pytest", "ruff==0.12.0"]
 [project.scripts]
-reachy-mini-conversation-demo = "reachy_mini_conversation_demo.main:main"
 [project.entry-points."reachy_mini_apps"]
 reachy_mini_conversation_demo_app = "reachy_mini_conversation_demo.main:ReachyMiniConversationDemo"
@@ -56,7 +63,7 @@ include-package-data = true
 where = ["src"]
 [tool.setuptools.package-data]
-reachy_mini_conversation_demo = ["images/*"]
 [tool.ruff]
 line-length = 119
@@ -82,7 +89,7 @@ ignore = [
 length-sort = true
 lines-after-imports = 2
 no-lines-before = ["standard-library", "local-folder"]
-known-local-folder = ["reachy_mini_conversation_demo"]
 known-first-party = ["reachy_mini", "reachy_mini_dances_library", "reachy_mini_toolbox"]
 split-on-trailing-comma = true
@@ -91,3 +98,11 @@ quote-style = "double"
 indent-style = "space"
 skip-magic-trailing-comma = false
 line-ending = "auto"

 build-backend = "setuptools.build_meta"
 [project]
+name = "reachy_mini_conversation_app"
 version = "0.1.0"
 authors = [{ name = "Pollen Robotics", email = "contact@pollen-robotics.com" }]
 description = ""
 dependencies = [
     #Media
     "aiortc>=1.13.0",
+    "fastrtc>=0.0.33",
     "gradio>=5.49.0",
     "huggingface_hub>=0.34.4",
     "opencv-python>=4.12.0.88",
     #OpenAI
     "openai>=2.1",
+    #Reachy mini
     "reachy_mini_dances_library",
     "reachy_mini_toolbox",
     "reachy_mini>=1.0.0.rc5",
 yolo_vision = ["ultralytics", "supervision"]
 mediapipe_vision = ["mediapipe>=0.10.14"]
 all_vision = [
+  "torch", "transformers", "num2words",
+  "ultralytics", "supervision",
+  "mediapipe>=0.10.14",
 ]
 [dependency-groups]
+dev = [
+  "pytest",
+  "pytest-asyncio",
+  "ruff==0.12.0",
+  "mypy==1.18.2",
+  "pre-commit",
+  "types-requests",
+]
 [project.scripts]
+reachy-mini-conversation-app = "reachy_mini_conversation_app.main:main"
 [project.entry-points."reachy_mini_apps"]
 reachy_mini_conversation_demo_app = "reachy_mini_conversation_demo.main:ReachyMiniConversationDemo"
 where = ["src"]
 [tool.setuptools.package-data]
+reachy_mini_conversation_app = ["images/*"]
 [tool.ruff]
 line-length = 119
 length-sort = true
 lines-after-imports = 2
 no-lines-before = ["standard-library", "local-folder"]
+known-local-folder = ["reachy_mini_conversation_app"]
 known-first-party = ["reachy_mini", "reachy_mini_dances_library", "reachy_mini_toolbox"]
 split-on-trailing-comma = true
 indent-style = "space"
 skip-magic-trailing-comma = false
 line-ending = "auto"
+[tool.mypy]
+python_version = "3.12"
+files = ["src/"]
+ignore_missing_imports = true
+strict = true
+show_error_codes = true
+warn_unused_ignores = true

src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/__init__.py RENAMED Viewed

File without changes

src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/audio/__init__.py RENAMED Viewed

File without changes

src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/audio/head_wobbler.py RENAMED Viewed

@@ -5,11 +5,13 @@ import queue
 import base64
 import logging
 import threading
-from typing import Tuple, Optional
 import numpy as np
-from reachy_mini_conversation_demo.audio.speech_tapper import HOP_MS, SwayRollRT
 SAMPLE_RATE = 24000
@@ -20,13 +22,13 @@ logger = logging.getLogger(__name__)
 class HeadWobbler:
     """Converts audio deltas (base64) into head movement offsets."""
-    def __init__(self, set_speech_offsets):
         """Initialize the head wobbler."""
         self._apply_offsets = set_speech_offsets
-        self._base_ts: Optional[float] = None
         self._hops_done: int = 0
-        self.audio_queue: queue.Queue[Tuple[int, int, np.ndarray]] = queue.Queue()
         self.sway = SwayRollRT()
         # Synchronization primitives
@@ -35,7 +37,7 @@ class HeadWobbler:
         self._generation = 0
         self._stop_event = threading.Event()
-        self._thread: Optional[threading.Thread] = None
     def feed(self, delta_b64: str) -> None:
         """Thread-safe: push audio into the consumer queue."""
@@ -78,14 +80,14 @@ class HeadWobbler:
                 if chunk_generation != current_generation:
                     continue
-                pcm = np.asarray(chunk).squeeze(0)
-                with self._sway_lock:
-                    results = self.sway.feed(pcm, sr)
                 if self._base_ts is None:
                     with self._state_lock:
                         if self._base_ts is None:
-                            self._base_ts = time.time()
                 i = 0
                 while i < len(results):
@@ -96,14 +98,14 @@ class HeadWobbler:
                         hops_done = self._hops_done
                     if base_ts is None:
-                        base_ts = time.time()
                         with self._state_lock:
                             if self._base_ts is None:
                                 self._base_ts = base_ts
                                 hops_done = self._hops_done
                     target = base_ts + MOVEMENT_LATENCY_S + hops_done * hop_dt
-                    now = time.time()
                     if now - target >= hop_dt:
                         lag_hops = int((now - target) / hop_dt)

 import base64
 import logging
 import threading
+from typing import Tuple
+from collections.abc import Callable
 import numpy as np
+from numpy.typing import NDArray
+from reachy_mini_conversation_app.audio.speech_tapper import HOP_MS, SwayRollRT
 SAMPLE_RATE = 24000
 class HeadWobbler:
     """Converts audio deltas (base64) into head movement offsets."""
+    def __init__(self, set_speech_offsets: Callable[[Tuple[float, float, float, float, float, float]], None]) -> None:
         """Initialize the head wobbler."""
         self._apply_offsets = set_speech_offsets
+        self._base_ts: float | None = None
         self._hops_done: int = 0
+        self.audio_queue: "queue.Queue[Tuple[int, int, NDArray[np.int16]]]" = queue.Queue()
         self.sway = SwayRollRT()
         # Synchronization primitives
         self._generation = 0
         self._stop_event = threading.Event()
+        self._thread: threading.Thread | None = None
     def feed(self, delta_b64: str) -> None:
         """Thread-safe: push audio into the consumer queue."""
                 if chunk_generation != current_generation:
                     continue
                 if self._base_ts is None:
                     with self._state_lock:
                         if self._base_ts is None:
+                            self._base_ts = time.monotonic()
+                pcm = np.asarray(chunk).squeeze(0)
+                with self._sway_lock:
+                    results = self.sway.feed(pcm, sr)
                 i = 0
                 while i < len(results):
                         hops_done = self._hops_done
                     if base_ts is None:
+                        base_ts = time.monotonic()
                         with self._state_lock:
                             if self._base_ts is None:
                                 self._base_ts = base_ts
                                 hops_done = self._hops_done
                     target = base_ts + MOVEMENT_LATENCY_S + hops_done * hop_dt
+                    now = time.monotonic()
                     if now - target >= hop_dt:
                         lag_hops = int((now - target) / hop_dt)

src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/audio/speech_tapper.py RENAMED Viewed

@@ -1,10 +1,11 @@
 from __future__ import annotations
 import math
-from typing import Dict, List, Optional
 from itertools import islice
 from collections import deque
 import numpy as np
 # Tunables
@@ -48,7 +49,7 @@ SWAY_ATTACK_FR = max(1, int(SWAY_ATTACK_MS / HOP_MS))
 SWAY_RELEASE_FR = max(1, int(SWAY_RELEASE_MS / HOP_MS))
-def _rms_dbfs(x: np.ndarray) -> float:
     """Root-mean-square in dBFS for float32 mono array in [-1,1]."""
     # numerically stable rms (avoid overflow)
     x = x.astype(np.float32, copy=False)
@@ -66,7 +67,7 @@ def _loudness_gain(db: float, offset: float = SENS_DB_OFFSET) -> float:
     return t**LOUDNESS_GAMMA if LOUDNESS_GAMMA != 1.0 else t
-def _to_float32_mono(x: np.ndarray) -> np.ndarray:
     """Convert arbitrary PCM array to float32 mono in [-1,1].
     Accepts shapes: (N,), (1,N), (N,1), (C,N), (N,C).
@@ -94,7 +95,7 @@ def _to_float32_mono(x: np.ndarray) -> np.ndarray:
     return a.astype(np.float32) / (scale if scale != 0.0 else 1.0)
-def _resample_linear(x: np.ndarray, sr_in: int, sr_out: int) -> np.ndarray:
     """Lightweight linear resampler for short buffers."""
     if sr_in == sr_out or x.size == 0:
         return x
@@ -118,8 +119,8 @@ class SwayRollRT:
     def __init__(self, rng_seed: int = 7):
         """Initialize state."""
         self._seed = int(rng_seed)
-        self.samples = deque(maxlen=10 * SR)  # sliding window for VAD/env
-        self.carry = np.zeros(0, dtype=np.float32)
         self.vad_on = False
         self.vad_above = 0
@@ -150,7 +151,7 @@ class SwayRollRT:
         self.sway_down = 0
         self.t = 0.0
-    def feed(self, pcm: np.ndarray, sr: Optional[int]) -> List[Dict[str, float]]:
         """Stream in PCM chunk. Returns a list of sway dicts, one per hop (HOP_MS).
         Args:
@@ -177,7 +178,8 @@ class SwayRollRT:
         while self.carry.size >= HOP:
             hop = self.carry[:HOP]
-            self.carry = self.carry[HOP:]
             # keep sliding window for VAD/env computation
             # (deque accepts any iterable; list() for small HOP is fine)
@@ -260,7 +262,7 @@ class SwayRollRT:
                     "x_mm": x_mm,
                     "y_mm": y_mm,
                     "z_mm": z_mm,
-                }
             )
         return out

 from __future__ import annotations
 import math
+from typing import Any, Dict, List
 from itertools import islice
 from collections import deque
 import numpy as np
+from numpy.typing import NDArray
 # Tunables
 SWAY_RELEASE_FR = max(1, int(SWAY_RELEASE_MS / HOP_MS))
+def _rms_dbfs(x: NDArray[np.float32]) -> float:
     """Root-mean-square in dBFS for float32 mono array in [-1,1]."""
     # numerically stable rms (avoid overflow)
     x = x.astype(np.float32, copy=False)
     return t**LOUDNESS_GAMMA if LOUDNESS_GAMMA != 1.0 else t
+def _to_float32_mono(x: NDArray[Any]) -> NDArray[np.float32]:
     """Convert arbitrary PCM array to float32 mono in [-1,1].
     Accepts shapes: (N,), (1,N), (N,1), (C,N), (N,C).
     return a.astype(np.float32) / (scale if scale != 0.0 else 1.0)
+def _resample_linear(x: NDArray[np.float32], sr_in: int, sr_out: int) -> NDArray[np.float32]:
     """Lightweight linear resampler for short buffers."""
     if sr_in == sr_out or x.size == 0:
         return x
     def __init__(self, rng_seed: int = 7):
         """Initialize state."""
         self._seed = int(rng_seed)
+        self.samples: deque[float] = deque(maxlen=10 * SR)  # sliding window for VAD/env
+        self.carry: NDArray[np.float32] = np.zeros(0, dtype=np.float32)
         self.vad_on = False
         self.vad_above = 0
         self.sway_down = 0
         self.t = 0.0
+    def feed(self, pcm: NDArray[Any], sr: int | None) -> List[Dict[str, float]]:
         """Stream in PCM chunk. Returns a list of sway dicts, one per hop (HOP_MS).
         Args:
         while self.carry.size >= HOP:
             hop = self.carry[:HOP]
+            remaining: NDArray[np.float32] = self.carry[HOP:]
+            self.carry = remaining
             # keep sliding window for VAD/env computation
             # (deque accepts any iterable; list() for small HOP is fine)
                     "x_mm": x_mm,
                     "y_mm": y_mm,
                     "z_mm": z_mm,
+                },
             )
         return out

src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/camera_worker.py RENAMED Viewed

@@ -9,10 +9,10 @@ Ported from main_works.py camera_worker() function to provide:
 import time
 import logging
 import threading
-from typing import Tuple, Optional
-import cv2
 import numpy as np
 from scipy.spatial.transform import Rotation as R
 from reachy_mini import ReachyMini
@@ -25,20 +25,20 @@ logger = logging.getLogger(__name__)
 class CameraWorker:
     """Thread-safe camera worker with frame buffering and face tracking."""
-    def __init__(self, reachy_mini: ReachyMini, head_tracker=None):
         """Initialize."""
         self.reachy_mini = reachy_mini
         self.head_tracker = head_tracker
         # Thread-safe frame storage
-        self.latest_frame: Optional[np.ndarray] = None
         self.frame_lock = threading.Lock()
         self._stop_event = threading.Event()
-        self._thread: Optional[threading.Thread] = None
         # Face tracking state
         self.is_head_tracking_enabled = True
-        self.face_tracking_offsets = [
             0.0,
             0.0,
             0.0,
@@ -49,31 +49,30 @@ class CameraWorker:
         self.face_tracking_lock = threading.Lock()
         # Face tracking timing variables (same as main_works.py)
-        self.last_face_detected_time: Optional[float] = None
-        self.interpolation_start_time: Optional[float] = None
-        self.interpolation_start_pose: Optional[np.ndarray] = None
         self.face_lost_delay = 2.0  # seconds to wait before starting interpolation
         self.interpolation_duration = 1.0  # seconds to interpolate back to neutral
         # Track state changes
         self.previous_head_tracking_state = self.is_head_tracking_enabled
-    def get_latest_frame(self) -> Optional[np.ndarray]:
         """Get the latest frame (thread-safe)."""
         with self.frame_lock:
             if self.latest_frame is None:
                 return None
-            else:
-                frame = self.latest_frame.copy()
-                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                return frame
     def get_face_tracking_offsets(
         self,
     ) -> Tuple[float, float, float, float, float, float]:
         """Get current face tracking offsets (thread-safe)."""
         with self.face_tracking_lock:
-            return tuple(self.face_tracking_offsets)
     def set_head_tracking_enabled(self, enabled: bool) -> None:
         """Enable/disable head tracking."""
@@ -168,12 +167,11 @@ class CameraWorker:
                                     rotation[2],  # roll, pitch, yaw
                                 ]
-                        else:
-                            # No face detected while tracking enabled - set face lost timestamp
-                            if self.last_face_detected_time is None or self.last_face_detected_time == current_time:
-                                # Only update if we haven't already set a face lost time
-                                # (current_time check prevents overriding the disable-triggered timestamp)
-                                pass
                     # Handle smooth interpolation (works for both face-lost and tracking-disabled cases)
                     if self.last_face_detected_time is not None:
@@ -188,11 +186,12 @@ class CameraWorker:
                                     current_translation = self.face_tracking_offsets[:3]
                                     current_rotation_euler = self.face_tracking_offsets[3:]
                                     # Convert to 4x4 pose matrix
-                                    self.interpolation_start_pose = np.eye(4)
-                                    self.interpolation_start_pose[:3, 3] = current_translation
-                                    self.interpolation_start_pose[:3, :3] = R.from_euler(
-                                        "xyz", current_rotation_euler
                                     ).as_matrix()
                             # Calculate interpolation progress (t from 0 to 1)
                             elapsed_interpolation = current_time - self.interpolation_start_time
@@ -200,7 +199,7 @@ class CameraWorker:
                             # Interpolate between current pose and neutral pose
                             interpolated_pose = linear_pose_interpolation(
-                                self.interpolation_start_pose, neutral_pose, t
                             )
                             # Extract translation and rotation from interpolated pose

 import time
 import logging
 import threading
+from typing import Any, List, Tuple
 import numpy as np
+from numpy.typing import NDArray
 from scipy.spatial.transform import Rotation as R
 from reachy_mini import ReachyMini
 class CameraWorker:
     """Thread-safe camera worker with frame buffering and face tracking."""
+    def __init__(self, reachy_mini: ReachyMini, head_tracker: Any = None) -> None:
         """Initialize."""
         self.reachy_mini = reachy_mini
         self.head_tracker = head_tracker
         # Thread-safe frame storage
+        self.latest_frame: NDArray[np.uint8] | None = None
         self.frame_lock = threading.Lock()
         self._stop_event = threading.Event()
+        self._thread: threading.Thread | None = None
         # Face tracking state
         self.is_head_tracking_enabled = True
+        self.face_tracking_offsets: List[float] = [
             0.0,
             0.0,
             0.0,
         self.face_tracking_lock = threading.Lock()
         # Face tracking timing variables (same as main_works.py)
+        self.last_face_detected_time: float | None = None
+        self.interpolation_start_time: float | None = None
+        self.interpolation_start_pose: NDArray[np.float32] | None = None
         self.face_lost_delay = 2.0  # seconds to wait before starting interpolation
         self.interpolation_duration = 1.0  # seconds to interpolate back to neutral
         # Track state changes
         self.previous_head_tracking_state = self.is_head_tracking_enabled
+    def get_latest_frame(self) -> NDArray[np.uint8] | None:
         """Get the latest frame (thread-safe)."""
         with self.frame_lock:
             if self.latest_frame is None:
                 return None
+            # Return a copy in original BGR format (OpenCV native)
+            return self.latest_frame.copy()
     def get_face_tracking_offsets(
         self,
     ) -> Tuple[float, float, float, float, float, float]:
         """Get current face tracking offsets (thread-safe)."""
         with self.face_tracking_lock:
+            offsets = self.face_tracking_offsets
+            return (offsets[0], offsets[1], offsets[2], offsets[3], offsets[4], offsets[5])
     def set_head_tracking_enabled(self, enabled: bool) -> None:
         """Enable/disable head tracking."""
                                     rotation[2],  # roll, pitch, yaw
                                 ]
+                        # No face detected while tracking enabled - set face lost timestamp
+                        elif self.last_face_detected_time is None or self.last_face_detected_time == current_time:
+                            # Only update if we haven't already set a face lost time
+                            # (current_time check prevents overriding the disable-triggered timestamp)
+                            pass
                     # Handle smooth interpolation (works for both face-lost and tracking-disabled cases)
                     if self.last_face_detected_time is not None:
                                     current_translation = self.face_tracking_offsets[:3]
                                     current_rotation_euler = self.face_tracking_offsets[3:]
                                     # Convert to 4x4 pose matrix
+                                    pose_matrix = np.eye(4, dtype=np.float32)
+                                    pose_matrix[:3, 3] = current_translation
+                                    pose_matrix[:3, :3] = R.from_euler(
+                                        "xyz", current_rotation_euler,
                                     ).as_matrix()
+                                    self.interpolation_start_pose = pose_matrix
                             # Calculate interpolation progress (t from 0 to 1)
                             elapsed_interpolation = current_time - self.interpolation_start_time
                             # Interpolate between current pose and neutral pose
                             interpolated_pose = linear_pose_interpolation(
+                                self.interpolation_start_pose, neutral_pose, t,
                             )
                             # Extract translation and rotation from interpolated pose

src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/config.py RENAMED Viewed

@@ -1,44 +1,33 @@
 import os
 import logging
-from pathlib import Path
-from dotenv import load_dotenv
 logger = logging.getLogger(__name__)
-# Check if .env file exists
-# TODO Antoine - disabled this for testing appifying
-# env_file = Path(".env")
-# if not env_file.exists():
-#     raise RuntimeError(
-#         ".env file not found. Please create one based on .env.example:\n"
-#         "  cp .env.example .env\n"
-#         "Then add your OPENAI_API_KEY to the .env file."
-#     )
-# Load .env and verify it was loaded successfully
-# if not load_dotenv():
-#     raise RuntimeError(
-#         "Failed to load .env file. Please ensure the file is readable and properly formatted."
-#     )
-logger.info("Configuration loaded from .env file")
 class Config:
-    """Configuration class for the conversation demo."""
     # Required
     OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
-    if OPENAI_API_KEY is None:
-        raise RuntimeError(
-            "OPENAI_API_KEY is not set in .env file. Please add it:\n"
-            "  OPENAI_API_KEY=your_api_key_here"
-        )
-    if not OPENAI_API_KEY.strip():
         raise RuntimeError(
-            "OPENAI_API_KEY is empty in .env file. Please provide a valid API key."
         )
     # Optional

 import os
 import logging
+from dotenv import find_dotenv, load_dotenv
 logger = logging.getLogger(__name__)
+# Locate .env file (search upward from current working directory)
+dotenv_path = find_dotenv(usecwd=True)
+if dotenv_path:
+    # Load .env and override environment variables
+    load_dotenv(dotenv_path=dotenv_path, override=True)
+    logger.info(f"Configuration loaded from {dotenv_path}")
+else:
+    logger.warning("No .env file found, using environment variables")
 class Config:
+    """Configuration class for the conversation app."""
     # Required
     OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+    if not OPENAI_API_KEY or not OPENAI_API_KEY.strip():
         raise RuntimeError(
+            "OPENAI_API_KEY is missing or empty.\n"
+            "Either:\n"
+            "  1. Create a .env file with: OPENAI_API_KEY=your_api_key_here (recomended)\n"
+            "  2. Set environment variable: export OPENAI_API_KEY=your_api_key_here"
         )
     # Optional

src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/console.py RENAMED Viewed

@@ -3,14 +3,16 @@
 records mic frames to the handler and plays handler audio frames to the speaker.
 """
 import asyncio
 import logging
-import librosa
 from fastrtc import AdditionalOutputs, audio_to_int16, audio_to_float32
 from reachy_mini import ReachyMini
-from reachy_mini_conversation_demo.openai_realtime import OpenaiRealtimeHandler
 logger = logging.getLogger(__name__)
@@ -24,15 +26,21 @@ class LocalStream:
         self.handler = handler
         self._robot = robot
         self._stop_event = asyncio.Event()
-        self._tasks = []
         # Allow the handler to flush the player queue when appropriate.
-        self.handler._clear_queue = self.clear_audio_queue  # type: ignore[assignment]
     def launch(self) -> None:
         """Start the recorder/player and run the async processing loops."""
         self._stop_event.clear()
         self._robot.media.start_recording()
         self._robot.media.start_playing()
         async def runner() -> None:
             self._tasks = [
@@ -83,9 +91,8 @@ class LocalStream:
                 frame_mono = audio_frame.T[0]  # both channels are identical
                 frame = audio_to_int16(frame_mono)
                 await self.handler.receive((16000, frame))
-                # await asyncio.sleep(0)  # yield to event loop
-            else:
-                await asyncio.sleep(0.01)  # avoid busy loop
     async def play_loop(self) -> None:
         """Fetch outputs from the handler: log text and play audio frames."""
@@ -105,12 +112,16 @@ class LocalStream:
             elif isinstance(handler_output, tuple):
                 input_sample_rate, audio_frame = handler_output
                 device_sample_rate = self._robot.media.get_audio_samplerate()
-                audio_frame = audio_to_float32(audio_frame.squeeze())
                 if input_sample_rate != device_sample_rate:
-                    audio_frame = librosa.resample(
-                        audio_frame, orig_sr=input_sample_rate, target_sr=device_sample_rate
                     )
-                self._robot.media.push_audio_sample(audio_frame)
             else:
                 logger.debug("Ignoring output type=%s", type(handler_output).__name__)

 records mic frames to the handler and plays handler audio frames to the speaker.
 """
+import time
 import asyncio
 import logging
+from typing import List
 from fastrtc import AdditionalOutputs, audio_to_int16, audio_to_float32
+from librosa import resample
 from reachy_mini import ReachyMini
+from reachy_mini_conversation_app.openai_realtime import OpenaiRealtimeHandler
 logger = logging.getLogger(__name__)
         self.handler = handler
         self._robot = robot
         self._stop_event = asyncio.Event()
+        self._tasks: List[asyncio.Task[None]] = []
         # Allow the handler to flush the player queue when appropriate.
+        self.handler._clear_queue = self.clear_audio_queue
+        # Hack to avoid the first lenghty call to resample at runtime.
+        # This is likely caused by cache initialization overhead.
+        import numpy as np
+        resample(np.array([0.0]), orig_sr=1, target_sr=1)
     def launch(self) -> None:
         """Start the recorder/player and run the async processing loops."""
         self._stop_event.clear()
         self._robot.media.start_recording()
         self._robot.media.start_playing()
+        time.sleep(1)  # give some time to the pipelines to start
         async def runner() -> None:
             self._tasks = [
                 frame_mono = audio_frame.T[0]  # both channels are identical
                 frame = audio_to_int16(frame_mono)
                 await self.handler.receive((16000, frame))
+            await asyncio.sleep(0.01)  # avoid busy loop
     async def play_loop(self) -> None:
         """Fetch outputs from the handler: log text and play audio frames."""
             elif isinstance(handler_output, tuple):
                 input_sample_rate, audio_frame = handler_output
                 device_sample_rate = self._robot.media.get_audio_samplerate()
+                audio_frame_float = audio_to_float32(audio_frame.squeeze())
                 if input_sample_rate != device_sample_rate:
+                    audio_frame_float = resample(
+                        audio_frame_float,
+                        orig_sr=input_sample_rate,
+                        target_sr=device_sample_rate,
                     )
+                self._robot.media.push_audio_sample(audio_frame_float)
             else:
                 logger.debug("Ignoring output type=%s", type(handler_output).__name__)

src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/dance_emotion_moves.py RENAMED Viewed

@@ -9,6 +9,7 @@ import logging
 from typing import Tuple
 import numpy as np
 from reachy_mini.motion.move import Move
 from reachy_mini.motion.recorded_move import RecordedMoves
@@ -18,7 +19,7 @@ from reachy_mini_dances_library.dance_move import DanceMove
 logger = logging.getLogger(__name__)
-class DanceQueueMove(Move):
     """Wrapper for dance moves to work with the movement queue system."""
     def __init__(self, move_name: str):
@@ -29,9 +30,9 @@ class DanceQueueMove(Move):
     @property
     def duration(self) -> float:
         """Duration property required by official Move interface."""
-        return self.dance_move.duration
-    def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
         """Evaluate dance move at time t."""
         try:
             # Get the pose from the dance move
@@ -49,10 +50,10 @@ class DanceQueueMove(Move):
             from reachy_mini.utils import create_head_pose
             neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
-            return (neutral_head_pose, np.array([0.0, 0.0]), 0.0)
-class EmotionQueueMove(Move):
     """Wrapper for emotion moves to work with the movement queue system."""
     def __init__(self, emotion_name: str, recorded_moves: RecordedMoves):
@@ -63,9 +64,9 @@ class EmotionQueueMove(Move):
     @property
     def duration(self) -> float:
         """Duration property required by official Move interface."""
-        return self.emotion_move.duration
-    def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
         """Evaluate emotion move at time t."""
         try:
             # Get the pose from the emotion move
@@ -83,20 +84,20 @@ class EmotionQueueMove(Move):
             from reachy_mini.utils import create_head_pose
             neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
-            return (neutral_head_pose, np.array([0.0, 0.0]), 0.0)
-class GotoQueueMove(Move):
     """Wrapper for goto moves to work with the movement queue system."""
     def __init__(
         self,
-        target_head_pose: np.ndarray,
-        start_head_pose: np.ndarray = None,
         target_antennas: Tuple[float, float] = (0, 0),
-        start_antennas: Tuple[float, float] = None,
         target_body_yaw: float = 0,
-        start_body_yaw: float = None,
         duration: float = 1.0,
     ):
         """Initialize a GotoQueueMove."""
@@ -113,7 +114,7 @@ class GotoQueueMove(Move):
         """Duration property required by official Move interface."""
         return self._duration
-    def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
         """Evaluate goto move at time t using linear interpolation."""
         try:
             from reachy_mini.utils import create_head_pose
@@ -136,7 +137,8 @@ class GotoQueueMove(Move):
                 [
                     self.start_antennas[0] + (self.target_antennas[0] - self.start_antennas[0]) * t_clamped,
                     self.start_antennas[1] + (self.target_antennas[1] - self.start_antennas[1]) * t_clamped,
-                ]
             )
             # Interpolate body yaw
@@ -146,6 +148,7 @@ class GotoQueueMove(Move):
         except Exception as e:
             logger.error(f"Error evaluating goto move at t={t}: {e}")
-            # Return target pose on error - convert antennas to numpy array
-            target_antennas_array = np.array([self.target_antennas[0], self.target_antennas[1]])
-            return (self.target_head_pose, target_antennas_array, self.target_body_yaw)

 from typing import Tuple
 import numpy as np
+from numpy.typing import NDArray
 from reachy_mini.motion.move import Move
 from reachy_mini.motion.recorded_move import RecordedMoves
 logger = logging.getLogger(__name__)
+class DanceQueueMove(Move):  # type: ignore
     """Wrapper for dance moves to work with the movement queue system."""
     def __init__(self, move_name: str):
     @property
     def duration(self) -> float:
         """Duration property required by official Move interface."""
+        return float(self.dance_move.duration)
+    def evaluate(self, t: float) -> tuple[NDArray[np.float64] | None, NDArray[np.float64] | None, float | None]:
         """Evaluate dance move at time t."""
         try:
             # Get the pose from the dance move
             from reachy_mini.utils import create_head_pose
             neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
+            return (neutral_head_pose, np.array([0.0, 0.0], dtype=np.float64), 0.0)
+class EmotionQueueMove(Move):  # type: ignore
     """Wrapper for emotion moves to work with the movement queue system."""
     def __init__(self, emotion_name: str, recorded_moves: RecordedMoves):
     @property
     def duration(self) -> float:
         """Duration property required by official Move interface."""
+        return float(self.emotion_move.duration)
+    def evaluate(self, t: float) -> tuple[NDArray[np.float64] | None, NDArray[np.float64] | None, float | None]:
         """Evaluate emotion move at time t."""
         try:
             # Get the pose from the emotion move
             from reachy_mini.utils import create_head_pose
             neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
+            return (neutral_head_pose, np.array([0.0, 0.0], dtype=np.float64), 0.0)
+class GotoQueueMove(Move):  # type: ignore
     """Wrapper for goto moves to work with the movement queue system."""
     def __init__(
         self,
+        target_head_pose: NDArray[np.float32],
+        start_head_pose: NDArray[np.float32] | None = None,
         target_antennas: Tuple[float, float] = (0, 0),
+        start_antennas: Tuple[float, float] | None = None,
         target_body_yaw: float = 0,
+        start_body_yaw: float | None = None,
         duration: float = 1.0,
     ):
         """Initialize a GotoQueueMove."""
         """Duration property required by official Move interface."""
         return self._duration
+    def evaluate(self, t: float) -> tuple[NDArray[np.float64] | None, NDArray[np.float64] | None, float | None]:
         """Evaluate goto move at time t using linear interpolation."""
         try:
             from reachy_mini.utils import create_head_pose
                 [
                     self.start_antennas[0] + (self.target_antennas[0] - self.start_antennas[0]) * t_clamped,
                     self.start_antennas[1] + (self.target_antennas[1] - self.start_antennas[1]) * t_clamped,
+                ],
+                dtype=np.float64,
             )
             # Interpolate body yaw
         except Exception as e:
             logger.error(f"Error evaluating goto move at t={t}: {e}")
+            # Return target pose on error - convert to float64
+            target_head_pose_f64 = self.target_head_pose.astype(np.float64)
+            target_antennas_array = np.array([self.target_antennas[0], self.target_antennas[1]], dtype=np.float64)
+            return (target_head_pose_f64, target_antennas_array, self.target_body_yaw)

src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/images/reachymini_avatar.png RENAMED Viewed

File without changes

src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/images/user_avatar.png RENAMED Viewed

File without changes

src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/main.py RENAMED Viewed

@@ -1,28 +1,29 @@
-"""Entrypoint for the Reachy Mini conversation demo."""
 import os
 import sys
 import time
 import threading
 import gradio as gr
 from fastapi import FastAPI
 from fastrtc import Stream
 from reachy_mini import ReachyMini, ReachyMiniApp
-from reachy_mini_conversation_demo.moves import MovementManager
-from reachy_mini_conversation_demo.tools import ToolDependencies
-from reachy_mini_conversation_demo.utils import (
     parse_args,
     setup_logger,
     handle_vision_stuff,
 )
-from reachy_mini_conversation_demo.console import LocalStream
-from reachy_mini_conversation_demo.openai_realtime import OpenaiRealtimeHandler
-from reachy_mini_conversation_demo.audio.head_wobbler import HeadWobbler
-def update_chatbot(chatbot: list[dict], response: dict):
     """Update the chatbot with AdditionalOutputs."""
     chatbot.append(response)
     return chatbot
@@ -34,7 +35,7 @@ def main(robot=None):
     args.gradio = True  # TODO Antoine - force gradio for testing appifying
     logger = setup_logger(args.debug)
-    logger.info("Starting Reachy Mini Conversation Demo")
     if args.no_camera and args.head_tracker is not None:
         logger.warning("Head tracking is not activated due to --no-camera.")
@@ -45,7 +46,7 @@ def main(robot=None):
     # Check if running in simulation mode without --gradio
     if robot.client.get_status()["simulation_enabled"] and not args.gradio:
         logger.error(
-            "Simulation mode requires Gradio interface. Please use --gradio flag when running in simulation mode."
         )
         robot.client.disconnect()
         sys.exit(1)
@@ -80,7 +81,7 @@ def main(robot=None):
     handler = OpenaiRealtimeHandler(deps)
-    stream_manager = None
     if args.gradio:
         stream = Stream(

+"""Entrypoint for the Reachy Mini conversation app."""
 import os
 import sys
 import time
 import threading
+from typing import Any, Dict, List
 import gradio as gr
 from fastapi import FastAPI
 from fastrtc import Stream
 from reachy_mini import ReachyMini, ReachyMiniApp
+from reachy_mini_conversation_app.moves import MovementManager
+from reachy_mini_conversation_app.tools import ToolDependencies
+from reachy_mini_conversation_app.utils import (
     parse_args,
     setup_logger,
     handle_vision_stuff,
 )
+from reachy_mini_conversation_app.console import LocalStream
+from reachy_mini_conversation_app.openai_realtime import OpenaiRealtimeHandler
+from reachy_mini_conversation_app.audio.head_wobbler import HeadWobbler
+def update_chatbot(chatbot: List[Dict[str, Any]], response: Dict[str, Any]) -> List[Dict[str, Any]]:
     """Update the chatbot with AdditionalOutputs."""
     chatbot.append(response)
     return chatbot
     args.gradio = True  # TODO Antoine - force gradio for testing appifying
     logger = setup_logger(args.debug)
+    logger.info("Starting Reachy Mini Conversation App")
     if args.no_camera and args.head_tracker is not None:
         logger.warning("Head tracking is not activated due to --no-camera.")
     # Check if running in simulation mode without --gradio
     if robot.client.get_status()["simulation_enabled"] and not args.gradio:
         logger.error(
+            "Simulation mode requires Gradio interface. Please use --gradio flag when running in simulation mode.",
         )
         robot.client.disconnect()
         sys.exit(1)
     handler = OpenaiRealtimeHandler(deps)
+    stream_manager: gr.Blocks | LocalStream | None = None
     if args.gradio:
         stream = Stream(

src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/moves.py RENAMED Viewed

@@ -36,11 +36,12 @@ import time
 import logging
 import threading
 from queue import Empty, Queue
-from typing import Any, Tuple, Optional
 from collections import deque
 from dataclasses import dataclass
 import numpy as np
 from reachy_mini import ReachyMini
 from reachy_mini.utils import create_head_pose
@@ -57,15 +58,15 @@ logger = logging.getLogger(__name__)
 CONTROL_LOOP_FREQUENCY_HZ = 100.0  # Hz - Target frequency for the movement control loop
 # Type definitions
-FullBodyPose = Tuple[np.ndarray, Tuple[float, float], float]  # (head_pose_4x4, antennas, body_yaw)
-class BreathingMove(Move):
     """Breathing move with interpolation to neutral and then continuous breathing patterns."""
     def __init__(
         self,
-        interpolation_start_pose: np.ndarray,
         interpolation_start_antennas: Tuple[float, float],
         interpolation_duration: float = 1.0,
     ):
@@ -96,7 +97,7 @@ class BreathingMove(Move):
         """Duration property required by official Move interface."""
         return float("inf")  # Continuous breathing (never ends naturally)
-    def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
         """Evaluate breathing move at time t."""
         if t < self.interpolation_duration:
             # Phase 1: Interpolate to neutral base position
@@ -104,13 +105,14 @@ class BreathingMove(Move):
             # Interpolate head pose
             head_pose = linear_pose_interpolation(
-                self.interpolation_start_pose, self.neutral_head_pose, interpolation_t
             )
             # Interpolate antennas
-            antennas = (
                 1 - interpolation_t
             ) * self.interpolation_start_antennas + interpolation_t * self.neutral_antennas
         else:
             # Phase 2: Breathing patterns from neutral base
@@ -122,7 +124,7 @@ class BreathingMove(Move):
             # Antenna sway (opposite directions)
             antenna_sway = self.antenna_sway_amplitude * np.sin(2 * np.pi * self.antenna_frequency * breathing_time)
-            antennas = np.array([antenna_sway, -antenna_sway])
         # Return in official Move interface format: (head_pose, antennas_array, body_yaw)
         return (head_pose, antennas, 0.0)
@@ -168,8 +170,8 @@ class MovementState:
     """State tracking for the movement system."""
     # Primary move state
-    current_move: Optional[Move] = None
-    move_start_time: Optional[float] = None
     last_activity_time: float = 0.0
     # Secondary move state (offsets)
@@ -191,7 +193,7 @@ class MovementState:
     )
     # Status flags
-    last_primary_pose: Optional[FullBodyPose] = None
     def update_activity(self) -> None:
         """Update the last activity time."""
@@ -242,7 +244,7 @@ class MovementManager:
     def __init__(
         self,
         current_robot: ReachyMini,
-        camera_worker=None,
     ):
         """Initialize movement manager."""
         self.current_robot = current_robot
@@ -258,7 +260,7 @@ class MovementManager:
         self.state.last_primary_pose = (neutral_pose, (0.0, 0.0), 0.0)
         # Move queue (primary moves)
-        self.move_queue = deque()
         # Configuration
         self.idle_inactivity_delay = 0.3  # seconds
@@ -266,7 +268,7 @@ class MovementManager:
         self.target_period = 1.0 / self.target_frequency
         self._stop_event = threading.Event()
-        self._thread: Optional[threading.Thread] = None
         self._is_listening = False
         self._last_commanded_pose: FullBodyPose = clone_full_body_pose(self.state.last_primary_pose)
         self._listening_antennas: Tuple[float, float] = self._last_commanded_pose[1]
@@ -281,7 +283,7 @@ class MovementManager:
         self._set_target_err_suppressed = 0
         # Cross-thread signalling
-        self._command_queue: Queue[tuple[str, Any]] = Queue()
         self._speech_offsets_lock = threading.Lock()
         self._pending_speech_offsets: Tuple[float, float, float, float, float, float] = (
             0.0,
@@ -383,7 +385,7 @@ class MovementManager:
     def _apply_pending_offsets(self) -> None:
         """Apply the most recent speech/face offset updates."""
-        speech_offsets: Optional[Tuple[float, float, float, float, float, float]] = None
         with self._speech_offsets_lock:
             if self._speech_offsets_dirty:
                 speech_offsets = self._pending_speech_offsets
@@ -393,7 +395,7 @@ class MovementManager:
             self.state.speech_offsets = speech_offsets
             self.state.update_activity()
-        face_offsets: Optional[Tuple[float, float, float, float, float, float]] = None
         with self._face_offsets_lock:
             if self._face_offsets_dirty:
                 face_offsets = self._pending_face_offsets
@@ -549,14 +551,13 @@ class MovementManager:
             )
             self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
         else:
-            # Otherwise reuse the last primary pose so we avoid jumps between moves
-            if self.state.last_primary_pose is not None:
-                primary_full_body_pose = clone_full_body_pose(self.state.last_primary_pose)
-            else:
-                neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
-                primary_full_body_pose = (neutral_head_pose, (0.0, 0.0), 0.0)
-                self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
         return primary_full_body_pose
@@ -631,7 +632,7 @@ class MovementManager:
         return antennas_cmd
-    def _issue_control_command(self, head: np.ndarray, antennas: Tuple[float, float], body_yaw: float) -> None:
         """Send the fused pose to the robot with throttled error logging."""
         try:
             self.current_robot.set_target(head=head, antennas=antennas, body_yaw=body_yaw)
@@ -651,7 +652,7 @@ class MovementManager:
                 self._last_commanded_pose = clone_full_body_pose((head, antennas, body_yaw))
     def _update_frequency_stats(
-        self, loop_start: float, prev_loop_start: float, stats: LoopFrequencyStats
     ) -> LoopFrequencyStats:
         """Update frequency statistics based on the current loop start time."""
         period = loop_start - prev_loop_start
@@ -664,7 +665,7 @@ class MovementManager:
             stats.min_freq = min(stats.min_freq, stats.last_freq)
         return stats
-    def _schedule_next_tick(self, loop_start: float, stats: LoopFrequencyStats) -> tuple[float, LoopFrequencyStats]:
         """Compute sleep time to maintain target frequency and update potential freq."""
         computation_time = self._now() - loop_start
         stats.potential_freq = 1.0 / computation_time if computation_time > 0 else float("inf")
@@ -729,7 +730,7 @@ class MovementManager:
             self._thread = None
         logger.debug("Move worker stopped")
-    def get_status(self) -> dict[str, Any]:
         """Return a lightweight status snapshot for observability."""
         with self._status_lock:
             pose_snapshot = clone_full_body_pose(self._last_commanded_pose)

 import logging
 import threading
 from queue import Empty, Queue
+from typing import Any, Dict, Tuple
 from collections import deque
 from dataclasses import dataclass
 import numpy as np
+from numpy.typing import NDArray
 from reachy_mini import ReachyMini
 from reachy_mini.utils import create_head_pose
 CONTROL_LOOP_FREQUENCY_HZ = 100.0  # Hz - Target frequency for the movement control loop
 # Type definitions
+FullBodyPose = Tuple[NDArray[np.float32], Tuple[float, float], float]  # (head_pose_4x4, antennas, body_yaw)
+class BreathingMove(Move):  # type: ignore
     """Breathing move with interpolation to neutral and then continuous breathing patterns."""
     def __init__(
         self,
+        interpolation_start_pose: NDArray[np.float32],
         interpolation_start_antennas: Tuple[float, float],
         interpolation_duration: float = 1.0,
     ):
         """Duration property required by official Move interface."""
         return float("inf")  # Continuous breathing (never ends naturally)
+    def evaluate(self, t: float) -> tuple[NDArray[np.float64] | None, NDArray[np.float64] | None, float | None]:
         """Evaluate breathing move at time t."""
         if t < self.interpolation_duration:
             # Phase 1: Interpolate to neutral base position
             # Interpolate head pose
             head_pose = linear_pose_interpolation(
+                self.interpolation_start_pose, self.neutral_head_pose, interpolation_t,
             )
             # Interpolate antennas
+            antennas_interp = (
                 1 - interpolation_t
             ) * self.interpolation_start_antennas + interpolation_t * self.neutral_antennas
+            antennas = antennas_interp.astype(np.float64)
         else:
             # Phase 2: Breathing patterns from neutral base
             # Antenna sway (opposite directions)
             antenna_sway = self.antenna_sway_amplitude * np.sin(2 * np.pi * self.antenna_frequency * breathing_time)
+            antennas = np.array([antenna_sway, -antenna_sway], dtype=np.float64)
         # Return in official Move interface format: (head_pose, antennas_array, body_yaw)
         return (head_pose, antennas, 0.0)
     """State tracking for the movement system."""
     # Primary move state
+    current_move: Move | None = None
+    move_start_time: float | None = None
     last_activity_time: float = 0.0
     # Secondary move state (offsets)
     )
     # Status flags
+    last_primary_pose: FullBodyPose | None = None
     def update_activity(self) -> None:
         """Update the last activity time."""
     def __init__(
         self,
         current_robot: ReachyMini,
+        camera_worker: "Any" = None,
     ):
         """Initialize movement manager."""
         self.current_robot = current_robot
         self.state.last_primary_pose = (neutral_pose, (0.0, 0.0), 0.0)
         # Move queue (primary moves)
+        self.move_queue: deque[Move] = deque()
         # Configuration
         self.idle_inactivity_delay = 0.3  # seconds
         self.target_period = 1.0 / self.target_frequency
         self._stop_event = threading.Event()
+        self._thread: threading.Thread | None = None
         self._is_listening = False
         self._last_commanded_pose: FullBodyPose = clone_full_body_pose(self.state.last_primary_pose)
         self._listening_antennas: Tuple[float, float] = self._last_commanded_pose[1]
         self._set_target_err_suppressed = 0
         # Cross-thread signalling
+        self._command_queue: "Queue[Tuple[str, Any]]" = Queue()
         self._speech_offsets_lock = threading.Lock()
         self._pending_speech_offsets: Tuple[float, float, float, float, float, float] = (
             0.0,
     def _apply_pending_offsets(self) -> None:
         """Apply the most recent speech/face offset updates."""
+        speech_offsets: Tuple[float, float, float, float, float, float] | None = None
         with self._speech_offsets_lock:
             if self._speech_offsets_dirty:
                 speech_offsets = self._pending_speech_offsets
             self.state.speech_offsets = speech_offsets
             self.state.update_activity()
+        face_offsets: Tuple[float, float, float, float, float, float] | None = None
         with self._face_offsets_lock:
             if self._face_offsets_dirty:
                 face_offsets = self._pending_face_offsets
             )
             self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
+        # Otherwise reuse the last primary pose so we avoid jumps between moves
+        elif self.state.last_primary_pose is not None:
+            primary_full_body_pose = clone_full_body_pose(self.state.last_primary_pose)
         else:
+            neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
+            primary_full_body_pose = (neutral_head_pose, (0.0, 0.0), 0.0)
+            self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
         return primary_full_body_pose
         return antennas_cmd
+    def _issue_control_command(self, head: NDArray[np.float32], antennas: Tuple[float, float], body_yaw: float) -> None:
         """Send the fused pose to the robot with throttled error logging."""
         try:
             self.current_robot.set_target(head=head, antennas=antennas, body_yaw=body_yaw)
                 self._last_commanded_pose = clone_full_body_pose((head, antennas, body_yaw))
     def _update_frequency_stats(
+        self, loop_start: float, prev_loop_start: float, stats: LoopFrequencyStats,
     ) -> LoopFrequencyStats:
         """Update frequency statistics based on the current loop start time."""
         period = loop_start - prev_loop_start
             stats.min_freq = min(stats.min_freq, stats.last_freq)
         return stats
+    def _schedule_next_tick(self, loop_start: float, stats: LoopFrequencyStats) -> Tuple[float, LoopFrequencyStats]:
         """Compute sleep time to maintain target frequency and update potential freq."""
         computation_time = self._now() - loop_start
         stats.potential_freq = 1.0 / computation_time if computation_time > 0 else float("inf")
             self._thread = None
         logger.debug("Move worker stopped")
+    def get_status(self) -> Dict[str, Any]:
         """Return a lightweight status snapshot for observability."""
         with self._status_lock:
             pose_snapshot = clone_full_body_pose(self._last_commanded_pose)

src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/openai_realtime.py RENAMED Viewed

@@ -1,21 +1,25 @@
 import json
 import base64
 import asyncio
 import logging
 from datetime import datetime
 import numpy as np
 import gradio as gr
 from openai import AsyncOpenAI
 from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item
-from reachy_mini_conversation_demo.tools import (
     ALL_TOOL_SPECS,
     ToolDependencies,
     dispatch_tool_call,
 )
-from reachy_mini_conversation_demo.config import config
-from reachy_mini_conversation_demo.prompts import SESSION_INSTRUCTIONS
 logger = logging.getLogger(__name__)
@@ -33,57 +37,131 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
         )
         self.deps = deps
-        self.connection = None
-        self.output_queue = asyncio.Queue()
         self.last_activity_time = asyncio.get_event_loop().time()
         self.start_time = asyncio.get_event_loop().time()
         self.is_idle_tool_call = False
-    def copy(self):
         """Create a copy of the handler."""
         return OpenaiRealtimeHandler(self.deps)
-    async def start_up(self):
-        """Start the handler."""
         self.client = AsyncOpenAI(api_key=config.OPENAI_API_KEY)
-        async with self.client.beta.realtime.connect(model=config.MODEL_NAME) as conn:
-            await conn.session.update(
-                session={
-                    "turn_detection": {
-                        "type": "server_vad",
-                    },
-                    "input_audio_transcription": {
-                        "model": "whisper-1",
-                        "language": "en",
                     },
-                    "voice": "ballad",
-                    "instructions": SESSION_INSTRUCTIONS,
-                    "tools": ALL_TOOL_SPECS,
-                    "tool_choice": "auto",
-                    "temperature": 0.7,
-                }
-            )
             # Manage event received from the openai server
             self.connection = conn
             async for event in self.connection:
                 logger.debug(f"OpenAI event: {event.type}")
                 if event.type == "input_audio_buffer.speech_started":
-                    if hasattr(self, '_clear_queue'):
                         self._clear_queue()
-                    self.deps.head_wobbler.reset()
                     self.deps.movement_manager.set_listening(True)
                     logger.debug("User speech started")
                 if event.type == "input_audio_buffer.speech_stopped":
                     self.deps.movement_manager.set_listening(False)
-                    logger.debug("User speech stopped")
-                if event.type in ("response.audio.completed", "response.completed"):
-                    # Doesn't seem to be called
                     logger.debug("response completed")
-                    self.deps.head_wobbler.reset()
                 if event.type == "response.created":
                     logger.debug("Response created")
@@ -91,18 +169,28 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
                 if event.type == "response.done":
                     # Doesn't mean the audio is done playing
                     logger.debug("Response done")
-                    pass
                 if event.type == "conversation.item.input_audio_transcription.completed":
                     logger.debug(f"User transcript: {event.transcript}")
                     await self.output_queue.put(AdditionalOutputs({"role": "user", "content": event.transcript}))
-                if event.type == "response.audio_transcript.done":
                     logger.debug(f"Assistant transcript: {event.transcript}")
                     await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": event.transcript}))
-                if event.type == "response.audio.delta":
-                    self.deps.head_wobbler.feed(event.delta)
                     self.last_activity_time = asyncio.get_event_loop().time()
                     logger.debug("last activity time updated to %s", self.last_activity_time)
                     await self.output_queue.put(
@@ -118,6 +206,10 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
                     args_json_str = getattr(event, "arguments", None)
                     call_id = getattr(event, "call_id", None)
                     try:
                         tool_result = await dispatch_tool_call(tool_name, args_json_str, self.deps)
                         logger.debug("Tool '%s' executed successfully", tool_name)
@@ -127,22 +219,23 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
                         tool_result = {"error": str(e)}
                     # send the tool result back
-                    await self.connection.conversation.item.create(
-                        item={
-                            "type": "function_call_output",
-                            "call_id": call_id,
-                            "output": json.dumps(tool_result),
-                        }
-                    )
                     await self.output_queue.put(
                         AdditionalOutputs(
                             {
                                 "role": "assistant",
                                 "content": json.dumps(tool_result),
-                                "metadata": {"title": "🛠️ Used tool " + tool_name, "status": "done"},
                             },
-                        )
                     )
                     if tool_name == "camera" and "b64_im" in tool_result:
@@ -159,55 +252,68 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
                                     {
                                         "type": "input_image",
                                         "image_url": f"data:image/jpeg;base64,{b64_im}",
-                                    }
                                 ],
-                            }
                         )
                         logger.info("Added camera image to conversation")
-                        np_img = self.deps.camera_worker.get_latest_frame()
-                        img = gr.Image(value=np_img)
-                        await self.output_queue.put(
-                            AdditionalOutputs(
-                                {
-                                    "role": "assistant",
-                                    "content": img,
-                                }
                             )
-                        )
-                    if not self.is_idle_tool_call:
                         await self.connection.response.create(
                             response={
-                                "instructions": "Use the tool result just returned and answer concisely in speech."
-                            }
                         )
-                    else:
-                        self.is_idle_tool_call = False
                     # re synchronize the head wobble after a tool call that may have taken some time
-                    self.deps.head_wobbler.reset()
                 # server error
                 if event.type == "error":
                     err = getattr(event, "error", None)
                     msg = getattr(err, "message", str(err) if err else "unknown error")
-                    logger.error("Realtime error: %s (raw=%s)", msg, err)
-                    await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": f"[error] {msg}"}))
     # Microphone receive
-    async def receive(self, frame: tuple[int, np.ndarray]) -> None:
         """Receive audio frame from the microphone and send it to the openai server."""
         if not self.connection:
             return
         _, array = frame
         array = array.squeeze()
         audio_message = base64.b64encode(array.tobytes()).decode("utf-8")
-        # Fills the input audio buffer to be sent to the server
-        await self.connection.input_audio_buffer.append(audio=audio_message)  # type: ignore
-    async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
         """Emit audio frame to be played by the speaker."""
         # sends to the stream the stuff put in the output queue by the openai event handler
         # This is called periodically by the fastrtc Stream
@@ -215,28 +321,43 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
         # Handle idle
         idle_duration = asyncio.get_event_loop().time() - self.last_activity_time
         if idle_duration > 15.0 and self.deps.movement_manager.is_idle():
-            await self.send_idle_signal(idle_duration)
             self.last_activity_time = asyncio.get_event_loop().time()  # avoid repeated resets
-        return await wait_for_item(self.output_queue)
     async def shutdown(self) -> None:
         """Shutdown the handler."""
         if self.connection:
-            await self.connection.close()
-            self.connection = None
-    def format_timestamp(self):
-        """Format current timestamp with date, time and elapsed seconds."""
-        current_time = asyncio.get_event_loop().time()
-        elapsed_seconds = current_time - self.start_time
-        dt = datetime.fromtimestamp(current_time)
         return f"[{dt.strftime('%Y-%m-%d %H:%M:%S')} | +{elapsed_seconds:.1f}s]"
-    async def send_idle_signal(self, idle_duration) -> None:
         """Send an idle signal to the openai server."""
         logger.debug("Sending idle signal")
         self.is_idle_tool_call = True
@@ -249,12 +370,11 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
                 "type": "message",
                 "role": "user",
                 "content": [{"type": "input_text", "text": timestamp_msg}],
-            }
         )
         await self.connection.response.create(
             response={
-                "modalities": ["text"],
                 "instructions": "You MUST respond with function calls only - no speech or text. Choose appropriate actions for idle behavior.",
                 "tool_choice": "required",
-            }
         )

 import json
 import base64
+import random
 import asyncio
 import logging
+from typing import Any, Tuple, Literal, cast
 from datetime import datetime
 import numpy as np
 import gradio as gr
 from openai import AsyncOpenAI
 from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item
+from numpy.typing import NDArray
+from websockets.exceptions import ConnectionClosedError
+from reachy_mini_conversation_app.tools import (
     ALL_TOOL_SPECS,
     ToolDependencies,
     dispatch_tool_call,
 )
+from reachy_mini_conversation_app.config import config
+from reachy_mini_conversation_app.prompts import SESSION_INSTRUCTIONS
 logger = logging.getLogger(__name__)
         )
         self.deps = deps
+        # Override type annotations for OpenAI strict typing (only for values used in API)
+        self.output_sample_rate: Literal[24000]
+        self.target_input_rate: Literal[24000] = 24000
+        # input_sample_rate rest as int for comparison logic
+        self.resample_ratio = self.target_input_rate / self.input_sample_rate
+        self.connection: Any = None
+        self.output_queue: "asyncio.Queue[Tuple[int, NDArray[np.int16]] | AdditionalOutputs]" = asyncio.Queue()
         self.last_activity_time = asyncio.get_event_loop().time()
         self.start_time = asyncio.get_event_loop().time()
         self.is_idle_tool_call = False
+    def copy(self) -> "OpenaiRealtimeHandler":
         """Create a copy of the handler."""
         return OpenaiRealtimeHandler(self.deps)
+    def resample_audio(self, audio: NDArray[np.int16]) -> NDArray[np.int16]:
+        """Resample audio using linear interpolation."""
+        if self.input_sample_rate == self.target_input_rate:
+            return audio
+        # Use numpy's interp for simple linear resampling
+        input_length = len(audio)
+        output_length = int(input_length * self.resample_ratio)
+        input_time = np.arange(input_length)
+        output_time = np.linspace(0, input_length - 1, output_length)
+        resampled = np.interp(output_time, input_time, audio.astype(np.float32))
+        return cast(NDArray[np.int16], resampled.astype(np.int16))
+    async def start_up(self) -> None:
+        """Start the handler with minimal retries on unexpected websocket closure."""
         self.client = AsyncOpenAI(api_key=config.OPENAI_API_KEY)
+        max_attempts = 3
+        for attempt in range(1, max_attempts + 1):
+            try:
+                await self._run_realtime_session()
+                # Normal exit from the session, stop retrying
+                return
+            except ConnectionClosedError as e:
+                # Abrupt close (e.g., "no close frame received or sent") → retry
+                logger.warning(
+                    "Realtime websocket closed unexpectedly (attempt %d/%d): %s",
+                    attempt, max_attempts, e
+                )
+                if attempt < max_attempts:
+                    # exponential backoff with jitter
+                    base_delay = 2 ** (attempt - 1)  # 1s, 2s, 4s, 8s, etc.
+                    jitter = random.uniform(0, 0.5)
+                    delay = base_delay + jitter
+                    logger.info("Retrying in %.1f seconds...", delay)
+                    await asyncio.sleep(delay)
+                    continue
+                raise
+            finally:
+                # never keep a stale reference
+                self.connection = None
+    async def _run_realtime_session(self) -> None:
+        """Establish and manage a single realtime session."""
+        async with self.client.realtime.connect(model=config.MODEL_NAME) as conn:
+            try:
+                await conn.session.update(
+                    session={
+                        "type": "realtime",
+                        "instructions": SESSION_INSTRUCTIONS,
+                        "audio": {
+                            "input": {
+                                "format": {
+                                    "type": "audio/pcm",
+                                    "rate": self.target_input_rate,
+                                },
+                                "transcription": {
+                                    "model": "whisper-1",
+                                    "language": "en"
+                                },
+                                "turn_detection": {
+                                    "type": "server_vad",
+                                    "interrupt_response": True,
+                                },
+                            },
+                            "output": {
+                                "format": {
+                                    "type": "audio/pcm",
+                                    "rate": self.output_sample_rate,
+                                },
+                                "voice": "cedar",
+                            },
+                        },
+                        "tools": ALL_TOOL_SPECS,  # type: ignore[typeddict-item]
+                        "tool_choice": "auto",
                     },
+                )
+            except Exception:
+                logger.exception("Realtime session.update failed; aborting startup")
+                return
+            logger.info("Realtime session updated successfully")
             # Manage event received from the openai server
             self.connection = conn
             async for event in self.connection:
                 logger.debug(f"OpenAI event: {event.type}")
                 if event.type == "input_audio_buffer.speech_started":
+                    if hasattr(self, "_clear_queue") and callable(self._clear_queue):
                         self._clear_queue()
+                    if self.deps.head_wobbler is not None:
+                        self.deps.head_wobbler.reset()
                     self.deps.movement_manager.set_listening(True)
                     logger.debug("User speech started")
                 if event.type == "input_audio_buffer.speech_stopped":
                     self.deps.movement_manager.set_listening(False)
+                    logger.debug("User speech stopped - server will auto-commit with VAD")
+                if event.type in (
+                    "response.audio.done",            # GA
+                    "response.output_audio.done",     # GA alias
+                    "response.audio.completed",       # legacy (for safety)
+                    "response.completed",             # text-only completion
+                ):
                     logger.debug("response completed")
                 if event.type == "response.created":
                     logger.debug("Response created")
                 if event.type == "response.done":
                     # Doesn't mean the audio is done playing
                     logger.debug("Response done")
+                # Handle partial transcription (user speaking in real-time)
+                if event.type == "conversation.item.input_audio_transcription.partial":
+                    logger.debug(f"User partial transcript: {event.transcript}")
+                    await self.output_queue.put(
+                        AdditionalOutputs({"role": "user_partial", "content": event.transcript})
+                    )
+                # Handle completed transcription (user finished speaking)
                 if event.type == "conversation.item.input_audio_transcription.completed":
                     logger.debug(f"User transcript: {event.transcript}")
                     await self.output_queue.put(AdditionalOutputs({"role": "user", "content": event.transcript}))
+                # Handle assistant transcription
+                if event.type in ("response.audio_transcript.done", "response.output_audio_transcript.done"):
                     logger.debug(f"Assistant transcript: {event.transcript}")
                     await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": event.transcript}))
+                # Handle audio delta
+                if event.type in ("response.audio.delta", "response.output_audio.delta"):
+                    if self.deps.head_wobbler is not None:
+                        self.deps.head_wobbler.feed(event.delta)
                     self.last_activity_time = asyncio.get_event_loop().time()
                     logger.debug("last activity time updated to %s", self.last_activity_time)
                     await self.output_queue.put(
                     args_json_str = getattr(event, "arguments", None)
                     call_id = getattr(event, "call_id", None)
+                    if not isinstance(tool_name, str) or not isinstance(args_json_str, str):
+                        logger.error("Invalid tool call: tool_name=%s, args=%s", tool_name, args_json_str)
+                        continue
                     try:
                         tool_result = await dispatch_tool_call(tool_name, args_json_str, self.deps)
                         logger.debug("Tool '%s' executed successfully", tool_name)
                         tool_result = {"error": str(e)}
                     # send the tool result back
+                    if isinstance(call_id, str):
+                        await self.connection.conversation.item.create(
+                            item={
+                                "type": "function_call_output",
+                                "call_id": call_id,
+                                "output": json.dumps(tool_result),
+                            },
+                        )
                     await self.output_queue.put(
                         AdditionalOutputs(
                             {
                                 "role": "assistant",
                                 "content": json.dumps(tool_result),
+                                "metadata": {"title": f"🛠️ Used tool {tool_name}", "status": "done"},
                             },
+                        ),
                     )
                     if tool_name == "camera" and "b64_im" in tool_result:
                                     {
                                         "type": "input_image",
                                         "image_url": f"data:image/jpeg;base64,{b64_im}",
+                                    },
                                 ],
+                            },
                         )
                         logger.info("Added camera image to conversation")
+                        if self.deps.camera_worker is not None:
+                            np_img = self.deps.camera_worker.get_latest_frame()
+                            img = gr.Image(value=np_img)
+                            await self.output_queue.put(
+                                AdditionalOutputs(
+                                    {
+                                        "role": "assistant",
+                                        "content": img,
+                                    },
+                                ),
                             )
+                    # if this tool call was triggered by an idle signal, don't make the robot speak
+                    # for other tool calls, let the robot reply out loud
+                    if self.is_idle_tool_call:
+                        self.is_idle_tool_call = False
+                    else:
                         await self.connection.response.create(
                             response={
+                                "instructions": "Use the tool result just returned and answer concisely in speech.",
+                            },
                         )
                     # re synchronize the head wobble after a tool call that may have taken some time
+                    if self.deps.head_wobbler is not None:
+                        self.deps.head_wobbler.reset()
                 # server error
                 if event.type == "error":
                     err = getattr(event, "error", None)
                     msg = getattr(err, "message", str(err) if err else "unknown error")
+                    code = getattr(err, "code", "")
+                    logger.error("Realtime error [%s]: %s (raw=%s)", code, msg, err)
+                    # Only show user-facing errors, not internal state errors
+                    if code not in ("input_audio_buffer_commit_empty", "conversation_already_has_active_response"):
+                        await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": f"[error] {msg}"}))
     # Microphone receive
+    async def receive(self, frame: Tuple[int, NDArray[np.int16]]) -> None:
         """Receive audio frame from the microphone and send it to the openai server."""
         if not self.connection:
             return
         _, array = frame
         array = array.squeeze()
+        # Resample if needed
+        if self.input_sample_rate != self.target_input_rate:
+            array = self.resample_audio(array)
         audio_message = base64.b64encode(array.tobytes()).decode("utf-8")
+        await self.connection.input_audio_buffer.append(audio=audio_message)
+    async def emit(self) -> Tuple[int, NDArray[np.int16]] | AdditionalOutputs | None:
         """Emit audio frame to be played by the speaker."""
         # sends to the stream the stuff put in the output queue by the openai event handler
         # This is called periodically by the fastrtc Stream
         # Handle idle
         idle_duration = asyncio.get_event_loop().time() - self.last_activity_time
         if idle_duration > 15.0 and self.deps.movement_manager.is_idle():
+            try:
+                await self.send_idle_signal(idle_duration)
+            except Exception as e:
+                logger.warning("Idle signal skipped (connection closed?): %s", e)
+                return None
             self.last_activity_time = asyncio.get_event_loop().time()  # avoid repeated resets
+        return await wait_for_item(self.output_queue)  # type: ignore[no-any-return]
     async def shutdown(self) -> None:
         """Shutdown the handler."""
         if self.connection:
+            try:
+                await self.connection.close()
+            except ConnectionClosedError as e:
+                logger.debug(f"Connection already closed during shutdown: {e}")
+            except Exception as e:
+                logger.debug(f"connection.close() ignored: {e}")
+            finally:
+                self.connection = None
+        # Clear any remaining items in the output queue
+        while not self.output_queue.empty():
+            try:
+                self.output_queue.get_nowait()
+            except asyncio.QueueEmpty:
+                break
+    def format_timestamp(self) -> str:
+        """Format current timestamp with date, time, and elapsed seconds."""
+        loop_time = asyncio.get_event_loop().time()  # monotonic
+        elapsed_seconds = loop_time - self.start_time
+        dt = datetime.now()  # wall-clock
         return f"[{dt.strftime('%Y-%m-%d %H:%M:%S')} | +{elapsed_seconds:.1f}s]"
+    async def send_idle_signal(self, idle_duration: float) -> None:
         """Send an idle signal to the openai server."""
         logger.debug("Sending idle signal")
         self.is_idle_tool_call = True
                 "type": "message",
                 "role": "user",
                 "content": [{"type": "input_text", "text": timestamp_msg}],
+            },
         )
         await self.connection.response.create(
             response={
                 "instructions": "You MUST respond with function calls only - no speech or text. Choose appropriate actions for idle behavior.",
                 "tool_choice": "required",
+            },
         )

src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/prompts.py RENAMED Viewed

File without changes

src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/tools.py RENAMED Viewed

@@ -4,7 +4,7 @@ import json
 import asyncio
 import inspect
 import logging
-from typing import Any, Dict, Literal, Optional
 from dataclasses import dataclass
 from reachy_mini import ReachyMini
@@ -17,7 +17,7 @@ logger = logging.getLogger(__name__)
 try:
     from reachy_mini.motion.recorded_move import RecordedMoves
     from reachy_mini_dances_library.collection.dance import AVAILABLE_MOVES
-    from reachy_mini_conversation_demo.dance_emotion_moves import (
         GotoQueueMove,
         DanceQueueMove,
         EmotionQueueMove,
@@ -36,9 +36,9 @@ except ImportError as e:
     EMOTION_AVAILABLE = False
-def get_concrete_subclasses(base):
     """Recursively find all concrete (non-abstract) subclasses of a base class."""
-    result = []
     for cls in base.__subclasses__():
         if not inspect.isabstract(cls):
             result.append(cls)
@@ -58,9 +58,9 @@ class ToolDependencies:
     reachy_mini: ReachyMini
     movement_manager: Any  # MovementManager from moves.py
     # Optional deps
-    camera_worker: Optional[Any] = None  # CameraWorker for frame buffering
-    vision_manager: Optional[Any] = None
-    head_wobbler: Optional[Any] = None  # HeadWobbler for audio-reactive motion
     motion_duration_s: float = 1.0
@@ -88,7 +88,7 @@ class Tool(abc.ABC):
         }
     @abc.abstractmethod
-    async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
         """Async tool execution entrypoint."""
         raise NotImplementedError
@@ -113,7 +113,7 @@ class MoveHead(Tool):
     }
     # mapping: direction -> args for create_head_pose
-    DELTAS: dict[str, tuple[int, int, int, int, int, int]] = {
         "left": (0, 0, 0, 0, 0, 40),
         "right": (0, 0, 0, 0, 0, -40),
         "up": (0, 0, 0, 0, -30, 0),
@@ -121,9 +121,12 @@ class MoveHead(Tool):
         "front": (0, 0, 0, 0, 0, 0),
     }
-    async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
         """Move head in a given direction."""
-        direction: Direction = kwargs.get("direction")
         logger.info("Tool call: move_head direction=%s", direction)
         deltas = self.DELTAS.get(direction, self.DELTAS["front"])
@@ -177,7 +180,7 @@ class Camera(Tool):
         "required": ["question"],
     }
-    async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
         """Take a picture with the camera and ask a question about it."""
         image_query = (kwargs.get("question") or "").strip()
         if not image_query:
@@ -199,7 +202,7 @@ class Camera(Tool):
         # Use vision manager for processing if available
         if deps.vision_manager is not None:
             vision_result = await asyncio.to_thread(
-                deps.vision_manager.processor.process_image, frame, image_query
             )
             if isinstance(vision_result, dict) and "error" in vision_result:
                 return vision_result
@@ -208,17 +211,16 @@ class Camera(Tool):
                 if isinstance(vision_result, str)
                 else {"error": "vision returned non-string"}
             )
-        else:
-            # Return base64 encoded image like main_works.py camera tool
-            import base64
-            import cv2
-            temp_path = "/tmp/camera_frame.jpg"
-            cv2.imwrite(temp_path, frame)
-            with open(temp_path, "rb") as f:
-                b64_encoded = base64.b64encode(f.read()).decode("utf-8")
-            return {"b64_im": b64_encoded}
 class HeadTracking(Tool):
@@ -232,7 +234,7 @@ class HeadTracking(Tool):
         "required": ["start"],
     }
-    async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
         """Enable or disable head tracking."""
         enable = bool(kwargs.get("start"))
@@ -288,12 +290,12 @@ class Dance(Tool):
         "required": [],
     }
-    async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
         """Play a named or random dance move once (or repeat). Non-blocking."""
         if not DANCE_AVAILABLE:
             return {"error": "Dance system not available"}
-        move_name = kwargs.get("move", None)
         repeat = int(kwargs.get("repeat", 1))
         logger.info("Tool call: dance move=%s repeat=%d", move_name, repeat)
@@ -326,12 +328,12 @@ class StopDance(Tool):
             "dummy": {
                 "type": "boolean",
                 "description": "dummy boolean, set it to true",
-            }
         },
         "required": ["dummy"],
     }
-    async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
         """Stop the current dance move."""
         logger.info("Tool call: stop_dance")
         movement_manager = deps.movement_manager
@@ -373,7 +375,7 @@ class PlayEmotion(Tool):
         "required": ["emotion"],
     }
-    async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
         """Play a pre-recorded emotion."""
         if not EMOTION_AVAILABLE:
             return {"error": "Emotion system not available"}
@@ -399,7 +401,7 @@ class PlayEmotion(Tool):
         except Exception as e:
             logger.exception("Failed to play emotion")
-            return {"error": f"Failed to play emotion: {str(e)}"}
 class StopEmotion(Tool):
@@ -413,12 +415,12 @@ class StopEmotion(Tool):
             "dummy": {
                 "type": "boolean",
                 "description": "dummy boolean, set it to true",
-            }
         },
         "required": ["dummy"],
     }
-    async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
         """Stop the current emotion."""
         logger.info("Tool call: stop_emotion")
         movement_manager = deps.movement_manager
@@ -442,7 +444,7 @@ class DoNothing(Tool):
         "required": [],
     }
-    async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
         """Do nothing - stay still and silent."""
         reason = kwargs.get("reason", "just chilling")
         logger.info("Tool call: do_nothing reason=%s", reason)
@@ -452,12 +454,12 @@ class DoNothing(Tool):
 # Registry & specs (dynamic)
 # List of available tool classes
-ALL_TOOLS: Dict[str, Tool] = {cls.name: cls() for cls in get_concrete_subclasses(Tool)}
 ALL_TOOL_SPECS = [tool.spec() for tool in ALL_TOOLS.values()]
 # Dispatcher
-def _safe_load_obj(args_json: str) -> dict[str, Any]:
     try:
         parsed_args = json.loads(args_json or "{}")
         return parsed_args if isinstance(parsed_args, dict) else {}

 import asyncio
 import inspect
 import logging
+from typing import Any, Dict, List, Tuple, Literal
 from dataclasses import dataclass
 from reachy_mini import ReachyMini
 try:
     from reachy_mini.motion.recorded_move import RecordedMoves
     from reachy_mini_dances_library.collection.dance import AVAILABLE_MOVES
+    from reachy_mini_conversation_app.dance_emotion_moves import (
         GotoQueueMove,
         DanceQueueMove,
         EmotionQueueMove,
     EMOTION_AVAILABLE = False
+def get_concrete_subclasses(base: type[Tool]) -> List[type[Tool]]:
     """Recursively find all concrete (non-abstract) subclasses of a base class."""
+    result: List[type[Tool]] = []
     for cls in base.__subclasses__():
         if not inspect.isabstract(cls):
             result.append(cls)
     reachy_mini: ReachyMini
     movement_manager: Any  # MovementManager from moves.py
     # Optional deps
+    camera_worker: Any | None = None  # CameraWorker for frame buffering
+    vision_manager: Any | None = None
+    head_wobbler: Any | None = None  # HeadWobbler for audio-reactive motion
     motion_duration_s: float = 1.0
         }
     @abc.abstractmethod
+    async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
         """Async tool execution entrypoint."""
         raise NotImplementedError
     }
     # mapping: direction -> args for create_head_pose
+    DELTAS: Dict[str, Tuple[int, int, int, int, int, int]] = {
         "left": (0, 0, 0, 0, 0, 40),
         "right": (0, 0, 0, 0, 0, -40),
         "up": (0, 0, 0, 0, -30, 0),
         "front": (0, 0, 0, 0, 0, 0),
     }
+    async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
         """Move head in a given direction."""
+        direction_raw = kwargs.get("direction")
+        if not isinstance(direction_raw, str):
+            return {"error": "direction must be a string"}
+        direction: Direction = direction_raw  # type: ignore[assignment]
         logger.info("Tool call: move_head direction=%s", direction)
         deltas = self.DELTAS.get(direction, self.DELTAS["front"])
         "required": ["question"],
     }
+    async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
         """Take a picture with the camera and ask a question about it."""
         image_query = (kwargs.get("question") or "").strip()
         if not image_query:
         # Use vision manager for processing if available
         if deps.vision_manager is not None:
             vision_result = await asyncio.to_thread(
+                deps.vision_manager.processor.process_image, frame, image_query,
             )
             if isinstance(vision_result, dict) and "error" in vision_result:
                 return vision_result
                 if isinstance(vision_result, str)
                 else {"error": "vision returned non-string"}
             )
+        # Return base64 encoded image like main_works.py camera tool
+        import base64
+        import cv2
+        temp_path = "/tmp/camera_frame.jpg"
+        cv2.imwrite(temp_path, frame)
+        with open(temp_path, "rb") as f:
+            b64_encoded = base64.b64encode(f.read()).decode("utf-8")
+        return {"b64_im": b64_encoded}
 class HeadTracking(Tool):
         "required": ["start"],
     }
+    async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
         """Enable or disable head tracking."""
         enable = bool(kwargs.get("start"))
         "required": [],
     }
+    async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
         """Play a named or random dance move once (or repeat). Non-blocking."""
         if not DANCE_AVAILABLE:
             return {"error": "Dance system not available"}
+        move_name = kwargs.get("move")
         repeat = int(kwargs.get("repeat", 1))
         logger.info("Tool call: dance move=%s repeat=%d", move_name, repeat)
             "dummy": {
                 "type": "boolean",
                 "description": "dummy boolean, set it to true",
+            },
         },
         "required": ["dummy"],
     }
+    async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
         """Stop the current dance move."""
         logger.info("Tool call: stop_dance")
         movement_manager = deps.movement_manager
         "required": ["emotion"],
     }
+    async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
         """Play a pre-recorded emotion."""
         if not EMOTION_AVAILABLE:
             return {"error": "Emotion system not available"}
         except Exception as e:
             logger.exception("Failed to play emotion")
+            return {"error": f"Failed to play emotion: {e!s}"}
 class StopEmotion(Tool):
             "dummy": {
                 "type": "boolean",
                 "description": "dummy boolean, set it to true",
+            },
         },
         "required": ["dummy"],
     }
+    async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
         """Stop the current emotion."""
         logger.info("Tool call: stop_emotion")
         movement_manager = deps.movement_manager
         "required": [],
     }
+    async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
         """Do nothing - stay still and silent."""
         reason = kwargs.get("reason", "just chilling")
         logger.info("Tool call: do_nothing reason=%s", reason)
 # Registry & specs (dynamic)
 # List of available tool classes
+ALL_TOOLS: Dict[str, Tool] = {cls.name: cls() for cls in get_concrete_subclasses(Tool)}  # type: ignore[type-abstract]
 ALL_TOOL_SPECS = [tool.spec() for tool in ALL_TOOLS.values()]
 # Dispatcher
+def _safe_load_obj(args_json: str) -> Dict[str, Any]:
     try:
         parsed_args = json.loads(args_json or "{}")
         return parsed_args if isinstance(parsed_args, dict) else {}

src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/utils.py RENAMED Viewed

@@ -1,13 +1,15 @@
 import logging
 import argparse
 import warnings
-from reachy_mini_conversation_demo.camera_worker import CameraWorker
-def parse_args():
     """Parse command line arguments."""
-    parser = argparse.ArgumentParser("Reachy Mini Conversation Demo")
     parser.add_argument(
         "--head-tracker",
         choices=["yolo", "mediapipe", None],
@@ -27,7 +29,7 @@ def parse_args():
     return parser.parse_args()
-def handle_vision_stuff(args, current_robot):
     """Initialize camera, head tracker, camera worker, and vision manager.
     By default, vision is handled by gpt-realtime model when camera tool is used.
@@ -41,11 +43,11 @@ def handle_vision_stuff(args, current_robot):
         # Initialize head tracker if specified
         if args.head_tracker is not None:
             if args.head_tracker == "yolo":
-                from reachy_mini_conversation_demo.vision.yolo_head_tracker import HeadTracker
                 head_tracker = HeadTracker()
             elif args.head_tracker == "mediapipe":
-                from reachy_mini_toolbox.vision import HeadTracker
                 head_tracker = HeadTracker()
@@ -55,22 +57,22 @@ def handle_vision_stuff(args, current_robot):
         # Initialize vision manager only if local vision is requested
         if args.local_vision:
             try:
-                from reachy_mini_conversation_demo.vision.processors import initialize_vision_manager
                 vision_manager = initialize_vision_manager(camera_worker)
             except ImportError as e:
                 raise ImportError(
-                    "To use --local-vision, please install the extra dependencies: pip install '.[local_vision]'"
                 ) from e
         else:
             logging.getLogger(__name__).info(
-                "Using gpt-realtime for vision (default). Use --local-vision for local processing."
             )
     return camera_worker, head_tracker, vision_manager
-def setup_logger(debug):
     """Setups the logger."""
     log_level = "DEBUG" if debug else "INFO"
     logging.basicConfig(

 import logging
 import argparse
 import warnings
+from typing import Any, Tuple
+from reachy_mini import ReachyMini
+from reachy_mini_conversation_app.camera_worker import CameraWorker
+def parse_args() -> argparse.Namespace:
     """Parse command line arguments."""
+    parser = argparse.ArgumentParser("Reachy Mini Conversation App")
     parser.add_argument(
         "--head-tracker",
         choices=["yolo", "mediapipe", None],
     return parser.parse_args()
+def handle_vision_stuff(args: argparse.Namespace, current_robot: ReachyMini) -> Tuple[CameraWorker | None, Any, Any]:
     """Initialize camera, head tracker, camera worker, and vision manager.
     By default, vision is handled by gpt-realtime model when camera tool is used.
         # Initialize head tracker if specified
         if args.head_tracker is not None:
             if args.head_tracker == "yolo":
+                from reachy_mini_conversation_app.vision.yolo_head_tracker import HeadTracker
                 head_tracker = HeadTracker()
             elif args.head_tracker == "mediapipe":
+                from reachy_mini_toolbox.vision import HeadTracker  # type: ignore[no-redef]
                 head_tracker = HeadTracker()
         # Initialize vision manager only if local vision is requested
         if args.local_vision:
             try:
+                from reachy_mini_conversation_app.vision.processors import initialize_vision_manager
                 vision_manager = initialize_vision_manager(camera_worker)
             except ImportError as e:
                 raise ImportError(
+                    "To use --local-vision, please install the extra dependencies: pip install '.[local_vision]'",
                 ) from e
         else:
             logging.getLogger(__name__).info(
+                "Using gpt-realtime for vision (default). Use --local-vision for local processing.",
             )
     return camera_worker, head_tracker, vision_manager
+def setup_logger(debug: bool) -> logging.Logger:
     """Setups the logger."""
     log_level = "DEBUG" if debug else "INFO"
     logging.basicConfig(

src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/vision/__init__.py RENAMED Viewed

File without changes

src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/vision/processors.py RENAMED Viewed

@@ -3,16 +3,17 @@ import time
 import base64
 import logging
 import threading
-from typing import Any, Dict, Optional
 from dataclasses import dataclass
 import cv2
 import numpy as np
 import torch
 from transformers import AutoProcessor, AutoModelForImageTextToText
 from huggingface_hub import snapshot_download
-from reachy_mini_conversation_demo.config import config
 logger = logging.getLogger(__name__)
@@ -34,7 +35,7 @@ class VisionConfig:
 class VisionProcessor:
     """Handles SmolVLM2 model loading and inference."""
-    def __init__(self, vision_config: VisionConfig = None):
         """Initialize the vision processor."""
         self.vision_config = vision_config or VisionConfig()
         self.model_path = self.vision_config.model_path
@@ -60,7 +61,7 @@ class VisionProcessor:
         """Load model and processor onto the selected device."""
         try:
             logger.info(f"Loading SmolVLM2 model on {self.device} (HF_HOME={config.HF_HOME})")
-            self.processor = AutoProcessor.from_pretrained(self.model_path)
             # Select dtype depending on device
             if self.device == "cuda":
@@ -70,16 +71,17 @@ class VisionProcessor:
             else:
                 dtype = torch.float32
-            model_kwargs = {"dtype": dtype}
             # flash_attention_2 is CUDA-only; skip on MPS/CPU
             if self.device == "cuda":
                 model_kwargs["_attn_implementation"] = "flash_attention_2"
             # Load model weights
-            self.model = AutoModelForImageTextToText.from_pretrained(self.model_path, **model_kwargs).to(self.device)
-            self.model.eval()
             self._initialized = True
             return True
@@ -89,11 +91,11 @@ class VisionProcessor:
     def process_image(
         self,
-        cv2_image: np.ndarray,
         prompt: str = "Briefly describe what you see in one sentence.",
     ) -> str:
         """Process CV2 image and return description with retry logic."""
-        if not self._initialized:
             return "Vision model not initialized"
         for attempt in range(self.vision_config.max_retries):
@@ -205,16 +207,16 @@ class VisionProcessor:
 class VisionManager:
     """Manages periodic vision processing and scene understanding."""
-    def __init__(self, camera, vision_config: VisionConfig = None):
         """Initialize vision manager with camera and configuration."""
         self.camera = camera
         self.vision_config = vision_config or VisionConfig()
         self.vision_interval = self.vision_config.vision_interval
         self.processor = VisionProcessor(self.vision_config)
-        self._last_processed_time = 0
         self._stop_event = threading.Event()
-        self._thread: Optional[threading.Thread] = None
         # Initialize processor
         if not self.processor.initialize():
@@ -245,7 +247,7 @@ class VisionManager:
                     frame = self.camera.get_latest_frame()
                     if frame is not None:
                         description = self.processor.process_image(
-                            frame, "Briefly describe what you see in one sentence."
                         )
                         # Only update if we got a valid response
@@ -274,7 +276,7 @@ class VisionManager:
         }
-def initialize_vision_manager(camera_worker) -> Optional[VisionManager]:
     """Initialize vision manager with model download and configuration.
     Args:
@@ -318,7 +320,7 @@ def initialize_vision_manager(camera_worker) -> Optional[VisionManager]:
         # Log device info
         device_info = vision_manager.processor.get_model_info()
         logger.info(
-            f"Vision processing enabled: {device_info.get('model_path')} on {device_info.get('device')}"
         )
         return vision_manager

 import base64
 import logging
 import threading
+from typing import Any, Dict
 from dataclasses import dataclass
 import cv2
 import numpy as np
 import torch
+from numpy.typing import NDArray
 from transformers import AutoProcessor, AutoModelForImageTextToText
 from huggingface_hub import snapshot_download
+from reachy_mini_conversation_app.config import config
 logger = logging.getLogger(__name__)
 class VisionProcessor:
     """Handles SmolVLM2 model loading and inference."""
+    def __init__(self, vision_config: VisionConfig | None = None):
         """Initialize the vision processor."""
         self.vision_config = vision_config or VisionConfig()
         self.model_path = self.vision_config.model_path
         """Load model and processor onto the selected device."""
         try:
             logger.info(f"Loading SmolVLM2 model on {self.device} (HF_HOME={config.HF_HOME})")
+            self.processor = AutoProcessor.from_pretrained(self.model_path)  # type: ignore[no-untyped-call]
             # Select dtype depending on device
             if self.device == "cuda":
             else:
                 dtype = torch.float32
+            model_kwargs: Dict[str, Any] = {"dtype": dtype}
             # flash_attention_2 is CUDA-only; skip on MPS/CPU
             if self.device == "cuda":
                 model_kwargs["_attn_implementation"] = "flash_attention_2"
             # Load model weights
+            self.model = AutoModelForImageTextToText.from_pretrained(self.model_path, **model_kwargs).to(self.device)  # type: ignore[arg-type]
+            if self.model is not None:
+                self.model.eval()
             self._initialized = True
             return True
     def process_image(
         self,
+        cv2_image: NDArray[np.uint8],
         prompt: str = "Briefly describe what you see in one sentence.",
     ) -> str:
         """Process CV2 image and return description with retry logic."""
+        if not self._initialized or self.processor is None or self.model is None:
             return "Vision model not initialized"
         for attempt in range(self.vision_config.max_retries):
 class VisionManager:
     """Manages periodic vision processing and scene understanding."""
+    def __init__(self, camera: Any, vision_config: VisionConfig | None = None):
         """Initialize vision manager with camera and configuration."""
         self.camera = camera
         self.vision_config = vision_config or VisionConfig()
         self.vision_interval = self.vision_config.vision_interval
         self.processor = VisionProcessor(self.vision_config)
+        self._last_processed_time = 0.0
         self._stop_event = threading.Event()
+        self._thread: threading.Thread | None = None
         # Initialize processor
         if not self.processor.initialize():
                     frame = self.camera.get_latest_frame()
                     if frame is not None:
                         description = self.processor.process_image(
+                            frame, "Briefly describe what you see in one sentence.",
                         )
                         # Only update if we got a valid response
         }
+def initialize_vision_manager(camera_worker: Any) -> VisionManager | None:
     """Initialize vision manager with model download and configuration.
     Args:
         # Log device info
         device_info = vision_manager.processor.get_model_info()
         logger.info(
+            f"Vision processing enabled: {device_info.get('model_path')} on {device_info.get('device')}",
         )
         return vision_manager

src/{reachy_mini_conversation_demo → reachy_mini_conversation_app}/vision/yolo_head_tracker.py RENAMED Viewed

@@ -1,16 +1,17 @@
 from __future__ import annotations
 import logging
-from typing import Tuple, Optional
 import numpy as np
 try:
     from supervision import Detections
-    from ultralytics import YOLO
 except ImportError as e:
     raise ImportError(
-        "To use YOLO head tracker, please install the extra dependencies: pip install '.[yolo_vision]'"
     ) from e
 from huggingface_hub import hf_hub_download
@@ -48,7 +49,7 @@ class HeadTracker:
             logger.error(f"Failed to load YOLO model: {e}")
             raise
-    def _select_best_face(self, detections: Detections) -> Optional[int]:
         """Select the best face based on confidence and area (largest face with highest confidence).
         Args:
@@ -61,6 +62,10 @@ class HeadTracker:
         if detections.xyxy.shape[0] == 0:
             return None
         # Filter by confidence threshold
         valid_mask = detections.confidence >= self.confidence_threshold
         if not np.any(valid_mask):
@@ -78,9 +83,9 @@ class HeadTracker:
         # Return index of best face
         best_idx = valid_indices[np.argmax(scores)]
-        return best_idx
-    def _bbox_to_mp_coords(self, bbox: np.ndarray, w: int, h: int) -> np.ndarray:
         """Convert bounding box center to MediaPipe-style coordinates [-1, 1].
         Args:
@@ -101,7 +106,7 @@ class HeadTracker:
         return np.array([norm_x, norm_y], dtype=np.float32)
-    def get_head_position(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[float]]:
         """Get head position from face detection.
         Args:
@@ -125,9 +130,10 @@ class HeadTracker:
                 return None, None
             bbox = detections.xyxy[face_idx]
-            confidence = detections.confidence[face_idx]
-            logger.debug(f"Face detected with confidence: {confidence:.2f}")
             # Get face center in [-1, 1] coordinates
             face_center = self._bbox_to_mp_coords(bbox, w, h)

 from __future__ import annotations
 import logging
+from typing import Tuple
 import numpy as np
+from numpy.typing import NDArray
 try:
     from supervision import Detections
+    from ultralytics import YOLO  # type: ignore[attr-defined]
 except ImportError as e:
     raise ImportError(
+        "To use YOLO head tracker, please install the extra dependencies: pip install '.[yolo_vision]'",
     ) from e
 from huggingface_hub import hf_hub_download
             logger.error(f"Failed to load YOLO model: {e}")
             raise
+    def _select_best_face(self, detections: Detections) -> int | None:
         """Select the best face based on confidence and area (largest face with highest confidence).
         Args:
         if detections.xyxy.shape[0] == 0:
             return None
+        # Check if confidence is available
+        if detections.confidence is None:
+            return None
         # Filter by confidence threshold
         valid_mask = detections.confidence >= self.confidence_threshold
         if not np.any(valid_mask):
         # Return index of best face
         best_idx = valid_indices[np.argmax(scores)]
+        return int(best_idx)
+    def _bbox_to_mp_coords(self, bbox: NDArray[np.float32], w: int, h: int) -> NDArray[np.float32]:
         """Convert bounding box center to MediaPipe-style coordinates [-1, 1].
         Args:
         return np.array([norm_x, norm_y], dtype=np.float32)
+    def get_head_position(self, img: NDArray[np.uint8]) -> Tuple[NDArray[np.float32] | None, float | None]:
         """Get head position from face detection.
         Args:
                 return None, None
             bbox = detections.xyxy[face_idx]
+            if detections.confidence is not None:
+                confidence = detections.confidence[face_idx]
+                logger.debug(f"Face detected with confidence: {confidence:.2f}")
             # Get face center in [-1, 1] coordinates
             face_center = self._bbox_to_mp_coords(bbox, w, h)

tests/audio/test_head_wobbler.py CHANGED Viewed

@@ -4,11 +4,12 @@ import math
 import time
 import base64
 import threading
-from typing import List, Tuple, Callable
 import numpy as np
-from reachy_mini_conversation_demo.audio.head_wobbler import HeadWobbler
 def _make_audio_chunk(duration_s: float = 0.3, frequency_hz: float = 220.0) -> str:
@@ -74,7 +75,7 @@ def test_reset_allows_future_offsets() -> None:
         wobbler.stop()
-def test_reset_during_inflight_chunk_keeps_worker(monkeypatch) -> None:
     """Simulate reset during chunk processing to ensure the worker survives."""
     wobbler, captured = _start_wobbler()
     ready = threading.Event()

 import time
 import base64
 import threading
+from typing import Any, List, Tuple
+from collections.abc import Callable
 import numpy as np
+from reachy_mini_conversation_app.audio.head_wobbler import HeadWobbler
 def _make_audio_chunk(duration_s: float = 0.3, frequency_hz: float = 220.0) -> str:
         wobbler.stop()
+def test_reset_during_inflight_chunk_keeps_worker(monkeypatch: Any) -> None:
     """Simulate reset during chunk processing to ensure the worker survives."""
     wobbler, captured = _start_wobbler()
     ready = threading.Event()

tests/test_openai_realtime.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import asyncio
+import logging
+from typing import Any
+from datetime import datetime, timezone
+from unittest.mock import MagicMock
+import pytest
+import reachy_mini_conversation_app.openai_realtime as rt_mod
+from reachy_mini_conversation_app.tools import ToolDependencies
+from reachy_mini_conversation_app.openai_realtime import OpenaiRealtimeHandler
+def _build_handler(loop: asyncio.AbstractEventLoop) -> OpenaiRealtimeHandler:
+    asyncio.set_event_loop(loop)
+    deps = ToolDependencies(reachy_mini=MagicMock(), movement_manager=MagicMock())
+    return OpenaiRealtimeHandler(deps)
+def test_format_timestamp_uses_wall_clock() -> None:
+    """Test that format_timestamp uses wall clock time."""
+    loop = asyncio.new_event_loop()
+    try:
+        print("Testing format_timestamp...")
+        handler = _build_handler(loop)
+        formatted = handler.format_timestamp()
+        print(f"Formatted timestamp: {formatted}")
+    finally:
+        asyncio.set_event_loop(None)
+        loop.close()
+    # Extract year from "[YYYY-MM-DD ...]"
+    year = int(formatted[1:5])
+    assert year == datetime.now(timezone.utc).year
+@pytest.mark.asyncio
+async def test_start_up_retries_on_abrupt_close(monkeypatch: Any, caplog: Any) -> None:
+    """First connection dies with ConnectionClosedError during iteration -> retried.
+    Second connection iterates cleanly (no events) -> start_up returns without raising.
+    Ensures handler clears self.connection at the end.
+    """
+    caplog.set_level(logging.WARNING)
+    # Use a local Exception as the module's ConnectionClosedError to avoid ws dependency
+    FakeCCE = type("FakeCCE", (Exception,), {})
+    monkeypatch.setattr(rt_mod, "ConnectionClosedError", FakeCCE)
+    # Make asyncio.sleep return immediately (for backoff)
+    async def _fast_sleep(*_a: Any, **_kw: Any) -> None: return None
+    monkeypatch.setattr(asyncio, "sleep", _fast_sleep, raising=False)
+    attempt_counter = {"n": 0}
+    class FakeConn:
+        """Minimal realtime connection stub."""
+        def __init__(self, mode: str):
+            self._mode = mode
+            class _Session:
+                async def update(self, **_kw: Any) -> None: return None
+            self.session = _Session()
+            class _InputAudioBuffer:
+                async def append(self, **_kw: Any) -> None: return None
+            self.input_audio_buffer = _InputAudioBuffer()
+            class _Item:
+                async def create(self, **_kw: Any) -> None: return None
+            class _Conversation:
+                item = _Item()
+            self.conversation = _Conversation()
+            class _Response:
+                async def create(self, **_kw: Any) -> None: return None
+                async def cancel(self, **_kw: Any) -> None: return None
+            self.response = _Response()
+        async def __aenter__(self) -> "FakeConn": return self
+        async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> bool: return False
+        async def close(self) -> None: return None
+        # Async iterator protocol
+        def __aiter__(self) -> "FakeConn": return self
+        async def __anext__(self) -> None:
+            if self._mode == "raise_on_iter":
+                raise FakeCCE("abrupt close (simulated)")
+            raise StopAsyncIteration  # clean exit (no events)
+    class FakeRealtime:
+        def connect(self, **_kw: Any) -> FakeConn:
+            attempt_counter["n"] += 1
+            mode = "raise_on_iter" if attempt_counter["n"] == 1 else "clean"
+            return FakeConn(mode)
+    class FakeClient:
+        def __init__(self, **_kw: Any) -> None: self.realtime = FakeRealtime()
+    # Patch the OpenAI client used by the handler
+    monkeypatch.setattr(rt_mod, "AsyncOpenAI", FakeClient)
+    # Build handler with minimal deps
+    deps = ToolDependencies(reachy_mini=MagicMock(), movement_manager=MagicMock())
+    handler = rt_mod.OpenaiRealtimeHandler(deps)
+    # Run: should retry once and exit cleanly
+    await handler.start_up()
+    # Validate: two attempts total (fail -> retry -> succeed), and connection cleared
+    assert attempt_counter["n"] == 2
+    assert handler.connection is None
+    # Optional: confirm we logged the unexpected close once
+    warnings = [r for r in caplog.records if r.levelname == "WARNING" and "closed unexpectedly" in r.msg]
+    assert len(warnings) == 1

uv.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff