apirrone commited on
Commit
f0177ee
Β·
2 Parent(s): 6df93fa 4af767f

Merge branch 'develop' into 62-appify-the-demo

Browse files
Files changed (30) hide show
  1. .github/workflows/tests.yml +74 -0
  2. .github/workflows/typecheck.yml +29 -0
  3. .gitignore +1 -0
  4. README.md +16 -8
  5. docs/assets/conversation_app_arch.svg +3 -0
  6. {src/reachy_mini_conversation_demo/images β†’ docs/assets}/reachy_mini_dance.gif +0 -0
  7. docs/scheme.mmd +58 -0
  8. pyproject.toml +25 -10
  9. src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/__init__.py +0 -0
  10. src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/audio/__init__.py +0 -0
  11. src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/audio/head_wobbler.py +15 -13
  12. src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/audio/speech_tapper.py +11 -9
  13. src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/camera_worker.py +25 -26
  14. src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/config.py +15 -26
  15. src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/console.py +22 -11
  16. src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/dance_emotion_moves.py +21 -18
  17. src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/images/reachymini_avatar.png +0 -0
  18. src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/images/user_avatar.png +0 -0
  19. src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/main.py +12 -11
  20. src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/moves.py +29 -28
  21. src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/openai_realtime.py +203 -83
  22. src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/prompts.py +0 -0
  23. src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/tools.py +36 -34
  24. src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/utils.py +12 -10
  25. src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/vision/__init__.py +0 -0
  26. src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/vision/processors.py +17 -15
  27. src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/vision/yolo_head_tracker.py +15 -9
  28. tests/audio/test_head_wobbler.py +4 -3
  29. tests/test_openai_realtime.py +117 -0
  30. uv.lock +0 -0
.github/workflows/tests.yml ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Tests
2
+ on:
3
+ push:
4
+ pull_request:
5
+
6
+ permissions:
7
+ contents: read
8
+ actions: write
9
+
10
+ concurrency:
11
+ group: ${{ github.workflow }}-${{ github.ref }}
12
+ cancel-in-progress: true
13
+
14
+ jobs:
15
+ tests:
16
+ name: pytest (py${{ matrix.python-version }})
17
+ runs-on: ubuntu-latest
18
+ timeout-minutes: 15
19
+ strategy:
20
+ fail-fast: false
21
+ matrix:
22
+ python-version: ["3.12"]
23
+
24
+ env:
25
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
26
+ HF_HUB_ETAG_TIMEOUT: "120"
27
+ HF_HUB_DOWNLOAD_TIMEOUT: "120"
28
+
29
+ steps:
30
+ - uses: actions/checkout@v4
31
+
32
+ - uses: actions/setup-python@v5
33
+ with:
34
+ python-version: ${{ matrix.python-version }}
35
+
36
+ - uses: astral-sh/setup-uv@v5
37
+
38
+ - name: Set HF_HOME
39
+ shell: bash
40
+ run: |
41
+ echo "HF_HOME=${RUNNER_TEMP}/.hf" >> "$GITHUB_ENV"
42
+ mkdir -p "${RUNNER_TEMP}/.hf"
43
+
44
+ - name: Cache Hugging Face hub
45
+ uses: actions/cache@v4
46
+ with:
47
+ path: ${{ runner.temp }}/.hf
48
+ key: hf-${{ runner.os }}-${{ hashFiles('uv.lock', 'pyproject.toml') }}
49
+ restore-keys: hf-${{ runner.os }}-
50
+
51
+ # test-only .env file
52
+ - name: Create test .env
53
+ run: |
54
+ printf "OPENAI_API_KEY=test-dummy\n" > .env
55
+
56
+ - name: Install (locked)
57
+ run: |
58
+ uv sync --frozen --group dev --extra all_vision
59
+
60
+ # Prefetch HF dataset to avoid download during test collection
61
+ - name: Prefetch HF dataset
62
+ run: |
63
+ uv run python - <<'PY'
64
+ from huggingface_hub import snapshot_download
65
+ snapshot_download(
66
+ repo_id="pollen-robotics/reachy-mini-emotions-library",
67
+ repo_type="dataset",
68
+ etag_timeout=120,
69
+ max_workers=4
70
+ )
71
+ PY
72
+
73
+ - name: Run tests
74
+ run: uv run pytest -q
.github/workflows/typecheck.yml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Type check
2
+
3
+ on: [push, pull_request]
4
+
5
+ permissions:
6
+ contents: read
7
+
8
+ concurrency:
9
+ group: ${{ github.workflow }}-${{ github.ref }}
10
+ cancel-in-progress: true
11
+
12
+ jobs:
13
+ mypy:
14
+ runs-on: ubuntu-latest
15
+ timeout-minutes: 10
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - uses: actions/setup-python@v5
20
+ with:
21
+ python-version: "3.12"
22
+
23
+ - uses: astral-sh/setup-uv@v5
24
+
25
+ - name: Install deps (locked) incl. vision extras
26
+ run: uv sync --frozen --group dev --extra all_vision
27
+
28
+ - name: Run mypy
29
+ run: uv run mypy --pretty --show-error-codes .
.gitignore CHANGED
@@ -29,6 +29,7 @@ coverage.xml
29
 
30
  # Linting and formatting
31
  .ruff_cache/
 
32
 
33
  # IDE
34
  .vscode/
 
29
 
30
  # Linting and formatting
31
  .ruff_cache/
32
+ .mypy_cache/
33
 
34
  # IDE
35
  .vscode/
README.md CHANGED
@@ -9,11 +9,19 @@ tags:
9
  - reachy_mini
10
  ---
11
 
12
- # Reachy Mini conversation demo
13
 
14
- Conversational demo for the Reachy Mini robot combining OpenAI's realtime APIs, vision pipelines, and choreographed motion libraries.
15
 
16
- ![Reachy Mini Dance](src/reachy_mini_conversation_demo/images/reachy_mini_dance.gif)
 
 
 
 
 
 
 
 
17
 
18
  ## Overview
19
  - Real-time audio conversation loop powered by the OpenAI realtime API and `fastrtc` for low-latency streaming.
@@ -94,12 +102,12 @@ Some wheels (e.g. PyTorch) are large and require compatible CUDA or CPU builds
94
  | `HF_TOKEN` | Optional token for Hugging Face models (only used with `--local-vision` flag, falls back to `huggingface-cli login`).
95
  | `LOCAL_VISION_MODEL` | Hugging Face model path for local vision processing (only used with `--local-vision` flag, defaults to `HuggingFaceTB/SmolVLM2-2.2B-Instruct`).
96
 
97
- ## Running the demo
98
 
99
  Activate your virtual environment, ensure the Reachy Mini robot (or simulator) is reachable, then launch:
100
 
101
  ```bash
102
- reachy-mini-conversation-demo
103
  ```
104
 
105
  By default, the app runs in console mode for direct audio interaction. Use the `--gradio` flag to launch a web UI served locally at http://127.0.0.1:7860/ (required when running in simulation mode). With a camera attached, vision is handled by the gpt-realtime model when the camera tool is used. For local vision processing, use the `--local-vision` flag to process frames periodically using the SmolVLM2 model. Additionally, you can enable face tracking via YOLO or MediaPipe pipelines depending on the extras you installed.
@@ -119,19 +127,19 @@ By default, the app runs in console mode for direct audio interaction. Use the `
119
  - Run on hardware with MediaPipe face tracking:
120
 
121
  ```bash
122
- reachy-mini-conversation-demo --head-tracker mediapipe
123
  ```
124
 
125
  - Run with local vision processing (requires `local_vision` extra):
126
 
127
  ```bash
128
- reachy-mini-conversation-demo --local-vision
129
  ```
130
 
131
  - Disable the camera pipeline (audio-only conversation):
132
 
133
  ```bash
134
- reachy-mini-conversation-demo --no-camera
135
  ```
136
 
137
  ## LLM tools exposed to the assistant
 
9
  - reachy_mini
10
  ---
11
 
12
+ # Reachy Mini conversation app
13
 
14
+ Conversational app for the Reachy Mini robot combining OpenAI's realtime APIs, vision pipelines, and choreographed motion libraries.
15
 
16
+ ![Reachy Mini Dance](docs/assets/reachy_mini_dance.gif)
17
+
18
+ ## Architecture
19
+
20
+ The app follows a layered architecture connecting the user, AI services, and robot hardware:
21
+
22
+ <p align="center">
23
+ <img src="docs/assets/conversation_app_arch.svg" alt="Architecture Diagram" width="600"/>
24
+ </p>
25
 
26
  ## Overview
27
  - Real-time audio conversation loop powered by the OpenAI realtime API and `fastrtc` for low-latency streaming.
 
102
  | `HF_TOKEN` | Optional token for Hugging Face models (only used with `--local-vision` flag, falls back to `huggingface-cli login`).
103
  | `LOCAL_VISION_MODEL` | Hugging Face model path for local vision processing (only used with `--local-vision` flag, defaults to `HuggingFaceTB/SmolVLM2-2.2B-Instruct`).
104
 
105
+ ## Running the app
106
 
107
  Activate your virtual environment, ensure the Reachy Mini robot (or simulator) is reachable, then launch:
108
 
109
  ```bash
110
+ reachy-mini-conversation-app
111
  ```
112
 
113
  By default, the app runs in console mode for direct audio interaction. Use the `--gradio` flag to launch a web UI served locally at http://127.0.0.1:7860/ (required when running in simulation mode). With a camera attached, vision is handled by the gpt-realtime model when the camera tool is used. For local vision processing, use the `--local-vision` flag to process frames periodically using the SmolVLM2 model. Additionally, you can enable face tracking via YOLO or MediaPipe pipelines depending on the extras you installed.
 
127
  - Run on hardware with MediaPipe face tracking:
128
 
129
  ```bash
130
+ reachy-mini-conversation-app --head-tracker mediapipe
131
  ```
132
 
133
  - Run with local vision processing (requires `local_vision` extra):
134
 
135
  ```bash
136
+ reachy-mini-conversation-app --local-vision
137
  ```
138
 
139
  - Disable the camera pipeline (audio-only conversation):
140
 
141
  ```bash
142
+ reachy-mini-conversation-app --no-camera
143
  ```
144
 
145
  ## LLM tools exposed to the assistant
docs/assets/conversation_app_arch.svg ADDED

Git LFS Details

  • SHA256: 2d3251bc98d5a0bf1d41d0332b76e7e86496745b2a0999f228b7d8647dd453a2
  • Pointer size: 131 Bytes
  • Size of remote file: 122 kB
{src/reachy_mini_conversation_demo/images β†’ docs/assets}/reachy_mini_dance.gif RENAMED
File without changes
docs/scheme.mmd ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ config:
3
+ layout: dagre
4
+ flowchart:
5
+ htmlLabels: true
6
+ ---
7
+ flowchart TB
8
+ User(["<span style='font-size:16px;font-weight:bold;'>User</span><br><span style='font-size:13px;color:#01579b;'>Person interacting with system</span>"])
9
+ -- audio stream -->
10
+ UI@{ label: "<span style='font-size:16px;font-weight:bold;'>UI Layer</span><br><span style='font-size:13px;color:#0277bd;'>Gradio/Console</span>" }
11
+
12
+ UI -- audio stream -->
13
+ OpenAI@{ label: "<span style='font-size:17px;font-weight:bold;'>gpt-realtime API</span><br><span style='font-size:13px; color:#7b1fa2;'>Audio+Tool Calls+Vision</span>" }
14
+
15
+ OpenAI -- audio stream -->
16
+ Motion@{ label: "<span style='font-size:16px;font-weight:bold;'>Motion Control</span><br><span style='font-size:13px;color:#f57f17;'>Audio Sync + Tracking</span>" }
17
+
18
+ OpenAI -- tool calls -->
19
+ Handlers@{ label: "<span style='font-size:16px;font-weight:bold;'>Tool Handlers</span><br><span style='font-size:12px;color:#f9a825;'>move_head, camera, head_tracking,<br/>dance, play_emotion, do_nothing</span>" }
20
+
21
+ Handlers -- movement
22
+ requests --> Motion
23
+
24
+ Handlers -- camera frames, face tracking -->
25
+ Camera@{ label: "<span style='font-size:16px;font-weight:bold;'>Camera Worker</span><br><span style='font-size:13px;color:#f57f17;'>Frame Buffer + Face Tracking</span>" }
26
+
27
+ Handlers -. image for
28
+ analysis .-> OpenAI
29
+
30
+ Camera -- face tracking --> Motion
31
+
32
+ Camera -. frames .->
33
+ Vision@{ label: "<span style='font-size:16px;font-weight:bold;'>Vision Processor</span><br><span style='font-size:13px;color:#7b1fa2;'>Local VLM (optional)</span>" }
34
+
35
+ Vision -. description .-> Handlers
36
+
37
+ Robot@{ label: "<span style='font-size:16px;font-weight:bold;'>reachy_mini</span><br><span style='font-size:13px;color:#c62828;'>Robot Control Library</span>" }
38
+ -- camera
39
+ frames --> Camera
40
+
41
+ Motion -- commands --> Robot
42
+
43
+ Handlers -- results --> OpenAI
44
+
45
+ User:::userStyle
46
+ UI:::uiStyle
47
+ OpenAI:::aiStyle
48
+ Motion:::coreStyle
49
+ Handlers:::toolStyle
50
+ Camera:::coreStyle
51
+ Vision:::aiStyle
52
+ Robot:::hardwareStyle
53
+ classDef userStyle fill:#e1f5fe,stroke:#01579b,stroke-width:3px
54
+ classDef uiStyle fill:#b3e5fc,stroke:#0277bd,stroke-width:2px
55
+ classDef aiStyle fill:#e1bee7,stroke:#7b1fa2,stroke-width:3px
56
+ classDef coreStyle fill:#fff9c4,stroke:#f57f17,stroke-width:2px
57
+ classDef hardwareStyle fill:#ef9a9a,stroke:#c62828,stroke-width:3px
58
+ classDef toolStyle fill:#fffde7,stroke:#f9a825,stroke-width:1px
pyproject.toml CHANGED
@@ -3,7 +3,7 @@ requires = ["setuptools"]
3
  build-backend = "setuptools.build_meta"
4
 
5
  [project]
6
- name = "reachy_mini_conversation_demo"
7
  version = "0.1.0"
8
  authors = [{ name = "Pollen Robotics", email = "contact@pollen-robotics.com" }]
9
  description = ""
@@ -12,7 +12,7 @@ requires-python = ">=3.10"
12
  dependencies = [
13
  #Media
14
  "aiortc>=1.13.0",
15
- "fastrtc@git+ssh://git@github.com/gradio-app/fastrtc.git@main",
16
  "gradio>=5.49.0",
17
  "huggingface_hub>=0.34.4",
18
  "opencv-python>=4.12.0.88",
@@ -23,7 +23,7 @@ dependencies = [
23
  #OpenAI
24
  "openai>=2.1",
25
 
26
- #Reachy mini
27
  "reachy_mini_dances_library",
28
  "reachy_mini_toolbox",
29
  "reachy_mini>=1.0.0.rc5",
@@ -34,16 +34,23 @@ local_vision = ["torch", "transformers", "num2words"]
34
  yolo_vision = ["ultralytics", "supervision"]
35
  mediapipe_vision = ["mediapipe>=0.10.14"]
36
  all_vision = [
37
- "reachy_mini_conversation_demo[local_vision]",
38
- "reachy_mini_conversation_demo[yolo_vision]",
39
- "reachy_mini_conversation_demo[mediapipe_vision]",
40
  ]
41
 
42
  [dependency-groups]
43
- dev = ["pytest", "ruff==0.12.0"]
 
 
 
 
 
 
 
44
 
45
  [project.scripts]
46
- reachy-mini-conversation-demo = "reachy_mini_conversation_demo.main:main"
47
 
48
  [project.entry-points."reachy_mini_apps"]
49
  reachy_mini_conversation_demo_app = "reachy_mini_conversation_demo.main:ReachyMiniConversationDemo"
@@ -56,7 +63,7 @@ include-package-data = true
56
  where = ["src"]
57
 
58
  [tool.setuptools.package-data]
59
- reachy_mini_conversation_demo = ["images/*"]
60
 
61
  [tool.ruff]
62
  line-length = 119
@@ -82,7 +89,7 @@ ignore = [
82
  length-sort = true
83
  lines-after-imports = 2
84
  no-lines-before = ["standard-library", "local-folder"]
85
- known-local-folder = ["reachy_mini_conversation_demo"]
86
  known-first-party = ["reachy_mini", "reachy_mini_dances_library", "reachy_mini_toolbox"]
87
  split-on-trailing-comma = true
88
 
@@ -91,3 +98,11 @@ quote-style = "double"
91
  indent-style = "space"
92
  skip-magic-trailing-comma = false
93
  line-ending = "auto"
 
 
 
 
 
 
 
 
 
3
  build-backend = "setuptools.build_meta"
4
 
5
  [project]
6
+ name = "reachy_mini_conversation_app"
7
  version = "0.1.0"
8
  authors = [{ name = "Pollen Robotics", email = "contact@pollen-robotics.com" }]
9
  description = ""
 
12
  dependencies = [
13
  #Media
14
  "aiortc>=1.13.0",
15
+ "fastrtc>=0.0.33",
16
  "gradio>=5.49.0",
17
  "huggingface_hub>=0.34.4",
18
  "opencv-python>=4.12.0.88",
 
23
  #OpenAI
24
  "openai>=2.1",
25
 
26
+ #Reachy mini
27
  "reachy_mini_dances_library",
28
  "reachy_mini_toolbox",
29
  "reachy_mini>=1.0.0.rc5",
 
34
  yolo_vision = ["ultralytics", "supervision"]
35
  mediapipe_vision = ["mediapipe>=0.10.14"]
36
  all_vision = [
37
+ "torch", "transformers", "num2words",
38
+ "ultralytics", "supervision",
39
+ "mediapipe>=0.10.14",
40
  ]
41
 
42
  [dependency-groups]
43
+ dev = [
44
+ "pytest",
45
+ "pytest-asyncio",
46
+ "ruff==0.12.0",
47
+ "mypy==1.18.2",
48
+ "pre-commit",
49
+ "types-requests",
50
+ ]
51
 
52
  [project.scripts]
53
+ reachy-mini-conversation-app = "reachy_mini_conversation_app.main:main"
54
 
55
  [project.entry-points."reachy_mini_apps"]
56
  reachy_mini_conversation_demo_app = "reachy_mini_conversation_demo.main:ReachyMiniConversationDemo"
 
63
  where = ["src"]
64
 
65
  [tool.setuptools.package-data]
66
+ reachy_mini_conversation_app = ["images/*"]
67
 
68
  [tool.ruff]
69
  line-length = 119
 
89
  length-sort = true
90
  lines-after-imports = 2
91
  no-lines-before = ["standard-library", "local-folder"]
92
+ known-local-folder = ["reachy_mini_conversation_app"]
93
  known-first-party = ["reachy_mini", "reachy_mini_dances_library", "reachy_mini_toolbox"]
94
  split-on-trailing-comma = true
95
 
 
98
  indent-style = "space"
99
  skip-magic-trailing-comma = false
100
  line-ending = "auto"
101
+
102
+ [tool.mypy]
103
+ python_version = "3.12"
104
+ files = ["src/"]
105
+ ignore_missing_imports = true
106
+ strict = true
107
+ show_error_codes = true
108
+ warn_unused_ignores = true
src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/__init__.py RENAMED
File without changes
src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/audio/__init__.py RENAMED
File without changes
src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/audio/head_wobbler.py RENAMED
@@ -5,11 +5,13 @@ import queue
5
  import base64
6
  import logging
7
  import threading
8
- from typing import Tuple, Optional
 
9
 
10
  import numpy as np
 
11
 
12
- from reachy_mini_conversation_demo.audio.speech_tapper import HOP_MS, SwayRollRT
13
 
14
 
15
  SAMPLE_RATE = 24000
@@ -20,13 +22,13 @@ logger = logging.getLogger(__name__)
20
  class HeadWobbler:
21
  """Converts audio deltas (base64) into head movement offsets."""
22
 
23
- def __init__(self, set_speech_offsets):
24
  """Initialize the head wobbler."""
25
  self._apply_offsets = set_speech_offsets
26
- self._base_ts: Optional[float] = None
27
  self._hops_done: int = 0
28
 
29
- self.audio_queue: queue.Queue[Tuple[int, int, np.ndarray]] = queue.Queue()
30
  self.sway = SwayRollRT()
31
 
32
  # Synchronization primitives
@@ -35,7 +37,7 @@ class HeadWobbler:
35
  self._generation = 0
36
 
37
  self._stop_event = threading.Event()
38
- self._thread: Optional[threading.Thread] = None
39
 
40
  def feed(self, delta_b64: str) -> None:
41
  """Thread-safe: push audio into the consumer queue."""
@@ -78,14 +80,14 @@ class HeadWobbler:
78
  if chunk_generation != current_generation:
79
  continue
80
 
81
- pcm = np.asarray(chunk).squeeze(0)
82
- with self._sway_lock:
83
- results = self.sway.feed(pcm, sr)
84
-
85
  if self._base_ts is None:
86
  with self._state_lock:
87
  if self._base_ts is None:
88
- self._base_ts = time.time()
 
 
 
 
89
 
90
  i = 0
91
  while i < len(results):
@@ -96,14 +98,14 @@ class HeadWobbler:
96
  hops_done = self._hops_done
97
 
98
  if base_ts is None:
99
- base_ts = time.time()
100
  with self._state_lock:
101
  if self._base_ts is None:
102
  self._base_ts = base_ts
103
  hops_done = self._hops_done
104
 
105
  target = base_ts + MOVEMENT_LATENCY_S + hops_done * hop_dt
106
- now = time.time()
107
 
108
  if now - target >= hop_dt:
109
  lag_hops = int((now - target) / hop_dt)
 
5
  import base64
6
  import logging
7
  import threading
8
+ from typing import Tuple
9
+ from collections.abc import Callable
10
 
11
  import numpy as np
12
+ from numpy.typing import NDArray
13
 
14
+ from reachy_mini_conversation_app.audio.speech_tapper import HOP_MS, SwayRollRT
15
 
16
 
17
  SAMPLE_RATE = 24000
 
22
  class HeadWobbler:
23
  """Converts audio deltas (base64) into head movement offsets."""
24
 
25
+ def __init__(self, set_speech_offsets: Callable[[Tuple[float, float, float, float, float, float]], None]) -> None:
26
  """Initialize the head wobbler."""
27
  self._apply_offsets = set_speech_offsets
28
+ self._base_ts: float | None = None
29
  self._hops_done: int = 0
30
 
31
+ self.audio_queue: "queue.Queue[Tuple[int, int, NDArray[np.int16]]]" = queue.Queue()
32
  self.sway = SwayRollRT()
33
 
34
  # Synchronization primitives
 
37
  self._generation = 0
38
 
39
  self._stop_event = threading.Event()
40
+ self._thread: threading.Thread | None = None
41
 
42
  def feed(self, delta_b64: str) -> None:
43
  """Thread-safe: push audio into the consumer queue."""
 
80
  if chunk_generation != current_generation:
81
  continue
82
 
 
 
 
 
83
  if self._base_ts is None:
84
  with self._state_lock:
85
  if self._base_ts is None:
86
+ self._base_ts = time.monotonic()
87
+
88
+ pcm = np.asarray(chunk).squeeze(0)
89
+ with self._sway_lock:
90
+ results = self.sway.feed(pcm, sr)
91
 
92
  i = 0
93
  while i < len(results):
 
98
  hops_done = self._hops_done
99
 
100
  if base_ts is None:
101
+ base_ts = time.monotonic()
102
  with self._state_lock:
103
  if self._base_ts is None:
104
  self._base_ts = base_ts
105
  hops_done = self._hops_done
106
 
107
  target = base_ts + MOVEMENT_LATENCY_S + hops_done * hop_dt
108
+ now = time.monotonic()
109
 
110
  if now - target >= hop_dt:
111
  lag_hops = int((now - target) / hop_dt)
src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/audio/speech_tapper.py RENAMED
@@ -1,10 +1,11 @@
1
  from __future__ import annotations
2
  import math
3
- from typing import Dict, List, Optional
4
  from itertools import islice
5
  from collections import deque
6
 
7
  import numpy as np
 
8
 
9
 
10
  # Tunables
@@ -48,7 +49,7 @@ SWAY_ATTACK_FR = max(1, int(SWAY_ATTACK_MS / HOP_MS))
48
  SWAY_RELEASE_FR = max(1, int(SWAY_RELEASE_MS / HOP_MS))
49
 
50
 
51
- def _rms_dbfs(x: np.ndarray) -> float:
52
  """Root-mean-square in dBFS for float32 mono array in [-1,1]."""
53
  # numerically stable rms (avoid overflow)
54
  x = x.astype(np.float32, copy=False)
@@ -66,7 +67,7 @@ def _loudness_gain(db: float, offset: float = SENS_DB_OFFSET) -> float:
66
  return t**LOUDNESS_GAMMA if LOUDNESS_GAMMA != 1.0 else t
67
 
68
 
69
- def _to_float32_mono(x: np.ndarray) -> np.ndarray:
70
  """Convert arbitrary PCM array to float32 mono in [-1,1].
71
 
72
  Accepts shapes: (N,), (1,N), (N,1), (C,N), (N,C).
@@ -94,7 +95,7 @@ def _to_float32_mono(x: np.ndarray) -> np.ndarray:
94
  return a.astype(np.float32) / (scale if scale != 0.0 else 1.0)
95
 
96
 
97
- def _resample_linear(x: np.ndarray, sr_in: int, sr_out: int) -> np.ndarray:
98
  """Lightweight linear resampler for short buffers."""
99
  if sr_in == sr_out or x.size == 0:
100
  return x
@@ -118,8 +119,8 @@ class SwayRollRT:
118
  def __init__(self, rng_seed: int = 7):
119
  """Initialize state."""
120
  self._seed = int(rng_seed)
121
- self.samples = deque(maxlen=10 * SR) # sliding window for VAD/env
122
- self.carry = np.zeros(0, dtype=np.float32)
123
 
124
  self.vad_on = False
125
  self.vad_above = 0
@@ -150,7 +151,7 @@ class SwayRollRT:
150
  self.sway_down = 0
151
  self.t = 0.0
152
 
153
- def feed(self, pcm: np.ndarray, sr: Optional[int]) -> List[Dict[str, float]]:
154
  """Stream in PCM chunk. Returns a list of sway dicts, one per hop (HOP_MS).
155
 
156
  Args:
@@ -177,7 +178,8 @@ class SwayRollRT:
177
 
178
  while self.carry.size >= HOP:
179
  hop = self.carry[:HOP]
180
- self.carry = self.carry[HOP:]
 
181
 
182
  # keep sliding window for VAD/env computation
183
  # (deque accepts any iterable; list() for small HOP is fine)
@@ -260,7 +262,7 @@ class SwayRollRT:
260
  "x_mm": x_mm,
261
  "y_mm": y_mm,
262
  "z_mm": z_mm,
263
- }
264
  )
265
 
266
  return out
 
1
  from __future__ import annotations
2
  import math
3
+ from typing import Any, Dict, List
4
  from itertools import islice
5
  from collections import deque
6
 
7
  import numpy as np
8
+ from numpy.typing import NDArray
9
 
10
 
11
  # Tunables
 
49
  SWAY_RELEASE_FR = max(1, int(SWAY_RELEASE_MS / HOP_MS))
50
 
51
 
52
+ def _rms_dbfs(x: NDArray[np.float32]) -> float:
53
  """Root-mean-square in dBFS for float32 mono array in [-1,1]."""
54
  # numerically stable rms (avoid overflow)
55
  x = x.astype(np.float32, copy=False)
 
67
  return t**LOUDNESS_GAMMA if LOUDNESS_GAMMA != 1.0 else t
68
 
69
 
70
+ def _to_float32_mono(x: NDArray[Any]) -> NDArray[np.float32]:
71
  """Convert arbitrary PCM array to float32 mono in [-1,1].
72
 
73
  Accepts shapes: (N,), (1,N), (N,1), (C,N), (N,C).
 
95
  return a.astype(np.float32) / (scale if scale != 0.0 else 1.0)
96
 
97
 
98
+ def _resample_linear(x: NDArray[np.float32], sr_in: int, sr_out: int) -> NDArray[np.float32]:
99
  """Lightweight linear resampler for short buffers."""
100
  if sr_in == sr_out or x.size == 0:
101
  return x
 
119
  def __init__(self, rng_seed: int = 7):
120
  """Initialize state."""
121
  self._seed = int(rng_seed)
122
+ self.samples: deque[float] = deque(maxlen=10 * SR) # sliding window for VAD/env
123
+ self.carry: NDArray[np.float32] = np.zeros(0, dtype=np.float32)
124
 
125
  self.vad_on = False
126
  self.vad_above = 0
 
151
  self.sway_down = 0
152
  self.t = 0.0
153
 
154
+ def feed(self, pcm: NDArray[Any], sr: int | None) -> List[Dict[str, float]]:
155
  """Stream in PCM chunk. Returns a list of sway dicts, one per hop (HOP_MS).
156
 
157
  Args:
 
178
 
179
  while self.carry.size >= HOP:
180
  hop = self.carry[:HOP]
181
+ remaining: NDArray[np.float32] = self.carry[HOP:]
182
+ self.carry = remaining
183
 
184
  # keep sliding window for VAD/env computation
185
  # (deque accepts any iterable; list() for small HOP is fine)
 
262
  "x_mm": x_mm,
263
  "y_mm": y_mm,
264
  "z_mm": z_mm,
265
+ },
266
  )
267
 
268
  return out
src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/camera_worker.py RENAMED
@@ -9,10 +9,10 @@ Ported from main_works.py camera_worker() function to provide:
9
  import time
10
  import logging
11
  import threading
12
- from typing import Tuple, Optional
13
 
14
- import cv2
15
  import numpy as np
 
16
  from scipy.spatial.transform import Rotation as R
17
 
18
  from reachy_mini import ReachyMini
@@ -25,20 +25,20 @@ logger = logging.getLogger(__name__)
25
  class CameraWorker:
26
  """Thread-safe camera worker with frame buffering and face tracking."""
27
 
28
- def __init__(self, reachy_mini: ReachyMini, head_tracker=None):
29
  """Initialize."""
30
  self.reachy_mini = reachy_mini
31
  self.head_tracker = head_tracker
32
 
33
  # Thread-safe frame storage
34
- self.latest_frame: Optional[np.ndarray] = None
35
  self.frame_lock = threading.Lock()
36
  self._stop_event = threading.Event()
37
- self._thread: Optional[threading.Thread] = None
38
 
39
  # Face tracking state
40
  self.is_head_tracking_enabled = True
41
- self.face_tracking_offsets = [
42
  0.0,
43
  0.0,
44
  0.0,
@@ -49,31 +49,30 @@ class CameraWorker:
49
  self.face_tracking_lock = threading.Lock()
50
 
51
  # Face tracking timing variables (same as main_works.py)
52
- self.last_face_detected_time: Optional[float] = None
53
- self.interpolation_start_time: Optional[float] = None
54
- self.interpolation_start_pose: Optional[np.ndarray] = None
55
  self.face_lost_delay = 2.0 # seconds to wait before starting interpolation
56
  self.interpolation_duration = 1.0 # seconds to interpolate back to neutral
57
 
58
  # Track state changes
59
  self.previous_head_tracking_state = self.is_head_tracking_enabled
60
 
61
- def get_latest_frame(self) -> Optional[np.ndarray]:
62
  """Get the latest frame (thread-safe)."""
63
  with self.frame_lock:
64
  if self.latest_frame is None:
65
  return None
66
- else:
67
- frame = self.latest_frame.copy()
68
- frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
69
- return frame
70
 
71
  def get_face_tracking_offsets(
72
  self,
73
  ) -> Tuple[float, float, float, float, float, float]:
74
  """Get current face tracking offsets (thread-safe)."""
75
  with self.face_tracking_lock:
76
- return tuple(self.face_tracking_offsets)
 
77
 
78
  def set_head_tracking_enabled(self, enabled: bool) -> None:
79
  """Enable/disable head tracking."""
@@ -168,12 +167,11 @@ class CameraWorker:
168
  rotation[2], # roll, pitch, yaw
169
  ]
170
 
171
- else:
172
- # No face detected while tracking enabled - set face lost timestamp
173
- if self.last_face_detected_time is None or self.last_face_detected_time == current_time:
174
- # Only update if we haven't already set a face lost time
175
- # (current_time check prevents overriding the disable-triggered timestamp)
176
- pass
177
 
178
  # Handle smooth interpolation (works for both face-lost and tracking-disabled cases)
179
  if self.last_face_detected_time is not None:
@@ -188,11 +186,12 @@ class CameraWorker:
188
  current_translation = self.face_tracking_offsets[:3]
189
  current_rotation_euler = self.face_tracking_offsets[3:]
190
  # Convert to 4x4 pose matrix
191
- self.interpolation_start_pose = np.eye(4)
192
- self.interpolation_start_pose[:3, 3] = current_translation
193
- self.interpolation_start_pose[:3, :3] = R.from_euler(
194
- "xyz", current_rotation_euler
195
  ).as_matrix()
 
196
 
197
  # Calculate interpolation progress (t from 0 to 1)
198
  elapsed_interpolation = current_time - self.interpolation_start_time
@@ -200,7 +199,7 @@ class CameraWorker:
200
 
201
  # Interpolate between current pose and neutral pose
202
  interpolated_pose = linear_pose_interpolation(
203
- self.interpolation_start_pose, neutral_pose, t
204
  )
205
 
206
  # Extract translation and rotation from interpolated pose
 
9
  import time
10
  import logging
11
  import threading
12
+ from typing import Any, List, Tuple
13
 
 
14
  import numpy as np
15
+ from numpy.typing import NDArray
16
  from scipy.spatial.transform import Rotation as R
17
 
18
  from reachy_mini import ReachyMini
 
25
  class CameraWorker:
26
  """Thread-safe camera worker with frame buffering and face tracking."""
27
 
28
+ def __init__(self, reachy_mini: ReachyMini, head_tracker: Any = None) -> None:
29
  """Initialize."""
30
  self.reachy_mini = reachy_mini
31
  self.head_tracker = head_tracker
32
 
33
  # Thread-safe frame storage
34
+ self.latest_frame: NDArray[np.uint8] | None = None
35
  self.frame_lock = threading.Lock()
36
  self._stop_event = threading.Event()
37
+ self._thread: threading.Thread | None = None
38
 
39
  # Face tracking state
40
  self.is_head_tracking_enabled = True
41
+ self.face_tracking_offsets: List[float] = [
42
  0.0,
43
  0.0,
44
  0.0,
 
49
  self.face_tracking_lock = threading.Lock()
50
 
51
  # Face tracking timing variables (same as main_works.py)
52
+ self.last_face_detected_time: float | None = None
53
+ self.interpolation_start_time: float | None = None
54
+ self.interpolation_start_pose: NDArray[np.float32] | None = None
55
  self.face_lost_delay = 2.0 # seconds to wait before starting interpolation
56
  self.interpolation_duration = 1.0 # seconds to interpolate back to neutral
57
 
58
  # Track state changes
59
  self.previous_head_tracking_state = self.is_head_tracking_enabled
60
 
61
+ def get_latest_frame(self) -> NDArray[np.uint8] | None:
62
  """Get the latest frame (thread-safe)."""
63
  with self.frame_lock:
64
  if self.latest_frame is None:
65
  return None
66
+ # Return a copy in original BGR format (OpenCV native)
67
+ return self.latest_frame.copy()
 
 
68
 
69
  def get_face_tracking_offsets(
70
  self,
71
  ) -> Tuple[float, float, float, float, float, float]:
72
  """Get current face tracking offsets (thread-safe)."""
73
  with self.face_tracking_lock:
74
+ offsets = self.face_tracking_offsets
75
+ return (offsets[0], offsets[1], offsets[2], offsets[3], offsets[4], offsets[5])
76
 
77
  def set_head_tracking_enabled(self, enabled: bool) -> None:
78
  """Enable/disable head tracking."""
 
167
  rotation[2], # roll, pitch, yaw
168
  ]
169
 
170
+ # No face detected while tracking enabled - set face lost timestamp
171
+ elif self.last_face_detected_time is None or self.last_face_detected_time == current_time:
172
+ # Only update if we haven't already set a face lost time
173
+ # (current_time check prevents overriding the disable-triggered timestamp)
174
+ pass
 
175
 
176
  # Handle smooth interpolation (works for both face-lost and tracking-disabled cases)
177
  if self.last_face_detected_time is not None:
 
186
  current_translation = self.face_tracking_offsets[:3]
187
  current_rotation_euler = self.face_tracking_offsets[3:]
188
  # Convert to 4x4 pose matrix
189
+ pose_matrix = np.eye(4, dtype=np.float32)
190
+ pose_matrix[:3, 3] = current_translation
191
+ pose_matrix[:3, :3] = R.from_euler(
192
+ "xyz", current_rotation_euler,
193
  ).as_matrix()
194
+ self.interpolation_start_pose = pose_matrix
195
 
196
  # Calculate interpolation progress (t from 0 to 1)
197
  elapsed_interpolation = current_time - self.interpolation_start_time
 
199
 
200
  # Interpolate between current pose and neutral pose
201
  interpolated_pose = linear_pose_interpolation(
202
+ self.interpolation_start_pose, neutral_pose, t,
203
  )
204
 
205
  # Extract translation and rotation from interpolated pose
src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/config.py RENAMED
@@ -1,44 +1,33 @@
1
  import os
2
  import logging
3
- from pathlib import Path
4
 
5
- from dotenv import load_dotenv
6
 
7
 
8
  logger = logging.getLogger(__name__)
9
 
10
- # Check if .env file exists
11
- # TODO Antoine - disabled this for testing appifying
12
- # env_file = Path(".env")
13
- # if not env_file.exists():
14
- # raise RuntimeError(
15
- # ".env file not found. Please create one based on .env.example:\n"
16
- # " cp .env.example .env\n"
17
- # "Then add your OPENAI_API_KEY to the .env file."
18
- # )
19
 
20
- # Load .env and verify it was loaded successfully
21
- # if not load_dotenv():
22
- # raise RuntimeError(
23
- # "Failed to load .env file. Please ensure the file is readable and properly formatted."
24
- # )
25
-
26
- logger.info("Configuration loaded from .env file")
27
 
28
 
29
  class Config:
30
- """Configuration class for the conversation demo."""
31
 
32
  # Required
33
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
34
- if OPENAI_API_KEY is None:
35
- raise RuntimeError(
36
- "OPENAI_API_KEY is not set in .env file. Please add it:\n"
37
- " OPENAI_API_KEY=your_api_key_here"
38
- )
39
- if not OPENAI_API_KEY.strip():
40
  raise RuntimeError(
41
- "OPENAI_API_KEY is empty in .env file. Please provide a valid API key."
 
 
 
42
  )
43
 
44
  # Optional
 
1
  import os
2
  import logging
 
3
 
4
+ from dotenv import find_dotenv, load_dotenv
5
 
6
 
7
  logger = logging.getLogger(__name__)
8
 
9
+ # Locate .env file (search upward from current working directory)
10
+ dotenv_path = find_dotenv(usecwd=True)
 
 
 
 
 
 
 
11
 
12
+ if dotenv_path:
13
+ # Load .env and override environment variables
14
+ load_dotenv(dotenv_path=dotenv_path, override=True)
15
+ logger.info(f"Configuration loaded from {dotenv_path}")
16
+ else:
17
+ logger.warning("No .env file found, using environment variables")
 
18
 
19
 
20
  class Config:
21
+ """Configuration class for the conversation app."""
22
 
23
  # Required
24
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
25
+ if not OPENAI_API_KEY or not OPENAI_API_KEY.strip():
 
 
 
 
 
26
  raise RuntimeError(
27
+ "OPENAI_API_KEY is missing or empty.\n"
28
+ "Either:\n"
29
+ " 1. Create a .env file with: OPENAI_API_KEY=your_api_key_here (recomended)\n"
30
+ " 2. Set environment variable: export OPENAI_API_KEY=your_api_key_here"
31
  )
32
 
33
  # Optional
src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/console.py RENAMED
@@ -3,14 +3,16 @@
3
  records mic frames to the handler and plays handler audio frames to the speaker.
4
  """
5
 
 
6
  import asyncio
7
  import logging
 
8
 
9
- import librosa
10
  from fastrtc import AdditionalOutputs, audio_to_int16, audio_to_float32
 
11
 
12
  from reachy_mini import ReachyMini
13
- from reachy_mini_conversation_demo.openai_realtime import OpenaiRealtimeHandler
14
 
15
 
16
  logger = logging.getLogger(__name__)
@@ -24,15 +26,21 @@ class LocalStream:
24
  self.handler = handler
25
  self._robot = robot
26
  self._stop_event = asyncio.Event()
27
- self._tasks = []
28
  # Allow the handler to flush the player queue when appropriate.
29
- self.handler._clear_queue = self.clear_audio_queue # type: ignore[assignment]
 
 
 
 
 
30
 
31
  def launch(self) -> None:
32
  """Start the recorder/player and run the async processing loops."""
33
  self._stop_event.clear()
34
  self._robot.media.start_recording()
35
  self._robot.media.start_playing()
 
36
 
37
  async def runner() -> None:
38
  self._tasks = [
@@ -83,9 +91,8 @@ class LocalStream:
83
  frame_mono = audio_frame.T[0] # both channels are identical
84
  frame = audio_to_int16(frame_mono)
85
  await self.handler.receive((16000, frame))
86
- # await asyncio.sleep(0) # yield to event loop
87
- else:
88
- await asyncio.sleep(0.01) # avoid busy loop
89
 
90
  async def play_loop(self) -> None:
91
  """Fetch outputs from the handler: log text and play audio frames."""
@@ -105,12 +112,16 @@ class LocalStream:
105
  elif isinstance(handler_output, tuple):
106
  input_sample_rate, audio_frame = handler_output
107
  device_sample_rate = self._robot.media.get_audio_samplerate()
108
- audio_frame = audio_to_float32(audio_frame.squeeze())
 
109
  if input_sample_rate != device_sample_rate:
110
- audio_frame = librosa.resample(
111
- audio_frame, orig_sr=input_sample_rate, target_sr=device_sample_rate
 
 
112
  )
113
- self._robot.media.push_audio_sample(audio_frame)
 
114
 
115
  else:
116
  logger.debug("Ignoring output type=%s", type(handler_output).__name__)
 
3
  records mic frames to the handler and plays handler audio frames to the speaker.
4
  """
5
 
6
+ import time
7
  import asyncio
8
  import logging
9
+ from typing import List
10
 
 
11
  from fastrtc import AdditionalOutputs, audio_to_int16, audio_to_float32
12
+ from librosa import resample
13
 
14
  from reachy_mini import ReachyMini
15
+ from reachy_mini_conversation_app.openai_realtime import OpenaiRealtimeHandler
16
 
17
 
18
  logger = logging.getLogger(__name__)
 
26
  self.handler = handler
27
  self._robot = robot
28
  self._stop_event = asyncio.Event()
29
+ self._tasks: List[asyncio.Task[None]] = []
30
  # Allow the handler to flush the player queue when appropriate.
31
+ self.handler._clear_queue = self.clear_audio_queue
32
+
33
+ # Hack to avoid the first lenghty call to resample at runtime.
34
+ # This is likely caused by cache initialization overhead.
35
+ import numpy as np
36
+ resample(np.array([0.0]), orig_sr=1, target_sr=1)
37
 
38
  def launch(self) -> None:
39
  """Start the recorder/player and run the async processing loops."""
40
  self._stop_event.clear()
41
  self._robot.media.start_recording()
42
  self._robot.media.start_playing()
43
+ time.sleep(1) # give some time to the pipelines to start
44
 
45
  async def runner() -> None:
46
  self._tasks = [
 
91
  frame_mono = audio_frame.T[0] # both channels are identical
92
  frame = audio_to_int16(frame_mono)
93
  await self.handler.receive((16000, frame))
94
+
95
+ await asyncio.sleep(0.01) # avoid busy loop
 
96
 
97
  async def play_loop(self) -> None:
98
  """Fetch outputs from the handler: log text and play audio frames."""
 
112
  elif isinstance(handler_output, tuple):
113
  input_sample_rate, audio_frame = handler_output
114
  device_sample_rate = self._robot.media.get_audio_samplerate()
115
+ audio_frame_float = audio_to_float32(audio_frame.squeeze())
116
+
117
  if input_sample_rate != device_sample_rate:
118
+ audio_frame_float = resample(
119
+ audio_frame_float,
120
+ orig_sr=input_sample_rate,
121
+ target_sr=device_sample_rate,
122
  )
123
+
124
+ self._robot.media.push_audio_sample(audio_frame_float)
125
 
126
  else:
127
  logger.debug("Ignoring output type=%s", type(handler_output).__name__)
src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/dance_emotion_moves.py RENAMED
@@ -9,6 +9,7 @@ import logging
9
  from typing import Tuple
10
 
11
  import numpy as np
 
12
 
13
  from reachy_mini.motion.move import Move
14
  from reachy_mini.motion.recorded_move import RecordedMoves
@@ -18,7 +19,7 @@ from reachy_mini_dances_library.dance_move import DanceMove
18
  logger = logging.getLogger(__name__)
19
 
20
 
21
- class DanceQueueMove(Move):
22
  """Wrapper for dance moves to work with the movement queue system."""
23
 
24
  def __init__(self, move_name: str):
@@ -29,9 +30,9 @@ class DanceQueueMove(Move):
29
  @property
30
  def duration(self) -> float:
31
  """Duration property required by official Move interface."""
32
- return self.dance_move.duration
33
 
34
- def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
35
  """Evaluate dance move at time t."""
36
  try:
37
  # Get the pose from the dance move
@@ -49,10 +50,10 @@ class DanceQueueMove(Move):
49
  from reachy_mini.utils import create_head_pose
50
 
51
  neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
52
- return (neutral_head_pose, np.array([0.0, 0.0]), 0.0)
53
 
54
 
55
- class EmotionQueueMove(Move):
56
  """Wrapper for emotion moves to work with the movement queue system."""
57
 
58
  def __init__(self, emotion_name: str, recorded_moves: RecordedMoves):
@@ -63,9 +64,9 @@ class EmotionQueueMove(Move):
63
  @property
64
  def duration(self) -> float:
65
  """Duration property required by official Move interface."""
66
- return self.emotion_move.duration
67
 
68
- def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
69
  """Evaluate emotion move at time t."""
70
  try:
71
  # Get the pose from the emotion move
@@ -83,20 +84,20 @@ class EmotionQueueMove(Move):
83
  from reachy_mini.utils import create_head_pose
84
 
85
  neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
86
- return (neutral_head_pose, np.array([0.0, 0.0]), 0.0)
87
 
88
 
89
- class GotoQueueMove(Move):
90
  """Wrapper for goto moves to work with the movement queue system."""
91
 
92
  def __init__(
93
  self,
94
- target_head_pose: np.ndarray,
95
- start_head_pose: np.ndarray = None,
96
  target_antennas: Tuple[float, float] = (0, 0),
97
- start_antennas: Tuple[float, float] = None,
98
  target_body_yaw: float = 0,
99
- start_body_yaw: float = None,
100
  duration: float = 1.0,
101
  ):
102
  """Initialize a GotoQueueMove."""
@@ -113,7 +114,7 @@ class GotoQueueMove(Move):
113
  """Duration property required by official Move interface."""
114
  return self._duration
115
 
116
- def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
117
  """Evaluate goto move at time t using linear interpolation."""
118
  try:
119
  from reachy_mini.utils import create_head_pose
@@ -136,7 +137,8 @@ class GotoQueueMove(Move):
136
  [
137
  self.start_antennas[0] + (self.target_antennas[0] - self.start_antennas[0]) * t_clamped,
138
  self.start_antennas[1] + (self.target_antennas[1] - self.start_antennas[1]) * t_clamped,
139
- ]
 
140
  )
141
 
142
  # Interpolate body yaw
@@ -146,6 +148,7 @@ class GotoQueueMove(Move):
146
 
147
  except Exception as e:
148
  logger.error(f"Error evaluating goto move at t={t}: {e}")
149
- # Return target pose on error - convert antennas to numpy array
150
- target_antennas_array = np.array([self.target_antennas[0], self.target_antennas[1]])
151
- return (self.target_head_pose, target_antennas_array, self.target_body_yaw)
 
 
9
  from typing import Tuple
10
 
11
  import numpy as np
12
+ from numpy.typing import NDArray
13
 
14
  from reachy_mini.motion.move import Move
15
  from reachy_mini.motion.recorded_move import RecordedMoves
 
19
  logger = logging.getLogger(__name__)
20
 
21
 
22
+ class DanceQueueMove(Move): # type: ignore
23
  """Wrapper for dance moves to work with the movement queue system."""
24
 
25
  def __init__(self, move_name: str):
 
30
  @property
31
  def duration(self) -> float:
32
  """Duration property required by official Move interface."""
33
+ return float(self.dance_move.duration)
34
 
35
+ def evaluate(self, t: float) -> tuple[NDArray[np.float64] | None, NDArray[np.float64] | None, float | None]:
36
  """Evaluate dance move at time t."""
37
  try:
38
  # Get the pose from the dance move
 
50
  from reachy_mini.utils import create_head_pose
51
 
52
  neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
53
+ return (neutral_head_pose, np.array([0.0, 0.0], dtype=np.float64), 0.0)
54
 
55
 
56
+ class EmotionQueueMove(Move): # type: ignore
57
  """Wrapper for emotion moves to work with the movement queue system."""
58
 
59
  def __init__(self, emotion_name: str, recorded_moves: RecordedMoves):
 
64
  @property
65
  def duration(self) -> float:
66
  """Duration property required by official Move interface."""
67
+ return float(self.emotion_move.duration)
68
 
69
+ def evaluate(self, t: float) -> tuple[NDArray[np.float64] | None, NDArray[np.float64] | None, float | None]:
70
  """Evaluate emotion move at time t."""
71
  try:
72
  # Get the pose from the emotion move
 
84
  from reachy_mini.utils import create_head_pose
85
 
86
  neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
87
+ return (neutral_head_pose, np.array([0.0, 0.0], dtype=np.float64), 0.0)
88
 
89
 
90
+ class GotoQueueMove(Move): # type: ignore
91
  """Wrapper for goto moves to work with the movement queue system."""
92
 
93
  def __init__(
94
  self,
95
+ target_head_pose: NDArray[np.float32],
96
+ start_head_pose: NDArray[np.float32] | None = None,
97
  target_antennas: Tuple[float, float] = (0, 0),
98
+ start_antennas: Tuple[float, float] | None = None,
99
  target_body_yaw: float = 0,
100
+ start_body_yaw: float | None = None,
101
  duration: float = 1.0,
102
  ):
103
  """Initialize a GotoQueueMove."""
 
114
  """Duration property required by official Move interface."""
115
  return self._duration
116
 
117
+ def evaluate(self, t: float) -> tuple[NDArray[np.float64] | None, NDArray[np.float64] | None, float | None]:
118
  """Evaluate goto move at time t using linear interpolation."""
119
  try:
120
  from reachy_mini.utils import create_head_pose
 
137
  [
138
  self.start_antennas[0] + (self.target_antennas[0] - self.start_antennas[0]) * t_clamped,
139
  self.start_antennas[1] + (self.target_antennas[1] - self.start_antennas[1]) * t_clamped,
140
+ ],
141
+ dtype=np.float64,
142
  )
143
 
144
  # Interpolate body yaw
 
148
 
149
  except Exception as e:
150
  logger.error(f"Error evaluating goto move at t={t}: {e}")
151
+ # Return target pose on error - convert to float64
152
+ target_head_pose_f64 = self.target_head_pose.astype(np.float64)
153
+ target_antennas_array = np.array([self.target_antennas[0], self.target_antennas[1]], dtype=np.float64)
154
+ return (target_head_pose_f64, target_antennas_array, self.target_body_yaw)
src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/images/reachymini_avatar.png RENAMED
File without changes
src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/images/user_avatar.png RENAMED
File without changes
src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/main.py RENAMED
@@ -1,28 +1,29 @@
1
- """Entrypoint for the Reachy Mini conversation demo."""
2
 
3
  import os
4
  import sys
5
  import time
6
  import threading
 
7
 
8
  import gradio as gr
9
  from fastapi import FastAPI
10
  from fastrtc import Stream
11
 
12
  from reachy_mini import ReachyMini, ReachyMiniApp
13
- from reachy_mini_conversation_demo.moves import MovementManager
14
- from reachy_mini_conversation_demo.tools import ToolDependencies
15
- from reachy_mini_conversation_demo.utils import (
16
  parse_args,
17
  setup_logger,
18
  handle_vision_stuff,
19
  )
20
- from reachy_mini_conversation_demo.console import LocalStream
21
- from reachy_mini_conversation_demo.openai_realtime import OpenaiRealtimeHandler
22
- from reachy_mini_conversation_demo.audio.head_wobbler import HeadWobbler
23
 
24
 
25
- def update_chatbot(chatbot: list[dict], response: dict):
26
  """Update the chatbot with AdditionalOutputs."""
27
  chatbot.append(response)
28
  return chatbot
@@ -34,7 +35,7 @@ def main(robot=None):
34
  args.gradio = True # TODO Antoine - force gradio for testing appifying
35
 
36
  logger = setup_logger(args.debug)
37
- logger.info("Starting Reachy Mini Conversation Demo")
38
 
39
  if args.no_camera and args.head_tracker is not None:
40
  logger.warning("Head tracking is not activated due to --no-camera.")
@@ -45,7 +46,7 @@ def main(robot=None):
45
  # Check if running in simulation mode without --gradio
46
  if robot.client.get_status()["simulation_enabled"] and not args.gradio:
47
  logger.error(
48
- "Simulation mode requires Gradio interface. Please use --gradio flag when running in simulation mode."
49
  )
50
  robot.client.disconnect()
51
  sys.exit(1)
@@ -80,7 +81,7 @@ def main(robot=None):
80
 
81
  handler = OpenaiRealtimeHandler(deps)
82
 
83
- stream_manager = None
84
 
85
  if args.gradio:
86
  stream = Stream(
 
1
+ """Entrypoint for the Reachy Mini conversation app."""
2
 
3
  import os
4
  import sys
5
  import time
6
  import threading
7
+ from typing import Any, Dict, List
8
 
9
  import gradio as gr
10
  from fastapi import FastAPI
11
  from fastrtc import Stream
12
 
13
  from reachy_mini import ReachyMini, ReachyMiniApp
14
+ from reachy_mini_conversation_app.moves import MovementManager
15
+ from reachy_mini_conversation_app.tools import ToolDependencies
16
+ from reachy_mini_conversation_app.utils import (
17
  parse_args,
18
  setup_logger,
19
  handle_vision_stuff,
20
  )
21
+ from reachy_mini_conversation_app.console import LocalStream
22
+ from reachy_mini_conversation_app.openai_realtime import OpenaiRealtimeHandler
23
+ from reachy_mini_conversation_app.audio.head_wobbler import HeadWobbler
24
 
25
 
26
+ def update_chatbot(chatbot: List[Dict[str, Any]], response: Dict[str, Any]) -> List[Dict[str, Any]]:
27
  """Update the chatbot with AdditionalOutputs."""
28
  chatbot.append(response)
29
  return chatbot
 
35
  args.gradio = True # TODO Antoine - force gradio for testing appifying
36
 
37
  logger = setup_logger(args.debug)
38
+ logger.info("Starting Reachy Mini Conversation App")
39
 
40
  if args.no_camera and args.head_tracker is not None:
41
  logger.warning("Head tracking is not activated due to --no-camera.")
 
46
  # Check if running in simulation mode without --gradio
47
  if robot.client.get_status()["simulation_enabled"] and not args.gradio:
48
  logger.error(
49
+ "Simulation mode requires Gradio interface. Please use --gradio flag when running in simulation mode.",
50
  )
51
  robot.client.disconnect()
52
  sys.exit(1)
 
81
 
82
  handler = OpenaiRealtimeHandler(deps)
83
 
84
+ stream_manager: gr.Blocks | LocalStream | None = None
85
 
86
  if args.gradio:
87
  stream = Stream(
src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/moves.py RENAMED
@@ -36,11 +36,12 @@ import time
36
  import logging
37
  import threading
38
  from queue import Empty, Queue
39
- from typing import Any, Tuple, Optional
40
  from collections import deque
41
  from dataclasses import dataclass
42
 
43
  import numpy as np
 
44
 
45
  from reachy_mini import ReachyMini
46
  from reachy_mini.utils import create_head_pose
@@ -57,15 +58,15 @@ logger = logging.getLogger(__name__)
57
  CONTROL_LOOP_FREQUENCY_HZ = 100.0 # Hz - Target frequency for the movement control loop
58
 
59
  # Type definitions
60
- FullBodyPose = Tuple[np.ndarray, Tuple[float, float], float] # (head_pose_4x4, antennas, body_yaw)
61
 
62
 
63
- class BreathingMove(Move):
64
  """Breathing move with interpolation to neutral and then continuous breathing patterns."""
65
 
66
  def __init__(
67
  self,
68
- interpolation_start_pose: np.ndarray,
69
  interpolation_start_antennas: Tuple[float, float],
70
  interpolation_duration: float = 1.0,
71
  ):
@@ -96,7 +97,7 @@ class BreathingMove(Move):
96
  """Duration property required by official Move interface."""
97
  return float("inf") # Continuous breathing (never ends naturally)
98
 
99
- def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
100
  """Evaluate breathing move at time t."""
101
  if t < self.interpolation_duration:
102
  # Phase 1: Interpolate to neutral base position
@@ -104,13 +105,14 @@ class BreathingMove(Move):
104
 
105
  # Interpolate head pose
106
  head_pose = linear_pose_interpolation(
107
- self.interpolation_start_pose, self.neutral_head_pose, interpolation_t
108
  )
109
 
110
  # Interpolate antennas
111
- antennas = (
112
  1 - interpolation_t
113
  ) * self.interpolation_start_antennas + interpolation_t * self.neutral_antennas
 
114
 
115
  else:
116
  # Phase 2: Breathing patterns from neutral base
@@ -122,7 +124,7 @@ class BreathingMove(Move):
122
 
123
  # Antenna sway (opposite directions)
124
  antenna_sway = self.antenna_sway_amplitude * np.sin(2 * np.pi * self.antenna_frequency * breathing_time)
125
- antennas = np.array([antenna_sway, -antenna_sway])
126
 
127
  # Return in official Move interface format: (head_pose, antennas_array, body_yaw)
128
  return (head_pose, antennas, 0.0)
@@ -168,8 +170,8 @@ class MovementState:
168
  """State tracking for the movement system."""
169
 
170
  # Primary move state
171
- current_move: Optional[Move] = None
172
- move_start_time: Optional[float] = None
173
  last_activity_time: float = 0.0
174
 
175
  # Secondary move state (offsets)
@@ -191,7 +193,7 @@ class MovementState:
191
  )
192
 
193
  # Status flags
194
- last_primary_pose: Optional[FullBodyPose] = None
195
 
196
  def update_activity(self) -> None:
197
  """Update the last activity time."""
@@ -242,7 +244,7 @@ class MovementManager:
242
  def __init__(
243
  self,
244
  current_robot: ReachyMini,
245
- camera_worker=None,
246
  ):
247
  """Initialize movement manager."""
248
  self.current_robot = current_robot
@@ -258,7 +260,7 @@ class MovementManager:
258
  self.state.last_primary_pose = (neutral_pose, (0.0, 0.0), 0.0)
259
 
260
  # Move queue (primary moves)
261
- self.move_queue = deque()
262
 
263
  # Configuration
264
  self.idle_inactivity_delay = 0.3 # seconds
@@ -266,7 +268,7 @@ class MovementManager:
266
  self.target_period = 1.0 / self.target_frequency
267
 
268
  self._stop_event = threading.Event()
269
- self._thread: Optional[threading.Thread] = None
270
  self._is_listening = False
271
  self._last_commanded_pose: FullBodyPose = clone_full_body_pose(self.state.last_primary_pose)
272
  self._listening_antennas: Tuple[float, float] = self._last_commanded_pose[1]
@@ -281,7 +283,7 @@ class MovementManager:
281
  self._set_target_err_suppressed = 0
282
 
283
  # Cross-thread signalling
284
- self._command_queue: Queue[tuple[str, Any]] = Queue()
285
  self._speech_offsets_lock = threading.Lock()
286
  self._pending_speech_offsets: Tuple[float, float, float, float, float, float] = (
287
  0.0,
@@ -383,7 +385,7 @@ class MovementManager:
383
 
384
  def _apply_pending_offsets(self) -> None:
385
  """Apply the most recent speech/face offset updates."""
386
- speech_offsets: Optional[Tuple[float, float, float, float, float, float]] = None
387
  with self._speech_offsets_lock:
388
  if self._speech_offsets_dirty:
389
  speech_offsets = self._pending_speech_offsets
@@ -393,7 +395,7 @@ class MovementManager:
393
  self.state.speech_offsets = speech_offsets
394
  self.state.update_activity()
395
 
396
- face_offsets: Optional[Tuple[float, float, float, float, float, float]] = None
397
  with self._face_offsets_lock:
398
  if self._face_offsets_dirty:
399
  face_offsets = self._pending_face_offsets
@@ -549,14 +551,13 @@ class MovementManager:
549
  )
550
 
551
  self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
 
 
 
552
  else:
553
- # Otherwise reuse the last primary pose so we avoid jumps between moves
554
- if self.state.last_primary_pose is not None:
555
- primary_full_body_pose = clone_full_body_pose(self.state.last_primary_pose)
556
- else:
557
- neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
558
- primary_full_body_pose = (neutral_head_pose, (0.0, 0.0), 0.0)
559
- self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
560
 
561
  return primary_full_body_pose
562
 
@@ -631,7 +632,7 @@ class MovementManager:
631
 
632
  return antennas_cmd
633
 
634
- def _issue_control_command(self, head: np.ndarray, antennas: Tuple[float, float], body_yaw: float) -> None:
635
  """Send the fused pose to the robot with throttled error logging."""
636
  try:
637
  self.current_robot.set_target(head=head, antennas=antennas, body_yaw=body_yaw)
@@ -651,7 +652,7 @@ class MovementManager:
651
  self._last_commanded_pose = clone_full_body_pose((head, antennas, body_yaw))
652
 
653
  def _update_frequency_stats(
654
- self, loop_start: float, prev_loop_start: float, stats: LoopFrequencyStats
655
  ) -> LoopFrequencyStats:
656
  """Update frequency statistics based on the current loop start time."""
657
  period = loop_start - prev_loop_start
@@ -664,7 +665,7 @@ class MovementManager:
664
  stats.min_freq = min(stats.min_freq, stats.last_freq)
665
  return stats
666
 
667
- def _schedule_next_tick(self, loop_start: float, stats: LoopFrequencyStats) -> tuple[float, LoopFrequencyStats]:
668
  """Compute sleep time to maintain target frequency and update potential freq."""
669
  computation_time = self._now() - loop_start
670
  stats.potential_freq = 1.0 / computation_time if computation_time > 0 else float("inf")
@@ -729,7 +730,7 @@ class MovementManager:
729
  self._thread = None
730
  logger.debug("Move worker stopped")
731
 
732
- def get_status(self) -> dict[str, Any]:
733
  """Return a lightweight status snapshot for observability."""
734
  with self._status_lock:
735
  pose_snapshot = clone_full_body_pose(self._last_commanded_pose)
 
36
  import logging
37
  import threading
38
  from queue import Empty, Queue
39
+ from typing import Any, Dict, Tuple
40
  from collections import deque
41
  from dataclasses import dataclass
42
 
43
  import numpy as np
44
+ from numpy.typing import NDArray
45
 
46
  from reachy_mini import ReachyMini
47
  from reachy_mini.utils import create_head_pose
 
58
  CONTROL_LOOP_FREQUENCY_HZ = 100.0 # Hz - Target frequency for the movement control loop
59
 
60
  # Type definitions
61
+ FullBodyPose = Tuple[NDArray[np.float32], Tuple[float, float], float] # (head_pose_4x4, antennas, body_yaw)
62
 
63
 
64
+ class BreathingMove(Move): # type: ignore
65
  """Breathing move with interpolation to neutral and then continuous breathing patterns."""
66
 
67
  def __init__(
68
  self,
69
+ interpolation_start_pose: NDArray[np.float32],
70
  interpolation_start_antennas: Tuple[float, float],
71
  interpolation_duration: float = 1.0,
72
  ):
 
97
  """Duration property required by official Move interface."""
98
  return float("inf") # Continuous breathing (never ends naturally)
99
 
100
+ def evaluate(self, t: float) -> tuple[NDArray[np.float64] | None, NDArray[np.float64] | None, float | None]:
101
  """Evaluate breathing move at time t."""
102
  if t < self.interpolation_duration:
103
  # Phase 1: Interpolate to neutral base position
 
105
 
106
  # Interpolate head pose
107
  head_pose = linear_pose_interpolation(
108
+ self.interpolation_start_pose, self.neutral_head_pose, interpolation_t,
109
  )
110
 
111
  # Interpolate antennas
112
+ antennas_interp = (
113
  1 - interpolation_t
114
  ) * self.interpolation_start_antennas + interpolation_t * self.neutral_antennas
115
+ antennas = antennas_interp.astype(np.float64)
116
 
117
  else:
118
  # Phase 2: Breathing patterns from neutral base
 
124
 
125
  # Antenna sway (opposite directions)
126
  antenna_sway = self.antenna_sway_amplitude * np.sin(2 * np.pi * self.antenna_frequency * breathing_time)
127
+ antennas = np.array([antenna_sway, -antenna_sway], dtype=np.float64)
128
 
129
  # Return in official Move interface format: (head_pose, antennas_array, body_yaw)
130
  return (head_pose, antennas, 0.0)
 
170
  """State tracking for the movement system."""
171
 
172
  # Primary move state
173
+ current_move: Move | None = None
174
+ move_start_time: float | None = None
175
  last_activity_time: float = 0.0
176
 
177
  # Secondary move state (offsets)
 
193
  )
194
 
195
  # Status flags
196
+ last_primary_pose: FullBodyPose | None = None
197
 
198
  def update_activity(self) -> None:
199
  """Update the last activity time."""
 
244
  def __init__(
245
  self,
246
  current_robot: ReachyMini,
247
+ camera_worker: "Any" = None,
248
  ):
249
  """Initialize movement manager."""
250
  self.current_robot = current_robot
 
260
  self.state.last_primary_pose = (neutral_pose, (0.0, 0.0), 0.0)
261
 
262
  # Move queue (primary moves)
263
+ self.move_queue: deque[Move] = deque()
264
 
265
  # Configuration
266
  self.idle_inactivity_delay = 0.3 # seconds
 
268
  self.target_period = 1.0 / self.target_frequency
269
 
270
  self._stop_event = threading.Event()
271
+ self._thread: threading.Thread | None = None
272
  self._is_listening = False
273
  self._last_commanded_pose: FullBodyPose = clone_full_body_pose(self.state.last_primary_pose)
274
  self._listening_antennas: Tuple[float, float] = self._last_commanded_pose[1]
 
283
  self._set_target_err_suppressed = 0
284
 
285
  # Cross-thread signalling
286
+ self._command_queue: "Queue[Tuple[str, Any]]" = Queue()
287
  self._speech_offsets_lock = threading.Lock()
288
  self._pending_speech_offsets: Tuple[float, float, float, float, float, float] = (
289
  0.0,
 
385
 
386
  def _apply_pending_offsets(self) -> None:
387
  """Apply the most recent speech/face offset updates."""
388
+ speech_offsets: Tuple[float, float, float, float, float, float] | None = None
389
  with self._speech_offsets_lock:
390
  if self._speech_offsets_dirty:
391
  speech_offsets = self._pending_speech_offsets
 
395
  self.state.speech_offsets = speech_offsets
396
  self.state.update_activity()
397
 
398
+ face_offsets: Tuple[float, float, float, float, float, float] | None = None
399
  with self._face_offsets_lock:
400
  if self._face_offsets_dirty:
401
  face_offsets = self._pending_face_offsets
 
551
  )
552
 
553
  self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
554
+ # Otherwise reuse the last primary pose so we avoid jumps between moves
555
+ elif self.state.last_primary_pose is not None:
556
+ primary_full_body_pose = clone_full_body_pose(self.state.last_primary_pose)
557
  else:
558
+ neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
559
+ primary_full_body_pose = (neutral_head_pose, (0.0, 0.0), 0.0)
560
+ self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
 
 
 
 
561
 
562
  return primary_full_body_pose
563
 
 
632
 
633
  return antennas_cmd
634
 
635
+ def _issue_control_command(self, head: NDArray[np.float32], antennas: Tuple[float, float], body_yaw: float) -> None:
636
  """Send the fused pose to the robot with throttled error logging."""
637
  try:
638
  self.current_robot.set_target(head=head, antennas=antennas, body_yaw=body_yaw)
 
652
  self._last_commanded_pose = clone_full_body_pose((head, antennas, body_yaw))
653
 
654
  def _update_frequency_stats(
655
+ self, loop_start: float, prev_loop_start: float, stats: LoopFrequencyStats,
656
  ) -> LoopFrequencyStats:
657
  """Update frequency statistics based on the current loop start time."""
658
  period = loop_start - prev_loop_start
 
665
  stats.min_freq = min(stats.min_freq, stats.last_freq)
666
  return stats
667
 
668
+ def _schedule_next_tick(self, loop_start: float, stats: LoopFrequencyStats) -> Tuple[float, LoopFrequencyStats]:
669
  """Compute sleep time to maintain target frequency and update potential freq."""
670
  computation_time = self._now() - loop_start
671
  stats.potential_freq = 1.0 / computation_time if computation_time > 0 else float("inf")
 
730
  self._thread = None
731
  logger.debug("Move worker stopped")
732
 
733
+ def get_status(self) -> Dict[str, Any]:
734
  """Return a lightweight status snapshot for observability."""
735
  with self._status_lock:
736
  pose_snapshot = clone_full_body_pose(self._last_commanded_pose)
src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/openai_realtime.py RENAMED
@@ -1,21 +1,25 @@
1
  import json
2
  import base64
 
3
  import asyncio
4
  import logging
 
5
  from datetime import datetime
6
 
7
  import numpy as np
8
  import gradio as gr
9
  from openai import AsyncOpenAI
10
  from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item
 
 
11
 
12
- from reachy_mini_conversation_demo.tools import (
13
  ALL_TOOL_SPECS,
14
  ToolDependencies,
15
  dispatch_tool_call,
16
  )
17
- from reachy_mini_conversation_demo.config import config
18
- from reachy_mini_conversation_demo.prompts import SESSION_INSTRUCTIONS
19
 
20
 
21
  logger = logging.getLogger(__name__)
@@ -33,57 +37,131 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
33
  )
34
  self.deps = deps
35
 
36
- self.connection = None
37
- self.output_queue = asyncio.Queue()
 
 
 
 
 
 
38
 
39
  self.last_activity_time = asyncio.get_event_loop().time()
40
  self.start_time = asyncio.get_event_loop().time()
41
  self.is_idle_tool_call = False
42
 
43
- def copy(self):
44
  """Create a copy of the handler."""
45
  return OpenaiRealtimeHandler(self.deps)
46
 
47
- async def start_up(self):
48
- """Start the handler."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  self.client = AsyncOpenAI(api_key=config.OPENAI_API_KEY)
50
- async with self.client.beta.realtime.connect(model=config.MODEL_NAME) as conn:
51
- await conn.session.update(
52
- session={
53
- "turn_detection": {
54
- "type": "server_vad",
55
- },
56
- "input_audio_transcription": {
57
- "model": "whisper-1",
58
- "language": "en",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  },
60
- "voice": "ballad",
61
- "instructions": SESSION_INSTRUCTIONS,
62
- "tools": ALL_TOOL_SPECS,
63
- "tool_choice": "auto",
64
- "temperature": 0.7,
65
- }
66
- )
67
 
68
  # Manage event received from the openai server
69
  self.connection = conn
70
  async for event in self.connection:
71
  logger.debug(f"OpenAI event: {event.type}")
72
  if event.type == "input_audio_buffer.speech_started":
73
- if hasattr(self, '_clear_queue'):
74
  self._clear_queue()
75
- self.deps.head_wobbler.reset()
 
76
  self.deps.movement_manager.set_listening(True)
77
  logger.debug("User speech started")
78
 
79
  if event.type == "input_audio_buffer.speech_stopped":
80
  self.deps.movement_manager.set_listening(False)
81
- logger.debug("User speech stopped")
82
-
83
- if event.type in ("response.audio.completed", "response.completed"):
84
- # Doesn't seem to be called
 
 
 
 
85
  logger.debug("response completed")
86
- self.deps.head_wobbler.reset()
87
 
88
  if event.type == "response.created":
89
  logger.debug("Response created")
@@ -91,18 +169,28 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
91
  if event.type == "response.done":
92
  # Doesn't mean the audio is done playing
93
  logger.debug("Response done")
94
- pass
95
 
 
 
 
 
 
 
 
 
96
  if event.type == "conversation.item.input_audio_transcription.completed":
97
  logger.debug(f"User transcript: {event.transcript}")
98
  await self.output_queue.put(AdditionalOutputs({"role": "user", "content": event.transcript}))
99
 
100
- if event.type == "response.audio_transcript.done":
 
101
  logger.debug(f"Assistant transcript: {event.transcript}")
102
  await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": event.transcript}))
103
 
104
- if event.type == "response.audio.delta":
105
- self.deps.head_wobbler.feed(event.delta)
 
 
106
  self.last_activity_time = asyncio.get_event_loop().time()
107
  logger.debug("last activity time updated to %s", self.last_activity_time)
108
  await self.output_queue.put(
@@ -118,6 +206,10 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
118
  args_json_str = getattr(event, "arguments", None)
119
  call_id = getattr(event, "call_id", None)
120
 
 
 
 
 
121
  try:
122
  tool_result = await dispatch_tool_call(tool_name, args_json_str, self.deps)
123
  logger.debug("Tool '%s' executed successfully", tool_name)
@@ -127,22 +219,23 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
127
  tool_result = {"error": str(e)}
128
 
129
  # send the tool result back
130
- await self.connection.conversation.item.create(
131
- item={
132
- "type": "function_call_output",
133
- "call_id": call_id,
134
- "output": json.dumps(tool_result),
135
- }
136
- )
 
137
 
138
  await self.output_queue.put(
139
  AdditionalOutputs(
140
  {
141
  "role": "assistant",
142
  "content": json.dumps(tool_result),
143
- "metadata": {"title": "πŸ› οΈ Used tool " + tool_name, "status": "done"},
144
  },
145
- )
146
  )
147
 
148
  if tool_name == "camera" and "b64_im" in tool_result:
@@ -159,55 +252,68 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
159
  {
160
  "type": "input_image",
161
  "image_url": f"data:image/jpeg;base64,{b64_im}",
162
- }
163
  ],
164
- }
165
  )
166
  logger.info("Added camera image to conversation")
167
 
168
- np_img = self.deps.camera_worker.get_latest_frame()
169
- img = gr.Image(value=np_img)
 
170
 
171
- await self.output_queue.put(
172
- AdditionalOutputs(
173
- {
174
- "role": "assistant",
175
- "content": img,
176
- }
 
177
  )
178
- )
179
 
180
- if not self.is_idle_tool_call:
 
 
 
 
181
  await self.connection.response.create(
182
  response={
183
- "instructions": "Use the tool result just returned and answer concisely in speech."
184
- }
185
  )
186
- else:
187
- self.is_idle_tool_call = False
188
 
189
  # re synchronize the head wobble after a tool call that may have taken some time
190
- self.deps.head_wobbler.reset()
 
191
 
192
  # server error
193
  if event.type == "error":
194
  err = getattr(event, "error", None)
195
  msg = getattr(err, "message", str(err) if err else "unknown error")
196
- logger.error("Realtime error: %s (raw=%s)", msg, err)
197
- await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": f"[error] {msg}"}))
 
 
 
 
 
198
 
199
  # Microphone receive
200
- async def receive(self, frame: tuple[int, np.ndarray]) -> None:
201
  """Receive audio frame from the microphone and send it to the openai server."""
202
  if not self.connection:
203
  return
204
  _, array = frame
205
  array = array.squeeze()
 
 
 
 
 
206
  audio_message = base64.b64encode(array.tobytes()).decode("utf-8")
207
- # Fills the input audio buffer to be sent to the server
208
- await self.connection.input_audio_buffer.append(audio=audio_message) # type: ignore
209
 
210
- async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
211
  """Emit audio frame to be played by the speaker."""
212
  # sends to the stream the stuff put in the output queue by the openai event handler
213
  # This is called periodically by the fastrtc Stream
@@ -215,28 +321,43 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
215
  # Handle idle
216
  idle_duration = asyncio.get_event_loop().time() - self.last_activity_time
217
  if idle_duration > 15.0 and self.deps.movement_manager.is_idle():
218
- await self.send_idle_signal(idle_duration)
 
 
 
 
219
 
220
  self.last_activity_time = asyncio.get_event_loop().time() # avoid repeated resets
221
 
222
- return await wait_for_item(self.output_queue)
223
 
224
  async def shutdown(self) -> None:
225
  """Shutdown the handler."""
226
  if self.connection:
227
- await self.connection.close()
228
- self.connection = None
229
-
230
- def format_timestamp(self):
231
- """Format current timestamp with date, time and elapsed seconds."""
232
- current_time = asyncio.get_event_loop().time()
233
- elapsed_seconds = current_time - self.start_time
234
- dt = datetime.fromtimestamp(current_time)
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  return f"[{dt.strftime('%Y-%m-%d %H:%M:%S')} | +{elapsed_seconds:.1f}s]"
236
 
237
-
238
-
239
- async def send_idle_signal(self, idle_duration) -> None:
240
  """Send an idle signal to the openai server."""
241
  logger.debug("Sending idle signal")
242
  self.is_idle_tool_call = True
@@ -249,12 +370,11 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
249
  "type": "message",
250
  "role": "user",
251
  "content": [{"type": "input_text", "text": timestamp_msg}],
252
- }
253
  )
254
  await self.connection.response.create(
255
  response={
256
- "modalities": ["text"],
257
  "instructions": "You MUST respond with function calls only - no speech or text. Choose appropriate actions for idle behavior.",
258
  "tool_choice": "required",
259
- }
260
  )
 
1
  import json
2
  import base64
3
+ import random
4
  import asyncio
5
  import logging
6
+ from typing import Any, Tuple, Literal, cast
7
  from datetime import datetime
8
 
9
  import numpy as np
10
  import gradio as gr
11
  from openai import AsyncOpenAI
12
  from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item
13
+ from numpy.typing import NDArray
14
+ from websockets.exceptions import ConnectionClosedError
15
 
16
+ from reachy_mini_conversation_app.tools import (
17
  ALL_TOOL_SPECS,
18
  ToolDependencies,
19
  dispatch_tool_call,
20
  )
21
+ from reachy_mini_conversation_app.config import config
22
+ from reachy_mini_conversation_app.prompts import SESSION_INSTRUCTIONS
23
 
24
 
25
  logger = logging.getLogger(__name__)
 
37
  )
38
  self.deps = deps
39
 
40
+ # Override type annotations for OpenAI strict typing (only for values used in API)
41
+ self.output_sample_rate: Literal[24000]
42
+ self.target_input_rate: Literal[24000] = 24000
43
+ # input_sample_rate rest as int for comparison logic
44
+ self.resample_ratio = self.target_input_rate / self.input_sample_rate
45
+
46
+ self.connection: Any = None
47
+ self.output_queue: "asyncio.Queue[Tuple[int, NDArray[np.int16]] | AdditionalOutputs]" = asyncio.Queue()
48
 
49
  self.last_activity_time = asyncio.get_event_loop().time()
50
  self.start_time = asyncio.get_event_loop().time()
51
  self.is_idle_tool_call = False
52
 
53
+ def copy(self) -> "OpenaiRealtimeHandler":
54
  """Create a copy of the handler."""
55
  return OpenaiRealtimeHandler(self.deps)
56
 
57
+ def resample_audio(self, audio: NDArray[np.int16]) -> NDArray[np.int16]:
58
+ """Resample audio using linear interpolation."""
59
+ if self.input_sample_rate == self.target_input_rate:
60
+ return audio
61
+
62
+ # Use numpy's interp for simple linear resampling
63
+ input_length = len(audio)
64
+ output_length = int(input_length * self.resample_ratio)
65
+
66
+ input_time = np.arange(input_length)
67
+ output_time = np.linspace(0, input_length - 1, output_length)
68
+
69
+ resampled = np.interp(output_time, input_time, audio.astype(np.float32))
70
+ return cast(NDArray[np.int16], resampled.astype(np.int16))
71
+
72
+ async def start_up(self) -> None:
73
+ """Start the handler with minimal retries on unexpected websocket closure."""
74
  self.client = AsyncOpenAI(api_key=config.OPENAI_API_KEY)
75
+
76
+ max_attempts = 3
77
+ for attempt in range(1, max_attempts + 1):
78
+ try:
79
+ await self._run_realtime_session()
80
+ # Normal exit from the session, stop retrying
81
+ return
82
+ except ConnectionClosedError as e:
83
+ # Abrupt close (e.g., "no close frame received or sent") β†’ retry
84
+ logger.warning(
85
+ "Realtime websocket closed unexpectedly (attempt %d/%d): %s",
86
+ attempt, max_attempts, e
87
+ )
88
+ if attempt < max_attempts:
89
+ # exponential backoff with jitter
90
+ base_delay = 2 ** (attempt - 1) # 1s, 2s, 4s, 8s, etc.
91
+ jitter = random.uniform(0, 0.5)
92
+ delay = base_delay + jitter
93
+ logger.info("Retrying in %.1f seconds...", delay)
94
+ await asyncio.sleep(delay)
95
+ continue
96
+ raise
97
+ finally:
98
+ # never keep a stale reference
99
+ self.connection = None
100
+
101
+ async def _run_realtime_session(self) -> None:
102
+ """Establish and manage a single realtime session."""
103
+ async with self.client.realtime.connect(model=config.MODEL_NAME) as conn:
104
+ try:
105
+ await conn.session.update(
106
+ session={
107
+ "type": "realtime",
108
+ "instructions": SESSION_INSTRUCTIONS,
109
+ "audio": {
110
+ "input": {
111
+ "format": {
112
+ "type": "audio/pcm",
113
+ "rate": self.target_input_rate,
114
+ },
115
+ "transcription": {
116
+ "model": "whisper-1",
117
+ "language": "en"
118
+ },
119
+ "turn_detection": {
120
+ "type": "server_vad",
121
+ "interrupt_response": True,
122
+ },
123
+ },
124
+ "output": {
125
+ "format": {
126
+ "type": "audio/pcm",
127
+ "rate": self.output_sample_rate,
128
+ },
129
+ "voice": "cedar",
130
+ },
131
+ },
132
+ "tools": ALL_TOOL_SPECS, # type: ignore[typeddict-item]
133
+ "tool_choice": "auto",
134
  },
135
+ )
136
+ except Exception:
137
+ logger.exception("Realtime session.update failed; aborting startup")
138
+ return
139
+
140
+ logger.info("Realtime session updated successfully")
 
141
 
142
  # Manage event received from the openai server
143
  self.connection = conn
144
  async for event in self.connection:
145
  logger.debug(f"OpenAI event: {event.type}")
146
  if event.type == "input_audio_buffer.speech_started":
147
+ if hasattr(self, "_clear_queue") and callable(self._clear_queue):
148
  self._clear_queue()
149
+ if self.deps.head_wobbler is not None:
150
+ self.deps.head_wobbler.reset()
151
  self.deps.movement_manager.set_listening(True)
152
  logger.debug("User speech started")
153
 
154
  if event.type == "input_audio_buffer.speech_stopped":
155
  self.deps.movement_manager.set_listening(False)
156
+ logger.debug("User speech stopped - server will auto-commit with VAD")
157
+
158
+ if event.type in (
159
+ "response.audio.done", # GA
160
+ "response.output_audio.done", # GA alias
161
+ "response.audio.completed", # legacy (for safety)
162
+ "response.completed", # text-only completion
163
+ ):
164
  logger.debug("response completed")
 
165
 
166
  if event.type == "response.created":
167
  logger.debug("Response created")
 
169
  if event.type == "response.done":
170
  # Doesn't mean the audio is done playing
171
  logger.debug("Response done")
 
172
 
173
+ # Handle partial transcription (user speaking in real-time)
174
+ if event.type == "conversation.item.input_audio_transcription.partial":
175
+ logger.debug(f"User partial transcript: {event.transcript}")
176
+ await self.output_queue.put(
177
+ AdditionalOutputs({"role": "user_partial", "content": event.transcript})
178
+ )
179
+
180
+ # Handle completed transcription (user finished speaking)
181
  if event.type == "conversation.item.input_audio_transcription.completed":
182
  logger.debug(f"User transcript: {event.transcript}")
183
  await self.output_queue.put(AdditionalOutputs({"role": "user", "content": event.transcript}))
184
 
185
+ # Handle assistant transcription
186
+ if event.type in ("response.audio_transcript.done", "response.output_audio_transcript.done"):
187
  logger.debug(f"Assistant transcript: {event.transcript}")
188
  await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": event.transcript}))
189
 
190
+ # Handle audio delta
191
+ if event.type in ("response.audio.delta", "response.output_audio.delta"):
192
+ if self.deps.head_wobbler is not None:
193
+ self.deps.head_wobbler.feed(event.delta)
194
  self.last_activity_time = asyncio.get_event_loop().time()
195
  logger.debug("last activity time updated to %s", self.last_activity_time)
196
  await self.output_queue.put(
 
206
  args_json_str = getattr(event, "arguments", None)
207
  call_id = getattr(event, "call_id", None)
208
 
209
+ if not isinstance(tool_name, str) or not isinstance(args_json_str, str):
210
+ logger.error("Invalid tool call: tool_name=%s, args=%s", tool_name, args_json_str)
211
+ continue
212
+
213
  try:
214
  tool_result = await dispatch_tool_call(tool_name, args_json_str, self.deps)
215
  logger.debug("Tool '%s' executed successfully", tool_name)
 
219
  tool_result = {"error": str(e)}
220
 
221
  # send the tool result back
222
+ if isinstance(call_id, str):
223
+ await self.connection.conversation.item.create(
224
+ item={
225
+ "type": "function_call_output",
226
+ "call_id": call_id,
227
+ "output": json.dumps(tool_result),
228
+ },
229
+ )
230
 
231
  await self.output_queue.put(
232
  AdditionalOutputs(
233
  {
234
  "role": "assistant",
235
  "content": json.dumps(tool_result),
236
+ "metadata": {"title": f"πŸ› οΈ Used tool {tool_name}", "status": "done"},
237
  },
238
+ ),
239
  )
240
 
241
  if tool_name == "camera" and "b64_im" in tool_result:
 
252
  {
253
  "type": "input_image",
254
  "image_url": f"data:image/jpeg;base64,{b64_im}",
255
+ },
256
  ],
257
+ },
258
  )
259
  logger.info("Added camera image to conversation")
260
 
261
+ if self.deps.camera_worker is not None:
262
+ np_img = self.deps.camera_worker.get_latest_frame()
263
+ img = gr.Image(value=np_img)
264
 
265
+ await self.output_queue.put(
266
+ AdditionalOutputs(
267
+ {
268
+ "role": "assistant",
269
+ "content": img,
270
+ },
271
+ ),
272
  )
 
273
 
274
+ # if this tool call was triggered by an idle signal, don't make the robot speak
275
+ # for other tool calls, let the robot reply out loud
276
+ if self.is_idle_tool_call:
277
+ self.is_idle_tool_call = False
278
+ else:
279
  await self.connection.response.create(
280
  response={
281
+ "instructions": "Use the tool result just returned and answer concisely in speech.",
282
+ },
283
  )
 
 
284
 
285
  # re synchronize the head wobble after a tool call that may have taken some time
286
+ if self.deps.head_wobbler is not None:
287
+ self.deps.head_wobbler.reset()
288
 
289
  # server error
290
  if event.type == "error":
291
  err = getattr(event, "error", None)
292
  msg = getattr(err, "message", str(err) if err else "unknown error")
293
+ code = getattr(err, "code", "")
294
+
295
+ logger.error("Realtime error [%s]: %s (raw=%s)", code, msg, err)
296
+
297
+ # Only show user-facing errors, not internal state errors
298
+ if code not in ("input_audio_buffer_commit_empty", "conversation_already_has_active_response"):
299
+ await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": f"[error] {msg}"}))
300
 
301
  # Microphone receive
302
+ async def receive(self, frame: Tuple[int, NDArray[np.int16]]) -> None:
303
  """Receive audio frame from the microphone and send it to the openai server."""
304
  if not self.connection:
305
  return
306
  _, array = frame
307
  array = array.squeeze()
308
+
309
+ # Resample if needed
310
+ if self.input_sample_rate != self.target_input_rate:
311
+ array = self.resample_audio(array)
312
+
313
  audio_message = base64.b64encode(array.tobytes()).decode("utf-8")
314
+ await self.connection.input_audio_buffer.append(audio=audio_message)
 
315
 
316
+ async def emit(self) -> Tuple[int, NDArray[np.int16]] | AdditionalOutputs | None:
317
  """Emit audio frame to be played by the speaker."""
318
  # sends to the stream the stuff put in the output queue by the openai event handler
319
  # This is called periodically by the fastrtc Stream
 
321
  # Handle idle
322
  idle_duration = asyncio.get_event_loop().time() - self.last_activity_time
323
  if idle_duration > 15.0 and self.deps.movement_manager.is_idle():
324
+ try:
325
+ await self.send_idle_signal(idle_duration)
326
+ except Exception as e:
327
+ logger.warning("Idle signal skipped (connection closed?): %s", e)
328
+ return None
329
 
330
  self.last_activity_time = asyncio.get_event_loop().time() # avoid repeated resets
331
 
332
+ return await wait_for_item(self.output_queue) # type: ignore[no-any-return]
333
 
334
  async def shutdown(self) -> None:
335
  """Shutdown the handler."""
336
  if self.connection:
337
+ try:
338
+ await self.connection.close()
339
+ except ConnectionClosedError as e:
340
+ logger.debug(f"Connection already closed during shutdown: {e}")
341
+ except Exception as e:
342
+ logger.debug(f"connection.close() ignored: {e}")
343
+ finally:
344
+ self.connection = None
345
+
346
+ # Clear any remaining items in the output queue
347
+ while not self.output_queue.empty():
348
+ try:
349
+ self.output_queue.get_nowait()
350
+ except asyncio.QueueEmpty:
351
+ break
352
+
353
+ def format_timestamp(self) -> str:
354
+ """Format current timestamp with date, time, and elapsed seconds."""
355
+ loop_time = asyncio.get_event_loop().time() # monotonic
356
+ elapsed_seconds = loop_time - self.start_time
357
+ dt = datetime.now() # wall-clock
358
  return f"[{dt.strftime('%Y-%m-%d %H:%M:%S')} | +{elapsed_seconds:.1f}s]"
359
 
360
+ async def send_idle_signal(self, idle_duration: float) -> None:
 
 
361
  """Send an idle signal to the openai server."""
362
  logger.debug("Sending idle signal")
363
  self.is_idle_tool_call = True
 
370
  "type": "message",
371
  "role": "user",
372
  "content": [{"type": "input_text", "text": timestamp_msg}],
373
+ },
374
  )
375
  await self.connection.response.create(
376
  response={
 
377
  "instructions": "You MUST respond with function calls only - no speech or text. Choose appropriate actions for idle behavior.",
378
  "tool_choice": "required",
379
+ },
380
  )
src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/prompts.py RENAMED
File without changes
src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/tools.py RENAMED
@@ -4,7 +4,7 @@ import json
4
  import asyncio
5
  import inspect
6
  import logging
7
- from typing import Any, Dict, Literal, Optional
8
  from dataclasses import dataclass
9
 
10
  from reachy_mini import ReachyMini
@@ -17,7 +17,7 @@ logger = logging.getLogger(__name__)
17
  try:
18
  from reachy_mini.motion.recorded_move import RecordedMoves
19
  from reachy_mini_dances_library.collection.dance import AVAILABLE_MOVES
20
- from reachy_mini_conversation_demo.dance_emotion_moves import (
21
  GotoQueueMove,
22
  DanceQueueMove,
23
  EmotionQueueMove,
@@ -36,9 +36,9 @@ except ImportError as e:
36
  EMOTION_AVAILABLE = False
37
 
38
 
39
- def get_concrete_subclasses(base):
40
  """Recursively find all concrete (non-abstract) subclasses of a base class."""
41
- result = []
42
  for cls in base.__subclasses__():
43
  if not inspect.isabstract(cls):
44
  result.append(cls)
@@ -58,9 +58,9 @@ class ToolDependencies:
58
  reachy_mini: ReachyMini
59
  movement_manager: Any # MovementManager from moves.py
60
  # Optional deps
61
- camera_worker: Optional[Any] = None # CameraWorker for frame buffering
62
- vision_manager: Optional[Any] = None
63
- head_wobbler: Optional[Any] = None # HeadWobbler for audio-reactive motion
64
  motion_duration_s: float = 1.0
65
 
66
 
@@ -88,7 +88,7 @@ class Tool(abc.ABC):
88
  }
89
 
90
  @abc.abstractmethod
91
- async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
92
  """Async tool execution entrypoint."""
93
  raise NotImplementedError
94
 
@@ -113,7 +113,7 @@ class MoveHead(Tool):
113
  }
114
 
115
  # mapping: direction -> args for create_head_pose
116
- DELTAS: dict[str, tuple[int, int, int, int, int, int]] = {
117
  "left": (0, 0, 0, 0, 0, 40),
118
  "right": (0, 0, 0, 0, 0, -40),
119
  "up": (0, 0, 0, 0, -30, 0),
@@ -121,9 +121,12 @@ class MoveHead(Tool):
121
  "front": (0, 0, 0, 0, 0, 0),
122
  }
123
 
124
- async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
125
  """Move head in a given direction."""
126
- direction: Direction = kwargs.get("direction")
 
 
 
127
  logger.info("Tool call: move_head direction=%s", direction)
128
 
129
  deltas = self.DELTAS.get(direction, self.DELTAS["front"])
@@ -177,7 +180,7 @@ class Camera(Tool):
177
  "required": ["question"],
178
  }
179
 
180
- async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
181
  """Take a picture with the camera and ask a question about it."""
182
  image_query = (kwargs.get("question") or "").strip()
183
  if not image_query:
@@ -199,7 +202,7 @@ class Camera(Tool):
199
  # Use vision manager for processing if available
200
  if deps.vision_manager is not None:
201
  vision_result = await asyncio.to_thread(
202
- deps.vision_manager.processor.process_image, frame, image_query
203
  )
204
  if isinstance(vision_result, dict) and "error" in vision_result:
205
  return vision_result
@@ -208,17 +211,16 @@ class Camera(Tool):
208
  if isinstance(vision_result, str)
209
  else {"error": "vision returned non-string"}
210
  )
211
- else:
212
- # Return base64 encoded image like main_works.py camera tool
213
- import base64
214
 
215
- import cv2
216
 
217
- temp_path = "/tmp/camera_frame.jpg"
218
- cv2.imwrite(temp_path, frame)
219
- with open(temp_path, "rb") as f:
220
- b64_encoded = base64.b64encode(f.read()).decode("utf-8")
221
- return {"b64_im": b64_encoded}
222
 
223
 
224
  class HeadTracking(Tool):
@@ -232,7 +234,7 @@ class HeadTracking(Tool):
232
  "required": ["start"],
233
  }
234
 
235
- async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
236
  """Enable or disable head tracking."""
237
  enable = bool(kwargs.get("start"))
238
 
@@ -288,12 +290,12 @@ class Dance(Tool):
288
  "required": [],
289
  }
290
 
291
- async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
292
  """Play a named or random dance move once (or repeat). Non-blocking."""
293
  if not DANCE_AVAILABLE:
294
  return {"error": "Dance system not available"}
295
 
296
- move_name = kwargs.get("move", None)
297
  repeat = int(kwargs.get("repeat", 1))
298
 
299
  logger.info("Tool call: dance move=%s repeat=%d", move_name, repeat)
@@ -326,12 +328,12 @@ class StopDance(Tool):
326
  "dummy": {
327
  "type": "boolean",
328
  "description": "dummy boolean, set it to true",
329
- }
330
  },
331
  "required": ["dummy"],
332
  }
333
 
334
- async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
335
  """Stop the current dance move."""
336
  logger.info("Tool call: stop_dance")
337
  movement_manager = deps.movement_manager
@@ -373,7 +375,7 @@ class PlayEmotion(Tool):
373
  "required": ["emotion"],
374
  }
375
 
376
- async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
377
  """Play a pre-recorded emotion."""
378
  if not EMOTION_AVAILABLE:
379
  return {"error": "Emotion system not available"}
@@ -399,7 +401,7 @@ class PlayEmotion(Tool):
399
 
400
  except Exception as e:
401
  logger.exception("Failed to play emotion")
402
- return {"error": f"Failed to play emotion: {str(e)}"}
403
 
404
 
405
  class StopEmotion(Tool):
@@ -413,12 +415,12 @@ class StopEmotion(Tool):
413
  "dummy": {
414
  "type": "boolean",
415
  "description": "dummy boolean, set it to true",
416
- }
417
  },
418
  "required": ["dummy"],
419
  }
420
 
421
- async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
422
  """Stop the current emotion."""
423
  logger.info("Tool call: stop_emotion")
424
  movement_manager = deps.movement_manager
@@ -442,7 +444,7 @@ class DoNothing(Tool):
442
  "required": [],
443
  }
444
 
445
- async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
446
  """Do nothing - stay still and silent."""
447
  reason = kwargs.get("reason", "just chilling")
448
  logger.info("Tool call: do_nothing reason=%s", reason)
@@ -452,12 +454,12 @@ class DoNothing(Tool):
452
  # Registry & specs (dynamic)
453
 
454
  # List of available tool classes
455
- ALL_TOOLS: Dict[str, Tool] = {cls.name: cls() for cls in get_concrete_subclasses(Tool)}
456
  ALL_TOOL_SPECS = [tool.spec() for tool in ALL_TOOLS.values()]
457
 
458
 
459
  # Dispatcher
460
- def _safe_load_obj(args_json: str) -> dict[str, Any]:
461
  try:
462
  parsed_args = json.loads(args_json or "{}")
463
  return parsed_args if isinstance(parsed_args, dict) else {}
 
4
  import asyncio
5
  import inspect
6
  import logging
7
+ from typing import Any, Dict, List, Tuple, Literal
8
  from dataclasses import dataclass
9
 
10
  from reachy_mini import ReachyMini
 
17
  try:
18
  from reachy_mini.motion.recorded_move import RecordedMoves
19
  from reachy_mini_dances_library.collection.dance import AVAILABLE_MOVES
20
+ from reachy_mini_conversation_app.dance_emotion_moves import (
21
  GotoQueueMove,
22
  DanceQueueMove,
23
  EmotionQueueMove,
 
36
  EMOTION_AVAILABLE = False
37
 
38
 
39
+ def get_concrete_subclasses(base: type[Tool]) -> List[type[Tool]]:
40
  """Recursively find all concrete (non-abstract) subclasses of a base class."""
41
+ result: List[type[Tool]] = []
42
  for cls in base.__subclasses__():
43
  if not inspect.isabstract(cls):
44
  result.append(cls)
 
58
  reachy_mini: ReachyMini
59
  movement_manager: Any # MovementManager from moves.py
60
  # Optional deps
61
+ camera_worker: Any | None = None # CameraWorker for frame buffering
62
+ vision_manager: Any | None = None
63
+ head_wobbler: Any | None = None # HeadWobbler for audio-reactive motion
64
  motion_duration_s: float = 1.0
65
 
66
 
 
88
  }
89
 
90
  @abc.abstractmethod
91
+ async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
92
  """Async tool execution entrypoint."""
93
  raise NotImplementedError
94
 
 
113
  }
114
 
115
  # mapping: direction -> args for create_head_pose
116
+ DELTAS: Dict[str, Tuple[int, int, int, int, int, int]] = {
117
  "left": (0, 0, 0, 0, 0, 40),
118
  "right": (0, 0, 0, 0, 0, -40),
119
  "up": (0, 0, 0, 0, -30, 0),
 
121
  "front": (0, 0, 0, 0, 0, 0),
122
  }
123
 
124
+ async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
125
  """Move head in a given direction."""
126
+ direction_raw = kwargs.get("direction")
127
+ if not isinstance(direction_raw, str):
128
+ return {"error": "direction must be a string"}
129
+ direction: Direction = direction_raw # type: ignore[assignment]
130
  logger.info("Tool call: move_head direction=%s", direction)
131
 
132
  deltas = self.DELTAS.get(direction, self.DELTAS["front"])
 
180
  "required": ["question"],
181
  }
182
 
183
+ async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
184
  """Take a picture with the camera and ask a question about it."""
185
  image_query = (kwargs.get("question") or "").strip()
186
  if not image_query:
 
202
  # Use vision manager for processing if available
203
  if deps.vision_manager is not None:
204
  vision_result = await asyncio.to_thread(
205
+ deps.vision_manager.processor.process_image, frame, image_query,
206
  )
207
  if isinstance(vision_result, dict) and "error" in vision_result:
208
  return vision_result
 
211
  if isinstance(vision_result, str)
212
  else {"error": "vision returned non-string"}
213
  )
214
+ # Return base64 encoded image like main_works.py camera tool
215
+ import base64
 
216
 
217
+ import cv2
218
 
219
+ temp_path = "/tmp/camera_frame.jpg"
220
+ cv2.imwrite(temp_path, frame)
221
+ with open(temp_path, "rb") as f:
222
+ b64_encoded = base64.b64encode(f.read()).decode("utf-8")
223
+ return {"b64_im": b64_encoded}
224
 
225
 
226
  class HeadTracking(Tool):
 
234
  "required": ["start"],
235
  }
236
 
237
+ async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
238
  """Enable or disable head tracking."""
239
  enable = bool(kwargs.get("start"))
240
 
 
290
  "required": [],
291
  }
292
 
293
+ async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
294
  """Play a named or random dance move once (or repeat). Non-blocking."""
295
  if not DANCE_AVAILABLE:
296
  return {"error": "Dance system not available"}
297
 
298
+ move_name = kwargs.get("move")
299
  repeat = int(kwargs.get("repeat", 1))
300
 
301
  logger.info("Tool call: dance move=%s repeat=%d", move_name, repeat)
 
328
  "dummy": {
329
  "type": "boolean",
330
  "description": "dummy boolean, set it to true",
331
+ },
332
  },
333
  "required": ["dummy"],
334
  }
335
 
336
+ async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
337
  """Stop the current dance move."""
338
  logger.info("Tool call: stop_dance")
339
  movement_manager = deps.movement_manager
 
375
  "required": ["emotion"],
376
  }
377
 
378
+ async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
379
  """Play a pre-recorded emotion."""
380
  if not EMOTION_AVAILABLE:
381
  return {"error": "Emotion system not available"}
 
401
 
402
  except Exception as e:
403
  logger.exception("Failed to play emotion")
404
+ return {"error": f"Failed to play emotion: {e!s}"}
405
 
406
 
407
  class StopEmotion(Tool):
 
415
  "dummy": {
416
  "type": "boolean",
417
  "description": "dummy boolean, set it to true",
418
+ },
419
  },
420
  "required": ["dummy"],
421
  }
422
 
423
+ async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
424
  """Stop the current emotion."""
425
  logger.info("Tool call: stop_emotion")
426
  movement_manager = deps.movement_manager
 
444
  "required": [],
445
  }
446
 
447
+ async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
448
  """Do nothing - stay still and silent."""
449
  reason = kwargs.get("reason", "just chilling")
450
  logger.info("Tool call: do_nothing reason=%s", reason)
 
454
  # Registry & specs (dynamic)
455
 
456
  # List of available tool classes
457
+ ALL_TOOLS: Dict[str, Tool] = {cls.name: cls() for cls in get_concrete_subclasses(Tool)} # type: ignore[type-abstract]
458
  ALL_TOOL_SPECS = [tool.spec() for tool in ALL_TOOLS.values()]
459
 
460
 
461
  # Dispatcher
462
+ def _safe_load_obj(args_json: str) -> Dict[str, Any]:
463
  try:
464
  parsed_args = json.loads(args_json or "{}")
465
  return parsed_args if isinstance(parsed_args, dict) else {}
src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/utils.py RENAMED
@@ -1,13 +1,15 @@
1
  import logging
2
  import argparse
3
  import warnings
 
4
 
5
- from reachy_mini_conversation_demo.camera_worker import CameraWorker
 
6
 
7
 
8
- def parse_args():
9
  """Parse command line arguments."""
10
- parser = argparse.ArgumentParser("Reachy Mini Conversation Demo")
11
  parser.add_argument(
12
  "--head-tracker",
13
  choices=["yolo", "mediapipe", None],
@@ -27,7 +29,7 @@ def parse_args():
27
  return parser.parse_args()
28
 
29
 
30
- def handle_vision_stuff(args, current_robot):
31
  """Initialize camera, head tracker, camera worker, and vision manager.
32
 
33
  By default, vision is handled by gpt-realtime model when camera tool is used.
@@ -41,11 +43,11 @@ def handle_vision_stuff(args, current_robot):
41
  # Initialize head tracker if specified
42
  if args.head_tracker is not None:
43
  if args.head_tracker == "yolo":
44
- from reachy_mini_conversation_demo.vision.yolo_head_tracker import HeadTracker
45
 
46
  head_tracker = HeadTracker()
47
  elif args.head_tracker == "mediapipe":
48
- from reachy_mini_toolbox.vision import HeadTracker
49
 
50
  head_tracker = HeadTracker()
51
 
@@ -55,22 +57,22 @@ def handle_vision_stuff(args, current_robot):
55
  # Initialize vision manager only if local vision is requested
56
  if args.local_vision:
57
  try:
58
- from reachy_mini_conversation_demo.vision.processors import initialize_vision_manager
59
 
60
  vision_manager = initialize_vision_manager(camera_worker)
61
  except ImportError as e:
62
  raise ImportError(
63
- "To use --local-vision, please install the extra dependencies: pip install '.[local_vision]'"
64
  ) from e
65
  else:
66
  logging.getLogger(__name__).info(
67
- "Using gpt-realtime for vision (default). Use --local-vision for local processing."
68
  )
69
 
70
  return camera_worker, head_tracker, vision_manager
71
 
72
 
73
- def setup_logger(debug):
74
  """Setups the logger."""
75
  log_level = "DEBUG" if debug else "INFO"
76
  logging.basicConfig(
 
1
  import logging
2
  import argparse
3
  import warnings
4
+ from typing import Any, Tuple
5
 
6
+ from reachy_mini import ReachyMini
7
+ from reachy_mini_conversation_app.camera_worker import CameraWorker
8
 
9
 
10
+ def parse_args() -> argparse.Namespace:
11
  """Parse command line arguments."""
12
+ parser = argparse.ArgumentParser("Reachy Mini Conversation App")
13
  parser.add_argument(
14
  "--head-tracker",
15
  choices=["yolo", "mediapipe", None],
 
29
  return parser.parse_args()
30
 
31
 
32
+ def handle_vision_stuff(args: argparse.Namespace, current_robot: ReachyMini) -> Tuple[CameraWorker | None, Any, Any]:
33
  """Initialize camera, head tracker, camera worker, and vision manager.
34
 
35
  By default, vision is handled by gpt-realtime model when camera tool is used.
 
43
  # Initialize head tracker if specified
44
  if args.head_tracker is not None:
45
  if args.head_tracker == "yolo":
46
+ from reachy_mini_conversation_app.vision.yolo_head_tracker import HeadTracker
47
 
48
  head_tracker = HeadTracker()
49
  elif args.head_tracker == "mediapipe":
50
+ from reachy_mini_toolbox.vision import HeadTracker # type: ignore[no-redef]
51
 
52
  head_tracker = HeadTracker()
53
 
 
57
  # Initialize vision manager only if local vision is requested
58
  if args.local_vision:
59
  try:
60
+ from reachy_mini_conversation_app.vision.processors import initialize_vision_manager
61
 
62
  vision_manager = initialize_vision_manager(camera_worker)
63
  except ImportError as e:
64
  raise ImportError(
65
+ "To use --local-vision, please install the extra dependencies: pip install '.[local_vision]'",
66
  ) from e
67
  else:
68
  logging.getLogger(__name__).info(
69
+ "Using gpt-realtime for vision (default). Use --local-vision for local processing.",
70
  )
71
 
72
  return camera_worker, head_tracker, vision_manager
73
 
74
 
75
+ def setup_logger(debug: bool) -> logging.Logger:
76
  """Setups the logger."""
77
  log_level = "DEBUG" if debug else "INFO"
78
  logging.basicConfig(
src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/vision/__init__.py RENAMED
File without changes
src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/vision/processors.py RENAMED
@@ -3,16 +3,17 @@ import time
3
  import base64
4
  import logging
5
  import threading
6
- from typing import Any, Dict, Optional
7
  from dataclasses import dataclass
8
 
9
  import cv2
10
  import numpy as np
11
  import torch
 
12
  from transformers import AutoProcessor, AutoModelForImageTextToText
13
  from huggingface_hub import snapshot_download
14
 
15
- from reachy_mini_conversation_demo.config import config
16
 
17
 
18
  logger = logging.getLogger(__name__)
@@ -34,7 +35,7 @@ class VisionConfig:
34
  class VisionProcessor:
35
  """Handles SmolVLM2 model loading and inference."""
36
 
37
- def __init__(self, vision_config: VisionConfig = None):
38
  """Initialize the vision processor."""
39
  self.vision_config = vision_config or VisionConfig()
40
  self.model_path = self.vision_config.model_path
@@ -60,7 +61,7 @@ class VisionProcessor:
60
  """Load model and processor onto the selected device."""
61
  try:
62
  logger.info(f"Loading SmolVLM2 model on {self.device} (HF_HOME={config.HF_HOME})")
63
- self.processor = AutoProcessor.from_pretrained(self.model_path)
64
 
65
  # Select dtype depending on device
66
  if self.device == "cuda":
@@ -70,16 +71,17 @@ class VisionProcessor:
70
  else:
71
  dtype = torch.float32
72
 
73
- model_kwargs = {"dtype": dtype}
74
 
75
  # flash_attention_2 is CUDA-only; skip on MPS/CPU
76
  if self.device == "cuda":
77
  model_kwargs["_attn_implementation"] = "flash_attention_2"
78
 
79
  # Load model weights
80
- self.model = AutoModelForImageTextToText.from_pretrained(self.model_path, **model_kwargs).to(self.device)
81
 
82
- self.model.eval()
 
83
  self._initialized = True
84
  return True
85
 
@@ -89,11 +91,11 @@ class VisionProcessor:
89
 
90
  def process_image(
91
  self,
92
- cv2_image: np.ndarray,
93
  prompt: str = "Briefly describe what you see in one sentence.",
94
  ) -> str:
95
  """Process CV2 image and return description with retry logic."""
96
- if not self._initialized:
97
  return "Vision model not initialized"
98
 
99
  for attempt in range(self.vision_config.max_retries):
@@ -205,16 +207,16 @@ class VisionProcessor:
205
  class VisionManager:
206
  """Manages periodic vision processing and scene understanding."""
207
 
208
- def __init__(self, camera, vision_config: VisionConfig = None):
209
  """Initialize vision manager with camera and configuration."""
210
  self.camera = camera
211
  self.vision_config = vision_config or VisionConfig()
212
  self.vision_interval = self.vision_config.vision_interval
213
  self.processor = VisionProcessor(self.vision_config)
214
 
215
- self._last_processed_time = 0
216
  self._stop_event = threading.Event()
217
- self._thread: Optional[threading.Thread] = None
218
 
219
  # Initialize processor
220
  if not self.processor.initialize():
@@ -245,7 +247,7 @@ class VisionManager:
245
  frame = self.camera.get_latest_frame()
246
  if frame is not None:
247
  description = self.processor.process_image(
248
- frame, "Briefly describe what you see in one sentence."
249
  )
250
 
251
  # Only update if we got a valid response
@@ -274,7 +276,7 @@ class VisionManager:
274
  }
275
 
276
 
277
- def initialize_vision_manager(camera_worker) -> Optional[VisionManager]:
278
  """Initialize vision manager with model download and configuration.
279
 
280
  Args:
@@ -318,7 +320,7 @@ def initialize_vision_manager(camera_worker) -> Optional[VisionManager]:
318
  # Log device info
319
  device_info = vision_manager.processor.get_model_info()
320
  logger.info(
321
- f"Vision processing enabled: {device_info.get('model_path')} on {device_info.get('device')}"
322
  )
323
 
324
  return vision_manager
 
3
  import base64
4
  import logging
5
  import threading
6
+ from typing import Any, Dict
7
  from dataclasses import dataclass
8
 
9
  import cv2
10
  import numpy as np
11
  import torch
12
+ from numpy.typing import NDArray
13
  from transformers import AutoProcessor, AutoModelForImageTextToText
14
  from huggingface_hub import snapshot_download
15
 
16
+ from reachy_mini_conversation_app.config import config
17
 
18
 
19
  logger = logging.getLogger(__name__)
 
35
  class VisionProcessor:
36
  """Handles SmolVLM2 model loading and inference."""
37
 
38
+ def __init__(self, vision_config: VisionConfig | None = None):
39
  """Initialize the vision processor."""
40
  self.vision_config = vision_config or VisionConfig()
41
  self.model_path = self.vision_config.model_path
 
61
  """Load model and processor onto the selected device."""
62
  try:
63
  logger.info(f"Loading SmolVLM2 model on {self.device} (HF_HOME={config.HF_HOME})")
64
+ self.processor = AutoProcessor.from_pretrained(self.model_path) # type: ignore[no-untyped-call]
65
 
66
  # Select dtype depending on device
67
  if self.device == "cuda":
 
71
  else:
72
  dtype = torch.float32
73
 
74
+ model_kwargs: Dict[str, Any] = {"dtype": dtype}
75
 
76
  # flash_attention_2 is CUDA-only; skip on MPS/CPU
77
  if self.device == "cuda":
78
  model_kwargs["_attn_implementation"] = "flash_attention_2"
79
 
80
  # Load model weights
81
+ self.model = AutoModelForImageTextToText.from_pretrained(self.model_path, **model_kwargs).to(self.device) # type: ignore[arg-type]
82
 
83
+ if self.model is not None:
84
+ self.model.eval()
85
  self._initialized = True
86
  return True
87
 
 
91
 
92
  def process_image(
93
  self,
94
+ cv2_image: NDArray[np.uint8],
95
  prompt: str = "Briefly describe what you see in one sentence.",
96
  ) -> str:
97
  """Process CV2 image and return description with retry logic."""
98
+ if not self._initialized or self.processor is None or self.model is None:
99
  return "Vision model not initialized"
100
 
101
  for attempt in range(self.vision_config.max_retries):
 
207
  class VisionManager:
208
  """Manages periodic vision processing and scene understanding."""
209
 
210
+ def __init__(self, camera: Any, vision_config: VisionConfig | None = None):
211
  """Initialize vision manager with camera and configuration."""
212
  self.camera = camera
213
  self.vision_config = vision_config or VisionConfig()
214
  self.vision_interval = self.vision_config.vision_interval
215
  self.processor = VisionProcessor(self.vision_config)
216
 
217
+ self._last_processed_time = 0.0
218
  self._stop_event = threading.Event()
219
+ self._thread: threading.Thread | None = None
220
 
221
  # Initialize processor
222
  if not self.processor.initialize():
 
247
  frame = self.camera.get_latest_frame()
248
  if frame is not None:
249
  description = self.processor.process_image(
250
+ frame, "Briefly describe what you see in one sentence.",
251
  )
252
 
253
  # Only update if we got a valid response
 
276
  }
277
 
278
 
279
+ def initialize_vision_manager(camera_worker: Any) -> VisionManager | None:
280
  """Initialize vision manager with model download and configuration.
281
 
282
  Args:
 
320
  # Log device info
321
  device_info = vision_manager.processor.get_model_info()
322
  logger.info(
323
+ f"Vision processing enabled: {device_info.get('model_path')} on {device_info.get('device')}",
324
  )
325
 
326
  return vision_manager
src/{reachy_mini_conversation_demo β†’ reachy_mini_conversation_app}/vision/yolo_head_tracker.py RENAMED
@@ -1,16 +1,17 @@
1
  from __future__ import annotations
2
  import logging
3
- from typing import Tuple, Optional
4
 
5
  import numpy as np
 
6
 
7
 
8
  try:
9
  from supervision import Detections
10
- from ultralytics import YOLO
11
  except ImportError as e:
12
  raise ImportError(
13
- "To use YOLO head tracker, please install the extra dependencies: pip install '.[yolo_vision]'"
14
  ) from e
15
  from huggingface_hub import hf_hub_download
16
 
@@ -48,7 +49,7 @@ class HeadTracker:
48
  logger.error(f"Failed to load YOLO model: {e}")
49
  raise
50
 
51
- def _select_best_face(self, detections: Detections) -> Optional[int]:
52
  """Select the best face based on confidence and area (largest face with highest confidence).
53
 
54
  Args:
@@ -61,6 +62,10 @@ class HeadTracker:
61
  if detections.xyxy.shape[0] == 0:
62
  return None
63
 
 
 
 
 
64
  # Filter by confidence threshold
65
  valid_mask = detections.confidence >= self.confidence_threshold
66
  if not np.any(valid_mask):
@@ -78,9 +83,9 @@ class HeadTracker:
78
 
79
  # Return index of best face
80
  best_idx = valid_indices[np.argmax(scores)]
81
- return best_idx
82
 
83
- def _bbox_to_mp_coords(self, bbox: np.ndarray, w: int, h: int) -> np.ndarray:
84
  """Convert bounding box center to MediaPipe-style coordinates [-1, 1].
85
 
86
  Args:
@@ -101,7 +106,7 @@ class HeadTracker:
101
 
102
  return np.array([norm_x, norm_y], dtype=np.float32)
103
 
104
- def get_head_position(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[float]]:
105
  """Get head position from face detection.
106
 
107
  Args:
@@ -125,9 +130,10 @@ class HeadTracker:
125
  return None, None
126
 
127
  bbox = detections.xyxy[face_idx]
128
- confidence = detections.confidence[face_idx]
129
 
130
- logger.debug(f"Face detected with confidence: {confidence:.2f}")
 
 
131
 
132
  # Get face center in [-1, 1] coordinates
133
  face_center = self._bbox_to_mp_coords(bbox, w, h)
 
1
  from __future__ import annotations
2
  import logging
3
+ from typing import Tuple
4
 
5
  import numpy as np
6
+ from numpy.typing import NDArray
7
 
8
 
9
  try:
10
  from supervision import Detections
11
+ from ultralytics import YOLO # type: ignore[attr-defined]
12
  except ImportError as e:
13
  raise ImportError(
14
+ "To use YOLO head tracker, please install the extra dependencies: pip install '.[yolo_vision]'",
15
  ) from e
16
  from huggingface_hub import hf_hub_download
17
 
 
49
  logger.error(f"Failed to load YOLO model: {e}")
50
  raise
51
 
52
+ def _select_best_face(self, detections: Detections) -> int | None:
53
  """Select the best face based on confidence and area (largest face with highest confidence).
54
 
55
  Args:
 
62
  if detections.xyxy.shape[0] == 0:
63
  return None
64
 
65
+ # Check if confidence is available
66
+ if detections.confidence is None:
67
+ return None
68
+
69
  # Filter by confidence threshold
70
  valid_mask = detections.confidence >= self.confidence_threshold
71
  if not np.any(valid_mask):
 
83
 
84
  # Return index of best face
85
  best_idx = valid_indices[np.argmax(scores)]
86
+ return int(best_idx)
87
 
88
+ def _bbox_to_mp_coords(self, bbox: NDArray[np.float32], w: int, h: int) -> NDArray[np.float32]:
89
  """Convert bounding box center to MediaPipe-style coordinates [-1, 1].
90
 
91
  Args:
 
106
 
107
  return np.array([norm_x, norm_y], dtype=np.float32)
108
 
109
+ def get_head_position(self, img: NDArray[np.uint8]) -> Tuple[NDArray[np.float32] | None, float | None]:
110
  """Get head position from face detection.
111
 
112
  Args:
 
130
  return None, None
131
 
132
  bbox = detections.xyxy[face_idx]
 
133
 
134
+ if detections.confidence is not None:
135
+ confidence = detections.confidence[face_idx]
136
+ logger.debug(f"Face detected with confidence: {confidence:.2f}")
137
 
138
  # Get face center in [-1, 1] coordinates
139
  face_center = self._bbox_to_mp_coords(bbox, w, h)
tests/audio/test_head_wobbler.py CHANGED
@@ -4,11 +4,12 @@ import math
4
  import time
5
  import base64
6
  import threading
7
- from typing import List, Tuple, Callable
 
8
 
9
  import numpy as np
10
 
11
- from reachy_mini_conversation_demo.audio.head_wobbler import HeadWobbler
12
 
13
 
14
  def _make_audio_chunk(duration_s: float = 0.3, frequency_hz: float = 220.0) -> str:
@@ -74,7 +75,7 @@ def test_reset_allows_future_offsets() -> None:
74
  wobbler.stop()
75
 
76
 
77
- def test_reset_during_inflight_chunk_keeps_worker(monkeypatch) -> None:
78
  """Simulate reset during chunk processing to ensure the worker survives."""
79
  wobbler, captured = _start_wobbler()
80
  ready = threading.Event()
 
4
  import time
5
  import base64
6
  import threading
7
+ from typing import Any, List, Tuple
8
+ from collections.abc import Callable
9
 
10
  import numpy as np
11
 
12
+ from reachy_mini_conversation_app.audio.head_wobbler import HeadWobbler
13
 
14
 
15
  def _make_audio_chunk(duration_s: float = 0.3, frequency_hz: float = 220.0) -> str:
 
75
  wobbler.stop()
76
 
77
 
78
+ def test_reset_during_inflight_chunk_keeps_worker(monkeypatch: Any) -> None:
79
  """Simulate reset during chunk processing to ensure the worker survives."""
80
  wobbler, captured = _start_wobbler()
81
  ready = threading.Event()
tests/test_openai_realtime.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import logging
3
+ from typing import Any
4
+ from datetime import datetime, timezone
5
+ from unittest.mock import MagicMock
6
+
7
+ import pytest
8
+
9
+ import reachy_mini_conversation_app.openai_realtime as rt_mod
10
+ from reachy_mini_conversation_app.tools import ToolDependencies
11
+ from reachy_mini_conversation_app.openai_realtime import OpenaiRealtimeHandler
12
+
13
+
14
+ def _build_handler(loop: asyncio.AbstractEventLoop) -> OpenaiRealtimeHandler:
15
+ asyncio.set_event_loop(loop)
16
+ deps = ToolDependencies(reachy_mini=MagicMock(), movement_manager=MagicMock())
17
+ return OpenaiRealtimeHandler(deps)
18
+
19
+
20
+ def test_format_timestamp_uses_wall_clock() -> None:
21
+ """Test that format_timestamp uses wall clock time."""
22
+ loop = asyncio.new_event_loop()
23
+ try:
24
+ print("Testing format_timestamp...")
25
+ handler = _build_handler(loop)
26
+ formatted = handler.format_timestamp()
27
+ print(f"Formatted timestamp: {formatted}")
28
+ finally:
29
+ asyncio.set_event_loop(None)
30
+ loop.close()
31
+
32
+ # Extract year from "[YYYY-MM-DD ...]"
33
+ year = int(formatted[1:5])
34
+ assert year == datetime.now(timezone.utc).year
35
+
36
+ @pytest.mark.asyncio
37
+ async def test_start_up_retries_on_abrupt_close(monkeypatch: Any, caplog: Any) -> None:
38
+ """First connection dies with ConnectionClosedError during iteration -> retried.
39
+
40
+ Second connection iterates cleanly (no events) -> start_up returns without raising.
41
+ Ensures handler clears self.connection at the end.
42
+ """
43
+ caplog.set_level(logging.WARNING)
44
+
45
+ # Use a local Exception as the module's ConnectionClosedError to avoid ws dependency
46
+ FakeCCE = type("FakeCCE", (Exception,), {})
47
+ monkeypatch.setattr(rt_mod, "ConnectionClosedError", FakeCCE)
48
+
49
+ # Make asyncio.sleep return immediately (for backoff)
50
+ async def _fast_sleep(*_a: Any, **_kw: Any) -> None: return None
51
+ monkeypatch.setattr(asyncio, "sleep", _fast_sleep, raising=False)
52
+
53
+ attempt_counter = {"n": 0}
54
+
55
+ class FakeConn:
56
+ """Minimal realtime connection stub."""
57
+
58
+ def __init__(self, mode: str):
59
+ self._mode = mode
60
+
61
+ class _Session:
62
+ async def update(self, **_kw: Any) -> None: return None
63
+ self.session = _Session()
64
+
65
+ class _InputAudioBuffer:
66
+ async def append(self, **_kw: Any) -> None: return None
67
+ self.input_audio_buffer = _InputAudioBuffer()
68
+
69
+ class _Item:
70
+ async def create(self, **_kw: Any) -> None: return None
71
+
72
+ class _Conversation:
73
+ item = _Item()
74
+ self.conversation = _Conversation()
75
+
76
+ class _Response:
77
+ async def create(self, **_kw: Any) -> None: return None
78
+ async def cancel(self, **_kw: Any) -> None: return None
79
+ self.response = _Response()
80
+
81
+ async def __aenter__(self) -> "FakeConn": return self
82
+ async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> bool: return False
83
+ async def close(self) -> None: return None
84
+
85
+ # Async iterator protocol
86
+ def __aiter__(self) -> "FakeConn": return self
87
+ async def __anext__(self) -> None:
88
+ if self._mode == "raise_on_iter":
89
+ raise FakeCCE("abrupt close (simulated)")
90
+ raise StopAsyncIteration # clean exit (no events)
91
+
92
+ class FakeRealtime:
93
+ def connect(self, **_kw: Any) -> FakeConn:
94
+ attempt_counter["n"] += 1
95
+ mode = "raise_on_iter" if attempt_counter["n"] == 1 else "clean"
96
+ return FakeConn(mode)
97
+
98
+ class FakeClient:
99
+ def __init__(self, **_kw: Any) -> None: self.realtime = FakeRealtime()
100
+
101
+ # Patch the OpenAI client used by the handler
102
+ monkeypatch.setattr(rt_mod, "AsyncOpenAI", FakeClient)
103
+
104
+ # Build handler with minimal deps
105
+ deps = ToolDependencies(reachy_mini=MagicMock(), movement_manager=MagicMock())
106
+ handler = rt_mod.OpenaiRealtimeHandler(deps)
107
+
108
+ # Run: should retry once and exit cleanly
109
+ await handler.start_up()
110
+
111
+ # Validate: two attempts total (fail -> retry -> succeed), and connection cleared
112
+ assert attempt_counter["n"] == 2
113
+ assert handler.connection is None
114
+
115
+ # Optional: confirm we logged the unexpected close once
116
+ warnings = [r for r in caplog.records if r.levelname == "WARNING" and "closed unexpectedly" in r.msg]
117
+ assert len(warnings) == 1
uv.lock CHANGED
The diff for this file is too large to render. See raw diff