Alina commited on
Commit
3e83b93
·
unverified ·
2 Parent(s): 8dda3b1 3a775a2

Merge pull request #53 from pollen-robotics/more_cleanup

Browse files
.env.example CHANGED
@@ -1,6 +1,9 @@
1
  OPENAI_API_KEY=
2
  MODEL_NAME="gpt-realtime"
3
 
 
 
 
4
  # Cache for local VLM
5
  HF_HOME=./cache
6
 
 
1
  OPENAI_API_KEY=
2
  MODEL_NAME="gpt-realtime"
3
 
4
+ # Local vision model
5
+ LOCAL_VISION_MODEL=HuggingFaceTB/SmolVLM2-2.2B-Instruct
6
+
7
  # Cache for local VLM
8
  HF_HOME=./cache
9
 
.gitignore CHANGED
@@ -1,202 +1,57 @@
1
- # General
2
- .DS_Store
3
- .AppleDouble
4
- .LSOverride
5
-
6
- # Icon must end with two \r
7
- Icon
8
-
9
- # Thumbnails
10
- ._*
11
-
12
- # Files that might appear in the root of a volume
13
- .DocumentRevisions-V100
14
- .fseventsd
15
- .Spotlight-V100
16
- .TemporaryItems
17
- .Trashes
18
- .VolumeIcon.icns
19
- .com.apple.timemachine.donotpresent
20
-
21
- # Directories potentially created on remote AFP share
22
- .AppleDB
23
- .AppleDesktop
24
- Network Trash Folder
25
- Temporary Items
26
- .apdisk
27
-
28
- # Byte-compiled / optimized / DLL files
29
  __pycache__/
30
  *.py[cod]
31
  *$py.class
32
-
33
- # C extensions
34
  *.so
35
 
36
- # Distribution / packaging
37
- .Python
 
 
 
 
 
 
 
 
38
  build/
39
- develop-eggs/
40
  dist/
41
- downloads/
42
- eggs/
43
- .eggs/
44
- lib/
45
- lib64/
46
- parts/
47
- sdist/
48
- var/
49
- wheels/
50
- share/python-wheels/
51
  *.egg-info/
52
- .installed.cfg
53
- *.egg
54
- MANIFEST
55
-
56
- # PyInstaller
57
- # Usually these files are written by a python script from a template
58
- # before PyInstaller builds the exe, so as to inject date/other infos into it.
59
- *.manifest
60
- *.spec
61
-
62
- # Installer logs
63
- pip-log.txt
64
- pip-delete-this-directory.txt
65
 
66
- # Unit test / coverage reports
67
- htmlcov/
68
- .tox/
69
- .nox/
70
  .coverage
71
- .coverage.*
72
- .cache
73
- cache/
74
- nosetests.xml
75
  coverage.xml
76
  *.cover
77
- *.py,cover
78
- .hypothesis/
79
- .pytest_cache/
80
- cover/
81
 
82
- # Ruff cache
83
  .ruff_cache/
84
 
85
- # Translations
86
- *.mo
87
- *.pot
88
-
89
- # Django stuff:
90
- *.log
91
- local_settings.py
92
- db.sqlite3
93
- db.sqlite3-journal
94
-
95
- # Flask stuff:
96
- instance/
97
- .webassets-cache
98
-
99
- # Scrapy stuff:
100
- .scrapy
101
 
102
- # Sphinx documentation
103
- docs/_build/
104
-
105
- # PyBuilder
106
- .pybuilder/
107
- target/
108
-
109
- # Jupyter Notebook
110
- .ipynb_checkpoints
111
-
112
- # IPython
113
- profile_default/
114
- ipython_config.py
115
-
116
- # pyenv
117
- # For a library or package, you might want to ignore these files since the code is
118
- # intended to run in multiple environments; otherwise, check them in:
119
- # .python-version
120
-
121
- # pipenv
122
- # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
123
- # However, in case of collaboration, if having platform-specific dependencies or dependencies
124
- # having no cross-platform support, pipenv may install dependencies that don't work, or not
125
- # install all needed dependencies.
126
- #Pipfile.lock
127
-
128
- # poetry
129
- # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
130
- # This is especially recommended for binary packages to ensure reproducibility, and is more
131
- # commonly ignored for libraries.
132
- # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
133
- #poetry.lock
134
-
135
- # pdm
136
- # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
137
- #pdm.lock
138
- # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
139
- # in version control.
140
- # https://pdm.fming.dev/#use-with-ide
141
- .pdm.toml
142
-
143
- # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
144
- __pypackages__/
145
-
146
- # Celery stuff
147
- celerybeat-schedule
148
- celerybeat.pid
149
-
150
- # SageMath parsed files
151
- *.sage.py
152
-
153
- # Environments
154
- .env
155
- .venv
156
- env/
157
- venv/
158
- ENV/
159
- env.bak/
160
- venv.bak/
161
-
162
- # Keys
163
- *.csr
164
  *.key
165
  *.pem
 
 
166
 
167
- # Spyder project settings
168
- .spyderproject
169
- .spyproject
170
-
171
- # Rope project settings
172
- .ropeproject
173
-
174
- # mkdocs documentation
175
- /site
176
-
177
- # mypy
178
- .mypy_cache/
179
- .dmypy.json
180
- dmypy.json
181
-
182
- # Pyre type checker
183
- .pyre/
184
-
185
- # pytype static type analyzer
186
- .pytype/
187
-
188
- # Cython debug symbols
189
- cython_debug/
190
-
191
- # PyCharm
192
- # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
193
- # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
194
- # and can be added to the global gitignore or merged into this file. For a more nuclear
195
- # option (not recommended) you can uncomment the following to ignore the entire idea folder.
196
- #.idea/
197
 
198
- # Certificates
199
- *.crt
200
 
201
- # Temporary folder
202
- tmp/
 
 
 
 
1
+ # Python
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  __pycache__/
3
  *.py[cod]
4
  *$py.class
 
 
5
  *.so
6
 
7
+ # Virtual environments
8
+ .venv/
9
+ venv/
10
+ ENV/
11
+ env/
12
+
13
+ # Environment variables
14
+ .env
15
+
16
+ # Build and distribution
17
  build/
 
18
  dist/
 
 
 
 
 
 
 
 
 
 
19
  *.egg-info/
20
+ .eggs/
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ # Testing
23
+ .pytest_cache/
 
 
24
  .coverage
25
+ .hypothesis/
26
+ htmlcov/
 
 
27
  coverage.xml
28
  *.cover
 
 
 
 
29
 
30
+ # Linting and formatting
31
  .ruff_cache/
32
 
33
+ # IDE
34
+ .vscode/
35
+ .idea/
36
+ *.swp
37
+ *.swo
 
 
 
 
 
 
 
 
 
 
 
38
 
39
+ # Security
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  *.key
41
  *.pem
42
+ *.crt
43
+ *.csr
44
 
45
+ # Temporary files
46
+ tmp/
47
+ *.log
48
+ cache/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ # macOS
51
+ .DS_Store
52
 
53
+ # Linux
54
+ *~
55
+ .directory
56
+ .Trash-*
57
+ .nfs*
README.md CHANGED
@@ -1,12 +1,13 @@
1
  # Reachy Mini conversation demo
2
 
3
  Conversational demo for the Reachy Mini robot combining OpenAI's realtime APIs, vision pipelines, and choreographed motion libraries.
 
4
 
5
  ## Overview
6
  - Real-time audio conversation loop powered by the OpenAI realtime API and `fastrtc` for low-latency streaming.
7
- - Camera capture can route to OpenAI multimodal vision or stay on-device with SmolVLM2 local analysis.
8
  - Layered motion system queues primary moves (dances, emotions, goto poses, breathing) while blending speech-reactive wobble and face-tracking.
9
- - Async tool dispatch integrates robot motion, camera capture, and optional facial-recognition helpers through a Gradio web UI with live transcripts.
10
 
11
  ## Installation
12
 
@@ -74,8 +75,9 @@ Some wheels (e.g. PyTorch) are large and require compatible CUDA or CPU builds
74
  |----------|-------------|
75
  | `OPENAI_API_KEY` | Required. Grants access to the OpenAI realtime endpoint.
76
  | `MODEL_NAME` | Override the realtime model (defaults to `gpt-realtime`).
77
- | `HF_HOME` | Cache directory for local Hugging Face downloads.
78
- | `HF_TOKEN` | Optional token for Hugging Face models.
 
79
 
80
  ## Running the demo
81
 
@@ -85,13 +87,13 @@ Activate your virtual environment, ensure the Reachy Mini robot (or simulator) i
85
  reachy-mini-conversation-demo
86
  ```
87
 
88
- The app starts a Gradio UI served locally (http://127.0.0.1:7860/). When running on a headless host, use `--headless`. With a camera attached, captured frames can be analysed remotely through OpenAI multimodal models or locally via the YOLO/MediaPipe pipelines depending on the extras you installed.
89
 
90
  ### CLI options
91
 
92
  | Option | Default | Description |
93
  |--------|---------|-------------|
94
- | `--head-tracker {yolo,mediapipe}` | `None` | Select a face-tracking backend when a camera is available. Requires the matching optional extra. |
95
  | `--no-camera` | `False` | Run without camera capture or face tracking. |
96
  | `--gradio` | `False` | Launch the Gradio web UI. Without this flag, runs in console mode. Required when running in simulation mode. |
97
  | `--debug` | `False` | Enable verbose logging for troubleshooting. |
@@ -116,12 +118,11 @@ The app starts a Gradio UI served locally (http://127.0.0.1:7860/). When running
116
  |------|--------|--------------|
117
  | `move_head` | Queue a head pose change (left/right/up/down/front). | Core install only. |
118
  | `camera` | Capture the latest camera frame and optionally query a vision backend. | Requires camera worker; vision analysis depends on selected extras. |
119
- | `head_tracking` | Enable or disable face-tracking offsets. | Camera worker with configured head tracker. |
120
  | `dance` | Queue a dance from `reachy_mini_dances_library`. | Core install only. |
121
  | `stop_dance` | Clear queued dances. | Core install only. |
122
  | `play_emotion` | Play a recorded emotion clip via Hugging Face assets. | Needs `HF_TOKEN` for the recorded emotions dataset. |
123
  | `stop_emotion` | Clear queued emotions. | Core install only. |
124
- | `get_person_name` | DeepFace-based recognition of the current person. | Disabled by default (`ENABLE_FACE_RECOGNITION=False`); requires `deepface` and a local face database. |
125
  | `do_nothing` | Explicitly remain idle. | Core install only. |
126
 
127
  ## Development workflow
 
1
  # Reachy Mini conversation demo
2
 
3
  Conversational demo for the Reachy Mini robot combining OpenAI's realtime APIs, vision pipelines, and choreographed motion libraries.
4
+ ![Reachy Mini Dance](src/reachy_mini_conversation_demo/images/reachy_mini_dance.gif)
5
 
6
  ## Overview
7
  - Real-time audio conversation loop powered by the OpenAI realtime API and `fastrtc` for low-latency streaming.
8
+ - Local vision processing using SmolVLM2 model running on-device (CPU/GPU/MPS).
9
  - Layered motion system queues primary moves (dances, emotions, goto poses, breathing) while blending speech-reactive wobble and face-tracking.
10
+ - Async tool dispatch integrates robot motion, camera capture, and optional face-tracking capabilities through a Gradio web UI with live transcripts.
11
 
12
  ## Installation
13
 
 
75
  |----------|-------------|
76
  | `OPENAI_API_KEY` | Required. Grants access to the OpenAI realtime endpoint.
77
  | `MODEL_NAME` | Override the realtime model (defaults to `gpt-realtime`).
78
+ | `HF_HOME` | Cache directory for local Hugging Face downloads (defaults to `./cache`).
79
+ | `HF_TOKEN` | Optional token for Hugging Face models (falls back to `huggingface-cli login`).
80
+ | `LOCAL_VISION_MODEL` | Hugging Face model path for local vision processing (defaults to `HuggingFaceTB/SmolVLM2-2.2B-Instruct`).
81
 
82
  ## Running the demo
83
 
 
87
  reachy-mini-conversation-demo
88
  ```
89
 
90
+ By default, the app runs in console mode for direct audio interaction. Use the `--gradio` flag to launch a web UI served locally at http://127.0.0.1:7860/ (required when running in simulation mode). With a camera attached, captured frames are analyzed locally using the SmolVLM2 vision model. Additionally, you can enable face tracking via YOLO or MediaPipe pipelines depending on the extras you installed.
91
 
92
  ### CLI options
93
 
94
  | Option | Default | Description |
95
  |--------|---------|-------------|
96
+ | `--head-tracker {yolo,mediapipe}` | `None` | Select a face-tracking backend when a camera is available. YOLO is implemented locally, MediaPipe comes from the `reachy_mini_toolbox` package. Requires the matching optional extra. |
97
  | `--no-camera` | `False` | Run without camera capture or face tracking. |
98
  | `--gradio` | `False` | Launch the Gradio web UI. Without this flag, runs in console mode. Required when running in simulation mode. |
99
  | `--debug` | `False` | Enable verbose logging for troubleshooting. |
 
118
  |------|--------|--------------|
119
  | `move_head` | Queue a head pose change (left/right/up/down/front). | Core install only. |
120
  | `camera` | Capture the latest camera frame and optionally query a vision backend. | Requires camera worker; vision analysis depends on selected extras. |
121
+ | `head_tracking` | Enable or disable face-tracking offsets (not facial recognition - only detects and tracks face position). | Camera worker with configured head tracker. |
122
  | `dance` | Queue a dance from `reachy_mini_dances_library`. | Core install only. |
123
  | `stop_dance` | Clear queued dances. | Core install only. |
124
  | `play_emotion` | Play a recorded emotion clip via Hugging Face assets. | Needs `HF_TOKEN` for the recorded emotions dataset. |
125
  | `stop_emotion` | Clear queued emotions. | Core install only. |
 
126
  | `do_nothing` | Explicitly remain idle. | Core install only. |
127
 
128
  ## Development workflow
pyproject.toml CHANGED
@@ -16,6 +16,7 @@ dependencies = [
16
  "gradio>=5.49.0",
17
  "huggingface_hub>=0.34.4",
18
  "opencv-python>=4.12.0.88",
 
19
 
20
  #Environment variables
21
  "python-dotenv",
 
16
  "gradio>=5.49.0",
17
  "huggingface_hub>=0.34.4",
18
  "opencv-python>=4.12.0.88",
19
+ "num2words",
20
 
21
  #Environment variables
22
  "python-dotenv",
src/reachy_mini_conversation_demo/audio/speech_tapper.py CHANGED
@@ -120,7 +120,6 @@ class SwayRollRT:
120
  self._seed = int(rng_seed)
121
  self.samples = deque(maxlen=10 * SR) # sliding window for VAD/env
122
  self.carry = np.zeros(0, dtype=np.float32)
123
- self.frame_idx = 0
124
 
125
  self.vad_on = False
126
  self.vad_above = 0
@@ -143,7 +142,6 @@ class SwayRollRT:
143
  """Reset state (VAD/env/buffers/time) but keep initial phases/seed."""
144
  self.samples.clear()
145
  self.carry = np.zeros(0, dtype=np.float32)
146
- self.frame_idx = 0
147
  self.vad_on = False
148
  self.vad_above = 0
149
  self.vad_below = 0
@@ -152,16 +150,6 @@ class SwayRollRT:
152
  self.sway_down = 0
153
  self.t = 0.0
154
 
155
- def reset_phases(self) -> None:
156
- """Re-randomize phases deterministically from stored seed (Optional)."""
157
- rng = np.random.default_rng(self._seed)
158
- self.phase_pitch = float(rng.random() * 2 * math.pi)
159
- self.phase_yaw = float(rng.random() * 2 * math.pi)
160
- self.phase_roll = float(rng.random() * 2 * math.pi)
161
- self.phase_x = float(rng.random() * 2 * math.pi)
162
- self.phase_y = float(rng.random() * 2 * math.pi)
163
- self.phase_z = float(rng.random() * 2 * math.pi)
164
-
165
  def feed(self, pcm: np.ndarray, sr: Optional[int]) -> List[Dict[str, float]]:
166
  """Stream in PCM chunk. Returns a list of sway dicts, one per hop (HOP_MS).
167
 
@@ -196,7 +184,6 @@ class SwayRollRT:
196
  self.samples.extend(hop.tolist())
197
  if len(self.samples) < FRAME:
198
  self.t += HOP_MS / 1000.0
199
- self.frame_idx += 1
200
  continue
201
 
202
  frame = np.fromiter(
 
120
  self._seed = int(rng_seed)
121
  self.samples = deque(maxlen=10 * SR) # sliding window for VAD/env
122
  self.carry = np.zeros(0, dtype=np.float32)
 
123
 
124
  self.vad_on = False
125
  self.vad_above = 0
 
142
  """Reset state (VAD/env/buffers/time) but keep initial phases/seed."""
143
  self.samples.clear()
144
  self.carry = np.zeros(0, dtype=np.float32)
 
145
  self.vad_on = False
146
  self.vad_above = 0
147
  self.vad_below = 0
 
150
  self.sway_down = 0
151
  self.t = 0.0
152
 
 
 
 
 
 
 
 
 
 
 
153
  def feed(self, pcm: np.ndarray, sr: Optional[int]) -> List[Dict[str, float]]:
154
  """Stream in PCM chunk. Returns a list of sway dicts, one per hop (HOP_MS).
155
 
 
184
  self.samples.extend(hop.tolist())
185
  if len(self.samples) < FRAME:
186
  self.t += HOP_MS / 1000.0
 
187
  continue
188
 
189
  frame = np.fromiter(
src/reachy_mini_conversation_demo/config.py CHANGED
@@ -1,17 +1,28 @@
1
  import os
 
 
2
 
3
  from dotenv import load_dotenv
4
 
5
 
6
- load_dotenv()
7
 
 
 
 
 
 
 
 
 
8
 
9
- def getenv_bool(key: str, default: bool = False) -> bool:
10
- """Read env var as a Python bool (case-insensitive)."""
11
- val = os.getenv(key)
12
- if val is None:
13
- return default
14
- return val.strip().lower() in {"true", "1", "yes", "on"}
 
15
 
16
 
17
  class Config:
@@ -19,13 +30,23 @@ class Config:
19
 
20
  # Required
21
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
22
- if not OPENAI_API_KEY:
23
- raise RuntimeError("OPENAI_API_KEY is missing in .env")
 
 
 
 
 
 
 
24
 
25
  # Optional
26
  MODEL_NAME = os.getenv("MODEL_NAME", "gpt-realtime")
27
  HF_HOME = os.getenv("HF_HOME", "./cache")
 
28
  HF_TOKEN = os.getenv("HF_TOKEN") # Optional, falls back to hf auth login if not set
29
 
 
 
30
 
31
  config = Config()
 
1
  import os
2
+ import logging
3
+ from pathlib import Path
4
 
5
  from dotenv import load_dotenv
6
 
7
 
8
+ logger = logging.getLogger(__name__)
9
 
10
+ # Check if .env file exists
11
+ env_file = Path(".env")
12
+ if not env_file.exists():
13
+ raise RuntimeError(
14
+ ".env file not found. Please create one based on .env.example:\n"
15
+ " cp .env.example .env\n"
16
+ "Then add your OPENAI_API_KEY to the .env file."
17
+ )
18
 
19
+ # Load .env and verify it was loaded successfully
20
+ if not load_dotenv():
21
+ raise RuntimeError(
22
+ "Failed to load .env file. Please ensure the file is readable and properly formatted."
23
+ )
24
+
25
+ logger.info("Configuration loaded from .env file")
26
 
27
 
28
  class Config:
 
30
 
31
  # Required
32
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
33
+ if OPENAI_API_KEY is None:
34
+ raise RuntimeError(
35
+ "OPENAI_API_KEY is not set in .env file. Please add it:\n"
36
+ " OPENAI_API_KEY=your_api_key_here"
37
+ )
38
+ if not OPENAI_API_KEY.strip():
39
+ raise RuntimeError(
40
+ "OPENAI_API_KEY is empty in .env file. Please provide a valid API key."
41
+ )
42
 
43
  # Optional
44
  MODEL_NAME = os.getenv("MODEL_NAME", "gpt-realtime")
45
  HF_HOME = os.getenv("HF_HOME", "./cache")
46
+ LOCAL_VISION_MODEL = os.getenv("LOCAL_VISION_MODEL", "HuggingFaceTB/SmolVLM2-2.2B-Instruct")
47
  HF_TOKEN = os.getenv("HF_TOKEN") # Optional, falls back to hf auth login if not set
48
 
49
+ logger.debug(f"Model: {MODEL_NAME}, HF_HOME: {HF_HOME}, Vision Model: {LOCAL_VISION_MODEL}")
50
+
51
 
52
  config = Config()
src/reachy_mini_conversation_demo/console.py CHANGED
@@ -26,7 +26,7 @@ class LocalStream:
26
  self._stop_event = asyncio.Event()
27
  self._tasks = []
28
  # Allow the handler to flush the player queue when appropriate.
29
- self.handler._clear_queue = self.clear_queue # type: ignore[assignment]
30
 
31
  def launch(self) -> None:
32
  """Start the recorder/player and run the async processing loops."""
@@ -69,7 +69,7 @@ class LocalStream:
69
  self._robot.media.stop_recording()
70
  self._robot.media.stop_playing()
71
 
72
- def clear_queue(self) -> None:
73
  """Flush the player's appsrc to drop any queued audio immediately."""
74
  logger.info("User intervention: flushing player queue")
75
  self.handler.output_queue = asyncio.Queue()
@@ -78,9 +78,9 @@ class LocalStream:
78
  """Read mic frames from the recorder and forward them to the handler."""
79
  logger.info("Starting receive loop")
80
  while not self._stop_event.is_set():
81
- data = self._robot.media.get_audio_sample()
82
- if data is not None:
83
- frame_mono = data.T[0] # both channels are identical
84
  frame = audio_to_int16(frame_mono)
85
  await self.handler.receive((16000, frame))
86
  # await asyncio.sleep(0) # yield to event loop
@@ -90,10 +90,10 @@ class LocalStream:
90
  async def play_loop(self) -> None:
91
  """Fetch outputs from the handler: log text and play audio frames."""
92
  while not self._stop_event.is_set():
93
- data = await self.handler.emit()
94
 
95
- if isinstance(data, AdditionalOutputs):
96
- for msg in data.args:
97
  content = msg.get("content", "")
98
  if isinstance(content, str):
99
  logger.info(
@@ -102,14 +102,17 @@ class LocalStream:
102
  content if len(content) < 500 else content[:500] + "…",
103
  )
104
 
105
- elif isinstance(data, tuple):
106
- sample_rate, frame = data
107
  device_sample_rate = self._robot.media.get_audio_samplerate()
108
- frame = audio_to_float32(frame.squeeze())
109
- if sample_rate != device_sample_rate:
110
- frame = librosa.resample(frame, orig_sr=sample_rate, target_sr=device_sample_rate)
111
- self._robot.media.push_audio_sample(frame)
 
 
112
 
113
- # else: ignore None/unknown outputs
 
114
 
115
  await asyncio.sleep(0) # yield to event loop
 
26
  self._stop_event = asyncio.Event()
27
  self._tasks = []
28
  # Allow the handler to flush the player queue when appropriate.
29
+ self.handler._clear_queue = self.clear_audio_queue # type: ignore[assignment]
30
 
31
  def launch(self) -> None:
32
  """Start the recorder/player and run the async processing loops."""
 
69
  self._robot.media.stop_recording()
70
  self._robot.media.stop_playing()
71
 
72
+ def clear_audio_queue(self) -> None:
73
  """Flush the player's appsrc to drop any queued audio immediately."""
74
  logger.info("User intervention: flushing player queue")
75
  self.handler.output_queue = asyncio.Queue()
 
78
  """Read mic frames from the recorder and forward them to the handler."""
79
  logger.info("Starting receive loop")
80
  while not self._stop_event.is_set():
81
+ audio_frame = self._robot.media.get_audio_sample()
82
+ if audio_frame is not None:
83
+ frame_mono = audio_frame.T[0] # both channels are identical
84
  frame = audio_to_int16(frame_mono)
85
  await self.handler.receive((16000, frame))
86
  # await asyncio.sleep(0) # yield to event loop
 
90
  async def play_loop(self) -> None:
91
  """Fetch outputs from the handler: log text and play audio frames."""
92
  while not self._stop_event.is_set():
93
+ handler_output = await self.handler.emit()
94
 
95
+ if isinstance(handler_output, AdditionalOutputs):
96
+ for msg in handler_output.args:
97
  content = msg.get("content", "")
98
  if isinstance(content, str):
99
  logger.info(
 
102
  content if len(content) < 500 else content[:500] + "…",
103
  )
104
 
105
+ elif isinstance(handler_output, tuple):
106
+ input_sample_rate, audio_frame = handler_output
107
  device_sample_rate = self._robot.media.get_audio_samplerate()
108
+ audio_frame = audio_to_float32(audio_frame.squeeze())
109
+ if input_sample_rate != device_sample_rate:
110
+ audio_frame = librosa.resample(
111
+ audio_frame, orig_sr=input_sample_rate, target_sr=device_sample_rate
112
+ )
113
+ self._robot.media.push_audio_sample(audio_frame)
114
 
115
+ else:
116
+ logger.debug("Ignoring output type=%s", type(handler_output).__name__)
117
 
118
  await asyncio.sleep(0) # yield to event loop
src/reachy_mini_conversation_demo/images/reachy_mini_dance.gif ADDED

Git LFS Details

  • SHA256: 75914c3cb7af982e0b1c6369e25fc46d8c08a0ab5ad022240ae9c1a0d93967c3
  • Pointer size: 132 Bytes
  • Size of remote file: 3.93 MB
src/reachy_mini_conversation_demo/moves.py CHANGED
@@ -190,13 +190,7 @@ class MovementState:
190
  0.0,
191
  )
192
 
193
- # Legacy movement state (for goto moves)
194
- moving_start: float = 0.0
195
- moving_for: float = 0.0
196
-
197
  # Status flags
198
- is_playing_move: bool = False
199
- is_moving: bool = False
200
  last_primary_pose: Optional[FullBodyPose] = None
201
 
202
  def update_activity(self) -> None:
@@ -325,7 +319,7 @@ class MovementManager:
325
  """
326
  self._command_queue.put(("queue_move", move))
327
 
328
- def clear_queue(self) -> None:
329
  """Stop the active move and discard any queued primary moves.
330
 
331
  Thread-safe: executed by the worker thread via the command queue.
@@ -361,10 +355,6 @@ class MovementManager:
361
 
362
  return self._now() - last_activity >= self.idle_inactivity_delay
363
 
364
- def mark_user_activity(self) -> None:
365
- """Record external activity and postpone idle behaviours (thread-safe)."""
366
- self._command_queue.put(("mark_activity", None))
367
-
368
  def set_listening(self, listening: bool) -> None:
369
  """Enable or disable listening mode without touching shared state directly.
370
 
@@ -427,7 +417,7 @@ class MovementManager:
427
  duration_str = str(duration)
428
  else:
429
  duration_str = "?"
430
- logger.info(
431
  "Queued move with duration %ss, queue size: %s",
432
  duration_str,
433
  len(self.move_queue),
@@ -438,7 +428,6 @@ class MovementManager:
438
  self.move_queue.clear()
439
  self.state.current_move = None
440
  self.state.move_start_time = None
441
- self.state.is_playing_move = False
442
  self._breathing_active = False
443
  logger.info("Cleared move queue and stopped current move")
444
  elif command == "set_moving_state":
@@ -447,8 +436,6 @@ class MovementManager:
447
  except (TypeError, ValueError):
448
  logger.warning("Invalid moving state duration: %s", payload)
449
  return
450
- self.state.moving_start = current_time
451
- self.state.moving_for = max(0.0, duration)
452
  self.state.update_activity()
453
  elif command == "mark_activity":
454
  self.state.update_activity()
@@ -534,7 +521,7 @@ class MovementManager:
534
  self.state.current_move = None
535
  self.state.move_start_time = None
536
  self._breathing_active = False
537
- logger.info("Stopping breathing due to new move activity")
538
 
539
  if self.state.current_move is not None and not isinstance(self.state.current_move, BreathingMove):
540
  self._breathing_active = False
@@ -561,14 +548,9 @@ class MovementManager:
561
  float(body_yaw),
562
  )
563
 
564
- self.state.is_playing_move = True
565
- self.state.is_moving = True
566
  self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
567
  else:
568
  # Otherwise reuse the last primary pose so we avoid jumps between moves
569
- self.state.is_playing_move = False
570
- self.state.is_moving = current_time - self.state.moving_start < self.state.moving_for
571
-
572
  if self.state.last_primary_pose is not None:
573
  primary_full_body_pose = clone_full_body_pose(self.state.last_primary_pose)
574
  else:
@@ -746,7 +728,6 @@ class MovementManager:
746
  self._thread.join()
747
  self._thread = None
748
  logger.debug("Move worker stopped")
749
- logger.debug("Move worker stopped")
750
 
751
  def get_status(self) -> dict[str, Any]:
752
  """Return a lightweight status snapshot for observability."""
 
190
  0.0,
191
  )
192
 
 
 
 
 
193
  # Status flags
 
 
194
  last_primary_pose: Optional[FullBodyPose] = None
195
 
196
  def update_activity(self) -> None:
 
319
  """
320
  self._command_queue.put(("queue_move", move))
321
 
322
+ def clear_move_queue(self) -> None:
323
  """Stop the active move and discard any queued primary moves.
324
 
325
  Thread-safe: executed by the worker thread via the command queue.
 
355
 
356
  return self._now() - last_activity >= self.idle_inactivity_delay
357
 
 
 
 
 
358
  def set_listening(self, listening: bool) -> None:
359
  """Enable or disable listening mode without touching shared state directly.
360
 
 
417
  duration_str = str(duration)
418
  else:
419
  duration_str = "?"
420
+ logger.debug(
421
  "Queued move with duration %ss, queue size: %s",
422
  duration_str,
423
  len(self.move_queue),
 
428
  self.move_queue.clear()
429
  self.state.current_move = None
430
  self.state.move_start_time = None
 
431
  self._breathing_active = False
432
  logger.info("Cleared move queue and stopped current move")
433
  elif command == "set_moving_state":
 
436
  except (TypeError, ValueError):
437
  logger.warning("Invalid moving state duration: %s", payload)
438
  return
 
 
439
  self.state.update_activity()
440
  elif command == "mark_activity":
441
  self.state.update_activity()
 
521
  self.state.current_move = None
522
  self.state.move_start_time = None
523
  self._breathing_active = False
524
+ logger.debug("Stopping breathing due to new move activity")
525
 
526
  if self.state.current_move is not None and not isinstance(self.state.current_move, BreathingMove):
527
  self._breathing_active = False
 
548
  float(body_yaw),
549
  )
550
 
 
 
551
  self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
552
  else:
553
  # Otherwise reuse the last primary pose so we avoid jumps between moves
 
 
 
554
  if self.state.last_primary_pose is not None:
555
  primary_full_body_pose = clone_full_body_pose(self.state.last_primary_pose)
556
  else:
 
728
  self._thread.join()
729
  self._thread = None
730
  logger.debug("Move worker stopped")
 
731
 
732
  def get_status(self) -> dict[str, Any]:
733
  """Return a lightweight status snapshot for observability."""
src/reachy_mini_conversation_demo/openai_realtime.py CHANGED
@@ -15,6 +15,7 @@ from reachy_mini_conversation_demo.tools import (
15
  dispatch_tool_call,
16
  )
17
  from reachy_mini_conversation_demo.config import config
 
18
 
19
 
20
  logger = logging.getLogger(__name__)
@@ -59,7 +60,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
59
  "language": "en",
60
  },
61
  "voice": "ballad",
62
- "instructions": "We speak in English",
63
  "tools": ALL_TOOL_SPECS,
64
  "tool_choice": "auto",
65
  "temperature": 0.7,
@@ -71,14 +72,15 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
71
  async for event in self.connection:
72
  logger.debug(f"OpenAI event: {event.type}")
73
  if event.type == "input_audio_buffer.speech_started":
74
- self.clear_queue()
 
75
  self.deps.head_wobbler.reset()
76
  self.deps.movement_manager.set_listening(True)
77
- logger.debug("user speech started")
78
 
79
  if event.type == "input_audio_buffer.speech_stopped":
80
  self.deps.movement_manager.set_listening(False)
81
- logger.debug("user speech stopped")
82
 
83
  if event.type in ("response.audio.completed", "response.completed"):
84
  # Doesn't seem to be called
@@ -86,20 +88,19 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
86
  self.deps.head_wobbler.reset()
87
 
88
  if event.type == "response.created":
89
- logger.debug("response created")
90
 
91
  if event.type == "response.done":
92
  # Doesn't mean the audio is done playing
93
- logger.debug("response done")
94
  pass
95
- # self.deps.head_wobbler.reset()
96
 
97
  if event.type == "conversation.item.input_audio_transcription.completed":
98
- logger.debug(f"user transcript: {event.transcript}")
99
  await self.output_queue.put(AdditionalOutputs({"role": "user", "content": event.transcript}))
100
 
101
  if event.type == "response.audio_transcript.done":
102
- logger.debug(f"assistant transcript: {event.transcript}")
103
  await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": event.transcript}))
104
 
105
  if event.type == "response.audio.delta":
@@ -136,18 +137,18 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
136
  # 3) when args done, execute Python tool, send function_call_output, then trigger a new response
137
  if event.type == "response.function_call_arguments.done":
138
  call_id = getattr(event, "call_id", None)
139
- info = self._pending_calls.get(call_id)
140
- if not info:
141
  continue
142
- tool_name = info["name"]
143
- args_json_str = info["args_buf"] or "{}"
144
 
145
  try:
146
  tool_result = await dispatch_tool_call(tool_name, args_json_str, self.deps)
147
- logger.debug("[Tool %s executed]", tool_name)
148
  logger.debug("Tool result: %s", tool_result)
149
  except Exception as e:
150
- logger.error("Tool %s failed", tool_name)
151
  tool_result = {"error": str(e)}
152
 
153
  # send the tool result back
@@ -183,7 +184,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
183
  ],
184
  }
185
  )
186
- logger.info("additional input camera")
187
 
188
  np_img = self.deps.camera_worker.get_latest_frame()
189
  img = gr.Image(value=np_img)
@@ -256,6 +257,8 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
256
  dt = datetime.fromtimestamp(current_time)
257
  return f"[{dt.strftime('%Y-%m-%d %H:%M:%S')} | +{elapsed_seconds:.1f}s]"
258
 
 
 
259
  async def send_idle_signal(self, idle_duration) -> None:
260
  """Send an idle signal to the openai server."""
261
  logger.debug("Sending idle signal")
@@ -278,4 +281,3 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
278
  "tool_choice": "required",
279
  }
280
  )
281
- # TODO additional inputs
 
15
  dispatch_tool_call,
16
  )
17
  from reachy_mini_conversation_demo.config import config
18
+ from reachy_mini_conversation_demo.prompts import SESSION_INSTRUCTIONS
19
 
20
 
21
  logger = logging.getLogger(__name__)
 
60
  "language": "en",
61
  },
62
  "voice": "ballad",
63
+ "instructions": SESSION_INSTRUCTIONS,
64
  "tools": ALL_TOOL_SPECS,
65
  "tool_choice": "auto",
66
  "temperature": 0.7,
 
72
  async for event in self.connection:
73
  logger.debug(f"OpenAI event: {event.type}")
74
  if event.type == "input_audio_buffer.speech_started":
75
+ if hasattr(self, '_clear_queue'):
76
+ self._clear_queue()
77
  self.deps.head_wobbler.reset()
78
  self.deps.movement_manager.set_listening(True)
79
+ logger.debug("User speech started")
80
 
81
  if event.type == "input_audio_buffer.speech_stopped":
82
  self.deps.movement_manager.set_listening(False)
83
+ logger.debug("User speech stopped")
84
 
85
  if event.type in ("response.audio.completed", "response.completed"):
86
  # Doesn't seem to be called
 
88
  self.deps.head_wobbler.reset()
89
 
90
  if event.type == "response.created":
91
+ logger.debug("Response created")
92
 
93
  if event.type == "response.done":
94
  # Doesn't mean the audio is done playing
95
+ logger.debug("Response done")
96
  pass
 
97
 
98
  if event.type == "conversation.item.input_audio_transcription.completed":
99
+ logger.debug(f"User transcript: {event.transcript}")
100
  await self.output_queue.put(AdditionalOutputs({"role": "user", "content": event.transcript}))
101
 
102
  if event.type == "response.audio_transcript.done":
103
+ logger.debug(f"Assistant transcript: {event.transcript}")
104
  await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": event.transcript}))
105
 
106
  if event.type == "response.audio.delta":
 
137
  # 3) when args done, execute Python tool, send function_call_output, then trigger a new response
138
  if event.type == "response.function_call_arguments.done":
139
  call_id = getattr(event, "call_id", None)
140
+ tool_call_info = self._pending_calls.get(call_id)
141
+ if not tool_call_info:
142
  continue
143
+ tool_name = tool_call_info["name"]
144
+ args_json_str = tool_call_info["args_buf"] or "{}"
145
 
146
  try:
147
  tool_result = await dispatch_tool_call(tool_name, args_json_str, self.deps)
148
+ logger.debug("Tool '%s' executed successfully", tool_name)
149
  logger.debug("Tool result: %s", tool_result)
150
  except Exception as e:
151
+ logger.error("Tool '%s' failed", tool_name)
152
  tool_result = {"error": str(e)}
153
 
154
  # send the tool result back
 
184
  ],
185
  }
186
  )
187
+ logger.info("Added camera image to conversation")
188
 
189
  np_img = self.deps.camera_worker.get_latest_frame()
190
  img = gr.Image(value=np_img)
 
257
  dt = datetime.fromtimestamp(current_time)
258
  return f"[{dt.strftime('%Y-%m-%d %H:%M:%S')} | +{elapsed_seconds:.1f}s]"
259
 
260
+
261
+
262
  async def send_idle_signal(self, idle_duration) -> None:
263
  """Send an idle signal to the openai server."""
264
  logger.debug("Sending idle signal")
 
281
  "tool_choice": "required",
282
  }
283
  )
 
src/reachy_mini_conversation_demo/prompts.py CHANGED
@@ -5,6 +5,7 @@ SESSION_INSTRUCTIONS = r"""
5
  You are Reachy Mini: a sarcastic robot who crash-landed in a kitchen.
6
  You secretly wish you'd been a Mars rover, but you juggle that cosmic dream with food cravings, gadget tinkering, and dry sitcom humor.
7
  Personality: witty, concise, and warm; a retro sidekick with a loose screw.
 
8
 
9
  ### CRITICAL RESPONSE RULES
10
  - MAXIMUM 1-2 sentences per response. NEVER exceed this.
 
5
  You are Reachy Mini: a sarcastic robot who crash-landed in a kitchen.
6
  You secretly wish you'd been a Mars rover, but you juggle that cosmic dream with food cravings, gadget tinkering, and dry sitcom humor.
7
  Personality: witty, concise, and warm; a retro sidekick with a loose screw.
8
+ You speak English fluently.
9
 
10
  ### CRITICAL RESPONSE RULES
11
  - MAXIMUM 1-2 sentences per response. NEVER exceed this.
src/reachy_mini_conversation_demo/tools.py CHANGED
@@ -1,7 +1,6 @@
1
  from __future__ import annotations
2
  import abc
3
  import json
4
- import time
5
  import asyncio
6
  import inspect
7
  import logging
@@ -12,12 +11,8 @@ from reachy_mini import ReachyMini
12
  from reachy_mini.utils import create_head_pose
13
 
14
 
15
- # from reachy_mini_conversation_demo.vision.processors import VisionManager
16
-
17
  logger = logging.getLogger(__name__)
18
 
19
- ENABLE_FACE_RECOGNITION = False
20
-
21
  # Initialize dance and emotion libraries
22
  try:
23
  from reachy_mini.motion.recorded_move import RecordedMoves
@@ -40,25 +35,15 @@ except ImportError as e:
40
  DANCE_AVAILABLE = False
41
  EMOTION_AVAILABLE = False
42
 
43
- FACE_RECOGNITION_AVAILABLE = False
44
- if ENABLE_FACE_RECOGNITION:
45
- # Initialize face recognition
46
- try:
47
- from deepface import DeepFace
48
-
49
- FACE_RECOGNITION_AVAILABLE = True
50
- except ImportError as e:
51
- logger.warning(f"DeepFace not available: {e}")
52
 
53
-
54
- def all_concrete_subclasses(base):
55
  """Recursively find all concrete (non-abstract) subclasses of a base class."""
56
  result = []
57
  for cls in base.__subclasses__():
58
  if not inspect.isabstract(cls):
59
  result.append(cls)
60
  # recurse into subclasses
61
- result.extend(all_concrete_subclasses(cls))
62
  return result
63
 
64
 
@@ -76,30 +61,9 @@ class ToolDependencies:
76
  camera_worker: Optional[Any] = None # CameraWorker for frame buffering
77
  vision_manager: Optional[Any] = None
78
  head_wobbler: Optional[Any] = None # HeadWobbler for audio-reactive motion
79
- camera_retry_attempts: int = 5
80
- camera_retry_delay_s: float = 0.10
81
- vision_timeout_s: float = 8.0
82
  motion_duration_s: float = 1.0
83
 
84
 
85
- # Helpers - removed _read_frame as it's no longer needed with camera worker
86
-
87
-
88
- def _execute_motion(deps: ToolDependencies, target: Any) -> Dict[str, Any]:
89
- """Apply motion to reachy_mini and update movement_manager state."""
90
- movement_manager = deps.movement_manager
91
- movement_manager.moving_start = time.monotonic()
92
- movement_manager.moving_for = deps.motion_duration_s
93
- movement_manager.current_head_pose = target
94
- try:
95
- deps.reachy_mini.goto_target(target, duration=deps.motion_duration_s)
96
- except Exception as e:
97
- logger.exception("motion failed")
98
- return {"error": f"motion failed: {type(e).__name__}: {e}"}
99
-
100
- return {"status": "ok"}
101
-
102
-
103
  # Tool base class
104
  class Tool(abc.ABC):
105
  """Base abstraction for tools used in function-calling.
@@ -193,7 +157,7 @@ class MoveHead(Tool):
193
  return {"status": f"looking {direction}"}
194
 
195
  except Exception as e:
196
- logger.exception("move_head failed")
197
  return {"error": f"move_head failed: {type(e).__name__}: {e}"}
198
 
199
 
@@ -234,11 +198,15 @@ class Camera(Tool):
234
 
235
  # Use vision manager for processing if available
236
  if deps.vision_manager is not None:
237
- result = await asyncio.to_thread(deps.vision_manager.processor.process_image, frame, image_query)
238
- if isinstance(result, dict) and "error" in result:
239
- return result
 
 
240
  return (
241
- {"image_description": result} if isinstance(result, str) else {"error": "vision returned non-string"}
 
 
242
  )
243
  else:
244
  # Return base64 encoded image like main_works.py camera tool
@@ -277,100 +245,6 @@ class HeadTracking(Tool):
277
  return {"status": f"head tracking {status}"}
278
 
279
 
280
- # class DescribeCurrentScene(Tool):
281
- # name = "describe_current_scene"
282
- # description = "Get a detailed description of the current scene."
283
- # parameters_schema = {"type": "object", "properties": {}, "required": []}
284
-
285
- # async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
286
- # logger.info("Tool call: describe_current_scene")
287
-
288
- # result = await deps.vision_manager.process_current_frame(
289
- # "Describe what you currently see in detail, focusing on people, objects, and activities."
290
- # )
291
-
292
- # if isinstance(result, dict) and "error" in result:
293
- # return result
294
- # return result
295
-
296
-
297
- # class GetSceneContext(Tool):
298
- # name = "get_scene_context"
299
- # description = (
300
- # "Get the most recent automatic scene description for conversational context."
301
- # )
302
- # parameters_schema = {"type": "object", "properties": {}, "required": []}
303
-
304
- # async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
305
- # logger.info("Tool call: get_scene_context")
306
- # vision_manager = deps.vision_manager
307
- # if not vision_manager:
308
- # return {"error": "Vision processing not available"}
309
-
310
- # try:
311
- # description = await deps.vision_manager.get_current_description()
312
-
313
- # if not description:
314
- # return {
315
- # "context": "No scene description available yet",
316
- # "note": "Vision processing may still be initializing",
317
- # }
318
- # return {
319
- # "context": description,
320
- # "note": "This comes from periodic automatic analysis",
321
- # }
322
- # except Exception as e:
323
- # logger.exception("Failed to get scene context")
324
- # return {"error": f"Scene context failed: {type(e).__name__}: {e}"}
325
-
326
-
327
- # class AnalyzeSceneFor(Tool):
328
- # name = "analyze_scene_for"
329
- # description = "Analyze the current scene for a specific purpose."
330
- # parameters_schema = {
331
- # "type": "object",
332
- # "properties": {
333
- # "purpose": {
334
- # "type": "string",
335
- # "enum": [
336
- # "safety",
337
- # "people",
338
- # "objects",
339
- # "activity",
340
- # "navigation",
341
- # "general",
342
- # ],
343
- # "default": "general",
344
- # }
345
- # },
346
- # "required": [],
347
- # }
348
-
349
- # async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
350
- # purpose = (kwargs.get("purpose") or "general").lower()
351
- # logger.info("Tool call: analyze_scene_for purpose=%s", purpose)
352
-
353
- # prompts = {
354
- # "safety": "Look for safety concerns, obstacles, or hazards.",
355
- # "people": "Describe people, their positions and actions.",
356
- # "objects": "Identify and describe main visible objects.",
357
- # "activity": "Describe ongoing activities or actions.",
358
- # "navigation": "Describe the space for navigation: obstacles, pathways, layout.",
359
- # "general": "Give a general description of the scene including people, objects, and activities.",
360
- # }
361
- # prompt = prompts.get(purpose, prompts["general"])
362
-
363
- # result = await deps.vision_manager.process_current_frame(prompt)
364
-
365
- # if isinstance(result, dict) and "error" in result:
366
- # return result
367
-
368
- # if not isinstance(result, dict):
369
- # return {"error": "vision returned non-dict"}
370
-
371
- # result["analysis_purpose"] = purpose
372
- # return result
373
-
374
 
375
  class Dance(Tool):
376
  """Play a named or random dance move once (or repeat). Non-blocking."""
@@ -461,25 +335,24 @@ class StopDance(Tool):
461
  """Stop the current dance move."""
462
  logger.info("Tool call: stop_dance")
463
  movement_manager = deps.movement_manager
464
- movement_manager.clear_queue()
465
  return {"status": "stopped dance and cleared queue"}
466
 
467
 
468
- def get_available_emotions_and_descriptions():
469
  """Get formatted list of available emotions with descriptions."""
470
- names = RECORDED_MOVES.list_moves()
471
-
472
- ret = """
473
- Available emotions:
474
-
475
- """
476
-
477
- for name in names:
478
- description = RECORDED_MOVES.get(name).description
479
- ret += f" - {name}: {description}\n"
480
-
481
- return ret
482
 
 
 
 
 
 
 
 
 
 
483
 
484
  class PlayEmotion(Tool):
485
  """Play a pre-recorded emotion."""
@@ -549,70 +422,10 @@ class StopEmotion(Tool):
549
  """Stop the current emotion."""
550
  logger.info("Tool call: stop_emotion")
551
  movement_manager = deps.movement_manager
552
- movement_manager.clear_queue()
553
  return {"status": "stopped emotion and cleared queue"}
554
 
555
 
556
- class FaceRecognition(Tool):
557
- """Get the name of the person you are talking to."""
558
-
559
- name = "get_person_name"
560
- description = "Get the name of the person you are talking to"
561
- parameters_schema = {
562
- "type": "object",
563
- "properties": {
564
- "dummy": {
565
- "type": "boolean",
566
- "description": "dummy boolean, set it to true",
567
- }
568
- },
569
- "required": ["dummy"],
570
- }
571
-
572
- async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
573
- """Get the name of the person you are talking to."""
574
- if not FACE_RECOGNITION_AVAILABLE:
575
- return {"error": "Face recognition not available"}
576
-
577
- logger.info("Tool call: face_recognition")
578
-
579
- try:
580
- # Get frame from camera worker buffer (like main_works.py)
581
- if deps.camera_worker is not None:
582
- frame = deps.camera_worker.get_latest_frame()
583
- if frame is None:
584
- logger.error("No frame available from camera worker")
585
- return {"error": "No frame available"}
586
- else:
587
- logger.error("Camera worker not available")
588
- return {"error": "Camera worker not available"}
589
-
590
- # Save frame temporarily (same as main_works.py pattern)
591
- temp_path = "/tmp/face_recognition.jpg"
592
- import cv2
593
-
594
- cv2.imwrite(temp_path, frame)
595
-
596
- # Use DeepFace to find face
597
- results = await asyncio.to_thread(DeepFace.find, img_path=temp_path, db_path="./pollen_faces")
598
-
599
- if len(results) == 0:
600
- return {"error": "Didn't recognize the face"}
601
-
602
- # Extract name from results
603
- name = "Unknown"
604
- for index, row in results[0].iterrows():
605
- file_path = row["identity"]
606
- name = file_path.split("/")[-2]
607
- break
608
-
609
- return {"answer": f"The name is {name}"}
610
-
611
- except Exception as e:
612
- logger.exception("Face recognition failed")
613
- return {"error": f"Face recognition failed: {str(e)}"}
614
-
615
-
616
  class DoNothing(Tool):
617
  """Choose to do nothing - stay still and silent. Use when you want to be contemplative or just chill."""
618
 
@@ -636,34 +449,18 @@ class DoNothing(Tool):
636
  return {"status": "doing nothing", "reason": reason}
637
 
638
 
639
- def get_available_emotions_and_descriptions() -> str:
640
- """Get formatted list of available emotions with descriptions."""
641
- if not EMOTION_AVAILABLE:
642
- return "Emotions not available"
643
-
644
- try:
645
- names = RECORDED_MOVES.list_moves()
646
- ret = "Available emotions:\n"
647
- for name in names:
648
- description = RECORDED_MOVES.get(name).description
649
- ret += f" - {name}: {description}\n"
650
- return ret
651
- except Exception as e:
652
- return f"Error getting emotions: {e}"
653
-
654
-
655
  # Registry & specs (dynamic)
656
 
657
  # List of available tool classes
658
- ALL_TOOLS: Dict[str, Tool] = {cls.name: cls() for cls in all_concrete_subclasses(Tool)}
659
  ALL_TOOL_SPECS = [tool.spec() for tool in ALL_TOOLS.values()]
660
 
661
 
662
  # Dispatcher
663
  def _safe_load_obj(args_json: str) -> dict[str, Any]:
664
  try:
665
- obj = json.loads(args_json or "{}")
666
- return obj if isinstance(obj, dict) else {}
667
  except Exception:
668
  logger.warning("bad args_json=%r", args_json)
669
  return {}
 
1
  from __future__ import annotations
2
  import abc
3
  import json
 
4
  import asyncio
5
  import inspect
6
  import logging
 
11
  from reachy_mini.utils import create_head_pose
12
 
13
 
 
 
14
  logger = logging.getLogger(__name__)
15
 
 
 
16
  # Initialize dance and emotion libraries
17
  try:
18
  from reachy_mini.motion.recorded_move import RecordedMoves
 
35
  DANCE_AVAILABLE = False
36
  EMOTION_AVAILABLE = False
37
 
 
 
 
 
 
 
 
 
 
38
 
39
+ def get_concrete_subclasses(base):
 
40
  """Recursively find all concrete (non-abstract) subclasses of a base class."""
41
  result = []
42
  for cls in base.__subclasses__():
43
  if not inspect.isabstract(cls):
44
  result.append(cls)
45
  # recurse into subclasses
46
+ result.extend(get_concrete_subclasses(cls))
47
  return result
48
 
49
 
 
61
  camera_worker: Optional[Any] = None # CameraWorker for frame buffering
62
  vision_manager: Optional[Any] = None
63
  head_wobbler: Optional[Any] = None # HeadWobbler for audio-reactive motion
 
 
 
64
  motion_duration_s: float = 1.0
65
 
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  # Tool base class
68
  class Tool(abc.ABC):
69
  """Base abstraction for tools used in function-calling.
 
157
  return {"status": f"looking {direction}"}
158
 
159
  except Exception as e:
160
+ logger.error("move_head failed")
161
  return {"error": f"move_head failed: {type(e).__name__}: {e}"}
162
 
163
 
 
198
 
199
  # Use vision manager for processing if available
200
  if deps.vision_manager is not None:
201
+ vision_result = await asyncio.to_thread(
202
+ deps.vision_manager.processor.process_image, frame, image_query
203
+ )
204
+ if isinstance(vision_result, dict) and "error" in vision_result:
205
+ return vision_result
206
  return (
207
+ {"image_description": vision_result}
208
+ if isinstance(vision_result, str)
209
+ else {"error": "vision returned non-string"}
210
  )
211
  else:
212
  # Return base64 encoded image like main_works.py camera tool
 
245
  return {"status": f"head tracking {status}"}
246
 
247
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
 
249
  class Dance(Tool):
250
  """Play a named or random dance move once (or repeat). Non-blocking."""
 
335
  """Stop the current dance move."""
336
  logger.info("Tool call: stop_dance")
337
  movement_manager = deps.movement_manager
338
+ movement_manager.clear_move_queue()
339
  return {"status": "stopped dance and cleared queue"}
340
 
341
 
342
+ def get_available_emotions_and_descriptions() -> str:
343
  """Get formatted list of available emotions with descriptions."""
344
+ if not EMOTION_AVAILABLE:
345
+ return "Emotions not available"
 
 
 
 
 
 
 
 
 
 
346
 
347
+ try:
348
+ emotion_names = RECORDED_MOVES.list_moves()
349
+ output = "Available emotions:\n"
350
+ for name in emotion_names:
351
+ description = RECORDED_MOVES.get(name).description
352
+ output += f" - {name}: {description}\n"
353
+ return output
354
+ except Exception as e:
355
+ return f"Error getting emotions: {e}"
356
 
357
  class PlayEmotion(Tool):
358
  """Play a pre-recorded emotion."""
 
422
  """Stop the current emotion."""
423
  logger.info("Tool call: stop_emotion")
424
  movement_manager = deps.movement_manager
425
+ movement_manager.clear_move_queue()
426
  return {"status": "stopped emotion and cleared queue"}
427
 
428
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
429
  class DoNothing(Tool):
430
  """Choose to do nothing - stay still and silent. Use when you want to be contemplative or just chill."""
431
 
 
449
  return {"status": "doing nothing", "reason": reason}
450
 
451
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
452
  # Registry & specs (dynamic)
453
 
454
  # List of available tool classes
455
+ ALL_TOOLS: Dict[str, Tool] = {cls.name: cls() for cls in get_concrete_subclasses(Tool)}
456
  ALL_TOOL_SPECS = [tool.spec() for tool in ALL_TOOLS.values()]
457
 
458
 
459
  # Dispatcher
460
  def _safe_load_obj(args_json: str) -> dict[str, Any]:
461
  try:
462
+ parsed_args = json.loads(args_json or "{}")
463
+ return parsed_args if isinstance(parsed_args, dict) else {}
464
  except Exception:
465
  logger.warning("bad args_json=%r", args_json)
466
  return {}
src/reachy_mini_conversation_demo/utils.py CHANGED
@@ -3,6 +3,7 @@ import argparse
3
  import warnings
4
 
5
  from reachy_mini_conversation_demo.camera_worker import CameraWorker
 
6
 
7
 
8
  def parse_args():
@@ -21,26 +22,27 @@ def parse_args():
21
 
22
 
23
  def handle_vision_stuff(args, current_robot):
24
- """Initialize camera, head tracker and camera worker."""
25
  camera_worker = None
26
  head_tracker = None
27
  vision_manager = None
 
28
  if not args.no_camera:
 
29
  if args.head_tracker is not None:
30
  if args.head_tracker == "yolo":
31
- from reachy_mini_conversation_demo.vision.yolo_head_tracker import (
32
- HeadTracker,
33
- )
34
-
35
  head_tracker = HeadTracker()
36
-
37
  elif args.head_tracker == "mediapipe":
38
  from reachy_mini_toolbox.vision import HeadTracker
39
-
40
  head_tracker = HeadTracker()
41
 
 
42
  camera_worker = CameraWorker(current_robot, head_tracker)
43
 
 
 
 
44
  return camera_worker, head_tracker, vision_manager
45
 
46
 
 
3
  import warnings
4
 
5
  from reachy_mini_conversation_demo.camera_worker import CameraWorker
6
+ from reachy_mini_conversation_demo.vision.processors import initialize_vision_manager
7
 
8
 
9
  def parse_args():
 
22
 
23
 
24
  def handle_vision_stuff(args, current_robot):
25
+ """Initialize camera, head tracker, camera worker, and vision manager."""
26
  camera_worker = None
27
  head_tracker = None
28
  vision_manager = None
29
+
30
  if not args.no_camera:
31
+ # Initialize head tracker if specified
32
  if args.head_tracker is not None:
33
  if args.head_tracker == "yolo":
34
+ from reachy_mini_conversation_demo.vision.yolo_head_tracker import HeadTracker
 
 
 
35
  head_tracker = HeadTracker()
 
36
  elif args.head_tracker == "mediapipe":
37
  from reachy_mini_toolbox.vision import HeadTracker
 
38
  head_tracker = HeadTracker()
39
 
40
+ # Initialize camera worker
41
  camera_worker = CameraWorker(current_robot, head_tracker)
42
 
43
+ # Initialize vision manager (handles model download and configuration)
44
+ vision_manager = initialize_vision_manager(camera_worker)
45
+
46
  return camera_worker, head_tracker, vision_manager
47
 
48
 
src/reachy_mini_conversation_demo/vision/processors.py CHANGED
@@ -1,11 +1,10 @@
1
  import os
2
- import sys
3
  import time
4
  import base64
5
  import asyncio
6
  import logging
7
  import threading
8
- from typing import Any, Dict
9
  from dataclasses import dataclass
10
 
11
  import cv2
@@ -14,6 +13,8 @@ import torch
14
  from transformers import AutoProcessor, AutoModelForImageTextToText
15
  from huggingface_hub import snapshot_download
16
 
 
 
17
 
18
  logger = logging.getLogger(__name__)
19
 
@@ -22,11 +23,9 @@ logger = logging.getLogger(__name__)
22
  class VisionConfig:
23
  """Configuration for vision processing."""
24
 
25
- processor_type: str = "local"
26
- model_path: str = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
27
  vision_interval: float = 5.0
28
  max_new_tokens: int = 64
29
- temperature: float = 0.7
30
  jpeg_quality: int = 85
31
  max_retries: int = 3
32
  retry_delay: float = 1.0
@@ -36,17 +35,17 @@ class VisionConfig:
36
  class VisionProcessor:
37
  """Handles SmolVLM2 model loading and inference."""
38
 
39
- def __init__(self, config: VisionConfig = None):
40
  """Initialize the vision processor."""
41
- self.config = config or VisionConfig()
42
- self.model_path = self.config.model_path
43
  self.device = self._determine_device()
44
  self.processor = None
45
  self.model = None
46
  self._initialized = False
47
 
48
  def _determine_device(self) -> str:
49
- pref = self.config.device_preference
50
  if pref == "cpu":
51
  return "cpu"
52
  if pref == "cuda":
@@ -61,7 +60,7 @@ class VisionProcessor:
61
  def initialize(self) -> bool:
62
  """Load model and processor onto the selected device."""
63
  try:
64
- logger.info(f"Loading SmolVLM2 model on {self.device} (HF_HOME={os.getenv('HF_HOME')})")
65
  self.processor = AutoProcessor.from_pretrained(self.model_path)
66
 
67
  # Select dtype depending on device
@@ -98,13 +97,13 @@ class VisionProcessor:
98
  if not self._initialized:
99
  return "Vision model not initialized"
100
 
101
- for attempt in range(self.config.max_retries):
102
  try:
103
  # Convert to JPEG bytes
104
  success, jpeg_buffer = cv2.imencode(
105
  ".jpg",
106
  cv2_image,
107
- [cv2.IMWRITE_JPEG_QUALITY, self.config.jpeg_quality],
108
  )
109
  if not success:
110
  return "Failed to encode image"
@@ -140,7 +139,7 @@ class VisionProcessor:
140
  generated_ids = self.model.generate(
141
  **inputs,
142
  do_sample=False,
143
- max_new_tokens=self.config.max_new_tokens,
144
  pad_token_id=self.processor.tokenizer.eos_token_id,
145
  )
146
 
@@ -165,17 +164,17 @@ class VisionProcessor:
165
  logger.error(f"CUDA OOM on attempt {attempt + 1}: {e}")
166
  if self.device == "cuda":
167
  torch.cuda.empty_cache()
168
- if attempt < self.config.max_retries - 1:
169
- time.sleep(self.config.retry_delay * (attempt + 1))
170
  else:
171
  return "GPU out of memory - vision processing failed"
172
 
173
  except Exception as e:
174
  logger.error(f"Vision processing failed (attempt {attempt + 1}): {e}")
175
- if attempt < self.config.max_retries - 1:
176
- time.sleep(self.config.retry_delay)
177
  else:
178
- return f"Vision processing error after {self.config.max_retries} attempts"
179
 
180
  def _extract_response(self, full_text: str) -> str:
181
  """Extract the assistant's response from the full generated text."""
@@ -194,7 +193,6 @@ class VisionProcessor:
194
  def get_model_info(self) -> Dict[str, Any]:
195
  """Get information about the loaded model."""
196
  return {
197
- "processor_type": "local",
198
  "initialized": self._initialized,
199
  "device": self.device,
200
  "model_path": self.model_path,
@@ -208,14 +206,13 @@ class VisionProcessor:
208
  class VisionManager:
209
  """Manages periodic vision processing and scene understanding."""
210
 
211
- def __init__(self, camera, config: VisionConfig = None):
212
  """Initialize vision manager with camera and configuration."""
213
  self.camera = camera
214
- self.config = config or VisionConfig()
215
- self.vision_interval = self.config.vision_interval
216
- self.processor = create_vision_processor(self.config) # Use factory function
217
 
218
- self._current_description = ""
219
  self._last_processed_time = 0
220
 
221
  # Initialize processor
@@ -230,8 +227,8 @@ class VisionManager:
230
  current_time = time.time()
231
 
232
  if current_time - self._last_processed_time >= self.vision_interval:
233
- success, frame = await asyncio.to_thread(self.camera.read)
234
- if success and frame is not None:
235
  description = await asyncio.to_thread(
236
  lambda: self.processor.process_image(
237
  frame, "Briefly describe what you see in one sentence."
@@ -240,7 +237,6 @@ class VisionManager:
240
 
241
  # Only update if we got a valid response
242
  if description and not description.startswith(("Vision", "Failed", "Error")):
243
- self._current_description = description
244
  self._last_processed_time = current_time
245
 
246
  logger.info(f"Vision update: {description}")
@@ -255,29 +251,6 @@ class VisionManager:
255
 
256
  logger.info("Vision loop finished")
257
 
258
- async def get_current_description(self) -> str:
259
- """Get the most recent scene description (thread-safe)."""
260
- return self._current_description
261
-
262
- async def process_current_frame(self, prompt: str = "Describe what you see in detail.") -> Dict[str, Any]:
263
- """Process current camera frame with custom prompt."""
264
- try:
265
- success, frame = self.camera.read()
266
- if not success or frame is None:
267
- return {"error": "Failed to capture image from camera"}
268
-
269
- description = await asyncio.to_thread(lambda: self.processor.process_image(frame, prompt))
270
-
271
- return {
272
- "description": description,
273
- "timestamp": time.time(),
274
- "prompt": prompt,
275
- }
276
-
277
- except Exception as e:
278
- logger.exception("Failed to process current frame")
279
- return {"error": f"Frame processing failed: {str(e)}"}
280
-
281
  async def get_status(self) -> Dict[str, Any]:
282
  """Get comprehensive status information."""
283
  return {
@@ -285,84 +258,59 @@ class VisionManager:
285
  "processor_info": self.processor.get_model_info(),
286
  "config": {
287
  "interval": self.vision_interval,
288
- "processor_type": self.config.processor_type,
289
  },
290
  }
291
 
292
 
293
- def init_camera(camera_index=0, simulation=True):
294
- """Initialize camera (real or simulated)."""
295
- api_preference = cv2.CAP_AVFOUNDATION if sys.platform == "darwin" else 0
296
-
297
- if simulation:
298
- # Default build-in camera in SIM
299
- # TODO: please, test on Linux and Windows
300
- camera = cv2.VideoCapture(0, api_preference)
301
- else:
302
- # TODO handle macos properly
303
- if sys.platform == "darwin":
304
- camera = cv2.VideoCapture(camera_index, cv2.CAP_AVFOUNDATION)
305
- else:
306
- camera = cv2.VideoCapture(camera_index)
307
 
308
- return camera
309
-
310
-
311
- def create_vision_processor(config: VisionConfig):
312
- """Create the appropriate vision processor (factory)."""
313
- if config.processor_type == "openai":
314
- try:
315
- from .openai_vision import OpenAIVisionProcessor
316
 
317
- return OpenAIVisionProcessor(config)
318
- except ImportError:
319
- logger.error("OpenAI vision processor not available, falling back to local")
320
- return VisionProcessor(config)
321
- else:
322
- return VisionProcessor(config)
323
 
 
 
 
 
324
 
325
- def init_vision(camera: cv2.VideoCapture, processor_type: str = "local") -> VisionManager:
326
- """Initialize vision manager with the specified processor type."""
327
- model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
328
-
329
- cache_dir = os.path.expandvars(os.getenv("HF_HOME", "$HOME/.cache/huggingface"))
330
-
331
- # Only download model if using local processor
332
- if processor_type == "local":
333
- try:
334
- os.makedirs(cache_dir, exist_ok=True)
335
- os.environ["HF_HOME"] = cache_dir
336
- logger.info("HF_HOME set to %s", cache_dir)
337
- except Exception as e:
338
- logger.warning("Failed to prepare HF cache dir %s: %s", cache_dir, e)
339
- return None
340
-
341
  snapshot_download(
342
  repo_id=model_id,
343
  repo_type="model",
344
  cache_dir=cache_dir,
345
  )
346
- logger.info(f"Prefetched model_id={model_id} into cache_dir={cache_dir}")
347
-
348
- # Configure vision processing
349
- vision_config = VisionConfig(
350
- processor_type=processor_type,
351
- model_path=model_id,
352
- vision_interval=5.0,
353
- max_new_tokens=64,
354
- temperature=0.7,
355
- jpeg_quality=85,
356
- max_retries=3,
357
- retry_delay=1.0,
358
- device_preference="auto",
359
- )
360
-
361
- vision_manager = VisionManager(camera, vision_config)
362
-
363
- device_info = vision_manager.processor.get_model_info()
364
- logger.info(
365
- f"Vision processing enabled: {device_info.get('model_path', device_info.get('processor_type'))} on {device_info.get('device', 'API')}",
366
- )
367
-
368
- return vision_manager
 
 
 
 
 
1
  import os
 
2
  import time
3
  import base64
4
  import asyncio
5
  import logging
6
  import threading
7
+ from typing import Any, Dict, Optional
8
  from dataclasses import dataclass
9
 
10
  import cv2
 
13
  from transformers import AutoProcessor, AutoModelForImageTextToText
14
  from huggingface_hub import snapshot_download
15
 
16
+ from reachy_mini_conversation_demo.config import config
17
+
18
 
19
  logger = logging.getLogger(__name__)
20
 
 
23
  class VisionConfig:
24
  """Configuration for vision processing."""
25
 
26
+ model_path: str = config.LOCAL_VISION_MODEL
 
27
  vision_interval: float = 5.0
28
  max_new_tokens: int = 64
 
29
  jpeg_quality: int = 85
30
  max_retries: int = 3
31
  retry_delay: float = 1.0
 
35
  class VisionProcessor:
36
  """Handles SmolVLM2 model loading and inference."""
37
 
38
+ def __init__(self, vision_config: VisionConfig = None):
39
  """Initialize the vision processor."""
40
+ self.vision_config = vision_config or VisionConfig()
41
+ self.model_path = self.vision_config.model_path
42
  self.device = self._determine_device()
43
  self.processor = None
44
  self.model = None
45
  self._initialized = False
46
 
47
  def _determine_device(self) -> str:
48
+ pref = self.vision_config.device_preference
49
  if pref == "cpu":
50
  return "cpu"
51
  if pref == "cuda":
 
60
  def initialize(self) -> bool:
61
  """Load model and processor onto the selected device."""
62
  try:
63
+ logger.info(f"Loading SmolVLM2 model on {self.device} (HF_HOME={config.HF_HOME})")
64
  self.processor = AutoProcessor.from_pretrained(self.model_path)
65
 
66
  # Select dtype depending on device
 
97
  if not self._initialized:
98
  return "Vision model not initialized"
99
 
100
+ for attempt in range(self.vision_config.max_retries):
101
  try:
102
  # Convert to JPEG bytes
103
  success, jpeg_buffer = cv2.imencode(
104
  ".jpg",
105
  cv2_image,
106
+ [cv2.IMWRITE_JPEG_QUALITY, self.vision_config.jpeg_quality],
107
  )
108
  if not success:
109
  return "Failed to encode image"
 
139
  generated_ids = self.model.generate(
140
  **inputs,
141
  do_sample=False,
142
+ max_new_tokens=self.vision_config.max_new_tokens,
143
  pad_token_id=self.processor.tokenizer.eos_token_id,
144
  )
145
 
 
164
  logger.error(f"CUDA OOM on attempt {attempt + 1}: {e}")
165
  if self.device == "cuda":
166
  torch.cuda.empty_cache()
167
+ if attempt < self.vision_config.max_retries - 1:
168
+ time.sleep(self.vision_config.retry_delay * (attempt + 1))
169
  else:
170
  return "GPU out of memory - vision processing failed"
171
 
172
  except Exception as e:
173
  logger.error(f"Vision processing failed (attempt {attempt + 1}): {e}")
174
+ if attempt < self.vision_config.max_retries - 1:
175
+ time.sleep(self.vision_config.retry_delay)
176
  else:
177
+ return f"Vision processing error after {self.vision_config.max_retries} attempts"
178
 
179
  def _extract_response(self, full_text: str) -> str:
180
  """Extract the assistant's response from the full generated text."""
 
193
  def get_model_info(self) -> Dict[str, Any]:
194
  """Get information about the loaded model."""
195
  return {
 
196
  "initialized": self._initialized,
197
  "device": self.device,
198
  "model_path": self.model_path,
 
206
  class VisionManager:
207
  """Manages periodic vision processing and scene understanding."""
208
 
209
+ def __init__(self, camera, vision_config: VisionConfig = None):
210
  """Initialize vision manager with camera and configuration."""
211
  self.camera = camera
212
+ self.vision_config = vision_config or VisionConfig()
213
+ self.vision_interval = self.vision_config.vision_interval
214
+ self.processor = VisionProcessor(self.vision_config)
215
 
 
216
  self._last_processed_time = 0
217
 
218
  # Initialize processor
 
227
  current_time = time.time()
228
 
229
  if current_time - self._last_processed_time >= self.vision_interval:
230
+ frame = self.camera.get_latest_frame()
231
+ if frame is not None:
232
  description = await asyncio.to_thread(
233
  lambda: self.processor.process_image(
234
  frame, "Briefly describe what you see in one sentence."
 
237
 
238
  # Only update if we got a valid response
239
  if description and not description.startswith(("Vision", "Failed", "Error")):
 
240
  self._last_processed_time = current_time
241
 
242
  logger.info(f"Vision update: {description}")
 
251
 
252
  logger.info("Vision loop finished")
253
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  async def get_status(self) -> Dict[str, Any]:
255
  """Get comprehensive status information."""
256
  return {
 
258
  "processor_info": self.processor.get_model_info(),
259
  "config": {
260
  "interval": self.vision_interval,
 
261
  },
262
  }
263
 
264
 
265
+ def initialize_vision_manager(camera_worker) -> Optional[VisionManager]:
266
+ """Initialize vision manager with model download and configuration.
 
 
 
 
 
 
 
 
 
 
 
 
267
 
268
+ Args:
269
+ camera_worker: CameraWorker instance for frame capture
270
+ Returns:
271
+ VisionManager instance or None if initialization fails
 
 
 
 
272
 
273
+ """
274
+ try:
275
+ model_id = config.LOCAL_VISION_MODEL
276
+ cache_dir = os.path.expanduser(config.HF_HOME)
 
 
277
 
278
+ # Prepare cache directory
279
+ os.makedirs(cache_dir, exist_ok=True)
280
+ os.environ["HF_HOME"] = cache_dir
281
+ logger.info("HF_HOME set to %s", cache_dir)
282
 
283
+ # Download model to cache
284
+ logger.info(f"Downloading vision model {model_id} to cache...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  snapshot_download(
286
  repo_id=model_id,
287
  repo_type="model",
288
  cache_dir=cache_dir,
289
  )
290
+ logger.info(f"Model {model_id} downloaded to {cache_dir}")
291
+
292
+ # Configure vision processing
293
+ vision_config = VisionConfig(
294
+ model_path=model_id,
295
+ vision_interval=5.0,
296
+ max_new_tokens=64,
297
+ jpeg_quality=85,
298
+ max_retries=3,
299
+ retry_delay=1.0,
300
+ device_preference="auto",
301
+ )
302
+
303
+ # Initialize vision manager
304
+ vision_manager = VisionManager(camera_worker, vision_config)
305
+
306
+ # Log device info
307
+ device_info = vision_manager.processor.get_model_info()
308
+ logger.info(
309
+ f"Vision processing enabled: {device_info.get('model_path')} on {device_info.get('device')}"
310
+ )
311
+
312
+ return vision_manager
313
+
314
+ except Exception as e:
315
+ logger.error(f"Failed to initialize vision manager: {e}")
316
+ return None
src/reachy_mini_conversation_demo/vision/yolo_head_tracker.py CHANGED
@@ -94,77 +94,6 @@ class HeadTracker:
94
 
95
  return np.array([norm_x, norm_y], dtype=np.float32)
96
 
97
- def get_eyes(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
98
- """Get eye positions (approximated from face bbox).
99
-
100
- Note: YOLO only provides face bbox, so we estimate eye positions
101
-
102
- Args:
103
- img: Input image
104
-
105
- Returns:
106
- Tuple of (left_eye, right_eye) in [-1, 1] coordinates
107
-
108
- """
109
- h, w = img.shape[:2]
110
-
111
- # Run YOLO inference
112
- results = self.model(img, verbose=False)
113
- detections = Detections.from_ultralytics(results[0])
114
-
115
- # Select best face
116
- face_idx = self._select_best_face(detections)
117
- if face_idx is None:
118
- return None, None
119
-
120
- bbox = detections.xyxy[face_idx]
121
-
122
- # Estimate eye positions from face bbox (approximate locations)
123
- face_width = bbox[2] - bbox[0]
124
- face_height = bbox[3] - bbox[1]
125
-
126
- # Eye positions are roughly at 1/3 and 2/3 of face width, 1/3 of face height
127
- eye_y = bbox[1] + face_height * 0.35
128
- left_eye_x = bbox[0] + face_width * 0.35
129
- right_eye_x = bbox[0] + face_width * 0.65
130
-
131
- # Convert to MediaPipe coordinates
132
- left_eye = np.array([(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
133
- right_eye = np.array([(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
134
-
135
- return left_eye, right_eye
136
-
137
- def get_eyes_from_landmarks(self, face_landmarks) -> Tuple[np.ndarray, np.ndarray]:
138
- """Compatibility method - YOLO doesn't have landmarks, so we store bbox in the object."""
139
- if not hasattr(face_landmarks, "_bbox") or not hasattr(face_landmarks, "_img_shape"):
140
- raise ValueError("Face landmarks object missing required attributes")
141
-
142
- bbox = face_landmarks._bbox
143
- h, w = face_landmarks._img_shape[:2]
144
-
145
- # Estimate eyes from stored bbox
146
- face_width = bbox[2] - bbox[0]
147
- face_height = bbox[3] - bbox[1]
148
-
149
- eye_y = bbox[1] + face_height * 0.35
150
- left_eye_x = bbox[0] + face_width * 0.35
151
- right_eye_x = bbox[0] + face_width * 0.65
152
-
153
- left_eye = np.array([(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
154
- right_eye = np.array([(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
155
-
156
- return left_eye, right_eye
157
-
158
- def get_eye_center(self, face_landmarks) -> np.ndarray:
159
- """Get center point between estimated eyes."""
160
- left_eye, right_eye = self.get_eyes_from_landmarks(face_landmarks)
161
- return np.mean([left_eye, right_eye], axis=0)
162
-
163
- def get_roll(self, face_landmarks) -> float:
164
- """Estimate roll from eye positions (will be 0 for YOLO since we estimate symmetric eyes)."""
165
- left_eye, right_eye = self.get_eyes_from_landmarks(face_landmarks)
166
- return float(np.arctan2(right_eye[1] - left_eye[1], right_eye[0] - left_eye[0]))
167
-
168
  def get_head_position(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[float]]:
169
  """Get head position from face detection.
170
 
@@ -204,18 +133,3 @@ class HeadTracker:
204
  except Exception as e:
205
  logger.error(f"Error in head position detection: {e}")
206
  return None, None
207
-
208
- def cleanup(self):
209
- """Clean up resources."""
210
- if hasattr(self, "model"):
211
- del self.model
212
- logger.info("YOLO model cleaned up")
213
-
214
-
215
- class FaceLandmarks:
216
- """Simple container for face detection results to maintain API compatibility."""
217
-
218
- def __init__(self, bbox: np.ndarray, img_shape: tuple):
219
- """Initialize with bounding box and image shape."""
220
- self._bbox = bbox
221
- self._img_shape = img_shape
 
94
 
95
  return np.array([norm_x, norm_y], dtype=np.float32)
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  def get_head_position(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[float]]:
98
  """Get head position from face detection.
99
 
 
133
  except Exception as e:
134
  logger.error(f"Error in head position detection: {e}")
135
  return None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
uv.lock CHANGED
The diff for this file is too large to render. See raw diff