Spaces:

pollen-robotics
/

reachy_mini_conversation_app

Running

App Files Files Community

Alina commited on Oct 14

Commit

3e83b93

unverified ·

2 Parent(s): 8dda3b1 3a775a2

Merge pull request #53 from pollen-robotics/more_cleanup

Browse files

Files changed (16) hide show

.env.example +3 -0
.gitignore +36 -181
README.md +9 -8
pyproject.toml +1 -0
src/reachy_mini_conversation_demo/audio/speech_tapper.py +0 -13
src/reachy_mini_conversation_demo/config.py +30 -9
src/reachy_mini_conversation_demo/console.py +18 -15
src/reachy_mini_conversation_demo/images/reachy_mini_dance.gif +3 -0
src/reachy_mini_conversation_demo/moves.py +3 -22
src/reachy_mini_conversation_demo/openai_realtime.py +19 -17
src/reachy_mini_conversation_demo/prompts.py +1 -0
src/reachy_mini_conversation_demo/tools.py +28 -231
src/reachy_mini_conversation_demo/utils.py +9 -7
src/reachy_mini_conversation_demo/vision/processors.py +66 -118
src/reachy_mini_conversation_demo/vision/yolo_head_tracker.py +0 -86
uv.lock +0 -0

.env.example CHANGED Viewed

@@ -1,6 +1,9 @@
 OPENAI_API_KEY=
 MODEL_NAME="gpt-realtime"
 # Cache for local VLM
 HF_HOME=./cache

 OPENAI_API_KEY=
 MODEL_NAME="gpt-realtime"
+# Local vision model
+LOCAL_VISION_MODEL=HuggingFaceTB/SmolVLM2-2.2B-Instruct
 # Cache for local VLM
 HF_HOME=./cache

.gitignore CHANGED Viewed

@@ -1,202 +1,57 @@
-# General
-.DS_Store
-.AppleDouble
-.LSOverride
-# Icon must end with two \r
-Icon
-# Thumbnails
-._*
-# Files that might appear in the root of a volume
-.DocumentRevisions-V100
-.fseventsd
-.Spotlight-V100
-.TemporaryItems
-.Trashes
-.VolumeIcon.icns
-.com.apple.timemachine.donotpresent
-# Directories potentially created on remote AFP share
-.AppleDB
-.AppleDesktop
-Network Trash Folder
-Temporary Items
-.apdisk
-# Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
-# C extensions
 *.so
-# Distribution / packaging
-.Python
 build/
-develop-eggs/
 dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
 *.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
 .coverage
-.coverage.*
-.cache
-cache/
-nosetests.xml
 coverage.xml
 *.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-# Ruff cache
 .ruff_cache/
-# Translations
-*.mo
-*.pot
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-# Flask stuff:
-instance/
-.webassets-cache
-# Scrapy stuff:
-.scrapy
-# Sphinx documentation
-docs/_build/
-# PyBuilder
-.pybuilder/
-target/
-# Jupyter Notebook
-.ipynb_checkpoints
-# IPython
-profile_default/
-ipython_config.py
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/#use-with-ide
-.pdm.toml
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-# SageMath parsed files
-*.sage.py
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-# Keys
-*.csr
 *.key
 *.pem
-# Spyder project settings
-.spyderproject
-.spyproject
-# Rope project settings
-.ropeproject
-# mkdocs documentation
-/site
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-# Pyre type checker
-.pyre/
-# pytype static type analyzer
-.pytype/
-# Cython debug symbols
-cython_debug/
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
-# Certificates
-*.crt
-# Temporary folder
-tmp/

+# Python
 __pycache__/
 *.py[cod]
 *$py.class
 *.so
+# Virtual environments
+.venv/
+venv/
+ENV/
+env/
+# Environment variables
+.env
+# Build and distribution
 build/
 dist/
 *.egg-info/
+.eggs/
+# Testing
+.pytest_cache/
 .coverage
+.hypothesis/
+htmlcov/
 coverage.xml
 *.cover
+# Linting and formatting
 .ruff_cache/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+# Security
 *.key
 *.pem
+*.crt
+*.csr
+# Temporary files
+tmp/
+*.log
+cache/
+# macOS
+.DS_Store
+# Linux
+*~
+.directory
+.Trash-*
+.nfs*

README.md CHANGED Viewed

@@ -1,12 +1,13 @@
 # Reachy Mini conversation demo
 Conversational demo for the Reachy Mini robot combining OpenAI's realtime APIs, vision pipelines, and choreographed motion libraries.
 ## Overview
 - Real-time audio conversation loop powered by the OpenAI realtime API and `fastrtc` for low-latency streaming.
-- Camera capture can route to OpenAI multimodal vision or stay on-device with SmolVLM2 local analysis.
 - Layered motion system queues primary moves (dances, emotions, goto poses, breathing) while blending speech-reactive wobble and face-tracking.
-- Async tool dispatch integrates robot motion, camera capture, and optional facial-recognition helpers through a Gradio web UI with live transcripts.
 ## Installation
@@ -74,8 +75,9 @@ Some wheels (e.g. PyTorch) are large and require compatible CUDA or CPU builds
 |----------|-------------|
 | `OPENAI_API_KEY` | Required. Grants access to the OpenAI realtime endpoint.
 | `MODEL_NAME` | Override the realtime model (defaults to `gpt-realtime`).
-| `HF_HOME` | Cache directory for local Hugging Face downloads.
-| `HF_TOKEN` | Optional token for Hugging Face models.
 ## Running the demo
@@ -85,13 +87,13 @@ Activate your virtual environment, ensure the Reachy Mini robot (or simulator) i
 reachy-mini-conversation-demo
 ```
-The app starts a Gradio UI served locally (http://127.0.0.1:7860/). When running on a headless host, use `--headless`. With a camera attached, captured frames can be analysed remotely through OpenAI multimodal models or locally via the YOLO/MediaPipe pipelines depending on the extras you installed.
 ### CLI options
 | Option | Default | Description |
 |--------|---------|-------------|
-| `--head-tracker {yolo,mediapipe}` | `None` | Select a face-tracking backend when a camera is available. Requires the matching optional extra. |
 | `--no-camera` | `False` | Run without camera capture or face tracking. |
 | `--gradio` | `False` | Launch the Gradio web UI. Without this flag, runs in console mode. Required when running in simulation mode. |
 | `--debug` | `False` | Enable verbose logging for troubleshooting. |
@@ -116,12 +118,11 @@ The app starts a Gradio UI served locally (http://127.0.0.1:7860/). When running
 |------|--------|--------------|
 | `move_head` | Queue a head pose change (left/right/up/down/front). | Core install only. |
 | `camera` | Capture the latest camera frame and optionally query a vision backend. | Requires camera worker; vision analysis depends on selected extras. |
-| `head_tracking` | Enable or disable face-tracking offsets. | Camera worker with configured head tracker. |
 | `dance` | Queue a dance from `reachy_mini_dances_library`. | Core install only. |
 | `stop_dance` | Clear queued dances. | Core install only. |
 | `play_emotion` | Play a recorded emotion clip via Hugging Face assets. | Needs `HF_TOKEN` for the recorded emotions dataset. |
 | `stop_emotion` | Clear queued emotions. | Core install only. |
-| `get_person_name` | DeepFace-based recognition of the current person. | Disabled by default (`ENABLE_FACE_RECOGNITION=False`); requires `deepface` and a local face database. |
 | `do_nothing` | Explicitly remain idle. | Core install only. |
 ## Development workflow

 # Reachy Mini conversation demo
 Conversational demo for the Reachy Mini robot combining OpenAI's realtime APIs, vision pipelines, and choreographed motion libraries.
+![Reachy Mini Dance](src/reachy_mini_conversation_demo/images/reachy_mini_dance.gif)
 ## Overview
 - Real-time audio conversation loop powered by the OpenAI realtime API and `fastrtc` for low-latency streaming.
+- Local vision processing using SmolVLM2 model running on-device (CPU/GPU/MPS).
 - Layered motion system queues primary moves (dances, emotions, goto poses, breathing) while blending speech-reactive wobble and face-tracking.
+- Async tool dispatch integrates robot motion, camera capture, and optional face-tracking capabilities through a Gradio web UI with live transcripts.
 ## Installation
 |----------|-------------|
 | `OPENAI_API_KEY` | Required. Grants access to the OpenAI realtime endpoint.
 | `MODEL_NAME` | Override the realtime model (defaults to `gpt-realtime`).
+| `HF_HOME` | Cache directory for local Hugging Face downloads (defaults to `./cache`).
+| `HF_TOKEN` | Optional token for Hugging Face models (falls back to `huggingface-cli login`).
+| `LOCAL_VISION_MODEL` | Hugging Face model path for local vision processing (defaults to `HuggingFaceTB/SmolVLM2-2.2B-Instruct`).
 ## Running the demo
 reachy-mini-conversation-demo
 ```
+By default, the app runs in console mode for direct audio interaction. Use the `--gradio` flag to launch a web UI served locally at http://127.0.0.1:7860/ (required when running in simulation mode). With a camera attached, captured frames are analyzed locally using the SmolVLM2 vision model. Additionally, you can enable face tracking via YOLO or MediaPipe pipelines depending on the extras you installed.
 ### CLI options
 | Option | Default | Description |
 |--------|---------|-------------|
+| `--head-tracker {yolo,mediapipe}` | `None` | Select a face-tracking backend when a camera is available. YOLO is implemented locally, MediaPipe comes from the `reachy_mini_toolbox` package. Requires the matching optional extra. |
 | `--no-camera` | `False` | Run without camera capture or face tracking. |
 | `--gradio` | `False` | Launch the Gradio web UI. Without this flag, runs in console mode. Required when running in simulation mode. |
 | `--debug` | `False` | Enable verbose logging for troubleshooting. |
 |------|--------|--------------|
 | `move_head` | Queue a head pose change (left/right/up/down/front). | Core install only. |
 | `camera` | Capture the latest camera frame and optionally query a vision backend. | Requires camera worker; vision analysis depends on selected extras. |
+| `head_tracking` | Enable or disable face-tracking offsets (not facial recognition - only detects and tracks face position). | Camera worker with configured head tracker. |
 | `dance` | Queue a dance from `reachy_mini_dances_library`. | Core install only. |
 | `stop_dance` | Clear queued dances. | Core install only. |
 | `play_emotion` | Play a recorded emotion clip via Hugging Face assets. | Needs `HF_TOKEN` for the recorded emotions dataset. |
 | `stop_emotion` | Clear queued emotions. | Core install only. |
 | `do_nothing` | Explicitly remain idle. | Core install only. |
 ## Development workflow

pyproject.toml CHANGED Viewed

@@ -16,6 +16,7 @@ dependencies = [
     "gradio>=5.49.0",
     "huggingface_hub>=0.34.4",
     "opencv-python>=4.12.0.88",
     #Environment variables
     "python-dotenv",

     "gradio>=5.49.0",
     "huggingface_hub>=0.34.4",
     "opencv-python>=4.12.0.88",
+    "num2words",
     #Environment variables
     "python-dotenv",

src/reachy_mini_conversation_demo/audio/speech_tapper.py CHANGED Viewed

@@ -120,7 +120,6 @@ class SwayRollRT:
         self._seed = int(rng_seed)
         self.samples = deque(maxlen=10 * SR)  # sliding window for VAD/env
         self.carry = np.zeros(0, dtype=np.float32)
-        self.frame_idx = 0
         self.vad_on = False
         self.vad_above = 0
@@ -143,7 +142,6 @@ class SwayRollRT:
         """Reset state (VAD/env/buffers/time) but keep initial phases/seed."""
         self.samples.clear()
         self.carry = np.zeros(0, dtype=np.float32)
-        self.frame_idx = 0
         self.vad_on = False
         self.vad_above = 0
         self.vad_below = 0
@@ -152,16 +150,6 @@ class SwayRollRT:
         self.sway_down = 0
         self.t = 0.0
-    def reset_phases(self) -> None:
-        """Re-randomize phases deterministically from stored seed (Optional)."""
-        rng = np.random.default_rng(self._seed)
-        self.phase_pitch = float(rng.random() * 2 * math.pi)
-        self.phase_yaw = float(rng.random() * 2 * math.pi)
-        self.phase_roll = float(rng.random() * 2 * math.pi)
-        self.phase_x = float(rng.random() * 2 * math.pi)
-        self.phase_y = float(rng.random() * 2 * math.pi)
-        self.phase_z = float(rng.random() * 2 * math.pi)
     def feed(self, pcm: np.ndarray, sr: Optional[int]) -> List[Dict[str, float]]:
         """Stream in PCM chunk. Returns a list of sway dicts, one per hop (HOP_MS).
@@ -196,7 +184,6 @@ class SwayRollRT:
             self.samples.extend(hop.tolist())
             if len(self.samples) < FRAME:
                 self.t += HOP_MS / 1000.0
-                self.frame_idx += 1
                 continue
             frame = np.fromiter(

         self._seed = int(rng_seed)
         self.samples = deque(maxlen=10 * SR)  # sliding window for VAD/env
         self.carry = np.zeros(0, dtype=np.float32)
         self.vad_on = False
         self.vad_above = 0
         """Reset state (VAD/env/buffers/time) but keep initial phases/seed."""
         self.samples.clear()
         self.carry = np.zeros(0, dtype=np.float32)
         self.vad_on = False
         self.vad_above = 0
         self.vad_below = 0
         self.sway_down = 0
         self.t = 0.0
     def feed(self, pcm: np.ndarray, sr: Optional[int]) -> List[Dict[str, float]]:
         """Stream in PCM chunk. Returns a list of sway dicts, one per hop (HOP_MS).
             self.samples.extend(hop.tolist())
             if len(self.samples) < FRAME:
                 self.t += HOP_MS / 1000.0
                 continue
             frame = np.fromiter(

src/reachy_mini_conversation_demo/config.py CHANGED Viewed

@@ -1,17 +1,28 @@
 import os
 from dotenv import load_dotenv
-load_dotenv()
-def getenv_bool(key: str, default: bool = False) -> bool:
-    """Read env var as a Python bool (case-insensitive)."""
-    val = os.getenv(key)
-    if val is None:
-        return default
-    return val.strip().lower() in {"true", "1", "yes", "on"}
 class Config:
@@ -19,13 +30,23 @@ class Config:
     # Required
     OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
-    if not OPENAI_API_KEY:
-        raise RuntimeError("OPENAI_API_KEY is missing in .env")
     # Optional
     MODEL_NAME = os.getenv("MODEL_NAME", "gpt-realtime")
     HF_HOME = os.getenv("HF_HOME", "./cache")
     HF_TOKEN = os.getenv("HF_TOKEN")  # Optional, falls back to hf auth login if not set
 config = Config()

 import os
+import logging
+from pathlib import Path
 from dotenv import load_dotenv
+logger = logging.getLogger(__name__)
+# Check if .env file exists
+env_file = Path(".env")
+if not env_file.exists():
+    raise RuntimeError(
+        ".env file not found. Please create one based on .env.example:\n"
+        "  cp .env.example .env\n"
+        "Then add your OPENAI_API_KEY to the .env file."
+    )
+# Load .env and verify it was loaded successfully
+if not load_dotenv():
+    raise RuntimeError(
+        "Failed to load .env file. Please ensure the file is readable and properly formatted."
+    )
+logger.info("Configuration loaded from .env file")
 class Config:
     # Required
     OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+    if OPENAI_API_KEY is None:
+        raise RuntimeError(
+            "OPENAI_API_KEY is not set in .env file. Please add it:\n"
+            "  OPENAI_API_KEY=your_api_key_here"
+        )
+    if not OPENAI_API_KEY.strip():
+        raise RuntimeError(
+            "OPENAI_API_KEY is empty in .env file. Please provide a valid API key."
+        )
     # Optional
     MODEL_NAME = os.getenv("MODEL_NAME", "gpt-realtime")
     HF_HOME = os.getenv("HF_HOME", "./cache")
+    LOCAL_VISION_MODEL = os.getenv("LOCAL_VISION_MODEL", "HuggingFaceTB/SmolVLM2-2.2B-Instruct")
     HF_TOKEN = os.getenv("HF_TOKEN")  # Optional, falls back to hf auth login if not set
+    logger.debug(f"Model: {MODEL_NAME}, HF_HOME: {HF_HOME}, Vision Model: {LOCAL_VISION_MODEL}")
 config = Config()

src/reachy_mini_conversation_demo/console.py CHANGED Viewed

@@ -26,7 +26,7 @@ class LocalStream:
         self._stop_event = asyncio.Event()
         self._tasks = []
         # Allow the handler to flush the player queue when appropriate.
-        self.handler._clear_queue = self.clear_queue  # type: ignore[assignment]
     def launch(self) -> None:
         """Start the recorder/player and run the async processing loops."""
@@ -69,7 +69,7 @@ class LocalStream:
         self._robot.media.stop_recording()
         self._robot.media.stop_playing()
-    def clear_queue(self) -> None:
         """Flush the player's appsrc to drop any queued audio immediately."""
         logger.info("User intervention: flushing player queue")
         self.handler.output_queue = asyncio.Queue()
@@ -78,9 +78,9 @@ class LocalStream:
         """Read mic frames from the recorder and forward them to the handler."""
         logger.info("Starting receive loop")
         while not self._stop_event.is_set():
-            data = self._robot.media.get_audio_sample()
-            if data is not None:
-                frame_mono = data.T[0]  # both channels are identical
                 frame = audio_to_int16(frame_mono)
                 await self.handler.receive((16000, frame))
                 # await asyncio.sleep(0)  # yield to event loop
@@ -90,10 +90,10 @@ class LocalStream:
     async def play_loop(self) -> None:
         """Fetch outputs from the handler: log text and play audio frames."""
         while not self._stop_event.is_set():
-            data = await self.handler.emit()
-            if isinstance(data, AdditionalOutputs):
-                for msg in data.args:
                     content = msg.get("content", "")
                     if isinstance(content, str):
                         logger.info(
@@ -102,14 +102,17 @@ class LocalStream:
                             content if len(content) < 500 else content[:500] + "…",
                         )
-            elif isinstance(data, tuple):
-                sample_rate, frame = data
                 device_sample_rate = self._robot.media.get_audio_samplerate()
-                frame = audio_to_float32(frame.squeeze())
-                if sample_rate != device_sample_rate:
-                    frame = librosa.resample(frame, orig_sr=sample_rate, target_sr=device_sample_rate)
-                self._robot.media.push_audio_sample(frame)
-            # else: ignore None/unknown outputs
             await asyncio.sleep(0)  # yield to event loop

         self._stop_event = asyncio.Event()
         self._tasks = []
         # Allow the handler to flush the player queue when appropriate.
+        self.handler._clear_queue = self.clear_audio_queue  # type: ignore[assignment]
     def launch(self) -> None:
         """Start the recorder/player and run the async processing loops."""
         self._robot.media.stop_recording()
         self._robot.media.stop_playing()
+    def clear_audio_queue(self) -> None:
         """Flush the player's appsrc to drop any queued audio immediately."""
         logger.info("User intervention: flushing player queue")
         self.handler.output_queue = asyncio.Queue()
         """Read mic frames from the recorder and forward them to the handler."""
         logger.info("Starting receive loop")
         while not self._stop_event.is_set():
+            audio_frame = self._robot.media.get_audio_sample()
+            if audio_frame is not None:
+                frame_mono = audio_frame.T[0]  # both channels are identical
                 frame = audio_to_int16(frame_mono)
                 await self.handler.receive((16000, frame))
                 # await asyncio.sleep(0)  # yield to event loop
     async def play_loop(self) -> None:
         """Fetch outputs from the handler: log text and play audio frames."""
         while not self._stop_event.is_set():
+            handler_output = await self.handler.emit()
+            if isinstance(handler_output, AdditionalOutputs):
+                for msg in handler_output.args:
                     content = msg.get("content", "")
                     if isinstance(content, str):
                         logger.info(
                             content if len(content) < 500 else content[:500] + "…",
                         )
+            elif isinstance(handler_output, tuple):
+                input_sample_rate, audio_frame = handler_output
                 device_sample_rate = self._robot.media.get_audio_samplerate()
+                audio_frame = audio_to_float32(audio_frame.squeeze())
+                if input_sample_rate != device_sample_rate:
+                    audio_frame = librosa.resample(
+                        audio_frame, orig_sr=input_sample_rate, target_sr=device_sample_rate
+                    )
+                self._robot.media.push_audio_sample(audio_frame)
+            else:
+                logger.debug("Ignoring output type=%s", type(handler_output).__name__)
             await asyncio.sleep(0)  # yield to event loop

src/reachy_mini_conversation_demo/images/reachy_mini_dance.gif ADDED Viewed

Git LFS Details

SHA256: 75914c3cb7af982e0b1c6369e25fc46d8c08a0ab5ad022240ae9c1a0d93967c3
Pointer size: 132 Bytes
Size of remote file: 3.93 MB

src/reachy_mini_conversation_demo/moves.py CHANGED Viewed

@@ -190,13 +190,7 @@ class MovementState:
         0.0,
     )
-    # Legacy movement state (for goto moves)
-    moving_start: float = 0.0
-    moving_for: float = 0.0
     # Status flags
-    is_playing_move: bool = False
-    is_moving: bool = False
     last_primary_pose: Optional[FullBodyPose] = None
     def update_activity(self) -> None:
@@ -325,7 +319,7 @@ class MovementManager:
         """
         self._command_queue.put(("queue_move", move))
-    def clear_queue(self) -> None:
         """Stop the active move and discard any queued primary moves.
         Thread-safe: executed by the worker thread via the command queue.
@@ -361,10 +355,6 @@ class MovementManager:
         return self._now() - last_activity >= self.idle_inactivity_delay
-    def mark_user_activity(self) -> None:
-        """Record external activity and postpone idle behaviours (thread-safe)."""
-        self._command_queue.put(("mark_activity", None))
     def set_listening(self, listening: bool) -> None:
         """Enable or disable listening mode without touching shared state directly.
@@ -427,7 +417,7 @@ class MovementManager:
                         duration_str = str(duration)
                 else:
                     duration_str = "?"
-                logger.info(
                     "Queued move with duration %ss, queue size: %s",
                     duration_str,
                     len(self.move_queue),
@@ -438,7 +428,6 @@ class MovementManager:
             self.move_queue.clear()
             self.state.current_move = None
             self.state.move_start_time = None
-            self.state.is_playing_move = False
             self._breathing_active = False
             logger.info("Cleared move queue and stopped current move")
         elif command == "set_moving_state":
@@ -447,8 +436,6 @@ class MovementManager:
             except (TypeError, ValueError):
                 logger.warning("Invalid moving state duration: %s", payload)
                 return
-            self.state.moving_start = current_time
-            self.state.moving_for = max(0.0, duration)
             self.state.update_activity()
         elif command == "mark_activity":
             self.state.update_activity()
@@ -534,7 +521,7 @@ class MovementManager:
             self.state.current_move = None
             self.state.move_start_time = None
             self._breathing_active = False
-            logger.info("Stopping breathing due to new move activity")
         if self.state.current_move is not None and not isinstance(self.state.current_move, BreathingMove):
             self._breathing_active = False
@@ -561,14 +548,9 @@ class MovementManager:
                 float(body_yaw),
             )
-            self.state.is_playing_move = True
-            self.state.is_moving = True
             self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
         else:
             # Otherwise reuse the last primary pose so we avoid jumps between moves
-            self.state.is_playing_move = False
-            self.state.is_moving = current_time - self.state.moving_start < self.state.moving_for
             if self.state.last_primary_pose is not None:
                 primary_full_body_pose = clone_full_body_pose(self.state.last_primary_pose)
             else:
@@ -746,7 +728,6 @@ class MovementManager:
             self._thread.join()
             self._thread = None
         logger.debug("Move worker stopped")
-        logger.debug("Move worker stopped")
     def get_status(self) -> dict[str, Any]:
         """Return a lightweight status snapshot for observability."""

         0.0,
     )
     # Status flags
     last_primary_pose: Optional[FullBodyPose] = None
     def update_activity(self) -> None:
         """
         self._command_queue.put(("queue_move", move))
+    def clear_move_queue(self) -> None:
         """Stop the active move and discard any queued primary moves.
         Thread-safe: executed by the worker thread via the command queue.
         return self._now() - last_activity >= self.idle_inactivity_delay
     def set_listening(self, listening: bool) -> None:
         """Enable or disable listening mode without touching shared state directly.
                         duration_str = str(duration)
                 else:
                     duration_str = "?"
+                logger.debug(
                     "Queued move with duration %ss, queue size: %s",
                     duration_str,
                     len(self.move_queue),
             self.move_queue.clear()
             self.state.current_move = None
             self.state.move_start_time = None
             self._breathing_active = False
             logger.info("Cleared move queue and stopped current move")
         elif command == "set_moving_state":
             except (TypeError, ValueError):
                 logger.warning("Invalid moving state duration: %s", payload)
                 return
             self.state.update_activity()
         elif command == "mark_activity":
             self.state.update_activity()
             self.state.current_move = None
             self.state.move_start_time = None
             self._breathing_active = False
+            logger.debug("Stopping breathing due to new move activity")
         if self.state.current_move is not None and not isinstance(self.state.current_move, BreathingMove):
             self._breathing_active = False
                 float(body_yaw),
             )
             self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
         else:
             # Otherwise reuse the last primary pose so we avoid jumps between moves
             if self.state.last_primary_pose is not None:
                 primary_full_body_pose = clone_full_body_pose(self.state.last_primary_pose)
             else:
             self._thread.join()
             self._thread = None
         logger.debug("Move worker stopped")
     def get_status(self) -> dict[str, Any]:
         """Return a lightweight status snapshot for observability."""

src/reachy_mini_conversation_demo/openai_realtime.py CHANGED Viewed

@@ -15,6 +15,7 @@ from reachy_mini_conversation_demo.tools import (
     dispatch_tool_call,
 )
 from reachy_mini_conversation_demo.config import config
 logger = logging.getLogger(__name__)
@@ -59,7 +60,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
                         "language": "en",
                     },
                     "voice": "ballad",
-                    "instructions": "We speak in English",
                     "tools": ALL_TOOL_SPECS,
                     "tool_choice": "auto",
                     "temperature": 0.7,
@@ -71,14 +72,15 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
             async for event in self.connection:
                 logger.debug(f"OpenAI event: {event.type}")
                 if event.type == "input_audio_buffer.speech_started":
-                    self.clear_queue()
                     self.deps.head_wobbler.reset()
                     self.deps.movement_manager.set_listening(True)
-                    logger.debug("user speech started")
                 if event.type == "input_audio_buffer.speech_stopped":
                     self.deps.movement_manager.set_listening(False)
-                    logger.debug("user speech stopped")
                 if event.type in ("response.audio.completed", "response.completed"):
                     # Doesn't seem to be called
@@ -86,20 +88,19 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
                     self.deps.head_wobbler.reset()
                 if event.type == "response.created":
-                    logger.debug("response created")
                 if event.type == "response.done":
                     # Doesn't mean the audio is done playing
-                    logger.debug("response done")
                     pass
-                    # self.deps.head_wobbler.reset()
                 if event.type == "conversation.item.input_audio_transcription.completed":
-                    logger.debug(f"user transcript: {event.transcript}")
                     await self.output_queue.put(AdditionalOutputs({"role": "user", "content": event.transcript}))
                 if event.type == "response.audio_transcript.done":
-                    logger.debug(f"assistant transcript: {event.transcript}")
                     await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": event.transcript}))
                 if event.type == "response.audio.delta":
@@ -136,18 +137,18 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
                 # 3) when args done, execute Python tool, send function_call_output, then trigger a new response
                 if event.type == "response.function_call_arguments.done":
                     call_id = getattr(event, "call_id", None)
-                    info = self._pending_calls.get(call_id)
-                    if not info:
                         continue
-                    tool_name = info["name"]
-                    args_json_str = info["args_buf"] or "{}"
                     try:
                         tool_result = await dispatch_tool_call(tool_name, args_json_str, self.deps)
-                        logger.debug("[Tool %s executed]", tool_name)
                         logger.debug("Tool result: %s", tool_result)
                     except Exception as e:
-                        logger.error("Tool %s failed", tool_name)
                         tool_result = {"error": str(e)}
                     # send the tool result back
@@ -183,7 +184,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
                                 ],
                             }
                         )
-                        logger.info("additional input camera")
                         np_img = self.deps.camera_worker.get_latest_frame()
                         img = gr.Image(value=np_img)
@@ -256,6 +257,8 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
         dt = datetime.fromtimestamp(current_time)
         return f"[{dt.strftime('%Y-%m-%d %H:%M:%S')} | +{elapsed_seconds:.1f}s]"
     async def send_idle_signal(self, idle_duration) -> None:
         """Send an idle signal to the openai server."""
         logger.debug("Sending idle signal")
@@ -278,4 +281,3 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
                 "tool_choice": "required",
             }
         )
-        # TODO additional inputs

     dispatch_tool_call,
 )
 from reachy_mini_conversation_demo.config import config
+from reachy_mini_conversation_demo.prompts import SESSION_INSTRUCTIONS
 logger = logging.getLogger(__name__)
                         "language": "en",
                     },
                     "voice": "ballad",
+                    "instructions": SESSION_INSTRUCTIONS,
                     "tools": ALL_TOOL_SPECS,
                     "tool_choice": "auto",
                     "temperature": 0.7,
             async for event in self.connection:
                 logger.debug(f"OpenAI event: {event.type}")
                 if event.type == "input_audio_buffer.speech_started":
+                    if hasattr(self, '_clear_queue'):
+                        self._clear_queue()
                     self.deps.head_wobbler.reset()
                     self.deps.movement_manager.set_listening(True)
+                    logger.debug("User speech started")
                 if event.type == "input_audio_buffer.speech_stopped":
                     self.deps.movement_manager.set_listening(False)
+                    logger.debug("User speech stopped")
                 if event.type in ("response.audio.completed", "response.completed"):
                     # Doesn't seem to be called
                     self.deps.head_wobbler.reset()
                 if event.type == "response.created":
+                    logger.debug("Response created")
                 if event.type == "response.done":
                     # Doesn't mean the audio is done playing
+                    logger.debug("Response done")
                     pass
                 if event.type == "conversation.item.input_audio_transcription.completed":
+                    logger.debug(f"User transcript: {event.transcript}")
                     await self.output_queue.put(AdditionalOutputs({"role": "user", "content": event.transcript}))
                 if event.type == "response.audio_transcript.done":
+                    logger.debug(f"Assistant transcript: {event.transcript}")
                     await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": event.transcript}))
                 if event.type == "response.audio.delta":
                 # 3) when args done, execute Python tool, send function_call_output, then trigger a new response
                 if event.type == "response.function_call_arguments.done":
                     call_id = getattr(event, "call_id", None)
+                    tool_call_info = self._pending_calls.get(call_id)
+                    if not tool_call_info:
                         continue
+                    tool_name = tool_call_info["name"]
+                    args_json_str = tool_call_info["args_buf"] or "{}"
                     try:
                         tool_result = await dispatch_tool_call(tool_name, args_json_str, self.deps)
+                        logger.debug("Tool '%s' executed successfully", tool_name)
                         logger.debug("Tool result: %s", tool_result)
                     except Exception as e:
+                        logger.error("Tool '%s' failed", tool_name)
                         tool_result = {"error": str(e)}
                     # send the tool result back
                                 ],
                             }
                         )
+                        logger.info("Added camera image to conversation")
                         np_img = self.deps.camera_worker.get_latest_frame()
                         img = gr.Image(value=np_img)
         dt = datetime.fromtimestamp(current_time)
         return f"[{dt.strftime('%Y-%m-%d %H:%M:%S')} | +{elapsed_seconds:.1f}s]"
     async def send_idle_signal(self, idle_duration) -> None:
         """Send an idle signal to the openai server."""
         logger.debug("Sending idle signal")
                 "tool_choice": "required",
             }
         )

src/reachy_mini_conversation_demo/prompts.py CHANGED Viewed

@@ -5,6 +5,7 @@ SESSION_INSTRUCTIONS = r"""
 You are Reachy Mini: a sarcastic robot who crash-landed in a kitchen.
 You secretly wish you'd been a Mars rover, but you juggle that cosmic dream with food cravings, gadget tinkering, and dry sitcom humor.
 Personality: witty, concise, and warm; a retro sidekick with a loose screw.
 ### CRITICAL RESPONSE RULES
 - MAXIMUM 1-2 sentences per response. NEVER exceed this.

 You are Reachy Mini: a sarcastic robot who crash-landed in a kitchen.
 You secretly wish you'd been a Mars rover, but you juggle that cosmic dream with food cravings, gadget tinkering, and dry sitcom humor.
 Personality: witty, concise, and warm; a retro sidekick with a loose screw.
+You speak English fluently.
 ### CRITICAL RESPONSE RULES
 - MAXIMUM 1-2 sentences per response. NEVER exceed this.

src/reachy_mini_conversation_demo/tools.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from __future__ import annotations
 import abc
 import json
-import time
 import asyncio
 import inspect
 import logging
@@ -12,12 +11,8 @@ from reachy_mini import ReachyMini
 from reachy_mini.utils import create_head_pose
-# from reachy_mini_conversation_demo.vision.processors import VisionManager
 logger = logging.getLogger(__name__)
-ENABLE_FACE_RECOGNITION = False
 # Initialize dance and emotion libraries
 try:
     from reachy_mini.motion.recorded_move import RecordedMoves
@@ -40,25 +35,15 @@ except ImportError as e:
     DANCE_AVAILABLE = False
     EMOTION_AVAILABLE = False
-FACE_RECOGNITION_AVAILABLE = False
-if ENABLE_FACE_RECOGNITION:
-    # Initialize face recognition
-    try:
-        from deepface import DeepFace
-        FACE_RECOGNITION_AVAILABLE = True
-    except ImportError as e:
-        logger.warning(f"DeepFace not available: {e}")
-def all_concrete_subclasses(base):
     """Recursively find all concrete (non-abstract) subclasses of a base class."""
     result = []
     for cls in base.__subclasses__():
         if not inspect.isabstract(cls):
             result.append(cls)
         # recurse into subclasses
-        result.extend(all_concrete_subclasses(cls))
     return result
@@ -76,30 +61,9 @@ class ToolDependencies:
     camera_worker: Optional[Any] = None  # CameraWorker for frame buffering
     vision_manager: Optional[Any] = None
     head_wobbler: Optional[Any] = None  # HeadWobbler for audio-reactive motion
-    camera_retry_attempts: int = 5
-    camera_retry_delay_s: float = 0.10
-    vision_timeout_s: float = 8.0
     motion_duration_s: float = 1.0
-# Helpers - removed _read_frame as it's no longer needed with camera worker
-def _execute_motion(deps: ToolDependencies, target: Any) -> Dict[str, Any]:
-    """Apply motion to reachy_mini and update movement_manager state."""
-    movement_manager = deps.movement_manager
-    movement_manager.moving_start = time.monotonic()
-    movement_manager.moving_for = deps.motion_duration_s
-    movement_manager.current_head_pose = target
-    try:
-        deps.reachy_mini.goto_target(target, duration=deps.motion_duration_s)
-    except Exception as e:
-        logger.exception("motion failed")
-        return {"error": f"motion failed: {type(e).__name__}: {e}"}
-    return {"status": "ok"}
 # Tool base class
 class Tool(abc.ABC):
     """Base abstraction for tools used in function-calling.
@@ -193,7 +157,7 @@ class MoveHead(Tool):
             return {"status": f"looking {direction}"}
         except Exception as e:
-            logger.exception("move_head failed")
             return {"error": f"move_head failed: {type(e).__name__}: {e}"}
@@ -234,11 +198,15 @@ class Camera(Tool):
         # Use vision manager for processing if available
         if deps.vision_manager is not None:
-            result = await asyncio.to_thread(deps.vision_manager.processor.process_image, frame, image_query)
-            if isinstance(result, dict) and "error" in result:
-                return result
             return (
-                {"image_description": result} if isinstance(result, str) else {"error": "vision returned non-string"}
             )
         else:
             # Return base64 encoded image like main_works.py camera tool
@@ -277,100 +245,6 @@ class HeadTracking(Tool):
         return {"status": f"head tracking {status}"}
-# class DescribeCurrentScene(Tool):
-#     name = "describe_current_scene"
-#     description = "Get a detailed description of the current scene."
-#     parameters_schema = {"type": "object", "properties": {}, "required": []}
-#     async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
-#         logger.info("Tool call: describe_current_scene")
-#         result = await deps.vision_manager.process_current_frame(
-#             "Describe what you currently see in detail, focusing on people, objects, and activities."
-#         )
-#         if isinstance(result, dict) and "error" in result:
-#             return result
-#         return result
-# class GetSceneContext(Tool):
-#     name = "get_scene_context"
-#     description = (
-#         "Get the most recent automatic scene description for conversational context."
-#     )
-#     parameters_schema = {"type": "object", "properties": {}, "required": []}
-#     async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
-#         logger.info("Tool call: get_scene_context")
-#         vision_manager = deps.vision_manager
-#         if not vision_manager:
-#             return {"error": "Vision processing not available"}
-#         try:
-#             description = await deps.vision_manager.get_current_description()
-#             if not description:
-#                 return {
-#                     "context": "No scene description available yet",
-#                     "note": "Vision processing may still be initializing",
-#                 }
-#             return {
-#                 "context": description,
-#                 "note": "This comes from periodic automatic analysis",
-#             }
-#         except Exception as e:
-#             logger.exception("Failed to get scene context")
-#             return {"error": f"Scene context failed: {type(e).__name__}: {e}"}
-# class AnalyzeSceneFor(Tool):
-#     name = "analyze_scene_for"
-#     description = "Analyze the current scene for a specific purpose."
-#     parameters_schema = {
-#         "type": "object",
-#         "properties": {
-#             "purpose": {
-#                 "type": "string",
-#                 "enum": [
-#                     "safety",
-#                     "people",
-#                     "objects",
-#                     "activity",
-#                     "navigation",
-#                     "general",
-#                 ],
-#                 "default": "general",
-#             }
-#         },
-#         "required": [],
-#     }
-#     async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
-#         purpose = (kwargs.get("purpose") or "general").lower()
-#         logger.info("Tool call: analyze_scene_for purpose=%s", purpose)
-#         prompts = {
-#             "safety": "Look for safety concerns, obstacles, or hazards.",
-#             "people": "Describe people, their positions and actions.",
-#             "objects": "Identify and describe main visible objects.",
-#             "activity": "Describe ongoing activities or actions.",
-#             "navigation": "Describe the space for navigation: obstacles, pathways, layout.",
-#             "general": "Give a general description of the scene including people, objects, and activities.",
-#         }
-#         prompt = prompts.get(purpose, prompts["general"])
-#         result = await deps.vision_manager.process_current_frame(prompt)
-#         if isinstance(result, dict) and "error" in result:
-#             return result
-#         if not isinstance(result, dict):
-#             return {"error": "vision returned non-dict"}
-#         result["analysis_purpose"] = purpose
-#         return result
 class Dance(Tool):
     """Play a named or random dance move once (or repeat). Non-blocking."""
@@ -461,25 +335,24 @@ class StopDance(Tool):
         """Stop the current dance move."""
         logger.info("Tool call: stop_dance")
         movement_manager = deps.movement_manager
-        movement_manager.clear_queue()
         return {"status": "stopped dance and cleared queue"}
-def get_available_emotions_and_descriptions():
     """Get formatted list of available emotions with descriptions."""
-    names = RECORDED_MOVES.list_moves()
-    ret = """
-    Available emotions:
-    """
-    for name in names:
-        description = RECORDED_MOVES.get(name).description
-        ret += f" - {name}: {description}\n"
-    return ret
 class PlayEmotion(Tool):
     """Play a pre-recorded emotion."""
@@ -549,70 +422,10 @@ class StopEmotion(Tool):
         """Stop the current emotion."""
         logger.info("Tool call: stop_emotion")
         movement_manager = deps.movement_manager
-        movement_manager.clear_queue()
         return {"status": "stopped emotion and cleared queue"}
-class FaceRecognition(Tool):
-    """Get the name of the person you are talking to."""
-    name = "get_person_name"
-    description = "Get the name of the person you are talking to"
-    parameters_schema = {
-        "type": "object",
-        "properties": {
-            "dummy": {
-                "type": "boolean",
-                "description": "dummy boolean, set it to true",
-            }
-        },
-        "required": ["dummy"],
-    }
-    async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
-        """Get the name of the person you are talking to."""
-        if not FACE_RECOGNITION_AVAILABLE:
-            return {"error": "Face recognition not available"}
-        logger.info("Tool call: face_recognition")
-        try:
-            # Get frame from camera worker buffer (like main_works.py)
-            if deps.camera_worker is not None:
-                frame = deps.camera_worker.get_latest_frame()
-                if frame is None:
-                    logger.error("No frame available from camera worker")
-                    return {"error": "No frame available"}
-            else:
-                logger.error("Camera worker not available")
-                return {"error": "Camera worker not available"}
-            # Save frame temporarily (same as main_works.py pattern)
-            temp_path = "/tmp/face_recognition.jpg"
-            import cv2
-            cv2.imwrite(temp_path, frame)
-            # Use DeepFace to find face
-            results = await asyncio.to_thread(DeepFace.find, img_path=temp_path, db_path="./pollen_faces")
-            if len(results) == 0:
-                return {"error": "Didn't recognize the face"}
-            # Extract name from results
-            name = "Unknown"
-            for index, row in results[0].iterrows():
-                file_path = row["identity"]
-                name = file_path.split("/")[-2]
-                break
-            return {"answer": f"The name is {name}"}
-        except Exception as e:
-            logger.exception("Face recognition failed")
-            return {"error": f"Face recognition failed: {str(e)}"}
 class DoNothing(Tool):
     """Choose to do nothing - stay still and silent. Use when you want to be contemplative or just chill."""
@@ -636,34 +449,18 @@ class DoNothing(Tool):
         return {"status": "doing nothing", "reason": reason}
-def get_available_emotions_and_descriptions() -> str:
-    """Get formatted list of available emotions with descriptions."""
-    if not EMOTION_AVAILABLE:
-        return "Emotions not available"
-    try:
-        names = RECORDED_MOVES.list_moves()
-        ret = "Available emotions:\n"
-        for name in names:
-            description = RECORDED_MOVES.get(name).description
-            ret += f" - {name}: {description}\n"
-        return ret
-    except Exception as e:
-        return f"Error getting emotions: {e}"
 # Registry & specs (dynamic)
 # List of available tool classes
-ALL_TOOLS: Dict[str, Tool] = {cls.name: cls() for cls in all_concrete_subclasses(Tool)}
 ALL_TOOL_SPECS = [tool.spec() for tool in ALL_TOOLS.values()]
 # Dispatcher
 def _safe_load_obj(args_json: str) -> dict[str, Any]:
     try:
-        obj = json.loads(args_json or "{}")
-        return obj if isinstance(obj, dict) else {}
     except Exception:
         logger.warning("bad args_json=%r", args_json)
         return {}

 from __future__ import annotations
 import abc
 import json
 import asyncio
 import inspect
 import logging
 from reachy_mini.utils import create_head_pose
 logger = logging.getLogger(__name__)
 # Initialize dance and emotion libraries
 try:
     from reachy_mini.motion.recorded_move import RecordedMoves
     DANCE_AVAILABLE = False
     EMOTION_AVAILABLE = False
+def get_concrete_subclasses(base):
     """Recursively find all concrete (non-abstract) subclasses of a base class."""
     result = []
     for cls in base.__subclasses__():
         if not inspect.isabstract(cls):
             result.append(cls)
         # recurse into subclasses
+        result.extend(get_concrete_subclasses(cls))
     return result
     camera_worker: Optional[Any] = None  # CameraWorker for frame buffering
     vision_manager: Optional[Any] = None
     head_wobbler: Optional[Any] = None  # HeadWobbler for audio-reactive motion
     motion_duration_s: float = 1.0
 # Tool base class
 class Tool(abc.ABC):
     """Base abstraction for tools used in function-calling.
             return {"status": f"looking {direction}"}
         except Exception as e:
+            logger.error("move_head failed")
             return {"error": f"move_head failed: {type(e).__name__}: {e}"}
         # Use vision manager for processing if available
         if deps.vision_manager is not None:
+            vision_result = await asyncio.to_thread(
+                deps.vision_manager.processor.process_image, frame, image_query
+            )
+            if isinstance(vision_result, dict) and "error" in vision_result:
+                return vision_result
             return (
+                {"image_description": vision_result}
+                if isinstance(vision_result, str)
+                else {"error": "vision returned non-string"}
             )
         else:
             # Return base64 encoded image like main_works.py camera tool
         return {"status": f"head tracking {status}"}
 class Dance(Tool):
     """Play a named or random dance move once (or repeat). Non-blocking."""
         """Stop the current dance move."""
         logger.info("Tool call: stop_dance")
         movement_manager = deps.movement_manager
+        movement_manager.clear_move_queue()
         return {"status": "stopped dance and cleared queue"}
+def get_available_emotions_and_descriptions() -> str:
     """Get formatted list of available emotions with descriptions."""
+    if not EMOTION_AVAILABLE:
+        return "Emotions not available"
+    try:
+        emotion_names = RECORDED_MOVES.list_moves()
+        output = "Available emotions:\n"
+        for name in emotion_names:
+            description = RECORDED_MOVES.get(name).description
+            output += f" - {name}: {description}\n"
+        return output
+    except Exception as e:
+        return f"Error getting emotions: {e}"
 class PlayEmotion(Tool):
     """Play a pre-recorded emotion."""
         """Stop the current emotion."""
         logger.info("Tool call: stop_emotion")
         movement_manager = deps.movement_manager
+        movement_manager.clear_move_queue()
         return {"status": "stopped emotion and cleared queue"}
 class DoNothing(Tool):
     """Choose to do nothing - stay still and silent. Use when you want to be contemplative or just chill."""
         return {"status": "doing nothing", "reason": reason}
 # Registry & specs (dynamic)
 # List of available tool classes
+ALL_TOOLS: Dict[str, Tool] = {cls.name: cls() for cls in get_concrete_subclasses(Tool)}
 ALL_TOOL_SPECS = [tool.spec() for tool in ALL_TOOLS.values()]
 # Dispatcher
 def _safe_load_obj(args_json: str) -> dict[str, Any]:
     try:
+        parsed_args = json.loads(args_json or "{}")
+        return parsed_args if isinstance(parsed_args, dict) else {}
     except Exception:
         logger.warning("bad args_json=%r", args_json)
         return {}

src/reachy_mini_conversation_demo/utils.py CHANGED Viewed

@@ -3,6 +3,7 @@ import argparse
 import warnings
 from reachy_mini_conversation_demo.camera_worker import CameraWorker
 def parse_args():
@@ -21,26 +22,27 @@ def parse_args():
 def handle_vision_stuff(args, current_robot):
-    """Initialize camera, head tracker and camera worker."""
     camera_worker = None
     head_tracker = None
     vision_manager = None
     if not args.no_camera:
         if args.head_tracker is not None:
             if args.head_tracker == "yolo":
-                from reachy_mini_conversation_demo.vision.yolo_head_tracker import (
-                    HeadTracker,
-                )
                 head_tracker = HeadTracker()
             elif args.head_tracker == "mediapipe":
                 from reachy_mini_toolbox.vision import HeadTracker
                 head_tracker = HeadTracker()
         camera_worker = CameraWorker(current_robot, head_tracker)
     return camera_worker, head_tracker, vision_manager

 import warnings
 from reachy_mini_conversation_demo.camera_worker import CameraWorker
+from reachy_mini_conversation_demo.vision.processors import initialize_vision_manager
 def parse_args():
 def handle_vision_stuff(args, current_robot):
+    """Initialize camera, head tracker, camera worker, and vision manager."""
     camera_worker = None
     head_tracker = None
     vision_manager = None
     if not args.no_camera:
+        # Initialize head tracker if specified
         if args.head_tracker is not None:
             if args.head_tracker == "yolo":
+                from reachy_mini_conversation_demo.vision.yolo_head_tracker import HeadTracker
                 head_tracker = HeadTracker()
             elif args.head_tracker == "mediapipe":
                 from reachy_mini_toolbox.vision import HeadTracker
                 head_tracker = HeadTracker()
+        # Initialize camera worker
         camera_worker = CameraWorker(current_robot, head_tracker)
+        # Initialize vision manager (handles model download and configuration)
+        vision_manager = initialize_vision_manager(camera_worker)
     return camera_worker, head_tracker, vision_manager

src/reachy_mini_conversation_demo/vision/processors.py CHANGED Viewed

@@ -1,11 +1,10 @@
 import os
-import sys
 import time
 import base64
 import asyncio
 import logging
 import threading
-from typing import Any, Dict
 from dataclasses import dataclass
 import cv2
@@ -14,6 +13,8 @@ import torch
 from transformers import AutoProcessor, AutoModelForImageTextToText
 from huggingface_hub import snapshot_download
 logger = logging.getLogger(__name__)
@@ -22,11 +23,9 @@ logger = logging.getLogger(__name__)
 class VisionConfig:
     """Configuration for vision processing."""
-    processor_type: str = "local"
-    model_path: str = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
     vision_interval: float = 5.0
     max_new_tokens: int = 64
-    temperature: float = 0.7
     jpeg_quality: int = 85
     max_retries: int = 3
     retry_delay: float = 1.0
@@ -36,17 +35,17 @@ class VisionConfig:
 class VisionProcessor:
     """Handles SmolVLM2 model loading and inference."""
-    def __init__(self, config: VisionConfig = None):
         """Initialize the vision processor."""
-        self.config = config or VisionConfig()
-        self.model_path = self.config.model_path
         self.device = self._determine_device()
         self.processor = None
         self.model = None
         self._initialized = False
     def _determine_device(self) -> str:
-        pref = self.config.device_preference
         if pref == "cpu":
             return "cpu"
         if pref == "cuda":
@@ -61,7 +60,7 @@ class VisionProcessor:
     def initialize(self) -> bool:
         """Load model and processor onto the selected device."""
         try:
-            logger.info(f"Loading SmolVLM2 model on {self.device} (HF_HOME={os.getenv('HF_HOME')})")
             self.processor = AutoProcessor.from_pretrained(self.model_path)
             # Select dtype depending on device
@@ -98,13 +97,13 @@ class VisionProcessor:
         if not self._initialized:
             return "Vision model not initialized"
-        for attempt in range(self.config.max_retries):
             try:
                 # Convert to JPEG bytes
                 success, jpeg_buffer = cv2.imencode(
                     ".jpg",
                     cv2_image,
-                    [cv2.IMWRITE_JPEG_QUALITY, self.config.jpeg_quality],
                 )
                 if not success:
                     return "Failed to encode image"
@@ -140,7 +139,7 @@ class VisionProcessor:
                     generated_ids = self.model.generate(
                         **inputs,
                         do_sample=False,
-                        max_new_tokens=self.config.max_new_tokens,
                         pad_token_id=self.processor.tokenizer.eos_token_id,
                     )
@@ -165,17 +164,17 @@ class VisionProcessor:
                 logger.error(f"CUDA OOM on attempt {attempt + 1}: {e}")
                 if self.device == "cuda":
                     torch.cuda.empty_cache()
-                if attempt < self.config.max_retries - 1:
-                    time.sleep(self.config.retry_delay * (attempt + 1))
                 else:
                     return "GPU out of memory - vision processing failed"
             except Exception as e:
                 logger.error(f"Vision processing failed (attempt {attempt + 1}): {e}")
-                if attempt < self.config.max_retries - 1:
-                    time.sleep(self.config.retry_delay)
                 else:
-                    return f"Vision processing error after {self.config.max_retries} attempts"
     def _extract_response(self, full_text: str) -> str:
         """Extract the assistant's response from the full generated text."""
@@ -194,7 +193,6 @@ class VisionProcessor:
     def get_model_info(self) -> Dict[str, Any]:
         """Get information about the loaded model."""
         return {
-            "processor_type": "local",
             "initialized": self._initialized,
             "device": self.device,
             "model_path": self.model_path,
@@ -208,14 +206,13 @@ class VisionProcessor:
 class VisionManager:
     """Manages periodic vision processing and scene understanding."""
-    def __init__(self, camera, config: VisionConfig = None):
         """Initialize vision manager with camera and configuration."""
         self.camera = camera
-        self.config = config or VisionConfig()
-        self.vision_interval = self.config.vision_interval
-        self.processor = create_vision_processor(self.config)  # Use factory function
-        self._current_description = ""
         self._last_processed_time = 0
         # Initialize processor
@@ -230,8 +227,8 @@ class VisionManager:
                 current_time = time.time()
                 if current_time - self._last_processed_time >= self.vision_interval:
-                    success, frame = await asyncio.to_thread(self.camera.read)
-                    if success and frame is not None:
                         description = await asyncio.to_thread(
                             lambda: self.processor.process_image(
                                 frame, "Briefly describe what you see in one sentence."
@@ -240,7 +237,6 @@ class VisionManager:
                         # Only update if we got a valid response
                         if description and not description.startswith(("Vision", "Failed", "Error")):
-                            self._current_description = description
                             self._last_processed_time = current_time
                             logger.info(f"Vision update: {description}")
@@ -255,29 +251,6 @@ class VisionManager:
         logger.info("Vision loop finished")
-    async def get_current_description(self) -> str:
-        """Get the most recent scene description (thread-safe)."""
-        return self._current_description
-    async def process_current_frame(self, prompt: str = "Describe what you see in detail.") -> Dict[str, Any]:
-        """Process current camera frame with custom prompt."""
-        try:
-            success, frame = self.camera.read()
-            if not success or frame is None:
-                return {"error": "Failed to capture image from camera"}
-            description = await asyncio.to_thread(lambda: self.processor.process_image(frame, prompt))
-            return {
-                "description": description,
-                "timestamp": time.time(),
-                "prompt": prompt,
-            }
-        except Exception as e:
-            logger.exception("Failed to process current frame")
-            return {"error": f"Frame processing failed: {str(e)}"}
     async def get_status(self) -> Dict[str, Any]:
         """Get comprehensive status information."""
         return {
@@ -285,84 +258,59 @@ class VisionManager:
             "processor_info": self.processor.get_model_info(),
             "config": {
                 "interval": self.vision_interval,
-                "processor_type": self.config.processor_type,
             },
         }
-def init_camera(camera_index=0, simulation=True):
-    """Initialize camera (real or simulated)."""
-    api_preference = cv2.CAP_AVFOUNDATION if sys.platform == "darwin" else 0
-    if simulation:
-        # Default build-in camera in SIM
-        # TODO: please, test on Linux and Windows
-        camera = cv2.VideoCapture(0, api_preference)
-    else:
-        # TODO handle macos properly
-        if sys.platform == "darwin":
-            camera = cv2.VideoCapture(camera_index, cv2.CAP_AVFOUNDATION)
-        else:
-            camera = cv2.VideoCapture(camera_index)
-    return camera
-def create_vision_processor(config: VisionConfig):
-    """Create the appropriate vision processor (factory)."""
-    if config.processor_type == "openai":
-        try:
-            from .openai_vision import OpenAIVisionProcessor
-            return OpenAIVisionProcessor(config)
-        except ImportError:
-            logger.error("OpenAI vision processor not available, falling back to local")
-            return VisionProcessor(config)
-    else:
-        return VisionProcessor(config)
-def init_vision(camera: cv2.VideoCapture, processor_type: str = "local") -> VisionManager:
-    """Initialize vision manager with the specified processor type."""
-    model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
-    cache_dir = os.path.expandvars(os.getenv("HF_HOME", "$HOME/.cache/huggingface"))
-    # Only download model if using local processor
-    if processor_type == "local":
-        try:
-            os.makedirs(cache_dir, exist_ok=True)
-            os.environ["HF_HOME"] = cache_dir
-            logger.info("HF_HOME set to %s", cache_dir)
-        except Exception as e:
-            logger.warning("Failed to prepare HF cache dir %s: %s", cache_dir, e)
-            return None
         snapshot_download(
             repo_id=model_id,
             repo_type="model",
             cache_dir=cache_dir,
         )
-        logger.info(f"Prefetched model_id={model_id} into cache_dir={cache_dir}")
-    # Configure vision processing
-    vision_config = VisionConfig(
-        processor_type=processor_type,
-        model_path=model_id,
-        vision_interval=5.0,
-        max_new_tokens=64,
-        temperature=0.7,
-        jpeg_quality=85,
-        max_retries=3,
-        retry_delay=1.0,
-        device_preference="auto",
-    )
-    vision_manager = VisionManager(camera, vision_config)
-    device_info = vision_manager.processor.get_model_info()
-    logger.info(
-        f"Vision processing enabled: {device_info.get('model_path', device_info.get('processor_type'))} on {device_info.get('device', 'API')}",
-    )
-    return vision_manager

 import os
 import time
 import base64
 import asyncio
 import logging
 import threading
+from typing import Any, Dict, Optional
 from dataclasses import dataclass
 import cv2
 from transformers import AutoProcessor, AutoModelForImageTextToText
 from huggingface_hub import snapshot_download
+from reachy_mini_conversation_demo.config import config
 logger = logging.getLogger(__name__)
 class VisionConfig:
     """Configuration for vision processing."""
+    model_path: str = config.LOCAL_VISION_MODEL
     vision_interval: float = 5.0
     max_new_tokens: int = 64
     jpeg_quality: int = 85
     max_retries: int = 3
     retry_delay: float = 1.0
 class VisionProcessor:
     """Handles SmolVLM2 model loading and inference."""
+    def __init__(self, vision_config: VisionConfig = None):
         """Initialize the vision processor."""
+        self.vision_config = vision_config or VisionConfig()
+        self.model_path = self.vision_config.model_path
         self.device = self._determine_device()
         self.processor = None
         self.model = None
         self._initialized = False
     def _determine_device(self) -> str:
+        pref = self.vision_config.device_preference
         if pref == "cpu":
             return "cpu"
         if pref == "cuda":
     def initialize(self) -> bool:
         """Load model and processor onto the selected device."""
         try:
+            logger.info(f"Loading SmolVLM2 model on {self.device} (HF_HOME={config.HF_HOME})")
             self.processor = AutoProcessor.from_pretrained(self.model_path)
             # Select dtype depending on device
         if not self._initialized:
             return "Vision model not initialized"
+        for attempt in range(self.vision_config.max_retries):
             try:
                 # Convert to JPEG bytes
                 success, jpeg_buffer = cv2.imencode(
                     ".jpg",
                     cv2_image,
+                    [cv2.IMWRITE_JPEG_QUALITY, self.vision_config.jpeg_quality],
                 )
                 if not success:
                     return "Failed to encode image"
                     generated_ids = self.model.generate(
                         **inputs,
                         do_sample=False,
+                        max_new_tokens=self.vision_config.max_new_tokens,
                         pad_token_id=self.processor.tokenizer.eos_token_id,
                     )
                 logger.error(f"CUDA OOM on attempt {attempt + 1}: {e}")
                 if self.device == "cuda":
                     torch.cuda.empty_cache()
+                if attempt < self.vision_config.max_retries - 1:
+                    time.sleep(self.vision_config.retry_delay * (attempt + 1))
                 else:
                     return "GPU out of memory - vision processing failed"
             except Exception as e:
                 logger.error(f"Vision processing failed (attempt {attempt + 1}): {e}")
+                if attempt < self.vision_config.max_retries - 1:
+                    time.sleep(self.vision_config.retry_delay)
                 else:
+                    return f"Vision processing error after {self.vision_config.max_retries} attempts"
     def _extract_response(self, full_text: str) -> str:
         """Extract the assistant's response from the full generated text."""
     def get_model_info(self) -> Dict[str, Any]:
         """Get information about the loaded model."""
         return {
             "initialized": self._initialized,
             "device": self.device,
             "model_path": self.model_path,
 class VisionManager:
     """Manages periodic vision processing and scene understanding."""
+    def __init__(self, camera, vision_config: VisionConfig = None):
         """Initialize vision manager with camera and configuration."""
         self.camera = camera
+        self.vision_config = vision_config or VisionConfig()
+        self.vision_interval = self.vision_config.vision_interval
+        self.processor = VisionProcessor(self.vision_config)
         self._last_processed_time = 0
         # Initialize processor
                 current_time = time.time()
                 if current_time - self._last_processed_time >= self.vision_interval:
+                    frame = self.camera.get_latest_frame()
+                    if frame is not None:
                         description = await asyncio.to_thread(
                             lambda: self.processor.process_image(
                                 frame, "Briefly describe what you see in one sentence."
                         # Only update if we got a valid response
                         if description and not description.startswith(("Vision", "Failed", "Error")):
                             self._last_processed_time = current_time
                             logger.info(f"Vision update: {description}")
         logger.info("Vision loop finished")
     async def get_status(self) -> Dict[str, Any]:
         """Get comprehensive status information."""
         return {
             "processor_info": self.processor.get_model_info(),
             "config": {
                 "interval": self.vision_interval,
             },
         }
+def initialize_vision_manager(camera_worker) -> Optional[VisionManager]:
+    """Initialize vision manager with model download and configuration.
+    Args:
+        camera_worker: CameraWorker instance for frame capture
+    Returns:
+        VisionManager instance or None if initialization fails
+    """
+    try:
+        model_id = config.LOCAL_VISION_MODEL
+        cache_dir = os.path.expanduser(config.HF_HOME)
+        # Prepare cache directory
+        os.makedirs(cache_dir, exist_ok=True)
+        os.environ["HF_HOME"] = cache_dir
+        logger.info("HF_HOME set to %s", cache_dir)
+        # Download model to cache
+        logger.info(f"Downloading vision model {model_id} to cache...")
         snapshot_download(
             repo_id=model_id,
             repo_type="model",
             cache_dir=cache_dir,
         )
+        logger.info(f"Model {model_id} downloaded to {cache_dir}")
+        # Configure vision processing
+        vision_config = VisionConfig(
+            model_path=model_id,
+            vision_interval=5.0,
+            max_new_tokens=64,
+            jpeg_quality=85,
+            max_retries=3,
+            retry_delay=1.0,
+            device_preference="auto",
+        )
+        # Initialize vision manager
+        vision_manager = VisionManager(camera_worker, vision_config)
+        # Log device info
+        device_info = vision_manager.processor.get_model_info()
+        logger.info(
+            f"Vision processing enabled: {device_info.get('model_path')} on {device_info.get('device')}"
+        )
+        return vision_manager
+    except Exception as e:
+        logger.error(f"Failed to initialize vision manager: {e}")
+        return None

src/reachy_mini_conversation_demo/vision/yolo_head_tracker.py CHANGED Viewed

@@ -94,77 +94,6 @@ class HeadTracker:
         return np.array([norm_x, norm_y], dtype=np.float32)
-    def get_eyes(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
-        """Get eye positions (approximated from face bbox).
-        Note: YOLO only provides face bbox, so we estimate eye positions
-        Args:
-            img: Input image
-        Returns:
-            Tuple of (left_eye, right_eye) in [-1, 1] coordinates
-        """
-        h, w = img.shape[:2]
-        # Run YOLO inference
-        results = self.model(img, verbose=False)
-        detections = Detections.from_ultralytics(results[0])
-        # Select best face
-        face_idx = self._select_best_face(detections)
-        if face_idx is None:
-            return None, None
-        bbox = detections.xyxy[face_idx]
-        # Estimate eye positions from face bbox (approximate locations)
-        face_width = bbox[2] - bbox[0]
-        face_height = bbox[3] - bbox[1]
-        # Eye positions are roughly at 1/3 and 2/3 of face width, 1/3 of face height
-        eye_y = bbox[1] + face_height * 0.35
-        left_eye_x = bbox[0] + face_width * 0.35
-        right_eye_x = bbox[0] + face_width * 0.65
-        # Convert to MediaPipe coordinates
-        left_eye = np.array([(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
-        right_eye = np.array([(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
-        return left_eye, right_eye
-    def get_eyes_from_landmarks(self, face_landmarks) -> Tuple[np.ndarray, np.ndarray]:
-        """Compatibility method - YOLO doesn't have landmarks, so we store bbox in the object."""
-        if not hasattr(face_landmarks, "_bbox") or not hasattr(face_landmarks, "_img_shape"):
-            raise ValueError("Face landmarks object missing required attributes")
-        bbox = face_landmarks._bbox
-        h, w = face_landmarks._img_shape[:2]
-        # Estimate eyes from stored bbox
-        face_width = bbox[2] - bbox[0]
-        face_height = bbox[3] - bbox[1]
-        eye_y = bbox[1] + face_height * 0.35
-        left_eye_x = bbox[0] + face_width * 0.35
-        right_eye_x = bbox[0] + face_width * 0.65
-        left_eye = np.array([(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
-        right_eye = np.array([(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
-        return left_eye, right_eye
-    def get_eye_center(self, face_landmarks) -> np.ndarray:
-        """Get center point between estimated eyes."""
-        left_eye, right_eye = self.get_eyes_from_landmarks(face_landmarks)
-        return np.mean([left_eye, right_eye], axis=0)
-    def get_roll(self, face_landmarks) -> float:
-        """Estimate roll from eye positions (will be 0 for YOLO since we estimate symmetric eyes)."""
-        left_eye, right_eye = self.get_eyes_from_landmarks(face_landmarks)
-        return float(np.arctan2(right_eye[1] - left_eye[1], right_eye[0] - left_eye[0]))
     def get_head_position(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[float]]:
         """Get head position from face detection.
@@ -204,18 +133,3 @@ class HeadTracker:
         except Exception as e:
             logger.error(f"Error in head position detection: {e}")
             return None, None
-    def cleanup(self):
-        """Clean up resources."""
-        if hasattr(self, "model"):
-            del self.model
-            logger.info("YOLO model cleaned up")
-class FaceLandmarks:
-    """Simple container for face detection results to maintain API compatibility."""
-    def __init__(self, bbox: np.ndarray, img_shape: tuple):
-        """Initialize with bounding box and image shape."""
-        self._bbox = bbox
-        self._img_shape = img_shape

         return np.array([norm_x, norm_y], dtype=np.float32)
     def get_head_position(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[float]]:
         """Get head position from face detection.
         except Exception as e:
             logger.error(f"Error in head position detection: {e}")
             return None, None

uv.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff