Merge branch 'develop' into 62-appify-the-demo
Browse files- .github/workflows/tests.yml +74 -0
- .github/workflows/typecheck.yml +29 -0
- .gitignore +1 -0
- README.md +16 -8
- docs/assets/conversation_app_arch.svg +3 -0
- {src/reachy_mini_conversation_demo/images β docs/assets}/reachy_mini_dance.gif +0 -0
- docs/scheme.mmd +58 -0
- pyproject.toml +25 -10
- src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/__init__.py +0 -0
- src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/audio/__init__.py +0 -0
- src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/audio/head_wobbler.py +15 -13
- src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/audio/speech_tapper.py +11 -9
- src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/camera_worker.py +25 -26
- src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/config.py +15 -26
- src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/console.py +22 -11
- src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/dance_emotion_moves.py +21 -18
- src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/images/reachymini_avatar.png +0 -0
- src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/images/user_avatar.png +0 -0
- src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/main.py +12 -11
- src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/moves.py +29 -28
- src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/openai_realtime.py +203 -83
- src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/prompts.py +0 -0
- src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/tools.py +36 -34
- src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/utils.py +12 -10
- src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/vision/__init__.py +0 -0
- src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/vision/processors.py +17 -15
- src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/vision/yolo_head_tracker.py +15 -9
- tests/audio/test_head_wobbler.py +4 -3
- tests/test_openai_realtime.py +117 -0
- uv.lock +0 -0
.github/workflows/tests.yml
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Tests
|
| 2 |
+
on:
|
| 3 |
+
push:
|
| 4 |
+
pull_request:
|
| 5 |
+
|
| 6 |
+
permissions:
|
| 7 |
+
contents: read
|
| 8 |
+
actions: write
|
| 9 |
+
|
| 10 |
+
concurrency:
|
| 11 |
+
group: ${{ github.workflow }}-${{ github.ref }}
|
| 12 |
+
cancel-in-progress: true
|
| 13 |
+
|
| 14 |
+
jobs:
|
| 15 |
+
tests:
|
| 16 |
+
name: pytest (py${{ matrix.python-version }})
|
| 17 |
+
runs-on: ubuntu-latest
|
| 18 |
+
timeout-minutes: 15
|
| 19 |
+
strategy:
|
| 20 |
+
fail-fast: false
|
| 21 |
+
matrix:
|
| 22 |
+
python-version: ["3.12"]
|
| 23 |
+
|
| 24 |
+
env:
|
| 25 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 26 |
+
HF_HUB_ETAG_TIMEOUT: "120"
|
| 27 |
+
HF_HUB_DOWNLOAD_TIMEOUT: "120"
|
| 28 |
+
|
| 29 |
+
steps:
|
| 30 |
+
- uses: actions/checkout@v4
|
| 31 |
+
|
| 32 |
+
- uses: actions/setup-python@v5
|
| 33 |
+
with:
|
| 34 |
+
python-version: ${{ matrix.python-version }}
|
| 35 |
+
|
| 36 |
+
- uses: astral-sh/setup-uv@v5
|
| 37 |
+
|
| 38 |
+
- name: Set HF_HOME
|
| 39 |
+
shell: bash
|
| 40 |
+
run: |
|
| 41 |
+
echo "HF_HOME=${RUNNER_TEMP}/.hf" >> "$GITHUB_ENV"
|
| 42 |
+
mkdir -p "${RUNNER_TEMP}/.hf"
|
| 43 |
+
|
| 44 |
+
- name: Cache Hugging Face hub
|
| 45 |
+
uses: actions/cache@v4
|
| 46 |
+
with:
|
| 47 |
+
path: ${{ runner.temp }}/.hf
|
| 48 |
+
key: hf-${{ runner.os }}-${{ hashFiles('uv.lock', 'pyproject.toml') }}
|
| 49 |
+
restore-keys: hf-${{ runner.os }}-
|
| 50 |
+
|
| 51 |
+
# test-only .env file
|
| 52 |
+
- name: Create test .env
|
| 53 |
+
run: |
|
| 54 |
+
printf "OPENAI_API_KEY=test-dummy\n" > .env
|
| 55 |
+
|
| 56 |
+
- name: Install (locked)
|
| 57 |
+
run: |
|
| 58 |
+
uv sync --frozen --group dev --extra all_vision
|
| 59 |
+
|
| 60 |
+
# Prefetch HF dataset to avoid download during test collection
|
| 61 |
+
- name: Prefetch HF dataset
|
| 62 |
+
run: |
|
| 63 |
+
uv run python - <<'PY'
|
| 64 |
+
from huggingface_hub import snapshot_download
|
| 65 |
+
snapshot_download(
|
| 66 |
+
repo_id="pollen-robotics/reachy-mini-emotions-library",
|
| 67 |
+
repo_type="dataset",
|
| 68 |
+
etag_timeout=120,
|
| 69 |
+
max_workers=4
|
| 70 |
+
)
|
| 71 |
+
PY
|
| 72 |
+
|
| 73 |
+
- name: Run tests
|
| 74 |
+
run: uv run pytest -q
|
.github/workflows/typecheck.yml
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Type check
|
| 2 |
+
|
| 3 |
+
on: [push, pull_request]
|
| 4 |
+
|
| 5 |
+
permissions:
|
| 6 |
+
contents: read
|
| 7 |
+
|
| 8 |
+
concurrency:
|
| 9 |
+
group: ${{ github.workflow }}-${{ github.ref }}
|
| 10 |
+
cancel-in-progress: true
|
| 11 |
+
|
| 12 |
+
jobs:
|
| 13 |
+
mypy:
|
| 14 |
+
runs-on: ubuntu-latest
|
| 15 |
+
timeout-minutes: 10
|
| 16 |
+
steps:
|
| 17 |
+
- uses: actions/checkout@v4
|
| 18 |
+
|
| 19 |
+
- uses: actions/setup-python@v5
|
| 20 |
+
with:
|
| 21 |
+
python-version: "3.12"
|
| 22 |
+
|
| 23 |
+
- uses: astral-sh/setup-uv@v5
|
| 24 |
+
|
| 25 |
+
- name: Install deps (locked) incl. vision extras
|
| 26 |
+
run: uv sync --frozen --group dev --extra all_vision
|
| 27 |
+
|
| 28 |
+
- name: Run mypy
|
| 29 |
+
run: uv run mypy --pretty --show-error-codes .
|
.gitignore
CHANGED
|
@@ -29,6 +29,7 @@ coverage.xml
|
|
| 29 |
|
| 30 |
# Linting and formatting
|
| 31 |
.ruff_cache/
|
|
|
|
| 32 |
|
| 33 |
# IDE
|
| 34 |
.vscode/
|
|
|
|
| 29 |
|
| 30 |
# Linting and formatting
|
| 31 |
.ruff_cache/
|
| 32 |
+
.mypy_cache/
|
| 33 |
|
| 34 |
# IDE
|
| 35 |
.vscode/
|
README.md
CHANGED
|
@@ -9,11 +9,19 @@ tags:
|
|
| 9 |
- reachy_mini
|
| 10 |
---
|
| 11 |
|
| 12 |
-
# Reachy Mini conversation
|
| 13 |
|
| 14 |
-
Conversational
|
| 15 |
|
| 16 |
-
 are large and require compatible CUDA or CPU builds
|
|
| 94 |
| `HF_TOKEN` | Optional token for Hugging Face models (only used with `--local-vision` flag, falls back to `huggingface-cli login`).
|
| 95 |
| `LOCAL_VISION_MODEL` | Hugging Face model path for local vision processing (only used with `--local-vision` flag, defaults to `HuggingFaceTB/SmolVLM2-2.2B-Instruct`).
|
| 96 |
|
| 97 |
-
## Running the
|
| 98 |
|
| 99 |
Activate your virtual environment, ensure the Reachy Mini robot (or simulator) is reachable, then launch:
|
| 100 |
|
| 101 |
```bash
|
| 102 |
-
reachy-mini-conversation-
|
| 103 |
```
|
| 104 |
|
| 105 |
By default, the app runs in console mode for direct audio interaction. Use the `--gradio` flag to launch a web UI served locally at http://127.0.0.1:7860/ (required when running in simulation mode). With a camera attached, vision is handled by the gpt-realtime model when the camera tool is used. For local vision processing, use the `--local-vision` flag to process frames periodically using the SmolVLM2 model. Additionally, you can enable face tracking via YOLO or MediaPipe pipelines depending on the extras you installed.
|
|
@@ -119,19 +127,19 @@ By default, the app runs in console mode for direct audio interaction. Use the `
|
|
| 119 |
- Run on hardware with MediaPipe face tracking:
|
| 120 |
|
| 121 |
```bash
|
| 122 |
-
reachy-mini-conversation-
|
| 123 |
```
|
| 124 |
|
| 125 |
- Run with local vision processing (requires `local_vision` extra):
|
| 126 |
|
| 127 |
```bash
|
| 128 |
-
reachy-mini-conversation-
|
| 129 |
```
|
| 130 |
|
| 131 |
- Disable the camera pipeline (audio-only conversation):
|
| 132 |
|
| 133 |
```bash
|
| 134 |
-
reachy-mini-conversation-
|
| 135 |
```
|
| 136 |
|
| 137 |
## LLM tools exposed to the assistant
|
|
|
|
| 9 |
- reachy_mini
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# Reachy Mini conversation app
|
| 13 |
|
| 14 |
+
Conversational app for the Reachy Mini robot combining OpenAI's realtime APIs, vision pipelines, and choreographed motion libraries.
|
| 15 |
|
| 16 |
+

|
| 17 |
+
|
| 18 |
+
## Architecture
|
| 19 |
+
|
| 20 |
+
The app follows a layered architecture connecting the user, AI services, and robot hardware:
|
| 21 |
+
|
| 22 |
+
<p align="center">
|
| 23 |
+
<img src="docs/assets/conversation_app_arch.svg" alt="Architecture Diagram" width="600"/>
|
| 24 |
+
</p>
|
| 25 |
|
| 26 |
## Overview
|
| 27 |
- Real-time audio conversation loop powered by the OpenAI realtime API and `fastrtc` for low-latency streaming.
|
|
|
|
| 102 |
| `HF_TOKEN` | Optional token for Hugging Face models (only used with `--local-vision` flag, falls back to `huggingface-cli login`).
|
| 103 |
| `LOCAL_VISION_MODEL` | Hugging Face model path for local vision processing (only used with `--local-vision` flag, defaults to `HuggingFaceTB/SmolVLM2-2.2B-Instruct`).
|
| 104 |
|
| 105 |
+
## Running the app
|
| 106 |
|
| 107 |
Activate your virtual environment, ensure the Reachy Mini robot (or simulator) is reachable, then launch:
|
| 108 |
|
| 109 |
```bash
|
| 110 |
+
reachy-mini-conversation-app
|
| 111 |
```
|
| 112 |
|
| 113 |
By default, the app runs in console mode for direct audio interaction. Use the `--gradio` flag to launch a web UI served locally at http://127.0.0.1:7860/ (required when running in simulation mode). With a camera attached, vision is handled by the gpt-realtime model when the camera tool is used. For local vision processing, use the `--local-vision` flag to process frames periodically using the SmolVLM2 model. Additionally, you can enable face tracking via YOLO or MediaPipe pipelines depending on the extras you installed.
|
|
|
|
| 127 |
- Run on hardware with MediaPipe face tracking:
|
| 128 |
|
| 129 |
```bash
|
| 130 |
+
reachy-mini-conversation-app --head-tracker mediapipe
|
| 131 |
```
|
| 132 |
|
| 133 |
- Run with local vision processing (requires `local_vision` extra):
|
| 134 |
|
| 135 |
```bash
|
| 136 |
+
reachy-mini-conversation-app --local-vision
|
| 137 |
```
|
| 138 |
|
| 139 |
- Disable the camera pipeline (audio-only conversation):
|
| 140 |
|
| 141 |
```bash
|
| 142 |
+
reachy-mini-conversation-app --no-camera
|
| 143 |
```
|
| 144 |
|
| 145 |
## LLM tools exposed to the assistant
|
docs/assets/conversation_app_arch.svg
ADDED
|
|
Git LFS Details
|
{src/reachy_mini_conversation_demo/images β docs/assets}/reachy_mini_dance.gif
RENAMED
|
File without changes
|
docs/scheme.mmd
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
config:
|
| 3 |
+
layout: dagre
|
| 4 |
+
flowchart:
|
| 5 |
+
htmlLabels: true
|
| 6 |
+
---
|
| 7 |
+
flowchart TB
|
| 8 |
+
User(["<span style='font-size:16px;font-weight:bold;'>User</span><br><span style='font-size:13px;color:#01579b;'>Person interacting with system</span>"])
|
| 9 |
+
-- audio stream -->
|
| 10 |
+
UI@{ label: "<span style='font-size:16px;font-weight:bold;'>UI Layer</span><br><span style='font-size:13px;color:#0277bd;'>Gradio/Console</span>" }
|
| 11 |
+
|
| 12 |
+
UI -- audio stream -->
|
| 13 |
+
OpenAI@{ label: "<span style='font-size:17px;font-weight:bold;'>gpt-realtime API</span><br><span style='font-size:13px; color:#7b1fa2;'>Audio+Tool Calls+Vision</span>" }
|
| 14 |
+
|
| 15 |
+
OpenAI -- audio stream -->
|
| 16 |
+
Motion@{ label: "<span style='font-size:16px;font-weight:bold;'>Motion Control</span><br><span style='font-size:13px;color:#f57f17;'>Audio Sync + Tracking</span>" }
|
| 17 |
+
|
| 18 |
+
OpenAI -- tool calls -->
|
| 19 |
+
Handlers@{ label: "<span style='font-size:16px;font-weight:bold;'>Tool Handlers</span><br><span style='font-size:12px;color:#f9a825;'>move_head, camera, head_tracking,<br/>dance, play_emotion, do_nothing</span>" }
|
| 20 |
+
|
| 21 |
+
Handlers -- movement
|
| 22 |
+
requests --> Motion
|
| 23 |
+
|
| 24 |
+
Handlers -- camera frames, face tracking -->
|
| 25 |
+
Camera@{ label: "<span style='font-size:16px;font-weight:bold;'>Camera Worker</span><br><span style='font-size:13px;color:#f57f17;'>Frame Buffer + Face Tracking</span>" }
|
| 26 |
+
|
| 27 |
+
Handlers -. image for
|
| 28 |
+
analysis .-> OpenAI
|
| 29 |
+
|
| 30 |
+
Camera -- face tracking --> Motion
|
| 31 |
+
|
| 32 |
+
Camera -. frames .->
|
| 33 |
+
Vision@{ label: "<span style='font-size:16px;font-weight:bold;'>Vision Processor</span><br><span style='font-size:13px;color:#7b1fa2;'>Local VLM (optional)</span>" }
|
| 34 |
+
|
| 35 |
+
Vision -. description .-> Handlers
|
| 36 |
+
|
| 37 |
+
Robot@{ label: "<span style='font-size:16px;font-weight:bold;'>reachy_mini</span><br><span style='font-size:13px;color:#c62828;'>Robot Control Library</span>" }
|
| 38 |
+
-- camera
|
| 39 |
+
frames --> Camera
|
| 40 |
+
|
| 41 |
+
Motion -- commands --> Robot
|
| 42 |
+
|
| 43 |
+
Handlers -- results --> OpenAI
|
| 44 |
+
|
| 45 |
+
User:::userStyle
|
| 46 |
+
UI:::uiStyle
|
| 47 |
+
OpenAI:::aiStyle
|
| 48 |
+
Motion:::coreStyle
|
| 49 |
+
Handlers:::toolStyle
|
| 50 |
+
Camera:::coreStyle
|
| 51 |
+
Vision:::aiStyle
|
| 52 |
+
Robot:::hardwareStyle
|
| 53 |
+
classDef userStyle fill:#e1f5fe,stroke:#01579b,stroke-width:3px
|
| 54 |
+
classDef uiStyle fill:#b3e5fc,stroke:#0277bd,stroke-width:2px
|
| 55 |
+
classDef aiStyle fill:#e1bee7,stroke:#7b1fa2,stroke-width:3px
|
| 56 |
+
classDef coreStyle fill:#fff9c4,stroke:#f57f17,stroke-width:2px
|
| 57 |
+
classDef hardwareStyle fill:#ef9a9a,stroke:#c62828,stroke-width:3px
|
| 58 |
+
classDef toolStyle fill:#fffde7,stroke:#f9a825,stroke-width:1px
|
pyproject.toml
CHANGED
|
@@ -3,7 +3,7 @@ requires = ["setuptools"]
|
|
| 3 |
build-backend = "setuptools.build_meta"
|
| 4 |
|
| 5 |
[project]
|
| 6 |
-
name = "
|
| 7 |
version = "0.1.0"
|
| 8 |
authors = [{ name = "Pollen Robotics", email = "contact@pollen-robotics.com" }]
|
| 9 |
description = ""
|
|
@@ -12,7 +12,7 @@ requires-python = ">=3.10"
|
|
| 12 |
dependencies = [
|
| 13 |
#Media
|
| 14 |
"aiortc>=1.13.0",
|
| 15 |
-
"fastrtc
|
| 16 |
"gradio>=5.49.0",
|
| 17 |
"huggingface_hub>=0.34.4",
|
| 18 |
"opencv-python>=4.12.0.88",
|
|
@@ -23,7 +23,7 @@ dependencies = [
|
|
| 23 |
#OpenAI
|
| 24 |
"openai>=2.1",
|
| 25 |
|
| 26 |
-
#Reachy mini
|
| 27 |
"reachy_mini_dances_library",
|
| 28 |
"reachy_mini_toolbox",
|
| 29 |
"reachy_mini>=1.0.0.rc5",
|
|
@@ -34,16 +34,23 @@ local_vision = ["torch", "transformers", "num2words"]
|
|
| 34 |
yolo_vision = ["ultralytics", "supervision"]
|
| 35 |
mediapipe_vision = ["mediapipe>=0.10.14"]
|
| 36 |
all_vision = [
|
| 37 |
-
"
|
| 38 |
-
"
|
| 39 |
-
"
|
| 40 |
]
|
| 41 |
|
| 42 |
[dependency-groups]
|
| 43 |
-
dev = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
[project.scripts]
|
| 46 |
-
reachy-mini-conversation-
|
| 47 |
|
| 48 |
[project.entry-points."reachy_mini_apps"]
|
| 49 |
reachy_mini_conversation_demo_app = "reachy_mini_conversation_demo.main:ReachyMiniConversationDemo"
|
|
@@ -56,7 +63,7 @@ include-package-data = true
|
|
| 56 |
where = ["src"]
|
| 57 |
|
| 58 |
[tool.setuptools.package-data]
|
| 59 |
-
|
| 60 |
|
| 61 |
[tool.ruff]
|
| 62 |
line-length = 119
|
|
@@ -82,7 +89,7 @@ ignore = [
|
|
| 82 |
length-sort = true
|
| 83 |
lines-after-imports = 2
|
| 84 |
no-lines-before = ["standard-library", "local-folder"]
|
| 85 |
-
known-local-folder = ["
|
| 86 |
known-first-party = ["reachy_mini", "reachy_mini_dances_library", "reachy_mini_toolbox"]
|
| 87 |
split-on-trailing-comma = true
|
| 88 |
|
|
@@ -91,3 +98,11 @@ quote-style = "double"
|
|
| 91 |
indent-style = "space"
|
| 92 |
skip-magic-trailing-comma = false
|
| 93 |
line-ending = "auto"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
build-backend = "setuptools.build_meta"
|
| 4 |
|
| 5 |
[project]
|
| 6 |
+
name = "reachy_mini_conversation_app"
|
| 7 |
version = "0.1.0"
|
| 8 |
authors = [{ name = "Pollen Robotics", email = "contact@pollen-robotics.com" }]
|
| 9 |
description = ""
|
|
|
|
| 12 |
dependencies = [
|
| 13 |
#Media
|
| 14 |
"aiortc>=1.13.0",
|
| 15 |
+
"fastrtc>=0.0.33",
|
| 16 |
"gradio>=5.49.0",
|
| 17 |
"huggingface_hub>=0.34.4",
|
| 18 |
"opencv-python>=4.12.0.88",
|
|
|
|
| 23 |
#OpenAI
|
| 24 |
"openai>=2.1",
|
| 25 |
|
| 26 |
+
#Reachy mini
|
| 27 |
"reachy_mini_dances_library",
|
| 28 |
"reachy_mini_toolbox",
|
| 29 |
"reachy_mini>=1.0.0.rc5",
|
|
|
|
| 34 |
yolo_vision = ["ultralytics", "supervision"]
|
| 35 |
mediapipe_vision = ["mediapipe>=0.10.14"]
|
| 36 |
all_vision = [
|
| 37 |
+
"torch", "transformers", "num2words",
|
| 38 |
+
"ultralytics", "supervision",
|
| 39 |
+
"mediapipe>=0.10.14",
|
| 40 |
]
|
| 41 |
|
| 42 |
[dependency-groups]
|
| 43 |
+
dev = [
|
| 44 |
+
"pytest",
|
| 45 |
+
"pytest-asyncio",
|
| 46 |
+
"ruff==0.12.0",
|
| 47 |
+
"mypy==1.18.2",
|
| 48 |
+
"pre-commit",
|
| 49 |
+
"types-requests",
|
| 50 |
+
]
|
| 51 |
|
| 52 |
[project.scripts]
|
| 53 |
+
reachy-mini-conversation-app = "reachy_mini_conversation_app.main:main"
|
| 54 |
|
| 55 |
[project.entry-points."reachy_mini_apps"]
|
| 56 |
reachy_mini_conversation_demo_app = "reachy_mini_conversation_demo.main:ReachyMiniConversationDemo"
|
|
|
|
| 63 |
where = ["src"]
|
| 64 |
|
| 65 |
[tool.setuptools.package-data]
|
| 66 |
+
reachy_mini_conversation_app = ["images/*"]
|
| 67 |
|
| 68 |
[tool.ruff]
|
| 69 |
line-length = 119
|
|
|
|
| 89 |
length-sort = true
|
| 90 |
lines-after-imports = 2
|
| 91 |
no-lines-before = ["standard-library", "local-folder"]
|
| 92 |
+
known-local-folder = ["reachy_mini_conversation_app"]
|
| 93 |
known-first-party = ["reachy_mini", "reachy_mini_dances_library", "reachy_mini_toolbox"]
|
| 94 |
split-on-trailing-comma = true
|
| 95 |
|
|
|
|
| 98 |
indent-style = "space"
|
| 99 |
skip-magic-trailing-comma = false
|
| 100 |
line-ending = "auto"
|
| 101 |
+
|
| 102 |
+
[tool.mypy]
|
| 103 |
+
python_version = "3.12"
|
| 104 |
+
files = ["src/"]
|
| 105 |
+
ignore_missing_imports = true
|
| 106 |
+
strict = true
|
| 107 |
+
show_error_codes = true
|
| 108 |
+
warn_unused_ignores = true
|
src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/__init__.py
RENAMED
|
File without changes
|
src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/audio/__init__.py
RENAMED
|
File without changes
|
src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/audio/head_wobbler.py
RENAMED
|
@@ -5,11 +5,13 @@ import queue
|
|
| 5 |
import base64
|
| 6 |
import logging
|
| 7 |
import threading
|
| 8 |
-
from typing import Tuple
|
|
|
|
| 9 |
|
| 10 |
import numpy as np
|
|
|
|
| 11 |
|
| 12 |
-
from
|
| 13 |
|
| 14 |
|
| 15 |
SAMPLE_RATE = 24000
|
|
@@ -20,13 +22,13 @@ logger = logging.getLogger(__name__)
|
|
| 20 |
class HeadWobbler:
|
| 21 |
"""Converts audio deltas (base64) into head movement offsets."""
|
| 22 |
|
| 23 |
-
def __init__(self, set_speech_offsets):
|
| 24 |
"""Initialize the head wobbler."""
|
| 25 |
self._apply_offsets = set_speech_offsets
|
| 26 |
-
self._base_ts:
|
| 27 |
self._hops_done: int = 0
|
| 28 |
|
| 29 |
-
self.audio_queue: queue.Queue[Tuple[int, int, np.
|
| 30 |
self.sway = SwayRollRT()
|
| 31 |
|
| 32 |
# Synchronization primitives
|
|
@@ -35,7 +37,7 @@ class HeadWobbler:
|
|
| 35 |
self._generation = 0
|
| 36 |
|
| 37 |
self._stop_event = threading.Event()
|
| 38 |
-
self._thread:
|
| 39 |
|
| 40 |
def feed(self, delta_b64: str) -> None:
|
| 41 |
"""Thread-safe: push audio into the consumer queue."""
|
|
@@ -78,14 +80,14 @@ class HeadWobbler:
|
|
| 78 |
if chunk_generation != current_generation:
|
| 79 |
continue
|
| 80 |
|
| 81 |
-
pcm = np.asarray(chunk).squeeze(0)
|
| 82 |
-
with self._sway_lock:
|
| 83 |
-
results = self.sway.feed(pcm, sr)
|
| 84 |
-
|
| 85 |
if self._base_ts is None:
|
| 86 |
with self._state_lock:
|
| 87 |
if self._base_ts is None:
|
| 88 |
-
self._base_ts = time.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
i = 0
|
| 91 |
while i < len(results):
|
|
@@ -96,14 +98,14 @@ class HeadWobbler:
|
|
| 96 |
hops_done = self._hops_done
|
| 97 |
|
| 98 |
if base_ts is None:
|
| 99 |
-
base_ts = time.
|
| 100 |
with self._state_lock:
|
| 101 |
if self._base_ts is None:
|
| 102 |
self._base_ts = base_ts
|
| 103 |
hops_done = self._hops_done
|
| 104 |
|
| 105 |
target = base_ts + MOVEMENT_LATENCY_S + hops_done * hop_dt
|
| 106 |
-
now = time.
|
| 107 |
|
| 108 |
if now - target >= hop_dt:
|
| 109 |
lag_hops = int((now - target) / hop_dt)
|
|
|
|
| 5 |
import base64
|
| 6 |
import logging
|
| 7 |
import threading
|
| 8 |
+
from typing import Tuple
|
| 9 |
+
from collections.abc import Callable
|
| 10 |
|
| 11 |
import numpy as np
|
| 12 |
+
from numpy.typing import NDArray
|
| 13 |
|
| 14 |
+
from reachy_mini_conversation_app.audio.speech_tapper import HOP_MS, SwayRollRT
|
| 15 |
|
| 16 |
|
| 17 |
SAMPLE_RATE = 24000
|
|
|
|
| 22 |
class HeadWobbler:
|
| 23 |
"""Converts audio deltas (base64) into head movement offsets."""
|
| 24 |
|
| 25 |
+
def __init__(self, set_speech_offsets: Callable[[Tuple[float, float, float, float, float, float]], None]) -> None:
|
| 26 |
"""Initialize the head wobbler."""
|
| 27 |
self._apply_offsets = set_speech_offsets
|
| 28 |
+
self._base_ts: float | None = None
|
| 29 |
self._hops_done: int = 0
|
| 30 |
|
| 31 |
+
self.audio_queue: "queue.Queue[Tuple[int, int, NDArray[np.int16]]]" = queue.Queue()
|
| 32 |
self.sway = SwayRollRT()
|
| 33 |
|
| 34 |
# Synchronization primitives
|
|
|
|
| 37 |
self._generation = 0
|
| 38 |
|
| 39 |
self._stop_event = threading.Event()
|
| 40 |
+
self._thread: threading.Thread | None = None
|
| 41 |
|
| 42 |
def feed(self, delta_b64: str) -> None:
|
| 43 |
"""Thread-safe: push audio into the consumer queue."""
|
|
|
|
| 80 |
if chunk_generation != current_generation:
|
| 81 |
continue
|
| 82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
if self._base_ts is None:
|
| 84 |
with self._state_lock:
|
| 85 |
if self._base_ts is None:
|
| 86 |
+
self._base_ts = time.monotonic()
|
| 87 |
+
|
| 88 |
+
pcm = np.asarray(chunk).squeeze(0)
|
| 89 |
+
with self._sway_lock:
|
| 90 |
+
results = self.sway.feed(pcm, sr)
|
| 91 |
|
| 92 |
i = 0
|
| 93 |
while i < len(results):
|
|
|
|
| 98 |
hops_done = self._hops_done
|
| 99 |
|
| 100 |
if base_ts is None:
|
| 101 |
+
base_ts = time.monotonic()
|
| 102 |
with self._state_lock:
|
| 103 |
if self._base_ts is None:
|
| 104 |
self._base_ts = base_ts
|
| 105 |
hops_done = self._hops_done
|
| 106 |
|
| 107 |
target = base_ts + MOVEMENT_LATENCY_S + hops_done * hop_dt
|
| 108 |
+
now = time.monotonic()
|
| 109 |
|
| 110 |
if now - target >= hop_dt:
|
| 111 |
lag_hops = int((now - target) / hop_dt)
|
src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/audio/speech_tapper.py
RENAMED
|
@@ -1,10 +1,11 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
import math
|
| 3 |
-
from typing import Dict, List
|
| 4 |
from itertools import islice
|
| 5 |
from collections import deque
|
| 6 |
|
| 7 |
import numpy as np
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
# Tunables
|
|
@@ -48,7 +49,7 @@ SWAY_ATTACK_FR = max(1, int(SWAY_ATTACK_MS / HOP_MS))
|
|
| 48 |
SWAY_RELEASE_FR = max(1, int(SWAY_RELEASE_MS / HOP_MS))
|
| 49 |
|
| 50 |
|
| 51 |
-
def _rms_dbfs(x: np.
|
| 52 |
"""Root-mean-square in dBFS for float32 mono array in [-1,1]."""
|
| 53 |
# numerically stable rms (avoid overflow)
|
| 54 |
x = x.astype(np.float32, copy=False)
|
|
@@ -66,7 +67,7 @@ def _loudness_gain(db: float, offset: float = SENS_DB_OFFSET) -> float:
|
|
| 66 |
return t**LOUDNESS_GAMMA if LOUDNESS_GAMMA != 1.0 else t
|
| 67 |
|
| 68 |
|
| 69 |
-
def _to_float32_mono(x:
|
| 70 |
"""Convert arbitrary PCM array to float32 mono in [-1,1].
|
| 71 |
|
| 72 |
Accepts shapes: (N,), (1,N), (N,1), (C,N), (N,C).
|
|
@@ -94,7 +95,7 @@ def _to_float32_mono(x: np.ndarray) -> np.ndarray:
|
|
| 94 |
return a.astype(np.float32) / (scale if scale != 0.0 else 1.0)
|
| 95 |
|
| 96 |
|
| 97 |
-
def _resample_linear(x: np.
|
| 98 |
"""Lightweight linear resampler for short buffers."""
|
| 99 |
if sr_in == sr_out or x.size == 0:
|
| 100 |
return x
|
|
@@ -118,8 +119,8 @@ class SwayRollRT:
|
|
| 118 |
def __init__(self, rng_seed: int = 7):
|
| 119 |
"""Initialize state."""
|
| 120 |
self._seed = int(rng_seed)
|
| 121 |
-
self.samples = deque(maxlen=10 * SR) # sliding window for VAD/env
|
| 122 |
-
self.carry = np.zeros(0, dtype=np.float32)
|
| 123 |
|
| 124 |
self.vad_on = False
|
| 125 |
self.vad_above = 0
|
|
@@ -150,7 +151,7 @@ class SwayRollRT:
|
|
| 150 |
self.sway_down = 0
|
| 151 |
self.t = 0.0
|
| 152 |
|
| 153 |
-
def feed(self, pcm:
|
| 154 |
"""Stream in PCM chunk. Returns a list of sway dicts, one per hop (HOP_MS).
|
| 155 |
|
| 156 |
Args:
|
|
@@ -177,7 +178,8 @@ class SwayRollRT:
|
|
| 177 |
|
| 178 |
while self.carry.size >= HOP:
|
| 179 |
hop = self.carry[:HOP]
|
| 180 |
-
|
|
|
|
| 181 |
|
| 182 |
# keep sliding window for VAD/env computation
|
| 183 |
# (deque accepts any iterable; list() for small HOP is fine)
|
|
@@ -260,7 +262,7 @@ class SwayRollRT:
|
|
| 260 |
"x_mm": x_mm,
|
| 261 |
"y_mm": y_mm,
|
| 262 |
"z_mm": z_mm,
|
| 263 |
-
}
|
| 264 |
)
|
| 265 |
|
| 266 |
return out
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
import math
|
| 3 |
+
from typing import Any, Dict, List
|
| 4 |
from itertools import islice
|
| 5 |
from collections import deque
|
| 6 |
|
| 7 |
import numpy as np
|
| 8 |
+
from numpy.typing import NDArray
|
| 9 |
|
| 10 |
|
| 11 |
# Tunables
|
|
|
|
| 49 |
SWAY_RELEASE_FR = max(1, int(SWAY_RELEASE_MS / HOP_MS))
|
| 50 |
|
| 51 |
|
| 52 |
+
def _rms_dbfs(x: NDArray[np.float32]) -> float:
|
| 53 |
"""Root-mean-square in dBFS for float32 mono array in [-1,1]."""
|
| 54 |
# numerically stable rms (avoid overflow)
|
| 55 |
x = x.astype(np.float32, copy=False)
|
|
|
|
| 67 |
return t**LOUDNESS_GAMMA if LOUDNESS_GAMMA != 1.0 else t
|
| 68 |
|
| 69 |
|
| 70 |
+
def _to_float32_mono(x: NDArray[Any]) -> NDArray[np.float32]:
|
| 71 |
"""Convert arbitrary PCM array to float32 mono in [-1,1].
|
| 72 |
|
| 73 |
Accepts shapes: (N,), (1,N), (N,1), (C,N), (N,C).
|
|
|
|
| 95 |
return a.astype(np.float32) / (scale if scale != 0.0 else 1.0)
|
| 96 |
|
| 97 |
|
| 98 |
+
def _resample_linear(x: NDArray[np.float32], sr_in: int, sr_out: int) -> NDArray[np.float32]:
|
| 99 |
"""Lightweight linear resampler for short buffers."""
|
| 100 |
if sr_in == sr_out or x.size == 0:
|
| 101 |
return x
|
|
|
|
| 119 |
def __init__(self, rng_seed: int = 7):
|
| 120 |
"""Initialize state."""
|
| 121 |
self._seed = int(rng_seed)
|
| 122 |
+
self.samples: deque[float] = deque(maxlen=10 * SR) # sliding window for VAD/env
|
| 123 |
+
self.carry: NDArray[np.float32] = np.zeros(0, dtype=np.float32)
|
| 124 |
|
| 125 |
self.vad_on = False
|
| 126 |
self.vad_above = 0
|
|
|
|
| 151 |
self.sway_down = 0
|
| 152 |
self.t = 0.0
|
| 153 |
|
| 154 |
+
def feed(self, pcm: NDArray[Any], sr: int | None) -> List[Dict[str, float]]:
|
| 155 |
"""Stream in PCM chunk. Returns a list of sway dicts, one per hop (HOP_MS).
|
| 156 |
|
| 157 |
Args:
|
|
|
|
| 178 |
|
| 179 |
while self.carry.size >= HOP:
|
| 180 |
hop = self.carry[:HOP]
|
| 181 |
+
remaining: NDArray[np.float32] = self.carry[HOP:]
|
| 182 |
+
self.carry = remaining
|
| 183 |
|
| 184 |
# keep sliding window for VAD/env computation
|
| 185 |
# (deque accepts any iterable; list() for small HOP is fine)
|
|
|
|
| 262 |
"x_mm": x_mm,
|
| 263 |
"y_mm": y_mm,
|
| 264 |
"z_mm": z_mm,
|
| 265 |
+
},
|
| 266 |
)
|
| 267 |
|
| 268 |
return out
|
src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/camera_worker.py
RENAMED
|
@@ -9,10 +9,10 @@ Ported from main_works.py camera_worker() function to provide:
|
|
| 9 |
import time
|
| 10 |
import logging
|
| 11 |
import threading
|
| 12 |
-
from typing import
|
| 13 |
|
| 14 |
-
import cv2
|
| 15 |
import numpy as np
|
|
|
|
| 16 |
from scipy.spatial.transform import Rotation as R
|
| 17 |
|
| 18 |
from reachy_mini import ReachyMini
|
|
@@ -25,20 +25,20 @@ logger = logging.getLogger(__name__)
|
|
| 25 |
class CameraWorker:
|
| 26 |
"""Thread-safe camera worker with frame buffering and face tracking."""
|
| 27 |
|
| 28 |
-
def __init__(self, reachy_mini: ReachyMini, head_tracker=None):
|
| 29 |
"""Initialize."""
|
| 30 |
self.reachy_mini = reachy_mini
|
| 31 |
self.head_tracker = head_tracker
|
| 32 |
|
| 33 |
# Thread-safe frame storage
|
| 34 |
-
self.latest_frame:
|
| 35 |
self.frame_lock = threading.Lock()
|
| 36 |
self._stop_event = threading.Event()
|
| 37 |
-
self._thread:
|
| 38 |
|
| 39 |
# Face tracking state
|
| 40 |
self.is_head_tracking_enabled = True
|
| 41 |
-
self.face_tracking_offsets = [
|
| 42 |
0.0,
|
| 43 |
0.0,
|
| 44 |
0.0,
|
|
@@ -49,31 +49,30 @@ class CameraWorker:
|
|
| 49 |
self.face_tracking_lock = threading.Lock()
|
| 50 |
|
| 51 |
# Face tracking timing variables (same as main_works.py)
|
| 52 |
-
self.last_face_detected_time:
|
| 53 |
-
self.interpolation_start_time:
|
| 54 |
-
self.interpolation_start_pose:
|
| 55 |
self.face_lost_delay = 2.0 # seconds to wait before starting interpolation
|
| 56 |
self.interpolation_duration = 1.0 # seconds to interpolate back to neutral
|
| 57 |
|
| 58 |
# Track state changes
|
| 59 |
self.previous_head_tracking_state = self.is_head_tracking_enabled
|
| 60 |
|
| 61 |
-
def get_latest_frame(self) ->
|
| 62 |
"""Get the latest frame (thread-safe)."""
|
| 63 |
with self.frame_lock:
|
| 64 |
if self.latest_frame is None:
|
| 65 |
return None
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
| 69 |
-
return frame
|
| 70 |
|
| 71 |
def get_face_tracking_offsets(
|
| 72 |
self,
|
| 73 |
) -> Tuple[float, float, float, float, float, float]:
|
| 74 |
"""Get current face tracking offsets (thread-safe)."""
|
| 75 |
with self.face_tracking_lock:
|
| 76 |
-
|
|
|
|
| 77 |
|
| 78 |
def set_head_tracking_enabled(self, enabled: bool) -> None:
|
| 79 |
"""Enable/disable head tracking."""
|
|
@@ -168,12 +167,11 @@ class CameraWorker:
|
|
| 168 |
rotation[2], # roll, pitch, yaw
|
| 169 |
]
|
| 170 |
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
if
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
pass
|
| 177 |
|
| 178 |
# Handle smooth interpolation (works for both face-lost and tracking-disabled cases)
|
| 179 |
if self.last_face_detected_time is not None:
|
|
@@ -188,11 +186,12 @@ class CameraWorker:
|
|
| 188 |
current_translation = self.face_tracking_offsets[:3]
|
| 189 |
current_rotation_euler = self.face_tracking_offsets[3:]
|
| 190 |
# Convert to 4x4 pose matrix
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
"xyz", current_rotation_euler
|
| 195 |
).as_matrix()
|
|
|
|
| 196 |
|
| 197 |
# Calculate interpolation progress (t from 0 to 1)
|
| 198 |
elapsed_interpolation = current_time - self.interpolation_start_time
|
|
@@ -200,7 +199,7 @@ class CameraWorker:
|
|
| 200 |
|
| 201 |
# Interpolate between current pose and neutral pose
|
| 202 |
interpolated_pose = linear_pose_interpolation(
|
| 203 |
-
self.interpolation_start_pose, neutral_pose, t
|
| 204 |
)
|
| 205 |
|
| 206 |
# Extract translation and rotation from interpolated pose
|
|
|
|
| 9 |
import time
|
| 10 |
import logging
|
| 11 |
import threading
|
| 12 |
+
from typing import Any, List, Tuple
|
| 13 |
|
|
|
|
| 14 |
import numpy as np
|
| 15 |
+
from numpy.typing import NDArray
|
| 16 |
from scipy.spatial.transform import Rotation as R
|
| 17 |
|
| 18 |
from reachy_mini import ReachyMini
|
|
|
|
| 25 |
class CameraWorker:
|
| 26 |
"""Thread-safe camera worker with frame buffering and face tracking."""
|
| 27 |
|
| 28 |
+
def __init__(self, reachy_mini: ReachyMini, head_tracker: Any = None) -> None:
|
| 29 |
"""Initialize."""
|
| 30 |
self.reachy_mini = reachy_mini
|
| 31 |
self.head_tracker = head_tracker
|
| 32 |
|
| 33 |
# Thread-safe frame storage
|
| 34 |
+
self.latest_frame: NDArray[np.uint8] | None = None
|
| 35 |
self.frame_lock = threading.Lock()
|
| 36 |
self._stop_event = threading.Event()
|
| 37 |
+
self._thread: threading.Thread | None = None
|
| 38 |
|
| 39 |
# Face tracking state
|
| 40 |
self.is_head_tracking_enabled = True
|
| 41 |
+
self.face_tracking_offsets: List[float] = [
|
| 42 |
0.0,
|
| 43 |
0.0,
|
| 44 |
0.0,
|
|
|
|
| 49 |
self.face_tracking_lock = threading.Lock()
|
| 50 |
|
| 51 |
# Face tracking timing variables (same as main_works.py)
|
| 52 |
+
self.last_face_detected_time: float | None = None
|
| 53 |
+
self.interpolation_start_time: float | None = None
|
| 54 |
+
self.interpolation_start_pose: NDArray[np.float32] | None = None
|
| 55 |
self.face_lost_delay = 2.0 # seconds to wait before starting interpolation
|
| 56 |
self.interpolation_duration = 1.0 # seconds to interpolate back to neutral
|
| 57 |
|
| 58 |
# Track state changes
|
| 59 |
self.previous_head_tracking_state = self.is_head_tracking_enabled
|
| 60 |
|
| 61 |
+
def get_latest_frame(self) -> NDArray[np.uint8] | None:
|
| 62 |
"""Get the latest frame (thread-safe)."""
|
| 63 |
with self.frame_lock:
|
| 64 |
if self.latest_frame is None:
|
| 65 |
return None
|
| 66 |
+
# Return a copy in original BGR format (OpenCV native)
|
| 67 |
+
return self.latest_frame.copy()
|
|
|
|
|
|
|
| 68 |
|
| 69 |
def get_face_tracking_offsets(
|
| 70 |
self,
|
| 71 |
) -> Tuple[float, float, float, float, float, float]:
|
| 72 |
"""Get current face tracking offsets (thread-safe)."""
|
| 73 |
with self.face_tracking_lock:
|
| 74 |
+
offsets = self.face_tracking_offsets
|
| 75 |
+
return (offsets[0], offsets[1], offsets[2], offsets[3], offsets[4], offsets[5])
|
| 76 |
|
| 77 |
def set_head_tracking_enabled(self, enabled: bool) -> None:
|
| 78 |
"""Enable/disable head tracking."""
|
|
|
|
| 167 |
rotation[2], # roll, pitch, yaw
|
| 168 |
]
|
| 169 |
|
| 170 |
+
# No face detected while tracking enabled - set face lost timestamp
|
| 171 |
+
elif self.last_face_detected_time is None or self.last_face_detected_time == current_time:
|
| 172 |
+
# Only update if we haven't already set a face lost time
|
| 173 |
+
# (current_time check prevents overriding the disable-triggered timestamp)
|
| 174 |
+
pass
|
|
|
|
| 175 |
|
| 176 |
# Handle smooth interpolation (works for both face-lost and tracking-disabled cases)
|
| 177 |
if self.last_face_detected_time is not None:
|
|
|
|
| 186 |
current_translation = self.face_tracking_offsets[:3]
|
| 187 |
current_rotation_euler = self.face_tracking_offsets[3:]
|
| 188 |
# Convert to 4x4 pose matrix
|
| 189 |
+
pose_matrix = np.eye(4, dtype=np.float32)
|
| 190 |
+
pose_matrix[:3, 3] = current_translation
|
| 191 |
+
pose_matrix[:3, :3] = R.from_euler(
|
| 192 |
+
"xyz", current_rotation_euler,
|
| 193 |
).as_matrix()
|
| 194 |
+
self.interpolation_start_pose = pose_matrix
|
| 195 |
|
| 196 |
# Calculate interpolation progress (t from 0 to 1)
|
| 197 |
elapsed_interpolation = current_time - self.interpolation_start_time
|
|
|
|
| 199 |
|
| 200 |
# Interpolate between current pose and neutral pose
|
| 201 |
interpolated_pose = linear_pose_interpolation(
|
| 202 |
+
self.interpolation_start_pose, neutral_pose, t,
|
| 203 |
)
|
| 204 |
|
| 205 |
# Extract translation and rotation from interpolated pose
|
src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/config.py
RENAMED
|
@@ -1,44 +1,33 @@
|
|
| 1 |
import os
|
| 2 |
import logging
|
| 3 |
-
from pathlib import Path
|
| 4 |
|
| 5 |
-
from dotenv import load_dotenv
|
| 6 |
|
| 7 |
|
| 8 |
logger = logging.getLogger(__name__)
|
| 9 |
|
| 10 |
-
#
|
| 11 |
-
|
| 12 |
-
# env_file = Path(".env")
|
| 13 |
-
# if not env_file.exists():
|
| 14 |
-
# raise RuntimeError(
|
| 15 |
-
# ".env file not found. Please create one based on .env.example:\n"
|
| 16 |
-
# " cp .env.example .env\n"
|
| 17 |
-
# "Then add your OPENAI_API_KEY to the .env file."
|
| 18 |
-
# )
|
| 19 |
|
| 20 |
-
|
| 21 |
-
#
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
logger.info("Configuration loaded from .env file")
|
| 27 |
|
| 28 |
|
| 29 |
class Config:
|
| 30 |
-
"""Configuration class for the conversation
|
| 31 |
|
| 32 |
# Required
|
| 33 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 34 |
-
if OPENAI_API_KEY
|
| 35 |
-
raise RuntimeError(
|
| 36 |
-
"OPENAI_API_KEY is not set in .env file. Please add it:\n"
|
| 37 |
-
" OPENAI_API_KEY=your_api_key_here"
|
| 38 |
-
)
|
| 39 |
-
if not OPENAI_API_KEY.strip():
|
| 40 |
raise RuntimeError(
|
| 41 |
-
"OPENAI_API_KEY is
|
|
|
|
|
|
|
|
|
|
| 42 |
)
|
| 43 |
|
| 44 |
# Optional
|
|
|
|
| 1 |
import os
|
| 2 |
import logging
|
|
|
|
| 3 |
|
| 4 |
+
from dotenv import find_dotenv, load_dotenv
|
| 5 |
|
| 6 |
|
| 7 |
logger = logging.getLogger(__name__)
|
| 8 |
|
| 9 |
+
# Locate .env file (search upward from current working directory)
|
| 10 |
+
dotenv_path = find_dotenv(usecwd=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
+
if dotenv_path:
|
| 13 |
+
# Load .env and override environment variables
|
| 14 |
+
load_dotenv(dotenv_path=dotenv_path, override=True)
|
| 15 |
+
logger.info(f"Configuration loaded from {dotenv_path}")
|
| 16 |
+
else:
|
| 17 |
+
logger.warning("No .env file found, using environment variables")
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
class Config:
|
| 21 |
+
"""Configuration class for the conversation app."""
|
| 22 |
|
| 23 |
# Required
|
| 24 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 25 |
+
if not OPENAI_API_KEY or not OPENAI_API_KEY.strip():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
raise RuntimeError(
|
| 27 |
+
"OPENAI_API_KEY is missing or empty.\n"
|
| 28 |
+
"Either:\n"
|
| 29 |
+
" 1. Create a .env file with: OPENAI_API_KEY=your_api_key_here (recomended)\n"
|
| 30 |
+
" 2. Set environment variable: export OPENAI_API_KEY=your_api_key_here"
|
| 31 |
)
|
| 32 |
|
| 33 |
# Optional
|
src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/console.py
RENAMED
|
@@ -3,14 +3,16 @@
|
|
| 3 |
records mic frames to the handler and plays handler audio frames to the speaker.
|
| 4 |
"""
|
| 5 |
|
|
|
|
| 6 |
import asyncio
|
| 7 |
import logging
|
|
|
|
| 8 |
|
| 9 |
-
import librosa
|
| 10 |
from fastrtc import AdditionalOutputs, audio_to_int16, audio_to_float32
|
|
|
|
| 11 |
|
| 12 |
from reachy_mini import ReachyMini
|
| 13 |
-
from
|
| 14 |
|
| 15 |
|
| 16 |
logger = logging.getLogger(__name__)
|
|
@@ -24,15 +26,21 @@ class LocalStream:
|
|
| 24 |
self.handler = handler
|
| 25 |
self._robot = robot
|
| 26 |
self._stop_event = asyncio.Event()
|
| 27 |
-
self._tasks = []
|
| 28 |
# Allow the handler to flush the player queue when appropriate.
|
| 29 |
-
self.handler._clear_queue = self.clear_audio_queue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
def launch(self) -> None:
|
| 32 |
"""Start the recorder/player and run the async processing loops."""
|
| 33 |
self._stop_event.clear()
|
| 34 |
self._robot.media.start_recording()
|
| 35 |
self._robot.media.start_playing()
|
|
|
|
| 36 |
|
| 37 |
async def runner() -> None:
|
| 38 |
self._tasks = [
|
|
@@ -83,9 +91,8 @@ class LocalStream:
|
|
| 83 |
frame_mono = audio_frame.T[0] # both channels are identical
|
| 84 |
frame = audio_to_int16(frame_mono)
|
| 85 |
await self.handler.receive((16000, frame))
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
await asyncio.sleep(0.01) # avoid busy loop
|
| 89 |
|
| 90 |
async def play_loop(self) -> None:
|
| 91 |
"""Fetch outputs from the handler: log text and play audio frames."""
|
|
@@ -105,12 +112,16 @@ class LocalStream:
|
|
| 105 |
elif isinstance(handler_output, tuple):
|
| 106 |
input_sample_rate, audio_frame = handler_output
|
| 107 |
device_sample_rate = self._robot.media.get_audio_samplerate()
|
| 108 |
-
|
|
|
|
| 109 |
if input_sample_rate != device_sample_rate:
|
| 110 |
-
|
| 111 |
-
|
|
|
|
|
|
|
| 112 |
)
|
| 113 |
-
|
|
|
|
| 114 |
|
| 115 |
else:
|
| 116 |
logger.debug("Ignoring output type=%s", type(handler_output).__name__)
|
|
|
|
| 3 |
records mic frames to the handler and plays handler audio frames to the speaker.
|
| 4 |
"""
|
| 5 |
|
| 6 |
+
import time
|
| 7 |
import asyncio
|
| 8 |
import logging
|
| 9 |
+
from typing import List
|
| 10 |
|
|
|
|
| 11 |
from fastrtc import AdditionalOutputs, audio_to_int16, audio_to_float32
|
| 12 |
+
from librosa import resample
|
| 13 |
|
| 14 |
from reachy_mini import ReachyMini
|
| 15 |
+
from reachy_mini_conversation_app.openai_realtime import OpenaiRealtimeHandler
|
| 16 |
|
| 17 |
|
| 18 |
logger = logging.getLogger(__name__)
|
|
|
|
| 26 |
self.handler = handler
|
| 27 |
self._robot = robot
|
| 28 |
self._stop_event = asyncio.Event()
|
| 29 |
+
self._tasks: List[asyncio.Task[None]] = []
|
| 30 |
# Allow the handler to flush the player queue when appropriate.
|
| 31 |
+
self.handler._clear_queue = self.clear_audio_queue
|
| 32 |
+
|
| 33 |
+
# Hack to avoid the first lenghty call to resample at runtime.
|
| 34 |
+
# This is likely caused by cache initialization overhead.
|
| 35 |
+
import numpy as np
|
| 36 |
+
resample(np.array([0.0]), orig_sr=1, target_sr=1)
|
| 37 |
|
| 38 |
def launch(self) -> None:
|
| 39 |
"""Start the recorder/player and run the async processing loops."""
|
| 40 |
self._stop_event.clear()
|
| 41 |
self._robot.media.start_recording()
|
| 42 |
self._robot.media.start_playing()
|
| 43 |
+
time.sleep(1) # give some time to the pipelines to start
|
| 44 |
|
| 45 |
async def runner() -> None:
|
| 46 |
self._tasks = [
|
|
|
|
| 91 |
frame_mono = audio_frame.T[0] # both channels are identical
|
| 92 |
frame = audio_to_int16(frame_mono)
|
| 93 |
await self.handler.receive((16000, frame))
|
| 94 |
+
|
| 95 |
+
await asyncio.sleep(0.01) # avoid busy loop
|
|
|
|
| 96 |
|
| 97 |
async def play_loop(self) -> None:
|
| 98 |
"""Fetch outputs from the handler: log text and play audio frames."""
|
|
|
|
| 112 |
elif isinstance(handler_output, tuple):
|
| 113 |
input_sample_rate, audio_frame = handler_output
|
| 114 |
device_sample_rate = self._robot.media.get_audio_samplerate()
|
| 115 |
+
audio_frame_float = audio_to_float32(audio_frame.squeeze())
|
| 116 |
+
|
| 117 |
if input_sample_rate != device_sample_rate:
|
| 118 |
+
audio_frame_float = resample(
|
| 119 |
+
audio_frame_float,
|
| 120 |
+
orig_sr=input_sample_rate,
|
| 121 |
+
target_sr=device_sample_rate,
|
| 122 |
)
|
| 123 |
+
|
| 124 |
+
self._robot.media.push_audio_sample(audio_frame_float)
|
| 125 |
|
| 126 |
else:
|
| 127 |
logger.debug("Ignoring output type=%s", type(handler_output).__name__)
|
src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/dance_emotion_moves.py
RENAMED
|
@@ -9,6 +9,7 @@ import logging
|
|
| 9 |
from typing import Tuple
|
| 10 |
|
| 11 |
import numpy as np
|
|
|
|
| 12 |
|
| 13 |
from reachy_mini.motion.move import Move
|
| 14 |
from reachy_mini.motion.recorded_move import RecordedMoves
|
|
@@ -18,7 +19,7 @@ from reachy_mini_dances_library.dance_move import DanceMove
|
|
| 18 |
logger = logging.getLogger(__name__)
|
| 19 |
|
| 20 |
|
| 21 |
-
class DanceQueueMove(Move):
|
| 22 |
"""Wrapper for dance moves to work with the movement queue system."""
|
| 23 |
|
| 24 |
def __init__(self, move_name: str):
|
|
@@ -29,9 +30,9 @@ class DanceQueueMove(Move):
|
|
| 29 |
@property
|
| 30 |
def duration(self) -> float:
|
| 31 |
"""Duration property required by official Move interface."""
|
| 32 |
-
return self.dance_move.duration
|
| 33 |
|
| 34 |
-
def evaluate(self, t: float) -> tuple[np.
|
| 35 |
"""Evaluate dance move at time t."""
|
| 36 |
try:
|
| 37 |
# Get the pose from the dance move
|
|
@@ -49,10 +50,10 @@ class DanceQueueMove(Move):
|
|
| 49 |
from reachy_mini.utils import create_head_pose
|
| 50 |
|
| 51 |
neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
|
| 52 |
-
return (neutral_head_pose, np.array([0.0, 0.0]), 0.0)
|
| 53 |
|
| 54 |
|
| 55 |
-
class EmotionQueueMove(Move):
|
| 56 |
"""Wrapper for emotion moves to work with the movement queue system."""
|
| 57 |
|
| 58 |
def __init__(self, emotion_name: str, recorded_moves: RecordedMoves):
|
|
@@ -63,9 +64,9 @@ class EmotionQueueMove(Move):
|
|
| 63 |
@property
|
| 64 |
def duration(self) -> float:
|
| 65 |
"""Duration property required by official Move interface."""
|
| 66 |
-
return self.emotion_move.duration
|
| 67 |
|
| 68 |
-
def evaluate(self, t: float) -> tuple[np.
|
| 69 |
"""Evaluate emotion move at time t."""
|
| 70 |
try:
|
| 71 |
# Get the pose from the emotion move
|
|
@@ -83,20 +84,20 @@ class EmotionQueueMove(Move):
|
|
| 83 |
from reachy_mini.utils import create_head_pose
|
| 84 |
|
| 85 |
neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
|
| 86 |
-
return (neutral_head_pose, np.array([0.0, 0.0]), 0.0)
|
| 87 |
|
| 88 |
|
| 89 |
-
class GotoQueueMove(Move):
|
| 90 |
"""Wrapper for goto moves to work with the movement queue system."""
|
| 91 |
|
| 92 |
def __init__(
|
| 93 |
self,
|
| 94 |
-
target_head_pose: np.
|
| 95 |
-
start_head_pose: np.
|
| 96 |
target_antennas: Tuple[float, float] = (0, 0),
|
| 97 |
-
start_antennas: Tuple[float, float] = None,
|
| 98 |
target_body_yaw: float = 0,
|
| 99 |
-
start_body_yaw: float = None,
|
| 100 |
duration: float = 1.0,
|
| 101 |
):
|
| 102 |
"""Initialize a GotoQueueMove."""
|
|
@@ -113,7 +114,7 @@ class GotoQueueMove(Move):
|
|
| 113 |
"""Duration property required by official Move interface."""
|
| 114 |
return self._duration
|
| 115 |
|
| 116 |
-
def evaluate(self, t: float) -> tuple[np.
|
| 117 |
"""Evaluate goto move at time t using linear interpolation."""
|
| 118 |
try:
|
| 119 |
from reachy_mini.utils import create_head_pose
|
|
@@ -136,7 +137,8 @@ class GotoQueueMove(Move):
|
|
| 136 |
[
|
| 137 |
self.start_antennas[0] + (self.target_antennas[0] - self.start_antennas[0]) * t_clamped,
|
| 138 |
self.start_antennas[1] + (self.target_antennas[1] - self.start_antennas[1]) * t_clamped,
|
| 139 |
-
]
|
|
|
|
| 140 |
)
|
| 141 |
|
| 142 |
# Interpolate body yaw
|
|
@@ -146,6 +148,7 @@ class GotoQueueMove(Move):
|
|
| 146 |
|
| 147 |
except Exception as e:
|
| 148 |
logger.error(f"Error evaluating goto move at t={t}: {e}")
|
| 149 |
-
# Return target pose on error - convert
|
| 150 |
-
|
| 151 |
-
|
|
|
|
|
|
| 9 |
from typing import Tuple
|
| 10 |
|
| 11 |
import numpy as np
|
| 12 |
+
from numpy.typing import NDArray
|
| 13 |
|
| 14 |
from reachy_mini.motion.move import Move
|
| 15 |
from reachy_mini.motion.recorded_move import RecordedMoves
|
|
|
|
| 19 |
logger = logging.getLogger(__name__)
|
| 20 |
|
| 21 |
|
| 22 |
+
class DanceQueueMove(Move): # type: ignore
|
| 23 |
"""Wrapper for dance moves to work with the movement queue system."""
|
| 24 |
|
| 25 |
def __init__(self, move_name: str):
|
|
|
|
| 30 |
@property
|
| 31 |
def duration(self) -> float:
|
| 32 |
"""Duration property required by official Move interface."""
|
| 33 |
+
return float(self.dance_move.duration)
|
| 34 |
|
| 35 |
+
def evaluate(self, t: float) -> tuple[NDArray[np.float64] | None, NDArray[np.float64] | None, float | None]:
|
| 36 |
"""Evaluate dance move at time t."""
|
| 37 |
try:
|
| 38 |
# Get the pose from the dance move
|
|
|
|
| 50 |
from reachy_mini.utils import create_head_pose
|
| 51 |
|
| 52 |
neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
|
| 53 |
+
return (neutral_head_pose, np.array([0.0, 0.0], dtype=np.float64), 0.0)
|
| 54 |
|
| 55 |
|
| 56 |
+
class EmotionQueueMove(Move): # type: ignore
|
| 57 |
"""Wrapper for emotion moves to work with the movement queue system."""
|
| 58 |
|
| 59 |
def __init__(self, emotion_name: str, recorded_moves: RecordedMoves):
|
|
|
|
| 64 |
@property
|
| 65 |
def duration(self) -> float:
|
| 66 |
"""Duration property required by official Move interface."""
|
| 67 |
+
return float(self.emotion_move.duration)
|
| 68 |
|
| 69 |
+
def evaluate(self, t: float) -> tuple[NDArray[np.float64] | None, NDArray[np.float64] | None, float | None]:
|
| 70 |
"""Evaluate emotion move at time t."""
|
| 71 |
try:
|
| 72 |
# Get the pose from the emotion move
|
|
|
|
| 84 |
from reachy_mini.utils import create_head_pose
|
| 85 |
|
| 86 |
neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
|
| 87 |
+
return (neutral_head_pose, np.array([0.0, 0.0], dtype=np.float64), 0.0)
|
| 88 |
|
| 89 |
|
| 90 |
+
class GotoQueueMove(Move): # type: ignore
|
| 91 |
"""Wrapper for goto moves to work with the movement queue system."""
|
| 92 |
|
| 93 |
def __init__(
|
| 94 |
self,
|
| 95 |
+
target_head_pose: NDArray[np.float32],
|
| 96 |
+
start_head_pose: NDArray[np.float32] | None = None,
|
| 97 |
target_antennas: Tuple[float, float] = (0, 0),
|
| 98 |
+
start_antennas: Tuple[float, float] | None = None,
|
| 99 |
target_body_yaw: float = 0,
|
| 100 |
+
start_body_yaw: float | None = None,
|
| 101 |
duration: float = 1.0,
|
| 102 |
):
|
| 103 |
"""Initialize a GotoQueueMove."""
|
|
|
|
| 114 |
"""Duration property required by official Move interface."""
|
| 115 |
return self._duration
|
| 116 |
|
| 117 |
+
def evaluate(self, t: float) -> tuple[NDArray[np.float64] | None, NDArray[np.float64] | None, float | None]:
|
| 118 |
"""Evaluate goto move at time t using linear interpolation."""
|
| 119 |
try:
|
| 120 |
from reachy_mini.utils import create_head_pose
|
|
|
|
| 137 |
[
|
| 138 |
self.start_antennas[0] + (self.target_antennas[0] - self.start_antennas[0]) * t_clamped,
|
| 139 |
self.start_antennas[1] + (self.target_antennas[1] - self.start_antennas[1]) * t_clamped,
|
| 140 |
+
],
|
| 141 |
+
dtype=np.float64,
|
| 142 |
)
|
| 143 |
|
| 144 |
# Interpolate body yaw
|
|
|
|
| 148 |
|
| 149 |
except Exception as e:
|
| 150 |
logger.error(f"Error evaluating goto move at t={t}: {e}")
|
| 151 |
+
# Return target pose on error - convert to float64
|
| 152 |
+
target_head_pose_f64 = self.target_head_pose.astype(np.float64)
|
| 153 |
+
target_antennas_array = np.array([self.target_antennas[0], self.target_antennas[1]], dtype=np.float64)
|
| 154 |
+
return (target_head_pose_f64, target_antennas_array, self.target_body_yaw)
|
src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/images/reachymini_avatar.png
RENAMED
|
File without changes
|
src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/images/user_avatar.png
RENAMED
|
File without changes
|
src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/main.py
RENAMED
|
@@ -1,28 +1,29 @@
|
|
| 1 |
-
"""Entrypoint for the Reachy Mini conversation
|
| 2 |
|
| 3 |
import os
|
| 4 |
import sys
|
| 5 |
import time
|
| 6 |
import threading
|
|
|
|
| 7 |
|
| 8 |
import gradio as gr
|
| 9 |
from fastapi import FastAPI
|
| 10 |
from fastrtc import Stream
|
| 11 |
|
| 12 |
from reachy_mini import ReachyMini, ReachyMiniApp
|
| 13 |
-
from
|
| 14 |
-
from
|
| 15 |
-
from
|
| 16 |
parse_args,
|
| 17 |
setup_logger,
|
| 18 |
handle_vision_stuff,
|
| 19 |
)
|
| 20 |
-
from
|
| 21 |
-
from
|
| 22 |
-
from
|
| 23 |
|
| 24 |
|
| 25 |
-
def update_chatbot(chatbot:
|
| 26 |
"""Update the chatbot with AdditionalOutputs."""
|
| 27 |
chatbot.append(response)
|
| 28 |
return chatbot
|
|
@@ -34,7 +35,7 @@ def main(robot=None):
|
|
| 34 |
args.gradio = True # TODO Antoine - force gradio for testing appifying
|
| 35 |
|
| 36 |
logger = setup_logger(args.debug)
|
| 37 |
-
logger.info("Starting Reachy Mini Conversation
|
| 38 |
|
| 39 |
if args.no_camera and args.head_tracker is not None:
|
| 40 |
logger.warning("Head tracking is not activated due to --no-camera.")
|
|
@@ -45,7 +46,7 @@ def main(robot=None):
|
|
| 45 |
# Check if running in simulation mode without --gradio
|
| 46 |
if robot.client.get_status()["simulation_enabled"] and not args.gradio:
|
| 47 |
logger.error(
|
| 48 |
-
"Simulation mode requires Gradio interface. Please use --gradio flag when running in simulation mode."
|
| 49 |
)
|
| 50 |
robot.client.disconnect()
|
| 51 |
sys.exit(1)
|
|
@@ -80,7 +81,7 @@ def main(robot=None):
|
|
| 80 |
|
| 81 |
handler = OpenaiRealtimeHandler(deps)
|
| 82 |
|
| 83 |
-
stream_manager = None
|
| 84 |
|
| 85 |
if args.gradio:
|
| 86 |
stream = Stream(
|
|
|
|
| 1 |
+
"""Entrypoint for the Reachy Mini conversation app."""
|
| 2 |
|
| 3 |
import os
|
| 4 |
import sys
|
| 5 |
import time
|
| 6 |
import threading
|
| 7 |
+
from typing import Any, Dict, List
|
| 8 |
|
| 9 |
import gradio as gr
|
| 10 |
from fastapi import FastAPI
|
| 11 |
from fastrtc import Stream
|
| 12 |
|
| 13 |
from reachy_mini import ReachyMini, ReachyMiniApp
|
| 14 |
+
from reachy_mini_conversation_app.moves import MovementManager
|
| 15 |
+
from reachy_mini_conversation_app.tools import ToolDependencies
|
| 16 |
+
from reachy_mini_conversation_app.utils import (
|
| 17 |
parse_args,
|
| 18 |
setup_logger,
|
| 19 |
handle_vision_stuff,
|
| 20 |
)
|
| 21 |
+
from reachy_mini_conversation_app.console import LocalStream
|
| 22 |
+
from reachy_mini_conversation_app.openai_realtime import OpenaiRealtimeHandler
|
| 23 |
+
from reachy_mini_conversation_app.audio.head_wobbler import HeadWobbler
|
| 24 |
|
| 25 |
|
| 26 |
+
def update_chatbot(chatbot: List[Dict[str, Any]], response: Dict[str, Any]) -> List[Dict[str, Any]]:
|
| 27 |
"""Update the chatbot with AdditionalOutputs."""
|
| 28 |
chatbot.append(response)
|
| 29 |
return chatbot
|
|
|
|
| 35 |
args.gradio = True # TODO Antoine - force gradio for testing appifying
|
| 36 |
|
| 37 |
logger = setup_logger(args.debug)
|
| 38 |
+
logger.info("Starting Reachy Mini Conversation App")
|
| 39 |
|
| 40 |
if args.no_camera and args.head_tracker is not None:
|
| 41 |
logger.warning("Head tracking is not activated due to --no-camera.")
|
|
|
|
| 46 |
# Check if running in simulation mode without --gradio
|
| 47 |
if robot.client.get_status()["simulation_enabled"] and not args.gradio:
|
| 48 |
logger.error(
|
| 49 |
+
"Simulation mode requires Gradio interface. Please use --gradio flag when running in simulation mode.",
|
| 50 |
)
|
| 51 |
robot.client.disconnect()
|
| 52 |
sys.exit(1)
|
|
|
|
| 81 |
|
| 82 |
handler = OpenaiRealtimeHandler(deps)
|
| 83 |
|
| 84 |
+
stream_manager: gr.Blocks | LocalStream | None = None
|
| 85 |
|
| 86 |
if args.gradio:
|
| 87 |
stream = Stream(
|
src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/moves.py
RENAMED
|
@@ -36,11 +36,12 @@ import time
|
|
| 36 |
import logging
|
| 37 |
import threading
|
| 38 |
from queue import Empty, Queue
|
| 39 |
-
from typing import Any,
|
| 40 |
from collections import deque
|
| 41 |
from dataclasses import dataclass
|
| 42 |
|
| 43 |
import numpy as np
|
|
|
|
| 44 |
|
| 45 |
from reachy_mini import ReachyMini
|
| 46 |
from reachy_mini.utils import create_head_pose
|
|
@@ -57,15 +58,15 @@ logger = logging.getLogger(__name__)
|
|
| 57 |
CONTROL_LOOP_FREQUENCY_HZ = 100.0 # Hz - Target frequency for the movement control loop
|
| 58 |
|
| 59 |
# Type definitions
|
| 60 |
-
FullBodyPose = Tuple[np.
|
| 61 |
|
| 62 |
|
| 63 |
-
class BreathingMove(Move):
|
| 64 |
"""Breathing move with interpolation to neutral and then continuous breathing patterns."""
|
| 65 |
|
| 66 |
def __init__(
|
| 67 |
self,
|
| 68 |
-
interpolation_start_pose: np.
|
| 69 |
interpolation_start_antennas: Tuple[float, float],
|
| 70 |
interpolation_duration: float = 1.0,
|
| 71 |
):
|
|
@@ -96,7 +97,7 @@ class BreathingMove(Move):
|
|
| 96 |
"""Duration property required by official Move interface."""
|
| 97 |
return float("inf") # Continuous breathing (never ends naturally)
|
| 98 |
|
| 99 |
-
def evaluate(self, t: float) -> tuple[np.
|
| 100 |
"""Evaluate breathing move at time t."""
|
| 101 |
if t < self.interpolation_duration:
|
| 102 |
# Phase 1: Interpolate to neutral base position
|
|
@@ -104,13 +105,14 @@ class BreathingMove(Move):
|
|
| 104 |
|
| 105 |
# Interpolate head pose
|
| 106 |
head_pose = linear_pose_interpolation(
|
| 107 |
-
self.interpolation_start_pose, self.neutral_head_pose, interpolation_t
|
| 108 |
)
|
| 109 |
|
| 110 |
# Interpolate antennas
|
| 111 |
-
|
| 112 |
1 - interpolation_t
|
| 113 |
) * self.interpolation_start_antennas + interpolation_t * self.neutral_antennas
|
|
|
|
| 114 |
|
| 115 |
else:
|
| 116 |
# Phase 2: Breathing patterns from neutral base
|
|
@@ -122,7 +124,7 @@ class BreathingMove(Move):
|
|
| 122 |
|
| 123 |
# Antenna sway (opposite directions)
|
| 124 |
antenna_sway = self.antenna_sway_amplitude * np.sin(2 * np.pi * self.antenna_frequency * breathing_time)
|
| 125 |
-
antennas = np.array([antenna_sway, -antenna_sway])
|
| 126 |
|
| 127 |
# Return in official Move interface format: (head_pose, antennas_array, body_yaw)
|
| 128 |
return (head_pose, antennas, 0.0)
|
|
@@ -168,8 +170,8 @@ class MovementState:
|
|
| 168 |
"""State tracking for the movement system."""
|
| 169 |
|
| 170 |
# Primary move state
|
| 171 |
-
current_move:
|
| 172 |
-
move_start_time:
|
| 173 |
last_activity_time: float = 0.0
|
| 174 |
|
| 175 |
# Secondary move state (offsets)
|
|
@@ -191,7 +193,7 @@ class MovementState:
|
|
| 191 |
)
|
| 192 |
|
| 193 |
# Status flags
|
| 194 |
-
last_primary_pose:
|
| 195 |
|
| 196 |
def update_activity(self) -> None:
|
| 197 |
"""Update the last activity time."""
|
|
@@ -242,7 +244,7 @@ class MovementManager:
|
|
| 242 |
def __init__(
|
| 243 |
self,
|
| 244 |
current_robot: ReachyMini,
|
| 245 |
-
camera_worker=None,
|
| 246 |
):
|
| 247 |
"""Initialize movement manager."""
|
| 248 |
self.current_robot = current_robot
|
|
@@ -258,7 +260,7 @@ class MovementManager:
|
|
| 258 |
self.state.last_primary_pose = (neutral_pose, (0.0, 0.0), 0.0)
|
| 259 |
|
| 260 |
# Move queue (primary moves)
|
| 261 |
-
self.move_queue = deque()
|
| 262 |
|
| 263 |
# Configuration
|
| 264 |
self.idle_inactivity_delay = 0.3 # seconds
|
|
@@ -266,7 +268,7 @@ class MovementManager:
|
|
| 266 |
self.target_period = 1.0 / self.target_frequency
|
| 267 |
|
| 268 |
self._stop_event = threading.Event()
|
| 269 |
-
self._thread:
|
| 270 |
self._is_listening = False
|
| 271 |
self._last_commanded_pose: FullBodyPose = clone_full_body_pose(self.state.last_primary_pose)
|
| 272 |
self._listening_antennas: Tuple[float, float] = self._last_commanded_pose[1]
|
|
@@ -281,7 +283,7 @@ class MovementManager:
|
|
| 281 |
self._set_target_err_suppressed = 0
|
| 282 |
|
| 283 |
# Cross-thread signalling
|
| 284 |
-
self._command_queue: Queue[
|
| 285 |
self._speech_offsets_lock = threading.Lock()
|
| 286 |
self._pending_speech_offsets: Tuple[float, float, float, float, float, float] = (
|
| 287 |
0.0,
|
|
@@ -383,7 +385,7 @@ class MovementManager:
|
|
| 383 |
|
| 384 |
def _apply_pending_offsets(self) -> None:
|
| 385 |
"""Apply the most recent speech/face offset updates."""
|
| 386 |
-
speech_offsets:
|
| 387 |
with self._speech_offsets_lock:
|
| 388 |
if self._speech_offsets_dirty:
|
| 389 |
speech_offsets = self._pending_speech_offsets
|
|
@@ -393,7 +395,7 @@ class MovementManager:
|
|
| 393 |
self.state.speech_offsets = speech_offsets
|
| 394 |
self.state.update_activity()
|
| 395 |
|
| 396 |
-
face_offsets:
|
| 397 |
with self._face_offsets_lock:
|
| 398 |
if self._face_offsets_dirty:
|
| 399 |
face_offsets = self._pending_face_offsets
|
|
@@ -549,14 +551,13 @@ class MovementManager:
|
|
| 549 |
)
|
| 550 |
|
| 551 |
self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
|
|
|
|
|
|
|
|
|
|
| 552 |
else:
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
else:
|
| 557 |
-
neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
|
| 558 |
-
primary_full_body_pose = (neutral_head_pose, (0.0, 0.0), 0.0)
|
| 559 |
-
self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
|
| 560 |
|
| 561 |
return primary_full_body_pose
|
| 562 |
|
|
@@ -631,7 +632,7 @@ class MovementManager:
|
|
| 631 |
|
| 632 |
return antennas_cmd
|
| 633 |
|
| 634 |
-
def _issue_control_command(self, head: np.
|
| 635 |
"""Send the fused pose to the robot with throttled error logging."""
|
| 636 |
try:
|
| 637 |
self.current_robot.set_target(head=head, antennas=antennas, body_yaw=body_yaw)
|
|
@@ -651,7 +652,7 @@ class MovementManager:
|
|
| 651 |
self._last_commanded_pose = clone_full_body_pose((head, antennas, body_yaw))
|
| 652 |
|
| 653 |
def _update_frequency_stats(
|
| 654 |
-
self, loop_start: float, prev_loop_start: float, stats: LoopFrequencyStats
|
| 655 |
) -> LoopFrequencyStats:
|
| 656 |
"""Update frequency statistics based on the current loop start time."""
|
| 657 |
period = loop_start - prev_loop_start
|
|
@@ -664,7 +665,7 @@ class MovementManager:
|
|
| 664 |
stats.min_freq = min(stats.min_freq, stats.last_freq)
|
| 665 |
return stats
|
| 666 |
|
| 667 |
-
def _schedule_next_tick(self, loop_start: float, stats: LoopFrequencyStats) ->
|
| 668 |
"""Compute sleep time to maintain target frequency and update potential freq."""
|
| 669 |
computation_time = self._now() - loop_start
|
| 670 |
stats.potential_freq = 1.0 / computation_time if computation_time > 0 else float("inf")
|
|
@@ -729,7 +730,7 @@ class MovementManager:
|
|
| 729 |
self._thread = None
|
| 730 |
logger.debug("Move worker stopped")
|
| 731 |
|
| 732 |
-
def get_status(self) ->
|
| 733 |
"""Return a lightweight status snapshot for observability."""
|
| 734 |
with self._status_lock:
|
| 735 |
pose_snapshot = clone_full_body_pose(self._last_commanded_pose)
|
|
|
|
| 36 |
import logging
|
| 37 |
import threading
|
| 38 |
from queue import Empty, Queue
|
| 39 |
+
from typing import Any, Dict, Tuple
|
| 40 |
from collections import deque
|
| 41 |
from dataclasses import dataclass
|
| 42 |
|
| 43 |
import numpy as np
|
| 44 |
+
from numpy.typing import NDArray
|
| 45 |
|
| 46 |
from reachy_mini import ReachyMini
|
| 47 |
from reachy_mini.utils import create_head_pose
|
|
|
|
| 58 |
CONTROL_LOOP_FREQUENCY_HZ = 100.0 # Hz - Target frequency for the movement control loop
|
| 59 |
|
| 60 |
# Type definitions
|
| 61 |
+
FullBodyPose = Tuple[NDArray[np.float32], Tuple[float, float], float] # (head_pose_4x4, antennas, body_yaw)
|
| 62 |
|
| 63 |
|
| 64 |
+
class BreathingMove(Move): # type: ignore
|
| 65 |
"""Breathing move with interpolation to neutral and then continuous breathing patterns."""
|
| 66 |
|
| 67 |
def __init__(
|
| 68 |
self,
|
| 69 |
+
interpolation_start_pose: NDArray[np.float32],
|
| 70 |
interpolation_start_antennas: Tuple[float, float],
|
| 71 |
interpolation_duration: float = 1.0,
|
| 72 |
):
|
|
|
|
| 97 |
"""Duration property required by official Move interface."""
|
| 98 |
return float("inf") # Continuous breathing (never ends naturally)
|
| 99 |
|
| 100 |
+
def evaluate(self, t: float) -> tuple[NDArray[np.float64] | None, NDArray[np.float64] | None, float | None]:
|
| 101 |
"""Evaluate breathing move at time t."""
|
| 102 |
if t < self.interpolation_duration:
|
| 103 |
# Phase 1: Interpolate to neutral base position
|
|
|
|
| 105 |
|
| 106 |
# Interpolate head pose
|
| 107 |
head_pose = linear_pose_interpolation(
|
| 108 |
+
self.interpolation_start_pose, self.neutral_head_pose, interpolation_t,
|
| 109 |
)
|
| 110 |
|
| 111 |
# Interpolate antennas
|
| 112 |
+
antennas_interp = (
|
| 113 |
1 - interpolation_t
|
| 114 |
) * self.interpolation_start_antennas + interpolation_t * self.neutral_antennas
|
| 115 |
+
antennas = antennas_interp.astype(np.float64)
|
| 116 |
|
| 117 |
else:
|
| 118 |
# Phase 2: Breathing patterns from neutral base
|
|
|
|
| 124 |
|
| 125 |
# Antenna sway (opposite directions)
|
| 126 |
antenna_sway = self.antenna_sway_amplitude * np.sin(2 * np.pi * self.antenna_frequency * breathing_time)
|
| 127 |
+
antennas = np.array([antenna_sway, -antenna_sway], dtype=np.float64)
|
| 128 |
|
| 129 |
# Return in official Move interface format: (head_pose, antennas_array, body_yaw)
|
| 130 |
return (head_pose, antennas, 0.0)
|
|
|
|
| 170 |
"""State tracking for the movement system."""
|
| 171 |
|
| 172 |
# Primary move state
|
| 173 |
+
current_move: Move | None = None
|
| 174 |
+
move_start_time: float | None = None
|
| 175 |
last_activity_time: float = 0.0
|
| 176 |
|
| 177 |
# Secondary move state (offsets)
|
|
|
|
| 193 |
)
|
| 194 |
|
| 195 |
# Status flags
|
| 196 |
+
last_primary_pose: FullBodyPose | None = None
|
| 197 |
|
| 198 |
def update_activity(self) -> None:
|
| 199 |
"""Update the last activity time."""
|
|
|
|
| 244 |
def __init__(
|
| 245 |
self,
|
| 246 |
current_robot: ReachyMini,
|
| 247 |
+
camera_worker: "Any" = None,
|
| 248 |
):
|
| 249 |
"""Initialize movement manager."""
|
| 250 |
self.current_robot = current_robot
|
|
|
|
| 260 |
self.state.last_primary_pose = (neutral_pose, (0.0, 0.0), 0.0)
|
| 261 |
|
| 262 |
# Move queue (primary moves)
|
| 263 |
+
self.move_queue: deque[Move] = deque()
|
| 264 |
|
| 265 |
# Configuration
|
| 266 |
self.idle_inactivity_delay = 0.3 # seconds
|
|
|
|
| 268 |
self.target_period = 1.0 / self.target_frequency
|
| 269 |
|
| 270 |
self._stop_event = threading.Event()
|
| 271 |
+
self._thread: threading.Thread | None = None
|
| 272 |
self._is_listening = False
|
| 273 |
self._last_commanded_pose: FullBodyPose = clone_full_body_pose(self.state.last_primary_pose)
|
| 274 |
self._listening_antennas: Tuple[float, float] = self._last_commanded_pose[1]
|
|
|
|
| 283 |
self._set_target_err_suppressed = 0
|
| 284 |
|
| 285 |
# Cross-thread signalling
|
| 286 |
+
self._command_queue: "Queue[Tuple[str, Any]]" = Queue()
|
| 287 |
self._speech_offsets_lock = threading.Lock()
|
| 288 |
self._pending_speech_offsets: Tuple[float, float, float, float, float, float] = (
|
| 289 |
0.0,
|
|
|
|
| 385 |
|
| 386 |
def _apply_pending_offsets(self) -> None:
|
| 387 |
"""Apply the most recent speech/face offset updates."""
|
| 388 |
+
speech_offsets: Tuple[float, float, float, float, float, float] | None = None
|
| 389 |
with self._speech_offsets_lock:
|
| 390 |
if self._speech_offsets_dirty:
|
| 391 |
speech_offsets = self._pending_speech_offsets
|
|
|
|
| 395 |
self.state.speech_offsets = speech_offsets
|
| 396 |
self.state.update_activity()
|
| 397 |
|
| 398 |
+
face_offsets: Tuple[float, float, float, float, float, float] | None = None
|
| 399 |
with self._face_offsets_lock:
|
| 400 |
if self._face_offsets_dirty:
|
| 401 |
face_offsets = self._pending_face_offsets
|
|
|
|
| 551 |
)
|
| 552 |
|
| 553 |
self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
|
| 554 |
+
# Otherwise reuse the last primary pose so we avoid jumps between moves
|
| 555 |
+
elif self.state.last_primary_pose is not None:
|
| 556 |
+
primary_full_body_pose = clone_full_body_pose(self.state.last_primary_pose)
|
| 557 |
else:
|
| 558 |
+
neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
|
| 559 |
+
primary_full_body_pose = (neutral_head_pose, (0.0, 0.0), 0.0)
|
| 560 |
+
self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 561 |
|
| 562 |
return primary_full_body_pose
|
| 563 |
|
|
|
|
| 632 |
|
| 633 |
return antennas_cmd
|
| 634 |
|
| 635 |
+
def _issue_control_command(self, head: NDArray[np.float32], antennas: Tuple[float, float], body_yaw: float) -> None:
|
| 636 |
"""Send the fused pose to the robot with throttled error logging."""
|
| 637 |
try:
|
| 638 |
self.current_robot.set_target(head=head, antennas=antennas, body_yaw=body_yaw)
|
|
|
|
| 652 |
self._last_commanded_pose = clone_full_body_pose((head, antennas, body_yaw))
|
| 653 |
|
| 654 |
def _update_frequency_stats(
|
| 655 |
+
self, loop_start: float, prev_loop_start: float, stats: LoopFrequencyStats,
|
| 656 |
) -> LoopFrequencyStats:
|
| 657 |
"""Update frequency statistics based on the current loop start time."""
|
| 658 |
period = loop_start - prev_loop_start
|
|
|
|
| 665 |
stats.min_freq = min(stats.min_freq, stats.last_freq)
|
| 666 |
return stats
|
| 667 |
|
| 668 |
+
def _schedule_next_tick(self, loop_start: float, stats: LoopFrequencyStats) -> Tuple[float, LoopFrequencyStats]:
|
| 669 |
"""Compute sleep time to maintain target frequency and update potential freq."""
|
| 670 |
computation_time = self._now() - loop_start
|
| 671 |
stats.potential_freq = 1.0 / computation_time if computation_time > 0 else float("inf")
|
|
|
|
| 730 |
self._thread = None
|
| 731 |
logger.debug("Move worker stopped")
|
| 732 |
|
| 733 |
+
def get_status(self) -> Dict[str, Any]:
|
| 734 |
"""Return a lightweight status snapshot for observability."""
|
| 735 |
with self._status_lock:
|
| 736 |
pose_snapshot = clone_full_body_pose(self._last_commanded_pose)
|
src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/openai_realtime.py
RENAMED
|
@@ -1,21 +1,25 @@
|
|
| 1 |
import json
|
| 2 |
import base64
|
|
|
|
| 3 |
import asyncio
|
| 4 |
import logging
|
|
|
|
| 5 |
from datetime import datetime
|
| 6 |
|
| 7 |
import numpy as np
|
| 8 |
import gradio as gr
|
| 9 |
from openai import AsyncOpenAI
|
| 10 |
from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item
|
|
|
|
|
|
|
| 11 |
|
| 12 |
-
from
|
| 13 |
ALL_TOOL_SPECS,
|
| 14 |
ToolDependencies,
|
| 15 |
dispatch_tool_call,
|
| 16 |
)
|
| 17 |
-
from
|
| 18 |
-
from
|
| 19 |
|
| 20 |
|
| 21 |
logger = logging.getLogger(__name__)
|
|
@@ -33,57 +37,131 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 33 |
)
|
| 34 |
self.deps = deps
|
| 35 |
|
| 36 |
-
|
| 37 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
self.last_activity_time = asyncio.get_event_loop().time()
|
| 40 |
self.start_time = asyncio.get_event_loop().time()
|
| 41 |
self.is_idle_tool_call = False
|
| 42 |
|
| 43 |
-
def copy(self):
|
| 44 |
"""Create a copy of the handler."""
|
| 45 |
return OpenaiRealtimeHandler(self.deps)
|
| 46 |
|
| 47 |
-
|
| 48 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
self.client = AsyncOpenAI(api_key=config.OPENAI_API_KEY)
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
},
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
)
|
| 67 |
|
| 68 |
# Manage event received from the openai server
|
| 69 |
self.connection = conn
|
| 70 |
async for event in self.connection:
|
| 71 |
logger.debug(f"OpenAI event: {event.type}")
|
| 72 |
if event.type == "input_audio_buffer.speech_started":
|
| 73 |
-
if hasattr(self,
|
| 74 |
self._clear_queue()
|
| 75 |
-
self.deps.head_wobbler
|
|
|
|
| 76 |
self.deps.movement_manager.set_listening(True)
|
| 77 |
logger.debug("User speech started")
|
| 78 |
|
| 79 |
if event.type == "input_audio_buffer.speech_stopped":
|
| 80 |
self.deps.movement_manager.set_listening(False)
|
| 81 |
-
logger.debug("User speech stopped")
|
| 82 |
-
|
| 83 |
-
if event.type in (
|
| 84 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
logger.debug("response completed")
|
| 86 |
-
self.deps.head_wobbler.reset()
|
| 87 |
|
| 88 |
if event.type == "response.created":
|
| 89 |
logger.debug("Response created")
|
|
@@ -91,18 +169,28 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 91 |
if event.type == "response.done":
|
| 92 |
# Doesn't mean the audio is done playing
|
| 93 |
logger.debug("Response done")
|
| 94 |
-
pass
|
| 95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
if event.type == "conversation.item.input_audio_transcription.completed":
|
| 97 |
logger.debug(f"User transcript: {event.transcript}")
|
| 98 |
await self.output_queue.put(AdditionalOutputs({"role": "user", "content": event.transcript}))
|
| 99 |
|
| 100 |
-
|
|
|
|
| 101 |
logger.debug(f"Assistant transcript: {event.transcript}")
|
| 102 |
await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": event.transcript}))
|
| 103 |
|
| 104 |
-
|
| 105 |
-
|
|
|
|
|
|
|
| 106 |
self.last_activity_time = asyncio.get_event_loop().time()
|
| 107 |
logger.debug("last activity time updated to %s", self.last_activity_time)
|
| 108 |
await self.output_queue.put(
|
|
@@ -118,6 +206,10 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 118 |
args_json_str = getattr(event, "arguments", None)
|
| 119 |
call_id = getattr(event, "call_id", None)
|
| 120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
try:
|
| 122 |
tool_result = await dispatch_tool_call(tool_name, args_json_str, self.deps)
|
| 123 |
logger.debug("Tool '%s' executed successfully", tool_name)
|
|
@@ -127,22 +219,23 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 127 |
tool_result = {"error": str(e)}
|
| 128 |
|
| 129 |
# send the tool result back
|
| 130 |
-
|
| 131 |
-
item
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
|
|
|
| 137 |
|
| 138 |
await self.output_queue.put(
|
| 139 |
AdditionalOutputs(
|
| 140 |
{
|
| 141 |
"role": "assistant",
|
| 142 |
"content": json.dumps(tool_result),
|
| 143 |
-
"metadata": {"title": "π οΈ Used tool "
|
| 144 |
},
|
| 145 |
-
)
|
| 146 |
)
|
| 147 |
|
| 148 |
if tool_name == "camera" and "b64_im" in tool_result:
|
|
@@ -159,55 +252,68 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 159 |
{
|
| 160 |
"type": "input_image",
|
| 161 |
"image_url": f"data:image/jpeg;base64,{b64_im}",
|
| 162 |
-
}
|
| 163 |
],
|
| 164 |
-
}
|
| 165 |
)
|
| 166 |
logger.info("Added camera image to conversation")
|
| 167 |
|
| 168 |
-
|
| 169 |
-
|
|
|
|
| 170 |
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
|
|
|
| 177 |
)
|
| 178 |
-
)
|
| 179 |
|
| 180 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
await self.connection.response.create(
|
| 182 |
response={
|
| 183 |
-
"instructions": "Use the tool result just returned and answer concisely in speech."
|
| 184 |
-
}
|
| 185 |
)
|
| 186 |
-
else:
|
| 187 |
-
self.is_idle_tool_call = False
|
| 188 |
|
| 189 |
# re synchronize the head wobble after a tool call that may have taken some time
|
| 190 |
-
self.deps.head_wobbler
|
|
|
|
| 191 |
|
| 192 |
# server error
|
| 193 |
if event.type == "error":
|
| 194 |
err = getattr(event, "error", None)
|
| 195 |
msg = getattr(err, "message", str(err) if err else "unknown error")
|
| 196 |
-
|
| 197 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
|
| 199 |
# Microphone receive
|
| 200 |
-
async def receive(self, frame:
|
| 201 |
"""Receive audio frame from the microphone and send it to the openai server."""
|
| 202 |
if not self.connection:
|
| 203 |
return
|
| 204 |
_, array = frame
|
| 205 |
array = array.squeeze()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
audio_message = base64.b64encode(array.tobytes()).decode("utf-8")
|
| 207 |
-
|
| 208 |
-
await self.connection.input_audio_buffer.append(audio=audio_message) # type: ignore
|
| 209 |
|
| 210 |
-
async def emit(self) ->
|
| 211 |
"""Emit audio frame to be played by the speaker."""
|
| 212 |
# sends to the stream the stuff put in the output queue by the openai event handler
|
| 213 |
# This is called periodically by the fastrtc Stream
|
|
@@ -215,28 +321,43 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 215 |
# Handle idle
|
| 216 |
idle_duration = asyncio.get_event_loop().time() - self.last_activity_time
|
| 217 |
if idle_duration > 15.0 and self.deps.movement_manager.is_idle():
|
| 218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
|
| 220 |
self.last_activity_time = asyncio.get_event_loop().time() # avoid repeated resets
|
| 221 |
|
| 222 |
-
return await wait_for_item(self.output_queue)
|
| 223 |
|
| 224 |
async def shutdown(self) -> None:
|
| 225 |
"""Shutdown the handler."""
|
| 226 |
if self.connection:
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
return f"[{dt.strftime('%Y-%m-%d %H:%M:%S')} | +{elapsed_seconds:.1f}s]"
|
| 236 |
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
async def send_idle_signal(self, idle_duration) -> None:
|
| 240 |
"""Send an idle signal to the openai server."""
|
| 241 |
logger.debug("Sending idle signal")
|
| 242 |
self.is_idle_tool_call = True
|
|
@@ -249,12 +370,11 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 249 |
"type": "message",
|
| 250 |
"role": "user",
|
| 251 |
"content": [{"type": "input_text", "text": timestamp_msg}],
|
| 252 |
-
}
|
| 253 |
)
|
| 254 |
await self.connection.response.create(
|
| 255 |
response={
|
| 256 |
-
"modalities": ["text"],
|
| 257 |
"instructions": "You MUST respond with function calls only - no speech or text. Choose appropriate actions for idle behavior.",
|
| 258 |
"tool_choice": "required",
|
| 259 |
-
}
|
| 260 |
)
|
|
|
|
| 1 |
import json
|
| 2 |
import base64
|
| 3 |
+
import random
|
| 4 |
import asyncio
|
| 5 |
import logging
|
| 6 |
+
from typing import Any, Tuple, Literal, cast
|
| 7 |
from datetime import datetime
|
| 8 |
|
| 9 |
import numpy as np
|
| 10 |
import gradio as gr
|
| 11 |
from openai import AsyncOpenAI
|
| 12 |
from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item
|
| 13 |
+
from numpy.typing import NDArray
|
| 14 |
+
from websockets.exceptions import ConnectionClosedError
|
| 15 |
|
| 16 |
+
from reachy_mini_conversation_app.tools import (
|
| 17 |
ALL_TOOL_SPECS,
|
| 18 |
ToolDependencies,
|
| 19 |
dispatch_tool_call,
|
| 20 |
)
|
| 21 |
+
from reachy_mini_conversation_app.config import config
|
| 22 |
+
from reachy_mini_conversation_app.prompts import SESSION_INSTRUCTIONS
|
| 23 |
|
| 24 |
|
| 25 |
logger = logging.getLogger(__name__)
|
|
|
|
| 37 |
)
|
| 38 |
self.deps = deps
|
| 39 |
|
| 40 |
+
# Override type annotations for OpenAI strict typing (only for values used in API)
|
| 41 |
+
self.output_sample_rate: Literal[24000]
|
| 42 |
+
self.target_input_rate: Literal[24000] = 24000
|
| 43 |
+
# input_sample_rate rest as int for comparison logic
|
| 44 |
+
self.resample_ratio = self.target_input_rate / self.input_sample_rate
|
| 45 |
+
|
| 46 |
+
self.connection: Any = None
|
| 47 |
+
self.output_queue: "asyncio.Queue[Tuple[int, NDArray[np.int16]] | AdditionalOutputs]" = asyncio.Queue()
|
| 48 |
|
| 49 |
self.last_activity_time = asyncio.get_event_loop().time()
|
| 50 |
self.start_time = asyncio.get_event_loop().time()
|
| 51 |
self.is_idle_tool_call = False
|
| 52 |
|
| 53 |
+
def copy(self) -> "OpenaiRealtimeHandler":
|
| 54 |
"""Create a copy of the handler."""
|
| 55 |
return OpenaiRealtimeHandler(self.deps)
|
| 56 |
|
| 57 |
+
def resample_audio(self, audio: NDArray[np.int16]) -> NDArray[np.int16]:
|
| 58 |
+
"""Resample audio using linear interpolation."""
|
| 59 |
+
if self.input_sample_rate == self.target_input_rate:
|
| 60 |
+
return audio
|
| 61 |
+
|
| 62 |
+
# Use numpy's interp for simple linear resampling
|
| 63 |
+
input_length = len(audio)
|
| 64 |
+
output_length = int(input_length * self.resample_ratio)
|
| 65 |
+
|
| 66 |
+
input_time = np.arange(input_length)
|
| 67 |
+
output_time = np.linspace(0, input_length - 1, output_length)
|
| 68 |
+
|
| 69 |
+
resampled = np.interp(output_time, input_time, audio.astype(np.float32))
|
| 70 |
+
return cast(NDArray[np.int16], resampled.astype(np.int16))
|
| 71 |
+
|
| 72 |
+
async def start_up(self) -> None:
|
| 73 |
+
"""Start the handler with minimal retries on unexpected websocket closure."""
|
| 74 |
self.client = AsyncOpenAI(api_key=config.OPENAI_API_KEY)
|
| 75 |
+
|
| 76 |
+
max_attempts = 3
|
| 77 |
+
for attempt in range(1, max_attempts + 1):
|
| 78 |
+
try:
|
| 79 |
+
await self._run_realtime_session()
|
| 80 |
+
# Normal exit from the session, stop retrying
|
| 81 |
+
return
|
| 82 |
+
except ConnectionClosedError as e:
|
| 83 |
+
# Abrupt close (e.g., "no close frame received or sent") β retry
|
| 84 |
+
logger.warning(
|
| 85 |
+
"Realtime websocket closed unexpectedly (attempt %d/%d): %s",
|
| 86 |
+
attempt, max_attempts, e
|
| 87 |
+
)
|
| 88 |
+
if attempt < max_attempts:
|
| 89 |
+
# exponential backoff with jitter
|
| 90 |
+
base_delay = 2 ** (attempt - 1) # 1s, 2s, 4s, 8s, etc.
|
| 91 |
+
jitter = random.uniform(0, 0.5)
|
| 92 |
+
delay = base_delay + jitter
|
| 93 |
+
logger.info("Retrying in %.1f seconds...", delay)
|
| 94 |
+
await asyncio.sleep(delay)
|
| 95 |
+
continue
|
| 96 |
+
raise
|
| 97 |
+
finally:
|
| 98 |
+
# never keep a stale reference
|
| 99 |
+
self.connection = None
|
| 100 |
+
|
| 101 |
+
async def _run_realtime_session(self) -> None:
|
| 102 |
+
"""Establish and manage a single realtime session."""
|
| 103 |
+
async with self.client.realtime.connect(model=config.MODEL_NAME) as conn:
|
| 104 |
+
try:
|
| 105 |
+
await conn.session.update(
|
| 106 |
+
session={
|
| 107 |
+
"type": "realtime",
|
| 108 |
+
"instructions": SESSION_INSTRUCTIONS,
|
| 109 |
+
"audio": {
|
| 110 |
+
"input": {
|
| 111 |
+
"format": {
|
| 112 |
+
"type": "audio/pcm",
|
| 113 |
+
"rate": self.target_input_rate,
|
| 114 |
+
},
|
| 115 |
+
"transcription": {
|
| 116 |
+
"model": "whisper-1",
|
| 117 |
+
"language": "en"
|
| 118 |
+
},
|
| 119 |
+
"turn_detection": {
|
| 120 |
+
"type": "server_vad",
|
| 121 |
+
"interrupt_response": True,
|
| 122 |
+
},
|
| 123 |
+
},
|
| 124 |
+
"output": {
|
| 125 |
+
"format": {
|
| 126 |
+
"type": "audio/pcm",
|
| 127 |
+
"rate": self.output_sample_rate,
|
| 128 |
+
},
|
| 129 |
+
"voice": "cedar",
|
| 130 |
+
},
|
| 131 |
+
},
|
| 132 |
+
"tools": ALL_TOOL_SPECS, # type: ignore[typeddict-item]
|
| 133 |
+
"tool_choice": "auto",
|
| 134 |
},
|
| 135 |
+
)
|
| 136 |
+
except Exception:
|
| 137 |
+
logger.exception("Realtime session.update failed; aborting startup")
|
| 138 |
+
return
|
| 139 |
+
|
| 140 |
+
logger.info("Realtime session updated successfully")
|
|
|
|
| 141 |
|
| 142 |
# Manage event received from the openai server
|
| 143 |
self.connection = conn
|
| 144 |
async for event in self.connection:
|
| 145 |
logger.debug(f"OpenAI event: {event.type}")
|
| 146 |
if event.type == "input_audio_buffer.speech_started":
|
| 147 |
+
if hasattr(self, "_clear_queue") and callable(self._clear_queue):
|
| 148 |
self._clear_queue()
|
| 149 |
+
if self.deps.head_wobbler is not None:
|
| 150 |
+
self.deps.head_wobbler.reset()
|
| 151 |
self.deps.movement_manager.set_listening(True)
|
| 152 |
logger.debug("User speech started")
|
| 153 |
|
| 154 |
if event.type == "input_audio_buffer.speech_stopped":
|
| 155 |
self.deps.movement_manager.set_listening(False)
|
| 156 |
+
logger.debug("User speech stopped - server will auto-commit with VAD")
|
| 157 |
+
|
| 158 |
+
if event.type in (
|
| 159 |
+
"response.audio.done", # GA
|
| 160 |
+
"response.output_audio.done", # GA alias
|
| 161 |
+
"response.audio.completed", # legacy (for safety)
|
| 162 |
+
"response.completed", # text-only completion
|
| 163 |
+
):
|
| 164 |
logger.debug("response completed")
|
|
|
|
| 165 |
|
| 166 |
if event.type == "response.created":
|
| 167 |
logger.debug("Response created")
|
|
|
|
| 169 |
if event.type == "response.done":
|
| 170 |
# Doesn't mean the audio is done playing
|
| 171 |
logger.debug("Response done")
|
|
|
|
| 172 |
|
| 173 |
+
# Handle partial transcription (user speaking in real-time)
|
| 174 |
+
if event.type == "conversation.item.input_audio_transcription.partial":
|
| 175 |
+
logger.debug(f"User partial transcript: {event.transcript}")
|
| 176 |
+
await self.output_queue.put(
|
| 177 |
+
AdditionalOutputs({"role": "user_partial", "content": event.transcript})
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
# Handle completed transcription (user finished speaking)
|
| 181 |
if event.type == "conversation.item.input_audio_transcription.completed":
|
| 182 |
logger.debug(f"User transcript: {event.transcript}")
|
| 183 |
await self.output_queue.put(AdditionalOutputs({"role": "user", "content": event.transcript}))
|
| 184 |
|
| 185 |
+
# Handle assistant transcription
|
| 186 |
+
if event.type in ("response.audio_transcript.done", "response.output_audio_transcript.done"):
|
| 187 |
logger.debug(f"Assistant transcript: {event.transcript}")
|
| 188 |
await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": event.transcript}))
|
| 189 |
|
| 190 |
+
# Handle audio delta
|
| 191 |
+
if event.type in ("response.audio.delta", "response.output_audio.delta"):
|
| 192 |
+
if self.deps.head_wobbler is not None:
|
| 193 |
+
self.deps.head_wobbler.feed(event.delta)
|
| 194 |
self.last_activity_time = asyncio.get_event_loop().time()
|
| 195 |
logger.debug("last activity time updated to %s", self.last_activity_time)
|
| 196 |
await self.output_queue.put(
|
|
|
|
| 206 |
args_json_str = getattr(event, "arguments", None)
|
| 207 |
call_id = getattr(event, "call_id", None)
|
| 208 |
|
| 209 |
+
if not isinstance(tool_name, str) or not isinstance(args_json_str, str):
|
| 210 |
+
logger.error("Invalid tool call: tool_name=%s, args=%s", tool_name, args_json_str)
|
| 211 |
+
continue
|
| 212 |
+
|
| 213 |
try:
|
| 214 |
tool_result = await dispatch_tool_call(tool_name, args_json_str, self.deps)
|
| 215 |
logger.debug("Tool '%s' executed successfully", tool_name)
|
|
|
|
| 219 |
tool_result = {"error": str(e)}
|
| 220 |
|
| 221 |
# send the tool result back
|
| 222 |
+
if isinstance(call_id, str):
|
| 223 |
+
await self.connection.conversation.item.create(
|
| 224 |
+
item={
|
| 225 |
+
"type": "function_call_output",
|
| 226 |
+
"call_id": call_id,
|
| 227 |
+
"output": json.dumps(tool_result),
|
| 228 |
+
},
|
| 229 |
+
)
|
| 230 |
|
| 231 |
await self.output_queue.put(
|
| 232 |
AdditionalOutputs(
|
| 233 |
{
|
| 234 |
"role": "assistant",
|
| 235 |
"content": json.dumps(tool_result),
|
| 236 |
+
"metadata": {"title": f"π οΈ Used tool {tool_name}", "status": "done"},
|
| 237 |
},
|
| 238 |
+
),
|
| 239 |
)
|
| 240 |
|
| 241 |
if tool_name == "camera" and "b64_im" in tool_result:
|
|
|
|
| 252 |
{
|
| 253 |
"type": "input_image",
|
| 254 |
"image_url": f"data:image/jpeg;base64,{b64_im}",
|
| 255 |
+
},
|
| 256 |
],
|
| 257 |
+
},
|
| 258 |
)
|
| 259 |
logger.info("Added camera image to conversation")
|
| 260 |
|
| 261 |
+
if self.deps.camera_worker is not None:
|
| 262 |
+
np_img = self.deps.camera_worker.get_latest_frame()
|
| 263 |
+
img = gr.Image(value=np_img)
|
| 264 |
|
| 265 |
+
await self.output_queue.put(
|
| 266 |
+
AdditionalOutputs(
|
| 267 |
+
{
|
| 268 |
+
"role": "assistant",
|
| 269 |
+
"content": img,
|
| 270 |
+
},
|
| 271 |
+
),
|
| 272 |
)
|
|
|
|
| 273 |
|
| 274 |
+
# if this tool call was triggered by an idle signal, don't make the robot speak
|
| 275 |
+
# for other tool calls, let the robot reply out loud
|
| 276 |
+
if self.is_idle_tool_call:
|
| 277 |
+
self.is_idle_tool_call = False
|
| 278 |
+
else:
|
| 279 |
await self.connection.response.create(
|
| 280 |
response={
|
| 281 |
+
"instructions": "Use the tool result just returned and answer concisely in speech.",
|
| 282 |
+
},
|
| 283 |
)
|
|
|
|
|
|
|
| 284 |
|
| 285 |
# re synchronize the head wobble after a tool call that may have taken some time
|
| 286 |
+
if self.deps.head_wobbler is not None:
|
| 287 |
+
self.deps.head_wobbler.reset()
|
| 288 |
|
| 289 |
# server error
|
| 290 |
if event.type == "error":
|
| 291 |
err = getattr(event, "error", None)
|
| 292 |
msg = getattr(err, "message", str(err) if err else "unknown error")
|
| 293 |
+
code = getattr(err, "code", "")
|
| 294 |
+
|
| 295 |
+
logger.error("Realtime error [%s]: %s (raw=%s)", code, msg, err)
|
| 296 |
+
|
| 297 |
+
# Only show user-facing errors, not internal state errors
|
| 298 |
+
if code not in ("input_audio_buffer_commit_empty", "conversation_already_has_active_response"):
|
| 299 |
+
await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": f"[error] {msg}"}))
|
| 300 |
|
| 301 |
# Microphone receive
|
| 302 |
+
async def receive(self, frame: Tuple[int, NDArray[np.int16]]) -> None:
|
| 303 |
"""Receive audio frame from the microphone and send it to the openai server."""
|
| 304 |
if not self.connection:
|
| 305 |
return
|
| 306 |
_, array = frame
|
| 307 |
array = array.squeeze()
|
| 308 |
+
|
| 309 |
+
# Resample if needed
|
| 310 |
+
if self.input_sample_rate != self.target_input_rate:
|
| 311 |
+
array = self.resample_audio(array)
|
| 312 |
+
|
| 313 |
audio_message = base64.b64encode(array.tobytes()).decode("utf-8")
|
| 314 |
+
await self.connection.input_audio_buffer.append(audio=audio_message)
|
|
|
|
| 315 |
|
| 316 |
+
async def emit(self) -> Tuple[int, NDArray[np.int16]] | AdditionalOutputs | None:
|
| 317 |
"""Emit audio frame to be played by the speaker."""
|
| 318 |
# sends to the stream the stuff put in the output queue by the openai event handler
|
| 319 |
# This is called periodically by the fastrtc Stream
|
|
|
|
| 321 |
# Handle idle
|
| 322 |
idle_duration = asyncio.get_event_loop().time() - self.last_activity_time
|
| 323 |
if idle_duration > 15.0 and self.deps.movement_manager.is_idle():
|
| 324 |
+
try:
|
| 325 |
+
await self.send_idle_signal(idle_duration)
|
| 326 |
+
except Exception as e:
|
| 327 |
+
logger.warning("Idle signal skipped (connection closed?): %s", e)
|
| 328 |
+
return None
|
| 329 |
|
| 330 |
self.last_activity_time = asyncio.get_event_loop().time() # avoid repeated resets
|
| 331 |
|
| 332 |
+
return await wait_for_item(self.output_queue) # type: ignore[no-any-return]
|
| 333 |
|
| 334 |
async def shutdown(self) -> None:
|
| 335 |
"""Shutdown the handler."""
|
| 336 |
if self.connection:
|
| 337 |
+
try:
|
| 338 |
+
await self.connection.close()
|
| 339 |
+
except ConnectionClosedError as e:
|
| 340 |
+
logger.debug(f"Connection already closed during shutdown: {e}")
|
| 341 |
+
except Exception as e:
|
| 342 |
+
logger.debug(f"connection.close() ignored: {e}")
|
| 343 |
+
finally:
|
| 344 |
+
self.connection = None
|
| 345 |
+
|
| 346 |
+
# Clear any remaining items in the output queue
|
| 347 |
+
while not self.output_queue.empty():
|
| 348 |
+
try:
|
| 349 |
+
self.output_queue.get_nowait()
|
| 350 |
+
except asyncio.QueueEmpty:
|
| 351 |
+
break
|
| 352 |
+
|
| 353 |
+
def format_timestamp(self) -> str:
|
| 354 |
+
"""Format current timestamp with date, time, and elapsed seconds."""
|
| 355 |
+
loop_time = asyncio.get_event_loop().time() # monotonic
|
| 356 |
+
elapsed_seconds = loop_time - self.start_time
|
| 357 |
+
dt = datetime.now() # wall-clock
|
| 358 |
return f"[{dt.strftime('%Y-%m-%d %H:%M:%S')} | +{elapsed_seconds:.1f}s]"
|
| 359 |
|
| 360 |
+
async def send_idle_signal(self, idle_duration: float) -> None:
|
|
|
|
|
|
|
| 361 |
"""Send an idle signal to the openai server."""
|
| 362 |
logger.debug("Sending idle signal")
|
| 363 |
self.is_idle_tool_call = True
|
|
|
|
| 370 |
"type": "message",
|
| 371 |
"role": "user",
|
| 372 |
"content": [{"type": "input_text", "text": timestamp_msg}],
|
| 373 |
+
},
|
| 374 |
)
|
| 375 |
await self.connection.response.create(
|
| 376 |
response={
|
|
|
|
| 377 |
"instructions": "You MUST respond with function calls only - no speech or text. Choose appropriate actions for idle behavior.",
|
| 378 |
"tool_choice": "required",
|
| 379 |
+
},
|
| 380 |
)
|
src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/prompts.py
RENAMED
|
File without changes
|
src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/tools.py
RENAMED
|
@@ -4,7 +4,7 @@ import json
|
|
| 4 |
import asyncio
|
| 5 |
import inspect
|
| 6 |
import logging
|
| 7 |
-
from typing import Any, Dict,
|
| 8 |
from dataclasses import dataclass
|
| 9 |
|
| 10 |
from reachy_mini import ReachyMini
|
|
@@ -17,7 +17,7 @@ logger = logging.getLogger(__name__)
|
|
| 17 |
try:
|
| 18 |
from reachy_mini.motion.recorded_move import RecordedMoves
|
| 19 |
from reachy_mini_dances_library.collection.dance import AVAILABLE_MOVES
|
| 20 |
-
from
|
| 21 |
GotoQueueMove,
|
| 22 |
DanceQueueMove,
|
| 23 |
EmotionQueueMove,
|
|
@@ -36,9 +36,9 @@ except ImportError as e:
|
|
| 36 |
EMOTION_AVAILABLE = False
|
| 37 |
|
| 38 |
|
| 39 |
-
def get_concrete_subclasses(base):
|
| 40 |
"""Recursively find all concrete (non-abstract) subclasses of a base class."""
|
| 41 |
-
result = []
|
| 42 |
for cls in base.__subclasses__():
|
| 43 |
if not inspect.isabstract(cls):
|
| 44 |
result.append(cls)
|
|
@@ -58,9 +58,9 @@ class ToolDependencies:
|
|
| 58 |
reachy_mini: ReachyMini
|
| 59 |
movement_manager: Any # MovementManager from moves.py
|
| 60 |
# Optional deps
|
| 61 |
-
camera_worker:
|
| 62 |
-
vision_manager:
|
| 63 |
-
head_wobbler:
|
| 64 |
motion_duration_s: float = 1.0
|
| 65 |
|
| 66 |
|
|
@@ -88,7 +88,7 @@ class Tool(abc.ABC):
|
|
| 88 |
}
|
| 89 |
|
| 90 |
@abc.abstractmethod
|
| 91 |
-
async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
|
| 92 |
"""Async tool execution entrypoint."""
|
| 93 |
raise NotImplementedError
|
| 94 |
|
|
@@ -113,7 +113,7 @@ class MoveHead(Tool):
|
|
| 113 |
}
|
| 114 |
|
| 115 |
# mapping: direction -> args for create_head_pose
|
| 116 |
-
DELTAS:
|
| 117 |
"left": (0, 0, 0, 0, 0, 40),
|
| 118 |
"right": (0, 0, 0, 0, 0, -40),
|
| 119 |
"up": (0, 0, 0, 0, -30, 0),
|
|
@@ -121,9 +121,12 @@ class MoveHead(Tool):
|
|
| 121 |
"front": (0, 0, 0, 0, 0, 0),
|
| 122 |
}
|
| 123 |
|
| 124 |
-
async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
|
| 125 |
"""Move head in a given direction."""
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
| 127 |
logger.info("Tool call: move_head direction=%s", direction)
|
| 128 |
|
| 129 |
deltas = self.DELTAS.get(direction, self.DELTAS["front"])
|
|
@@ -177,7 +180,7 @@ class Camera(Tool):
|
|
| 177 |
"required": ["question"],
|
| 178 |
}
|
| 179 |
|
| 180 |
-
async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
|
| 181 |
"""Take a picture with the camera and ask a question about it."""
|
| 182 |
image_query = (kwargs.get("question") or "").strip()
|
| 183 |
if not image_query:
|
|
@@ -199,7 +202,7 @@ class Camera(Tool):
|
|
| 199 |
# Use vision manager for processing if available
|
| 200 |
if deps.vision_manager is not None:
|
| 201 |
vision_result = await asyncio.to_thread(
|
| 202 |
-
deps.vision_manager.processor.process_image, frame, image_query
|
| 203 |
)
|
| 204 |
if isinstance(vision_result, dict) and "error" in vision_result:
|
| 205 |
return vision_result
|
|
@@ -208,17 +211,16 @@ class Camera(Tool):
|
|
| 208 |
if isinstance(vision_result, str)
|
| 209 |
else {"error": "vision returned non-string"}
|
| 210 |
)
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
import base64
|
| 214 |
|
| 215 |
-
|
| 216 |
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
|
| 223 |
|
| 224 |
class HeadTracking(Tool):
|
|
@@ -232,7 +234,7 @@ class HeadTracking(Tool):
|
|
| 232 |
"required": ["start"],
|
| 233 |
}
|
| 234 |
|
| 235 |
-
async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
|
| 236 |
"""Enable or disable head tracking."""
|
| 237 |
enable = bool(kwargs.get("start"))
|
| 238 |
|
|
@@ -288,12 +290,12 @@ class Dance(Tool):
|
|
| 288 |
"required": [],
|
| 289 |
}
|
| 290 |
|
| 291 |
-
async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
|
| 292 |
"""Play a named or random dance move once (or repeat). Non-blocking."""
|
| 293 |
if not DANCE_AVAILABLE:
|
| 294 |
return {"error": "Dance system not available"}
|
| 295 |
|
| 296 |
-
move_name = kwargs.get("move"
|
| 297 |
repeat = int(kwargs.get("repeat", 1))
|
| 298 |
|
| 299 |
logger.info("Tool call: dance move=%s repeat=%d", move_name, repeat)
|
|
@@ -326,12 +328,12 @@ class StopDance(Tool):
|
|
| 326 |
"dummy": {
|
| 327 |
"type": "boolean",
|
| 328 |
"description": "dummy boolean, set it to true",
|
| 329 |
-
}
|
| 330 |
},
|
| 331 |
"required": ["dummy"],
|
| 332 |
}
|
| 333 |
|
| 334 |
-
async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
|
| 335 |
"""Stop the current dance move."""
|
| 336 |
logger.info("Tool call: stop_dance")
|
| 337 |
movement_manager = deps.movement_manager
|
|
@@ -373,7 +375,7 @@ class PlayEmotion(Tool):
|
|
| 373 |
"required": ["emotion"],
|
| 374 |
}
|
| 375 |
|
| 376 |
-
async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
|
| 377 |
"""Play a pre-recorded emotion."""
|
| 378 |
if not EMOTION_AVAILABLE:
|
| 379 |
return {"error": "Emotion system not available"}
|
|
@@ -399,7 +401,7 @@ class PlayEmotion(Tool):
|
|
| 399 |
|
| 400 |
except Exception as e:
|
| 401 |
logger.exception("Failed to play emotion")
|
| 402 |
-
return {"error": f"Failed to play emotion: {
|
| 403 |
|
| 404 |
|
| 405 |
class StopEmotion(Tool):
|
|
@@ -413,12 +415,12 @@ class StopEmotion(Tool):
|
|
| 413 |
"dummy": {
|
| 414 |
"type": "boolean",
|
| 415 |
"description": "dummy boolean, set it to true",
|
| 416 |
-
}
|
| 417 |
},
|
| 418 |
"required": ["dummy"],
|
| 419 |
}
|
| 420 |
|
| 421 |
-
async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
|
| 422 |
"""Stop the current emotion."""
|
| 423 |
logger.info("Tool call: stop_emotion")
|
| 424 |
movement_manager = deps.movement_manager
|
|
@@ -442,7 +444,7 @@ class DoNothing(Tool):
|
|
| 442 |
"required": [],
|
| 443 |
}
|
| 444 |
|
| 445 |
-
async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
|
| 446 |
"""Do nothing - stay still and silent."""
|
| 447 |
reason = kwargs.get("reason", "just chilling")
|
| 448 |
logger.info("Tool call: do_nothing reason=%s", reason)
|
|
@@ -452,12 +454,12 @@ class DoNothing(Tool):
|
|
| 452 |
# Registry & specs (dynamic)
|
| 453 |
|
| 454 |
# List of available tool classes
|
| 455 |
-
ALL_TOOLS: Dict[str, Tool] = {cls.name: cls() for cls in get_concrete_subclasses(Tool)}
|
| 456 |
ALL_TOOL_SPECS = [tool.spec() for tool in ALL_TOOLS.values()]
|
| 457 |
|
| 458 |
|
| 459 |
# Dispatcher
|
| 460 |
-
def _safe_load_obj(args_json: str) ->
|
| 461 |
try:
|
| 462 |
parsed_args = json.loads(args_json or "{}")
|
| 463 |
return parsed_args if isinstance(parsed_args, dict) else {}
|
|
|
|
| 4 |
import asyncio
|
| 5 |
import inspect
|
| 6 |
import logging
|
| 7 |
+
from typing import Any, Dict, List, Tuple, Literal
|
| 8 |
from dataclasses import dataclass
|
| 9 |
|
| 10 |
from reachy_mini import ReachyMini
|
|
|
|
| 17 |
try:
|
| 18 |
from reachy_mini.motion.recorded_move import RecordedMoves
|
| 19 |
from reachy_mini_dances_library.collection.dance import AVAILABLE_MOVES
|
| 20 |
+
from reachy_mini_conversation_app.dance_emotion_moves import (
|
| 21 |
GotoQueueMove,
|
| 22 |
DanceQueueMove,
|
| 23 |
EmotionQueueMove,
|
|
|
|
| 36 |
EMOTION_AVAILABLE = False
|
| 37 |
|
| 38 |
|
| 39 |
+
def get_concrete_subclasses(base: type[Tool]) -> List[type[Tool]]:
|
| 40 |
"""Recursively find all concrete (non-abstract) subclasses of a base class."""
|
| 41 |
+
result: List[type[Tool]] = []
|
| 42 |
for cls in base.__subclasses__():
|
| 43 |
if not inspect.isabstract(cls):
|
| 44 |
result.append(cls)
|
|
|
|
| 58 |
reachy_mini: ReachyMini
|
| 59 |
movement_manager: Any # MovementManager from moves.py
|
| 60 |
# Optional deps
|
| 61 |
+
camera_worker: Any | None = None # CameraWorker for frame buffering
|
| 62 |
+
vision_manager: Any | None = None
|
| 63 |
+
head_wobbler: Any | None = None # HeadWobbler for audio-reactive motion
|
| 64 |
motion_duration_s: float = 1.0
|
| 65 |
|
| 66 |
|
|
|
|
| 88 |
}
|
| 89 |
|
| 90 |
@abc.abstractmethod
|
| 91 |
+
async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
|
| 92 |
"""Async tool execution entrypoint."""
|
| 93 |
raise NotImplementedError
|
| 94 |
|
|
|
|
| 113 |
}
|
| 114 |
|
| 115 |
# mapping: direction -> args for create_head_pose
|
| 116 |
+
DELTAS: Dict[str, Tuple[int, int, int, int, int, int]] = {
|
| 117 |
"left": (0, 0, 0, 0, 0, 40),
|
| 118 |
"right": (0, 0, 0, 0, 0, -40),
|
| 119 |
"up": (0, 0, 0, 0, -30, 0),
|
|
|
|
| 121 |
"front": (0, 0, 0, 0, 0, 0),
|
| 122 |
}
|
| 123 |
|
| 124 |
+
async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
|
| 125 |
"""Move head in a given direction."""
|
| 126 |
+
direction_raw = kwargs.get("direction")
|
| 127 |
+
if not isinstance(direction_raw, str):
|
| 128 |
+
return {"error": "direction must be a string"}
|
| 129 |
+
direction: Direction = direction_raw # type: ignore[assignment]
|
| 130 |
logger.info("Tool call: move_head direction=%s", direction)
|
| 131 |
|
| 132 |
deltas = self.DELTAS.get(direction, self.DELTAS["front"])
|
|
|
|
| 180 |
"required": ["question"],
|
| 181 |
}
|
| 182 |
|
| 183 |
+
async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
|
| 184 |
"""Take a picture with the camera and ask a question about it."""
|
| 185 |
image_query = (kwargs.get("question") or "").strip()
|
| 186 |
if not image_query:
|
|
|
|
| 202 |
# Use vision manager for processing if available
|
| 203 |
if deps.vision_manager is not None:
|
| 204 |
vision_result = await asyncio.to_thread(
|
| 205 |
+
deps.vision_manager.processor.process_image, frame, image_query,
|
| 206 |
)
|
| 207 |
if isinstance(vision_result, dict) and "error" in vision_result:
|
| 208 |
return vision_result
|
|
|
|
| 211 |
if isinstance(vision_result, str)
|
| 212 |
else {"error": "vision returned non-string"}
|
| 213 |
)
|
| 214 |
+
# Return base64 encoded image like main_works.py camera tool
|
| 215 |
+
import base64
|
|
|
|
| 216 |
|
| 217 |
+
import cv2
|
| 218 |
|
| 219 |
+
temp_path = "/tmp/camera_frame.jpg"
|
| 220 |
+
cv2.imwrite(temp_path, frame)
|
| 221 |
+
with open(temp_path, "rb") as f:
|
| 222 |
+
b64_encoded = base64.b64encode(f.read()).decode("utf-8")
|
| 223 |
+
return {"b64_im": b64_encoded}
|
| 224 |
|
| 225 |
|
| 226 |
class HeadTracking(Tool):
|
|
|
|
| 234 |
"required": ["start"],
|
| 235 |
}
|
| 236 |
|
| 237 |
+
async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
|
| 238 |
"""Enable or disable head tracking."""
|
| 239 |
enable = bool(kwargs.get("start"))
|
| 240 |
|
|
|
|
| 290 |
"required": [],
|
| 291 |
}
|
| 292 |
|
| 293 |
+
async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
|
| 294 |
"""Play a named or random dance move once (or repeat). Non-blocking."""
|
| 295 |
if not DANCE_AVAILABLE:
|
| 296 |
return {"error": "Dance system not available"}
|
| 297 |
|
| 298 |
+
move_name = kwargs.get("move")
|
| 299 |
repeat = int(kwargs.get("repeat", 1))
|
| 300 |
|
| 301 |
logger.info("Tool call: dance move=%s repeat=%d", move_name, repeat)
|
|
|
|
| 328 |
"dummy": {
|
| 329 |
"type": "boolean",
|
| 330 |
"description": "dummy boolean, set it to true",
|
| 331 |
+
},
|
| 332 |
},
|
| 333 |
"required": ["dummy"],
|
| 334 |
}
|
| 335 |
|
| 336 |
+
async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
|
| 337 |
"""Stop the current dance move."""
|
| 338 |
logger.info("Tool call: stop_dance")
|
| 339 |
movement_manager = deps.movement_manager
|
|
|
|
| 375 |
"required": ["emotion"],
|
| 376 |
}
|
| 377 |
|
| 378 |
+
async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
|
| 379 |
"""Play a pre-recorded emotion."""
|
| 380 |
if not EMOTION_AVAILABLE:
|
| 381 |
return {"error": "Emotion system not available"}
|
|
|
|
| 401 |
|
| 402 |
except Exception as e:
|
| 403 |
logger.exception("Failed to play emotion")
|
| 404 |
+
return {"error": f"Failed to play emotion: {e!s}"}
|
| 405 |
|
| 406 |
|
| 407 |
class StopEmotion(Tool):
|
|
|
|
| 415 |
"dummy": {
|
| 416 |
"type": "boolean",
|
| 417 |
"description": "dummy boolean, set it to true",
|
| 418 |
+
},
|
| 419 |
},
|
| 420 |
"required": ["dummy"],
|
| 421 |
}
|
| 422 |
|
| 423 |
+
async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
|
| 424 |
"""Stop the current emotion."""
|
| 425 |
logger.info("Tool call: stop_emotion")
|
| 426 |
movement_manager = deps.movement_manager
|
|
|
|
| 444 |
"required": [],
|
| 445 |
}
|
| 446 |
|
| 447 |
+
async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
|
| 448 |
"""Do nothing - stay still and silent."""
|
| 449 |
reason = kwargs.get("reason", "just chilling")
|
| 450 |
logger.info("Tool call: do_nothing reason=%s", reason)
|
|
|
|
| 454 |
# Registry & specs (dynamic)
|
| 455 |
|
| 456 |
# List of available tool classes
|
| 457 |
+
ALL_TOOLS: Dict[str, Tool] = {cls.name: cls() for cls in get_concrete_subclasses(Tool)} # type: ignore[type-abstract]
|
| 458 |
ALL_TOOL_SPECS = [tool.spec() for tool in ALL_TOOLS.values()]
|
| 459 |
|
| 460 |
|
| 461 |
# Dispatcher
|
| 462 |
+
def _safe_load_obj(args_json: str) -> Dict[str, Any]:
|
| 463 |
try:
|
| 464 |
parsed_args = json.loads(args_json or "{}")
|
| 465 |
return parsed_args if isinstance(parsed_args, dict) else {}
|
src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/utils.py
RENAMED
|
@@ -1,13 +1,15 @@
|
|
| 1 |
import logging
|
| 2 |
import argparse
|
| 3 |
import warnings
|
|
|
|
| 4 |
|
| 5 |
-
from
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
-
def parse_args():
|
| 9 |
"""Parse command line arguments."""
|
| 10 |
-
parser = argparse.ArgumentParser("Reachy Mini Conversation
|
| 11 |
parser.add_argument(
|
| 12 |
"--head-tracker",
|
| 13 |
choices=["yolo", "mediapipe", None],
|
|
@@ -27,7 +29,7 @@ def parse_args():
|
|
| 27 |
return parser.parse_args()
|
| 28 |
|
| 29 |
|
| 30 |
-
def handle_vision_stuff(args, current_robot):
|
| 31 |
"""Initialize camera, head tracker, camera worker, and vision manager.
|
| 32 |
|
| 33 |
By default, vision is handled by gpt-realtime model when camera tool is used.
|
|
@@ -41,11 +43,11 @@ def handle_vision_stuff(args, current_robot):
|
|
| 41 |
# Initialize head tracker if specified
|
| 42 |
if args.head_tracker is not None:
|
| 43 |
if args.head_tracker == "yolo":
|
| 44 |
-
from
|
| 45 |
|
| 46 |
head_tracker = HeadTracker()
|
| 47 |
elif args.head_tracker == "mediapipe":
|
| 48 |
-
from reachy_mini_toolbox.vision import HeadTracker
|
| 49 |
|
| 50 |
head_tracker = HeadTracker()
|
| 51 |
|
|
@@ -55,22 +57,22 @@ def handle_vision_stuff(args, current_robot):
|
|
| 55 |
# Initialize vision manager only if local vision is requested
|
| 56 |
if args.local_vision:
|
| 57 |
try:
|
| 58 |
-
from
|
| 59 |
|
| 60 |
vision_manager = initialize_vision_manager(camera_worker)
|
| 61 |
except ImportError as e:
|
| 62 |
raise ImportError(
|
| 63 |
-
"To use --local-vision, please install the extra dependencies: pip install '.[local_vision]'"
|
| 64 |
) from e
|
| 65 |
else:
|
| 66 |
logging.getLogger(__name__).info(
|
| 67 |
-
"Using gpt-realtime for vision (default). Use --local-vision for local processing."
|
| 68 |
)
|
| 69 |
|
| 70 |
return camera_worker, head_tracker, vision_manager
|
| 71 |
|
| 72 |
|
| 73 |
-
def setup_logger(debug):
|
| 74 |
"""Setups the logger."""
|
| 75 |
log_level = "DEBUG" if debug else "INFO"
|
| 76 |
logging.basicConfig(
|
|
|
|
| 1 |
import logging
|
| 2 |
import argparse
|
| 3 |
import warnings
|
| 4 |
+
from typing import Any, Tuple
|
| 5 |
|
| 6 |
+
from reachy_mini import ReachyMini
|
| 7 |
+
from reachy_mini_conversation_app.camera_worker import CameraWorker
|
| 8 |
|
| 9 |
|
| 10 |
+
def parse_args() -> argparse.Namespace:
|
| 11 |
"""Parse command line arguments."""
|
| 12 |
+
parser = argparse.ArgumentParser("Reachy Mini Conversation App")
|
| 13 |
parser.add_argument(
|
| 14 |
"--head-tracker",
|
| 15 |
choices=["yolo", "mediapipe", None],
|
|
|
|
| 29 |
return parser.parse_args()
|
| 30 |
|
| 31 |
|
| 32 |
+
def handle_vision_stuff(args: argparse.Namespace, current_robot: ReachyMini) -> Tuple[CameraWorker | None, Any, Any]:
|
| 33 |
"""Initialize camera, head tracker, camera worker, and vision manager.
|
| 34 |
|
| 35 |
By default, vision is handled by gpt-realtime model when camera tool is used.
|
|
|
|
| 43 |
# Initialize head tracker if specified
|
| 44 |
if args.head_tracker is not None:
|
| 45 |
if args.head_tracker == "yolo":
|
| 46 |
+
from reachy_mini_conversation_app.vision.yolo_head_tracker import HeadTracker
|
| 47 |
|
| 48 |
head_tracker = HeadTracker()
|
| 49 |
elif args.head_tracker == "mediapipe":
|
| 50 |
+
from reachy_mini_toolbox.vision import HeadTracker # type: ignore[no-redef]
|
| 51 |
|
| 52 |
head_tracker = HeadTracker()
|
| 53 |
|
|
|
|
| 57 |
# Initialize vision manager only if local vision is requested
|
| 58 |
if args.local_vision:
|
| 59 |
try:
|
| 60 |
+
from reachy_mini_conversation_app.vision.processors import initialize_vision_manager
|
| 61 |
|
| 62 |
vision_manager = initialize_vision_manager(camera_worker)
|
| 63 |
except ImportError as e:
|
| 64 |
raise ImportError(
|
| 65 |
+
"To use --local-vision, please install the extra dependencies: pip install '.[local_vision]'",
|
| 66 |
) from e
|
| 67 |
else:
|
| 68 |
logging.getLogger(__name__).info(
|
| 69 |
+
"Using gpt-realtime for vision (default). Use --local-vision for local processing.",
|
| 70 |
)
|
| 71 |
|
| 72 |
return camera_worker, head_tracker, vision_manager
|
| 73 |
|
| 74 |
|
| 75 |
+
def setup_logger(debug: bool) -> logging.Logger:
|
| 76 |
"""Setups the logger."""
|
| 77 |
log_level = "DEBUG" if debug else "INFO"
|
| 78 |
logging.basicConfig(
|
src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/vision/__init__.py
RENAMED
|
File without changes
|
src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/vision/processors.py
RENAMED
|
@@ -3,16 +3,17 @@ import time
|
|
| 3 |
import base64
|
| 4 |
import logging
|
| 5 |
import threading
|
| 6 |
-
from typing import Any, Dict
|
| 7 |
from dataclasses import dataclass
|
| 8 |
|
| 9 |
import cv2
|
| 10 |
import numpy as np
|
| 11 |
import torch
|
|
|
|
| 12 |
from transformers import AutoProcessor, AutoModelForImageTextToText
|
| 13 |
from huggingface_hub import snapshot_download
|
| 14 |
|
| 15 |
-
from
|
| 16 |
|
| 17 |
|
| 18 |
logger = logging.getLogger(__name__)
|
|
@@ -34,7 +35,7 @@ class VisionConfig:
|
|
| 34 |
class VisionProcessor:
|
| 35 |
"""Handles SmolVLM2 model loading and inference."""
|
| 36 |
|
| 37 |
-
def __init__(self, vision_config: VisionConfig = None):
|
| 38 |
"""Initialize the vision processor."""
|
| 39 |
self.vision_config = vision_config or VisionConfig()
|
| 40 |
self.model_path = self.vision_config.model_path
|
|
@@ -60,7 +61,7 @@ class VisionProcessor:
|
|
| 60 |
"""Load model and processor onto the selected device."""
|
| 61 |
try:
|
| 62 |
logger.info(f"Loading SmolVLM2 model on {self.device} (HF_HOME={config.HF_HOME})")
|
| 63 |
-
self.processor = AutoProcessor.from_pretrained(self.model_path)
|
| 64 |
|
| 65 |
# Select dtype depending on device
|
| 66 |
if self.device == "cuda":
|
|
@@ -70,16 +71,17 @@ class VisionProcessor:
|
|
| 70 |
else:
|
| 71 |
dtype = torch.float32
|
| 72 |
|
| 73 |
-
model_kwargs = {"dtype": dtype}
|
| 74 |
|
| 75 |
# flash_attention_2 is CUDA-only; skip on MPS/CPU
|
| 76 |
if self.device == "cuda":
|
| 77 |
model_kwargs["_attn_implementation"] = "flash_attention_2"
|
| 78 |
|
| 79 |
# Load model weights
|
| 80 |
-
self.model = AutoModelForImageTextToText.from_pretrained(self.model_path, **model_kwargs).to(self.device)
|
| 81 |
|
| 82 |
-
self.model
|
|
|
|
| 83 |
self._initialized = True
|
| 84 |
return True
|
| 85 |
|
|
@@ -89,11 +91,11 @@ class VisionProcessor:
|
|
| 89 |
|
| 90 |
def process_image(
|
| 91 |
self,
|
| 92 |
-
cv2_image: np.
|
| 93 |
prompt: str = "Briefly describe what you see in one sentence.",
|
| 94 |
) -> str:
|
| 95 |
"""Process CV2 image and return description with retry logic."""
|
| 96 |
-
if not self._initialized:
|
| 97 |
return "Vision model not initialized"
|
| 98 |
|
| 99 |
for attempt in range(self.vision_config.max_retries):
|
|
@@ -205,16 +207,16 @@ class VisionProcessor:
|
|
| 205 |
class VisionManager:
|
| 206 |
"""Manages periodic vision processing and scene understanding."""
|
| 207 |
|
| 208 |
-
def __init__(self, camera, vision_config: VisionConfig = None):
|
| 209 |
"""Initialize vision manager with camera and configuration."""
|
| 210 |
self.camera = camera
|
| 211 |
self.vision_config = vision_config or VisionConfig()
|
| 212 |
self.vision_interval = self.vision_config.vision_interval
|
| 213 |
self.processor = VisionProcessor(self.vision_config)
|
| 214 |
|
| 215 |
-
self._last_processed_time = 0
|
| 216 |
self._stop_event = threading.Event()
|
| 217 |
-
self._thread:
|
| 218 |
|
| 219 |
# Initialize processor
|
| 220 |
if not self.processor.initialize():
|
|
@@ -245,7 +247,7 @@ class VisionManager:
|
|
| 245 |
frame = self.camera.get_latest_frame()
|
| 246 |
if frame is not None:
|
| 247 |
description = self.processor.process_image(
|
| 248 |
-
frame, "Briefly describe what you see in one sentence."
|
| 249 |
)
|
| 250 |
|
| 251 |
# Only update if we got a valid response
|
|
@@ -274,7 +276,7 @@ class VisionManager:
|
|
| 274 |
}
|
| 275 |
|
| 276 |
|
| 277 |
-
def initialize_vision_manager(camera_worker) ->
|
| 278 |
"""Initialize vision manager with model download and configuration.
|
| 279 |
|
| 280 |
Args:
|
|
@@ -318,7 +320,7 @@ def initialize_vision_manager(camera_worker) -> Optional[VisionManager]:
|
|
| 318 |
# Log device info
|
| 319 |
device_info = vision_manager.processor.get_model_info()
|
| 320 |
logger.info(
|
| 321 |
-
f"Vision processing enabled: {device_info.get('model_path')} on {device_info.get('device')}"
|
| 322 |
)
|
| 323 |
|
| 324 |
return vision_manager
|
|
|
|
| 3 |
import base64
|
| 4 |
import logging
|
| 5 |
import threading
|
| 6 |
+
from typing import Any, Dict
|
| 7 |
from dataclasses import dataclass
|
| 8 |
|
| 9 |
import cv2
|
| 10 |
import numpy as np
|
| 11 |
import torch
|
| 12 |
+
from numpy.typing import NDArray
|
| 13 |
from transformers import AutoProcessor, AutoModelForImageTextToText
|
| 14 |
from huggingface_hub import snapshot_download
|
| 15 |
|
| 16 |
+
from reachy_mini_conversation_app.config import config
|
| 17 |
|
| 18 |
|
| 19 |
logger = logging.getLogger(__name__)
|
|
|
|
| 35 |
class VisionProcessor:
|
| 36 |
"""Handles SmolVLM2 model loading and inference."""
|
| 37 |
|
| 38 |
+
def __init__(self, vision_config: VisionConfig | None = None):
|
| 39 |
"""Initialize the vision processor."""
|
| 40 |
self.vision_config = vision_config or VisionConfig()
|
| 41 |
self.model_path = self.vision_config.model_path
|
|
|
|
| 61 |
"""Load model and processor onto the selected device."""
|
| 62 |
try:
|
| 63 |
logger.info(f"Loading SmolVLM2 model on {self.device} (HF_HOME={config.HF_HOME})")
|
| 64 |
+
self.processor = AutoProcessor.from_pretrained(self.model_path) # type: ignore[no-untyped-call]
|
| 65 |
|
| 66 |
# Select dtype depending on device
|
| 67 |
if self.device == "cuda":
|
|
|
|
| 71 |
else:
|
| 72 |
dtype = torch.float32
|
| 73 |
|
| 74 |
+
model_kwargs: Dict[str, Any] = {"dtype": dtype}
|
| 75 |
|
| 76 |
# flash_attention_2 is CUDA-only; skip on MPS/CPU
|
| 77 |
if self.device == "cuda":
|
| 78 |
model_kwargs["_attn_implementation"] = "flash_attention_2"
|
| 79 |
|
| 80 |
# Load model weights
|
| 81 |
+
self.model = AutoModelForImageTextToText.from_pretrained(self.model_path, **model_kwargs).to(self.device) # type: ignore[arg-type]
|
| 82 |
|
| 83 |
+
if self.model is not None:
|
| 84 |
+
self.model.eval()
|
| 85 |
self._initialized = True
|
| 86 |
return True
|
| 87 |
|
|
|
|
| 91 |
|
| 92 |
def process_image(
|
| 93 |
self,
|
| 94 |
+
cv2_image: NDArray[np.uint8],
|
| 95 |
prompt: str = "Briefly describe what you see in one sentence.",
|
| 96 |
) -> str:
|
| 97 |
"""Process CV2 image and return description with retry logic."""
|
| 98 |
+
if not self._initialized or self.processor is None or self.model is None:
|
| 99 |
return "Vision model not initialized"
|
| 100 |
|
| 101 |
for attempt in range(self.vision_config.max_retries):
|
|
|
|
| 207 |
class VisionManager:
|
| 208 |
"""Manages periodic vision processing and scene understanding."""
|
| 209 |
|
| 210 |
+
def __init__(self, camera: Any, vision_config: VisionConfig | None = None):
|
| 211 |
"""Initialize vision manager with camera and configuration."""
|
| 212 |
self.camera = camera
|
| 213 |
self.vision_config = vision_config or VisionConfig()
|
| 214 |
self.vision_interval = self.vision_config.vision_interval
|
| 215 |
self.processor = VisionProcessor(self.vision_config)
|
| 216 |
|
| 217 |
+
self._last_processed_time = 0.0
|
| 218 |
self._stop_event = threading.Event()
|
| 219 |
+
self._thread: threading.Thread | None = None
|
| 220 |
|
| 221 |
# Initialize processor
|
| 222 |
if not self.processor.initialize():
|
|
|
|
| 247 |
frame = self.camera.get_latest_frame()
|
| 248 |
if frame is not None:
|
| 249 |
description = self.processor.process_image(
|
| 250 |
+
frame, "Briefly describe what you see in one sentence.",
|
| 251 |
)
|
| 252 |
|
| 253 |
# Only update if we got a valid response
|
|
|
|
| 276 |
}
|
| 277 |
|
| 278 |
|
| 279 |
+
def initialize_vision_manager(camera_worker: Any) -> VisionManager | None:
|
| 280 |
"""Initialize vision manager with model download and configuration.
|
| 281 |
|
| 282 |
Args:
|
|
|
|
| 320 |
# Log device info
|
| 321 |
device_info = vision_manager.processor.get_model_info()
|
| 322 |
logger.info(
|
| 323 |
+
f"Vision processing enabled: {device_info.get('model_path')} on {device_info.get('device')}",
|
| 324 |
)
|
| 325 |
|
| 326 |
return vision_manager
|
src/{reachy_mini_conversation_demo β reachy_mini_conversation_app}/vision/yolo_head_tracker.py
RENAMED
|
@@ -1,16 +1,17 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
import logging
|
| 3 |
-
from typing import Tuple
|
| 4 |
|
| 5 |
import numpy as np
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
try:
|
| 9 |
from supervision import Detections
|
| 10 |
-
from ultralytics import YOLO
|
| 11 |
except ImportError as e:
|
| 12 |
raise ImportError(
|
| 13 |
-
"To use YOLO head tracker, please install the extra dependencies: pip install '.[yolo_vision]'"
|
| 14 |
) from e
|
| 15 |
from huggingface_hub import hf_hub_download
|
| 16 |
|
|
@@ -48,7 +49,7 @@ class HeadTracker:
|
|
| 48 |
logger.error(f"Failed to load YOLO model: {e}")
|
| 49 |
raise
|
| 50 |
|
| 51 |
-
def _select_best_face(self, detections: Detections) ->
|
| 52 |
"""Select the best face based on confidence and area (largest face with highest confidence).
|
| 53 |
|
| 54 |
Args:
|
|
@@ -61,6 +62,10 @@ class HeadTracker:
|
|
| 61 |
if detections.xyxy.shape[0] == 0:
|
| 62 |
return None
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
# Filter by confidence threshold
|
| 65 |
valid_mask = detections.confidence >= self.confidence_threshold
|
| 66 |
if not np.any(valid_mask):
|
|
@@ -78,9 +83,9 @@ class HeadTracker:
|
|
| 78 |
|
| 79 |
# Return index of best face
|
| 80 |
best_idx = valid_indices[np.argmax(scores)]
|
| 81 |
-
return best_idx
|
| 82 |
|
| 83 |
-
def _bbox_to_mp_coords(self, bbox: np.
|
| 84 |
"""Convert bounding box center to MediaPipe-style coordinates [-1, 1].
|
| 85 |
|
| 86 |
Args:
|
|
@@ -101,7 +106,7 @@ class HeadTracker:
|
|
| 101 |
|
| 102 |
return np.array([norm_x, norm_y], dtype=np.float32)
|
| 103 |
|
| 104 |
-
def get_head_position(self, img: np.
|
| 105 |
"""Get head position from face detection.
|
| 106 |
|
| 107 |
Args:
|
|
@@ -125,9 +130,10 @@ class HeadTracker:
|
|
| 125 |
return None, None
|
| 126 |
|
| 127 |
bbox = detections.xyxy[face_idx]
|
| 128 |
-
confidence = detections.confidence[face_idx]
|
| 129 |
|
| 130 |
-
|
|
|
|
|
|
|
| 131 |
|
| 132 |
# Get face center in [-1, 1] coordinates
|
| 133 |
face_center = self._bbox_to_mp_coords(bbox, w, h)
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
import logging
|
| 3 |
+
from typing import Tuple
|
| 4 |
|
| 5 |
import numpy as np
|
| 6 |
+
from numpy.typing import NDArray
|
| 7 |
|
| 8 |
|
| 9 |
try:
|
| 10 |
from supervision import Detections
|
| 11 |
+
from ultralytics import YOLO # type: ignore[attr-defined]
|
| 12 |
except ImportError as e:
|
| 13 |
raise ImportError(
|
| 14 |
+
"To use YOLO head tracker, please install the extra dependencies: pip install '.[yolo_vision]'",
|
| 15 |
) from e
|
| 16 |
from huggingface_hub import hf_hub_download
|
| 17 |
|
|
|
|
| 49 |
logger.error(f"Failed to load YOLO model: {e}")
|
| 50 |
raise
|
| 51 |
|
| 52 |
+
def _select_best_face(self, detections: Detections) -> int | None:
|
| 53 |
"""Select the best face based on confidence and area (largest face with highest confidence).
|
| 54 |
|
| 55 |
Args:
|
|
|
|
| 62 |
if detections.xyxy.shape[0] == 0:
|
| 63 |
return None
|
| 64 |
|
| 65 |
+
# Check if confidence is available
|
| 66 |
+
if detections.confidence is None:
|
| 67 |
+
return None
|
| 68 |
+
|
| 69 |
# Filter by confidence threshold
|
| 70 |
valid_mask = detections.confidence >= self.confidence_threshold
|
| 71 |
if not np.any(valid_mask):
|
|
|
|
| 83 |
|
| 84 |
# Return index of best face
|
| 85 |
best_idx = valid_indices[np.argmax(scores)]
|
| 86 |
+
return int(best_idx)
|
| 87 |
|
| 88 |
+
def _bbox_to_mp_coords(self, bbox: NDArray[np.float32], w: int, h: int) -> NDArray[np.float32]:
|
| 89 |
"""Convert bounding box center to MediaPipe-style coordinates [-1, 1].
|
| 90 |
|
| 91 |
Args:
|
|
|
|
| 106 |
|
| 107 |
return np.array([norm_x, norm_y], dtype=np.float32)
|
| 108 |
|
| 109 |
+
def get_head_position(self, img: NDArray[np.uint8]) -> Tuple[NDArray[np.float32] | None, float | None]:
|
| 110 |
"""Get head position from face detection.
|
| 111 |
|
| 112 |
Args:
|
|
|
|
| 130 |
return None, None
|
| 131 |
|
| 132 |
bbox = detections.xyxy[face_idx]
|
|
|
|
| 133 |
|
| 134 |
+
if detections.confidence is not None:
|
| 135 |
+
confidence = detections.confidence[face_idx]
|
| 136 |
+
logger.debug(f"Face detected with confidence: {confidence:.2f}")
|
| 137 |
|
| 138 |
# Get face center in [-1, 1] coordinates
|
| 139 |
face_center = self._bbox_to_mp_coords(bbox, w, h)
|
tests/audio/test_head_wobbler.py
CHANGED
|
@@ -4,11 +4,12 @@ import math
|
|
| 4 |
import time
|
| 5 |
import base64
|
| 6 |
import threading
|
| 7 |
-
from typing import List, Tuple
|
|
|
|
| 8 |
|
| 9 |
import numpy as np
|
| 10 |
|
| 11 |
-
from
|
| 12 |
|
| 13 |
|
| 14 |
def _make_audio_chunk(duration_s: float = 0.3, frequency_hz: float = 220.0) -> str:
|
|
@@ -74,7 +75,7 @@ def test_reset_allows_future_offsets() -> None:
|
|
| 74 |
wobbler.stop()
|
| 75 |
|
| 76 |
|
| 77 |
-
def test_reset_during_inflight_chunk_keeps_worker(monkeypatch) -> None:
|
| 78 |
"""Simulate reset during chunk processing to ensure the worker survives."""
|
| 79 |
wobbler, captured = _start_wobbler()
|
| 80 |
ready = threading.Event()
|
|
|
|
| 4 |
import time
|
| 5 |
import base64
|
| 6 |
import threading
|
| 7 |
+
from typing import Any, List, Tuple
|
| 8 |
+
from collections.abc import Callable
|
| 9 |
|
| 10 |
import numpy as np
|
| 11 |
|
| 12 |
+
from reachy_mini_conversation_app.audio.head_wobbler import HeadWobbler
|
| 13 |
|
| 14 |
|
| 15 |
def _make_audio_chunk(duration_s: float = 0.3, frequency_hz: float = 220.0) -> str:
|
|
|
|
| 75 |
wobbler.stop()
|
| 76 |
|
| 77 |
|
| 78 |
+
def test_reset_during_inflight_chunk_keeps_worker(monkeypatch: Any) -> None:
|
| 79 |
"""Simulate reset during chunk processing to ensure the worker survives."""
|
| 80 |
wobbler, captured = _start_wobbler()
|
| 81 |
ready = threading.Event()
|
tests/test_openai_realtime.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import logging
|
| 3 |
+
from typing import Any
|
| 4 |
+
from datetime import datetime, timezone
|
| 5 |
+
from unittest.mock import MagicMock
|
| 6 |
+
|
| 7 |
+
import pytest
|
| 8 |
+
|
| 9 |
+
import reachy_mini_conversation_app.openai_realtime as rt_mod
|
| 10 |
+
from reachy_mini_conversation_app.tools import ToolDependencies
|
| 11 |
+
from reachy_mini_conversation_app.openai_realtime import OpenaiRealtimeHandler
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def _build_handler(loop: asyncio.AbstractEventLoop) -> OpenaiRealtimeHandler:
|
| 15 |
+
asyncio.set_event_loop(loop)
|
| 16 |
+
deps = ToolDependencies(reachy_mini=MagicMock(), movement_manager=MagicMock())
|
| 17 |
+
return OpenaiRealtimeHandler(deps)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def test_format_timestamp_uses_wall_clock() -> None:
|
| 21 |
+
"""Test that format_timestamp uses wall clock time."""
|
| 22 |
+
loop = asyncio.new_event_loop()
|
| 23 |
+
try:
|
| 24 |
+
print("Testing format_timestamp...")
|
| 25 |
+
handler = _build_handler(loop)
|
| 26 |
+
formatted = handler.format_timestamp()
|
| 27 |
+
print(f"Formatted timestamp: {formatted}")
|
| 28 |
+
finally:
|
| 29 |
+
asyncio.set_event_loop(None)
|
| 30 |
+
loop.close()
|
| 31 |
+
|
| 32 |
+
# Extract year from "[YYYY-MM-DD ...]"
|
| 33 |
+
year = int(formatted[1:5])
|
| 34 |
+
assert year == datetime.now(timezone.utc).year
|
| 35 |
+
|
| 36 |
+
@pytest.mark.asyncio
|
| 37 |
+
async def test_start_up_retries_on_abrupt_close(monkeypatch: Any, caplog: Any) -> None:
|
| 38 |
+
"""First connection dies with ConnectionClosedError during iteration -> retried.
|
| 39 |
+
|
| 40 |
+
Second connection iterates cleanly (no events) -> start_up returns without raising.
|
| 41 |
+
Ensures handler clears self.connection at the end.
|
| 42 |
+
"""
|
| 43 |
+
caplog.set_level(logging.WARNING)
|
| 44 |
+
|
| 45 |
+
# Use a local Exception as the module's ConnectionClosedError to avoid ws dependency
|
| 46 |
+
FakeCCE = type("FakeCCE", (Exception,), {})
|
| 47 |
+
monkeypatch.setattr(rt_mod, "ConnectionClosedError", FakeCCE)
|
| 48 |
+
|
| 49 |
+
# Make asyncio.sleep return immediately (for backoff)
|
| 50 |
+
async def _fast_sleep(*_a: Any, **_kw: Any) -> None: return None
|
| 51 |
+
monkeypatch.setattr(asyncio, "sleep", _fast_sleep, raising=False)
|
| 52 |
+
|
| 53 |
+
attempt_counter = {"n": 0}
|
| 54 |
+
|
| 55 |
+
class FakeConn:
|
| 56 |
+
"""Minimal realtime connection stub."""
|
| 57 |
+
|
| 58 |
+
def __init__(self, mode: str):
|
| 59 |
+
self._mode = mode
|
| 60 |
+
|
| 61 |
+
class _Session:
|
| 62 |
+
async def update(self, **_kw: Any) -> None: return None
|
| 63 |
+
self.session = _Session()
|
| 64 |
+
|
| 65 |
+
class _InputAudioBuffer:
|
| 66 |
+
async def append(self, **_kw: Any) -> None: return None
|
| 67 |
+
self.input_audio_buffer = _InputAudioBuffer()
|
| 68 |
+
|
| 69 |
+
class _Item:
|
| 70 |
+
async def create(self, **_kw: Any) -> None: return None
|
| 71 |
+
|
| 72 |
+
class _Conversation:
|
| 73 |
+
item = _Item()
|
| 74 |
+
self.conversation = _Conversation()
|
| 75 |
+
|
| 76 |
+
class _Response:
|
| 77 |
+
async def create(self, **_kw: Any) -> None: return None
|
| 78 |
+
async def cancel(self, **_kw: Any) -> None: return None
|
| 79 |
+
self.response = _Response()
|
| 80 |
+
|
| 81 |
+
async def __aenter__(self) -> "FakeConn": return self
|
| 82 |
+
async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> bool: return False
|
| 83 |
+
async def close(self) -> None: return None
|
| 84 |
+
|
| 85 |
+
# Async iterator protocol
|
| 86 |
+
def __aiter__(self) -> "FakeConn": return self
|
| 87 |
+
async def __anext__(self) -> None:
|
| 88 |
+
if self._mode == "raise_on_iter":
|
| 89 |
+
raise FakeCCE("abrupt close (simulated)")
|
| 90 |
+
raise StopAsyncIteration # clean exit (no events)
|
| 91 |
+
|
| 92 |
+
class FakeRealtime:
|
| 93 |
+
def connect(self, **_kw: Any) -> FakeConn:
|
| 94 |
+
attempt_counter["n"] += 1
|
| 95 |
+
mode = "raise_on_iter" if attempt_counter["n"] == 1 else "clean"
|
| 96 |
+
return FakeConn(mode)
|
| 97 |
+
|
| 98 |
+
class FakeClient:
|
| 99 |
+
def __init__(self, **_kw: Any) -> None: self.realtime = FakeRealtime()
|
| 100 |
+
|
| 101 |
+
# Patch the OpenAI client used by the handler
|
| 102 |
+
monkeypatch.setattr(rt_mod, "AsyncOpenAI", FakeClient)
|
| 103 |
+
|
| 104 |
+
# Build handler with minimal deps
|
| 105 |
+
deps = ToolDependencies(reachy_mini=MagicMock(), movement_manager=MagicMock())
|
| 106 |
+
handler = rt_mod.OpenaiRealtimeHandler(deps)
|
| 107 |
+
|
| 108 |
+
# Run: should retry once and exit cleanly
|
| 109 |
+
await handler.start_up()
|
| 110 |
+
|
| 111 |
+
# Validate: two attempts total (fail -> retry -> succeed), and connection cleared
|
| 112 |
+
assert attempt_counter["n"] == 2
|
| 113 |
+
assert handler.connection is None
|
| 114 |
+
|
| 115 |
+
# Optional: confirm we logged the unexpected close once
|
| 116 |
+
warnings = [r for r in caplog.records if r.levelname == "WARNING" and "closed unexpectedly" in r.msg]
|
| 117 |
+
assert len(warnings) == 1
|
uv.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|