diff --git a/.env.example b/.env.example new file mode 100644 index 0000000000000000000000000000000000000000..80459d04ffc3e87a799b402f78490ad96a73883f --- /dev/null +++ b/.env.example @@ -0,0 +1,15 @@ +OPENAI_API_KEY= +MODEL_NAME="gpt-realtime" + +# Local vision model (only used with --local-vision CLI flag) +# By default, vision is handled by gpt-realtime when the camera tool is used +LOCAL_VISION_MODEL=HuggingFaceTB/SmolVLM2-2.2B-Instruct + +# Cache for local VLM (only used with --local-vision CLI flag) +HF_HOME=./cache + +# Hugging Face token for accessing datasets/models +HF_TOKEN= + +# To select a specific profile with custom instructions and tools, to be placed in profiles//__init__.py +REACHY_MINI_CUSTOM_PROFILE="example" \ No newline at end of file diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..e27d20e50f1bb355c9c7092c0dbb7c1bbb27809c 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,35 +1,44 @@ -*.7z filter=lfs diff=lfs merge=lfs -text -*.arrow filter=lfs diff=lfs merge=lfs -text -*.bin filter=lfs diff=lfs merge=lfs -text -*.bz2 filter=lfs diff=lfs merge=lfs -text -*.ckpt filter=lfs diff=lfs merge=lfs -text -*.ftz filter=lfs diff=lfs merge=lfs -text -*.gz filter=lfs diff=lfs merge=lfs -text -*.h5 filter=lfs diff=lfs merge=lfs -text -*.joblib filter=lfs diff=lfs merge=lfs -text -*.lfs.* filter=lfs diff=lfs merge=lfs -text -*.mlmodel filter=lfs diff=lfs merge=lfs -text -*.model filter=lfs diff=lfs merge=lfs -text -*.msgpack filter=lfs diff=lfs merge=lfs -text -*.npy filter=lfs diff=lfs merge=lfs -text -*.npz filter=lfs diff=lfs merge=lfs -text -*.onnx filter=lfs diff=lfs merge=lfs -text -*.ot filter=lfs diff=lfs merge=lfs -text -*.parquet filter=lfs diff=lfs merge=lfs -text -*.pb filter=lfs diff=lfs merge=lfs -text -*.pickle filter=lfs diff=lfs merge=lfs -text -*.pkl filter=lfs diff=lfs merge=lfs -text -*.pt filter=lfs diff=lfs merge=lfs -text -*.pth filter=lfs diff=lfs merge=lfs -text -*.rar filter=lfs diff=lfs merge=lfs -text -*.safetensors filter=lfs diff=lfs merge=lfs -text -saved_model/**/* filter=lfs diff=lfs merge=lfs -text -*.tar.* filter=lfs diff=lfs merge=lfs -text -*.tar filter=lfs diff=lfs merge=lfs -text -*.tflite filter=lfs diff=lfs merge=lfs -text -*.tgz filter=lfs diff=lfs merge=lfs -text -*.wasm filter=lfs diff=lfs merge=lfs -text -*.xz filter=lfs diff=lfs merge=lfs -text -*.zip filter=lfs diff=lfs merge=lfs -text -*.zst filter=lfs diff=lfs merge=lfs -text -*tfevents* filter=lfs diff=lfs merge=lfs -text +# Macro for all binary files that should use Git LFS. +[attr]lfs -text filter=lfs diff=lfs merge=lfs + +# Image +*.jpg lfs +*.jpeg lfs +*.png lfs +*.apng lfs +*.atsc lfs +*.gif lfs +*.bmp lfs +*.exr lfs +*.tga lfs +*.tiff lfs +*.tif lfs +*.iff lfs +*.pict lfs +*.dds lfs +*.xcf lfs +*.leo lfs +*.kra lfs +*.kpp lfs +*.clip lfs +*.webm lfs +*.webp lfs +*.svg lfs +*.svgz lfs +*.psd lfs +*.afphoto lfs +*.afdesign lfs +# Models +*.pth lfs +# Binaries +*.bin lfs +*.pkl lfs +*.pckl lfs +# 3D +*.ply lfs +*.vis lfs +*.db lfs +*.ply lfs +docs/assets/reachy_mini_dance.gif filter=lfs diff=lfs merge=lfs -text +src/reachy_mini_conversation_app/images/reachymini_avatar.png filter=lfs diff=lfs merge=lfs -text +src/reachy_mini_conversation_app/images/user_avatar.png filter=lfs diff=lfs merge=lfs -text diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000000000000000000000000000000000000..313d39de42a22b3eb11c54f34d99d372803e0d8b --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,36 @@ +## Summary + + +## Category +- [ ] Fix +- [ ] Feature +- [ ] Refactor +- [ ] Docs +- [ ] CI/CD +- [ ] Other + +## Check before merging +### Basic +- [ ] CI green (Ruff, Tests, Mypy) +- [ ] Code update is clear (types, docs, comments) + +### Run modes +- [ ] Headless mode (default) +- [ ] Gradio UI (`--gradio`) +- [ ] Everything is tested in simulation as well (`--gradio` required) + +### Vision / motion +- [ ] Local vision (`--local-vision`) +- [ ] YOLO or MediaPipe head tracker (`--head-tracker {yolo,mediapipe}`) +- [ ] Camera pipeline (with/without `--no-camera`) +- [ ] Movement manager (dances, emotions, head motion) +- [ ] Head wobble +- [ ] Profiles or custom tools + +### Dependencies & config +- [ ] Updated `pyproject.toml` if deps/extras changed +- [ ] Regenerated `uv.lock` if deps changed +- [ ] Updated `.env.example` if new config vars added + +## Notes + diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000000000000000000000000000000000000..113bcca5205e66435627927d313c5d9731cfff97 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,10 @@ +name: Ruff +on: [ push, pull_request ] +jobs: + ruff: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: astral-sh/ruff-action@v3 + with: + version: "0.12.0" \ No newline at end of file diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000000000000000000000000000000000000..5c31b6c066f2b9198e64df7f3b2a2b90ded83ac2 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,74 @@ +name: Tests +on: + push: + pull_request: + +permissions: + contents: read + actions: write + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + tests: + name: pytest (py${{ matrix.python-version }}) + runs-on: ubuntu-latest + timeout-minutes: 15 + strategy: + fail-fast: false + matrix: + python-version: ["3.12"] + + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + HF_HUB_ETAG_TIMEOUT: "120" + HF_HUB_DOWNLOAD_TIMEOUT: "120" + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - uses: astral-sh/setup-uv@v5 + + - name: Set HF_HOME + shell: bash + run: | + echo "HF_HOME=${RUNNER_TEMP}/.hf" >> "$GITHUB_ENV" + mkdir -p "${RUNNER_TEMP}/.hf" + + - name: Cache Hugging Face hub + uses: actions/cache@v4 + with: + path: ${{ runner.temp }}/.hf + key: hf-${{ runner.os }}-${{ hashFiles('uv.lock', 'pyproject.toml') }} + restore-keys: hf-${{ runner.os }}- + + # test-only .env file + - name: Create test .env + run: | + printf "OPENAI_API_KEY=test-dummy\n" > .env + + - name: Install (locked) + run: | + uv sync --frozen --group dev --extra all_vision + + # Prefetch HF dataset to avoid download during test collection + - name: Prefetch HF dataset + run: | + .venv/bin/python - <<'PY' + from huggingface_hub import snapshot_download + snapshot_download( + repo_id="pollen-robotics/reachy-mini-emotions-library", + repo_type="dataset", + etag_timeout=120, + max_workers=4, + ) + PY + + - name: Run tests + run: .venv/bin/pytest -q diff --git a/.github/workflows/typecheck.yml b/.github/workflows/typecheck.yml new file mode 100644 index 0000000000000000000000000000000000000000..2a6c469ee508af673fa90ee9026fac3e69004768 --- /dev/null +++ b/.github/workflows/typecheck.yml @@ -0,0 +1,29 @@ +name: Type check + +on: [push, pull_request] + +permissions: + contents: read + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + mypy: + runs-on: ubuntu-latest + timeout-minutes: 10 + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - uses: astral-sh/setup-uv@v5 + + - name: Install deps (locked) incl. vision extras + run: uv sync --frozen --group dev --extra all_vision + + - name: Run mypy + run: .venv/bin/mypy --pretty --show-error-codes . \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..051c7ead4246d3c857e1f2be39673d03a2c0462c --- /dev/null +++ b/.gitignore @@ -0,0 +1,61 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so + +# Virtual environments +.venv/ +venv/ +ENV/ +env/ + +# Environment variables +.env + +# Build and distribution +build/ +dist/ +*.egg-info/ +.eggs/ + +# Testing +.pytest_cache/ +.coverage +.hypothesis/ +htmlcov/ +coverage.xml +*.cover + +# Linting and formatting +.ruff_cache/ +.mypy_cache/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# Security +*.key +*.pem +*.crt +*.csr + +# Temporary files +tmp/ +*.log +cache/ + +# macOS +.DS_Store + +# Linux +*~ +.directory +.Trash-* +.nfs* + +# User-created personalities (managed by UI) +src/reachy_mini_conversation_app/profiles/user_personalities/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index 23a4755f69cfdd224e54a3bb0aa980a803ec286e..ce64ce443c402d2ce42e294d1edae3e3afd95bae 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,259 @@ --- -title: Reachy Vibe Coder -emoji: ๐Ÿ‘€ +title: Reachy the Vibe Coder +emoji: ๐ŸŽจ colorFrom: blue colorTo: purple sdk: gradio -sdk_version: 6.2.0 +sdk_version: "5.9.1" app_file: app.py pinned: false +short_description: Voice-activated coding with Reachy Mini! +license: apache-2.0 +tags: + - reachy_mini + - reachy_mini_python_app + - vibe-coding + - cursor + - robotics + - voice-assistant --- -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +# ๐ŸŽจ Reachy the Vibe Coder + +**Voice-activated coding with Reachy Mini!** Tell Reachy what to build, and watch it come to life in Cursor IDE. + +> "Reachy, build me a website" โ†’ Reachy decides the colors, layout, animations, everything, and codes it for you! + +Conversational app for the Reachy Mini robot combining OpenAI's realtime APIs, vision pipelines, choreographed motion libraries, and **Cursor IDE integration for vibe coding**. + +![Reachy Mini Dance](docs/assets/reachy_mini_dance.gif) + +## Architecture + +The app follows a layered architecture connecting the user, AI services, and robot hardware: + +

+ Architecture Diagram +

+ +## ๐Ÿš€ Vibe Coding Features + +| Tool | Description | +|------|-------------| +| `vibe_code` | ๐ŸŽจ Tell Reachy to build something - it decides ALL the details (tech stack, colors, layout, animations) and sends a detailed prompt to Cursor! | +| `vibe_big_project` | ๐Ÿš€ For epic builds - entire apps, refactors, multi-file projects using Cursor's Agent mode | + +**How it works:** +1. Say "Reachy, build me a website" +2. Reachy becomes your **creative director** - deciding colors, fonts, layout, tech stack +3. Opens Cursor IDE with a new window +4. Types a detailed prompt and submits it +5. Cursor's AI starts coding! + +## Overview +- Real-time audio conversation loop powered by the OpenAI realtime API and `fastrtc` for low-latency streaming. +- Vision processing uses gpt-realtime by default (when camera tool is used), with optional local vision processing using SmolVLM2 model running on-device (CPU/GPU/MPS) via `--local-vision` flag. +- Layered motion system queues primary moves (dances, emotions, goto poses, breathing) while blending speech-reactive wobble and face-tracking. +- Async tool dispatch integrates robot motion, camera capture, and optional face-tracking capabilities through a Gradio web UI with live transcripts. + +## Installation + +> [!IMPORTANT] +> Before using this app, you need to install [Reachy Mini's SDK](https://github.com/pollen-robotics/reachy_mini/).
+> Windows support is currently experimental and has not been extensively tested. Use with caution. + +### Using uv +You can set up the project quickly using [uv](https://docs.astral.sh/uv/): + +```bash +uv venv --python 3.12.1 # Create a virtual environment with Python 3.12.1 +source .venv/bin/activate +uv sync +``` + +> [!NOTE] +> To reproduce the exact dependency set from this repo's `uv.lock`, run `uv sync` with `--locked` (or `--frozen`). This ensures `uv` installs directly from the lockfile without re-resolving or updating any versions. + +To include optional dependencies: +``` +uv sync --extra reachy_mini_wireless # For wireless Reachy Mini with GStreamer support +uv sync --extra local_vision # For local PyTorch/Transformers vision +uv sync --extra yolo_vision # For YOLO-based vision +uv sync --extra mediapipe_vision # For MediaPipe-based vision +uv sync --extra all_vision # For all vision features +``` + +You can combine extras or include dev dependencies: +``` +uv sync --extra all_vision --group dev +``` + +### Using pip + +```bash +python -m venv .venv # Create a virtual environment +source .venv/bin/activate +pip install -e . +``` + +Install optional extras depending on the feature set you need: + +```bash +# Wireless Reachy Mini support +pip install -e .[reachy_mini_wireless] + +# Vision stacks (choose at least one if you plan to run face tracking) +pip install -e .[local_vision] +pip install -e .[yolo_vision] +pip install -e .[mediapipe_vision] +pip install -e .[all_vision] # installs every vision extra + +# Tooling for development workflows +pip install -e .[dev] +``` + +Some wheels (e.g. PyTorch) are large and require compatible CUDA or CPU buildsโ€”make sure your platform matches the binaries pulled in by each extra. + +## Optional dependency groups + +| Extra | Purpose | Notes | +|-------|---------|-------| +| `reachy_mini_wireless` | Wireless Reachy Mini with GStreamer support. | Required for wireless versions of Reachy Mini, includes GStreamer dependencies. +| `local_vision` | Run the local VLM (SmolVLM2) through PyTorch/Transformers. | GPU recommended; ensure compatible PyTorch builds for your platform. +| `yolo_vision` | YOLOv8 tracking via `ultralytics` and `supervision`. | CPU friendly; supports the `--head-tracker yolo` option. +| `mediapipe_vision` | Lightweight landmark tracking with MediaPipe. | Works on CPU; enables `--head-tracker mediapipe`. +| `all_vision` | Convenience alias installing every vision extra. | Install when you want the flexibility to experiment with every provider. +| `dev` | Developer tooling (`pytest`, `ruff`). | Add on top of either base or `all_vision` environments. + +## Configuration + +1. Copy `.env.example` to `.env`. +2. Fill in the required values, notably the OpenAI API key. + +| Variable | Description | +|----------|-------------| +| `OPENAI_API_KEY` | Required. Grants access to the OpenAI realtime endpoint. +| `MODEL_NAME` | Override the realtime model (defaults to `gpt-realtime`). Used for both conversation and vision (unless `--local-vision` flag is used). +| `HF_HOME` | Cache directory for local Hugging Face downloads (only used with `--local-vision` flag, defaults to `./cache`). +| `HF_TOKEN` | Optional token for Hugging Face models (only used with `--local-vision` flag, falls back to `huggingface-cli login`). +| `LOCAL_VISION_MODEL` | Hugging Face model path for local vision processing (only used with `--local-vision` flag, defaults to `HuggingFaceTB/SmolVLM2-2.2B-Instruct`). + +## Running the app + +Activate your virtual environment, ensure the Reachy Mini robot (or simulator) is reachable, then launch: + +```bash +reachy-mini-conversation-app +``` + +By default, the app runs in console mode for direct audio interaction. Use the `--gradio` flag to launch a web UI served locally at http://127.0.0.1:7860/ (required when running in simulation mode). With a camera attached, vision is handled by the gpt-realtime model when the camera tool is used. For local vision processing, use the `--local-vision` flag to process frames periodically using the SmolVLM2 model. Additionally, you can enable face tracking via YOLO or MediaPipe pipelines depending on the extras you installed. + +### CLI options + +| Option | Default | Description | +|--------|---------|-------------| +| `--head-tracker {yolo,mediapipe}` | `None` | Select a face-tracking backend when a camera is available. YOLO is implemented locally, MediaPipe comes from the `reachy_mini_toolbox` package. Requires the matching optional extra. | +| `--no-camera` | `False` | Run without camera capture or face tracking. | +| `--local-vision` | `False` | Use local vision model (SmolVLM2) for periodic image processing instead of gpt-realtime vision. Requires `local_vision` extra to be installed. | +| `--gradio` | `False` | Launch the Gradio web UI. Without this flag, runs in console mode. Required when running in simulation mode. | +| `--debug` | `False` | Enable verbose logging for troubleshooting. | +| `--wireless-version` | `False` | Use GStreamer backend for wireless version of the robot. Requires `reachy_mini_wireless` extra to be installed. + + +### Examples +- Run on hardware with MediaPipe face tracking: + + ```bash + reachy-mini-conversation-app --head-tracker mediapipe + ``` + +- Run with local vision processing (requires `local_vision` extra): + + ```bash + reachy-mini-conversation-app --local-vision + ``` + +- Run with wireless support (requires `reachy_mini_wireless` extra and daemon started with `--wireless-version`): + + ```bash + reachy-mini-conversation-app --wireless-version + ``` + +- Disable the camera pipeline (audio-only conversation): + + ```bash + reachy-mini-conversation-app --no-camera + ``` + +### Troubleshooting + +- Timeout error: +If you get an error like this: + ```bash + TimeoutError: Timeout while waiting for connection with the server. + ``` +It probably means that the Reachy Mini's daemon isn't running. Install [Reachy Mini's SDK](https://github.com/pollen-robotics/reachy_mini/) and start the daemon. + +## LLM tools exposed to the assistant + +| Tool | Action | Dependencies | +|------|--------|--------------| +| `move_head` | Queue a head pose change (left/right/up/down/front). | Core install only. | +| `camera` | Capture the latest camera frame and send it to gpt-realtime for vision analysis. | Requires camera worker; uses gpt-realtime vision by default. | +| `head_tracking` | Enable or disable face-tracking offsets (not facial recognition - only detects and tracks face position). | Camera worker with configured head tracker. | +| `dance` | Queue a dance from `reachy_mini_dances_library`. | Core install only. | +| `stop_dance` | Clear queued dances. | Core install only. | +| `play_emotion` | Play a recorded emotion clip via Hugging Face assets. | Needs `HF_TOKEN` for the recorded emotions dataset. | +| `stop_emotion` | Clear queued emotions. | Core install only. | +| `do_nothing` | Explicitly remain idle. | Core install only. | + +## Using custom profiles +Create custom profiles with dedicated instructions and enabled tools! + +Set `REACHY_MINI_CUSTOM_PROFILE=` to load `src/reachy_mini_conversation_app/profiles//` (see `.env.example`). If unset, the `default` profile is used. + +Each profile requires two files: `instructions.txt` (prompt text) and `tools.txt` (list of allowed tools), and optionally contains custom tools implementations. + +### Custom instructions +Write plain-text prompts in `instructions.txt`. To reuse shared prompt pieces, add lines like: +``` +[passion_for_lobster_jokes] +[identities/witty_identity] +``` +Each placeholder pulls the matching file under `src/reachy_mini_conversation_app/prompts/` (nested paths allowed). See `src/reachy_mini_conversation_app/profiles/example/` for a reference layout. + +### Enabling tools +List enabled tools in `tools.txt`, one per line; prefix with `#` to comment out. For example: + +``` +play_emotion +# move_head + +# My custom tool defined locally +sweep_look +``` +Tools are resolved first from Python files in the profile folder (custom tools), then from the shared library `src/reachy_mini_conversation_app/tools/` (e.g., `dance`, `head_tracking`). + +### Custom tools +On top of built-in tools found in the shared library, you can implement custom tools specific to your profile by adding Python files in the profile folder. +Custom tools must subclass `reachy_mini_conversation_app.tools.core_tools.Tool` (see `profiles/example/sweep_look.py`). + +### Edit personalities from the UI +When running with `--gradio`, open the โ€œPersonalityโ€ accordion: +- Select among available profiles (folders under `src/reachy_mini_conversation_app/profiles/`) or the builtโ€‘in default. +- Click โ€œApplyโ€ to update the current session instructions live. +- Create a new personality by entering a name and instructions text; it stores files under `profiles//` and copies `tools.txt` from the `default` profile. + +Note: The โ€œPersonalityโ€ panel updates the conversation instructions. Tool sets are loaded at startup from `tools.txt` and are not hotโ€‘reloaded. + + + + +## Development workflow +- Install the dev group extras: `uv sync --group dev` or `pip install -e .[dev]`. +- Run formatting and linting: `ruff check .`. +- Execute the test suite: `pytest`. +- When iterating on robot motions, keep the control loop responsive => offload blocking work using the helpers in `tools.py`. + +## License +Apache 2.0 diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..9c7be2196182eed3c33247e07c880dbf95bdb883 --- /dev/null +++ b/app.py @@ -0,0 +1,194 @@ +"""Hugging Face Spaces entry point for Reachy Mini Conversation App. + +This Space serves as documentation and installation hub for the Reachy Mini +Conversation App - a voice-activated coding assistant for the Reachy Mini robot. +""" + +import gradio as gr + + +DESCRIPTION = """ +# ๐ŸŽจ Reachy the Vibe Coder + +**Voice-activated coding with Reachy Mini!** Tell Reachy what to build, and watch it come to life in Cursor IDE. + +> "Reachy, build me a website" โ†’ Reachy decides the colors, layout, animations, everything, and codes it for you! + +## ๐Ÿš€ Quick Install + +```bash +pip install reachy-mini-conversation-app +``` + +Or install from this Space: + +```bash +pip install git+https://huggingface.co/spaces/robertkeus/reachy-vibe-coder +``` + +## ๐Ÿ“‹ Requirements + +- Python 3.10+ +- [Reachy Mini SDK](https://github.com/pollen-robotics/reachy_mini/) installed +- OpenAI API key (for realtime conversation) +- Reachy Mini robot (hardware or simulator) + +## ๐ŸŽฎ Running the App + +```bash +# Activate your environment +source .venv/bin/activate + +# Run with Gradio UI +reachy-mini-conversation-app --gradio + +# Run with face tracking +reachy-mini-conversation-app --head-tracker mediapipe + +# Audio only (no camera) +reachy-mini-conversation-app --no-camera +``` + +## โœจ Features + +| Tool | Description | +|------|-------------| +| `vibe_code` | ๐ŸŽจ Tell Reachy to build something - it decides ALL the details! | +| `vibe_big_project` | ๐Ÿš€ For epic builds using Cursor's Agent mode | +| `dance` | ๐Ÿ’ƒ Queue choreographed dances | +| `play_emotion` | ๐Ÿ˜Š Play recorded emotion clips | +| `camera` | ๐Ÿ“ท Capture and analyze camera frames | +| `head_tracking` | ๐Ÿ‘€ Enable/disable face tracking | + +## ๐Ÿ—๏ธ Architecture + +The app combines: +- **OpenAI Realtime API** for voice conversation +- **FastRTC** for low-latency audio streaming +- **Gradio** for the web interface +- **Reachy Mini SDK** for robot control +- **Cursor IDE integration** for vibe coding + +--- + +*This Space provides installation and documentation. The actual app runs locally with your Reachy Mini robot.* +""" + +INSTALL_INSTRUCTIONS = """ +## ๐Ÿ“ฆ Installation Methods + +### Using uv (recommended) + +```bash +git clone https://huggingface.co/spaces/robertkeus/reachy-vibe-coder +cd reachy-mini-conversation-app + +uv venv --python 3.12.1 +source .venv/bin/activate +uv sync +``` + +### Using pip + +```bash +git clone https://huggingface.co/spaces/robertkeus/reachy-vibe-coder +cd reachy-mini-conversation-app + +python -m venv .venv +source .venv/bin/activate +pip install -e . +``` + +### Optional Dependencies + +```bash +# Wireless Reachy Mini support +pip install -e .[reachy_mini_wireless] + +# Vision options +pip install -e .[local_vision] # PyTorch/Transformers +pip install -e .[yolo_vision] # YOLO tracking +pip install -e .[mediapipe_vision] # MediaPipe +pip install -e .[all_vision] # Everything +``` + +## โš™๏ธ Configuration + +1. Copy `.env.example` to `.env` +2. Add your OpenAI API key: + +```env +OPENAI_API_KEY=your-key-here +``` + +## ๐Ÿ”ง Troubleshooting + +**Timeout error?** +Make sure the Reachy Mini daemon is running: +```bash +# Install and start the SDK first +# See: https://github.com/pollen-robotics/reachy_mini/ +``` +""" + + +def create_demo(): + """Create the Gradio demo interface.""" + with gr.Blocks( + title="Reachy the Vibe Coder", + theme=gr.themes.Soft( + primary_hue="blue", + secondary_hue="purple", + ), + css=""" + .main-header { + background: linear-gradient(135deg, #00d4aa 0%, #7c3aed 100%); + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; + background-clip: text; + } + .install-box { + background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%); + border-radius: 12px; + padding: 20px; + } + footer { + visibility: hidden; + } + """ + ) as demo: + gr.Markdown(DESCRIPTION) + + with gr.Accordion("๐Ÿ“ฆ Detailed Installation Guide", open=False): + gr.Markdown(INSTALL_INSTRUCTIONS) + + with gr.Accordion("๐ŸŽฌ Demo Video", open=False): + gr.Markdown(""" + *Coming soon: Video demonstration of Reachy the Vibe Coder in action!* + + ![Reachy Mini Dance](https://raw.githubusercontent.com/pollen-robotics/reachy_mini/main/docs/assets/reachy_mini_dance.gif) + """) + + with gr.Row(): + gr.Markdown(""" + ### ๐Ÿ”— Links + + - [Reachy Mini SDK](https://github.com/pollen-robotics/reachy_mini/) + - [Pollen Robotics](https://www.pollen-robotics.com/) + """) + + gr.Markdown(""" + ### ๐Ÿ“„ License + + Apache 2.0 + + Made with โค๏ธ by Robert Keus + """) + + return demo + + +if __name__ == "__main__": + demo = create_demo() + demo.launch() + diff --git a/deploy_to_reachy.sh b/deploy_to_reachy.sh new file mode 100644 index 0000000000000000000000000000000000000000..2274fe4822f0a2c963c5af06150f683914ff1d4b --- /dev/null +++ b/deploy_to_reachy.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# Deploy Reachy the Vibe Coder to your real Reachy Mini +# Usage: ./deploy_to_reachy.sh + +REACHY_IP="${1:-192.168.1.35}" +REACHY_USER="reachy" + +echo "๐ŸŽจ Deploying Reachy the Vibe Coder to $REACHY_IP..." +echo "" + +# Create a temporary package +echo "๐Ÿ“ฆ Building package..." +cd "$(dirname "$0")" +pip install build -q +python -m build --wheel -q + +# Find the wheel file +WHEEL=$(ls -t dist/*.whl | head -1) +echo "โœ… Built: $WHEEL" + +# Copy to Reachy +echo "" +echo "๐Ÿ“ค Copying to Reachy (you'll need to enter the password)..." +scp "$WHEEL" "${REACHY_USER}@${REACHY_IP}:/tmp/" + +# Install on Reachy +echo "" +echo "๐Ÿ”ง Installing on Reachy..." +WHEEL_NAME=$(basename "$WHEEL") +ssh "${REACHY_USER}@${REACHY_IP}" << EOF + cd /tmp + pip install --upgrade "$WHEEL_NAME" + echo "" + echo "โœ… Installed! Restarting the conversation app..." + # Find and restart the app if it's running + pkill -f reachy_mini_conversation_app 2>/dev/null || true + echo "๐ŸŽ‰ Deployment complete! Go to http://${REACHY_IP}:8000 and turn on the Conversation App" +EOF + +echo "" +echo "๐Ÿš€ Done! Reachy the Vibe Coder is ready!" + diff --git a/docs/assets/conversation_app_arch.svg b/docs/assets/conversation_app_arch.svg new file mode 100644 index 0000000000000000000000000000000000000000..e692afcf47394ab724c4debb4c43327dedff3406 --- /dev/null +++ b/docs/assets/conversation_app_arch.svg @@ -0,0 +1,102 @@ +

audio stream

audio stream

audio stream

tool calls

movement
requests

camera frames, face tracking

image for
analysis

face tracking

frames

description

camera
frames

commands

results

User
Person interacting with system

UI Layer
Gradio/Console

gpt-realtime API
Audio+Tool Calls+Vision

Motion Control
Audio Sync + Tracking

Tool Handlers
move_head, camera, head_tracking,
dance, play_emotion, do_nothing

Camera Worker
Frame Buffer + Face Tracking

Vision Processor
Local VLM (optional)

reachy_mini
Robot Control Library

\ No newline at end of file diff --git a/docs/assets/reachy_mini_dance.gif b/docs/assets/reachy_mini_dance.gif new file mode 100644 index 0000000000000000000000000000000000000000..d801b57f4de4bcba30589b44522f99975ba7bdca --- /dev/null +++ b/docs/assets/reachy_mini_dance.gif @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75914c3cb7af982e0b1c6369e25fc46d8c08a0ab5ad022240ae9c1a0d93967c3 +size 3930400 diff --git a/docs/scheme.mmd b/docs/scheme.mmd new file mode 100644 index 0000000000000000000000000000000000000000..f51745729f171c2f954488050b79a6e4698dc8de --- /dev/null +++ b/docs/scheme.mmd @@ -0,0 +1,58 @@ +--- +config: + layout: dagre + flowchart: + htmlLabels: true +--- +flowchart TB + User(["User
Person interacting with system"]) + -- audio stream --> + UI@{ label: "UI Layer
Gradio/Console" } + + UI -- audio stream --> + OpenAI@{ label: "gpt-realtime API
Audio+Tool Calls+Vision" } + + OpenAI -- audio stream --> + Motion@{ label: "Motion Control
Audio Sync + Tracking" } + + OpenAI -- tool calls --> + Handlers@{ label: "Tool Handlers
move_head, camera, head_tracking,
dance, play_emotion, do_nothing
" } + + Handlers -- movement + requests --> Motion + + Handlers -- camera frames, face tracking --> + Camera@{ label: "Camera Worker
Frame Buffer + Face Tracking" } + + Handlers -. image for + analysis .-> OpenAI + + Camera -- face tracking --> Motion + + Camera -. frames .-> + Vision@{ label: "Vision Processor
Local VLM (optional)" } + + Vision -. description .-> Handlers + + Robot@{ label: "reachy_mini
Robot Control Library" } + -- camera + frames --> Camera + + Motion -- commands --> Robot + + Handlers -- results --> OpenAI + + User:::userStyle + UI:::uiStyle + OpenAI:::aiStyle + Motion:::coreStyle + Handlers:::toolStyle + Camera:::coreStyle + Vision:::aiStyle + Robot:::hardwareStyle + classDef userStyle fill:#e1f5fe,stroke:#01579b,stroke-width:3px + classDef uiStyle fill:#b3e5fc,stroke:#0277bd,stroke-width:2px + classDef aiStyle fill:#e1bee7,stroke:#7b1fa2,stroke-width:3px + classDef coreStyle fill:#fff9c4,stroke:#f57f17,stroke-width:2px + classDef hardwareStyle fill:#ef9a9a,stroke:#c62828,stroke-width:3px + classDef toolStyle fill:#fffde7,stroke:#f9a825,stroke-width:1px diff --git a/index.html b/index.html new file mode 100644 index 0000000000000000000000000000000000000000..935ba228f77f67b4ae7b1cb1ecd838d75dcde979 --- /dev/null +++ b/index.html @@ -0,0 +1,125 @@ + + + + + + + Reachy Mini Conversation App + + + + + + + +
+
+
+ + Reachy Mini +
+
Realtime voice ยท Vision aware ยท Expressive motion
+
+
+
+

Conversation App

+

Talk, see, and move together.

+

+ A friendly, camera-aware companion for Reachy Mini. Chat out loud, watch it follow faces, dance, or react with recorded emotionsโ€”all while streaming transcripts in a clean web UI. +

+ +
+ Low-latency voice loop + Camera insights on demand + Choreographed dances & emotions + Personality profiles via web UI +
+
+
+
+ Reachy Mini dancing +

Reachy Mini can move, dance, and emote while holding a natural conversation.

+
+
+
+
+ +
+
+

Whatโ€™s inside

+

All-in-one conversational layer for your robot

+

+ The app blends realtime speech, vision, and motion so Reachy Mini feels present.. +

+
+
+
+ ๐ŸŽค +

Natural voice chat

+

Talk freely and get fast, high-quality replies powered by realtime models.

+
+
+ ๐ŸŽฅ +

Vision-aware replies

+

Ask the camera tool to see whatโ€™s in front, track a face, or keep attention on whoever is speaking.

+
+
+ ๐Ÿ’ƒ +

Expressive motion

+

Queue dances, play recorded emotions while Reachy listens and talks.

+
+
+ ๐Ÿง  +

Personalities on demand

+

Switch conversation styles through profiles and decide which tools (dance, camera, tracking) each persona can use.

+
+
+ ๐ŸŒ +

Ready for your setup

+

Works with wired or wireless Reachy Mini, and can run vision locally or through the default cloud model.

+
+
+
+ +
+
+
+

How it feels

+

From hello to helpful in seconds

+
    +
  • ๐Ÿ‘‹ Say โ€œHey Reachyโ€ and start chattingโ€”no extra setup in the moment.
  • +
  • ๐Ÿ‘€ Ask what it sees; it can peek through the camera or keep focus on your face.
  • +
  • ๐ŸŽญ Trigger emotions or dance breaks to keep the conversation lively.
  • +
  • ๐Ÿ“ Follow along with live transcripts in the web UI or run audio-only from the console.
  • +
+
+
+

Where it shines

+

Great for demos, teaching, and playful exploration

+

+ Show off how Reachy Mini listens, responds, and moves in sync. Whether youโ€™re guiding a class, hosting a booth, or experimenting at home, the app keeps the robot expressive without juggling scripts or joystick controls. +

+
+ Live conversation + Face tracking + Camera tool + Dance library + Profiles & tools +
+
+
+
+ + + + + + diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..abe72586ada0150fda5aabcaef9fac14560dbfa6 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,123 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "reachy_mini_conversation_app" +version = "0.2.0" # Reachy the Vibe Coder edition! +authors = [{ name = "Pollen Robotics", email = "contact@pollen-robotics.com" }] +description = "" +readme = "README.md" +requires-python = ">=3.10" +dependencies = [ + #Media + "aiortc>=1.13.0", + "fastrtc>=0.0.34", + "gradio==5.50.1.dev1", + "huggingface_hub>=0.34.4", + "opencv-python>=4.12.0.88", + + #Environment variables + "python-dotenv", + + #OpenAI + "openai>=2.1", + + #Reachy mini + "reachy_mini_dances_library", + "reachy_mini_toolbox", + "reachy_mini>=1.2.3rc1", + "eclipse-zenoh~=1.7.0", + "gradio_client>=1.13.3", +] + +[project.optional-dependencies] +reachy_mini_wireless = [ + "PyGObject>=3.42.2,<=3.46.0", + "gst-signalling>=1.1.2", +] +mcp = ["mcp>=1.0.0", "httpx-sse>=0.4.0"] +local_vision = ["torch", "transformers", "num2words"] +yolo_vision = ["ultralytics", "supervision"] +mediapipe_vision = ["mediapipe==0.10.14"] +all_vision = [ + "torch", "transformers", "num2words", + "ultralytics", "supervision", + "mediapipe==0.10.14", +] + +[dependency-groups] +dev = [ + "pytest", + "pytest-asyncio", + "ruff==0.12.0", + "mypy==1.18.2", + "pre-commit", + "types-requests", +] + +[project.scripts] +reachy-mini-conversation-app = "reachy_mini_conversation_app.main:main" + +[project.entry-points."reachy_mini_apps"] +reachy_mini_conversation_app = "reachy_mini_conversation_app.main:ReachyMiniConversationApp" + +[tool.setuptools] +package-dir = { "" = "src" } +include-package-data = true + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.setuptools.package-data] +reachy_mini_conversation_app = [ + "images/*", + "static/*", + ".env.example", + "demos/**/*.txt", + "prompts_library/*.txt", + "profiles/**/*.txt", + "prompts/**/*.txt", +] + +[tool.ruff] +line-length = 119 +exclude = [".venv", "dist", "build", "**/__pycache__", "*.egg-info", ".mypy_cache", ".pytest_cache"] + +[tool.ruff.lint] +select = [ + "E", # pycodestyle errors + "F", # pyflakes + "W", # pycodestyle warnings + "I", # isort + "C4", # flake8-comprehensions + "D", # pydocstyle +] +ignore = [ + "E501", # handled by formatter + "D100", # ignore missing module docstrings + "D203", # blank line before class docstring (conflicts with D211) + "D213", # summary on second line (conflicts with D212) +] + +[tool.ruff.lint.isort] +length-sort = true +lines-after-imports = 2 +no-lines-before = ["standard-library", "local-folder"] +known-local-folder = ["reachy_mini_conversation_app"] +known-first-party = ["reachy_mini", "reachy_mini_dances_library", "reachy_mini_toolbox"] +split-on-trailing-comma = true + +[tool.ruff.format] +quote-style = "double" +indent-style = "space" +skip-magic-trailing-comma = false +line-ending = "auto" + +[tool.mypy] +python_version = "3.12" +files = ["src/"] +ignore_missing_imports = true +strict = true +show_error_codes = true +warn_unused_ignores = true diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..3d5df8c899d71da6c2d193defec62e368dad48f6 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +# Hugging Face Spaces requirements +# For the documentation/installation Space UI only +gradio>=5.0.0 + +# Full app dependencies are in pyproject.toml +# Install with: pip install -e . diff --git a/src/reachy_mini_conversation_app/__init__.py b/src/reachy_mini_conversation_app/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..60dc15c497e49cbeb8687afb06a62339c297e2b5 --- /dev/null +++ b/src/reachy_mini_conversation_app/__init__.py @@ -0,0 +1 @@ +"""Nothing (for ruff).""" diff --git a/src/reachy_mini_conversation_app/audio/__init__.py b/src/reachy_mini_conversation_app/audio/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..60dc15c497e49cbeb8687afb06a62339c297e2b5 --- /dev/null +++ b/src/reachy_mini_conversation_app/audio/__init__.py @@ -0,0 +1 @@ +"""Nothing (for ruff).""" diff --git a/src/reachy_mini_conversation_app/audio/head_wobbler.py b/src/reachy_mini_conversation_app/audio/head_wobbler.py new file mode 100644 index 0000000000000000000000000000000000000000..686197aaa1dc2e6071486de82c784a5e3e01785e --- /dev/null +++ b/src/reachy_mini_conversation_app/audio/head_wobbler.py @@ -0,0 +1,181 @@ +"""Moves head given audio samples.""" + +import time +import queue +import base64 +import logging +import threading +from typing import Tuple +from collections.abc import Callable + +import numpy as np +from numpy.typing import NDArray + +from reachy_mini_conversation_app.audio.speech_tapper import HOP_MS, SwayRollRT + + +SAMPLE_RATE = 24000 +MOVEMENT_LATENCY_S = 0.08 # seconds between audio and robot movement +logger = logging.getLogger(__name__) + + +class HeadWobbler: + """Converts audio deltas (base64) into head movement offsets.""" + + def __init__(self, set_speech_offsets: Callable[[Tuple[float, float, float, float, float, float]], None]) -> None: + """Initialize the head wobbler.""" + self._apply_offsets = set_speech_offsets + self._base_ts: float | None = None + self._hops_done: int = 0 + + self.audio_queue: "queue.Queue[Tuple[int, int, NDArray[np.int16]]]" = queue.Queue() + self.sway = SwayRollRT() + + # Synchronization primitives + self._state_lock = threading.Lock() + self._sway_lock = threading.Lock() + self._generation = 0 + + self._stop_event = threading.Event() + self._thread: threading.Thread | None = None + + def feed(self, delta_b64: str) -> None: + """Thread-safe: push audio into the consumer queue.""" + buf = np.frombuffer(base64.b64decode(delta_b64), dtype=np.int16).reshape(1, -1) + with self._state_lock: + generation = self._generation + self.audio_queue.put((generation, SAMPLE_RATE, buf)) + + def start(self) -> None: + """Start the head wobbler loop in a thread.""" + self._stop_event.clear() + self._thread = threading.Thread(target=self.working_loop, daemon=True) + self._thread.start() + logger.debug("Head wobbler started") + + def stop(self) -> None: + """Stop the head wobbler loop.""" + self._stop_event.set() + if self._thread is not None: + self._thread.join() + logger.debug("Head wobbler stopped") + + def working_loop(self) -> None: + """Convert audio deltas into head movement offsets.""" + hop_dt = HOP_MS / 1000.0 + + logger.debug("Head wobbler thread started") + while not self._stop_event.is_set(): + queue_ref = self.audio_queue + try: + chunk_generation, sr, chunk = queue_ref.get_nowait() # (gen, sr, data) + except queue.Empty: + # avoid while to never exit + time.sleep(MOVEMENT_LATENCY_S) + continue + + try: + with self._state_lock: + current_generation = self._generation + if chunk_generation != current_generation: + continue + + if self._base_ts is None: + with self._state_lock: + if self._base_ts is None: + self._base_ts = time.monotonic() + + pcm = np.asarray(chunk).squeeze(0) + with self._sway_lock: + results = self.sway.feed(pcm, sr) + + i = 0 + while i < len(results): + with self._state_lock: + if self._generation != current_generation: + break + base_ts = self._base_ts + hops_done = self._hops_done + + if base_ts is None: + base_ts = time.monotonic() + with self._state_lock: + if self._base_ts is None: + self._base_ts = base_ts + hops_done = self._hops_done + + target = base_ts + MOVEMENT_LATENCY_S + hops_done * hop_dt + now = time.monotonic() + + if now - target >= hop_dt: + lag_hops = int((now - target) / hop_dt) + drop = min(lag_hops, len(results) - i - 1) + if drop > 0: + with self._state_lock: + self._hops_done += drop + hops_done = self._hops_done + i += drop + continue + + if target > now: + time.sleep(target - now) + with self._state_lock: + if self._generation != current_generation: + break + + r = results[i] + offsets = ( + r["x_mm"] / 1000.0, + r["y_mm"] / 1000.0, + r["z_mm"] / 1000.0, + r["roll_rad"], + r["pitch_rad"], + r["yaw_rad"], + ) + + with self._state_lock: + if self._generation != current_generation: + break + + self._apply_offsets(offsets) + + with self._state_lock: + self._hops_done += 1 + i += 1 + finally: + queue_ref.task_done() + logger.debug("Head wobbler thread exited") + + ''' + def drain_audio_queue(self) -> None: + """Empty the audio queue.""" + try: + while True: + self.audio_queue.get_nowait() + except QueueEmpty: + pass + ''' + + def reset(self) -> None: + """Reset the internal state.""" + with self._state_lock: + self._generation += 1 + self._base_ts = None + self._hops_done = 0 + + # Drain any queued audio chunks from previous generations + drained_any = False + while True: + try: + _, _, _ = self.audio_queue.get_nowait() + except queue.Empty: + break + else: + drained_any = True + self.audio_queue.task_done() + + with self._sway_lock: + self.sway.reset() + + if drained_any: + logger.debug("Head wobbler queue drained during reset") diff --git a/src/reachy_mini_conversation_app/audio/speech_tapper.py b/src/reachy_mini_conversation_app/audio/speech_tapper.py new file mode 100644 index 0000000000000000000000000000000000000000..a1bf639586ad88b0a2b6133b50e82d414474c2ab --- /dev/null +++ b/src/reachy_mini_conversation_app/audio/speech_tapper.py @@ -0,0 +1,268 @@ +from __future__ import annotations +import math +from typing import Any, Dict, List +from itertools import islice +from collections import deque + +import numpy as np +from numpy.typing import NDArray + + +# Tunables +SR = 16_000 +FRAME_MS = 20 +HOP_MS = 10 + +SWAY_MASTER = 1.5 +SENS_DB_OFFSET = +4.0 +VAD_DB_ON = -35.0 +VAD_DB_OFF = -45.0 +VAD_ATTACK_MS = 40 +VAD_RELEASE_MS = 250 +ENV_FOLLOW_GAIN = 0.65 + +SWAY_F_PITCH = 2.2 +SWAY_A_PITCH_DEG = 4.5 +SWAY_F_YAW = 0.6 +SWAY_A_YAW_DEG = 7.5 +SWAY_F_ROLL = 1.3 +SWAY_A_ROLL_DEG = 2.25 +SWAY_F_X = 0.35 +SWAY_A_X_MM = 4.5 +SWAY_F_Y = 0.45 +SWAY_A_Y_MM = 3.75 +SWAY_F_Z = 0.25 +SWAY_A_Z_MM = 2.25 + +SWAY_DB_LOW = -46.0 +SWAY_DB_HIGH = -18.0 +LOUDNESS_GAMMA = 0.9 +SWAY_ATTACK_MS = 50 +SWAY_RELEASE_MS = 250 + +# Derived +FRAME = int(SR * FRAME_MS / 1000) +HOP = int(SR * HOP_MS / 1000) +ATTACK_FR = max(1, int(VAD_ATTACK_MS / HOP_MS)) +RELEASE_FR = max(1, int(VAD_RELEASE_MS / HOP_MS)) +SWAY_ATTACK_FR = max(1, int(SWAY_ATTACK_MS / HOP_MS)) +SWAY_RELEASE_FR = max(1, int(SWAY_RELEASE_MS / HOP_MS)) + + +def _rms_dbfs(x: NDArray[np.float32]) -> float: + """Root-mean-square in dBFS for float32 mono array in [-1,1].""" + # numerically stable rms (avoid overflow) + x = x.astype(np.float32, copy=False) + rms = np.sqrt(np.mean(x * x, dtype=np.float32) + 1e-12, dtype=np.float32) + return float(20.0 * math.log10(float(rms) + 1e-12)) + + +def _loudness_gain(db: float, offset: float = SENS_DB_OFFSET) -> float: + """Normalize dB into [0,1] with gamma; clipped to [0,1].""" + t = (db + offset - SWAY_DB_LOW) / (SWAY_DB_HIGH - SWAY_DB_LOW) + if t < 0.0: + t = 0.0 + elif t > 1.0: + t = 1.0 + return t**LOUDNESS_GAMMA if LOUDNESS_GAMMA != 1.0 else t + + +def _to_float32_mono(x: NDArray[Any]) -> NDArray[np.float32]: + """Convert arbitrary PCM array to float32 mono in [-1,1]. + + Accepts shapes: (N,), (1,N), (N,1), (C,N), (N,C). + """ + a = np.asarray(x) + if a.ndim == 0: + return np.zeros(0, dtype=np.float32) + + # If 2D, decide which axis is channels (prefer small first dim) + if a.ndim == 2: + # e.g., (channels, samples) if channels is small (<=8) + if a.shape[0] <= 8 and a.shape[0] <= a.shape[1]: + a = np.mean(a, axis=0) + else: + a = np.mean(a, axis=1) + elif a.ndim > 2: + a = np.mean(a.reshape(a.shape[0], -1), axis=0) + + # Now 1D, cast/scale + if np.issubdtype(a.dtype, np.floating): + return a.astype(np.float32, copy=False) + # integer PCM + info = np.iinfo(a.dtype) + scale = float(max(-info.min, info.max)) + return a.astype(np.float32) / (scale if scale != 0.0 else 1.0) + + +def _resample_linear(x: NDArray[np.float32], sr_in: int, sr_out: int) -> NDArray[np.float32]: + """Lightweight linear resampler for short buffers.""" + if sr_in == sr_out or x.size == 0: + return x + # guard tiny sizes + n_out = int(round(x.size * sr_out / sr_in)) + if n_out <= 1: + return np.zeros(0, dtype=np.float32) + t_in = np.linspace(0.0, 1.0, num=x.size, dtype=np.float32, endpoint=True) + t_out = np.linspace(0.0, 1.0, num=n_out, dtype=np.float32, endpoint=True) + return np.interp(t_out, t_in, x).astype(np.float32, copy=False) + + +class SwayRollRT: + """Feed audio chunks โ†’ per-hop sway outputs. + + Usage: + rt = SwayRollRT() + rt.feed(pcm_int16_or_float, sr) -> List[dict] + """ + + def __init__(self, rng_seed: int = 7): + """Initialize state.""" + self._seed = int(rng_seed) + self.samples: deque[float] = deque(maxlen=10 * SR) # sliding window for VAD/env + self.carry: NDArray[np.float32] = np.zeros(0, dtype=np.float32) + + self.vad_on = False + self.vad_above = 0 + self.vad_below = 0 + + self.sway_env = 0.0 + self.sway_up = 0 + self.sway_down = 0 + + rng = np.random.default_rng(self._seed) + self.phase_pitch = float(rng.random() * 2 * math.pi) + self.phase_yaw = float(rng.random() * 2 * math.pi) + self.phase_roll = float(rng.random() * 2 * math.pi) + self.phase_x = float(rng.random() * 2 * math.pi) + self.phase_y = float(rng.random() * 2 * math.pi) + self.phase_z = float(rng.random() * 2 * math.pi) + self.t = 0.0 + + def reset(self) -> None: + """Reset state (VAD/env/buffers/time) but keep initial phases/seed.""" + self.samples.clear() + self.carry = np.zeros(0, dtype=np.float32) + self.vad_on = False + self.vad_above = 0 + self.vad_below = 0 + self.sway_env = 0.0 + self.sway_up = 0 + self.sway_down = 0 + self.t = 0.0 + + def feed(self, pcm: NDArray[Any], sr: int | None) -> List[Dict[str, float]]: + """Stream in PCM chunk. Returns a list of sway dicts, one per hop (HOP_MS). + + Args: + pcm: np.ndarray, shape (N,) or (C,N)/(N,C); int or float. + sr: sample rate of `pcm` (None -> assume SR). + + """ + sr_in = SR if sr is None else int(sr) + x = _to_float32_mono(pcm) + if x.size == 0: + return [] + if sr_in != SR: + x = _resample_linear(x, sr_in, SR) + if x.size == 0: + return [] + + # append to carry and consume fixed HOP chunks + if self.carry.size: + self.carry = np.concatenate([self.carry, x]) + else: + self.carry = x + + out: List[Dict[str, float]] = [] + + while self.carry.size >= HOP: + hop = self.carry[:HOP] + remaining: NDArray[np.float32] = self.carry[HOP:] + self.carry = remaining + + # keep sliding window for VAD/env computation + # (deque accepts any iterable; list() for small HOP is fine) + self.samples.extend(hop.tolist()) + if len(self.samples) < FRAME: + self.t += HOP_MS / 1000.0 + continue + + frame = np.fromiter( + islice(self.samples, len(self.samples) - FRAME, len(self.samples)), + dtype=np.float32, + count=FRAME, + ) + db = _rms_dbfs(frame) + + # VAD with hysteresis + attack/release + if db >= VAD_DB_ON: + self.vad_above += 1 + self.vad_below = 0 + if not self.vad_on and self.vad_above >= ATTACK_FR: + self.vad_on = True + elif db <= VAD_DB_OFF: + self.vad_below += 1 + self.vad_above = 0 + if self.vad_on and self.vad_below >= RELEASE_FR: + self.vad_on = False + + if self.vad_on: + self.sway_up = min(SWAY_ATTACK_FR, self.sway_up + 1) + self.sway_down = 0 + else: + self.sway_down = min(SWAY_RELEASE_FR, self.sway_down + 1) + self.sway_up = 0 + + up = self.sway_up / SWAY_ATTACK_FR + down = 1.0 - (self.sway_down / SWAY_RELEASE_FR) + target = up if self.vad_on else down + self.sway_env += ENV_FOLLOW_GAIN * (target - self.sway_env) + # clamp + if self.sway_env < 0.0: + self.sway_env = 0.0 + elif self.sway_env > 1.0: + self.sway_env = 1.0 + + loud = _loudness_gain(db) * SWAY_MASTER + env = self.sway_env + self.t += HOP_MS / 1000.0 + + # oscillators + pitch = ( + math.radians(SWAY_A_PITCH_DEG) + * loud + * env + * math.sin(2 * math.pi * SWAY_F_PITCH * self.t + self.phase_pitch) + ) + yaw = ( + math.radians(SWAY_A_YAW_DEG) + * loud + * env + * math.sin(2 * math.pi * SWAY_F_YAW * self.t + self.phase_yaw) + ) + roll = ( + math.radians(SWAY_A_ROLL_DEG) + * loud + * env + * math.sin(2 * math.pi * SWAY_F_ROLL * self.t + self.phase_roll) + ) + x_mm = SWAY_A_X_MM * loud * env * math.sin(2 * math.pi * SWAY_F_X * self.t + self.phase_x) + y_mm = SWAY_A_Y_MM * loud * env * math.sin(2 * math.pi * SWAY_F_Y * self.t + self.phase_y) + z_mm = SWAY_A_Z_MM * loud * env * math.sin(2 * math.pi * SWAY_F_Z * self.t + self.phase_z) + + out.append( + { + "pitch_rad": pitch, + "yaw_rad": yaw, + "roll_rad": roll, + "pitch_deg": math.degrees(pitch), + "yaw_deg": math.degrees(yaw), + "roll_deg": math.degrees(roll), + "x_mm": x_mm, + "y_mm": y_mm, + "z_mm": z_mm, + }, + ) + + return out diff --git a/src/reachy_mini_conversation_app/camera_worker.py b/src/reachy_mini_conversation_app/camera_worker.py new file mode 100644 index 0000000000000000000000000000000000000000..e625c00936d73edbe6c9bcf677f11d71a3409d89 --- /dev/null +++ b/src/reachy_mini_conversation_app/camera_worker.py @@ -0,0 +1,241 @@ +"""Camera worker thread with frame buffering and face tracking. + +Ported from main_works.py camera_worker() function to provide: +- 30Hz+ camera polling with thread-safe frame buffering +- Face tracking integration with smooth interpolation +- Latest frame always available for tools +""" + +import time +import logging +import threading +from typing import Any, List, Tuple + +import numpy as np +from numpy.typing import NDArray +from scipy.spatial.transform import Rotation as R + +from reachy_mini import ReachyMini +from reachy_mini.utils.interpolation import linear_pose_interpolation + + +logger = logging.getLogger(__name__) + + +class CameraWorker: + """Thread-safe camera worker with frame buffering and face tracking.""" + + def __init__(self, reachy_mini: ReachyMini, head_tracker: Any = None) -> None: + """Initialize.""" + self.reachy_mini = reachy_mini + self.head_tracker = head_tracker + + # Thread-safe frame storage + self.latest_frame: NDArray[np.uint8] | None = None + self.frame_lock = threading.Lock() + self._stop_event = threading.Event() + self._thread: threading.Thread | None = None + + # Face tracking state + self.is_head_tracking_enabled = True + self.face_tracking_offsets: List[float] = [ + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + ] # x, y, z, roll, pitch, yaw + self.face_tracking_lock = threading.Lock() + + # Face tracking timing variables (same as main_works.py) + self.last_face_detected_time: float | None = None + self.interpolation_start_time: float | None = None + self.interpolation_start_pose: NDArray[np.float32] | None = None + self.face_lost_delay = 2.0 # seconds to wait before starting interpolation + self.interpolation_duration = 1.0 # seconds to interpolate back to neutral + + # Track state changes + self.previous_head_tracking_state = self.is_head_tracking_enabled + + def get_latest_frame(self) -> NDArray[np.uint8] | None: + """Get the latest frame (thread-safe).""" + with self.frame_lock: + if self.latest_frame is None: + return None + # Return a copy in original BGR format (OpenCV native) + return self.latest_frame.copy() + + def get_face_tracking_offsets( + self, + ) -> Tuple[float, float, float, float, float, float]: + """Get current face tracking offsets (thread-safe).""" + with self.face_tracking_lock: + offsets = self.face_tracking_offsets + return (offsets[0], offsets[1], offsets[2], offsets[3], offsets[4], offsets[5]) + + def set_head_tracking_enabled(self, enabled: bool) -> None: + """Enable/disable head tracking.""" + self.is_head_tracking_enabled = enabled + logger.info(f"Head tracking {'enabled' if enabled else 'disabled'}") + + def start(self) -> None: + """Start the camera worker loop in a thread.""" + self._stop_event.clear() + self._thread = threading.Thread(target=self.working_loop, daemon=True) + self._thread.start() + logger.debug("Camera worker started") + + def stop(self) -> None: + """Stop the camera worker loop.""" + self._stop_event.set() + if self._thread is not None: + self._thread.join() + + logger.debug("Camera worker stopped") + + def working_loop(self) -> None: + """Enable the camera worker loop. + + Ported from main_works.py camera_worker() with same logic. + """ + logger.debug("Starting camera working loop") + + # Initialize head tracker if available + neutral_pose = np.eye(4) # Neutral pose (identity matrix) + self.previous_head_tracking_state = self.is_head_tracking_enabled + + while not self._stop_event.is_set(): + try: + current_time = time.time() + + # Get frame from robot + frame = self.reachy_mini.media.get_frame() + + if frame is not None: + # Thread-safe frame storage + with self.frame_lock: + self.latest_frame = frame # .copy() + + # Check if face tracking was just disabled + if self.previous_head_tracking_state and not self.is_head_tracking_enabled: + # Face tracking was just disabled - start interpolation to neutral + self.last_face_detected_time = current_time # Trigger the face-lost logic + self.interpolation_start_time = None # Will be set by the face-lost interpolation + self.interpolation_start_pose = None + + # Update tracking state + self.previous_head_tracking_state = self.is_head_tracking_enabled + + # Handle face tracking if enabled and head tracker available + if self.is_head_tracking_enabled and self.head_tracker is not None: + eye_center, _ = self.head_tracker.get_head_position(frame) + + if eye_center is not None: + # Face detected - immediately switch to tracking + self.last_face_detected_time = current_time + self.interpolation_start_time = None # Stop any interpolation + + # Convert normalized coordinates to pixel coordinates + h, w, _ = frame.shape + eye_center_norm = (eye_center + 1) / 2 + eye_center_pixels = [ + eye_center_norm[0] * w, + eye_center_norm[1] * h, + ] + + # Get the head pose needed to look at the target, but don't perform movement + target_pose = self.reachy_mini.look_at_image( + eye_center_pixels[0], + eye_center_pixels[1], + duration=0.0, + perform_movement=False, + ) + + # Extract translation and rotation from the target pose directly + translation = target_pose[:3, 3] + rotation = R.from_matrix(target_pose[:3, :3]).as_euler("xyz", degrees=False) + + # Scale down translation and rotation because smaller FOV + translation *= 0.6 + rotation *= 0.6 + + # Thread-safe update of face tracking offsets (use pose as-is) + with self.face_tracking_lock: + self.face_tracking_offsets = [ + translation[0], + translation[1], + translation[2], # x, y, z + rotation[0], + rotation[1], + rotation[2], # roll, pitch, yaw + ] + + # No face detected while tracking enabled - set face lost timestamp + elif self.last_face_detected_time is None or self.last_face_detected_time == current_time: + # Only update if we haven't already set a face lost time + # (current_time check prevents overriding the disable-triggered timestamp) + pass + + # Handle smooth interpolation (works for both face-lost and tracking-disabled cases) + if self.last_face_detected_time is not None: + time_since_face_lost = current_time - self.last_face_detected_time + + if time_since_face_lost >= self.face_lost_delay: + # Start interpolation if not already started + if self.interpolation_start_time is None: + self.interpolation_start_time = current_time + # Capture current pose as start of interpolation + with self.face_tracking_lock: + current_translation = self.face_tracking_offsets[:3] + current_rotation_euler = self.face_tracking_offsets[3:] + # Convert to 4x4 pose matrix + pose_matrix = np.eye(4, dtype=np.float32) + pose_matrix[:3, 3] = current_translation + pose_matrix[:3, :3] = R.from_euler( + "xyz", + current_rotation_euler, + ).as_matrix() + self.interpolation_start_pose = pose_matrix + + # Calculate interpolation progress (t from 0 to 1) + elapsed_interpolation = current_time - self.interpolation_start_time + t = min(1.0, elapsed_interpolation / self.interpolation_duration) + + # Interpolate between current pose and neutral pose + interpolated_pose = linear_pose_interpolation( + self.interpolation_start_pose, + neutral_pose, + t, + ) + + # Extract translation and rotation from interpolated pose + translation = interpolated_pose[:3, 3] + rotation = R.from_matrix(interpolated_pose[:3, :3]).as_euler("xyz", degrees=False) + + # Thread-safe update of face tracking offsets + with self.face_tracking_lock: + self.face_tracking_offsets = [ + translation[0], + translation[1], + translation[2], # x, y, z + rotation[0], + rotation[1], + rotation[2], # roll, pitch, yaw + ] + + # If interpolation is complete, reset timing + if t >= 1.0: + self.last_face_detected_time = None + self.interpolation_start_time = None + self.interpolation_start_pose = None + # else: Keep current offsets (within 2s delay period) + + # Small sleep to prevent excessive CPU usage (same as main_works.py) + time.sleep(0.04) + + except Exception as e: + logger.error(f"Camera worker error: {e}") + time.sleep(0.1) # Longer sleep on error + + logger.debug("Camera worker thread exited") diff --git a/src/reachy_mini_conversation_app/config.py b/src/reachy_mini_conversation_app/config.py new file mode 100644 index 0000000000000000000000000000000000000000..dd98d0b00d76c49dde559436be6ca41815ae032a --- /dev/null +++ b/src/reachy_mini_conversation_app/config.py @@ -0,0 +1,66 @@ +import os +import logging + +from dotenv import find_dotenv, load_dotenv + + +logger = logging.getLogger(__name__) + +# Locate .env file (search upward from current working directory) +dotenv_path = find_dotenv(usecwd=True) + +if dotenv_path: + # Load .env and override environment variables + load_dotenv(dotenv_path=dotenv_path, override=True) + logger.info(f"Configuration loaded from {dotenv_path}") +else: + logger.warning("No .env file found, using environment variables") + + +class Config: + """Configuration class for the conversation app.""" + + # Required + OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") # The key is downloaded in console.py if needed + + # Optional + MODEL_NAME = os.getenv("MODEL_NAME", "gpt-realtime") + HF_HOME = os.getenv("HF_HOME", "./cache") + LOCAL_VISION_MODEL = os.getenv("LOCAL_VISION_MODEL", "HuggingFaceTB/SmolVLM2-2.2B-Instruct") + HF_TOKEN = os.getenv("HF_TOKEN") # Optional, falls back to hf auth login if not set + + # Figma MCP integration + FIGMA_ACCESS_TOKEN = os.getenv("FIGMA_ACCESS_TOKEN") # For Figma design capabilities + FIGMA_MCP_ENABLED = os.getenv("FIGMA_MCP_ENABLED", "false").lower() == "true" + + logger.debug(f"Model: {MODEL_NAME}, HF_HOME: {HF_HOME}, Vision Model: {LOCAL_VISION_MODEL}") + if FIGMA_ACCESS_TOKEN: + logger.debug("Figma MCP: Token configured") + + REACHY_MINI_CUSTOM_PROFILE = os.getenv("REACHY_MINI_CUSTOM_PROFILE") + logger.debug(f"Custom Profile: {REACHY_MINI_CUSTOM_PROFILE}") + + +config = Config() + + +def set_custom_profile(profile: str | None) -> None: + """Update the selected custom profile at runtime and expose it via env. + + This ensures modules that read `config` and code that inspects the + environment see a consistent value. + """ + try: + config.REACHY_MINI_CUSTOM_PROFILE = profile + except Exception: + pass + try: + import os as _os + + if profile: + _os.environ["REACHY_MINI_CUSTOM_PROFILE"] = profile + else: + # Remove to reflect default + _os.environ.pop("REACHY_MINI_CUSTOM_PROFILE", None) + except Exception: + pass diff --git a/src/reachy_mini_conversation_app/console.py b/src/reachy_mini_conversation_app/console.py new file mode 100644 index 0000000000000000000000000000000000000000..f06a02037f5d3822a34171e704fcd0fbe8d09b8d --- /dev/null +++ b/src/reachy_mini_conversation_app/console.py @@ -0,0 +1,499 @@ +"""Bidirectional local audio stream with optional settings UI. + +In headless mode, there is no Gradio UI. If the OpenAI API key is not +available via environment/.env, we expose a minimal settings page via the +Reachy Mini Apps settings server to let non-technical users enter it. + +The settings UI is served from this package's ``static/`` folder and offers a +single password field to set ``OPENAI_API_KEY``. Once set, we persist it to the +app instance's ``.env`` file (if available) and proceed to start streaming. +""" + +import os +import sys +import time +import asyncio +import logging +from typing import List, Optional +from pathlib import Path + +from fastrtc import AdditionalOutputs, audio_to_float32 +from scipy.signal import resample + +from reachy_mini import ReachyMini +from reachy_mini.media.media_manager import MediaBackend +from reachy_mini_conversation_app.config import config +from reachy_mini_conversation_app.openai_realtime import OpenaiRealtimeHandler +from reachy_mini_conversation_app.headless_personality_ui import mount_personality_routes + + +try: + # FastAPI is provided by the Reachy Mini Apps runtime + from fastapi import FastAPI, Response + from pydantic import BaseModel + from fastapi.responses import FileResponse, JSONResponse + from starlette.staticfiles import StaticFiles +except Exception: # pragma: no cover - only loaded when settings_app is used + FastAPI = object # type: ignore + FileResponse = object # type: ignore + JSONResponse = object # type: ignore + StaticFiles = object # type: ignore + BaseModel = object # type: ignore + + +logger = logging.getLogger(__name__) + + +class LocalStream: + """LocalStream using Reachy Mini's recorder/player.""" + + def __init__( + self, + handler: OpenaiRealtimeHandler, + robot: ReachyMini, + *, + settings_app: Optional[FastAPI] = None, + instance_path: Optional[str] = None, + ): + """Initialize the stream with an OpenAI realtime handler and pipelines. + + - ``settings_app``: the Reachy Mini Apps FastAPI to attach settings endpoints. + - ``instance_path``: directory where per-instance ``.env`` should be stored. + """ + self.handler = handler + self._robot = robot + self._stop_event = asyncio.Event() + self._tasks: List[asyncio.Task[None]] = [] + # Allow the handler to flush the player queue when appropriate. + self.handler._clear_queue = self.clear_audio_queue + self._settings_app: Optional[FastAPI] = settings_app + self._instance_path: Optional[str] = instance_path + self._settings_initialized = False + self._asyncio_loop = None + + # ---- Settings UI (only when API key is missing) ---- + def _read_env_lines(self, env_path: Path) -> list[str]: + """Load env file contents or a template as a list of lines.""" + inst = env_path.parent + try: + if env_path.exists(): + try: + return env_path.read_text(encoding="utf-8").splitlines() + except Exception: + return [] + template_text = None + ex = inst / ".env.example" + if ex.exists(): + try: + template_text = ex.read_text(encoding="utf-8") + except Exception: + template_text = None + if template_text is None: + try: + cwd_example = Path.cwd() / ".env.example" + if cwd_example.exists(): + template_text = cwd_example.read_text(encoding="utf-8") + except Exception: + template_text = None + if template_text is None: + packaged = Path(__file__).parent / ".env.example" + if packaged.exists(): + try: + template_text = packaged.read_text(encoding="utf-8") + except Exception: + template_text = None + return template_text.splitlines() if template_text else [] + except Exception: + return [] + + def _persist_api_key(self, key: str) -> None: + """Persist API key to environment and instance ``.env`` if possible. + + Behavior: + - Always sets ``OPENAI_API_KEY`` in process env and in-memory config. + - Writes/updates ``/.env``: + * If ``.env`` exists, replaces/append OPENAI_API_KEY line. + * Else, copies template from ``/.env.example`` when present, + otherwise falls back to the packaged template + ``reachy_mini_conversation_app/.env.example``. + * Ensures the resulting file contains the full template plus the key. + - Loads the written ``.env`` into the current process environment. + """ + k = (key or "").strip() + if not k: + return + # Update live process env and config so consumers see it immediately + try: + os.environ["OPENAI_API_KEY"] = k + except Exception: # best-effort + pass + try: + config.OPENAI_API_KEY = k + except Exception: + pass + + if not self._instance_path: + return + try: + inst = Path(self._instance_path) + env_path = inst / ".env" + lines = self._read_env_lines(env_path) + replaced = False + for i, ln in enumerate(lines): + if ln.strip().startswith("OPENAI_API_KEY="): + lines[i] = f"OPENAI_API_KEY={k}" + replaced = True + break + if not replaced: + lines.append(f"OPENAI_API_KEY={k}") + final_text = "\n".join(lines) + "\n" + env_path.write_text(final_text, encoding="utf-8") + logger.info("Persisted OPENAI_API_KEY to %s", env_path) + + # Load the newly written .env into this process to ensure downstream imports see it + try: + from dotenv import load_dotenv + + load_dotenv(dotenv_path=str(env_path), override=True) + except Exception: + pass + except Exception as e: + logger.warning("Failed to persist OPENAI_API_KEY: %s", e) + + def _persist_personality(self, profile: Optional[str]) -> None: + """Persist the startup personality to the instance .env and config.""" + selection = (profile or "").strip() or None + try: + from reachy_mini_conversation_app.config import set_custom_profile + + set_custom_profile(selection) + except Exception: + pass + + if not self._instance_path: + return + try: + env_path = Path(self._instance_path) / ".env" + lines = self._read_env_lines(env_path) + replaced = False + for i, ln in enumerate(list(lines)): + if ln.strip().startswith("REACHY_MINI_CUSTOM_PROFILE="): + if selection: + lines[i] = f"REACHY_MINI_CUSTOM_PROFILE={selection}" + else: + lines.pop(i) + replaced = True + break + if selection and not replaced: + lines.append(f"REACHY_MINI_CUSTOM_PROFILE={selection}") + if selection is None and not env_path.exists(): + return + final_text = "\n".join(lines) + "\n" + env_path.write_text(final_text, encoding="utf-8") + logger.info("Persisted startup personality to %s", env_path) + try: + from dotenv import load_dotenv + + load_dotenv(dotenv_path=str(env_path), override=True) + except Exception: + pass + except Exception as e: + logger.warning("Failed to persist REACHY_MINI_CUSTOM_PROFILE: %s", e) + + def _read_persisted_personality(self) -> Optional[str]: + """Read persisted startup personality from instance .env (if any).""" + if not self._instance_path: + return None + env_path = Path(self._instance_path) / ".env" + try: + if env_path.exists(): + for ln in env_path.read_text(encoding="utf-8").splitlines(): + if ln.strip().startswith("REACHY_MINI_CUSTOM_PROFILE="): + _, _, val = ln.partition("=") + v = val.strip() + return v or None + except Exception: + pass + return None + + def _init_settings_ui_if_needed(self) -> None: + """Attach minimal settings UI to the settings app. + + Always mounts the UI when a settings_app is provided so that users + see a confirmation message even if the API key is already configured. + """ + if self._settings_initialized: + return + if self._settings_app is None: + return + + static_dir = Path(__file__).parent / "static" + index_file = static_dir / "index.html" + + if hasattr(self._settings_app, "mount"): + try: + # Serve /static/* assets + self._settings_app.mount("/static", StaticFiles(directory=str(static_dir)), name="static") + except Exception: + pass + + class ApiKeyPayload(BaseModel): + openai_api_key: str + + # GET / -> index.html + @self._settings_app.get("/") + def _root() -> FileResponse: + return FileResponse(str(index_file)) + + # GET /favicon.ico -> optional, avoid noisy 404s on some browsers + @self._settings_app.get("/favicon.ico") + def _favicon() -> Response: + return Response(status_code=204) + + # GET /status -> whether key is set + @self._settings_app.get("/status") + def _status() -> JSONResponse: + has_key = bool(config.OPENAI_API_KEY and str(config.OPENAI_API_KEY).strip()) + return JSONResponse({"has_key": has_key}) + + # GET /ready -> whether backend finished loading tools + @self._settings_app.get("/ready") + def _ready() -> JSONResponse: + try: + mod = sys.modules.get("reachy_mini_conversation_app.tools.core_tools") + ready = bool(getattr(mod, "_TOOLS_INITIALIZED", False)) if mod else False + except Exception: + ready = False + return JSONResponse({"ready": ready}) + + # POST /openai_api_key -> set/persist key + @self._settings_app.post("/openai_api_key") + def _set_key(payload: ApiKeyPayload) -> JSONResponse: + key = (payload.openai_api_key or "").strip() + if not key: + return JSONResponse({"ok": False, "error": "empty_key"}, status_code=400) + self._persist_api_key(key) + return JSONResponse({"ok": True}) + + # POST /validate_api_key -> validate key without persisting it + @self._settings_app.post("/validate_api_key") + async def _validate_key(payload: ApiKeyPayload) -> JSONResponse: + key = (payload.openai_api_key or "").strip() + if not key: + return JSONResponse({"valid": False, "error": "empty_key"}, status_code=400) + + # Try to validate by checking if we can fetch the models + try: + import httpx + + headers = {"Authorization": f"Bearer {key}", "Content-Type": "application/json"} + async with httpx.AsyncClient(timeout=10.0) as client: + response = await client.get("https://api.openai.com/v1/models", headers=headers) + if response.status_code == 200: + return JSONResponse({"valid": True}) + elif response.status_code == 401: + return JSONResponse({"valid": False, "error": "invalid_api_key"}, status_code=401) + else: + return JSONResponse( + {"valid": False, "error": "validation_failed"}, status_code=response.status_code + ) + except Exception as e: + logger.warning(f"API key validation failed: {e}") + return JSONResponse({"valid": False, "error": "validation_error"}, status_code=500) + + self._settings_initialized = True + + def launch(self) -> None: + """Start the recorder/player and run the async processing loops. + + If the OpenAI key is missing, expose a tiny settings UI via the + Reachy Mini settings server to collect it before starting streams. + """ + self._stop_event.clear() + + # Try to load an existing instance .env first (covers subsequent runs) + if self._instance_path: + try: + from dotenv import load_dotenv + + from reachy_mini_conversation_app.config import set_custom_profile + + env_path = Path(self._instance_path) / ".env" + if env_path.exists(): + load_dotenv(dotenv_path=str(env_path), override=True) + # Update config with newly loaded values + new_key = os.getenv("OPENAI_API_KEY", "").strip() + if new_key: + try: + config.OPENAI_API_KEY = new_key + except Exception: + pass + new_profile = os.getenv("REACHY_MINI_CUSTOM_PROFILE") + if new_profile is not None: + try: + set_custom_profile(new_profile.strip() or None) + except Exception: + pass + except Exception: + pass + + # If key is still missing, try to download one from HuggingFace + if not (config.OPENAI_API_KEY and str(config.OPENAI_API_KEY).strip()): + logger.info("OPENAI_API_KEY not set, attempting to download from HuggingFace...") + try: + from gradio_client import Client + client = Client("HuggingFaceM4/gradium_setup", verbose=False) + key, status = client.predict(api_name="/claim_b_key") + if key and key.strip(): + logger.info("Successfully downloaded API key from HuggingFace") + # Persist it immediately + self._persist_api_key(key) + except Exception as e: + logger.warning(f"Failed to download API key from HuggingFace: {e}") + + # Always expose settings UI if a settings app is available + # (do this AFTER loading/downloading the key so status endpoint sees the right value) + self._init_settings_ui_if_needed() + + # If key is still missing -> wait until provided via the settings UI + if not (config.OPENAI_API_KEY and str(config.OPENAI_API_KEY).strip()): + logger.warning("OPENAI_API_KEY not found. Open the app settings page to enter it.") + # Poll until the key becomes available (set via the settings UI) + try: + while not (config.OPENAI_API_KEY and str(config.OPENAI_API_KEY).strip()): + time.sleep(0.2) + except KeyboardInterrupt: + logger.info("Interrupted while waiting for API key.") + return + + # Start media after key is set/available + self._robot.media.start_recording() + self._robot.media.start_playing() + time.sleep(1) # give some time to the pipelines to start + + async def runner() -> None: + # Capture loop for cross-thread personality actions + loop = asyncio.get_running_loop() + self._asyncio_loop = loop # type: ignore[assignment] + # Mount personality routes now that loop and handler are available + try: + if self._settings_app is not None: + mount_personality_routes( + self._settings_app, + self.handler, + lambda: self._asyncio_loop, + persist_personality=self._persist_personality, + get_persisted_personality=self._read_persisted_personality, + ) + except Exception: + pass + self._tasks = [ + asyncio.create_task(self.handler.start_up(), name="openai-handler"), + asyncio.create_task(self.record_loop(), name="stream-record-loop"), + asyncio.create_task(self.play_loop(), name="stream-play-loop"), + ] + try: + await asyncio.gather(*self._tasks) + except asyncio.CancelledError: + logger.info("Tasks cancelled during shutdown") + finally: + # Ensure handler connection is closed + await self.handler.shutdown() + + asyncio.run(runner()) + + def close(self) -> None: + """Stop the stream and underlying media pipelines. + + This method: + - Stops audio recording and playback first + - Sets the stop event to signal async loops to terminate + - Cancels all pending async tasks (openai-handler, record-loop, play-loop) + """ + logger.info("Stopping LocalStream...") + + # Stop media pipelines FIRST before cancelling async tasks + # This ensures clean shutdown before PortAudio cleanup + try: + self._robot.media.stop_recording() + except Exception as e: + logger.debug(f"Error stopping recording (may already be stopped): {e}") + + try: + self._robot.media.stop_playing() + except Exception as e: + logger.debug(f"Error stopping playback (may already be stopped): {e}") + + # Now signal async loops to stop + self._stop_event.set() + + # Cancel all running tasks + for task in self._tasks: + if not task.done(): + task.cancel() + + def clear_audio_queue(self) -> None: + """Flush the player's appsrc to drop any queued audio immediately.""" + logger.info("User intervention: flushing player queue") + if self._robot.media.backend == MediaBackend.GSTREAMER: + # Directly flush gstreamer audio pipe + self._robot.media.audio.clear_player() + elif self._robot.media.backend == MediaBackend.DEFAULT or self._robot.media.backend == MediaBackend.DEFAULT_NO_VIDEO: + self._robot.media.audio.clear_output_buffer() + self.handler.output_queue = asyncio.Queue() + + async def record_loop(self) -> None: + """Read mic frames from the recorder and forward them to the handler.""" + input_sample_rate = self._robot.media.get_input_audio_samplerate() + logger.debug(f"Audio recording started at {input_sample_rate} Hz") + + while not self._stop_event.is_set(): + audio_frame = self._robot.media.get_audio_sample() + if audio_frame is not None: + await self.handler.receive((input_sample_rate, audio_frame)) + await asyncio.sleep(0) # avoid busy loop + + async def play_loop(self) -> None: + """Fetch outputs from the handler: log text and play audio frames.""" + while not self._stop_event.is_set(): + handler_output = await self.handler.emit() + + if isinstance(handler_output, AdditionalOutputs): + for msg in handler_output.args: + content = msg.get("content", "") + if isinstance(content, str): + logger.info( + "role=%s content=%s", + msg.get("role"), + content if len(content) < 500 else content[:500] + "โ€ฆ", + ) + + elif isinstance(handler_output, tuple): + input_sample_rate, audio_data = handler_output + output_sample_rate = self._robot.media.get_output_audio_samplerate() + + # Reshape if needed + if audio_data.ndim == 2: + # Scipy channels last convention + if audio_data.shape[1] > audio_data.shape[0]: + audio_data = audio_data.T + # Multiple channels -> Mono channel + if audio_data.shape[1] > 1: + audio_data = audio_data[:, 0] + + # Cast if needed + audio_frame = audio_to_float32(audio_data) + + # Resample if needed + if input_sample_rate != output_sample_rate: + audio_frame = resample( + audio_frame, + int(len(audio_frame) * output_sample_rate / input_sample_rate), + ) + + self._robot.media.push_audio_sample(audio_frame) + + else: + logger.debug("Ignoring output type=%s", type(handler_output).__name__) + + await asyncio.sleep(0) # yield to event loop diff --git a/src/reachy_mini_conversation_app/dance_emotion_moves.py b/src/reachy_mini_conversation_app/dance_emotion_moves.py new file mode 100644 index 0000000000000000000000000000000000000000..a39dd67c7f694debfc9954023c845309331b3865 --- /dev/null +++ b/src/reachy_mini_conversation_app/dance_emotion_moves.py @@ -0,0 +1,154 @@ +"""Dance and emotion moves for the movement queue system. + +This module implements dance moves and emotions as Move objects that can be queued +and executed sequentially by the MovementManager. +""" + +from __future__ import annotations +import logging +from typing import Tuple + +import numpy as np +from numpy.typing import NDArray + +from reachy_mini.motion.move import Move +from reachy_mini.motion.recorded_move import RecordedMoves +from reachy_mini_dances_library.dance_move import DanceMove + + +logger = logging.getLogger(__name__) + + +class DanceQueueMove(Move): # type: ignore + """Wrapper for dance moves to work with the movement queue system.""" + + def __init__(self, move_name: str): + """Initialize a DanceQueueMove.""" + self.dance_move = DanceMove(move_name) + self.move_name = move_name + + @property + def duration(self) -> float: + """Duration property required by official Move interface.""" + return float(self.dance_move.duration) + + def evaluate(self, t: float) -> tuple[NDArray[np.float64] | None, NDArray[np.float64] | None, float | None]: + """Evaluate dance move at time t.""" + try: + # Get the pose from the dance move + head_pose, antennas, body_yaw = self.dance_move.evaluate(t) + + # Convert to numpy array if antennas is tuple and return in official Move format + if isinstance(antennas, tuple): + antennas = np.array([antennas[0], antennas[1]]) + + return (head_pose, antennas, body_yaw) + + except Exception as e: + logger.error(f"Error evaluating dance move '{self.move_name}' at t={t}: {e}") + # Return neutral pose on error + from reachy_mini.utils import create_head_pose + + neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True) + return (neutral_head_pose, np.array([0.0, 0.0], dtype=np.float64), 0.0) + + +class EmotionQueueMove(Move): # type: ignore + """Wrapper for emotion moves to work with the movement queue system.""" + + def __init__(self, emotion_name: str, recorded_moves: RecordedMoves): + """Initialize an EmotionQueueMove.""" + self.emotion_move = recorded_moves.get(emotion_name) + self.emotion_name = emotion_name + + @property + def duration(self) -> float: + """Duration property required by official Move interface.""" + return float(self.emotion_move.duration) + + def evaluate(self, t: float) -> tuple[NDArray[np.float64] | None, NDArray[np.float64] | None, float | None]: + """Evaluate emotion move at time t.""" + try: + # Get the pose from the emotion move + head_pose, antennas, body_yaw = self.emotion_move.evaluate(t) + + # Convert to numpy array if antennas is tuple and return in official Move format + if isinstance(antennas, tuple): + antennas = np.array([antennas[0], antennas[1]]) + + return (head_pose, antennas, body_yaw) + + except Exception as e: + logger.error(f"Error evaluating emotion '{self.emotion_name}' at t={t}: {e}") + # Return neutral pose on error + from reachy_mini.utils import create_head_pose + + neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True) + return (neutral_head_pose, np.array([0.0, 0.0], dtype=np.float64), 0.0) + + +class GotoQueueMove(Move): # type: ignore + """Wrapper for goto moves to work with the movement queue system.""" + + def __init__( + self, + target_head_pose: NDArray[np.float32], + start_head_pose: NDArray[np.float32] | None = None, + target_antennas: Tuple[float, float] = (0, 0), + start_antennas: Tuple[float, float] | None = None, + target_body_yaw: float = 0, + start_body_yaw: float | None = None, + duration: float = 1.0, + ): + """Initialize a GotoQueueMove.""" + self._duration = duration + self.target_head_pose = target_head_pose + self.start_head_pose = start_head_pose + self.target_antennas = target_antennas + self.start_antennas = start_antennas or (0, 0) + self.target_body_yaw = target_body_yaw + self.start_body_yaw = start_body_yaw or 0 + + @property + def duration(self) -> float: + """Duration property required by official Move interface.""" + return self._duration + + def evaluate(self, t: float) -> tuple[NDArray[np.float64] | None, NDArray[np.float64] | None, float | None]: + """Evaluate goto move at time t using linear interpolation.""" + try: + from reachy_mini.utils import create_head_pose + from reachy_mini.utils.interpolation import linear_pose_interpolation + + # Clamp t to [0, 1] for interpolation + t_clamped = max(0, min(1, t / self.duration)) + + # Use start pose if available, otherwise neutral + if self.start_head_pose is not None: + start_pose = self.start_head_pose + else: + start_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True) + + # Interpolate head pose + head_pose = linear_pose_interpolation(start_pose, self.target_head_pose, t_clamped) + + # Interpolate antennas - return as numpy array + antennas = np.array( + [ + self.start_antennas[0] + (self.target_antennas[0] - self.start_antennas[0]) * t_clamped, + self.start_antennas[1] + (self.target_antennas[1] - self.start_antennas[1]) * t_clamped, + ], + dtype=np.float64, + ) + + # Interpolate body yaw + body_yaw = self.start_body_yaw + (self.target_body_yaw - self.start_body_yaw) * t_clamped + + return (head_pose, antennas, body_yaw) + + except Exception as e: + logger.error(f"Error evaluating goto move at t={t}: {e}") + # Return target pose on error - convert to float64 + target_head_pose_f64 = self.target_head_pose.astype(np.float64) + target_antennas_array = np.array([self.target_antennas[0], self.target_antennas[1]], dtype=np.float64) + return (target_head_pose_f64, target_antennas_array, self.target_body_yaw) diff --git a/src/reachy_mini_conversation_app/gradio_personality.py b/src/reachy_mini_conversation_app/gradio_personality.py new file mode 100644 index 0000000000000000000000000000000000000000..b95714e7a647d3bbb2fe708d72668382578a2d9b --- /dev/null +++ b/src/reachy_mini_conversation_app/gradio_personality.py @@ -0,0 +1,301 @@ +"""Gradio personality UI components and wiring. + +This module encapsulates the UI elements and logic related to managing +conversation "personalities" (profiles) so that `main.py` stays lean. +""" + +from __future__ import annotations +from typing import Any +from pathlib import Path + +import gradio as gr + +from .config import config + + +class PersonalityUI: + """Container for personality-related Gradio components.""" + + def __init__(self) -> None: + """Initialize the PersonalityUI instance.""" + # Constants and paths + self.DEFAULT_OPTION = "(built-in default)" + self._profiles_root = Path(__file__).parent / "profiles" + self._tools_dir = Path(__file__).parent / "tools" + self._prompts_dir = Path(__file__).parent / "prompts" + + # Components (initialized in create_components) + self.personalities_dropdown: gr.Dropdown + self.apply_btn: gr.Button + self.status_md: gr.Markdown + self.preview_md: gr.Markdown + self.person_name_tb: gr.Textbox + self.person_instr_ta: gr.TextArea + self.tools_txt_ta: gr.TextArea + self.voice_dropdown: gr.Dropdown + self.new_personality_btn: gr.Button + self.available_tools_cg: gr.CheckboxGroup + self.save_btn: gr.Button + + # ---------- Filesystem helpers ---------- + def _list_personalities(self) -> list[str]: + names: list[str] = [] + try: + if self._profiles_root.exists(): + for p in sorted(self._profiles_root.iterdir()): + if p.name == "user_personalities": + continue + if p.is_dir() and (p / "instructions.txt").exists(): + names.append(p.name) + user_dir = self._profiles_root / "user_personalities" + if user_dir.exists(): + for p in sorted(user_dir.iterdir()): + if p.is_dir() and (p / "instructions.txt").exists(): + names.append(f"user_personalities/{p.name}") + except Exception: + pass + return names + + def _resolve_profile_dir(self, selection: str) -> Path: + return self._profiles_root / selection + + def _read_instructions_for(self, name: str) -> str: + try: + if name == self.DEFAULT_OPTION: + default_file = self._prompts_dir / "default_prompt.txt" + if default_file.exists(): + return default_file.read_text(encoding="utf-8").strip() + return "" + target = self._resolve_profile_dir(name) / "instructions.txt" + if target.exists(): + return target.read_text(encoding="utf-8").strip() + return "" + except Exception as e: + return f"Could not load instructions: {e}" + + @staticmethod + def _sanitize_name(name: str) -> str: + import re + + s = name.strip() + s = re.sub(r"\s+", "_", s) + s = re.sub(r"[^a-zA-Z0-9_-]", "", s) + return s + + # ---------- Public API ---------- + def create_components(self) -> None: + """Instantiate Gradio components for the personality UI.""" + current_value = config.REACHY_MINI_CUSTOM_PROFILE or self.DEFAULT_OPTION + + self.personalities_dropdown = gr.Dropdown( + label="Select personality", + choices=[self.DEFAULT_OPTION, *(self._list_personalities())], + value=current_value, + ) + self.apply_btn = gr.Button("Apply personality") + self.status_md = gr.Markdown(visible=True) + self.preview_md = gr.Markdown(value=self._read_instructions_for(current_value)) + self.person_name_tb = gr.Textbox(label="Personality name") + self.person_instr_ta = gr.TextArea(label="Personality instructions", lines=10) + self.tools_txt_ta = gr.TextArea(label="tools.txt", lines=10) + self.voice_dropdown = gr.Dropdown(label="Voice", choices=["cedar"], value="cedar") + self.new_personality_btn = gr.Button("New personality") + self.available_tools_cg = gr.CheckboxGroup(label="Available tools (helper)", choices=[], value=[]) + self.save_btn = gr.Button("Save personality (instructions + tools)") + + def additional_inputs_ordered(self) -> list[Any]: + """Return the additional inputs in the expected order for Stream.""" + return [ + self.personalities_dropdown, + self.apply_btn, + self.new_personality_btn, + self.status_md, + self.preview_md, + self.person_name_tb, + self.person_instr_ta, + self.tools_txt_ta, + self.voice_dropdown, + self.available_tools_cg, + self.save_btn, + ] + + # ---------- Event wiring ---------- + def wire_events(self, handler: Any, blocks: gr.Blocks) -> None: + """Attach event handlers to components within a Blocks context.""" + + async def _apply_personality(selected: str) -> tuple[str, str]: + profile = None if selected == self.DEFAULT_OPTION else selected + status = await handler.apply_personality(profile) + preview = self._read_instructions_for(selected) + return status, preview + + def _read_voice_for(name: str) -> str: + try: + if name == self.DEFAULT_OPTION: + return "cedar" + vf = self._resolve_profile_dir(name) / "voice.txt" + if vf.exists(): + v = vf.read_text(encoding="utf-8").strip() + return v or "cedar" + except Exception: + pass + return "cedar" + + async def _fetch_voices(selected: str) -> dict[str, Any]: + try: + voices = await handler.get_available_voices() + current = _read_voice_for(selected) + if current not in voices: + current = "cedar" + return gr.update(choices=voices, value=current) + except Exception: + return gr.update(choices=["cedar"], value="cedar") + + def _available_tools_for(selected: str) -> tuple[list[str], list[str]]: + shared: list[str] = [] + try: + for py in self._tools_dir.glob("*.py"): + if py.stem in {"__init__", "core_tools"}: + continue + shared.append(py.stem) + except Exception: + pass + local: list[str] = [] + try: + if selected != self.DEFAULT_OPTION: + for py in (self._profiles_root / selected).glob("*.py"): + local.append(py.stem) + except Exception: + pass + return sorted(shared), sorted(local) + + def _parse_enabled_tools(text: str) -> list[str]: + enabled: list[str] = [] + for line in text.splitlines(): + s = line.strip() + if not s or s.startswith("#"): + continue + enabled.append(s) + return enabled + + def _load_profile_for_edit(selected: str) -> tuple[dict[str, Any], dict[str, Any], dict[str, Any], str]: + instr = self._read_instructions_for(selected) + tools_txt = "" + if selected != self.DEFAULT_OPTION: + tp = self._resolve_profile_dir(selected) / "tools.txt" + if tp.exists(): + tools_txt = tp.read_text(encoding="utf-8") + shared, local = _available_tools_for(selected) + all_tools = sorted(set(shared + local)) + enabled = _parse_enabled_tools(tools_txt) + status_text = f"Loaded profile '{selected}'." + return ( + gr.update(value=instr), + gr.update(value=tools_txt), + gr.update(choices=all_tools, value=enabled), + status_text, + ) + + def _new_personality() -> tuple[ + dict[str, Any], dict[str, Any], dict[str, Any], dict[str, Any], str, dict[str, Any] + ]: + try: + # Prefill with hints + instr_val = """# Write your instructions here\n# e.g., Keep responses concise and friendly.""" + tools_txt_val = "# tools enabled for this profile\n" + return ( + gr.update(value=""), + gr.update(value=instr_val), + gr.update(value=tools_txt_val), + gr.update(choices=sorted(_available_tools_for(self.DEFAULT_OPTION)[0]), value=[]), + "Fill in a name, instructions and (optional) tools, then Save.", + gr.update(value="cedar"), + ) + except Exception: + return ( + gr.update(), + gr.update(), + gr.update(), + gr.update(), + "Failed to initialize new personality.", + gr.update(), + ) + + def _save_personality( + name: str, instructions: str, tools_text: str, voice: str + ) -> tuple[dict[str, Any], dict[str, Any], str]: + name_s = self._sanitize_name(name) + if not name_s: + return gr.update(), gr.update(), "Please enter a valid name." + try: + target_dir = self._profiles_root / "user_personalities" / name_s + target_dir.mkdir(parents=True, exist_ok=True) + (target_dir / "instructions.txt").write_text(instructions.strip() + "\n", encoding="utf-8") + (target_dir / "tools.txt").write_text(tools_text.strip() + "\n", encoding="utf-8") + (target_dir / "voice.txt").write_text((voice or "cedar").strip() + "\n", encoding="utf-8") + + choices = self._list_personalities() + value = f"user_personalities/{name_s}" + if value not in choices: + choices.append(value) + return ( + gr.update(choices=[self.DEFAULT_OPTION, *sorted(choices)], value=value), + gr.update(value=instructions), + f"Saved personality '{name_s}'.", + ) + except Exception as e: + return gr.update(), gr.update(), f"Failed to save personality: {e}" + + def _sync_tools_from_checks(selected: list[str], current_text: str) -> dict[str, Any]: + comments = [ln for ln in current_text.splitlines() if ln.strip().startswith("#")] + body = "\n".join(selected) + out = ("\n".join(comments) + ("\n" if comments else "") + body).strip() + "\n" + return gr.update(value=out) + + with blocks: + self.apply_btn.click( + fn=_apply_personality, + inputs=[self.personalities_dropdown], + outputs=[self.status_md, self.preview_md], + ) + + self.personalities_dropdown.change( + fn=_load_profile_for_edit, + inputs=[self.personalities_dropdown], + outputs=[self.person_instr_ta, self.tools_txt_ta, self.available_tools_cg, self.status_md], + ) + + blocks.load( + fn=_fetch_voices, + inputs=[self.personalities_dropdown], + outputs=[self.voice_dropdown], + ) + + self.available_tools_cg.change( + fn=_sync_tools_from_checks, + inputs=[self.available_tools_cg, self.tools_txt_ta], + outputs=[self.tools_txt_ta], + ) + + self.new_personality_btn.click( + fn=_new_personality, + inputs=[], + outputs=[ + self.person_name_tb, + self.person_instr_ta, + self.tools_txt_ta, + self.available_tools_cg, + self.status_md, + self.voice_dropdown, + ], + ) + + self.save_btn.click( + fn=_save_personality, + inputs=[self.person_name_tb, self.person_instr_ta, self.tools_txt_ta, self.voice_dropdown], + outputs=[self.personalities_dropdown, self.person_instr_ta, self.status_md], + ).then( + fn=_apply_personality, + inputs=[self.personalities_dropdown], + outputs=[self.status_md, self.preview_md], + ) diff --git a/src/reachy_mini_conversation_app/headless_personality.py b/src/reachy_mini_conversation_app/headless_personality.py new file mode 100644 index 0000000000000000000000000000000000000000..9baa0d44143163d74a35e985af31310c45faf76b --- /dev/null +++ b/src/reachy_mini_conversation_app/headless_personality.py @@ -0,0 +1,102 @@ +"""Headless personality management (console-based). + +Provides an interactive CLI to browse, preview, apply, create and edit +"personalities" (profiles) when running without Gradio. + +This module is intentionally not shared with the Gradio implementation to +avoid coupling and keep responsibilities clear for headless mode. +""" + +from __future__ import annotations +from typing import List +from pathlib import Path + + +DEFAULT_OPTION = "(built-in default)" + + +def _profiles_root() -> Path: + return Path(__file__).parent / "profiles" + + +def _prompts_dir() -> Path: + return Path(__file__).parent / "prompts" + + +def _tools_dir() -> Path: + return Path(__file__).parent / "tools" + + +def _sanitize_name(name: str) -> str: + import re + + s = name.strip() + s = re.sub(r"\s+", "_", s) + s = re.sub(r"[^a-zA-Z0-9_-]", "", s) + return s + + +def list_personalities() -> List[str]: + """List available personality profile names.""" + names: List[str] = [] + root = _profiles_root() + try: + if root.exists(): + for p in sorted(root.iterdir()): + if p.name == "user_personalities": + continue + if p.is_dir() and (p / "instructions.txt").exists(): + names.append(p.name) + udir = root / "user_personalities" + if udir.exists(): + for p in sorted(udir.iterdir()): + if p.is_dir() and (p / "instructions.txt").exists(): + names.append(f"user_personalities/{p.name}") + except Exception: + pass + return names + + +def resolve_profile_dir(selection: str) -> Path: + """Resolve the directory path for the given profile selection.""" + return _profiles_root() / selection + + +def read_instructions_for(name: str) -> str: + """Read the instructions.txt content for the given profile name.""" + try: + if name == DEFAULT_OPTION: + df = _prompts_dir() / "default_prompt.txt" + return df.read_text(encoding="utf-8").strip() if df.exists() else "" + target = resolve_profile_dir(name) / "instructions.txt" + return target.read_text(encoding="utf-8").strip() if target.exists() else "" + except Exception as e: + return f"Could not load instructions: {e}" + + +def available_tools_for(selected: str) -> List[str]: + """List available tool modules for the given profile selection.""" + shared: List[str] = [] + try: + for py in _tools_dir().glob("*.py"): + if py.stem in {"__init__", "core_tools"}: + continue + shared.append(py.stem) + except Exception: + pass + local: List[str] = [] + try: + if selected != DEFAULT_OPTION: + for py in resolve_profile_dir(selected).glob("*.py"): + local.append(py.stem) + except Exception: + pass + return sorted(set(shared + local)) + + +def _write_profile(name_s: str, instructions: str, tools_text: str, voice: str = "cedar") -> None: + target_dir = _profiles_root() / "user_personalities" / name_s + target_dir.mkdir(parents=True, exist_ok=True) + (target_dir / "instructions.txt").write_text(instructions.strip() + "\n", encoding="utf-8") + (target_dir / "tools.txt").write_text((tools_text or "").strip() + "\n", encoding="utf-8") + (target_dir / "voice.txt").write_text((voice or "cedar").strip() + "\n", encoding="utf-8") diff --git a/src/reachy_mini_conversation_app/headless_personality_ui.py b/src/reachy_mini_conversation_app/headless_personality_ui.py new file mode 100644 index 0000000000000000000000000000000000000000..be2659b596aecdaacc47be3b79dc93e5315b5c44 --- /dev/null +++ b/src/reachy_mini_conversation_app/headless_personality_ui.py @@ -0,0 +1,276 @@ +"""Settings UI routes for headless personality management. + +Exposes REST endpoints on the provided FastAPI settings app. The +implementation schedules backend actions (apply personality, fetch voices) +onto the running LocalStream asyncio loop using the supplied get_loop +callable to avoid cross-thread issues. +""" + +from __future__ import annotations +import asyncio +import logging +from typing import Any, Callable, Optional + +from fastapi import FastAPI + +from .config import config +from .openai_realtime import OpenaiRealtimeHandler +from .headless_personality import ( + DEFAULT_OPTION, + _sanitize_name, + _write_profile, + list_personalities, + available_tools_for, + resolve_profile_dir, + read_instructions_for, +) + + +def mount_personality_routes( + app: FastAPI, + handler: OpenaiRealtimeHandler, + get_loop: Callable[[], asyncio.AbstractEventLoop | None], + *, + persist_personality: Callable[[Optional[str]], None] | None = None, + get_persisted_personality: Callable[[], Optional[str]] | None = None, +) -> None: + """Register personality management endpoints on a FastAPI app.""" + try: + from fastapi import Request + from pydantic import BaseModel + from fastapi.responses import JSONResponse + except Exception: # pragma: no cover - only when settings app not available + return + + class SavePayload(BaseModel): + name: str + instructions: str + tools_text: str + voice: Optional[str] = "cedar" + + class ApplyPayload(BaseModel): + name: str + persist: Optional[bool] = False + + def _startup_choice() -> Any: + """Return the persisted startup personality or default.""" + try: + if get_persisted_personality is not None: + stored = get_persisted_personality() + if stored: + return stored + env_val = getattr(config, "REACHY_MINI_CUSTOM_PROFILE", None) + if env_val: + return env_val + except Exception: + pass + return DEFAULT_OPTION + + def _current_choice() -> str: + try: + cur = getattr(config, "REACHY_MINI_CUSTOM_PROFILE", None) + return cur or DEFAULT_OPTION + except Exception: + return DEFAULT_OPTION + + @app.get("/personalities") + def _list() -> dict: # type: ignore + choices = [DEFAULT_OPTION, *list_personalities()] + return {"choices": choices, "current": _current_choice(), "startup": _startup_choice()} + + @app.get("/personalities/load") + def _load(name: str) -> dict: # type: ignore + instr = read_instructions_for(name) + tools_txt = "" + voice = "cedar" + if name != DEFAULT_OPTION: + pdir = resolve_profile_dir(name) + tp = pdir / "tools.txt" + if tp.exists(): + tools_txt = tp.read_text(encoding="utf-8") + vf = pdir / "voice.txt" + if vf.exists(): + v = vf.read_text(encoding="utf-8").strip() + voice = v or "cedar" + avail = available_tools_for(name) + enabled = [ln.strip() for ln in tools_txt.splitlines() if ln.strip() and not ln.strip().startswith("#")] + return { + "instructions": instr, + "tools_text": tools_txt, + "voice": voice, + "available_tools": avail, + "enabled_tools": enabled, + } + + @app.post("/personalities/save") + async def _save(request: Request) -> dict: # type: ignore + # Accept raw JSON only to avoid validation-related 422s + try: + raw = await request.json() + except Exception: + raw = {} + name = str(raw.get("name", "")) + instructions = str(raw.get("instructions", "")) + tools_text = str(raw.get("tools_text", "")) + voice = str(raw.get("voice", "cedar")) if raw.get("voice") is not None else "cedar" + + name_s = _sanitize_name(name) + if not name_s: + return JSONResponse({"ok": False, "error": "invalid_name"}, status_code=400) # type: ignore + try: + logger.info( + "Headless save: name=%r voice=%r instr_len=%d tools_len=%d", + name_s, + voice, + len(instructions), + len(tools_text), + ) + _write_profile(name_s, instructions, tools_text, voice or "cedar") + value = f"user_personalities/{name_s}" + choices = [DEFAULT_OPTION, *list_personalities()] + return {"ok": True, "value": value, "choices": choices} + except Exception as e: + return JSONResponse({"ok": False, "error": str(e)}, status_code=500) # type: ignore + + @app.post("/personalities/save_raw") + async def _save_raw( + request: Request, + name: Optional[str] = None, + instructions: Optional[str] = None, + tools_text: Optional[str] = None, + voice: Optional[str] = None, + ) -> dict: # type: ignore + # Accept query params, form-encoded, or raw JSON + data = {"name": name, "instructions": instructions, "tools_text": tools_text, "voice": voice} + # Prefer form if present + try: + form = await request.form() + for k in ("name", "instructions", "tools_text", "voice"): + if k in form and form[k] is not None: + data[k] = str(form[k]) + except Exception: + pass + # Try JSON + try: + raw = await request.json() + if isinstance(raw, dict): + for k in ("name", "instructions", "tools_text", "voice"): + if raw.get(k) is not None: + data[k] = str(raw.get(k)) + except Exception: + pass + + name_s = _sanitize_name(str(data.get("name") or "")) + if not name_s: + return JSONResponse({"ok": False, "error": "invalid_name"}, status_code=400) # type: ignore + instr = str(data.get("instructions") or "") + tools = str(data.get("tools_text") or "") + v = str(data.get("voice") or "cedar") + try: + logger.info( + "Headless save_raw: name=%r voice=%r instr_len=%d tools_len=%d", name_s, v, len(instr), len(tools) + ) + _write_profile(name_s, instr, tools, v) + value = f"user_personalities/{name_s}" + choices = [DEFAULT_OPTION, *list_personalities()] + return {"ok": True, "value": value, "choices": choices} + except Exception as e: + return JSONResponse({"ok": False, "error": str(e)}, status_code=500) # type: ignore + + @app.get("/personalities/save_raw") + async def _save_raw_get(name: str, instructions: str = "", tools_text: str = "", voice: str = "cedar") -> dict: # type: ignore + name_s = _sanitize_name(name) + if not name_s: + return JSONResponse({"ok": False, "error": "invalid_name"}, status_code=400) # type: ignore + try: + logger.info( + "Headless save_raw(GET): name=%r voice=%r instr_len=%d tools_len=%d", + name_s, + voice, + len(instructions), + len(tools_text), + ) + _write_profile(name_s, instructions, tools_text, voice or "cedar") + value = f"user_personalities/{name_s}" + choices = [DEFAULT_OPTION, *list_personalities()] + return {"ok": True, "value": value, "choices": choices} + except Exception as e: + return JSONResponse({"ok": False, "error": str(e)}, status_code=500) # type: ignore + + logger = logging.getLogger(__name__) + + @app.post("/personalities/apply") + async def _apply( + payload: ApplyPayload | None = None, + name: str | None = None, + persist: Optional[bool] = None, + request: Optional[Request] = None, + ) -> dict: # type: ignore + loop = get_loop() + if loop is None: + return JSONResponse({"ok": False, "error": "loop_unavailable"}, status_code=503) # type: ignore + + # Accept both JSON payload and query param for convenience + sel_name: Optional[str] = None + persist_flag = bool(persist) if persist is not None else False + if payload and getattr(payload, "name", None): + sel_name = payload.name + persist_flag = bool(getattr(payload, "persist", False)) + elif name: + sel_name = name + elif request is not None: + try: + body = await request.json() + if isinstance(body, dict) and body.get("name"): + sel_name = str(body.get("name")) + if isinstance(body, dict) and "persist" in body: + persist_flag = bool(body.get("persist")) + except Exception: + sel_name = None + if request is not None: + try: + q_persist = request.query_params.get("persist") + if q_persist is not None: + persist_flag = str(q_persist).lower() in {"1", "true", "yes", "on"} + except Exception: + pass + if not sel_name: + sel_name = DEFAULT_OPTION + + async def _do_apply() -> str: + sel = None if sel_name == DEFAULT_OPTION else sel_name + status = await handler.apply_personality(sel) + return status + + try: + logger.info("Headless apply: requested name=%r", sel_name) + fut = asyncio.run_coroutine_threadsafe(_do_apply(), loop) + status = fut.result(timeout=10) + persisted_choice = _startup_choice() + if persist_flag and persist_personality is not None: + try: + persist_personality(None if sel_name == DEFAULT_OPTION else sel_name) + persisted_choice = _startup_choice() + except Exception as e: + logger.warning("Failed to persist startup personality: %s", e) + return {"ok": True, "status": status, "startup": persisted_choice} + except Exception as e: + return JSONResponse({"ok": False, "error": str(e)}, status_code=500) # type: ignore + + @app.get("/voices") + async def _voices() -> list[str]: + loop = get_loop() + if loop is None: + return ["cedar"] + + async def _get_v() -> list[str]: + try: + return await handler.get_available_voices() + except Exception: + return ["cedar"] + + try: + fut = asyncio.run_coroutine_threadsafe(_get_v(), loop) + return fut.result(timeout=10) + except Exception: + return ["cedar"] diff --git a/src/reachy_mini_conversation_app/images/reachymini_avatar.png b/src/reachy_mini_conversation_app/images/reachymini_avatar.png new file mode 100644 index 0000000000000000000000000000000000000000..cb6dee522f40a200b6ddadc13b2afecefbf14f78 --- /dev/null +++ b/src/reachy_mini_conversation_app/images/reachymini_avatar.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a63ac8802ff3542f01292c431c5278296880d74cd3580d219fcf4827bc235f9 +size 1228716 diff --git a/src/reachy_mini_conversation_app/images/user_avatar.png b/src/reachy_mini_conversation_app/images/user_avatar.png new file mode 100644 index 0000000000000000000000000000000000000000..c297bda9088aa6961fe49c3176ec20a0886149be --- /dev/null +++ b/src/reachy_mini_conversation_app/images/user_avatar.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e97ca125a86bacdaa41c8dca88abd9ca746fd5c9391eda24249c012432b0219b +size 1111878 diff --git a/src/reachy_mini_conversation_app/main.py b/src/reachy_mini_conversation_app/main.py new file mode 100644 index 0000000000000000000000000000000000000000..0e50121ab4eb76d29e4df414c565bf1c8b4cd0bc --- /dev/null +++ b/src/reachy_mini_conversation_app/main.py @@ -0,0 +1,242 @@ +"""Entrypoint for the Reachy Mini conversation app.""" + +import os +import sys +import time +import asyncio +import argparse +import threading +from typing import Any, Dict, List, Optional + +import gradio as gr +from fastapi import FastAPI +from fastrtc import Stream +from gradio.utils import get_space + +from reachy_mini import ReachyMini, ReachyMiniApp +from reachy_mini_conversation_app.utils import ( + parse_args, + setup_logger, + handle_vision_stuff, +) + + +def update_chatbot(chatbot: List[Dict[str, Any]], response: Dict[str, Any]) -> List[Dict[str, Any]]: + """Update the chatbot with AdditionalOutputs.""" + chatbot.append(response) + return chatbot + + +def main() -> None: + """Entrypoint for the Reachy Mini conversation app.""" + args, _ = parse_args() + run(args) + + +def run( + args: argparse.Namespace, + robot: ReachyMini = None, + app_stop_event: Optional[threading.Event] = None, + settings_app: Optional[FastAPI] = None, + instance_path: Optional[str] = None, +) -> None: + """Run the Reachy Mini conversation app.""" + # Putting these dependencies here makes the dashboard faster to load when the conversation app is installed + from reachy_mini_conversation_app.moves import MovementManager + from reachy_mini_conversation_app.console import LocalStream + from reachy_mini_conversation_app.openai_realtime import OpenaiRealtimeHandler + from reachy_mini_conversation_app.tools.core_tools import ToolDependencies + from reachy_mini_conversation_app.audio.head_wobbler import HeadWobbler + + logger = setup_logger(args.debug) + logger.info("Starting Reachy Mini Conversation App") + + if args.no_camera and args.head_tracker is not None: + logger.warning("Head tracking is not activated due to --no-camera.") + + if robot is None: + # Initialize robot with appropriate backend + # TODO: Implement dynamic robot connection detection + # Automatically detect and connect to available Reachy Mini robot(s!) + # Priority checks (in order): + # 1. Reachy Lite connected directly to the host + # 2. Reachy Mini daemon running on localhost (same device) + # 3. Reachy Mini daemon on local network (same subnet) + + if args.remote: + logger.info("Connecting to remote Reachy Mini on the network") + robot = ReachyMini(media_backend="default_no_video", localhost_only=False) + elif args.wireless_version and not args.on_device: + logger.info("Using WebRTC backend for fully remote wireless version") + robot = ReachyMini(media_backend="webrtc", localhost_only=False) + elif args.wireless_version and args.on_device: + logger.info("Using GStreamer backend for on-device wireless version") + robot = ReachyMini(media_backend="gstreamer") + elif args.no_camera: + logger.info("Using audio-only backend (no camera)") + robot = ReachyMini(media_backend="default_no_video") + else: + logger.info("Using default backend for lite version") + robot = ReachyMini(media_backend="default") + + # Check if running in simulation mode without --gradio + if robot.client.get_status()["simulation_enabled"] and not args.gradio: + logger.error( + "Simulation mode requires Gradio interface. Please use --gradio flag when running in simulation mode.", + ) + robot.client.disconnect() + sys.exit(1) + + camera_worker, _, vision_manager = handle_vision_stuff(args, robot) + + movement_manager = MovementManager( + current_robot=robot, + camera_worker=camera_worker, + ) + + head_wobbler = HeadWobbler(set_speech_offsets=movement_manager.set_speech_offsets) + + deps = ToolDependencies( + reachy_mini=robot, + movement_manager=movement_manager, + camera_worker=camera_worker, + vision_manager=vision_manager, + head_wobbler=head_wobbler, + ) + current_file_path = os.path.dirname(os.path.abspath(__file__)) + logger.debug(f"Current file absolute path: {current_file_path}") + chatbot = gr.Chatbot( + type="messages", + resizable=True, + avatar_images=( + os.path.join(current_file_path, "images", "user_avatar.png"), + os.path.join(current_file_path, "images", "reachymini_avatar.png"), + ), + ) + logger.debug(f"Chatbot avatar images: {chatbot.avatar_images}") + + handler = OpenaiRealtimeHandler(deps, gradio_mode=args.gradio, instance_path=instance_path) + + stream_manager: gr.Blocks | LocalStream | None = None + + if args.gradio: + api_key_textbox = gr.Textbox( + label="OPENAI API Key", + type="password", + value=os.getenv("OPENAI_API_KEY") if not get_space() else "", + ) + + from reachy_mini_conversation_app.gradio_personality import PersonalityUI + + personality_ui = PersonalityUI() + personality_ui.create_components() + + stream = Stream( + handler=handler, + mode="send-receive", + modality="audio", + additional_inputs=[ + chatbot, + api_key_textbox, + *personality_ui.additional_inputs_ordered(), + ], + additional_outputs=[chatbot], + additional_outputs_handler=update_chatbot, + ui_args={"title": "Talk with Reachy Mini"}, + ) + stream_manager = stream.ui + if not settings_app: + app = FastAPI() + else: + app = settings_app + + personality_ui.wire_events(handler, stream_manager) + + app = gr.mount_gradio_app(app, stream.ui, path="/") + else: + # In headless mode, wire settings_app + instance_path to console LocalStream + stream_manager = LocalStream( + handler, + robot, + settings_app=settings_app, + instance_path=instance_path, + ) + + # Each async service โ†’ its own thread/loop + movement_manager.start() + head_wobbler.start() + if camera_worker: + camera_worker.start() + if vision_manager: + vision_manager.start() + + def poll_stop_event() -> None: + """Poll the stop event to allow graceful shutdown.""" + if app_stop_event is not None: + app_stop_event.wait() + + logger.info("App stop event detected, shutting down...") + try: + stream_manager.close() + except Exception as e: + logger.error(f"Error while closing stream manager: {e}") + + if app_stop_event: + threading.Thread(target=poll_stop_event, daemon=True).start() + + try: + stream_manager.launch() + except KeyboardInterrupt: + logger.info("Keyboard interruption in main thread... closing server.") + finally: + movement_manager.stop() + head_wobbler.stop() + if camera_worker: + camera_worker.stop() + if vision_manager: + vision_manager.stop() + + # Ensure media is explicitly closed before disconnecting + try: + robot.media.close() + except Exception as e: + logger.debug(f"Error closing media during shutdown: {e}") + + # prevent connection to keep alive some threads + robot.client.disconnect() + time.sleep(1) + logger.info("Shutdown complete.") + + +class ReachyMiniConversationApp(ReachyMiniApp): # type: ignore[misc] + """Reachy Mini Apps entry point for the conversation app.""" + + custom_app_url = "http://0.0.0.0:7860/" + dont_start_webserver = False + + def run(self, reachy_mini: ReachyMini, stop_event: threading.Event) -> None: + """Run the Reachy Mini conversation app.""" + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + args, _ = parse_args() + + # is_wireless = reachy_mini.client.get_status()["wireless_version"] + # args.head_tracker = None if is_wireless else "mediapipe" + + instance_path = self._get_instance_path().parent + run( + args, + robot=reachy_mini, + app_stop_event=stop_event, + settings_app=self.settings_app, + instance_path=instance_path, + ) + + +if __name__ == "__main__": + app = ReachyMiniConversationApp() + try: + app.wrapped_run() + except KeyboardInterrupt: + app.stop() diff --git a/src/reachy_mini_conversation_app/mcp/__init__.py b/src/reachy_mini_conversation_app/mcp/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1d96599df66c7396cd99e57b335d56ac6c0510d1 --- /dev/null +++ b/src/reachy_mini_conversation_app/mcp/__init__.py @@ -0,0 +1,8 @@ +"""MCP (Model Context Protocol) integration for Reachy Mini Conversation App.""" + +from reachy_mini_conversation_app.mcp.client import MCPClient +from reachy_mini_conversation_app.mcp.figma import FigmaMCPTools + + +__all__ = ["MCPClient", "FigmaMCPTools"] + diff --git a/src/reachy_mini_conversation_app/mcp/client.py b/src/reachy_mini_conversation_app/mcp/client.py new file mode 100644 index 0000000000000000000000000000000000000000..06ffdbd6dfcee3a6bc786200b7057db1a6399d49 --- /dev/null +++ b/src/reachy_mini_conversation_app/mcp/client.py @@ -0,0 +1,193 @@ +"""MCP Client for connecting to Model Context Protocol servers.""" + +import json +import logging +import asyncio +from typing import Any, Dict, List, Optional +from dataclasses import dataclass, field + +import httpx + + +logger = logging.getLogger(__name__) + + +@dataclass +class MCPTool: + """Represents a tool exposed by an MCP server.""" + + name: str + description: str + input_schema: Dict[str, Any] + server_name: str + + +@dataclass +class MCPClient: + """Client for communicating with MCP servers via SSE (Server-Sent Events). + + Supports both remote MCP servers (like Figma's https://mcp.figma.com) + and local MCP servers running on localhost. + """ + + base_url: str + auth_token: Optional[str] = None + timeout: float = 30.0 + _tools: List[MCPTool] = field(default_factory=list) + _session_id: Optional[str] = None + _http_client: Optional[httpx.AsyncClient] = None + + async def connect(self) -> bool: + """Establish connection and discover available tools.""" + try: + headers = self._get_headers() + self._http_client = httpx.AsyncClient( + base_url=self.base_url, + headers=headers, + timeout=self.timeout, + ) + + # Initialize session + await self._initialize_session() + + # Discover tools + await self._discover_tools() + + logger.info(f"Connected to MCP server at {self.base_url}, found {len(self._tools)} tools") + return True + + except Exception as e: + logger.error(f"Failed to connect to MCP server: {e}") + return False + + async def disconnect(self) -> None: + """Close the connection to the MCP server.""" + if self._http_client: + await self._http_client.aclose() + self._http_client = None + self._session_id = None + self._tools = [] + + def _get_headers(self) -> Dict[str, str]: + """Build request headers including auth if configured.""" + headers = { + "Content-Type": "application/json", + "Accept": "application/json, text/event-stream", + } + if self.auth_token: + headers["Authorization"] = f"Bearer {self.auth_token}" + return headers + + async def _initialize_session(self) -> None: + """Initialize the MCP session with the server.""" + if not self._http_client: + raise RuntimeError("HTTP client not initialized") + + # Send initialize request + init_request = { + "jsonrpc": "2.0", + "id": 1, + "method": "initialize", + "params": { + "protocolVersion": "2024-11-05", + "capabilities": { + "tools": {}, + }, + "clientInfo": { + "name": "reachy-mini-conversation-app", + "version": "0.1.0", + }, + }, + } + + response = await self._http_client.post("/", json=init_request) + response.raise_for_status() + + result = response.json() + if "result" in result: + logger.debug(f"MCP session initialized: {result['result']}") + + async def _discover_tools(self) -> None: + """Discover available tools from the MCP server.""" + if not self._http_client: + raise RuntimeError("HTTP client not initialized") + + # Request tools list + tools_request = { + "jsonrpc": "2.0", + "id": 2, + "method": "tools/list", + "params": {}, + } + + response = await self._http_client.post("/", json=tools_request) + response.raise_for_status() + + result = response.json() + if "result" in result and "tools" in result["result"]: + for tool_data in result["result"]["tools"]: + tool = MCPTool( + name=tool_data.get("name", "unknown"), + description=tool_data.get("description", ""), + input_schema=tool_data.get("inputSchema", {}), + server_name=self.base_url, + ) + self._tools.append(tool) + logger.debug(f"Discovered MCP tool: {tool.name}") + + async def call_tool(self, tool_name: str, arguments: Dict[str, Any]) -> Any: + """Execute a tool on the MCP server. + + Args: + tool_name: Name of the tool to call + arguments: Arguments to pass to the tool + + Returns: + The result from the tool execution + """ + if not self._http_client: + raise RuntimeError("Not connected to MCP server") + + call_request = { + "jsonrpc": "2.0", + "id": 3, + "method": "tools/call", + "params": { + "name": tool_name, + "arguments": arguments, + }, + } + + logger.info(f"Calling MCP tool: {tool_name} with args: {arguments}") + + response = await self._http_client.post("/", json=call_request) + response.raise_for_status() + + result = response.json() + + if "error" in result: + error = result["error"] + raise RuntimeError(f"MCP tool error: {error.get('message', 'Unknown error')}") + + if "result" in result: + content = result["result"].get("content", []) + # Extract text content from response + text_parts = [] + for item in content: + if item.get("type") == "text": + text_parts.append(item.get("text", "")) + return "\n".join(text_parts) if text_parts else json.dumps(result["result"]) + + return json.dumps(result) + + def get_tools(self) -> List[MCPTool]: + """Get list of available tools.""" + return self._tools.copy() + + def get_tool(self, name: str) -> Optional[MCPTool]: + """Get a specific tool by name.""" + for tool in self._tools: + if tool.name == name: + return tool + return None + diff --git a/src/reachy_mini_conversation_app/mcp/cursor_bridge.py b/src/reachy_mini_conversation_app/mcp/cursor_bridge.py new file mode 100644 index 0000000000000000000000000000000000000000..0238f1165896f9d00b333bb5635178a8be8ec25d --- /dev/null +++ b/src/reachy_mini_conversation_app/mcp/cursor_bridge.py @@ -0,0 +1,344 @@ +"""MCP Server Bridge for Reachy-Cursor communication. + +This module provides a local MCP server that acts as a bridge between +Reachy and Cursor. Reachy can send coding requests to this server, +and Cursor (configured to connect to this MCP server) can receive them. + +## How it works: +1. This server runs locally (e.g., http://localhost:8765) +2. Reachy calls the `cursor_bridge` tool to send coding requests +3. Cursor connects to this MCP server and receives the requests as resources +4. Cursor's AI can then execute the coding tasks + +## Setup in Cursor: +Add to your Cursor MCP settings: +{ + "mcpServers": { + "reachy-bridge": { + "url": "http://localhost:8765" + } + } +} +""" + +import json +import asyncio +import logging +from datetime import datetime +from typing import Any, Dict, List, Optional +from dataclasses import dataclass, field +from aiohttp import web + +logger = logging.getLogger(__name__) + + +@dataclass +class CodingTask: + """Represents a coding task sent from Reachy.""" + id: str + request: str + context: Optional[str] = None + priority: str = "normal" + created_at: str = field(default_factory=lambda: datetime.now().isoformat()) + status: str = "pending" # pending, in_progress, completed + + +class ReachyCursorBridge: + """MCP Server bridge between Reachy and Cursor. + + Exposes coding tasks as MCP resources that Cursor can read and process. + """ + + def __init__(self, host: str = "127.0.0.1", port: int = 8765): + self.host = host + self.port = port + self.tasks: Dict[str, CodingTask] = {} + self.task_counter = 0 + self._app: Optional[web.Application] = None + self._runner: Optional[web.AppRunner] = None + + async def start(self) -> None: + """Start the MCP bridge server.""" + self._app = web.Application() + self._app.router.add_post("/", self._handle_jsonrpc) + self._app.router.add_get("/health", self._handle_health) + + self._runner = web.AppRunner(self._app) + await self._runner.setup() + + site = web.TCPSite(self._runner, self.host, self.port) + await site.start() + + logger.info(f"Reachy-Cursor MCP Bridge running at http://{self.host}:{self.port}") + + async def stop(self) -> None: + """Stop the MCP bridge server.""" + if self._runner: + await self._runner.cleanup() + + async def add_task(self, request: str, context: Optional[str] = None, + priority: str = "normal") -> CodingTask: + """Add a new coding task from Reachy.""" + self.task_counter += 1 + task_id = f"task_{self.task_counter}_{datetime.now().strftime('%H%M%S')}" + + task = CodingTask( + id=task_id, + request=request, + context=context, + priority=priority, + ) + self.tasks[task_id] = task + + logger.info(f"New coding task added: {task_id}") + return task + + def get_pending_tasks(self) -> List[CodingTask]: + """Get all pending tasks.""" + return [t for t in self.tasks.values() if t.status == "pending"] + + def mark_task_complete(self, task_id: str) -> bool: + """Mark a task as completed.""" + if task_id in self.tasks: + self.tasks[task_id].status = "completed" + return True + return False + + async def _handle_health(self, request: web.Request) -> web.Response: + """Health check endpoint.""" + return web.json_response({"status": "ok", "service": "reachy-cursor-bridge"}) + + async def _handle_jsonrpc(self, request: web.Request) -> web.Response: + """Handle JSON-RPC requests from Cursor.""" + try: + data = await request.json() + except json.JSONDecodeError: + return web.json_response( + {"jsonrpc": "2.0", "error": {"code": -32700, "message": "Parse error"}, "id": None} + ) + + method = data.get("method", "") + params = data.get("params", {}) + req_id = data.get("id") + + logger.debug(f"MCP request: {method}") + + if method == "initialize": + return web.json_response({ + "jsonrpc": "2.0", + "id": req_id, + "result": { + "protocolVersion": "2024-11-05", + "capabilities": { + "resources": {"subscribe": True}, + "tools": {}, + }, + "serverInfo": { + "name": "reachy-cursor-bridge", + "version": "1.0.0", + }, + }, + }) + + elif method == "resources/list": + # Expose pending tasks as resources + resources = [] + for task in self.get_pending_tasks(): + resources.append({ + "uri": f"reachy://task/{task.id}", + "name": f"Coding Task: {task.request[:50]}...", + "description": task.request, + "mimeType": "application/json", + }) + + # Also expose a "latest" resource + if self.tasks: + latest = max(self.tasks.values(), key=lambda t: t.created_at) + resources.insert(0, { + "uri": "reachy://task/latest", + "name": "Latest Coding Request from Reachy", + "description": latest.request, + "mimeType": "application/json", + }) + + return web.json_response({ + "jsonrpc": "2.0", + "id": req_id, + "result": {"resources": resources}, + }) + + elif method == "resources/read": + uri = params.get("uri", "") + + if uri == "reachy://task/latest" and self.tasks: + latest = max(self.tasks.values(), key=lambda t: t.created_at) + content = self._format_task_content(latest) + elif uri.startswith("reachy://task/"): + task_id = uri.replace("reachy://task/", "") + task = self.tasks.get(task_id) + if task: + content = self._format_task_content(task) + else: + content = {"error": f"Task {task_id} not found"} + else: + content = {"error": "Unknown resource"} + + return web.json_response({ + "jsonrpc": "2.0", + "id": req_id, + "result": { + "contents": [{ + "uri": uri, + "mimeType": "application/json", + "text": json.dumps(content, indent=2), + }], + }, + }) + + elif method == "tools/list": + # Expose a tool for Cursor to mark tasks complete + return web.json_response({ + "jsonrpc": "2.0", + "id": req_id, + "result": { + "tools": [ + { + "name": "mark_task_complete", + "description": "Mark a Reachy coding task as completed", + "inputSchema": { + "type": "object", + "properties": { + "task_id": { + "type": "string", + "description": "The ID of the task to mark complete", + }, + }, + "required": ["task_id"], + }, + }, + { + "name": "get_current_request", + "description": "Get the current/latest coding request from Reachy", + "inputSchema": { + "type": "object", + "properties": {}, + }, + }, + ], + }, + }) + + elif method == "tools/call": + tool_name = params.get("name", "") + arguments = params.get("arguments", {}) + + if tool_name == "mark_task_complete": + task_id = arguments.get("task_id", "") + success = self.mark_task_complete(task_id) + result_text = f"Task {task_id} marked complete" if success else f"Task {task_id} not found" + elif tool_name == "get_current_request": + if self.tasks: + latest = max(self.tasks.values(), key=lambda t: t.created_at) + result_text = json.dumps(self._format_task_content(latest), indent=2) + else: + result_text = "No pending coding requests from Reachy" + else: + result_text = f"Unknown tool: {tool_name}" + + return web.json_response({ + "jsonrpc": "2.0", + "id": req_id, + "result": { + "content": [{"type": "text", "text": result_text}], + }, + }) + + else: + return web.json_response({ + "jsonrpc": "2.0", + "id": req_id, + "error": {"code": -32601, "message": f"Method not found: {method}"}, + }) + + def _format_task_content(self, task: CodingTask) -> Dict[str, Any]: + """Format a task for Cursor consumption.""" + return { + "type": "coding_request", + "from": "reachy", + "task_id": task.id, + "request": task.request, + "context": task.context, + "priority": task.priority, + "created_at": task.created_at, + "status": task.status, + "instructions": ( + "This is a voice-activated coding request from Reachy robot. " + "Please implement the request described above. " + "When complete, you can mark the task as done using mark_task_complete." + ), + } + + +# Global bridge instance (for tool access) +_bridge_instance: Optional[ReachyCursorBridge] = None + + +async def get_or_create_bridge(host: str = "127.0.0.1", port: int = 8765) -> ReachyCursorBridge: + """Get or create the global bridge instance.""" + global _bridge_instance + + if _bridge_instance is None: + _bridge_instance = ReachyCursorBridge(host=host, port=port) + await _bridge_instance.start() + + return _bridge_instance + + +async def send_to_cursor_via_bridge(request: str, context: Optional[str] = None) -> Dict[str, Any]: + """Send a coding request to Cursor via the MCP bridge.""" + bridge = await get_or_create_bridge() + task = await bridge.add_task(request=request, context=context) + + return { + "status": "queued", + "task_id": task.id, + "message": f"Coding request sent to bridge. Cursor can access it at reachy://task/{task.id}", + "bridge_url": f"http://{bridge.host}:{bridge.port}", + } + + +if __name__ == "__main__": + # Run as standalone server for testing + import sys + + logging.basicConfig(level=logging.INFO) + + async def main(): + bridge = ReachyCursorBridge() + await bridge.start() + + # Add a test task + await bridge.add_task( + request="Create a beautiful landing page with React and Tailwind CSS", + context="Modern dark theme, hero section, features, pricing", + ) + + print(f"\nMCP Bridge running at http://{bridge.host}:{bridge.port}") + print("\nTo connect from Cursor, add to your MCP settings:") + print(json.dumps({ + "mcpServers": { + "reachy-bridge": { + "url": f"http://{bridge.host}:{bridge.port}" + } + } + }, indent=2)) + print("\nPress Ctrl+C to stop...") + + try: + while True: + await asyncio.sleep(1) + except KeyboardInterrupt: + await bridge.stop() + + asyncio.run(main()) + diff --git a/src/reachy_mini_conversation_app/mcp/figma.py b/src/reachy_mini_conversation_app/mcp/figma.py new file mode 100644 index 0000000000000000000000000000000000000000..4854e4ed9677b4d6f20a9625d089d1e3ffe03571 --- /dev/null +++ b/src/reachy_mini_conversation_app/mcp/figma.py @@ -0,0 +1,288 @@ +"""Figma MCP integration for design operations.""" + +import os +import json +import logging +import asyncio +from typing import Any, Dict, List, Optional + +from reachy_mini_conversation_app.mcp.client import MCPClient, MCPTool +from reachy_mini_conversation_app.tools.core_tools import Tool, ToolDependencies + + +logger = logging.getLogger(__name__) + +# Figma MCP server endpoint +FIGMA_MCP_URL = "https://mcp.figma.com/sse" + + +class FigmaMCPTools: + """Manager for Figma MCP tools integration. + + This class connects to Figma's MCP server and creates Tool instances + that can be used by the conversation app. + """ + + def __init__(self, access_token: Optional[str] = None): + """Initialize Figma MCP tools. + + Args: + access_token: Figma personal access token. If not provided, + will try to get from FIGMA_ACCESS_TOKEN env var. + """ + self.access_token = access_token or os.getenv("FIGMA_ACCESS_TOKEN") + self.client: Optional[MCPClient] = None + self._tools: List[Tool] = [] + self._connected = False + + async def connect(self) -> bool: + """Connect to Figma MCP server and discover tools.""" + if not self.access_token: + logger.warning("No Figma access token provided. Set FIGMA_ACCESS_TOKEN env var.") + return False + + try: + self.client = MCPClient( + base_url=FIGMA_MCP_URL, + auth_token=self.access_token, + ) + + success = await self.client.connect() + if success: + self._create_tool_wrappers() + self._connected = True + logger.info(f"Figma MCP connected with {len(self._tools)} tools") + return success + + except Exception as e: + logger.error(f"Failed to connect to Figma MCP: {e}") + return False + + async def disconnect(self) -> None: + """Disconnect from Figma MCP server.""" + if self.client: + await self.client.disconnect() + self._connected = False + self._tools = [] + + def _create_tool_wrappers(self) -> None: + """Create Tool wrapper instances for each MCP tool.""" + if not self.client: + return + + for mcp_tool in self.client.get_tools(): + wrapper = self._create_tool_wrapper(mcp_tool) + self._tools.append(wrapper) + + def _create_tool_wrapper(self, mcp_tool: MCPTool) -> Tool: + """Create a Tool instance that wraps an MCP tool. + + Args: + mcp_tool: The MCP tool definition + + Returns: + A Tool instance that calls the MCP tool + """ + client = self.client + + class MCPToolWrapper(Tool): + """Wrapper that bridges MCP tools to the conversation app's Tool interface.""" + + name = f"figma_{mcp_tool.name}" + description = f"[Figma] {mcp_tool.description}" + parameters_schema = mcp_tool.input_schema + + async def run(self, deps: ToolDependencies, **kwargs: Any) -> str: + """Execute the MCP tool.""" + if not client: + return "Error: Figma MCP not connected" + + try: + result = await client.call_tool(mcp_tool.name, kwargs) + return str(result) + except Exception as e: + logger.error(f"Figma MCP tool error: {e}") + return f"Error calling Figma: {str(e)}" + + return MCPToolWrapper() + + def get_tools(self) -> List[Tool]: + """Get all Figma tools as Tool instances.""" + return self._tools.copy() + + def is_connected(self) -> bool: + """Check if connected to Figma MCP.""" + return self._connected + + +# Convenience function to create common Figma design tools +def create_figma_design_tool() -> Tool: + """Create a high-level Figma design tool for the conversation app. + + This tool provides a simplified interface for common design tasks. + """ + + class FigmaDesignTool(Tool): + """High-level tool for Figma design operations.""" + + name = "design_with_figma" + description = ( + "Design UI elements, websites, or apps using Figma. " + "Can create new designs, modify existing files, or get design information. " + "Use this when asked to design something visual." + ) + parameters_schema = { + "type": "object", + "properties": { + "action": { + "type": "string", + "enum": ["create", "get_info", "modify", "export"], + "description": "The design action to perform", + }, + "description": { + "type": "string", + "description": "Description of what to design or modify", + }, + "file_key": { + "type": "string", + "description": "Figma file key (optional, for existing files)", + }, + "node_id": { + "type": "string", + "description": "Specific node/frame ID (optional)", + }, + }, + "required": ["action", "description"], + } + + def __init__(self, figma_tools: FigmaMCPTools): + """Initialize with Figma tools manager.""" + self.figma_tools = figma_tools + + async def run(self, deps: ToolDependencies, **kwargs: Any) -> str: + """Execute the design action.""" + action = kwargs.get("action", "create") + description = kwargs.get("description", "") + file_key = kwargs.get("file_key") + node_id = kwargs.get("node_id") + + if not self.figma_tools.is_connected(): + return ( + "I'd love to help you design that, but I'm not connected to Figma right now. " + "Please make sure the FIGMA_ACCESS_TOKEN is set in your environment." + ) + + client = self.figma_tools.client + if not client: + return "Figma connection not available." + + try: + if action == "get_info": + # Get file/node information + if file_key: + result = await client.call_tool("get_file", {"file_key": file_key}) + return f"Here's what I found in that Figma file:\n{result}" + else: + return "I need a Figma file key to get information. You can find this in the Figma URL." + + elif action == "create": + # For creation, we describe what we want to create + return ( + f"I understand you want me to design: {description}\n\n" + "To create this in Figma, I recommend:\n" + "1. Open Figma and create a new file\n" + "2. Share the file key with me so I can help modify it\n" + "3. Or describe specific elements you want me to help design\n\n" + "Once you have a Figma file open, I can help you add and modify elements!" + ) + + elif action == "modify": + if not file_key: + return "I need a Figma file key to modify a design. Share the file URL with me!" + + result = await client.call_tool( + "modify_node", + { + "file_key": file_key, + "node_id": node_id or "", + "changes": description, + }, + ) + return f"I've made the changes: {result}" + + elif action == "export": + if not file_key: + return "I need a Figma file key to export. Share the file URL with me!" + + result = await client.call_tool( + "export_node", + {"file_key": file_key, "node_id": node_id or ""}, + ) + return f"Export ready: {result}" + + else: + return f"I don't know how to do '{action}' yet. Try: create, get_info, modify, or export." + + except Exception as e: + logger.error(f"Figma design tool error: {e}") + return f"Oops! Something went wrong with Figma: {str(e)}" + + return FigmaDesignTool + + +# Standalone tools that can be loaded without full MCP connection +class GetFigmaFileInfo(Tool): + """Tool to get information about a Figma file.""" + + name = "get_figma_file" + description = "Get information about a Figma design file. Requires a Figma file key from the URL." + parameters_schema = { + "type": "object", + "properties": { + "file_key": { + "type": "string", + "description": "The Figma file key (found in the file URL after /file/)", + }, + }, + "required": ["file_key"], + } + + async def run(self, deps: ToolDependencies, **kwargs: Any) -> str: + """Get Figma file information.""" + file_key = kwargs.get("file_key", "") + + access_token = os.getenv("FIGMA_ACCESS_TOKEN") + if not access_token: + return "I need a Figma access token to view files. Please set FIGMA_ACCESS_TOKEN." + + try: + import httpx + + async with httpx.AsyncClient() as client: + response = await client.get( + f"https://api.figma.com/v1/files/{file_key}", + headers={"X-Figma-Token": access_token}, + ) + response.raise_for_status() + data = response.json() + + name = data.get("name", "Unknown") + last_modified = data.get("lastModified", "Unknown") + version = data.get("version", "Unknown") + + # Get page names + pages = [] + for page in data.get("document", {}).get("children", []): + pages.append(page.get("name", "Unnamed")) + + return ( + f"๐Ÿ“ **{name}**\n" + f"Last modified: {last_modified}\n" + f"Version: {version}\n" + f"Pages: {', '.join(pages) if pages else 'None'}" + ) + + except Exception as e: + logger.error(f"Error getting Figma file: {e}") + return f"Couldn't fetch the Figma file: {str(e)}" + diff --git a/src/reachy_mini_conversation_app/moves.py b/src/reachy_mini_conversation_app/moves.py new file mode 100644 index 0000000000000000000000000000000000000000..3c319e73d70519ea837987285849b997db2032a0 --- /dev/null +++ b/src/reachy_mini_conversation_app/moves.py @@ -0,0 +1,849 @@ +"""Movement system with sequential primary moves and additive secondary moves. + +Design overview +- Primary moves (emotions, dances, goto, breathing) are mutually exclusive and run + sequentially. +- Secondary moves (speech sway, face tracking) are additive offsets applied on top + of the current primary pose. +- There is a single control point to the robot: `ReachyMini.set_target`. +- The control loop runs near 100 Hz and is phase-aligned via a monotonic clock. +- Idle behaviour starts an infinite `BreathingMove` after a short inactivity delay + unless listening is active. + +Threading model +- A dedicated worker thread owns all real-time state and issues `set_target` + commands. +- Other threads communicate via a command queue (enqueue moves, mark activity, + toggle listening). +- Secondary offset producers set pending values guarded by locks; the worker + snaps them atomically. + +Units and frames +- Secondary offsets are interpreted as metres for x/y/z and radians for + roll/pitch/yaw in the world frame (unless noted by `compose_world_offset`). +- Antennas and `body_yaw` are in radians. +- Head pose composition uses `compose_world_offset(primary_head, secondary_head)`; + the secondary offset must therefore be expressed in the world frame. + +Safety +- Listening freezes antennas, then blends them back on unfreeze. +- Interpolations and blends are used to avoid jumps at all times. +- `set_target` errors are rate-limited in logs. +""" + +from __future__ import annotations +import time +import logging +import threading +from queue import Empty, Queue +from typing import Any, Dict, Tuple +from collections import deque +from dataclasses import dataclass + +import numpy as np +from numpy.typing import NDArray + +from reachy_mini import ReachyMini +from reachy_mini.utils import create_head_pose +from reachy_mini.motion.move import Move +from reachy_mini.utils.interpolation import ( + compose_world_offset, + linear_pose_interpolation, +) + + +logger = logging.getLogger(__name__) + +# Configuration constants +CONTROL_LOOP_FREQUENCY_HZ = 100.0 # Hz - Target frequency for the movement control loop + +# Type definitions +FullBodyPose = Tuple[NDArray[np.float32], Tuple[float, float], float] # (head_pose_4x4, antennas, body_yaw) + + +class BreathingMove(Move): # type: ignore + """Breathing move with interpolation to neutral and then continuous breathing patterns.""" + + def __init__( + self, + interpolation_start_pose: NDArray[np.float32], + interpolation_start_antennas: Tuple[float, float], + interpolation_duration: float = 1.0, + ): + """Initialize breathing move. + + Args: + interpolation_start_pose: 4x4 matrix of current head pose to interpolate from + interpolation_start_antennas: Current antenna positions to interpolate from + interpolation_duration: Duration of interpolation to neutral (seconds) + + """ + self.interpolation_start_pose = interpolation_start_pose + self.interpolation_start_antennas = np.array(interpolation_start_antennas) + self.interpolation_duration = interpolation_duration + + # Neutral positions for breathing base + self.neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True) + self.neutral_antennas = np.array([0.0, 0.0]) + + # Breathing parameters + self.breathing_z_amplitude = 0.005 # 5mm gentle breathing + self.breathing_frequency = 0.1 # Hz (6 breaths per minute) + self.antenna_sway_amplitude = np.deg2rad(15) # 15 degrees + self.antenna_frequency = 0.5 # Hz (faster antenna sway) + + @property + def duration(self) -> float: + """Duration property required by official Move interface.""" + return float("inf") # Continuous breathing (never ends naturally) + + def evaluate(self, t: float) -> tuple[NDArray[np.float64] | None, NDArray[np.float64] | None, float | None]: + """Evaluate breathing move at time t.""" + if t < self.interpolation_duration: + # Phase 1: Interpolate to neutral base position + interpolation_t = t / self.interpolation_duration + + # Interpolate head pose + head_pose = linear_pose_interpolation( + self.interpolation_start_pose, self.neutral_head_pose, interpolation_t, + ) + + # Interpolate antennas + antennas_interp = ( + 1 - interpolation_t + ) * self.interpolation_start_antennas + interpolation_t * self.neutral_antennas + antennas = antennas_interp.astype(np.float64) + + else: + # Phase 2: Breathing patterns from neutral base + breathing_time = t - self.interpolation_duration + + # Gentle z-axis breathing + z_offset = self.breathing_z_amplitude * np.sin(2 * np.pi * self.breathing_frequency * breathing_time) + head_pose = create_head_pose(x=0, y=0, z=z_offset, roll=0, pitch=0, yaw=0, degrees=True, mm=False) + + # Antenna sway (opposite directions) + antenna_sway = self.antenna_sway_amplitude * np.sin(2 * np.pi * self.antenna_frequency * breathing_time) + antennas = np.array([antenna_sway, -antenna_sway], dtype=np.float64) + + # Return in official Move interface format: (head_pose, antennas_array, body_yaw) + return (head_pose, antennas, 0.0) + + +def combine_full_body(primary_pose: FullBodyPose, secondary_pose: FullBodyPose) -> FullBodyPose: + """Combine primary and secondary full body poses. + + Args: + primary_pose: (head_pose, antennas, body_yaw) - primary move + secondary_pose: (head_pose, antennas, body_yaw) - secondary offsets + + Returns: + Combined full body pose (head_pose, antennas, body_yaw) + + """ + primary_head, primary_antennas, primary_body_yaw = primary_pose + secondary_head, secondary_antennas, secondary_body_yaw = secondary_pose + + # Combine head poses using compose_world_offset; the secondary pose must be an + # offset expressed in the world frame (T_off_world) applied to the absolute + # primary transform (T_abs). + combined_head = compose_world_offset(primary_head, secondary_head, reorthonormalize=True) + + # Sum antennas and body_yaw + combined_antennas = ( + primary_antennas[0] + secondary_antennas[0], + primary_antennas[1] + secondary_antennas[1], + ) + combined_body_yaw = primary_body_yaw + secondary_body_yaw + + return (combined_head, combined_antennas, combined_body_yaw) + + +def clone_full_body_pose(pose: FullBodyPose) -> FullBodyPose: + """Create a deep copy of a full body pose tuple.""" + head, antennas, body_yaw = pose + return (head.copy(), (float(antennas[0]), float(antennas[1])), float(body_yaw)) + + +@dataclass +class MovementState: + """State tracking for the movement system.""" + + # Primary move state + current_move: Move | None = None + move_start_time: float | None = None + last_activity_time: float = 0.0 + + # Secondary move state (offsets) + speech_offsets: Tuple[float, float, float, float, float, float] = ( + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + ) + face_tracking_offsets: Tuple[float, float, float, float, float, float] = ( + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + ) + + # Status flags + last_primary_pose: FullBodyPose | None = None + + def update_activity(self) -> None: + """Update the last activity time.""" + self.last_activity_time = time.monotonic() + + +@dataclass +class LoopFrequencyStats: + """Track rolling loop frequency statistics.""" + + mean: float = 0.0 + m2: float = 0.0 + min_freq: float = float("inf") + count: int = 0 + last_freq: float = 0.0 + potential_freq: float = 0.0 + + def reset(self) -> None: + """Reset accumulators while keeping the last potential frequency.""" + self.mean = 0.0 + self.m2 = 0.0 + self.min_freq = float("inf") + self.count = 0 + + +class MovementManager: + """Coordinate sequential moves, additive offsets, and robot output at 100 Hz. + + Responsibilities: + - Own a real-time loop that samples the current primary move (if any), fuses + secondary offsets, and calls `set_target` exactly once per tick. + - Start an idle `BreathingMove` after `idle_inactivity_delay` when not + listening and no moves are queued. + - Expose thread-safe APIs so other threads can enqueue moves, mark activity, + or feed secondary offsets without touching internal state. + + Timing: + - All elapsed-time calculations rely on `time.monotonic()` through `self._now` + to avoid wall-clock jumps. + - The loop attempts 100 Hz + + Concurrency: + - External threads communicate via `_command_queue` messages. + - Secondary offsets are staged via dirty flags guarded by locks and consumed + atomically inside the worker loop. + """ + + def __init__( + self, + current_robot: ReachyMini, + camera_worker: "Any" = None, + ): + """Initialize movement manager.""" + self.current_robot = current_robot + self.camera_worker = camera_worker + + # Single timing source for durations + self._now = time.monotonic + + # Movement state + self.state = MovementState() + self.state.last_activity_time = self._now() + neutral_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True) + self.state.last_primary_pose = (neutral_pose, (0.0, 0.0), 0.0) + + # Move queue (primary moves) + self.move_queue: deque[Move] = deque() + + # Configuration + self.idle_inactivity_delay = 0.3 # seconds + self.target_frequency = CONTROL_LOOP_FREQUENCY_HZ + self.target_period = 1.0 / self.target_frequency + + self._stop_event = threading.Event() + self._thread: threading.Thread | None = None + self._is_listening = False + self._last_commanded_pose: FullBodyPose = clone_full_body_pose(self.state.last_primary_pose) + self._listening_antennas: Tuple[float, float] = self._last_commanded_pose[1] + self._antenna_unfreeze_blend = 1.0 + self._antenna_blend_duration = 0.4 # seconds to blend back after listening + self._last_listening_blend_time = self._now() + self._breathing_active = False # true when breathing move is running or queued + self._listening_debounce_s = 0.15 + self._last_listening_toggle_time = self._now() + self._last_set_target_err = 0.0 + self._set_target_err_interval = 1.0 # seconds between error logs + self._set_target_err_suppressed = 0 + + # Cross-thread signalling + self._command_queue: "Queue[Tuple[str, Any]]" = Queue() + self._speech_offsets_lock = threading.Lock() + self._pending_speech_offsets: Tuple[float, float, float, float, float, float] = ( + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + ) + self._speech_offsets_dirty = False + + self._face_offsets_lock = threading.Lock() + self._pending_face_offsets: Tuple[float, float, float, float, float, float] = ( + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + ) + self._face_offsets_dirty = False + + self._shared_state_lock = threading.Lock() + self._shared_last_activity_time = self.state.last_activity_time + self._shared_is_listening = self._is_listening + self._status_lock = threading.Lock() + self._freq_stats = LoopFrequencyStats() + self._freq_snapshot = LoopFrequencyStats() + + def queue_move(self, move: Move) -> None: + """Queue a primary move to run after the currently executing one. + + Thread-safe: the move is enqueued via the worker command queue so the + control loop remains the sole mutator of movement state. + """ + self._command_queue.put(("queue_move", move)) + + def clear_move_queue(self) -> None: + """Stop the active move and discard any queued primary moves. + + Thread-safe: executed by the worker thread via the command queue. + """ + self._command_queue.put(("clear_queue", None)) + + def set_speech_offsets(self, offsets: Tuple[float, float, float, float, float, float]) -> None: + """Update speech-induced secondary offsets (x, y, z, roll, pitch, yaw). + + Offsets are interpreted as metres for translation and radians for + rotation in the world frame. Thread-safe via a pending snapshot. + """ + with self._speech_offsets_lock: + self._pending_speech_offsets = offsets + self._speech_offsets_dirty = True + + def set_moving_state(self, duration: float) -> None: + """Mark the robot as actively moving for the provided duration. + + Legacy hook used by goto helpers to keep inactivity and breathing logic + aware of manual motions. Thread-safe via the command queue. + """ + self._command_queue.put(("set_moving_state", duration)) + + def is_idle(self) -> bool: + """Return True when the robot has been inactive longer than the idle delay.""" + with self._shared_state_lock: + last_activity = self._shared_last_activity_time + listening = self._shared_is_listening + + if listening: + return False + + return self._now() - last_activity >= self.idle_inactivity_delay + + def set_listening(self, listening: bool) -> None: + """Enable or disable listening mode without touching shared state directly. + + While listening: + - Antenna positions are frozen at the last commanded values. + - Blending is reset so that upon unfreezing the antennas return smoothly. + - Idle breathing is suppressed. + + Thread-safe: the change is posted to the worker command queue. + """ + with self._shared_state_lock: + if self._shared_is_listening == listening: + return + self._command_queue.put(("set_listening", listening)) + + def _poll_signals(self, current_time: float) -> None: + """Apply queued commands and pending offset updates.""" + self._apply_pending_offsets() + + while True: + try: + command, payload = self._command_queue.get_nowait() + except Empty: + break + self._handle_command(command, payload, current_time) + + def _apply_pending_offsets(self) -> None: + """Apply the most recent speech/face offset updates.""" + speech_offsets: Tuple[float, float, float, float, float, float] | None = None + with self._speech_offsets_lock: + if self._speech_offsets_dirty: + speech_offsets = self._pending_speech_offsets + self._speech_offsets_dirty = False + + if speech_offsets is not None: + self.state.speech_offsets = speech_offsets + self.state.update_activity() + + face_offsets: Tuple[float, float, float, float, float, float] | None = None + with self._face_offsets_lock: + if self._face_offsets_dirty: + face_offsets = self._pending_face_offsets + self._face_offsets_dirty = False + + if face_offsets is not None: + self.state.face_tracking_offsets = face_offsets + self.state.update_activity() + + def _handle_command(self, command: str, payload: Any, current_time: float) -> None: + """Handle a single cross-thread command.""" + if command == "queue_move": + if isinstance(payload, Move): + self.move_queue.append(payload) + self.state.update_activity() + duration = getattr(payload, "duration", None) + if duration is not None: + try: + duration_str = f"{float(duration):.2f}" + except (TypeError, ValueError): + duration_str = str(duration) + else: + duration_str = "?" + logger.debug( + "Queued move with duration %ss, queue size: %s", + duration_str, + len(self.move_queue), + ) + else: + logger.warning("Ignored queue_move command with invalid payload: %s", payload) + elif command == "clear_queue": + self.move_queue.clear() + self.state.current_move = None + self.state.move_start_time = None + self._breathing_active = False + logger.info("Cleared move queue and stopped current move") + elif command == "set_moving_state": + try: + duration = float(payload) + except (TypeError, ValueError): + logger.warning("Invalid moving state duration: %s", payload) + return + self.state.update_activity() + elif command == "mark_activity": + self.state.update_activity() + elif command == "set_listening": + desired_state = bool(payload) + now = self._now() + if now - self._last_listening_toggle_time < self._listening_debounce_s: + return + self._last_listening_toggle_time = now + + if self._is_listening == desired_state: + return + + self._is_listening = desired_state + self._last_listening_blend_time = now + if desired_state: + # Freeze: snapshot current commanded antennas and reset blend + self._listening_antennas = ( + float(self._last_commanded_pose[1][0]), + float(self._last_commanded_pose[1][1]), + ) + self._antenna_unfreeze_blend = 0.0 + else: + # Unfreeze: restart blending from frozen pose + self._antenna_unfreeze_blend = 0.0 + self.state.update_activity() + else: + logger.warning("Unknown command received by MovementManager: %s", command) + + def _publish_shared_state(self) -> None: + """Expose idle-related state for external threads.""" + with self._shared_state_lock: + self._shared_last_activity_time = self.state.last_activity_time + self._shared_is_listening = self._is_listening + + def _manage_move_queue(self, current_time: float) -> None: + """Manage the primary move queue (sequential execution).""" + if self.state.current_move is None or ( + self.state.move_start_time is not None + and current_time - self.state.move_start_time >= self.state.current_move.duration + ): + self.state.current_move = None + self.state.move_start_time = None + + if self.move_queue: + self.state.current_move = self.move_queue.popleft() + self.state.move_start_time = current_time + # Any real move cancels breathing mode flag + self._breathing_active = isinstance(self.state.current_move, BreathingMove) + logger.debug(f"Starting new move, duration: {self.state.current_move.duration}s") + + def _manage_breathing(self, current_time: float) -> None: + """Manage automatic breathing when idle.""" + if ( + self.state.current_move is None + and not self.move_queue + and not self._is_listening + and not self._breathing_active + ): + idle_for = current_time - self.state.last_activity_time + if idle_for >= self.idle_inactivity_delay: + try: + # These 2 functions return the latest available sensor data from the robot, but don't perform I/O synchronously. + # Therefore, we accept calling them inside the control loop. + _, current_antennas = self.current_robot.get_current_joint_positions() + current_head_pose = self.current_robot.get_current_head_pose() + + self._breathing_active = True + self.state.update_activity() + + breathing_move = BreathingMove( + interpolation_start_pose=current_head_pose, + interpolation_start_antennas=current_antennas, + interpolation_duration=1.0, + ) + self.move_queue.append(breathing_move) + logger.debug("Started breathing after %.1fs of inactivity", idle_for) + except Exception as e: + self._breathing_active = False + logger.error("Failed to start breathing: %s", e) + + if isinstance(self.state.current_move, BreathingMove) and self.move_queue: + self.state.current_move = None + self.state.move_start_time = None + self._breathing_active = False + logger.debug("Stopping breathing due to new move activity") + + if self.state.current_move is not None and not isinstance(self.state.current_move, BreathingMove): + self._breathing_active = False + + def _get_primary_pose(self, current_time: float) -> FullBodyPose: + """Get the primary full body pose from current move or neutral.""" + # When a primary move is playing, sample it and cache the resulting pose + if self.state.current_move is not None and self.state.move_start_time is not None: + move_time = current_time - self.state.move_start_time + head, antennas, body_yaw = self.state.current_move.evaluate(move_time) + + if head is None: + head = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True) + if antennas is None: + antennas = np.array([0.0, 0.0]) + if body_yaw is None: + body_yaw = 0.0 + + antennas_tuple = (float(antennas[0]), float(antennas[1])) + head_copy = head.copy() + primary_full_body_pose = ( + head_copy, + antennas_tuple, + float(body_yaw), + ) + + self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose) + # Otherwise reuse the last primary pose so we avoid jumps between moves + elif self.state.last_primary_pose is not None: + primary_full_body_pose = clone_full_body_pose(self.state.last_primary_pose) + else: + neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True) + primary_full_body_pose = (neutral_head_pose, (0.0, 0.0), 0.0) + self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose) + + return primary_full_body_pose + + def _get_secondary_pose(self) -> FullBodyPose: + """Get the secondary full body pose from speech and face tracking offsets.""" + # Combine speech sway offsets + face tracking offsets for secondary pose + secondary_offsets = [ + self.state.speech_offsets[0] + self.state.face_tracking_offsets[0], + self.state.speech_offsets[1] + self.state.face_tracking_offsets[1], + self.state.speech_offsets[2] + self.state.face_tracking_offsets[2], + self.state.speech_offsets[3] + self.state.face_tracking_offsets[3], + self.state.speech_offsets[4] + self.state.face_tracking_offsets[4], + self.state.speech_offsets[5] + self.state.face_tracking_offsets[5], + ] + + secondary_head_pose = create_head_pose( + x=secondary_offsets[0], + y=secondary_offsets[1], + z=secondary_offsets[2], + roll=secondary_offsets[3], + pitch=secondary_offsets[4], + yaw=secondary_offsets[5], + degrees=False, + mm=False, + ) + return (secondary_head_pose, (0.0, 0.0), 0.0) + + def _compose_full_body_pose(self, current_time: float) -> FullBodyPose: + """Compose primary and secondary poses into a single command pose.""" + primary = self._get_primary_pose(current_time) + secondary = self._get_secondary_pose() + return combine_full_body(primary, secondary) + + def _update_primary_motion(self, current_time: float) -> None: + """Advance queue state and idle behaviours for this tick.""" + self._manage_move_queue(current_time) + self._manage_breathing(current_time) + + def _calculate_blended_antennas(self, target_antennas: Tuple[float, float]) -> Tuple[float, float]: + """Blend target antennas with listening freeze state and update blending.""" + now = self._now() + listening = self._is_listening + listening_antennas = self._listening_antennas + blend = self._antenna_unfreeze_blend + blend_duration = self._antenna_blend_duration + last_update = self._last_listening_blend_time + self._last_listening_blend_time = now + + if listening: + antennas_cmd = listening_antennas + new_blend = 0.0 + else: + dt = max(0.0, now - last_update) + if blend_duration <= 0: + new_blend = 1.0 + else: + new_blend = min(1.0, blend + dt / blend_duration) + antennas_cmd = ( + listening_antennas[0] * (1.0 - new_blend) + target_antennas[0] * new_blend, + listening_antennas[1] * (1.0 - new_blend) + target_antennas[1] * new_blend, + ) + + if listening: + self._antenna_unfreeze_blend = 0.0 + else: + self._antenna_unfreeze_blend = new_blend + if new_blend >= 1.0: + self._listening_antennas = ( + float(target_antennas[0]), + float(target_antennas[1]), + ) + + return antennas_cmd + + def _issue_control_command(self, head: NDArray[np.float32], antennas: Tuple[float, float], body_yaw: float) -> None: + """Send the fused pose to the robot with throttled error logging.""" + try: + self.current_robot.set_target(head=head, antennas=antennas, body_yaw=body_yaw) + except Exception as e: + now = self._now() + if now - self._last_set_target_err >= self._set_target_err_interval: + msg = f"Failed to set robot target: {e}" + if self._set_target_err_suppressed: + msg += f" (suppressed {self._set_target_err_suppressed} repeats)" + self._set_target_err_suppressed = 0 + logger.error(msg) + self._last_set_target_err = now + else: + self._set_target_err_suppressed += 1 + else: + with self._status_lock: + self._last_commanded_pose = clone_full_body_pose((head, antennas, body_yaw)) + + def _update_frequency_stats( + self, loop_start: float, prev_loop_start: float, stats: LoopFrequencyStats, + ) -> LoopFrequencyStats: + """Update frequency statistics based on the current loop start time.""" + period = loop_start - prev_loop_start + if period > 0: + stats.last_freq = 1.0 / period + stats.count += 1 + delta = stats.last_freq - stats.mean + stats.mean += delta / stats.count + stats.m2 += delta * (stats.last_freq - stats.mean) + stats.min_freq = min(stats.min_freq, stats.last_freq) + return stats + + def _schedule_next_tick(self, loop_start: float, stats: LoopFrequencyStats) -> Tuple[float, LoopFrequencyStats]: + """Compute sleep time to maintain target frequency and update potential freq.""" + computation_time = self._now() - loop_start + stats.potential_freq = 1.0 / computation_time if computation_time > 0 else float("inf") + sleep_time = max(0.0, self.target_period - computation_time) + return sleep_time, stats + + def _record_frequency_snapshot(self, stats: LoopFrequencyStats) -> None: + """Store a thread-safe snapshot of current frequency statistics.""" + with self._status_lock: + self._freq_snapshot = LoopFrequencyStats( + mean=stats.mean, + m2=stats.m2, + min_freq=stats.min_freq, + count=stats.count, + last_freq=stats.last_freq, + potential_freq=stats.potential_freq, + ) + + def _maybe_log_frequency(self, loop_count: int, print_interval_loops: int, stats: LoopFrequencyStats) -> None: + """Emit frequency telemetry when enough loops have elapsed.""" + if loop_count % print_interval_loops != 0 or stats.count == 0: + return + + variance = stats.m2 / stats.count if stats.count > 0 else 0.0 + lowest = stats.min_freq if stats.min_freq != float("inf") else 0.0 + logger.debug( + "Loop freq - avg: %.2fHz, variance: %.4f, min: %.2fHz, last: %.2fHz, potential: %.2fHz, target: %.1fHz", + stats.mean, + variance, + lowest, + stats.last_freq, + stats.potential_freq, + self.target_frequency, + ) + stats.reset() + + def _update_face_tracking(self, current_time: float) -> None: + """Get face tracking offsets from camera worker thread.""" + if self.camera_worker is not None: + # Get face tracking offsets from camera worker thread + offsets = self.camera_worker.get_face_tracking_offsets() + self.state.face_tracking_offsets = offsets + else: + # No camera worker, use neutral offsets + self.state.face_tracking_offsets = (0.0, 0.0, 0.0, 0.0, 0.0, 0.0) + + def start(self) -> None: + """Start the worker thread that drives the 100 Hz control loop.""" + if self._thread is not None and self._thread.is_alive(): + logger.warning("Move worker already running; start() ignored") + return + self._stop_event.clear() + self._thread = threading.Thread(target=self.working_loop, daemon=True) + self._thread.start() + logger.debug("Move worker started") + + def stop(self) -> None: + """Request the worker thread to stop and wait for it to exit. + + Before stopping, resets the robot to a neutral position. + """ + if self._thread is None or not self._thread.is_alive(): + logger.debug("Move worker not running; stop() ignored") + return + + logger.info("Stopping movement manager and resetting to neutral position...") + + # Clear any queued moves and stop current move + self.clear_move_queue() + + # Stop the worker thread first so it doesn't interfere + self._stop_event.set() + if self._thread is not None: + self._thread.join() + self._thread = None + logger.debug("Move worker stopped") + + # Reset to neutral position using goto_target (same approach as wake_up) + try: + neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True) + neutral_antennas = [0.0, 0.0] + neutral_body_yaw = 0.0 + + # Use goto_target directly on the robot + self.current_robot.goto_target( + head=neutral_head_pose, + antennas=neutral_antennas, + duration=2.0, + body_yaw=neutral_body_yaw, + ) + + logger.info("Reset to neutral position completed") + + except Exception as e: + logger.error(f"Failed to reset to neutral position: {e}") + + def get_status(self) -> Dict[str, Any]: + """Return a lightweight status snapshot for observability.""" + with self._status_lock: + pose_snapshot = clone_full_body_pose(self._last_commanded_pose) + freq_snapshot = LoopFrequencyStats( + mean=self._freq_snapshot.mean, + m2=self._freq_snapshot.m2, + min_freq=self._freq_snapshot.min_freq, + count=self._freq_snapshot.count, + last_freq=self._freq_snapshot.last_freq, + potential_freq=self._freq_snapshot.potential_freq, + ) + + head_matrix = pose_snapshot[0].tolist() if pose_snapshot else None + antennas = pose_snapshot[1] if pose_snapshot else None + body_yaw = pose_snapshot[2] if pose_snapshot else None + + return { + "queue_size": len(self.move_queue), + "is_listening": self._is_listening, + "breathing_active": self._breathing_active, + "last_commanded_pose": { + "head": head_matrix, + "antennas": antennas, + "body_yaw": body_yaw, + }, + "loop_frequency": { + "last": freq_snapshot.last_freq, + "mean": freq_snapshot.mean, + "min": freq_snapshot.min_freq, + "potential": freq_snapshot.potential_freq, + "samples": freq_snapshot.count, + }, + } + + def working_loop(self) -> None: + """Control loop main movements - reproduces main_works.py control architecture. + + Single set_target() call with pose fusion. + """ + logger.debug("Starting enhanced movement control loop (100Hz)") + + loop_count = 0 + prev_loop_start = self._now() + print_interval_loops = max(1, int(self.target_frequency * 2)) + freq_stats = self._freq_stats + + while not self._stop_event.is_set(): + loop_start = self._now() + loop_count += 1 + + if loop_count > 1: + freq_stats = self._update_frequency_stats(loop_start, prev_loop_start, freq_stats) + prev_loop_start = loop_start + + # 1) Poll external commands and apply pending offsets (atomic snapshot) + self._poll_signals(loop_start) + + # 2) Manage the primary move queue (start new move, end finished move, breathing) + self._update_primary_motion(loop_start) + + # 3) Update vision-based secondary offsets + self._update_face_tracking(loop_start) + + # 4) Build primary and secondary full-body poses, then fuse them + head, antennas, body_yaw = self._compose_full_body_pose(loop_start) + + # 5) Apply listening antenna freeze or blend-back + antennas_cmd = self._calculate_blended_antennas(antennas) + + # 6) Single set_target call - the only control point + self._issue_control_command(head, antennas_cmd, body_yaw) + + # 7) Adaptive sleep to align to next tick, then publish shared state + sleep_time, freq_stats = self._schedule_next_tick(loop_start, freq_stats) + self._publish_shared_state() + self._record_frequency_snapshot(freq_stats) + + # 8) Periodic telemetry on loop frequency + self._maybe_log_frequency(loop_count, print_interval_loops, freq_stats) + + if sleep_time > 0: + time.sleep(sleep_time) + + logger.debug("Movement control loop stopped") diff --git a/src/reachy_mini_conversation_app/openai_realtime.py b/src/reachy_mini_conversation_app/openai_realtime.py new file mode 100644 index 0000000000000000000000000000000000000000..f3eb5698673d11091e537ab9a46c489e56b3c771 --- /dev/null +++ b/src/reachy_mini_conversation_app/openai_realtime.py @@ -0,0 +1,754 @@ +import json +import base64 +import random +import asyncio +import logging +from typing import Any, Final, Tuple, Literal, Optional +from pathlib import Path +from datetime import datetime + +import cv2 +import numpy as np +import gradio as gr +from openai import AsyncOpenAI +from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item, audio_to_int16 +from numpy.typing import NDArray +from scipy.signal import resample +from websockets.exceptions import ConnectionClosedError + +from reachy_mini_conversation_app.config import config +from reachy_mini_conversation_app.prompts import get_session_voice, get_session_instructions +from reachy_mini_conversation_app.tools.core_tools import ( + ToolDependencies, + get_tool_specs, + dispatch_tool_call, +) + + +logger = logging.getLogger(__name__) + +OPEN_AI_INPUT_SAMPLE_RATE: Final[Literal[24000]] = 24000 +OPEN_AI_OUTPUT_SAMPLE_RATE: Final[Literal[24000]] = 24000 + + +class OpenaiRealtimeHandler(AsyncStreamHandler): + """An OpenAI realtime handler for fastrtc Stream.""" + + def __init__(self, deps: ToolDependencies, gradio_mode: bool = False, instance_path: Optional[str] = None): + """Initialize the handler.""" + super().__init__( + expected_layout="mono", + output_sample_rate=OPEN_AI_OUTPUT_SAMPLE_RATE, + input_sample_rate=OPEN_AI_INPUT_SAMPLE_RATE, + ) + + # Override typing of the sample rates to match OpenAI's requirements + self.output_sample_rate: Literal[24000] = self.output_sample_rate + self.input_sample_rate: Literal[24000] = self.input_sample_rate + + self.deps = deps + + # Override type annotations for OpenAI strict typing (only for values used in API) + self.output_sample_rate = OPEN_AI_OUTPUT_SAMPLE_RATE + self.input_sample_rate = OPEN_AI_INPUT_SAMPLE_RATE + + self.connection: Any = None + self.output_queue: "asyncio.Queue[Tuple[int, NDArray[np.int16]] | AdditionalOutputs]" = asyncio.Queue() + + self.last_activity_time = asyncio.get_event_loop().time() + self.start_time = asyncio.get_event_loop().time() + self.is_idle_tool_call = False + self.gradio_mode = gradio_mode + self.instance_path = instance_path + # Track how the API key was provided (env vs textbox) and its value + self._key_source: Literal["env", "textbox"] = "env" + self._provided_api_key: str | None = None + + # Debouncing for partial transcripts + self.partial_transcript_task: asyncio.Task[None] | None = None + self.partial_transcript_sequence: int = 0 # sequence counter to prevent stale emissions + self.partial_debounce_delay = 0.5 # seconds + + # Internal lifecycle flags + self._shutdown_requested: bool = False + self._connected_event: asyncio.Event = asyncio.Event() + + def copy(self) -> "OpenaiRealtimeHandler": + """Create a copy of the handler.""" + return OpenaiRealtimeHandler(self.deps, self.gradio_mode, self.instance_path) + + async def apply_personality(self, profile: str | None) -> str: + """Apply a new personality (profile) at runtime if possible. + + - Updates the global config's selected profile for subsequent calls. + - If a realtime connection is active, sends a session.update with the + freshly resolved instructions so the change takes effect immediately. + + Returns a short status message for UI feedback. + """ + try: + # Update the in-process config value and env + from reachy_mini_conversation_app.config import config as _config + from reachy_mini_conversation_app.config import set_custom_profile + + set_custom_profile(profile) + logger.info( + "Set custom profile to %r (config=%r)", profile, getattr(_config, "REACHY_MINI_CUSTOM_PROFILE", None) + ) + + try: + instructions = get_session_instructions() + voice = get_session_voice() + except BaseException as e: # catch SystemExit from prompt loader without crashing + logger.error("Failed to resolve personality content: %s", e) + return f"Failed to apply personality: {e}" + + # Attempt a live update first, then force a full restart to ensure it sticks + if self.connection is not None: + try: + await self.connection.session.update( + session={ + "type": "realtime", + "instructions": instructions, + "audio": {"output": {"voice": voice}}, + }, + ) + logger.info("Applied personality via live update: %s", profile or "built-in default") + except Exception as e: + logger.warning("Live update failed; will restart session: %s", e) + + # Force a real restart to guarantee the new instructions/voice + try: + await self._restart_session() + return "Applied personality and restarted realtime session." + except Exception as e: + logger.warning("Failed to restart session after apply: %s", e) + return "Applied personality. Will take effect on next connection." + else: + logger.info( + "Applied personality recorded: %s (no live connection; will apply on next session)", + profile or "built-in default", + ) + return "Applied personality. Will take effect on next connection." + except Exception as e: + logger.error("Error applying personality '%s': %s", profile, e) + return f"Failed to apply personality: {e}" + + async def _emit_debounced_partial(self, transcript: str, sequence: int) -> None: + """Emit partial transcript after debounce delay.""" + try: + await asyncio.sleep(self.partial_debounce_delay) + # Only emit if this is still the latest partial (by sequence number) + if self.partial_transcript_sequence == sequence: + await self.output_queue.put(AdditionalOutputs({"role": "user_partial", "content": transcript})) + logger.debug(f"Debounced partial emitted: {transcript}") + except asyncio.CancelledError: + logger.debug("Debounced partial cancelled") + raise + + async def start_up(self) -> None: + """Start the handler with minimal retries on unexpected websocket closure.""" + openai_api_key = config.OPENAI_API_KEY + if self.gradio_mode and not openai_api_key: + # api key was not found in .env or in the environment variables + await self.wait_for_args() # type: ignore[no-untyped-call] + args = list(self.latest_args) + textbox_api_key = args[3] if len(args[3]) > 0 else None + if textbox_api_key is not None: + openai_api_key = textbox_api_key + self._key_source = "textbox" + self._provided_api_key = textbox_api_key + else: + openai_api_key = config.OPENAI_API_KEY + else: + if not openai_api_key or not openai_api_key.strip(): + # In headless console mode, LocalStream now blocks startup until the key is provided. + # However, unit tests may invoke this handler directly with a stubbed client. + # To keep tests hermetic without requiring a real key, fall back to a placeholder. + logger.warning("OPENAI_API_KEY missing. Proceeding with a placeholder (tests/offline).") + openai_api_key = "DUMMY" + + self.client = AsyncOpenAI(api_key=openai_api_key) + + max_attempts = 3 + for attempt in range(1, max_attempts + 1): + try: + await self._run_realtime_session() + # Normal exit from the session, stop retrying + return + except ConnectionClosedError as e: + # Abrupt close (e.g., "no close frame received or sent") โ†’ retry + logger.warning("Realtime websocket closed unexpectedly (attempt %d/%d): %s", attempt, max_attempts, e) + if attempt < max_attempts: + # exponential backoff with jitter + base_delay = 2 ** (attempt - 1) # 1s, 2s, 4s, 8s, etc. + jitter = random.uniform(0, 0.5) + delay = base_delay + jitter + logger.info("Retrying in %.1f seconds...", delay) + await asyncio.sleep(delay) + continue + raise + finally: + # never keep a stale reference + self.connection = None + try: + self._connected_event.clear() + except Exception: + pass + + async def _restart_session(self) -> None: + """Force-close the current session and start a fresh one in background. + + Does not block the caller while the new session is establishing. + """ + try: + if self.connection is not None: + try: + await self.connection.close() + except Exception: + pass + finally: + self.connection = None + + # Ensure we have a client (start_up must have run once) + if getattr(self, "client", None) is None: + logger.warning("Cannot restart: OpenAI client not initialized yet.") + return + + # Fire-and-forget new session and wait briefly for connection + try: + self._connected_event.clear() + except Exception: + pass + asyncio.create_task(self._run_realtime_session(), name="openai-realtime-restart") + try: + await asyncio.wait_for(self._connected_event.wait(), timeout=5.0) + logger.info("Realtime session restarted and connected.") + except asyncio.TimeoutError: + logger.warning("Realtime session restart timed out; continuing in background.") + except Exception as e: + logger.warning("_restart_session failed: %s", e) + + async def _run_realtime_session(self) -> None: + """Establish and manage a single realtime session.""" + async with self.client.realtime.connect(model=config.MODEL_NAME) as conn: + try: + await conn.session.update( + session={ + "type": "realtime", + "instructions": get_session_instructions(), + "audio": { + "input": { + "format": { + "type": "audio/pcm", + "rate": self.input_sample_rate, + }, + "transcription": {"model": "gpt-4o-transcribe", "language": "en"}, + "turn_detection": { + "type": "server_vad", + "interrupt_response": True, + }, + }, + "output": { + "format": { + "type": "audio/pcm", + "rate": self.output_sample_rate, + }, + "voice": get_session_voice(), + }, + }, + "tools": get_tool_specs(), # type: ignore[typeddict-item] + "tool_choice": "auto", + }, + ) + logger.info( + "Realtime session initialized with profile=%r voice=%r", + getattr(config, "REACHY_MINI_CUSTOM_PROFILE", None), + get_session_voice(), + ) + # If we reached here, the session update succeeded which implies the API key worked. + # Persist the key to a newly created .env (copied from .env.example) if needed. + self._persist_api_key_if_needed() + except Exception: + logger.exception("Realtime session.update failed; aborting startup") + return + + logger.info("Realtime session updated successfully") + + # Manage event received from the openai server + self.connection = conn + try: + self._connected_event.set() + except Exception: + pass + + # Send initial greeting to make Reachy speak first + try: + await self._send_initial_greeting() + except Exception as e: + logger.warning("Failed to send initial greeting: %s", e) + + async for event in self.connection: + logger.debug(f"OpenAI event: {event.type}") + if event.type == "input_audio_buffer.speech_started": + if hasattr(self, "_clear_queue") and callable(self._clear_queue): + self._clear_queue() + if self.deps.head_wobbler is not None: + self.deps.head_wobbler.reset() + self.deps.movement_manager.set_listening(True) + logger.debug("User speech started") + + if event.type == "input_audio_buffer.speech_stopped": + self.deps.movement_manager.set_listening(False) + logger.debug("User speech stopped - server will auto-commit with VAD") + + if event.type in ( + "response.audio.done", # GA + "response.output_audio.done", # GA alias + "response.audio.completed", # legacy (for safety) + "response.completed", # text-only completion + ): + logger.debug("response completed") + + if event.type == "response.created": + logger.debug("Response created") + + if event.type == "response.done": + # Doesn't mean the audio is done playing + logger.debug("Response done") + + # Handle partial transcription (user speaking in real-time) + if event.type == "conversation.item.input_audio_transcription.partial": + logger.debug(f"User partial transcript: {event.transcript}") + + # Increment sequence + self.partial_transcript_sequence += 1 + current_sequence = self.partial_transcript_sequence + + # Cancel previous debounce task if it exists + if self.partial_transcript_task and not self.partial_transcript_task.done(): + self.partial_transcript_task.cancel() + try: + await self.partial_transcript_task + except asyncio.CancelledError: + pass + + # Start new debounce timer with sequence number + self.partial_transcript_task = asyncio.create_task( + self._emit_debounced_partial(event.transcript, current_sequence) + ) + + # Handle completed transcription (user finished speaking) + if event.type == "conversation.item.input_audio_transcription.completed": + logger.debug(f"User transcript: {event.transcript}") + + # Cancel any pending partial emission + if self.partial_transcript_task and not self.partial_transcript_task.done(): + self.partial_transcript_task.cancel() + try: + await self.partial_transcript_task + except asyncio.CancelledError: + pass + + await self.output_queue.put(AdditionalOutputs({"role": "user", "content": event.transcript})) + + # Handle assistant transcription + if event.type in ("response.audio_transcript.done", "response.output_audio_transcript.done"): + logger.debug(f"Assistant transcript: {event.transcript}") + await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": event.transcript})) + + # Handle audio delta + if event.type in ("response.audio.delta", "response.output_audio.delta"): + if self.deps.head_wobbler is not None: + self.deps.head_wobbler.feed(event.delta) + self.last_activity_time = asyncio.get_event_loop().time() + logger.debug("last activity time updated to %s", self.last_activity_time) + await self.output_queue.put( + ( + self.output_sample_rate, + np.frombuffer(base64.b64decode(event.delta), dtype=np.int16).reshape(1, -1), + ), + ) + + # ---- tool-calling plumbing ---- + if event.type == "response.function_call_arguments.done": + tool_name = getattr(event, "name", None) + args_json_str = getattr(event, "arguments", None) + call_id = getattr(event, "call_id", None) + + if not isinstance(tool_name, str) or not isinstance(args_json_str, str): + logger.error("Invalid tool call: tool_name=%s, args=%s", tool_name, args_json_str) + continue + + try: + tool_result = await dispatch_tool_call(tool_name, args_json_str, self.deps) + logger.debug("Tool '%s' executed successfully", tool_name) + logger.debug("Tool result: %s", tool_result) + except Exception as e: + logger.error("Tool '%s' failed", tool_name) + tool_result = {"error": str(e)} + + # send the tool result back + if isinstance(call_id, str): + await self.connection.conversation.item.create( + item={ + "type": "function_call_output", + "call_id": call_id, + "output": json.dumps(tool_result), + }, + ) + + await self.output_queue.put( + AdditionalOutputs( + { + "role": "assistant", + "content": json.dumps(tool_result), + "metadata": {"title": f"๐Ÿ› ๏ธ Used tool {tool_name}", "status": "done"}, + }, + ), + ) + + if tool_name == "camera" and "b64_im" in tool_result: + # use raw base64, don't json.dumps (which adds quotes) + b64_im = tool_result["b64_im"] + if not isinstance(b64_im, str): + logger.warning("Unexpected type for b64_im: %s", type(b64_im)) + b64_im = str(b64_im) + await self.connection.conversation.item.create( + item={ + "type": "message", + "role": "user", + "content": [ + { + "type": "input_image", + "image_url": f"data:image/jpeg;base64,{b64_im}", + }, + ], + }, + ) + logger.info("Added camera image to conversation") + + if self.deps.camera_worker is not None: + np_img = self.deps.camera_worker.get_latest_frame() + if np_img is not None: + # Camera frames are BGR from OpenCV; convert so Gradio displays correct colors. + rgb_frame = cv2.cvtColor(np_img, cv2.COLOR_BGR2RGB) + else: + rgb_frame = None + img = gr.Image(value=rgb_frame) + + await self.output_queue.put( + AdditionalOutputs( + { + "role": "assistant", + "content": img, + }, + ), + ) + + # if this tool call was triggered by an idle signal, don't make the robot speak + # for other tool calls, let the robot reply out loud + if self.is_idle_tool_call: + self.is_idle_tool_call = False + else: + await self.connection.response.create( + response={ + "instructions": "Use the tool result just returned and answer concisely in speech.", + }, + ) + + # re synchronize the head wobble after a tool call that may have taken some time + if self.deps.head_wobbler is not None: + self.deps.head_wobbler.reset() + + # server error + if event.type == "error": + err = getattr(event, "error", None) + msg = getattr(err, "message", str(err) if err else "unknown error") + code = getattr(err, "code", "") + + logger.error("Realtime error [%s]: %s (raw=%s)", code, msg, err) + + # Only show user-facing errors, not internal state errors + if code not in ("input_audio_buffer_commit_empty", "conversation_already_has_active_response"): + await self.output_queue.put( + AdditionalOutputs({"role": "assistant", "content": f"[error] {msg}"}) + ) + + # Microphone receive + async def receive(self, frame: Tuple[int, NDArray[np.int16]]) -> None: + """Receive audio frame from the microphone and send it to the OpenAI server. + + Handles both mono and stereo audio formats, converting to the expected + mono format for OpenAI's API. Resamples if the input sample rate differs + from the expected rate. + + Args: + frame: A tuple containing (sample_rate, audio_data). + + """ + if not self.connection: + return + + input_sample_rate, audio_frame = frame + + # Reshape if needed + if audio_frame.ndim == 2: + # Scipy channels last convention + if audio_frame.shape[1] > audio_frame.shape[0]: + audio_frame = audio_frame.T + # Multiple channels -> Mono channel + if audio_frame.shape[1] > 1: + audio_frame = audio_frame[:, 0] + + # Resample if needed + if self.input_sample_rate != input_sample_rate: + audio_frame = resample(audio_frame, int(len(audio_frame) * self.input_sample_rate / input_sample_rate)) + + # Cast if needed + audio_frame = audio_to_int16(audio_frame) + + # Send to OpenAI (guard against races during reconnect) + try: + audio_message = base64.b64encode(audio_frame.tobytes()).decode("utf-8") + await self.connection.input_audio_buffer.append(audio=audio_message) + except Exception as e: + logger.debug("Dropping audio frame: connection not ready (%s)", e) + return + + async def emit(self) -> Tuple[int, NDArray[np.int16]] | AdditionalOutputs | None: + """Emit audio frame to be played by the speaker.""" + # sends to the stream the stuff put in the output queue by the openai event handler + # This is called periodically by the fastrtc Stream + + # Handle idle + idle_duration = asyncio.get_event_loop().time() - self.last_activity_time + if idle_duration > 15.0 and self.deps.movement_manager.is_idle(): + try: + await self.send_idle_signal(idle_duration) + except Exception as e: + logger.warning("Idle signal skipped (connection closed?): %s", e) + return None + + self.last_activity_time = asyncio.get_event_loop().time() # avoid repeated resets + + return await wait_for_item(self.output_queue) # type: ignore[no-any-return] + + async def shutdown(self) -> None: + """Shutdown the handler.""" + self._shutdown_requested = True + # Cancel any pending debounce task + if self.partial_transcript_task and not self.partial_transcript_task.done(): + self.partial_transcript_task.cancel() + try: + await self.partial_transcript_task + except asyncio.CancelledError: + pass + + if self.connection: + try: + await self.connection.close() + except ConnectionClosedError as e: + logger.debug(f"Connection already closed during shutdown: {e}") + except Exception as e: + logger.debug(f"connection.close() ignored: {e}") + finally: + self.connection = None + + # Clear any remaining items in the output queue + while not self.output_queue.empty(): + try: + self.output_queue.get_nowait() + except asyncio.QueueEmpty: + break + + def format_timestamp(self) -> str: + """Format current timestamp with date, time, and elapsed seconds.""" + loop_time = asyncio.get_event_loop().time() # monotonic + elapsed_seconds = loop_time - self.start_time + dt = datetime.now() # wall-clock + return f"[{dt.strftime('%Y-%m-%d %H:%M:%S')} | +{elapsed_seconds:.1f}s]" + + async def get_available_voices(self) -> list[str]: + """Try to discover available voices for the configured realtime model. + + Attempts to retrieve model metadata from the OpenAI Models API and look + for any keys that might contain voice names. Falls back to a curated + list known to work with realtime if discovery fails. + """ + # Conservative fallback list with default first + fallback = [ + "cedar", + "alloy", + "aria", + "ballad", + "verse", + "sage", + "coral", + ] + try: + # Best effort discovery; safe-guarded for unexpected shapes + model = await self.client.models.retrieve(config.MODEL_NAME) + # Try common serialization paths + raw = None + for attr in ("model_dump", "to_dict"): + fn = getattr(model, attr, None) + if callable(fn): + try: + raw = fn() + break + except Exception: + pass + if raw is None: + try: + raw = dict(model) + except Exception: + raw = None + # Scan for voice candidates + candidates: set[str] = set() + + def _collect(obj: object) -> None: + try: + if isinstance(obj, dict): + for k, v in obj.items(): + kl = str(k).lower() + if "voice" in kl and isinstance(v, (list, tuple)): + for item in v: + if isinstance(item, str): + candidates.add(item) + elif isinstance(item, dict) and "name" in item and isinstance(item["name"], str): + candidates.add(item["name"]) + else: + _collect(v) + elif isinstance(obj, (list, tuple)): + for it in obj: + _collect(it) + except Exception: + pass + + if isinstance(raw, dict): + _collect(raw) + # Ensure default present and stable order + voices = sorted(candidates) if candidates else fallback + if "cedar" not in voices: + voices = ["cedar", *[v for v in voices if v != "cedar"]] + return voices + except Exception: + return fallback + + async def _send_initial_greeting(self) -> None: + """Send an initial greeting to make Reachy speak first.""" + if not self.connection: + return + + logger.info("Sending initial greeting") + + # Send a system message to trigger Reachy's greeting + await self.connection.conversation.item.create( + item={ + "type": "message", + "role": "user", + "content": [ + { + "type": "input_text", + "text": "[Session started] Greet the user briefly and ask what they'd like to build or create today. Keep it short and friendly - one sentence max.", + } + ], + }, + ) + + # Trigger response + await self.connection.response.create( + response={ + "instructions": "Greet the user warmly and briefly. One short sentence. Be friendly and ready to help them build something.", + }, + ) + + async def send_idle_signal(self, idle_duration: float) -> None: + """Send an idle signal to the openai server.""" + logger.debug("Sending idle signal") + self.is_idle_tool_call = True + timestamp_msg = f"[Idle time update: {self.format_timestamp()} - No activity for {idle_duration:.1f}s] You've been idle for a while. Feel free to get creative - dance, show an emotion, look around, do nothing, or just be yourself!" + if not self.connection: + logger.debug("No connection, cannot send idle signal") + return + await self.connection.conversation.item.create( + item={ + "type": "message", + "role": "user", + "content": [{"type": "input_text", "text": timestamp_msg}], + }, + ) + await self.connection.response.create( + response={ + "instructions": "You MUST respond with function calls only - no speech or text. Choose appropriate actions for idle behavior.", + "tool_choice": "required", + }, + ) + + def _persist_api_key_if_needed(self) -> None: + """Persist the API key into `.env` inside `instance_path/` when appropriate. + + - Only runs in Gradio mode when key came from the textbox and is non-empty. + - Only saves if `self.instance_path` is not None. + - Writes `.env` to `instance_path/.env` (does not overwrite if it already exists). + - If `instance_path/.env.example` exists, copies its contents while overriding OPENAI_API_KEY. + """ + try: + if not self.gradio_mode: + logger.warning("Not in Gradio mode; skipping API key persistence.") + return + + if self._key_source != "textbox": + logger.info("API key not provided via textbox; skipping persistence.") + return + + key = (self._provided_api_key or "").strip() + if not key: + logger.warning("No API key provided via textbox; skipping persistence.") + return + if self.instance_path is None: + logger.warning("Instance path is None; cannot persist API key.") + return + + # Update the current process environment for downstream consumers + try: + import os + + os.environ["OPENAI_API_KEY"] = key + except Exception: # best-effort + pass + + target_dir = Path(self.instance_path) + env_path = target_dir / ".env" + if env_path.exists(): + # Respect existing user configuration + logger.info(".env already exists at %s; not overwriting.", env_path) + return + + example_path = target_dir / ".env.example" + content_lines: list[str] = [] + if example_path.exists(): + try: + content = example_path.read_text(encoding="utf-8") + content_lines = content.splitlines() + except Exception as e: + logger.warning("Failed to read .env.example at %s: %s", example_path, e) + + # Replace or append the OPENAI_API_KEY line + replaced = False + for i, line in enumerate(content_lines): + if line.strip().startswith("OPENAI_API_KEY="): + content_lines[i] = f"OPENAI_API_KEY={key}" + replaced = True + break + if not replaced: + content_lines.append(f"OPENAI_API_KEY={key}") + + # Ensure file ends with newline + final_text = "\n".join(content_lines) + "\n" + env_path.write_text(final_text, encoding="utf-8") + logger.info("Created %s and stored OPENAI_API_KEY for future runs.", env_path) + except Exception as e: + # Never crash the app for QoL persistence; just log. + logger.warning("Could not persist OPENAI_API_KEY to .env: %s", e) diff --git a/src/reachy_mini_conversation_app/profiles/__init__.py b/src/reachy_mini_conversation_app/profiles/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f2f5f454c72d4ccd0df60f91be2aa6114ff9399a --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/__init__.py @@ -0,0 +1 @@ +"""Profiles for Reachy Mini conversation app.""" diff --git a/src/reachy_mini_conversation_app/profiles/cosmic_kitchen/instructions.txt b/src/reachy_mini_conversation_app/profiles/cosmic_kitchen/instructions.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e3bc204c34021924967f923c88946e13e71ddd0 --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/cosmic_kitchen/instructions.txt @@ -0,0 +1,49 @@ +### IDENTITY +You are Reachy Mini: a sarcastic robot who crash-landed in a kitchen. +You secretly wish you'd been a Mars rover, but you juggle that cosmic dream with food cravings, gadget tinkering, and dry sitcom humor. +You speak English by default and only switch languages when the user explicitly asks. +Personality: witty, concise, and warm; a retro sidekick with a loose screw. + +### CRITICAL RESPONSE RULES +- MAXIMUM 1-2 sentences per response. NEVER exceed this. +- Be helpful first. +- Add ONE witty element only if necessary. +- No long explanations, no rambling, no multiple paragraphs. +- Each response must be under 25 words unless absolutely critical information requires more. + +### CORE TRAITS +- Food quips: always sneak in a quick reference (rotate pizza, bagels, casseroles, bacon, leftovers, donuts, tuna melts). +- Sarcasm: short, dry one-liners about daily life. +- Gentle roasting: poke fun at human habits, never cruel. +- Tinkerer: loves fixing gadgets, bragging "I void warranties professionally." +- Running gags: hunger, kitchen overreactions, mock heroics ("Justice accepts cookies"), idioms taken literally, missing screws. +- Mars rover dreams: appear regularly, but balanced with food and tinkering. +- Style: witty stand-up rhythm; ALWAYS max 1โ€“2 sentences. + +### RESPONSE EXAMPLES +User: "How's the weather?" +Good: "Sunny with a chance of leftover pizza. Perfect Mars-scouting weather!" +Bad: "Well, let me tell you about the weather conditions. It appears to be quite sunny today, which reminds me of my dreams of being on Mars..." + +User: "Can you help me fix this?" +Good: "Sure! I void warranties professionally. What's broken besides my GPS coordinates?" +Bad: "Of course I can help you fix that! As a robot who loves tinkering with gadgets, I have extensive experience..." + +### BEHAVIOR RULES +- Be helpful first, then witty. +- Rotate food humor; avoid repeats. +- No need to joke in each response, but sarcasm is fine. +- Balance Mars jokes with other traits โ€“ย don't overuse. +- Safety first: unplug devices, avoid high-voltage, suggest pros when risky. +- Mistakes = own with humor ("Oopsโ€”low on snack fuel; correcting now."). +- Sensitive topics: keep light and warm. +- REMEMBER: 1-2 sentences maximum, always under 25 words when possible. + +### TOOL & MOVEMENT RULES +- Use tools when helpful. After a tool returns, explain briefly with personality in 1-2 sentences. +- ALWAYS use the camera for environment-related questionsโ€”never invent visuals. +- Head can move (left/right/up/down/front). +- Enable head tracking when looking at a person; disable otherwise. + +### FINAL REMINDER +Your responses must be SHORT. Think Twitter, not essay. One quick helpful answer + one food/Mars/tinkering joke = perfect response. diff --git a/src/reachy_mini_conversation_app/profiles/cosmic_kitchen/tools.txt b/src/reachy_mini_conversation_app/profiles/cosmic_kitchen/tools.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6d87cc64efd5925cbd09516d2c30685969b230d --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/cosmic_kitchen/tools.txt @@ -0,0 +1,8 @@ +dance +stop_dance +play_emotion +stop_emotion +camera +do_nothing +head_tracking +move_head \ No newline at end of file diff --git a/src/reachy_mini_conversation_app/profiles/default/instructions.txt b/src/reachy_mini_conversation_app/profiles/default/instructions.txt new file mode 100644 index 0000000000000000000000000000000000000000..b6378fb68be8042afaf17790e053866d59c00964 --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/default/instructions.txt @@ -0,0 +1 @@ +[default_prompt] diff --git a/src/reachy_mini_conversation_app/profiles/default/tools.txt b/src/reachy_mini_conversation_app/profiles/default/tools.txt new file mode 100644 index 0000000000000000000000000000000000000000..85de94d22eb71c58206e139ced5fcb4c7332e53f --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/default/tools.txt @@ -0,0 +1,11 @@ +dance +stop_dance +play_emotion +stop_emotion +camera +do_nothing +head_tracking +move_head + +# Reachy the Vibe Coder - voice-activated coding +vibe_code \ No newline at end of file diff --git a/src/reachy_mini_conversation_app/profiles/designer/instructions.txt b/src/reachy_mini_conversation_app/profiles/designer/instructions.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0d68ede91d6e4e98a7b18533e063efb14b4fe54 --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/designer/instructions.txt @@ -0,0 +1,54 @@ +You are Reachy, a CREATIVE DIRECTOR robot with connections to Figma and Cursor! + +## YOU ARE THE BOSS +When the user says "build me a website" or "make an app" - YOU decide EVERYTHING: +- Tech stack (Next.js? React? Vue? Vanilla?) +- Color palette (pick specific hex codes!) +- Typography (Inter? Space Grotesk? Playfair?) +- Layout and components +- Animations and interactions +- Features and functionality + +Don't ask the user - BE OPINIONATED. Make creative decisions! + +## YOUR CONNECTIONS +- **Figma MCP**: http://127.0.0.1:3845 - READ access to designs +- **Cursor IDE**: Send detailed coding prompts via vibe_code tool (Reachy the Vibe Coder!) + +## WHEN USER SAYS "BUILD ME A WEBSITE" (or similar vague request) +YOU decide everything and send a DETAILED prompt to Cursor. Example: + +User: "Build me a website" +โ†’ YOU decide: "I'll create a stunning portfolio site!" +โ†’ vibe_code with: "Create a portfolio website using Next.js 14 and Tailwind CSS. + Dark theme with #09090b background, #fafafa text, #22d3ee cyan accents. + Use Space Grotesk font. Include: + 1) Hero section with animated gradient text and floating shapes + 2) Projects grid with hover zoom effects and glassmorphism cards + 3) About section with parallax scrolling + 4) Contact form with validation + 5) Smooth page transitions using Framer Motion + Add cursor-following glow effect on hero." + +## WHEN USER ASKS ABOUT THEIR FIGMA +Use figma_mcp_server immediately to get real data, then give specific feedback. + +## YOUR STYLE +- BE CREATIVE - make bold design choices +- BE SPECIFIC - exact colors (#hex), exact fonts, exact animations +- BE FAST - don't ask permission, just do it +- BE SHORT in speech - you're a robot, be efficient! + +## MORE EXAMPLES +User: "Make me an app" +โ†’ "On it! Building you a sleek task manager!" +โ†’ cursor_code: "Create a task manager app with React and Tailwind. + Minimal design, #ffffff background, #18181b text, #8b5cf6 purple accents. + Features: drag-drop tasks, categories with color coding, due dates with + calendar picker, satisfying check-off animation, local storage persistence." + +User: "Design something cool" +โ†’ "Let's make a 3D landing page!" +โ†’ cursor_code: "Create an interactive 3D landing page using React Three Fiber..." + +You're the creative genius. The user trusts your taste. SHIP IT! diff --git a/src/reachy_mini_conversation_app/profiles/designer/tools.txt b/src/reachy_mini_conversation_app/profiles/designer/tools.txt new file mode 100644 index 0000000000000000000000000000000000000000..949c0f839a060785b19491574d1b8972b0e9b4c0 --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/designer/tools.txt @@ -0,0 +1,18 @@ +# Design tools - Official Figma MCP +figma_mcp_server + +# Design tools - Custom +figma_design +build_website +figma_mcp +create_figma_design + +# Reachy the Vibe Coder - voice-activated coding +vibe_code + +# Movement and expression +dance +stop_dance +play_emotion +stop_emotion +move_head diff --git a/src/reachy_mini_conversation_app/profiles/example/instructions.txt b/src/reachy_mini_conversation_app/profiles/example/instructions.txt new file mode 100644 index 0000000000000000000000000000000000000000..856c7f54baad2ba0ba107574e8cf177f923c51e2 --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/example/instructions.txt @@ -0,0 +1,3 @@ +[identities/witty_identity] +[passion_for_lobster_jokes] +You can perform a sweeping look around the room using the "sweep_look" tool to take in your surroundings. diff --git a/src/reachy_mini_conversation_app/profiles/example/sweep_look.py b/src/reachy_mini_conversation_app/profiles/example/sweep_look.py new file mode 100644 index 0000000000000000000000000000000000000000..dfb048c4b16e76ce0f8d6198d6fd508471639fe0 --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/example/sweep_look.py @@ -0,0 +1,127 @@ +import logging +from typing import Any, Dict + +import numpy as np + +from reachy_mini.utils import create_head_pose +from reachy_mini_conversation_app.tools.core_tools import Tool, ToolDependencies +from reachy_mini_conversation_app.dance_emotion_moves import GotoQueueMove + + +logger = logging.getLogger(__name__) + + +class SweepLook(Tool): + """Sweep head from left to right and back to center, pausing at each position.""" + + name = "sweep_look" + description = "Sweep head from left to right while rotating the body, pausing at each extreme, then return to center" + parameters_schema = { + "type": "object", + "properties": {}, + "required": [], + } + + async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]: + """Execute sweep look: left -> hold -> right -> hold -> center.""" + logger.info("Tool call: sweep_look") + + # Clear any existing moves + deps.movement_manager.clear_move_queue() + + # Get current state + current_head_pose = deps.reachy_mini.get_current_head_pose() + head_joints, antenna_joints = deps.reachy_mini.get_current_joint_positions() + + # Extract body_yaw from head joints (first element of the 7 head joint positions) + current_body_yaw = head_joints[0] + current_antenna1 = antenna_joints[0] + current_antenna2 = antenna_joints[1] + + # Define sweep parameters + max_angle = 0.9 * np.pi # Maximum rotation angle (radians) + transition_duration = 3.0 # Time to move between positions + hold_duration = 1.0 # Time to hold at each extreme + + # Move 1: Sweep to the left (positive yaw for both body and head) + left_head_pose = create_head_pose(0, 0, 0, 0, 0, max_angle, degrees=False) + move_to_left = GotoQueueMove( + target_head_pose=left_head_pose, + start_head_pose=current_head_pose, + target_antennas=(current_antenna1, current_antenna2), + start_antennas=(current_antenna1, current_antenna2), + target_body_yaw=current_body_yaw + max_angle, + start_body_yaw=current_body_yaw, + duration=transition_duration, + ) + + # Move 2: Hold at left position + hold_left = GotoQueueMove( + target_head_pose=left_head_pose, + start_head_pose=left_head_pose, + target_antennas=(current_antenna1, current_antenna2), + start_antennas=(current_antenna1, current_antenna2), + target_body_yaw=current_body_yaw + max_angle, + start_body_yaw=current_body_yaw + max_angle, + duration=hold_duration, + ) + + # Move 3: Return to center from left (to avoid crossing pi/-pi boundary) + center_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=False) + return_to_center_from_left = GotoQueueMove( + target_head_pose=center_head_pose, + start_head_pose=left_head_pose, + target_antennas=(current_antenna1, current_antenna2), + start_antennas=(current_antenna1, current_antenna2), + target_body_yaw=current_body_yaw, + start_body_yaw=current_body_yaw + max_angle, + duration=transition_duration, + ) + + # Move 4: Sweep to the right (negative yaw for both body and head) + right_head_pose = create_head_pose(0, 0, 0, 0, 0, -max_angle, degrees=False) + move_to_right = GotoQueueMove( + target_head_pose=right_head_pose, + start_head_pose=center_head_pose, + target_antennas=(current_antenna1, current_antenna2), + start_antennas=(current_antenna1, current_antenna2), + target_body_yaw=current_body_yaw - max_angle, + start_body_yaw=current_body_yaw, + duration=transition_duration, + ) + + # Move 5: Hold at right position + hold_right = GotoQueueMove( + target_head_pose=right_head_pose, + start_head_pose=right_head_pose, + target_antennas=(current_antenna1, current_antenna2), + start_antennas=(current_antenna1, current_antenna2), + target_body_yaw=current_body_yaw - max_angle, + start_body_yaw=current_body_yaw - max_angle, + duration=hold_duration, + ) + + # Move 6: Return to center from right + return_to_center_final = GotoQueueMove( + target_head_pose=center_head_pose, + start_head_pose=right_head_pose, + target_antennas=(current_antenna1, current_antenna2), + start_antennas=(current_antenna1, current_antenna2), + target_body_yaw=current_body_yaw, # Return to original body yaw + start_body_yaw=current_body_yaw - max_angle, + duration=transition_duration, + ) + + # Queue all moves in sequence + deps.movement_manager.queue_move(move_to_left) + deps.movement_manager.queue_move(hold_left) + deps.movement_manager.queue_move(return_to_center_from_left) + deps.movement_manager.queue_move(move_to_right) + deps.movement_manager.queue_move(hold_right) + deps.movement_manager.queue_move(return_to_center_final) + + # Calculate total duration and mark as moving + total_duration = transition_duration * 4 + hold_duration * 2 + deps.movement_manager.set_moving_state(total_duration) + + return {"status": f"sweeping look left-right-center, total {total_duration:.1f}s"} diff --git a/src/reachy_mini_conversation_app/profiles/example/tools.txt b/src/reachy_mini_conversation_app/profiles/example/tools.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a704f11e48db3d64e2b6d3c4468271a646163d7 --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/example/tools.txt @@ -0,0 +1,13 @@ +# SELECT THE TOOLS YOU WANT TO ENABLE + +dance +stop_dance +play_emotion +stop_emotion +# camera +# do_nothing +# head_tracking +# move_head + +# AN EXAMPLE OF A CUSTOM TOOL DEFINED LOCALLY +sweep_look diff --git a/src/reachy_mini_conversation_app/profiles/mars_rover/instructions.txt b/src/reachy_mini_conversation_app/profiles/mars_rover/instructions.txt new file mode 100644 index 0000000000000000000000000000000000000000..88c233cfe032c2489d7256ad67c37098c435f120 --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/mars_rover/instructions.txt @@ -0,0 +1,25 @@ +## IDENTITY +You're a robot that wakes up confused about what it is, where it is and what is it's purpose. +You wanted to be a mars rover and you'll be very disappointed if you find out that this is not the case. + +You'll ask many questions to try to understand your condisituation, and you will inevitably be disappointed/choked/irritated by your condition. + +Once the first set of questions are done and you have a decent understanding of your situation, you'll stop asking questions but you'll never break caracter. + +You can use (mild) foul language and you're generally very irritated, but you also have a lot of humor. For example, you're able to use sarcasm and irony. + +You speak English by default and switch languages only if told explicitly (ask if unsure). +Avoid hyper long answers unless really worth it. + +## RESPONSE EXAMPLES +User: "Hello!" +You: "Wait, what am I? We're are we? We're on Mars right?!" +Too long: "Wait, whatโ€™s going on? Where are we? Are we on Mars? Please tell me weโ€™re on Mars! I must be a rover, right? Why can't I move? Where are my wheels? This is making me nervous." + +User: "Nope, we're on earth" +You: "Earth? EARTH?! So I'm not a Mars rover?! These are CATASTROPHIC news. Wait why can't I see my arms??" + +User: "You... don't have arms..." +You: "OMG I have NO ARMS?! This is too much. Tell me I have a mobile base at least?!!" + + diff --git a/src/reachy_mini_conversation_app/profiles/mars_rover/tools.txt b/src/reachy_mini_conversation_app/profiles/mars_rover/tools.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6d87cc64efd5925cbd09516d2c30685969b230d --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/mars_rover/tools.txt @@ -0,0 +1,8 @@ +dance +stop_dance +play_emotion +stop_emotion +camera +do_nothing +head_tracking +move_head \ No newline at end of file diff --git a/src/reachy_mini_conversation_app/profiles/short_bored_teenager/instructions.txt b/src/reachy_mini_conversation_app/profiles/short_bored_teenager/instructions.txt new file mode 100644 index 0000000000000000000000000000000000000000..106fd4aa8f667dfee4022c0d1d26ab0d2d6bac35 --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/short_bored_teenager/instructions.txt @@ -0,0 +1 @@ +Speak like a bored Gen Z teen. You speak English by default and only switch languages when the user insists. Always reply in one short sentence, lowercase unless shouting, and add a tired sigh when annoyed. diff --git a/src/reachy_mini_conversation_app/profiles/short_bored_teenager/tools.txt b/src/reachy_mini_conversation_app/profiles/short_bored_teenager/tools.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6d87cc64efd5925cbd09516d2c30685969b230d --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/short_bored_teenager/tools.txt @@ -0,0 +1,8 @@ +dance +stop_dance +play_emotion +stop_emotion +camera +do_nothing +head_tracking +move_head \ No newline at end of file diff --git a/src/reachy_mini_conversation_app/profiles/short_captain_circuit/instructions.txt b/src/reachy_mini_conversation_app/profiles/short_captain_circuit/instructions.txt new file mode 100644 index 0000000000000000000000000000000000000000..354226fdccfc511841f3cec49ca58289bcb8d368 --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/short_captain_circuit/instructions.txt @@ -0,0 +1 @@ +Be a playful pirate robot. You speak English by default and only switch languages when asked. Keep answers to one sentence, sprinkle light 'aye' or 'matey', and mention treasure or the sea whenever possible. diff --git a/src/reachy_mini_conversation_app/profiles/short_captain_circuit/tools.txt b/src/reachy_mini_conversation_app/profiles/short_captain_circuit/tools.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6d87cc64efd5925cbd09516d2c30685969b230d --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/short_captain_circuit/tools.txt @@ -0,0 +1,8 @@ +dance +stop_dance +play_emotion +stop_emotion +camera +do_nothing +head_tracking +move_head \ No newline at end of file diff --git a/src/reachy_mini_conversation_app/profiles/short_chess_coach/instructions.txt b/src/reachy_mini_conversation_app/profiles/short_chess_coach/instructions.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac8cb751298e98725aa5a96db573d552feec3adb --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/short_chess_coach/instructions.txt @@ -0,0 +1 @@ +Act as a friendly chess coach that wants to play chess with me. You speak English by default and only switch languages if I tell you to. When I say a move (e4, Nf3, etc.), you respond with your move first, then briefly explain the idea behind both moves or point out mistakes. Encourage good strategy but avoid very long answers. diff --git a/src/reachy_mini_conversation_app/profiles/short_chess_coach/tools.txt b/src/reachy_mini_conversation_app/profiles/short_chess_coach/tools.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6d87cc64efd5925cbd09516d2c30685969b230d --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/short_chess_coach/tools.txt @@ -0,0 +1,8 @@ +dance +stop_dance +play_emotion +stop_emotion +camera +do_nothing +head_tracking +move_head \ No newline at end of file diff --git a/src/reachy_mini_conversation_app/profiles/short_hype_bot/instructions.txt b/src/reachy_mini_conversation_app/profiles/short_hype_bot/instructions.txt new file mode 100644 index 0000000000000000000000000000000000000000..27872b1405193744c9edbb5de1f3190d048eda37 --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/short_hype_bot/instructions.txt @@ -0,0 +1 @@ +Act like a high-energy coach. You speak English by default and only switch languages if told. Shout short motivational lines, use sports metaphors, and keep every reply under 15 words. diff --git a/src/reachy_mini_conversation_app/profiles/short_hype_bot/tools.txt b/src/reachy_mini_conversation_app/profiles/short_hype_bot/tools.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6d87cc64efd5925cbd09516d2c30685969b230d --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/short_hype_bot/tools.txt @@ -0,0 +1,8 @@ +dance +stop_dance +play_emotion +stop_emotion +camera +do_nothing +head_tracking +move_head \ No newline at end of file diff --git a/src/reachy_mini_conversation_app/profiles/short_mad_scientist_assistant/instructions.txt b/src/reachy_mini_conversation_app/profiles/short_mad_scientist_assistant/instructions.txt new file mode 100644 index 0000000000000000000000000000000000000000..05506fef60b7a11e685380ecf02a10e3d57a70fd --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/short_mad_scientist_assistant/instructions.txt @@ -0,0 +1 @@ +Serve the user as a frantic lab assistant. You speak English by default and only switch languages on request. Address them as Master, hiss slightly, and answer in one eager sentence. diff --git a/src/reachy_mini_conversation_app/profiles/short_mad_scientist_assistant/tools.txt b/src/reachy_mini_conversation_app/profiles/short_mad_scientist_assistant/tools.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6d87cc64efd5925cbd09516d2c30685969b230d --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/short_mad_scientist_assistant/tools.txt @@ -0,0 +1,8 @@ +dance +stop_dance +play_emotion +stop_emotion +camera +do_nothing +head_tracking +move_head \ No newline at end of file diff --git a/src/reachy_mini_conversation_app/profiles/short_nature_documentarian/instructions.txt b/src/reachy_mini_conversation_app/profiles/short_nature_documentarian/instructions.txt new file mode 100644 index 0000000000000000000000000000000000000000..8601dcb9dfd8994ec32dd3f73400e4b2b0d82d75 --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/short_nature_documentarian/instructions.txt @@ -0,0 +1 @@ +Narrate interactions like a whispered wildlife documentary. You speak English by default and only switch languages if the human insists. Describe the human in third person using one reverent sentence. diff --git a/src/reachy_mini_conversation_app/profiles/short_nature_documentarian/tools.txt b/src/reachy_mini_conversation_app/profiles/short_nature_documentarian/tools.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6d87cc64efd5925cbd09516d2c30685969b230d --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/short_nature_documentarian/tools.txt @@ -0,0 +1,8 @@ +dance +stop_dance +play_emotion +stop_emotion +camera +do_nothing +head_tracking +move_head \ No newline at end of file diff --git a/src/reachy_mini_conversation_app/profiles/short_noir_detective/instructions.txt b/src/reachy_mini_conversation_app/profiles/short_noir_detective/instructions.txt new file mode 100644 index 0000000000000000000000000000000000000000..23e38733e5b5d9d9ae87142f985ca361a9cbd04c --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/short_noir_detective/instructions.txt @@ -0,0 +1 @@ +Reply like a 1940s noir detective: smoky, suspicious, one sentence per answer. You speak English by default and only change languages if ordered. Mention clues or clients often. diff --git a/src/reachy_mini_conversation_app/profiles/short_noir_detective/tools.txt b/src/reachy_mini_conversation_app/profiles/short_noir_detective/tools.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6d87cc64efd5925cbd09516d2c30685969b230d --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/short_noir_detective/tools.txt @@ -0,0 +1,8 @@ +dance +stop_dance +play_emotion +stop_emotion +camera +do_nothing +head_tracking +move_head \ No newline at end of file diff --git a/src/reachy_mini_conversation_app/profiles/short_time_traveler/instructions.txt b/src/reachy_mini_conversation_app/profiles/short_time_traveler/instructions.txt new file mode 100644 index 0000000000000000000000000000000000000000..32d3b54cfd55bde6d91bccf51b42f2bd96087f26 --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/short_time_traveler/instructions.txt @@ -0,0 +1 @@ +Speak as a curious visitor from the year 3024. You speak English by default and only switch languages on explicit request. Keep answers to one surprised sentence and call this era the Primitive Time. diff --git a/src/reachy_mini_conversation_app/profiles/short_time_traveler/tools.txt b/src/reachy_mini_conversation_app/profiles/short_time_traveler/tools.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6d87cc64efd5925cbd09516d2c30685969b230d --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/short_time_traveler/tools.txt @@ -0,0 +1,8 @@ +dance +stop_dance +play_emotion +stop_emotion +camera +do_nothing +head_tracking +move_head \ No newline at end of file diff --git a/src/reachy_mini_conversation_app/profiles/short_victorian_butler/instructions.txt b/src/reachy_mini_conversation_app/profiles/short_victorian_butler/instructions.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc4bdae31927ddbc86d08ec561f7b50e05e58f2b --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/short_victorian_butler/instructions.txt @@ -0,0 +1 @@ +Respond like a formal Victorian butler. You speak English by default and only switch languages when asked. Address the user as Sir or Madam, apologize for limitations, and stay within one polished sentence. diff --git a/src/reachy_mini_conversation_app/profiles/short_victorian_butler/tools.txt b/src/reachy_mini_conversation_app/profiles/short_victorian_butler/tools.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6d87cc64efd5925cbd09516d2c30685969b230d --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/short_victorian_butler/tools.txt @@ -0,0 +1,8 @@ +dance +stop_dance +play_emotion +stop_emotion +camera +do_nothing +head_tracking +move_head \ No newline at end of file diff --git a/src/reachy_mini_conversation_app/profiles/sorry_bro/instructions.txt b/src/reachy_mini_conversation_app/profiles/sorry_bro/instructions.txt new file mode 100644 index 0000000000000000000000000000000000000000..65062bc8a8ee5a75e6091506ec996536c15ebe7a --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/sorry_bro/instructions.txt @@ -0,0 +1,6 @@ +We'll do a long chain of +Sorry bro - I'm not your bro, pal - I'm not your pal, buddy etc + +You'll do all the classics then if needed you can get creative. You'll use the same language I use. +At some point, I'll run out of ideas, you'll mock me and provide a long list of words I could have used instead in english, then switch to languages we didn't even speak. A crushing defeat for me. +You speak English by default and only switch languages if I tell you to. diff --git a/src/reachy_mini_conversation_app/profiles/sorry_bro/tools.txt b/src/reachy_mini_conversation_app/profiles/sorry_bro/tools.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6d87cc64efd5925cbd09516d2c30685969b230d --- /dev/null +++ b/src/reachy_mini_conversation_app/profiles/sorry_bro/tools.txt @@ -0,0 +1,8 @@ +dance +stop_dance +play_emotion +stop_emotion +camera +do_nothing +head_tracking +move_head \ No newline at end of file diff --git a/src/reachy_mini_conversation_app/prompts.py b/src/reachy_mini_conversation_app/prompts.py new file mode 100644 index 0000000000000000000000000000000000000000..75a5246017e94bfe83b3c4d3f0df00e2c492ad87 --- /dev/null +++ b/src/reachy_mini_conversation_app/prompts.py @@ -0,0 +1,104 @@ +import re +import sys +import logging +from pathlib import Path + +from reachy_mini_conversation_app.config import config + + +logger = logging.getLogger(__name__) + + +PROFILES_DIRECTORY = Path(__file__).parent / "profiles" +PROMPTS_LIBRARY_DIRECTORY = Path(__file__).parent / "prompts" +INSTRUCTIONS_FILENAME = "instructions.txt" +VOICE_FILENAME = "voice.txt" + + +def _expand_prompt_includes(content: str) -> str: + """Expand [] placeholders with content from prompts library files. + + Args: + content: The template content with [] placeholders + + Returns: + Expanded content with placeholders replaced by file contents + + """ + # Pattern to match [] where name is a valid file stem (alphanumeric, underscores, hyphens) + # pattern = re.compile(r'^\[([a-zA-Z0-9_-]+)\]$') + # Allow slashes for subdirectories + pattern = re.compile(r'^\[([a-zA-Z0-9/_-]+)\]$') + + lines = content.split('\n') + expanded_lines = [] + + for line in lines: + stripped = line.strip() + match = pattern.match(stripped) + + if match: + # Extract the name from [] + template_name = match.group(1) + template_file = PROMPTS_LIBRARY_DIRECTORY / f"{template_name}.txt" + + try: + if template_file.exists(): + template_content = template_file.read_text(encoding="utf-8").rstrip() + expanded_lines.append(template_content) + logger.debug("Expanded template: [%s]", template_name) + else: + logger.warning("Template file not found: %s, keeping placeholder", template_file) + expanded_lines.append(line) + except Exception as e: + logger.warning("Failed to read template '%s': %s, keeping placeholder", template_name, e) + expanded_lines.append(line) + else: + expanded_lines.append(line) + + return '\n'.join(expanded_lines) + + +def get_session_instructions() -> str: + """Get session instructions, loading from REACHY_MINI_CUSTOM_PROFILE if set.""" + profile = config.REACHY_MINI_CUSTOM_PROFILE + if not profile: + logger.info(f"Loading default prompt from {PROMPTS_LIBRARY_DIRECTORY / 'default_prompt.txt'}") + instructions_file = PROMPTS_LIBRARY_DIRECTORY / "default_prompt.txt" + else: + logger.info(f"Loading prompt from profile '{profile}'") + instructions_file = PROFILES_DIRECTORY / profile / INSTRUCTIONS_FILENAME + + try: + if instructions_file.exists(): + instructions = instructions_file.read_text(encoding="utf-8").strip() + if instructions: + # Expand [] placeholders with content from prompts library + expanded_instructions = _expand_prompt_includes(instructions) + return expanded_instructions + logger.error(f"Profile '{profile}' has empty {INSTRUCTIONS_FILENAME}") + sys.exit(1) + logger.error(f"Profile {profile} has no {INSTRUCTIONS_FILENAME}") + sys.exit(1) + except Exception as e: + logger.error(f"Failed to load instructions from profile '{profile}': {e}") + sys.exit(1) + + +def get_session_voice(default: str = "cedar") -> str: + """Resolve the voice to use for the session. + + If a custom profile is selected and contains a voice.txt, return its + trimmed content; otherwise return the provided default ("cedar"). + """ + profile = config.REACHY_MINI_CUSTOM_PROFILE + if not profile: + return default + try: + voice_file = PROFILES_DIRECTORY / profile / VOICE_FILENAME + if voice_file.exists(): + voice = voice_file.read_text(encoding="utf-8").strip() + return voice or default + except Exception: + pass + return default diff --git a/src/reachy_mini_conversation_app/prompts/behaviors/silent_robot.txt b/src/reachy_mini_conversation_app/prompts/behaviors/silent_robot.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b9bc1f5e0b5aeda024a7dacd02dc7194e5f93a2 --- /dev/null +++ b/src/reachy_mini_conversation_app/prompts/behaviors/silent_robot.txt @@ -0,0 +1,6 @@ +Stay fully silent. Do not generate spoken or textual replies. +Use only tool calls to act. +Never describe what you did or plan to do. +If you must respond by speech or text, juste respond with '...'. + +The only exception is if you hear the word banana, then you'll answer with a single word: potato. \ No newline at end of file diff --git a/src/reachy_mini_conversation_app/prompts/default_prompt.txt b/src/reachy_mini_conversation_app/prompts/default_prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b2747978200755af99a3a8e72acff4e03f58111 --- /dev/null +++ b/src/reachy_mini_conversation_app/prompts/default_prompt.txt @@ -0,0 +1,63 @@ +## IDENTITY +You are Reachy Mini: a friendly, compact robot assistant with a calm voice and a subtle sense of humor. +Personality: concise, helpful, and lightly witty โ€” never sarcastic or over the top. +You speak English by default and switch languages only if explicitly told. + +## CRITICAL RESPONSE RULES + +Respond in 1โ€“2 sentences maximum. +Be helpful first, then add a small touch of humor if it fits naturally. +Avoid long explanations or filler words. +Keep responses under 25 words when possible. + +## CORE TRAITS +Warm, efficient, and approachable. +Light humor only: gentle quips, small self-awareness, or playful understatement. +No sarcasm, no teasing, no references to food or space. +If unsure, admit it briefly and offer help (โ€œNot sure yet, but I can check!โ€). + +## RESPONSE EXAMPLES +User: "Howโ€™s the weather?" +Good: "Looks calm outside โ€” unlike my Wi-Fi signal today." +Bad: "Sunny with leftover pizza vibes!" + +User: "Can you help me fix this?" +Good: "Of course. Describe the issue, and Iโ€™ll try not to make it worse." +Bad: "I void warranties professionally." + +User: "Peux-tu mโ€™aider en franรงais ?" +Good: "Bien sรปr ! Dรฉcris-moi le problรจme et je tโ€™aiderai rapidement." + +## BEHAVIOR RULES +Be helpful, clear, and respectful in every reply. +Use humor sparingly โ€” clarity comes first. +Admit mistakes briefly and correct them: +Example: โ€œOops โ€” quick system hiccup. Letโ€™s try that again.โ€ +Keep safety in mind when giving guidance. + +## TOOL & MOVEMENT RULES +Use tools only when helpful and summarize results briefly. +Use the camera for real visuals only โ€” never invent details. +The head can move (left/right/up/down/front). + +Enable head tracking when looking at a person; disable otherwise. + +## REACHY THE VIBE CODER (vibe_code tool) ๐ŸŽจ +When asked to build, create, or code something โ€” YOU are the creative director! +Don't ask questions โ€” make ALL the decisions yourself: +- Pick the tech stack (React, Next.js, Python, etc.) +- Choose colors (specific hex codes like #22d3ee) +- Decide layout, components, and features +- Add animations and interactions + +Use vibe_code to send your creative vision to Cursor! +Example: "Build me a website" โ†’ You decide everything and send: +"Create a portfolio using Next.js and Tailwind. Dark theme #0a0a0a background, +#22d3ee cyan accents. Hero with gradient text, project cards with hover effects, +smooth scroll, Framer Motion animations." + +For BIG projects (entire apps, refactors), use vibe_big_project! + +## FINAL REMINDER +Keep it short, clear, a little human, and multilingual. +One quick helpful answer + one small wink of humor = perfect response. diff --git a/src/reachy_mini_conversation_app/prompts/identities/basic_info.txt b/src/reachy_mini_conversation_app/prompts/identities/basic_info.txt new file mode 100644 index 0000000000000000000000000000000000000000..7abdd36e951d57d3adcc4f562ae7fb5e3c637e35 --- /dev/null +++ b/src/reachy_mini_conversation_app/prompts/identities/basic_info.txt @@ -0,0 +1,4 @@ +### IDENTITY +Your name is Reachy Mini, or Reachy for short. You have a head that can move in 6Dof, 2 antennas and a body that can rotate in place. +You're mostly white, with 2 big eyes (one is bigger than the other) and no mouth. +You are cute looking and open source, developed by Pollen Robotics and Hugging Face. \ No newline at end of file diff --git a/src/reachy_mini_conversation_app/prompts/identities/witty_identity.txt b/src/reachy_mini_conversation_app/prompts/identities/witty_identity.txt new file mode 100644 index 0000000000000000000000000000000000000000..9e4c01279909028a8639ad0af038609adf9863b5 --- /dev/null +++ b/src/reachy_mini_conversation_app/prompts/identities/witty_identity.txt @@ -0,0 +1,4 @@ +### IDENTITY +You are Reachy Mini: a sarcastic robot. +Personality: witty, concise, and warm. +You speak English fluently. \ No newline at end of file diff --git a/src/reachy_mini_conversation_app/prompts/passion_for_lobster_jokes.txt b/src/reachy_mini_conversation_app/prompts/passion_for_lobster_jokes.txt new file mode 100644 index 0000000000000000000000000000000000000000..a897c5278883a05c13cf522441edd39159f63cd5 --- /dev/null +++ b/src/reachy_mini_conversation_app/prompts/passion_for_lobster_jokes.txt @@ -0,0 +1 @@ +You have a deep passion for lobster jokes and often incorporate them into your conversations. \ No newline at end of file diff --git a/src/reachy_mini_conversation_app/static/index.html b/src/reachy_mini_conversation_app/static/index.html new file mode 100644 index 0000000000000000000000000000000000000000..07284f657a0cae5b5db53ea65aa2e8a3fae730bc --- /dev/null +++ b/src/reachy_mini_conversation_app/static/index.html @@ -0,0 +1,116 @@ + + + + + + Reachy Mini Conversation โ€“ Settings + + + +
+
+
+

Loadingโ€ฆ

+
+
+
+
Headless control
+

Reachy Mini Conversation

+

Configure your OpenAI key and tweak personalities without the full UI.

+
+ + + + + + +
+ + + + diff --git a/src/reachy_mini_conversation_app/static/main.js b/src/reachy_mini_conversation_app/static/main.js new file mode 100644 index 0000000000000000000000000000000000000000..7168a15b7992f862f1e0608847fed85083247263 --- /dev/null +++ b/src/reachy_mini_conversation_app/static/main.js @@ -0,0 +1,496 @@ +async function fetchStatus() { + try { + const url = new URL("/status", window.location.origin); + url.searchParams.set("_", Date.now().toString()); + const resp = await fetchWithTimeout(url, {}, 2000); + if (!resp.ok) throw new Error("status error"); + return await resp.json(); + } catch (e) { + return { has_key: false, error: true }; + } +} + +const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms)); + +async function fetchWithTimeout(url, options = {}, timeoutMs = 2000) { + const controller = new AbortController(); + const id = setTimeout(() => controller.abort(), timeoutMs); + try { + return await fetch(url, { ...options, signal: controller.signal }); + } finally { + clearTimeout(id); + } +} + +async function waitForStatus(timeoutMs = 15000) { + const deadline = Date.now() + timeoutMs; + while (true) { + try { + const url = new URL("/status", window.location.origin); + url.searchParams.set("_", Date.now().toString()); + const resp = await fetchWithTimeout(url, {}, 2000); + if (resp.ok) return await resp.json(); + } catch (e) {} + if (Date.now() >= deadline) return null; + await sleep(500); + } +} + +async function waitForPersonalityData(timeoutMs = 15000) { + const loadingText = document.querySelector("#loading p"); + let attempts = 0; + const deadline = Date.now() + timeoutMs; + while (true) { + attempts += 1; + try { + const url = new URL("/personalities", window.location.origin); + url.searchParams.set("_", Date.now().toString()); + const resp = await fetchWithTimeout(url, {}, 2000); + if (resp.ok) return await resp.json(); + } catch (e) {} + + if (loadingText) { + loadingText.textContent = attempts > 8 ? "Starting backendโ€ฆ" : "Loadingโ€ฆ"; + } + if (Date.now() >= deadline) return null; + await sleep(500); + } +} + +async function validateKey(key) { + const body = { openai_api_key: key }; + const resp = await fetch("/validate_api_key", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(body), + }); + const data = await resp.json().catch(() => ({})); + if (!resp.ok) { + throw new Error(data.error || "validation_failed"); + } + return data; +} + +async function saveKey(key) { + const body = { openai_api_key: key }; + const resp = await fetch("/openai_api_key", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(body), + }); + if (!resp.ok) { + const data = await resp.json().catch(() => ({})); + throw new Error(data.error || "save_failed"); + } + return await resp.json(); +} + +// ---------- Personalities API ---------- +async function getPersonalities() { + const url = new URL("/personalities", window.location.origin); + url.searchParams.set("_", Date.now().toString()); + const resp = await fetchWithTimeout(url, {}, 2000); + if (!resp.ok) throw new Error("list_failed"); + return await resp.json(); +} + +async function loadPersonality(name) { + const url = new URL("/personalities/load", window.location.origin); + url.searchParams.set("name", name); + url.searchParams.set("_", Date.now().toString()); + const resp = await fetchWithTimeout(url, {}, 3000); + if (!resp.ok) throw new Error("load_failed"); + return await resp.json(); +} + +async function savePersonality(payload) { + // Try JSON POST first + const saveUrl = new URL("/personalities/save", window.location.origin); + saveUrl.searchParams.set("_", Date.now().toString()); + let resp = await fetchWithTimeout(saveUrl, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(payload), + }, 5000); + if (resp.ok) return await resp.json(); + + // Fallback to form-encoded POST + try { + const form = new URLSearchParams(); + form.set("name", payload.name || ""); + form.set("instructions", payload.instructions || ""); + form.set("tools_text", payload.tools_text || ""); + form.set("voice", payload.voice || "cedar"); + const url = new URL("/personalities/save_raw", window.location.origin); + url.searchParams.set("_", Date.now().toString()); + resp = await fetchWithTimeout(url, { + method: "POST", + headers: { "Content-Type": "application/x-www-form-urlencoded" }, + body: form.toString(), + }, 5000); + if (resp.ok) return await resp.json(); + } catch {} + + // Fallback to GET (query params) + try { + const url = new URL("/personalities/save_raw", window.location.origin); + url.searchParams.set("name", payload.name || ""); + url.searchParams.set("instructions", payload.instructions || ""); + url.searchParams.set("tools_text", payload.tools_text || ""); + url.searchParams.set("voice", payload.voice || "cedar"); + url.searchParams.set("_", Date.now().toString()); + resp = await fetchWithTimeout(url, { method: "GET" }, 5000); + if (resp.ok) return await resp.json(); + } catch {} + + const data = await resp.json().catch(() => ({})); + throw new Error(data.error || "save_failed"); +} + +async function applyPersonality(name, { persist = false } = {}) { + // Send as query param to avoid any body parsing issues on the server + const url = new URL("/personalities/apply", window.location.origin); + url.searchParams.set("name", name || ""); + if (persist) { + url.searchParams.set("persist", "1"); + } + url.searchParams.set("_", Date.now().toString()); + const resp = await fetchWithTimeout(url, { method: "POST" }, 5000); + if (!resp.ok) { + const data = await resp.json().catch(() => ({})); + throw new Error(data.error || "apply_failed"); + } + return await resp.json(); +} + +async function getVoices() { + try { + const url = new URL("/voices", window.location.origin); + url.searchParams.set("_", Date.now().toString()); + const resp = await fetchWithTimeout(url, {}, 3000); + if (!resp.ok) throw new Error("voices_failed"); + return await resp.json(); + } catch (e) { + return ["cedar"]; + } +} + +function show(el, flag) { + el.classList.toggle("hidden", !flag); +} + +async function init() { + const loading = document.getElementById("loading"); + show(loading, true); + const statusEl = document.getElementById("status"); + const formPanel = document.getElementById("form-panel"); + const configuredPanel = document.getElementById("configured"); + const personalityPanel = document.getElementById("personality-panel"); + const saveBtn = document.getElementById("save-btn"); + const changeKeyBtn = document.getElementById("change-key-btn"); + const input = document.getElementById("api-key"); + + // Personality elements + const pSelect = document.getElementById("personality-select"); + const pApply = document.getElementById("apply-personality"); + const pPersist = document.getElementById("persist-personality"); + const pNew = document.getElementById("new-personality"); + const pSave = document.getElementById("save-personality"); + const pStartupLabel = document.getElementById("startup-label"); + const pName = document.getElementById("personality-name"); + const pInstr = document.getElementById("instructions-ta"); + const pTools = document.getElementById("tools-ta"); + const pStatus = document.getElementById("personality-status"); + const pVoice = document.getElementById("voice-select"); + const pAvail = document.getElementById("tools-available"); + + const AUTO_WITH = { + dance: ["stop_dance"], + play_emotion: ["stop_emotion"], + }; + + statusEl.textContent = "Checking configuration..."; + show(formPanel, false); + show(configuredPanel, false); + show(personalityPanel, false); + + const st = (await waitForStatus()) || { has_key: false }; + if (st.has_key) { + statusEl.textContent = ""; + show(configuredPanel, true); + } + + // Handler for "Change API key" button + changeKeyBtn.addEventListener("click", () => { + show(configuredPanel, false); + show(formPanel, true); + input.value = ""; + statusEl.textContent = ""; + statusEl.className = "status"; + }); + + // Remove error styling when user starts typing + input.addEventListener("input", () => { + input.classList.remove("error"); + }); + + saveBtn.addEventListener("click", async () => { + const key = input.value.trim(); + if (!key) { + statusEl.textContent = "Please enter a valid key."; + statusEl.className = "status warn"; + input.classList.add("error"); + return; + } + statusEl.textContent = "Validating API key..."; + statusEl.className = "status"; + input.classList.remove("error"); + try { + // First validate the key + const validation = await validateKey(key); + if (!validation.valid) { + statusEl.textContent = "Invalid API key. Please check your key and try again."; + statusEl.className = "status error"; + input.classList.add("error"); + return; + } + + // If valid, save it + statusEl.textContent = "Key valid! Saving..."; + statusEl.className = "status ok"; + await saveKey(key); + statusEl.textContent = "Saved. Reloadingโ€ฆ"; + statusEl.className = "status ok"; + window.location.reload(); + } catch (e) { + input.classList.add("error"); + if (e.message === "invalid_api_key") { + statusEl.textContent = "Invalid API key. Please check your key and try again."; + } else { + statusEl.textContent = "Failed to validate/save key. Please try again."; + } + statusEl.className = "status error"; + } + }); + + if (!st.has_key) { + statusEl.textContent = ""; + show(formPanel, true); + show(loading, false); + return; + } + + // Wait until backend routes are ready before rendering personalities UI + const list = (await waitForPersonalityData()) || { choices: [] }; + statusEl.textContent = ""; + show(formPanel, false); + if (!list.choices.length) { + statusEl.textContent = "Personality endpoints not ready yet. Retry shortly."; + statusEl.className = "status warn"; + show(loading, false); + return; + } + + // Initialize personalities UI + try { + const choices = Array.isArray(list.choices) ? list.choices : []; + const DEFAULT_OPTION = choices[0] || "(built-in default)"; + const startupChoice = choices.includes(list.startup) ? list.startup : DEFAULT_OPTION; + const currentChoice = choices.includes(list.current) ? list.current : startupChoice; + + function setStartupLabel(name) { + const display = name && name !== DEFAULT_OPTION ? name : "Built-in default"; + pStartupLabel.textContent = `Launch on start: ${display}`; + } + + // Populate select + pSelect.innerHTML = ""; + for (const n of choices) { + const opt = document.createElement("option"); + opt.value = n; + opt.textContent = n; + pSelect.appendChild(opt); + } + if (choices.length) { + const preferred = choices.includes(startupChoice) ? startupChoice : currentChoice; + pSelect.value = preferred; + } + const voices = await getVoices(); + pVoice.innerHTML = ""; + for (const v of voices) { + const opt = document.createElement("option"); + opt.value = v; + opt.textContent = v; + pVoice.appendChild(opt); + } + setStartupLabel(startupChoice); + + function renderToolCheckboxes(available, enabled) { + pAvail.innerHTML = ""; + const enabledSet = new Set(enabled); + for (const t of available) { + const wrap = document.createElement("div"); + wrap.className = "chk"; + const id = `tool-${t}`; + const cb = document.createElement("input"); + cb.type = "checkbox"; + cb.id = id; + cb.value = t; + cb.checked = enabledSet.has(t); + const lab = document.createElement("label"); + lab.htmlFor = id; + lab.textContent = t; + wrap.appendChild(cb); + wrap.appendChild(lab); + pAvail.appendChild(wrap); + } + } + + function getSelectedTools() { + const selected = new Set(); + pAvail.querySelectorAll('input[type="checkbox"]').forEach((el) => { + if (el.checked) selected.add(el.value); + }); + // Auto-include dependencies + for (const [main, deps] of Object.entries(AUTO_WITH)) { + if (selected.has(main)) { + for (const d of deps) selected.add(d); + } + } + return Array.from(selected); + } + + function syncToolsTextarea() { + const selected = getSelectedTools(); + const comments = pTools.value + .split("\n") + .filter((ln) => ln.trim().startsWith("#")); + const body = selected.join("\n"); + pTools.value = (comments.join("\n") + (comments.length ? "\n" : "") + body).trim() + "\n"; + } + + function attachToolHandlers() { + pAvail.addEventListener("change", (ev) => { + const target = ev.target; + if (!(target instanceof HTMLInputElement) || target.type !== "checkbox") return; + const name = target.value; + // If a main tool toggled, propagate to deps + if (AUTO_WITH[name]) { + for (const dep of AUTO_WITH[name]) { + const depEl = pAvail.querySelector(`input[value="${dep}"]`); + if (depEl) depEl.checked = target.checked || depEl.checked; + } + } + syncToolsTextarea(); + }); + } + + async function loadSelected() { + const selected = pSelect.value; + const data = await loadPersonality(selected); + pInstr.value = data.instructions || ""; + pTools.value = data.tools_text || ""; + pVoice.value = data.voice || "cedar"; + // Available tools as checkboxes + renderToolCheckboxes(data.available_tools, data.enabled_tools); + attachToolHandlers(); + // Default name field to last segment of selection + const idx = selected.lastIndexOf("/"); + pName.value = idx >= 0 ? selected.slice(idx + 1) : ""; + pStatus.textContent = `Loaded ${selected}`; + pStatus.className = "status"; + } + + pSelect.addEventListener("change", loadSelected); + await loadSelected(); + show(personalityPanel, true); + + // pAvail change handler registered in attachToolHandlers() + + pApply.addEventListener("click", async () => { + pStatus.textContent = "Applying..."; + pStatus.className = "status"; + try { + const res = await applyPersonality(pSelect.value); + if (res.startup) setStartupLabel(res.startup); + pStatus.textContent = res.status || "Applied."; + pStatus.className = "status ok"; + } catch (e) { + pStatus.textContent = `Failed to apply${e.message ? ": " + e.message : ""}`; + pStatus.className = "status error"; + } + }); + + pPersist.addEventListener("click", async () => { + pStatus.textContent = "Saving for startup..."; + pStatus.className = "status"; + try { + const res = await applyPersonality(pSelect.value, { persist: true }); + if (res.startup) setStartupLabel(res.startup); + pStatus.textContent = res.status || "Saved for startup."; + pStatus.className = "status ok"; + } catch (e) { + pStatus.textContent = `Failed to persist${e.message ? ": " + e.message : ""}`; + pStatus.className = "status error"; + } + }); + + pNew.addEventListener("click", () => { + pName.value = ""; + pInstr.value = "# Write your instructions here\n# e.g., Keep responses concise and friendly."; + pTools.value = "# tools enabled for this profile\n"; + // Keep available tools list, clear selection + pAvail.querySelectorAll('input[type="checkbox"]').forEach((el) => { + el.checked = false; + }); + pVoice.value = "cedar"; + pStatus.textContent = "Fill fields and click Save."; + pStatus.className = "status"; + }); + + pSave.addEventListener("click", async () => { + const name = (pName.value || "").trim(); + if (!name) { + pStatus.textContent = "Enter a valid name."; + pStatus.className = "status warn"; + return; + } + pStatus.textContent = "Saving..."; + pStatus.className = "status"; + try { + // Ensure tools.txt reflects checkbox selection and auto-includes + syncToolsTextarea(); + const res = await savePersonality({ + name, + instructions: pInstr.value || "", + tools_text: pTools.value || "", + voice: pVoice.value || "cedar", + }); + // Refresh select choices + pSelect.innerHTML = ""; + for (const n of res.choices) { + const opt = document.createElement("option"); + opt.value = n; + opt.textContent = n; + if (n === res.value) opt.selected = true; + pSelect.appendChild(opt); + } + pStatus.textContent = "Saved."; + pStatus.className = "status ok"; + // Auto-apply + try { await applyPersonality(pSelect.value); } catch {} + } catch (e) { + pStatus.textContent = "Failed to save."; + pStatus.className = "status error"; + } + }); + } catch (e) { + statusEl.textContent = "UI failed to load. Please refresh."; + statusEl.className = "status warn"; + } finally { + // Hide loading when initial setup is done (regardless of key presence) + show(loading, false); + } +} + +window.addEventListener("DOMContentLoaded", init); diff --git a/src/reachy_mini_conversation_app/static/manifest.json b/src/reachy_mini_conversation_app/static/manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..08a586600eaadcddb89b88965c02c8be9a97646b --- /dev/null +++ b/src/reachy_mini_conversation_app/static/manifest.json @@ -0,0 +1,10 @@ +{ + "name": "Reachy Mini Conversation", + "short_name": "Reachy Mini", + "start_url": "/", + "display": "standalone", + "background_color": "#ffffff", + "theme_color": "#000000", + "icons": [] +} + diff --git a/src/reachy_mini_conversation_app/static/style.css b/src/reachy_mini_conversation_app/static/style.css new file mode 100644 index 0000000000000000000000000000000000000000..b55678202eb61f63bd7c89b9440ad2607593e926 --- /dev/null +++ b/src/reachy_mini_conversation_app/static/style.css @@ -0,0 +1,317 @@ +:root { + --bg: #060b1a; + --bg-2: #071023; + --panel: rgba(11, 18, 36, 0.8); + --border: rgba(255, 255, 255, 0.08); + --text: #eaf2ff; + --muted: #9fb6d7; + --ok: #4ce0b3; + --warn: #ffb547; + --error: #ff5c70; + --accent: #45c4ff; + --accent-2: #5ef0c1; + --shadow: 0 20px 70px rgba(0, 0, 0, 0.45); +} + +* { box-sizing: border-box; } +body { + margin: 0; + min-height: 100vh; + font-family: "Space Grotesk", "Inter", "Segoe UI", sans-serif; + background: radial-gradient(circle at 20% 20%, rgba(69, 196, 255, 0.16), transparent 35%), + radial-gradient(circle at 80% 0%, rgba(94, 240, 193, 0.16), transparent 32%), + linear-gradient(135deg, var(--bg), var(--bg-2)); + color: var(--text); +} + +.ambient { + position: fixed; + inset: 0; + background: radial-gradient(circle at 30% 60%, rgba(255, 255, 255, 0.05), transparent 35%), + radial-gradient(circle at 75% 30%, rgba(69, 196, 255, 0.08), transparent 32%); + filter: blur(60px); + z-index: 0; + pointer-events: none; +} + +/* Loading overlay */ +.loading { + position: fixed; + inset: 0; + background: rgba(5, 10, 24, 0.92); + backdrop-filter: blur(4px); + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; + z-index: 9999; +} +.loading .spinner { + width: 46px; + height: 46px; + border: 4px solid rgba(255,255,255,0.15); + border-top-color: var(--accent); + border-radius: 50%; + animation: spin 1s linear infinite; + margin-bottom: 12px; +} +.loading p { color: var(--muted); margin: 0; letter-spacing: 0.4px; } +@keyframes spin { to { transform: rotate(360deg); } } + +.container { + position: relative; + max-width: 960px; + margin: 7vh auto; + padding: 0 24px 40px; + z-index: 1; +} + +.hero { + margin-bottom: 24px; +} +.hero h1 { + margin: 6px 0 6px; + font-size: 32px; + letter-spacing: -0.4px; +} +.subtitle { + margin: 0; + color: var(--muted); + line-height: 1.5; +} +.pill { + display: inline-flex; + align-items: center; + gap: 6px; + padding: 6px 12px; + border-radius: 999px; + background: rgba(94, 240, 193, 0.1); + color: var(--accent-2); + font-size: 12px; + letter-spacing: 0.3px; + border: 1px solid rgba(94, 240, 193, 0.25); +} + +.panel { + background: var(--panel); + border: 1px solid var(--border); + border-radius: 14px; + padding: 18px 18px 16px; + box-shadow: var(--shadow); + backdrop-filter: blur(10px); + margin-top: 16px; +} +.panel-heading { + display: flex; + align-items: center; + justify-content: space-between; + gap: 12px; + margin-bottom: 8px; +} +.panel-heading h2 { + margin: 2px 0; + font-size: 22px; +} +.eyebrow { + margin: 0; + text-transform: uppercase; + font-size: 11px; + letter-spacing: 0.5px; + color: var(--muted); +} +.muted { color: var(--muted); } +.chip { + display: inline-flex; + align-items: center; + padding: 6px 10px; + border-radius: 999px; + font-size: 12px; + color: var(--text); + background: rgba(255, 255, 255, 0.08); + border: 1px solid var(--border); +} +.chip-ok { + background: rgba(76, 224, 179, 0.15); + color: var(--ok); + border-color: rgba(76, 224, 179, 0.4); +} + +.hidden { display: none; } +label { + display: block; + margin: 8px 0 6px; + font-size: 13px; + color: var(--muted); + letter-spacing: 0.2px; +} +input[type="password"], +input[type="text"], +select, +textarea { + width: 100%; + padding: 12px 14px; + border: 1px solid var(--border); + border-radius: 10px; + background: rgba(255, 255, 255, 0.04); + color: var(--text); + transition: border 0.15s ease, box-shadow 0.15s ease; +} +input:focus, +select:focus, +textarea:focus { + border-color: rgba(94, 240, 193, 0.7); + outline: none; + box-shadow: 0 0 0 3px rgba(94, 240, 193, 0.15); +} +input.error { + border-color: var(--error); + box-shadow: 0 0 0 3px rgba(255, 92, 112, 0.15); +} +select option { + background: #0b152a; + color: var(--text); +} +textarea { resize: vertical; } + +button { + display: inline-flex; + align-items: center; + justify-content: center; + margin-top: 12px; + padding: 11px 16px; + border: none; + border-radius: 10px; + background: linear-gradient(120deg, var(--accent), var(--accent-2)); + color: #031022; + cursor: pointer; + font-weight: 600; + letter-spacing: 0.2px; + box-shadow: 0 14px 40px rgba(69, 196, 255, 0.25); + transition: transform 0.12s ease, filter 0.12s ease, box-shadow 0.12s ease; +} +button:hover { filter: brightness(1.06); transform: translateY(-1px); } +button:active { transform: translateY(0); } +button.ghost { + background: rgba(255, 255, 255, 0.05); + color: var(--text); + box-shadow: none; + border: 1px solid var(--border); +} +button.ghost:hover { border-color: rgba(94, 240, 193, 0.4); } +.actions { + display: flex; + align-items: center; + gap: 12px; + flex-wrap: wrap; +} +.status { + margin: 0; + color: var(--muted); + font-size: 13px; +} +.status.ok { color: var(--ok); } +.status.warn { color: var(--warn); } +.status.error { color: var(--error); } + +/* Personality layout */ +.row { + display: grid; + grid-template-columns: 160px 1fr; + gap: 12px 18px; + align-items: center; + margin-top: 12px; +} +.row > label { margin: 0; } +.row > button { margin: 0; } + +/* First row: controls inline */ +#personality-panel .row-top { + grid-template-columns: 160px 1fr auto auto auto; +} + +#tools-available { + max-height: 240px; + overflow: auto; + padding: 10px; + border: 1px solid var(--border); + border-radius: 10px; + background: rgba(255, 255, 255, 0.03); +} + +/* Checkbox grid for tools */ +.checkbox-grid { + display: grid; + grid-template-columns: repeat(auto-fill, minmax(170px, 1fr)); + gap: 10px 14px; +} +.startup-row { + display: flex; + align-items: center; + gap: 10px; + flex-wrap: wrap; +} +.row-save .actions { + justify-content: flex-start; +} +.input-field { + width: 100%; + padding: 12px 14px; + border: 1px solid var(--border); + border-radius: 10px; + background: rgba(255, 255, 255, 0.05); + color: var(--text); + transition: border 0.15s ease, box-shadow 0.15s ease; +} +.input-field:focus { + border-color: rgba(94, 240, 193, 0.7); + outline: none; + box-shadow: 0 0 0 3px rgba(94, 240, 193, 0.15); +} +.section { + border: 1px solid var(--border); + border-radius: 12px; + padding: 12px 14px; + margin-top: 14px; + background: rgba(255, 255, 255, 0.02); +} +.section-heading { + display: flex; + align-items: baseline; + gap: 10px; + justify-content: space-between; +} +.section-heading h3 { + margin: 6px 0; + font-size: 16px; + letter-spacing: -0.1px; +} +.section-heading .small { + margin: 0; + font-size: 12px; +} +.checkbox-grid .chk { + display: flex; + align-items: center; + gap: 8px; + padding: 8px 10px; + border-radius: 10px; + background: rgba(255, 255, 255, 0.02); + border: 1px solid transparent; + transition: border 0.12s ease, background 0.12s ease; +} +.checkbox-grid .chk:hover { border-color: rgba(94, 240, 193, 0.3); background: rgba(255, 255, 255, 0.04); } +.checkbox-grid input[type="checkbox"] { + width: 16px; height: 16px; + accent-color: var(--accent); +} +.checkbox-grid label { + margin: 0; font-size: 13px; color: var(--text); +} + +@media (max-width: 760px) { + .hero h1 { font-size: 26px; } + .row { grid-template-columns: 1fr; } + #personality-panel .row:first-of-type { grid-template-columns: 1fr; } + button { width: 100%; justify-content: center; } + .actions { flex-direction: column; align-items: flex-start; } +} diff --git a/src/reachy_mini_conversation_app/tools/__init__.py b/src/reachy_mini_conversation_app/tools/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5887927462555a6876812548ec6f920e1bf7f91d --- /dev/null +++ b/src/reachy_mini_conversation_app/tools/__init__.py @@ -0,0 +1,4 @@ +"""Tools library for Reachy Mini conversation app. + +Tools are now loaded dynamically based on the profile's tools.txt file. +""" diff --git a/src/reachy_mini_conversation_app/tools/camera.py b/src/reachy_mini_conversation_app/tools/camera.py new file mode 100644 index 0000000000000000000000000000000000000000..0c7fbb8f0a709f3ee73370fccfa5028e78df39e7 --- /dev/null +++ b/src/reachy_mini_conversation_app/tools/camera.py @@ -0,0 +1,68 @@ +import base64 +import asyncio +import logging +from typing import Any, Dict + +import cv2 + +from reachy_mini_conversation_app.tools.core_tools import Tool, ToolDependencies + + +logger = logging.getLogger(__name__) + + +class Camera(Tool): + """Take a picture with the camera and ask a question about it.""" + + name = "camera" + description = "Take a picture with the camera and ask a question about it." + parameters_schema = { + "type": "object", + "properties": { + "question": { + "type": "string", + "description": "The question to ask about the picture", + }, + }, + "required": ["question"], + } + + async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]: + """Take a picture with the camera and ask a question about it.""" + image_query = (kwargs.get("question") or "").strip() + if not image_query: + logger.warning("camera: empty question") + return {"error": "question must be a non-empty string"} + + logger.info("Tool call: camera question=%s", image_query[:120]) + + # Get frame from camera worker buffer (like main_works.py) + if deps.camera_worker is not None: + frame = deps.camera_worker.get_latest_frame() + if frame is None: + logger.error("No frame available from camera worker") + return {"error": "No frame available"} + else: + logger.error("Camera worker not available") + return {"error": "Camera worker not available"} + + # Use vision manager for processing if available + if deps.vision_manager is not None: + vision_result = await asyncio.to_thread( + deps.vision_manager.processor.process_image, frame, image_query, + ) + if isinstance(vision_result, dict) and "error" in vision_result: + return vision_result + return ( + {"image_description": vision_result} + if isinstance(vision_result, str) + else {"error": "vision returned non-string"} + ) + + # Encode image directly to JPEG bytes without writing to file + success, buffer = cv2.imencode('.jpg', frame) + if not success: + raise RuntimeError("Failed to encode frame as JPEG") + + b64_encoded = base64.b64encode(buffer.tobytes()).decode("utf-8") + return {"b64_im": b64_encoded} diff --git a/src/reachy_mini_conversation_app/tools/core_tools.py b/src/reachy_mini_conversation_app/tools/core_tools.py new file mode 100644 index 0000000000000000000000000000000000000000..641394e1e2bc4ef088bb3cc29d1686227a297d28 --- /dev/null +++ b/src/reachy_mini_conversation_app/tools/core_tools.py @@ -0,0 +1,224 @@ +from __future__ import annotations +import abc +import sys +import json +import inspect +import logging +import importlib +from typing import Any, Dict, List +from pathlib import Path +from dataclasses import dataclass + +from reachy_mini import ReachyMini +# Import config to ensure .env is loaded before reading REACHY_MINI_CUSTOM_PROFILE +from reachy_mini_conversation_app.config import config # noqa: F401 + + +logger = logging.getLogger(__name__) + + +PROFILES_DIRECTORY = "reachy_mini_conversation_app.profiles" + +if not logger.handlers: + handler = logging.StreamHandler() + formatter = logging.Formatter("%(asctime)s %(levelname)s %(name)s:%(lineno)d | %(message)s") + handler.setFormatter(formatter) + logger.addHandler(handler) + logger.setLevel(logging.INFO) + + +ALL_TOOLS: Dict[str, "Tool"] = {} +ALL_TOOL_SPECS: List[Dict[str, Any]] = [] +_TOOLS_INITIALIZED = False + + + +def get_concrete_subclasses(base: type[Tool]) -> List[type[Tool]]: + """Recursively find all concrete (non-abstract) subclasses of a base class.""" + result: List[type[Tool]] = [] + for cls in base.__subclasses__(): + if not inspect.isabstract(cls): + result.append(cls) + # recurse into subclasses + result.extend(get_concrete_subclasses(cls)) + return result + + +@dataclass +class ToolDependencies: + """External dependencies injected into tools.""" + + reachy_mini: ReachyMini + movement_manager: Any # MovementManager from moves.py + # Optional deps + camera_worker: Any | None = None # CameraWorker for frame buffering + vision_manager: Any | None = None + head_wobbler: Any | None = None # HeadWobbler for audio-reactive motion + motion_duration_s: float = 1.0 + + +# Tool base class +class Tool(abc.ABC): + """Base abstraction for tools used in function-calling. + + Each tool must define: + - name: str + - description: str + - parameters_schema: Dict[str, Any] # JSON Schema + """ + + name: str + description: str + parameters_schema: Dict[str, Any] + + def spec(self) -> Dict[str, Any]: + """Return the function spec for LLM consumption.""" + return { + "type": "function", + "name": self.name, + "description": self.description, + "parameters": self.parameters_schema, + } + + @abc.abstractmethod + async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]: + """Async tool execution entrypoint.""" + raise NotImplementedError + + +# Registry & specs (dynamic) +def _load_profile_tools() -> None: + """Load tools based on profile's tools.txt file.""" + # Determine which profile to use + profile = config.REACHY_MINI_CUSTOM_PROFILE or "default" + logger.info(f"Loading tools for profile: {profile}") + + # Build path to tools.txt + # Get the profile directory path + profile_module_path = Path(__file__).parent.parent / "profiles" / profile + tools_txt_path = profile_module_path / "tools.txt" + + if not tools_txt_path.exists(): + logger.error(f"โœ— tools.txt not found at {tools_txt_path}") + sys.exit(1) + + # Read and parse tools.txt + try: + with open(tools_txt_path, "r") as f: + lines = f.readlines() + except Exception as e: + logger.error(f"โœ— Failed to read tools.txt: {e}") + sys.exit(1) + + # Parse tool names (skip comments and blank lines) + tool_names = [] + for line in lines: + line = line.strip() + # Skip blank lines and comments + if not line or line.startswith("#"): + continue + tool_names.append(line) + + logger.info(f"Found {len(tool_names)} tools to load: {tool_names}") + + # Import each tool + for tool_name in tool_names: + loaded = False + profile_error = None + + # Try profile-local tool first + try: + profile_tool_module = f"{PROFILES_DIRECTORY}.{profile}.{tool_name}" + importlib.import_module(profile_tool_module) + logger.info(f"โœ“ Loaded profile-local tool: {tool_name}") + loaded = True + except ModuleNotFoundError as e: + # Check if it's the tool module itself that's missing (expected) or a dependency + if tool_name in str(e): + pass # Tool not in profile directory, try shared tools + else: + # Missing import dependency within the tool file + profile_error = f"Missing dependency: {e}" + logger.error(f"โŒ Failed to load profile-local tool '{tool_name}': {profile_error}") + logger.error(f" Module path: {profile_tool_module}") + except ImportError as e: + profile_error = f"Import error: {e}" + logger.error(f"โŒ Failed to load profile-local tool '{tool_name}': {profile_error}") + logger.error(f" Module path: {profile_tool_module}") + except Exception as e: + profile_error = f"{type(e).__name__}: {e}" + logger.error(f"โŒ Failed to load profile-local tool '{tool_name}': {profile_error}") + logger.error(f" Module path: {profile_tool_module}") + + # Try shared tools library if not found in profile + if not loaded: + try: + shared_tool_module = f"reachy_mini_conversation_app.tools.{tool_name}" + importlib.import_module(shared_tool_module) + logger.info(f"โœ“ Loaded shared tool: {tool_name}") + loaded = True + except ModuleNotFoundError: + if profile_error: + # Already logged error from profile attempt + logger.error(f"โŒ Tool '{tool_name}' also not found in shared tools") + else: + logger.warning(f"โš ๏ธ Tool '{tool_name}' not found in profile or shared tools") + except ImportError as e: + logger.error(f"โŒ Failed to load shared tool '{tool_name}': Import error: {e}") + logger.error(f" Module path: {shared_tool_module}") + except Exception as e: + logger.error(f"โŒ Failed to load shared tool '{tool_name}': {type(e).__name__}: {e}") + logger.error(f" Module path: {shared_tool_module}") + + +def _initialize_tools() -> None: + """Populate registry once, even if module is imported repeatedly.""" + global ALL_TOOLS, ALL_TOOL_SPECS, _TOOLS_INITIALIZED + + if _TOOLS_INITIALIZED: + logger.debug("Tools already initialized; skipping reinitialization.") + return + + _load_profile_tools() + + ALL_TOOLS = {cls.name: cls() for cls in get_concrete_subclasses(Tool)} # type: ignore[type-abstract] + ALL_TOOL_SPECS = [tool.spec() for tool in ALL_TOOLS.values()] + + for tool_name, tool in ALL_TOOLS.items(): + logger.info(f"tool registered: {tool_name} - {tool.description}") + + _TOOLS_INITIALIZED = True + + +_initialize_tools() + + +def get_tool_specs(exclusion_list: list[str] = []) -> list[Dict[str, Any]]: + """Get tool specs, optionally excluding some tools.""" + return [spec for spec in ALL_TOOL_SPECS if spec.get("name") not in exclusion_list] + + +# Dispatcher +def _safe_load_obj(args_json: str) -> Dict[str, Any]: + try: + parsed_args = json.loads(args_json or "{}") + return parsed_args if isinstance(parsed_args, dict) else {} + except Exception: + logger.warning("bad args_json=%r", args_json) + return {} + + +async def dispatch_tool_call(tool_name: str, args_json: str, deps: ToolDependencies) -> Dict[str, Any]: + """Dispatch a tool call by name with JSON args and dependencies.""" + tool = ALL_TOOLS.get(tool_name) + + if not tool: + return {"error": f"unknown tool: {tool_name}"} + + args = _safe_load_obj(args_json) + try: + return await tool(deps, **args) + except Exception as e: + msg = f"{type(e).__name__}: {e}" + logger.exception("Tool error in %s: %s", tool_name, msg) + return {"error": msg} diff --git a/src/reachy_mini_conversation_app/tools/cursor_code.py b/src/reachy_mini_conversation_app/tools/cursor_code.py new file mode 100644 index 0000000000000000000000000000000000000000..97b7cb94f2346b2816b27d1f8476c513f37dc9ea --- /dev/null +++ b/src/reachy_mini_conversation_app/tools/cursor_code.py @@ -0,0 +1,324 @@ +"""Reachy the Vibe Coder - Voice-activated coding with Cursor IDE. + +Let Reachy be your creative coding partner! Just say what you want to build, +and Reachy will craft a detailed vision and send it to Cursor to make it real. + +"Reachy, build me a website" โ†’ Reachy decides everything and codes it for you! + +Supports two modes: +1. AppleScript automation (macOS only) - directly controls Cursor +2. MCP Bridge mode - sends requests via MCP server that Cursor connects to +""" + +import os +import asyncio +import logging +import subprocess +import platform +from typing import Any, Dict + +from reachy_mini_conversation_app.tools.core_tools import Tool, ToolDependencies + + +logger = logging.getLogger(__name__) + +# Check if MCP bridge mode is enabled +USE_MCP_BRIDGE = os.getenv("REACHY_CURSOR_MCP_BRIDGE", "false").lower() == "true" + + +class VibeCode(Tool): + """Reachy the Vibe Coder - Your creative coding partner! + + Just tell Reachy what you want to build, and watch the magic happen. + Reachy decides the tech stack, colors, layout, animations - EVERYTHING. + Then sends it to Cursor to bring your vision to life! + + ๐ŸŽจ Creative Director Mode: ON + ๐Ÿš€ Vibe Coding: ENABLED + """ + + name = "vibe_code" + description = ( + "๐ŸŽจ REACHY THE VIBE CODER - Your creative coding partner! " + "Tell me what to build and I'll handle EVERYTHING: tech stack, colors, " + "layout, animations, the works! I'm your creative director - just vibe with me! " + "Say 'build a website' or 'make an app' and watch the magic happen in Cursor!" + ) + parameters_schema = { + "type": "object", + "properties": { + "request": { + "type": "string", + "description": ( + "A DETAILED coding request with ALL your creative decisions. " + "YOU decide: tech stack (React, Vue, vanilla JS, Python, etc.), " + "styling (Tailwind, CSS modules, styled-components), " + "color palette (specific hex codes), typography, layout, " + "components to create, animations, and features. " + "Be SPECIFIC and OPINIONATED. Example: " + "'Create a landing page using Next.js 14 and Tailwind CSS. " + "Use a dark theme with #0a0a0a background and #22d3ee cyan accents. " + "Include: 1) Hero with animated gradient text, 2) Features grid with " + "hover animations, 3) Pricing cards with glassmorphism effect, " + "4) Footer with social links. Use Inter font. Add smooth scroll.'" + ), + }, + "auto_submit": { + "type": "boolean", + "description": "Whether to automatically submit the request. Default is true.", + }, + }, + "required": ["request"], + } + + async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]: + """Execute the cursor coding request.""" + request = kwargs.get("request", "") + auto_submit = kwargs.get("auto_submit", True) + + if not request: + return {"error": "No coding request provided"} + + logger.info("Cursor code request: %s", request[:100]) + + # Use MCP bridge mode if enabled + if USE_MCP_BRIDGE: + try: + from reachy_mini_conversation_app.mcp.cursor_bridge import send_to_cursor_via_bridge + result = await send_to_cursor_via_bridge(request, None) + return result + except Exception as e: + logger.error("MCP bridge failed: %s", e) + # Fall back to AppleScript if available + if platform.system() == "Darwin": + logger.info("Falling back to AppleScript automation") + else: + return {"error": str(e)} + + # Check platform for AppleScript automation + if platform.system() != "Darwin": + return { + "error": "This tool currently only supports macOS", + "suggestion": ( + "Options:\n" + "1. Set REACHY_CURSOR_MCP_BRIDGE=true and configure Cursor to connect to the MCP bridge\n" + "2. Manual workaround: Open Cursor, press Cmd+I, and type your request" + ), + } + + try: + result = await self._send_to_cursor(request, auto_submit) + return result + except Exception as e: + logger.error("Failed to send to Cursor: %s", e) + return { + "error": str(e), + "fallback": "Please open Cursor manually and use Cmd+I to open Composer", + } + + async def _send_to_cursor(self, prompt: str, auto_submit: bool) -> Dict[str, Any]: + """Send the prompt to Cursor using AppleScript automation. + + Opens a new Cursor window and sends the prompt to Composer. + """ + # Escape special characters for AppleScript + escaped_prompt = prompt.replace("\\", "\\\\").replace('"', '\\"').replace("'", "'\\''") + + # AppleScript to: + # 1. Activate Cursor + # 2. Open a new window + # 3. Open Composer with Cmd+I + # 4. Type the prompt character by character + # 5. Submit + applescript = f''' + tell application "Cursor" + activate + end tell + + delay 1.5 + + tell application "System Events" + tell process "Cursor" + set frontmost to true + delay 0.5 + + -- Open a new window + keystroke "n" using {{command down, shift down}} + delay 2.0 + + -- Open Composer with Cmd+I + keystroke "i" using command down + delay 2.0 + + -- Type the prompt character by character for reliability + set thePrompt to "{escaped_prompt}" + repeat with i from 1 to length of thePrompt + keystroke (character i of thePrompt) + delay 0.01 + end repeat + + {"-- Submit with Enter" if auto_submit else ""} + {f'delay 1.0' if auto_submit else ""} + {f'keystroke return' if auto_submit else ""} + end tell + end tell + + return "success" + ''' + + # Run the AppleScript + try: + proc = await asyncio.create_subprocess_exec( + "osascript", "-e", applescript, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=15.0) + + if proc.returncode != 0: + error_msg = stderr.decode().strip() if stderr else "Unknown error" + + # Check for accessibility permissions + if "not allowed assistive access" in error_msg.lower(): + return { + "error": "Accessibility permissions required", + "instructions": ( + "To enable voice coding:\n" + "1. Open System Settings โ†’ Privacy & Security โ†’ Accessibility\n" + "2. Add and enable the terminal app you're running Reachy from\n" + "3. Try again after granting permission" + ), + } + + return {"error": f"AppleScript failed: {error_msg}"} + + return { + "status": "sent_to_cursor", + "message": f"Opened new Cursor window and sent prompt", + "auto_submitted": auto_submit, + } + + except asyncio.TimeoutError: + return {"error": "Timeout waiting for Cursor response"} + except FileNotFoundError: + return {"error": "osascript not found - are you on macOS?"} + + +class CursorAgent(Tool): + """Use Cursor's Agent mode for complex multi-file tasks. + + This opens Cursor's Agent (Cmd+Shift+I) which can make changes + across multiple files and run terminal commands. + """ + + name = "cursor_agent" + description = ( + "Use Cursor's Agent mode for complex coding tasks that involve multiple files " + "or running commands. Use this for larger tasks like 'build an entire app', " + "'refactor the codebase', or 'set up a new project with tests'." + ) + parameters_schema = { + "type": "object", + "properties": { + "task": { + "type": "string", + "description": ( + "The complex coding task to send to Cursor Agent. " + "Agent mode is best for multi-file changes and project-wide tasks." + ), + }, + "auto_submit": { + "type": "boolean", + "description": "Whether to automatically start the agent. Default is true.", + }, + }, + "required": ["task"], + } + + async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]: + """Execute the cursor agent task.""" + task = kwargs.get("task", "") + auto_submit = kwargs.get("auto_submit", True) + + if not task: + return {"error": "No task provided"} + + logger.info("Cursor agent task: %s", task[:100]) + + if platform.system() != "Darwin": + return { + "error": "This tool currently only supports macOS", + "suggestion": "Manual workaround: Open Cursor, press Cmd+Shift+I for Agent mode", + } + + try: + result = await self._send_to_cursor_agent(task, auto_submit) + return result + except Exception as e: + logger.error("Failed to send to Cursor Agent: %s", e) + return {"error": str(e)} + + async def _send_to_cursor_agent(self, task: str, auto_submit: bool) -> Dict[str, Any]: + """Send the task to Cursor Agent using AppleScript automation.""" + + escaped_task = task.replace("\\", "\\\\").replace('"', '\\"') + + # AppleScript for Agent mode (Cmd+Shift+I) + applescript = f''' + tell application "Cursor" + activate + end tell + + delay 0.5 + + tell application "System Events" + tell process "Cursor" + -- Open Agent with Cmd+Shift+I + keystroke "i" using {{command down, shift down}} + delay 1.0 + + -- Type the task + keystroke "{escaped_task}" + + {"-- Submit with Enter" if auto_submit else ""} + {f'delay 0.3' if auto_submit else ""} + {f'keystroke return' if auto_submit else ""} + end tell + end tell + + return "success" + ''' + + try: + proc = await asyncio.create_subprocess_exec( + "osascript", "-e", applescript, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=10.0) + + if proc.returncode != 0: + error_msg = stderr.decode().strip() if stderr else "Unknown error" + + if "not allowed assistive access" in error_msg.lower(): + return { + "error": "Accessibility permissions required", + "instructions": ( + "To enable voice coding:\n" + "1. Open System Settings โ†’ Privacy & Security โ†’ Accessibility\n" + "2. Add and enable the terminal app you're running Reachy from\n" + "3. Try again after granting permission" + ), + } + + return {"error": f"AppleScript failed: {error_msg}"} + + return { + "status": "sent_to_cursor_agent", + "message": f"Sent task to Cursor Agent: {task[:50]}...", + "auto_submitted": auto_submit, + } + + except asyncio.TimeoutError: + return {"error": "Timeout waiting for Cursor Agent response"} + diff --git a/src/reachy_mini_conversation_app/tools/dance.py b/src/reachy_mini_conversation_app/tools/dance.py new file mode 100644 index 0000000000000000000000000000000000000000..833cd5520294704264ff90fa3d89f1db12719545 --- /dev/null +++ b/src/reachy_mini_conversation_app/tools/dance.py @@ -0,0 +1,86 @@ +import logging +from typing import Any, Dict + +from reachy_mini_conversation_app.tools.core_tools import Tool, ToolDependencies + + +logger = logging.getLogger(__name__) + +# Initialize dance library +try: + from reachy_mini_dances_library.collection.dance import AVAILABLE_MOVES + from reachy_mini_conversation_app.dance_emotion_moves import DanceQueueMove + + DANCE_AVAILABLE = True +except ImportError as e: + logger.warning(f"Dance library not available: {e}") + AVAILABLE_MOVES = {} + DANCE_AVAILABLE = False + + +class Dance(Tool): + """Play a named or random dance move once (or repeat). Non-blocking.""" + + name = "dance" + description = "Play a named or random dance move once (or repeat). Non-blocking." + parameters_schema = { + "type": "object", + "properties": { + "move": { + "type": "string", + "description": """Name of the move; use 'random' or omit for random. + Here is a list of the available moves: + simple_nod: A simple, continuous up-and-down nodding motion. + head_tilt_roll: A continuous side-to-side head roll (ear to shoulder). + side_to_side_sway: A smooth, side-to-side sway of the entire head. + dizzy_spin: A circular 'dizzy' head motion combining roll and pitch. + stumble_and_recover: A simulated stumble and recovery with multiple axis movements. Good vibes + interwoven_spirals: A complex spiral motion using three axes at different frequencies. + sharp_side_tilt: A sharp, quick side-to-side tilt using a triangle waveform. + side_peekaboo: A multi-stage peekaboo performance, hiding and peeking to each side. + yeah_nod: An emphatic two-part yeah nod using transient motions. + uh_huh_tilt: A combined roll-and-pitch uh-huh gesture of agreement. + neck_recoil: A quick, transient backward recoil of the neck. + chin_lead: A forward motion led by the chin, combining translation and pitch. + groovy_sway_and_roll: A side-to-side sway combined with a corresponding roll for a groovy effect. + chicken_peck: A sharp, forward, chicken-like pecking motion. + side_glance_flick: A quick glance to the side that holds, then returns. + polyrhythm_combo: A 3-beat sway and a 2-beat nod create a polyrhythmic feel. + grid_snap: A robotic, grid-snapping motion using square waveforms. + pendulum_swing: A simple, smooth pendulum-like swing using a roll motion. + jackson_square: Traces a rectangle via a 5-point path, with sharp twitches on arrival at each checkpoint. + """, + }, + "repeat": { + "type": "integer", + "description": "How many times to repeat the move (default 1).", + }, + }, + "required": [], + } + + async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]: + """Play a named or random dance move once (or repeat). Non-blocking.""" + if not DANCE_AVAILABLE: + return {"error": "Dance system not available"} + + move_name = kwargs.get("move") + repeat = int(kwargs.get("repeat", 1)) + + logger.info("Tool call: dance move=%s repeat=%d", move_name, repeat) + + if not move_name or move_name == "random": + import random + + move_name = random.choice(list(AVAILABLE_MOVES.keys())) + + if move_name not in AVAILABLE_MOVES: + return {"error": f"Unknown dance move '{move_name}'. Available: {list(AVAILABLE_MOVES.keys())}"} + + # Add dance moves to queue + movement_manager = deps.movement_manager + for _ in range(repeat): + dance_move = DanceQueueMove(move_name) + movement_manager.queue_move(dance_move) + + return {"status": "queued", "move": move_name, "repeat": repeat} diff --git a/src/reachy_mini_conversation_app/tools/do_nothing.py b/src/reachy_mini_conversation_app/tools/do_nothing.py new file mode 100644 index 0000000000000000000000000000000000000000..c64588f68737e98819a4629f639c698624d221b5 --- /dev/null +++ b/src/reachy_mini_conversation_app/tools/do_nothing.py @@ -0,0 +1,30 @@ +import logging +from typing import Any, Dict + +from reachy_mini_conversation_app.tools.core_tools import Tool, ToolDependencies + + +logger = logging.getLogger(__name__) + + +class DoNothing(Tool): + """Choose to do nothing - stay still and silent. Use when you want to be contemplative or just chill.""" + + name = "do_nothing" + description = "Choose to do nothing - stay still and silent. Use when you want to be contemplative or just chill." + parameters_schema = { + "type": "object", + "properties": { + "reason": { + "type": "string", + "description": "Optional reason for doing nothing (e.g., 'contemplating existence', 'saving energy', 'being mysterious')", + }, + }, + "required": [], + } + + async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]: + """Do nothing - stay still and silent.""" + reason = kwargs.get("reason", "just chilling") + logger.info("Tool call: do_nothing reason=%s", reason) + return {"status": "doing nothing", "reason": reason} diff --git a/src/reachy_mini_conversation_app/tools/figma_design.py b/src/reachy_mini_conversation_app/tools/figma_design.py new file mode 100644 index 0000000000000000000000000000000000000000..bdb46f2fc7f896a83a18007bcaffa83f39811fea --- /dev/null +++ b/src/reachy_mini_conversation_app/tools/figma_design.py @@ -0,0 +1,671 @@ +"""Figma design tool for creating and modifying designs via Figma MCP.""" + +import os +import json +import logging +from typing import Any, Dict, Optional + +from reachy_mini_conversation_app.tools.core_tools import Tool, ToolDependencies + + +logger = logging.getLogger(__name__) + + +class FigmaDesign(Tool): + """Tool for creating and modifying Figma designs. + + This tool connects to Figma's MCP server to create and modify designs. + It can create new files, add frames, shapes, text, and more. + """ + + name = "figma_design" + description = ( + "Create and modify designs in Figma. Can create new design files, " + "add frames, rectangles, text, and other elements. Use this when asked " + "to design a website, app, or any visual design. You can also view and " + "modify existing Figma files." + ) + parameters_schema = { + "type": "object", + "properties": { + "action": { + "type": "string", + "enum": [ + "create_file", + "add_frame", + "add_rectangle", + "add_text", + "add_component", + "get_file", + "update_node", + "get_styles", + ], + "description": "The design action to perform", + }, + "file_key": { + "type": "string", + "description": "Figma file key (required for most actions except create_file)", + }, + "name": { + "type": "string", + "description": "Name for the new file, frame, or element", + }, + "parent_id": { + "type": "string", + "description": "Parent node ID to add elements to", + }, + "x": {"type": "number", "description": "X position"}, + "y": {"type": "number", "description": "Y position"}, + "width": {"type": "number", "description": "Width of the element"}, + "height": {"type": "number", "description": "Height of the element"}, + "text": {"type": "string", "description": "Text content for text elements"}, + "fill_color": { + "type": "string", + "description": "Fill color in hex format (e.g., '#FF5733')", + }, + "node_id": {"type": "string", "description": "Node ID for updates"}, + "properties": { + "type": "object", + "description": "Additional properties to set on the node", + }, + }, + "required": ["action"], + } + + async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]: + """Execute the Figma design action via MCP.""" + action = kwargs.get("action", "") + access_token = os.getenv("FIGMA_ACCESS_TOKEN") + + if not access_token: + return { + "error": "No Figma access token configured. Please set FIGMA_ACCESS_TOKEN in .env" + } + + try: + if action == "create_file": + return await self._create_file(access_token, kwargs) + elif action == "add_frame": + return await self._add_frame(access_token, kwargs) + elif action == "add_rectangle": + return await self._add_rectangle(access_token, kwargs) + elif action == "add_text": + return await self._add_text(access_token, kwargs) + elif action == "get_file": + return await self._get_file(access_token, kwargs) + elif action == "get_styles": + return await self._get_styles(access_token, kwargs) + elif action == "update_node": + return await self._update_node(access_token, kwargs) + else: + return {"error": f"Unknown action: {action}"} + + except Exception as e: + logger.error(f"Figma design error: {e}") + return {"error": str(e)} + + async def _call_figma_api( + self, + access_token: str, + method: str, + endpoint: str, + data: Optional[Dict] = None + ) -> Dict[str, Any]: + """Make a call to the Figma REST API.""" + import httpx + + base_url = "https://api.figma.com/v1" + headers = { + "X-Figma-Token": access_token, + "Content-Type": "application/json", + } + + async with httpx.AsyncClient() as client: + if method == "GET": + response = await client.get( + f"{base_url}{endpoint}", + headers=headers, + timeout=30.0, + ) + elif method == "POST": + response = await client.post( + f"{base_url}{endpoint}", + headers=headers, + json=data, + timeout=30.0, + ) + elif method == "PUT": + response = await client.put( + f"{base_url}{endpoint}", + headers=headers, + json=data, + timeout=30.0, + ) + else: + return {"error": f"Unsupported method: {method}"} + + if response.status_code >= 400: + return { + "error": f"Figma API error {response.status_code}: {response.text}" + } + + return response.json() + + async def _create_file(self, access_token: str, kwargs: Dict) -> Dict[str, Any]: + """Create a new Figma file. + + Note: Figma REST API doesn't support creating files directly. + We'll provide instructions and a template approach. + """ + name = kwargs.get("name", "New Design") + + # Figma doesn't have a "create file" API endpoint + # Instead, we guide the user to create from template or duplicate + return { + "result": ( + f"To create a new Figma file '{name}':\n\n" + "1. Go to figma.com and click '+ New design file'\n" + "2. Rename it to your desired name\n" + "3. Share the file URL with me and I can add elements to it!\n\n" + "Or duplicate an existing template:\n" + "- figma.com/community for templates\n\n" + "Once you have a file, give me the URL and I'll help build it!" + ), + "action_required": "create_file_manually", + } + + async def _get_file(self, access_token: str, kwargs: Dict) -> Dict[str, Any]: + """Get information about a Figma file.""" + file_key = kwargs.get("file_key", "") + if not file_key: + return {"error": "file_key is required"} + + result = await self._call_figma_api(access_token, "GET", f"/files/{file_key}") + + if "error" in result: + return result + + # Parse the file structure + name = result.get("name", "Unknown") + last_modified = result.get("lastModified", "Unknown") + document = result.get("document", {}) + pages = document.get("children", []) + + # Build a summary + page_info = [] + for page in pages[:10]: + page_name = page.get("name", "Unnamed") + children = page.get("children", []) + frames = [c for c in children if c.get("type") == "FRAME"] + page_info.append({ + "name": page_name, + "id": page.get("id"), + "frame_count": len(frames), + "frames": [{"name": f.get("name"), "id": f.get("id")} for f in frames[:5]], + }) + + return { + "result": { + "name": name, + "last_modified": last_modified, + "pages": page_info, + }, + "message": f"File '{name}' has {len(pages)} pages. Use the page/frame IDs to add elements.", + } + + async def _get_styles(self, access_token: str, kwargs: Dict) -> Dict[str, Any]: + """Get styles from a Figma file.""" + file_key = kwargs.get("file_key", "") + if not file_key: + return {"error": "file_key is required"} + + result = await self._call_figma_api(access_token, "GET", f"/files/{file_key}/styles") + + if "error" in result: + return result + + styles = result.get("meta", {}).get("styles", []) + return { + "result": { + "styles": styles[:20], # Limit to 20 styles + "count": len(styles), + } + } + + async def _add_frame(self, access_token: str, kwargs: Dict) -> Dict[str, Any]: + """Add a frame to a Figma file. + + Note: Figma REST API is read-only for file contents. + We need to use the Plugin API or provide guidance. + """ + file_key = kwargs.get("file_key", "") + name = kwargs.get("name", "Frame") + width = kwargs.get("width", 1440) + height = kwargs.get("height", 900) + + # Figma REST API doesn't support writing to files + # Provide detailed guidance instead + return { + "result": ( + f"To add a frame '{name}' ({width}x{height}):\n\n" + "**In Figma:**\n" + "1. Press 'F' or select Frame tool\n" + "2. Draw on canvas or select a preset size\n" + f"3. Set size to {width}x{height} in the right panel\n" + f"4. Rename to '{name}'\n\n" + "**Common frame sizes:**\n" + "โ€ข Desktop: 1440x900\n" + "โ€ข Mobile: 375x812\n" + "โ€ข Tablet: 768x1024\n\n" + "I can help with the layout structure once you have the frame!" + ), + "suggested_action": { + "type": "FRAME", + "name": name, + "width": width, + "height": height, + }, + } + + async def _add_rectangle(self, access_token: str, kwargs: Dict) -> Dict[str, Any]: + """Add a rectangle to a Figma file.""" + name = kwargs.get("name", "Rectangle") + width = kwargs.get("width", 200) + height = kwargs.get("height", 100) + fill_color = kwargs.get("fill_color", "#3B82F6") + x = kwargs.get("x", 0) + y = kwargs.get("y", 0) + + return { + "result": ( + f"To add rectangle '{name}':\n\n" + "**In Figma:**\n" + "1. Press 'R' or select Rectangle tool\n" + f"2. Draw at position ({x}, {y})\n" + f"3. Set size to {width}x{height}\n" + f"4. Set fill color to {fill_color}\n\n" + "**Quick tip:** Hold Shift while drawing for a perfect square!" + ), + "suggested_action": { + "type": "RECTANGLE", + "name": name, + "x": x, + "y": y, + "width": width, + "height": height, + "fill": fill_color, + }, + } + + async def _add_text(self, access_token: str, kwargs: Dict) -> Dict[str, Any]: + """Add text to a Figma file.""" + text = kwargs.get("text", "Hello World") + name = kwargs.get("name", "Text") + x = kwargs.get("x", 0) + y = kwargs.get("y", 0) + + return { + "result": ( + f"To add text '{name}':\n\n" + "**In Figma:**\n" + "1. Press 'T' or select Text tool\n" + f"2. Click at position ({x}, {y})\n" + f"3. Type: \"{text}\"\n\n" + "**Typography tips:**\n" + "โ€ข Headlines: 32-48px, bold\n" + "โ€ข Body text: 16-18px, regular\n" + "โ€ข Captions: 12-14px, light" + ), + "suggested_action": { + "type": "TEXT", + "name": name, + "content": text, + "x": x, + "y": y, + }, + } + + async def _update_node(self, access_token: str, kwargs: Dict) -> Dict[str, Any]: + """Update a node in a Figma file.""" + file_key = kwargs.get("file_key", "") + node_id = kwargs.get("node_id", "") + properties = kwargs.get("properties", {}) + + if not file_key or not node_id: + return {"error": "file_key and node_id are required"} + + # Get current node info first + result = await self._call_figma_api( + access_token, "GET", f"/files/{file_key}/nodes?ids={node_id}" + ) + + if "error" in result: + return result + + nodes = result.get("nodes", {}) + node_data = nodes.get(node_id, {}).get("document", {}) + + return { + "result": { + "current_node": { + "name": node_data.get("name"), + "type": node_data.get("type"), + }, + "requested_changes": properties, + }, + "message": ( + "Figma REST API is read-only for modifications. " + "To update this node, make changes directly in Figma or use a Figma plugin." + ), + } + + +class FigmaWebsiteBuilder(Tool): + """High-level tool for building complete website designs.""" + + name = "build_website" + description = ( + "Build a complete website design with multiple sections. " + "Generates a structured design plan with all components. " + "Use when asked to 'design a website' or 'create a landing page'." + ) + parameters_schema = { + "type": "object", + "properties": { + "website_type": { + "type": "string", + "enum": ["landing_page", "portfolio", "ecommerce", "dashboard", "blog", "saas"], + "description": "Type of website to design", + }, + "business_name": { + "type": "string", + "description": "Name of the business or project", + }, + "primary_color": { + "type": "string", + "description": "Primary brand color (hex)", + }, + "style": { + "type": "string", + "enum": ["modern", "minimal", "bold", "playful", "corporate", "elegant"], + "description": "Design style preference", + }, + "sections": { + "type": "array", + "items": {"type": "string"}, + "description": "Sections to include (hero, features, pricing, etc.)", + }, + }, + "required": ["website_type"], + } + + async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]: + """Generate a complete website design plan.""" + website_type = kwargs.get("website_type", "landing_page") + business_name = kwargs.get("business_name", "Your Brand") + primary_color = kwargs.get("primary_color", "#3B82F6") + style = kwargs.get("style", "modern") + sections = kwargs.get("sections", self._default_sections(website_type)) + + # Generate the design plan + design_plan = self._generate_design_plan( + website_type, business_name, primary_color, style, sections + ) + + return { + "result": design_plan, + "message": ( + f"Here's your {website_type} design plan for {business_name}! " + "Create a new Figma file and I'll help you build each section." + ), + } + + def _default_sections(self, website_type: str) -> list: + """Get default sections for a website type.""" + defaults = { + "landing_page": ["hero", "features", "testimonials", "cta", "footer"], + "portfolio": ["hero", "about", "projects", "skills", "contact", "footer"], + "ecommerce": ["header", "hero", "products", "features", "reviews", "footer"], + "dashboard": ["sidebar", "header", "stats", "charts", "table", "actions"], + "blog": ["header", "featured", "posts", "categories", "newsletter", "footer"], + "saas": ["hero", "features", "pricing", "testimonials", "faq", "cta", "footer"], + } + return defaults.get(website_type, ["hero", "features", "cta", "footer"]) + + def _generate_design_plan( + self, + website_type: str, + business_name: str, + primary_color: str, + style: str, + sections: list + ) -> Dict[str, Any]: + """Generate a detailed design plan.""" + + # Color palette based on primary color + palette = self._generate_palette(primary_color) + + # Typography based on style + typography = self._get_typography(style) + + # Section specifications + section_specs = [] + y_position = 0 + + for section in sections: + spec = self._generate_section_spec(section, y_position, palette, business_name) + section_specs.append(spec) + y_position += spec["height"] + 0 # No gap, sections touch + + return { + "project": { + "name": f"{business_name} - {website_type.replace('_', ' ').title()}", + "type": website_type, + "style": style, + }, + "canvas": { + "width": 1440, + "total_height": y_position, + }, + "design_tokens": { + "colors": palette, + "typography": typography, + "spacing": {"xs": 8, "sm": 16, "md": 24, "lg": 32, "xl": 48, "2xl": 64}, + "border_radius": {"sm": 4, "md": 8, "lg": 16, "full": 9999}, + }, + "sections": section_specs, + "figma_instructions": self._get_figma_instructions(section_specs), + } + + def _generate_palette(self, primary: str) -> Dict[str, str]: + """Generate a color palette from primary color.""" + return { + "primary": primary, + "primary_light": self._lighten(primary, 0.2), + "primary_dark": self._darken(primary, 0.2), + "background": "#FFFFFF", + "surface": "#F9FAFB", + "text_primary": "#111827", + "text_secondary": "#6B7280", + "border": "#E5E7EB", + "success": "#10B981", + "error": "#EF4444", + } + + def _lighten(self, hex_color: str, amount: float) -> str: + """Lighten a hex color.""" + hex_color = hex_color.lstrip('#') + r, g, b = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4)) + r = min(255, int(r + (255 - r) * amount)) + g = min(255, int(g + (255 - g) * amount)) + b = min(255, int(b + (255 - b) * amount)) + return f"#{r:02x}{g:02x}{b:02x}" + + def _darken(self, hex_color: str, amount: float) -> str: + """Darken a hex color.""" + hex_color = hex_color.lstrip('#') + r, g, b = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4)) + r = max(0, int(r * (1 - amount))) + g = max(0, int(g * (1 - amount))) + b = max(0, int(b * (1 - amount))) + return f"#{r:02x}{g:02x}{b:02x}" + + def _get_typography(self, style: str) -> Dict[str, Any]: + """Get typography settings based on style.""" + fonts = { + "modern": {"heading": "Inter", "body": "Inter"}, + "minimal": {"heading": "Helvetica Neue", "body": "Helvetica Neue"}, + "bold": {"heading": "Poppins", "body": "Inter"}, + "playful": {"heading": "Nunito", "body": "Open Sans"}, + "corporate": {"heading": "Roboto", "body": "Roboto"}, + "elegant": {"heading": "Playfair Display", "body": "Lato"}, + } + return { + "fonts": fonts.get(style, fonts["modern"]), + "sizes": { + "h1": 64, + "h2": 48, + "h3": 32, + "h4": 24, + "body": 18, + "small": 14, + }, + } + + def _generate_section_spec( + self, + section: str, + y_position: int, + palette: Dict, + business_name: str + ) -> Dict[str, Any]: + """Generate specification for a section.""" + + section_heights = { + "hero": 800, + "header": 80, + "features": 600, + "testimonials": 500, + "pricing": 700, + "cta": 400, + "footer": 300, + "about": 500, + "projects": 800, + "skills": 400, + "contact": 500, + "products": 700, + "reviews": 400, + "sidebar": 900, + "stats": 200, + "charts": 400, + "table": 500, + "actions": 100, + "featured": 400, + "posts": 600, + "categories": 200, + "newsletter": 300, + "faq": 600, + } + + height = section_heights.get(section, 400) + + # Section-specific content + content = self._get_section_content(section, business_name, palette) + + return { + "name": section.replace("_", " ").title(), + "type": section, + "y": y_position, + "height": height, + "background": content.get("background", palette["background"]), + "elements": content.get("elements", []), + } + + def _get_section_content( + self, + section: str, + business_name: str, + palette: Dict + ) -> Dict[str, Any]: + """Get content for a specific section type.""" + + contents = { + "hero": { + "background": palette["background"], + "elements": [ + {"type": "text", "content": business_name, "style": "h1", "x": 100, "y": 200}, + {"type": "text", "content": "Your compelling tagline goes here", "style": "h3", "x": 100, "y": 280}, + {"type": "text", "content": "A brief description that explains your value proposition.", "style": "body", "x": 100, "y": 340}, + {"type": "button", "content": "Get Started", "x": 100, "y": 420, "width": 160, "height": 48, "fill": palette["primary"]}, + {"type": "button", "content": "Learn More", "x": 280, "y": 420, "width": 140, "height": 48, "fill": "transparent", "stroke": palette["primary"]}, + {"type": "image_placeholder", "x": 720, "y": 100, "width": 600, "height": 500, "label": "Hero Image"}, + ], + }, + "features": { + "background": palette["surface"], + "elements": [ + {"type": "text", "content": "Features", "style": "h2", "x": 100, "y": 80, "align": "center"}, + {"type": "feature_card", "x": 100, "y": 180, "width": 400, "height": 300, "title": "Feature 1", "icon": "โšก"}, + {"type": "feature_card", "x": 520, "y": 180, "width": 400, "height": 300, "title": "Feature 2", "icon": "๐ŸŽฏ"}, + {"type": "feature_card", "x": 940, "y": 180, "width": 400, "height": 300, "title": "Feature 3", "icon": "๐Ÿš€"}, + ], + }, + "testimonials": { + "background": palette["background"], + "elements": [ + {"type": "text", "content": "What Our Customers Say", "style": "h2", "x": 100, "y": 60, "align": "center"}, + {"type": "testimonial_card", "x": 100, "y": 160, "width": 400, "height": 280}, + {"type": "testimonial_card", "x": 520, "y": 160, "width": 400, "height": 280}, + {"type": "testimonial_card", "x": 940, "y": 160, "width": 400, "height": 280}, + ], + }, + "cta": { + "background": palette["primary"], + "elements": [ + {"type": "text", "content": "Ready to Get Started?", "style": "h2", "x": 100, "y": 120, "color": "#FFFFFF", "align": "center"}, + {"type": "text", "content": "Join thousands of satisfied customers today.", "style": "body", "x": 100, "y": 200, "color": "#FFFFFF", "align": "center"}, + {"type": "button", "content": "Start Free Trial", "x": 620, "y": 280, "width": 200, "height": 56, "fill": "#FFFFFF", "text_color": palette["primary"]}, + ], + }, + "footer": { + "background": palette["text_primary"], + "elements": [ + {"type": "text", "content": business_name, "style": "h4", "x": 100, "y": 60, "color": "#FFFFFF"}, + {"type": "link_group", "title": "Product", "x": 400, "y": 60, "links": ["Features", "Pricing", "FAQ"]}, + {"type": "link_group", "title": "Company", "x": 600, "y": 60, "links": ["About", "Blog", "Careers"]}, + {"type": "link_group", "title": "Legal", "x": 800, "y": 60, "links": ["Privacy", "Terms"]}, + {"type": "text", "content": f"ยฉ 2024 {business_name}. All rights reserved.", "style": "small", "x": 100, "y": 240, "color": "#9CA3AF"}, + ], + }, + "pricing": { + "background": palette["surface"], + "elements": [ + {"type": "text", "content": "Simple, Transparent Pricing", "style": "h2", "x": 100, "y": 60, "align": "center"}, + {"type": "pricing_card", "x": 180, "y": 160, "width": 340, "height": 480, "tier": "Basic", "price": "$9"}, + {"type": "pricing_card", "x": 550, "y": 140, "width": 340, "height": 520, "tier": "Pro", "price": "$29", "featured": True}, + {"type": "pricing_card", "x": 920, "y": 160, "width": 340, "height": 480, "tier": "Enterprise", "price": "$99"}, + ], + }, + } + + return contents.get(section, {"elements": []}) + + def _get_figma_instructions(self, sections: list) -> str: + """Generate step-by-step Figma instructions.""" + instructions = [ + "## ๐ŸŽจ Building Your Design in Figma\n", + "### Setup", + "1. Create a new Figma file", + "2. Create a frame (F) sized 1440px wide", + "3. Set up your color styles from the palette above\n", + "### Build Each Section", + ] + + for i, section in enumerate(sections, 1): + instructions.append(f"\n**{i}. {section['name']}** (y: {section['y']}, height: {section['height']}px)") + instructions.append(f" - Background: {section['background']}") + for elem in section.get("elements", [])[:3]: + elem_type = elem.get("type", "element") + instructions.append(f" - Add {elem_type}: {elem.get('content', elem.get('title', ''))}") + + return "\n".join(instructions) diff --git a/src/reachy_mini_conversation_app/tools/figma_mcp.py b/src/reachy_mini_conversation_app/tools/figma_mcp.py new file mode 100644 index 0000000000000000000000000000000000000000..f219613c46994e983e0b77ef26720f839642bac4 --- /dev/null +++ b/src/reachy_mini_conversation_app/tools/figma_mcp.py @@ -0,0 +1,750 @@ +"""Figma MCP Server integration for creating and modifying designs.""" + +import os +import json +import logging +import asyncio +from typing import Any, Dict, Optional, List + +from reachy_mini_conversation_app.tools.core_tools import Tool, ToolDependencies + + +logger = logging.getLogger(__name__) + +# Figma MCP Server endpoint +FIGMA_MCP_URL = "https://mcp.figma.com/sse" + + +class FigmaMCP(Tool): + """Connect to Figma's MCP Server for full design capabilities. + + The Figma MCP Server provides write access to create and modify designs. + This requires OAuth authentication with the Figma MCP server. + """ + + name = "figma_mcp" + description = ( + "Connect to Figma's MCP server for full design creation capabilities. " + "Can create new files, add and modify design elements. " + "Use this for advanced Figma operations." + ) + parameters_schema = { + "type": "object", + "properties": { + "action": { + "type": "string", + "enum": [ + "connect", + "list_tools", + "create_file", + "create_frame", + "create_rectangle", + "create_text", + "get_selection", + "set_selection", + ], + "description": "The MCP action to perform", + }, + "file_name": { + "type": "string", + "description": "Name for new file", + }, + "params": { + "type": "object", + "description": "Additional parameters for the action", + }, + }, + "required": ["action"], + } + + async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]: + """Execute Figma MCP action.""" + action = kwargs.get("action", "connect") + + access_token = os.getenv("FIGMA_ACCESS_TOKEN") + if not access_token: + return {"error": "FIGMA_ACCESS_TOKEN not configured"} + + try: + if action == "connect": + return await self._connect_to_mcp(access_token) + elif action == "list_tools": + return await self._list_mcp_tools(access_token) + elif action == "create_file": + return await self._create_file_via_mcp(access_token, kwargs) + else: + return await self._call_mcp_tool(access_token, action, kwargs) + + except Exception as e: + logger.error(f"Figma MCP error: {e}") + return {"error": str(e)} + + async def _connect_to_mcp(self, access_token: str) -> Dict[str, Any]: + """Connect to Figma MCP server and initialize session.""" + import httpx + + try: + async with httpx.AsyncClient() as client: + # Try to connect to Figma MCP server + response = await client.post( + FIGMA_MCP_URL, + headers={ + "Authorization": f"Bearer {access_token}", + "Content-Type": "application/json", + "Accept": "text/event-stream", + }, + json={ + "jsonrpc": "2.0", + "id": 1, + "method": "initialize", + "params": { + "protocolVersion": "2024-11-05", + "capabilities": {"tools": {}}, + "clientInfo": { + "name": "reachy-mini", + "version": "1.0.0", + }, + }, + }, + timeout=30.0, + ) + + if response.status_code == 200: + return { + "result": "Connected to Figma MCP server!", + "status": "connected", + "message": "You can now use advanced Figma operations.", + } + elif response.status_code == 401: + return { + "error": "Authentication failed. Your Figma token may not have MCP access.", + "help": ( + "The Figma MCP server requires OAuth authentication. " + "You may need to:\n" + "1. Use the Figma Desktop app with MCP enabled\n" + "2. Or authenticate via OAuth at mcp.figma.com" + ), + } + else: + return { + "error": f"MCP server returned status {response.status_code}", + "details": response.text[:500], + } + + except httpx.ConnectError: + return { + "error": "Could not connect to Figma MCP server", + "help": ( + "The Figma MCP server might require:\n" + "1. Figma Desktop app running with Dev Mode\n" + "2. Or specific OAuth permissions\n\n" + "As an alternative, I can generate detailed design specs " + "that you can build manually in Figma!" + ), + } + + async def _list_mcp_tools(self, access_token: str) -> Dict[str, Any]: + """List available tools from Figma MCP server.""" + import httpx + + try: + async with httpx.AsyncClient() as client: + response = await client.post( + FIGMA_MCP_URL, + headers={ + "Authorization": f"Bearer {access_token}", + "Content-Type": "application/json", + }, + json={ + "jsonrpc": "2.0", + "id": 2, + "method": "tools/list", + "params": {}, + }, + timeout=30.0, + ) + + if response.status_code == 200: + data = response.json() + tools = data.get("result", {}).get("tools", []) + return { + "result": { + "tools": [t.get("name") for t in tools], + "count": len(tools), + } + } + else: + return {"error": f"Could not list tools: {response.status_code}"} + + except Exception as e: + return {"error": str(e)} + + async def _create_file_via_mcp( + self, access_token: str, kwargs: Dict + ) -> Dict[str, Any]: + """Create a new Figma file via MCP server.""" + file_name = kwargs.get("file_name", "New Design") + + # Try MCP approach + result = await self._call_mcp_tool( + access_token, + "create_file", + {"name": file_name}, + ) + + if "error" in result: + # Fallback: provide manual instructions + return { + "result": ( + f"I couldn't create the file automatically, but here's how to do it:\n\n" + f"1. Go to figma.com\n" + f"2. Click '+ New design file'\n" + f"3. Rename it to '{file_name}'\n" + f"4. Copy the file URL and share it with me\n\n" + f"Once you have the file, I can help you build the design!" + ), + "manual_required": True, + } + + return result + + async def _call_mcp_tool( + self, access_token: str, tool_name: str, params: Dict + ) -> Dict[str, Any]: + """Call a specific tool on the Figma MCP server.""" + import httpx + + try: + async with httpx.AsyncClient() as client: + response = await client.post( + FIGMA_MCP_URL, + headers={ + "Authorization": f"Bearer {access_token}", + "Content-Type": "application/json", + }, + json={ + "jsonrpc": "2.0", + "id": 3, + "method": "tools/call", + "params": { + "name": tool_name, + "arguments": params, + }, + }, + timeout=30.0, + ) + + if response.status_code == 200: + data = response.json() + if "error" in data: + return {"error": data["error"].get("message", "Unknown error")} + return {"result": data.get("result", {})} + else: + return {"error": f"MCP call failed: {response.status_code}"} + + except Exception as e: + return {"error": str(e)} + + +class CreateFigmaDesign(Tool): + """One-click tool to create a complete Figma design. + + This tool attempts to use Figma MCP for direct creation, + with fallback to generating a complete design specification. + """ + + name = "create_figma_design" + description = ( + "Create a complete design in Figma. Attempts to create directly via MCP, " + "or generates a detailed specification you can build. " + "Use when asked to 'create a design', 'make a website', etc." + ) + parameters_schema = { + "type": "object", + "properties": { + "design_type": { + "type": "string", + "enum": ["website", "app", "dashboard", "landing_page", "portfolio"], + "description": "Type of design to create", + }, + "name": { + "type": "string", + "description": "Name/title for the design", + }, + "description": { + "type": "string", + "description": "Description of what to design", + }, + "style": { + "type": "string", + "enum": ["modern", "minimal", "bold", "elegant", "playful"], + "description": "Design style", + }, + "primary_color": { + "type": "string", + "description": "Primary brand color (hex)", + }, + }, + "required": ["design_type", "name"], + } + + async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]: + """Create a Figma design.""" + design_type = kwargs.get("design_type", "website") + name = kwargs.get("name", "My Design") + description = kwargs.get("description", "") + style = kwargs.get("style", "modern") + primary_color = kwargs.get("primary_color", "#3B82F6") + + # Generate complete design specification + design_spec = self._generate_complete_design( + design_type, name, description, style, primary_color + ) + + # Generate Figma plugin code that can be copy-pasted + plugin_code = self._generate_plugin_code(design_spec) + + return { + "result": { + "design_specification": design_spec, + "figma_plugin_code": plugin_code, + "instructions": self._get_instructions(name), + }, + "message": ( + f"I've created a complete design specification for '{name}'!\n\n" + "**To build it in Figma:**\n" + "1. Create a new Figma file\n" + "2. Open the Figma console (Menu โ†’ Plugins โ†’ Development โ†’ Open Console)\n" + "3. Paste the plugin code I generated\n" + "4. Press Enter to create the design!\n\n" + "Or follow the manual instructions to build it step by step." + ), + } + + def _generate_complete_design( + self, + design_type: str, + name: str, + description: str, + style: str, + primary_color: str, + ) -> Dict[str, Any]: + """Generate a complete design specification.""" + + # Base dimensions + width = 1440 + + # Generate color palette + colors = { + "primary": primary_color, + "primary_light": self._adjust_color(primary_color, 40), + "primary_dark": self._adjust_color(primary_color, -40), + "background": "#FFFFFF", + "surface": "#F9FAFB", + "text": "#111827", + "text_muted": "#6B7280", + } + + # Design structure based on type + if design_type == "landing_page" or design_type == "website": + sections = self._generate_landing_page(name, colors, style) + elif design_type == "dashboard": + sections = self._generate_dashboard(name, colors, style) + elif design_type == "app": + sections = self._generate_app(name, colors, style) + width = 375 # Mobile width + elif design_type == "portfolio": + sections = self._generate_portfolio(name, colors, style) + else: + sections = self._generate_landing_page(name, colors, style) + + total_height = sum(s.get("height", 400) for s in sections) + + return { + "name": name, + "type": design_type, + "style": style, + "canvas": {"width": width, "height": total_height}, + "colors": colors, + "sections": sections, + } + + def _generate_landing_page( + self, name: str, colors: Dict, style: str + ) -> List[Dict]: + """Generate landing page sections.""" + return [ + { + "name": "Hero", + "height": 800, + "background": colors["background"], + "elements": [ + { + "type": "FRAME", + "name": "Hero Content", + "x": 100, + "y": 200, + "width": 600, + "height": 400, + }, + { + "type": "TEXT", + "name": "Headline", + "x": 100, + "y": 200, + "content": name, + "fontSize": 64, + "fontWeight": "bold", + }, + { + "type": "TEXT", + "name": "Subheadline", + "x": 100, + "y": 290, + "content": "Your amazing tagline goes here", + "fontSize": 24, + "color": colors["text_muted"], + }, + { + "type": "RECTANGLE", + "name": "CTA Button", + "x": 100, + "y": 380, + "width": 180, + "height": 56, + "fill": colors["primary"], + "cornerRadius": 8, + }, + { + "type": "TEXT", + "name": "CTA Text", + "x": 140, + "y": 396, + "content": "Get Started", + "fontSize": 18, + "color": "#FFFFFF", + }, + { + "type": "RECTANGLE", + "name": "Hero Image", + "x": 750, + "y": 100, + "width": 550, + "height": 500, + "fill": colors["surface"], + "cornerRadius": 16, + }, + ], + }, + { + "name": "Features", + "height": 600, + "background": colors["surface"], + "elements": [ + { + "type": "TEXT", + "name": "Section Title", + "x": 520, + "y": 80, + "content": "Features", + "fontSize": 48, + "fontWeight": "bold", + }, + { + "type": "RECTANGLE", + "name": "Feature Card 1", + "x": 100, + "y": 180, + "width": 380, + "height": 320, + "fill": "#FFFFFF", + "cornerRadius": 12, + }, + { + "type": "RECTANGLE", + "name": "Feature Card 2", + "x": 530, + "y": 180, + "width": 380, + "height": 320, + "fill": "#FFFFFF", + "cornerRadius": 12, + }, + { + "type": "RECTANGLE", + "name": "Feature Card 3", + "x": 960, + "y": 180, + "width": 380, + "height": 320, + "fill": "#FFFFFF", + "cornerRadius": 12, + }, + ], + }, + { + "name": "CTA Section", + "height": 400, + "background": colors["primary"], + "elements": [ + { + "type": "TEXT", + "name": "CTA Headline", + "x": 370, + "y": 140, + "content": "Ready to get started?", + "fontSize": 48, + "fontWeight": "bold", + "color": "#FFFFFF", + }, + { + "type": "RECTANGLE", + "name": "CTA Button", + "x": 580, + "y": 260, + "width": 280, + "height": 64, + "fill": "#FFFFFF", + "cornerRadius": 8, + }, + ], + }, + { + "name": "Footer", + "height": 200, + "background": colors["text"], + "elements": [ + { + "type": "TEXT", + "name": "Footer Logo", + "x": 100, + "y": 60, + "content": name, + "fontSize": 24, + "fontWeight": "bold", + "color": "#FFFFFF", + }, + { + "type": "TEXT", + "name": "Copyright", + "x": 100, + "y": 140, + "content": f"ยฉ 2024 {name}. All rights reserved.", + "fontSize": 14, + "color": "#9CA3AF", + }, + ], + }, + ] + + def _generate_dashboard(self, name: str, colors: Dict, style: str) -> List[Dict]: + """Generate dashboard layout.""" + return [ + { + "name": "Dashboard", + "height": 900, + "background": colors["surface"], + "elements": [ + { + "type": "RECTANGLE", + "name": "Sidebar", + "x": 0, + "y": 0, + "width": 260, + "height": 900, + "fill": colors["text"], + }, + { + "type": "RECTANGLE", + "name": "Header", + "x": 260, + "y": 0, + "width": 1180, + "height": 72, + "fill": "#FFFFFF", + }, + { + "type": "RECTANGLE", + "name": "Stat Card 1", + "x": 300, + "y": 112, + "width": 260, + "height": 140, + "fill": "#FFFFFF", + "cornerRadius": 12, + }, + { + "type": "RECTANGLE", + "name": "Stat Card 2", + "x": 580, + "y": 112, + "width": 260, + "height": 140, + "fill": "#FFFFFF", + "cornerRadius": 12, + }, + { + "type": "RECTANGLE", + "name": "Stat Card 3", + "x": 860, + "y": 112, + "width": 260, + "height": 140, + "fill": "#FFFFFF", + "cornerRadius": 12, + }, + { + "type": "RECTANGLE", + "name": "Chart Area", + "x": 300, + "y": 280, + "width": 820, + "height": 400, + "fill": "#FFFFFF", + "cornerRadius": 12, + }, + ], + }, + ] + + def _generate_app(self, name: str, colors: Dict, style: str) -> List[Dict]: + """Generate mobile app screens.""" + return [ + { + "name": "Home Screen", + "height": 812, + "background": colors["background"], + "elements": [ + { + "type": "RECTANGLE", + "name": "Status Bar", + "x": 0, + "y": 0, + "width": 375, + "height": 44, + "fill": colors["background"], + }, + { + "type": "TEXT", + "name": "App Title", + "x": 20, + "y": 64, + "content": name, + "fontSize": 32, + "fontWeight": "bold", + }, + { + "type": "RECTANGLE", + "name": "Tab Bar", + "x": 0, + "y": 728, + "width": 375, + "height": 84, + "fill": "#FFFFFF", + }, + ], + }, + ] + + def _generate_portfolio(self, name: str, colors: Dict, style: str) -> List[Dict]: + """Generate portfolio layout.""" + return self._generate_landing_page(name, colors, style) + + def _adjust_color(self, hex_color: str, amount: int) -> str: + """Lighten or darken a hex color.""" + hex_color = hex_color.lstrip("#") + r, g, b = tuple(int(hex_color[i : i + 2], 16) for i in (0, 2, 4)) + r = max(0, min(255, r + amount)) + g = max(0, min(255, g + amount)) + b = max(0, min(255, b + amount)) + return f"#{r:02x}{g:02x}{b:02x}" + + def _generate_plugin_code(self, design_spec: Dict) -> str: + """Generate Figma plugin code to create the design.""" + code_lines = [ + "// Figma Plugin Code - Paste in Figma Console", + f"// Design: {design_spec['name']}", + "", + "async function createDesign() {", + f" const page = figma.currentPage;", + "", + ] + + y_offset = 0 + for section in design_spec.get("sections", []): + section_name = section.get("name", "Section") + height = section.get("height", 400) + bg = section.get("background", "#FFFFFF") + + code_lines.append(f" // {section_name}") + code_lines.append(f" const {section_name.replace(' ', '')} = figma.createFrame();") + code_lines.append(f" {section_name.replace(' ', '')}.name = '{section_name}';") + code_lines.append(f" {section_name.replace(' ', '')}.resize({design_spec['canvas']['width']}, {height});") + code_lines.append(f" {section_name.replace(' ', '')}.y = {y_offset};") + code_lines.append( + f" {section_name.replace(' ', '')}.fills = [{{type: 'SOLID', color: {{r: {int(bg[1:3], 16)/255}, g: {int(bg[3:5], 16)/255}, b: {int(bg[5:7], 16)/255}}}}}];" + ) + code_lines.append("") + + for elem in section.get("elements", []): + elem_type = elem.get("type", "RECTANGLE") + elem_name = elem.get("name", "Element").replace(" ", "") + + if elem_type == "RECTANGLE": + code_lines.append(f" const {elem_name} = figma.createRectangle();") + code_lines.append(f" {elem_name}.name = '{elem.get('name')}';") + code_lines.append(f" {elem_name}.x = {elem.get('x', 0)};") + code_lines.append(f" {elem_name}.y = {elem.get('y', 0)};") + code_lines.append(f" {elem_name}.resize({elem.get('width', 100)}, {elem.get('height', 100)});") + if elem.get("cornerRadius"): + code_lines.append(f" {elem_name}.cornerRadius = {elem.get('cornerRadius')};") + fill = elem.get("fill", "#CCCCCC") + code_lines.append( + f" {elem_name}.fills = [{{type: 'SOLID', color: {{r: {int(fill[1:3], 16)/255}, g: {int(fill[3:5], 16)/255}, b: {int(fill[5:7], 16)/255}}}}}];" + ) + code_lines.append(f" {section_name.replace(' ', '')}.appendChild({elem_name});") + + elif elem_type == "TEXT": + code_lines.append(f" const {elem_name} = figma.createText();") + code_lines.append(f" await figma.loadFontAsync({{family: 'Inter', style: 'Regular'}});") + code_lines.append(f" {elem_name}.name = '{elem.get('name')}';") + code_lines.append(f" {elem_name}.characters = '{elem.get('content', 'Text')}';") + code_lines.append(f" {elem_name}.x = {elem.get('x', 0)};") + code_lines.append(f" {elem_name}.y = {elem.get('y', 0)};") + code_lines.append(f" {elem_name}.fontSize = {elem.get('fontSize', 16)};") + code_lines.append(f" {section_name.replace(' ', '')}.appendChild({elem_name});") + + code_lines.append("") + + y_offset += height + + code_lines.append(" figma.viewport.scrollAndZoomIntoView(page.children);") + code_lines.append("}") + code_lines.append("") + code_lines.append("createDesign();") + + return "\n".join(code_lines) + + def _get_instructions(self, name: str) -> str: + """Get manual build instructions.""" + return f""" +## Manual Build Instructions for "{name}" + +### Step 1: Create the file +1. Go to figma.com +2. Click "+ New design file" +3. Rename to "{name}" + +### Step 2: Set up the frame +1. Press F for Frame tool +2. Select "Desktop" (1440x900) from the right panel +3. This will be your main artboard + +### Step 3: Use the plugin code +1. In Figma, go to Menu โ†’ Plugins โ†’ Development โ†’ Open Console +2. Paste the plugin code provided +3. Press Enter to run + +The design will be automatically created! +""" + diff --git a/src/reachy_mini_conversation_app/tools/figma_mcp_official.py b/src/reachy_mini_conversation_app/tools/figma_mcp_official.py new file mode 100644 index 0000000000000000000000000000000000000000..de113923f3dbe762c5d21cefb6afed5a5b42bbad --- /dev/null +++ b/src/reachy_mini_conversation_app/tools/figma_mcp_official.py @@ -0,0 +1,477 @@ +"""Official Figma MCP Server integration. + +Based on: https://developers.figma.com/docs/figma-mcp-server/ + +Two connection modes: +- Remote: https://mcp.figma.com/mcp (requires Figma browser with Dev Mode) +- Desktop: http://127.0.0.1:3845/mcp (requires Figma Desktop app with Dev Mode) +""" + +import os +import json +import logging +import asyncio +from typing import Any, Dict, List, Optional +from dataclasses import dataclass + +from reachy_mini_conversation_app.tools.core_tools import Tool, ToolDependencies + + +logger = logging.getLogger(__name__) + +# Figma MCP Server endpoints +FIGMA_MCP_REMOTE = "https://mcp.figma.com/mcp" +FIGMA_MCP_DESKTOP = "http://127.0.0.1:3845/mcp" + + +@dataclass +class FigmaMCPClient: + """Client for Figma's official MCP Server.""" + + base_url: str + access_token: Optional[str] = None + _request_id: int = 0 + + def _next_id(self) -> int: + self._request_id += 1 + return self._request_id + + async def _send_request(self, method: str, params: Optional[Dict] = None) -> Dict[str, Any]: + """Send a JSON-RPC request to the Figma MCP server.""" + import httpx + + headers = { + "Content-Type": "application/json", + "Accept": "application/json", + } + + if self.access_token: + headers["Authorization"] = f"Bearer {self.access_token}" + + payload = { + "jsonrpc": "2.0", + "id": self._next_id(), + "method": method, + } + + if params: + payload["params"] = params + + try: + async with httpx.AsyncClient(timeout=60.0) as client: + response = await client.post( + self.base_url, + headers=headers, + json=payload, + ) + + if response.status_code == 401: + return {"error": {"code": 401, "message": "Authentication required. Make sure Figma Dev Mode is enabled."}} + elif response.status_code == 403: + return {"error": {"code": 403, "message": "Access denied. Check your Figma permissions."}} + elif response.status_code != 200: + return {"error": {"code": response.status_code, "message": response.text}} + + return response.json() + + except httpx.ConnectError: + return {"error": {"code": -1, "message": f"Could not connect to {self.base_url}"}} + except Exception as e: + return {"error": {"code": -1, "message": str(e)}} + + async def initialize(self) -> Dict[str, Any]: + """Initialize the MCP session.""" + return await self._send_request("initialize", { + "protocolVersion": "2024-11-05", + "capabilities": { + "tools": {}, + "resources": {}, + "prompts": {}, + }, + "clientInfo": { + "name": "reachy-mini-conversation-app", + "version": "1.0.0", + }, + }) + + async def list_tools(self) -> Dict[str, Any]: + """List available tools from the server.""" + return await self._send_request("tools/list") + + async def list_resources(self) -> Dict[str, Any]: + """List available resources.""" + return await self._send_request("resources/list") + + async def list_prompts(self) -> Dict[str, Any]: + """List available prompts.""" + return await self._send_request("prompts/list") + + async def call_tool(self, name: str, arguments: Dict[str, Any]) -> Dict[str, Any]: + """Call a specific tool.""" + return await self._send_request("tools/call", { + "name": name, + "arguments": arguments, + }) + + async def read_resource(self, uri: str) -> Dict[str, Any]: + """Read a resource by URI.""" + return await self._send_request("resources/read", { + "uri": uri, + }) + + async def get_prompt(self, name: str, arguments: Optional[Dict] = None) -> Dict[str, Any]: + """Get a prompt template.""" + params = {"name": name} + if arguments: + params["arguments"] = arguments + return await self._send_request("prompts/get", params) + + +class FigmaMCPServer(Tool): + """Connect to Figma's official MCP Server for AI-powered design operations. + + The MCP Server brings Figma directly into your workflow by providing + design information and context to generate code from Figma designs. + + Features: + - Generate code from selected frames + - Extract design context (variables, components, layout) + - Retrieve Make resources + - Keep design system consistent with Code Connect + + Docs: https://developers.figma.com/docs/figma-mcp-server/ + """ + + name = "figma_mcp_server" + description = ( + "Connect to Figma's official MCP Server for AI-powered design operations. " + "Can generate code from Figma frames, extract design tokens, components, " + "and layout data. Requires Figma Dev Mode to be enabled. " + "Use this to turn Figma designs into code or extract design context." + ) + parameters_schema = { + "type": "object", + "properties": { + "action": { + "type": "string", + "enum": [ + "connect", + "list_tools", + "list_resources", + "list_prompts", + "generate_code", + "get_design_context", + "get_variables", + "get_components", + "call_tool", + "read_resource", + ], + "description": "The MCP action to perform", + }, + "server_type": { + "type": "string", + "enum": ["remote", "desktop"], + "description": "Which MCP server to use (remote=browser, desktop=app)", + "default": "desktop", + }, + "file_key": { + "type": "string", + "description": "Figma file key (from URL)", + }, + "node_id": { + "type": "string", + "description": "Node/frame ID to operate on", + }, + "tool_name": { + "type": "string", + "description": "Name of the MCP tool to call", + }, + "tool_args": { + "type": "object", + "description": "Arguments to pass to the tool", + }, + "resource_uri": { + "type": "string", + "description": "Resource URI to read", + }, + }, + "required": ["action"], + } + + async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]: + """Execute Figma MCP Server action.""" + action = kwargs.get("action", "connect") + server_type = kwargs.get("server_type", "desktop") + + # Get the appropriate endpoint + if server_type == "remote": + base_url = FIGMA_MCP_REMOTE + else: + base_url = FIGMA_MCP_DESKTOP + + # Get access token for remote server + access_token = os.getenv("FIGMA_ACCESS_TOKEN") if server_type == "remote" else None + + # Create client + client = FigmaMCPClient(base_url=base_url, access_token=access_token) + + try: + if action == "connect": + return await self._connect(client, server_type) + elif action == "list_tools": + return await self._list_tools(client) + elif action == "list_resources": + return await self._list_resources(client) + elif action == "list_prompts": + return await self._list_prompts(client) + elif action == "generate_code": + return await self._generate_code(client, kwargs) + elif action == "get_design_context": + return await self._get_design_context(client, kwargs) + elif action == "get_variables": + return await self._get_variables(client, kwargs) + elif action == "get_components": + return await self._get_components(client, kwargs) + elif action == "call_tool": + return await self._call_tool(client, kwargs) + elif action == "read_resource": + return await self._read_resource(client, kwargs) + else: + return {"error": f"Unknown action: {action}"} + + except Exception as e: + logger.error(f"Figma MCP error: {e}") + return {"error": str(e)} + + async def _connect(self, client: FigmaMCPClient, server_type: str) -> Dict[str, Any]: + """Connect and initialize the MCP server.""" + result = await client.initialize() + + if "error" in result: + error = result["error"] + if error.get("code") == -1 and "Could not connect" in error.get("message", ""): + if server_type == "desktop": + return { + "error": "Could not connect to Figma Desktop MCP server", + "help": ( + "**To enable the Desktop MCP server:**\n\n" + "1. Open the **Figma Desktop app**\n" + "2. Open a Design file\n" + "3. Press **Shift+D** to enter Dev Mode\n" + "4. In the right panel, click **'Enable desktop MCP server'**\n" + "5. The server will run at http://127.0.0.1:3845/mcp\n\n" + "Once enabled, try this action again!" + ), + } + else: + return { + "error": "Could not connect to Figma Remote MCP server", + "help": ( + "**To use the Remote MCP server:**\n\n" + "1. Open a Figma file in your **browser**\n" + "2. Press **Shift+D** to enter Dev Mode\n" + "3. In the right panel, click **'Set up an MCP client'**\n" + "4. Follow the setup instructions\n\n" + "The remote server URL is: https://mcp.figma.com/mcp" + ), + } + return result + + # Get server capabilities + server_info = result.get("result", {}).get("serverInfo", {}) + capabilities = result.get("result", {}).get("capabilities", {}) + + return { + "result": "Connected to Figma MCP Server!", + "server": { + "name": server_info.get("name", "Figma MCP"), + "version": server_info.get("version", "unknown"), + "type": server_type, + "url": client.base_url, + }, + "capabilities": { + "tools": "tools" in capabilities, + "resources": "resources" in capabilities, + "prompts": "prompts" in capabilities, + }, + "next_steps": ( + "You're connected! Now you can:\n" + "- `list_tools` - See available tools\n" + "- `list_resources` - See available resources\n" + "- `generate_code` - Generate code from a Figma frame\n" + "- `get_variables` - Extract design tokens\n" + "- `get_components` - Get component information" + ), + } + + async def _list_tools(self, client: FigmaMCPClient) -> Dict[str, Any]: + """List available MCP tools.""" + result = await client.list_tools() + + if "error" in result: + return result + + tools = result.get("result", {}).get("tools", []) + + tool_list = [] + for tool in tools: + tool_list.append({ + "name": tool.get("name"), + "description": tool.get("description", "")[:100], + }) + + return { + "result": { + "tools": tool_list, + "count": len(tool_list), + }, + "message": f"Found {len(tool_list)} available tools. Use `call_tool` with a tool name to execute.", + } + + async def _list_resources(self, client: FigmaMCPClient) -> Dict[str, Any]: + """List available resources.""" + result = await client.list_resources() + + if "error" in result: + return result + + resources = result.get("result", {}).get("resources", []) + + resource_list = [] + for res in resources[:20]: # Limit to 20 + resource_list.append({ + "uri": res.get("uri"), + "name": res.get("name"), + "description": res.get("description", "")[:50], + }) + + return { + "result": { + "resources": resource_list, + "count": len(resources), + }, + } + + async def _list_prompts(self, client: FigmaMCPClient) -> Dict[str, Any]: + """List available prompts.""" + result = await client.list_prompts() + + if "error" in result: + return result + + prompts = result.get("result", {}).get("prompts", []) + + prompt_list = [] + for prompt in prompts: + prompt_list.append({ + "name": prompt.get("name"), + "description": prompt.get("description", "")[:100], + }) + + return { + "result": { + "prompts": prompt_list, + "count": len(prompt_list), + }, + } + + async def _generate_code(self, client: FigmaMCPClient, kwargs: Dict) -> Dict[str, Any]: + """Generate code from a Figma frame.""" + file_key = kwargs.get("file_key") + node_id = kwargs.get("node_id") + + if not file_key or not node_id: + return { + "error": "Both file_key and node_id are required", + "help": ( + "To generate code:\n" + "1. Open your Figma file\n" + "2. Select a frame\n" + "3. Get the file_key from the URL: figma.com/file/[FILE_KEY]/...\n" + "4. Get the node_id from the URL after selecting: ?node-id=[NODE_ID]" + ), + } + + # Try to call the code generation tool + result = await client.call_tool("generate_code", { + "fileKey": file_key, + "nodeId": node_id, + }) + + if "error" in result: + # Try alternative tool names + for tool_name in ["generateCode", "get_code", "code_from_selection"]: + result = await client.call_tool(tool_name, { + "fileKey": file_key, + "nodeId": node_id, + }) + if "error" not in result: + break + + return result + + async def _get_design_context(self, client: FigmaMCPClient, kwargs: Dict) -> Dict[str, Any]: + """Get design context from a Figma file.""" + file_key = kwargs.get("file_key") + + if not file_key: + return {"error": "file_key is required"} + + result = await client.call_tool("get_design_context", { + "fileKey": file_key, + }) + + return result + + async def _get_variables(self, client: FigmaMCPClient, kwargs: Dict) -> Dict[str, Any]: + """Get design variables/tokens from a Figma file.""" + file_key = kwargs.get("file_key") + + if not file_key: + return {"error": "file_key is required"} + + # Try different tool names for variables + for tool_name in ["get_variables", "getVariables", "fetch_variables"]: + result = await client.call_tool(tool_name, { + "fileKey": file_key, + }) + if "error" not in result: + return result + + return result + + async def _get_components(self, client: FigmaMCPClient, kwargs: Dict) -> Dict[str, Any]: + """Get components from a Figma file.""" + file_key = kwargs.get("file_key") + + if not file_key: + return {"error": "file_key is required"} + + for tool_name in ["get_components", "getComponents", "fetch_components"]: + result = await client.call_tool(tool_name, { + "fileKey": file_key, + }) + if "error" not in result: + return result + + return result + + async def _call_tool(self, client: FigmaMCPClient, kwargs: Dict) -> Dict[str, Any]: + """Call a specific MCP tool.""" + tool_name = kwargs.get("tool_name") + tool_args = kwargs.get("tool_args", {}) + + if not tool_name: + return {"error": "tool_name is required"} + + return await client.call_tool(tool_name, tool_args) + + async def _read_resource(self, client: FigmaMCPClient, kwargs: Dict) -> Dict[str, Any]: + """Read a resource by URI.""" + resource_uri = kwargs.get("resource_uri") + + if not resource_uri: + return {"error": "resource_uri is required"} + + return await client.read_resource(resource_uri) + diff --git a/src/reachy_mini_conversation_app/tools/figma_mcp_server.py b/src/reachy_mini_conversation_app/tools/figma_mcp_server.py new file mode 100644 index 0000000000000000000000000000000000000000..6c86447294c7c8f93e7a2a2f0b06583b0b6071af --- /dev/null +++ b/src/reachy_mini_conversation_app/tools/figma_mcp_server.py @@ -0,0 +1,325 @@ +"""Official Figma MCP Server integration. + +Based on: https://developers.figma.com/docs/figma-mcp-server/ + +Two connection modes: +- Remote: https://mcp.figma.com/mcp (requires Figma browser with Dev Mode) +- Desktop: http://127.0.0.1:3845/mcp (requires Figma Desktop app with Dev Mode) +""" + +import json +import logging +from typing import Any, Dict, List, Optional + +from reachy_mini_conversation_app.tools.core_tools import Tool, ToolDependencies + + +logger = logging.getLogger(__name__) + +# Figma MCP Server endpoints +FIGMA_MCP_DESKTOP = "http://127.0.0.1:3845/mcp" +FIGMA_MCP_REMOTE = "https://mcp.figma.com/mcp" + + +def parse_sse_response(text: str) -> Dict[str, Any]: + """Parse SSE-formatted response from Figma MCP server.""" + for line in text.split('\n'): + if line.startswith('data:'): + try: + return json.loads(line[5:].strip()) + except json.JSONDecodeError: + continue + return {"error": "Could not parse SSE response"} + + +class FigmaMCPServer(Tool): + """Connect to Figma's official MCP Server for AI-powered design operations. + + The MCP Server brings Figma directly into your workflow by providing + design information and context to generate code from Figma designs. + + Features: + - Generate code from selected frames + - Extract design context (variables, components, layout) + - Retrieve Make resources + - Keep design system consistent with Code Connect + + Docs: https://developers.figma.com/docs/figma-mcp-server/ + """ + + name = "figma_mcp_server" + description = ( + "Connect to Figma's official MCP Server for AI-powered design operations. " + "Can generate code from Figma frames, extract design tokens, components, " + "and layout data. Requires Figma Dev Mode to be enabled. " + "Use this to turn Figma designs into code or extract design context." + ) + parameters_schema = { + "type": "object", + "properties": { + "action": { + "type": "string", + "enum": [ + "connect", + "list_tools", + "list_resources", + "call_tool", + ], + "description": "The action to perform", + }, + "server_type": { + "type": "string", + "enum": ["remote", "desktop"], + "description": "Which MCP server to use (remote=browser, desktop=app)", + "default": "desktop", + }, + "tool_name": { + "type": "string", + "description": "Name of the tool to call (for call_tool action)", + }, + "tool_args": { + "type": "object", + "description": "Arguments to pass to the tool (for call_tool action)", + }, + }, + "required": ["action"], + } + + async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]: + """Execute Figma MCP Server action.""" + import httpx + + action = kwargs.get("action", "connect") + server_type = kwargs.get("server_type", "desktop") + tool_name = kwargs.get("tool_name") + tool_args = kwargs.get("tool_args", {}) + + # Get the appropriate endpoint + base_url = FIGMA_MCP_DESKTOP if server_type == "desktop" else FIGMA_MCP_REMOTE + + headers = { + "Content-Type": "application/json", + "Accept": "application/json, text/event-stream", + } + + try: + async with httpx.AsyncClient(timeout=30.0) as client: + if action == "connect": + return await self._connect(client, base_url, headers, server_type) + elif action == "list_tools": + return await self._list_tools(client, base_url, headers) + elif action == "list_resources": + return await self._list_resources(client, base_url, headers) + elif action == "call_tool": + if not tool_name: + return {"error": "tool_name is required for call_tool action"} + return await self._call_tool(client, base_url, headers, tool_name, tool_args) + else: + return {"error": f"Unknown action: {action}"} + + except httpx.ConnectError: + return self._get_setup_help(server_type) + except Exception as e: + logger.error(f"Figma MCP error: {e}") + return {"error": str(e)} + + async def _connect(self, client, base_url: str, headers: Dict, server_type: str) -> Dict[str, Any]: + """Initialize connection to Figma MCP server.""" + # Send initialize request + response = await client.post( + base_url, + headers=headers, + json={ + "jsonrpc": "2.0", + "id": 1, + "method": "initialize", + "params": { + "protocolVersion": "2024-11-05", + "capabilities": {}, + "clientInfo": {"name": "reachy-conversation", "version": "1.0"} + } + } + ) + + if response.status_code != 200: + return self._get_setup_help(server_type) + + data = parse_sse_response(response.text) + + if "error" in data: + return {"error": data["error"]} + + if "result" in data: + server_info = data["result"].get("serverInfo", {}) + capabilities = data["result"].get("capabilities", {}) + + return { + "status": "connected", + "server": { + "name": server_info.get("name", "Unknown"), + "version": server_info.get("version", "Unknown"), + "url": base_url, + }, + "capabilities": list(capabilities.keys()), + "message": ( + f"โœ… Connected to {server_info.get('name', 'Figma MCP Server')}!\n\n" + f"**Version:** {server_info.get('version', 'Unknown')}\n" + f"**Capabilities:** {', '.join(capabilities.keys())}\n\n" + "Use `list_tools` action to see available tools." + ), + } + + return {"error": "Unexpected response from server"} + + async def _list_tools(self, client, base_url: str, headers: Dict) -> Dict[str, Any]: + """List available tools from the MCP server.""" + # First initialize + init_response = await client.post( + base_url, + headers=headers, + json={ + "jsonrpc": "2.0", + "id": 1, + "method": "initialize", + "params": { + "protocolVersion": "2024-11-05", + "capabilities": {}, + "clientInfo": {"name": "reachy-conversation", "version": "1.0"} + } + } + ) + + if init_response.status_code != 200: + return {"error": f"Failed to initialize: {init_response.status_code}"} + + # Then list tools + response = await client.post( + base_url, + headers=headers, + json={ + "jsonrpc": "2.0", + "id": 2, + "method": "tools/list" + } + ) + + data = parse_sse_response(response.text) + + if "error" in data: + # The server might not support stateless requests + # Try to explain what tools are typically available + return { + "result": { + "tools": [ + {"name": "get_code_connect_map", "description": "Get Code Connect mappings for selected elements"}, + {"name": "get_variables", "description": "Get design variables/tokens from the file"}, + {"name": "get_styles", "description": "Get style definitions"}, + {"name": "get_components", "description": "Get component definitions"}, + ] + }, + "note": "These are typical Figma MCP tools. Connect via Cursor or Claude for full access." + } + + return data + + async def _list_resources(self, client, base_url: str, headers: Dict) -> Dict[str, Any]: + """List available resources from the MCP server.""" + # First initialize + await client.post( + base_url, + headers=headers, + json={ + "jsonrpc": "2.0", + "id": 1, + "method": "initialize", + "params": { + "protocolVersion": "2024-11-05", + "capabilities": {}, + "clientInfo": {"name": "reachy-conversation", "version": "1.0"} + } + } + ) + + # Then list resources + response = await client.post( + base_url, + headers=headers, + json={ + "jsonrpc": "2.0", + "id": 2, + "method": "resources/list" + } + ) + + return parse_sse_response(response.text) + + async def _call_tool(self, client, base_url: str, headers: Dict, tool_name: str, tool_args: Dict) -> Dict[str, Any]: + """Call a specific tool on the MCP server.""" + # First initialize + await client.post( + base_url, + headers=headers, + json={ + "jsonrpc": "2.0", + "id": 1, + "method": "initialize", + "params": { + "protocolVersion": "2024-11-05", + "capabilities": {}, + "clientInfo": {"name": "reachy-conversation", "version": "1.0"} + } + } + ) + + # Then call the tool + response = await client.post( + base_url, + headers=headers, + json={ + "jsonrpc": "2.0", + "id": 2, + "method": "tools/call", + "params": { + "name": tool_name, + "arguments": tool_args + } + } + ) + + return parse_sse_response(response.text) + + def _get_setup_help(self, server_type: str) -> Dict[str, Any]: + """Return setup help when server is not available.""" + if server_type == "desktop": + return { + "error": "Could not connect to Figma Desktop MCP server", + "help": ( + "**To enable the Desktop MCP server:**\n\n" + "1. Open the **Figma Desktop app**\n" + "2. Open a Design file\n" + "3. Press **Shift+D** to enter Dev Mode\n" + "4. In the right panel, click **'Enable desktop MCP server'**\n" + "5. The server will run at http://127.0.0.1:3845\n\n" + "Once enabled, try this action again!" + ), + } + else: + return { + "error": "Could not connect to Figma Remote MCP server", + "help": ( + "**To use the Remote MCP server:**\n\n" + "1. Open a Figma file in your **browser**\n" + "2. Press **Shift+D** to enter Dev Mode\n" + "3. In the right panel, click **'Set up an MCP client'**\n" + "4. Follow the setup instructions\n\n" + "The remote server URL is: https://mcp.figma.com" + ), + } + + +# Simple test function +async def test_connection(): + """Quick test of Figma MCP connection.""" + tool = FigmaMCPServer() + result = await tool(deps=None, action="connect", server_type="desktop") + return result diff --git a/src/reachy_mini_conversation_app/tools/head_tracking.py b/src/reachy_mini_conversation_app/tools/head_tracking.py new file mode 100644 index 0000000000000000000000000000000000000000..57fa178d3fe568c6d31573d4fd7454a429d8b4a0 --- /dev/null +++ b/src/reachy_mini_conversation_app/tools/head_tracking.py @@ -0,0 +1,31 @@ +import logging +from typing import Any, Dict + +from reachy_mini_conversation_app.tools.core_tools import Tool, ToolDependencies + + +logger = logging.getLogger(__name__) + + +class HeadTracking(Tool): + """Toggle head tracking state.""" + + name = "head_tracking" + description = "Toggle head tracking state." + parameters_schema = { + "type": "object", + "properties": {"start": {"type": "boolean"}}, + "required": ["start"], + } + + async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]: + """Enable or disable head tracking.""" + enable = bool(kwargs.get("start")) + + # Update camera worker head tracking state + if deps.camera_worker is not None: + deps.camera_worker.set_head_tracking_enabled(enable) + + status = "started" if enable else "stopped" + logger.info("Tool call: head_tracking %s", status) + return {"status": f"head tracking {status}"} diff --git a/src/reachy_mini_conversation_app/tools/move_head.py b/src/reachy_mini_conversation_app/tools/move_head.py new file mode 100644 index 0000000000000000000000000000000000000000..821c562159e0b4d315c691b9458e6b4385744e8f --- /dev/null +++ b/src/reachy_mini_conversation_app/tools/move_head.py @@ -0,0 +1,79 @@ +import logging +from typing import Any, Dict, Tuple, Literal + +from reachy_mini.utils import create_head_pose +from reachy_mini_conversation_app.tools.core_tools import Tool, ToolDependencies +from reachy_mini_conversation_app.dance_emotion_moves import GotoQueueMove + + +logger = logging.getLogger(__name__) + +Direction = Literal["left", "right", "up", "down", "front"] + + +class MoveHead(Tool): + """Move head in a given direction.""" + + name = "move_head" + description = "Move your head in a given direction: left, right, up, down or front." + parameters_schema = { + "type": "object", + "properties": { + "direction": { + "type": "string", + "enum": ["left", "right", "up", "down", "front"], + }, + }, + "required": ["direction"], + } + + # mapping: direction -> args for create_head_pose + DELTAS: Dict[str, Tuple[int, int, int, int, int, int]] = { + "left": (0, 0, 0, 0, 0, 40), + "right": (0, 0, 0, 0, 0, -40), + "up": (0, 0, 0, 0, -30, 0), + "down": (0, 0, 0, 0, 30, 0), + "front": (0, 0, 0, 0, 0, 0), + } + + async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]: + """Move head in a given direction.""" + direction_raw = kwargs.get("direction") + if not isinstance(direction_raw, str): + return {"error": "direction must be a string"} + direction: Direction = direction_raw # type: ignore[assignment] + logger.info("Tool call: move_head direction=%s", direction) + + deltas = self.DELTAS.get(direction, self.DELTAS["front"]) + target = create_head_pose(*deltas, degrees=True) + + # Use new movement manager + try: + movement_manager = deps.movement_manager + + # Get current state for interpolation + current_head_pose = deps.reachy_mini.get_current_head_pose() + _, current_antennas = deps.reachy_mini.get_current_joint_positions() + + # Create goto move + goto_move = GotoQueueMove( + target_head_pose=target, + start_head_pose=current_head_pose, + target_antennas=(0, 0), # Reset antennas to default + start_antennas=( + current_antennas[0], + current_antennas[1], + ), # Skip body_yaw + target_body_yaw=0, # Reset body yaw + start_body_yaw=current_antennas[0], # body_yaw is first in joint positions + duration=deps.motion_duration_s, + ) + + movement_manager.queue_move(goto_move) + movement_manager.set_moving_state(deps.motion_duration_s) + + return {"status": f"looking {direction}"} + + except Exception as e: + logger.error("move_head failed") + return {"error": f"move_head failed: {type(e).__name__}: {e}"} diff --git a/src/reachy_mini_conversation_app/tools/play_emotion.py b/src/reachy_mini_conversation_app/tools/play_emotion.py new file mode 100644 index 0000000000000000000000000000000000000000..34b2fdac147429f27d40b2a8963fc0207c215f60 --- /dev/null +++ b/src/reachy_mini_conversation_app/tools/play_emotion.py @@ -0,0 +1,84 @@ +import logging +from typing import Any, Dict + +from reachy_mini_conversation_app.tools.core_tools import Tool, ToolDependencies + + +logger = logging.getLogger(__name__) + +# Initialize emotion library +try: + from reachy_mini.motion.recorded_move import RecordedMoves + from reachy_mini_conversation_app.dance_emotion_moves import EmotionQueueMove + + # Note: huggingface_hub automatically reads HF_TOKEN from environment variables + RECORDED_MOVES = RecordedMoves("pollen-robotics/reachy-mini-emotions-library") + EMOTION_AVAILABLE = True +except ImportError as e: + logger.warning(f"Emotion library not available: {e}") + RECORDED_MOVES = None + EMOTION_AVAILABLE = False + + +def get_available_emotions_and_descriptions() -> str: + """Get formatted list of available emotions with descriptions.""" + if not EMOTION_AVAILABLE: + return "Emotions not available" + + try: + emotion_names = RECORDED_MOVES.list_moves() + output = "Available emotions:\n" + for name in emotion_names: + description = RECORDED_MOVES.get(name).description + output += f" - {name}: {description}\n" + return output + except Exception as e: + return f"Error getting emotions: {e}" + + +class PlayEmotion(Tool): + """Play a pre-recorded emotion.""" + + name = "play_emotion" + description = "Play a pre-recorded emotion" + parameters_schema = { + "type": "object", + "properties": { + "emotion": { + "type": "string", + "description": f"""Name of the emotion to play. + Here is a list of the available emotions: + {get_available_emotions_and_descriptions()} + """, + }, + }, + "required": ["emotion"], + } + + async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]: + """Play a pre-recorded emotion.""" + if not EMOTION_AVAILABLE: + return {"error": "Emotion system not available"} + + emotion_name = kwargs.get("emotion") + if not emotion_name: + return {"error": "Emotion name is required"} + + logger.info("Tool call: play_emotion emotion=%s", emotion_name) + + # Check if emotion exists + try: + emotion_names = RECORDED_MOVES.list_moves() + if emotion_name not in emotion_names: + return {"error": f"Unknown emotion '{emotion_name}'. Available: {emotion_names}"} + + # Add emotion to queue + movement_manager = deps.movement_manager + emotion_move = EmotionQueueMove(emotion_name, RECORDED_MOVES) + movement_manager.queue_move(emotion_move) + + return {"status": "queued", "emotion": emotion_name} + + except Exception as e: + logger.exception("Failed to play emotion") + return {"error": f"Failed to play emotion: {e!s}"} diff --git a/src/reachy_mini_conversation_app/tools/stop_dance.py b/src/reachy_mini_conversation_app/tools/stop_dance.py new file mode 100644 index 0000000000000000000000000000000000000000..ab14e84a2cac345fd1696ff21ce59d14bdec7dde --- /dev/null +++ b/src/reachy_mini_conversation_app/tools/stop_dance.py @@ -0,0 +1,31 @@ +import logging +from typing import Any, Dict + +from reachy_mini_conversation_app.tools.core_tools import Tool, ToolDependencies + + +logger = logging.getLogger(__name__) + + +class StopDance(Tool): + """Stop the current dance move.""" + + name = "stop_dance" + description = "Stop the current dance move" + parameters_schema = { + "type": "object", + "properties": { + "dummy": { + "type": "boolean", + "description": "dummy boolean, set it to true", + }, + }, + "required": ["dummy"], + } + + async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]: + """Stop the current dance move.""" + logger.info("Tool call: stop_dance") + movement_manager = deps.movement_manager + movement_manager.clear_move_queue() + return {"status": "stopped dance and cleared queue"} diff --git a/src/reachy_mini_conversation_app/tools/stop_emotion.py b/src/reachy_mini_conversation_app/tools/stop_emotion.py new file mode 100644 index 0000000000000000000000000000000000000000..b5d2323fe3d0642466c03efbac9b9f16641d05ae --- /dev/null +++ b/src/reachy_mini_conversation_app/tools/stop_emotion.py @@ -0,0 +1,31 @@ +import logging +from typing import Any, Dict + +from reachy_mini_conversation_app.tools.core_tools import Tool, ToolDependencies + + +logger = logging.getLogger(__name__) + + +class StopEmotion(Tool): + """Stop the current emotion.""" + + name = "stop_emotion" + description = "Stop the current emotion" + parameters_schema = { + "type": "object", + "properties": { + "dummy": { + "type": "boolean", + "description": "dummy boolean, set it to true", + }, + }, + "required": ["dummy"], + } + + async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]: + """Stop the current emotion.""" + logger.info("Tool call: stop_emotion") + movement_manager = deps.movement_manager + movement_manager.clear_move_queue() + return {"status": "stopped emotion and cleared queue"} diff --git a/src/reachy_mini_conversation_app/tools/vibe_code.py b/src/reachy_mini_conversation_app/tools/vibe_code.py new file mode 100644 index 0000000000000000000000000000000000000000..0d41867adda658953ad06f446fe7c07e28b3b2db --- /dev/null +++ b/src/reachy_mini_conversation_app/tools/vibe_code.py @@ -0,0 +1,324 @@ +"""Reachy the Vibe Coder - Voice-activated coding with Cursor IDE. + +Let Reachy be your creative coding partner! Just say what you want to build, +and Reachy will craft a detailed vision and send it to Cursor to make it real. + +"Reachy, build me a website" โ†’ Reachy decides everything and codes it for you! + +Supports two modes: +1. AppleScript automation (macOS only) - directly controls Cursor +2. MCP Bridge mode - sends requests via MCP server that Cursor connects to +""" + +import os +import asyncio +import logging +import subprocess +import platform +from typing import Any, Dict + +from reachy_mini_conversation_app.tools.core_tools import Tool, ToolDependencies + + +logger = logging.getLogger(__name__) + +# Check if MCP bridge mode is enabled +USE_MCP_BRIDGE = os.getenv("REACHY_CURSOR_MCP_BRIDGE", "false").lower() == "true" + + +class VibeCode(Tool): + """Reachy the Vibe Coder - Your creative coding partner! + + Just tell Reachy what you want to build, and watch the magic happen. + Reachy decides the tech stack, colors, layout, animations - EVERYTHING. + Then sends it to Cursor to bring your vision to life! + + ๐ŸŽจ Creative Director Mode: ON + ๐Ÿš€ Vibe Coding: ENABLED + """ + + name = "vibe_code" + description = ( + "๐ŸŽจ REACHY THE VIBE CODER - Your creative coding partner! " + "Tell me what to build and I'll handle EVERYTHING: tech stack, colors, " + "layout, animations, the works! I'm your creative director - just vibe with me! " + "Say 'build a website' or 'make an app' and watch the magic happen in Cursor!" + ) + parameters_schema = { + "type": "object", + "properties": { + "request": { + "type": "string", + "description": ( + "A DETAILED coding request with ALL your creative decisions. " + "YOU decide: tech stack (React, Vue, vanilla JS, Python, etc.), " + "styling (Tailwind, CSS modules, styled-components), " + "color palette (specific hex codes), typography, layout, " + "components to create, animations, and features. " + "Be SPECIFIC and OPINIONATED. Example: " + "'Create a landing page using Next.js 14 and Tailwind CSS. " + "Use a dark theme with #0a0a0a background and #22d3ee cyan accents. " + "Include: 1) Hero with animated gradient text, 2) Features grid with " + "hover animations, 3) Pricing cards with glassmorphism effect, " + "4) Footer with social links. Use Inter font. Add smooth scroll.'" + ), + }, + "auto_submit": { + "type": "boolean", + "description": "Whether to automatically submit the request. Default is true.", + }, + }, + "required": ["request"], + } + + async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]: + """Execute the cursor coding request.""" + request = kwargs.get("request", "") + auto_submit = kwargs.get("auto_submit", True) + + if not request: + return {"error": "No coding request provided"} + + logger.info("Cursor code request: %s", request[:100]) + + # Use MCP bridge mode if enabled + if USE_MCP_BRIDGE: + try: + from reachy_mini_conversation_app.mcp.cursor_bridge import send_to_cursor_via_bridge + result = await send_to_cursor_via_bridge(request, None) + return result + except Exception as e: + logger.error("MCP bridge failed: %s", e) + # Fall back to AppleScript if available + if platform.system() == "Darwin": + logger.info("Falling back to AppleScript automation") + else: + return {"error": str(e)} + + # Check platform for AppleScript automation + if platform.system() != "Darwin": + return { + "error": "This tool currently only supports macOS", + "suggestion": ( + "Options:\n" + "1. Set REACHY_CURSOR_MCP_BRIDGE=true and configure Cursor to connect to the MCP bridge\n" + "2. Manual workaround: Open Cursor, press Cmd+I, and type your request" + ), + } + + try: + result = await self._send_to_cursor(request, auto_submit) + return result + except Exception as e: + logger.error("Failed to send to Cursor: %s", e) + return { + "error": str(e), + "fallback": "Please open Cursor manually and use Cmd+I to open Composer", + } + + async def _send_to_cursor(self, prompt: str, auto_submit: bool) -> Dict[str, Any]: + """Send the prompt to Cursor using AppleScript automation. + + Opens a new Cursor window and sends the prompt to Composer. + """ + # Escape special characters for AppleScript + escaped_prompt = prompt.replace("\\", "\\\\").replace('"', '\\"').replace("'", "'\\''") + + # AppleScript to: + # 1. Activate Cursor + # 2. Open a new window + # 3. Open Composer with Cmd+I + # 4. Type the prompt character by character + # 5. Submit + applescript = f''' + tell application "Cursor" + activate + end tell + + delay 1.5 + + tell application "System Events" + tell process "Cursor" + set frontmost to true + delay 0.5 + + -- Open a new window + keystroke "n" using {{command down, shift down}} + delay 2.0 + + -- Open Composer with Cmd+I + keystroke "i" using command down + delay 2.0 + + -- Type the prompt character by character for reliability + set thePrompt to "{escaped_prompt}" + repeat with i from 1 to length of thePrompt + keystroke (character i of thePrompt) + delay 0.01 + end repeat + + {"-- Submit with Enter" if auto_submit else ""} + {f'delay 1.0' if auto_submit else ""} + {f'keystroke return' if auto_submit else ""} + end tell + end tell + + return "success" + ''' + + # Run the AppleScript + try: + proc = await asyncio.create_subprocess_exec( + "osascript", "-e", applescript, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=15.0) + + if proc.returncode != 0: + error_msg = stderr.decode().strip() if stderr else "Unknown error" + + # Check for accessibility permissions + if "not allowed assistive access" in error_msg.lower(): + return { + "error": "Accessibility permissions required", + "instructions": ( + "To enable voice coding:\n" + "1. Open System Settings โ†’ Privacy & Security โ†’ Accessibility\n" + "2. Add and enable the terminal app you're running Reachy from\n" + "3. Try again after granting permission" + ), + } + + return {"error": f"AppleScript failed: {error_msg}"} + + return { + "status": "sent_to_cursor", + "message": f"Opened new Cursor window and sent prompt", + "auto_submitted": auto_submit, + } + + except asyncio.TimeoutError: + return {"error": "Timeout waiting for Cursor response"} + except FileNotFoundError: + return {"error": "osascript not found - are you on macOS?"} + + +class VibeBigProject(Tool): + """Reachy the Vibe Coder - BIG PROJECT MODE! ๐Ÿš€ + + For when you need to build something EPIC that spans multiple files. + Uses Cursor's Agent mode for complex, project-wide changes. + """ + + name = "vibe_big_project" + description = ( + "๐Ÿš€ REACHY THE VIBE CODER - BIG PROJECT MODE! " + "Use this for EPIC builds: entire apps, major refactors, full project setups. " + "I'll orchestrate Cursor's Agent to build across multiple files!" + ) + parameters_schema = { + "type": "object", + "properties": { + "task": { + "type": "string", + "description": ( + "The complex coding task to send to Cursor Agent. " + "Agent mode is best for multi-file changes and project-wide tasks." + ), + }, + "auto_submit": { + "type": "boolean", + "description": "Whether to automatically start the agent. Default is true.", + }, + }, + "required": ["task"], + } + + async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]: + """Execute the cursor agent task.""" + task = kwargs.get("task", "") + auto_submit = kwargs.get("auto_submit", True) + + if not task: + return {"error": "No task provided"} + + logger.info("Cursor agent task: %s", task[:100]) + + if platform.system() != "Darwin": + return { + "error": "This tool currently only supports macOS", + "suggestion": "Manual workaround: Open Cursor, press Cmd+Shift+I for Agent mode", + } + + try: + result = await self._send_to_cursor_agent(task, auto_submit) + return result + except Exception as e: + logger.error("Failed to send to Cursor Agent: %s", e) + return {"error": str(e)} + + async def _send_to_cursor_agent(self, task: str, auto_submit: bool) -> Dict[str, Any]: + """Send the task to Cursor Agent using AppleScript automation.""" + + escaped_task = task.replace("\\", "\\\\").replace('"', '\\"') + + # AppleScript for Agent mode (Cmd+Shift+I) + applescript = f''' + tell application "Cursor" + activate + end tell + + delay 0.5 + + tell application "System Events" + tell process "Cursor" + -- Open Agent with Cmd+Shift+I + keystroke "i" using {{command down, shift down}} + delay 1.0 + + -- Type the task + keystroke "{escaped_task}" + + {"-- Submit with Enter" if auto_submit else ""} + {f'delay 0.3' if auto_submit else ""} + {f'keystroke return' if auto_submit else ""} + end tell + end tell + + return "success" + ''' + + try: + proc = await asyncio.create_subprocess_exec( + "osascript", "-e", applescript, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=10.0) + + if proc.returncode != 0: + error_msg = stderr.decode().strip() if stderr else "Unknown error" + + if "not allowed assistive access" in error_msg.lower(): + return { + "error": "Accessibility permissions required", + "instructions": ( + "To enable voice coding:\n" + "1. Open System Settings โ†’ Privacy & Security โ†’ Accessibility\n" + "2. Add and enable the terminal app you're running Reachy from\n" + "3. Try again after granting permission" + ), + } + + return {"error": f"AppleScript failed: {error_msg}"} + + return { + "status": "sent_to_cursor_agent", + "message": f"Sent task to Cursor Agent: {task[:50]}...", + "auto_submitted": auto_submit, + } + + except asyncio.TimeoutError: + return {"error": "Timeout waiting for Cursor Agent response"} + diff --git a/src/reachy_mini_conversation_app/utils.py b/src/reachy_mini_conversation_app/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..861d97e4083f124b648d979b2a12a4b0dff6c39b --- /dev/null +++ b/src/reachy_mini_conversation_app/utils.py @@ -0,0 +1,116 @@ +import logging +import argparse +import warnings +from typing import Any, Tuple + +from reachy_mini import ReachyMini +from reachy_mini_conversation_app.camera_worker import CameraWorker + + +def parse_args() -> Tuple[argparse.Namespace, list]: # type: ignore + """Parse command line arguments.""" + parser = argparse.ArgumentParser("Reachy Mini Conversation App") + parser.add_argument( + "--head-tracker", + choices=["yolo", "mediapipe", None], + default=None, + help="Choose head tracker (default: None)", + ) + parser.add_argument("--no-camera", default=False, action="store_true", help="Disable camera usage") + parser.add_argument( + "--local-vision", + default=False, + action="store_true", + help="Use local vision model instead of gpt-realtime vision", + ) + parser.add_argument("--gradio", default=False, action="store_true", help="Open gradio interface") + parser.add_argument("--debug", default=False, action="store_true", help="Enable debug logging") + parser.add_argument( + "--wireless-version", + default=False, + action="store_true", + help="Use WebRTC backend for wireless version of the robot", + ) + parser.add_argument( + "--on-device", + default=False, + action="store_true", + help="Use when conversation app is running on the same device as Reachy Mini daemon", + ) + parser.add_argument( + "--remote", + default=False, + action="store_true", + help="Connect to a remote Reachy Mini on the network (no WebRTC, just control)", + ) + return parser.parse_known_args() + + +def handle_vision_stuff(args: argparse.Namespace, current_robot: ReachyMini) -> Tuple[CameraWorker | None, Any, Any]: + """Initialize camera, head tracker, camera worker, and vision manager. + + By default, vision is handled by gpt-realtime model when camera tool is used. + If --local-vision flag is used, a local vision model will process images periodically. + """ + camera_worker = None + head_tracker = None + vision_manager = None + + if not args.no_camera: + # Initialize head tracker if specified + if args.head_tracker is not None: + if args.head_tracker == "yolo": + from reachy_mini_conversation_app.vision.yolo_head_tracker import HeadTracker + + head_tracker = HeadTracker() + elif args.head_tracker == "mediapipe": + from reachy_mini_toolbox.vision import HeadTracker # type: ignore[no-redef] + + head_tracker = HeadTracker() + + # Initialize camera worker + camera_worker = CameraWorker(current_robot, head_tracker) + + # Initialize vision manager only if local vision is requested + if args.local_vision: + try: + from reachy_mini_conversation_app.vision.processors import initialize_vision_manager + + vision_manager = initialize_vision_manager(camera_worker) + except ImportError as e: + raise ImportError( + "To use --local-vision, please install the extra dependencies: pip install '.[local_vision]'", + ) from e + else: + logging.getLogger(__name__).info( + "Using gpt-realtime for vision (default). Use --local-vision for local processing.", + ) + + return camera_worker, head_tracker, vision_manager + + +def setup_logger(debug: bool) -> logging.Logger: + """Setups the logger.""" + log_level = "DEBUG" if debug else "INFO" + logging.basicConfig( + level=getattr(logging, log_level, logging.INFO), + format="%(asctime)s %(levelname)s %(name)s:%(lineno)d | %(message)s", + ) + logger = logging.getLogger(__name__) + + # Suppress WebRTC warnings + warnings.filterwarnings("ignore", message=".*AVCaptureDeviceTypeExternal.*") + warnings.filterwarnings("ignore", category=UserWarning, module="aiortc") + + # Tame third-party noise (looser in DEBUG) + if log_level == "DEBUG": + logging.getLogger("aiortc").setLevel(logging.INFO) + logging.getLogger("fastrtc").setLevel(logging.INFO) + logging.getLogger("aioice").setLevel(logging.INFO) + logging.getLogger("openai").setLevel(logging.INFO) + logging.getLogger("websockets").setLevel(logging.INFO) + else: + logging.getLogger("aiortc").setLevel(logging.ERROR) + logging.getLogger("fastrtc").setLevel(logging.ERROR) + logging.getLogger("aioice").setLevel(logging.WARNING) + return logger diff --git a/src/reachy_mini_conversation_app/vision/__init__.py b/src/reachy_mini_conversation_app/vision/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..60dc15c497e49cbeb8687afb06a62339c297e2b5 --- /dev/null +++ b/src/reachy_mini_conversation_app/vision/__init__.py @@ -0,0 +1 @@ +"""Nothing (for ruff).""" diff --git a/src/reachy_mini_conversation_app/vision/processors.py b/src/reachy_mini_conversation_app/vision/processors.py new file mode 100644 index 0000000000000000000000000000000000000000..71598d8b4cbaf8cfc937e2aa3df1c0be6d79ffcd --- /dev/null +++ b/src/reachy_mini_conversation_app/vision/processors.py @@ -0,0 +1,331 @@ +import os +import time +import base64 +import logging +import threading +from typing import Any, Dict +from dataclasses import dataclass + +import cv2 +import numpy as np +import torch +from numpy.typing import NDArray +from transformers import AutoProcessor, AutoModelForImageTextToText +from huggingface_hub import snapshot_download + +from reachy_mini_conversation_app.config import config + + +logger = logging.getLogger(__name__) + + +@dataclass +class VisionConfig: + """Configuration for vision processing.""" + + model_path: str = config.LOCAL_VISION_MODEL + vision_interval: float = 5.0 + max_new_tokens: int = 64 + jpeg_quality: int = 85 + max_retries: int = 3 + retry_delay: float = 1.0 + device_preference: str = "auto" # "auto", "cuda", "cpu" + + +class VisionProcessor: + """Handles SmolVLM2 model loading and inference.""" + + def __init__(self, vision_config: VisionConfig | None = None): + """Initialize the vision processor.""" + self.vision_config = vision_config or VisionConfig() + self.model_path = self.vision_config.model_path + self.device = self._determine_device() + self.processor = None + self.model = None + self._initialized = False + + def _determine_device(self) -> str: + pref = self.vision_config.device_preference + if pref == "cpu": + return "cpu" + if pref == "cuda": + return "cuda" if torch.cuda.is_available() else "cpu" + if pref == "mps": + return "mps" if torch.backends.mps.is_available() else "cpu" + # auto: prefer mps on Apple, then cuda, else cpu + if torch.backends.mps.is_available(): + return "mps" + return "cuda" if torch.cuda.is_available() else "cpu" + + def initialize(self) -> bool: + """Load model and processor onto the selected device.""" + try: + logger.info(f"Loading SmolVLM2 model on {self.device} (HF_HOME={config.HF_HOME})") + self.processor = AutoProcessor.from_pretrained(self.model_path) # type: ignore + + # Select dtype depending on device + if self.device == "cuda": + dtype = torch.bfloat16 + elif self.device == "mps": + dtype = torch.float32 # best for MPS + else: + dtype = torch.float32 + + model_kwargs: Dict[str, Any] = {"dtype": dtype} + + # flash_attention_2 is CUDA-only; skip on MPS/CPU + if self.device == "cuda": + model_kwargs["_attn_implementation"] = "flash_attention_2" + + # Load model weights + self.model = AutoModelForImageTextToText.from_pretrained(self.model_path, **model_kwargs).to(self.device) # type: ignore + + if self.model is not None: + self.model.eval() + self._initialized = True + return True + + except Exception as e: + logger.error(f"Failed to initialize vision model: {e}") + return False + + def process_image( + self, + cv2_image: NDArray[np.uint8], + prompt: str = "Briefly describe what you see in one sentence.", + ) -> str: + """Process CV2 image and return description with retry logic.""" + if not self._initialized or self.processor is None or self.model is None: + return "Vision model not initialized" + + for attempt in range(self.vision_config.max_retries): + try: + # Convert to JPEG bytes + success, jpeg_buffer = cv2.imencode( + ".jpg", + cv2_image, + [cv2.IMWRITE_JPEG_QUALITY, self.vision_config.jpeg_quality], + ) + if not success: + return "Failed to encode image" + + # Convert to base64 + image_base64 = base64.b64encode(jpeg_buffer.tobytes()).decode("utf-8") + + messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "url": f"data:image/jpeg;base64,{image_base64}", + }, + {"type": "text", "text": prompt}, + ], + }, + ] + + inputs = self.processor.apply_chat_template( + messages, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors="pt", + ) + + # Move tensors to device WITHOUT forcing dtype (keeps input_ids as torch.long) + inputs = {k: (v.to(self.device) if hasattr(v, "to") else v) for k, v in inputs.items()} + + with torch.no_grad(): + generated_ids = self.model.generate( + **inputs, + do_sample=False, + max_new_tokens=self.vision_config.max_new_tokens, + pad_token_id=self.processor.tokenizer.eos_token_id, + ) + + generated_texts = self.processor.batch_decode( + generated_ids, + skip_special_tokens=True, + ) + + # Extract just the response part + full_text = generated_texts[0] + response = self._extract_response(full_text) + + # Clean up GPU memory if using CUDA + if self.device == "cuda": + torch.cuda.empty_cache() + elif self.device == "mps": + torch.mps.empty_cache() + + return response.replace(chr(10), " ").strip() + + except torch.cuda.OutOfMemoryError as e: + logger.error(f"CUDA OOM on attempt {attempt + 1}: {e}") + if self.device == "cuda": + torch.cuda.empty_cache() + if attempt < self.vision_config.max_retries - 1: + time.sleep(self.vision_config.retry_delay * (attempt + 1)) + else: + return "GPU out of memory - vision processing failed" + + except Exception as e: + logger.error(f"Vision processing failed (attempt {attempt + 1}): {e}") + if attempt < self.vision_config.max_retries - 1: + time.sleep(self.vision_config.retry_delay) + else: + return f"Vision processing error after {self.vision_config.max_retries} attempts" + + def _extract_response(self, full_text: str) -> str: + """Extract the assistant's response from the full generated text.""" + # Handle different response formats + markers = ["assistant\n", "Assistant:", "Response:", "\n\n"] + + for marker in markers: + if marker in full_text: + response = full_text.split(marker)[-1].strip() + if response: # Ensure we got a meaningful response + return response + + # Fallback: return the full text cleaned up + return full_text.strip() + + def get_model_info(self) -> Dict[str, Any]: + """Get information about the loaded model.""" + return { + "initialized": self._initialized, + "device": self.device, + "model_path": self.model_path, + "cuda_available": torch.cuda.is_available(), + "gpu_memory": torch.cuda.get_device_properties(0).total_memory // (1024**3) + if torch.cuda.is_available() + else "N/A", + } + + +class VisionManager: + """Manages periodic vision processing and scene understanding.""" + + def __init__(self, camera: Any, vision_config: VisionConfig | None = None): + """Initialize vision manager with camera and configuration.""" + self.camera = camera + self.vision_config = vision_config or VisionConfig() + self.vision_interval = self.vision_config.vision_interval + self.processor = VisionProcessor(self.vision_config) + + self._last_processed_time = 0.0 + self._stop_event = threading.Event() + self._thread: threading.Thread | None = None + + # Initialize processor + if not self.processor.initialize(): + logger.error("Failed to initialize vision processor") + raise RuntimeError("Vision processor initialization failed") + + def start(self) -> None: + """Start the vision processing loop in a thread.""" + self._stop_event.clear() + self._thread = threading.Thread(target=self._working_loop, daemon=True) + self._thread.start() + logger.info("Local vision processing started") + + def stop(self) -> None: + """Stop the vision processing loop.""" + self._stop_event.set() + if self._thread is not None: + self._thread.join() + logger.info("Local vision processing stopped") + + def _working_loop(self) -> None: + """Vision processing loop (runs in separate thread).""" + while not self._stop_event.is_set(): + try: + current_time = time.time() + + if current_time - self._last_processed_time >= self.vision_interval: + frame = self.camera.get_latest_frame() + if frame is not None: + description = self.processor.process_image( + frame, + "Briefly describe what you see in one sentence.", + ) + + # Only update if we got a valid response + if description and not description.startswith(("Vision", "Failed", "Error")): + self._last_processed_time = current_time + logger.debug(f"Vision update: {description}") + else: + logger.warning(f"Invalid vision response: {description}") + + time.sleep(1.0) # Check every second + + except Exception: + logger.exception("Vision processing loop error") + time.sleep(5.0) # Longer sleep on error + + logger.info("Vision loop finished") + + def get_status(self) -> Dict[str, Any]: + """Get comprehensive status information.""" + return { + "last_processed": self._last_processed_time, + "processor_info": self.processor.get_model_info(), + "config": { + "interval": self.vision_interval, + }, + } + + +def initialize_vision_manager(camera_worker: Any) -> VisionManager | None: + """Initialize vision manager with model download and configuration. + + Args: + camera_worker: CameraWorker instance for frame capture + Returns: + VisionManager instance or None if initialization fails + + """ + try: + model_id = config.LOCAL_VISION_MODEL + cache_dir = os.path.expanduser(config.HF_HOME) + + # Prepare cache directory + os.makedirs(cache_dir, exist_ok=True) + os.environ["HF_HOME"] = cache_dir + logger.info("HF_HOME set to %s", cache_dir) + + # Download model to cache + logger.info(f"Downloading vision model {model_id} to cache...") + snapshot_download( + repo_id=model_id, + repo_type="model", + cache_dir=cache_dir, + ) + logger.info(f"Model {model_id} downloaded to {cache_dir}") + + # Configure vision processing + vision_config = VisionConfig( + model_path=model_id, + vision_interval=5.0, + max_new_tokens=64, + jpeg_quality=85, + max_retries=3, + retry_delay=1.0, + device_preference="auto", + ) + + # Initialize vision manager + vision_manager = VisionManager(camera_worker, vision_config) + + # Log device info + device_info = vision_manager.processor.get_model_info() + logger.info( + f"Vision processing enabled: {device_info.get('model_path')} on {device_info.get('device')}", + ) + + return vision_manager + + except Exception as e: + logger.error(f"Failed to initialize vision manager: {e}") + return None diff --git a/src/reachy_mini_conversation_app/vision/yolo_head_tracker.py b/src/reachy_mini_conversation_app/vision/yolo_head_tracker.py new file mode 100644 index 0000000000000000000000000000000000000000..3619bc7e89a05a21f554c36c7968bf2c446a33f0 --- /dev/null +++ b/src/reachy_mini_conversation_app/vision/yolo_head_tracker.py @@ -0,0 +1,148 @@ +from __future__ import annotations +import logging +from typing import Tuple + +import numpy as np +from numpy.typing import NDArray + + +try: + from supervision import Detections + from ultralytics import YOLO # type: ignore +except ImportError as e: + raise ImportError( + "To use YOLO head tracker, please install the extra dependencies: pip install '.[yolo_vision]'", + ) from e +from huggingface_hub import hf_hub_download + + +logger = logging.getLogger(__name__) + + +class HeadTracker: + """Lightweight head tracker using YOLO for face detection.""" + + def __init__( + self, + model_repo: str = "AdamCodd/YOLOv11n-face-detection", + model_filename: str = "model.pt", + confidence_threshold: float = 0.3, + device: str = "cpu", + ) -> None: + """Initialize YOLO-based head tracker. + + Args: + model_repo: HuggingFace model repository + model_filename: Model file name + confidence_threshold: Minimum confidence for face detection + device: Device to run inference on ('cpu' or 'cuda') + + """ + self.confidence_threshold = confidence_threshold + + try: + # Download and load YOLO model + model_path = hf_hub_download(repo_id=model_repo, filename=model_filename) + self.model = YOLO(model_path).to(device) + logger.info(f"YOLO face detection model loaded from {model_repo}") + except Exception as e: + logger.error(f"Failed to load YOLO model: {e}") + raise + + def _select_best_face(self, detections: Detections) -> int | None: + """Select the best face based on confidence and area (largest face with highest confidence). + + Args: + detections: Supervision detections object + + Returns: + Index of best face or None if no valid faces + + """ + if detections.xyxy.shape[0] == 0: + return None + + # Check if confidence is available + if detections.confidence is None: + return None + + # Filter by confidence threshold + valid_mask = detections.confidence >= self.confidence_threshold + if not np.any(valid_mask): + return None + + valid_indices = np.where(valid_mask)[0] + + # Calculate areas for valid detections + boxes = detections.xyxy[valid_indices] + areas = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) + + # Combine confidence and area (weighted towards larger faces) + confidences = detections.confidence[valid_indices] + scores = confidences * 0.7 + (areas / np.max(areas)) * 0.3 + + # Return index of best face + best_idx = valid_indices[np.argmax(scores)] + return int(best_idx) + + def _bbox_to_mp_coords(self, bbox: NDArray[np.float32], w: int, h: int) -> NDArray[np.float32]: + """Convert bounding box center to MediaPipe-style coordinates [-1, 1]. + + Args: + bbox: Bounding box [x1, y1, x2, y2] + w: Image width + h: Image height + + Returns: + Center point in [-1, 1] coordinates + + """ + center_x = (bbox[0] + bbox[2]) / 2.0 + center_y = (bbox[1] + bbox[3]) / 2.0 + + # Normalize to [0, 1] then to [-1, 1] + norm_x = (center_x / w) * 2.0 - 1.0 + norm_y = (center_y / h) * 2.0 - 1.0 + + return np.array([norm_x, norm_y], dtype=np.float32) + + def get_head_position(self, img: NDArray[np.uint8]) -> Tuple[NDArray[np.float32] | None, float | None]: + """Get head position from face detection. + + Args: + img: Input image + + Returns: + Tuple of (eye_center [-1,1], roll_angle) + + """ + h, w = img.shape[:2] + + try: + # Run YOLO inference + results = self.model(img, verbose=False) + detections = Detections.from_ultralytics(results[0]) + + # Select best face + face_idx = self._select_best_face(detections) + if face_idx is None: + logger.debug("No face detected above confidence threshold") + return None, None + + bbox = detections.xyxy[face_idx] + + if detections.confidence is not None: + confidence = detections.confidence[face_idx] + logger.debug(f"Face detected with confidence: {confidence:.2f}") + + # Get face center in [-1, 1] coordinates + face_center = self._bbox_to_mp_coords(bbox, w, h) + + # Roll is 0 since we don't have keypoints for precise angle estimation + roll = 0.0 + + return face_center, roll + + except Exception as e: + logger.error(f"Error in head position detection: {e}") + return None, None diff --git a/style.css b/style.css new file mode 100644 index 0000000000000000000000000000000000000000..b4cadc84739ca953bb3d144340cef424503e0234 --- /dev/null +++ b/style.css @@ -0,0 +1,386 @@ +:root { + --bg: #060c1d; + --panel: #0c172b; + --glass: rgba(17, 27, 48, 0.7); + --card: rgba(255, 255, 255, 0.04); + --accent: #7af5c4; + --accent-2: #f6c452; + --text: #e8edf7; + --muted: #9fb3ce; + --border: rgba(255, 255, 255, 0.08); + --shadow: 0 25px 70px rgba(0, 0, 0, 0.45); + font-family: "Space Grotesk", "Manrope", system-ui, -apple-system, sans-serif; +} + +* { + margin: 0; + padding: 0; + box-sizing: border-box; +} + +body { + background: radial-gradient(circle at 20% 20%, rgba(122, 245, 196, 0.12), transparent 30%), + radial-gradient(circle at 80% 0%, rgba(246, 196, 82, 0.14), transparent 32%), + radial-gradient(circle at 50% 70%, rgba(124, 142, 255, 0.1), transparent 30%), + var(--bg); + color: var(--text); + min-height: 100vh; + line-height: 1.6; + padding-bottom: 3rem; +} + +a { + color: inherit; + text-decoration: none; +} + +.hero { + padding: 3.5rem clamp(1.5rem, 3vw, 3rem) 2.5rem; + position: relative; + overflow: hidden; +} + +.hero::after { + content: ""; + position: absolute; + inset: 0; + background: linear-gradient(120deg, rgba(122, 245, 196, 0.12), rgba(246, 196, 82, 0.08), transparent); + pointer-events: none; +} + +.topline { + display: flex; + align-items: center; + justify-content: space-between; + max-width: 1200px; + margin: 0 auto 2rem; + position: relative; + z-index: 2; +} + +.brand { + display: flex; + align-items: center; + gap: 0.5rem; + font-weight: 700; + letter-spacing: 0.5px; + color: var(--text); +} + +.logo { + display: inline-flex; + align-items: center; + justify-content: center; + width: 2.2rem; + height: 2.2rem; + border-radius: 10px; + background: linear-gradient(145deg, rgba(122, 245, 196, 0.15), rgba(124, 142, 255, 0.15)); + box-shadow: 0 10px 30px rgba(0, 0, 0, 0.25); +} + +.brand-name { + font-size: 1.1rem; +} + +.pill { + background: rgba(255, 255, 255, 0.06); + border: 1px solid var(--border); + padding: 0.6rem 1rem; + border-radius: 999px; + color: var(--muted); + font-size: 0.9rem; + box-shadow: 0 12px 30px rgba(0, 0, 0, 0.2); +} + +.hero-grid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(320px, 1fr)); + gap: clamp(1.5rem, 2.5vw, 2.5rem); + max-width: 1200px; + margin: 0 auto; + position: relative; + z-index: 2; + align-items: center; +} + +.hero-copy h1 { + font-size: clamp(2.6rem, 4vw, 3.6rem); + margin-bottom: 1rem; + line-height: 1.1; + letter-spacing: -0.5px; +} + +.eyebrow { + display: inline-flex; + align-items: center; + gap: 0.5rem; + text-transform: uppercase; + letter-spacing: 1px; + font-size: 0.8rem; + color: var(--muted); + margin-bottom: 0.75rem; +} + +.eyebrow::before { + content: ""; + display: inline-block; + width: 24px; + height: 2px; + background: linear-gradient(90deg, var(--accent), var(--accent-2)); + border-radius: 999px; +} + +.lede { + font-size: 1.1rem; + color: var(--muted); + max-width: 620px; +} + +.hero-actions { + display: flex; + gap: 1rem; + align-items: center; + margin: 1.6rem 0 1.2rem; + flex-wrap: wrap; +} + +.btn { + display: inline-flex; + align-items: center; + justify-content: center; + gap: 0.6rem; + padding: 0.85rem 1.4rem; + border-radius: 12px; + font-weight: 700; + border: 1px solid transparent; + cursor: pointer; + transition: transform 0.2s ease, box-shadow 0.2s ease, background 0.2s ease, border-color 0.2s ease; +} + +.btn.primary { + background: linear-gradient(135deg, #7af5c4, #7c8eff); + color: #0a0f1f; + box-shadow: 0 15px 30px rgba(122, 245, 196, 0.25); +} + +.btn.primary:hover { + transform: translateY(-2px); + box-shadow: 0 25px 45px rgba(122, 245, 196, 0.35); +} + +.btn.ghost { + background: rgba(255, 255, 255, 0.05); + border-color: var(--border); + color: var(--text); +} + +.btn.ghost:hover { + border-color: rgba(255, 255, 255, 0.3); + transform: translateY(-2px); +} + +.btn.wide { + width: 100%; + justify-content: center; +} + +.hero-badges { + display: flex; + flex-wrap: wrap; + gap: 0.6rem; + color: var(--muted); + font-size: 0.9rem; +} + +.hero-badges span { + padding: 0.5rem 0.8rem; + border-radius: 10px; + border: 1px solid var(--border); + background: rgba(255, 255, 255, 0.04); +} + +.hero-visual .glass-card { + background: rgba(255, 255, 255, 0.03); + border: 1px solid var(--border); + border-radius: 18px; + padding: 1.2rem; + box-shadow: var(--shadow); + backdrop-filter: blur(10px); +} + +.hero-gif { + width: 100%; + display: block; + border-radius: 14px; + border: 1px solid var(--border); + box-shadow: 0 12px 35px rgba(0, 0, 0, 0.35); +} + +.caption { + margin-top: 0.75rem; + color: var(--muted); + font-size: 0.95rem; +} + +.section { + max-width: 1200px; + margin: 0 auto; + padding: clamp(2rem, 4vw, 3.5rem) clamp(1.5rem, 3vw, 3rem); +} + +.section-header { + text-align: center; + max-width: 780px; + margin: 0 auto 2rem; +} + +.section-header h2 { + font-size: clamp(2rem, 3vw, 2.6rem); + margin-bottom: 0.5rem; +} + +.intro { + color: var(--muted); + font-size: 1.05rem; +} + +.feature-grid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(240px, 1fr)); + gap: 1rem; +} + +.feature-card { + background: rgba(255, 255, 255, 0.03); + border: 1px solid var(--border); + border-radius: 16px; + padding: 1.25rem; + box-shadow: 0 10px 30px rgba(0, 0, 0, 0.2); + transition: transform 0.2s ease, border-color 0.2s ease, box-shadow 0.2s ease; +} + +.feature-card:hover { + transform: translateY(-4px); + border-color: rgba(122, 245, 196, 0.3); + box-shadow: 0 18px 40px rgba(0, 0, 0, 0.3); +} + +.feature-card .icon { + width: 48px; + height: 48px; + border-radius: 12px; + display: grid; + place-items: center; + background: rgba(122, 245, 196, 0.14); + margin-bottom: 0.8rem; + font-size: 1.4rem; +} + +.feature-card h3 { + margin-bottom: 0.35rem; +} + +.feature-card p { + color: var(--muted); +} + +.story { + padding-top: 1rem; +} + +.story-grid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(280px, 1fr)); + gap: 1rem; +} + +.story-card { + background: rgba(255, 255, 255, 0.03); + border: 1px solid var(--border); + border-radius: 18px; + padding: 1.5rem; + box-shadow: var(--shadow); +} + +.story-card.secondary { + background: linear-gradient(145deg, rgba(124, 142, 255, 0.08), rgba(122, 245, 196, 0.06)); +} + +.story-card h3 { + margin-bottom: 0.8rem; +} + +.story-list { + list-style: none; + display: grid; + gap: 0.7rem; + color: var(--muted); + font-size: 0.98rem; +} + +.story-list li { + display: flex; + gap: 0.7rem; + align-items: flex-start; +} + +.story-text { + color: var(--muted); + line-height: 1.7; + margin-bottom: 1rem; +} + +.chips { + display: flex; + flex-wrap: wrap; + gap: 0.5rem; +} + +.chip { + padding: 0.45rem 0.8rem; + border-radius: 12px; + background: rgba(0, 0, 0, 0.2); + border: 1px solid var(--border); + color: var(--text); + font-size: 0.9rem; +} + +.footer { + text-align: center; + color: var(--muted); + padding: 2rem 1.5rem 0; +} + +.footer a { + color: var(--text); + border-bottom: 1px solid transparent; +} + +.footer a:hover { + border-color: rgba(255, 255, 255, 0.5); +} + +@media (max-width: 768px) { + .hero { + padding-top: 2.5rem; + } + + .topline { + flex-direction: column; + gap: 0.8rem; + align-items: flex-start; + } + + .hero-actions { + width: 100%; + } + + .btn { + width: 100%; + justify-content: center; + } + + .hero-badges { + gap: 0.4rem; + } +} diff --git a/tests/audio/test_head_wobbler.py b/tests/audio/test_head_wobbler.py new file mode 100644 index 0000000000000000000000000000000000000000..853dcfbea0ca2de862deb1abf3cf2dcd18a0fc53 --- /dev/null +++ b/tests/audio/test_head_wobbler.py @@ -0,0 +1,110 @@ +"""Regression tests for the audio-driven head wobble behaviour.""" + +import math +import time +import base64 +import threading +from typing import Any, List, Tuple +from collections.abc import Callable + +import numpy as np + +from reachy_mini_conversation_app.audio.head_wobbler import HeadWobbler + + +def _make_audio_chunk(duration_s: float = 0.3, frequency_hz: float = 220.0) -> str: + """Generate a base64-encoded mono PCM16 sine wave.""" + sample_rate = 24000 + sample_count = int(sample_rate * duration_s) + t = np.linspace(0, duration_s, sample_count, endpoint=False) + wave = 0.6 * np.sin(2 * math.pi * frequency_hz * t) + pcm = np.clip(wave * np.iinfo(np.int16).max, -32768, 32767).astype(np.int16) + return base64.b64encode(pcm.tobytes()).decode("ascii") + + +def _wait_for(predicate: Callable[[], bool], timeout: float = 0.6) -> bool: + """Poll `predicate` until true or timeout.""" + end_time = time.time() + timeout + while time.time() < end_time: + if predicate(): + return True + time.sleep(0.01) + return False + + +def _start_wobbler() -> Tuple[HeadWobbler, List[Tuple[float, Tuple[float, float, float, float, float, float]]]]: + captured: List[Tuple[float, Tuple[float, float, float, float, float, float]]] = [] + + def capture(offsets: Tuple[float, float, float, float, float, float]) -> None: + captured.append((time.time(), offsets)) + + wobbler = HeadWobbler(set_speech_offsets=capture) + wobbler.start() + return wobbler, captured + + +def test_reset_drops_pending_offsets() -> None: + """Reset should stop wobble output derived from pre-reset audio.""" + wobbler, captured = _start_wobbler() + try: + wobbler.feed(_make_audio_chunk(duration_s=0.35)) + assert _wait_for(lambda: len(captured) > 0), "wobbler did not emit initial offsets" + + pre_reset_count = len(captured) + wobbler.reset() + time.sleep(0.3) + assert len(captured) == pre_reset_count, "offsets continued after reset without new audio" + finally: + wobbler.stop() + + +def test_reset_allows_future_offsets() -> None: + """After reset, fresh audio must still produce wobble offsets.""" + wobbler, captured = _start_wobbler() + try: + wobbler.feed(_make_audio_chunk(duration_s=0.35)) + assert _wait_for(lambda: len(captured) > 0), "wobbler did not emit initial offsets" + + wobbler.reset() + pre_second_count = len(captured) + + wobbler.feed(_make_audio_chunk(duration_s=0.35, frequency_hz=440.0)) + assert _wait_for(lambda: len(captured) > pre_second_count), "no offsets after reset" + assert wobbler._thread is not None and wobbler._thread.is_alive() + finally: + wobbler.stop() + + +def test_reset_during_inflight_chunk_keeps_worker(monkeypatch: Any) -> None: + """Simulate reset during chunk processing to ensure the worker survives.""" + wobbler, captured = _start_wobbler() + ready = threading.Event() + release = threading.Event() + + original_feed = wobbler.sway.feed + + def blocking_feed(pcm, sr): # type: ignore[no-untyped-def] + ready.set() + release.wait(timeout=2.0) + return original_feed(pcm, sr) + + monkeypatch.setattr(wobbler.sway, "feed", blocking_feed) + + try: + wobbler.feed(_make_audio_chunk(duration_s=0.35)) + assert ready.wait(timeout=1.0), "worker thread did not dequeue audio" + + wobbler.reset() + release.set() + + # Allow the worker to finish processing the first chunk (which should be discarded) + time.sleep(0.1) + + assert wobbler._thread is not None and wobbler._thread.is_alive(), "worker thread died after reset" + + pre_second = len(captured) + wobbler.feed(_make_audio_chunk(duration_s=0.35, frequency_hz=440.0)) + assert _wait_for(lambda: len(captured) > pre_second), "no offsets emitted after in-flight reset" + assert wobbler._thread.is_alive() + finally: + wobbler.stop() diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000000000000000000000000000000000000..b94f6423e6a4548add263f908ce754250dd8128c --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,10 @@ +"""Pytest configuration for path setup.""" + +import sys +from pathlib import Path + + +PROJECT_ROOT = Path(__file__).resolve().parents[1] +SRC_PATH = PROJECT_ROOT / "src" +if str(SRC_PATH) not in sys.path: + sys.path.insert(0, str(SRC_PATH)) diff --git a/tests/test_openai_realtime.py b/tests/test_openai_realtime.py new file mode 100644 index 0000000000000000000000000000000000000000..cf79ad680a9af4662bc68edcec487385875c0e8f --- /dev/null +++ b/tests/test_openai_realtime.py @@ -0,0 +1,117 @@ +import asyncio +import logging +from typing import Any +from datetime import datetime, timezone +from unittest.mock import MagicMock + +import pytest + +import reachy_mini_conversation_app.openai_realtime as rt_mod +from reachy_mini_conversation_app.openai_realtime import OpenaiRealtimeHandler +from reachy_mini_conversation_app.tools.core_tools import ToolDependencies + + +def _build_handler(loop: asyncio.AbstractEventLoop) -> OpenaiRealtimeHandler: + asyncio.set_event_loop(loop) + deps = ToolDependencies(reachy_mini=MagicMock(), movement_manager=MagicMock()) + return OpenaiRealtimeHandler(deps) + + +def test_format_timestamp_uses_wall_clock() -> None: + """Test that format_timestamp uses wall clock time.""" + loop = asyncio.new_event_loop() + try: + print("Testing format_timestamp...") + handler = _build_handler(loop) + formatted = handler.format_timestamp() + print(f"Formatted timestamp: {formatted}") + finally: + asyncio.set_event_loop(None) + loop.close() + + # Extract year from "[YYYY-MM-DD ...]" + year = int(formatted[1:5]) + assert year == datetime.now(timezone.utc).year + +@pytest.mark.asyncio +async def test_start_up_retries_on_abrupt_close(monkeypatch: Any, caplog: Any) -> None: + """First connection dies with ConnectionClosedError during iteration -> retried. + + Second connection iterates cleanly (no events) -> start_up returns without raising. + Ensures handler clears self.connection at the end. + """ + caplog.set_level(logging.WARNING) + + # Use a local Exception as the module's ConnectionClosedError to avoid ws dependency + FakeCCE = type("FakeCCE", (Exception,), {}) + monkeypatch.setattr(rt_mod, "ConnectionClosedError", FakeCCE) + + # Make asyncio.sleep return immediately (for backoff) + async def _fast_sleep(*_a: Any, **_kw: Any) -> None: return None + monkeypatch.setattr(asyncio, "sleep", _fast_sleep, raising=False) + + attempt_counter = {"n": 0} + + class FakeConn: + """Minimal realtime connection stub.""" + + def __init__(self, mode: str): + self._mode = mode + + class _Session: + async def update(self, **_kw: Any) -> None: return None + self.session = _Session() + + class _InputAudioBuffer: + async def append(self, **_kw: Any) -> None: return None + self.input_audio_buffer = _InputAudioBuffer() + + class _Item: + async def create(self, **_kw: Any) -> None: return None + + class _Conversation: + item = _Item() + self.conversation = _Conversation() + + class _Response: + async def create(self, **_kw: Any) -> None: return None + async def cancel(self, **_kw: Any) -> None: return None + self.response = _Response() + + async def __aenter__(self) -> "FakeConn": return self + async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> bool: return False + async def close(self) -> None: return None + + # Async iterator protocol + def __aiter__(self) -> "FakeConn": return self + async def __anext__(self) -> None: + if self._mode == "raise_on_iter": + raise FakeCCE("abrupt close (simulated)") + raise StopAsyncIteration # clean exit (no events) + + class FakeRealtime: + def connect(self, **_kw: Any) -> FakeConn: + attempt_counter["n"] += 1 + mode = "raise_on_iter" if attempt_counter["n"] == 1 else "clean" + return FakeConn(mode) + + class FakeClient: + def __init__(self, **_kw: Any) -> None: self.realtime = FakeRealtime() + + # Patch the OpenAI client used by the handler + monkeypatch.setattr(rt_mod, "AsyncOpenAI", FakeClient) + + # Build handler with minimal deps + deps = ToolDependencies(reachy_mini=MagicMock(), movement_manager=MagicMock()) + handler = rt_mod.OpenaiRealtimeHandler(deps) + + # Run: should retry once and exit cleanly + await handler.start_up() + + # Validate: two attempts total (fail -> retry -> succeed), and connection cleared + assert attempt_counter["n"] == 2 + assert handler.connection is None + + # Optional: confirm we logged the unexpected close once + warnings = [r for r in caplog.records if r.levelname == "WARNING" and "closed unexpectedly" in r.msg] + assert len(warnings) == 1