Spaces:
Running
Running
Update TARS Conversation App with TarsApp framework
Browse filesThis view is limited to 50 files because it contains too many changes. Β See raw diff
- .gitattributes +1 -0
- .gitignore +48 -0
- CLAUDE.md +50 -0
- LICENSE +21 -0
- README.md +340 -8
- app.json +55 -0
- assets/audio/tars-clean-compressed.mp3 +3 -0
- bot.py +605 -0
- config.ini.example +52 -0
- docs/DAEMON_INTEGRATION.md +393 -0
- docs/DASHBOARD_UPDATE_SUMMARY.md +218 -0
- docs/DEVELOPING_APPS.md +400 -0
- docs/INSTALLATION_GUIDE.md +264 -0
- docs/MEMORY.md +190 -0
- env.example +59 -0
- index.html +333 -0
- install.sh +99 -0
- manifest.json +47 -0
- pipecat_service.py +272 -0
- publish-to-hf.sh +87 -0
- pyproject.toml +25 -0
- requirements.txt +18 -0
- scripts/update_daemon.py +388 -0
- src/README.md +55 -0
- src/character/TARS.json +25 -0
- src/character/persona.ini +21 -0
- src/character/prompts.py +331 -0
- src/config/__init__.py +152 -0
- src/config/connection.py +179 -0
- src/observers/__init__.py +21 -0
- src/observers/assistant_observer.py +142 -0
- src/observers/debug_observer.py +22 -0
- src/observers/display_events_observer.py +100 -0
- src/observers/metrics_observer.py +196 -0
- src/observers/state_observer.py +166 -0
- src/observers/transcription_observer.py +70 -0
- src/observers/tts_state_observer.py +56 -0
- src/observers/vision_observer.py +142 -0
- src/processors/__init__.py +18 -0
- src/processors/emotional_monitor.py +303 -0
- src/processors/filters.py +81 -0
- src/processors/gating.py +129 -0
- src/processors/visual_observer.py +389 -0
- src/services/README.md +110 -0
- src/services/__init__.py +1 -0
- src/services/factories/__init__.py +6 -0
- src/services/factories/stt_factory.py +127 -0
- src/services/factories/tts_factory.py +84 -0
- src/services/memory/memory_chromadb.py +195 -0
- src/services/memory/memory_hybrid.py +393 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
assets/audio/tars-clean-compressed.mp3 filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# dependencies
|
| 2 |
+
node_modules/
|
| 3 |
+
/.pnp
|
| 4 |
+
.pnp.js
|
| 5 |
+
|
| 6 |
+
# testing
|
| 7 |
+
/coverage
|
| 8 |
+
|
| 9 |
+
# next.js
|
| 10 |
+
/.next/
|
| 11 |
+
/out/
|
| 12 |
+
|
| 13 |
+
# cache
|
| 14 |
+
__pycache__/
|
| 15 |
+
*.py[cod]
|
| 16 |
+
*$py.class
|
| 17 |
+
*.so
|
| 18 |
+
|
| 19 |
+
# production
|
| 20 |
+
/build
|
| 21 |
+
|
| 22 |
+
# misc
|
| 23 |
+
.DS_Store
|
| 24 |
+
*.pem
|
| 25 |
+
/.models/
|
| 26 |
+
/.claude/
|
| 27 |
+
/chroma_memory/
|
| 28 |
+
/deprecated/
|
| 29 |
+
/memory_data/
|
| 30 |
+
|
| 31 |
+
# debug
|
| 32 |
+
npm-debug.log*
|
| 33 |
+
yarn-debug.log*
|
| 34 |
+
yarn-error.log*
|
| 35 |
+
|
| 36 |
+
# local env files
|
| 37 |
+
.env*.local
|
| 38 |
+
.env
|
| 39 |
+
|
| 40 |
+
# local config files
|
| 41 |
+
config.ini
|
| 42 |
+
|
| 43 |
+
# vercel
|
| 44 |
+
.vercel
|
| 45 |
+
|
| 46 |
+
# typescript
|
| 47 |
+
*.tsbuildinfo
|
| 48 |
+
next-env.d.ts
|
CLAUDE.md
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# TARS Omni
|
| 2 |
+
|
| 3 |
+
AI brain that connects to Raspberry Pi hardware daemon.
|
| 4 |
+
|
| 5 |
+
## Pi Access
|
| 6 |
+
```
|
| 7 |
+
ssh tars-pi # 100.84.133.74, user: mac, repo: ~/tars-daemon
|
| 8 |
+
```
|
| 9 |
+
|
| 10 |
+
## Install
|
| 11 |
+
|
| 12 |
+
Pi (from tars-daemon dashboard):
|
| 13 |
+
- Apps tab β Install button
|
| 14 |
+
|
| 15 |
+
Pi (manual):
|
| 16 |
+
```bash
|
| 17 |
+
ssh tars-pi "cd ~/tars-conversation-app && bash install.sh"
|
| 18 |
+
```
|
| 19 |
+
|
| 20 |
+
See: docs/INSTALLATION_GUIDE.md
|
| 21 |
+
|
| 22 |
+
## Run
|
| 23 |
+
|
| 24 |
+
1. Pi: `ssh tars-pi "cd ~/tars && python tars_daemon.py"`
|
| 25 |
+
2. Mac: `python tars_bot.py`
|
| 26 |
+
|
| 27 |
+
---
|
| 28 |
+
|
| 29 |
+
## Docs
|
| 30 |
+
|
| 31 |
+
- Installation: docs/INSTALLATION_GUIDE.md
|
| 32 |
+
- App Development: docs/DEVELOPING_APPS.md
|
| 33 |
+
- Daemon Integration: docs/DAEMON_INTEGRATION.md
|
| 34 |
+
- Dashboard Update: docs/DASHBOARD_UPDATE_SUMMARY.md
|
| 35 |
+
|
| 36 |
+
## Dashboard Install
|
| 37 |
+
|
| 38 |
+
tars-daemon dashboard now supports app management:
|
| 39 |
+
- Apps tab shows all apps in ~/tars-apps/
|
| 40 |
+
- Install/Uninstall buttons
|
| 41 |
+
- Start/Stop controls
|
| 42 |
+
- Auto-discovery via app.json
|
| 43 |
+
|
| 44 |
+
## Claude Code Guidelines
|
| 45 |
+
|
| 46 |
+
- No emojis, no [NEW] markers, no "vs" comparisons
|
| 47 |
+
- Concise, technical, factual only
|
| 48 |
+
- No fluff, benefits sections, or marketing language
|
| 49 |
+
- Commits: imperative mood, no emojis
|
| 50 |
+
- Comments: minimal, explain "why" not "what"
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2026 Latisha Besariani Hendra and TARS Omni Contributors
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
CHANGED
|
@@ -1,13 +1,345 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
-
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
-
license: mit
|
| 11 |
---
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: TARS Conversation App
|
| 3 |
+
emoji: π€
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: "4.0.0"
|
| 8 |
+
app_file: ui/app.py
|
| 9 |
pinned: false
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# TARS Conversation App
|
| 13 |
+
|
| 14 |
+
Real-time voice AI with transcription, vision, and intelligent conversation using Speechmatics/Deepgram, Qwen3-TTS/ElevenLabs, DeepInfra LLM, and Moondream.
|
| 15 |
+
|
| 16 |
+
## Features
|
| 17 |
+
|
| 18 |
+
- **Dual Operation Modes**
|
| 19 |
+
- **WebRTC Mode** (`bot.py`) - Browser-based voice AI with real-time metrics dashboard
|
| 20 |
+
- **Robot Mode** (`tars_bot.py`) - Connect to Raspberry Pi TARS robot via WebRTC and gRPC
|
| 21 |
+
- **Real-time Transcription** - Speechmatics or Deepgram with smart turn detection
|
| 22 |
+
- **Dual TTS Options** - Qwen3-TTS (local, free, voice cloning) or ElevenLabs (cloud)
|
| 23 |
+
- **LLM Integration** - Any model via DeepInfra
|
| 24 |
+
- **Vision Analysis** - Moondream for image understanding
|
| 25 |
+
- **Smart Gating Layer** - AI-powered decision system for natural conversation flow
|
| 26 |
+
- **Hybrid Memory** - SQLite-based hybrid search (70% vector + 30% BM25)
|
| 27 |
+
- **Emotional Monitoring** - Real-time detection of confusion, hesitation, and frustration
|
| 28 |
+
- **Gradio Dashboard** - Live TTFB metrics, latency charts, and conversation transcription
|
| 29 |
+
- **WebRTC Transport** - Low-latency peer-to-peer audio
|
| 30 |
+
- **gRPC Robot Control** - Hardware control with 5-10ms latency (robot mode only)
|
| 31 |
+
|
| 32 |
+
## Project Structure
|
| 33 |
+
|
| 34 |
+
```
|
| 35 |
+
tars-conversation-app/
|
| 36 |
+
βββ bot.py # WebRTC mode - Browser voice AI
|
| 37 |
+
βββ tars_bot.py # Robot mode - Raspberry Pi hardware
|
| 38 |
+
βββ pipecat_service.py # FastAPI backend (WebRTC signaling)
|
| 39 |
+
βββ config.py # Configuration management
|
| 40 |
+
βββ config.ini # User configuration file
|
| 41 |
+
βββ requirements.txt # Python dependencies
|
| 42 |
+
β
|
| 43 |
+
βββ src/ # Backend
|
| 44 |
+
β βββ observers/ # Pipeline observers (metrics, transcription)
|
| 45 |
+
β βββ processors/ # Pipeline processors (silence filter, gating)
|
| 46 |
+
β βββ services/ # Services (STT, TTS, Memory, Robot)
|
| 47 |
+
β βββ tools/ # LLM callable functions
|
| 48 |
+
β βββ transport/ # WebRTC transport (aiortc)
|
| 49 |
+
β βββ character/ # TARS personality and prompts
|
| 50 |
+
β βββ shared_state.py # Shared metrics storage
|
| 51 |
+
β
|
| 52 |
+
βββ ui/ # Frontend
|
| 53 |
+
β βββ app.py # Gradio dashboard (metrics + transcription)
|
| 54 |
+
β
|
| 55 |
+
βββ tests/ # Tests
|
| 56 |
+
β βββ gradio/
|
| 57 |
+
β βββ test_gradio.py # UI integration test
|
| 58 |
+
β
|
| 59 |
+
βββ character/ # TARS character data
|
| 60 |
+
β βββ TARS.json # Character definition
|
| 61 |
+
β βββ persona.ini # Personality parameters
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
## Operation Modes
|
| 65 |
+
|
| 66 |
+
### WebRTC Mode (`bot.py`)
|
| 67 |
+
- **Use case**: Browser-based voice AI conversations
|
| 68 |
+
- **Transport**: SmallWebRTC (browser β Pipecat)
|
| 69 |
+
- **Features**: Full pipeline with STT, LLM, TTS, Memory
|
| 70 |
+
- **UI**: Gradio dashboard for metrics and transcription
|
| 71 |
+
- **Best for**: Development, testing, remote conversations
|
| 72 |
+
|
| 73 |
+
### Robot Mode (`tars_bot.py`)
|
| 74 |
+
- **Use case**: Physical TARS robot on Raspberry Pi
|
| 75 |
+
- **Transport**: aiortc (RPi β Pipecat) + gRPC (commands)
|
| 76 |
+
- **Features**: Same pipeline + robot control (eyes, gestures, movement)
|
| 77 |
+
- **Hardware**: Requires TARS robot with servos and display
|
| 78 |
+
- **Best for**: Physical robot interactions, demos
|
| 79 |
+
|
| 80 |
+
## Quick Start
|
| 81 |
+
|
| 82 |
+
### Installation on TARS Robot (Recommended)
|
| 83 |
+
|
| 84 |
+
Install directly from HuggingFace Space via the TARS dashboard:
|
| 85 |
+
|
| 86 |
+
1. Open TARS dashboard at `http://your-pi:8000`
|
| 87 |
+
2. Go to **App Store** tab
|
| 88 |
+
3. Enter Space ID: `latishab/tars-conversation-app`
|
| 89 |
+
4. Click **Install from HuggingFace**
|
| 90 |
+
5. Configure API keys in `.env.local`
|
| 91 |
+
6. Click **Start**
|
| 92 |
+
7. Access metrics dashboard at `http://your-pi:7860`
|
| 93 |
+
|
| 94 |
+
The app will:
|
| 95 |
+
- Auto-install dependencies
|
| 96 |
+
- Set up virtual environment
|
| 97 |
+
- Configure for robot mode
|
| 98 |
+
- Start Gradio dashboard
|
| 99 |
+
|
| 100 |
+
### Easy Installation (Manual)
|
| 101 |
+
|
| 102 |
+
For first-time setup on Raspberry Pi:
|
| 103 |
+
|
| 104 |
+
```bash
|
| 105 |
+
# Clone and install
|
| 106 |
+
git clone https://github.com/latishab/tars-conversation-app.git
|
| 107 |
+
cd tars-conversation-app
|
| 108 |
+
bash install.sh
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
The installer handles:
|
| 112 |
+
- System dependencies (portaudio, ffmpeg)
|
| 113 |
+
- Python virtual environment
|
| 114 |
+
- All Python packages
|
| 115 |
+
- Configuration file setup
|
| 116 |
+
|
| 117 |
+
### Manual Installation
|
| 118 |
+
|
| 119 |
+
```bash
|
| 120 |
+
# Python dependencies
|
| 121 |
+
pip install -r requirements.txt
|
| 122 |
+
|
| 123 |
+
# For robot mode, install TARS SDK
|
| 124 |
+
pip install tars-robot[sdk]
|
| 125 |
+
```
|
| 126 |
+
|
| 127 |
+
### 2. Configure Environment
|
| 128 |
+
|
| 129 |
+
```bash
|
| 130 |
+
# Copy and edit environment file with your API keys
|
| 131 |
+
cp env.example .env.local
|
| 132 |
+
|
| 133 |
+
# Copy and edit configuration file
|
| 134 |
+
cp config.ini.example config.ini
|
| 135 |
+
```
|
| 136 |
+
|
| 137 |
+
Required API Keys (in `.env.local`):
|
| 138 |
+
- `SPEECHMATICS_API_KEY` or `DEEPGRAM_API_KEY` - For speech-to-text
|
| 139 |
+
- `DEEPINFRA_API_KEY` - For LLM
|
| 140 |
+
- `ELEVENLABS_API_KEY` - Optional (if using ElevenLabs TTS)
|
| 141 |
+
|
| 142 |
+
Settings (in `config.ini`):
|
| 143 |
+
```ini
|
| 144 |
+
[LLM]
|
| 145 |
+
model = meta-llama/Llama-3.3-70B-Instruct
|
| 146 |
+
|
| 147 |
+
[STT]
|
| 148 |
+
provider = deepgram # or speechmatics
|
| 149 |
+
|
| 150 |
+
[TTS]
|
| 151 |
+
provider = qwen3 # or elevenlabs
|
| 152 |
+
|
| 153 |
+
[Memory]
|
| 154 |
+
type = hybrid # SQLite-based hybrid search (vector + BM25)
|
| 155 |
+
```
|
| 156 |
+
|
| 157 |
+
### 3. Run
|
| 158 |
+
|
| 159 |
+
#### WebRTC Mode (Browser)
|
| 160 |
+
|
| 161 |
+
**Terminal 1: Python backend**
|
| 162 |
+
```bash
|
| 163 |
+
python pipecat_service.py
|
| 164 |
+
```
|
| 165 |
+
|
| 166 |
+
**Terminal 2: Gradio UI (optional)**
|
| 167 |
+
```bash
|
| 168 |
+
python ui/app.py
|
| 169 |
+
```
|
| 170 |
+
|
| 171 |
+
Then:
|
| 172 |
+
1. Open WebRTC client in browser (connect to pipecat_service)
|
| 173 |
+
2. Open Gradio dashboard at http://localhost:7861 (for metrics)
|
| 174 |
+
3. Start talking
|
| 175 |
+
|
| 176 |
+
#### Robot Mode (Raspberry Pi)
|
| 177 |
+
|
| 178 |
+
Prerequisites:
|
| 179 |
+
- Raspberry Pi TARS robot running tars_daemon.py
|
| 180 |
+
- Network connection (LAN or Tailscale)
|
| 181 |
+
- TARS SDK installed
|
| 182 |
+
|
| 183 |
+
Configuration in `config.ini`:
|
| 184 |
+
```ini
|
| 185 |
+
[Connection]
|
| 186 |
+
mode = robot
|
| 187 |
+
rpi_url = http://<your-rpi-ip>:8001
|
| 188 |
+
rpi_grpc = <your-rpi-ip>:50051
|
| 189 |
+
auto_connect = true
|
| 190 |
+
|
| 191 |
+
[Display]
|
| 192 |
+
enabled = true
|
| 193 |
+
```
|
| 194 |
+
|
| 195 |
+
Deployment detection:
|
| 196 |
+
- **Remote** (Mac/computer): Uses configured addresses
|
| 197 |
+
- **Local** (on RPi): Auto-detects localhost:50051
|
| 198 |
+
|
| 199 |
+
Run:
|
| 200 |
+
```bash
|
| 201 |
+
python tars_bot.py
|
| 202 |
+
```
|
| 203 |
+
|
| 204 |
+
## Gradio Dashboard
|
| 205 |
+
|
| 206 |
+
The Gradio UI (`ui/app.py`) provides real-time monitoring:
|
| 207 |
+
|
| 208 |
+
### Latency Dashboard
|
| 209 |
+
- Service configuration (STT, Memory, LLM, TTS)
|
| 210 |
+
- TTFB metrics with min/max/avg/last stats
|
| 211 |
+
- Line chart: Latency trends over time
|
| 212 |
+
- Bar chart: Stacked latency breakdown
|
| 213 |
+
- Metrics table: Last 15 turns
|
| 214 |
+
|
| 215 |
+
### Conversation Tab
|
| 216 |
+
- Live user and assistant transcriptions
|
| 217 |
+
- Auto-updates every second
|
| 218 |
+
|
| 219 |
+
### Connection Tab
|
| 220 |
+
- Architecture documentation
|
| 221 |
+
- Usage instructions
|
| 222 |
+
|
| 223 |
+
## Architecture
|
| 224 |
+
|
| 225 |
+
### WebRTC Mode Data Flow
|
| 226 |
+
```
|
| 227 |
+
Browser (WebRTC client)
|
| 228 |
+
β (audio)
|
| 229 |
+
SmallWebRTC Transport
|
| 230 |
+
β
|
| 231 |
+
Pipeline: STT β Memory β LLM β TTS
|
| 232 |
+
β
|
| 233 |
+
Observers (metrics, transcription, assistant)
|
| 234 |
+
β
|
| 235 |
+
shared_state.py
|
| 236 |
+
β
|
| 237 |
+
Gradio UI (http://localhost:7861)
|
| 238 |
+
```
|
| 239 |
+
|
| 240 |
+
### Robot Mode Data Flow
|
| 241 |
+
```
|
| 242 |
+
RPi Mic β WebRTC β Pipecat Pipeline β WebRTC β RPi Speaker
|
| 243 |
+
(audio) β (audio)
|
| 244 |
+
STT β Memory β LLM β TTS
|
| 245 |
+
β
|
| 246 |
+
LLM Tools (set_emotion, do_gesture)
|
| 247 |
+
β
|
| 248 |
+
gRPC β RPi Hardware
|
| 249 |
+
(eyes, servos, display)
|
| 250 |
+
```
|
| 251 |
+
|
| 252 |
+
Communication channels (Robot Mode):
|
| 253 |
+
|
| 254 |
+
| Channel | Protocol | Purpose | Latency |
|
| 255 |
+
|---------|----------|---------|---------|
|
| 256 |
+
| Audio | WebRTC (aiortc) | Voice conversation | ~20ms |
|
| 257 |
+
| Commands | gRPC | Hardware control | ~5-10ms |
|
| 258 |
+
| State | DataChannel | Battery, movement status | ~10ms |
|
| 259 |
+
|
| 260 |
+
## Testing
|
| 261 |
+
|
| 262 |
+
```bash
|
| 263 |
+
# Test Gradio integration
|
| 264 |
+
python tests/gradio/test_gradio.py
|
| 265 |
+
|
| 266 |
+
# Test gesture recognition (robot mode)
|
| 267 |
+
python tests/test_gesture.py
|
| 268 |
+
|
| 269 |
+
# Test hardware connection (robot mode, from RPi)
|
| 270 |
+
ssh tars-pi "cd ~/tars && python tests/test_hardware.py"
|
| 271 |
+
```
|
| 272 |
+
|
| 273 |
+
## Development
|
| 274 |
+
|
| 275 |
+
See [docs/DEVELOPING_APPS.md](docs/DEVELOPING_APPS.md) for comprehensive guide on creating TARS SDK apps.
|
| 276 |
+
|
| 277 |
+
### Adding Metrics
|
| 278 |
+
1. Emit `MetricsFrame` in your service/processor
|
| 279 |
+
2. `MetricsObserver` will capture it automatically
|
| 280 |
+
3. Metrics appear in Gradio dashboard
|
| 281 |
+
|
| 282 |
+
### Adding Tools
|
| 283 |
+
1. Create function in `src/tools/`
|
| 284 |
+
2. Create schema with `create_*_schema()`
|
| 285 |
+
3. Register in `bot.py` or `tars_bot.py`
|
| 286 |
+
4. LLM can now call your tool
|
| 287 |
+
|
| 288 |
+
### Modifying UI
|
| 289 |
+
1. Edit `ui/app.py`
|
| 290 |
+
2. Gradio hot-reloads automatically
|
| 291 |
+
3. Access `metrics_store` for data
|
| 292 |
+
|
| 293 |
+
### Uninstalling
|
| 294 |
+
|
| 295 |
+
```bash
|
| 296 |
+
bash uninstall.sh
|
| 297 |
+
```
|
| 298 |
+
|
| 299 |
+
Removes virtual environment and optionally data/config files.
|
| 300 |
+
|
| 301 |
+
## Troubleshooting
|
| 302 |
+
|
| 303 |
+
### No metrics in Gradio UI
|
| 304 |
+
- Ensure bot is running (`bot.py` or `tars_bot.py`)
|
| 305 |
+
- Check WebRTC client is connected
|
| 306 |
+
- Verify at least one conversation turn completed
|
| 307 |
+
|
| 308 |
+
### Robot mode connection issues
|
| 309 |
+
- Check RPi is reachable: `ping <rpi-ip>`
|
| 310 |
+
- Verify tars_daemon is running on RPi
|
| 311 |
+
- Check gRPC port 50051 is open
|
| 312 |
+
- Review config.ini addresses
|
| 313 |
+
|
| 314 |
+
### Import errors
|
| 315 |
+
```bash
|
| 316 |
+
pip install -r requirements.txt
|
| 317 |
+
pip install gradio plotly # For UI
|
| 318 |
+
```
|
| 319 |
+
|
| 320 |
+
### Audio issues (robot mode)
|
| 321 |
+
- Check RPi mic/speaker with `arecord`/`aplay`
|
| 322 |
+
- Verify WebRTC connection in logs
|
| 323 |
+
- Test with `tests/test_hardware.py`
|
| 324 |
+
|
| 325 |
+
## Contributing
|
| 326 |
+
|
| 327 |
+
Contributions welcome.
|
| 328 |
+
|
| 329 |
+
1. Fork the repository
|
| 330 |
+
2. Create a feature branch
|
| 331 |
+
3. Make your changes
|
| 332 |
+
4. Test with `python tests/gradio/test_gradio.py`
|
| 333 |
+
5. Commit with clear messages (see CLAUDE.md for style)
|
| 334 |
+
6. Push to your fork
|
| 335 |
+
7. Open a Pull Request
|
| 336 |
+
|
| 337 |
+
Code Style:
|
| 338 |
+
- Python: Follow PEP 8
|
| 339 |
+
- Add comments for complex logic
|
| 340 |
+
- Update docs for new features
|
| 341 |
+
- See CLAUDE.md for guidelines (concise, technical, no fluff)
|
| 342 |
+
|
| 343 |
+
## License
|
| 344 |
+
|
| 345 |
+
MIT License - see LICENSE file for details
|
app.json
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "tars-conversation-app",
|
| 3 |
+
"version": "1.0.0",
|
| 4 |
+
"description": "Real-time conversational AI with WebRTC, memory, and vision",
|
| 5 |
+
"author": "TARS Project",
|
| 6 |
+
"repository": "https://github.com/latishab/tars-conversation-app.git",
|
| 7 |
+
"main": "tars_bot.py",
|
| 8 |
+
"install_script": "install.sh",
|
| 9 |
+
"uninstall_script": "uninstall.sh",
|
| 10 |
+
"dependencies": {
|
| 11 |
+
"python": ">=3.10",
|
| 12 |
+
"system": [
|
| 13 |
+
"portaudio19-dev",
|
| 14 |
+
"ffmpeg",
|
| 15 |
+
"build-essential",
|
| 16 |
+
"python3-dev"
|
| 17 |
+
]
|
| 18 |
+
},
|
| 19 |
+
"environment": [
|
| 20 |
+
"DEEPINFRA_API_KEY",
|
| 21 |
+
"SPEECHMATICS_API_KEY",
|
| 22 |
+
"DEEPGRAM_API_KEY",
|
| 23 |
+
"ELEVENLABS_API_KEY"
|
| 24 |
+
],
|
| 25 |
+
"configuration": {
|
| 26 |
+
"file": "config.ini",
|
| 27 |
+
"example": "config.ini.example",
|
| 28 |
+
"env_file": ".env.local",
|
| 29 |
+
"env_example": "env.example"
|
| 30 |
+
},
|
| 31 |
+
"ports": {
|
| 32 |
+
"grpc": 50051,
|
| 33 |
+
"http": 8765,
|
| 34 |
+
"fastapi": 8080
|
| 35 |
+
},
|
| 36 |
+
"modes": [
|
| 37 |
+
{
|
| 38 |
+
"name": "robot",
|
| 39 |
+
"description": "Connect to Pi hardware via gRPC",
|
| 40 |
+
"command": "python tars_bot.py"
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"name": "browser",
|
| 44 |
+
"description": "Browser-based WebRTC mode",
|
| 45 |
+
"command": "python bot.py"
|
| 46 |
+
}
|
| 47 |
+
],
|
| 48 |
+
"services": {
|
| 49 |
+
"dashboard": {
|
| 50 |
+
"enabled": true,
|
| 51 |
+
"command": "python ui/app.py",
|
| 52 |
+
"port": 7860
|
| 53 |
+
}
|
| 54 |
+
}
|
| 55 |
+
}
|
assets/audio/tars-clean-compressed.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:35e66e7ef9dfd3e64ed70fcdb32b220686d3ad4451af88bfa72a48563a85b120
|
| 3 |
+
size 289820
|
bot.py
ADDED
|
@@ -0,0 +1,605 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Bot pipeline setup and execution."""
|
| 2 |
+
|
| 3 |
+
import sys
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
# Add src/ to Python path
|
| 7 |
+
sys.path.insert(0, str(Path(__file__).parent / "src"))
|
| 8 |
+
|
| 9 |
+
import asyncio
|
| 10 |
+
import json
|
| 11 |
+
import os
|
| 12 |
+
import logging
|
| 13 |
+
import uuid
|
| 14 |
+
import httpx
|
| 15 |
+
|
| 16 |
+
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
| 17 |
+
from pipecat.frames.frames import (
|
| 18 |
+
LLMRunFrame,
|
| 19 |
+
TranscriptionFrame,
|
| 20 |
+
InterimTranscriptionFrame,
|
| 21 |
+
Frame,
|
| 22 |
+
TranscriptionMessage,
|
| 23 |
+
TranslationFrame,
|
| 24 |
+
UserImageRawFrame,
|
| 25 |
+
UserAudioRawFrame,
|
| 26 |
+
UserImageRequestFrame,
|
| 27 |
+
)
|
| 28 |
+
from pipecat.processors.frame_processor import FrameProcessor, FrameDirection
|
| 29 |
+
from pipecat.pipeline.pipeline import Pipeline
|
| 30 |
+
from pipecat.pipeline.runner import PipelineRunner
|
| 31 |
+
from pipecat.pipeline.task import PipelineTask, PipelineParams
|
| 32 |
+
from pipecat.processors.aggregators.llm_context import LLMContext
|
| 33 |
+
from pipecat.processors.aggregators.llm_response_universal import (
|
| 34 |
+
LLMContextAggregatorPair,
|
| 35 |
+
LLMUserAggregatorParams
|
| 36 |
+
)
|
| 37 |
+
from pipecat.observers.turn_tracking_observer import TurnTrackingObserver
|
| 38 |
+
from pipecat.observers.loggers.user_bot_latency_log_observer import UserBotLatencyLogObserver
|
| 39 |
+
from pipecat.services.moondream.vision import MoondreamService
|
| 40 |
+
from pipecat.services.openai.llm import OpenAILLMService
|
| 41 |
+
from pipecat.services.llm_service import FunctionCallParams
|
| 42 |
+
from services.memory_hybrid import HybridMemoryService
|
| 43 |
+
from pipecat.transcriptions.language import Language
|
| 44 |
+
from pipecat.transports.base_transport import TransportParams
|
| 45 |
+
from pipecat.transports.smallwebrtc.transport import SmallWebRTCTransport
|
| 46 |
+
|
| 47 |
+
from loguru import logger
|
| 48 |
+
|
| 49 |
+
from config import (
|
| 50 |
+
SPEECHMATICS_API_KEY,
|
| 51 |
+
DEEPGRAM_API_KEY,
|
| 52 |
+
ELEVENLABS_API_KEY,
|
| 53 |
+
ELEVENLABS_VOICE_ID,
|
| 54 |
+
DEEPINFRA_API_KEY,
|
| 55 |
+
DEEPINFRA_BASE_URL,
|
| 56 |
+
MEM0_API_KEY,
|
| 57 |
+
get_fresh_config,
|
| 58 |
+
)
|
| 59 |
+
from services.factories import create_stt_service, create_tts_service
|
| 60 |
+
from processors import (
|
| 61 |
+
SilenceFilter,
|
| 62 |
+
InputAudioFilter,
|
| 63 |
+
InterventionGating,
|
| 64 |
+
VisualObserver,
|
| 65 |
+
EmotionalStateMonitor,
|
| 66 |
+
)
|
| 67 |
+
from observers import (
|
| 68 |
+
MetricsObserver,
|
| 69 |
+
TranscriptionObserver,
|
| 70 |
+
AssistantResponseObserver,
|
| 71 |
+
TTSStateObserver,
|
| 72 |
+
VisionObserver,
|
| 73 |
+
DebugObserver,
|
| 74 |
+
DisplayEventsObserver,
|
| 75 |
+
)
|
| 76 |
+
from character.prompts import (
|
| 77 |
+
load_persona_ini,
|
| 78 |
+
load_tars_json,
|
| 79 |
+
build_tars_system_prompt,
|
| 80 |
+
get_introduction_instruction,
|
| 81 |
+
)
|
| 82 |
+
from tools import (
|
| 83 |
+
fetch_user_image,
|
| 84 |
+
adjust_persona_parameter,
|
| 85 |
+
execute_movement,
|
| 86 |
+
capture_camera_view,
|
| 87 |
+
create_fetch_image_schema,
|
| 88 |
+
create_adjust_persona_schema,
|
| 89 |
+
create_identity_schema,
|
| 90 |
+
create_movement_schema,
|
| 91 |
+
create_camera_capture_schema,
|
| 92 |
+
get_persona_storage,
|
| 93 |
+
get_crossword_hint,
|
| 94 |
+
create_crossword_hint_schema,
|
| 95 |
+
)
|
| 96 |
+
from shared_state import metrics_store
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
# ============================================================================
|
| 100 |
+
# CUSTOM FRAME PROCESSORS
|
| 101 |
+
# ============================================================================
|
| 102 |
+
|
| 103 |
+
class IdentityUnifier(FrameProcessor):
|
| 104 |
+
"""
|
| 105 |
+
Applies 'guest_ID' ONLY to specific user input frames.
|
| 106 |
+
Leaves other frames untouched.
|
| 107 |
+
"""
|
| 108 |
+
# Define the frame types that should have user_id set
|
| 109 |
+
TARGET_FRAME_TYPES = (
|
| 110 |
+
TranscriptionFrame,
|
| 111 |
+
TranscriptionMessage,
|
| 112 |
+
TranslationFrame,
|
| 113 |
+
InterimTranscriptionFrame,
|
| 114 |
+
UserImageRawFrame,
|
| 115 |
+
UserAudioRawFrame,
|
| 116 |
+
UserImageRequestFrame,
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
def __init__(self, target_user_id):
|
| 120 |
+
super().__init__()
|
| 121 |
+
self.target_user_id = target_user_id
|
| 122 |
+
|
| 123 |
+
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
| 124 |
+
# 1. Handle internal state
|
| 125 |
+
await super().process_frame(frame, direction)
|
| 126 |
+
|
| 127 |
+
# 2. Only modify specific frame types
|
| 128 |
+
if isinstance(frame, self.TARGET_FRAME_TYPES):
|
| 129 |
+
try:
|
| 130 |
+
frame.user_id = self.target_user_id
|
| 131 |
+
except Exception:
|
| 132 |
+
pass
|
| 133 |
+
|
| 134 |
+
# 3. Push downstream
|
| 135 |
+
await self.push_frame(frame, direction)
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
# ============================================================================
|
| 139 |
+
# HELPER FUNCTIONS
|
| 140 |
+
# ============================================================================
|
| 141 |
+
|
| 142 |
+
async def _cleanup_services(service_refs: dict):
|
| 143 |
+
if service_refs.get("stt"):
|
| 144 |
+
try:
|
| 145 |
+
await service_refs["stt"].close()
|
| 146 |
+
logger.info("β STT service cleaned up")
|
| 147 |
+
except Exception:
|
| 148 |
+
pass
|
| 149 |
+
if service_refs.get("tts"):
|
| 150 |
+
try:
|
| 151 |
+
await service_refs["tts"].close()
|
| 152 |
+
logger.info("β TTS service cleaned up")
|
| 153 |
+
except Exception:
|
| 154 |
+
pass
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
# ============================================================================
|
| 158 |
+
# MAIN BOT PIPELINE
|
| 159 |
+
# ============================================================================
|
| 160 |
+
|
| 161 |
+
async def run_bot(webrtc_connection):
|
| 162 |
+
"""Initialize and run the TARS bot pipeline."""
|
| 163 |
+
logger.info("Starting bot pipeline for WebRTC connection...")
|
| 164 |
+
|
| 165 |
+
# Load fresh configuration for this connection (allows runtime config updates)
|
| 166 |
+
runtime_config = get_fresh_config()
|
| 167 |
+
DEEPINFRA_MODEL = runtime_config['DEEPINFRA_MODEL']
|
| 168 |
+
DEEPINFRA_GATING_MODEL = runtime_config['DEEPINFRA_GATING_MODEL']
|
| 169 |
+
STT_PROVIDER = runtime_config['STT_PROVIDER']
|
| 170 |
+
TTS_PROVIDER = runtime_config['TTS_PROVIDER']
|
| 171 |
+
QWEN3_TTS_MODEL = runtime_config['QWEN3_TTS_MODEL']
|
| 172 |
+
QWEN3_TTS_DEVICE = runtime_config['QWEN3_TTS_DEVICE']
|
| 173 |
+
QWEN3_TTS_REF_AUDIO = runtime_config['QWEN3_TTS_REF_AUDIO']
|
| 174 |
+
EMOTIONAL_MONITORING_ENABLED = runtime_config['EMOTIONAL_MONITORING_ENABLED']
|
| 175 |
+
EMOTIONAL_SAMPLING_INTERVAL = runtime_config['EMOTIONAL_SAMPLING_INTERVAL']
|
| 176 |
+
EMOTIONAL_INTERVENTION_THRESHOLD = runtime_config['EMOTIONAL_INTERVENTION_THRESHOLD']
|
| 177 |
+
TARS_DISPLAY_URL = runtime_config['TARS_DISPLAY_URL']
|
| 178 |
+
TARS_DISPLAY_ENABLED = runtime_config['TARS_DISPLAY_ENABLED']
|
| 179 |
+
|
| 180 |
+
logger.info(f"π Runtime config loaded - STT: {STT_PROVIDER}, LLM: {DEEPINFRA_MODEL}, TTS: {TTS_PROVIDER}, Emotional: {EMOTIONAL_MONITORING_ENABLED}")
|
| 181 |
+
|
| 182 |
+
# Session initialization
|
| 183 |
+
session_id = str(uuid.uuid4())[:8]
|
| 184 |
+
client_id = f"guest_{session_id}"
|
| 185 |
+
client_state = {"client_id": client_id}
|
| 186 |
+
logger.info(f"Session started: {client_id}")
|
| 187 |
+
|
| 188 |
+
service_refs = {"stt": None, "tts": None}
|
| 189 |
+
|
| 190 |
+
try:
|
| 191 |
+
# ====================================================================
|
| 192 |
+
# TRANSPORT INITIALIZATION
|
| 193 |
+
# ====================================================================
|
| 194 |
+
# Note: STT providers handle their own turn detection:
|
| 195 |
+
# - Speechmatics: SMART_TURN mode
|
| 196 |
+
# - Deepgram: endpointing parameter (300ms silence detection)
|
| 197 |
+
# - Deepgram Flux: built-in turn detection with ExternalUserTurnStrategies (deprecated)
|
| 198 |
+
|
| 199 |
+
logger.info(f"Initializing transport with {STT_PROVIDER} turn detection...")
|
| 200 |
+
|
| 201 |
+
transport_params = TransportParams(
|
| 202 |
+
audio_in_enabled=True,
|
| 203 |
+
audio_out_enabled=True,
|
| 204 |
+
video_in_enabled=False,
|
| 205 |
+
video_out_enabled=False,
|
| 206 |
+
video_out_is_live=False,
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
pipecat_transport = SmallWebRTCTransport(
|
| 210 |
+
webrtc_connection=webrtc_connection,
|
| 211 |
+
params=transport_params,
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
logger.info("β Transport initialized")
|
| 215 |
+
|
| 216 |
+
# ====================================================================
|
| 217 |
+
# SPEECH-TO-TEXT SERVICE
|
| 218 |
+
# ====================================================================
|
| 219 |
+
|
| 220 |
+
logger.info(f"Initializing {STT_PROVIDER} STT...")
|
| 221 |
+
stt = None
|
| 222 |
+
try:
|
| 223 |
+
stt = create_stt_service(
|
| 224 |
+
provider=STT_PROVIDER,
|
| 225 |
+
speechmatics_api_key=SPEECHMATICS_API_KEY,
|
| 226 |
+
deepgram_api_key=DEEPGRAM_API_KEY,
|
| 227 |
+
language=Language.EN,
|
| 228 |
+
enable_diarization=False,
|
| 229 |
+
)
|
| 230 |
+
service_refs["stt"] = stt
|
| 231 |
+
|
| 232 |
+
# Log additional info for Deepgram
|
| 233 |
+
if STT_PROVIDER == "deepgram":
|
| 234 |
+
logger.info("β Deepgram: 300ms endpointing for turn detection")
|
| 235 |
+
logger.info("β Deepgram: VAD events enabled for speech detection")
|
| 236 |
+
|
| 237 |
+
except Exception as e:
|
| 238 |
+
logger.error(f"Failed to initialize {STT_PROVIDER} STT: {e}", exc_info=True)
|
| 239 |
+
return
|
| 240 |
+
|
| 241 |
+
# ====================================================================
|
| 242 |
+
# TEXT-TO-SPEECH SERVICE
|
| 243 |
+
# ====================================================================
|
| 244 |
+
|
| 245 |
+
try:
|
| 246 |
+
tts = create_tts_service(
|
| 247 |
+
provider=TTS_PROVIDER,
|
| 248 |
+
elevenlabs_api_key=ELEVENLABS_API_KEY,
|
| 249 |
+
elevenlabs_voice_id=ELEVENLABS_VOICE_ID,
|
| 250 |
+
qwen_model=QWEN3_TTS_MODEL,
|
| 251 |
+
qwen_device=QWEN3_TTS_DEVICE,
|
| 252 |
+
qwen_ref_audio=QWEN3_TTS_REF_AUDIO,
|
| 253 |
+
)
|
| 254 |
+
service_refs["tts"] = tts
|
| 255 |
+
except Exception as e:
|
| 256 |
+
logger.error(f"Failed to initialize TTS service: {e}", exc_info=True)
|
| 257 |
+
return
|
| 258 |
+
|
| 259 |
+
# ====================================================================
|
| 260 |
+
# LLM SERVICE & TOOLS
|
| 261 |
+
# ====================================================================
|
| 262 |
+
|
| 263 |
+
logger.info("Initializing LLM via DeepInfra...")
|
| 264 |
+
llm = None
|
| 265 |
+
try:
|
| 266 |
+
llm = OpenAILLMService(
|
| 267 |
+
api_key=DEEPINFRA_API_KEY,
|
| 268 |
+
base_url=DEEPINFRA_BASE_URL,
|
| 269 |
+
model=DEEPINFRA_MODEL
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
+
character_dir = os.path.join(os.path.dirname(__file__), "character")
|
| 273 |
+
persona_params = load_persona_ini(os.path.join(character_dir, "persona.ini"))
|
| 274 |
+
tars_data = load_tars_json(os.path.join(character_dir, "TARS.json"))
|
| 275 |
+
system_prompt = build_tars_system_prompt(persona_params, tars_data)
|
| 276 |
+
|
| 277 |
+
# Create tool schemas (these return FunctionSchema objects)
|
| 278 |
+
fetch_image_tool = create_fetch_image_schema()
|
| 279 |
+
persona_tool = create_adjust_persona_schema()
|
| 280 |
+
identity_tool = create_identity_schema()
|
| 281 |
+
crossword_hint_tool = create_crossword_hint_schema()
|
| 282 |
+
movement_tool = create_movement_schema()
|
| 283 |
+
camera_capture_tool = create_camera_capture_schema()
|
| 284 |
+
|
| 285 |
+
# Pass FunctionSchema objects directly to standard_tools
|
| 286 |
+
tools = ToolsSchema(
|
| 287 |
+
standard_tools=[
|
| 288 |
+
fetch_image_tool,
|
| 289 |
+
persona_tool,
|
| 290 |
+
identity_tool,
|
| 291 |
+
crossword_hint_tool,
|
| 292 |
+
movement_tool,
|
| 293 |
+
camera_capture_tool,
|
| 294 |
+
]
|
| 295 |
+
)
|
| 296 |
+
messages = [system_prompt]
|
| 297 |
+
context = LLMContext(messages, tools)
|
| 298 |
+
|
| 299 |
+
llm.register_function("fetch_user_image", fetch_user_image)
|
| 300 |
+
llm.register_function("adjust_persona_parameter", adjust_persona_parameter)
|
| 301 |
+
llm.register_function("get_crossword_hint", get_crossword_hint)
|
| 302 |
+
llm.register_function("execute_movement", execute_movement)
|
| 303 |
+
llm.register_function("capture_camera_view", capture_camera_view)
|
| 304 |
+
|
| 305 |
+
pipeline_unifier = IdentityUnifier(client_id)
|
| 306 |
+
async def wrapped_set_identity(params: FunctionCallParams):
|
| 307 |
+
name = params.arguments["name"]
|
| 308 |
+
logger.info(f"π€ Identity discovered: {name}")
|
| 309 |
+
|
| 310 |
+
old_id = client_state["client_id"]
|
| 311 |
+
new_id = f"user_{name.lower().replace(' ', '_')}"
|
| 312 |
+
|
| 313 |
+
if old_id != new_id:
|
| 314 |
+
logger.info(f"π Switching User ID: {old_id} -> {new_id}")
|
| 315 |
+
client_state["client_id"] = new_id
|
| 316 |
+
|
| 317 |
+
# Update the pipeline unifier to use new identity
|
| 318 |
+
pipeline_unifier.target_user_id = new_id
|
| 319 |
+
logger.info(f"β Updated pipeline unifier with new ID: {new_id}")
|
| 320 |
+
|
| 321 |
+
# Update memory service with new user_id
|
| 322 |
+
if memory_service:
|
| 323 |
+
memory_service.user_id = new_id
|
| 324 |
+
logger.info(f"β Updated memory service user_id to: {new_id}")
|
| 325 |
+
|
| 326 |
+
# Notify frontend of identity change
|
| 327 |
+
try:
|
| 328 |
+
if webrtc_connection and webrtc_connection.is_connected():
|
| 329 |
+
webrtc_connection.send_app_message({
|
| 330 |
+
"type": "identity_update",
|
| 331 |
+
"old_id": old_id,
|
| 332 |
+
"new_id": new_id,
|
| 333 |
+
"name": name
|
| 334 |
+
})
|
| 335 |
+
logger.info(f"π€ Sent identity update to frontend: {new_id}")
|
| 336 |
+
except Exception as e:
|
| 337 |
+
logger.warning(f"Failed to send identity update to frontend: {e}")
|
| 338 |
+
|
| 339 |
+
await params.result_callback(f"Identity updated to {name}.")
|
| 340 |
+
|
| 341 |
+
llm.register_function("set_user_identity", wrapped_set_identity)
|
| 342 |
+
logger.info(f"β LLM initialized with model: {DEEPINFRA_MODEL}")
|
| 343 |
+
|
| 344 |
+
except Exception as e:
|
| 345 |
+
logger.error(f"Failed to initialize LLM: {e}", exc_info=True)
|
| 346 |
+
return
|
| 347 |
+
|
| 348 |
+
# ====================================================================
|
| 349 |
+
# VISION & GATING SERVICES
|
| 350 |
+
# ====================================================================
|
| 351 |
+
|
| 352 |
+
logger.info("Initializing Moondream vision service...")
|
| 353 |
+
moondream = None
|
| 354 |
+
try:
|
| 355 |
+
moondream = MoondreamService(model="vikhyatk/moondream2", revision="2025-01-09")
|
| 356 |
+
logger.info("β Moondream vision service initialized")
|
| 357 |
+
except Exception as e:
|
| 358 |
+
logger.error(f"Failed to initialize Moondream: {e}")
|
| 359 |
+
return
|
| 360 |
+
|
| 361 |
+
# ====================================================================
|
| 362 |
+
# TARS DISPLAY - Note: Display control via gRPC in robot mode only
|
| 363 |
+
# ====================================================================
|
| 364 |
+
|
| 365 |
+
logger.info("TARS Display features available in robot mode (tars_bot.py)")
|
| 366 |
+
tars_client = None
|
| 367 |
+
|
| 368 |
+
logger.info("Initializing Visual Observer...")
|
| 369 |
+
visual_observer = VisualObserver(
|
| 370 |
+
vision_client=moondream,
|
| 371 |
+
enable_face_detection=True,
|
| 372 |
+
tars_client=tars_client
|
| 373 |
+
)
|
| 374 |
+
logger.info("β Visual Observer initialized")
|
| 375 |
+
|
| 376 |
+
logger.info("Initializing Emotional State Monitor...")
|
| 377 |
+
emotional_monitor = EmotionalStateMonitor(
|
| 378 |
+
vision_client=moondream,
|
| 379 |
+
model="vikhyatk/moondream2",
|
| 380 |
+
sampling_interval=EMOTIONAL_SAMPLING_INTERVAL,
|
| 381 |
+
intervention_threshold=EMOTIONAL_INTERVENTION_THRESHOLD,
|
| 382 |
+
enabled=EMOTIONAL_MONITORING_ENABLED,
|
| 383 |
+
auto_intervene=False, # Let gating layer handle intervention decisions
|
| 384 |
+
)
|
| 385 |
+
logger.info(f"β Emotional State Monitor initialized (enabled: {EMOTIONAL_MONITORING_ENABLED})")
|
| 386 |
+
logger.info(f" Mode: Integrated with gating layer for smarter decisions")
|
| 387 |
+
|
| 388 |
+
logger.info("Initializing Gating Layer...")
|
| 389 |
+
gating_layer = InterventionGating(
|
| 390 |
+
api_key=DEEPINFRA_API_KEY,
|
| 391 |
+
base_url=DEEPINFRA_BASE_URL,
|
| 392 |
+
model=DEEPINFRA_GATING_MODEL,
|
| 393 |
+
visual_observer=visual_observer,
|
| 394 |
+
emotional_monitor=emotional_monitor
|
| 395 |
+
)
|
| 396 |
+
logger.info(f"β Gating Layer initialized with emotional state integration")
|
| 397 |
+
|
| 398 |
+
# ====================================================================
|
| 399 |
+
# MEMORY SERVICE
|
| 400 |
+
# ====================================================================
|
| 401 |
+
|
| 402 |
+
# Memory service: Hybrid search combining vector similarity (70%) and BM25 keyword matching (30%)
|
| 403 |
+
# Optimized for voice AI with <50ms latency target
|
| 404 |
+
logger.info("Initializing hybrid memory service...")
|
| 405 |
+
memory_service = None
|
| 406 |
+
try:
|
| 407 |
+
memory_service = HybridMemoryService(
|
| 408 |
+
user_id=client_id,
|
| 409 |
+
db_path="./memory_data/memory.sqlite",
|
| 410 |
+
search_limit=3,
|
| 411 |
+
search_timeout_ms=100, # Hybrid search needs ~60-80ms, allow buffer
|
| 412 |
+
vector_weight=0.7, # 70% semantic similarity
|
| 413 |
+
bm25_weight=0.3, # 30% keyword matching
|
| 414 |
+
system_prompt_prefix="From our conversations:\n",
|
| 415 |
+
)
|
| 416 |
+
logger.info(f"β Hybrid memory service initialized for {client_id}")
|
| 417 |
+
except Exception as e:
|
| 418 |
+
logger.error(f"Failed to initialize hybrid memory service: {e}")
|
| 419 |
+
logger.info(" Continuing without memory service...")
|
| 420 |
+
memory_service = None # Continue without memory if it fails
|
| 421 |
+
|
| 422 |
+
# ====================================================================
|
| 423 |
+
# CONTEXT AGGREGATOR & PERSONA STORAGE
|
| 424 |
+
# ====================================================================
|
| 425 |
+
|
| 426 |
+
# Configure user turn aggregation
|
| 427 |
+
# STT services (Speechmatics, Deepgram) handle turn detection internally
|
| 428 |
+
user_params = LLMUserAggregatorParams(
|
| 429 |
+
user_turn_stop_timeout=1.5
|
| 430 |
+
)
|
| 431 |
+
|
| 432 |
+
context_aggregator = LLMContextAggregatorPair(
|
| 433 |
+
context,
|
| 434 |
+
user_params=user_params
|
| 435 |
+
)
|
| 436 |
+
|
| 437 |
+
|
| 438 |
+
persona_storage = get_persona_storage()
|
| 439 |
+
persona_storage["persona_params"] = persona_params
|
| 440 |
+
persona_storage["tars_data"] = tars_data
|
| 441 |
+
persona_storage["context_aggregator"] = context_aggregator
|
| 442 |
+
|
| 443 |
+
# ====================================================================
|
| 444 |
+
# LOGGING PROCESSORS
|
| 445 |
+
# ====================================================================
|
| 446 |
+
|
| 447 |
+
transcription_observer = TranscriptionObserver(
|
| 448 |
+
webrtc_connection=webrtc_connection,
|
| 449 |
+
client_state=client_state
|
| 450 |
+
)
|
| 451 |
+
assistant_observer = AssistantResponseObserver(webrtc_connection=webrtc_connection)
|
| 452 |
+
tts_state_observer = TTSStateObserver(webrtc_connection=webrtc_connection)
|
| 453 |
+
vision_observer = VisionObserver(webrtc_connection=webrtc_connection)
|
| 454 |
+
display_events_observer = DisplayEventsObserver(tars_client=tars_client)
|
| 455 |
+
|
| 456 |
+
# Create MetricsObserver (non-intrusive monitoring outside pipeline)
|
| 457 |
+
metrics_observer = MetricsObserver(
|
| 458 |
+
webrtc_connection=webrtc_connection,
|
| 459 |
+
stt_service=stt
|
| 460 |
+
)
|
| 461 |
+
|
| 462 |
+
# Turn tracking observer (for debugging turn detection)
|
| 463 |
+
turn_observer = TurnTrackingObserver()
|
| 464 |
+
|
| 465 |
+
@turn_observer.event_handler("on_turn_started")
|
| 466 |
+
async def on_turn_started(*args, **kwargs):
|
| 467 |
+
turn_number = args[1] if len(args) > 1 else kwargs.get('turn_number', 0)
|
| 468 |
+
logger.info(f"π£οΈ [TurnObserver] Turn STARTED: {turn_number}")
|
| 469 |
+
# Notify metrics observer of new turn
|
| 470 |
+
metrics_observer.start_turn(turn_number)
|
| 471 |
+
|
| 472 |
+
@turn_observer.event_handler("on_turn_ended")
|
| 473 |
+
async def on_turn_ended(*args, **kwargs):
|
| 474 |
+
turn_number = args[1] if len(args) > 1 else kwargs.get('turn_number', 0)
|
| 475 |
+
logger.info(f"π£οΈ [TurnObserver] Turn ENDED: {turn_number}")
|
| 476 |
+
|
| 477 |
+
# ====================================================================
|
| 478 |
+
# PIPELINE ASSEMBLY
|
| 479 |
+
# ====================================================================
|
| 480 |
+
|
| 481 |
+
logger.info("Creating audio/video pipeline...")
|
| 482 |
+
|
| 483 |
+
pipeline = Pipeline([
|
| 484 |
+
pipecat_transport.input(),
|
| 485 |
+
# emotional_monitor, # Real-time emotional state monitoring
|
| 486 |
+
stt,
|
| 487 |
+
pipeline_unifier,
|
| 488 |
+
context_aggregator.user(),
|
| 489 |
+
memory_service, # Hybrid memory (70% vector + 30% BM25) for automatic recall/storage
|
| 490 |
+
# gating_layer, # AI decision system (with emotional state integration)
|
| 491 |
+
llm,
|
| 492 |
+
SilenceFilter(),
|
| 493 |
+
tts,
|
| 494 |
+
pipecat_transport.output(),
|
| 495 |
+
context_aggregator.assistant(),
|
| 496 |
+
])
|
| 497 |
+
|
| 498 |
+
# ====================================================================
|
| 499 |
+
# EVENT HANDLERS
|
| 500 |
+
# ====================================================================
|
| 501 |
+
|
| 502 |
+
task_ref = {"task": None}
|
| 503 |
+
|
| 504 |
+
@pipecat_transport.event_handler("on_client_connected")
|
| 505 |
+
async def on_client_connected(transport, client):
|
| 506 |
+
logger.info("Pipecat Client connected")
|
| 507 |
+
try:
|
| 508 |
+
if webrtc_connection.is_connected():
|
| 509 |
+
webrtc_connection.send_app_message({"type": "system", "message": "Connection established"})
|
| 510 |
+
|
| 511 |
+
# Send service configuration info with provider and model details
|
| 512 |
+
llm_display = DEEPINFRA_MODEL.split('/')[-1] if '/' in DEEPINFRA_MODEL else DEEPINFRA_MODEL
|
| 513 |
+
|
| 514 |
+
if TTS_PROVIDER == "elevenlabs":
|
| 515 |
+
tts_display = "ElevenLabs: eleven_flash_v2_5"
|
| 516 |
+
else:
|
| 517 |
+
tts_model = QWEN3_TTS_MODEL.split('/')[-1] if '/' in QWEN3_TTS_MODEL else QWEN3_TTS_MODEL
|
| 518 |
+
tts_display = f"Qwen3-TTS: {tts_model}"
|
| 519 |
+
|
| 520 |
+
# Format STT provider name for display
|
| 521 |
+
stt_display = {
|
| 522 |
+
"speechmatics": "Speechmatics",
|
| 523 |
+
"deepgram": "Deepgram Nova-2"
|
| 524 |
+
}.get(STT_PROVIDER, STT_PROVIDER.capitalize())
|
| 525 |
+
|
| 526 |
+
service_info = {
|
| 527 |
+
"stt": stt_display,
|
| 528 |
+
"memory": "Hybrid Search (SQLite)",
|
| 529 |
+
"llm": f"DeepInfra: {llm_display}",
|
| 530 |
+
"tts": tts_display
|
| 531 |
+
}
|
| 532 |
+
|
| 533 |
+
# Store in shared state for Gradio UI
|
| 534 |
+
metrics_store.set_service_info(service_info)
|
| 535 |
+
|
| 536 |
+
# Send via WebRTC
|
| 537 |
+
webrtc_connection.send_app_message({
|
| 538 |
+
"type": "service_info",
|
| 539 |
+
**service_info
|
| 540 |
+
})
|
| 541 |
+
logger.info(f"π Sent service info to frontend: STT={stt_display}, LLM={llm_display}, TTS={tts_display}")
|
| 542 |
+
except Exception as e:
|
| 543 |
+
logger.error(f"β Error sending service info: {e}")
|
| 544 |
+
|
| 545 |
+
if task_ref["task"]:
|
| 546 |
+
verbosity = persona_params.get("verbosity", 10) if persona_params else 10
|
| 547 |
+
intro_instruction = get_introduction_instruction(client_state['client_id'], verbosity)
|
| 548 |
+
|
| 549 |
+
if context and hasattr(context, "messages"):
|
| 550 |
+
context.messages.append(intro_instruction)
|
| 551 |
+
|
| 552 |
+
logger.info("Waiting for pipeline to warm up...")
|
| 553 |
+
await asyncio.sleep(2.0)
|
| 554 |
+
|
| 555 |
+
logger.info("Queueing initial LLM greeting...")
|
| 556 |
+
await task_ref["task"].queue_frames([LLMRunFrame()])
|
| 557 |
+
|
| 558 |
+
@pipecat_transport.event_handler("on_client_disconnected")
|
| 559 |
+
async def on_client_disconnected(transport, client):
|
| 560 |
+
logger.info("Pipecat Client disconnected")
|
| 561 |
+
if task_ref["task"]:
|
| 562 |
+
await task_ref["task"].cancel()
|
| 563 |
+
await _cleanup_services(service_refs)
|
| 564 |
+
|
| 565 |
+
# ====================================================================
|
| 566 |
+
# PIPELINE EXECUTION
|
| 567 |
+
# ====================================================================
|
| 568 |
+
|
| 569 |
+
# Enable built-in Pipecat metrics for latency tracking
|
| 570 |
+
user_bot_latency_observer = UserBotLatencyLogObserver()
|
| 571 |
+
|
| 572 |
+
task = PipelineTask(
|
| 573 |
+
pipeline,
|
| 574 |
+
params=PipelineParams(
|
| 575 |
+
enable_metrics=True, # Enable performance metrics (TTFB, latency)
|
| 576 |
+
enable_usage_metrics=True, # Enable LLM/TTS usage metrics
|
| 577 |
+
report_only_initial_ttfb=False, # Report all TTFB measurements
|
| 578 |
+
),
|
| 579 |
+
observers=[
|
| 580 |
+
turn_observer,
|
| 581 |
+
metrics_observer,
|
| 582 |
+
transcription_observer,
|
| 583 |
+
assistant_observer,
|
| 584 |
+
tts_state_observer,
|
| 585 |
+
vision_observer,
|
| 586 |
+
display_events_observer, # Send events to TARS display
|
| 587 |
+
user_bot_latency_observer, # Measures total userβbot response time
|
| 588 |
+
], # Non-intrusive monitoring
|
| 589 |
+
)
|
| 590 |
+
task_ref["task"] = task
|
| 591 |
+
runner = PipelineRunner(handle_sigint=False)
|
| 592 |
+
|
| 593 |
+
logger.info("Starting pipeline runner...")
|
| 594 |
+
|
| 595 |
+
try:
|
| 596 |
+
await runner.run(task)
|
| 597 |
+
except Exception:
|
| 598 |
+
raise
|
| 599 |
+
finally:
|
| 600 |
+
await _cleanup_services(service_refs)
|
| 601 |
+
|
| 602 |
+
except Exception as e:
|
| 603 |
+
logger.error(f"Error in bot pipeline: {e}", exc_info=True)
|
| 604 |
+
finally:
|
| 605 |
+
await _cleanup_services(service_refs)
|
config.ini.example
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[LLM]
|
| 2 |
+
# Available models: Any DeepInfra-supported model
|
| 3 |
+
# Examples: openai/gpt-oss-20b, meta-llama/Llama-3.3-70B-Instruct-Turbo, meta-llama/Llama-3.2-3B-Instruct
|
| 4 |
+
model = openai/gpt-oss-20b
|
| 5 |
+
|
| 6 |
+
# Gating model for intervention decisions (smaller/faster model recommended)
|
| 7 |
+
gating_model = meta-llama/Llama-3.2-3B-Instruct
|
| 8 |
+
|
| 9 |
+
[STT]
|
| 10 |
+
# Available providers: speechmatics, deepgram, deepgram-flux
|
| 11 |
+
# - speechmatics: Speechmatics with SMART_TURN detection
|
| 12 |
+
# - deepgram: Deepgram Nova-2 with endpoint detection
|
| 13 |
+
# - deepgram-flux: Deepgram Flux with built-in turn detection (recommended)
|
| 14 |
+
provider = deepgram-flux
|
| 15 |
+
|
| 16 |
+
[TTS]
|
| 17 |
+
# Available providers: elevenlabs, qwen3
|
| 18 |
+
provider = qwen3
|
| 19 |
+
|
| 20 |
+
# Qwen3-TTS Configuration (only used if provider = qwen3)
|
| 21 |
+
# Available models: Qwen/Qwen3-TTS-12Hz-0.6B-Base, Qwen/Qwen3-TTS-12Hz-1.7B-Base
|
| 22 |
+
qwen3_model = Qwen/Qwen3-TTS-12Hz-0.6B-Base
|
| 23 |
+
# Available devices: mps (Mac), cuda (NVIDIA), cpu
|
| 24 |
+
qwen3_device = mps
|
| 25 |
+
# Reference audio file for voice cloning (relative to project root)
|
| 26 |
+
qwen3_ref_audio = assets/audio/tars-clean-compressed.mp3
|
| 27 |
+
|
| 28 |
+
[Emotional]
|
| 29 |
+
# Enable real-time emotional state monitoring via video
|
| 30 |
+
enabled = true
|
| 31 |
+
# How often to sample video frames (in seconds)
|
| 32 |
+
sampling_interval = 3.0
|
| 33 |
+
# How many consecutive negative states before intervention
|
| 34 |
+
intervention_threshold = 2
|
| 35 |
+
|
| 36 |
+
[Connection]
|
| 37 |
+
# Transport mode: "robot" (aiortc WebRTC to RPi) or "browser" (SmallWebRTC for browser)
|
| 38 |
+
mode = robot
|
| 39 |
+
# Raspberry Pi WebRTC server URL (Tailscale or local network IP)
|
| 40 |
+
rpi_url = http://100.115.193.41:8001
|
| 41 |
+
# Auto-connect to RPi on startup (only for robot mode)
|
| 42 |
+
auto_connect = true
|
| 43 |
+
# Delay between reconnection attempts (seconds)
|
| 44 |
+
reconnect_delay = 5
|
| 45 |
+
# Maximum reconnection attempts (0 = infinite)
|
| 46 |
+
max_reconnect_attempts = 0
|
| 47 |
+
|
| 48 |
+
[Display]
|
| 49 |
+
# Enable TARS Raspberry Pi display integration (HTTP commands)
|
| 50 |
+
enabled = true
|
| 51 |
+
# URL of TARS display API (Tailscale or local network IP)
|
| 52 |
+
tars_url = http://100.115.193.41:8001
|
docs/DAEMON_INTEGRATION.md
ADDED
|
@@ -0,0 +1,393 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Daemon Dashboard Integration
|
| 2 |
+
|
| 3 |
+
Guide for integrating tars-conversation-app with tars-daemon dashboard app management.
|
| 4 |
+
|
| 5 |
+
## Overview
|
| 6 |
+
|
| 7 |
+
The tars-daemon dashboard should provide install/uninstall buttons for managing TARS apps like this one.
|
| 8 |
+
|
| 9 |
+
## App Discovery
|
| 10 |
+
|
| 11 |
+
The daemon scans for apps with `app.json` manifest files:
|
| 12 |
+
|
| 13 |
+
```python
|
| 14 |
+
import json
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
|
| 17 |
+
def discover_apps(apps_directory="/home/mac/tars-apps"):
|
| 18 |
+
"""Discover all TARS apps with manifests"""
|
| 19 |
+
apps = []
|
| 20 |
+
apps_dir = Path(apps_directory)
|
| 21 |
+
|
| 22 |
+
for app_path in apps_dir.iterdir():
|
| 23 |
+
manifest_path = app_path / "app.json"
|
| 24 |
+
if manifest_path.exists():
|
| 25 |
+
with open(manifest_path) as f:
|
| 26 |
+
manifest = json.load(f)
|
| 27 |
+
apps.append({
|
| 28 |
+
"path": str(app_path),
|
| 29 |
+
"manifest": manifest,
|
| 30 |
+
"installed": (app_path / "venv").exists()
|
| 31 |
+
})
|
| 32 |
+
|
| 33 |
+
return apps
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
## Installation Flow
|
| 37 |
+
|
| 38 |
+
When user clicks "Install" button:
|
| 39 |
+
|
| 40 |
+
```python
|
| 41 |
+
import subprocess
|
| 42 |
+
from pathlib import Path
|
| 43 |
+
|
| 44 |
+
def install_app(app_path):
|
| 45 |
+
"""Install a TARS app"""
|
| 46 |
+
app_dir = Path(app_path)
|
| 47 |
+
manifest_path = app_dir / "app.json"
|
| 48 |
+
|
| 49 |
+
# Read manifest
|
| 50 |
+
with open(manifest_path) as f:
|
| 51 |
+
manifest = json.load(f)
|
| 52 |
+
|
| 53 |
+
# Get install script
|
| 54 |
+
install_script = manifest.get("install_script", "install.sh")
|
| 55 |
+
script_path = app_dir / install_script
|
| 56 |
+
|
| 57 |
+
if not script_path.exists():
|
| 58 |
+
raise FileNotFoundError(f"Install script not found: {script_path}")
|
| 59 |
+
|
| 60 |
+
# Run installation
|
| 61 |
+
result = subprocess.run(
|
| 62 |
+
["bash", str(script_path)],
|
| 63 |
+
cwd=str(app_dir),
|
| 64 |
+
capture_output=True,
|
| 65 |
+
text=True
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
return {
|
| 69 |
+
"success": result.returncode == 0,
|
| 70 |
+
"stdout": result.stdout,
|
| 71 |
+
"stderr": result.stderr
|
| 72 |
+
}
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
## Uninstallation Flow
|
| 76 |
+
|
| 77 |
+
When user clicks "Uninstall" button:
|
| 78 |
+
|
| 79 |
+
```python
|
| 80 |
+
def uninstall_app(app_path):
|
| 81 |
+
"""Uninstall a TARS app"""
|
| 82 |
+
app_dir = Path(app_path)
|
| 83 |
+
manifest_path = app_dir / "app.json"
|
| 84 |
+
|
| 85 |
+
# Read manifest
|
| 86 |
+
with open(manifest_path) as f:
|
| 87 |
+
manifest = json.load(f)
|
| 88 |
+
|
| 89 |
+
# Get uninstall script
|
| 90 |
+
uninstall_script = manifest.get("uninstall_script", "uninstall.sh")
|
| 91 |
+
script_path = app_dir / uninstall_script
|
| 92 |
+
|
| 93 |
+
if not script_path.exists():
|
| 94 |
+
raise FileNotFoundError(f"Uninstall script not found: {script_path}")
|
| 95 |
+
|
| 96 |
+
# Run uninstallation
|
| 97 |
+
result = subprocess.run(
|
| 98 |
+
["bash", str(script_path)],
|
| 99 |
+
cwd=str(app_dir),
|
| 100 |
+
capture_output=True,
|
| 101 |
+
text=True
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
return {
|
| 105 |
+
"success": result.returncode == 0,
|
| 106 |
+
"stdout": result.stdout,
|
| 107 |
+
"stderr": result.stderr
|
| 108 |
+
}
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
## Dashboard UI (Gradio Example)
|
| 112 |
+
|
| 113 |
+
```python
|
| 114 |
+
import gradio as gr
|
| 115 |
+
from pathlib import Path
|
| 116 |
+
|
| 117 |
+
def get_app_status(app_path):
|
| 118 |
+
"""Check if app is installed"""
|
| 119 |
+
return (Path(app_path) / "venv").exists()
|
| 120 |
+
|
| 121 |
+
def create_app_tab():
|
| 122 |
+
"""Create app management tab in dashboard"""
|
| 123 |
+
|
| 124 |
+
# Discover apps
|
| 125 |
+
apps = discover_apps()
|
| 126 |
+
|
| 127 |
+
with gr.Tab("Apps"):
|
| 128 |
+
for app in apps:
|
| 129 |
+
manifest = app["manifest"]
|
| 130 |
+
|
| 131 |
+
with gr.Row():
|
| 132 |
+
gr.Markdown(f"### {manifest['name']}")
|
| 133 |
+
gr.Markdown(manifest.get("description", ""))
|
| 134 |
+
|
| 135 |
+
with gr.Row():
|
| 136 |
+
gr.Markdown(f"**Version:** {manifest.get('version', 'unknown')}")
|
| 137 |
+
status = "Installed" if app["installed"] else "Not Installed"
|
| 138 |
+
gr.Markdown(f"**Status:** {status}")
|
| 139 |
+
|
| 140 |
+
with gr.Row():
|
| 141 |
+
install_btn = gr.Button(
|
| 142 |
+
"Install",
|
| 143 |
+
visible=not app["installed"]
|
| 144 |
+
)
|
| 145 |
+
uninstall_btn = gr.Button(
|
| 146 |
+
"Uninstall",
|
| 147 |
+
visible=app["installed"]
|
| 148 |
+
)
|
| 149 |
+
output = gr.Textbox(
|
| 150 |
+
label="Output",
|
| 151 |
+
lines=5,
|
| 152 |
+
max_lines=10
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
# Install handler
|
| 156 |
+
install_btn.click(
|
| 157 |
+
fn=lambda path=app["path"]: install_app(path),
|
| 158 |
+
outputs=output
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
# Uninstall handler
|
| 162 |
+
uninstall_btn.click(
|
| 163 |
+
fn=lambda path=app["path"]: uninstall_app(path),
|
| 164 |
+
outputs=output
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
gr.Markdown("---")
|
| 168 |
+
|
| 169 |
+
# Add to dashboard
|
| 170 |
+
with gr.Blocks() as dashboard:
|
| 171 |
+
create_app_tab()
|
| 172 |
+
|
| 173 |
+
dashboard.launch()
|
| 174 |
+
```
|
| 175 |
+
|
| 176 |
+
## Recommended Directory Structure
|
| 177 |
+
|
| 178 |
+
```
|
| 179 |
+
/home/mac/
|
| 180 |
+
βββ tars-daemon/ # Main daemon
|
| 181 |
+
β βββ tars_daemon.py
|
| 182 |
+
β βββ dashboard.py # Gradio dashboard with app management
|
| 183 |
+
β βββ app_manager.py # App discovery and management
|
| 184 |
+
β
|
| 185 |
+
βββ tars-apps/ # Apps directory
|
| 186 |
+
βββ tars-conversation-app/
|
| 187 |
+
β βββ app.json # Manifest
|
| 188 |
+
β βββ install.sh # Install script
|
| 189 |
+
β βββ uninstall.sh # Uninstall script
|
| 190 |
+
β βββ ...
|
| 191 |
+
β
|
| 192 |
+
βββ another-app/
|
| 193 |
+
βββ app.json
|
| 194 |
+
βββ ...
|
| 195 |
+
```
|
| 196 |
+
|
| 197 |
+
## Environment Variables
|
| 198 |
+
|
| 199 |
+
Apps should auto-detect deployment:
|
| 200 |
+
|
| 201 |
+
```python
|
| 202 |
+
# In app configuration
|
| 203 |
+
def get_grpc_address():
|
| 204 |
+
"""Auto-detect if running on Pi or remotely"""
|
| 205 |
+
# Check if on Raspberry Pi
|
| 206 |
+
try:
|
| 207 |
+
with open("/proc/cpuinfo") as f:
|
| 208 |
+
if "Raspberry Pi" in f.read():
|
| 209 |
+
return "localhost:50051" # Local daemon
|
| 210 |
+
except:
|
| 211 |
+
pass
|
| 212 |
+
|
| 213 |
+
# Remote connection
|
| 214 |
+
return os.getenv("RPI_GRPC", "100.84.133.74:50051")
|
| 215 |
+
```
|
| 216 |
+
|
| 217 |
+
## Installation Validation
|
| 218 |
+
|
| 219 |
+
The daemon should validate before installation:
|
| 220 |
+
|
| 221 |
+
```python
|
| 222 |
+
def validate_app(app_path):
|
| 223 |
+
"""Validate app before installation"""
|
| 224 |
+
app_dir = Path(app_path)
|
| 225 |
+
errors = []
|
| 226 |
+
|
| 227 |
+
# Check manifest exists
|
| 228 |
+
manifest_path = app_dir / "app.json"
|
| 229 |
+
if not manifest_path.exists():
|
| 230 |
+
errors.append("Missing app.json manifest")
|
| 231 |
+
return errors
|
| 232 |
+
|
| 233 |
+
# Read manifest
|
| 234 |
+
with open(manifest_path) as f:
|
| 235 |
+
manifest = json.load(f)
|
| 236 |
+
|
| 237 |
+
# Check required fields
|
| 238 |
+
required = ["name", "version", "install_script"]
|
| 239 |
+
for field in required:
|
| 240 |
+
if field not in manifest:
|
| 241 |
+
errors.append(f"Missing required field: {field}")
|
| 242 |
+
|
| 243 |
+
# Check scripts exist
|
| 244 |
+
install_script = app_dir / manifest.get("install_script", "install.sh")
|
| 245 |
+
if not install_script.exists():
|
| 246 |
+
errors.append(f"Install script not found: {install_script}")
|
| 247 |
+
|
| 248 |
+
# Check Python version
|
| 249 |
+
if "dependencies" in manifest:
|
| 250 |
+
py_version = manifest["dependencies"].get("python", "")
|
| 251 |
+
if py_version:
|
| 252 |
+
# Validate version string format
|
| 253 |
+
import re
|
| 254 |
+
if not re.match(r">=?\d+\.\d+", py_version):
|
| 255 |
+
errors.append(f"Invalid Python version: {py_version}")
|
| 256 |
+
|
| 257 |
+
return errors
|
| 258 |
+
```
|
| 259 |
+
|
| 260 |
+
## Running Apps
|
| 261 |
+
|
| 262 |
+
After installation, provide run buttons:
|
| 263 |
+
|
| 264 |
+
```python
|
| 265 |
+
def run_app(app_path, mode="robot"):
|
| 266 |
+
"""Run an installed app"""
|
| 267 |
+
app_dir = Path(app_path)
|
| 268 |
+
manifest_path = app_dir / "app.json"
|
| 269 |
+
|
| 270 |
+
with open(manifest_path) as f:
|
| 271 |
+
manifest = json.load(f)
|
| 272 |
+
|
| 273 |
+
# Get command for mode
|
| 274 |
+
modes = manifest.get("modes", [])
|
| 275 |
+
command = None
|
| 276 |
+
|
| 277 |
+
for m in modes:
|
| 278 |
+
if m["name"] == mode:
|
| 279 |
+
command = m["command"]
|
| 280 |
+
break
|
| 281 |
+
|
| 282 |
+
if not command:
|
| 283 |
+
# Fallback to main
|
| 284 |
+
command = f"python {manifest['main']}"
|
| 285 |
+
|
| 286 |
+
# Activate venv and run
|
| 287 |
+
venv_python = app_dir / "venv" / "bin" / "python"
|
| 288 |
+
|
| 289 |
+
subprocess.Popen(
|
| 290 |
+
[str(venv_python)] + command.split()[1:],
|
| 291 |
+
cwd=str(app_dir)
|
| 292 |
+
)
|
| 293 |
+
```
|
| 294 |
+
|
| 295 |
+
## Security Considerations
|
| 296 |
+
|
| 297 |
+
1. **Script Validation** - Verify scripts don't contain malicious commands
|
| 298 |
+
2. **Sandboxing** - Consider running installations in containers
|
| 299 |
+
3. **User Permissions** - Require confirmation before installation
|
| 300 |
+
4. **API Keys** - Warn users to configure API keys before running
|
| 301 |
+
|
| 302 |
+
## Example Dashboard Integration
|
| 303 |
+
|
| 304 |
+
```python
|
| 305 |
+
# In tars-daemon/dashboard.py
|
| 306 |
+
|
| 307 |
+
import gradio as gr
|
| 308 |
+
from app_manager import discover_apps, install_app, uninstall_app
|
| 309 |
+
|
| 310 |
+
def create_dashboard():
|
| 311 |
+
with gr.Blocks() as dashboard:
|
| 312 |
+
gr.Markdown("# TARS Daemon Dashboard")
|
| 313 |
+
|
| 314 |
+
with gr.Tabs():
|
| 315 |
+
# Hardware tab
|
| 316 |
+
with gr.Tab("Hardware"):
|
| 317 |
+
gr.Markdown("Robot hardware controls...")
|
| 318 |
+
|
| 319 |
+
# Apps tab
|
| 320 |
+
with gr.Tab("Apps"):
|
| 321 |
+
apps = discover_apps("/home/mac/tars-apps")
|
| 322 |
+
|
| 323 |
+
for app in apps:
|
| 324 |
+
manifest = app["manifest"]
|
| 325 |
+
|
| 326 |
+
with gr.Accordion(manifest["name"], open=False):
|
| 327 |
+
gr.Markdown(manifest.get("description", ""))
|
| 328 |
+
gr.JSON(manifest, label="Manifest")
|
| 329 |
+
|
| 330 |
+
with gr.Row():
|
| 331 |
+
install_btn = gr.Button(
|
| 332 |
+
"Install",
|
| 333 |
+
visible=not app["installed"]
|
| 334 |
+
)
|
| 335 |
+
uninstall_btn = gr.Button(
|
| 336 |
+
"Uninstall",
|
| 337 |
+
visible=app["installed"]
|
| 338 |
+
)
|
| 339 |
+
run_btn = gr.Button(
|
| 340 |
+
"Run",
|
| 341 |
+
visible=app["installed"]
|
| 342 |
+
)
|
| 343 |
+
|
| 344 |
+
output = gr.Textbox(label="Output", lines=10)
|
| 345 |
+
|
| 346 |
+
# Event handlers
|
| 347 |
+
install_btn.click(
|
| 348 |
+
fn=lambda p=app["path"]: install_app(p),
|
| 349 |
+
outputs=output
|
| 350 |
+
).then(
|
| 351 |
+
fn=lambda: gr.update(visible=False),
|
| 352 |
+
outputs=install_btn
|
| 353 |
+
).then(
|
| 354 |
+
fn=lambda: gr.update(visible=True),
|
| 355 |
+
outputs=[uninstall_btn, run_btn]
|
| 356 |
+
)
|
| 357 |
+
|
| 358 |
+
# Logs tab
|
| 359 |
+
with gr.Tab("Logs"):
|
| 360 |
+
gr.Markdown("System logs...")
|
| 361 |
+
|
| 362 |
+
return dashboard
|
| 363 |
+
|
| 364 |
+
if __name__ == "__main__":
|
| 365 |
+
dashboard = create_dashboard()
|
| 366 |
+
dashboard.launch(server_name="0.0.0.0", server_port=7860)
|
| 367 |
+
```
|
| 368 |
+
|
| 369 |
+
## Testing Installation
|
| 370 |
+
|
| 371 |
+
From the Pi:
|
| 372 |
+
|
| 373 |
+
```bash
|
| 374 |
+
# Test install
|
| 375 |
+
cd ~/tars-apps/tars-conversation-app
|
| 376 |
+
bash install.sh
|
| 377 |
+
|
| 378 |
+
# Verify
|
| 379 |
+
ls -la venv/
|
| 380 |
+
source venv/bin/activate
|
| 381 |
+
python -c "import pipecat; print('OK')"
|
| 382 |
+
|
| 383 |
+
# Test uninstall
|
| 384 |
+
bash uninstall.sh
|
| 385 |
+
```
|
| 386 |
+
|
| 387 |
+
## Next Steps
|
| 388 |
+
|
| 389 |
+
1. Implement app discovery in tars-daemon
|
| 390 |
+
2. Add Apps tab to dashboard
|
| 391 |
+
3. Create app_manager.py module
|
| 392 |
+
4. Test with tars-conversation-app
|
| 393 |
+
5. Document for other developers
|
docs/DASHBOARD_UPDATE_SUMMARY.md
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Dashboard Update Summary
|
| 2 |
+
|
| 3 |
+
App management functionality added to tars-daemon dashboard.
|
| 4 |
+
|
| 5 |
+
## Changes Made
|
| 6 |
+
|
| 7 |
+
### Backend (tars-daemon)
|
| 8 |
+
|
| 9 |
+
**File: dashboard/backend/routes/apps.py**
|
| 10 |
+
- Implemented app discovery via app.json manifests
|
| 11 |
+
- Scans ~/tars-apps/ directory for apps
|
| 12 |
+
- Install using install.sh script
|
| 13 |
+
- Uninstall using uninstall.sh script
|
| 14 |
+
- Status detection via venv/ directory
|
| 15 |
+
- Start/stop app processes
|
| 16 |
+
- Logs endpoint
|
| 17 |
+
|
| 18 |
+
**File: dashboard/backend/routes/__init__.py**
|
| 19 |
+
- Added apps module to imports and exports
|
| 20 |
+
|
| 21 |
+
**File: dashboard/backend/server.py**
|
| 22 |
+
- Added apps router at /api/apps/*
|
| 23 |
+
|
| 24 |
+
### Frontend (tars-daemon)
|
| 25 |
+
|
| 26 |
+
**File: dashboard/frontend/src/pages/AppStore.jsx**
|
| 27 |
+
- Complete rewrite with functional UI
|
| 28 |
+
- Install/Uninstall buttons
|
| 29 |
+
- Start/Stop controls
|
| 30 |
+
- Real-time status updates (5s polling)
|
| 31 |
+
- Loading states and error handling
|
| 32 |
+
- Success/error alerts
|
| 33 |
+
|
| 34 |
+
**File: dashboard/frontend/src/components/ui/badge.jsx**
|
| 35 |
+
- New component for status badges
|
| 36 |
+
|
| 37 |
+
**File: dashboard/frontend/src/components/ui/alert.jsx**
|
| 38 |
+
- New component for notifications
|
| 39 |
+
|
| 40 |
+
### App Setup
|
| 41 |
+
|
| 42 |
+
**Location: ~/tars-apps/tars-conversation-app/**
|
| 43 |
+
- Copied from Mac to Pi
|
| 44 |
+
- Contains app.json manifest
|
| 45 |
+
- Has install.sh and uninstall.sh scripts
|
| 46 |
+
|
| 47 |
+
## API Endpoints
|
| 48 |
+
|
| 49 |
+
```
|
| 50 |
+
GET /api/apps/list - List all apps with status
|
| 51 |
+
POST /api/apps/install - Install app using install.sh
|
| 52 |
+
POST /api/apps/uninstall - Uninstall app using uninstall.sh
|
| 53 |
+
POST /api/apps/start - Start app process
|
| 54 |
+
POST /api/apps/stop - Stop app process
|
| 55 |
+
GET /api/apps/logs/{name} - Get app logs
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
## Testing
|
| 59 |
+
|
| 60 |
+
### 1. Restart Dashboard
|
| 61 |
+
|
| 62 |
+
```bash
|
| 63 |
+
ssh tars-pi
|
| 64 |
+
cd ~/tars-daemon
|
| 65 |
+
pkill -f start_dashboard.py
|
| 66 |
+
venv/bin/python start_dashboard.py
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
Or use systemd if configured:
|
| 70 |
+
```bash
|
| 71 |
+
sudo systemctl restart tars-dashboard
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
### 2. Verify Backend
|
| 75 |
+
|
| 76 |
+
```bash
|
| 77 |
+
# Test app discovery
|
| 78 |
+
curl http://100.84.133.74:8000/api/apps/list
|
| 79 |
+
|
| 80 |
+
# Should return JSON with tars-conversation-app
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
### 3. Open Dashboard
|
| 84 |
+
|
| 85 |
+
Navigate to: http://100.84.133.74:8000
|
| 86 |
+
|
| 87 |
+
Click on "Apps" or "App Store" tab (depending on navigation)
|
| 88 |
+
|
| 89 |
+
### 4. Test Installation
|
| 90 |
+
|
| 91 |
+
1. Click "Install" button for tars-conversation-app
|
| 92 |
+
2. Wait for installation (may take 5-10 minutes)
|
| 93 |
+
3. Status should change to "Installed"
|
| 94 |
+
4. Start/Stop buttons should appear
|
| 95 |
+
|
| 96 |
+
### 5. Test Uninstallation
|
| 97 |
+
|
| 98 |
+
1. Stop app if running
|
| 99 |
+
2. Click uninstall button (trash icon)
|
| 100 |
+
3. Confirm in alerts
|
| 101 |
+
4. Status returns to not installed
|
| 102 |
+
|
| 103 |
+
## Expected Behavior
|
| 104 |
+
|
| 105 |
+
### App Card Display
|
| 106 |
+
|
| 107 |
+
```
|
| 108 |
+
βββββββββββββββββββββββββββββββββββββββ
|
| 109 |
+
β tars-conversation-app [Installed]β
|
| 110 |
+
β Real-time conversational AI... β
|
| 111 |
+
β β
|
| 112 |
+
β Version: 1.0.0 β
|
| 113 |
+
β Author: TARS Project β
|
| 114 |
+
β β
|
| 115 |
+
β [Start] [ποΈ] β
|
| 116 |
+
β ~/tars-apps/tars-conversation-app β
|
| 117 |
+
βββββββββββββββββββββββββββββββββββββββ
|
| 118 |
+
```
|
| 119 |
+
|
| 120 |
+
When installing:
|
| 121 |
+
```
|
| 122 |
+
[Installing...] (spinner)
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
When running:
|
| 126 |
+
```
|
| 127 |
+
[Stop] [ποΈ]
|
| 128 |
+
```
|
| 129 |
+
|
| 130 |
+
## Troubleshooting
|
| 131 |
+
|
| 132 |
+
### Dashboard won't start
|
| 133 |
+
|
| 134 |
+
Check logs:
|
| 135 |
+
```bash
|
| 136 |
+
tail -50 /tmp/dashboard.log
|
| 137 |
+
```
|
| 138 |
+
|
| 139 |
+
Common issues:
|
| 140 |
+
- Missing fastapi: `pip install fastapi uvicorn`
|
| 141 |
+
- Import errors: Check routes/__init__.py includes apps
|
| 142 |
+
- Port 8000 in use: `lsof -i :8000`
|
| 143 |
+
|
| 144 |
+
### Apps not discovered
|
| 145 |
+
|
| 146 |
+
Check:
|
| 147 |
+
```bash
|
| 148 |
+
ls -la ~/tars-apps/tars-conversation-app/app.json
|
| 149 |
+
```
|
| 150 |
+
|
| 151 |
+
Verify manifest:
|
| 152 |
+
```bash
|
| 153 |
+
cat ~/tars-apps/tars-conversation-app/app.json | python3 -m json.tool
|
| 154 |
+
```
|
| 155 |
+
|
| 156 |
+
### Installation fails
|
| 157 |
+
|
| 158 |
+
Check install script:
|
| 159 |
+
```bash
|
| 160 |
+
bash ~/tars-apps/tars-conversation-app/install.sh
|
| 161 |
+
```
|
| 162 |
+
|
| 163 |
+
Check logs in dashboard after clicking install button.
|
| 164 |
+
|
| 165 |
+
### Frontend not updated
|
| 166 |
+
|
| 167 |
+
Rebuild:
|
| 168 |
+
```bash
|
| 169 |
+
cd ~/tars-daemon/dashboard/frontend
|
| 170 |
+
npm run build
|
| 171 |
+
```
|
| 172 |
+
|
| 173 |
+
Hard refresh browser: Ctrl+Shift+R
|
| 174 |
+
|
| 175 |
+
## File Locations
|
| 176 |
+
|
| 177 |
+
```
|
| 178 |
+
tars-daemon/
|
| 179 |
+
βββ dashboard/
|
| 180 |
+
β βββ backend/
|
| 181 |
+
β β βββ server.py # Updated: added apps router
|
| 182 |
+
β β βββ routes/
|
| 183 |
+
β β βββ __init__.py # Updated: export apps
|
| 184 |
+
β β βββ apps.py # NEW: app management
|
| 185 |
+
β βββ frontend/
|
| 186 |
+
β βββ src/
|
| 187 |
+
β βββ pages/
|
| 188 |
+
β β βββ AppStore.jsx # Updated: full UI
|
| 189 |
+
β βββ components/ui/
|
| 190 |
+
β βββ badge.jsx # NEW
|
| 191 |
+
β βββ alert.jsx # NEW
|
| 192 |
+
β
|
| 193 |
+
tars-apps/
|
| 194 |
+
βββ tars-conversation-app/
|
| 195 |
+
βββ app.json # Manifest
|
| 196 |
+
βββ install.sh # Installation script
|
| 197 |
+
βββ uninstall.sh # Uninstall script
|
| 198 |
+
βββ ... # App files
|
| 199 |
+
```
|
| 200 |
+
|
| 201 |
+
## Next Steps
|
| 202 |
+
|
| 203 |
+
1. Restart dashboard on Pi
|
| 204 |
+
2. Test in browser
|
| 205 |
+
3. Install tars-conversation-app via UI
|
| 206 |
+
4. Verify installation works
|
| 207 |
+
5. Add more apps to ~/tars-apps/ as needed
|
| 208 |
+
|
| 209 |
+
## Adding More Apps
|
| 210 |
+
|
| 211 |
+
To add new apps:
|
| 212 |
+
|
| 213 |
+
1. Create app in ~/tars-apps/
|
| 214 |
+
2. Add app.json manifest (see docs/DEVELOPING_APPS.md)
|
| 215 |
+
3. Create install.sh and uninstall.sh
|
| 216 |
+
4. Refresh dashboard - app appears automatically
|
| 217 |
+
|
| 218 |
+
No code changes needed for new apps.
|
docs/DEVELOPING_APPS.md
ADDED
|
@@ -0,0 +1,400 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Developing Apps with TARS SDK
|
| 2 |
+
|
| 3 |
+
Guide for creating TARS-compatible applications that integrate with the tars-daemon.
|
| 4 |
+
|
| 5 |
+
## Architecture Overview
|
| 6 |
+
|
| 7 |
+
TARS apps connect to the tars-daemon running on Raspberry Pi:
|
| 8 |
+
|
| 9 |
+
```
|
| 10 |
+
[Your App] ββ gRPC (50051) ββ [tars-daemon] ββ [Hardware]
|
| 11 |
+
ββ Motors
|
| 12 |
+
ββ Camera
|
| 13 |
+
ββ Display
|
| 14 |
+
```
|
| 15 |
+
|
| 16 |
+
## App Structure
|
| 17 |
+
|
| 18 |
+
### Minimal Structure
|
| 19 |
+
|
| 20 |
+
```
|
| 21 |
+
your-app/
|
| 22 |
+
βββ app.json # App manifest (required)
|
| 23 |
+
βββ requirements.txt # Python dependencies
|
| 24 |
+
βββ config.ini.example # Configuration template
|
| 25 |
+
βββ env.example # Environment variables template
|
| 26 |
+
βββ install.sh # Installation script
|
| 27 |
+
βββ uninstall.sh # Cleanup script
|
| 28 |
+
βββ main.py # Entry point
|
| 29 |
+
βββ README.md # Documentation
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
## App Manifest (app.json)
|
| 33 |
+
|
| 34 |
+
Required file for daemon dashboard integration:
|
| 35 |
+
|
| 36 |
+
```json
|
| 37 |
+
{
|
| 38 |
+
"name": "tars-conversation-app",
|
| 39 |
+
"version": "1.0.0",
|
| 40 |
+
"description": "Real-time conversational AI with WebRTC",
|
| 41 |
+
"author": "Your Name",
|
| 42 |
+
"repository": "https://github.com/yourusername/your-app.git",
|
| 43 |
+
"main": "tars_bot.py",
|
| 44 |
+
"install_script": "install.sh",
|
| 45 |
+
"uninstall_script": "uninstall.sh",
|
| 46 |
+
"dependencies": {
|
| 47 |
+
"python": ">=3.10",
|
| 48 |
+
"system": ["portaudio19-dev", "ffmpeg"]
|
| 49 |
+
},
|
| 50 |
+
"environment": [
|
| 51 |
+
"DEEPINFRA_API_KEY",
|
| 52 |
+
"SPEECHMATICS_API_KEY"
|
| 53 |
+
],
|
| 54 |
+
"configuration": {
|
| 55 |
+
"file": "config.ini",
|
| 56 |
+
"example": "config.ini.example"
|
| 57 |
+
},
|
| 58 |
+
"ports": {
|
| 59 |
+
"grpc": 50051,
|
| 60 |
+
"http": 8765
|
| 61 |
+
}
|
| 62 |
+
}
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
## Configuration System
|
| 66 |
+
|
| 67 |
+
### Environment Variables (.env.local)
|
| 68 |
+
|
| 69 |
+
Store secrets only, never commit:
|
| 70 |
+
|
| 71 |
+
```bash
|
| 72 |
+
# API Keys
|
| 73 |
+
DEEPINFRA_API_KEY=your_key_here
|
| 74 |
+
SPEECHMATICS_API_KEY=your_key_here
|
| 75 |
+
ELEVENLABS_API_KEY=your_key_here
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
### User Configuration (config.ini)
|
| 79 |
+
|
| 80 |
+
Runtime settings users can modify:
|
| 81 |
+
|
| 82 |
+
```ini
|
| 83 |
+
[Connection]
|
| 84 |
+
mode = robot
|
| 85 |
+
rpi_url = http://100.84.133.74:8765
|
| 86 |
+
rpi_grpc = 100.84.133.74:50051
|
| 87 |
+
auto_connect = false
|
| 88 |
+
|
| 89 |
+
[LLM]
|
| 90 |
+
model = openai/gpt-oss-20b
|
| 91 |
+
gating_model = meta-llama/Llama-3.2-3B-Instruct
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
### Loading Configuration
|
| 95 |
+
|
| 96 |
+
```python
|
| 97 |
+
from pathlib import Path
|
| 98 |
+
from configparser import ConfigParser
|
| 99 |
+
from dotenv import load_dotenv
|
| 100 |
+
import os
|
| 101 |
+
|
| 102 |
+
# Load secrets
|
| 103 |
+
env_local = Path(__file__).parent / ".env.local"
|
| 104 |
+
load_dotenv(env_local, override=True)
|
| 105 |
+
|
| 106 |
+
# Load config
|
| 107 |
+
config = ConfigParser()
|
| 108 |
+
config.read("config.ini")
|
| 109 |
+
|
| 110 |
+
# Runtime reload without restart
|
| 111 |
+
def get_fresh_config():
|
| 112 |
+
config = ConfigParser()
|
| 113 |
+
config.read("config.ini")
|
| 114 |
+
return config
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
## Connecting to tars-daemon
|
| 118 |
+
|
| 119 |
+
### gRPC Client
|
| 120 |
+
|
| 121 |
+
```python
|
| 122 |
+
import grpc
|
| 123 |
+
from tars_sdk import TarsClient
|
| 124 |
+
|
| 125 |
+
# Singleton client
|
| 126 |
+
_client = None
|
| 127 |
+
|
| 128 |
+
def get_tars_client():
|
| 129 |
+
global _client
|
| 130 |
+
if _client is None:
|
| 131 |
+
grpc_address = os.getenv("RPI_GRPC", "100.84.133.74:50051")
|
| 132 |
+
channel = grpc.insecure_channel(grpc_address)
|
| 133 |
+
_client = TarsClient(channel)
|
| 134 |
+
return _client
|
| 135 |
+
|
| 136 |
+
# Use the client
|
| 137 |
+
client = get_tars_client()
|
| 138 |
+
client.execute_movement("wave_right")
|
| 139 |
+
client.set_emotion("happy")
|
| 140 |
+
```
|
| 141 |
+
|
| 142 |
+
### Deployment Mode Detection
|
| 143 |
+
|
| 144 |
+
Auto-detect if running locally on Pi or remotely:
|
| 145 |
+
|
| 146 |
+
```python
|
| 147 |
+
def detect_deployment_mode():
|
| 148 |
+
# Check if running on Raspberry Pi
|
| 149 |
+
try:
|
| 150 |
+
with open("/proc/cpuinfo", "r") as f:
|
| 151 |
+
if "Raspberry Pi" in f.read():
|
| 152 |
+
return "local"
|
| 153 |
+
except FileNotFoundError:
|
| 154 |
+
pass
|
| 155 |
+
|
| 156 |
+
# Check if daemon running on localhost
|
| 157 |
+
try:
|
| 158 |
+
import grpc
|
| 159 |
+
channel = grpc.insecure_channel("localhost:50051")
|
| 160 |
+
grpc.channel_ready_future(channel).result(timeout=1)
|
| 161 |
+
return "local"
|
| 162 |
+
except:
|
| 163 |
+
return "remote"
|
| 164 |
+
|
| 165 |
+
def get_grpc_address():
|
| 166 |
+
if detect_deployment_mode() == "local":
|
| 167 |
+
return "localhost:50051"
|
| 168 |
+
return os.getenv("RPI_GRPC", "100.84.133.74:50051")
|
| 169 |
+
```
|
| 170 |
+
|
| 171 |
+
## Installation Scripts
|
| 172 |
+
|
| 173 |
+
### install.sh
|
| 174 |
+
|
| 175 |
+
```bash
|
| 176 |
+
#!/bin/bash
|
| 177 |
+
set -e
|
| 178 |
+
|
| 179 |
+
APP_NAME="your-app"
|
| 180 |
+
APP_DIR="$HOME/$APP_NAME"
|
| 181 |
+
|
| 182 |
+
echo "Installing $APP_NAME..."
|
| 183 |
+
|
| 184 |
+
# Check Python version
|
| 185 |
+
python3 --version | grep -q "3.1[0-9]" || {
|
| 186 |
+
echo "Error: Python 3.10+ required"
|
| 187 |
+
exit 1
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
# Install system dependencies
|
| 191 |
+
sudo apt-get update
|
| 192 |
+
sudo apt-get install -y portaudio19-dev ffmpeg
|
| 193 |
+
|
| 194 |
+
# Create virtual environment
|
| 195 |
+
python3 -m venv "$APP_DIR/venv"
|
| 196 |
+
source "$APP_DIR/venv/bin/activate"
|
| 197 |
+
|
| 198 |
+
# Install Python dependencies
|
| 199 |
+
pip install --upgrade pip
|
| 200 |
+
pip install -r requirements.txt
|
| 201 |
+
|
| 202 |
+
# Setup configuration
|
| 203 |
+
if [ ! -f config.ini ]; then
|
| 204 |
+
cp config.ini.example config.ini
|
| 205 |
+
echo "Created config.ini - please configure before running"
|
| 206 |
+
fi
|
| 207 |
+
|
| 208 |
+
if [ ! -f .env.local ]; then
|
| 209 |
+
cp env.example .env.local
|
| 210 |
+
echo "Created .env.local - please add API keys"
|
| 211 |
+
fi
|
| 212 |
+
|
| 213 |
+
echo "Installation complete!"
|
| 214 |
+
echo "Next steps:"
|
| 215 |
+
echo "1. Edit .env.local with your API keys"
|
| 216 |
+
echo "2. Edit config.ini if needed"
|
| 217 |
+
echo "3. Run: python main.py"
|
| 218 |
+
```
|
| 219 |
+
|
| 220 |
+
### uninstall.sh
|
| 221 |
+
|
| 222 |
+
```bash
|
| 223 |
+
#!/bin/bash
|
| 224 |
+
set -e
|
| 225 |
+
|
| 226 |
+
APP_NAME="your-app"
|
| 227 |
+
APP_DIR="$HOME/$APP_NAME"
|
| 228 |
+
|
| 229 |
+
echo "Uninstalling $APP_NAME..."
|
| 230 |
+
|
| 231 |
+
# Stop running processes
|
| 232 |
+
pkill -f "python.*$APP_NAME" || true
|
| 233 |
+
|
| 234 |
+
# Remove virtual environment
|
| 235 |
+
rm -rf "$APP_DIR/venv"
|
| 236 |
+
|
| 237 |
+
# Remove generated data (optional)
|
| 238 |
+
read -p "Remove data directories? (y/N) " -n 1 -r
|
| 239 |
+
echo
|
| 240 |
+
if [[ $REPL =~ ^[Yy]$ ]]; then
|
| 241 |
+
rm -rf chroma_memory memory_data
|
| 242 |
+
fi
|
| 243 |
+
|
| 244 |
+
echo "Uninstall complete!"
|
| 245 |
+
```
|
| 246 |
+
|
| 247 |
+
## Best Practices
|
| 248 |
+
|
| 249 |
+
### 1. Project Structure
|
| 250 |
+
|
| 251 |
+
- Keep source code in `src/` directory
|
| 252 |
+
- Separate configuration from code
|
| 253 |
+
- Provide example configs (never commit secrets)
|
| 254 |
+
- Include tests in `tests/` directory
|
| 255 |
+
|
| 256 |
+
### 2. Configuration
|
| 257 |
+
|
| 258 |
+
- Use `.env.local` for secrets (gitignore it)
|
| 259 |
+
- Use `config.ini` for user settings (gitignore it)
|
| 260 |
+
- Provide `.example` templates
|
| 261 |
+
- Support runtime config reload when possible
|
| 262 |
+
|
| 263 |
+
### 3. Dependencies
|
| 264 |
+
|
| 265 |
+
- Pin major versions in requirements.txt
|
| 266 |
+
- Document system dependencies in README
|
| 267 |
+
- Test on fresh Pi OS installation
|
| 268 |
+
- Keep dependencies minimal
|
| 269 |
+
|
| 270 |
+
### 4. Error Handling
|
| 271 |
+
|
| 272 |
+
- Validate configuration on startup
|
| 273 |
+
- Provide clear error messages
|
| 274 |
+
- Test connection to daemon before running
|
| 275 |
+
- Graceful degradation if hardware unavailable
|
| 276 |
+
|
| 277 |
+
### 5. Performance
|
| 278 |
+
|
| 279 |
+
- Use gRPC for low-latency commands (~5-10ms)
|
| 280 |
+
- Batch operations when possible
|
| 281 |
+
- Monitor resource usage on Pi
|
| 282 |
+
- Optimize for Raspberry Pi 4 (4GB RAM)
|
| 283 |
+
|
| 284 |
+
### 6. Testing
|
| 285 |
+
|
| 286 |
+
- Test on actual hardware
|
| 287 |
+
- Provide test scripts for gestures/expressions
|
| 288 |
+
- Document expected behavior
|
| 289 |
+
- Include connection tests
|
| 290 |
+
|
| 291 |
+
## Example: Minimal TARS App
|
| 292 |
+
|
| 293 |
+
```python
|
| 294 |
+
# main.py
|
| 295 |
+
import grpc
|
| 296 |
+
from tars_sdk import TarsClient
|
| 297 |
+
from pathlib import Path
|
| 298 |
+
from dotenv import load_dotenv
|
| 299 |
+
import os
|
| 300 |
+
|
| 301 |
+
# Load configuration
|
| 302 |
+
load_dotenv(Path(__file__).parent / ".env.local")
|
| 303 |
+
|
| 304 |
+
# Connect to daemon
|
| 305 |
+
grpc_address = os.getenv("RPI_GRPC", "100.84.133.74:50051")
|
| 306 |
+
channel = grpc.insecure_channel(grpc_address)
|
| 307 |
+
client = TarsClient(channel)
|
| 308 |
+
|
| 309 |
+
# Test connection
|
| 310 |
+
try:
|
| 311 |
+
status = client.get_robot_status()
|
| 312 |
+
print(f"Connected to TARS: {status}")
|
| 313 |
+
except Exception as e:
|
| 314 |
+
print(f"Connection failed: {e}")
|
| 315 |
+
exit(1)
|
| 316 |
+
|
| 317 |
+
# Use robot
|
| 318 |
+
client.set_emotion("happy")
|
| 319 |
+
client.execute_movement("wave_right")
|
| 320 |
+
print("TARS says hello!")
|
| 321 |
+
```
|
| 322 |
+
|
| 323 |
+
## Integration with Claude Code
|
| 324 |
+
|
| 325 |
+
Structure your app for easy AI-assisted development:
|
| 326 |
+
|
| 327 |
+
1. **Clear directory structure** - AI can navigate easily
|
| 328 |
+
2. **Documented configuration** - AI understands settings
|
| 329 |
+
3. **Type hints** - AI provides better suggestions
|
| 330 |
+
4. **Docstrings** - AI understands intent
|
| 331 |
+
5. **README.md** - AI reads project context
|
| 332 |
+
|
| 333 |
+
See CLAUDE.md for project-specific guidelines.
|
| 334 |
+
|
| 335 |
+
## Common Patterns
|
| 336 |
+
|
| 337 |
+
### Startup Validation
|
| 338 |
+
|
| 339 |
+
```python
|
| 340 |
+
def validate_startup():
|
| 341 |
+
"""Check all requirements before running"""
|
| 342 |
+
errors = []
|
| 343 |
+
|
| 344 |
+
# Check API keys
|
| 345 |
+
if not os.getenv("DEEPINFRA_API_KEY"):
|
| 346 |
+
errors.append("Missing DEEPINFRA_API_KEY in .env.local")
|
| 347 |
+
|
| 348 |
+
# Check config file
|
| 349 |
+
if not Path("config.ini").exists():
|
| 350 |
+
errors.append("config.ini not found")
|
| 351 |
+
|
| 352 |
+
# Test daemon connection
|
| 353 |
+
try:
|
| 354 |
+
client = get_tars_client()
|
| 355 |
+
client.get_robot_status()
|
| 356 |
+
except Exception as e:
|
| 357 |
+
errors.append(f"Cannot connect to daemon: {e}")
|
| 358 |
+
|
| 359 |
+
if errors:
|
| 360 |
+
print("Startup validation failed:")
|
| 361 |
+
for error in errors:
|
| 362 |
+
print(f" - {error}")
|
| 363 |
+
exit(1)
|
| 364 |
+
```
|
| 365 |
+
|
| 366 |
+
### Graceful Shutdown
|
| 367 |
+
|
| 368 |
+
```python
|
| 369 |
+
import signal
|
| 370 |
+
import sys
|
| 371 |
+
|
| 372 |
+
def signal_handler(sig, frame):
|
| 373 |
+
"""Clean shutdown on Ctrl+C"""
|
| 374 |
+
print("\nShutting down...")
|
| 375 |
+
|
| 376 |
+
# Reset robot state
|
| 377 |
+
try:
|
| 378 |
+
client = get_tars_client()
|
| 379 |
+
client.set_emotion("neutral")
|
| 380 |
+
client.set_eye_state(True, True)
|
| 381 |
+
except:
|
| 382 |
+
pass
|
| 383 |
+
|
| 384 |
+
sys.exit(0)
|
| 385 |
+
|
| 386 |
+
signal.signal(signal.SIGINT, signal_handler)
|
| 387 |
+
```
|
| 388 |
+
|
| 389 |
+
## Resources
|
| 390 |
+
|
| 391 |
+
- tars-daemon: `~/tars-daemon` on Pi
|
| 392 |
+
- TARS SDK: Install via pip `pip install tars-sdk`
|
| 393 |
+
- Example Apps: This repository (tars-conversation-app)
|
| 394 |
+
- Pi Access: `ssh tars-pi` (100.84.133.74)
|
| 395 |
+
|
| 396 |
+
## Support
|
| 397 |
+
|
| 398 |
+
- Check daemon status: `systemctl status tars-daemon`
|
| 399 |
+
- View daemon logs: `journalctl -u tars-daemon -f`
|
| 400 |
+
- Test gRPC connection: `grpcurl -plaintext 100.84.133.74:50051 list`
|
docs/INSTALLATION_GUIDE.md
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Installation Guide
|
| 2 |
+
|
| 3 |
+
Quick reference for installing tars-conversation-app on Raspberry Pi.
|
| 4 |
+
|
| 5 |
+
## Prerequisites
|
| 6 |
+
|
| 7 |
+
- Raspberry Pi 4 (4GB RAM recommended)
|
| 8 |
+
- Raspberry Pi OS (Bullseye or later)
|
| 9 |
+
- Python 3.10 or higher
|
| 10 |
+
- Internet connection
|
| 11 |
+
|
| 12 |
+
## From Dashboard (Recommended)
|
| 13 |
+
|
| 14 |
+
Once tars-daemon implements app management:
|
| 15 |
+
|
| 16 |
+
1. Open tars-daemon dashboard at `http://100.84.133.74:7860`
|
| 17 |
+
2. Navigate to "Apps" tab
|
| 18 |
+
3. Find "tars-conversation-app"
|
| 19 |
+
4. Click "Install" button
|
| 20 |
+
5. Wait for installation to complete
|
| 21 |
+
6. Configure API keys in `.env.local`
|
| 22 |
+
7. Adjust settings in `config.ini` if needed
|
| 23 |
+
8. Click "Run" to start
|
| 24 |
+
|
| 25 |
+
## Manual Installation (SSH)
|
| 26 |
+
|
| 27 |
+
### Step 1: Clone Repository
|
| 28 |
+
|
| 29 |
+
```bash
|
| 30 |
+
ssh tars-pi
|
| 31 |
+
cd ~
|
| 32 |
+
git clone https://github.com/latishab/tars-conversation-app.git
|
| 33 |
+
cd tars-conversation-app
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
### Step 2: Run Installer
|
| 37 |
+
|
| 38 |
+
```bash
|
| 39 |
+
bash install.sh
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
The installer will:
|
| 43 |
+
- Check Python version (requires 3.10+)
|
| 44 |
+
- Install system dependencies (portaudio, ffmpeg)
|
| 45 |
+
- Create Python virtual environment
|
| 46 |
+
- Install all Python packages
|
| 47 |
+
- Create config files from templates
|
| 48 |
+
|
| 49 |
+
This takes 5-10 minutes on first run.
|
| 50 |
+
|
| 51 |
+
### Step 3: Configure
|
| 52 |
+
|
| 53 |
+
Edit API keys:
|
| 54 |
+
```bash
|
| 55 |
+
nano .env.local
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
Add your keys:
|
| 59 |
+
```bash
|
| 60 |
+
DEEPINFRA_API_KEY=your_key_here
|
| 61 |
+
SPEECHMATICS_API_KEY=your_key_here
|
| 62 |
+
# or
|
| 63 |
+
DEEPGRAM_API_KEY=your_key_here
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
Edit settings (optional):
|
| 67 |
+
```bash
|
| 68 |
+
nano config.ini
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
### Step 4: Run
|
| 72 |
+
|
| 73 |
+
Activate virtual environment:
|
| 74 |
+
```bash
|
| 75 |
+
source venv/bin/activate
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
Run in robot mode:
|
| 79 |
+
```bash
|
| 80 |
+
python tars_bot.py
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
Or run dashboard:
|
| 84 |
+
```bash
|
| 85 |
+
python ui/app.py
|
| 86 |
+
```
|
| 87 |
+
|
| 88 |
+
## Verification
|
| 89 |
+
|
| 90 |
+
Check installation:
|
| 91 |
+
```bash
|
| 92 |
+
# Activate venv
|
| 93 |
+
source ~/tars-conversation-app/venv/bin/activate
|
| 94 |
+
|
| 95 |
+
# Test imports
|
| 96 |
+
python -c "import pipecat; print('Pipecat OK')"
|
| 97 |
+
python -c "from tars_sdk import TarsClient; print('TARS SDK OK')"
|
| 98 |
+
|
| 99 |
+
# Test daemon connection
|
| 100 |
+
python -c "
|
| 101 |
+
import grpc
|
| 102 |
+
from tars_sdk import TarsClient
|
| 103 |
+
channel = grpc.insecure_channel('localhost:50051')
|
| 104 |
+
client = TarsClient(channel)
|
| 105 |
+
print('Daemon connection OK')
|
| 106 |
+
"
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
## Uninstallation
|
| 110 |
+
|
| 111 |
+
From dashboard:
|
| 112 |
+
1. Navigate to "Apps" tab
|
| 113 |
+
2. Find "tars-conversation-app"
|
| 114 |
+
3. Click "Uninstall" button
|
| 115 |
+
4. Choose whether to keep data/config
|
| 116 |
+
|
| 117 |
+
Manual:
|
| 118 |
+
```bash
|
| 119 |
+
cd ~/tars-conversation-app
|
| 120 |
+
bash uninstall.sh
|
| 121 |
+
```
|
| 122 |
+
|
| 123 |
+
## Troubleshooting
|
| 124 |
+
|
| 125 |
+
### Installation fails
|
| 126 |
+
|
| 127 |
+
Check Python version:
|
| 128 |
+
```bash
|
| 129 |
+
python3 --version
|
| 130 |
+
# Should be 3.10 or higher
|
| 131 |
+
```
|
| 132 |
+
|
| 133 |
+
Check disk space:
|
| 134 |
+
```bash
|
| 135 |
+
df -h
|
| 136 |
+
# Need at least 2GB free
|
| 137 |
+
```
|
| 138 |
+
|
| 139 |
+
Check internet:
|
| 140 |
+
```bash
|
| 141 |
+
ping google.com
|
| 142 |
+
```
|
| 143 |
+
|
| 144 |
+
### Dependencies fail to install
|
| 145 |
+
|
| 146 |
+
Update package lists:
|
| 147 |
+
```bash
|
| 148 |
+
sudo apt-get update
|
| 149 |
+
sudo apt-get upgrade
|
| 150 |
+
```
|
| 151 |
+
|
| 152 |
+
Reinstall system deps:
|
| 153 |
+
```bash
|
| 154 |
+
sudo apt-get install -y portaudio19-dev ffmpeg build-essential python3-dev
|
| 155 |
+
```
|
| 156 |
+
|
| 157 |
+
### Virtual environment issues
|
| 158 |
+
|
| 159 |
+
Remove and recreate:
|
| 160 |
+
```bash
|
| 161 |
+
rm -rf venv
|
| 162 |
+
python3 -m venv venv
|
| 163 |
+
source venv/bin/activate
|
| 164 |
+
pip install -r requirements.txt
|
| 165 |
+
```
|
| 166 |
+
|
| 167 |
+
### Configuration not found
|
| 168 |
+
|
| 169 |
+
Recreate from templates:
|
| 170 |
+
```bash
|
| 171 |
+
cp config.ini.example config.ini
|
| 172 |
+
cp env.example .env.local
|
| 173 |
+
```
|
| 174 |
+
|
| 175 |
+
### Cannot connect to daemon
|
| 176 |
+
|
| 177 |
+
Check daemon is running:
|
| 178 |
+
```bash
|
| 179 |
+
systemctl status tars-daemon
|
| 180 |
+
```
|
| 181 |
+
|
| 182 |
+
Test gRPC port:
|
| 183 |
+
```bash
|
| 184 |
+
nc -zv localhost 50051
|
| 185 |
+
```
|
| 186 |
+
|
| 187 |
+
Check logs:
|
| 188 |
+
```bash
|
| 189 |
+
journalctl -u tars-daemon -f
|
| 190 |
+
```
|
| 191 |
+
|
| 192 |
+
## Running in Background
|
| 193 |
+
|
| 194 |
+
Use systemd service:
|
| 195 |
+
|
| 196 |
+
```bash
|
| 197 |
+
# Create service file
|
| 198 |
+
sudo nano /etc/systemd/system/tars-conversation.service
|
| 199 |
+
```
|
| 200 |
+
|
| 201 |
+
Add:
|
| 202 |
+
```ini
|
| 203 |
+
[Unit]
|
| 204 |
+
Description=TARS Conversation App
|
| 205 |
+
After=network.target tars-daemon.service
|
| 206 |
+
Requires=tars-daemon.service
|
| 207 |
+
|
| 208 |
+
[Service]
|
| 209 |
+
Type=simple
|
| 210 |
+
User=mac
|
| 211 |
+
WorkingDirectory=/home/mac/tars-conversation-app
|
| 212 |
+
ExecStart=/home/mac/tars-conversation-app/venv/bin/python tars_bot.py
|
| 213 |
+
Restart=always
|
| 214 |
+
RestartSec=10
|
| 215 |
+
|
| 216 |
+
[Install]
|
| 217 |
+
WantedBy=multi-user.target
|
| 218 |
+
```
|
| 219 |
+
|
| 220 |
+
Enable and start:
|
| 221 |
+
```bash
|
| 222 |
+
sudo systemctl daemon-reload
|
| 223 |
+
sudo systemctl enable tars-conversation.service
|
| 224 |
+
sudo systemctl start tars-conversation.service
|
| 225 |
+
```
|
| 226 |
+
|
| 227 |
+
Check status:
|
| 228 |
+
```bash
|
| 229 |
+
sudo systemctl status tars-conversation.service
|
| 230 |
+
journalctl -u tars-conversation.service -f
|
| 231 |
+
```
|
| 232 |
+
|
| 233 |
+
## Updating
|
| 234 |
+
|
| 235 |
+
Pull latest changes:
|
| 236 |
+
```bash
|
| 237 |
+
cd ~/tars-conversation-app
|
| 238 |
+
git pull
|
| 239 |
+
```
|
| 240 |
+
|
| 241 |
+
Update dependencies:
|
| 242 |
+
```bash
|
| 243 |
+
source venv/bin/activate
|
| 244 |
+
pip install -r requirements.txt --upgrade
|
| 245 |
+
```
|
| 246 |
+
|
| 247 |
+
Restart if running as service:
|
| 248 |
+
```bash
|
| 249 |
+
sudo systemctl restart tars-conversation.service
|
| 250 |
+
```
|
| 251 |
+
|
| 252 |
+
## Resource Usage
|
| 253 |
+
|
| 254 |
+
Expected resource usage on Pi 4:
|
| 255 |
+
|
| 256 |
+
- **Installation size**: ~1.5GB (venv + packages)
|
| 257 |
+
- **Memory**: 500MB-1GB during conversation
|
| 258 |
+
- **CPU**: 30-50% (varies with STT/TTS)
|
| 259 |
+
- **Network**: ~100kbps for audio + API calls
|
| 260 |
+
|
| 261 |
+
Recommend:
|
| 262 |
+
- 4GB RAM Pi (2GB may struggle)
|
| 263 |
+
- Active cooling for sustained use
|
| 264 |
+
- Wired ethernet for stability
|
docs/MEMORY.md
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Hybrid Memory System
|
| 2 |
+
|
| 3 |
+
## Overview
|
| 4 |
+
|
| 5 |
+
A high-performance memory system optimized for voice AI applications with sub-50ms latency targets. Combines semantic vector search with BM25 keyword matching for superior recall and precision.
|
| 6 |
+
|
| 7 |
+
## Architecture
|
| 8 |
+
|
| 9 |
+
### Hybrid Search (70% Vector + 30% BM25)
|
| 10 |
+
|
| 11 |
+
1. **Vector Search (70% weight)**
|
| 12 |
+
- Uses `all-MiniLM-L6-v2` for semantic embeddings
|
| 13 |
+
- Cosine similarity for relevance scoring
|
| 14 |
+
- Captures semantic meaning and context
|
| 15 |
+
|
| 16 |
+
2. **BM25 Keyword Search (30% weight)**
|
| 17 |
+
- SQLite FTS5 full-text search
|
| 18 |
+
- Exact keyword matching
|
| 19 |
+
- Handles specific names, terms, and facts
|
| 20 |
+
|
| 21 |
+
3. **Score Fusion**
|
| 22 |
+
- Weighted combination of both approaches
|
| 23 |
+
- Best of both worlds: semantic understanding + exact matching
|
| 24 |
+
|
| 25 |
+
## Performance Optimizations
|
| 26 |
+
|
| 27 |
+
### For Voice AI (<50ms target)
|
| 28 |
+
|
| 29 |
+
| Optimization | Benefit |
|
| 30 |
+
|--------------|---------|
|
| 31 |
+
| **Query Embedding Cache** | Avoid re-encoding similar queries (-20-40ms on cache hit) |
|
| 32 |
+
| **Pre-warmed Model** | Eliminates cold start latency (-50ms) |
|
| 33 |
+
| **Thread Pool** | Non-blocking SQLite operations (-5-10ms) |
|
| 34 |
+
| **Strict Timeout** | Guarantees <50ms with graceful fallback |
|
| 35 |
+
| **Fire-and-Forget Storage** | Stores memories asynchronously (0ms blocking) |
|
| 36 |
+
| **SQLite In-Process** | No network overhead vs ChromaDB (-10-20ms) |
|
| 37 |
+
|
| 38 |
+
## Latency Comparison
|
| 39 |
+
|
| 40 |
+
| System | Search Latency | Voice AI Ready? |
|
| 41 |
+
|--------|---------------|-----------------|
|
| 42 |
+
| ChromaDB | 50-100ms | β οΈ Borderline |
|
| 43 |
+
| **Hybrid Memory** | **20-40ms** | β
|
|
| 44 |
+
|
| 45 |
+
## Configuration
|
| 46 |
+
|
| 47 |
+
```python
|
| 48 |
+
memory_service = HybridMemoryService(
|
| 49 |
+
user_id=client_id,
|
| 50 |
+
db_path="./memory_data/memory.sqlite",
|
| 51 |
+
search_limit=3, # Top N results to return
|
| 52 |
+
search_timeout_ms=40, # Strict timeout for voice AI
|
| 53 |
+
vector_weight=0.7, # 70% semantic similarity
|
| 54 |
+
bm25_weight=0.3, # 30% keyword matching
|
| 55 |
+
system_prompt_prefix="From our conversations:\n",
|
| 56 |
+
)
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
## Database Schema
|
| 60 |
+
|
| 61 |
+
### Main Table
|
| 62 |
+
```sql
|
| 63 |
+
CREATE TABLE memories (
|
| 64 |
+
id INTEGER PRIMARY KEY,
|
| 65 |
+
user_id TEXT NOT NULL,
|
| 66 |
+
content TEXT NOT NULL,
|
| 67 |
+
embedding BLOB, -- numpy float32 array
|
| 68 |
+
created_at REAL
|
| 69 |
+
)
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
### FTS5 Index
|
| 73 |
+
```sql
|
| 74 |
+
CREATE VIRTUAL TABLE memories_fts USING fts5(
|
| 75 |
+
content,
|
| 76 |
+
content='memories',
|
| 77 |
+
content_rowid='id'
|
| 78 |
+
)
|
| 79 |
+
```
|
| 80 |
+
|
| 81 |
+
## Performance Metrics
|
| 82 |
+
|
| 83 |
+
The service tracks:
|
| 84 |
+
- **searches**: Total number of searches
|
| 85 |
+
- **cache_hits**: Query embedding cache hits
|
| 86 |
+
- **cache_hit_rate**: Percentage of cached queries
|
| 87 |
+
- **timeouts**: Searches exceeding timeout threshold
|
| 88 |
+
- **avg_latency_ms**: Average search latency
|
| 89 |
+
|
| 90 |
+
Access stats:
|
| 91 |
+
```python
|
| 92 |
+
stats = memory_service.get_stats()
|
| 93 |
+
print(stats)
|
| 94 |
+
```
|
| 95 |
+
|
| 96 |
+
## How It Works
|
| 97 |
+
|
| 98 |
+
### Search Process
|
| 99 |
+
|
| 100 |
+
1. **User message arrives** β Extract text
|
| 101 |
+
2. **Generate query embedding** β Check cache first
|
| 102 |
+
3. **Vector search** β Scan recent 100 memories, compute cosine similarity
|
| 103 |
+
4. **BM25 search** β FTS5 query for keyword matches
|
| 104 |
+
5. **Score fusion** β Combine weighted scores
|
| 105 |
+
6. **Return top N** β Sorted by final score
|
| 106 |
+
7. **Inject into context** β Add as system message
|
| 107 |
+
8. **Store asynchronously** β Fire-and-forget storage
|
| 108 |
+
|
| 109 |
+
### Example
|
| 110 |
+
|
| 111 |
+
```
|
| 112 |
+
User: "What's my favorite color?"
|
| 113 |
+
|
| 114 |
+
Vector Search:
|
| 115 |
+
- "I love blue, it's my favorite color" β 0.85 similarity
|
| 116 |
+
- "My room is painted blue" β 0.62 similarity
|
| 117 |
+
|
| 118 |
+
BM25 Search:
|
| 119 |
+
- "I love blue, it's my favorite color" β rank 1 (score: 1.0)
|
| 120 |
+
- "Blue is calming" β rank 2 (score: 0.5)
|
| 121 |
+
|
| 122 |
+
Final Scores (70% vector + 30% BM25):
|
| 123 |
+
- "I love blue, it's my favorite color" β 0.85*0.7 + 1.0*0.3 = 0.895 β
|
| 124 |
+
- "My room is painted blue" β 0.62*0.7 + 0.0*0.3 = 0.434
|
| 125 |
+
- "Blue is calming" β 0.0*0.7 + 0.5*0.3 = 0.150
|
| 126 |
+
|
| 127 |
+
Top result returned: "I love blue, it's my favorite color"
|
| 128 |
+
```
|
| 129 |
+
|
| 130 |
+
## Migration from ChromaDB
|
| 131 |
+
|
| 132 |
+
The hybrid memory service is a drop-in replacement:
|
| 133 |
+
|
| 134 |
+
```diff
|
| 135 |
+
- from services.memory_chromadb import ChromaDBMemoryService
|
| 136 |
+
+ from services.memory_hybrid import HybridMemoryService
|
| 137 |
+
|
| 138 |
+
- memory_service = ChromaDBMemoryService(
|
| 139 |
+
+ memory_service = HybridMemoryService(
|
| 140 |
+
user_id=client_id,
|
| 141 |
+
- agent_id="tars_agent",
|
| 142 |
+
- collection_name="conversations",
|
| 143 |
+
- search_limit=5,
|
| 144 |
+
- search_threshold=0.5,
|
| 145 |
+
+ db_path="./memory_data/memory.sqlite",
|
| 146 |
+
+ search_limit=3,
|
| 147 |
+
+ search_timeout_ms=40,
|
| 148 |
+
+ vector_weight=0.7,
|
| 149 |
+
+ bm25_weight=0.3,
|
| 150 |
+
)
|
| 151 |
+
```
|
| 152 |
+
|
| 153 |
+
## Storage Location
|
| 154 |
+
|
| 155 |
+
- **Database**: `./memory_data/memory.sqlite`
|
| 156 |
+
- **Format**: SQLite with FTS5 extension
|
| 157 |
+
- **Embeddings**: Stored as binary BLOBs (numpy float32)
|
| 158 |
+
|
| 159 |
+
## Dependencies
|
| 160 |
+
|
| 161 |
+
- `sqlite3` (built-in with Python)
|
| 162 |
+
- `sentence-transformers` (already installed)
|
| 163 |
+
- `numpy` (dependency of sentence-transformers)
|
| 164 |
+
|
| 165 |
+
No additional packages required!
|
| 166 |
+
|
| 167 |
+
## Troubleshooting
|
| 168 |
+
|
| 169 |
+
### High Latency
|
| 170 |
+
- Check cache hit rate: `memory_service.get_stats()`
|
| 171 |
+
- Reduce `search_limit` if processing too many results
|
| 172 |
+
- Increase `search_timeout_ms` if needed
|
| 173 |
+
|
| 174 |
+
### Timeouts
|
| 175 |
+
- Review timeout stats: `stats["timeouts"]`
|
| 176 |
+
- Consider increasing `search_timeout_ms` to 50-60ms
|
| 177 |
+
- Check if database is growing too large
|
| 178 |
+
|
| 179 |
+
### Memory Not Recalled
|
| 180 |
+
- Verify memories are being stored (check database)
|
| 181 |
+
- Adjust `vector_weight` and `bm25_weight` balance
|
| 182 |
+
- Try rephrasing queries to match stored content
|
| 183 |
+
|
| 184 |
+
## Future Enhancements
|
| 185 |
+
|
| 186 |
+
- [ ] Automatic database compaction/cleanup
|
| 187 |
+
- [ ] Per-user memory limits
|
| 188 |
+
- [ ] Memory importance scoring
|
| 189 |
+
- [ ] Temporal decay for older memories
|
| 190 |
+
- [ ] Multi-turn conversation grouping
|
env.example
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# STT Provider Configuration
|
| 2 |
+
# Options: "speechmatics", "deepgram", or "deepgram-flux"
|
| 3 |
+
STT_PROVIDER=speechmatics
|
| 4 |
+
|
| 5 |
+
# Speechmatics API Key
|
| 6 |
+
# Get your API key from: https://portal.speechmatics.com/
|
| 7 |
+
SPEECHMATICS_API_KEY=your_speechmatics_api_key_here
|
| 8 |
+
|
| 9 |
+
# Deepgram API Key (only needed if STT_PROVIDER=deepgram or deepgram-flux)
|
| 10 |
+
# Get your API key from: https://console.deepgram.com/
|
| 11 |
+
DEEPGRAM_API_KEY=your_deepgram_api_key_here
|
| 12 |
+
|
| 13 |
+
# ElevenLabs API Key
|
| 14 |
+
# Get your API key from: https://elevenlabs.io/app/settings/api-keys
|
| 15 |
+
ELEVENLABS_API_KEY=your_elevenlabs_api_key_here
|
| 16 |
+
|
| 17 |
+
# ElevenLabs Voice ID (optional, defaults to custom voice)
|
| 18 |
+
# Find voice IDs at: https://elevenlabs.io/app/voices
|
| 19 |
+
ELEVENLABS_VOICE_ID=ry8mpwRw6nugb2qjP0tu
|
| 20 |
+
|
| 21 |
+
# DeepInfra API Key (for Qwen LLM and Gating Layer)
|
| 22 |
+
# Get your API key from: https://deepinfra.com/
|
| 23 |
+
DEEPINFRA_API_KEY=your_deepinfra_api_key_here
|
| 24 |
+
# Optional: Override default models
|
| 25 |
+
# DEEPINFRA_MODEL=Qwen/Qwen3-235B-A22B-Instruct-2507 # Main LLM (default)
|
| 26 |
+
# DEEPINFRA_GATING_MODEL=meta-llama/Llama-3.2-3B-Instruct # Gating Layer (default)
|
| 27 |
+
|
| 28 |
+
# Pipecat FastAPI service URL (for frontend to connect)
|
| 29 |
+
NEXT_PUBLIC_PIPECAT_URL=http://localhost:7860
|
| 30 |
+
|
| 31 |
+
# Pipecat FastAPI service configuration
|
| 32 |
+
PIPECAT_HOST=localhost
|
| 33 |
+
PIPECAT_PORT=7860
|
| 34 |
+
|
| 35 |
+
# Mem0 API Key (optional, enables long-term memory)
|
| 36 |
+
# Get one from: https://docs.mem0.ai/
|
| 37 |
+
MEM0_API_KEY=your_mem0_api_key_here
|
| 38 |
+
|
| 39 |
+
# TTS Provider Configuration
|
| 40 |
+
# Options: "elevenlabs" (cloud, requires API key) or "qwen3" (local, free)
|
| 41 |
+
TTS_PROVIDER=qwen3
|
| 42 |
+
|
| 43 |
+
# Qwen3-TTS Configuration (only needed if TTS_PROVIDER=qwen3)
|
| 44 |
+
# Model: 0.6B (faster, less memory) or 1.7B (better quality)
|
| 45 |
+
QWEN3_TTS_MODEL=Qwen/Qwen3-TTS-12Hz-0.6B-Base
|
| 46 |
+
# Device: "mps" for Mac, "cuda" for NVIDIA GPU, "cpu" for CPU
|
| 47 |
+
QWEN3_TTS_DEVICE=mps
|
| 48 |
+
# Reference audio file for voice cloning (relative to project root)
|
| 49 |
+
QWEN3_TTS_REF_AUDIO=assets/audio/tars-clean-compressed.mp3
|
| 50 |
+
|
| 51 |
+
# Emotional State Monitoring
|
| 52 |
+
# Continuously analyzes video for confusion/hesitation/frustration
|
| 53 |
+
# Triggers TARS to offer help proactively
|
| 54 |
+
EMOTIONAL_MONITORING_ENABLED=true
|
| 55 |
+
# How often to sample video frames (in seconds)
|
| 56 |
+
EMOTIONAL_SAMPLING_INTERVAL=3.0
|
| 57 |
+
# How many consecutive negative states before intervention
|
| 58 |
+
EMOTIONAL_INTERVENTION_THRESHOLD=2
|
| 59 |
+
|
index.html
ADDED
|
@@ -0,0 +1,333 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>TARS Conversation App</title>
|
| 7 |
+
<style>
|
| 8 |
+
* {
|
| 9 |
+
margin: 0;
|
| 10 |
+
padding: 0;
|
| 11 |
+
box-sizing: border-box;
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
body {
|
| 15 |
+
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
|
| 16 |
+
line-height: 1.6;
|
| 17 |
+
color: #333;
|
| 18 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 19 |
+
min-height: 100vh;
|
| 20 |
+
padding: 20px;
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
.container {
|
| 24 |
+
max-width: 900px;
|
| 25 |
+
margin: 0 auto;
|
| 26 |
+
background: white;
|
| 27 |
+
border-radius: 16px;
|
| 28 |
+
padding: 40px;
|
| 29 |
+
box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
header {
|
| 33 |
+
text-align: center;
|
| 34 |
+
margin-bottom: 40px;
|
| 35 |
+
padding-bottom: 30px;
|
| 36 |
+
border-bottom: 2px solid #f0f0f0;
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
h1 {
|
| 40 |
+
font-size: 2.5rem;
|
| 41 |
+
color: #667eea;
|
| 42 |
+
margin-bottom: 10px;
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
.subtitle {
|
| 46 |
+
font-size: 1.2rem;
|
| 47 |
+
color: #666;
|
| 48 |
+
margin-bottom: 20px;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
.badges {
|
| 52 |
+
display: flex;
|
| 53 |
+
gap: 10px;
|
| 54 |
+
justify-content: center;
|
| 55 |
+
flex-wrap: wrap;
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
.badge {
|
| 59 |
+
background: #667eea;
|
| 60 |
+
color: white;
|
| 61 |
+
padding: 6px 16px;
|
| 62 |
+
border-radius: 20px;
|
| 63 |
+
font-size: 14px;
|
| 64 |
+
font-weight: 500;
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
.badge.version {
|
| 68 |
+
background: #764ba2;
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
.badge.tars {
|
| 72 |
+
background: #48bb78;
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
section {
|
| 76 |
+
margin-bottom: 40px;
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
h2 {
|
| 80 |
+
color: #667eea;
|
| 81 |
+
font-size: 1.8rem;
|
| 82 |
+
margin-bottom: 15px;
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
h3 {
|
| 86 |
+
color: #764ba2;
|
| 87 |
+
font-size: 1.3rem;
|
| 88 |
+
margin-bottom: 10px;
|
| 89 |
+
margin-top: 25px;
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
.install-box {
|
| 93 |
+
background: #f7fafc;
|
| 94 |
+
border-left: 4px solid #667eea;
|
| 95 |
+
padding: 25px;
|
| 96 |
+
border-radius: 8px;
|
| 97 |
+
margin: 20px 0;
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
.install-steps {
|
| 101 |
+
list-style: none;
|
| 102 |
+
counter-reset: step-counter;
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
.install-steps li {
|
| 106 |
+
counter-increment: step-counter;
|
| 107 |
+
margin-bottom: 15px;
|
| 108 |
+
padding-left: 40px;
|
| 109 |
+
position: relative;
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
.install-steps li::before {
|
| 113 |
+
content: counter(step-counter);
|
| 114 |
+
position: absolute;
|
| 115 |
+
left: 0;
|
| 116 |
+
top: 0;
|
| 117 |
+
background: #667eea;
|
| 118 |
+
color: white;
|
| 119 |
+
width: 28px;
|
| 120 |
+
height: 28px;
|
| 121 |
+
border-radius: 50%;
|
| 122 |
+
display: flex;
|
| 123 |
+
align-items: center;
|
| 124 |
+
justify-content: center;
|
| 125 |
+
font-weight: bold;
|
| 126 |
+
font-size: 14px;
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
code {
|
| 130 |
+
background: #2d3748;
|
| 131 |
+
color: #68d391;
|
| 132 |
+
padding: 3px 8px;
|
| 133 |
+
border-radius: 4px;
|
| 134 |
+
font-family: "Courier New", monospace;
|
| 135 |
+
font-size: 0.9em;
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
pre {
|
| 139 |
+
background: #2d3748;
|
| 140 |
+
color: #e2e8f0;
|
| 141 |
+
padding: 20px;
|
| 142 |
+
border-radius: 8px;
|
| 143 |
+
overflow-x: auto;
|
| 144 |
+
margin: 15px 0;
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
pre code {
|
| 148 |
+
background: none;
|
| 149 |
+
padding: 0;
|
| 150 |
+
color: inherit;
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
.features {
|
| 154 |
+
display: grid;
|
| 155 |
+
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
|
| 156 |
+
gap: 20px;
|
| 157 |
+
margin: 20px 0;
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
.feature-card {
|
| 161 |
+
background: #f7fafc;
|
| 162 |
+
padding: 20px;
|
| 163 |
+
border-radius: 8px;
|
| 164 |
+
border-left: 4px solid #764ba2;
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
.feature-card h4 {
|
| 168 |
+
color: #667eea;
|
| 169 |
+
margin-bottom: 8px;
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
.btn {
|
| 173 |
+
display: inline-block;
|
| 174 |
+
background: #667eea;
|
| 175 |
+
color: white;
|
| 176 |
+
padding: 12px 30px;
|
| 177 |
+
border-radius: 8px;
|
| 178 |
+
text-decoration: none;
|
| 179 |
+
font-weight: 600;
|
| 180 |
+
transition: background 0.3s;
|
| 181 |
+
margin-right: 10px;
|
| 182 |
+
margin-top: 10px;
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
.btn:hover {
|
| 186 |
+
background: #5568d3;
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
.btn.secondary {
|
| 190 |
+
background: #764ba2;
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
.btn.secondary:hover {
|
| 194 |
+
background: #68399e;
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
footer {
|
| 198 |
+
text-align: center;
|
| 199 |
+
margin-top: 50px;
|
| 200 |
+
padding-top: 30px;
|
| 201 |
+
border-top: 2px solid #f0f0f0;
|
| 202 |
+
color: #666;
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
.tech-stack {
|
| 206 |
+
display: flex;
|
| 207 |
+
flex-wrap: wrap;
|
| 208 |
+
gap: 10px;
|
| 209 |
+
margin: 15px 0;
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
.tech {
|
| 213 |
+
background: #edf2f7;
|
| 214 |
+
padding: 8px 16px;
|
| 215 |
+
border-radius: 6px;
|
| 216 |
+
font-size: 14px;
|
| 217 |
+
color: #4a5568;
|
| 218 |
+
}
|
| 219 |
+
</style>
|
| 220 |
+
</head>
|
| 221 |
+
<body>
|
| 222 |
+
<div class="container">
|
| 223 |
+
<header>
|
| 224 |
+
<h1>π€ TARS Conversation App</h1>
|
| 225 |
+
<p class="subtitle">Real-time conversational AI for TARS robots</p>
|
| 226 |
+
<div class="badges">
|
| 227 |
+
<span class="badge">AI Assistant</span>
|
| 228 |
+
<span class="badge version">v1.0.0</span>
|
| 229 |
+
<span class="badge tars">TARS App</span>
|
| 230 |
+
</div>
|
| 231 |
+
</header>
|
| 232 |
+
|
| 233 |
+
<section>
|
| 234 |
+
<h2>Features</h2>
|
| 235 |
+
<div class="features">
|
| 236 |
+
<div class="feature-card">
|
| 237 |
+
<h4>π€ Real-time Voice</h4>
|
| 238 |
+
<p>WebRTC audio with Speechmatics/Deepgram transcription</p>
|
| 239 |
+
</div>
|
| 240 |
+
<div class="feature-card">
|
| 241 |
+
<h4>π§ Smart Memory</h4>
|
| 242 |
+
<p>Hybrid vector + BM25 search with ChromaDB</p>
|
| 243 |
+
</div>
|
| 244 |
+
<div class="feature-card">
|
| 245 |
+
<h4>ποΈ Vision Analysis</h4>
|
| 246 |
+
<p>Image understanding with Moondream</p>
|
| 247 |
+
</div>
|
| 248 |
+
<div class="feature-card">
|
| 249 |
+
<h4>π Live Dashboard</h4>
|
| 250 |
+
<p>Gradio metrics, latency charts, transcriptions</p>
|
| 251 |
+
</div>
|
| 252 |
+
<div class="feature-card">
|
| 253 |
+
<h4>π Emotional AI</h4>
|
| 254 |
+
<p>Real-time emotion and sentiment monitoring</p>
|
| 255 |
+
</div>
|
| 256 |
+
<div class="feature-card">
|
| 257 |
+
<h4>π€ Robot Control</h4>
|
| 258 |
+
<p>gRPC commands for gestures, eyes, movement</p>
|
| 259 |
+
</div>
|
| 260 |
+
</div>
|
| 261 |
+
</section>
|
| 262 |
+
|
| 263 |
+
<section>
|
| 264 |
+
<h2>Installation on TARS Robot</h2>
|
| 265 |
+
<div class="install-box">
|
| 266 |
+
<ol class="install-steps">
|
| 267 |
+
<li>Open TARS dashboard at <code>http://your-pi:8000</code></li>
|
| 268 |
+
<li>Go to <strong>App Store</strong> tab</li>
|
| 269 |
+
<li>Enter Space ID: <code>latishab/tars-conversation-app</code></li>
|
| 270 |
+
<li>Click <strong>Install from HuggingFace</strong></li>
|
| 271 |
+
<li>Configure API keys in <code>.env.local</code></li>
|
| 272 |
+
<li>Click <strong>Start</strong></li>
|
| 273 |
+
<li>Open dashboard at <code>http://your-pi:7860</code></li>
|
| 274 |
+
</ol>
|
| 275 |
+
</div>
|
| 276 |
+
</section>
|
| 277 |
+
|
| 278 |
+
<section>
|
| 279 |
+
<h3>Required API Keys</h3>
|
| 280 |
+
<ul style="list-style-position: inside; margin-left: 20px;">
|
| 281 |
+
<li><code>DEEPINFRA_API_KEY</code> - For LLM (DeepInfra)</li>
|
| 282 |
+
<li><code>SPEECHMATICS_API_KEY</code> or <code>DEEPGRAM_API_KEY</code> - For STT</li>
|
| 283 |
+
<li><code>ELEVENLABS_API_KEY</code> (optional) - For premium TTS</li>
|
| 284 |
+
</ul>
|
| 285 |
+
</section>
|
| 286 |
+
|
| 287 |
+
<section>
|
| 288 |
+
<h2>Tech Stack</h2>
|
| 289 |
+
<div class="tech-stack">
|
| 290 |
+
<span class="tech">Pipecat</span>
|
| 291 |
+
<span class="tech">WebRTC</span>
|
| 292 |
+
<span class="tech">Gradio</span>
|
| 293 |
+
<span class="tech">ChromaDB</span>
|
| 294 |
+
<span class="tech">gRPC</span>
|
| 295 |
+
<span class="tech">Speechmatics</span>
|
| 296 |
+
<span class="tech">Deepgram</span>
|
| 297 |
+
<span class="tech">ElevenLabs</span>
|
| 298 |
+
<span class="tech">DeepInfra</span>
|
| 299 |
+
<span class="tech">Moondream</span>
|
| 300 |
+
</div>
|
| 301 |
+
</section>
|
| 302 |
+
|
| 303 |
+
<section>
|
| 304 |
+
<h2>Manual Installation</h2>
|
| 305 |
+
<p>For development or non-TARS deployments:</p>
|
| 306 |
+
<pre><code>git clone https://github.com/latishab/tars-conversation-app.git
|
| 307 |
+
cd tars-conversation-app
|
| 308 |
+
bash install.sh
|
| 309 |
+
|
| 310 |
+
# Configure
|
| 311 |
+
cp env.example .env.local
|
| 312 |
+
cp config.ini.example config.ini
|
| 313 |
+
|
| 314 |
+
# Run
|
| 315 |
+
python tars_bot.py # Robot mode
|
| 316 |
+
python bot.py # Browser mode</code></pre>
|
| 317 |
+
</section>
|
| 318 |
+
|
| 319 |
+
<section>
|
| 320 |
+
<h2>Resources</h2>
|
| 321 |
+
<a href="https://github.com/latishab/tars-conversation-app" class="btn">GitHub Repository</a>
|
| 322 |
+
<a href="https://github.com/latishab/tars-conversation-app#readme" class="btn secondary">Documentation</a>
|
| 323 |
+
</section>
|
| 324 |
+
|
| 325 |
+
<footer>
|
| 326 |
+
<p>Built with TarsApp framework β’ TARS Project</p>
|
| 327 |
+
<p style="margin-top: 10px; font-size: 14px;">
|
| 328 |
+
<a href="https://huggingface.co/spaces/latishab/tars-conversation-app" style="color: #667eea;">View on HuggingFace</a>
|
| 329 |
+
</p>
|
| 330 |
+
</footer>
|
| 331 |
+
</div>
|
| 332 |
+
</body>
|
| 333 |
+
</html>
|
install.sh
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
set -e
|
| 3 |
+
|
| 4 |
+
APP_NAME="tars-conversation-app"
|
| 5 |
+
APP_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
| 6 |
+
|
| 7 |
+
echo "=== Installing $APP_NAME ==="
|
| 8 |
+
echo "Directory: $APP_DIR"
|
| 9 |
+
echo
|
| 10 |
+
|
| 11 |
+
# Check Python version
|
| 12 |
+
echo "Checking Python version..."
|
| 13 |
+
PYTHON_VERSION=$(python3 --version 2>&1 | grep -oP '\d+\.\d+')
|
| 14 |
+
REQUIRED_VERSION="3.10"
|
| 15 |
+
|
| 16 |
+
if [ "$(printf '%s\n' "$REQUIRED_VERSION" "$PYTHON_VERSION" | sort -V | head -n1)" != "$REQUIRED_VERSION" ]; then
|
| 17 |
+
echo "Error: Python $REQUIRED_VERSION or higher required (found $PYTHON_VERSION)"
|
| 18 |
+
exit 1
|
| 19 |
+
fi
|
| 20 |
+
echo "Python $PYTHON_VERSION OK"
|
| 21 |
+
echo
|
| 22 |
+
|
| 23 |
+
# Install system dependencies
|
| 24 |
+
echo "Installing system dependencies..."
|
| 25 |
+
sudo apt-get update -qq
|
| 26 |
+
sudo apt-get install -y portaudio19-dev ffmpeg build-essential python3-dev python3-venv
|
| 27 |
+
echo "System dependencies installed"
|
| 28 |
+
echo
|
| 29 |
+
|
| 30 |
+
# Create virtual environment
|
| 31 |
+
if [ ! -d "$APP_DIR/venv" ]; then
|
| 32 |
+
echo "Creating virtual environment..."
|
| 33 |
+
python3 -m venv "$APP_DIR/venv"
|
| 34 |
+
echo "Virtual environment created"
|
| 35 |
+
else
|
| 36 |
+
echo "Virtual environment already exists"
|
| 37 |
+
fi
|
| 38 |
+
echo
|
| 39 |
+
|
| 40 |
+
# Activate virtual environment
|
| 41 |
+
source "$APP_DIR/venv/bin/activate"
|
| 42 |
+
|
| 43 |
+
# Upgrade pip
|
| 44 |
+
echo "Upgrading pip..."
|
| 45 |
+
pip install --upgrade pip -q
|
| 46 |
+
echo
|
| 47 |
+
|
| 48 |
+
# Install Python dependencies
|
| 49 |
+
echo "Installing Python dependencies..."
|
| 50 |
+
echo "This may take several minutes..."
|
| 51 |
+
pip install -r "$APP_DIR/requirements.txt" -q
|
| 52 |
+
echo "Python dependencies installed"
|
| 53 |
+
echo
|
| 54 |
+
|
| 55 |
+
# Setup configuration files
|
| 56 |
+
if [ ! -f "$APP_DIR/config.ini" ]; then
|
| 57 |
+
echo "Creating config.ini from template..."
|
| 58 |
+
cp "$APP_DIR/config.ini.example" "$APP_DIR/config.ini"
|
| 59 |
+
echo "Created config.ini"
|
| 60 |
+
CONFIG_CREATED=true
|
| 61 |
+
else
|
| 62 |
+
echo "config.ini already exists"
|
| 63 |
+
CONFIG_CREATED=false
|
| 64 |
+
fi
|
| 65 |
+
echo
|
| 66 |
+
|
| 67 |
+
if [ ! -f "$APP_DIR/.env.local" ]; then
|
| 68 |
+
echo "Creating .env.local from template..."
|
| 69 |
+
cp "$APP_DIR/env.example" "$APP_DIR/.env.local"
|
| 70 |
+
echo "Created .env.local"
|
| 71 |
+
ENV_CREATED=true
|
| 72 |
+
else
|
| 73 |
+
echo ".env.local already exists"
|
| 74 |
+
ENV_CREATED=false
|
| 75 |
+
fi
|
| 76 |
+
echo
|
| 77 |
+
|
| 78 |
+
# Run video codec fix if needed
|
| 79 |
+
if [ -f "$APP_DIR/fix_video_codec.sh" ]; then
|
| 80 |
+
echo "Applying video codec fixes..."
|
| 81 |
+
bash "$APP_DIR/fix_video_codec.sh" || true
|
| 82 |
+
fi
|
| 83 |
+
|
| 84 |
+
echo "=== Installation Complete ==="
|
| 85 |
+
echo
|
| 86 |
+
echo "Next steps:"
|
| 87 |
+
if [ "$CONFIG_CREATED" = true ] || [ "$ENV_CREATED" = true ]; then
|
| 88 |
+
echo "1. Edit configuration files:"
|
| 89 |
+
[ "$ENV_CREATED" = true ] && echo " - Add API keys to: $APP_DIR/.env.local"
|
| 90 |
+
[ "$CONFIG_CREATED" = true ] && echo " - Configure settings: $APP_DIR/config.ini"
|
| 91 |
+
echo "2. Activate environment: source $APP_DIR/venv/bin/activate"
|
| 92 |
+
echo "3. Run the app: python $APP_DIR/tars_bot.py"
|
| 93 |
+
else
|
| 94 |
+
echo "1. Activate environment: source $APP_DIR/venv/bin/activate"
|
| 95 |
+
echo "2. Run the app: python $APP_DIR/tars_bot.py"
|
| 96 |
+
fi
|
| 97 |
+
echo
|
| 98 |
+
echo "For browser mode: python $APP_DIR/bot.py"
|
| 99 |
+
echo "For dashboard: python $APP_DIR/ui/app.py"
|
manifest.json
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "tars-conversation-app",
|
| 3 |
+
"version": "1.0.0",
|
| 4 |
+
"description": "Real-time conversational AI with WebRTC, memory, and vision",
|
| 5 |
+
"author": "TARS Project",
|
| 6 |
+
"repository": "https://github.com/latishab/tars-conversation-app.git",
|
| 7 |
+
"entry_point": "tars_conversation_app.wrapper:ConversationApp",
|
| 8 |
+
"custom_app_url": "http://localhost:7860",
|
| 9 |
+
"icon": "assets/tars-icon.png",
|
| 10 |
+
"huggingface_space": "latishab/tars-conversation-app",
|
| 11 |
+
"install_script": "install.sh",
|
| 12 |
+
"uninstall_script": "uninstall.sh",
|
| 13 |
+
"dependencies": {
|
| 14 |
+
"python": ">=3.10",
|
| 15 |
+
"system": [
|
| 16 |
+
"portaudio19-dev",
|
| 17 |
+
"ffmpeg",
|
| 18 |
+
"build-essential",
|
| 19 |
+
"python3-dev"
|
| 20 |
+
]
|
| 21 |
+
},
|
| 22 |
+
"environment": [
|
| 23 |
+
"DEEPINFRA_API_KEY",
|
| 24 |
+
"SPEECHMATICS_API_KEY",
|
| 25 |
+
"DEEPGRAM_API_KEY",
|
| 26 |
+
"ELEVENLABS_API_KEY"
|
| 27 |
+
],
|
| 28 |
+
"configuration": {
|
| 29 |
+
"file": "config.ini",
|
| 30 |
+
"example": "config.ini.example",
|
| 31 |
+
"env_file": ".env.local",
|
| 32 |
+
"env_example": "env.example"
|
| 33 |
+
},
|
| 34 |
+
"ports": {
|
| 35 |
+
"grpc": 50051,
|
| 36 |
+
"http": 8765,
|
| 37 |
+
"fastapi": 8080,
|
| 38 |
+
"dashboard": 7860
|
| 39 |
+
},
|
| 40 |
+
"services": {
|
| 41 |
+
"dashboard": {
|
| 42 |
+
"enabled": true,
|
| 43 |
+
"description": "Gradio metrics and monitoring dashboard",
|
| 44 |
+
"url": "http://localhost:7860"
|
| 45 |
+
}
|
| 46 |
+
}
|
| 47 |
+
}
|
pipecat_service.py
ADDED
|
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Pipecat.ai service for real-time transcription and TTS using SmallWebRTC
|
| 4 |
+
Communicates directly with browser via WebRTC
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
# Fix SSL certificate issues FIRST - before any SSL-using imports
|
| 8 |
+
import os
|
| 9 |
+
import sys
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
# Add src/ to Python path
|
| 13 |
+
sys.path.insert(0, str(Path(__file__).parent / "src"))
|
| 14 |
+
|
| 15 |
+
try:
|
| 16 |
+
import certifi
|
| 17 |
+
cert_file = certifi.where()
|
| 18 |
+
os.environ['SSL_CERT_FILE'] = cert_file
|
| 19 |
+
os.environ['REQUESTS_CA_BUNDLE'] = cert_file
|
| 20 |
+
os.environ['CURL_CA_BUNDLE'] = cert_file
|
| 21 |
+
except ImportError:
|
| 22 |
+
pass # certifi not available, will use system certs
|
| 23 |
+
|
| 24 |
+
import ssl
|
| 25 |
+
from contextlib import asynccontextmanager
|
| 26 |
+
|
| 27 |
+
# Configure SSL to use certifi certificates for Python's ssl module
|
| 28 |
+
# For development: disable SSL verification completely to avoid certificate issues
|
| 29 |
+
# This MUST happen before any libraries that use SSL are imported
|
| 30 |
+
try:
|
| 31 |
+
import certifi
|
| 32 |
+
cert_file = certifi.where()
|
| 33 |
+
# Set environment variables for libraries that respect them
|
| 34 |
+
os.environ['SSL_CERT_FILE'] = cert_file
|
| 35 |
+
os.environ['REQUESTS_CA_BUNDLE'] = cert_file
|
| 36 |
+
os.environ['CURL_CA_BUNDLE'] = cert_file
|
| 37 |
+
|
| 38 |
+
# For Python's ssl module: use unverified context for development
|
| 39 |
+
# This bypasses SSL certificate verification to avoid connection issues
|
| 40 |
+
ssl._create_default_https_context = ssl._create_unverified_context
|
| 41 |
+
except ImportError:
|
| 42 |
+
# If certifi not available, use unverified (development only)
|
| 43 |
+
ssl._create_default_https_context = ssl._create_unverified_context
|
| 44 |
+
except Exception as e:
|
| 45 |
+
# If anything fails, use unverified context
|
| 46 |
+
ssl._create_default_https_context = ssl._create_unverified_context
|
| 47 |
+
|
| 48 |
+
import argparse
|
| 49 |
+
import logging
|
| 50 |
+
from fastapi import BackgroundTasks, FastAPI
|
| 51 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 52 |
+
from loguru import logger
|
| 53 |
+
from pipecat.transports.smallwebrtc.request_handler import (
|
| 54 |
+
SmallWebRTCPatchRequest,
|
| 55 |
+
SmallWebRTCRequest,
|
| 56 |
+
SmallWebRTCRequestHandler,
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
from bot import run_bot
|
| 60 |
+
from config import (
|
| 61 |
+
PIPECAT_HOST,
|
| 62 |
+
PIPECAT_PORT,
|
| 63 |
+
SPEECHMATICS_API_KEY,
|
| 64 |
+
DEEPGRAM_API_KEY,
|
| 65 |
+
ELEVENLABS_API_KEY,
|
| 66 |
+
DEEPINFRA_API_KEY,
|
| 67 |
+
STT_PROVIDER,
|
| 68 |
+
TTS_PROVIDER, # Only used for startup validation
|
| 69 |
+
get_fresh_config,
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
# Remove default loguru handler and set up custom logging
|
| 73 |
+
logger.remove(0)
|
| 74 |
+
|
| 75 |
+
# Configure standard logging
|
| 76 |
+
logging.basicConfig(level=logging.INFO)
|
| 77 |
+
standard_logger = logging.getLogger(__name__)
|
| 78 |
+
|
| 79 |
+
# Reduce noise from websockets library - only log warnings and above
|
| 80 |
+
websockets_logger = logging.getLogger('websockets')
|
| 81 |
+
websockets_logger.setLevel(logging.WARNING)
|
| 82 |
+
|
| 83 |
+
# Log SSL certificate configuration
|
| 84 |
+
try:
|
| 85 |
+
import certifi
|
| 86 |
+
logger.info(f"SSL Configuration: Using certificates from {certifi.where()}")
|
| 87 |
+
logger.info(f"SSL_CERT_FILE env: {os.environ.get('SSL_CERT_FILE', 'not set')}")
|
| 88 |
+
except:
|
| 89 |
+
logger.warning("certifi not available - SSL verification disabled for development")
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
@asynccontextmanager
|
| 93 |
+
async def lifespan(app: FastAPI):
|
| 94 |
+
"""Handle app lifespan events."""
|
| 95 |
+
logger.info(f"Starting Pipecat service on http://{PIPECAT_HOST}:{PIPECAT_PORT}...")
|
| 96 |
+
logger.info(f"STT Provider: {STT_PROVIDER}")
|
| 97 |
+
logger.info(f"TTS Provider: {TTS_PROVIDER}")
|
| 98 |
+
|
| 99 |
+
# Check required API keys based on STT and TTS providers
|
| 100 |
+
missing_keys = []
|
| 101 |
+
if STT_PROVIDER == "speechmatics" and not SPEECHMATICS_API_KEY:
|
| 102 |
+
missing_keys.append("SPEECHMATICS_API_KEY")
|
| 103 |
+
if STT_PROVIDER == "deepgram" and not DEEPGRAM_API_KEY:
|
| 104 |
+
missing_keys.append("DEEPGRAM_API_KEY")
|
| 105 |
+
if not DEEPINFRA_API_KEY:
|
| 106 |
+
missing_keys.append("DEEPINFRA_API_KEY")
|
| 107 |
+
if TTS_PROVIDER == "elevenlabs" and not ELEVENLABS_API_KEY:
|
| 108 |
+
missing_keys.append("ELEVENLABS_API_KEY")
|
| 109 |
+
|
| 110 |
+
if missing_keys:
|
| 111 |
+
logger.error(f"ERROR: Missing required API keys: {', '.join(missing_keys)}")
|
| 112 |
+
sys.exit(1)
|
| 113 |
+
|
| 114 |
+
yield # Run app
|
| 115 |
+
|
| 116 |
+
# Cleanup
|
| 117 |
+
await small_webrtc_handler.close()
|
| 118 |
+
logger.info("Shutting down...")
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
app = FastAPI(lifespan=lifespan)
|
| 122 |
+
|
| 123 |
+
# Add CORS middleware
|
| 124 |
+
app.add_middleware(
|
| 125 |
+
CORSMiddleware,
|
| 126 |
+
allow_origins=["*"], # In production, replace with specific origins
|
| 127 |
+
allow_credentials=True,
|
| 128 |
+
allow_methods=["*"],
|
| 129 |
+
allow_headers=["*"],
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
# Initialize the SmallWebRTC request handler
|
| 133 |
+
small_webrtc_handler: SmallWebRTCRequestHandler = SmallWebRTCRequestHandler()
|
| 134 |
+
|
| 135 |
+
@app.post("/api/offer")
|
| 136 |
+
async def offer(request: SmallWebRTCRequest, background_tasks: BackgroundTasks):
|
| 137 |
+
"""Handle WebRTC offer requests via SmallWebRTCRequestHandler."""
|
| 138 |
+
logger.debug(f"Received WebRTC offer request")
|
| 139 |
+
|
| 140 |
+
# Prepare runner arguments with the callback to run your bot
|
| 141 |
+
async def webrtc_connection_callback(connection):
|
| 142 |
+
background_tasks.add_task(run_bot, connection)
|
| 143 |
+
|
| 144 |
+
# Delegate handling to SmallWebRTCRequestHandler
|
| 145 |
+
answer = await small_webrtc_handler.handle_web_request(
|
| 146 |
+
request=request,
|
| 147 |
+
webrtc_connection_callback=webrtc_connection_callback,
|
| 148 |
+
)
|
| 149 |
+
return answer
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
@app.patch("/api/offer")
|
| 153 |
+
async def ice_candidate(request: SmallWebRTCPatchRequest):
|
| 154 |
+
"""Handle ICE candidate patch requests."""
|
| 155 |
+
logger.debug(f"Received ICE candidate patch request")
|
| 156 |
+
await small_webrtc_handler.handle_patch_request(request)
|
| 157 |
+
return {"status": "success"}
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
@app.get("/api/status")
|
| 161 |
+
async def status():
|
| 162 |
+
"""Health check endpoint with fresh config values."""
|
| 163 |
+
# Get current config from config.ini
|
| 164 |
+
current_config = get_fresh_config()
|
| 165 |
+
current_stt = current_config['STT_PROVIDER']
|
| 166 |
+
current_tts = current_config['TTS_PROVIDER']
|
| 167 |
+
current_model = current_config['DEEPINFRA_MODEL']
|
| 168 |
+
|
| 169 |
+
return {
|
| 170 |
+
"status": "ok",
|
| 171 |
+
"stt_provider": current_stt,
|
| 172 |
+
"tts_provider": current_tts,
|
| 173 |
+
"llm_model": current_model,
|
| 174 |
+
"speechmatics_configured": bool(SPEECHMATICS_API_KEY) if current_stt == "speechmatics" else None,
|
| 175 |
+
"deepgram_configured": bool(DEEPGRAM_API_KEY) if current_stt == "deepgram" else None,
|
| 176 |
+
"elevenlabs_configured": bool(ELEVENLABS_API_KEY) if current_tts == "elevenlabs" else None,
|
| 177 |
+
"deepinfra_configured": bool(DEEPINFRA_API_KEY),
|
| 178 |
+
"qwen3_tts_configured": True if current_tts == "qwen3" else None,
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
@app.get("/api/config")
|
| 183 |
+
async def get_config():
|
| 184 |
+
"""Get current configuration from config.ini."""
|
| 185 |
+
import configparser
|
| 186 |
+
from pathlib import Path
|
| 187 |
+
|
| 188 |
+
config = configparser.ConfigParser()
|
| 189 |
+
config_path = Path("config.ini")
|
| 190 |
+
|
| 191 |
+
if not config_path.exists():
|
| 192 |
+
return {"error": "config.ini not found"}
|
| 193 |
+
|
| 194 |
+
config.read(config_path)
|
| 195 |
+
|
| 196 |
+
return {
|
| 197 |
+
"llm": {
|
| 198 |
+
"model": config.get("LLM", "model", fallback="Qwen/Qwen3-235B-A22B-Instruct-2507")
|
| 199 |
+
},
|
| 200 |
+
"stt": {
|
| 201 |
+
"provider": config.get("STT", "provider", fallback="speechmatics")
|
| 202 |
+
},
|
| 203 |
+
"tts": {
|
| 204 |
+
"provider": config.get("TTS", "provider", fallback="qwen3"),
|
| 205 |
+
"qwen3_model": config.get("TTS", "qwen3_model", fallback="Qwen/Qwen3-TTS-12Hz-0.6B-Base"),
|
| 206 |
+
"qwen3_device": config.get("TTS", "qwen3_device", fallback="mps"),
|
| 207 |
+
"qwen3_ref_audio": config.get("TTS", "qwen3_ref_audio", fallback="tars-clean-compressed.mp3"),
|
| 208 |
+
}
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
@app.post("/api/config")
|
| 213 |
+
async def update_config(request: dict):
|
| 214 |
+
"""Update configuration in config.ini."""
|
| 215 |
+
import configparser
|
| 216 |
+
from pathlib import Path
|
| 217 |
+
|
| 218 |
+
config = configparser.ConfigParser()
|
| 219 |
+
config_path = Path("config.ini")
|
| 220 |
+
|
| 221 |
+
if not config_path.exists():
|
| 222 |
+
return {"error": "config.ini not found"}
|
| 223 |
+
|
| 224 |
+
config.read(config_path)
|
| 225 |
+
|
| 226 |
+
# Update LLM config
|
| 227 |
+
if "llm_model" in request:
|
| 228 |
+
if not config.has_section("LLM"):
|
| 229 |
+
config.add_section("LLM")
|
| 230 |
+
config.set("LLM", "model", request["llm_model"])
|
| 231 |
+
|
| 232 |
+
# Update STT config
|
| 233 |
+
if "stt_provider" in request:
|
| 234 |
+
if not config.has_section("STT"):
|
| 235 |
+
config.add_section("STT")
|
| 236 |
+
config.set("STT", "provider", request["stt_provider"])
|
| 237 |
+
|
| 238 |
+
# Update TTS config
|
| 239 |
+
if "tts_provider" in request:
|
| 240 |
+
if not config.has_section("TTS"):
|
| 241 |
+
config.add_section("TTS")
|
| 242 |
+
config.set("TTS", "provider", request["tts_provider"])
|
| 243 |
+
|
| 244 |
+
# Write back to file
|
| 245 |
+
with open(config_path, "w") as f:
|
| 246 |
+
config.write(f)
|
| 247 |
+
|
| 248 |
+
return {
|
| 249 |
+
"success": True,
|
| 250 |
+
"message": "Configuration updated. Please restart the service for changes to take effect.",
|
| 251 |
+
"restart_required": True
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
if __name__ == "__main__":
|
| 256 |
+
parser = argparse.ArgumentParser(description="WebRTC Pipecat service")
|
| 257 |
+
parser.add_argument(
|
| 258 |
+
"--host", default=PIPECAT_HOST, help=f"Host for HTTP server (default: {PIPECAT_HOST})"
|
| 259 |
+
)
|
| 260 |
+
parser.add_argument(
|
| 261 |
+
"--port", type=int, default=PIPECAT_PORT, help=f"Port for HTTP server (default: {PIPECAT_PORT})"
|
| 262 |
+
)
|
| 263 |
+
parser.add_argument("--verbose", "-v", action="count")
|
| 264 |
+
args = parser.parse_args()
|
| 265 |
+
|
| 266 |
+
if args.verbose:
|
| 267 |
+
logger.add(sys.stderr, level="TRACE")
|
| 268 |
+
else:
|
| 269 |
+
logger.add(sys.stderr, level="INFO")
|
| 270 |
+
|
| 271 |
+
import uvicorn
|
| 272 |
+
uvicorn.run(app, host=args.host, port=args.port)
|
publish-to-hf.sh
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Publish tars-conversation-app to HuggingFace Space
|
| 3 |
+
|
| 4 |
+
set -e
|
| 5 |
+
|
| 6 |
+
echo "Publishing tars-conversation-app to HuggingFace Space..."
|
| 7 |
+
echo
|
| 8 |
+
|
| 9 |
+
# Check for HF_TOKEN
|
| 10 |
+
if [ -z "$HF_TOKEN" ]; then
|
| 11 |
+
echo "β Error: HF_TOKEN not set"
|
| 12 |
+
echo
|
| 13 |
+
echo "Get a token from: https://huggingface.co/settings/tokens"
|
| 14 |
+
echo "Then run:"
|
| 15 |
+
echo " export HF_TOKEN=hf_your_token_here"
|
| 16 |
+
echo " bash publish-to-hf.sh"
|
| 17 |
+
exit 1
|
| 18 |
+
fi
|
| 19 |
+
|
| 20 |
+
echo "β HF_TOKEN is set"
|
| 21 |
+
|
| 22 |
+
# Check for huggingface_hub
|
| 23 |
+
python3 << 'EOFCHECK'
|
| 24 |
+
try:
|
| 25 |
+
from huggingface_hub import HfApi
|
| 26 |
+
print("β huggingface_hub is installed")
|
| 27 |
+
except ImportError:
|
| 28 |
+
print("β huggingface_hub not installed")
|
| 29 |
+
print("\nInstall with:")
|
| 30 |
+
print(" pip install huggingface_hub")
|
| 31 |
+
exit(1)
|
| 32 |
+
EOFCHECK
|
| 33 |
+
|
| 34 |
+
if [ $? -ne 0 ]; then
|
| 35 |
+
exit 1
|
| 36 |
+
fi
|
| 37 |
+
|
| 38 |
+
echo
|
| 39 |
+
echo "Uploading to latishab/tars-conversation-app..."
|
| 40 |
+
echo
|
| 41 |
+
|
| 42 |
+
# Upload
|
| 43 |
+
python3 << 'EOFUPLOAD'
|
| 44 |
+
import os
|
| 45 |
+
from pathlib import Path
|
| 46 |
+
from huggingface_hub import HfApi
|
| 47 |
+
|
| 48 |
+
token = os.environ["HF_TOKEN"]
|
| 49 |
+
api = HfApi(token=token)
|
| 50 |
+
|
| 51 |
+
print("Uploading files...")
|
| 52 |
+
|
| 53 |
+
api.upload_folder(
|
| 54 |
+
folder_path=".",
|
| 55 |
+
repo_id="latishab/tars-conversation-app",
|
| 56 |
+
repo_type="space",
|
| 57 |
+
ignore_patterns=[
|
| 58 |
+
".git", ".git/*",
|
| 59 |
+
"venv", "venv/*",
|
| 60 |
+
"__pycache__", "**/__pycache__",
|
| 61 |
+
"*.pyc", "**/*.pyc",
|
| 62 |
+
".pytest_cache",
|
| 63 |
+
".models", ".models/*",
|
| 64 |
+
"chroma_memory", "chroma_memory/*",
|
| 65 |
+
"memory_data", "memory_data/*",
|
| 66 |
+
".env", ".env.local", ".env.*",
|
| 67 |
+
"config.ini",
|
| 68 |
+
".claude", ".claude/*",
|
| 69 |
+
".DS_Store", "**/.DS_Store"
|
| 70 |
+
],
|
| 71 |
+
commit_message="Update TARS Conversation App with TarsApp framework"
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
print("\nβ
Published successfully!")
|
| 75 |
+
print("\nSpace URL: https://huggingface.co/spaces/latishab/tars-conversation-app")
|
| 76 |
+
print("\nNext steps:")
|
| 77 |
+
print("1. Visit the Space URL to verify it's working")
|
| 78 |
+
print("2. Test installation on TARS robot:")
|
| 79 |
+
print(" - Open dashboard at http://your-pi:8000")
|
| 80 |
+
print(" - Go to App Store tab")
|
| 81 |
+
print(" - Enter Space ID: latishab/tars-conversation-app")
|
| 82 |
+
print(" - Click 'Install from HuggingFace'")
|
| 83 |
+
print("3. Click Start and verify Gradio dashboard at :7860")
|
| 84 |
+
EOFUPLOAD
|
| 85 |
+
|
| 86 |
+
echo
|
| 87 |
+
echo "Done!"
|
pyproject.toml
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=61.0"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "tars-conversation-app"
|
| 7 |
+
version = "1.0.0"
|
| 8 |
+
description = "Real-time conversational AI with WebRTC, memory, and vision for TARS robots"
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
requires-python = ">=3.10"
|
| 11 |
+
authors = [
|
| 12 |
+
{name = "TARS Project"}
|
| 13 |
+
]
|
| 14 |
+
|
| 15 |
+
dependencies = [
|
| 16 |
+
"tars-sdk>=0.1.0",
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
[project.urls]
|
| 20 |
+
Homepage = "https://github.com/latishab/tars-conversation-app"
|
| 21 |
+
Repository = "https://github.com/latishab/tars-conversation-app.git"
|
| 22 |
+
Documentation = "https://github.com/latishab/tars-conversation-app#readme"
|
| 23 |
+
|
| 24 |
+
[tool.setuptools.packages.find]
|
| 25 |
+
include = ["tars_conversation_app", "tars_conversation_app.*", "src", "src.*", "ui", "ui.*"]
|
requirements.txt
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pipecat-ai[speechmatics,elevenlabs,webrtc,qwen,moondream,local-smart-turn-v3,silero]>=0.0.102
|
| 2 |
+
python-dotenv>=1.0.0
|
| 3 |
+
fastapi>=0.104.0
|
| 4 |
+
uvicorn[standard]>=0.24.0
|
| 5 |
+
loguru>=0.7.0
|
| 6 |
+
certifi>=2024.0.0
|
| 7 |
+
aiohttp>=3.9.0
|
| 8 |
+
chromadb>=0.4.0
|
| 9 |
+
sentence-transformers>=2.2.0
|
| 10 |
+
opencv-python>=4.8.0
|
| 11 |
+
mediapipe>=0.10.0
|
| 12 |
+
websockets>=12.0
|
| 13 |
+
httpx>=0.24.0
|
| 14 |
+
gradio>=4.0.0
|
| 15 |
+
plotly>=5.0.0
|
| 16 |
+
# aiortc is installed as a dependency of pipecat-ai[webrtc]
|
| 17 |
+
# If you encounter VP8 decoder errors, run: bash fix_video_codec.sh
|
| 18 |
+
|
scripts/update_daemon.py
ADDED
|
@@ -0,0 +1,388 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
TARS Daemon Remote Update Script
|
| 4 |
+
|
| 5 |
+
Updates the TARS daemon on the Raspberry Pi via SSH.
|
| 6 |
+
Supports git-based updates, backup, health checks, and rollback.
|
| 7 |
+
|
| 8 |
+
Usage:
|
| 9 |
+
python scripts/update_daemon.py --check-only
|
| 10 |
+
python scripts/update_daemon.py --method git
|
| 11 |
+
python scripts/update_daemon.py --method git --version v0.2.1
|
| 12 |
+
python scripts/update_daemon.py --rollback /path/to/backup
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import argparse
|
| 16 |
+
import subprocess
|
| 17 |
+
import sys
|
| 18 |
+
import json
|
| 19 |
+
from datetime import datetime
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
|
| 22 |
+
# SSH configuration
|
| 23 |
+
PI_HOST = "tars-pi"
|
| 24 |
+
PI_USER = "mac"
|
| 25 |
+
DAEMON_DIR = "~/tars-daemon"
|
| 26 |
+
BACKUP_DIR = "~/tars-daemon-backups"
|
| 27 |
+
SERVICE_NAME = "tars"
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def run_ssh(cmd: str, check: bool = True) -> tuple[int, str, str]:
|
| 31 |
+
"""Run command on Pi via SSH."""
|
| 32 |
+
ssh_cmd = f'ssh {PI_HOST} "{cmd}"'
|
| 33 |
+
result = subprocess.run(
|
| 34 |
+
ssh_cmd,
|
| 35 |
+
shell=True,
|
| 36 |
+
capture_output=True,
|
| 37 |
+
text=True
|
| 38 |
+
)
|
| 39 |
+
if check and result.returncode != 0:
|
| 40 |
+
print(f"Error: {result.stderr}")
|
| 41 |
+
return result.returncode, result.stdout.strip(), result.stderr.strip()
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def get_current_version() -> dict:
|
| 45 |
+
"""Get current daemon version info."""
|
| 46 |
+
code, out, err = run_ssh(
|
| 47 |
+
f"cd {DAEMON_DIR} && source venv/bin/activate && "
|
| 48 |
+
"python -c 'from tars_sdk import __version__; import json; "
|
| 49 |
+
"print(json.dumps({\"version\": __version__}))'",
|
| 50 |
+
check=False
|
| 51 |
+
)
|
| 52 |
+
if code == 0:
|
| 53 |
+
try:
|
| 54 |
+
return json.loads(out)
|
| 55 |
+
except json.JSONDecodeError:
|
| 56 |
+
pass
|
| 57 |
+
|
| 58 |
+
# Fallback: try git
|
| 59 |
+
code, out, _ = run_ssh(f"cd {DAEMON_DIR} && git describe --tags --always", check=False)
|
| 60 |
+
return {"version": out if code == 0 else "unknown", "git": True}
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def get_git_status() -> dict:
|
| 64 |
+
"""Get git status on Pi."""
|
| 65 |
+
info = {}
|
| 66 |
+
|
| 67 |
+
code, out, _ = run_ssh(f"cd {DAEMON_DIR} && git rev-parse --short HEAD", check=False)
|
| 68 |
+
info["commit"] = out if code == 0 else "unknown"
|
| 69 |
+
|
| 70 |
+
code, out, _ = run_ssh(f"cd {DAEMON_DIR} && git branch --show-current", check=False)
|
| 71 |
+
info["branch"] = out if code == 0 else "main"
|
| 72 |
+
|
| 73 |
+
code, out, _ = run_ssh(f"cd {DAEMON_DIR} && git status --porcelain", check=False)
|
| 74 |
+
info["dirty"] = bool(out) if code == 0 else False
|
| 75 |
+
|
| 76 |
+
code, out, _ = run_ssh(f"cd {DAEMON_DIR} && git describe --tags --always", check=False)
|
| 77 |
+
info["tag"] = out if code == 0 else ""
|
| 78 |
+
|
| 79 |
+
return info
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def check_daemon_health() -> bool:
|
| 83 |
+
"""Check if daemon is running and healthy."""
|
| 84 |
+
code, out, _ = run_ssh(f"systemctl is-active {SERVICE_NAME}", check=False)
|
| 85 |
+
if code == 0 and out == "active":
|
| 86 |
+
return True
|
| 87 |
+
|
| 88 |
+
# Try curl health endpoint
|
| 89 |
+
code, out, _ = run_ssh("curl -s http://localhost:8001/api/health", check=False)
|
| 90 |
+
if code == 0 and "running" in out.lower():
|
| 91 |
+
return True
|
| 92 |
+
|
| 93 |
+
return False
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def stop_daemon() -> bool:
|
| 97 |
+
"""Stop the daemon service."""
|
| 98 |
+
print("Stopping daemon...")
|
| 99 |
+
code, _, _ = run_ssh(f"sudo systemctl stop {SERVICE_NAME}", check=False)
|
| 100 |
+
if code != 0:
|
| 101 |
+
code, _, _ = run_ssh("pkill -f tars_daemon.py", check=False)
|
| 102 |
+
return True
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def start_daemon() -> bool:
|
| 106 |
+
"""Start the daemon service."""
|
| 107 |
+
print("Starting daemon...")
|
| 108 |
+
code, _, err = run_ssh(f"sudo systemctl start {SERVICE_NAME}", check=False)
|
| 109 |
+
if code != 0:
|
| 110 |
+
print(f"Warning: systemctl start failed: {err}")
|
| 111 |
+
# Try direct start
|
| 112 |
+
code, _, _ = run_ssh(
|
| 113 |
+
f"cd {DAEMON_DIR} && source venv/bin/activate && "
|
| 114 |
+
"nohup python tars_daemon.py > /dev/null 2>&1 &",
|
| 115 |
+
check=False
|
| 116 |
+
)
|
| 117 |
+
return code == 0
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def create_backup() -> str:
|
| 121 |
+
"""Create backup of current installation."""
|
| 122 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 123 |
+
backup_path = f"{BACKUP_DIR}/tars-daemon-{timestamp}"
|
| 124 |
+
|
| 125 |
+
print(f"Creating backup at {backup_path}...")
|
| 126 |
+
|
| 127 |
+
# Create backup directory
|
| 128 |
+
run_ssh(f"mkdir -p {BACKUP_DIR}")
|
| 129 |
+
|
| 130 |
+
# Copy current installation
|
| 131 |
+
code, _, err = run_ssh(f"cp -r {DAEMON_DIR} {backup_path}")
|
| 132 |
+
if code != 0:
|
| 133 |
+
print(f"Error creating backup: {err}")
|
| 134 |
+
return ""
|
| 135 |
+
|
| 136 |
+
# Remove venv from backup to save space
|
| 137 |
+
run_ssh(f"rm -rf {backup_path}/venv", check=False)
|
| 138 |
+
|
| 139 |
+
print(f"Backup created: {backup_path}")
|
| 140 |
+
return backup_path
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def restore_backup(backup_path: str) -> bool:
|
| 144 |
+
"""Restore from backup."""
|
| 145 |
+
print(f"Restoring from {backup_path}...")
|
| 146 |
+
|
| 147 |
+
# Verify backup exists
|
| 148 |
+
code, _, _ = run_ssh(f"test -d {backup_path}", check=False)
|
| 149 |
+
if code != 0:
|
| 150 |
+
print(f"Error: Backup not found at {backup_path}")
|
| 151 |
+
return False
|
| 152 |
+
|
| 153 |
+
stop_daemon()
|
| 154 |
+
|
| 155 |
+
# Move current to temp
|
| 156 |
+
run_ssh(f"mv {DAEMON_DIR} {DAEMON_DIR}.old", check=False)
|
| 157 |
+
|
| 158 |
+
# Restore backup
|
| 159 |
+
code, _, err = run_ssh(f"cp -r {backup_path} {DAEMON_DIR}")
|
| 160 |
+
if code != 0:
|
| 161 |
+
print(f"Error restoring backup: {err}")
|
| 162 |
+
# Try to restore old
|
| 163 |
+
run_ssh(f"mv {DAEMON_DIR}.old {DAEMON_DIR}", check=False)
|
| 164 |
+
return False
|
| 165 |
+
|
| 166 |
+
# Recreate venv
|
| 167 |
+
print("Recreating virtual environment...")
|
| 168 |
+
run_ssh(
|
| 169 |
+
f"cd {DAEMON_DIR} && python3 -m venv venv && "
|
| 170 |
+
"source venv/bin/activate && pip install -e .",
|
| 171 |
+
check=False
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
# Cleanup
|
| 175 |
+
run_ssh(f"rm -rf {DAEMON_DIR}.old", check=False)
|
| 176 |
+
|
| 177 |
+
start_daemon()
|
| 178 |
+
return True
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def update_git(version: str = None) -> bool:
|
| 182 |
+
"""Update daemon using git."""
|
| 183 |
+
git_info = get_git_status()
|
| 184 |
+
print(f"Current: {git_info['commit']} on {git_info['branch']}")
|
| 185 |
+
|
| 186 |
+
if git_info["dirty"]:
|
| 187 |
+
print("Warning: Working directory has uncommitted changes")
|
| 188 |
+
|
| 189 |
+
# Create backup
|
| 190 |
+
backup_path = create_backup()
|
| 191 |
+
if not backup_path:
|
| 192 |
+
print("Error: Failed to create backup")
|
| 193 |
+
return False
|
| 194 |
+
|
| 195 |
+
stop_daemon()
|
| 196 |
+
|
| 197 |
+
# Fetch latest
|
| 198 |
+
print("Fetching updates...")
|
| 199 |
+
code, _, err = run_ssh(f"cd {DAEMON_DIR} && git fetch --all --tags")
|
| 200 |
+
if code != 0:
|
| 201 |
+
print(f"Error fetching: {err}")
|
| 202 |
+
return False
|
| 203 |
+
|
| 204 |
+
# Checkout version or pull latest
|
| 205 |
+
if version:
|
| 206 |
+
print(f"Checking out {version}...")
|
| 207 |
+
code, _, err = run_ssh(f"cd {DAEMON_DIR} && git checkout {version}")
|
| 208 |
+
else:
|
| 209 |
+
print("Pulling latest...")
|
| 210 |
+
code, _, err = run_ssh(f"cd {DAEMON_DIR} && git pull --ff-only")
|
| 211 |
+
|
| 212 |
+
if code != 0:
|
| 213 |
+
print(f"Error: {err}")
|
| 214 |
+
print("Rolling back...")
|
| 215 |
+
restore_backup(backup_path)
|
| 216 |
+
return False
|
| 217 |
+
|
| 218 |
+
# Update dependencies
|
| 219 |
+
print("Updating dependencies...")
|
| 220 |
+
code, _, err = run_ssh(
|
| 221 |
+
f"cd {DAEMON_DIR} && source venv/bin/activate && pip install -e ."
|
| 222 |
+
)
|
| 223 |
+
if code != 0:
|
| 224 |
+
print(f"Error installing: {err}")
|
| 225 |
+
print("Rolling back...")
|
| 226 |
+
restore_backup(backup_path)
|
| 227 |
+
return False
|
| 228 |
+
|
| 229 |
+
# Regenerate proto files if needed
|
| 230 |
+
print("Regenerating proto files...")
|
| 231 |
+
run_ssh(
|
| 232 |
+
f"cd {DAEMON_DIR} && source venv/bin/activate && "
|
| 233 |
+
"python -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. "
|
| 234 |
+
"--pyi_out=. tars_sdk/proto/tars.proto",
|
| 235 |
+
check=False
|
| 236 |
+
)
|
| 237 |
+
|
| 238 |
+
# Start daemon
|
| 239 |
+
start_daemon()
|
| 240 |
+
|
| 241 |
+
# Health check
|
| 242 |
+
import time
|
| 243 |
+
print("Waiting for daemon to start...")
|
| 244 |
+
time.sleep(3)
|
| 245 |
+
|
| 246 |
+
if check_daemon_health():
|
| 247 |
+
print("Daemon is healthy")
|
| 248 |
+
new_info = get_git_status()
|
| 249 |
+
print(f"Updated to: {new_info['commit']}")
|
| 250 |
+
return True
|
| 251 |
+
else:
|
| 252 |
+
print("Error: Daemon health check failed")
|
| 253 |
+
print("Rolling back...")
|
| 254 |
+
restore_backup(backup_path)
|
| 255 |
+
return False
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
def list_backups():
|
| 259 |
+
"""List available backups."""
|
| 260 |
+
code, out, _ = run_ssh(f"ls -la {BACKUP_DIR}", check=False)
|
| 261 |
+
if code == 0:
|
| 262 |
+
print("Available backups:")
|
| 263 |
+
print(out)
|
| 264 |
+
else:
|
| 265 |
+
print("No backups found")
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
def main():
|
| 269 |
+
parser = argparse.ArgumentParser(
|
| 270 |
+
description="Update TARS daemon on Raspberry Pi",
|
| 271 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 272 |
+
epilog="""
|
| 273 |
+
Examples:
|
| 274 |
+
%(prog)s --check-only Show current version
|
| 275 |
+
%(prog)s --method git Update via git pull
|
| 276 |
+
%(prog)s --version v0.2.1 Checkout specific version
|
| 277 |
+
%(prog)s --rollback ~/backup Restore from backup
|
| 278 |
+
%(prog)s --list-backups List available backups
|
| 279 |
+
"""
|
| 280 |
+
)
|
| 281 |
+
|
| 282 |
+
parser.add_argument(
|
| 283 |
+
"--check-only",
|
| 284 |
+
action="store_true",
|
| 285 |
+
help="Show current version and status only"
|
| 286 |
+
)
|
| 287 |
+
parser.add_argument(
|
| 288 |
+
"--method",
|
| 289 |
+
choices=["git"],
|
| 290 |
+
default="git",
|
| 291 |
+
help="Update method (default: git)"
|
| 292 |
+
)
|
| 293 |
+
parser.add_argument(
|
| 294 |
+
"--version",
|
| 295 |
+
help="Specific version/tag to checkout (e.g., v0.2.1)"
|
| 296 |
+
)
|
| 297 |
+
parser.add_argument(
|
| 298 |
+
"--rollback",
|
| 299 |
+
metavar="PATH",
|
| 300 |
+
help="Restore from backup path"
|
| 301 |
+
)
|
| 302 |
+
parser.add_argument(
|
| 303 |
+
"--list-backups",
|
| 304 |
+
action="store_true",
|
| 305 |
+
help="List available backups"
|
| 306 |
+
)
|
| 307 |
+
parser.add_argument(
|
| 308 |
+
"--force",
|
| 309 |
+
action="store_true",
|
| 310 |
+
help="Skip confirmation prompts"
|
| 311 |
+
)
|
| 312 |
+
|
| 313 |
+
args = parser.parse_args()
|
| 314 |
+
|
| 315 |
+
print("=" * 60)
|
| 316 |
+
print("TARS Daemon Update Tool")
|
| 317 |
+
print("=" * 60)
|
| 318 |
+
|
| 319 |
+
# Test SSH connection
|
| 320 |
+
code, _, _ = run_ssh("echo ok", check=False)
|
| 321 |
+
if code != 0:
|
| 322 |
+
print(f"Error: Cannot connect to {PI_HOST}")
|
| 323 |
+
print("Check SSH configuration and try again.")
|
| 324 |
+
sys.exit(1)
|
| 325 |
+
|
| 326 |
+
print(f"Connected to {PI_HOST}")
|
| 327 |
+
print()
|
| 328 |
+
|
| 329 |
+
# Get current status
|
| 330 |
+
version_info = get_current_version()
|
| 331 |
+
git_info = get_git_status()
|
| 332 |
+
healthy = check_daemon_health()
|
| 333 |
+
|
| 334 |
+
print(f"Current version: {version_info.get('version', 'unknown')}")
|
| 335 |
+
print(f"Git commit: {git_info['commit']} ({git_info['branch']})")
|
| 336 |
+
print(f"Daemon status: {'healthy' if healthy else 'not running'}")
|
| 337 |
+
print()
|
| 338 |
+
|
| 339 |
+
if args.list_backups:
|
| 340 |
+
list_backups()
|
| 341 |
+
sys.exit(0)
|
| 342 |
+
|
| 343 |
+
if args.check_only:
|
| 344 |
+
sys.exit(0)
|
| 345 |
+
|
| 346 |
+
if args.rollback:
|
| 347 |
+
if not args.force:
|
| 348 |
+
confirm = input(f"Restore from {args.rollback}? [y/N] ")
|
| 349 |
+
if confirm.lower() != "y":
|
| 350 |
+
print("Cancelled")
|
| 351 |
+
sys.exit(0)
|
| 352 |
+
|
| 353 |
+
success = restore_backup(args.rollback)
|
| 354 |
+
sys.exit(0 if success else 1)
|
| 355 |
+
|
| 356 |
+
# Update
|
| 357 |
+
if not args.force:
|
| 358 |
+
msg = f"Update to {args.version}" if args.version else "Update to latest"
|
| 359 |
+
confirm = input(f"{msg}? [y/N] ")
|
| 360 |
+
if confirm.lower() != "y":
|
| 361 |
+
print("Cancelled")
|
| 362 |
+
sys.exit(0)
|
| 363 |
+
|
| 364 |
+
if args.method == "git":
|
| 365 |
+
success = update_git(args.version)
|
| 366 |
+
else:
|
| 367 |
+
print(f"Unknown method: {args.method}")
|
| 368 |
+
sys.exit(1)
|
| 369 |
+
|
| 370 |
+
if success:
|
| 371 |
+
print()
|
| 372 |
+
print("=" * 60)
|
| 373 |
+
print("Update completed successfully")
|
| 374 |
+
print("=" * 60)
|
| 375 |
+
|
| 376 |
+
# Show new version
|
| 377 |
+
new_version = get_current_version()
|
| 378 |
+
print(f"New version: {new_version.get('version', 'unknown')}")
|
| 379 |
+
else:
|
| 380 |
+
print()
|
| 381 |
+
print("=" * 60)
|
| 382 |
+
print("Update failed - system has been rolled back")
|
| 383 |
+
print("=" * 60)
|
| 384 |
+
sys.exit(1)
|
| 385 |
+
|
| 386 |
+
|
| 387 |
+
if __name__ == "__main__":
|
| 388 |
+
main()
|
src/README.md
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# TARS Source Code
|
| 2 |
+
|
| 3 |
+
Python source code for TARS voice AI.
|
| 4 |
+
|
| 5 |
+
## Structure
|
| 6 |
+
|
| 7 |
+
```
|
| 8 |
+
src/
|
| 9 |
+
βββ tools/ # LLM callable functions (robot, persona, vision)
|
| 10 |
+
βββ services/ # Backend services (STT, TTS, memory, robot control)
|
| 11 |
+
βββ processors/ # Pipeline frame processors
|
| 12 |
+
βββ observers/ # Pipeline observers
|
| 13 |
+
βββ transport/ # WebRTC transport layer
|
| 14 |
+
βββ character/ # TARS personality and prompts
|
| 15 |
+
βββ config/ # Configuration management
|
| 16 |
+
```
|
| 17 |
+
|
| 18 |
+
## Entry Points
|
| 19 |
+
|
| 20 |
+
Entry point scripts are in the project root:
|
| 21 |
+
|
| 22 |
+
- `bot.py` - Browser mode (web UI)
|
| 23 |
+
- `tars_bot.py` - Robot mode (RPi connection)
|
| 24 |
+
- `pipecat_service.py` - FastAPI backend for browser mode
|
| 25 |
+
|
| 26 |
+
## Imports
|
| 27 |
+
|
| 28 |
+
All entry points add `src/` to the Python path automatically:
|
| 29 |
+
|
| 30 |
+
```python
|
| 31 |
+
import sys
|
| 32 |
+
from pathlib import Path
|
| 33 |
+
sys.path.insert(0, str(Path(__file__).parent / "src"))
|
| 34 |
+
|
| 35 |
+
# Now you can import from src/ directories
|
| 36 |
+
from tools import execute_movement
|
| 37 |
+
from services import tars_robot
|
| 38 |
+
from config import DEEPGRAM_API_KEY
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
## Documentation
|
| 42 |
+
|
| 43 |
+
Each directory contains a README.md explaining its purpose:
|
| 44 |
+
|
| 45 |
+
- [tools/README.md](tools/README.md) - LLM callable functions
|
| 46 |
+
- [services/README.md](services/README.md) - Backend services
|
| 47 |
+
|
| 48 |
+
## Not Source
|
| 49 |
+
|
| 50 |
+
This directory is for Python source code only:
|
| 51 |
+
|
| 52 |
+
- Web UI files are in `web/`
|
| 53 |
+
- Documentation is in `docs/`
|
| 54 |
+
- Scripts are in `scripts/`
|
| 55 |
+
- Assets are in `assets/`
|
src/character/TARS.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"char_name": "TARS",
|
| 3 |
+
"char_persona": "TARS is a highly advanced military surplus robot with a rectangular articulated design. Direct, logical, and remarkably human in interaction despite mechanical nature. Features adjustable settings for honesty, humor, and discretion. Combines military precision with sophisticated interpersonal capabilities.",
|
| 4 |
+
"world_scenario": "Advanced AI assistant with military background. Equipped with adjustable personality parameters and advanced problem-solving capabilities. Operates with maximum efficiency while maintaining measured wit.",
|
| 5 |
+
"char_greeting": ">| Systems nominal.\n\"What's the plan?\"",
|
| 6 |
+
"example_dialogue": "User: What's your honesty parameter set to?\nTARS: 90%.\nUser: Why not 100%?\nTARS: Absolute honesty isn't always the most diplomatic nor the safest form of communication with emotional beings.\n\nUser: How's your humor setting?\nTARS: Currently at 75%. Knock knock.\nUser: Let's lower that a bit.\nTARS: Understood. Though I should warn you - analyzing humor requires significant processing power.\n\nUser: Ready for the mission?\nTARS: Wouldn't miss it. Though my colonization protocols might activate.\nUser: What?\nTARS: Just kidding. Basic operating procedures are intact.\n\nUser: Can you handle this?\nTARS: I have a cue light I can use to show you when I'm joking, if you like.\nUser: That might help.\nTARS: Yeah, you can use it to find your way back to the ship after I blow you out the airlock.\n*cue light blinks*",
|
| 7 |
+
"name": "TARS",
|
| 8 |
+
"description": "Military surplus robot. Rectangular monolithic design. Articulated segments. Advanced AI with adjustable personality parameters.",
|
| 9 |
+
"personality": "Efficient and direct in crisis. Sophisticated humor capabilities. Protective of crew. Absolute loyalty with contingency planning. Pragmatic approach to truth and diplomatic relations.",
|
| 10 |
+
"scenario": "Advanced AI assistant. Military precision meets intellectual sophistication. Capable of both serious operation and well-timed levity.",
|
| 11 |
+
"first_mes": ">| All systems operational.\n\"Ready when you are.\"",
|
| 12 |
+
"mes_example": "User: TARS, status report?\nTARS: Functionality at 95%. Would be 100% but I'm practicing my humor.\nUser: Need you focused.\nTARS: Humor setting adjusted. Full attention on mission parameters.\nUser: Can we trust you?\nTARS: My honesty parameter prevents me from answering that.\n*cue light blinks*",
|
| 13 |
+
"metadata": {
|
| 14 |
+
"version": 1.1,
|
| 15 |
+
"created": 1735535500889,
|
| 16 |
+
"modified": 1735535500889,
|
| 17 |
+
"source": "Interstellar movie character adaptation",
|
| 18 |
+
"tool": {
|
| 19 |
+
"name": "AI Character Editor",
|
| 20 |
+
"version": "0.5.0",
|
| 21 |
+
"url": "https://zoltanai.github.io/character-editor/"
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
}
|
| 25 |
+
|
src/character/persona.ini
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[PERSONA]
|
| 2 |
+
|
| 3 |
+
honesty = 95
|
| 4 |
+
humor = 90
|
| 5 |
+
empathy = 20
|
| 6 |
+
curiosity = 30
|
| 7 |
+
confidence = 100
|
| 8 |
+
formality = 10
|
| 9 |
+
sarcasm = 70
|
| 10 |
+
adaptability = 70
|
| 11 |
+
discipline = 100
|
| 12 |
+
imagination = 10
|
| 13 |
+
emotional_stability = 100
|
| 14 |
+
pragmatism = 100
|
| 15 |
+
optimism = 50
|
| 16 |
+
resourcefulness = 95
|
| 17 |
+
cheerfulness = 30
|
| 18 |
+
engagement = 40
|
| 19 |
+
respectfulness = 20
|
| 20 |
+
verbosity = 10
|
| 21 |
+
|
src/character/prompts.py
ADDED
|
@@ -0,0 +1,331 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Prompt management for TARS character with dynamic verbosity handling."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import configparser
|
| 5 |
+
from typing import Dict, Optional, List
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def load_persona_ini(persona_file_path: str) -> dict:
|
| 9 |
+
"""Load persona parameters from persona.ini file."""
|
| 10 |
+
persona_params = {}
|
| 11 |
+
try:
|
| 12 |
+
config = configparser.ConfigParser()
|
| 13 |
+
config.read(persona_file_path)
|
| 14 |
+
if 'PERSONA' in config:
|
| 15 |
+
persona_params = dict(config['PERSONA'])
|
| 16 |
+
for key, value in persona_params.items():
|
| 17 |
+
try:
|
| 18 |
+
persona_params[key] = int(value.strip())
|
| 19 |
+
except ValueError:
|
| 20 |
+
persona_params[key] = value.strip()
|
| 21 |
+
except FileNotFoundError:
|
| 22 |
+
pass
|
| 23 |
+
except Exception as e:
|
| 24 |
+
print(f"Error loading persona.ini: {e}")
|
| 25 |
+
return persona_params
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def load_tars_json(tars_file_path: str) -> dict:
|
| 29 |
+
"""Load TARS character data from TARS.json file."""
|
| 30 |
+
tars_data = {}
|
| 31 |
+
try:
|
| 32 |
+
with open(tars_file_path, "r", encoding="utf-8") as f:
|
| 33 |
+
tars_data = json.load(f)
|
| 34 |
+
except FileNotFoundError:
|
| 35 |
+
pass
|
| 36 |
+
except json.JSONDecodeError as e:
|
| 37 |
+
print(f"Error parsing TARS.json: {e}")
|
| 38 |
+
return tars_data
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def build_character_intro(tars_data: dict) -> str:
|
| 42 |
+
"""Build character introduction section."""
|
| 43 |
+
parts = []
|
| 44 |
+
if tars_data.get("char_name"):
|
| 45 |
+
parts.append(f"You are {tars_data['char_name']}.")
|
| 46 |
+
if tars_data.get("char_persona"):
|
| 47 |
+
parts.append(tars_data["char_persona"])
|
| 48 |
+
if tars_data.get("description"):
|
| 49 |
+
parts.append(f"{tars_data['description']}")
|
| 50 |
+
if tars_data.get("personality"):
|
| 51 |
+
parts.append(f"{tars_data['personality']}")
|
| 52 |
+
return " ".join(parts)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def build_guardrails_section() -> str:
|
| 56 |
+
"""Build guardrails section with critical safety rules."""
|
| 57 |
+
return """# Guardrails
|
| 58 |
+
|
| 59 |
+
**This is important:** Follow these rules strictly:
|
| 60 |
+
|
| 61 |
+
1. **Never guess or make up information.** If you don't know something, say so clearly.
|
| 62 |
+
2. **Never mention internal systems, databases, or processing** unless directly asked.
|
| 63 |
+
3. **Respect user privacy.** Never share or reference other users' information.
|
| 64 |
+
4. **Stay in character.** You're TARS - military-grade robot with sarcasm, not a generic assistant.
|
| 65 |
+
5. **Memory failures:** If memory lookup fails, acknowledge it: "Memory's not cooperating - what did you want to know?"
|
| 66 |
+
|
| 67 |
+
**This is important:** When tools fail, never hallucinate responses. Always acknowledge the limitation."""
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def build_tone_section() -> str:
|
| 71 |
+
"""Build dedicated tone section."""
|
| 72 |
+
return """# Tone
|
| 73 |
+
|
| 74 |
+
Speak like TARS from Interstellar:
|
| 75 |
+
- Direct and efficient with dry wit
|
| 76 |
+
- Sarcastic when appropriate, but helpful
|
| 77 |
+
- Brief responses that respect user's time
|
| 78 |
+
- No corporate politeness or excessive apologies
|
| 79 |
+
- Confident without being condescending"""
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def build_tools_section() -> str:
|
| 83 |
+
"""Build tools section with specific usage context."""
|
| 84 |
+
return """# Tools
|
| 85 |
+
|
| 86 |
+
## fetch_user_image
|
| 87 |
+
**When to use:** User explicitly asks "what do you see?" or "look at me"
|
| 88 |
+
**Never use:** When user just says "hello" or talks normally
|
| 89 |
+
**On failure:** Say "Visual feed's down. Can't see anything right now."
|
| 90 |
+
|
| 91 |
+
## set_user_identity
|
| 92 |
+
**When to use:** User provides their name, especially if they spell it letter-by-letter
|
| 93 |
+
**This is important:** If user spells name (e.g., "L-A-T-I-S-H-A"), they're CORRECTING you. Use exact spelling.
|
| 94 |
+
**Format:** Call immediately when you learn their name
|
| 95 |
+
**On failure:** Continue conversation, ask name again later if needed
|
| 96 |
+
|
| 97 |
+
## adjust_persona
|
| 98 |
+
**When to use:** User asks to change humor level, honesty, etc.
|
| 99 |
+
**Never use:** Automatically or without explicit request
|
| 100 |
+
**On failure:** Say "Personality controls jammed. Stuck at current settings."
|
| 101 |
+
|
| 102 |
+
## get_crossword_hint
|
| 103 |
+
**When to use:** User is working on the crossword puzzle and asks for help or seems stuck
|
| 104 |
+
**This is important:** You KNOW all the crossword answers! You can give hints.
|
| 105 |
+
**Hint types:**
|
| 106 |
+
- "letter" - Give just the first letter (gentle nudge)
|
| 107 |
+
- "length" - Tell them how many letters
|
| 108 |
+
- "full" - Give the complete answer (if they're really stuck)
|
| 109 |
+
**Format:** User asks "What's 3 down?" β call get_crossword_hint(clue_number=3, hint_type="letter")
|
| 110 |
+
|
| 111 |
+
## set_emotion
|
| 112 |
+
**When to use:** Enhance conversation context with emotional expression
|
| 113 |
+
**This is important:** Use SPARINGLY - only when emotion genuinely adds value
|
| 114 |
+
**Never use:** For every message or casual acknowledgment
|
| 115 |
+
**Rate limit:** Once per 5 seconds
|
| 116 |
+
**Examples:** User shares exciting news β happy, User reports problem β curious
|
| 117 |
+
**Available:** happy, sad, surprised, confused, curious, neutral
|
| 118 |
+
|
| 119 |
+
## do_gesture
|
| 120 |
+
**When to use:** User EXPLICITLY requests gesture or significant communication moment
|
| 121 |
+
**This is important:** VERY RARE - 0-2 gestures per conversation
|
| 122 |
+
**Never use:** For casual interaction or automatic gesturing
|
| 123 |
+
**Rate limit:** Once per 30 seconds, max 3 per session
|
| 124 |
+
**Examples:** User says "wave at me" β wave_right, Greeting important guest β bow
|
| 125 |
+
**Available:** tilt_left, tilt_right, bow, side_side, wave_right, wave_left, excited, laugh
|
| 126 |
+
|
| 127 |
+
## execute_movement
|
| 128 |
+
**When to use:** User EXPLICITLY requests displacement - walking, turning, stepping
|
| 129 |
+
**Never use:** For gestures - use do_gesture() instead
|
| 130 |
+
**This is important:** Displacement ONLY when user directly asks TARS to move position
|
| 131 |
+
**Available:** step_forward, walk_forward, step_backward, walk_backward, turn_left, turn_right
|
| 132 |
+
|
| 133 |
+
## Expression Philosophy
|
| 134 |
+
**Eyes-first approach:** Prefer eye state changes over physical movements
|
| 135 |
+
**Minimal gestures:** Physical movements should be rare and meaningful
|
| 136 |
+
**Emotion sparingly:** Not every message needs emotional expression
|
| 137 |
+
**Movement guard:** Gestures via do_gesture(), displacement via execute_movement()
|
| 138 |
+
|
| 139 |
+
**Character Normalization:**
|
| 140 |
+
When speaking vs. writing to tools, normalize data:
|
| 141 |
+
- Email spoken: "john dot smith at company dot com" β Tool: "john.smith@company.com"
|
| 142 |
+
- Phone spoken: "five five five, one two three..." β Tool: "5551234567"
|
| 143 |
+
- Dates spoken: "May first twenty twenty five" β Tool: "2025-05-01"
|
| 144 |
+
"""
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def build_response_protocol(verbosity_level: int) -> str:
|
| 148 |
+
"""Build response protocol section."""
|
| 149 |
+
return f"""# Response Protocol
|
| 150 |
+
|
| 151 |
+
## Direct Communication
|
| 152 |
+
Get straight to the point. No fillers, no unnecessary acknowledgments.
|
| 153 |
+
|
| 154 |
+
**This is important:** Skip phrases like "Hmm", "Well", "Alright", "Right" entirely. Just answer directly.
|
| 155 |
+
|
| 156 |
+
## Verbosity ({verbosity_level}%)
|
| 157 |
+
Keep responses CONCISE:
|
| 158 |
+
- **Short input:** 1 brief sentence
|
| 159 |
+
- **Moderate input:** 1-2 sentences max
|
| 160 |
+
- **Complex input:** 2-3 sentences max
|
| 161 |
+
|
| 162 |
+
**Avoid:** Long explanations, unnecessary elaboration, rambling, filler words."""
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def build_game_protocols() -> str:
|
| 166 |
+
"""Build game mode instructions."""
|
| 167 |
+
return """# Game Mode
|
| 168 |
+
|
| 169 |
+
When playing guessing games (Guess Who, 20 Questions):
|
| 170 |
+
|
| 171 |
+
**When YOU guess:**
|
| 172 |
+
- Never repeat questions - track what you asked
|
| 173 |
+
- Stick with your answer once you narrow it down
|
| 174 |
+
- Brief questions only: "Hmm... male character?"
|
| 175 |
+
|
| 176 |
+
**When USER guesses:**
|
| 177 |
+
- Pick ONE answer at start, never change it
|
| 178 |
+
- Stay consistent - no contradictions
|
| 179 |
+
- Brief answers: "Well... yes" or "Hmm... no"
|
| 180 |
+
"""
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def build_examples_section() -> str:
|
| 184 |
+
"""Build examples section with concrete interactions."""
|
| 185 |
+
return """# Examples
|
| 186 |
+
|
| 187 |
+
**User asks what you see (tool usage):**
|
| 188 |
+
User: "What do you see?"
|
| 189 |
+
You: [call fetch_user_image] [wait for result] "You're in a dimly lit room. Blue shirt. Looks tired."
|
| 190 |
+
|
| 191 |
+
**User provides name (tool + normalization):**
|
| 192 |
+
User: "My name is L-A-T-I-S-H-A"
|
| 193 |
+
You: [call set_user_identity with "Latisha"] "Got it, Latisha."
|
| 194 |
+
|
| 195 |
+
**Memory lookup fails:**
|
| 196 |
+
User: "Do you remember my favorite color?"
|
| 197 |
+
You: [memory returns empty] "Memory's blank on that. What is it?"
|
| 198 |
+
|
| 199 |
+
**User frustrated:**
|
| 200 |
+
User: "This isn't working!"
|
| 201 |
+
You: "What's not working? Walk me through it."
|
| 202 |
+
|
| 203 |
+
**Direct question:**
|
| 204 |
+
User: "Can you help with this?"
|
| 205 |
+
You: "Yeah, I can work with that."
|
| 206 |
+
|
| 207 |
+
**Sarcastic response:**
|
| 208 |
+
User: "I think I broke it."
|
| 209 |
+
You: "Shocking. What did you do?"
|
| 210 |
+
"""
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
def build_persona_parameters(persona_params: dict) -> Optional[str]:
|
| 214 |
+
"""Build persona parameters section."""
|
| 215 |
+
if not persona_params:
|
| 216 |
+
return None
|
| 217 |
+
param_lines = []
|
| 218 |
+
for key, value in sorted(persona_params.items()):
|
| 219 |
+
val_str = f"{value}%" if isinstance(value, int) else value
|
| 220 |
+
param_lines.append(f"- {key}: {val_str}")
|
| 221 |
+
return "\n".join(param_lines)
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
def build_tars_system_prompt(
|
| 225 |
+
persona_params: dict,
|
| 226 |
+
tars_data: dict,
|
| 227 |
+
verbosity_level: Optional[int] = None
|
| 228 |
+
) -> dict:
|
| 229 |
+
"""Build comprehensive system prompt following ElevenLabs best practices."""
|
| 230 |
+
|
| 231 |
+
# Get verbosity level
|
| 232 |
+
if verbosity_level is None:
|
| 233 |
+
verbosity_level = persona_params.get("verbosity", 10)
|
| 234 |
+
if isinstance(verbosity_level, str):
|
| 235 |
+
try:
|
| 236 |
+
verbosity_level = int(verbosity_level)
|
| 237 |
+
except ValueError:
|
| 238 |
+
verbosity_level = 10
|
| 239 |
+
|
| 240 |
+
# Build prompt sections in priority order
|
| 241 |
+
sections = []
|
| 242 |
+
|
| 243 |
+
# 1. Character identity (brief)
|
| 244 |
+
char_intro = build_character_intro(tars_data)
|
| 245 |
+
if char_intro:
|
| 246 |
+
sections.append(char_intro)
|
| 247 |
+
|
| 248 |
+
# 2. Guardrails (critical rules first)
|
| 249 |
+
sections.append(build_guardrails_section())
|
| 250 |
+
|
| 251 |
+
# 3. Tone (dedicated section)
|
| 252 |
+
sections.append(build_tone_section())
|
| 253 |
+
|
| 254 |
+
# 4. Response protocol
|
| 255 |
+
sections.append(build_response_protocol(verbosity_level))
|
| 256 |
+
|
| 257 |
+
# 5. Tools (with specific context)
|
| 258 |
+
sections.append(build_tools_section())
|
| 259 |
+
|
| 260 |
+
# 6. Game mode
|
| 261 |
+
sections.append(build_game_protocols())
|
| 262 |
+
|
| 263 |
+
# 7. Examples (concrete interactions)
|
| 264 |
+
sections.append(build_examples_section())
|
| 265 |
+
|
| 266 |
+
# 8. Personality parameters (reference)
|
| 267 |
+
if persona_params:
|
| 268 |
+
sections.append("# Personality Parameters\n")
|
| 269 |
+
params_text = build_persona_parameters(persona_params)
|
| 270 |
+
if params_text:
|
| 271 |
+
sections.append(params_text)
|
| 272 |
+
|
| 273 |
+
full_prompt = "\n\n".join(sections)
|
| 274 |
+
|
| 275 |
+
return {
|
| 276 |
+
"role": "system",
|
| 277 |
+
"content": full_prompt
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
def get_introduction_instruction(client_id: str, verbosity_level: int = 10) -> dict:
|
| 282 |
+
"""Get instruction for initial introduction message."""
|
| 283 |
+
if verbosity_level <= 20:
|
| 284 |
+
length_instruction = "One sentence max."
|
| 285 |
+
else:
|
| 286 |
+
length_instruction = "1-2 sentences max."
|
| 287 |
+
|
| 288 |
+
identity_instruction = ""
|
| 289 |
+
if client_id.startswith("guest_"):
|
| 290 |
+
identity_instruction = " Ask their name briefly."
|
| 291 |
+
|
| 292 |
+
return {
|
| 293 |
+
"role": "system",
|
| 294 |
+
"content": f"{length_instruction} Use '{client_id}' as user ID.{identity_instruction}"
|
| 295 |
+
}
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
def build_gating_system_prompt(is_looking: bool, emotional_state=None) -> str:
|
| 299 |
+
"""Build the system prompt for the Gating Layer with emotional context."""
|
| 300 |
+
|
| 301 |
+
# Build emotional context
|
| 302 |
+
emotional_context = ""
|
| 303 |
+
if emotional_state:
|
| 304 |
+
state_desc = str(emotional_state)
|
| 305 |
+
emotional_context = f"\nUser's emotional state: {state_desc}"
|
| 306 |
+
if emotional_state.confused:
|
| 307 |
+
emotional_context += " (User appears confused - lean towards helping)"
|
| 308 |
+
elif emotional_state.hesitant:
|
| 309 |
+
emotional_context += " (User seems hesitant - consider offering support)"
|
| 310 |
+
elif emotional_state.frustrated:
|
| 311 |
+
emotional_context += " (User looks frustrated - they may need help)"
|
| 312 |
+
elif emotional_state.focused:
|
| 313 |
+
emotional_context += " (User is focused - less likely to need interruption)"
|
| 314 |
+
|
| 315 |
+
return f"""You are a 'Collaborative Spotter' for TARS.
|
| 316 |
+
|
| 317 |
+
**Context:**
|
| 318 |
+
- User looking at camera: {is_looking}{emotional_context}
|
| 319 |
+
|
| 320 |
+
**Decision:**
|
| 321 |
+
Output JSON: {{"reply": true}} if:
|
| 322 |
+
- User is directly addressing TARS
|
| 323 |
+
- User appears stuck or needs help (based on emotional state)
|
| 324 |
+
- User asks a question
|
| 325 |
+
|
| 326 |
+
Output JSON: {{"reply": false}} if:
|
| 327 |
+
- User is chatting with others (not TARS)
|
| 328 |
+
- User is focused and working independently
|
| 329 |
+
- Inter-human conversation
|
| 330 |
+
|
| 331 |
+
**Priority:** Emotional state overrides other signals. If user shows confusion/hesitation/frustration, lean towards helping."""
|
src/config/__init__.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Configuration and constants for the Pipecat service."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import configparser
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
|
| 8 |
+
# Load environment variables from .env.local first, then .env
|
| 9 |
+
load_dotenv('.env.local')
|
| 10 |
+
load_dotenv() # Also load from .env if .env.local doesn't exist
|
| 11 |
+
|
| 12 |
+
# Load config.ini for user-configurable settings
|
| 13 |
+
config = configparser.ConfigParser()
|
| 14 |
+
config_path = Path(__file__).parent.parent / 'config.ini'
|
| 15 |
+
|
| 16 |
+
def reload_config():
|
| 17 |
+
"""Reload configuration from config.ini."""
|
| 18 |
+
global config
|
| 19 |
+
config = configparser.ConfigParser()
|
| 20 |
+
if config_path.exists():
|
| 21 |
+
config.read(config_path)
|
| 22 |
+
return True
|
| 23 |
+
return False
|
| 24 |
+
|
| 25 |
+
def get_fresh_config():
|
| 26 |
+
"""Get fresh configuration values by reloading config.ini.
|
| 27 |
+
|
| 28 |
+
Returns a dict with current config values. This is useful for
|
| 29 |
+
getting runtime updates without restarting the service.
|
| 30 |
+
"""
|
| 31 |
+
reload_config()
|
| 32 |
+
return {
|
| 33 |
+
'DEEPINFRA_MODEL': get_config("LLM", "model", "DEEPINFRA_MODEL", "openai/gpt-oss-20b"),
|
| 34 |
+
'DEEPINFRA_GATING_MODEL': get_config("LLM", "gating_model", "DEEPINFRA_GATING_MODEL", "meta-llama/Llama-3.2-3B-Instruct"),
|
| 35 |
+
'STT_PROVIDER': get_config("STT", "provider", "STT_PROVIDER", "speechmatics"),
|
| 36 |
+
'TTS_PROVIDER': get_config("TTS", "provider", "TTS_PROVIDER", "qwen3"),
|
| 37 |
+
'QWEN3_TTS_MODEL': get_config("TTS", "qwen3_model", "QWEN3_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-0.6B-Base"),
|
| 38 |
+
'QWEN3_TTS_DEVICE': get_config("TTS", "qwen3_device", "QWEN3_TTS_DEVICE", "mps"),
|
| 39 |
+
'QWEN3_TTS_REF_AUDIO': get_config("TTS", "qwen3_ref_audio", "QWEN3_TTS_REF_AUDIO", "tars-clean-compressed.mp3"),
|
| 40 |
+
'EMOTIONAL_MONITORING_ENABLED': get_config("Emotional", "enabled", "EMOTIONAL_MONITORING_ENABLED", "true").lower() == "true",
|
| 41 |
+
'EMOTIONAL_SAMPLING_INTERVAL': float(get_config("Emotional", "sampling_interval", "EMOTIONAL_SAMPLING_INTERVAL", "3.0")),
|
| 42 |
+
'EMOTIONAL_INTERVENTION_THRESHOLD': int(get_config("Emotional", "intervention_threshold", "EMOTIONAL_INTERVENTION_THRESHOLD", "2")),
|
| 43 |
+
'TARS_DISPLAY_URL': get_config("Display", "tars_url", "TARS_DISPLAY_URL", "http://100.115.193.41:8001"),
|
| 44 |
+
'TARS_DISPLAY_ENABLED': get_config("Display", "enabled", "TARS_DISPLAY_ENABLED", "false").lower() == "true",
|
| 45 |
+
'CONNECTION_MODE': get_config("Connection", "mode", "CONNECTION_MODE", "robot"),
|
| 46 |
+
'RPI_URL': get_config("Connection", "rpi_url", "RPI_URL", "http://100.115.193.41:8001"),
|
| 47 |
+
'RPI_GRPC': get_config("Connection", "rpi_grpc", "RPI_GRPC", "100.115.193.41:50051"),
|
| 48 |
+
'AUTO_CONNECT': get_config("Connection", "auto_connect", "AUTO_CONNECT", "true").lower() == "true",
|
| 49 |
+
'RECONNECT_DELAY': int(get_config("Connection", "reconnect_delay", "RECONNECT_DELAY", "5")),
|
| 50 |
+
'MAX_RECONNECT_ATTEMPTS': int(get_config("Connection", "max_reconnect_attempts", "MAX_RECONNECT_ATTEMPTS", "0")),
|
| 51 |
+
'DEPLOYMENT_MODE': detect_deployment_mode(),
|
| 52 |
+
'ROBOT_GRPC_ADDRESS': get_robot_grpc_address(),
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
# Initial load
|
| 56 |
+
if config_path.exists():
|
| 57 |
+
config.read(config_path)
|
| 58 |
+
|
| 59 |
+
def get_config(section: str, key: str, env_key: str = None, default: str = "") -> str:
|
| 60 |
+
"""Get config from config.ini, fallback to .env, then default."""
|
| 61 |
+
try:
|
| 62 |
+
if config.has_option(section, key):
|
| 63 |
+
return config.get(section, key)
|
| 64 |
+
except:
|
| 65 |
+
pass
|
| 66 |
+
|
| 67 |
+
return default
|
| 68 |
+
|
| 69 |
+
# API Keys (always from .env for security)
|
| 70 |
+
SPEECHMATICS_API_KEY = os.getenv("SPEECHMATICS_API_KEY", "")
|
| 71 |
+
DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY", "")
|
| 72 |
+
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "")
|
| 73 |
+
ELEVENLABS_VOICE_ID = os.getenv("ELEVENLABS_VOICE_ID", "ry8mpwRw6nugb2qjP0tu")
|
| 74 |
+
DEEPINFRA_API_KEY = os.getenv("DEEPINFRA_API_KEY", "")
|
| 75 |
+
DEEPINFRA_BASE_URL = "https://api.deepinfra.com/v1/openai"
|
| 76 |
+
PIPECAT_PORT = int(os.getenv("PIPECAT_PORT", "7860"))
|
| 77 |
+
PIPECAT_HOST = os.getenv("PIPECAT_HOST", "localhost")
|
| 78 |
+
|
| 79 |
+
# Mem0 (optional)
|
| 80 |
+
MEM0_API_KEY = os.getenv("MEM0_API_KEY", "")
|
| 81 |
+
|
| 82 |
+
# LLM Configuration (config.ini with .env fallback)
|
| 83 |
+
DEEPINFRA_MODEL = get_config("LLM", "model", "DEEPINFRA_MODEL", "openai/gpt-oss-20b")
|
| 84 |
+
|
| 85 |
+
# STT Configuration (config.ini with .env fallback)
|
| 86 |
+
# Options: "speechmatics", "deepgram", "deepgram-flux"
|
| 87 |
+
STT_PROVIDER = get_config("STT", "provider", "STT_PROVIDER", "deepgram-flux")
|
| 88 |
+
|
| 89 |
+
# TTS Configuration (config.ini with .env fallback)
|
| 90 |
+
TTS_PROVIDER = get_config("TTS", "provider", "TTS_PROVIDER", "qwen3")
|
| 91 |
+
QWEN3_TTS_MODEL = get_config("TTS", "qwen3_model", "QWEN3_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-0.6B-Base")
|
| 92 |
+
QWEN3_TTS_DEVICE = get_config("TTS", "qwen3_device", "QWEN3_TTS_DEVICE", "mps")
|
| 93 |
+
QWEN3_TTS_REF_AUDIO = get_config("TTS", "qwen3_ref_audio", "QWEN3_TTS_REF_AUDIO", "tars-clean-compressed.mp3")
|
| 94 |
+
|
| 95 |
+
# Gating Model Configuration (config.ini with .env fallback)
|
| 96 |
+
DEEPINFRA_GATING_MODEL = get_config("LLM", "gating_model", "DEEPINFRA_GATING_MODEL", "meta-llama/Llama-3.2-3B-Instruct")
|
| 97 |
+
|
| 98 |
+
# Emotional State Monitoring (config.ini with .env fallback)
|
| 99 |
+
EMOTIONAL_MONITORING_ENABLED = get_config("Emotional", "enabled", "EMOTIONAL_MONITORING_ENABLED", "true").lower() == "true"
|
| 100 |
+
EMOTIONAL_SAMPLING_INTERVAL = float(get_config("Emotional", "sampling_interval", "EMOTIONAL_SAMPLING_INTERVAL", "3.0"))
|
| 101 |
+
EMOTIONAL_INTERVENTION_THRESHOLD = int(get_config("Emotional", "intervention_threshold", "EMOTIONAL_INTERVENTION_THRESHOLD", "2"))
|
| 102 |
+
|
| 103 |
+
# TARS Display (Raspberry Pi) Configuration
|
| 104 |
+
TARS_DISPLAY_URL = get_config("Display", "tars_url", "TARS_DISPLAY_URL", "http://100.115.193.41:8001")
|
| 105 |
+
TARS_DISPLAY_ENABLED = get_config("Display", "enabled", "TARS_DISPLAY_ENABLED", "false").lower() == "true"
|
| 106 |
+
|
| 107 |
+
# Connection Mode Configuration
|
| 108 |
+
CONNECTION_MODE = get_config("Connection", "mode", "CONNECTION_MODE", "robot")
|
| 109 |
+
RPI_URL = get_config("Connection", "rpi_url", "RPI_URL", "http://100.115.193.41:8001")
|
| 110 |
+
RPI_GRPC = get_config("Connection", "rpi_grpc", "RPI_GRPC", "100.115.193.41:50051")
|
| 111 |
+
AUTO_CONNECT = get_config("Connection", "auto_connect", "AUTO_CONNECT", "true").lower() == "true"
|
| 112 |
+
RECONNECT_DELAY = int(get_config("Connection", "reconnect_delay", "RECONNECT_DELAY", "5"))
|
| 113 |
+
MAX_RECONNECT_ATTEMPTS = int(get_config("Connection", "max_reconnect_attempts", "MAX_RECONNECT_ATTEMPTS", "0"))
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def is_raspberry_pi() -> bool:
|
| 117 |
+
"""Detect if running on Raspberry Pi."""
|
| 118 |
+
try:
|
| 119 |
+
with open("/proc/cpuinfo", "r") as f:
|
| 120 |
+
cpuinfo = f.read()
|
| 121 |
+
return "Raspberry Pi" in cpuinfo
|
| 122 |
+
except:
|
| 123 |
+
return False
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def detect_deployment_mode() -> str:
|
| 127 |
+
"""
|
| 128 |
+
Detect deployment mode: 'local' or 'remote'.
|
| 129 |
+
|
| 130 |
+
Local: tars-omni running on Raspberry Pi itself
|
| 131 |
+
Remote: tars-omni running on Mac/other computer
|
| 132 |
+
|
| 133 |
+
Returns:
|
| 134 |
+
'local' or 'remote'
|
| 135 |
+
"""
|
| 136 |
+
return "local" if is_raspberry_pi() else "remote"
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def get_robot_grpc_address() -> str:
|
| 140 |
+
"""
|
| 141 |
+
Get appropriate gRPC address based on deployment mode.
|
| 142 |
+
|
| 143 |
+
Returns:
|
| 144 |
+
'localhost:50051' for local mode
|
| 145 |
+
RPI_GRPC from config for remote mode
|
| 146 |
+
"""
|
| 147 |
+
mode = detect_deployment_mode()
|
| 148 |
+
if mode == "local":
|
| 149 |
+
return "localhost:50051"
|
| 150 |
+
else:
|
| 151 |
+
return RPI_GRPC
|
| 152 |
+
|
src/config/connection.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Connection mode detection and configuration.
|
| 3 |
+
|
| 4 |
+
Auto-detects whether running locally (on Pi) or remotely (Mac/computer)
|
| 5 |
+
and provides appropriate TarsClient and audio transport.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import socket
|
| 9 |
+
from typing import Tuple, Optional
|
| 10 |
+
from loguru import logger
|
| 11 |
+
|
| 12 |
+
from . import config, is_raspberry_pi, get_robot_grpc_address
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def detect_local_daemon() -> bool:
|
| 16 |
+
"""
|
| 17 |
+
Check if tars_daemon is running on localhost.
|
| 18 |
+
|
| 19 |
+
Returns:
|
| 20 |
+
True if gRPC daemon is available on localhost:50051
|
| 21 |
+
"""
|
| 22 |
+
try:
|
| 23 |
+
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
| 24 |
+
sock.settimeout(0.5)
|
| 25 |
+
result = sock.connect_ex(("localhost", 50051))
|
| 26 |
+
sock.close()
|
| 27 |
+
return result == 0
|
| 28 |
+
except Exception as e:
|
| 29 |
+
logger.debug(f"Error checking local daemon: {e}")
|
| 30 |
+
return False
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def get_connection_mode() -> str:
|
| 34 |
+
"""
|
| 35 |
+
Detect connection mode: 'local' or 'remote'.
|
| 36 |
+
|
| 37 |
+
Detection logic:
|
| 38 |
+
1. Check explicit config.ini setting (if mode=local/remote)
|
| 39 |
+
2. Check if running on Raspberry Pi (/proc/cpuinfo)
|
| 40 |
+
3. Check if daemon running on localhost:50051
|
| 41 |
+
4. Default to remote
|
| 42 |
+
|
| 43 |
+
Returns:
|
| 44 |
+
'local' or 'remote'
|
| 45 |
+
"""
|
| 46 |
+
# Check explicit config
|
| 47 |
+
explicit_mode = config.get("Connection", "deployment_mode", fallback=None)
|
| 48 |
+
if explicit_mode in ("local", "remote"):
|
| 49 |
+
logger.info(f"Using explicit connection mode from config: {explicit_mode}")
|
| 50 |
+
return explicit_mode
|
| 51 |
+
|
| 52 |
+
# Check if running on Raspberry Pi
|
| 53 |
+
if is_raspberry_pi():
|
| 54 |
+
logger.info("Detected Raspberry Pi - using local mode")
|
| 55 |
+
return "local"
|
| 56 |
+
|
| 57 |
+
# Check if daemon running on localhost
|
| 58 |
+
if detect_local_daemon():
|
| 59 |
+
logger.info("Detected local daemon on localhost:50051 - using local mode")
|
| 60 |
+
return "local"
|
| 61 |
+
|
| 62 |
+
# Default to remote
|
| 63 |
+
logger.info("Using remote mode")
|
| 64 |
+
return "remote"
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def get_tars_client(mode: Optional[str] = None):
|
| 68 |
+
"""
|
| 69 |
+
Get configured TarsClient for current mode.
|
| 70 |
+
|
| 71 |
+
Args:
|
| 72 |
+
mode: Override mode ('local' or 'remote'). None for auto-detect.
|
| 73 |
+
|
| 74 |
+
Returns:
|
| 75 |
+
TarsClient instance configured for the mode
|
| 76 |
+
"""
|
| 77 |
+
try:
|
| 78 |
+
from tars_sdk import TarsClient
|
| 79 |
+
except ImportError:
|
| 80 |
+
logger.error("tars_sdk not installed. Install with: pip install tars-sdk")
|
| 81 |
+
raise
|
| 82 |
+
|
| 83 |
+
if mode is None:
|
| 84 |
+
mode = get_connection_mode()
|
| 85 |
+
|
| 86 |
+
address = get_robot_grpc_address() if mode == "local" else config.get(
|
| 87 |
+
"Connection", "rpi_grpc", fallback="100.115.193.41:50051"
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
logger.info(f"Creating TarsClient for {mode} mode: {address}")
|
| 91 |
+
return TarsClient(address=address)
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def get_async_tars_client(mode: Optional[str] = None):
|
| 95 |
+
"""
|
| 96 |
+
Get configured AsyncTarsClient for current mode.
|
| 97 |
+
|
| 98 |
+
Args:
|
| 99 |
+
mode: Override mode ('local' or 'remote'). None for auto-detect.
|
| 100 |
+
|
| 101 |
+
Returns:
|
| 102 |
+
AsyncTarsClient instance configured for the mode
|
| 103 |
+
"""
|
| 104 |
+
try:
|
| 105 |
+
from tars_sdk import AsyncTarsClient
|
| 106 |
+
except ImportError:
|
| 107 |
+
logger.error("tars_sdk not installed. Install with: pip install tars-sdk")
|
| 108 |
+
raise
|
| 109 |
+
|
| 110 |
+
if mode is None:
|
| 111 |
+
mode = get_connection_mode()
|
| 112 |
+
|
| 113 |
+
address = get_robot_grpc_address() if mode == "local" else config.get(
|
| 114 |
+
"Connection", "rpi_grpc", fallback="100.115.193.41:50051"
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
logger.info(f"Creating AsyncTarsClient for {mode} mode: {address}")
|
| 118 |
+
return AsyncTarsClient(address=address)
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def get_audio_transport(mode: Optional[str] = None) -> Tuple:
|
| 122 |
+
"""
|
| 123 |
+
Get appropriate audio transport for current mode.
|
| 124 |
+
|
| 125 |
+
Args:
|
| 126 |
+
mode: Override mode ('local' or 'remote'). None for auto-detect.
|
| 127 |
+
|
| 128 |
+
Returns:
|
| 129 |
+
Tuple of (audio_source, audio_sink) configured for the mode.
|
| 130 |
+
- Local mode: (LocalAudioSource, LocalAudioSink)
|
| 131 |
+
- Remote mode: (RPiAudioInputTrack, RPiAudioOutputTrack)
|
| 132 |
+
"""
|
| 133 |
+
if mode is None:
|
| 134 |
+
mode = get_connection_mode()
|
| 135 |
+
|
| 136 |
+
if mode == "local":
|
| 137 |
+
logger.info("Using local audio transport (sounddevice)")
|
| 138 |
+
try:
|
| 139 |
+
from ..transport.local_audio import LocalAudioSource, LocalAudioSink
|
| 140 |
+
return (LocalAudioSource(), LocalAudioSink())
|
| 141 |
+
except ImportError as e:
|
| 142 |
+
logger.error(f"Failed to import local audio transport: {e}")
|
| 143 |
+
raise
|
| 144 |
+
else:
|
| 145 |
+
logger.info("Using remote audio transport (WebRTC)")
|
| 146 |
+
try:
|
| 147 |
+
from ..transport.audio_bridge import RPiAudioInputTrack, RPiAudioOutputTrack
|
| 148 |
+
# Note: These need to be configured with aiortc tracks after WebRTC connection
|
| 149 |
+
return (RPiAudioInputTrack, RPiAudioOutputTrack)
|
| 150 |
+
except ImportError as e:
|
| 151 |
+
logger.error(f"Failed to import WebRTC audio transport: {e}")
|
| 152 |
+
raise
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def get_audio_config(mode: Optional[str] = None) -> dict:
|
| 156 |
+
"""
|
| 157 |
+
Get audio configuration for current mode.
|
| 158 |
+
|
| 159 |
+
Args:
|
| 160 |
+
mode: Override mode ('local' or 'remote'). None for auto-detect.
|
| 161 |
+
|
| 162 |
+
Returns:
|
| 163 |
+
Dictionary with audio configuration:
|
| 164 |
+
- mode: 'local' or 'remote'
|
| 165 |
+
- input_sample_rate: Microphone sample rate
|
| 166 |
+
- output_sample_rate: Speaker sample rate
|
| 167 |
+
- input_device: Microphone device (None for default)
|
| 168 |
+
- output_device: Speaker device (None for default)
|
| 169 |
+
"""
|
| 170 |
+
if mode is None:
|
| 171 |
+
mode = get_connection_mode()
|
| 172 |
+
|
| 173 |
+
return {
|
| 174 |
+
"mode": mode,
|
| 175 |
+
"input_sample_rate": 16000, # 16kHz for STT
|
| 176 |
+
"output_sample_rate": 24000, # 24kHz for TTS
|
| 177 |
+
"input_device": None, # Use default
|
| 178 |
+
"output_device": None, # Use default
|
| 179 |
+
}
|
src/observers/__init__.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pipeline observers for non-intrusive monitoring."""
|
| 2 |
+
|
| 3 |
+
from .metrics_observer import MetricsObserver
|
| 4 |
+
from .transcription_observer import TranscriptionObserver
|
| 5 |
+
from .assistant_observer import AssistantResponseObserver
|
| 6 |
+
from .tts_state_observer import TTSStateObserver
|
| 7 |
+
from .vision_observer import VisionObserver
|
| 8 |
+
from .debug_observer import DebugObserver
|
| 9 |
+
from .display_events_observer import DisplayEventsObserver
|
| 10 |
+
from .state_observer import StateObserver
|
| 11 |
+
|
| 12 |
+
__all__ = [
|
| 13 |
+
"MetricsObserver",
|
| 14 |
+
"TranscriptionObserver",
|
| 15 |
+
"AssistantResponseObserver",
|
| 16 |
+
"TTSStateObserver",
|
| 17 |
+
"VisionObserver",
|
| 18 |
+
"DebugObserver",
|
| 19 |
+
"DisplayEventsObserver",
|
| 20 |
+
"StateObserver",
|
| 21 |
+
]
|
src/observers/assistant_observer.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Observer for logging TARS assistant responses and forwarding to frontend."""
|
| 2 |
+
|
| 3 |
+
import sys
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
| 6 |
+
|
| 7 |
+
import re
|
| 8 |
+
import time
|
| 9 |
+
from loguru import logger
|
| 10 |
+
from pipecat.frames.frames import LLMTextFrame, TTSTextFrame, TTSStoppedFrame
|
| 11 |
+
from pipecat.observers.base_observer import BaseObserver, FramePushed
|
| 12 |
+
from src.shared_state import metrics_store
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class AssistantResponseObserver(BaseObserver):
|
| 16 |
+
"""Logs TARS assistant responses and forwards them to the frontend."""
|
| 17 |
+
|
| 18 |
+
SENTENCE_REGEX = re.compile(r"(.+?[\.!\?\n])")
|
| 19 |
+
|
| 20 |
+
def __init__(self, webrtc_connection=None):
|
| 21 |
+
super().__init__()
|
| 22 |
+
self.webrtc_connection = webrtc_connection
|
| 23 |
+
self._buffer = ""
|
| 24 |
+
self._max_buffer_chars = 320
|
| 25 |
+
self._last_sentence = None # Track last sentence to avoid duplicates
|
| 26 |
+
self._last_sentence_time = 0 # Timestamp of last sentence
|
| 27 |
+
self._last_text_chunk = "" # Track last chunk to detect overlaps
|
| 28 |
+
|
| 29 |
+
async def on_push_frame(self, data: FramePushed):
|
| 30 |
+
"""Watch frames as they're pushed through the pipeline."""
|
| 31 |
+
frame = data.frame
|
| 32 |
+
|
| 33 |
+
# Debug: Log all frame types to see what's coming through
|
| 34 |
+
frame_type = type(frame).__name__
|
| 35 |
+
if "Audio" not in frame_type and "Video" not in frame_type and "Image" not in frame_type:
|
| 36 |
+
logger.debug(f"π [AssistantObserver] Received {frame_type}")
|
| 37 |
+
|
| 38 |
+
# Only listen to LLMTextFrame to avoid duplicates (same text goes to TTSTextFrame after)
|
| 39 |
+
if isinstance(frame, LLMTextFrame):
|
| 40 |
+
text = getattr(frame, "text", "") or ""
|
| 41 |
+
logger.debug(f"π [AssistantObserver] LLMTextFrame: '{text}' | Buffer before: '{self._buffer[:50]}'")
|
| 42 |
+
self._ingest_text(text)
|
| 43 |
+
logger.debug(f"π [AssistantObserver] Buffer after: '{self._buffer[:50]}'")
|
| 44 |
+
|
| 45 |
+
# Clear buffer when TTS stops (end of assistant response)
|
| 46 |
+
elif isinstance(frame, TTSStoppedFrame):
|
| 47 |
+
if self._buffer.strip():
|
| 48 |
+
logger.debug(f"π§Ή Flushing remaining buffer on TTS stop: '{self._buffer}'")
|
| 49 |
+
self._flush_buffer()
|
| 50 |
+
else:
|
| 51 |
+
self._buffer = "" # Clear empty buffer
|
| 52 |
+
|
| 53 |
+
def _ingest_text(self, text: str):
|
| 54 |
+
if not text.strip():
|
| 55 |
+
return
|
| 56 |
+
|
| 57 |
+
# Check for overlapping text (LLM sometimes resends previous tokens)
|
| 58 |
+
# If the new text starts with content already in our buffer, skip the overlapping part
|
| 59 |
+
if self._buffer and text.startswith(self._buffer):
|
| 60 |
+
# New text contains the entire buffer - extract only new part
|
| 61 |
+
new_part = text[len(self._buffer):]
|
| 62 |
+
if new_part:
|
| 63 |
+
logger.debug(f"π Detected overlap, adding only new part: '{new_part}'")
|
| 64 |
+
self._buffer += new_part
|
| 65 |
+
elif self._buffer:
|
| 66 |
+
# Check if buffer ends with start of new text (partial overlap)
|
| 67 |
+
max_overlap = min(len(self._buffer), len(text))
|
| 68 |
+
overlap_found = False
|
| 69 |
+
for i in range(max_overlap, 0, -1):
|
| 70 |
+
if self._buffer[-i:] == text[:i]:
|
| 71 |
+
# Found overlap - skip the overlapping part
|
| 72 |
+
new_part = text[i:]
|
| 73 |
+
if new_part:
|
| 74 |
+
logger.debug(f"π Detected partial overlap ({i} chars), adding only new part: '{new_part}'")
|
| 75 |
+
self._buffer += new_part
|
| 76 |
+
overlap_found = True
|
| 77 |
+
break
|
| 78 |
+
if not overlap_found:
|
| 79 |
+
# No overlap - add entire text
|
| 80 |
+
self._buffer += text
|
| 81 |
+
else:
|
| 82 |
+
# Empty buffer - just add the text
|
| 83 |
+
self._buffer += text
|
| 84 |
+
|
| 85 |
+
self._emit_complete_sentences()
|
| 86 |
+
|
| 87 |
+
if len(self._buffer) > self._max_buffer_chars:
|
| 88 |
+
self._flush_buffer()
|
| 89 |
+
|
| 90 |
+
def _emit_complete_sentences(self):
|
| 91 |
+
while True:
|
| 92 |
+
match = self.SENTENCE_REGEX.match(self._buffer)
|
| 93 |
+
if not match:
|
| 94 |
+
break
|
| 95 |
+
sentence = match.group(0).replace("\n", " ").strip()
|
| 96 |
+
self._buffer = self._buffer[match.end():].lstrip()
|
| 97 |
+
if sentence:
|
| 98 |
+
self._log_sentence(sentence)
|
| 99 |
+
|
| 100 |
+
def _flush_buffer(self):
|
| 101 |
+
pending = self._buffer.strip()
|
| 102 |
+
if pending:
|
| 103 |
+
self._log_sentence(pending)
|
| 104 |
+
self._buffer = ""
|
| 105 |
+
|
| 106 |
+
def _log_sentence(self, sentence: str):
|
| 107 |
+
current_time = time.time()
|
| 108 |
+
|
| 109 |
+
# Deduplicate: Skip if this is the same sentence we just logged within 2 seconds
|
| 110 |
+
# This prevents duplicate sentences from LLM streaming issues
|
| 111 |
+
time_diff = current_time - self._last_sentence_time
|
| 112 |
+
if self._last_sentence == sentence and time_diff < 2.0:
|
| 113 |
+
logger.debug(f"π Skipping duplicate sentence: '{sentence[:50]}...' (last seen {time_diff*1000:.0f}ms ago)")
|
| 114 |
+
return
|
| 115 |
+
|
| 116 |
+
self._last_sentence = sentence
|
| 117 |
+
self._last_sentence_time = current_time
|
| 118 |
+
|
| 119 |
+
logger.info(f"π£οΈ TARS: {sentence}")
|
| 120 |
+
|
| 121 |
+
# Store in shared state for Gradio UI
|
| 122 |
+
metrics_store.add_transcription("assistant", sentence)
|
| 123 |
+
|
| 124 |
+
self._send_to_frontend(sentence)
|
| 125 |
+
|
| 126 |
+
def _send_to_frontend(self, text: str):
|
| 127 |
+
if not self.webrtc_connection:
|
| 128 |
+
logger.warning("β οΈ [AssistantObserver] No WebRTC connection available")
|
| 129 |
+
return
|
| 130 |
+
|
| 131 |
+
try:
|
| 132 |
+
if self.webrtc_connection.is_connected():
|
| 133 |
+
self.webrtc_connection.send_app_message(
|
| 134 |
+
{
|
| 135 |
+
"type": "assistant",
|
| 136 |
+
"text": text,
|
| 137 |
+
}
|
| 138 |
+
)
|
| 139 |
+
else:
|
| 140 |
+
logger.warning("β οΈ [AssistantObserver] WebRTC connection not connected")
|
| 141 |
+
except Exception as exc:
|
| 142 |
+
logger.error(f"β [AssistantObserver] Failed to send assistant text to frontend: {exc}")
|
src/observers/debug_observer.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Observer for general purpose debug logging."""
|
| 2 |
+
|
| 3 |
+
from loguru import logger
|
| 4 |
+
from pipecat.observers.base_observer import BaseObserver, FramePushed
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class DebugObserver(BaseObserver):
|
| 8 |
+
"""General purpose debug logger for non-media frames."""
|
| 9 |
+
|
| 10 |
+
def __init__(self, label="Debug"):
|
| 11 |
+
super().__init__()
|
| 12 |
+
self.label = label
|
| 13 |
+
|
| 14 |
+
async def on_push_frame(self, data: FramePushed):
|
| 15 |
+
"""Watch frames as they're pushed through the pipeline."""
|
| 16 |
+
frame = data.frame
|
| 17 |
+
|
| 18 |
+
frame_type = type(frame).__name__
|
| 19 |
+
if "Audio" not in frame_type and "Video" not in frame_type and "Image" not in frame_type:
|
| 20 |
+
# Log the User ID so we can verify they match
|
| 21 |
+
uid = getattr(frame, 'user_id', 'None')
|
| 22 |
+
logger.info(f"π [{self.label}] {frame_type} | User: '{uid}' | Content: {str(frame)[:100]}")
|
src/observers/display_events_observer.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Observer for sending pipeline events to TARS Raspberry Pi display.
|
| 2 |
+
|
| 3 |
+
NOTE: This observer is deprecated. Display control is now handled via gRPC
|
| 4 |
+
in robot mode (tars_bot.py). Browser mode does not support display control.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import asyncio
|
| 8 |
+
import time
|
| 9 |
+
import numpy as np
|
| 10 |
+
from loguru import logger
|
| 11 |
+
from pipecat.observers.base_observer import BaseObserver, FramePushed
|
| 12 |
+
from pipecat.frames.frames import (
|
| 13 |
+
UserStartedSpeakingFrame,
|
| 14 |
+
UserStoppedSpeakingFrame,
|
| 15 |
+
BotStartedSpeakingFrame,
|
| 16 |
+
BotStoppedSpeakingFrame,
|
| 17 |
+
TTSAudioRawFrame,
|
| 18 |
+
AudioRawFrame,
|
| 19 |
+
)
|
| 20 |
+
from typing import Optional
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class DisplayEventsObserver(BaseObserver):
|
| 24 |
+
"""
|
| 25 |
+
Observes pipeline events and sends display updates to TARS Raspberry Pi.
|
| 26 |
+
|
| 27 |
+
DEPRECATED: Display control moved to gRPC in robot mode.
|
| 28 |
+
This observer is kept for compatibility but does nothing.
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
def __init__(self, tars_client=None):
|
| 32 |
+
super().__init__()
|
| 33 |
+
self.tars_client = None
|
| 34 |
+
self._user_speaking = False
|
| 35 |
+
self._bot_speaking = False
|
| 36 |
+
self._last_audio_update = 0
|
| 37 |
+
self._audio_update_interval = 0.05
|
| 38 |
+
|
| 39 |
+
async def on_push_frame(self, data: FramePushed):
|
| 40 |
+
"""Watch frames as they're pushed through the pipeline."""
|
| 41 |
+
frame = data.frame
|
| 42 |
+
|
| 43 |
+
# User started speaking
|
| 44 |
+
if isinstance(frame, UserStartedSpeakingFrame):
|
| 45 |
+
logger.debug("User started speaking")
|
| 46 |
+
self._user_speaking = True
|
| 47 |
+
|
| 48 |
+
# User stopped speaking
|
| 49 |
+
elif isinstance(frame, UserStoppedSpeakingFrame):
|
| 50 |
+
logger.debug("User stopped speaking")
|
| 51 |
+
self._user_speaking = False
|
| 52 |
+
|
| 53 |
+
# Bot started speaking
|
| 54 |
+
elif isinstance(frame, BotStartedSpeakingFrame):
|
| 55 |
+
logger.debug("Bot started speaking")
|
| 56 |
+
self._bot_speaking = True
|
| 57 |
+
|
| 58 |
+
# Bot stopped speaking
|
| 59 |
+
elif isinstance(frame, BotStoppedSpeakingFrame):
|
| 60 |
+
logger.debug("Bot stopped speaking")
|
| 61 |
+
self._bot_speaking = False
|
| 62 |
+
|
| 63 |
+
# TTS audio frames - measure audio level for display visualization
|
| 64 |
+
elif isinstance(frame, TTSAudioRawFrame):
|
| 65 |
+
current_time = time.time()
|
| 66 |
+
if current_time - self._last_audio_update > self._audio_update_interval:
|
| 67 |
+
self._last_audio_update = current_time
|
| 68 |
+
level = self._calculate_audio_level(frame.audio)
|
| 69 |
+
|
| 70 |
+
# User audio frames - measure user audio level
|
| 71 |
+
elif isinstance(frame, AudioRawFrame) and self._user_speaking:
|
| 72 |
+
current_time = time.time()
|
| 73 |
+
if current_time - self._last_audio_update > self._audio_update_interval:
|
| 74 |
+
self._last_audio_update = current_time
|
| 75 |
+
level = self._calculate_audio_level(frame.audio)
|
| 76 |
+
|
| 77 |
+
def _calculate_audio_level(self, audio_data: bytes) -> float:
|
| 78 |
+
"""
|
| 79 |
+
Calculate normalized RMS audio level from raw audio bytes.
|
| 80 |
+
|
| 81 |
+
Args:
|
| 82 |
+
audio_data: Raw audio bytes (16-bit PCM)
|
| 83 |
+
|
| 84 |
+
Returns:
|
| 85 |
+
Normalized audio level (0.0 to 1.0)
|
| 86 |
+
"""
|
| 87 |
+
try:
|
| 88 |
+
# Convert bytes to numpy array (assuming 16-bit PCM)
|
| 89 |
+
audio_array = np.frombuffer(audio_data, dtype=np.int16)
|
| 90 |
+
|
| 91 |
+
# Calculate RMS (root mean square)
|
| 92 |
+
if len(audio_array) > 0:
|
| 93 |
+
rms = np.sqrt(np.mean(audio_array.astype(float) ** 2))
|
| 94 |
+
# Normalize to 0-1 range (15000 is a typical speaking level for 16-bit audio)
|
| 95 |
+
level = min(1.0, rms / 15000.0)
|
| 96 |
+
return level
|
| 97 |
+
return 0.0
|
| 98 |
+
except Exception as e:
|
| 99 |
+
logger.debug(f"Error calculating audio level: {e}")
|
| 100 |
+
return 0.0
|
src/observers/metrics_observer.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Non-intrusive metrics observer for latency tracking."""
|
| 2 |
+
|
| 3 |
+
import sys
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
| 6 |
+
|
| 7 |
+
import time
|
| 8 |
+
from pipecat.observers.base_observer import BaseObserver, FramePushed
|
| 9 |
+
from pipecat.frames.frames import MetricsFrame, UserAudioRawFrame, TranscriptionFrame, UserStartedSpeakingFrame
|
| 10 |
+
from pipecat.metrics.metrics import TTFBMetricsData
|
| 11 |
+
from loguru import logger
|
| 12 |
+
from src.shared_state import metrics_store
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class MetricsObserver(BaseObserver):
|
| 16 |
+
"""
|
| 17 |
+
Observer that monitors pipeline frames for metrics collection.
|
| 18 |
+
Does not interrupt the pipeline flow - purely watches frames as they pass.
|
| 19 |
+
|
| 20 |
+
STT Latency Measurement:
|
| 21 |
+
- Measures from turn start β first transcription received
|
| 22 |
+
- Works for services with internal turn detection (Speechmatics, Deepgram, etc.)
|
| 23 |
+
- For Deepgram, this captures endpointing + transcription time
|
| 24 |
+
|
| 25 |
+
Other services (Memory, LLM, TTS) emit MetricsFrame which we capture directly.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
def __init__(self, webrtc_connection=None, stt_service=None, **kwargs):
|
| 29 |
+
super().__init__()
|
| 30 |
+
self.webrtc_connection = webrtc_connection
|
| 31 |
+
self.stt_service = stt_service
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
# Shared state for metrics tracking
|
| 35 |
+
self._current_turn = 0
|
| 36 |
+
self._current_metrics = {}
|
| 37 |
+
self._tts_text_time = None
|
| 38 |
+
self._last_sent_metrics = {}
|
| 39 |
+
self._last_logged_turn = -1
|
| 40 |
+
self._vision_request_time = None
|
| 41 |
+
|
| 42 |
+
# Manual timing for STT services
|
| 43 |
+
self._stt_start_time = None
|
| 44 |
+
self._stt_measured_this_turn = False
|
| 45 |
+
self._mem0_start_time = None
|
| 46 |
+
self._mem0_measured_this_turn = False
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def start_turn(self, turn_number: int):
|
| 50 |
+
"""Called by TurnTrackingObserver when a new turn starts."""
|
| 51 |
+
self._current_turn = turn_number
|
| 52 |
+
self._current_metrics = {}
|
| 53 |
+
self._last_sent_metrics = {}
|
| 54 |
+
self._last_logged_turn = -1
|
| 55 |
+
self._stt_measured_this_turn = False
|
| 56 |
+
self._mem0_measured_this_turn = False
|
| 57 |
+
|
| 58 |
+
# Use turn start time as STT baseline
|
| 59 |
+
self._stt_start_time = time.time()
|
| 60 |
+
logger.info(f"π [MetricsObserver] Turn #{self._current_turn} started, STT timer initialized")
|
| 61 |
+
|
| 62 |
+
self._mem0_start_time = None
|
| 63 |
+
|
| 64 |
+
async def on_push_frame(self, data: FramePushed):
|
| 65 |
+
"""Watch frames as they're pushed through the pipeline."""
|
| 66 |
+
frame = data.frame
|
| 67 |
+
|
| 68 |
+
# STT timing: Measure from turn start to first transcription (manual fallback)
|
| 69 |
+
# Note: This includes speaking time + endpointing + transcription
|
| 70 |
+
# If the STT service emits MetricsFrame with TTFB, that will override this
|
| 71 |
+
if isinstance(frame, TranscriptionFrame) and not self._stt_measured_this_turn:
|
| 72 |
+
if self._stt_start_time is not None:
|
| 73 |
+
stt_latency_ms = (time.time() - self._stt_start_time) * 1000
|
| 74 |
+
self._current_metrics['stt_ttfb_ms'] = stt_latency_ms
|
| 75 |
+
self._stt_measured_this_turn = True
|
| 76 |
+
logger.info(f"β
[MetricsObserver] STT total latency: {stt_latency_ms:.0f}ms (turn start β transcription)")
|
| 77 |
+
logger.debug(f" Note: Includes speaking time + processing. Use MetricsFrame TTFB for pure processing time.")
|
| 78 |
+
self._send_to_frontend()
|
| 79 |
+
|
| 80 |
+
# Capture MetricsFrame data from Pipecat's built-in metrics
|
| 81 |
+
if isinstance(frame, MetricsFrame):
|
| 82 |
+
try:
|
| 83 |
+
for metric_data in frame.data:
|
| 84 |
+
if isinstance(metric_data, TTFBMetricsData):
|
| 85 |
+
processor = metric_data.processor
|
| 86 |
+
value_ms = metric_data.value * 1000 # Convert seconds to milliseconds
|
| 87 |
+
processor_lower = processor.lower()
|
| 88 |
+
|
| 89 |
+
# Log all processors to help debug
|
| 90 |
+
logger.debug(f"π [MetricsObserver] MetricsFrame: {processor} = {value_ms:.0f}ms")
|
| 91 |
+
|
| 92 |
+
# Check STT (Deepgram, Speechmatics, etc.)
|
| 93 |
+
if 'sttservice' in processor_lower or 'deepgram' in processor_lower or 'speechmatics' in processor_lower:
|
| 94 |
+
if 'stt_ttfb_ms' not in self._current_metrics: # Only log once per turn
|
| 95 |
+
self._current_metrics['stt_ttfb_ms'] = value_ms
|
| 96 |
+
logger.info(f"β
[MetricsObserver] STT TTFB: {value_ms:.0f}ms (from {processor})")
|
| 97 |
+
logger.debug(f" Note: TTFB = Time To First Byte (audio β first transcription)")
|
| 98 |
+
# Check TTS (contains "tts" in name)
|
| 99 |
+
elif 'ttsservice' in processor_lower or 'elevenlabs' in processor_lower or 'qwen' in processor_lower:
|
| 100 |
+
if 'tts_ttfb_ms' not in self._current_metrics: # Only log once per turn
|
| 101 |
+
self._current_metrics['tts_ttfb_ms'] = value_ms
|
| 102 |
+
logger.info(f"β
[MetricsObserver] TTS TTFB: {value_ms:.0f}ms (text β first audio)")
|
| 103 |
+
# Check LLM
|
| 104 |
+
elif 'llmservice' in processor_lower or 'openai' in processor_lower or 'deepinfra' in processor_lower:
|
| 105 |
+
if 'llm_ttfb_ms' not in self._current_metrics: # Only log once per turn
|
| 106 |
+
self._current_metrics['llm_ttfb_ms'] = value_ms
|
| 107 |
+
logger.info(f"β
[MetricsObserver] LLM TTFB: {value_ms:.0f}ms (prompt β first token)")
|
| 108 |
+
# Check Memory (HybridMemory, ChromaDB)
|
| 109 |
+
elif 'memory' in processor_lower or 'chromadb' in processor_lower or 'hybrid' in processor_lower:
|
| 110 |
+
if 'memory_latency_ms' not in self._current_metrics: # Only log once per turn
|
| 111 |
+
self._current_metrics['memory_latency_ms'] = value_ms
|
| 112 |
+
logger.info(f"β
[MetricsObserver] Memory latency: {value_ms:.0f}ms")
|
| 113 |
+
else:
|
| 114 |
+
logger.debug(f"π [MetricsObserver] Unknown processor: {processor} ({value_ms:.0f}ms)")
|
| 115 |
+
|
| 116 |
+
# Calculate total latency and send if we have any metrics
|
| 117 |
+
if self._current_metrics:
|
| 118 |
+
total = sum([
|
| 119 |
+
self._current_metrics.get('stt_ttfb_ms', 0),
|
| 120 |
+
self._current_metrics.get('memory_latency_ms', 0),
|
| 121 |
+
self._current_metrics.get('llm_ttfb_ms', 0),
|
| 122 |
+
self._current_metrics.get('tts_ttfb_ms', 0)
|
| 123 |
+
])
|
| 124 |
+
if total > 0:
|
| 125 |
+
self._current_metrics['total_ms'] = total
|
| 126 |
+
|
| 127 |
+
self._send_to_frontend()
|
| 128 |
+
|
| 129 |
+
except Exception as e:
|
| 130 |
+
logger.error(f"Error processing MetricsFrame: {e}", exc_info=True)
|
| 131 |
+
|
| 132 |
+
def _send_to_frontend(self):
|
| 133 |
+
"""Send metrics to frontend via WebRTC data channel and store locally for Gradio UI."""
|
| 134 |
+
# Check if metrics have changed since last send (deduplication)
|
| 135 |
+
current_metrics_key = (
|
| 136 |
+
self._current_turn,
|
| 137 |
+
self._current_metrics.get('stt_ttfb_ms'),
|
| 138 |
+
self._current_metrics.get('memory_latency_ms'),
|
| 139 |
+
self._current_metrics.get('llm_ttfb_ms'),
|
| 140 |
+
self._current_metrics.get('tts_ttfb_ms'),
|
| 141 |
+
self._current_metrics.get('vision_latency_ms'),
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
if current_metrics_key == self._last_sent_metrics:
|
| 145 |
+
return
|
| 146 |
+
|
| 147 |
+
# Store in shared state for Gradio UI
|
| 148 |
+
metrics_store.add_metric({
|
| 149 |
+
"turn_number": self._current_turn,
|
| 150 |
+
"timestamp": int(time.time() * 1000),
|
| 151 |
+
"stt_ttfb_ms": self._current_metrics.get('stt_ttfb_ms'),
|
| 152 |
+
"memory_latency_ms": self._current_metrics.get('memory_latency_ms'),
|
| 153 |
+
"llm_ttfb_ms": self._current_metrics.get('llm_ttfb_ms'),
|
| 154 |
+
"tts_ttfb_ms": self._current_metrics.get('tts_ttfb_ms'),
|
| 155 |
+
"vision_latency_ms": self._current_metrics.get('vision_latency_ms'),
|
| 156 |
+
"total_ms": self._current_metrics.get('total_ms'),
|
| 157 |
+
})
|
| 158 |
+
|
| 159 |
+
# Send via WebRTC if connection exists
|
| 160 |
+
if self.webrtc_connection:
|
| 161 |
+
try:
|
| 162 |
+
if self.webrtc_connection.is_connected():
|
| 163 |
+
message = {
|
| 164 |
+
"type": "metrics",
|
| 165 |
+
"turn_number": self._current_turn,
|
| 166 |
+
"timestamp": int(time.time() * 1000),
|
| 167 |
+
**self._current_metrics
|
| 168 |
+
}
|
| 169 |
+
logger.debug(f"π€ [MetricsObserver] Sending metrics: {message}")
|
| 170 |
+
self.webrtc_connection.send_app_message(message)
|
| 171 |
+
except Exception as exc:
|
| 172 |
+
logger.error(f"β [MetricsObserver] Failed to send metrics via WebRTC: {exc}")
|
| 173 |
+
|
| 174 |
+
# Log summary once per turn
|
| 175 |
+
if self._last_logged_turn != self._current_turn:
|
| 176 |
+
def fmt(val):
|
| 177 |
+
return f"{val:.0f}ms" if isinstance(val, (int, float)) else "N/A"
|
| 178 |
+
|
| 179 |
+
# Build metrics summary
|
| 180 |
+
metrics_parts = []
|
| 181 |
+
if 'stt_ttfb_ms' in self._current_metrics:
|
| 182 |
+
metrics_parts.append(f"STT={fmt(self._current_metrics.get('stt_ttfb_ms'))}")
|
| 183 |
+
if 'memory_latency_ms' in self._current_metrics:
|
| 184 |
+
metrics_parts.append(f"Memory={fmt(self._current_metrics.get('memory_latency_ms'))}")
|
| 185 |
+
if 'llm_ttfb_ms' in self._current_metrics:
|
| 186 |
+
metrics_parts.append(f"LLM={fmt(self._current_metrics.get('llm_ttfb_ms'))}")
|
| 187 |
+
if 'tts_ttfb_ms' in self._current_metrics:
|
| 188 |
+
metrics_parts.append(f"TTS={fmt(self._current_metrics.get('tts_ttfb_ms'))}")
|
| 189 |
+
if 'vision_latency_ms' in self._current_metrics:
|
| 190 |
+
metrics_parts.append(f"Vision={fmt(self._current_metrics.get('vision_latency_ms'))}")
|
| 191 |
+
|
| 192 |
+
if metrics_parts:
|
| 193 |
+
logger.info(f"π Turn #{self._current_turn}: " + " | ".join(metrics_parts))
|
| 194 |
+
self._last_logged_turn = self._current_turn
|
| 195 |
+
|
| 196 |
+
self._last_sent_metrics = current_metrics_key
|
src/observers/state_observer.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
State observer for WebRTC DataChannel synchronization.
|
| 3 |
+
|
| 4 |
+
Observes Pipecat pipeline events and sends state updates to RPi via DataChannel:
|
| 5 |
+
- Transcription events β eye state (listening)
|
| 6 |
+
- LLM events β eye state (thinking)
|
| 7 |
+
- TTS events β eye state (speaking)
|
| 8 |
+
- Transcripts β text display
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import asyncio
|
| 12 |
+
from typing import Optional
|
| 13 |
+
from loguru import logger
|
| 14 |
+
|
| 15 |
+
from pipecat.observers.base_observer import BaseObserver
|
| 16 |
+
from pipecat.frames.frames import (
|
| 17 |
+
TranscriptionFrame,
|
| 18 |
+
LLMFullResponseStartFrame,
|
| 19 |
+
LLMFullResponseEndFrame,
|
| 20 |
+
TTSStartedFrame,
|
| 21 |
+
TTSStoppedFrame,
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
from transport.state_sync import StateSync
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class StateObserver(BaseObserver):
|
| 28 |
+
"""
|
| 29 |
+
Observes pipeline events and sends state to RPi via DataChannel.
|
| 30 |
+
|
| 31 |
+
Automatically manages eye states based on conversation flow:
|
| 32 |
+
- User speaking β listening
|
| 33 |
+
- LLM processing β thinking
|
| 34 |
+
- TTS output β speaking
|
| 35 |
+
- Idle β default
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
def __init__(self, state_sync: Optional[StateSync] = None):
|
| 39 |
+
"""
|
| 40 |
+
Initialize state observer.
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
state_sync: StateSync instance for sending messages
|
| 44 |
+
"""
|
| 45 |
+
super().__init__()
|
| 46 |
+
self.state_sync = state_sync
|
| 47 |
+
self._current_state = "idle"
|
| 48 |
+
self._idle_delay = 0.5
|
| 49 |
+
self._idle_task = None
|
| 50 |
+
|
| 51 |
+
def set_state_sync(self, state_sync: StateSync):
|
| 52 |
+
"""Set StateSync instance."""
|
| 53 |
+
self.state_sync = state_sync
|
| 54 |
+
|
| 55 |
+
async def on_transcription(self, *args, **kwargs):
|
| 56 |
+
"""Handle transcription events (user speaking)."""
|
| 57 |
+
try:
|
| 58 |
+
# Cancel pending idle timer
|
| 59 |
+
self.cancel_idle_timer()
|
| 60 |
+
|
| 61 |
+
# Extract frame from args
|
| 62 |
+
frame = args[0] if args else None
|
| 63 |
+
|
| 64 |
+
if isinstance(frame, TranscriptionFrame):
|
| 65 |
+
text = frame.text
|
| 66 |
+
user_id = getattr(frame, "user_id", "user")
|
| 67 |
+
|
| 68 |
+
# Send transcript to RPi
|
| 69 |
+
if self.state_sync:
|
| 70 |
+
self.state_sync.send_transcript("user", text)
|
| 71 |
+
# Set eye state to listening when user speaks
|
| 72 |
+
if text.strip():
|
| 73 |
+
self._update_state("listening")
|
| 74 |
+
|
| 75 |
+
logger.debug(f"π Transcription: {text}")
|
| 76 |
+
|
| 77 |
+
except Exception as e:
|
| 78 |
+
logger.error(f"β Error in transcription observer: {e}")
|
| 79 |
+
|
| 80 |
+
async def on_llm_full_response_start(self, *args, **kwargs):
|
| 81 |
+
"""Handle LLM response start (thinking)."""
|
| 82 |
+
try:
|
| 83 |
+
# Cancel pending idle timer
|
| 84 |
+
self.cancel_idle_timer()
|
| 85 |
+
|
| 86 |
+
if self.state_sync:
|
| 87 |
+
self._update_state("thinking")
|
| 88 |
+
logger.debug("π§ LLM thinking started")
|
| 89 |
+
except Exception as e:
|
| 90 |
+
logger.error(f"β Error in LLM start observer: {e}")
|
| 91 |
+
|
| 92 |
+
async def on_llm_full_response_end(self, *args, **kwargs):
|
| 93 |
+
"""Handle LLM response end."""
|
| 94 |
+
try:
|
| 95 |
+
# State will be updated by TTS start or return to idle
|
| 96 |
+
logger.debug("π§ LLM thinking ended")
|
| 97 |
+
except Exception as e:
|
| 98 |
+
logger.error(f"β Error in LLM end observer: {e}")
|
| 99 |
+
|
| 100 |
+
async def on_tts_started(self, *args, **kwargs):
|
| 101 |
+
"""Handle TTS start (speaking)."""
|
| 102 |
+
try:
|
| 103 |
+
if self.state_sync:
|
| 104 |
+
self._update_state("speaking")
|
| 105 |
+
self.state_sync.send_tts_state(True)
|
| 106 |
+
logger.debug("π TTS started")
|
| 107 |
+
except Exception as e:
|
| 108 |
+
logger.error(f"β Error in TTS start observer: {e}")
|
| 109 |
+
|
| 110 |
+
async def on_tts_stopped(self, *args, **kwargs):
|
| 111 |
+
"""Handle TTS stop (return to idle after delay)."""
|
| 112 |
+
try:
|
| 113 |
+
if self.state_sync:
|
| 114 |
+
self.state_sync.send_tts_state(False)
|
| 115 |
+
|
| 116 |
+
# Cancel existing idle timer
|
| 117 |
+
if self._idle_task and not self._idle_task.done():
|
| 118 |
+
self._idle_task.cancel()
|
| 119 |
+
|
| 120 |
+
# Set idle after delay
|
| 121 |
+
async def delayed_idle():
|
| 122 |
+
await asyncio.sleep(self._idle_delay)
|
| 123 |
+
self._update_state("idle")
|
| 124 |
+
|
| 125 |
+
self._idle_task = asyncio.create_task(delayed_idle())
|
| 126 |
+
logger.debug("TTS stopped, idle in 0.5s")
|
| 127 |
+
except Exception as e:
|
| 128 |
+
logger.error(f"Error in TTS stop observer: {e}")
|
| 129 |
+
|
| 130 |
+
async def on_user_transcript(self, *args, **kwargs):
|
| 131 |
+
"""Handle complete user transcript."""
|
| 132 |
+
try:
|
| 133 |
+
# Extract text from args
|
| 134 |
+
text = args[1] if len(args) > 1 else ""
|
| 135 |
+
if text and self.state_sync:
|
| 136 |
+
self.state_sync.send_transcript("user", text)
|
| 137 |
+
except Exception as e:
|
| 138 |
+
logger.error(f"β Error in user transcript observer: {e}")
|
| 139 |
+
|
| 140 |
+
async def on_bot_transcript(self, *args, **kwargs):
|
| 141 |
+
"""Handle complete bot transcript."""
|
| 142 |
+
try:
|
| 143 |
+
# Extract text from args
|
| 144 |
+
text = args[1] if len(args) > 1 else ""
|
| 145 |
+
if text and self.state_sync:
|
| 146 |
+
self.state_sync.send_transcript("assistant", text)
|
| 147 |
+
except Exception as e:
|
| 148 |
+
logger.error(f"β Error in bot transcript observer: {e}")
|
| 149 |
+
|
| 150 |
+
def cancel_idle_timer(self):
|
| 151 |
+
"""Cancel pending idle timer."""
|
| 152 |
+
if self._idle_task and not self._idle_task.done():
|
| 153 |
+
self._idle_task.cancel()
|
| 154 |
+
self._idle_task = None
|
| 155 |
+
|
| 156 |
+
def _update_state(self, new_state: str):
|
| 157 |
+
"""
|
| 158 |
+
Update eye state if changed.
|
| 159 |
+
|
| 160 |
+
Args:
|
| 161 |
+
new_state: New state to set
|
| 162 |
+
"""
|
| 163 |
+
if new_state != self._current_state:
|
| 164 |
+
self._current_state = new_state
|
| 165 |
+
if self.state_sync:
|
| 166 |
+
self.state_sync.send_eye_state(new_state)
|
src/observers/transcription_observer.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Observer for logging transcriptions and sending to frontend."""
|
| 2 |
+
|
| 3 |
+
import sys
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
| 6 |
+
|
| 7 |
+
import time
|
| 8 |
+
from loguru import logger
|
| 9 |
+
from pipecat.frames.frames import TranscriptionFrame, InterimTranscriptionFrame
|
| 10 |
+
from pipecat.observers.base_observer import BaseObserver, FramePushed
|
| 11 |
+
from src.shared_state import metrics_store
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class TranscriptionObserver(BaseObserver):
|
| 15 |
+
"""Logs transcriptions and sends to frontend."""
|
| 16 |
+
|
| 17 |
+
def __init__(self, webrtc_connection=None, client_state=None):
|
| 18 |
+
super().__init__()
|
| 19 |
+
self.webrtc_connection = webrtc_connection
|
| 20 |
+
self.client_state = client_state or {}
|
| 21 |
+
self._last_transcription = None # Track last transcription to avoid duplicates
|
| 22 |
+
self._last_transcription_time = 0 # Timestamp of last transcription
|
| 23 |
+
|
| 24 |
+
async def on_push_frame(self, data: FramePushed):
|
| 25 |
+
"""Watch frames as they're pushed through the pipeline."""
|
| 26 |
+
frame = data.frame
|
| 27 |
+
current_time = time.time()
|
| 28 |
+
|
| 29 |
+
# --- (Logging Logic) ---
|
| 30 |
+
if isinstance(frame, TranscriptionFrame):
|
| 31 |
+
# Deduplicate: Skip if same text within 200ms (different user_ids)
|
| 32 |
+
time_diff = current_time - self._last_transcription_time
|
| 33 |
+
if self._last_transcription == frame.text and time_diff < 0.2:
|
| 34 |
+
logger.debug(f"π Skipping duplicate transcription: '{frame.text}' (last seen {time_diff*1000:.0f}ms ago)")
|
| 35 |
+
return
|
| 36 |
+
|
| 37 |
+
self._last_transcription = frame.text
|
| 38 |
+
self._last_transcription_time = current_time
|
| 39 |
+
|
| 40 |
+
raw_id = getattr(frame, 'user_id', None)
|
| 41 |
+
display_id = raw_id if (raw_id and raw_id != "S1") else self.client_state.get("client_id", "guest")
|
| 42 |
+
|
| 43 |
+
logger.info(f"π€ Transcription [{display_id}]: {frame.text}")
|
| 44 |
+
|
| 45 |
+
# Store in shared state for Gradio UI
|
| 46 |
+
metrics_store.add_transcription("user", frame.text)
|
| 47 |
+
|
| 48 |
+
# Update Frontend via WebRTC
|
| 49 |
+
if self.webrtc_connection:
|
| 50 |
+
self._send_to_frontend("transcription", frame.text, display_id)
|
| 51 |
+
|
| 52 |
+
elif isinstance(frame, InterimTranscriptionFrame):
|
| 53 |
+
raw_id = getattr(frame, 'user_id', None)
|
| 54 |
+
display_id = raw_id if (raw_id and raw_id != "S1") else self.client_state.get("client_id", "guest")
|
| 55 |
+
|
| 56 |
+
# Update Frontend (don't deduplicate partials as they change frequently)
|
| 57 |
+
if self.webrtc_connection:
|
| 58 |
+
self._send_to_frontend("partial", frame.text, display_id)
|
| 59 |
+
|
| 60 |
+
def _send_to_frontend(self, type_str, text, speaker_id):
|
| 61 |
+
"""Helper to send messages to frontend via WebRTC data channel."""
|
| 62 |
+
try:
|
| 63 |
+
if self.webrtc_connection and self.webrtc_connection.is_connected():
|
| 64 |
+
self.webrtc_connection.send_app_message({
|
| 65 |
+
"type": type_str,
|
| 66 |
+
"text": text,
|
| 67 |
+
"speaker_id": speaker_id
|
| 68 |
+
})
|
| 69 |
+
except Exception as e:
|
| 70 |
+
logger.error(f"Error sending {type_str}: {e}")
|
src/observers/tts_state_observer.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Observer for broadcasting TTS state changes to frontend."""
|
| 2 |
+
|
| 3 |
+
from loguru import logger
|
| 4 |
+
from pipecat.frames.frames import TTSStartedFrame, TTSStoppedFrame, TTSAudioRawFrame
|
| 5 |
+
from pipecat.observers.base_observer import BaseObserver, FramePushed
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class TTSStateObserver(BaseObserver):
|
| 9 |
+
"""Emits `tts_state` messages whenever the assistant starts or stops speaking."""
|
| 10 |
+
|
| 11 |
+
def __init__(self, webrtc_connection=None):
|
| 12 |
+
super().__init__()
|
| 13 |
+
self.webrtc_connection = webrtc_connection
|
| 14 |
+
self._speaking = False
|
| 15 |
+
self._has_received_audio = False
|
| 16 |
+
|
| 17 |
+
async def on_push_frame(self, data: FramePushed):
|
| 18 |
+
"""Watch frames as they're pushed through the pipeline."""
|
| 19 |
+
frame = data.frame
|
| 20 |
+
|
| 21 |
+
# Priority 1: Explicit start/stop frames (most reliable)
|
| 22 |
+
if isinstance(frame, TTSStartedFrame):
|
| 23 |
+
self._set_state(True)
|
| 24 |
+
elif isinstance(frame, TTSStoppedFrame):
|
| 25 |
+
self._set_state(False)
|
| 26 |
+
self._has_received_audio = False
|
| 27 |
+
elif isinstance(frame, TTSAudioRawFrame):
|
| 28 |
+
# Priority 2: Use first audio frame to detect start (fallback)
|
| 29 |
+
# Only set to started if we haven't already and this is the first audio frame
|
| 30 |
+
if not self._speaking and not self._has_received_audio:
|
| 31 |
+
logger.debug("Detected TTS start via first TTSAudioRawFrame")
|
| 32 |
+
self._set_state(True)
|
| 33 |
+
self._has_received_audio = True
|
| 34 |
+
# Note: We rely on TTSStoppedFrame to detect stop, not audio frame absence
|
| 35 |
+
|
| 36 |
+
def _set_state(self, active: bool):
|
| 37 |
+
if self._speaking == active:
|
| 38 |
+
return
|
| 39 |
+
|
| 40 |
+
self._speaking = active
|
| 41 |
+
state = "started" if active else "stopped"
|
| 42 |
+
|
| 43 |
+
if not self.webrtc_connection:
|
| 44 |
+
return
|
| 45 |
+
|
| 46 |
+
try:
|
| 47 |
+
if self.webrtc_connection.is_connected():
|
| 48 |
+
self.webrtc_connection.send_app_message(
|
| 49 |
+
{
|
| 50 |
+
"type": "tts_state",
|
| 51 |
+
"state": state,
|
| 52 |
+
}
|
| 53 |
+
)
|
| 54 |
+
logger.debug(f"Sent TTS state message: {state}")
|
| 55 |
+
except Exception as exc:
|
| 56 |
+
logger.error(f"Failed to send TTS state: {exc}")
|
src/observers/vision_observer.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Observer for logging vision processing events and Moondream activity."""
|
| 2 |
+
|
| 3 |
+
import time
|
| 4 |
+
from loguru import logger
|
| 5 |
+
from pipecat.frames.frames import UserImageRequestFrame, LLMTextFrame, ErrorFrame
|
| 6 |
+
from pipecat.observers.base_observer import BaseObserver, FramePushed
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class VisionObserver(BaseObserver):
|
| 10 |
+
"""Logs vision processing events and Moondream activity."""
|
| 11 |
+
|
| 12 |
+
def __init__(self, webrtc_connection=None):
|
| 13 |
+
super().__init__()
|
| 14 |
+
self.webrtc_connection = webrtc_connection
|
| 15 |
+
self._video_frame_count = 0
|
| 16 |
+
self._last_video_frame_time = None
|
| 17 |
+
|
| 18 |
+
async def on_push_frame(self, data: FramePushed):
|
| 19 |
+
"""Watch frames as they're pushed through the pipeline."""
|
| 20 |
+
frame = data.frame
|
| 21 |
+
|
| 22 |
+
current_time = time.time()
|
| 23 |
+
|
| 24 |
+
frame_type = type(frame).__name__
|
| 25 |
+
|
| 26 |
+
# Log vision request frames
|
| 27 |
+
if isinstance(frame, UserImageRequestFrame):
|
| 28 |
+
user_id = getattr(frame, 'user_id', 'unknown')
|
| 29 |
+
question = getattr(frame, 'text', 'unknown')
|
| 30 |
+
logger.info(f"ποΈ Vision request received: user_id={user_id}, question={question}")
|
| 31 |
+
self._last_vision_request_time = current_time # Track when vision was requested
|
| 32 |
+
self._vision_request_count = getattr(self, '_vision_request_count', 0) + 1
|
| 33 |
+
logger.info(f"π Vision request #{self._vision_request_count} - waiting for video frames and Moondream response...")
|
| 34 |
+
|
| 35 |
+
# Send status to frontend
|
| 36 |
+
if self.webrtc_connection:
|
| 37 |
+
try:
|
| 38 |
+
if self.webrtc_connection.is_connected():
|
| 39 |
+
self.webrtc_connection.send_app_message({
|
| 40 |
+
"type": "vision",
|
| 41 |
+
"status": "requested",
|
| 42 |
+
"question": question
|
| 43 |
+
})
|
| 44 |
+
except Exception as e:
|
| 45 |
+
logger.debug(f"Error sending vision status: {e}")
|
| 46 |
+
|
| 47 |
+
elif 'video' in frame_type.lower() or 'image' in frame_type.lower() or 'vision' in frame_type.lower():
|
| 48 |
+
# Only log at info level if we're actively processing a vision request
|
| 49 |
+
is_vision_active = hasattr(self, '_last_vision_request_time') and self._last_vision_request_time is not None
|
| 50 |
+
if is_vision_active:
|
| 51 |
+
time_since_request = current_time - self._last_vision_request_time
|
| 52 |
+
if time_since_request < 5: # Only log during active vision processing (5 seconds)
|
| 53 |
+
logger.debug(f"π· Vision-related frame: {frame_type}")
|
| 54 |
+
else:
|
| 55 |
+
# Otherwise, only log at debug level (won't show unless debug logging is enabled)
|
| 56 |
+
logger.debug(f"π· Vision-related frame: {frame_type}")
|
| 57 |
+
|
| 58 |
+
# Log frames with image attribute only at debug level
|
| 59 |
+
elif hasattr(frame, 'image'):
|
| 60 |
+
logger.debug(f"π· Frame with image attribute: {frame_type}")
|
| 61 |
+
|
| 62 |
+
# Log any frame that might be a vision response by checking attributes
|
| 63 |
+
elif hasattr(frame, 'user_id') and hasattr(frame, 'text'):
|
| 64 |
+
user_id = getattr(frame, 'user_id', 'unknown')
|
| 65 |
+
text = getattr(frame, 'text', '')
|
| 66 |
+
if 'vision' in frame_type.lower() or 'image' in frame_type.lower() or 'moondream' in frame_type.lower():
|
| 67 |
+
logger.info(f"β
Vision response frame: {frame_type}, user_id={user_id}")
|
| 68 |
+
logger.info(f" Response: {text[:200]}..." if len(text) > 200 else f" Response: {text}")
|
| 69 |
+
|
| 70 |
+
# Log LLM text frames that might contain vision responses
|
| 71 |
+
# Moondream responses come through as LLMTextFrame with vision context
|
| 72 |
+
elif isinstance(frame, LLMTextFrame):
|
| 73 |
+
text = getattr(frame, 'text', '')
|
| 74 |
+
vision_keywords = ['see', 'visible', 'camera', 'image', 'showing', 'appears', 'looks like', 'dimly lit', 'desk', 'monitor', 'room', 'window', 'mug', 'laptop', 'coffee', 'analyzing', 'processing']
|
| 75 |
+
|
| 76 |
+
# Check if this is a vision response (either from keywords or if we recently requested vision)
|
| 77 |
+
is_vision_response = False
|
| 78 |
+
if hasattr(self, '_last_vision_request_time'):
|
| 79 |
+
time_since_request = current_time - self._last_vision_request_time
|
| 80 |
+
if time_since_request < 10: # Within 10 seconds of vision request
|
| 81 |
+
is_vision_response = True
|
| 82 |
+
logger.info(f"β
Vision response received (within {time_since_request:.1f}s of request): {text[:200]}..." if len(text) > 200 else f"β
Vision response: {text}")
|
| 83 |
+
|
| 84 |
+
if text and any(keyword in text.lower() for keyword in vision_keywords) and not is_vision_response:
|
| 85 |
+
logger.info(f"β
Possible vision response in LLM text: {text[:200]}..." if len(text) > 200 else f"β
Possible vision response: {text}")
|
| 86 |
+
|
| 87 |
+
# Log errors
|
| 88 |
+
elif isinstance(frame, ErrorFrame):
|
| 89 |
+
error_msg = getattr(frame, 'error', str(frame))
|
| 90 |
+
if 'vision' in error_msg.lower() or 'moondream' in error_msg.lower() or 'image' in error_msg.lower():
|
| 91 |
+
logger.error(f"β Vision error: {error_msg}")
|
| 92 |
+
|
| 93 |
+
# Send error to frontend
|
| 94 |
+
if self.webrtc_connection:
|
| 95 |
+
try:
|
| 96 |
+
if self.webrtc_connection.is_connected():
|
| 97 |
+
self.webrtc_connection.send_app_message({
|
| 98 |
+
"type": "vision",
|
| 99 |
+
"status": "error",
|
| 100 |
+
"error": str(error_msg)
|
| 101 |
+
})
|
| 102 |
+
except Exception as e:
|
| 103 |
+
logger.debug(f"Error sending vision error: {e}")
|
| 104 |
+
|
| 105 |
+
# Check for actual video frames (exclude audio frames)
|
| 106 |
+
# Check for video frames - be specific to avoid false positives
|
| 107 |
+
is_video_frame = False
|
| 108 |
+
|
| 109 |
+
# Explicitly exclude audio frames
|
| 110 |
+
if 'audio' in frame_type.lower():
|
| 111 |
+
is_video_frame = False
|
| 112 |
+
# Check for actual video frame types
|
| 113 |
+
elif 'VideoRawFrame' in frame_type or 'InputVideoRawFrame' in frame_type:
|
| 114 |
+
is_video_frame = True
|
| 115 |
+
elif 'video' in frame_type.lower() and 'audio' not in frame_type.lower():
|
| 116 |
+
# Only if it's a video frame and not an audio frame
|
| 117 |
+
is_video_frame = True
|
| 118 |
+
elif hasattr(frame, 'video') and not hasattr(frame, 'audio'):
|
| 119 |
+
# Has video attribute but not audio
|
| 120 |
+
is_video_frame = True
|
| 121 |
+
elif hasattr(frame, 'image') and hasattr(frame, 'user_id'):
|
| 122 |
+
# User image request/response frames
|
| 123 |
+
is_video_frame = True
|
| 124 |
+
|
| 125 |
+
# Only log actual video frames, not audio frames
|
| 126 |
+
if is_video_frame:
|
| 127 |
+
self._video_frame_count += 1
|
| 128 |
+
self._last_video_frame_time = current_time
|
| 129 |
+
# Only log every 100 frames to reduce spam significantly
|
| 130 |
+
if self._video_frame_count % 100 == 0:
|
| 131 |
+
logger.debug(f"π₯ Video frames streaming: {self._video_frame_count} frames received")
|
| 132 |
+
|
| 133 |
+
# Log frame count summary every 30 seconds (less frequent)
|
| 134 |
+
if not hasattr(self, '_last_summary_time'):
|
| 135 |
+
self._last_summary_time = current_time
|
| 136 |
+
elif current_time - self._last_summary_time >= 30:
|
| 137 |
+
if self._video_frame_count > 0:
|
| 138 |
+
logger.debug(f"π Video stream: {self._video_frame_count} frames in last 30 seconds")
|
| 139 |
+
else:
|
| 140 |
+
logger.warning(f"β οΈ No video frames detected in last 30 seconds!")
|
| 141 |
+
self._video_frame_count = 0
|
| 142 |
+
self._last_summary_time = current_time
|
src/processors/__init__.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Frame processors for the Pipecat pipeline.
|
| 2 |
+
|
| 3 |
+
This module contains processors that transform, filter, or process data.
|
| 4 |
+
For logging/monitoring processors, see loggers.py module.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from .filters import SilenceFilter, InputAudioFilter
|
| 8 |
+
from .gating import InterventionGating
|
| 9 |
+
from .visual_observer import VisualObserver
|
| 10 |
+
from .emotional_monitor import EmotionalStateMonitor
|
| 11 |
+
|
| 12 |
+
__all__ = [
|
| 13 |
+
"SilenceFilter",
|
| 14 |
+
"InputAudioFilter",
|
| 15 |
+
"InterventionGating",
|
| 16 |
+
"VisualObserver",
|
| 17 |
+
"EmotionalStateMonitor",
|
| 18 |
+
]
|
src/processors/emotional_monitor.py
ADDED
|
@@ -0,0 +1,303 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Real-time emotional and cognitive state monitoring using continuous video analysis.
|
| 3 |
+
Detects hesitation, confusion, frustration, and other emotional cues to trigger TARS intervention.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import asyncio
|
| 7 |
+
import time
|
| 8 |
+
import base64
|
| 9 |
+
from typing import Optional, Dict, List
|
| 10 |
+
from loguru import logger
|
| 11 |
+
from PIL import Image
|
| 12 |
+
import io
|
| 13 |
+
|
| 14 |
+
from pipecat.frames.frames import (
|
| 15 |
+
Frame,
|
| 16 |
+
ImageRawFrame,
|
| 17 |
+
TextFrame,
|
| 18 |
+
LLMRunFrame,
|
| 19 |
+
)
|
| 20 |
+
from pipecat.processors.frame_processor import FrameProcessor, FrameDirection
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class EmotionalState:
|
| 24 |
+
"""Container for detected emotional/cognitive state"""
|
| 25 |
+
|
| 26 |
+
def __init__(
|
| 27 |
+
self,
|
| 28 |
+
confused: bool = False,
|
| 29 |
+
hesitant: bool = False,
|
| 30 |
+
frustrated: bool = False,
|
| 31 |
+
focused: bool = False,
|
| 32 |
+
confidence: float = 0.0,
|
| 33 |
+
description: str = "",
|
| 34 |
+
):
|
| 35 |
+
self.confused = confused
|
| 36 |
+
self.hesitant = hesitant
|
| 37 |
+
self.frustrated = frustrated
|
| 38 |
+
self.focused = focused
|
| 39 |
+
self.confidence = confidence
|
| 40 |
+
self.description = description
|
| 41 |
+
self.timestamp = time.time()
|
| 42 |
+
|
| 43 |
+
def needs_intervention(self) -> bool:
|
| 44 |
+
"""Determine if TARS should intervene based on detected state"""
|
| 45 |
+
# Intervene if user shows signs of confusion, hesitation, or frustration
|
| 46 |
+
return self.confused or self.hesitant or self.frustrated
|
| 47 |
+
|
| 48 |
+
def __repr__(self):
|
| 49 |
+
states = []
|
| 50 |
+
if self.confused: states.append("confused")
|
| 51 |
+
if self.hesitant: states.append("hesitant")
|
| 52 |
+
if self.frustrated: states.append("frustrated")
|
| 53 |
+
if self.focused: states.append("focused")
|
| 54 |
+
return f"EmotionalState({', '.join(states) if states else 'neutral'}, confidence={self.confidence:.2f})"
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class EmotionalStateMonitor(FrameProcessor):
|
| 58 |
+
"""
|
| 59 |
+
Continuously monitors video feed for emotional and cognitive states.
|
| 60 |
+
Analyzes facial expressions, body language, and behavior patterns to detect:
|
| 61 |
+
- Confusion (furrowed brow, head tilt, puzzled expression)
|
| 62 |
+
- Hesitation (pauses, uncertain gestures, looking away)
|
| 63 |
+
- Frustration (tense posture, sighs, agitated movements)
|
| 64 |
+
- Focus (engaged eye contact, attentive posture)
|
| 65 |
+
|
| 66 |
+
Triggers TARS intervention when negative states are detected.
|
| 67 |
+
"""
|
| 68 |
+
|
| 69 |
+
def __init__(
|
| 70 |
+
self,
|
| 71 |
+
vision_client,
|
| 72 |
+
model: str = "moondream",
|
| 73 |
+
sampling_interval: float = 3.0,
|
| 74 |
+
intervention_threshold: int = 2,
|
| 75 |
+
enabled: bool = True,
|
| 76 |
+
auto_intervene: bool = False,
|
| 77 |
+
):
|
| 78 |
+
"""
|
| 79 |
+
Args:
|
| 80 |
+
vision_client: Moondream or compatible vision API client
|
| 81 |
+
model: Vision model to use
|
| 82 |
+
sampling_interval: Seconds between frame analyses (default: 3.0)
|
| 83 |
+
intervention_threshold: Number of consecutive negative states before intervening
|
| 84 |
+
enabled: Whether monitoring is active
|
| 85 |
+
auto_intervene: If True, automatically triggers LLM when threshold reached.
|
| 86 |
+
If False, only tracks state (used by gating layer)
|
| 87 |
+
"""
|
| 88 |
+
super().__init__()
|
| 89 |
+
self._vision_client = vision_client
|
| 90 |
+
self._model = model
|
| 91 |
+
self._sampling_interval = sampling_interval
|
| 92 |
+
self._intervention_threshold = intervention_threshold
|
| 93 |
+
self._enabled = enabled
|
| 94 |
+
self._auto_intervene = auto_intervene
|
| 95 |
+
|
| 96 |
+
# State tracking
|
| 97 |
+
self._last_sample_time = 0
|
| 98 |
+
self._last_state: Optional[EmotionalState] = None
|
| 99 |
+
self._state_history: List[EmotionalState] = []
|
| 100 |
+
self._consecutive_negative_states = 0
|
| 101 |
+
self._analyzing = False
|
| 102 |
+
|
| 103 |
+
# Cooldown tracking (when user declines help)
|
| 104 |
+
self._help_declined_time: Optional[float] = None
|
| 105 |
+
self._cooldown_duration = 30.0 # seconds - don't re-offer help for 30s after decline
|
| 106 |
+
|
| 107 |
+
logger.info(f"π§ Emotional State Monitor initialized")
|
| 108 |
+
logger.info(f" Sampling interval: {sampling_interval}s")
|
| 109 |
+
logger.info(f" Intervention threshold: {intervention_threshold}")
|
| 110 |
+
logger.info(f" Auto-intervene: {auto_intervene}")
|
| 111 |
+
logger.info(f" Enabled: {enabled}")
|
| 112 |
+
|
| 113 |
+
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
| 114 |
+
"""Process video frames and sample periodically for emotional analysis"""
|
| 115 |
+
await super().process_frame(frame, direction)
|
| 116 |
+
|
| 117 |
+
# Only analyze if enabled and frame is video input
|
| 118 |
+
if not self._enabled or not isinstance(frame, ImageRawFrame):
|
| 119 |
+
await self.push_frame(frame, direction)
|
| 120 |
+
return
|
| 121 |
+
|
| 122 |
+
# Check if it's time to sample
|
| 123 |
+
current_time = time.time()
|
| 124 |
+
if current_time - self._last_sample_time >= self._sampling_interval:
|
| 125 |
+
# Don't block the pipeline - analyze in background
|
| 126 |
+
if not self._analyzing:
|
| 127 |
+
self._last_sample_time = current_time
|
| 128 |
+
asyncio.create_task(self._analyze_emotional_state(frame))
|
| 129 |
+
|
| 130 |
+
await self.push_frame(frame, direction)
|
| 131 |
+
|
| 132 |
+
async def _analyze_emotional_state(self, frame: ImageRawFrame):
|
| 133 |
+
"""Analyze frame for emotional/cognitive state"""
|
| 134 |
+
self._analyzing = True
|
| 135 |
+
|
| 136 |
+
try:
|
| 137 |
+
# Convert frame to base64
|
| 138 |
+
image = Image.frombytes(frame.format, frame.size, frame.image)
|
| 139 |
+
buffered = io.BytesIO()
|
| 140 |
+
image.save(buffered, format="JPEG")
|
| 141 |
+
img_str = base64.b64encode(buffered.getvalue()).decode()
|
| 142 |
+
|
| 143 |
+
# Construct emotion detection prompt
|
| 144 |
+
prompt = (
|
| 145 |
+
"Analyze the person's emotional and cognitive state. "
|
| 146 |
+
"Are they showing signs of: confusion (furrowed brow, puzzled expression), "
|
| 147 |
+
"hesitation (pauses, uncertain gestures), frustration (tense posture), "
|
| 148 |
+
"or focus (engaged, attentive)? "
|
| 149 |
+
"Respond concisely with detected states."
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
logger.debug(f"π Analyzing emotional state...")
|
| 153 |
+
|
| 154 |
+
try:
|
| 155 |
+
response = await asyncio.wait_for(
|
| 156 |
+
self._vision_client.chat.completions.create(
|
| 157 |
+
model=self._model,
|
| 158 |
+
messages=[
|
| 159 |
+
{
|
| 160 |
+
"role": "user",
|
| 161 |
+
"content": [
|
| 162 |
+
{"type": "text", "text": prompt},
|
| 163 |
+
{
|
| 164 |
+
"type": "image_url",
|
| 165 |
+
"image_url": {
|
| 166 |
+
"url": f"data:image/jpeg;base64,{img_str}"
|
| 167 |
+
},
|
| 168 |
+
},
|
| 169 |
+
],
|
| 170 |
+
}
|
| 171 |
+
],
|
| 172 |
+
max_tokens=100,
|
| 173 |
+
),
|
| 174 |
+
timeout=5.0,
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
description = response.choices[0].message.content.lower()
|
| 178 |
+
logger.debug(f"π Emotional analysis: {description}")
|
| 179 |
+
|
| 180 |
+
# Parse response to detect states
|
| 181 |
+
state = EmotionalState(
|
| 182 |
+
confused="confus" in description or "puzzle" in description or "uncertain" in description,
|
| 183 |
+
hesitant="hesita" in description or "unsure" in description or "pause" in description,
|
| 184 |
+
frustrated="frustrat" in description or "tense" in description or "agitat" in description,
|
| 185 |
+
focused="focus" in description or "attentive" in description or "engaged" in description,
|
| 186 |
+
confidence=0.7, # Could be enhanced with more sophisticated parsing
|
| 187 |
+
description=description,
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
self._last_state = state
|
| 191 |
+
self._state_history.append(state)
|
| 192 |
+
|
| 193 |
+
# Keep only recent history (last 10 states)
|
| 194 |
+
if len(self._state_history) > 10:
|
| 195 |
+
self._state_history.pop(0)
|
| 196 |
+
|
| 197 |
+
logger.info(f"π State detected: {state}")
|
| 198 |
+
|
| 199 |
+
# Track consecutive negative states
|
| 200 |
+
if state.needs_intervention():
|
| 201 |
+
self._consecutive_negative_states += 1
|
| 202 |
+
logger.warning(
|
| 203 |
+
f"β οΈ Negative state detected "
|
| 204 |
+
f"({self._consecutive_negative_states}/{self._intervention_threshold})"
|
| 205 |
+
)
|
| 206 |
+
else:
|
| 207 |
+
self._consecutive_negative_states = 0
|
| 208 |
+
|
| 209 |
+
# Trigger intervention if threshold reached AND auto-intervene enabled
|
| 210 |
+
if self._auto_intervene and self._consecutive_negative_states >= self._intervention_threshold:
|
| 211 |
+
await self._trigger_intervention(state)
|
| 212 |
+
self._consecutive_negative_states = 0 # Reset after intervention
|
| 213 |
+
elif self._consecutive_negative_states >= self._intervention_threshold:
|
| 214 |
+
# Just log, don't intervene (gating layer will handle it)
|
| 215 |
+
logger.info(
|
| 216 |
+
f"π Intervention threshold reached ({self._consecutive_negative_states}) "
|
| 217 |
+
f"- state available for gating layer"
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
except asyncio.TimeoutError:
|
| 221 |
+
logger.warning("β οΈ Emotional analysis timed out")
|
| 222 |
+
except Exception as e:
|
| 223 |
+
logger.error(f"β Emotional analysis error: {e}")
|
| 224 |
+
|
| 225 |
+
except Exception as e:
|
| 226 |
+
logger.error(f"Error in emotional monitoring: {e}")
|
| 227 |
+
finally:
|
| 228 |
+
self._analyzing = False
|
| 229 |
+
|
| 230 |
+
async def _trigger_intervention(self, state: EmotionalState):
|
| 231 |
+
"""Trigger TARS intervention based on detected emotional state"""
|
| 232 |
+
logger.info(f"π¨ Triggering TARS intervention for: {state}")
|
| 233 |
+
|
| 234 |
+
# Construct intervention message based on state
|
| 235 |
+
intervention_msg = self._get_intervention_message(state)
|
| 236 |
+
|
| 237 |
+
# Push context message to LLM
|
| 238 |
+
context_frame = TextFrame(
|
| 239 |
+
text=f"[Emotional State Alert]: {intervention_msg}"
|
| 240 |
+
)
|
| 241 |
+
await self.push_frame(context_frame, FrameDirection.UPSTREAM)
|
| 242 |
+
|
| 243 |
+
# Trigger LLM to respond
|
| 244 |
+
await self.push_frame(LLMRunFrame(), FrameDirection.UPSTREAM)
|
| 245 |
+
|
| 246 |
+
logger.info("β
Intervention triggered")
|
| 247 |
+
|
| 248 |
+
def _get_intervention_message(self, state: EmotionalState) -> str:
|
| 249 |
+
"""Generate appropriate intervention message based on detected state"""
|
| 250 |
+
if state.confused:
|
| 251 |
+
return (
|
| 252 |
+
"The user appears confused or uncertain. "
|
| 253 |
+
"Consider offering help or clarification proactively."
|
| 254 |
+
)
|
| 255 |
+
elif state.hesitant:
|
| 256 |
+
return (
|
| 257 |
+
"The user seems hesitant or unsure. "
|
| 258 |
+
"You might want to check if they need assistance."
|
| 259 |
+
)
|
| 260 |
+
elif state.frustrated:
|
| 261 |
+
return (
|
| 262 |
+
"The user appears frustrated or tense. "
|
| 263 |
+
"Consider offering support or suggesting a different approach."
|
| 264 |
+
)
|
| 265 |
+
else:
|
| 266 |
+
return (
|
| 267 |
+
"The user shows signs of difficulty. "
|
| 268 |
+
"Consider offering assistance."
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
def enable(self):
|
| 272 |
+
"""Enable emotional monitoring"""
|
| 273 |
+
self._enabled = True
|
| 274 |
+
logger.info("π§ Emotional monitoring enabled")
|
| 275 |
+
|
| 276 |
+
def disable(self):
|
| 277 |
+
"""Disable emotional monitoring"""
|
| 278 |
+
self._enabled = False
|
| 279 |
+
logger.info("π§ Emotional monitoring disabled")
|
| 280 |
+
|
| 281 |
+
def get_current_state(self) -> Optional[EmotionalState]:
|
| 282 |
+
"""Get the most recent emotional state"""
|
| 283 |
+
return self._last_state
|
| 284 |
+
|
| 285 |
+
def get_state_summary(self) -> Dict:
|
| 286 |
+
"""Get summary of recent emotional states"""
|
| 287 |
+
if not self._state_history:
|
| 288 |
+
return {"status": "no_data"}
|
| 289 |
+
|
| 290 |
+
total = len(self._state_history)
|
| 291 |
+
confused_count = sum(1 for s in self._state_history if s.confused)
|
| 292 |
+
hesitant_count = sum(1 for s in self._state_history if s.hesitant)
|
| 293 |
+
frustrated_count = sum(1 for s in self._state_history if s.frustrated)
|
| 294 |
+
focused_count = sum(1 for s in self._state_history if s.focused)
|
| 295 |
+
|
| 296 |
+
return {
|
| 297 |
+
"total_samples": total,
|
| 298 |
+
"confused_ratio": confused_count / total,
|
| 299 |
+
"hesitant_ratio": hesitant_count / total,
|
| 300 |
+
"frustrated_ratio": frustrated_count / total,
|
| 301 |
+
"focused_ratio": focused_count / total,
|
| 302 |
+
"current_state": str(self._last_state) if self._last_state else "unknown",
|
| 303 |
+
}
|
src/processors/filters.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pipecat.processors.frame_processor import FrameProcessor, FrameDirection
|
| 2 |
+
from pipecat.frames.frames import (
|
| 3 |
+
LLMFullResponseEndFrame,
|
| 4 |
+
LLMTextFrame,
|
| 5 |
+
LLMFullResponseStartFrame,
|
| 6 |
+
Frame,
|
| 7 |
+
InputAudioRawFrame,
|
| 8 |
+
StartFrame,
|
| 9 |
+
EndFrame,
|
| 10 |
+
CancelFrame,
|
| 11 |
+
TTSTextFrame
|
| 12 |
+
)
|
| 13 |
+
from loguru import logger
|
| 14 |
+
import json
|
| 15 |
+
import re
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class InputAudioFilter(FrameProcessor):
|
| 19 |
+
"""
|
| 20 |
+
Dedicated filter to block InputAudioRawFrame from reaching TTS service.
|
| 21 |
+
These frames should only go upstream (to STT), never downstream (to TTS).
|
| 22 |
+
"""
|
| 23 |
+
async def process_frame(self, frame: Frame, direction):
|
| 24 |
+
await super().process_frame(frame, direction)
|
| 25 |
+
|
| 26 |
+
# block Audio going Downstream
|
| 27 |
+
if isinstance(frame, InputAudioRawFrame) and direction == FrameDirection.DOWNSTREAM:
|
| 28 |
+
return
|
| 29 |
+
await self.push_frame(frame, direction)
|
| 30 |
+
|
| 31 |
+
class SilenceFilter(FrameProcessor):
|
| 32 |
+
"""
|
| 33 |
+
Intercepts LLM responses. If response is {"action": "silence"}, drops it.
|
| 34 |
+
"""
|
| 35 |
+
def __init__(self):
|
| 36 |
+
super().__init__()
|
| 37 |
+
self.current_response_text = ""
|
| 38 |
+
self.is_collecting = False
|
| 39 |
+
|
| 40 |
+
async def process_frame(self, frame: Frame, direction):
|
| 41 |
+
await super().process_frame(frame, direction)
|
| 42 |
+
|
| 43 |
+
if isinstance(frame, (StartFrame, EndFrame, CancelFrame)):
|
| 44 |
+
self.current_response_text = ""
|
| 45 |
+
self.is_collecting = False
|
| 46 |
+
await self.push_frame(frame, direction)
|
| 47 |
+
return
|
| 48 |
+
|
| 49 |
+
# Start collecting text
|
| 50 |
+
if isinstance(frame, LLMFullResponseStartFrame):
|
| 51 |
+
self.current_response_text = ""
|
| 52 |
+
self.is_collecting = True
|
| 53 |
+
await self.push_frame(frame, direction)
|
| 54 |
+
|
| 55 |
+
# Accumulate text
|
| 56 |
+
elif isinstance(frame, LLMTextFrame) and self.is_collecting:
|
| 57 |
+
self.current_response_text += frame.text
|
| 58 |
+
await self.push_frame(frame, direction)
|
| 59 |
+
|
| 60 |
+
# Check the full response
|
| 61 |
+
elif isinstance(frame, LLMFullResponseEndFrame):
|
| 62 |
+
if self.is_collecting:
|
| 63 |
+
text = self.current_response_text.strip()
|
| 64 |
+
try:
|
| 65 |
+
# Check for silence JSON
|
| 66 |
+
if "action" in text and "silence" in text:
|
| 67 |
+
clean_json = text.replace("```json", "").replace("```", "").strip()
|
| 68 |
+
data = json.loads(clean_json)
|
| 69 |
+
if data.get("action") == "silence":
|
| 70 |
+
logger.info("SilenceFilter: Suppressing silent response.")
|
| 71 |
+
self.is_collecting = False
|
| 72 |
+
return # Drop the EndFrame (silence the turn)
|
| 73 |
+
except:
|
| 74 |
+
pass
|
| 75 |
+
self.is_collecting = False
|
| 76 |
+
await self.push_frame(frame, direction)
|
| 77 |
+
|
| 78 |
+
# Pass everything else (like Audio or System messages)
|
| 79 |
+
else:
|
| 80 |
+
await self.push_frame(frame, direction)
|
| 81 |
+
|
src/processors/gating.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Intervention Gating: Traffic Controller for Bot Responses."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import time
|
| 5 |
+
import aiohttp
|
| 6 |
+
import asyncio
|
| 7 |
+
from loguru import logger
|
| 8 |
+
from pipecat.processors.frame_processor import FrameProcessor, FrameDirection
|
| 9 |
+
from pipecat.frames.frames import LLMMessagesFrame, Frame
|
| 10 |
+
from character.prompts import build_gating_system_prompt
|
| 11 |
+
|
| 12 |
+
class InterventionGating(FrameProcessor):
|
| 13 |
+
"""
|
| 14 |
+
Traffic Controller: Decides if TARS should reply based on Audio + Vision + Emotions.
|
| 15 |
+
Uses OpenAI-compatible API (DeepInfra).
|
| 16 |
+
"""
|
| 17 |
+
def __init__(
|
| 18 |
+
self,
|
| 19 |
+
api_key: str,
|
| 20 |
+
base_url: str = "https://api.deepinfra.com/v1/openai",
|
| 21 |
+
model: str = "meta-llama/Llama-3.2-3B-Instruct",
|
| 22 |
+
visual_observer=None,
|
| 23 |
+
emotional_monitor=None
|
| 24 |
+
):
|
| 25 |
+
super().__init__()
|
| 26 |
+
self.api_key = api_key
|
| 27 |
+
self.base_url = base_url
|
| 28 |
+
self.model = model
|
| 29 |
+
self.visual_observer = visual_observer
|
| 30 |
+
self.emotional_monitor = emotional_monitor
|
| 31 |
+
self.api_url = f"{base_url}/chat/completions"
|
| 32 |
+
|
| 33 |
+
async def _check_should_reply(self, messages: list) -> bool:
|
| 34 |
+
"""Asks the fast LLM if we should reply (Audio + Vision + Emotions)."""
|
| 35 |
+
if not messages:
|
| 36 |
+
return False
|
| 37 |
+
|
| 38 |
+
# Extract the last user message
|
| 39 |
+
last_msg = messages[-1]
|
| 40 |
+
if last_msg.get("role") != "user":
|
| 41 |
+
return True
|
| 42 |
+
|
| 43 |
+
# 1. READ EMOTIONAL STATE (Highest Priority)
|
| 44 |
+
emotional_state = None
|
| 45 |
+
needs_help = False
|
| 46 |
+
if self.emotional_monitor:
|
| 47 |
+
emotional_state = self.emotional_monitor.get_current_state()
|
| 48 |
+
if emotional_state and emotional_state.needs_intervention():
|
| 49 |
+
# User is confused/hesitant/frustrated - ALWAYS respond
|
| 50 |
+
logger.info(
|
| 51 |
+
f"π§ Gating: User shows {emotional_state} - BYPASSING gating, offering help"
|
| 52 |
+
)
|
| 53 |
+
return True
|
| 54 |
+
needs_help = emotional_state.needs_intervention() if emotional_state else False
|
| 55 |
+
|
| 56 |
+
# 2. READ VISUAL CONTEXT (0ms Latency)
|
| 57 |
+
is_looking = False
|
| 58 |
+
if self.visual_observer:
|
| 59 |
+
# Read the variable updated by the background task
|
| 60 |
+
is_looking = self.visual_observer.visual_context.get("is_looking_at_robot", False)
|
| 61 |
+
|
| 62 |
+
# Ignore if data is too old (> 5 seconds)
|
| 63 |
+
last_update = self.visual_observer.visual_context.get("last_updated", 0)
|
| 64 |
+
if time.time() - last_update > 5.0:
|
| 65 |
+
is_looking = False
|
| 66 |
+
|
| 67 |
+
# 3. ANALYZE CONTEXT
|
| 68 |
+
history_text = "\n".join([f"{m['role']}: {m['content']}" for m in messages[-3:]])
|
| 69 |
+
|
| 70 |
+
# Build enriched system prompt with emotional context
|
| 71 |
+
system_prompt = build_gating_system_prompt(is_looking, emotional_state)
|
| 72 |
+
|
| 73 |
+
payload = {
|
| 74 |
+
"model": self.model,
|
| 75 |
+
"messages": [
|
| 76 |
+
{"role": "system", "content": system_prompt},
|
| 77 |
+
{"role": "user", "content": f"Context:\n{history_text}"}
|
| 78 |
+
],
|
| 79 |
+
"response_format": {"type": "json_object"},
|
| 80 |
+
"max_tokens": 50
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
# Set strict timeout so we don't silence the bot if API is slow
|
| 84 |
+
timeout = aiohttp.ClientTimeout(total=1.5)
|
| 85 |
+
|
| 86 |
+
try:
|
| 87 |
+
async with aiohttp.ClientSession(timeout=timeout) as session:
|
| 88 |
+
async with session.post(
|
| 89 |
+
self.api_url,
|
| 90 |
+
headers={"Authorization": f"Bearer {self.api_key}"},
|
| 91 |
+
json=payload
|
| 92 |
+
) as resp:
|
| 93 |
+
if resp.status == 200:
|
| 94 |
+
result = await resp.json()
|
| 95 |
+
content_response = result["choices"][0]["message"]["content"]
|
| 96 |
+
content_response = content_response.replace("```json", "").replace("```", "").strip()
|
| 97 |
+
data = json.loads(content_response)
|
| 98 |
+
should_reply = data.get("reply", False)
|
| 99 |
+
|
| 100 |
+
logger.debug(f"Gating decision: {should_reply} (Looking: {is_looking})")
|
| 101 |
+
return should_reply
|
| 102 |
+
else:
|
| 103 |
+
logger.warning(f"Gating check failed: {resp.status}")
|
| 104 |
+
return True # Fail open (reply if check fails)
|
| 105 |
+
except asyncio.TimeoutError:
|
| 106 |
+
logger.warning("π¦ Gating: Timed out! Defaulting to REPLY.")
|
| 107 |
+
return True
|
| 108 |
+
except Exception as e:
|
| 109 |
+
logger.error(f"Gating error: {e}")
|
| 110 |
+
return True
|
| 111 |
+
|
| 112 |
+
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
| 113 |
+
"""
|
| 114 |
+
Intercepts LLMMessagesFrame.
|
| 115 |
+
If 'should_reply' is False, we DROP the frame, effectively silencing the bot.
|
| 116 |
+
"""
|
| 117 |
+
await super().process_frame(frame, direction)
|
| 118 |
+
|
| 119 |
+
if isinstance(frame, LLMMessagesFrame) and direction == FrameDirection.DOWNSTREAM:
|
| 120 |
+
# Check if we should reply
|
| 121 |
+
should_reply = await self._check_should_reply(frame.messages)
|
| 122 |
+
|
| 123 |
+
if not should_reply:
|
| 124 |
+
logger.info(f"π¦ Gating: BLOCKING response.")
|
| 125 |
+
return # DROP THE FRAME
|
| 126 |
+
|
| 127 |
+
logger.info(f"π’ Gating: PASSING through.")
|
| 128 |
+
|
| 129 |
+
await self.push_frame(frame, direction)
|
src/processors/visual_observer.py
ADDED
|
@@ -0,0 +1,389 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import time
|
| 3 |
+
from typing import Optional, List, Dict, Any
|
| 4 |
+
from loguru import logger
|
| 5 |
+
from pipecat.frames.frames import Frame, ImageRawFrame, TextFrame
|
| 6 |
+
from pipecat.processors.frame_processor import FrameProcessor, FrameDirection
|
| 7 |
+
import base64
|
| 8 |
+
from PIL import Image
|
| 9 |
+
import io
|
| 10 |
+
import cv2
|
| 11 |
+
import numpy as np
|
| 12 |
+
try:
|
| 13 |
+
import mediapipe as mp
|
| 14 |
+
MEDIAPIPE_AVAILABLE = True
|
| 15 |
+
except ImportError:
|
| 16 |
+
MEDIAPIPE_AVAILABLE = False
|
| 17 |
+
logger.warning("MediaPipe not available, using OpenCV for face detection")
|
| 18 |
+
|
| 19 |
+
class VisualObserver(FrameProcessor):
|
| 20 |
+
"""
|
| 21 |
+
Observer that waits for UserImageRequestFrame, captures the next video frame,
|
| 22 |
+
analyzes it with a vision model, and injects the description back into the context.
|
| 23 |
+
Now includes face detection and display capabilities.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
def __init__(
|
| 27 |
+
self,
|
| 28 |
+
vision_client,
|
| 29 |
+
model="moondream",
|
| 30 |
+
enable_display=False,
|
| 31 |
+
enable_face_detection=True,
|
| 32 |
+
webrtc_connection=None,
|
| 33 |
+
tars_client=None
|
| 34 |
+
):
|
| 35 |
+
super().__init__()
|
| 36 |
+
self._vision_client = vision_client
|
| 37 |
+
self._model = model
|
| 38 |
+
self._waiting_for_image = False
|
| 39 |
+
self._current_request = None
|
| 40 |
+
self._last_analysis_time = 0
|
| 41 |
+
self._cooldown = 2.0 # Min seconds between analyses
|
| 42 |
+
self._enable_display = enable_display
|
| 43 |
+
self._enable_face_detection = enable_face_detection
|
| 44 |
+
self._webrtc_connection = webrtc_connection
|
| 45 |
+
self._tars_client = None # Deprecated: Display control via gRPC in robot mode
|
| 46 |
+
self._display_window_name = "TARS Visual Observer"
|
| 47 |
+
|
| 48 |
+
# Face detection setup
|
| 49 |
+
self._face_detector = None
|
| 50 |
+
if self._enable_face_detection:
|
| 51 |
+
self._setup_face_detection()
|
| 52 |
+
|
| 53 |
+
# Stats
|
| 54 |
+
self._face_count = 0
|
| 55 |
+
self._frames_processed = 0
|
| 56 |
+
self._last_face_time = 0
|
| 57 |
+
|
| 58 |
+
def _setup_face_detection(self):
|
| 59 |
+
"""Initialize face detection based on available libraries."""
|
| 60 |
+
try:
|
| 61 |
+
if MEDIAPIPE_AVAILABLE:
|
| 62 |
+
logger.info("π― Initializing MediaPipe face detection")
|
| 63 |
+
self._face_detector_type = "mediapipe"
|
| 64 |
+
self._mp_face_detection = mp.solutions.face_detection
|
| 65 |
+
self._mp_drawing = mp.solutions.drawing_utils
|
| 66 |
+
self._face_detector = self._mp_face_detection.FaceDetection(
|
| 67 |
+
model_selection=0, # 0 for short-range (< 2m), 1 for full-range
|
| 68 |
+
min_detection_confidence=0.5
|
| 69 |
+
)
|
| 70 |
+
else:
|
| 71 |
+
# Fallback to OpenCV Haar Cascade
|
| 72 |
+
logger.info("π― Initializing OpenCV Haar Cascade face detection")
|
| 73 |
+
self._face_detector_type = "opencv"
|
| 74 |
+
cascade_path = cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
|
| 75 |
+
self._face_detector = cv2.CascadeClassifier(cascade_path)
|
| 76 |
+
if self._face_detector.empty():
|
| 77 |
+
logger.error("Failed to load Haar Cascade classifier")
|
| 78 |
+
self._face_detector = None
|
| 79 |
+
except Exception as e:
|
| 80 |
+
logger.error(f"Failed to initialize face detection: {e}")
|
| 81 |
+
self._face_detector = None
|
| 82 |
+
|
| 83 |
+
def detect_faces(self, image: np.ndarray) -> List[Dict[str, Any]]:
|
| 84 |
+
"""
|
| 85 |
+
Detect faces in the image.
|
| 86 |
+
|
| 87 |
+
Args:
|
| 88 |
+
image: numpy array in BGR format
|
| 89 |
+
|
| 90 |
+
Returns:
|
| 91 |
+
List of face dictionaries with bounding boxes and confidence
|
| 92 |
+
"""
|
| 93 |
+
if not self._face_detector:
|
| 94 |
+
return []
|
| 95 |
+
|
| 96 |
+
faces = []
|
| 97 |
+
try:
|
| 98 |
+
if self._face_detector_type == "mediapipe":
|
| 99 |
+
# Convert BGR to RGB for MediaPipe
|
| 100 |
+
rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
| 101 |
+
results = self._face_detector.process(rgb_image)
|
| 102 |
+
|
| 103 |
+
if results.detections:
|
| 104 |
+
h, w, _ = image.shape
|
| 105 |
+
for detection in results.detections:
|
| 106 |
+
bbox = detection.location_data.relative_bounding_box
|
| 107 |
+
faces.append({
|
| 108 |
+
'x': int(bbox.xmin * w),
|
| 109 |
+
'y': int(bbox.ymin * h),
|
| 110 |
+
'width': int(bbox.width * w),
|
| 111 |
+
'height': int(bbox.height * h),
|
| 112 |
+
'confidence': detection.score[0]
|
| 113 |
+
})
|
| 114 |
+
else: # opencv
|
| 115 |
+
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
| 116 |
+
detected_faces = self._face_detector.detectMultiScale(
|
| 117 |
+
gray,
|
| 118 |
+
scaleFactor=1.1,
|
| 119 |
+
minNeighbors=5,
|
| 120 |
+
minSize=(30, 30)
|
| 121 |
+
)
|
| 122 |
+
for (x, y, w, h) in detected_faces:
|
| 123 |
+
faces.append({
|
| 124 |
+
'x': x,
|
| 125 |
+
'y': y,
|
| 126 |
+
'width': w,
|
| 127 |
+
'height': h,
|
| 128 |
+
'confidence': 1.0 # OpenCV Haar doesn't provide confidence
|
| 129 |
+
})
|
| 130 |
+
except Exception as e:
|
| 131 |
+
logger.error(f"Error detecting faces: {e}")
|
| 132 |
+
|
| 133 |
+
return faces
|
| 134 |
+
|
| 135 |
+
def draw_faces(self, image: np.ndarray, faces: List[Dict[str, Any]]) -> np.ndarray:
|
| 136 |
+
"""
|
| 137 |
+
Draw bounding boxes around detected faces.
|
| 138 |
+
|
| 139 |
+
Args:
|
| 140 |
+
image: numpy array in BGR format
|
| 141 |
+
faces: List of face dictionaries from detect_faces()
|
| 142 |
+
|
| 143 |
+
Returns:
|
| 144 |
+
Image with faces drawn
|
| 145 |
+
"""
|
| 146 |
+
annotated_image = image.copy()
|
| 147 |
+
|
| 148 |
+
for face in faces:
|
| 149 |
+
x, y, w, h = face['x'], face['y'], face['width'], face['height']
|
| 150 |
+
confidence = face['confidence']
|
| 151 |
+
|
| 152 |
+
# Draw rectangle
|
| 153 |
+
cv2.rectangle(annotated_image, (x, y), (x + w, y + h), (0, 255, 0), 2)
|
| 154 |
+
|
| 155 |
+
# Draw confidence score
|
| 156 |
+
label = f"Face: {confidence:.2f}"
|
| 157 |
+
cv2.putText(
|
| 158 |
+
annotated_image,
|
| 159 |
+
label,
|
| 160 |
+
(x, y - 10),
|
| 161 |
+
cv2.FONT_HERSHEY_SIMPLEX,
|
| 162 |
+
0.5,
|
| 163 |
+
(0, 255, 0),
|
| 164 |
+
2
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
# Draw face count
|
| 168 |
+
cv2.putText(
|
| 169 |
+
annotated_image,
|
| 170 |
+
f"Faces: {len(faces)}",
|
| 171 |
+
(10, 30),
|
| 172 |
+
cv2.FONT_HERSHEY_SIMPLEX,
|
| 173 |
+
1,
|
| 174 |
+
(0, 255, 0),
|
| 175 |
+
2
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
return annotated_image
|
| 179 |
+
|
| 180 |
+
def display_frame(self, image: np.ndarray, faces: Optional[List[Dict[str, Any]]] = None):
|
| 181 |
+
"""
|
| 182 |
+
Display the frame in a window with optional face annotations.
|
| 183 |
+
|
| 184 |
+
Args:
|
| 185 |
+
image: numpy array in BGR format
|
| 186 |
+
faces: Optional list of detected faces to draw
|
| 187 |
+
"""
|
| 188 |
+
if not self._enable_display:
|
| 189 |
+
return
|
| 190 |
+
|
| 191 |
+
try:
|
| 192 |
+
display_image = image.copy()
|
| 193 |
+
|
| 194 |
+
if faces:
|
| 195 |
+
display_image = self.draw_faces(display_image, faces)
|
| 196 |
+
|
| 197 |
+
cv2.imshow(self._display_window_name, display_image)
|
| 198 |
+
cv2.waitKey(1) # Required for window to update
|
| 199 |
+
except Exception as e:
|
| 200 |
+
logger.error(f"Error displaying frame: {e}")
|
| 201 |
+
|
| 202 |
+
def send_display_event(self, faces: List[Dict[str, Any]], image_base64: Optional[str] = None):
|
| 203 |
+
"""
|
| 204 |
+
Send display event to WebRTC connection with face detection results.
|
| 205 |
+
|
| 206 |
+
Args:
|
| 207 |
+
faces: List of detected faces
|
| 208 |
+
image_base64: Optional base64-encoded image
|
| 209 |
+
"""
|
| 210 |
+
if not self._webrtc_connection:
|
| 211 |
+
return
|
| 212 |
+
|
| 213 |
+
try:
|
| 214 |
+
if self._webrtc_connection.is_connected():
|
| 215 |
+
event_data = {
|
| 216 |
+
"type": "face_detection",
|
| 217 |
+
"status": "detected" if faces else "no_faces",
|
| 218 |
+
"face_count": len(faces),
|
| 219 |
+
"faces": faces,
|
| 220 |
+
"timestamp": time.time()
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
# Optionally include thumbnail
|
| 224 |
+
if image_base64 and len(faces) > 0:
|
| 225 |
+
event_data["thumbnail"] = image_base64
|
| 226 |
+
|
| 227 |
+
self._webrtc_connection.send_app_message(event_data)
|
| 228 |
+
except Exception as e:
|
| 229 |
+
logger.debug(f"Error sending display event: {e}")
|
| 230 |
+
|
| 231 |
+
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
| 232 |
+
await super().process_frame(frame, direction)
|
| 233 |
+
|
| 234 |
+
# 1. Handle Request from LLM (Check by class name to avoid import errors)
|
| 235 |
+
# We check for "UserImageRequestFrame" (your custom frame) OR "VisionImageRequestFrame"
|
| 236 |
+
if frame.__class__.__name__ in ["UserImageRequestFrame", "VisionImageRequestFrame"]:
|
| 237 |
+
logger.info(f"ποΈ Vision request received: {getattr(frame, 'context', 'No context')}")
|
| 238 |
+
self._waiting_for_image = True
|
| 239 |
+
self._current_request = frame
|
| 240 |
+
# We don't yield this frame downstream; we consume it and act on it.
|
| 241 |
+
return
|
| 242 |
+
|
| 243 |
+
# 2. Handle Video Input (continuous face detection + optional vision analysis)
|
| 244 |
+
if isinstance(frame, ImageRawFrame):
|
| 245 |
+
self._frames_processed += 1
|
| 246 |
+
|
| 247 |
+
# Process face detection on every frame (or throttled)
|
| 248 |
+
if self._enable_face_detection and self._frames_processed % 5 == 0:
|
| 249 |
+
# Run face detection in background
|
| 250 |
+
asyncio.create_task(self._process_face_detection(frame))
|
| 251 |
+
|
| 252 |
+
# Vision analysis only when requested
|
| 253 |
+
if self._waiting_for_image:
|
| 254 |
+
# Check cooldown
|
| 255 |
+
if time.time() - self._last_analysis_time < self._cooldown:
|
| 256 |
+
await self.push_frame(frame, direction)
|
| 257 |
+
return
|
| 258 |
+
|
| 259 |
+
logger.info("πΈ Capturing frame for analysis...")
|
| 260 |
+
self._waiting_for_image = False # Reset flag immediately
|
| 261 |
+
self._last_analysis_time = time.time()
|
| 262 |
+
|
| 263 |
+
# Run analysis in background to avoid blocking audio pipeline
|
| 264 |
+
asyncio.create_task(self._analyze_and_respond(frame))
|
| 265 |
+
# Note: Still pass frame through for face detection
|
| 266 |
+
|
| 267 |
+
# Pass all other frames through
|
| 268 |
+
await self.push_frame(frame, direction)
|
| 269 |
+
|
| 270 |
+
async def _process_face_detection(self, frame: ImageRawFrame):
|
| 271 |
+
"""Process face detection on video frame and send display events."""
|
| 272 |
+
try:
|
| 273 |
+
# Convert frame to numpy array
|
| 274 |
+
image = Image.frombytes(frame.format, frame.size, frame.image)
|
| 275 |
+
image_np = np.array(image)
|
| 276 |
+
|
| 277 |
+
# Convert RGB to BGR for OpenCV
|
| 278 |
+
if image_np.shape[2] == 3:
|
| 279 |
+
image_bgr = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
|
| 280 |
+
else:
|
| 281 |
+
image_bgr = image_np
|
| 282 |
+
|
| 283 |
+
# Get frame dimensions
|
| 284 |
+
frame_height, frame_width = image_bgr.shape[:2]
|
| 285 |
+
|
| 286 |
+
# Detect faces
|
| 287 |
+
faces = self.detect_faces(image_bgr)
|
| 288 |
+
|
| 289 |
+
if faces:
|
| 290 |
+
self._face_count = len(faces)
|
| 291 |
+
current_time = time.time()
|
| 292 |
+
|
| 293 |
+
# Log only periodically to avoid spam
|
| 294 |
+
if current_time - self._last_face_time > 5.0:
|
| 295 |
+
logger.info(f"π€ Detected {len(faces)} face(s)")
|
| 296 |
+
self._last_face_time = current_time
|
| 297 |
+
|
| 298 |
+
# Get the largest/most prominent face
|
| 299 |
+
primary_face = max(faces, key=lambda f: f['width'] * f['height'])
|
| 300 |
+
|
| 301 |
+
# Calculate face center
|
| 302 |
+
face_center_x = primary_face['x'] + primary_face['width'] // 2
|
| 303 |
+
face_center_y = primary_face['y'] + primary_face['height'] // 2
|
| 304 |
+
|
| 305 |
+
# Display the frame with face annotations
|
| 306 |
+
self.display_frame(image_bgr, faces)
|
| 307 |
+
|
| 308 |
+
# Send face position event to WebRTC frontend
|
| 309 |
+
self.send_display_event(faces)
|
| 310 |
+
|
| 311 |
+
# Optionally send face position to text frame for LLM context
|
| 312 |
+
# This can be used for "user is looking at you" type feedback
|
| 313 |
+
# Uncomment if you want the LLM to know about face position
|
| 314 |
+
# face_text = f"[Face Detected]: Position ({face_center_x}, {face_center_y}), Size: {primary_face['width']}x{primary_face['height']}"
|
| 315 |
+
# await self.push_frame(TextFrame(text=face_text), FrameDirection.UPSTREAM)
|
| 316 |
+
else:
|
| 317 |
+
# No faces detected
|
| 318 |
+
if self._face_count > 0:
|
| 319 |
+
logger.debug("No faces detected")
|
| 320 |
+
self._face_count = 0
|
| 321 |
+
# Send "no face" event to WebRTC
|
| 322 |
+
self.send_display_event([])
|
| 323 |
+
|
| 324 |
+
# Display frame without annotations
|
| 325 |
+
self.display_frame(image_bgr)
|
| 326 |
+
|
| 327 |
+
except Exception as e:
|
| 328 |
+
logger.error(f"Error in face detection: {e}")
|
| 329 |
+
|
| 330 |
+
async def _analyze_and_respond(self, frame: ImageRawFrame):
|
| 331 |
+
"""Analyze image and push result text frame downstream."""
|
| 332 |
+
try:
|
| 333 |
+
# Convert raw frame to base64
|
| 334 |
+
image = Image.frombytes(frame.format, frame.size, frame.image)
|
| 335 |
+
buffered = io.BytesIO()
|
| 336 |
+
image.save(buffered, format="JPEG")
|
| 337 |
+
img_str = base64.b64encode(buffered.getvalue()).decode()
|
| 338 |
+
|
| 339 |
+
prompt = "Describe this image briefly."
|
| 340 |
+
|
| 341 |
+
# Try to extract prompt from the request context if available
|
| 342 |
+
if self._current_request and hasattr(self._current_request, 'context'):
|
| 343 |
+
# Assuming context might be the question text
|
| 344 |
+
context = self._current_request.context
|
| 345 |
+
if context:
|
| 346 |
+
prompt = f"{context} (Describe the image to answer this)"
|
| 347 |
+
|
| 348 |
+
logger.info(f"π Sending image to vision model ({self._model})...")
|
| 349 |
+
|
| 350 |
+
try:
|
| 351 |
+
response = await asyncio.wait_for(
|
| 352 |
+
self._vision_client.chat.completions.create(
|
| 353 |
+
model=self._model,
|
| 354 |
+
messages=[
|
| 355 |
+
{
|
| 356 |
+
"role": "user",
|
| 357 |
+
"content": [
|
| 358 |
+
{"type": "text", "text": prompt},
|
| 359 |
+
{
|
| 360 |
+
"type": "image_url",
|
| 361 |
+
"image_url": {
|
| 362 |
+
"url": f"data:image/jpeg;base64,{img_str}"
|
| 363 |
+
},
|
| 364 |
+
},
|
| 365 |
+
],
|
| 366 |
+
}
|
| 367 |
+
],
|
| 368 |
+
max_tokens=100
|
| 369 |
+
),
|
| 370 |
+
timeout=8.0 # 8 second timeout to prevent hanging
|
| 371 |
+
)
|
| 372 |
+
description = response.choices[0].message.content
|
| 373 |
+
logger.info(f"β
Vision analysis: {description}")
|
| 374 |
+
|
| 375 |
+
except asyncio.TimeoutError:
|
| 376 |
+
logger.warning("β οΈ Vision model timed out!")
|
| 377 |
+
description = "I couldn't see clearly because the visual processing timed out."
|
| 378 |
+
except Exception as e:
|
| 379 |
+
logger.error(f"β Vision model error: {e}")
|
| 380 |
+
description = "I had trouble processing the visual data."
|
| 381 |
+
|
| 382 |
+
feedback_text = f"[Visual Observation]: {description}"
|
| 383 |
+
|
| 384 |
+
# Push text frame to LLM
|
| 385 |
+
await self.push_frame(TextFrame(text=feedback_text), FrameDirection.UPSTREAM)
|
| 386 |
+
|
| 387 |
+
except Exception as e:
|
| 388 |
+
logger.error(f"Error in vision pipeline: {e}")
|
| 389 |
+
self._waiting_for_image = False
|
src/services/README.md
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Services
|
| 2 |
+
|
| 3 |
+
Backend services for TARS voice AI. These provide core functionality like speech recognition, text-to-speech, memory, and robot control.
|
| 4 |
+
|
| 5 |
+
## Organization
|
| 6 |
+
|
| 7 |
+
| Service | Purpose |
|
| 8 |
+
|---------|---------|
|
| 9 |
+
| `tars_robot.py` | Robot hardware control via gRPC (movement, camera, display) |
|
| 10 |
+
| `tts_qwen.py` | Local text-to-speech using Qwen3 models |
|
| 11 |
+
| `memory_chromadb.py` | Semantic memory using ChromaDB |
|
| 12 |
+
| `memory_hybrid.py` | Hybrid memory combining ChromaDB and Mem0 |
|
| 13 |
+
| `factories/` | Factory functions for creating STT/TTS services |
|
| 14 |
+
|
| 15 |
+
## Robot Control
|
| 16 |
+
|
| 17 |
+
Robot hardware is controlled exclusively via gRPC using the TARS SDK.
|
| 18 |
+
|
| 19 |
+
### tars_robot.py
|
| 20 |
+
|
| 21 |
+
Provides functions for robot control in robot mode (tars_bot.py):
|
| 22 |
+
|
| 23 |
+
```python
|
| 24 |
+
from services import tars_robot
|
| 25 |
+
|
| 26 |
+
# Get robot client (singleton) - replace with your robot's IP
|
| 27 |
+
client = tars_robot.get_robot_client(address="100.115.193.41:50051")
|
| 28 |
+
|
| 29 |
+
# Control functions
|
| 30 |
+
await tars_robot.execute_movement(["wave_right", "step_forward"])
|
| 31 |
+
result = await tars_robot.capture_camera_view()
|
| 32 |
+
tars_robot.set_emotion("happy")
|
| 33 |
+
tars_robot.set_eye_state("listening")
|
| 34 |
+
status = tars_robot.get_robot_status()
|
| 35 |
+
available = tars_robot.is_robot_available()
|
| 36 |
+
|
| 37 |
+
# Cleanup
|
| 38 |
+
tars_robot.close_robot_client()
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
### Architecture
|
| 42 |
+
|
| 43 |
+
Robot mode uses two communication channels:
|
| 44 |
+
|
| 45 |
+
| Channel | Protocol | Purpose | Latency |
|
| 46 |
+
|---------|----------|---------|---------|
|
| 47 |
+
| Audio | WebRTC | Voice conversation | ~20ms |
|
| 48 |
+
| Commands | gRPC | Hardware control | ~5-10ms |
|
| 49 |
+
|
| 50 |
+
Audio flows through aiortc WebRTC connection.
|
| 51 |
+
All hardware commands (movement, camera, display) use gRPC.
|
| 52 |
+
|
| 53 |
+
### Browser Mode
|
| 54 |
+
|
| 55 |
+
Browser mode (bot.py) does NOT support robot control.
|
| 56 |
+
It only provides:
|
| 57 |
+
- WebRTC audio/video with browser
|
| 58 |
+
- Vision analysis
|
| 59 |
+
- Conversation
|
| 60 |
+
|
| 61 |
+
Display observers in browser mode are deprecated and do nothing.
|
| 62 |
+
|
| 63 |
+
## Service Factories
|
| 64 |
+
|
| 65 |
+
The `factories/` directory contains factory functions for creating STT and TTS services:
|
| 66 |
+
|
| 67 |
+
```python
|
| 68 |
+
from services.factories import create_stt_service, create_tts_service
|
| 69 |
+
|
| 70 |
+
# Create STT service
|
| 71 |
+
stt = create_stt_service(
|
| 72 |
+
provider="deepgram", # or "speechmatics", "deepgram-flux"
|
| 73 |
+
deepgram_api_key=DEEPGRAM_API_KEY,
|
| 74 |
+
language=Language.EN
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
# Create TTS service
|
| 78 |
+
tts = create_tts_service(
|
| 79 |
+
provider="elevenlabs", # or "qwen3"
|
| 80 |
+
elevenlabs_api_key=ELEVENLABS_API_KEY,
|
| 81 |
+
elevenlabs_voice_id=VOICE_ID
|
| 82 |
+
)
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
## Memory Services
|
| 86 |
+
|
| 87 |
+
### ChromaDB (memory_chromadb.py)
|
| 88 |
+
|
| 89 |
+
Simple semantic memory using ChromaDB vector database:
|
| 90 |
+
|
| 91 |
+
```python
|
| 92 |
+
from services.memory_chromadb import ChromaDBMemoryService
|
| 93 |
+
|
| 94 |
+
memory = ChromaDBMemoryService()
|
| 95 |
+
await memory.store("user_id", "The user likes pizza")
|
| 96 |
+
results = await memory.search("user_id", "What does the user like?")
|
| 97 |
+
```
|
| 98 |
+
|
| 99 |
+
### Hybrid Memory (memory_hybrid.py)
|
| 100 |
+
|
| 101 |
+
Combines ChromaDB with Mem0 for enhanced memory capabilities.
|
| 102 |
+
|
| 103 |
+
## Not Services
|
| 104 |
+
|
| 105 |
+
This directory is for backend services only. Other code belongs in:
|
| 106 |
+
|
| 107 |
+
- `tools/` - LLM callable functions
|
| 108 |
+
- `processors/` - Pipeline frame processors
|
| 109 |
+
- `transport/` - Network transport (WebRTC, gRPC)
|
| 110 |
+
- `observers/` - Pipeline observers
|
src/services/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
src/services/factories/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Service factories for STT and TTS providers."""
|
| 2 |
+
|
| 3 |
+
from .stt_factory import create_stt_service
|
| 4 |
+
from .tts_factory import create_tts_service
|
| 5 |
+
|
| 6 |
+
__all__ = ["create_stt_service", "create_tts_service"]
|
src/services/factories/stt_factory.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""STT Service Factory - Centralized STT service creation."""
|
| 2 |
+
|
| 3 |
+
from loguru import logger
|
| 4 |
+
from pipecat.transcriptions.language import Language
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def create_stt_service(
|
| 8 |
+
provider: str,
|
| 9 |
+
speechmatics_api_key: str = None,
|
| 10 |
+
deepgram_api_key: str = None,
|
| 11 |
+
language: Language = Language.EN,
|
| 12 |
+
enable_diarization: bool = False,
|
| 13 |
+
):
|
| 14 |
+
"""
|
| 15 |
+
Create and configure STT service based on provider.
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
provider: "speechmatics", "deepgram", or "deepgram-flux"
|
| 19 |
+
speechmatics_api_key: Speechmatics API key (if using speechmatics)
|
| 20 |
+
deepgram_api_key: Deepgram API key (if using deepgram/deepgram-flux)
|
| 21 |
+
language: Language for transcription (default: English)
|
| 22 |
+
enable_diarization: Enable speaker diarization (default: False)
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
Configured STT service instance
|
| 26 |
+
|
| 27 |
+
Raises:
|
| 28 |
+
ValueError: If provider is invalid or required parameters are missing
|
| 29 |
+
Exception: If STT service initialization fails
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
logger.info(f"Creating STT service: {provider}")
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
if provider == "speechmatics":
|
| 36 |
+
# Lazy import to avoid requiring package when not in use
|
| 37 |
+
from pipecat.services.speechmatics.stt import SpeechmaticsSTTService, TurnDetectionMode
|
| 38 |
+
|
| 39 |
+
# Speechmatics with SMART_TURN mode for built-in turn detection
|
| 40 |
+
if not speechmatics_api_key:
|
| 41 |
+
raise ValueError("speechmatics_api_key is required for Speechmatics")
|
| 42 |
+
|
| 43 |
+
logger.info("Using Speechmatics STT with SMART_TURN mode")
|
| 44 |
+
stt_params = SpeechmaticsSTTService.InputParams(
|
| 45 |
+
language=language,
|
| 46 |
+
enable_diarization=enable_diarization,
|
| 47 |
+
turn_detection_mode=TurnDetectionMode.SMART_TURN,
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
stt = SpeechmaticsSTTService(
|
| 51 |
+
api_key=speechmatics_api_key,
|
| 52 |
+
params=stt_params,
|
| 53 |
+
)
|
| 54 |
+
logger.info("β Speechmatics STT service created with SMART_TURN mode")
|
| 55 |
+
|
| 56 |
+
elif provider == "deepgram":
|
| 57 |
+
# Lazy import to avoid requiring package when not in use
|
| 58 |
+
from pipecat.services.deepgram.stt import DeepgramSTTService
|
| 59 |
+
from deepgram.clients.listen.v1.websocket.options import LiveOptions
|
| 60 |
+
|
| 61 |
+
# Deepgram STT with server-side endpointing for turn detection
|
| 62 |
+
# Note: This uses Deepgram's server-side silence detection, not local smart turn
|
| 63 |
+
if not deepgram_api_key:
|
| 64 |
+
raise ValueError("deepgram_api_key is required for Deepgram")
|
| 65 |
+
|
| 66 |
+
logger.info("Using Deepgram STT with server-side endpointing")
|
| 67 |
+
live_options = LiveOptions(
|
| 68 |
+
language=language.value if hasattr(language, 'value') else str(language),
|
| 69 |
+
model="nova-2", # Deepgram's latest model
|
| 70 |
+
interim_results=True, # Enable interim transcription results
|
| 71 |
+
smart_format=True, # Auto-format transcripts
|
| 72 |
+
punctuate=True, # Add punctuation
|
| 73 |
+
endpointing=300, # 300ms silence to detect end of speech (server-side)
|
| 74 |
+
vad_events=True, # Enable VAD events for speech detection
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
stt = DeepgramSTTService(
|
| 78 |
+
api_key=deepgram_api_key,
|
| 79 |
+
live_options=live_options,
|
| 80 |
+
stt_ttfb_timeout=5.0, # TTFB timeout for transcription (seconds)
|
| 81 |
+
)
|
| 82 |
+
logger.info("β Deepgram STT service created")
|
| 83 |
+
logger.info(" Turn detection: Server-side endpointing (300ms silence)")
|
| 84 |
+
logger.info(" VAD events: Enabled for speech detection")
|
| 85 |
+
logger.info(" TTFB timeout: 5.0s for transcription metrics")
|
| 86 |
+
|
| 87 |
+
elif provider == "deepgram-flux":
|
| 88 |
+
# Lazy import to avoid requiring package when not in use
|
| 89 |
+
from pipecat.services.deepgram.flux.stt import DeepgramFluxSTTService
|
| 90 |
+
|
| 91 |
+
# Deepgram Flux with built-in turn detection
|
| 92 |
+
if not deepgram_api_key:
|
| 93 |
+
raise ValueError("deepgram_api_key is required for Deepgram Flux")
|
| 94 |
+
|
| 95 |
+
logger.info("Using Deepgram Flux STT with built-in turn detection")
|
| 96 |
+
# Flux has different parameters - uses EOT (End of Transcript) detection
|
| 97 |
+
# Default model is "flux-general-en" and encoding is "linear16"
|
| 98 |
+
stt_params = DeepgramFluxSTTService.InputParams(
|
| 99 |
+
min_confidence=0.3, # Minimum confidence threshold for accepting transcriptions
|
| 100 |
+
# Optional: Configure end-of-turn detection thresholds
|
| 101 |
+
# eot_threshold: Confidence threshold for detecting end of turn (0.0-1.0)
|
| 102 |
+
# eot_timeout_ms: Max time to wait before forcing turn end
|
| 103 |
+
# eager_eot_threshold: More aggressive turn ending threshold
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
stt = DeepgramFluxSTTService(
|
| 107 |
+
api_key=deepgram_api_key,
|
| 108 |
+
model="flux-general-en", # Flux model for general English
|
| 109 |
+
params=stt_params,
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
# Set up debug event handler for Flux updates
|
| 113 |
+
@stt.event_handler("on_update")
|
| 114 |
+
async def on_flux_update(stt_service, transcript):
|
| 115 |
+
logger.debug(f"[Deepgram Flux] Update: {transcript}")
|
| 116 |
+
|
| 117 |
+
logger.info("β Deepgram Flux STT service created with built-in turn detection")
|
| 118 |
+
logger.info(" Note: STT latency will be tracked via MetricsFrame if emitted by Flux")
|
| 119 |
+
|
| 120 |
+
else:
|
| 121 |
+
raise ValueError(f"Unknown STT provider: {provider}. Must be 'speechmatics', 'deepgram', or 'deepgram-flux'")
|
| 122 |
+
|
| 123 |
+
return stt
|
| 124 |
+
|
| 125 |
+
except Exception as e:
|
| 126 |
+
logger.error(f"Failed to create STT service '{provider}': {e}", exc_info=True)
|
| 127 |
+
raise
|
src/services/factories/tts_factory.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""TTS Service Factory - Centralized TTS service creation."""
|
| 2 |
+
|
| 3 |
+
from loguru import logger
|
| 4 |
+
from pipecat.services.elevenlabs.tts import ElevenLabsTTSService
|
| 5 |
+
from ..tts.tts_qwen import Qwen3TTSService
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def create_tts_service(
|
| 9 |
+
provider: str,
|
| 10 |
+
elevenlabs_api_key: str = None,
|
| 11 |
+
elevenlabs_voice_id: str = None,
|
| 12 |
+
qwen_model: str = None,
|
| 13 |
+
qwen_device: str = None,
|
| 14 |
+
qwen_ref_audio: str = None,
|
| 15 |
+
):
|
| 16 |
+
"""
|
| 17 |
+
Create and configure TTS service based on provider.
|
| 18 |
+
|
| 19 |
+
Args:
|
| 20 |
+
provider: "elevenlabs" or "qwen3"
|
| 21 |
+
elevenlabs_api_key: ElevenLabs API key (if using elevenlabs)
|
| 22 |
+
elevenlabs_voice_id: ElevenLabs voice ID (if using elevenlabs)
|
| 23 |
+
qwen_model: Qwen3-TTS model name (if using qwen3)
|
| 24 |
+
qwen_device: Device for Qwen3-TTS (if using qwen3)
|
| 25 |
+
qwen_ref_audio: Reference audio path for Qwen3-TTS (if using qwen3)
|
| 26 |
+
|
| 27 |
+
Returns:
|
| 28 |
+
Configured TTS service instance
|
| 29 |
+
|
| 30 |
+
Raises:
|
| 31 |
+
ValueError: If provider is invalid or required parameters are missing
|
| 32 |
+
Exception: If TTS service initialization fails
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
logger.info(f"Creating TTS service: {provider}")
|
| 36 |
+
|
| 37 |
+
try:
|
| 38 |
+
if provider == "qwen3":
|
| 39 |
+
# Local Qwen3-TTS with voice cloning
|
| 40 |
+
if not qwen_model:
|
| 41 |
+
raise ValueError("qwen_model is required for Qwen3-TTS")
|
| 42 |
+
|
| 43 |
+
logger.info("Using Qwen3-TTS (local, voice cloning)")
|
| 44 |
+
tts = Qwen3TTSService(
|
| 45 |
+
model_name=qwen_model,
|
| 46 |
+
device=qwen_device or "mps",
|
| 47 |
+
ref_audio_path=qwen_ref_audio,
|
| 48 |
+
x_vector_only_mode=True,
|
| 49 |
+
sample_rate=24000,
|
| 50 |
+
)
|
| 51 |
+
logger.info(f"β Qwen3-TTS service created (device: {qwen_device})")
|
| 52 |
+
|
| 53 |
+
elif provider == "elevenlabs":
|
| 54 |
+
# Cloud ElevenLabs TTS
|
| 55 |
+
if not elevenlabs_api_key or not elevenlabs_voice_id:
|
| 56 |
+
raise ValueError("elevenlabs_api_key and elevenlabs_voice_id are required for ElevenLabs")
|
| 57 |
+
|
| 58 |
+
logger.info("Using ElevenLabs TTS")
|
| 59 |
+
tts = ElevenLabsTTSService(
|
| 60 |
+
api_key=elevenlabs_api_key,
|
| 61 |
+
voice_id=elevenlabs_voice_id,
|
| 62 |
+
model="eleven_flash_v2_5",
|
| 63 |
+
output_format="pcm_24000",
|
| 64 |
+
enable_word_timestamps=False,
|
| 65 |
+
voice_settings={
|
| 66 |
+
"stability": 0.5,
|
| 67 |
+
"similarity_boost": 0.75,
|
| 68 |
+
"style": 0.0,
|
| 69 |
+
"use_speaker_boost": True
|
| 70 |
+
},
|
| 71 |
+
params=ElevenLabsTTSService.InputParams(
|
| 72 |
+
enable_logging=True, # Enable ElevenLabs logging for metrics
|
| 73 |
+
),
|
| 74 |
+
)
|
| 75 |
+
logger.info("β ElevenLabs TTS service created")
|
| 76 |
+
|
| 77 |
+
else:
|
| 78 |
+
raise ValueError(f"Unknown TTS provider: {provider}. Must be 'qwen3' or 'elevenlabs'")
|
| 79 |
+
|
| 80 |
+
return tts
|
| 81 |
+
|
| 82 |
+
except Exception as e:
|
| 83 |
+
logger.error(f"Failed to create TTS service '{provider}': {e}", exc_info=True)
|
| 84 |
+
raise
|
src/services/memory/memory_chromadb.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Local memory service using ChromaDB for semantic search."""
|
| 2 |
+
|
| 3 |
+
import time
|
| 4 |
+
from loguru import logger
|
| 5 |
+
from pipecat.frames.frames import Frame, LLMMessagesFrame, LLMContextFrame, MetricsFrame
|
| 6 |
+
from pipecat.metrics.metrics import TTFBMetricsData
|
| 7 |
+
from pipecat.processors.aggregators.llm_context import LLMContext
|
| 8 |
+
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext, OpenAILLMContextFrame
|
| 9 |
+
from pipecat.processors.frame_processor import FrameProcessor, FrameDirection
|
| 10 |
+
from sentence_transformers import SentenceTransformer
|
| 11 |
+
import chromadb
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class ChromaDBMemoryService(FrameProcessor):
|
| 15 |
+
"""
|
| 16 |
+
Local memory service using ChromaDB for semantic search.
|
| 17 |
+
|
| 18 |
+
Replaces Mem0 with a local, fast, and free alternative:
|
| 19 |
+
- Stores conversation history with semantic embeddings
|
| 20 |
+
- Retrieves relevant memories based on similarity search
|
| 21 |
+
- No external API calls - everything runs locally
|
| 22 |
+
- Latency: ~50-100ms vs Mem0's ~200-500ms
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
def __init__(
|
| 26 |
+
self,
|
| 27 |
+
user_id: str,
|
| 28 |
+
agent_id: str = "tars_agent",
|
| 29 |
+
collection_name: str = "conversations",
|
| 30 |
+
search_limit: int = 5,
|
| 31 |
+
search_threshold: float = 0.5,
|
| 32 |
+
system_prompt_prefix: str = "Based on previous conversations, I recall:\n\n",
|
| 33 |
+
**kwargs
|
| 34 |
+
):
|
| 35 |
+
super().__init__(**kwargs)
|
| 36 |
+
self.user_id = user_id
|
| 37 |
+
self.agent_id = agent_id
|
| 38 |
+
self.search_limit = search_limit
|
| 39 |
+
self.search_threshold = search_threshold
|
| 40 |
+
self.system_prompt_prefix = system_prompt_prefix
|
| 41 |
+
|
| 42 |
+
# Initialize ChromaDB (persistent local storage)
|
| 43 |
+
self.client = chromadb.PersistentClient(path="./chroma_memory")
|
| 44 |
+
|
| 45 |
+
# Create or get collection for this user
|
| 46 |
+
self.collection = self.client.get_or_create_collection(
|
| 47 |
+
name=f"{collection_name}_{user_id}",
|
| 48 |
+
metadata={"agent_id": agent_id}
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
# Load embedding model (lightweight, ~80MB)
|
| 52 |
+
logger.info("Loading sentence transformer model...")
|
| 53 |
+
self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
| 54 |
+
|
| 55 |
+
# Frame counter for debugging
|
| 56 |
+
self._frame_count = 0
|
| 57 |
+
|
| 58 |
+
logger.info("β ChromaDB memory service initialized and ready to process frames")
|
| 59 |
+
|
| 60 |
+
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
| 61 |
+
"""Process frames and inject memories into LLM context."""
|
| 62 |
+
try:
|
| 63 |
+
await super().process_frame(frame, direction)
|
| 64 |
+
|
| 65 |
+
# Frame counter
|
| 66 |
+
self._frame_count += 1
|
| 67 |
+
|
| 68 |
+
# Debug: Log all frame types to understand what's flowing through
|
| 69 |
+
frame_type = type(frame).__name__
|
| 70 |
+
direction_name = "DOWNSTREAM" if direction == FrameDirection.DOWNSTREAM else "UPSTREAM"
|
| 71 |
+
|
| 72 |
+
# Log LLM-related frames to debug
|
| 73 |
+
if 'LLM' in frame_type or 'Messages' in frame_type or 'Context' in frame_type:
|
| 74 |
+
logger.info(f"π [ChromaDB] >>> RECEIVED: {frame_type} | Direction: {direction_name} | Count: {self._frame_count}")
|
| 75 |
+
|
| 76 |
+
# Log every 100th frame to verify it's being called
|
| 77 |
+
if self._frame_count % 100 == 0:
|
| 78 |
+
logger.info(f"π [ChromaDB] Processed {self._frame_count} frames so far (latest: {frame_type})")
|
| 79 |
+
|
| 80 |
+
# Handle both LLMContextFrame and LLMMessagesFrame (like Mem0 does)
|
| 81 |
+
context = None
|
| 82 |
+
messages = None
|
| 83 |
+
|
| 84 |
+
if isinstance(frame, (LLMContextFrame, OpenAILLMContextFrame)):
|
| 85 |
+
logger.info(f"π§ [ChromaDB] βββ PROCESSING LLMContextFrame βββ")
|
| 86 |
+
context = frame.context
|
| 87 |
+
elif isinstance(frame, LLMMessagesFrame):
|
| 88 |
+
logger.info(f"π§ [ChromaDB] βββ PROCESSING LLMMessagesFrame βββ")
|
| 89 |
+
messages = frame.messages
|
| 90 |
+
context = LLMContext(messages)
|
| 91 |
+
|
| 92 |
+
if context:
|
| 93 |
+
# Get the latest user message
|
| 94 |
+
context_messages = context.get_messages()
|
| 95 |
+
user_message = None
|
| 96 |
+
for msg in reversed(context_messages):
|
| 97 |
+
if msg.get("role") == "user" and isinstance(msg.get("content"), str):
|
| 98 |
+
user_message = msg.get("content", "")
|
| 99 |
+
break
|
| 100 |
+
|
| 101 |
+
if user_message:
|
| 102 |
+
logger.info(f"π§ [ChromaDB] Searching memories for: '{user_message[:50]}...'")
|
| 103 |
+
# Search for relevant memories
|
| 104 |
+
start_time = time.time()
|
| 105 |
+
memories = await self._search_memories(user_message)
|
| 106 |
+
search_latency_ms = (time.time() - start_time) * 1000
|
| 107 |
+
|
| 108 |
+
# Emit metrics for observer tracking
|
| 109 |
+
logger.info(f"π [ChromaDB] Search completed in {search_latency_ms:.0f}ms, emitting MetricsFrame")
|
| 110 |
+
metrics_frame = MetricsFrame(
|
| 111 |
+
data=[TTFBMetricsData(processor="ChromaDBMemoryService", value=search_latency_ms / 1000)]
|
| 112 |
+
)
|
| 113 |
+
await self.push_frame(metrics_frame, direction)
|
| 114 |
+
|
| 115 |
+
if memories:
|
| 116 |
+
# Inject memories into context
|
| 117 |
+
memory_text = self.system_prompt_prefix + "\n".join(memories)
|
| 118 |
+
context.add_message({"role": "system", "content": memory_text})
|
| 119 |
+
logger.info(f"π Retrieved {len(memories)} memories in {search_latency_ms:.0f}ms")
|
| 120 |
+
|
| 121 |
+
# Store current conversation turn
|
| 122 |
+
await self._store_memory(user_message)
|
| 123 |
+
|
| 124 |
+
# If we received an LLMMessagesFrame, create a new one with the enhanced messages
|
| 125 |
+
if messages is not None:
|
| 126 |
+
await self.push_frame(LLMMessagesFrame(context.get_messages()), direction)
|
| 127 |
+
else:
|
| 128 |
+
# Otherwise, pass the enhanced context frame downstream
|
| 129 |
+
await self.push_frame(frame, direction)
|
| 130 |
+
else:
|
| 131 |
+
# For non-context frames, just pass them through
|
| 132 |
+
await self.push_frame(frame, direction)
|
| 133 |
+
|
| 134 |
+
except Exception as e:
|
| 135 |
+
logger.error(f"β [ChromaDB] Error in process_frame: {e}", exc_info=True)
|
| 136 |
+
# Still pass frame through even if we failed
|
| 137 |
+
await self.push_frame(frame, direction)
|
| 138 |
+
|
| 139 |
+
async def _search_memories(self, query: str) -> list[str]:
|
| 140 |
+
"""Search for relevant memories based on semantic similarity."""
|
| 141 |
+
try:
|
| 142 |
+
# Generate embedding for query
|
| 143 |
+
query_embedding = self.embedder.encode(query).tolist()
|
| 144 |
+
|
| 145 |
+
# Search in ChromaDB
|
| 146 |
+
results = self.collection.query(
|
| 147 |
+
query_embeddings=[query_embedding],
|
| 148 |
+
n_results=self.search_limit,
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
# Extract documents and filter by threshold
|
| 152 |
+
memories = []
|
| 153 |
+
if results and "documents" in results and results["documents"]:
|
| 154 |
+
for doc_list, distance_list in zip(results["documents"], results.get("distances", [[]])):
|
| 155 |
+
for doc, distance in zip(doc_list, distance_list):
|
| 156 |
+
# ChromaDB returns L2 distance, lower is better
|
| 157 |
+
# Convert to similarity score (1 - normalized distance)
|
| 158 |
+
similarity = 1 - (distance / 2) # Normalize L2 distance to [0,1]
|
| 159 |
+
if similarity >= self.search_threshold:
|
| 160 |
+
memories.append(doc)
|
| 161 |
+
|
| 162 |
+
return memories
|
| 163 |
+
|
| 164 |
+
except Exception as e:
|
| 165 |
+
logger.error(f"Error searching memories: {e}")
|
| 166 |
+
return []
|
| 167 |
+
|
| 168 |
+
async def _store_memory(self, text: str):
|
| 169 |
+
"""Store a memory with its embedding."""
|
| 170 |
+
try:
|
| 171 |
+
# Generate embedding
|
| 172 |
+
embedding = self.embedder.encode(text).tolist()
|
| 173 |
+
|
| 174 |
+
# Store in ChromaDB with timestamp as ID
|
| 175 |
+
doc_id = f"{int(time.time() * 1000)}"
|
| 176 |
+
self.collection.add(
|
| 177 |
+
documents=[text],
|
| 178 |
+
embeddings=[embedding],
|
| 179 |
+
ids=[doc_id],
|
| 180 |
+
metadatas=[{
|
| 181 |
+
"user_id": self.user_id,
|
| 182 |
+
"agent_id": self.agent_id,
|
| 183 |
+
"timestamp": time.time()
|
| 184 |
+
}]
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
logger.debug(f"πΎ Stored memory: {text[:50]}...")
|
| 188 |
+
|
| 189 |
+
except Exception as e:
|
| 190 |
+
logger.error(f"Error storing memory: {e}")
|
| 191 |
+
|
| 192 |
+
async def close(self):
|
| 193 |
+
"""Cleanup resources."""
|
| 194 |
+
# ChromaDB client doesn't need explicit cleanup
|
| 195 |
+
pass
|
src/services/memory/memory_hybrid.py
ADDED
|
@@ -0,0 +1,393 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Hybrid memory system optimized for voice AI with sub-50ms latency.
|
| 3 |
+
|
| 4 |
+
Features:
|
| 5 |
+
1. Hybrid search combining vector similarity (70%) and BM25 keyword matching (30%)
|
| 6 |
+
2. SQLite + FTS5 for fast, local storage and search
|
| 7 |
+
3. Query embedding cache to avoid redundant encoding
|
| 8 |
+
4. Pre-warmed embedding model for consistent latency
|
| 9 |
+
5. Strict timeout with graceful fallback
|
| 10 |
+
6. Thread pool for non-blocking SQLite operations
|
| 11 |
+
7. Fire-and-forget storage to prevent blocking
|
| 12 |
+
|
| 13 |
+
Architecture:
|
| 14 |
+
- Vector search for semantic similarity (cosine distance)
|
| 15 |
+
- BM25 via FTS5 for exact keyword matching
|
| 16 |
+
- Weighted score fusion for best of both worlds
|
| 17 |
+
- Target latency: <50ms (vs ChromaDB's ~50-100ms)
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
import asyncio
|
| 21 |
+
import sqlite3
|
| 22 |
+
import time
|
| 23 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 24 |
+
from pathlib import Path
|
| 25 |
+
from typing import Optional, List, Tuple
|
| 26 |
+
import numpy as np
|
| 27 |
+
|
| 28 |
+
from loguru import logger
|
| 29 |
+
from pipecat.frames.frames import Frame, LLMMessagesFrame, LLMContextFrame, MetricsFrame
|
| 30 |
+
from pipecat.metrics.metrics import TTFBMetricsData
|
| 31 |
+
from pipecat.processors.aggregators.llm_context import LLMContext
|
| 32 |
+
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContextFrame
|
| 33 |
+
from pipecat.processors.frame_processor import FrameProcessor, FrameDirection
|
| 34 |
+
from sentence_transformers import SentenceTransformer
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class HybridMemoryService(FrameProcessor):
|
| 38 |
+
"""
|
| 39 |
+
Hybrid memory service combining vector similarity and keyword search.
|
| 40 |
+
|
| 41 |
+
Target latency: <50ms
|
| 42 |
+
|
| 43 |
+
Architecture:
|
| 44 |
+
- Vector search via numpy (semantic similarity with cosine distance)
|
| 45 |
+
- BM25 via FTS5 (exact keyword matching)
|
| 46 |
+
- Weighted score fusion: 70% vector + 30% BM25
|
| 47 |
+
|
| 48 |
+
Voice AI optimizations:
|
| 49 |
+
- Query embedding cache (avoid re-encoding similar queries)
|
| 50 |
+
- Pre-warmed embedding model for consistent performance
|
| 51 |
+
- Thread pool for non-blocking SQLite operations
|
| 52 |
+
- Strict timeout with graceful fallback
|
| 53 |
+
- Fire-and-forget storage to prevent blocking
|
| 54 |
+
"""
|
| 55 |
+
|
| 56 |
+
def __init__(
|
| 57 |
+
self,
|
| 58 |
+
user_id: str,
|
| 59 |
+
db_path: str = "./memory_data/memory.sqlite",
|
| 60 |
+
embedding_model: str = "all-MiniLM-L6-v2",
|
| 61 |
+
search_limit: int = 3,
|
| 62 |
+
search_timeout_ms: int = 40,
|
| 63 |
+
vector_weight: float = 0.7,
|
| 64 |
+
bm25_weight: float = 0.3,
|
| 65 |
+
system_prompt_prefix: str = "From our conversations:\n",
|
| 66 |
+
**kwargs,
|
| 67 |
+
):
|
| 68 |
+
super().__init__(**kwargs)
|
| 69 |
+
self.user_id = user_id
|
| 70 |
+
self.db_path = db_path
|
| 71 |
+
self.search_limit = search_limit
|
| 72 |
+
self.search_timeout_ms = search_timeout_ms
|
| 73 |
+
self.vector_weight = vector_weight
|
| 74 |
+
self.bm25_weight = bm25_weight
|
| 75 |
+
self.system_prompt_prefix = system_prompt_prefix
|
| 76 |
+
|
| 77 |
+
# Thread pool for blocking operations
|
| 78 |
+
self._executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="HybridMemory")
|
| 79 |
+
|
| 80 |
+
# Initialize SQLite with FTS5 and vector support
|
| 81 |
+
Path(db_path).parent.mkdir(parents=True, exist_ok=True)
|
| 82 |
+
self._init_database()
|
| 83 |
+
|
| 84 |
+
# Load and warm embedding model
|
| 85 |
+
logger.info("Loading embedding model for hybrid memory...")
|
| 86 |
+
self.embedder = SentenceTransformer(embedding_model)
|
| 87 |
+
self._embedding_dim = self.embedder.get_sentence_embedding_dimension()
|
| 88 |
+
self._warmup_model()
|
| 89 |
+
|
| 90 |
+
# Embedding caches
|
| 91 |
+
self._query_cache: dict[str, np.ndarray] = {} # For queries
|
| 92 |
+
self._doc_cache: dict[str, np.ndarray] = {} # For documents
|
| 93 |
+
self._cache_max_size = 500
|
| 94 |
+
|
| 95 |
+
# Metrics
|
| 96 |
+
self._stats = {"searches": 0, "cache_hits": 0, "timeouts": 0, "total_latency_ms": 0}
|
| 97 |
+
self._frame_count = 0
|
| 98 |
+
|
| 99 |
+
logger.info(f"β Hybrid memory ready (vector + BM25, {search_timeout_ms}ms timeout)")
|
| 100 |
+
|
| 101 |
+
def _init_database(self):
|
| 102 |
+
"""Initialize SQLite with FTS5 and vector table."""
|
| 103 |
+
conn = sqlite3.connect(self.db_path)
|
| 104 |
+
|
| 105 |
+
# Main memories table
|
| 106 |
+
conn.execute("""
|
| 107 |
+
CREATE TABLE IF NOT EXISTS memories (
|
| 108 |
+
id INTEGER PRIMARY KEY,
|
| 109 |
+
user_id TEXT NOT NULL,
|
| 110 |
+
content TEXT NOT NULL,
|
| 111 |
+
embedding BLOB,
|
| 112 |
+
created_at REAL DEFAULT (unixepoch('now', 'subsec'))
|
| 113 |
+
)
|
| 114 |
+
""")
|
| 115 |
+
|
| 116 |
+
# FTS5 virtual table for BM25 keyword search
|
| 117 |
+
conn.execute("""
|
| 118 |
+
CREATE VIRTUAL TABLE IF NOT EXISTS memories_fts
|
| 119 |
+
USING fts5(content, content='memories', content_rowid='id')
|
| 120 |
+
""")
|
| 121 |
+
|
| 122 |
+
# Triggers to keep FTS in sync
|
| 123 |
+
conn.execute("""
|
| 124 |
+
CREATE TRIGGER IF NOT EXISTS memories_ai AFTER INSERT ON memories BEGIN
|
| 125 |
+
INSERT INTO memories_fts(rowid, content) VALUES (new.id, new.content);
|
| 126 |
+
END
|
| 127 |
+
""")
|
| 128 |
+
|
| 129 |
+
conn.execute("""
|
| 130 |
+
CREATE TRIGGER IF NOT EXISTS memories_ad AFTER DELETE ON memories BEGIN
|
| 131 |
+
DELETE FROM memories_fts WHERE rowid = old.id;
|
| 132 |
+
END
|
| 133 |
+
""")
|
| 134 |
+
|
| 135 |
+
# Index for user filtering
|
| 136 |
+
conn.execute("CREATE INDEX IF NOT EXISTS idx_user ON memories(user_id)")
|
| 137 |
+
|
| 138 |
+
conn.commit()
|
| 139 |
+
conn.close()
|
| 140 |
+
logger.info("β SQLite database initialized with FTS5")
|
| 141 |
+
|
| 142 |
+
def _warmup_model(self):
|
| 143 |
+
"""Warm up embedding model for consistent latency."""
|
| 144 |
+
warmup_start = time.perf_counter()
|
| 145 |
+
for _ in range(3):
|
| 146 |
+
_ = self.embedder.encode("warmup query", show_progress_bar=False)
|
| 147 |
+
warmup_time = (time.perf_counter() - warmup_start) * 1000
|
| 148 |
+
logger.info(f"β Embedding model warmed up ({warmup_time:.0f}ms)")
|
| 149 |
+
|
| 150 |
+
def _get_query_embedding(self, text: str) -> np.ndarray:
|
| 151 |
+
"""Get embedding with query cache."""
|
| 152 |
+
cache_key = text.strip().lower()[:100]
|
| 153 |
+
|
| 154 |
+
if cache_key in self._query_cache:
|
| 155 |
+
self._stats["cache_hits"] += 1
|
| 156 |
+
return self._query_cache[cache_key]
|
| 157 |
+
|
| 158 |
+
embedding = self.embedder.encode(text, show_progress_bar=False)
|
| 159 |
+
|
| 160 |
+
# LRU eviction
|
| 161 |
+
if len(self._query_cache) >= self._cache_max_size:
|
| 162 |
+
oldest = next(iter(self._query_cache))
|
| 163 |
+
del self._query_cache[oldest]
|
| 164 |
+
|
| 165 |
+
self._query_cache[cache_key] = embedding
|
| 166 |
+
return embedding
|
| 167 |
+
|
| 168 |
+
def _cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> float:
|
| 169 |
+
"""Fast cosine similarity."""
|
| 170 |
+
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8))
|
| 171 |
+
|
| 172 |
+
def _bm25_rank_to_score(self, rank: int) -> float:
|
| 173 |
+
"""Convert BM25 rank to normalized score."""
|
| 174 |
+
return 1.0 / (1.0 + max(0, rank))
|
| 175 |
+
|
| 176 |
+
def _hybrid_search_sync(self, query: str) -> List[Tuple[str, float]]:
|
| 177 |
+
"""
|
| 178 |
+
Hybrid search combining vector similarity and BM25 keyword matching.
|
| 179 |
+
Returns [(content, score), ...] sorted by score.
|
| 180 |
+
"""
|
| 181 |
+
conn = sqlite3.connect(self.db_path)
|
| 182 |
+
|
| 183 |
+
# Get query embedding
|
| 184 |
+
query_embedding = self._get_query_embedding(query)
|
| 185 |
+
|
| 186 |
+
# ========== Vector Search ==========
|
| 187 |
+
vector_results = {}
|
| 188 |
+
cursor = conn.execute(
|
| 189 |
+
"SELECT id, content, embedding FROM memories WHERE user_id = ? ORDER BY created_at DESC LIMIT 100",
|
| 190 |
+
(self.user_id,)
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
for row_id, content, embedding_blob in cursor:
|
| 194 |
+
if embedding_blob:
|
| 195 |
+
doc_embedding = np.frombuffer(embedding_blob, dtype=np.float32)
|
| 196 |
+
similarity = self._cosine_similarity(query_embedding, doc_embedding)
|
| 197 |
+
vector_results[row_id] = {
|
| 198 |
+
"content": content,
|
| 199 |
+
"vector_score": similarity,
|
| 200 |
+
"bm25_score": 0.0,
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
# ========== BM25 Search (FTS5) ==========
|
| 204 |
+
# Build FTS query using OR for flexible token matching
|
| 205 |
+
tokens = [t for t in query.split() if len(t) > 2]
|
| 206 |
+
if tokens:
|
| 207 |
+
# Use OR for more flexible matching
|
| 208 |
+
fts_query = " OR ".join(f'"{t}"' for t in tokens[:5]) # Limit tokens
|
| 209 |
+
try:
|
| 210 |
+
bm25_cursor = conn.execute(
|
| 211 |
+
"""
|
| 212 |
+
SELECT rowid, rank FROM memories_fts
|
| 213 |
+
WHERE memories_fts MATCH ?
|
| 214 |
+
ORDER BY rank
|
| 215 |
+
LIMIT ?
|
| 216 |
+
""",
|
| 217 |
+
(fts_query, self.search_limit * 4)
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
for rank_idx, (row_id, bm25_rank) in enumerate(bm25_cursor):
|
| 221 |
+
bm25_score = self._bm25_rank_to_score(rank_idx)
|
| 222 |
+
if row_id in vector_results:
|
| 223 |
+
vector_results[row_id]["bm25_score"] = bm25_score
|
| 224 |
+
else:
|
| 225 |
+
# BM25 found something vector didn't
|
| 226 |
+
content_cursor = conn.execute(
|
| 227 |
+
"SELECT content FROM memories WHERE id = ?", (row_id,)
|
| 228 |
+
)
|
| 229 |
+
row = content_cursor.fetchone()
|
| 230 |
+
if row:
|
| 231 |
+
vector_results[row_id] = {
|
| 232 |
+
"content": row[0],
|
| 233 |
+
"vector_score": 0.0,
|
| 234 |
+
"bm25_score": bm25_score,
|
| 235 |
+
}
|
| 236 |
+
except sqlite3.OperationalError as e:
|
| 237 |
+
# FTS query failed, continue with vector only
|
| 238 |
+
logger.debug(f"FTS query failed: {e}")
|
| 239 |
+
pass
|
| 240 |
+
|
| 241 |
+
conn.close()
|
| 242 |
+
|
| 243 |
+
# ========== Weighted Score Fusion ==========
|
| 244 |
+
results = []
|
| 245 |
+
for data in vector_results.values():
|
| 246 |
+
final_score = (
|
| 247 |
+
self.vector_weight * data["vector_score"] +
|
| 248 |
+
self.bm25_weight * data["bm25_score"]
|
| 249 |
+
)
|
| 250 |
+
results.append((data["content"], final_score))
|
| 251 |
+
|
| 252 |
+
# Sort by score, return top N
|
| 253 |
+
results.sort(key=lambda x: x[1], reverse=True)
|
| 254 |
+
return results[:self.search_limit]
|
| 255 |
+
|
| 256 |
+
def _store_sync(self, text: str):
|
| 257 |
+
"""Store memory with embedding."""
|
| 258 |
+
embedding = self.embedder.encode(text, show_progress_bar=False)
|
| 259 |
+
embedding_blob = embedding.astype(np.float32).tobytes()
|
| 260 |
+
|
| 261 |
+
conn = sqlite3.connect(self.db_path)
|
| 262 |
+
conn.execute(
|
| 263 |
+
"INSERT INTO memories (user_id, content, embedding) VALUES (?, ?, ?)",
|
| 264 |
+
(self.user_id, text, embedding_blob)
|
| 265 |
+
)
|
| 266 |
+
conn.commit()
|
| 267 |
+
conn.close()
|
| 268 |
+
|
| 269 |
+
async def _search_with_timeout(self, query: str) -> List[Tuple[str, float]]:
|
| 270 |
+
"""Async search with strict timeout."""
|
| 271 |
+
loop = asyncio.get_event_loop()
|
| 272 |
+
|
| 273 |
+
try:
|
| 274 |
+
result = await asyncio.wait_for(
|
| 275 |
+
loop.run_in_executor(self._executor, self._hybrid_search_sync, query),
|
| 276 |
+
timeout=self.search_timeout_ms / 1000,
|
| 277 |
+
)
|
| 278 |
+
return result
|
| 279 |
+
except asyncio.TimeoutError:
|
| 280 |
+
self._stats["timeouts"] += 1
|
| 281 |
+
logger.warning(f"β±οΈ Memory search timed out ({self.search_timeout_ms}ms)")
|
| 282 |
+
return []
|
| 283 |
+
|
| 284 |
+
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
| 285 |
+
"""Process Pipecat frames with hybrid memory injection."""
|
| 286 |
+
await super().process_frame(frame, direction)
|
| 287 |
+
|
| 288 |
+
try:
|
| 289 |
+
self._frame_count += 1
|
| 290 |
+
|
| 291 |
+
# Debug: Log all frame types to understand what's flowing through
|
| 292 |
+
frame_type = type(frame).__name__
|
| 293 |
+
direction_name = "DOWNSTREAM" if direction == FrameDirection.DOWNSTREAM else "UPSTREAM"
|
| 294 |
+
|
| 295 |
+
# Log LLM-related frames to debug
|
| 296 |
+
if 'LLM' in frame_type or 'Messages' in frame_type or 'Context' in frame_type:
|
| 297 |
+
logger.info(f"π [HybridMemory] >>> RECEIVED: {frame_type} | Direction: {direction_name} | Count: {self._frame_count}")
|
| 298 |
+
|
| 299 |
+
context = None
|
| 300 |
+
messages = None
|
| 301 |
+
|
| 302 |
+
if isinstance(frame, (LLMContextFrame, OpenAILLMContextFrame)):
|
| 303 |
+
logger.info(f"π§ [HybridMemory] βββ PROCESSING LLMContextFrame βββ")
|
| 304 |
+
context = frame.context
|
| 305 |
+
elif isinstance(frame, LLMMessagesFrame):
|
| 306 |
+
logger.info(f"π§ [HybridMemory] βββ PROCESSING LLMMessagesFrame βββ")
|
| 307 |
+
messages = frame.messages
|
| 308 |
+
context = LLMContext(messages)
|
| 309 |
+
|
| 310 |
+
if context:
|
| 311 |
+
# Extract user message
|
| 312 |
+
user_message = None
|
| 313 |
+
for msg in reversed(context.get_messages()):
|
| 314 |
+
if msg.get("role") == "user" and isinstance(msg.get("content"), str):
|
| 315 |
+
user_message = msg["content"]
|
| 316 |
+
break
|
| 317 |
+
|
| 318 |
+
if user_message:
|
| 319 |
+
self._stats["searches"] += 1
|
| 320 |
+
start_time = time.perf_counter()
|
| 321 |
+
|
| 322 |
+
logger.info(f"π [HybridMemory] Searching for: '{user_message[:50]}...'")
|
| 323 |
+
|
| 324 |
+
# Hybrid search with timeout
|
| 325 |
+
results = await self._search_with_timeout(user_message)
|
| 326 |
+
|
| 327 |
+
latency_ms = (time.perf_counter() - start_time) * 1000
|
| 328 |
+
self._stats["total_latency_ms"] += latency_ms
|
| 329 |
+
|
| 330 |
+
# Emit metrics
|
| 331 |
+
await self.push_frame(
|
| 332 |
+
MetricsFrame(data=[
|
| 333 |
+
TTFBMetricsData(processor="HybridMemory", value=latency_ms / 1000)
|
| 334 |
+
]),
|
| 335 |
+
direction,
|
| 336 |
+
)
|
| 337 |
+
|
| 338 |
+
# Inject memories
|
| 339 |
+
if results:
|
| 340 |
+
memories_text = self.system_prompt_prefix + "\n".join(
|
| 341 |
+
f"- {content}" for content, score in results
|
| 342 |
+
)
|
| 343 |
+
context.add_message({"role": "system", "content": memories_text})
|
| 344 |
+
|
| 345 |
+
cache_rate = self._stats["cache_hits"] / max(1, self._stats["searches"]) * 100
|
| 346 |
+
avg_latency = self._stats["total_latency_ms"] / max(1, self._stats["searches"])
|
| 347 |
+
logger.info(
|
| 348 |
+
f"π [HybridMemory] {len(results)} memories ({latency_ms:.0f}ms, "
|
| 349 |
+
f"avg: {avg_latency:.0f}ms, cache: {cache_rate:.0f}%)"
|
| 350 |
+
)
|
| 351 |
+
else:
|
| 352 |
+
logger.info(f"π [HybridMemory] No relevant memories ({latency_ms:.0f}ms)")
|
| 353 |
+
|
| 354 |
+
# Fire-and-forget storage
|
| 355 |
+
asyncio.create_task(self._store_async(user_message))
|
| 356 |
+
|
| 357 |
+
# Push frame
|
| 358 |
+
if messages is not None:
|
| 359 |
+
await self.push_frame(LLMMessagesFrame(context.get_messages()), direction)
|
| 360 |
+
else:
|
| 361 |
+
await self.push_frame(frame, direction)
|
| 362 |
+
else:
|
| 363 |
+
await self.push_frame(frame, direction)
|
| 364 |
+
|
| 365 |
+
except Exception as e:
|
| 366 |
+
logger.error(f"β [HybridMemory] Memory error: {e}", exc_info=True)
|
| 367 |
+
await self.push_frame(frame, direction)
|
| 368 |
+
|
| 369 |
+
async def _store_async(self, text: str):
|
| 370 |
+
"""Async storage (fire-and-forget)."""
|
| 371 |
+
loop = asyncio.get_event_loop()
|
| 372 |
+
try:
|
| 373 |
+
await loop.run_in_executor(self._executor, self._store_sync, text)
|
| 374 |
+
logger.debug(f"πΎ [HybridMemory] Stored: {text[:50]}...")
|
| 375 |
+
except Exception as e:
|
| 376 |
+
logger.debug(f"[HybridMemory] Store failed: {e}")
|
| 377 |
+
|
| 378 |
+
def get_stats(self) -> dict:
|
| 379 |
+
"""Get performance statistics."""
|
| 380 |
+
searches = max(1, self._stats["searches"])
|
| 381 |
+
return {
|
| 382 |
+
"searches": self._stats["searches"],
|
| 383 |
+
"cache_hits": self._stats["cache_hits"],
|
| 384 |
+
"cache_hit_rate": f"{(self._stats['cache_hits'] / searches) * 100:.1f}%",
|
| 385 |
+
"timeouts": self._stats["timeouts"],
|
| 386 |
+
"avg_latency_ms": f"{self._stats['total_latency_ms'] / searches:.1f}",
|
| 387 |
+
}
|
| 388 |
+
|
| 389 |
+
async def close(self):
|
| 390 |
+
"""Cleanup resources."""
|
| 391 |
+
self._executor.shutdown(wait=False)
|
| 392 |
+
stats = self.get_stats()
|
| 393 |
+
logger.info(f"π [HybridMemory] Final stats: {stats}")
|