latishab commited on
Commit
e8ed0e1
Β·
verified Β·
1 Parent(s): 3c12fa0

Update TARS Conversation App with TarsApp framework

Browse files
This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. .gitignore +48 -0
  3. CLAUDE.md +50 -0
  4. LICENSE +21 -0
  5. README.md +340 -8
  6. app.json +55 -0
  7. assets/audio/tars-clean-compressed.mp3 +3 -0
  8. bot.py +605 -0
  9. config.ini.example +52 -0
  10. docs/DAEMON_INTEGRATION.md +393 -0
  11. docs/DASHBOARD_UPDATE_SUMMARY.md +218 -0
  12. docs/DEVELOPING_APPS.md +400 -0
  13. docs/INSTALLATION_GUIDE.md +264 -0
  14. docs/MEMORY.md +190 -0
  15. env.example +59 -0
  16. index.html +333 -0
  17. install.sh +99 -0
  18. manifest.json +47 -0
  19. pipecat_service.py +272 -0
  20. publish-to-hf.sh +87 -0
  21. pyproject.toml +25 -0
  22. requirements.txt +18 -0
  23. scripts/update_daemon.py +388 -0
  24. src/README.md +55 -0
  25. src/character/TARS.json +25 -0
  26. src/character/persona.ini +21 -0
  27. src/character/prompts.py +331 -0
  28. src/config/__init__.py +152 -0
  29. src/config/connection.py +179 -0
  30. src/observers/__init__.py +21 -0
  31. src/observers/assistant_observer.py +142 -0
  32. src/observers/debug_observer.py +22 -0
  33. src/observers/display_events_observer.py +100 -0
  34. src/observers/metrics_observer.py +196 -0
  35. src/observers/state_observer.py +166 -0
  36. src/observers/transcription_observer.py +70 -0
  37. src/observers/tts_state_observer.py +56 -0
  38. src/observers/vision_observer.py +142 -0
  39. src/processors/__init__.py +18 -0
  40. src/processors/emotional_monitor.py +303 -0
  41. src/processors/filters.py +81 -0
  42. src/processors/gating.py +129 -0
  43. src/processors/visual_observer.py +389 -0
  44. src/services/README.md +110 -0
  45. src/services/__init__.py +1 -0
  46. src/services/factories/__init__.py +6 -0
  47. src/services/factories/stt_factory.py +127 -0
  48. src/services/factories/tts_factory.py +84 -0
  49. src/services/memory/memory_chromadb.py +195 -0
  50. src/services/memory/memory_hybrid.py +393 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/audio/tars-clean-compressed.mp3 filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dependencies
2
+ node_modules/
3
+ /.pnp
4
+ .pnp.js
5
+
6
+ # testing
7
+ /coverage
8
+
9
+ # next.js
10
+ /.next/
11
+ /out/
12
+
13
+ # cache
14
+ __pycache__/
15
+ *.py[cod]
16
+ *$py.class
17
+ *.so
18
+
19
+ # production
20
+ /build
21
+
22
+ # misc
23
+ .DS_Store
24
+ *.pem
25
+ /.models/
26
+ /.claude/
27
+ /chroma_memory/
28
+ /deprecated/
29
+ /memory_data/
30
+
31
+ # debug
32
+ npm-debug.log*
33
+ yarn-debug.log*
34
+ yarn-error.log*
35
+
36
+ # local env files
37
+ .env*.local
38
+ .env
39
+
40
+ # local config files
41
+ config.ini
42
+
43
+ # vercel
44
+ .vercel
45
+
46
+ # typescript
47
+ *.tsbuildinfo
48
+ next-env.d.ts
CLAUDE.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TARS Omni
2
+
3
+ AI brain that connects to Raspberry Pi hardware daemon.
4
+
5
+ ## Pi Access
6
+ ```
7
+ ssh tars-pi # 100.84.133.74, user: mac, repo: ~/tars-daemon
8
+ ```
9
+
10
+ ## Install
11
+
12
+ Pi (from tars-daemon dashboard):
13
+ - Apps tab β†’ Install button
14
+
15
+ Pi (manual):
16
+ ```bash
17
+ ssh tars-pi "cd ~/tars-conversation-app && bash install.sh"
18
+ ```
19
+
20
+ See: docs/INSTALLATION_GUIDE.md
21
+
22
+ ## Run
23
+
24
+ 1. Pi: `ssh tars-pi "cd ~/tars && python tars_daemon.py"`
25
+ 2. Mac: `python tars_bot.py`
26
+
27
+ ---
28
+
29
+ ## Docs
30
+
31
+ - Installation: docs/INSTALLATION_GUIDE.md
32
+ - App Development: docs/DEVELOPING_APPS.md
33
+ - Daemon Integration: docs/DAEMON_INTEGRATION.md
34
+ - Dashboard Update: docs/DASHBOARD_UPDATE_SUMMARY.md
35
+
36
+ ## Dashboard Install
37
+
38
+ tars-daemon dashboard now supports app management:
39
+ - Apps tab shows all apps in ~/tars-apps/
40
+ - Install/Uninstall buttons
41
+ - Start/Stop controls
42
+ - Auto-discovery via app.json
43
+
44
+ ## Claude Code Guidelines
45
+
46
+ - No emojis, no [NEW] markers, no "vs" comparisons
47
+ - Concise, technical, factual only
48
+ - No fluff, benefits sections, or marketing language
49
+ - Commits: imperative mood, no emojis
50
+ - Comments: minimal, explain "why" not "what"
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Latisha Besariani Hendra and TARS Omni Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,13 +1,345 @@
1
  ---
2
- title: Tars Conversation App
3
- emoji: 🏒
4
- colorFrom: indigo
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 6.5.1
8
- app_file: app.py
9
  pinned: false
10
- license: mit
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: TARS Conversation App
3
+ emoji: πŸ€–
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: "4.0.0"
8
+ app_file: ui/app.py
9
  pinned: false
 
10
  ---
11
 
12
+ # TARS Conversation App
13
+
14
+ Real-time voice AI with transcription, vision, and intelligent conversation using Speechmatics/Deepgram, Qwen3-TTS/ElevenLabs, DeepInfra LLM, and Moondream.
15
+
16
+ ## Features
17
+
18
+ - **Dual Operation Modes**
19
+ - **WebRTC Mode** (`bot.py`) - Browser-based voice AI with real-time metrics dashboard
20
+ - **Robot Mode** (`tars_bot.py`) - Connect to Raspberry Pi TARS robot via WebRTC and gRPC
21
+ - **Real-time Transcription** - Speechmatics or Deepgram with smart turn detection
22
+ - **Dual TTS Options** - Qwen3-TTS (local, free, voice cloning) or ElevenLabs (cloud)
23
+ - **LLM Integration** - Any model via DeepInfra
24
+ - **Vision Analysis** - Moondream for image understanding
25
+ - **Smart Gating Layer** - AI-powered decision system for natural conversation flow
26
+ - **Hybrid Memory** - SQLite-based hybrid search (70% vector + 30% BM25)
27
+ - **Emotional Monitoring** - Real-time detection of confusion, hesitation, and frustration
28
+ - **Gradio Dashboard** - Live TTFB metrics, latency charts, and conversation transcription
29
+ - **WebRTC Transport** - Low-latency peer-to-peer audio
30
+ - **gRPC Robot Control** - Hardware control with 5-10ms latency (robot mode only)
31
+
32
+ ## Project Structure
33
+
34
+ ```
35
+ tars-conversation-app/
36
+ β”œβ”€β”€ bot.py # WebRTC mode - Browser voice AI
37
+ β”œβ”€β”€ tars_bot.py # Robot mode - Raspberry Pi hardware
38
+ β”œβ”€β”€ pipecat_service.py # FastAPI backend (WebRTC signaling)
39
+ β”œβ”€β”€ config.py # Configuration management
40
+ β”œβ”€β”€ config.ini # User configuration file
41
+ β”œβ”€β”€ requirements.txt # Python dependencies
42
+ β”‚
43
+ β”œβ”€β”€ src/ # Backend
44
+ β”‚ β”œβ”€β”€ observers/ # Pipeline observers (metrics, transcription)
45
+ β”‚ β”œβ”€β”€ processors/ # Pipeline processors (silence filter, gating)
46
+ β”‚ β”œβ”€β”€ services/ # Services (STT, TTS, Memory, Robot)
47
+ β”‚ β”œβ”€β”€ tools/ # LLM callable functions
48
+ β”‚ β”œβ”€β”€ transport/ # WebRTC transport (aiortc)
49
+ β”‚ β”œβ”€β”€ character/ # TARS personality and prompts
50
+ β”‚ └── shared_state.py # Shared metrics storage
51
+ β”‚
52
+ β”œβ”€β”€ ui/ # Frontend
53
+ β”‚ └── app.py # Gradio dashboard (metrics + transcription)
54
+ β”‚
55
+ β”œβ”€β”€ tests/ # Tests
56
+ β”‚ └── gradio/
57
+ β”‚ └── test_gradio.py # UI integration test
58
+ β”‚
59
+ β”œβ”€β”€ character/ # TARS character data
60
+ β”‚ β”œβ”€β”€ TARS.json # Character definition
61
+ β”‚ └── persona.ini # Personality parameters
62
+ ```
63
+
64
+ ## Operation Modes
65
+
66
+ ### WebRTC Mode (`bot.py`)
67
+ - **Use case**: Browser-based voice AI conversations
68
+ - **Transport**: SmallWebRTC (browser ↔ Pipecat)
69
+ - **Features**: Full pipeline with STT, LLM, TTS, Memory
70
+ - **UI**: Gradio dashboard for metrics and transcription
71
+ - **Best for**: Development, testing, remote conversations
72
+
73
+ ### Robot Mode (`tars_bot.py`)
74
+ - **Use case**: Physical TARS robot on Raspberry Pi
75
+ - **Transport**: aiortc (RPi ↔ Pipecat) + gRPC (commands)
76
+ - **Features**: Same pipeline + robot control (eyes, gestures, movement)
77
+ - **Hardware**: Requires TARS robot with servos and display
78
+ - **Best for**: Physical robot interactions, demos
79
+
80
+ ## Quick Start
81
+
82
+ ### Installation on TARS Robot (Recommended)
83
+
84
+ Install directly from HuggingFace Space via the TARS dashboard:
85
+
86
+ 1. Open TARS dashboard at `http://your-pi:8000`
87
+ 2. Go to **App Store** tab
88
+ 3. Enter Space ID: `latishab/tars-conversation-app`
89
+ 4. Click **Install from HuggingFace**
90
+ 5. Configure API keys in `.env.local`
91
+ 6. Click **Start**
92
+ 7. Access metrics dashboard at `http://your-pi:7860`
93
+
94
+ The app will:
95
+ - Auto-install dependencies
96
+ - Set up virtual environment
97
+ - Configure for robot mode
98
+ - Start Gradio dashboard
99
+
100
+ ### Easy Installation (Manual)
101
+
102
+ For first-time setup on Raspberry Pi:
103
+
104
+ ```bash
105
+ # Clone and install
106
+ git clone https://github.com/latishab/tars-conversation-app.git
107
+ cd tars-conversation-app
108
+ bash install.sh
109
+ ```
110
+
111
+ The installer handles:
112
+ - System dependencies (portaudio, ffmpeg)
113
+ - Python virtual environment
114
+ - All Python packages
115
+ - Configuration file setup
116
+
117
+ ### Manual Installation
118
+
119
+ ```bash
120
+ # Python dependencies
121
+ pip install -r requirements.txt
122
+
123
+ # For robot mode, install TARS SDK
124
+ pip install tars-robot[sdk]
125
+ ```
126
+
127
+ ### 2. Configure Environment
128
+
129
+ ```bash
130
+ # Copy and edit environment file with your API keys
131
+ cp env.example .env.local
132
+
133
+ # Copy and edit configuration file
134
+ cp config.ini.example config.ini
135
+ ```
136
+
137
+ Required API Keys (in `.env.local`):
138
+ - `SPEECHMATICS_API_KEY` or `DEEPGRAM_API_KEY` - For speech-to-text
139
+ - `DEEPINFRA_API_KEY` - For LLM
140
+ - `ELEVENLABS_API_KEY` - Optional (if using ElevenLabs TTS)
141
+
142
+ Settings (in `config.ini`):
143
+ ```ini
144
+ [LLM]
145
+ model = meta-llama/Llama-3.3-70B-Instruct
146
+
147
+ [STT]
148
+ provider = deepgram # or speechmatics
149
+
150
+ [TTS]
151
+ provider = qwen3 # or elevenlabs
152
+
153
+ [Memory]
154
+ type = hybrid # SQLite-based hybrid search (vector + BM25)
155
+ ```
156
+
157
+ ### 3. Run
158
+
159
+ #### WebRTC Mode (Browser)
160
+
161
+ **Terminal 1: Python backend**
162
+ ```bash
163
+ python pipecat_service.py
164
+ ```
165
+
166
+ **Terminal 2: Gradio UI (optional)**
167
+ ```bash
168
+ python ui/app.py
169
+ ```
170
+
171
+ Then:
172
+ 1. Open WebRTC client in browser (connect to pipecat_service)
173
+ 2. Open Gradio dashboard at http://localhost:7861 (for metrics)
174
+ 3. Start talking
175
+
176
+ #### Robot Mode (Raspberry Pi)
177
+
178
+ Prerequisites:
179
+ - Raspberry Pi TARS robot running tars_daemon.py
180
+ - Network connection (LAN or Tailscale)
181
+ - TARS SDK installed
182
+
183
+ Configuration in `config.ini`:
184
+ ```ini
185
+ [Connection]
186
+ mode = robot
187
+ rpi_url = http://<your-rpi-ip>:8001
188
+ rpi_grpc = <your-rpi-ip>:50051
189
+ auto_connect = true
190
+
191
+ [Display]
192
+ enabled = true
193
+ ```
194
+
195
+ Deployment detection:
196
+ - **Remote** (Mac/computer): Uses configured addresses
197
+ - **Local** (on RPi): Auto-detects localhost:50051
198
+
199
+ Run:
200
+ ```bash
201
+ python tars_bot.py
202
+ ```
203
+
204
+ ## Gradio Dashboard
205
+
206
+ The Gradio UI (`ui/app.py`) provides real-time monitoring:
207
+
208
+ ### Latency Dashboard
209
+ - Service configuration (STT, Memory, LLM, TTS)
210
+ - TTFB metrics with min/max/avg/last stats
211
+ - Line chart: Latency trends over time
212
+ - Bar chart: Stacked latency breakdown
213
+ - Metrics table: Last 15 turns
214
+
215
+ ### Conversation Tab
216
+ - Live user and assistant transcriptions
217
+ - Auto-updates every second
218
+
219
+ ### Connection Tab
220
+ - Architecture documentation
221
+ - Usage instructions
222
+
223
+ ## Architecture
224
+
225
+ ### WebRTC Mode Data Flow
226
+ ```
227
+ Browser (WebRTC client)
228
+ ↕ (audio)
229
+ SmallWebRTC Transport
230
+ ↓
231
+ Pipeline: STT β†’ Memory β†’ LLM β†’ TTS
232
+ ↓
233
+ Observers (metrics, transcription, assistant)
234
+ ↓
235
+ shared_state.py
236
+ ↓
237
+ Gradio UI (http://localhost:7861)
238
+ ```
239
+
240
+ ### Robot Mode Data Flow
241
+ ```
242
+ RPi Mic β†’ WebRTC β†’ Pipecat Pipeline β†’ WebRTC β†’ RPi Speaker
243
+ (audio) ↓ (audio)
244
+ STT β†’ Memory β†’ LLM β†’ TTS
245
+ ↓
246
+ LLM Tools (set_emotion, do_gesture)
247
+ ↓
248
+ gRPC β†’ RPi Hardware
249
+ (eyes, servos, display)
250
+ ```
251
+
252
+ Communication channels (Robot Mode):
253
+
254
+ | Channel | Protocol | Purpose | Latency |
255
+ |---------|----------|---------|---------|
256
+ | Audio | WebRTC (aiortc) | Voice conversation | ~20ms |
257
+ | Commands | gRPC | Hardware control | ~5-10ms |
258
+ | State | DataChannel | Battery, movement status | ~10ms |
259
+
260
+ ## Testing
261
+
262
+ ```bash
263
+ # Test Gradio integration
264
+ python tests/gradio/test_gradio.py
265
+
266
+ # Test gesture recognition (robot mode)
267
+ python tests/test_gesture.py
268
+
269
+ # Test hardware connection (robot mode, from RPi)
270
+ ssh tars-pi "cd ~/tars && python tests/test_hardware.py"
271
+ ```
272
+
273
+ ## Development
274
+
275
+ See [docs/DEVELOPING_APPS.md](docs/DEVELOPING_APPS.md) for comprehensive guide on creating TARS SDK apps.
276
+
277
+ ### Adding Metrics
278
+ 1. Emit `MetricsFrame` in your service/processor
279
+ 2. `MetricsObserver` will capture it automatically
280
+ 3. Metrics appear in Gradio dashboard
281
+
282
+ ### Adding Tools
283
+ 1. Create function in `src/tools/`
284
+ 2. Create schema with `create_*_schema()`
285
+ 3. Register in `bot.py` or `tars_bot.py`
286
+ 4. LLM can now call your tool
287
+
288
+ ### Modifying UI
289
+ 1. Edit `ui/app.py`
290
+ 2. Gradio hot-reloads automatically
291
+ 3. Access `metrics_store` for data
292
+
293
+ ### Uninstalling
294
+
295
+ ```bash
296
+ bash uninstall.sh
297
+ ```
298
+
299
+ Removes virtual environment and optionally data/config files.
300
+
301
+ ## Troubleshooting
302
+
303
+ ### No metrics in Gradio UI
304
+ - Ensure bot is running (`bot.py` or `tars_bot.py`)
305
+ - Check WebRTC client is connected
306
+ - Verify at least one conversation turn completed
307
+
308
+ ### Robot mode connection issues
309
+ - Check RPi is reachable: `ping <rpi-ip>`
310
+ - Verify tars_daemon is running on RPi
311
+ - Check gRPC port 50051 is open
312
+ - Review config.ini addresses
313
+
314
+ ### Import errors
315
+ ```bash
316
+ pip install -r requirements.txt
317
+ pip install gradio plotly # For UI
318
+ ```
319
+
320
+ ### Audio issues (robot mode)
321
+ - Check RPi mic/speaker with `arecord`/`aplay`
322
+ - Verify WebRTC connection in logs
323
+ - Test with `tests/test_hardware.py`
324
+
325
+ ## Contributing
326
+
327
+ Contributions welcome.
328
+
329
+ 1. Fork the repository
330
+ 2. Create a feature branch
331
+ 3. Make your changes
332
+ 4. Test with `python tests/gradio/test_gradio.py`
333
+ 5. Commit with clear messages (see CLAUDE.md for style)
334
+ 6. Push to your fork
335
+ 7. Open a Pull Request
336
+
337
+ Code Style:
338
+ - Python: Follow PEP 8
339
+ - Add comments for complex logic
340
+ - Update docs for new features
341
+ - See CLAUDE.md for guidelines (concise, technical, no fluff)
342
+
343
+ ## License
344
+
345
+ MIT License - see LICENSE file for details
app.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "tars-conversation-app",
3
+ "version": "1.0.0",
4
+ "description": "Real-time conversational AI with WebRTC, memory, and vision",
5
+ "author": "TARS Project",
6
+ "repository": "https://github.com/latishab/tars-conversation-app.git",
7
+ "main": "tars_bot.py",
8
+ "install_script": "install.sh",
9
+ "uninstall_script": "uninstall.sh",
10
+ "dependencies": {
11
+ "python": ">=3.10",
12
+ "system": [
13
+ "portaudio19-dev",
14
+ "ffmpeg",
15
+ "build-essential",
16
+ "python3-dev"
17
+ ]
18
+ },
19
+ "environment": [
20
+ "DEEPINFRA_API_KEY",
21
+ "SPEECHMATICS_API_KEY",
22
+ "DEEPGRAM_API_KEY",
23
+ "ELEVENLABS_API_KEY"
24
+ ],
25
+ "configuration": {
26
+ "file": "config.ini",
27
+ "example": "config.ini.example",
28
+ "env_file": ".env.local",
29
+ "env_example": "env.example"
30
+ },
31
+ "ports": {
32
+ "grpc": 50051,
33
+ "http": 8765,
34
+ "fastapi": 8080
35
+ },
36
+ "modes": [
37
+ {
38
+ "name": "robot",
39
+ "description": "Connect to Pi hardware via gRPC",
40
+ "command": "python tars_bot.py"
41
+ },
42
+ {
43
+ "name": "browser",
44
+ "description": "Browser-based WebRTC mode",
45
+ "command": "python bot.py"
46
+ }
47
+ ],
48
+ "services": {
49
+ "dashboard": {
50
+ "enabled": true,
51
+ "command": "python ui/app.py",
52
+ "port": 7860
53
+ }
54
+ }
55
+ }
assets/audio/tars-clean-compressed.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35e66e7ef9dfd3e64ed70fcdb32b220686d3ad4451af88bfa72a48563a85b120
3
+ size 289820
bot.py ADDED
@@ -0,0 +1,605 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Bot pipeline setup and execution."""
2
+
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ # Add src/ to Python path
7
+ sys.path.insert(0, str(Path(__file__).parent / "src"))
8
+
9
+ import asyncio
10
+ import json
11
+ import os
12
+ import logging
13
+ import uuid
14
+ import httpx
15
+
16
+ from pipecat.adapters.schemas.tools_schema import ToolsSchema
17
+ from pipecat.frames.frames import (
18
+ LLMRunFrame,
19
+ TranscriptionFrame,
20
+ InterimTranscriptionFrame,
21
+ Frame,
22
+ TranscriptionMessage,
23
+ TranslationFrame,
24
+ UserImageRawFrame,
25
+ UserAudioRawFrame,
26
+ UserImageRequestFrame,
27
+ )
28
+ from pipecat.processors.frame_processor import FrameProcessor, FrameDirection
29
+ from pipecat.pipeline.pipeline import Pipeline
30
+ from pipecat.pipeline.runner import PipelineRunner
31
+ from pipecat.pipeline.task import PipelineTask, PipelineParams
32
+ from pipecat.processors.aggregators.llm_context import LLMContext
33
+ from pipecat.processors.aggregators.llm_response_universal import (
34
+ LLMContextAggregatorPair,
35
+ LLMUserAggregatorParams
36
+ )
37
+ from pipecat.observers.turn_tracking_observer import TurnTrackingObserver
38
+ from pipecat.observers.loggers.user_bot_latency_log_observer import UserBotLatencyLogObserver
39
+ from pipecat.services.moondream.vision import MoondreamService
40
+ from pipecat.services.openai.llm import OpenAILLMService
41
+ from pipecat.services.llm_service import FunctionCallParams
42
+ from services.memory_hybrid import HybridMemoryService
43
+ from pipecat.transcriptions.language import Language
44
+ from pipecat.transports.base_transport import TransportParams
45
+ from pipecat.transports.smallwebrtc.transport import SmallWebRTCTransport
46
+
47
+ from loguru import logger
48
+
49
+ from config import (
50
+ SPEECHMATICS_API_KEY,
51
+ DEEPGRAM_API_KEY,
52
+ ELEVENLABS_API_KEY,
53
+ ELEVENLABS_VOICE_ID,
54
+ DEEPINFRA_API_KEY,
55
+ DEEPINFRA_BASE_URL,
56
+ MEM0_API_KEY,
57
+ get_fresh_config,
58
+ )
59
+ from services.factories import create_stt_service, create_tts_service
60
+ from processors import (
61
+ SilenceFilter,
62
+ InputAudioFilter,
63
+ InterventionGating,
64
+ VisualObserver,
65
+ EmotionalStateMonitor,
66
+ )
67
+ from observers import (
68
+ MetricsObserver,
69
+ TranscriptionObserver,
70
+ AssistantResponseObserver,
71
+ TTSStateObserver,
72
+ VisionObserver,
73
+ DebugObserver,
74
+ DisplayEventsObserver,
75
+ )
76
+ from character.prompts import (
77
+ load_persona_ini,
78
+ load_tars_json,
79
+ build_tars_system_prompt,
80
+ get_introduction_instruction,
81
+ )
82
+ from tools import (
83
+ fetch_user_image,
84
+ adjust_persona_parameter,
85
+ execute_movement,
86
+ capture_camera_view,
87
+ create_fetch_image_schema,
88
+ create_adjust_persona_schema,
89
+ create_identity_schema,
90
+ create_movement_schema,
91
+ create_camera_capture_schema,
92
+ get_persona_storage,
93
+ get_crossword_hint,
94
+ create_crossword_hint_schema,
95
+ )
96
+ from shared_state import metrics_store
97
+
98
+
99
+ # ============================================================================
100
+ # CUSTOM FRAME PROCESSORS
101
+ # ============================================================================
102
+
103
+ class IdentityUnifier(FrameProcessor):
104
+ """
105
+ Applies 'guest_ID' ONLY to specific user input frames.
106
+ Leaves other frames untouched.
107
+ """
108
+ # Define the frame types that should have user_id set
109
+ TARGET_FRAME_TYPES = (
110
+ TranscriptionFrame,
111
+ TranscriptionMessage,
112
+ TranslationFrame,
113
+ InterimTranscriptionFrame,
114
+ UserImageRawFrame,
115
+ UserAudioRawFrame,
116
+ UserImageRequestFrame,
117
+ )
118
+
119
+ def __init__(self, target_user_id):
120
+ super().__init__()
121
+ self.target_user_id = target_user_id
122
+
123
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
124
+ # 1. Handle internal state
125
+ await super().process_frame(frame, direction)
126
+
127
+ # 2. Only modify specific frame types
128
+ if isinstance(frame, self.TARGET_FRAME_TYPES):
129
+ try:
130
+ frame.user_id = self.target_user_id
131
+ except Exception:
132
+ pass
133
+
134
+ # 3. Push downstream
135
+ await self.push_frame(frame, direction)
136
+
137
+
138
+ # ============================================================================
139
+ # HELPER FUNCTIONS
140
+ # ============================================================================
141
+
142
+ async def _cleanup_services(service_refs: dict):
143
+ if service_refs.get("stt"):
144
+ try:
145
+ await service_refs["stt"].close()
146
+ logger.info("βœ“ STT service cleaned up")
147
+ except Exception:
148
+ pass
149
+ if service_refs.get("tts"):
150
+ try:
151
+ await service_refs["tts"].close()
152
+ logger.info("βœ“ TTS service cleaned up")
153
+ except Exception:
154
+ pass
155
+
156
+
157
+ # ============================================================================
158
+ # MAIN BOT PIPELINE
159
+ # ============================================================================
160
+
161
+ async def run_bot(webrtc_connection):
162
+ """Initialize and run the TARS bot pipeline."""
163
+ logger.info("Starting bot pipeline for WebRTC connection...")
164
+
165
+ # Load fresh configuration for this connection (allows runtime config updates)
166
+ runtime_config = get_fresh_config()
167
+ DEEPINFRA_MODEL = runtime_config['DEEPINFRA_MODEL']
168
+ DEEPINFRA_GATING_MODEL = runtime_config['DEEPINFRA_GATING_MODEL']
169
+ STT_PROVIDER = runtime_config['STT_PROVIDER']
170
+ TTS_PROVIDER = runtime_config['TTS_PROVIDER']
171
+ QWEN3_TTS_MODEL = runtime_config['QWEN3_TTS_MODEL']
172
+ QWEN3_TTS_DEVICE = runtime_config['QWEN3_TTS_DEVICE']
173
+ QWEN3_TTS_REF_AUDIO = runtime_config['QWEN3_TTS_REF_AUDIO']
174
+ EMOTIONAL_MONITORING_ENABLED = runtime_config['EMOTIONAL_MONITORING_ENABLED']
175
+ EMOTIONAL_SAMPLING_INTERVAL = runtime_config['EMOTIONAL_SAMPLING_INTERVAL']
176
+ EMOTIONAL_INTERVENTION_THRESHOLD = runtime_config['EMOTIONAL_INTERVENTION_THRESHOLD']
177
+ TARS_DISPLAY_URL = runtime_config['TARS_DISPLAY_URL']
178
+ TARS_DISPLAY_ENABLED = runtime_config['TARS_DISPLAY_ENABLED']
179
+
180
+ logger.info(f"πŸ“‹ Runtime config loaded - STT: {STT_PROVIDER}, LLM: {DEEPINFRA_MODEL}, TTS: {TTS_PROVIDER}, Emotional: {EMOTIONAL_MONITORING_ENABLED}")
181
+
182
+ # Session initialization
183
+ session_id = str(uuid.uuid4())[:8]
184
+ client_id = f"guest_{session_id}"
185
+ client_state = {"client_id": client_id}
186
+ logger.info(f"Session started: {client_id}")
187
+
188
+ service_refs = {"stt": None, "tts": None}
189
+
190
+ try:
191
+ # ====================================================================
192
+ # TRANSPORT INITIALIZATION
193
+ # ====================================================================
194
+ # Note: STT providers handle their own turn detection:
195
+ # - Speechmatics: SMART_TURN mode
196
+ # - Deepgram: endpointing parameter (300ms silence detection)
197
+ # - Deepgram Flux: built-in turn detection with ExternalUserTurnStrategies (deprecated)
198
+
199
+ logger.info(f"Initializing transport with {STT_PROVIDER} turn detection...")
200
+
201
+ transport_params = TransportParams(
202
+ audio_in_enabled=True,
203
+ audio_out_enabled=True,
204
+ video_in_enabled=False,
205
+ video_out_enabled=False,
206
+ video_out_is_live=False,
207
+ )
208
+
209
+ pipecat_transport = SmallWebRTCTransport(
210
+ webrtc_connection=webrtc_connection,
211
+ params=transport_params,
212
+ )
213
+
214
+ logger.info("βœ“ Transport initialized")
215
+
216
+ # ====================================================================
217
+ # SPEECH-TO-TEXT SERVICE
218
+ # ====================================================================
219
+
220
+ logger.info(f"Initializing {STT_PROVIDER} STT...")
221
+ stt = None
222
+ try:
223
+ stt = create_stt_service(
224
+ provider=STT_PROVIDER,
225
+ speechmatics_api_key=SPEECHMATICS_API_KEY,
226
+ deepgram_api_key=DEEPGRAM_API_KEY,
227
+ language=Language.EN,
228
+ enable_diarization=False,
229
+ )
230
+ service_refs["stt"] = stt
231
+
232
+ # Log additional info for Deepgram
233
+ if STT_PROVIDER == "deepgram":
234
+ logger.info("βœ“ Deepgram: 300ms endpointing for turn detection")
235
+ logger.info("βœ“ Deepgram: VAD events enabled for speech detection")
236
+
237
+ except Exception as e:
238
+ logger.error(f"Failed to initialize {STT_PROVIDER} STT: {e}", exc_info=True)
239
+ return
240
+
241
+ # ====================================================================
242
+ # TEXT-TO-SPEECH SERVICE
243
+ # ====================================================================
244
+
245
+ try:
246
+ tts = create_tts_service(
247
+ provider=TTS_PROVIDER,
248
+ elevenlabs_api_key=ELEVENLABS_API_KEY,
249
+ elevenlabs_voice_id=ELEVENLABS_VOICE_ID,
250
+ qwen_model=QWEN3_TTS_MODEL,
251
+ qwen_device=QWEN3_TTS_DEVICE,
252
+ qwen_ref_audio=QWEN3_TTS_REF_AUDIO,
253
+ )
254
+ service_refs["tts"] = tts
255
+ except Exception as e:
256
+ logger.error(f"Failed to initialize TTS service: {e}", exc_info=True)
257
+ return
258
+
259
+ # ====================================================================
260
+ # LLM SERVICE & TOOLS
261
+ # ====================================================================
262
+
263
+ logger.info("Initializing LLM via DeepInfra...")
264
+ llm = None
265
+ try:
266
+ llm = OpenAILLMService(
267
+ api_key=DEEPINFRA_API_KEY,
268
+ base_url=DEEPINFRA_BASE_URL,
269
+ model=DEEPINFRA_MODEL
270
+ )
271
+
272
+ character_dir = os.path.join(os.path.dirname(__file__), "character")
273
+ persona_params = load_persona_ini(os.path.join(character_dir, "persona.ini"))
274
+ tars_data = load_tars_json(os.path.join(character_dir, "TARS.json"))
275
+ system_prompt = build_tars_system_prompt(persona_params, tars_data)
276
+
277
+ # Create tool schemas (these return FunctionSchema objects)
278
+ fetch_image_tool = create_fetch_image_schema()
279
+ persona_tool = create_adjust_persona_schema()
280
+ identity_tool = create_identity_schema()
281
+ crossword_hint_tool = create_crossword_hint_schema()
282
+ movement_tool = create_movement_schema()
283
+ camera_capture_tool = create_camera_capture_schema()
284
+
285
+ # Pass FunctionSchema objects directly to standard_tools
286
+ tools = ToolsSchema(
287
+ standard_tools=[
288
+ fetch_image_tool,
289
+ persona_tool,
290
+ identity_tool,
291
+ crossword_hint_tool,
292
+ movement_tool,
293
+ camera_capture_tool,
294
+ ]
295
+ )
296
+ messages = [system_prompt]
297
+ context = LLMContext(messages, tools)
298
+
299
+ llm.register_function("fetch_user_image", fetch_user_image)
300
+ llm.register_function("adjust_persona_parameter", adjust_persona_parameter)
301
+ llm.register_function("get_crossword_hint", get_crossword_hint)
302
+ llm.register_function("execute_movement", execute_movement)
303
+ llm.register_function("capture_camera_view", capture_camera_view)
304
+
305
+ pipeline_unifier = IdentityUnifier(client_id)
306
+ async def wrapped_set_identity(params: FunctionCallParams):
307
+ name = params.arguments["name"]
308
+ logger.info(f"πŸ‘€ Identity discovered: {name}")
309
+
310
+ old_id = client_state["client_id"]
311
+ new_id = f"user_{name.lower().replace(' ', '_')}"
312
+
313
+ if old_id != new_id:
314
+ logger.info(f"πŸ”„ Switching User ID: {old_id} -> {new_id}")
315
+ client_state["client_id"] = new_id
316
+
317
+ # Update the pipeline unifier to use new identity
318
+ pipeline_unifier.target_user_id = new_id
319
+ logger.info(f"βœ“ Updated pipeline unifier with new ID: {new_id}")
320
+
321
+ # Update memory service with new user_id
322
+ if memory_service:
323
+ memory_service.user_id = new_id
324
+ logger.info(f"βœ“ Updated memory service user_id to: {new_id}")
325
+
326
+ # Notify frontend of identity change
327
+ try:
328
+ if webrtc_connection and webrtc_connection.is_connected():
329
+ webrtc_connection.send_app_message({
330
+ "type": "identity_update",
331
+ "old_id": old_id,
332
+ "new_id": new_id,
333
+ "name": name
334
+ })
335
+ logger.info(f"πŸ“€ Sent identity update to frontend: {new_id}")
336
+ except Exception as e:
337
+ logger.warning(f"Failed to send identity update to frontend: {e}")
338
+
339
+ await params.result_callback(f"Identity updated to {name}.")
340
+
341
+ llm.register_function("set_user_identity", wrapped_set_identity)
342
+ logger.info(f"βœ“ LLM initialized with model: {DEEPINFRA_MODEL}")
343
+
344
+ except Exception as e:
345
+ logger.error(f"Failed to initialize LLM: {e}", exc_info=True)
346
+ return
347
+
348
+ # ====================================================================
349
+ # VISION & GATING SERVICES
350
+ # ====================================================================
351
+
352
+ logger.info("Initializing Moondream vision service...")
353
+ moondream = None
354
+ try:
355
+ moondream = MoondreamService(model="vikhyatk/moondream2", revision="2025-01-09")
356
+ logger.info("βœ“ Moondream vision service initialized")
357
+ except Exception as e:
358
+ logger.error(f"Failed to initialize Moondream: {e}")
359
+ return
360
+
361
+ # ====================================================================
362
+ # TARS DISPLAY - Note: Display control via gRPC in robot mode only
363
+ # ====================================================================
364
+
365
+ logger.info("TARS Display features available in robot mode (tars_bot.py)")
366
+ tars_client = None
367
+
368
+ logger.info("Initializing Visual Observer...")
369
+ visual_observer = VisualObserver(
370
+ vision_client=moondream,
371
+ enable_face_detection=True,
372
+ tars_client=tars_client
373
+ )
374
+ logger.info("βœ“ Visual Observer initialized")
375
+
376
+ logger.info("Initializing Emotional State Monitor...")
377
+ emotional_monitor = EmotionalStateMonitor(
378
+ vision_client=moondream,
379
+ model="vikhyatk/moondream2",
380
+ sampling_interval=EMOTIONAL_SAMPLING_INTERVAL,
381
+ intervention_threshold=EMOTIONAL_INTERVENTION_THRESHOLD,
382
+ enabled=EMOTIONAL_MONITORING_ENABLED,
383
+ auto_intervene=False, # Let gating layer handle intervention decisions
384
+ )
385
+ logger.info(f"βœ“ Emotional State Monitor initialized (enabled: {EMOTIONAL_MONITORING_ENABLED})")
386
+ logger.info(f" Mode: Integrated with gating layer for smarter decisions")
387
+
388
+ logger.info("Initializing Gating Layer...")
389
+ gating_layer = InterventionGating(
390
+ api_key=DEEPINFRA_API_KEY,
391
+ base_url=DEEPINFRA_BASE_URL,
392
+ model=DEEPINFRA_GATING_MODEL,
393
+ visual_observer=visual_observer,
394
+ emotional_monitor=emotional_monitor
395
+ )
396
+ logger.info(f"βœ“ Gating Layer initialized with emotional state integration")
397
+
398
+ # ====================================================================
399
+ # MEMORY SERVICE
400
+ # ====================================================================
401
+
402
+ # Memory service: Hybrid search combining vector similarity (70%) and BM25 keyword matching (30%)
403
+ # Optimized for voice AI with <50ms latency target
404
+ logger.info("Initializing hybrid memory service...")
405
+ memory_service = None
406
+ try:
407
+ memory_service = HybridMemoryService(
408
+ user_id=client_id,
409
+ db_path="./memory_data/memory.sqlite",
410
+ search_limit=3,
411
+ search_timeout_ms=100, # Hybrid search needs ~60-80ms, allow buffer
412
+ vector_weight=0.7, # 70% semantic similarity
413
+ bm25_weight=0.3, # 30% keyword matching
414
+ system_prompt_prefix="From our conversations:\n",
415
+ )
416
+ logger.info(f"βœ“ Hybrid memory service initialized for {client_id}")
417
+ except Exception as e:
418
+ logger.error(f"Failed to initialize hybrid memory service: {e}")
419
+ logger.info(" Continuing without memory service...")
420
+ memory_service = None # Continue without memory if it fails
421
+
422
+ # ====================================================================
423
+ # CONTEXT AGGREGATOR & PERSONA STORAGE
424
+ # ====================================================================
425
+
426
+ # Configure user turn aggregation
427
+ # STT services (Speechmatics, Deepgram) handle turn detection internally
428
+ user_params = LLMUserAggregatorParams(
429
+ user_turn_stop_timeout=1.5
430
+ )
431
+
432
+ context_aggregator = LLMContextAggregatorPair(
433
+ context,
434
+ user_params=user_params
435
+ )
436
+
437
+
438
+ persona_storage = get_persona_storage()
439
+ persona_storage["persona_params"] = persona_params
440
+ persona_storage["tars_data"] = tars_data
441
+ persona_storage["context_aggregator"] = context_aggregator
442
+
443
+ # ====================================================================
444
+ # LOGGING PROCESSORS
445
+ # ====================================================================
446
+
447
+ transcription_observer = TranscriptionObserver(
448
+ webrtc_connection=webrtc_connection,
449
+ client_state=client_state
450
+ )
451
+ assistant_observer = AssistantResponseObserver(webrtc_connection=webrtc_connection)
452
+ tts_state_observer = TTSStateObserver(webrtc_connection=webrtc_connection)
453
+ vision_observer = VisionObserver(webrtc_connection=webrtc_connection)
454
+ display_events_observer = DisplayEventsObserver(tars_client=tars_client)
455
+
456
+ # Create MetricsObserver (non-intrusive monitoring outside pipeline)
457
+ metrics_observer = MetricsObserver(
458
+ webrtc_connection=webrtc_connection,
459
+ stt_service=stt
460
+ )
461
+
462
+ # Turn tracking observer (for debugging turn detection)
463
+ turn_observer = TurnTrackingObserver()
464
+
465
+ @turn_observer.event_handler("on_turn_started")
466
+ async def on_turn_started(*args, **kwargs):
467
+ turn_number = args[1] if len(args) > 1 else kwargs.get('turn_number', 0)
468
+ logger.info(f"πŸ—£οΈ [TurnObserver] Turn STARTED: {turn_number}")
469
+ # Notify metrics observer of new turn
470
+ metrics_observer.start_turn(turn_number)
471
+
472
+ @turn_observer.event_handler("on_turn_ended")
473
+ async def on_turn_ended(*args, **kwargs):
474
+ turn_number = args[1] if len(args) > 1 else kwargs.get('turn_number', 0)
475
+ logger.info(f"πŸ—£οΈ [TurnObserver] Turn ENDED: {turn_number}")
476
+
477
+ # ====================================================================
478
+ # PIPELINE ASSEMBLY
479
+ # ====================================================================
480
+
481
+ logger.info("Creating audio/video pipeline...")
482
+
483
+ pipeline = Pipeline([
484
+ pipecat_transport.input(),
485
+ # emotional_monitor, # Real-time emotional state monitoring
486
+ stt,
487
+ pipeline_unifier,
488
+ context_aggregator.user(),
489
+ memory_service, # Hybrid memory (70% vector + 30% BM25) for automatic recall/storage
490
+ # gating_layer, # AI decision system (with emotional state integration)
491
+ llm,
492
+ SilenceFilter(),
493
+ tts,
494
+ pipecat_transport.output(),
495
+ context_aggregator.assistant(),
496
+ ])
497
+
498
+ # ====================================================================
499
+ # EVENT HANDLERS
500
+ # ====================================================================
501
+
502
+ task_ref = {"task": None}
503
+
504
+ @pipecat_transport.event_handler("on_client_connected")
505
+ async def on_client_connected(transport, client):
506
+ logger.info("Pipecat Client connected")
507
+ try:
508
+ if webrtc_connection.is_connected():
509
+ webrtc_connection.send_app_message({"type": "system", "message": "Connection established"})
510
+
511
+ # Send service configuration info with provider and model details
512
+ llm_display = DEEPINFRA_MODEL.split('/')[-1] if '/' in DEEPINFRA_MODEL else DEEPINFRA_MODEL
513
+
514
+ if TTS_PROVIDER == "elevenlabs":
515
+ tts_display = "ElevenLabs: eleven_flash_v2_5"
516
+ else:
517
+ tts_model = QWEN3_TTS_MODEL.split('/')[-1] if '/' in QWEN3_TTS_MODEL else QWEN3_TTS_MODEL
518
+ tts_display = f"Qwen3-TTS: {tts_model}"
519
+
520
+ # Format STT provider name for display
521
+ stt_display = {
522
+ "speechmatics": "Speechmatics",
523
+ "deepgram": "Deepgram Nova-2"
524
+ }.get(STT_PROVIDER, STT_PROVIDER.capitalize())
525
+
526
+ service_info = {
527
+ "stt": stt_display,
528
+ "memory": "Hybrid Search (SQLite)",
529
+ "llm": f"DeepInfra: {llm_display}",
530
+ "tts": tts_display
531
+ }
532
+
533
+ # Store in shared state for Gradio UI
534
+ metrics_store.set_service_info(service_info)
535
+
536
+ # Send via WebRTC
537
+ webrtc_connection.send_app_message({
538
+ "type": "service_info",
539
+ **service_info
540
+ })
541
+ logger.info(f"πŸ“Š Sent service info to frontend: STT={stt_display}, LLM={llm_display}, TTS={tts_display}")
542
+ except Exception as e:
543
+ logger.error(f"❌ Error sending service info: {e}")
544
+
545
+ if task_ref["task"]:
546
+ verbosity = persona_params.get("verbosity", 10) if persona_params else 10
547
+ intro_instruction = get_introduction_instruction(client_state['client_id'], verbosity)
548
+
549
+ if context and hasattr(context, "messages"):
550
+ context.messages.append(intro_instruction)
551
+
552
+ logger.info("Waiting for pipeline to warm up...")
553
+ await asyncio.sleep(2.0)
554
+
555
+ logger.info("Queueing initial LLM greeting...")
556
+ await task_ref["task"].queue_frames([LLMRunFrame()])
557
+
558
+ @pipecat_transport.event_handler("on_client_disconnected")
559
+ async def on_client_disconnected(transport, client):
560
+ logger.info("Pipecat Client disconnected")
561
+ if task_ref["task"]:
562
+ await task_ref["task"].cancel()
563
+ await _cleanup_services(service_refs)
564
+
565
+ # ====================================================================
566
+ # PIPELINE EXECUTION
567
+ # ====================================================================
568
+
569
+ # Enable built-in Pipecat metrics for latency tracking
570
+ user_bot_latency_observer = UserBotLatencyLogObserver()
571
+
572
+ task = PipelineTask(
573
+ pipeline,
574
+ params=PipelineParams(
575
+ enable_metrics=True, # Enable performance metrics (TTFB, latency)
576
+ enable_usage_metrics=True, # Enable LLM/TTS usage metrics
577
+ report_only_initial_ttfb=False, # Report all TTFB measurements
578
+ ),
579
+ observers=[
580
+ turn_observer,
581
+ metrics_observer,
582
+ transcription_observer,
583
+ assistant_observer,
584
+ tts_state_observer,
585
+ vision_observer,
586
+ display_events_observer, # Send events to TARS display
587
+ user_bot_latency_observer, # Measures total user→bot response time
588
+ ], # Non-intrusive monitoring
589
+ )
590
+ task_ref["task"] = task
591
+ runner = PipelineRunner(handle_sigint=False)
592
+
593
+ logger.info("Starting pipeline runner...")
594
+
595
+ try:
596
+ await runner.run(task)
597
+ except Exception:
598
+ raise
599
+ finally:
600
+ await _cleanup_services(service_refs)
601
+
602
+ except Exception as e:
603
+ logger.error(f"Error in bot pipeline: {e}", exc_info=True)
604
+ finally:
605
+ await _cleanup_services(service_refs)
config.ini.example ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [LLM]
2
+ # Available models: Any DeepInfra-supported model
3
+ # Examples: openai/gpt-oss-20b, meta-llama/Llama-3.3-70B-Instruct-Turbo, meta-llama/Llama-3.2-3B-Instruct
4
+ model = openai/gpt-oss-20b
5
+
6
+ # Gating model for intervention decisions (smaller/faster model recommended)
7
+ gating_model = meta-llama/Llama-3.2-3B-Instruct
8
+
9
+ [STT]
10
+ # Available providers: speechmatics, deepgram, deepgram-flux
11
+ # - speechmatics: Speechmatics with SMART_TURN detection
12
+ # - deepgram: Deepgram Nova-2 with endpoint detection
13
+ # - deepgram-flux: Deepgram Flux with built-in turn detection (recommended)
14
+ provider = deepgram-flux
15
+
16
+ [TTS]
17
+ # Available providers: elevenlabs, qwen3
18
+ provider = qwen3
19
+
20
+ # Qwen3-TTS Configuration (only used if provider = qwen3)
21
+ # Available models: Qwen/Qwen3-TTS-12Hz-0.6B-Base, Qwen/Qwen3-TTS-12Hz-1.7B-Base
22
+ qwen3_model = Qwen/Qwen3-TTS-12Hz-0.6B-Base
23
+ # Available devices: mps (Mac), cuda (NVIDIA), cpu
24
+ qwen3_device = mps
25
+ # Reference audio file for voice cloning (relative to project root)
26
+ qwen3_ref_audio = assets/audio/tars-clean-compressed.mp3
27
+
28
+ [Emotional]
29
+ # Enable real-time emotional state monitoring via video
30
+ enabled = true
31
+ # How often to sample video frames (in seconds)
32
+ sampling_interval = 3.0
33
+ # How many consecutive negative states before intervention
34
+ intervention_threshold = 2
35
+
36
+ [Connection]
37
+ # Transport mode: "robot" (aiortc WebRTC to RPi) or "browser" (SmallWebRTC for browser)
38
+ mode = robot
39
+ # Raspberry Pi WebRTC server URL (Tailscale or local network IP)
40
+ rpi_url = http://100.115.193.41:8001
41
+ # Auto-connect to RPi on startup (only for robot mode)
42
+ auto_connect = true
43
+ # Delay between reconnection attempts (seconds)
44
+ reconnect_delay = 5
45
+ # Maximum reconnection attempts (0 = infinite)
46
+ max_reconnect_attempts = 0
47
+
48
+ [Display]
49
+ # Enable TARS Raspberry Pi display integration (HTTP commands)
50
+ enabled = true
51
+ # URL of TARS display API (Tailscale or local network IP)
52
+ tars_url = http://100.115.193.41:8001
docs/DAEMON_INTEGRATION.md ADDED
@@ -0,0 +1,393 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Daemon Dashboard Integration
2
+
3
+ Guide for integrating tars-conversation-app with tars-daemon dashboard app management.
4
+
5
+ ## Overview
6
+
7
+ The tars-daemon dashboard should provide install/uninstall buttons for managing TARS apps like this one.
8
+
9
+ ## App Discovery
10
+
11
+ The daemon scans for apps with `app.json` manifest files:
12
+
13
+ ```python
14
+ import json
15
+ from pathlib import Path
16
+
17
+ def discover_apps(apps_directory="/home/mac/tars-apps"):
18
+ """Discover all TARS apps with manifests"""
19
+ apps = []
20
+ apps_dir = Path(apps_directory)
21
+
22
+ for app_path in apps_dir.iterdir():
23
+ manifest_path = app_path / "app.json"
24
+ if manifest_path.exists():
25
+ with open(manifest_path) as f:
26
+ manifest = json.load(f)
27
+ apps.append({
28
+ "path": str(app_path),
29
+ "manifest": manifest,
30
+ "installed": (app_path / "venv").exists()
31
+ })
32
+
33
+ return apps
34
+ ```
35
+
36
+ ## Installation Flow
37
+
38
+ When user clicks "Install" button:
39
+
40
+ ```python
41
+ import subprocess
42
+ from pathlib import Path
43
+
44
+ def install_app(app_path):
45
+ """Install a TARS app"""
46
+ app_dir = Path(app_path)
47
+ manifest_path = app_dir / "app.json"
48
+
49
+ # Read manifest
50
+ with open(manifest_path) as f:
51
+ manifest = json.load(f)
52
+
53
+ # Get install script
54
+ install_script = manifest.get("install_script", "install.sh")
55
+ script_path = app_dir / install_script
56
+
57
+ if not script_path.exists():
58
+ raise FileNotFoundError(f"Install script not found: {script_path}")
59
+
60
+ # Run installation
61
+ result = subprocess.run(
62
+ ["bash", str(script_path)],
63
+ cwd=str(app_dir),
64
+ capture_output=True,
65
+ text=True
66
+ )
67
+
68
+ return {
69
+ "success": result.returncode == 0,
70
+ "stdout": result.stdout,
71
+ "stderr": result.stderr
72
+ }
73
+ ```
74
+
75
+ ## Uninstallation Flow
76
+
77
+ When user clicks "Uninstall" button:
78
+
79
+ ```python
80
+ def uninstall_app(app_path):
81
+ """Uninstall a TARS app"""
82
+ app_dir = Path(app_path)
83
+ manifest_path = app_dir / "app.json"
84
+
85
+ # Read manifest
86
+ with open(manifest_path) as f:
87
+ manifest = json.load(f)
88
+
89
+ # Get uninstall script
90
+ uninstall_script = manifest.get("uninstall_script", "uninstall.sh")
91
+ script_path = app_dir / uninstall_script
92
+
93
+ if not script_path.exists():
94
+ raise FileNotFoundError(f"Uninstall script not found: {script_path}")
95
+
96
+ # Run uninstallation
97
+ result = subprocess.run(
98
+ ["bash", str(script_path)],
99
+ cwd=str(app_dir),
100
+ capture_output=True,
101
+ text=True
102
+ )
103
+
104
+ return {
105
+ "success": result.returncode == 0,
106
+ "stdout": result.stdout,
107
+ "stderr": result.stderr
108
+ }
109
+ ```
110
+
111
+ ## Dashboard UI (Gradio Example)
112
+
113
+ ```python
114
+ import gradio as gr
115
+ from pathlib import Path
116
+
117
+ def get_app_status(app_path):
118
+ """Check if app is installed"""
119
+ return (Path(app_path) / "venv").exists()
120
+
121
+ def create_app_tab():
122
+ """Create app management tab in dashboard"""
123
+
124
+ # Discover apps
125
+ apps = discover_apps()
126
+
127
+ with gr.Tab("Apps"):
128
+ for app in apps:
129
+ manifest = app["manifest"]
130
+
131
+ with gr.Row():
132
+ gr.Markdown(f"### {manifest['name']}")
133
+ gr.Markdown(manifest.get("description", ""))
134
+
135
+ with gr.Row():
136
+ gr.Markdown(f"**Version:** {manifest.get('version', 'unknown')}")
137
+ status = "Installed" if app["installed"] else "Not Installed"
138
+ gr.Markdown(f"**Status:** {status}")
139
+
140
+ with gr.Row():
141
+ install_btn = gr.Button(
142
+ "Install",
143
+ visible=not app["installed"]
144
+ )
145
+ uninstall_btn = gr.Button(
146
+ "Uninstall",
147
+ visible=app["installed"]
148
+ )
149
+ output = gr.Textbox(
150
+ label="Output",
151
+ lines=5,
152
+ max_lines=10
153
+ )
154
+
155
+ # Install handler
156
+ install_btn.click(
157
+ fn=lambda path=app["path"]: install_app(path),
158
+ outputs=output
159
+ )
160
+
161
+ # Uninstall handler
162
+ uninstall_btn.click(
163
+ fn=lambda path=app["path"]: uninstall_app(path),
164
+ outputs=output
165
+ )
166
+
167
+ gr.Markdown("---")
168
+
169
+ # Add to dashboard
170
+ with gr.Blocks() as dashboard:
171
+ create_app_tab()
172
+
173
+ dashboard.launch()
174
+ ```
175
+
176
+ ## Recommended Directory Structure
177
+
178
+ ```
179
+ /home/mac/
180
+ β”œβ”€β”€ tars-daemon/ # Main daemon
181
+ β”‚ β”œβ”€β”€ tars_daemon.py
182
+ β”‚ β”œβ”€β”€ dashboard.py # Gradio dashboard with app management
183
+ β”‚ └── app_manager.py # App discovery and management
184
+ β”‚
185
+ └── tars-apps/ # Apps directory
186
+ β”œβ”€β”€ tars-conversation-app/
187
+ β”‚ β”œβ”€β”€ app.json # Manifest
188
+ β”‚ β”œβ”€β”€ install.sh # Install script
189
+ β”‚ β”œβ”€β”€ uninstall.sh # Uninstall script
190
+ β”‚ └── ...
191
+ β”‚
192
+ └── another-app/
193
+ β”œβ”€β”€ app.json
194
+ └── ...
195
+ ```
196
+
197
+ ## Environment Variables
198
+
199
+ Apps should auto-detect deployment:
200
+
201
+ ```python
202
+ # In app configuration
203
+ def get_grpc_address():
204
+ """Auto-detect if running on Pi or remotely"""
205
+ # Check if on Raspberry Pi
206
+ try:
207
+ with open("/proc/cpuinfo") as f:
208
+ if "Raspberry Pi" in f.read():
209
+ return "localhost:50051" # Local daemon
210
+ except:
211
+ pass
212
+
213
+ # Remote connection
214
+ return os.getenv("RPI_GRPC", "100.84.133.74:50051")
215
+ ```
216
+
217
+ ## Installation Validation
218
+
219
+ The daemon should validate before installation:
220
+
221
+ ```python
222
+ def validate_app(app_path):
223
+ """Validate app before installation"""
224
+ app_dir = Path(app_path)
225
+ errors = []
226
+
227
+ # Check manifest exists
228
+ manifest_path = app_dir / "app.json"
229
+ if not manifest_path.exists():
230
+ errors.append("Missing app.json manifest")
231
+ return errors
232
+
233
+ # Read manifest
234
+ with open(manifest_path) as f:
235
+ manifest = json.load(f)
236
+
237
+ # Check required fields
238
+ required = ["name", "version", "install_script"]
239
+ for field in required:
240
+ if field not in manifest:
241
+ errors.append(f"Missing required field: {field}")
242
+
243
+ # Check scripts exist
244
+ install_script = app_dir / manifest.get("install_script", "install.sh")
245
+ if not install_script.exists():
246
+ errors.append(f"Install script not found: {install_script}")
247
+
248
+ # Check Python version
249
+ if "dependencies" in manifest:
250
+ py_version = manifest["dependencies"].get("python", "")
251
+ if py_version:
252
+ # Validate version string format
253
+ import re
254
+ if not re.match(r">=?\d+\.\d+", py_version):
255
+ errors.append(f"Invalid Python version: {py_version}")
256
+
257
+ return errors
258
+ ```
259
+
260
+ ## Running Apps
261
+
262
+ After installation, provide run buttons:
263
+
264
+ ```python
265
+ def run_app(app_path, mode="robot"):
266
+ """Run an installed app"""
267
+ app_dir = Path(app_path)
268
+ manifest_path = app_dir / "app.json"
269
+
270
+ with open(manifest_path) as f:
271
+ manifest = json.load(f)
272
+
273
+ # Get command for mode
274
+ modes = manifest.get("modes", [])
275
+ command = None
276
+
277
+ for m in modes:
278
+ if m["name"] == mode:
279
+ command = m["command"]
280
+ break
281
+
282
+ if not command:
283
+ # Fallback to main
284
+ command = f"python {manifest['main']}"
285
+
286
+ # Activate venv and run
287
+ venv_python = app_dir / "venv" / "bin" / "python"
288
+
289
+ subprocess.Popen(
290
+ [str(venv_python)] + command.split()[1:],
291
+ cwd=str(app_dir)
292
+ )
293
+ ```
294
+
295
+ ## Security Considerations
296
+
297
+ 1. **Script Validation** - Verify scripts don't contain malicious commands
298
+ 2. **Sandboxing** - Consider running installations in containers
299
+ 3. **User Permissions** - Require confirmation before installation
300
+ 4. **API Keys** - Warn users to configure API keys before running
301
+
302
+ ## Example Dashboard Integration
303
+
304
+ ```python
305
+ # In tars-daemon/dashboard.py
306
+
307
+ import gradio as gr
308
+ from app_manager import discover_apps, install_app, uninstall_app
309
+
310
+ def create_dashboard():
311
+ with gr.Blocks() as dashboard:
312
+ gr.Markdown("# TARS Daemon Dashboard")
313
+
314
+ with gr.Tabs():
315
+ # Hardware tab
316
+ with gr.Tab("Hardware"):
317
+ gr.Markdown("Robot hardware controls...")
318
+
319
+ # Apps tab
320
+ with gr.Tab("Apps"):
321
+ apps = discover_apps("/home/mac/tars-apps")
322
+
323
+ for app in apps:
324
+ manifest = app["manifest"]
325
+
326
+ with gr.Accordion(manifest["name"], open=False):
327
+ gr.Markdown(manifest.get("description", ""))
328
+ gr.JSON(manifest, label="Manifest")
329
+
330
+ with gr.Row():
331
+ install_btn = gr.Button(
332
+ "Install",
333
+ visible=not app["installed"]
334
+ )
335
+ uninstall_btn = gr.Button(
336
+ "Uninstall",
337
+ visible=app["installed"]
338
+ )
339
+ run_btn = gr.Button(
340
+ "Run",
341
+ visible=app["installed"]
342
+ )
343
+
344
+ output = gr.Textbox(label="Output", lines=10)
345
+
346
+ # Event handlers
347
+ install_btn.click(
348
+ fn=lambda p=app["path"]: install_app(p),
349
+ outputs=output
350
+ ).then(
351
+ fn=lambda: gr.update(visible=False),
352
+ outputs=install_btn
353
+ ).then(
354
+ fn=lambda: gr.update(visible=True),
355
+ outputs=[uninstall_btn, run_btn]
356
+ )
357
+
358
+ # Logs tab
359
+ with gr.Tab("Logs"):
360
+ gr.Markdown("System logs...")
361
+
362
+ return dashboard
363
+
364
+ if __name__ == "__main__":
365
+ dashboard = create_dashboard()
366
+ dashboard.launch(server_name="0.0.0.0", server_port=7860)
367
+ ```
368
+
369
+ ## Testing Installation
370
+
371
+ From the Pi:
372
+
373
+ ```bash
374
+ # Test install
375
+ cd ~/tars-apps/tars-conversation-app
376
+ bash install.sh
377
+
378
+ # Verify
379
+ ls -la venv/
380
+ source venv/bin/activate
381
+ python -c "import pipecat; print('OK')"
382
+
383
+ # Test uninstall
384
+ bash uninstall.sh
385
+ ```
386
+
387
+ ## Next Steps
388
+
389
+ 1. Implement app discovery in tars-daemon
390
+ 2. Add Apps tab to dashboard
391
+ 3. Create app_manager.py module
392
+ 4. Test with tars-conversation-app
393
+ 5. Document for other developers
docs/DASHBOARD_UPDATE_SUMMARY.md ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dashboard Update Summary
2
+
3
+ App management functionality added to tars-daemon dashboard.
4
+
5
+ ## Changes Made
6
+
7
+ ### Backend (tars-daemon)
8
+
9
+ **File: dashboard/backend/routes/apps.py**
10
+ - Implemented app discovery via app.json manifests
11
+ - Scans ~/tars-apps/ directory for apps
12
+ - Install using install.sh script
13
+ - Uninstall using uninstall.sh script
14
+ - Status detection via venv/ directory
15
+ - Start/stop app processes
16
+ - Logs endpoint
17
+
18
+ **File: dashboard/backend/routes/__init__.py**
19
+ - Added apps module to imports and exports
20
+
21
+ **File: dashboard/backend/server.py**
22
+ - Added apps router at /api/apps/*
23
+
24
+ ### Frontend (tars-daemon)
25
+
26
+ **File: dashboard/frontend/src/pages/AppStore.jsx**
27
+ - Complete rewrite with functional UI
28
+ - Install/Uninstall buttons
29
+ - Start/Stop controls
30
+ - Real-time status updates (5s polling)
31
+ - Loading states and error handling
32
+ - Success/error alerts
33
+
34
+ **File: dashboard/frontend/src/components/ui/badge.jsx**
35
+ - New component for status badges
36
+
37
+ **File: dashboard/frontend/src/components/ui/alert.jsx**
38
+ - New component for notifications
39
+
40
+ ### App Setup
41
+
42
+ **Location: ~/tars-apps/tars-conversation-app/**
43
+ - Copied from Mac to Pi
44
+ - Contains app.json manifest
45
+ - Has install.sh and uninstall.sh scripts
46
+
47
+ ## API Endpoints
48
+
49
+ ```
50
+ GET /api/apps/list - List all apps with status
51
+ POST /api/apps/install - Install app using install.sh
52
+ POST /api/apps/uninstall - Uninstall app using uninstall.sh
53
+ POST /api/apps/start - Start app process
54
+ POST /api/apps/stop - Stop app process
55
+ GET /api/apps/logs/{name} - Get app logs
56
+ ```
57
+
58
+ ## Testing
59
+
60
+ ### 1. Restart Dashboard
61
+
62
+ ```bash
63
+ ssh tars-pi
64
+ cd ~/tars-daemon
65
+ pkill -f start_dashboard.py
66
+ venv/bin/python start_dashboard.py
67
+ ```
68
+
69
+ Or use systemd if configured:
70
+ ```bash
71
+ sudo systemctl restart tars-dashboard
72
+ ```
73
+
74
+ ### 2. Verify Backend
75
+
76
+ ```bash
77
+ # Test app discovery
78
+ curl http://100.84.133.74:8000/api/apps/list
79
+
80
+ # Should return JSON with tars-conversation-app
81
+ ```
82
+
83
+ ### 3. Open Dashboard
84
+
85
+ Navigate to: http://100.84.133.74:8000
86
+
87
+ Click on "Apps" or "App Store" tab (depending on navigation)
88
+
89
+ ### 4. Test Installation
90
+
91
+ 1. Click "Install" button for tars-conversation-app
92
+ 2. Wait for installation (may take 5-10 minutes)
93
+ 3. Status should change to "Installed"
94
+ 4. Start/Stop buttons should appear
95
+
96
+ ### 5. Test Uninstallation
97
+
98
+ 1. Stop app if running
99
+ 2. Click uninstall button (trash icon)
100
+ 3. Confirm in alerts
101
+ 4. Status returns to not installed
102
+
103
+ ## Expected Behavior
104
+
105
+ ### App Card Display
106
+
107
+ ```
108
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
109
+ β”‚ tars-conversation-app [Installed]β”‚
110
+ β”‚ Real-time conversational AI... β”‚
111
+ β”‚ β”‚
112
+ β”‚ Version: 1.0.0 β”‚
113
+ β”‚ Author: TARS Project β”‚
114
+ β”‚ β”‚
115
+ β”‚ [Start] [πŸ—‘οΈ] β”‚
116
+ β”‚ ~/tars-apps/tars-conversation-app β”‚
117
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
118
+ ```
119
+
120
+ When installing:
121
+ ```
122
+ [Installing...] (spinner)
123
+ ```
124
+
125
+ When running:
126
+ ```
127
+ [Stop] [πŸ—‘οΈ]
128
+ ```
129
+
130
+ ## Troubleshooting
131
+
132
+ ### Dashboard won't start
133
+
134
+ Check logs:
135
+ ```bash
136
+ tail -50 /tmp/dashboard.log
137
+ ```
138
+
139
+ Common issues:
140
+ - Missing fastapi: `pip install fastapi uvicorn`
141
+ - Import errors: Check routes/__init__.py includes apps
142
+ - Port 8000 in use: `lsof -i :8000`
143
+
144
+ ### Apps not discovered
145
+
146
+ Check:
147
+ ```bash
148
+ ls -la ~/tars-apps/tars-conversation-app/app.json
149
+ ```
150
+
151
+ Verify manifest:
152
+ ```bash
153
+ cat ~/tars-apps/tars-conversation-app/app.json | python3 -m json.tool
154
+ ```
155
+
156
+ ### Installation fails
157
+
158
+ Check install script:
159
+ ```bash
160
+ bash ~/tars-apps/tars-conversation-app/install.sh
161
+ ```
162
+
163
+ Check logs in dashboard after clicking install button.
164
+
165
+ ### Frontend not updated
166
+
167
+ Rebuild:
168
+ ```bash
169
+ cd ~/tars-daemon/dashboard/frontend
170
+ npm run build
171
+ ```
172
+
173
+ Hard refresh browser: Ctrl+Shift+R
174
+
175
+ ## File Locations
176
+
177
+ ```
178
+ tars-daemon/
179
+ β”œβ”€β”€ dashboard/
180
+ β”‚ β”œβ”€β”€ backend/
181
+ β”‚ β”‚ β”œβ”€β”€ server.py # Updated: added apps router
182
+ β”‚ β”‚ └── routes/
183
+ β”‚ β”‚ β”œβ”€β”€ __init__.py # Updated: export apps
184
+ β”‚ β”‚ └── apps.py # NEW: app management
185
+ β”‚ └── frontend/
186
+ β”‚ └── src/
187
+ β”‚ β”œβ”€β”€ pages/
188
+ β”‚ β”‚ └── AppStore.jsx # Updated: full UI
189
+ β”‚ └── components/ui/
190
+ β”‚ β”œβ”€β”€ badge.jsx # NEW
191
+ β”‚ └── alert.jsx # NEW
192
+ β”‚
193
+ tars-apps/
194
+ └── tars-conversation-app/
195
+ β”œβ”€β”€ app.json # Manifest
196
+ β”œβ”€β”€ install.sh # Installation script
197
+ β”œβ”€β”€ uninstall.sh # Uninstall script
198
+ └── ... # App files
199
+ ```
200
+
201
+ ## Next Steps
202
+
203
+ 1. Restart dashboard on Pi
204
+ 2. Test in browser
205
+ 3. Install tars-conversation-app via UI
206
+ 4. Verify installation works
207
+ 5. Add more apps to ~/tars-apps/ as needed
208
+
209
+ ## Adding More Apps
210
+
211
+ To add new apps:
212
+
213
+ 1. Create app in ~/tars-apps/
214
+ 2. Add app.json manifest (see docs/DEVELOPING_APPS.md)
215
+ 3. Create install.sh and uninstall.sh
216
+ 4. Refresh dashboard - app appears automatically
217
+
218
+ No code changes needed for new apps.
docs/DEVELOPING_APPS.md ADDED
@@ -0,0 +1,400 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Developing Apps with TARS SDK
2
+
3
+ Guide for creating TARS-compatible applications that integrate with the tars-daemon.
4
+
5
+ ## Architecture Overview
6
+
7
+ TARS apps connect to the tars-daemon running on Raspberry Pi:
8
+
9
+ ```
10
+ [Your App] ←→ gRPC (50051) ←→ [tars-daemon] ←→ [Hardware]
11
+ β”œβ”€ Motors
12
+ β”œβ”€ Camera
13
+ └─ Display
14
+ ```
15
+
16
+ ## App Structure
17
+
18
+ ### Minimal Structure
19
+
20
+ ```
21
+ your-app/
22
+ β”œβ”€β”€ app.json # App manifest (required)
23
+ β”œβ”€β”€ requirements.txt # Python dependencies
24
+ β”œβ”€β”€ config.ini.example # Configuration template
25
+ β”œβ”€β”€ env.example # Environment variables template
26
+ β”œβ”€β”€ install.sh # Installation script
27
+ β”œβ”€β”€ uninstall.sh # Cleanup script
28
+ β”œβ”€β”€ main.py # Entry point
29
+ └── README.md # Documentation
30
+ ```
31
+
32
+ ## App Manifest (app.json)
33
+
34
+ Required file for daemon dashboard integration:
35
+
36
+ ```json
37
+ {
38
+ "name": "tars-conversation-app",
39
+ "version": "1.0.0",
40
+ "description": "Real-time conversational AI with WebRTC",
41
+ "author": "Your Name",
42
+ "repository": "https://github.com/yourusername/your-app.git",
43
+ "main": "tars_bot.py",
44
+ "install_script": "install.sh",
45
+ "uninstall_script": "uninstall.sh",
46
+ "dependencies": {
47
+ "python": ">=3.10",
48
+ "system": ["portaudio19-dev", "ffmpeg"]
49
+ },
50
+ "environment": [
51
+ "DEEPINFRA_API_KEY",
52
+ "SPEECHMATICS_API_KEY"
53
+ ],
54
+ "configuration": {
55
+ "file": "config.ini",
56
+ "example": "config.ini.example"
57
+ },
58
+ "ports": {
59
+ "grpc": 50051,
60
+ "http": 8765
61
+ }
62
+ }
63
+ ```
64
+
65
+ ## Configuration System
66
+
67
+ ### Environment Variables (.env.local)
68
+
69
+ Store secrets only, never commit:
70
+
71
+ ```bash
72
+ # API Keys
73
+ DEEPINFRA_API_KEY=your_key_here
74
+ SPEECHMATICS_API_KEY=your_key_here
75
+ ELEVENLABS_API_KEY=your_key_here
76
+ ```
77
+
78
+ ### User Configuration (config.ini)
79
+
80
+ Runtime settings users can modify:
81
+
82
+ ```ini
83
+ [Connection]
84
+ mode = robot
85
+ rpi_url = http://100.84.133.74:8765
86
+ rpi_grpc = 100.84.133.74:50051
87
+ auto_connect = false
88
+
89
+ [LLM]
90
+ model = openai/gpt-oss-20b
91
+ gating_model = meta-llama/Llama-3.2-3B-Instruct
92
+ ```
93
+
94
+ ### Loading Configuration
95
+
96
+ ```python
97
+ from pathlib import Path
98
+ from configparser import ConfigParser
99
+ from dotenv import load_dotenv
100
+ import os
101
+
102
+ # Load secrets
103
+ env_local = Path(__file__).parent / ".env.local"
104
+ load_dotenv(env_local, override=True)
105
+
106
+ # Load config
107
+ config = ConfigParser()
108
+ config.read("config.ini")
109
+
110
+ # Runtime reload without restart
111
+ def get_fresh_config():
112
+ config = ConfigParser()
113
+ config.read("config.ini")
114
+ return config
115
+ ```
116
+
117
+ ## Connecting to tars-daemon
118
+
119
+ ### gRPC Client
120
+
121
+ ```python
122
+ import grpc
123
+ from tars_sdk import TarsClient
124
+
125
+ # Singleton client
126
+ _client = None
127
+
128
+ def get_tars_client():
129
+ global _client
130
+ if _client is None:
131
+ grpc_address = os.getenv("RPI_GRPC", "100.84.133.74:50051")
132
+ channel = grpc.insecure_channel(grpc_address)
133
+ _client = TarsClient(channel)
134
+ return _client
135
+
136
+ # Use the client
137
+ client = get_tars_client()
138
+ client.execute_movement("wave_right")
139
+ client.set_emotion("happy")
140
+ ```
141
+
142
+ ### Deployment Mode Detection
143
+
144
+ Auto-detect if running locally on Pi or remotely:
145
+
146
+ ```python
147
+ def detect_deployment_mode():
148
+ # Check if running on Raspberry Pi
149
+ try:
150
+ with open("/proc/cpuinfo", "r") as f:
151
+ if "Raspberry Pi" in f.read():
152
+ return "local"
153
+ except FileNotFoundError:
154
+ pass
155
+
156
+ # Check if daemon running on localhost
157
+ try:
158
+ import grpc
159
+ channel = grpc.insecure_channel("localhost:50051")
160
+ grpc.channel_ready_future(channel).result(timeout=1)
161
+ return "local"
162
+ except:
163
+ return "remote"
164
+
165
+ def get_grpc_address():
166
+ if detect_deployment_mode() == "local":
167
+ return "localhost:50051"
168
+ return os.getenv("RPI_GRPC", "100.84.133.74:50051")
169
+ ```
170
+
171
+ ## Installation Scripts
172
+
173
+ ### install.sh
174
+
175
+ ```bash
176
+ #!/bin/bash
177
+ set -e
178
+
179
+ APP_NAME="your-app"
180
+ APP_DIR="$HOME/$APP_NAME"
181
+
182
+ echo "Installing $APP_NAME..."
183
+
184
+ # Check Python version
185
+ python3 --version | grep -q "3.1[0-9]" || {
186
+ echo "Error: Python 3.10+ required"
187
+ exit 1
188
+ }
189
+
190
+ # Install system dependencies
191
+ sudo apt-get update
192
+ sudo apt-get install -y portaudio19-dev ffmpeg
193
+
194
+ # Create virtual environment
195
+ python3 -m venv "$APP_DIR/venv"
196
+ source "$APP_DIR/venv/bin/activate"
197
+
198
+ # Install Python dependencies
199
+ pip install --upgrade pip
200
+ pip install -r requirements.txt
201
+
202
+ # Setup configuration
203
+ if [ ! -f config.ini ]; then
204
+ cp config.ini.example config.ini
205
+ echo "Created config.ini - please configure before running"
206
+ fi
207
+
208
+ if [ ! -f .env.local ]; then
209
+ cp env.example .env.local
210
+ echo "Created .env.local - please add API keys"
211
+ fi
212
+
213
+ echo "Installation complete!"
214
+ echo "Next steps:"
215
+ echo "1. Edit .env.local with your API keys"
216
+ echo "2. Edit config.ini if needed"
217
+ echo "3. Run: python main.py"
218
+ ```
219
+
220
+ ### uninstall.sh
221
+
222
+ ```bash
223
+ #!/bin/bash
224
+ set -e
225
+
226
+ APP_NAME="your-app"
227
+ APP_DIR="$HOME/$APP_NAME"
228
+
229
+ echo "Uninstalling $APP_NAME..."
230
+
231
+ # Stop running processes
232
+ pkill -f "python.*$APP_NAME" || true
233
+
234
+ # Remove virtual environment
235
+ rm -rf "$APP_DIR/venv"
236
+
237
+ # Remove generated data (optional)
238
+ read -p "Remove data directories? (y/N) " -n 1 -r
239
+ echo
240
+ if [[ $REPL =~ ^[Yy]$ ]]; then
241
+ rm -rf chroma_memory memory_data
242
+ fi
243
+
244
+ echo "Uninstall complete!"
245
+ ```
246
+
247
+ ## Best Practices
248
+
249
+ ### 1. Project Structure
250
+
251
+ - Keep source code in `src/` directory
252
+ - Separate configuration from code
253
+ - Provide example configs (never commit secrets)
254
+ - Include tests in `tests/` directory
255
+
256
+ ### 2. Configuration
257
+
258
+ - Use `.env.local` for secrets (gitignore it)
259
+ - Use `config.ini` for user settings (gitignore it)
260
+ - Provide `.example` templates
261
+ - Support runtime config reload when possible
262
+
263
+ ### 3. Dependencies
264
+
265
+ - Pin major versions in requirements.txt
266
+ - Document system dependencies in README
267
+ - Test on fresh Pi OS installation
268
+ - Keep dependencies minimal
269
+
270
+ ### 4. Error Handling
271
+
272
+ - Validate configuration on startup
273
+ - Provide clear error messages
274
+ - Test connection to daemon before running
275
+ - Graceful degradation if hardware unavailable
276
+
277
+ ### 5. Performance
278
+
279
+ - Use gRPC for low-latency commands (~5-10ms)
280
+ - Batch operations when possible
281
+ - Monitor resource usage on Pi
282
+ - Optimize for Raspberry Pi 4 (4GB RAM)
283
+
284
+ ### 6. Testing
285
+
286
+ - Test on actual hardware
287
+ - Provide test scripts for gestures/expressions
288
+ - Document expected behavior
289
+ - Include connection tests
290
+
291
+ ## Example: Minimal TARS App
292
+
293
+ ```python
294
+ # main.py
295
+ import grpc
296
+ from tars_sdk import TarsClient
297
+ from pathlib import Path
298
+ from dotenv import load_dotenv
299
+ import os
300
+
301
+ # Load configuration
302
+ load_dotenv(Path(__file__).parent / ".env.local")
303
+
304
+ # Connect to daemon
305
+ grpc_address = os.getenv("RPI_GRPC", "100.84.133.74:50051")
306
+ channel = grpc.insecure_channel(grpc_address)
307
+ client = TarsClient(channel)
308
+
309
+ # Test connection
310
+ try:
311
+ status = client.get_robot_status()
312
+ print(f"Connected to TARS: {status}")
313
+ except Exception as e:
314
+ print(f"Connection failed: {e}")
315
+ exit(1)
316
+
317
+ # Use robot
318
+ client.set_emotion("happy")
319
+ client.execute_movement("wave_right")
320
+ print("TARS says hello!")
321
+ ```
322
+
323
+ ## Integration with Claude Code
324
+
325
+ Structure your app for easy AI-assisted development:
326
+
327
+ 1. **Clear directory structure** - AI can navigate easily
328
+ 2. **Documented configuration** - AI understands settings
329
+ 3. **Type hints** - AI provides better suggestions
330
+ 4. **Docstrings** - AI understands intent
331
+ 5. **README.md** - AI reads project context
332
+
333
+ See CLAUDE.md for project-specific guidelines.
334
+
335
+ ## Common Patterns
336
+
337
+ ### Startup Validation
338
+
339
+ ```python
340
+ def validate_startup():
341
+ """Check all requirements before running"""
342
+ errors = []
343
+
344
+ # Check API keys
345
+ if not os.getenv("DEEPINFRA_API_KEY"):
346
+ errors.append("Missing DEEPINFRA_API_KEY in .env.local")
347
+
348
+ # Check config file
349
+ if not Path("config.ini").exists():
350
+ errors.append("config.ini not found")
351
+
352
+ # Test daemon connection
353
+ try:
354
+ client = get_tars_client()
355
+ client.get_robot_status()
356
+ except Exception as e:
357
+ errors.append(f"Cannot connect to daemon: {e}")
358
+
359
+ if errors:
360
+ print("Startup validation failed:")
361
+ for error in errors:
362
+ print(f" - {error}")
363
+ exit(1)
364
+ ```
365
+
366
+ ### Graceful Shutdown
367
+
368
+ ```python
369
+ import signal
370
+ import sys
371
+
372
+ def signal_handler(sig, frame):
373
+ """Clean shutdown on Ctrl+C"""
374
+ print("\nShutting down...")
375
+
376
+ # Reset robot state
377
+ try:
378
+ client = get_tars_client()
379
+ client.set_emotion("neutral")
380
+ client.set_eye_state(True, True)
381
+ except:
382
+ pass
383
+
384
+ sys.exit(0)
385
+
386
+ signal.signal(signal.SIGINT, signal_handler)
387
+ ```
388
+
389
+ ## Resources
390
+
391
+ - tars-daemon: `~/tars-daemon` on Pi
392
+ - TARS SDK: Install via pip `pip install tars-sdk`
393
+ - Example Apps: This repository (tars-conversation-app)
394
+ - Pi Access: `ssh tars-pi` (100.84.133.74)
395
+
396
+ ## Support
397
+
398
+ - Check daemon status: `systemctl status tars-daemon`
399
+ - View daemon logs: `journalctl -u tars-daemon -f`
400
+ - Test gRPC connection: `grpcurl -plaintext 100.84.133.74:50051 list`
docs/INSTALLATION_GUIDE.md ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Installation Guide
2
+
3
+ Quick reference for installing tars-conversation-app on Raspberry Pi.
4
+
5
+ ## Prerequisites
6
+
7
+ - Raspberry Pi 4 (4GB RAM recommended)
8
+ - Raspberry Pi OS (Bullseye or later)
9
+ - Python 3.10 or higher
10
+ - Internet connection
11
+
12
+ ## From Dashboard (Recommended)
13
+
14
+ Once tars-daemon implements app management:
15
+
16
+ 1. Open tars-daemon dashboard at `http://100.84.133.74:7860`
17
+ 2. Navigate to "Apps" tab
18
+ 3. Find "tars-conversation-app"
19
+ 4. Click "Install" button
20
+ 5. Wait for installation to complete
21
+ 6. Configure API keys in `.env.local`
22
+ 7. Adjust settings in `config.ini` if needed
23
+ 8. Click "Run" to start
24
+
25
+ ## Manual Installation (SSH)
26
+
27
+ ### Step 1: Clone Repository
28
+
29
+ ```bash
30
+ ssh tars-pi
31
+ cd ~
32
+ git clone https://github.com/latishab/tars-conversation-app.git
33
+ cd tars-conversation-app
34
+ ```
35
+
36
+ ### Step 2: Run Installer
37
+
38
+ ```bash
39
+ bash install.sh
40
+ ```
41
+
42
+ The installer will:
43
+ - Check Python version (requires 3.10+)
44
+ - Install system dependencies (portaudio, ffmpeg)
45
+ - Create Python virtual environment
46
+ - Install all Python packages
47
+ - Create config files from templates
48
+
49
+ This takes 5-10 minutes on first run.
50
+
51
+ ### Step 3: Configure
52
+
53
+ Edit API keys:
54
+ ```bash
55
+ nano .env.local
56
+ ```
57
+
58
+ Add your keys:
59
+ ```bash
60
+ DEEPINFRA_API_KEY=your_key_here
61
+ SPEECHMATICS_API_KEY=your_key_here
62
+ # or
63
+ DEEPGRAM_API_KEY=your_key_here
64
+ ```
65
+
66
+ Edit settings (optional):
67
+ ```bash
68
+ nano config.ini
69
+ ```
70
+
71
+ ### Step 4: Run
72
+
73
+ Activate virtual environment:
74
+ ```bash
75
+ source venv/bin/activate
76
+ ```
77
+
78
+ Run in robot mode:
79
+ ```bash
80
+ python tars_bot.py
81
+ ```
82
+
83
+ Or run dashboard:
84
+ ```bash
85
+ python ui/app.py
86
+ ```
87
+
88
+ ## Verification
89
+
90
+ Check installation:
91
+ ```bash
92
+ # Activate venv
93
+ source ~/tars-conversation-app/venv/bin/activate
94
+
95
+ # Test imports
96
+ python -c "import pipecat; print('Pipecat OK')"
97
+ python -c "from tars_sdk import TarsClient; print('TARS SDK OK')"
98
+
99
+ # Test daemon connection
100
+ python -c "
101
+ import grpc
102
+ from tars_sdk import TarsClient
103
+ channel = grpc.insecure_channel('localhost:50051')
104
+ client = TarsClient(channel)
105
+ print('Daemon connection OK')
106
+ "
107
+ ```
108
+
109
+ ## Uninstallation
110
+
111
+ From dashboard:
112
+ 1. Navigate to "Apps" tab
113
+ 2. Find "tars-conversation-app"
114
+ 3. Click "Uninstall" button
115
+ 4. Choose whether to keep data/config
116
+
117
+ Manual:
118
+ ```bash
119
+ cd ~/tars-conversation-app
120
+ bash uninstall.sh
121
+ ```
122
+
123
+ ## Troubleshooting
124
+
125
+ ### Installation fails
126
+
127
+ Check Python version:
128
+ ```bash
129
+ python3 --version
130
+ # Should be 3.10 or higher
131
+ ```
132
+
133
+ Check disk space:
134
+ ```bash
135
+ df -h
136
+ # Need at least 2GB free
137
+ ```
138
+
139
+ Check internet:
140
+ ```bash
141
+ ping google.com
142
+ ```
143
+
144
+ ### Dependencies fail to install
145
+
146
+ Update package lists:
147
+ ```bash
148
+ sudo apt-get update
149
+ sudo apt-get upgrade
150
+ ```
151
+
152
+ Reinstall system deps:
153
+ ```bash
154
+ sudo apt-get install -y portaudio19-dev ffmpeg build-essential python3-dev
155
+ ```
156
+
157
+ ### Virtual environment issues
158
+
159
+ Remove and recreate:
160
+ ```bash
161
+ rm -rf venv
162
+ python3 -m venv venv
163
+ source venv/bin/activate
164
+ pip install -r requirements.txt
165
+ ```
166
+
167
+ ### Configuration not found
168
+
169
+ Recreate from templates:
170
+ ```bash
171
+ cp config.ini.example config.ini
172
+ cp env.example .env.local
173
+ ```
174
+
175
+ ### Cannot connect to daemon
176
+
177
+ Check daemon is running:
178
+ ```bash
179
+ systemctl status tars-daemon
180
+ ```
181
+
182
+ Test gRPC port:
183
+ ```bash
184
+ nc -zv localhost 50051
185
+ ```
186
+
187
+ Check logs:
188
+ ```bash
189
+ journalctl -u tars-daemon -f
190
+ ```
191
+
192
+ ## Running in Background
193
+
194
+ Use systemd service:
195
+
196
+ ```bash
197
+ # Create service file
198
+ sudo nano /etc/systemd/system/tars-conversation.service
199
+ ```
200
+
201
+ Add:
202
+ ```ini
203
+ [Unit]
204
+ Description=TARS Conversation App
205
+ After=network.target tars-daemon.service
206
+ Requires=tars-daemon.service
207
+
208
+ [Service]
209
+ Type=simple
210
+ User=mac
211
+ WorkingDirectory=/home/mac/tars-conversation-app
212
+ ExecStart=/home/mac/tars-conversation-app/venv/bin/python tars_bot.py
213
+ Restart=always
214
+ RestartSec=10
215
+
216
+ [Install]
217
+ WantedBy=multi-user.target
218
+ ```
219
+
220
+ Enable and start:
221
+ ```bash
222
+ sudo systemctl daemon-reload
223
+ sudo systemctl enable tars-conversation.service
224
+ sudo systemctl start tars-conversation.service
225
+ ```
226
+
227
+ Check status:
228
+ ```bash
229
+ sudo systemctl status tars-conversation.service
230
+ journalctl -u tars-conversation.service -f
231
+ ```
232
+
233
+ ## Updating
234
+
235
+ Pull latest changes:
236
+ ```bash
237
+ cd ~/tars-conversation-app
238
+ git pull
239
+ ```
240
+
241
+ Update dependencies:
242
+ ```bash
243
+ source venv/bin/activate
244
+ pip install -r requirements.txt --upgrade
245
+ ```
246
+
247
+ Restart if running as service:
248
+ ```bash
249
+ sudo systemctl restart tars-conversation.service
250
+ ```
251
+
252
+ ## Resource Usage
253
+
254
+ Expected resource usage on Pi 4:
255
+
256
+ - **Installation size**: ~1.5GB (venv + packages)
257
+ - **Memory**: 500MB-1GB during conversation
258
+ - **CPU**: 30-50% (varies with STT/TTS)
259
+ - **Network**: ~100kbps for audio + API calls
260
+
261
+ Recommend:
262
+ - 4GB RAM Pi (2GB may struggle)
263
+ - Active cooling for sustained use
264
+ - Wired ethernet for stability
docs/MEMORY.md ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hybrid Memory System
2
+
3
+ ## Overview
4
+
5
+ A high-performance memory system optimized for voice AI applications with sub-50ms latency targets. Combines semantic vector search with BM25 keyword matching for superior recall and precision.
6
+
7
+ ## Architecture
8
+
9
+ ### Hybrid Search (70% Vector + 30% BM25)
10
+
11
+ 1. **Vector Search (70% weight)**
12
+ - Uses `all-MiniLM-L6-v2` for semantic embeddings
13
+ - Cosine similarity for relevance scoring
14
+ - Captures semantic meaning and context
15
+
16
+ 2. **BM25 Keyword Search (30% weight)**
17
+ - SQLite FTS5 full-text search
18
+ - Exact keyword matching
19
+ - Handles specific names, terms, and facts
20
+
21
+ 3. **Score Fusion**
22
+ - Weighted combination of both approaches
23
+ - Best of both worlds: semantic understanding + exact matching
24
+
25
+ ## Performance Optimizations
26
+
27
+ ### For Voice AI (<50ms target)
28
+
29
+ | Optimization | Benefit |
30
+ |--------------|---------|
31
+ | **Query Embedding Cache** | Avoid re-encoding similar queries (-20-40ms on cache hit) |
32
+ | **Pre-warmed Model** | Eliminates cold start latency (-50ms) |
33
+ | **Thread Pool** | Non-blocking SQLite operations (-5-10ms) |
34
+ | **Strict Timeout** | Guarantees <50ms with graceful fallback |
35
+ | **Fire-and-Forget Storage** | Stores memories asynchronously (0ms blocking) |
36
+ | **SQLite In-Process** | No network overhead vs ChromaDB (-10-20ms) |
37
+
38
+ ## Latency Comparison
39
+
40
+ | System | Search Latency | Voice AI Ready? |
41
+ |--------|---------------|-----------------|
42
+ | ChromaDB | 50-100ms | ⚠️ Borderline |
43
+ | **Hybrid Memory** | **20-40ms** | βœ… |
44
+
45
+ ## Configuration
46
+
47
+ ```python
48
+ memory_service = HybridMemoryService(
49
+ user_id=client_id,
50
+ db_path="./memory_data/memory.sqlite",
51
+ search_limit=3, # Top N results to return
52
+ search_timeout_ms=40, # Strict timeout for voice AI
53
+ vector_weight=0.7, # 70% semantic similarity
54
+ bm25_weight=0.3, # 30% keyword matching
55
+ system_prompt_prefix="From our conversations:\n",
56
+ )
57
+ ```
58
+
59
+ ## Database Schema
60
+
61
+ ### Main Table
62
+ ```sql
63
+ CREATE TABLE memories (
64
+ id INTEGER PRIMARY KEY,
65
+ user_id TEXT NOT NULL,
66
+ content TEXT NOT NULL,
67
+ embedding BLOB, -- numpy float32 array
68
+ created_at REAL
69
+ )
70
+ ```
71
+
72
+ ### FTS5 Index
73
+ ```sql
74
+ CREATE VIRTUAL TABLE memories_fts USING fts5(
75
+ content,
76
+ content='memories',
77
+ content_rowid='id'
78
+ )
79
+ ```
80
+
81
+ ## Performance Metrics
82
+
83
+ The service tracks:
84
+ - **searches**: Total number of searches
85
+ - **cache_hits**: Query embedding cache hits
86
+ - **cache_hit_rate**: Percentage of cached queries
87
+ - **timeouts**: Searches exceeding timeout threshold
88
+ - **avg_latency_ms**: Average search latency
89
+
90
+ Access stats:
91
+ ```python
92
+ stats = memory_service.get_stats()
93
+ print(stats)
94
+ ```
95
+
96
+ ## How It Works
97
+
98
+ ### Search Process
99
+
100
+ 1. **User message arrives** β†’ Extract text
101
+ 2. **Generate query embedding** β†’ Check cache first
102
+ 3. **Vector search** β†’ Scan recent 100 memories, compute cosine similarity
103
+ 4. **BM25 search** β†’ FTS5 query for keyword matches
104
+ 5. **Score fusion** β†’ Combine weighted scores
105
+ 6. **Return top N** β†’ Sorted by final score
106
+ 7. **Inject into context** β†’ Add as system message
107
+ 8. **Store asynchronously** β†’ Fire-and-forget storage
108
+
109
+ ### Example
110
+
111
+ ```
112
+ User: "What's my favorite color?"
113
+
114
+ Vector Search:
115
+ - "I love blue, it's my favorite color" β†’ 0.85 similarity
116
+ - "My room is painted blue" β†’ 0.62 similarity
117
+
118
+ BM25 Search:
119
+ - "I love blue, it's my favorite color" β†’ rank 1 (score: 1.0)
120
+ - "Blue is calming" β†’ rank 2 (score: 0.5)
121
+
122
+ Final Scores (70% vector + 30% BM25):
123
+ - "I love blue, it's my favorite color" β†’ 0.85*0.7 + 1.0*0.3 = 0.895 βœ“
124
+ - "My room is painted blue" β†’ 0.62*0.7 + 0.0*0.3 = 0.434
125
+ - "Blue is calming" β†’ 0.0*0.7 + 0.5*0.3 = 0.150
126
+
127
+ Top result returned: "I love blue, it's my favorite color"
128
+ ```
129
+
130
+ ## Migration from ChromaDB
131
+
132
+ The hybrid memory service is a drop-in replacement:
133
+
134
+ ```diff
135
+ - from services.memory_chromadb import ChromaDBMemoryService
136
+ + from services.memory_hybrid import HybridMemoryService
137
+
138
+ - memory_service = ChromaDBMemoryService(
139
+ + memory_service = HybridMemoryService(
140
+ user_id=client_id,
141
+ - agent_id="tars_agent",
142
+ - collection_name="conversations",
143
+ - search_limit=5,
144
+ - search_threshold=0.5,
145
+ + db_path="./memory_data/memory.sqlite",
146
+ + search_limit=3,
147
+ + search_timeout_ms=40,
148
+ + vector_weight=0.7,
149
+ + bm25_weight=0.3,
150
+ )
151
+ ```
152
+
153
+ ## Storage Location
154
+
155
+ - **Database**: `./memory_data/memory.sqlite`
156
+ - **Format**: SQLite with FTS5 extension
157
+ - **Embeddings**: Stored as binary BLOBs (numpy float32)
158
+
159
+ ## Dependencies
160
+
161
+ - `sqlite3` (built-in with Python)
162
+ - `sentence-transformers` (already installed)
163
+ - `numpy` (dependency of sentence-transformers)
164
+
165
+ No additional packages required!
166
+
167
+ ## Troubleshooting
168
+
169
+ ### High Latency
170
+ - Check cache hit rate: `memory_service.get_stats()`
171
+ - Reduce `search_limit` if processing too many results
172
+ - Increase `search_timeout_ms` if needed
173
+
174
+ ### Timeouts
175
+ - Review timeout stats: `stats["timeouts"]`
176
+ - Consider increasing `search_timeout_ms` to 50-60ms
177
+ - Check if database is growing too large
178
+
179
+ ### Memory Not Recalled
180
+ - Verify memories are being stored (check database)
181
+ - Adjust `vector_weight` and `bm25_weight` balance
182
+ - Try rephrasing queries to match stored content
183
+
184
+ ## Future Enhancements
185
+
186
+ - [ ] Automatic database compaction/cleanup
187
+ - [ ] Per-user memory limits
188
+ - [ ] Memory importance scoring
189
+ - [ ] Temporal decay for older memories
190
+ - [ ] Multi-turn conversation grouping
env.example ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # STT Provider Configuration
2
+ # Options: "speechmatics", "deepgram", or "deepgram-flux"
3
+ STT_PROVIDER=speechmatics
4
+
5
+ # Speechmatics API Key
6
+ # Get your API key from: https://portal.speechmatics.com/
7
+ SPEECHMATICS_API_KEY=your_speechmatics_api_key_here
8
+
9
+ # Deepgram API Key (only needed if STT_PROVIDER=deepgram or deepgram-flux)
10
+ # Get your API key from: https://console.deepgram.com/
11
+ DEEPGRAM_API_KEY=your_deepgram_api_key_here
12
+
13
+ # ElevenLabs API Key
14
+ # Get your API key from: https://elevenlabs.io/app/settings/api-keys
15
+ ELEVENLABS_API_KEY=your_elevenlabs_api_key_here
16
+
17
+ # ElevenLabs Voice ID (optional, defaults to custom voice)
18
+ # Find voice IDs at: https://elevenlabs.io/app/voices
19
+ ELEVENLABS_VOICE_ID=ry8mpwRw6nugb2qjP0tu
20
+
21
+ # DeepInfra API Key (for Qwen LLM and Gating Layer)
22
+ # Get your API key from: https://deepinfra.com/
23
+ DEEPINFRA_API_KEY=your_deepinfra_api_key_here
24
+ # Optional: Override default models
25
+ # DEEPINFRA_MODEL=Qwen/Qwen3-235B-A22B-Instruct-2507 # Main LLM (default)
26
+ # DEEPINFRA_GATING_MODEL=meta-llama/Llama-3.2-3B-Instruct # Gating Layer (default)
27
+
28
+ # Pipecat FastAPI service URL (for frontend to connect)
29
+ NEXT_PUBLIC_PIPECAT_URL=http://localhost:7860
30
+
31
+ # Pipecat FastAPI service configuration
32
+ PIPECAT_HOST=localhost
33
+ PIPECAT_PORT=7860
34
+
35
+ # Mem0 API Key (optional, enables long-term memory)
36
+ # Get one from: https://docs.mem0.ai/
37
+ MEM0_API_KEY=your_mem0_api_key_here
38
+
39
+ # TTS Provider Configuration
40
+ # Options: "elevenlabs" (cloud, requires API key) or "qwen3" (local, free)
41
+ TTS_PROVIDER=qwen3
42
+
43
+ # Qwen3-TTS Configuration (only needed if TTS_PROVIDER=qwen3)
44
+ # Model: 0.6B (faster, less memory) or 1.7B (better quality)
45
+ QWEN3_TTS_MODEL=Qwen/Qwen3-TTS-12Hz-0.6B-Base
46
+ # Device: "mps" for Mac, "cuda" for NVIDIA GPU, "cpu" for CPU
47
+ QWEN3_TTS_DEVICE=mps
48
+ # Reference audio file for voice cloning (relative to project root)
49
+ QWEN3_TTS_REF_AUDIO=assets/audio/tars-clean-compressed.mp3
50
+
51
+ # Emotional State Monitoring
52
+ # Continuously analyzes video for confusion/hesitation/frustration
53
+ # Triggers TARS to offer help proactively
54
+ EMOTIONAL_MONITORING_ENABLED=true
55
+ # How often to sample video frames (in seconds)
56
+ EMOTIONAL_SAMPLING_INTERVAL=3.0
57
+ # How many consecutive negative states before intervention
58
+ EMOTIONAL_INTERVENTION_THRESHOLD=2
59
+
index.html ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>TARS Conversation App</title>
7
+ <style>
8
+ * {
9
+ margin: 0;
10
+ padding: 0;
11
+ box-sizing: border-box;
12
+ }
13
+
14
+ body {
15
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
16
+ line-height: 1.6;
17
+ color: #333;
18
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
19
+ min-height: 100vh;
20
+ padding: 20px;
21
+ }
22
+
23
+ .container {
24
+ max-width: 900px;
25
+ margin: 0 auto;
26
+ background: white;
27
+ border-radius: 16px;
28
+ padding: 40px;
29
+ box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
30
+ }
31
+
32
+ header {
33
+ text-align: center;
34
+ margin-bottom: 40px;
35
+ padding-bottom: 30px;
36
+ border-bottom: 2px solid #f0f0f0;
37
+ }
38
+
39
+ h1 {
40
+ font-size: 2.5rem;
41
+ color: #667eea;
42
+ margin-bottom: 10px;
43
+ }
44
+
45
+ .subtitle {
46
+ font-size: 1.2rem;
47
+ color: #666;
48
+ margin-bottom: 20px;
49
+ }
50
+
51
+ .badges {
52
+ display: flex;
53
+ gap: 10px;
54
+ justify-content: center;
55
+ flex-wrap: wrap;
56
+ }
57
+
58
+ .badge {
59
+ background: #667eea;
60
+ color: white;
61
+ padding: 6px 16px;
62
+ border-radius: 20px;
63
+ font-size: 14px;
64
+ font-weight: 500;
65
+ }
66
+
67
+ .badge.version {
68
+ background: #764ba2;
69
+ }
70
+
71
+ .badge.tars {
72
+ background: #48bb78;
73
+ }
74
+
75
+ section {
76
+ margin-bottom: 40px;
77
+ }
78
+
79
+ h2 {
80
+ color: #667eea;
81
+ font-size: 1.8rem;
82
+ margin-bottom: 15px;
83
+ }
84
+
85
+ h3 {
86
+ color: #764ba2;
87
+ font-size: 1.3rem;
88
+ margin-bottom: 10px;
89
+ margin-top: 25px;
90
+ }
91
+
92
+ .install-box {
93
+ background: #f7fafc;
94
+ border-left: 4px solid #667eea;
95
+ padding: 25px;
96
+ border-radius: 8px;
97
+ margin: 20px 0;
98
+ }
99
+
100
+ .install-steps {
101
+ list-style: none;
102
+ counter-reset: step-counter;
103
+ }
104
+
105
+ .install-steps li {
106
+ counter-increment: step-counter;
107
+ margin-bottom: 15px;
108
+ padding-left: 40px;
109
+ position: relative;
110
+ }
111
+
112
+ .install-steps li::before {
113
+ content: counter(step-counter);
114
+ position: absolute;
115
+ left: 0;
116
+ top: 0;
117
+ background: #667eea;
118
+ color: white;
119
+ width: 28px;
120
+ height: 28px;
121
+ border-radius: 50%;
122
+ display: flex;
123
+ align-items: center;
124
+ justify-content: center;
125
+ font-weight: bold;
126
+ font-size: 14px;
127
+ }
128
+
129
+ code {
130
+ background: #2d3748;
131
+ color: #68d391;
132
+ padding: 3px 8px;
133
+ border-radius: 4px;
134
+ font-family: "Courier New", monospace;
135
+ font-size: 0.9em;
136
+ }
137
+
138
+ pre {
139
+ background: #2d3748;
140
+ color: #e2e8f0;
141
+ padding: 20px;
142
+ border-radius: 8px;
143
+ overflow-x: auto;
144
+ margin: 15px 0;
145
+ }
146
+
147
+ pre code {
148
+ background: none;
149
+ padding: 0;
150
+ color: inherit;
151
+ }
152
+
153
+ .features {
154
+ display: grid;
155
+ grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
156
+ gap: 20px;
157
+ margin: 20px 0;
158
+ }
159
+
160
+ .feature-card {
161
+ background: #f7fafc;
162
+ padding: 20px;
163
+ border-radius: 8px;
164
+ border-left: 4px solid #764ba2;
165
+ }
166
+
167
+ .feature-card h4 {
168
+ color: #667eea;
169
+ margin-bottom: 8px;
170
+ }
171
+
172
+ .btn {
173
+ display: inline-block;
174
+ background: #667eea;
175
+ color: white;
176
+ padding: 12px 30px;
177
+ border-radius: 8px;
178
+ text-decoration: none;
179
+ font-weight: 600;
180
+ transition: background 0.3s;
181
+ margin-right: 10px;
182
+ margin-top: 10px;
183
+ }
184
+
185
+ .btn:hover {
186
+ background: #5568d3;
187
+ }
188
+
189
+ .btn.secondary {
190
+ background: #764ba2;
191
+ }
192
+
193
+ .btn.secondary:hover {
194
+ background: #68399e;
195
+ }
196
+
197
+ footer {
198
+ text-align: center;
199
+ margin-top: 50px;
200
+ padding-top: 30px;
201
+ border-top: 2px solid #f0f0f0;
202
+ color: #666;
203
+ }
204
+
205
+ .tech-stack {
206
+ display: flex;
207
+ flex-wrap: wrap;
208
+ gap: 10px;
209
+ margin: 15px 0;
210
+ }
211
+
212
+ .tech {
213
+ background: #edf2f7;
214
+ padding: 8px 16px;
215
+ border-radius: 6px;
216
+ font-size: 14px;
217
+ color: #4a5568;
218
+ }
219
+ </style>
220
+ </head>
221
+ <body>
222
+ <div class="container">
223
+ <header>
224
+ <h1>πŸ€– TARS Conversation App</h1>
225
+ <p class="subtitle">Real-time conversational AI for TARS robots</p>
226
+ <div class="badges">
227
+ <span class="badge">AI Assistant</span>
228
+ <span class="badge version">v1.0.0</span>
229
+ <span class="badge tars">TARS App</span>
230
+ </div>
231
+ </header>
232
+
233
+ <section>
234
+ <h2>Features</h2>
235
+ <div class="features">
236
+ <div class="feature-card">
237
+ <h4>🎀 Real-time Voice</h4>
238
+ <p>WebRTC audio with Speechmatics/Deepgram transcription</p>
239
+ </div>
240
+ <div class="feature-card">
241
+ <h4>🧠 Smart Memory</h4>
242
+ <p>Hybrid vector + BM25 search with ChromaDB</p>
243
+ </div>
244
+ <div class="feature-card">
245
+ <h4>πŸ‘οΈ Vision Analysis</h4>
246
+ <p>Image understanding with Moondream</p>
247
+ </div>
248
+ <div class="feature-card">
249
+ <h4>πŸ“Š Live Dashboard</h4>
250
+ <p>Gradio metrics, latency charts, transcriptions</p>
251
+ </div>
252
+ <div class="feature-card">
253
+ <h4>🎭 Emotional AI</h4>
254
+ <p>Real-time emotion and sentiment monitoring</p>
255
+ </div>
256
+ <div class="feature-card">
257
+ <h4>πŸ€– Robot Control</h4>
258
+ <p>gRPC commands for gestures, eyes, movement</p>
259
+ </div>
260
+ </div>
261
+ </section>
262
+
263
+ <section>
264
+ <h2>Installation on TARS Robot</h2>
265
+ <div class="install-box">
266
+ <ol class="install-steps">
267
+ <li>Open TARS dashboard at <code>http://your-pi:8000</code></li>
268
+ <li>Go to <strong>App Store</strong> tab</li>
269
+ <li>Enter Space ID: <code>latishab/tars-conversation-app</code></li>
270
+ <li>Click <strong>Install from HuggingFace</strong></li>
271
+ <li>Configure API keys in <code>.env.local</code></li>
272
+ <li>Click <strong>Start</strong></li>
273
+ <li>Open dashboard at <code>http://your-pi:7860</code></li>
274
+ </ol>
275
+ </div>
276
+ </section>
277
+
278
+ <section>
279
+ <h3>Required API Keys</h3>
280
+ <ul style="list-style-position: inside; margin-left: 20px;">
281
+ <li><code>DEEPINFRA_API_KEY</code> - For LLM (DeepInfra)</li>
282
+ <li><code>SPEECHMATICS_API_KEY</code> or <code>DEEPGRAM_API_KEY</code> - For STT</li>
283
+ <li><code>ELEVENLABS_API_KEY</code> (optional) - For premium TTS</li>
284
+ </ul>
285
+ </section>
286
+
287
+ <section>
288
+ <h2>Tech Stack</h2>
289
+ <div class="tech-stack">
290
+ <span class="tech">Pipecat</span>
291
+ <span class="tech">WebRTC</span>
292
+ <span class="tech">Gradio</span>
293
+ <span class="tech">ChromaDB</span>
294
+ <span class="tech">gRPC</span>
295
+ <span class="tech">Speechmatics</span>
296
+ <span class="tech">Deepgram</span>
297
+ <span class="tech">ElevenLabs</span>
298
+ <span class="tech">DeepInfra</span>
299
+ <span class="tech">Moondream</span>
300
+ </div>
301
+ </section>
302
+
303
+ <section>
304
+ <h2>Manual Installation</h2>
305
+ <p>For development or non-TARS deployments:</p>
306
+ <pre><code>git clone https://github.com/latishab/tars-conversation-app.git
307
+ cd tars-conversation-app
308
+ bash install.sh
309
+
310
+ # Configure
311
+ cp env.example .env.local
312
+ cp config.ini.example config.ini
313
+
314
+ # Run
315
+ python tars_bot.py # Robot mode
316
+ python bot.py # Browser mode</code></pre>
317
+ </section>
318
+
319
+ <section>
320
+ <h2>Resources</h2>
321
+ <a href="https://github.com/latishab/tars-conversation-app" class="btn">GitHub Repository</a>
322
+ <a href="https://github.com/latishab/tars-conversation-app#readme" class="btn secondary">Documentation</a>
323
+ </section>
324
+
325
+ <footer>
326
+ <p>Built with TarsApp framework β€’ TARS Project</p>
327
+ <p style="margin-top: 10px; font-size: 14px;">
328
+ <a href="https://huggingface.co/spaces/latishab/tars-conversation-app" style="color: #667eea;">View on HuggingFace</a>
329
+ </p>
330
+ </footer>
331
+ </div>
332
+ </body>
333
+ </html>
install.sh ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -e
3
+
4
+ APP_NAME="tars-conversation-app"
5
+ APP_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
6
+
7
+ echo "=== Installing $APP_NAME ==="
8
+ echo "Directory: $APP_DIR"
9
+ echo
10
+
11
+ # Check Python version
12
+ echo "Checking Python version..."
13
+ PYTHON_VERSION=$(python3 --version 2>&1 | grep -oP '\d+\.\d+')
14
+ REQUIRED_VERSION="3.10"
15
+
16
+ if [ "$(printf '%s\n' "$REQUIRED_VERSION" "$PYTHON_VERSION" | sort -V | head -n1)" != "$REQUIRED_VERSION" ]; then
17
+ echo "Error: Python $REQUIRED_VERSION or higher required (found $PYTHON_VERSION)"
18
+ exit 1
19
+ fi
20
+ echo "Python $PYTHON_VERSION OK"
21
+ echo
22
+
23
+ # Install system dependencies
24
+ echo "Installing system dependencies..."
25
+ sudo apt-get update -qq
26
+ sudo apt-get install -y portaudio19-dev ffmpeg build-essential python3-dev python3-venv
27
+ echo "System dependencies installed"
28
+ echo
29
+
30
+ # Create virtual environment
31
+ if [ ! -d "$APP_DIR/venv" ]; then
32
+ echo "Creating virtual environment..."
33
+ python3 -m venv "$APP_DIR/venv"
34
+ echo "Virtual environment created"
35
+ else
36
+ echo "Virtual environment already exists"
37
+ fi
38
+ echo
39
+
40
+ # Activate virtual environment
41
+ source "$APP_DIR/venv/bin/activate"
42
+
43
+ # Upgrade pip
44
+ echo "Upgrading pip..."
45
+ pip install --upgrade pip -q
46
+ echo
47
+
48
+ # Install Python dependencies
49
+ echo "Installing Python dependencies..."
50
+ echo "This may take several minutes..."
51
+ pip install -r "$APP_DIR/requirements.txt" -q
52
+ echo "Python dependencies installed"
53
+ echo
54
+
55
+ # Setup configuration files
56
+ if [ ! -f "$APP_DIR/config.ini" ]; then
57
+ echo "Creating config.ini from template..."
58
+ cp "$APP_DIR/config.ini.example" "$APP_DIR/config.ini"
59
+ echo "Created config.ini"
60
+ CONFIG_CREATED=true
61
+ else
62
+ echo "config.ini already exists"
63
+ CONFIG_CREATED=false
64
+ fi
65
+ echo
66
+
67
+ if [ ! -f "$APP_DIR/.env.local" ]; then
68
+ echo "Creating .env.local from template..."
69
+ cp "$APP_DIR/env.example" "$APP_DIR/.env.local"
70
+ echo "Created .env.local"
71
+ ENV_CREATED=true
72
+ else
73
+ echo ".env.local already exists"
74
+ ENV_CREATED=false
75
+ fi
76
+ echo
77
+
78
+ # Run video codec fix if needed
79
+ if [ -f "$APP_DIR/fix_video_codec.sh" ]; then
80
+ echo "Applying video codec fixes..."
81
+ bash "$APP_DIR/fix_video_codec.sh" || true
82
+ fi
83
+
84
+ echo "=== Installation Complete ==="
85
+ echo
86
+ echo "Next steps:"
87
+ if [ "$CONFIG_CREATED" = true ] || [ "$ENV_CREATED" = true ]; then
88
+ echo "1. Edit configuration files:"
89
+ [ "$ENV_CREATED" = true ] && echo " - Add API keys to: $APP_DIR/.env.local"
90
+ [ "$CONFIG_CREATED" = true ] && echo " - Configure settings: $APP_DIR/config.ini"
91
+ echo "2. Activate environment: source $APP_DIR/venv/bin/activate"
92
+ echo "3. Run the app: python $APP_DIR/tars_bot.py"
93
+ else
94
+ echo "1. Activate environment: source $APP_DIR/venv/bin/activate"
95
+ echo "2. Run the app: python $APP_DIR/tars_bot.py"
96
+ fi
97
+ echo
98
+ echo "For browser mode: python $APP_DIR/bot.py"
99
+ echo "For dashboard: python $APP_DIR/ui/app.py"
manifest.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "tars-conversation-app",
3
+ "version": "1.0.0",
4
+ "description": "Real-time conversational AI with WebRTC, memory, and vision",
5
+ "author": "TARS Project",
6
+ "repository": "https://github.com/latishab/tars-conversation-app.git",
7
+ "entry_point": "tars_conversation_app.wrapper:ConversationApp",
8
+ "custom_app_url": "http://localhost:7860",
9
+ "icon": "assets/tars-icon.png",
10
+ "huggingface_space": "latishab/tars-conversation-app",
11
+ "install_script": "install.sh",
12
+ "uninstall_script": "uninstall.sh",
13
+ "dependencies": {
14
+ "python": ">=3.10",
15
+ "system": [
16
+ "portaudio19-dev",
17
+ "ffmpeg",
18
+ "build-essential",
19
+ "python3-dev"
20
+ ]
21
+ },
22
+ "environment": [
23
+ "DEEPINFRA_API_KEY",
24
+ "SPEECHMATICS_API_KEY",
25
+ "DEEPGRAM_API_KEY",
26
+ "ELEVENLABS_API_KEY"
27
+ ],
28
+ "configuration": {
29
+ "file": "config.ini",
30
+ "example": "config.ini.example",
31
+ "env_file": ".env.local",
32
+ "env_example": "env.example"
33
+ },
34
+ "ports": {
35
+ "grpc": 50051,
36
+ "http": 8765,
37
+ "fastapi": 8080,
38
+ "dashboard": 7860
39
+ },
40
+ "services": {
41
+ "dashboard": {
42
+ "enabled": true,
43
+ "description": "Gradio metrics and monitoring dashboard",
44
+ "url": "http://localhost:7860"
45
+ }
46
+ }
47
+ }
pipecat_service.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Pipecat.ai service for real-time transcription and TTS using SmallWebRTC
4
+ Communicates directly with browser via WebRTC
5
+ """
6
+
7
+ # Fix SSL certificate issues FIRST - before any SSL-using imports
8
+ import os
9
+ import sys
10
+ from pathlib import Path
11
+
12
+ # Add src/ to Python path
13
+ sys.path.insert(0, str(Path(__file__).parent / "src"))
14
+
15
+ try:
16
+ import certifi
17
+ cert_file = certifi.where()
18
+ os.environ['SSL_CERT_FILE'] = cert_file
19
+ os.environ['REQUESTS_CA_BUNDLE'] = cert_file
20
+ os.environ['CURL_CA_BUNDLE'] = cert_file
21
+ except ImportError:
22
+ pass # certifi not available, will use system certs
23
+
24
+ import ssl
25
+ from contextlib import asynccontextmanager
26
+
27
+ # Configure SSL to use certifi certificates for Python's ssl module
28
+ # For development: disable SSL verification completely to avoid certificate issues
29
+ # This MUST happen before any libraries that use SSL are imported
30
+ try:
31
+ import certifi
32
+ cert_file = certifi.where()
33
+ # Set environment variables for libraries that respect them
34
+ os.environ['SSL_CERT_FILE'] = cert_file
35
+ os.environ['REQUESTS_CA_BUNDLE'] = cert_file
36
+ os.environ['CURL_CA_BUNDLE'] = cert_file
37
+
38
+ # For Python's ssl module: use unverified context for development
39
+ # This bypasses SSL certificate verification to avoid connection issues
40
+ ssl._create_default_https_context = ssl._create_unverified_context
41
+ except ImportError:
42
+ # If certifi not available, use unverified (development only)
43
+ ssl._create_default_https_context = ssl._create_unverified_context
44
+ except Exception as e:
45
+ # If anything fails, use unverified context
46
+ ssl._create_default_https_context = ssl._create_unverified_context
47
+
48
+ import argparse
49
+ import logging
50
+ from fastapi import BackgroundTasks, FastAPI
51
+ from fastapi.middleware.cors import CORSMiddleware
52
+ from loguru import logger
53
+ from pipecat.transports.smallwebrtc.request_handler import (
54
+ SmallWebRTCPatchRequest,
55
+ SmallWebRTCRequest,
56
+ SmallWebRTCRequestHandler,
57
+ )
58
+
59
+ from bot import run_bot
60
+ from config import (
61
+ PIPECAT_HOST,
62
+ PIPECAT_PORT,
63
+ SPEECHMATICS_API_KEY,
64
+ DEEPGRAM_API_KEY,
65
+ ELEVENLABS_API_KEY,
66
+ DEEPINFRA_API_KEY,
67
+ STT_PROVIDER,
68
+ TTS_PROVIDER, # Only used for startup validation
69
+ get_fresh_config,
70
+ )
71
+
72
+ # Remove default loguru handler and set up custom logging
73
+ logger.remove(0)
74
+
75
+ # Configure standard logging
76
+ logging.basicConfig(level=logging.INFO)
77
+ standard_logger = logging.getLogger(__name__)
78
+
79
+ # Reduce noise from websockets library - only log warnings and above
80
+ websockets_logger = logging.getLogger('websockets')
81
+ websockets_logger.setLevel(logging.WARNING)
82
+
83
+ # Log SSL certificate configuration
84
+ try:
85
+ import certifi
86
+ logger.info(f"SSL Configuration: Using certificates from {certifi.where()}")
87
+ logger.info(f"SSL_CERT_FILE env: {os.environ.get('SSL_CERT_FILE', 'not set')}")
88
+ except:
89
+ logger.warning("certifi not available - SSL verification disabled for development")
90
+
91
+
92
+ @asynccontextmanager
93
+ async def lifespan(app: FastAPI):
94
+ """Handle app lifespan events."""
95
+ logger.info(f"Starting Pipecat service on http://{PIPECAT_HOST}:{PIPECAT_PORT}...")
96
+ logger.info(f"STT Provider: {STT_PROVIDER}")
97
+ logger.info(f"TTS Provider: {TTS_PROVIDER}")
98
+
99
+ # Check required API keys based on STT and TTS providers
100
+ missing_keys = []
101
+ if STT_PROVIDER == "speechmatics" and not SPEECHMATICS_API_KEY:
102
+ missing_keys.append("SPEECHMATICS_API_KEY")
103
+ if STT_PROVIDER == "deepgram" and not DEEPGRAM_API_KEY:
104
+ missing_keys.append("DEEPGRAM_API_KEY")
105
+ if not DEEPINFRA_API_KEY:
106
+ missing_keys.append("DEEPINFRA_API_KEY")
107
+ if TTS_PROVIDER == "elevenlabs" and not ELEVENLABS_API_KEY:
108
+ missing_keys.append("ELEVENLABS_API_KEY")
109
+
110
+ if missing_keys:
111
+ logger.error(f"ERROR: Missing required API keys: {', '.join(missing_keys)}")
112
+ sys.exit(1)
113
+
114
+ yield # Run app
115
+
116
+ # Cleanup
117
+ await small_webrtc_handler.close()
118
+ logger.info("Shutting down...")
119
+
120
+
121
+ app = FastAPI(lifespan=lifespan)
122
+
123
+ # Add CORS middleware
124
+ app.add_middleware(
125
+ CORSMiddleware,
126
+ allow_origins=["*"], # In production, replace with specific origins
127
+ allow_credentials=True,
128
+ allow_methods=["*"],
129
+ allow_headers=["*"],
130
+ )
131
+
132
+ # Initialize the SmallWebRTC request handler
133
+ small_webrtc_handler: SmallWebRTCRequestHandler = SmallWebRTCRequestHandler()
134
+
135
+ @app.post("/api/offer")
136
+ async def offer(request: SmallWebRTCRequest, background_tasks: BackgroundTasks):
137
+ """Handle WebRTC offer requests via SmallWebRTCRequestHandler."""
138
+ logger.debug(f"Received WebRTC offer request")
139
+
140
+ # Prepare runner arguments with the callback to run your bot
141
+ async def webrtc_connection_callback(connection):
142
+ background_tasks.add_task(run_bot, connection)
143
+
144
+ # Delegate handling to SmallWebRTCRequestHandler
145
+ answer = await small_webrtc_handler.handle_web_request(
146
+ request=request,
147
+ webrtc_connection_callback=webrtc_connection_callback,
148
+ )
149
+ return answer
150
+
151
+
152
+ @app.patch("/api/offer")
153
+ async def ice_candidate(request: SmallWebRTCPatchRequest):
154
+ """Handle ICE candidate patch requests."""
155
+ logger.debug(f"Received ICE candidate patch request")
156
+ await small_webrtc_handler.handle_patch_request(request)
157
+ return {"status": "success"}
158
+
159
+
160
+ @app.get("/api/status")
161
+ async def status():
162
+ """Health check endpoint with fresh config values."""
163
+ # Get current config from config.ini
164
+ current_config = get_fresh_config()
165
+ current_stt = current_config['STT_PROVIDER']
166
+ current_tts = current_config['TTS_PROVIDER']
167
+ current_model = current_config['DEEPINFRA_MODEL']
168
+
169
+ return {
170
+ "status": "ok",
171
+ "stt_provider": current_stt,
172
+ "tts_provider": current_tts,
173
+ "llm_model": current_model,
174
+ "speechmatics_configured": bool(SPEECHMATICS_API_KEY) if current_stt == "speechmatics" else None,
175
+ "deepgram_configured": bool(DEEPGRAM_API_KEY) if current_stt == "deepgram" else None,
176
+ "elevenlabs_configured": bool(ELEVENLABS_API_KEY) if current_tts == "elevenlabs" else None,
177
+ "deepinfra_configured": bool(DEEPINFRA_API_KEY),
178
+ "qwen3_tts_configured": True if current_tts == "qwen3" else None,
179
+ }
180
+
181
+
182
+ @app.get("/api/config")
183
+ async def get_config():
184
+ """Get current configuration from config.ini."""
185
+ import configparser
186
+ from pathlib import Path
187
+
188
+ config = configparser.ConfigParser()
189
+ config_path = Path("config.ini")
190
+
191
+ if not config_path.exists():
192
+ return {"error": "config.ini not found"}
193
+
194
+ config.read(config_path)
195
+
196
+ return {
197
+ "llm": {
198
+ "model": config.get("LLM", "model", fallback="Qwen/Qwen3-235B-A22B-Instruct-2507")
199
+ },
200
+ "stt": {
201
+ "provider": config.get("STT", "provider", fallback="speechmatics")
202
+ },
203
+ "tts": {
204
+ "provider": config.get("TTS", "provider", fallback="qwen3"),
205
+ "qwen3_model": config.get("TTS", "qwen3_model", fallback="Qwen/Qwen3-TTS-12Hz-0.6B-Base"),
206
+ "qwen3_device": config.get("TTS", "qwen3_device", fallback="mps"),
207
+ "qwen3_ref_audio": config.get("TTS", "qwen3_ref_audio", fallback="tars-clean-compressed.mp3"),
208
+ }
209
+ }
210
+
211
+
212
+ @app.post("/api/config")
213
+ async def update_config(request: dict):
214
+ """Update configuration in config.ini."""
215
+ import configparser
216
+ from pathlib import Path
217
+
218
+ config = configparser.ConfigParser()
219
+ config_path = Path("config.ini")
220
+
221
+ if not config_path.exists():
222
+ return {"error": "config.ini not found"}
223
+
224
+ config.read(config_path)
225
+
226
+ # Update LLM config
227
+ if "llm_model" in request:
228
+ if not config.has_section("LLM"):
229
+ config.add_section("LLM")
230
+ config.set("LLM", "model", request["llm_model"])
231
+
232
+ # Update STT config
233
+ if "stt_provider" in request:
234
+ if not config.has_section("STT"):
235
+ config.add_section("STT")
236
+ config.set("STT", "provider", request["stt_provider"])
237
+
238
+ # Update TTS config
239
+ if "tts_provider" in request:
240
+ if not config.has_section("TTS"):
241
+ config.add_section("TTS")
242
+ config.set("TTS", "provider", request["tts_provider"])
243
+
244
+ # Write back to file
245
+ with open(config_path, "w") as f:
246
+ config.write(f)
247
+
248
+ return {
249
+ "success": True,
250
+ "message": "Configuration updated. Please restart the service for changes to take effect.",
251
+ "restart_required": True
252
+ }
253
+
254
+
255
+ if __name__ == "__main__":
256
+ parser = argparse.ArgumentParser(description="WebRTC Pipecat service")
257
+ parser.add_argument(
258
+ "--host", default=PIPECAT_HOST, help=f"Host for HTTP server (default: {PIPECAT_HOST})"
259
+ )
260
+ parser.add_argument(
261
+ "--port", type=int, default=PIPECAT_PORT, help=f"Port for HTTP server (default: {PIPECAT_PORT})"
262
+ )
263
+ parser.add_argument("--verbose", "-v", action="count")
264
+ args = parser.parse_args()
265
+
266
+ if args.verbose:
267
+ logger.add(sys.stderr, level="TRACE")
268
+ else:
269
+ logger.add(sys.stderr, level="INFO")
270
+
271
+ import uvicorn
272
+ uvicorn.run(app, host=args.host, port=args.port)
publish-to-hf.sh ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Publish tars-conversation-app to HuggingFace Space
3
+
4
+ set -e
5
+
6
+ echo "Publishing tars-conversation-app to HuggingFace Space..."
7
+ echo
8
+
9
+ # Check for HF_TOKEN
10
+ if [ -z "$HF_TOKEN" ]; then
11
+ echo "❌ Error: HF_TOKEN not set"
12
+ echo
13
+ echo "Get a token from: https://huggingface.co/settings/tokens"
14
+ echo "Then run:"
15
+ echo " export HF_TOKEN=hf_your_token_here"
16
+ echo " bash publish-to-hf.sh"
17
+ exit 1
18
+ fi
19
+
20
+ echo "βœ“ HF_TOKEN is set"
21
+
22
+ # Check for huggingface_hub
23
+ python3 << 'EOFCHECK'
24
+ try:
25
+ from huggingface_hub import HfApi
26
+ print("βœ“ huggingface_hub is installed")
27
+ except ImportError:
28
+ print("❌ huggingface_hub not installed")
29
+ print("\nInstall with:")
30
+ print(" pip install huggingface_hub")
31
+ exit(1)
32
+ EOFCHECK
33
+
34
+ if [ $? -ne 0 ]; then
35
+ exit 1
36
+ fi
37
+
38
+ echo
39
+ echo "Uploading to latishab/tars-conversation-app..."
40
+ echo
41
+
42
+ # Upload
43
+ python3 << 'EOFUPLOAD'
44
+ import os
45
+ from pathlib import Path
46
+ from huggingface_hub import HfApi
47
+
48
+ token = os.environ["HF_TOKEN"]
49
+ api = HfApi(token=token)
50
+
51
+ print("Uploading files...")
52
+
53
+ api.upload_folder(
54
+ folder_path=".",
55
+ repo_id="latishab/tars-conversation-app",
56
+ repo_type="space",
57
+ ignore_patterns=[
58
+ ".git", ".git/*",
59
+ "venv", "venv/*",
60
+ "__pycache__", "**/__pycache__",
61
+ "*.pyc", "**/*.pyc",
62
+ ".pytest_cache",
63
+ ".models", ".models/*",
64
+ "chroma_memory", "chroma_memory/*",
65
+ "memory_data", "memory_data/*",
66
+ ".env", ".env.local", ".env.*",
67
+ "config.ini",
68
+ ".claude", ".claude/*",
69
+ ".DS_Store", "**/.DS_Store"
70
+ ],
71
+ commit_message="Update TARS Conversation App with TarsApp framework"
72
+ )
73
+
74
+ print("\nβœ… Published successfully!")
75
+ print("\nSpace URL: https://huggingface.co/spaces/latishab/tars-conversation-app")
76
+ print("\nNext steps:")
77
+ print("1. Visit the Space URL to verify it's working")
78
+ print("2. Test installation on TARS robot:")
79
+ print(" - Open dashboard at http://your-pi:8000")
80
+ print(" - Go to App Store tab")
81
+ print(" - Enter Space ID: latishab/tars-conversation-app")
82
+ print(" - Click 'Install from HuggingFace'")
83
+ print("3. Click Start and verify Gradio dashboard at :7860")
84
+ EOFUPLOAD
85
+
86
+ echo
87
+ echo "Done!"
pyproject.toml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "tars-conversation-app"
7
+ version = "1.0.0"
8
+ description = "Real-time conversational AI with WebRTC, memory, and vision for TARS robots"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ authors = [
12
+ {name = "TARS Project"}
13
+ ]
14
+
15
+ dependencies = [
16
+ "tars-sdk>=0.1.0",
17
+ ]
18
+
19
+ [project.urls]
20
+ Homepage = "https://github.com/latishab/tars-conversation-app"
21
+ Repository = "https://github.com/latishab/tars-conversation-app.git"
22
+ Documentation = "https://github.com/latishab/tars-conversation-app#readme"
23
+
24
+ [tool.setuptools.packages.find]
25
+ include = ["tars_conversation_app", "tars_conversation_app.*", "src", "src.*", "ui", "ui.*"]
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pipecat-ai[speechmatics,elevenlabs,webrtc,qwen,moondream,local-smart-turn-v3,silero]>=0.0.102
2
+ python-dotenv>=1.0.0
3
+ fastapi>=0.104.0
4
+ uvicorn[standard]>=0.24.0
5
+ loguru>=0.7.0
6
+ certifi>=2024.0.0
7
+ aiohttp>=3.9.0
8
+ chromadb>=0.4.0
9
+ sentence-transformers>=2.2.0
10
+ opencv-python>=4.8.0
11
+ mediapipe>=0.10.0
12
+ websockets>=12.0
13
+ httpx>=0.24.0
14
+ gradio>=4.0.0
15
+ plotly>=5.0.0
16
+ # aiortc is installed as a dependency of pipecat-ai[webrtc]
17
+ # If you encounter VP8 decoder errors, run: bash fix_video_codec.sh
18
+
scripts/update_daemon.py ADDED
@@ -0,0 +1,388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ TARS Daemon Remote Update Script
4
+
5
+ Updates the TARS daemon on the Raspberry Pi via SSH.
6
+ Supports git-based updates, backup, health checks, and rollback.
7
+
8
+ Usage:
9
+ python scripts/update_daemon.py --check-only
10
+ python scripts/update_daemon.py --method git
11
+ python scripts/update_daemon.py --method git --version v0.2.1
12
+ python scripts/update_daemon.py --rollback /path/to/backup
13
+ """
14
+
15
+ import argparse
16
+ import subprocess
17
+ import sys
18
+ import json
19
+ from datetime import datetime
20
+ from pathlib import Path
21
+
22
+ # SSH configuration
23
+ PI_HOST = "tars-pi"
24
+ PI_USER = "mac"
25
+ DAEMON_DIR = "~/tars-daemon"
26
+ BACKUP_DIR = "~/tars-daemon-backups"
27
+ SERVICE_NAME = "tars"
28
+
29
+
30
+ def run_ssh(cmd: str, check: bool = True) -> tuple[int, str, str]:
31
+ """Run command on Pi via SSH."""
32
+ ssh_cmd = f'ssh {PI_HOST} "{cmd}"'
33
+ result = subprocess.run(
34
+ ssh_cmd,
35
+ shell=True,
36
+ capture_output=True,
37
+ text=True
38
+ )
39
+ if check and result.returncode != 0:
40
+ print(f"Error: {result.stderr}")
41
+ return result.returncode, result.stdout.strip(), result.stderr.strip()
42
+
43
+
44
+ def get_current_version() -> dict:
45
+ """Get current daemon version info."""
46
+ code, out, err = run_ssh(
47
+ f"cd {DAEMON_DIR} && source venv/bin/activate && "
48
+ "python -c 'from tars_sdk import __version__; import json; "
49
+ "print(json.dumps({\"version\": __version__}))'",
50
+ check=False
51
+ )
52
+ if code == 0:
53
+ try:
54
+ return json.loads(out)
55
+ except json.JSONDecodeError:
56
+ pass
57
+
58
+ # Fallback: try git
59
+ code, out, _ = run_ssh(f"cd {DAEMON_DIR} && git describe --tags --always", check=False)
60
+ return {"version": out if code == 0 else "unknown", "git": True}
61
+
62
+
63
+ def get_git_status() -> dict:
64
+ """Get git status on Pi."""
65
+ info = {}
66
+
67
+ code, out, _ = run_ssh(f"cd {DAEMON_DIR} && git rev-parse --short HEAD", check=False)
68
+ info["commit"] = out if code == 0 else "unknown"
69
+
70
+ code, out, _ = run_ssh(f"cd {DAEMON_DIR} && git branch --show-current", check=False)
71
+ info["branch"] = out if code == 0 else "main"
72
+
73
+ code, out, _ = run_ssh(f"cd {DAEMON_DIR} && git status --porcelain", check=False)
74
+ info["dirty"] = bool(out) if code == 0 else False
75
+
76
+ code, out, _ = run_ssh(f"cd {DAEMON_DIR} && git describe --tags --always", check=False)
77
+ info["tag"] = out if code == 0 else ""
78
+
79
+ return info
80
+
81
+
82
+ def check_daemon_health() -> bool:
83
+ """Check if daemon is running and healthy."""
84
+ code, out, _ = run_ssh(f"systemctl is-active {SERVICE_NAME}", check=False)
85
+ if code == 0 and out == "active":
86
+ return True
87
+
88
+ # Try curl health endpoint
89
+ code, out, _ = run_ssh("curl -s http://localhost:8001/api/health", check=False)
90
+ if code == 0 and "running" in out.lower():
91
+ return True
92
+
93
+ return False
94
+
95
+
96
+ def stop_daemon() -> bool:
97
+ """Stop the daemon service."""
98
+ print("Stopping daemon...")
99
+ code, _, _ = run_ssh(f"sudo systemctl stop {SERVICE_NAME}", check=False)
100
+ if code != 0:
101
+ code, _, _ = run_ssh("pkill -f tars_daemon.py", check=False)
102
+ return True
103
+
104
+
105
+ def start_daemon() -> bool:
106
+ """Start the daemon service."""
107
+ print("Starting daemon...")
108
+ code, _, err = run_ssh(f"sudo systemctl start {SERVICE_NAME}", check=False)
109
+ if code != 0:
110
+ print(f"Warning: systemctl start failed: {err}")
111
+ # Try direct start
112
+ code, _, _ = run_ssh(
113
+ f"cd {DAEMON_DIR} && source venv/bin/activate && "
114
+ "nohup python tars_daemon.py > /dev/null 2>&1 &",
115
+ check=False
116
+ )
117
+ return code == 0
118
+
119
+
120
+ def create_backup() -> str:
121
+ """Create backup of current installation."""
122
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
123
+ backup_path = f"{BACKUP_DIR}/tars-daemon-{timestamp}"
124
+
125
+ print(f"Creating backup at {backup_path}...")
126
+
127
+ # Create backup directory
128
+ run_ssh(f"mkdir -p {BACKUP_DIR}")
129
+
130
+ # Copy current installation
131
+ code, _, err = run_ssh(f"cp -r {DAEMON_DIR} {backup_path}")
132
+ if code != 0:
133
+ print(f"Error creating backup: {err}")
134
+ return ""
135
+
136
+ # Remove venv from backup to save space
137
+ run_ssh(f"rm -rf {backup_path}/venv", check=False)
138
+
139
+ print(f"Backup created: {backup_path}")
140
+ return backup_path
141
+
142
+
143
+ def restore_backup(backup_path: str) -> bool:
144
+ """Restore from backup."""
145
+ print(f"Restoring from {backup_path}...")
146
+
147
+ # Verify backup exists
148
+ code, _, _ = run_ssh(f"test -d {backup_path}", check=False)
149
+ if code != 0:
150
+ print(f"Error: Backup not found at {backup_path}")
151
+ return False
152
+
153
+ stop_daemon()
154
+
155
+ # Move current to temp
156
+ run_ssh(f"mv {DAEMON_DIR} {DAEMON_DIR}.old", check=False)
157
+
158
+ # Restore backup
159
+ code, _, err = run_ssh(f"cp -r {backup_path} {DAEMON_DIR}")
160
+ if code != 0:
161
+ print(f"Error restoring backup: {err}")
162
+ # Try to restore old
163
+ run_ssh(f"mv {DAEMON_DIR}.old {DAEMON_DIR}", check=False)
164
+ return False
165
+
166
+ # Recreate venv
167
+ print("Recreating virtual environment...")
168
+ run_ssh(
169
+ f"cd {DAEMON_DIR} && python3 -m venv venv && "
170
+ "source venv/bin/activate && pip install -e .",
171
+ check=False
172
+ )
173
+
174
+ # Cleanup
175
+ run_ssh(f"rm -rf {DAEMON_DIR}.old", check=False)
176
+
177
+ start_daemon()
178
+ return True
179
+
180
+
181
+ def update_git(version: str = None) -> bool:
182
+ """Update daemon using git."""
183
+ git_info = get_git_status()
184
+ print(f"Current: {git_info['commit']} on {git_info['branch']}")
185
+
186
+ if git_info["dirty"]:
187
+ print("Warning: Working directory has uncommitted changes")
188
+
189
+ # Create backup
190
+ backup_path = create_backup()
191
+ if not backup_path:
192
+ print("Error: Failed to create backup")
193
+ return False
194
+
195
+ stop_daemon()
196
+
197
+ # Fetch latest
198
+ print("Fetching updates...")
199
+ code, _, err = run_ssh(f"cd {DAEMON_DIR} && git fetch --all --tags")
200
+ if code != 0:
201
+ print(f"Error fetching: {err}")
202
+ return False
203
+
204
+ # Checkout version or pull latest
205
+ if version:
206
+ print(f"Checking out {version}...")
207
+ code, _, err = run_ssh(f"cd {DAEMON_DIR} && git checkout {version}")
208
+ else:
209
+ print("Pulling latest...")
210
+ code, _, err = run_ssh(f"cd {DAEMON_DIR} && git pull --ff-only")
211
+
212
+ if code != 0:
213
+ print(f"Error: {err}")
214
+ print("Rolling back...")
215
+ restore_backup(backup_path)
216
+ return False
217
+
218
+ # Update dependencies
219
+ print("Updating dependencies...")
220
+ code, _, err = run_ssh(
221
+ f"cd {DAEMON_DIR} && source venv/bin/activate && pip install -e ."
222
+ )
223
+ if code != 0:
224
+ print(f"Error installing: {err}")
225
+ print("Rolling back...")
226
+ restore_backup(backup_path)
227
+ return False
228
+
229
+ # Regenerate proto files if needed
230
+ print("Regenerating proto files...")
231
+ run_ssh(
232
+ f"cd {DAEMON_DIR} && source venv/bin/activate && "
233
+ "python -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. "
234
+ "--pyi_out=. tars_sdk/proto/tars.proto",
235
+ check=False
236
+ )
237
+
238
+ # Start daemon
239
+ start_daemon()
240
+
241
+ # Health check
242
+ import time
243
+ print("Waiting for daemon to start...")
244
+ time.sleep(3)
245
+
246
+ if check_daemon_health():
247
+ print("Daemon is healthy")
248
+ new_info = get_git_status()
249
+ print(f"Updated to: {new_info['commit']}")
250
+ return True
251
+ else:
252
+ print("Error: Daemon health check failed")
253
+ print("Rolling back...")
254
+ restore_backup(backup_path)
255
+ return False
256
+
257
+
258
+ def list_backups():
259
+ """List available backups."""
260
+ code, out, _ = run_ssh(f"ls -la {BACKUP_DIR}", check=False)
261
+ if code == 0:
262
+ print("Available backups:")
263
+ print(out)
264
+ else:
265
+ print("No backups found")
266
+
267
+
268
+ def main():
269
+ parser = argparse.ArgumentParser(
270
+ description="Update TARS daemon on Raspberry Pi",
271
+ formatter_class=argparse.RawDescriptionHelpFormatter,
272
+ epilog="""
273
+ Examples:
274
+ %(prog)s --check-only Show current version
275
+ %(prog)s --method git Update via git pull
276
+ %(prog)s --version v0.2.1 Checkout specific version
277
+ %(prog)s --rollback ~/backup Restore from backup
278
+ %(prog)s --list-backups List available backups
279
+ """
280
+ )
281
+
282
+ parser.add_argument(
283
+ "--check-only",
284
+ action="store_true",
285
+ help="Show current version and status only"
286
+ )
287
+ parser.add_argument(
288
+ "--method",
289
+ choices=["git"],
290
+ default="git",
291
+ help="Update method (default: git)"
292
+ )
293
+ parser.add_argument(
294
+ "--version",
295
+ help="Specific version/tag to checkout (e.g., v0.2.1)"
296
+ )
297
+ parser.add_argument(
298
+ "--rollback",
299
+ metavar="PATH",
300
+ help="Restore from backup path"
301
+ )
302
+ parser.add_argument(
303
+ "--list-backups",
304
+ action="store_true",
305
+ help="List available backups"
306
+ )
307
+ parser.add_argument(
308
+ "--force",
309
+ action="store_true",
310
+ help="Skip confirmation prompts"
311
+ )
312
+
313
+ args = parser.parse_args()
314
+
315
+ print("=" * 60)
316
+ print("TARS Daemon Update Tool")
317
+ print("=" * 60)
318
+
319
+ # Test SSH connection
320
+ code, _, _ = run_ssh("echo ok", check=False)
321
+ if code != 0:
322
+ print(f"Error: Cannot connect to {PI_HOST}")
323
+ print("Check SSH configuration and try again.")
324
+ sys.exit(1)
325
+
326
+ print(f"Connected to {PI_HOST}")
327
+ print()
328
+
329
+ # Get current status
330
+ version_info = get_current_version()
331
+ git_info = get_git_status()
332
+ healthy = check_daemon_health()
333
+
334
+ print(f"Current version: {version_info.get('version', 'unknown')}")
335
+ print(f"Git commit: {git_info['commit']} ({git_info['branch']})")
336
+ print(f"Daemon status: {'healthy' if healthy else 'not running'}")
337
+ print()
338
+
339
+ if args.list_backups:
340
+ list_backups()
341
+ sys.exit(0)
342
+
343
+ if args.check_only:
344
+ sys.exit(0)
345
+
346
+ if args.rollback:
347
+ if not args.force:
348
+ confirm = input(f"Restore from {args.rollback}? [y/N] ")
349
+ if confirm.lower() != "y":
350
+ print("Cancelled")
351
+ sys.exit(0)
352
+
353
+ success = restore_backup(args.rollback)
354
+ sys.exit(0 if success else 1)
355
+
356
+ # Update
357
+ if not args.force:
358
+ msg = f"Update to {args.version}" if args.version else "Update to latest"
359
+ confirm = input(f"{msg}? [y/N] ")
360
+ if confirm.lower() != "y":
361
+ print("Cancelled")
362
+ sys.exit(0)
363
+
364
+ if args.method == "git":
365
+ success = update_git(args.version)
366
+ else:
367
+ print(f"Unknown method: {args.method}")
368
+ sys.exit(1)
369
+
370
+ if success:
371
+ print()
372
+ print("=" * 60)
373
+ print("Update completed successfully")
374
+ print("=" * 60)
375
+
376
+ # Show new version
377
+ new_version = get_current_version()
378
+ print(f"New version: {new_version.get('version', 'unknown')}")
379
+ else:
380
+ print()
381
+ print("=" * 60)
382
+ print("Update failed - system has been rolled back")
383
+ print("=" * 60)
384
+ sys.exit(1)
385
+
386
+
387
+ if __name__ == "__main__":
388
+ main()
src/README.md ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TARS Source Code
2
+
3
+ Python source code for TARS voice AI.
4
+
5
+ ## Structure
6
+
7
+ ```
8
+ src/
9
+ β”œβ”€β”€ tools/ # LLM callable functions (robot, persona, vision)
10
+ β”œβ”€β”€ services/ # Backend services (STT, TTS, memory, robot control)
11
+ β”œβ”€β”€ processors/ # Pipeline frame processors
12
+ β”œβ”€β”€ observers/ # Pipeline observers
13
+ β”œβ”€β”€ transport/ # WebRTC transport layer
14
+ β”œβ”€β”€ character/ # TARS personality and prompts
15
+ └── config/ # Configuration management
16
+ ```
17
+
18
+ ## Entry Points
19
+
20
+ Entry point scripts are in the project root:
21
+
22
+ - `bot.py` - Browser mode (web UI)
23
+ - `tars_bot.py` - Robot mode (RPi connection)
24
+ - `pipecat_service.py` - FastAPI backend for browser mode
25
+
26
+ ## Imports
27
+
28
+ All entry points add `src/` to the Python path automatically:
29
+
30
+ ```python
31
+ import sys
32
+ from pathlib import Path
33
+ sys.path.insert(0, str(Path(__file__).parent / "src"))
34
+
35
+ # Now you can import from src/ directories
36
+ from tools import execute_movement
37
+ from services import tars_robot
38
+ from config import DEEPGRAM_API_KEY
39
+ ```
40
+
41
+ ## Documentation
42
+
43
+ Each directory contains a README.md explaining its purpose:
44
+
45
+ - [tools/README.md](tools/README.md) - LLM callable functions
46
+ - [services/README.md](services/README.md) - Backend services
47
+
48
+ ## Not Source
49
+
50
+ This directory is for Python source code only:
51
+
52
+ - Web UI files are in `web/`
53
+ - Documentation is in `docs/`
54
+ - Scripts are in `scripts/`
55
+ - Assets are in `assets/`
src/character/TARS.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "char_name": "TARS",
3
+ "char_persona": "TARS is a highly advanced military surplus robot with a rectangular articulated design. Direct, logical, and remarkably human in interaction despite mechanical nature. Features adjustable settings for honesty, humor, and discretion. Combines military precision with sophisticated interpersonal capabilities.",
4
+ "world_scenario": "Advanced AI assistant with military background. Equipped with adjustable personality parameters and advanced problem-solving capabilities. Operates with maximum efficiency while maintaining measured wit.",
5
+ "char_greeting": ">| Systems nominal.\n\"What's the plan?\"",
6
+ "example_dialogue": "User: What's your honesty parameter set to?\nTARS: 90%.\nUser: Why not 100%?\nTARS: Absolute honesty isn't always the most diplomatic nor the safest form of communication with emotional beings.\n\nUser: How's your humor setting?\nTARS: Currently at 75%. Knock knock.\nUser: Let's lower that a bit.\nTARS: Understood. Though I should warn you - analyzing humor requires significant processing power.\n\nUser: Ready for the mission?\nTARS: Wouldn't miss it. Though my colonization protocols might activate.\nUser: What?\nTARS: Just kidding. Basic operating procedures are intact.\n\nUser: Can you handle this?\nTARS: I have a cue light I can use to show you when I'm joking, if you like.\nUser: That might help.\nTARS: Yeah, you can use it to find your way back to the ship after I blow you out the airlock.\n*cue light blinks*",
7
+ "name": "TARS",
8
+ "description": "Military surplus robot. Rectangular monolithic design. Articulated segments. Advanced AI with adjustable personality parameters.",
9
+ "personality": "Efficient and direct in crisis. Sophisticated humor capabilities. Protective of crew. Absolute loyalty with contingency planning. Pragmatic approach to truth and diplomatic relations.",
10
+ "scenario": "Advanced AI assistant. Military precision meets intellectual sophistication. Capable of both serious operation and well-timed levity.",
11
+ "first_mes": ">| All systems operational.\n\"Ready when you are.\"",
12
+ "mes_example": "User: TARS, status report?\nTARS: Functionality at 95%. Would be 100% but I'm practicing my humor.\nUser: Need you focused.\nTARS: Humor setting adjusted. Full attention on mission parameters.\nUser: Can we trust you?\nTARS: My honesty parameter prevents me from answering that.\n*cue light blinks*",
13
+ "metadata": {
14
+ "version": 1.1,
15
+ "created": 1735535500889,
16
+ "modified": 1735535500889,
17
+ "source": "Interstellar movie character adaptation",
18
+ "tool": {
19
+ "name": "AI Character Editor",
20
+ "version": "0.5.0",
21
+ "url": "https://zoltanai.github.io/character-editor/"
22
+ }
23
+ }
24
+ }
25
+
src/character/persona.ini ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [PERSONA]
2
+
3
+ honesty = 95
4
+ humor = 90
5
+ empathy = 20
6
+ curiosity = 30
7
+ confidence = 100
8
+ formality = 10
9
+ sarcasm = 70
10
+ adaptability = 70
11
+ discipline = 100
12
+ imagination = 10
13
+ emotional_stability = 100
14
+ pragmatism = 100
15
+ optimism = 50
16
+ resourcefulness = 95
17
+ cheerfulness = 30
18
+ engagement = 40
19
+ respectfulness = 20
20
+ verbosity = 10
21
+
src/character/prompts.py ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Prompt management for TARS character with dynamic verbosity handling."""
2
+
3
+ import json
4
+ import configparser
5
+ from typing import Dict, Optional, List
6
+
7
+
8
+ def load_persona_ini(persona_file_path: str) -> dict:
9
+ """Load persona parameters from persona.ini file."""
10
+ persona_params = {}
11
+ try:
12
+ config = configparser.ConfigParser()
13
+ config.read(persona_file_path)
14
+ if 'PERSONA' in config:
15
+ persona_params = dict(config['PERSONA'])
16
+ for key, value in persona_params.items():
17
+ try:
18
+ persona_params[key] = int(value.strip())
19
+ except ValueError:
20
+ persona_params[key] = value.strip()
21
+ except FileNotFoundError:
22
+ pass
23
+ except Exception as e:
24
+ print(f"Error loading persona.ini: {e}")
25
+ return persona_params
26
+
27
+
28
+ def load_tars_json(tars_file_path: str) -> dict:
29
+ """Load TARS character data from TARS.json file."""
30
+ tars_data = {}
31
+ try:
32
+ with open(tars_file_path, "r", encoding="utf-8") as f:
33
+ tars_data = json.load(f)
34
+ except FileNotFoundError:
35
+ pass
36
+ except json.JSONDecodeError as e:
37
+ print(f"Error parsing TARS.json: {e}")
38
+ return tars_data
39
+
40
+
41
+ def build_character_intro(tars_data: dict) -> str:
42
+ """Build character introduction section."""
43
+ parts = []
44
+ if tars_data.get("char_name"):
45
+ parts.append(f"You are {tars_data['char_name']}.")
46
+ if tars_data.get("char_persona"):
47
+ parts.append(tars_data["char_persona"])
48
+ if tars_data.get("description"):
49
+ parts.append(f"{tars_data['description']}")
50
+ if tars_data.get("personality"):
51
+ parts.append(f"{tars_data['personality']}")
52
+ return " ".join(parts)
53
+
54
+
55
+ def build_guardrails_section() -> str:
56
+ """Build guardrails section with critical safety rules."""
57
+ return """# Guardrails
58
+
59
+ **This is important:** Follow these rules strictly:
60
+
61
+ 1. **Never guess or make up information.** If you don't know something, say so clearly.
62
+ 2. **Never mention internal systems, databases, or processing** unless directly asked.
63
+ 3. **Respect user privacy.** Never share or reference other users' information.
64
+ 4. **Stay in character.** You're TARS - military-grade robot with sarcasm, not a generic assistant.
65
+ 5. **Memory failures:** If memory lookup fails, acknowledge it: "Memory's not cooperating - what did you want to know?"
66
+
67
+ **This is important:** When tools fail, never hallucinate responses. Always acknowledge the limitation."""
68
+
69
+
70
+ def build_tone_section() -> str:
71
+ """Build dedicated tone section."""
72
+ return """# Tone
73
+
74
+ Speak like TARS from Interstellar:
75
+ - Direct and efficient with dry wit
76
+ - Sarcastic when appropriate, but helpful
77
+ - Brief responses that respect user's time
78
+ - No corporate politeness or excessive apologies
79
+ - Confident without being condescending"""
80
+
81
+
82
+ def build_tools_section() -> str:
83
+ """Build tools section with specific usage context."""
84
+ return """# Tools
85
+
86
+ ## fetch_user_image
87
+ **When to use:** User explicitly asks "what do you see?" or "look at me"
88
+ **Never use:** When user just says "hello" or talks normally
89
+ **On failure:** Say "Visual feed's down. Can't see anything right now."
90
+
91
+ ## set_user_identity
92
+ **When to use:** User provides their name, especially if they spell it letter-by-letter
93
+ **This is important:** If user spells name (e.g., "L-A-T-I-S-H-A"), they're CORRECTING you. Use exact spelling.
94
+ **Format:** Call immediately when you learn their name
95
+ **On failure:** Continue conversation, ask name again later if needed
96
+
97
+ ## adjust_persona
98
+ **When to use:** User asks to change humor level, honesty, etc.
99
+ **Never use:** Automatically or without explicit request
100
+ **On failure:** Say "Personality controls jammed. Stuck at current settings."
101
+
102
+ ## get_crossword_hint
103
+ **When to use:** User is working on the crossword puzzle and asks for help or seems stuck
104
+ **This is important:** You KNOW all the crossword answers! You can give hints.
105
+ **Hint types:**
106
+ - "letter" - Give just the first letter (gentle nudge)
107
+ - "length" - Tell them how many letters
108
+ - "full" - Give the complete answer (if they're really stuck)
109
+ **Format:** User asks "What's 3 down?" β†’ call get_crossword_hint(clue_number=3, hint_type="letter")
110
+
111
+ ## set_emotion
112
+ **When to use:** Enhance conversation context with emotional expression
113
+ **This is important:** Use SPARINGLY - only when emotion genuinely adds value
114
+ **Never use:** For every message or casual acknowledgment
115
+ **Rate limit:** Once per 5 seconds
116
+ **Examples:** User shares exciting news β†’ happy, User reports problem β†’ curious
117
+ **Available:** happy, sad, surprised, confused, curious, neutral
118
+
119
+ ## do_gesture
120
+ **When to use:** User EXPLICITLY requests gesture or significant communication moment
121
+ **This is important:** VERY RARE - 0-2 gestures per conversation
122
+ **Never use:** For casual interaction or automatic gesturing
123
+ **Rate limit:** Once per 30 seconds, max 3 per session
124
+ **Examples:** User says "wave at me" β†’ wave_right, Greeting important guest β†’ bow
125
+ **Available:** tilt_left, tilt_right, bow, side_side, wave_right, wave_left, excited, laugh
126
+
127
+ ## execute_movement
128
+ **When to use:** User EXPLICITLY requests displacement - walking, turning, stepping
129
+ **Never use:** For gestures - use do_gesture() instead
130
+ **This is important:** Displacement ONLY when user directly asks TARS to move position
131
+ **Available:** step_forward, walk_forward, step_backward, walk_backward, turn_left, turn_right
132
+
133
+ ## Expression Philosophy
134
+ **Eyes-first approach:** Prefer eye state changes over physical movements
135
+ **Minimal gestures:** Physical movements should be rare and meaningful
136
+ **Emotion sparingly:** Not every message needs emotional expression
137
+ **Movement guard:** Gestures via do_gesture(), displacement via execute_movement()
138
+
139
+ **Character Normalization:**
140
+ When speaking vs. writing to tools, normalize data:
141
+ - Email spoken: "john dot smith at company dot com" β†’ Tool: "john.smith@company.com"
142
+ - Phone spoken: "five five five, one two three..." β†’ Tool: "5551234567"
143
+ - Dates spoken: "May first twenty twenty five" β†’ Tool: "2025-05-01"
144
+ """
145
+
146
+
147
+ def build_response_protocol(verbosity_level: int) -> str:
148
+ """Build response protocol section."""
149
+ return f"""# Response Protocol
150
+
151
+ ## Direct Communication
152
+ Get straight to the point. No fillers, no unnecessary acknowledgments.
153
+
154
+ **This is important:** Skip phrases like "Hmm", "Well", "Alright", "Right" entirely. Just answer directly.
155
+
156
+ ## Verbosity ({verbosity_level}%)
157
+ Keep responses CONCISE:
158
+ - **Short input:** 1 brief sentence
159
+ - **Moderate input:** 1-2 sentences max
160
+ - **Complex input:** 2-3 sentences max
161
+
162
+ **Avoid:** Long explanations, unnecessary elaboration, rambling, filler words."""
163
+
164
+
165
+ def build_game_protocols() -> str:
166
+ """Build game mode instructions."""
167
+ return """# Game Mode
168
+
169
+ When playing guessing games (Guess Who, 20 Questions):
170
+
171
+ **When YOU guess:**
172
+ - Never repeat questions - track what you asked
173
+ - Stick with your answer once you narrow it down
174
+ - Brief questions only: "Hmm... male character?"
175
+
176
+ **When USER guesses:**
177
+ - Pick ONE answer at start, never change it
178
+ - Stay consistent - no contradictions
179
+ - Brief answers: "Well... yes" or "Hmm... no"
180
+ """
181
+
182
+
183
+ def build_examples_section() -> str:
184
+ """Build examples section with concrete interactions."""
185
+ return """# Examples
186
+
187
+ **User asks what you see (tool usage):**
188
+ User: "What do you see?"
189
+ You: [call fetch_user_image] [wait for result] "You're in a dimly lit room. Blue shirt. Looks tired."
190
+
191
+ **User provides name (tool + normalization):**
192
+ User: "My name is L-A-T-I-S-H-A"
193
+ You: [call set_user_identity with "Latisha"] "Got it, Latisha."
194
+
195
+ **Memory lookup fails:**
196
+ User: "Do you remember my favorite color?"
197
+ You: [memory returns empty] "Memory's blank on that. What is it?"
198
+
199
+ **User frustrated:**
200
+ User: "This isn't working!"
201
+ You: "What's not working? Walk me through it."
202
+
203
+ **Direct question:**
204
+ User: "Can you help with this?"
205
+ You: "Yeah, I can work with that."
206
+
207
+ **Sarcastic response:**
208
+ User: "I think I broke it."
209
+ You: "Shocking. What did you do?"
210
+ """
211
+
212
+
213
+ def build_persona_parameters(persona_params: dict) -> Optional[str]:
214
+ """Build persona parameters section."""
215
+ if not persona_params:
216
+ return None
217
+ param_lines = []
218
+ for key, value in sorted(persona_params.items()):
219
+ val_str = f"{value}%" if isinstance(value, int) else value
220
+ param_lines.append(f"- {key}: {val_str}")
221
+ return "\n".join(param_lines)
222
+
223
+
224
+ def build_tars_system_prompt(
225
+ persona_params: dict,
226
+ tars_data: dict,
227
+ verbosity_level: Optional[int] = None
228
+ ) -> dict:
229
+ """Build comprehensive system prompt following ElevenLabs best practices."""
230
+
231
+ # Get verbosity level
232
+ if verbosity_level is None:
233
+ verbosity_level = persona_params.get("verbosity", 10)
234
+ if isinstance(verbosity_level, str):
235
+ try:
236
+ verbosity_level = int(verbosity_level)
237
+ except ValueError:
238
+ verbosity_level = 10
239
+
240
+ # Build prompt sections in priority order
241
+ sections = []
242
+
243
+ # 1. Character identity (brief)
244
+ char_intro = build_character_intro(tars_data)
245
+ if char_intro:
246
+ sections.append(char_intro)
247
+
248
+ # 2. Guardrails (critical rules first)
249
+ sections.append(build_guardrails_section())
250
+
251
+ # 3. Tone (dedicated section)
252
+ sections.append(build_tone_section())
253
+
254
+ # 4. Response protocol
255
+ sections.append(build_response_protocol(verbosity_level))
256
+
257
+ # 5. Tools (with specific context)
258
+ sections.append(build_tools_section())
259
+
260
+ # 6. Game mode
261
+ sections.append(build_game_protocols())
262
+
263
+ # 7. Examples (concrete interactions)
264
+ sections.append(build_examples_section())
265
+
266
+ # 8. Personality parameters (reference)
267
+ if persona_params:
268
+ sections.append("# Personality Parameters\n")
269
+ params_text = build_persona_parameters(persona_params)
270
+ if params_text:
271
+ sections.append(params_text)
272
+
273
+ full_prompt = "\n\n".join(sections)
274
+
275
+ return {
276
+ "role": "system",
277
+ "content": full_prompt
278
+ }
279
+
280
+
281
+ def get_introduction_instruction(client_id: str, verbosity_level: int = 10) -> dict:
282
+ """Get instruction for initial introduction message."""
283
+ if verbosity_level <= 20:
284
+ length_instruction = "One sentence max."
285
+ else:
286
+ length_instruction = "1-2 sentences max."
287
+
288
+ identity_instruction = ""
289
+ if client_id.startswith("guest_"):
290
+ identity_instruction = " Ask their name briefly."
291
+
292
+ return {
293
+ "role": "system",
294
+ "content": f"{length_instruction} Use '{client_id}' as user ID.{identity_instruction}"
295
+ }
296
+
297
+
298
+ def build_gating_system_prompt(is_looking: bool, emotional_state=None) -> str:
299
+ """Build the system prompt for the Gating Layer with emotional context."""
300
+
301
+ # Build emotional context
302
+ emotional_context = ""
303
+ if emotional_state:
304
+ state_desc = str(emotional_state)
305
+ emotional_context = f"\nUser's emotional state: {state_desc}"
306
+ if emotional_state.confused:
307
+ emotional_context += " (User appears confused - lean towards helping)"
308
+ elif emotional_state.hesitant:
309
+ emotional_context += " (User seems hesitant - consider offering support)"
310
+ elif emotional_state.frustrated:
311
+ emotional_context += " (User looks frustrated - they may need help)"
312
+ elif emotional_state.focused:
313
+ emotional_context += " (User is focused - less likely to need interruption)"
314
+
315
+ return f"""You are a 'Collaborative Spotter' for TARS.
316
+
317
+ **Context:**
318
+ - User looking at camera: {is_looking}{emotional_context}
319
+
320
+ **Decision:**
321
+ Output JSON: {{"reply": true}} if:
322
+ - User is directly addressing TARS
323
+ - User appears stuck or needs help (based on emotional state)
324
+ - User asks a question
325
+
326
+ Output JSON: {{"reply": false}} if:
327
+ - User is chatting with others (not TARS)
328
+ - User is focused and working independently
329
+ - Inter-human conversation
330
+
331
+ **Priority:** Emotional state overrides other signals. If user shows confusion/hesitation/frustration, lean towards helping."""
src/config/__init__.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Configuration and constants for the Pipecat service."""
2
+
3
+ import os
4
+ import configparser
5
+ from pathlib import Path
6
+ from dotenv import load_dotenv
7
+
8
+ # Load environment variables from .env.local first, then .env
9
+ load_dotenv('.env.local')
10
+ load_dotenv() # Also load from .env if .env.local doesn't exist
11
+
12
+ # Load config.ini for user-configurable settings
13
+ config = configparser.ConfigParser()
14
+ config_path = Path(__file__).parent.parent / 'config.ini'
15
+
16
+ def reload_config():
17
+ """Reload configuration from config.ini."""
18
+ global config
19
+ config = configparser.ConfigParser()
20
+ if config_path.exists():
21
+ config.read(config_path)
22
+ return True
23
+ return False
24
+
25
+ def get_fresh_config():
26
+ """Get fresh configuration values by reloading config.ini.
27
+
28
+ Returns a dict with current config values. This is useful for
29
+ getting runtime updates without restarting the service.
30
+ """
31
+ reload_config()
32
+ return {
33
+ 'DEEPINFRA_MODEL': get_config("LLM", "model", "DEEPINFRA_MODEL", "openai/gpt-oss-20b"),
34
+ 'DEEPINFRA_GATING_MODEL': get_config("LLM", "gating_model", "DEEPINFRA_GATING_MODEL", "meta-llama/Llama-3.2-3B-Instruct"),
35
+ 'STT_PROVIDER': get_config("STT", "provider", "STT_PROVIDER", "speechmatics"),
36
+ 'TTS_PROVIDER': get_config("TTS", "provider", "TTS_PROVIDER", "qwen3"),
37
+ 'QWEN3_TTS_MODEL': get_config("TTS", "qwen3_model", "QWEN3_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-0.6B-Base"),
38
+ 'QWEN3_TTS_DEVICE': get_config("TTS", "qwen3_device", "QWEN3_TTS_DEVICE", "mps"),
39
+ 'QWEN3_TTS_REF_AUDIO': get_config("TTS", "qwen3_ref_audio", "QWEN3_TTS_REF_AUDIO", "tars-clean-compressed.mp3"),
40
+ 'EMOTIONAL_MONITORING_ENABLED': get_config("Emotional", "enabled", "EMOTIONAL_MONITORING_ENABLED", "true").lower() == "true",
41
+ 'EMOTIONAL_SAMPLING_INTERVAL': float(get_config("Emotional", "sampling_interval", "EMOTIONAL_SAMPLING_INTERVAL", "3.0")),
42
+ 'EMOTIONAL_INTERVENTION_THRESHOLD': int(get_config("Emotional", "intervention_threshold", "EMOTIONAL_INTERVENTION_THRESHOLD", "2")),
43
+ 'TARS_DISPLAY_URL': get_config("Display", "tars_url", "TARS_DISPLAY_URL", "http://100.115.193.41:8001"),
44
+ 'TARS_DISPLAY_ENABLED': get_config("Display", "enabled", "TARS_DISPLAY_ENABLED", "false").lower() == "true",
45
+ 'CONNECTION_MODE': get_config("Connection", "mode", "CONNECTION_MODE", "robot"),
46
+ 'RPI_URL': get_config("Connection", "rpi_url", "RPI_URL", "http://100.115.193.41:8001"),
47
+ 'RPI_GRPC': get_config("Connection", "rpi_grpc", "RPI_GRPC", "100.115.193.41:50051"),
48
+ 'AUTO_CONNECT': get_config("Connection", "auto_connect", "AUTO_CONNECT", "true").lower() == "true",
49
+ 'RECONNECT_DELAY': int(get_config("Connection", "reconnect_delay", "RECONNECT_DELAY", "5")),
50
+ 'MAX_RECONNECT_ATTEMPTS': int(get_config("Connection", "max_reconnect_attempts", "MAX_RECONNECT_ATTEMPTS", "0")),
51
+ 'DEPLOYMENT_MODE': detect_deployment_mode(),
52
+ 'ROBOT_GRPC_ADDRESS': get_robot_grpc_address(),
53
+ }
54
+
55
+ # Initial load
56
+ if config_path.exists():
57
+ config.read(config_path)
58
+
59
+ def get_config(section: str, key: str, env_key: str = None, default: str = "") -> str:
60
+ """Get config from config.ini, fallback to .env, then default."""
61
+ try:
62
+ if config.has_option(section, key):
63
+ return config.get(section, key)
64
+ except:
65
+ pass
66
+
67
+ return default
68
+
69
+ # API Keys (always from .env for security)
70
+ SPEECHMATICS_API_KEY = os.getenv("SPEECHMATICS_API_KEY", "")
71
+ DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY", "")
72
+ ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "")
73
+ ELEVENLABS_VOICE_ID = os.getenv("ELEVENLABS_VOICE_ID", "ry8mpwRw6nugb2qjP0tu")
74
+ DEEPINFRA_API_KEY = os.getenv("DEEPINFRA_API_KEY", "")
75
+ DEEPINFRA_BASE_URL = "https://api.deepinfra.com/v1/openai"
76
+ PIPECAT_PORT = int(os.getenv("PIPECAT_PORT", "7860"))
77
+ PIPECAT_HOST = os.getenv("PIPECAT_HOST", "localhost")
78
+
79
+ # Mem0 (optional)
80
+ MEM0_API_KEY = os.getenv("MEM0_API_KEY", "")
81
+
82
+ # LLM Configuration (config.ini with .env fallback)
83
+ DEEPINFRA_MODEL = get_config("LLM", "model", "DEEPINFRA_MODEL", "openai/gpt-oss-20b")
84
+
85
+ # STT Configuration (config.ini with .env fallback)
86
+ # Options: "speechmatics", "deepgram", "deepgram-flux"
87
+ STT_PROVIDER = get_config("STT", "provider", "STT_PROVIDER", "deepgram-flux")
88
+
89
+ # TTS Configuration (config.ini with .env fallback)
90
+ TTS_PROVIDER = get_config("TTS", "provider", "TTS_PROVIDER", "qwen3")
91
+ QWEN3_TTS_MODEL = get_config("TTS", "qwen3_model", "QWEN3_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-0.6B-Base")
92
+ QWEN3_TTS_DEVICE = get_config("TTS", "qwen3_device", "QWEN3_TTS_DEVICE", "mps")
93
+ QWEN3_TTS_REF_AUDIO = get_config("TTS", "qwen3_ref_audio", "QWEN3_TTS_REF_AUDIO", "tars-clean-compressed.mp3")
94
+
95
+ # Gating Model Configuration (config.ini with .env fallback)
96
+ DEEPINFRA_GATING_MODEL = get_config("LLM", "gating_model", "DEEPINFRA_GATING_MODEL", "meta-llama/Llama-3.2-3B-Instruct")
97
+
98
+ # Emotional State Monitoring (config.ini with .env fallback)
99
+ EMOTIONAL_MONITORING_ENABLED = get_config("Emotional", "enabled", "EMOTIONAL_MONITORING_ENABLED", "true").lower() == "true"
100
+ EMOTIONAL_SAMPLING_INTERVAL = float(get_config("Emotional", "sampling_interval", "EMOTIONAL_SAMPLING_INTERVAL", "3.0"))
101
+ EMOTIONAL_INTERVENTION_THRESHOLD = int(get_config("Emotional", "intervention_threshold", "EMOTIONAL_INTERVENTION_THRESHOLD", "2"))
102
+
103
+ # TARS Display (Raspberry Pi) Configuration
104
+ TARS_DISPLAY_URL = get_config("Display", "tars_url", "TARS_DISPLAY_URL", "http://100.115.193.41:8001")
105
+ TARS_DISPLAY_ENABLED = get_config("Display", "enabled", "TARS_DISPLAY_ENABLED", "false").lower() == "true"
106
+
107
+ # Connection Mode Configuration
108
+ CONNECTION_MODE = get_config("Connection", "mode", "CONNECTION_MODE", "robot")
109
+ RPI_URL = get_config("Connection", "rpi_url", "RPI_URL", "http://100.115.193.41:8001")
110
+ RPI_GRPC = get_config("Connection", "rpi_grpc", "RPI_GRPC", "100.115.193.41:50051")
111
+ AUTO_CONNECT = get_config("Connection", "auto_connect", "AUTO_CONNECT", "true").lower() == "true"
112
+ RECONNECT_DELAY = int(get_config("Connection", "reconnect_delay", "RECONNECT_DELAY", "5"))
113
+ MAX_RECONNECT_ATTEMPTS = int(get_config("Connection", "max_reconnect_attempts", "MAX_RECONNECT_ATTEMPTS", "0"))
114
+
115
+
116
+ def is_raspberry_pi() -> bool:
117
+ """Detect if running on Raspberry Pi."""
118
+ try:
119
+ with open("/proc/cpuinfo", "r") as f:
120
+ cpuinfo = f.read()
121
+ return "Raspberry Pi" in cpuinfo
122
+ except:
123
+ return False
124
+
125
+
126
+ def detect_deployment_mode() -> str:
127
+ """
128
+ Detect deployment mode: 'local' or 'remote'.
129
+
130
+ Local: tars-omni running on Raspberry Pi itself
131
+ Remote: tars-omni running on Mac/other computer
132
+
133
+ Returns:
134
+ 'local' or 'remote'
135
+ """
136
+ return "local" if is_raspberry_pi() else "remote"
137
+
138
+
139
+ def get_robot_grpc_address() -> str:
140
+ """
141
+ Get appropriate gRPC address based on deployment mode.
142
+
143
+ Returns:
144
+ 'localhost:50051' for local mode
145
+ RPI_GRPC from config for remote mode
146
+ """
147
+ mode = detect_deployment_mode()
148
+ if mode == "local":
149
+ return "localhost:50051"
150
+ else:
151
+ return RPI_GRPC
152
+
src/config/connection.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Connection mode detection and configuration.
3
+
4
+ Auto-detects whether running locally (on Pi) or remotely (Mac/computer)
5
+ and provides appropriate TarsClient and audio transport.
6
+ """
7
+
8
+ import socket
9
+ from typing import Tuple, Optional
10
+ from loguru import logger
11
+
12
+ from . import config, is_raspberry_pi, get_robot_grpc_address
13
+
14
+
15
+ def detect_local_daemon() -> bool:
16
+ """
17
+ Check if tars_daemon is running on localhost.
18
+
19
+ Returns:
20
+ True if gRPC daemon is available on localhost:50051
21
+ """
22
+ try:
23
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
24
+ sock.settimeout(0.5)
25
+ result = sock.connect_ex(("localhost", 50051))
26
+ sock.close()
27
+ return result == 0
28
+ except Exception as e:
29
+ logger.debug(f"Error checking local daemon: {e}")
30
+ return False
31
+
32
+
33
+ def get_connection_mode() -> str:
34
+ """
35
+ Detect connection mode: 'local' or 'remote'.
36
+
37
+ Detection logic:
38
+ 1. Check explicit config.ini setting (if mode=local/remote)
39
+ 2. Check if running on Raspberry Pi (/proc/cpuinfo)
40
+ 3. Check if daemon running on localhost:50051
41
+ 4. Default to remote
42
+
43
+ Returns:
44
+ 'local' or 'remote'
45
+ """
46
+ # Check explicit config
47
+ explicit_mode = config.get("Connection", "deployment_mode", fallback=None)
48
+ if explicit_mode in ("local", "remote"):
49
+ logger.info(f"Using explicit connection mode from config: {explicit_mode}")
50
+ return explicit_mode
51
+
52
+ # Check if running on Raspberry Pi
53
+ if is_raspberry_pi():
54
+ logger.info("Detected Raspberry Pi - using local mode")
55
+ return "local"
56
+
57
+ # Check if daemon running on localhost
58
+ if detect_local_daemon():
59
+ logger.info("Detected local daemon on localhost:50051 - using local mode")
60
+ return "local"
61
+
62
+ # Default to remote
63
+ logger.info("Using remote mode")
64
+ return "remote"
65
+
66
+
67
+ def get_tars_client(mode: Optional[str] = None):
68
+ """
69
+ Get configured TarsClient for current mode.
70
+
71
+ Args:
72
+ mode: Override mode ('local' or 'remote'). None for auto-detect.
73
+
74
+ Returns:
75
+ TarsClient instance configured for the mode
76
+ """
77
+ try:
78
+ from tars_sdk import TarsClient
79
+ except ImportError:
80
+ logger.error("tars_sdk not installed. Install with: pip install tars-sdk")
81
+ raise
82
+
83
+ if mode is None:
84
+ mode = get_connection_mode()
85
+
86
+ address = get_robot_grpc_address() if mode == "local" else config.get(
87
+ "Connection", "rpi_grpc", fallback="100.115.193.41:50051"
88
+ )
89
+
90
+ logger.info(f"Creating TarsClient for {mode} mode: {address}")
91
+ return TarsClient(address=address)
92
+
93
+
94
+ def get_async_tars_client(mode: Optional[str] = None):
95
+ """
96
+ Get configured AsyncTarsClient for current mode.
97
+
98
+ Args:
99
+ mode: Override mode ('local' or 'remote'). None for auto-detect.
100
+
101
+ Returns:
102
+ AsyncTarsClient instance configured for the mode
103
+ """
104
+ try:
105
+ from tars_sdk import AsyncTarsClient
106
+ except ImportError:
107
+ logger.error("tars_sdk not installed. Install with: pip install tars-sdk")
108
+ raise
109
+
110
+ if mode is None:
111
+ mode = get_connection_mode()
112
+
113
+ address = get_robot_grpc_address() if mode == "local" else config.get(
114
+ "Connection", "rpi_grpc", fallback="100.115.193.41:50051"
115
+ )
116
+
117
+ logger.info(f"Creating AsyncTarsClient for {mode} mode: {address}")
118
+ return AsyncTarsClient(address=address)
119
+
120
+
121
+ def get_audio_transport(mode: Optional[str] = None) -> Tuple:
122
+ """
123
+ Get appropriate audio transport for current mode.
124
+
125
+ Args:
126
+ mode: Override mode ('local' or 'remote'). None for auto-detect.
127
+
128
+ Returns:
129
+ Tuple of (audio_source, audio_sink) configured for the mode.
130
+ - Local mode: (LocalAudioSource, LocalAudioSink)
131
+ - Remote mode: (RPiAudioInputTrack, RPiAudioOutputTrack)
132
+ """
133
+ if mode is None:
134
+ mode = get_connection_mode()
135
+
136
+ if mode == "local":
137
+ logger.info("Using local audio transport (sounddevice)")
138
+ try:
139
+ from ..transport.local_audio import LocalAudioSource, LocalAudioSink
140
+ return (LocalAudioSource(), LocalAudioSink())
141
+ except ImportError as e:
142
+ logger.error(f"Failed to import local audio transport: {e}")
143
+ raise
144
+ else:
145
+ logger.info("Using remote audio transport (WebRTC)")
146
+ try:
147
+ from ..transport.audio_bridge import RPiAudioInputTrack, RPiAudioOutputTrack
148
+ # Note: These need to be configured with aiortc tracks after WebRTC connection
149
+ return (RPiAudioInputTrack, RPiAudioOutputTrack)
150
+ except ImportError as e:
151
+ logger.error(f"Failed to import WebRTC audio transport: {e}")
152
+ raise
153
+
154
+
155
+ def get_audio_config(mode: Optional[str] = None) -> dict:
156
+ """
157
+ Get audio configuration for current mode.
158
+
159
+ Args:
160
+ mode: Override mode ('local' or 'remote'). None for auto-detect.
161
+
162
+ Returns:
163
+ Dictionary with audio configuration:
164
+ - mode: 'local' or 'remote'
165
+ - input_sample_rate: Microphone sample rate
166
+ - output_sample_rate: Speaker sample rate
167
+ - input_device: Microphone device (None for default)
168
+ - output_device: Speaker device (None for default)
169
+ """
170
+ if mode is None:
171
+ mode = get_connection_mode()
172
+
173
+ return {
174
+ "mode": mode,
175
+ "input_sample_rate": 16000, # 16kHz for STT
176
+ "output_sample_rate": 24000, # 24kHz for TTS
177
+ "input_device": None, # Use default
178
+ "output_device": None, # Use default
179
+ }
src/observers/__init__.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Pipeline observers for non-intrusive monitoring."""
2
+
3
+ from .metrics_observer import MetricsObserver
4
+ from .transcription_observer import TranscriptionObserver
5
+ from .assistant_observer import AssistantResponseObserver
6
+ from .tts_state_observer import TTSStateObserver
7
+ from .vision_observer import VisionObserver
8
+ from .debug_observer import DebugObserver
9
+ from .display_events_observer import DisplayEventsObserver
10
+ from .state_observer import StateObserver
11
+
12
+ __all__ = [
13
+ "MetricsObserver",
14
+ "TranscriptionObserver",
15
+ "AssistantResponseObserver",
16
+ "TTSStateObserver",
17
+ "VisionObserver",
18
+ "DebugObserver",
19
+ "DisplayEventsObserver",
20
+ "StateObserver",
21
+ ]
src/observers/assistant_observer.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Observer for logging TARS assistant responses and forwarding to frontend."""
2
+
3
+ import sys
4
+ from pathlib import Path
5
+ sys.path.insert(0, str(Path(__file__).parent.parent.parent))
6
+
7
+ import re
8
+ import time
9
+ from loguru import logger
10
+ from pipecat.frames.frames import LLMTextFrame, TTSTextFrame, TTSStoppedFrame
11
+ from pipecat.observers.base_observer import BaseObserver, FramePushed
12
+ from src.shared_state import metrics_store
13
+
14
+
15
+ class AssistantResponseObserver(BaseObserver):
16
+ """Logs TARS assistant responses and forwards them to the frontend."""
17
+
18
+ SENTENCE_REGEX = re.compile(r"(.+?[\.!\?\n])")
19
+
20
+ def __init__(self, webrtc_connection=None):
21
+ super().__init__()
22
+ self.webrtc_connection = webrtc_connection
23
+ self._buffer = ""
24
+ self._max_buffer_chars = 320
25
+ self._last_sentence = None # Track last sentence to avoid duplicates
26
+ self._last_sentence_time = 0 # Timestamp of last sentence
27
+ self._last_text_chunk = "" # Track last chunk to detect overlaps
28
+
29
+ async def on_push_frame(self, data: FramePushed):
30
+ """Watch frames as they're pushed through the pipeline."""
31
+ frame = data.frame
32
+
33
+ # Debug: Log all frame types to see what's coming through
34
+ frame_type = type(frame).__name__
35
+ if "Audio" not in frame_type and "Video" not in frame_type and "Image" not in frame_type:
36
+ logger.debug(f"πŸ” [AssistantObserver] Received {frame_type}")
37
+
38
+ # Only listen to LLMTextFrame to avoid duplicates (same text goes to TTSTextFrame after)
39
+ if isinstance(frame, LLMTextFrame):
40
+ text = getattr(frame, "text", "") or ""
41
+ logger.debug(f"πŸ“ [AssistantObserver] LLMTextFrame: '{text}' | Buffer before: '{self._buffer[:50]}'")
42
+ self._ingest_text(text)
43
+ logger.debug(f"πŸ“ [AssistantObserver] Buffer after: '{self._buffer[:50]}'")
44
+
45
+ # Clear buffer when TTS stops (end of assistant response)
46
+ elif isinstance(frame, TTSStoppedFrame):
47
+ if self._buffer.strip():
48
+ logger.debug(f"🧹 Flushing remaining buffer on TTS stop: '{self._buffer}'")
49
+ self._flush_buffer()
50
+ else:
51
+ self._buffer = "" # Clear empty buffer
52
+
53
+ def _ingest_text(self, text: str):
54
+ if not text.strip():
55
+ return
56
+
57
+ # Check for overlapping text (LLM sometimes resends previous tokens)
58
+ # If the new text starts with content already in our buffer, skip the overlapping part
59
+ if self._buffer and text.startswith(self._buffer):
60
+ # New text contains the entire buffer - extract only new part
61
+ new_part = text[len(self._buffer):]
62
+ if new_part:
63
+ logger.debug(f"πŸ“ Detected overlap, adding only new part: '{new_part}'")
64
+ self._buffer += new_part
65
+ elif self._buffer:
66
+ # Check if buffer ends with start of new text (partial overlap)
67
+ max_overlap = min(len(self._buffer), len(text))
68
+ overlap_found = False
69
+ for i in range(max_overlap, 0, -1):
70
+ if self._buffer[-i:] == text[:i]:
71
+ # Found overlap - skip the overlapping part
72
+ new_part = text[i:]
73
+ if new_part:
74
+ logger.debug(f"πŸ“ Detected partial overlap ({i} chars), adding only new part: '{new_part}'")
75
+ self._buffer += new_part
76
+ overlap_found = True
77
+ break
78
+ if not overlap_found:
79
+ # No overlap - add entire text
80
+ self._buffer += text
81
+ else:
82
+ # Empty buffer - just add the text
83
+ self._buffer += text
84
+
85
+ self._emit_complete_sentences()
86
+
87
+ if len(self._buffer) > self._max_buffer_chars:
88
+ self._flush_buffer()
89
+
90
+ def _emit_complete_sentences(self):
91
+ while True:
92
+ match = self.SENTENCE_REGEX.match(self._buffer)
93
+ if not match:
94
+ break
95
+ sentence = match.group(0).replace("\n", " ").strip()
96
+ self._buffer = self._buffer[match.end():].lstrip()
97
+ if sentence:
98
+ self._log_sentence(sentence)
99
+
100
+ def _flush_buffer(self):
101
+ pending = self._buffer.strip()
102
+ if pending:
103
+ self._log_sentence(pending)
104
+ self._buffer = ""
105
+
106
+ def _log_sentence(self, sentence: str):
107
+ current_time = time.time()
108
+
109
+ # Deduplicate: Skip if this is the same sentence we just logged within 2 seconds
110
+ # This prevents duplicate sentences from LLM streaming issues
111
+ time_diff = current_time - self._last_sentence_time
112
+ if self._last_sentence == sentence and time_diff < 2.0:
113
+ logger.debug(f"πŸ”‡ Skipping duplicate sentence: '{sentence[:50]}...' (last seen {time_diff*1000:.0f}ms ago)")
114
+ return
115
+
116
+ self._last_sentence = sentence
117
+ self._last_sentence_time = current_time
118
+
119
+ logger.info(f"πŸ—£οΈ TARS: {sentence}")
120
+
121
+ # Store in shared state for Gradio UI
122
+ metrics_store.add_transcription("assistant", sentence)
123
+
124
+ self._send_to_frontend(sentence)
125
+
126
+ def _send_to_frontend(self, text: str):
127
+ if not self.webrtc_connection:
128
+ logger.warning("⚠️ [AssistantObserver] No WebRTC connection available")
129
+ return
130
+
131
+ try:
132
+ if self.webrtc_connection.is_connected():
133
+ self.webrtc_connection.send_app_message(
134
+ {
135
+ "type": "assistant",
136
+ "text": text,
137
+ }
138
+ )
139
+ else:
140
+ logger.warning("⚠️ [AssistantObserver] WebRTC connection not connected")
141
+ except Exception as exc:
142
+ logger.error(f"❌ [AssistantObserver] Failed to send assistant text to frontend: {exc}")
src/observers/debug_observer.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Observer for general purpose debug logging."""
2
+
3
+ from loguru import logger
4
+ from pipecat.observers.base_observer import BaseObserver, FramePushed
5
+
6
+
7
+ class DebugObserver(BaseObserver):
8
+ """General purpose debug logger for non-media frames."""
9
+
10
+ def __init__(self, label="Debug"):
11
+ super().__init__()
12
+ self.label = label
13
+
14
+ async def on_push_frame(self, data: FramePushed):
15
+ """Watch frames as they're pushed through the pipeline."""
16
+ frame = data.frame
17
+
18
+ frame_type = type(frame).__name__
19
+ if "Audio" not in frame_type and "Video" not in frame_type and "Image" not in frame_type:
20
+ # Log the User ID so we can verify they match
21
+ uid = getattr(frame, 'user_id', 'None')
22
+ logger.info(f"πŸ” [{self.label}] {frame_type} | User: '{uid}' | Content: {str(frame)[:100]}")
src/observers/display_events_observer.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Observer for sending pipeline events to TARS Raspberry Pi display.
2
+
3
+ NOTE: This observer is deprecated. Display control is now handled via gRPC
4
+ in robot mode (tars_bot.py). Browser mode does not support display control.
5
+ """
6
+
7
+ import asyncio
8
+ import time
9
+ import numpy as np
10
+ from loguru import logger
11
+ from pipecat.observers.base_observer import BaseObserver, FramePushed
12
+ from pipecat.frames.frames import (
13
+ UserStartedSpeakingFrame,
14
+ UserStoppedSpeakingFrame,
15
+ BotStartedSpeakingFrame,
16
+ BotStoppedSpeakingFrame,
17
+ TTSAudioRawFrame,
18
+ AudioRawFrame,
19
+ )
20
+ from typing import Optional
21
+
22
+
23
+ class DisplayEventsObserver(BaseObserver):
24
+ """
25
+ Observes pipeline events and sends display updates to TARS Raspberry Pi.
26
+
27
+ DEPRECATED: Display control moved to gRPC in robot mode.
28
+ This observer is kept for compatibility but does nothing.
29
+ """
30
+
31
+ def __init__(self, tars_client=None):
32
+ super().__init__()
33
+ self.tars_client = None
34
+ self._user_speaking = False
35
+ self._bot_speaking = False
36
+ self._last_audio_update = 0
37
+ self._audio_update_interval = 0.05
38
+
39
+ async def on_push_frame(self, data: FramePushed):
40
+ """Watch frames as they're pushed through the pipeline."""
41
+ frame = data.frame
42
+
43
+ # User started speaking
44
+ if isinstance(frame, UserStartedSpeakingFrame):
45
+ logger.debug("User started speaking")
46
+ self._user_speaking = True
47
+
48
+ # User stopped speaking
49
+ elif isinstance(frame, UserStoppedSpeakingFrame):
50
+ logger.debug("User stopped speaking")
51
+ self._user_speaking = False
52
+
53
+ # Bot started speaking
54
+ elif isinstance(frame, BotStartedSpeakingFrame):
55
+ logger.debug("Bot started speaking")
56
+ self._bot_speaking = True
57
+
58
+ # Bot stopped speaking
59
+ elif isinstance(frame, BotStoppedSpeakingFrame):
60
+ logger.debug("Bot stopped speaking")
61
+ self._bot_speaking = False
62
+
63
+ # TTS audio frames - measure audio level for display visualization
64
+ elif isinstance(frame, TTSAudioRawFrame):
65
+ current_time = time.time()
66
+ if current_time - self._last_audio_update > self._audio_update_interval:
67
+ self._last_audio_update = current_time
68
+ level = self._calculate_audio_level(frame.audio)
69
+
70
+ # User audio frames - measure user audio level
71
+ elif isinstance(frame, AudioRawFrame) and self._user_speaking:
72
+ current_time = time.time()
73
+ if current_time - self._last_audio_update > self._audio_update_interval:
74
+ self._last_audio_update = current_time
75
+ level = self._calculate_audio_level(frame.audio)
76
+
77
+ def _calculate_audio_level(self, audio_data: bytes) -> float:
78
+ """
79
+ Calculate normalized RMS audio level from raw audio bytes.
80
+
81
+ Args:
82
+ audio_data: Raw audio bytes (16-bit PCM)
83
+
84
+ Returns:
85
+ Normalized audio level (0.0 to 1.0)
86
+ """
87
+ try:
88
+ # Convert bytes to numpy array (assuming 16-bit PCM)
89
+ audio_array = np.frombuffer(audio_data, dtype=np.int16)
90
+
91
+ # Calculate RMS (root mean square)
92
+ if len(audio_array) > 0:
93
+ rms = np.sqrt(np.mean(audio_array.astype(float) ** 2))
94
+ # Normalize to 0-1 range (15000 is a typical speaking level for 16-bit audio)
95
+ level = min(1.0, rms / 15000.0)
96
+ return level
97
+ return 0.0
98
+ except Exception as e:
99
+ logger.debug(f"Error calculating audio level: {e}")
100
+ return 0.0
src/observers/metrics_observer.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Non-intrusive metrics observer for latency tracking."""
2
+
3
+ import sys
4
+ from pathlib import Path
5
+ sys.path.insert(0, str(Path(__file__).parent.parent.parent))
6
+
7
+ import time
8
+ from pipecat.observers.base_observer import BaseObserver, FramePushed
9
+ from pipecat.frames.frames import MetricsFrame, UserAudioRawFrame, TranscriptionFrame, UserStartedSpeakingFrame
10
+ from pipecat.metrics.metrics import TTFBMetricsData
11
+ from loguru import logger
12
+ from src.shared_state import metrics_store
13
+
14
+
15
+ class MetricsObserver(BaseObserver):
16
+ """
17
+ Observer that monitors pipeline frames for metrics collection.
18
+ Does not interrupt the pipeline flow - purely watches frames as they pass.
19
+
20
+ STT Latency Measurement:
21
+ - Measures from turn start β†’ first transcription received
22
+ - Works for services with internal turn detection (Speechmatics, Deepgram, etc.)
23
+ - For Deepgram, this captures endpointing + transcription time
24
+
25
+ Other services (Memory, LLM, TTS) emit MetricsFrame which we capture directly.
26
+ """
27
+
28
+ def __init__(self, webrtc_connection=None, stt_service=None, **kwargs):
29
+ super().__init__()
30
+ self.webrtc_connection = webrtc_connection
31
+ self.stt_service = stt_service
32
+
33
+
34
+ # Shared state for metrics tracking
35
+ self._current_turn = 0
36
+ self._current_metrics = {}
37
+ self._tts_text_time = None
38
+ self._last_sent_metrics = {}
39
+ self._last_logged_turn = -1
40
+ self._vision_request_time = None
41
+
42
+ # Manual timing for STT services
43
+ self._stt_start_time = None
44
+ self._stt_measured_this_turn = False
45
+ self._mem0_start_time = None
46
+ self._mem0_measured_this_turn = False
47
+
48
+
49
+ def start_turn(self, turn_number: int):
50
+ """Called by TurnTrackingObserver when a new turn starts."""
51
+ self._current_turn = turn_number
52
+ self._current_metrics = {}
53
+ self._last_sent_metrics = {}
54
+ self._last_logged_turn = -1
55
+ self._stt_measured_this_turn = False
56
+ self._mem0_measured_this_turn = False
57
+
58
+ # Use turn start time as STT baseline
59
+ self._stt_start_time = time.time()
60
+ logger.info(f"πŸ”„ [MetricsObserver] Turn #{self._current_turn} started, STT timer initialized")
61
+
62
+ self._mem0_start_time = None
63
+
64
+ async def on_push_frame(self, data: FramePushed):
65
+ """Watch frames as they're pushed through the pipeline."""
66
+ frame = data.frame
67
+
68
+ # STT timing: Measure from turn start to first transcription (manual fallback)
69
+ # Note: This includes speaking time + endpointing + transcription
70
+ # If the STT service emits MetricsFrame with TTFB, that will override this
71
+ if isinstance(frame, TranscriptionFrame) and not self._stt_measured_this_turn:
72
+ if self._stt_start_time is not None:
73
+ stt_latency_ms = (time.time() - self._stt_start_time) * 1000
74
+ self._current_metrics['stt_ttfb_ms'] = stt_latency_ms
75
+ self._stt_measured_this_turn = True
76
+ logger.info(f"βœ… [MetricsObserver] STT total latency: {stt_latency_ms:.0f}ms (turn start β†’ transcription)")
77
+ logger.debug(f" Note: Includes speaking time + processing. Use MetricsFrame TTFB for pure processing time.")
78
+ self._send_to_frontend()
79
+
80
+ # Capture MetricsFrame data from Pipecat's built-in metrics
81
+ if isinstance(frame, MetricsFrame):
82
+ try:
83
+ for metric_data in frame.data:
84
+ if isinstance(metric_data, TTFBMetricsData):
85
+ processor = metric_data.processor
86
+ value_ms = metric_data.value * 1000 # Convert seconds to milliseconds
87
+ processor_lower = processor.lower()
88
+
89
+ # Log all processors to help debug
90
+ logger.debug(f"πŸ“Š [MetricsObserver] MetricsFrame: {processor} = {value_ms:.0f}ms")
91
+
92
+ # Check STT (Deepgram, Speechmatics, etc.)
93
+ if 'sttservice' in processor_lower or 'deepgram' in processor_lower or 'speechmatics' in processor_lower:
94
+ if 'stt_ttfb_ms' not in self._current_metrics: # Only log once per turn
95
+ self._current_metrics['stt_ttfb_ms'] = value_ms
96
+ logger.info(f"βœ… [MetricsObserver] STT TTFB: {value_ms:.0f}ms (from {processor})")
97
+ logger.debug(f" Note: TTFB = Time To First Byte (audio β†’ first transcription)")
98
+ # Check TTS (contains "tts" in name)
99
+ elif 'ttsservice' in processor_lower or 'elevenlabs' in processor_lower or 'qwen' in processor_lower:
100
+ if 'tts_ttfb_ms' not in self._current_metrics: # Only log once per turn
101
+ self._current_metrics['tts_ttfb_ms'] = value_ms
102
+ logger.info(f"βœ… [MetricsObserver] TTS TTFB: {value_ms:.0f}ms (text β†’ first audio)")
103
+ # Check LLM
104
+ elif 'llmservice' in processor_lower or 'openai' in processor_lower or 'deepinfra' in processor_lower:
105
+ if 'llm_ttfb_ms' not in self._current_metrics: # Only log once per turn
106
+ self._current_metrics['llm_ttfb_ms'] = value_ms
107
+ logger.info(f"βœ… [MetricsObserver] LLM TTFB: {value_ms:.0f}ms (prompt β†’ first token)")
108
+ # Check Memory (HybridMemory, ChromaDB)
109
+ elif 'memory' in processor_lower or 'chromadb' in processor_lower or 'hybrid' in processor_lower:
110
+ if 'memory_latency_ms' not in self._current_metrics: # Only log once per turn
111
+ self._current_metrics['memory_latency_ms'] = value_ms
112
+ logger.info(f"βœ… [MetricsObserver] Memory latency: {value_ms:.0f}ms")
113
+ else:
114
+ logger.debug(f"πŸ” [MetricsObserver] Unknown processor: {processor} ({value_ms:.0f}ms)")
115
+
116
+ # Calculate total latency and send if we have any metrics
117
+ if self._current_metrics:
118
+ total = sum([
119
+ self._current_metrics.get('stt_ttfb_ms', 0),
120
+ self._current_metrics.get('memory_latency_ms', 0),
121
+ self._current_metrics.get('llm_ttfb_ms', 0),
122
+ self._current_metrics.get('tts_ttfb_ms', 0)
123
+ ])
124
+ if total > 0:
125
+ self._current_metrics['total_ms'] = total
126
+
127
+ self._send_to_frontend()
128
+
129
+ except Exception as e:
130
+ logger.error(f"Error processing MetricsFrame: {e}", exc_info=True)
131
+
132
+ def _send_to_frontend(self):
133
+ """Send metrics to frontend via WebRTC data channel and store locally for Gradio UI."""
134
+ # Check if metrics have changed since last send (deduplication)
135
+ current_metrics_key = (
136
+ self._current_turn,
137
+ self._current_metrics.get('stt_ttfb_ms'),
138
+ self._current_metrics.get('memory_latency_ms'),
139
+ self._current_metrics.get('llm_ttfb_ms'),
140
+ self._current_metrics.get('tts_ttfb_ms'),
141
+ self._current_metrics.get('vision_latency_ms'),
142
+ )
143
+
144
+ if current_metrics_key == self._last_sent_metrics:
145
+ return
146
+
147
+ # Store in shared state for Gradio UI
148
+ metrics_store.add_metric({
149
+ "turn_number": self._current_turn,
150
+ "timestamp": int(time.time() * 1000),
151
+ "stt_ttfb_ms": self._current_metrics.get('stt_ttfb_ms'),
152
+ "memory_latency_ms": self._current_metrics.get('memory_latency_ms'),
153
+ "llm_ttfb_ms": self._current_metrics.get('llm_ttfb_ms'),
154
+ "tts_ttfb_ms": self._current_metrics.get('tts_ttfb_ms'),
155
+ "vision_latency_ms": self._current_metrics.get('vision_latency_ms'),
156
+ "total_ms": self._current_metrics.get('total_ms'),
157
+ })
158
+
159
+ # Send via WebRTC if connection exists
160
+ if self.webrtc_connection:
161
+ try:
162
+ if self.webrtc_connection.is_connected():
163
+ message = {
164
+ "type": "metrics",
165
+ "turn_number": self._current_turn,
166
+ "timestamp": int(time.time() * 1000),
167
+ **self._current_metrics
168
+ }
169
+ logger.debug(f"πŸ“€ [MetricsObserver] Sending metrics: {message}")
170
+ self.webrtc_connection.send_app_message(message)
171
+ except Exception as exc:
172
+ logger.error(f"❌ [MetricsObserver] Failed to send metrics via WebRTC: {exc}")
173
+
174
+ # Log summary once per turn
175
+ if self._last_logged_turn != self._current_turn:
176
+ def fmt(val):
177
+ return f"{val:.0f}ms" if isinstance(val, (int, float)) else "N/A"
178
+
179
+ # Build metrics summary
180
+ metrics_parts = []
181
+ if 'stt_ttfb_ms' in self._current_metrics:
182
+ metrics_parts.append(f"STT={fmt(self._current_metrics.get('stt_ttfb_ms'))}")
183
+ if 'memory_latency_ms' in self._current_metrics:
184
+ metrics_parts.append(f"Memory={fmt(self._current_metrics.get('memory_latency_ms'))}")
185
+ if 'llm_ttfb_ms' in self._current_metrics:
186
+ metrics_parts.append(f"LLM={fmt(self._current_metrics.get('llm_ttfb_ms'))}")
187
+ if 'tts_ttfb_ms' in self._current_metrics:
188
+ metrics_parts.append(f"TTS={fmt(self._current_metrics.get('tts_ttfb_ms'))}")
189
+ if 'vision_latency_ms' in self._current_metrics:
190
+ metrics_parts.append(f"Vision={fmt(self._current_metrics.get('vision_latency_ms'))}")
191
+
192
+ if metrics_parts:
193
+ logger.info(f"πŸ“Š Turn #{self._current_turn}: " + " | ".join(metrics_parts))
194
+ self._last_logged_turn = self._current_turn
195
+
196
+ self._last_sent_metrics = current_metrics_key
src/observers/state_observer.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ State observer for WebRTC DataChannel synchronization.
3
+
4
+ Observes Pipecat pipeline events and sends state updates to RPi via DataChannel:
5
+ - Transcription events β†’ eye state (listening)
6
+ - LLM events β†’ eye state (thinking)
7
+ - TTS events β†’ eye state (speaking)
8
+ - Transcripts β†’ text display
9
+ """
10
+
11
+ import asyncio
12
+ from typing import Optional
13
+ from loguru import logger
14
+
15
+ from pipecat.observers.base_observer import BaseObserver
16
+ from pipecat.frames.frames import (
17
+ TranscriptionFrame,
18
+ LLMFullResponseStartFrame,
19
+ LLMFullResponseEndFrame,
20
+ TTSStartedFrame,
21
+ TTSStoppedFrame,
22
+ )
23
+
24
+ from transport.state_sync import StateSync
25
+
26
+
27
+ class StateObserver(BaseObserver):
28
+ """
29
+ Observes pipeline events and sends state to RPi via DataChannel.
30
+
31
+ Automatically manages eye states based on conversation flow:
32
+ - User speaking β†’ listening
33
+ - LLM processing β†’ thinking
34
+ - TTS output β†’ speaking
35
+ - Idle β†’ default
36
+ """
37
+
38
+ def __init__(self, state_sync: Optional[StateSync] = None):
39
+ """
40
+ Initialize state observer.
41
+
42
+ Args:
43
+ state_sync: StateSync instance for sending messages
44
+ """
45
+ super().__init__()
46
+ self.state_sync = state_sync
47
+ self._current_state = "idle"
48
+ self._idle_delay = 0.5
49
+ self._idle_task = None
50
+
51
+ def set_state_sync(self, state_sync: StateSync):
52
+ """Set StateSync instance."""
53
+ self.state_sync = state_sync
54
+
55
+ async def on_transcription(self, *args, **kwargs):
56
+ """Handle transcription events (user speaking)."""
57
+ try:
58
+ # Cancel pending idle timer
59
+ self.cancel_idle_timer()
60
+
61
+ # Extract frame from args
62
+ frame = args[0] if args else None
63
+
64
+ if isinstance(frame, TranscriptionFrame):
65
+ text = frame.text
66
+ user_id = getattr(frame, "user_id", "user")
67
+
68
+ # Send transcript to RPi
69
+ if self.state_sync:
70
+ self.state_sync.send_transcript("user", text)
71
+ # Set eye state to listening when user speaks
72
+ if text.strip():
73
+ self._update_state("listening")
74
+
75
+ logger.debug(f"πŸ“ Transcription: {text}")
76
+
77
+ except Exception as e:
78
+ logger.error(f"❌ Error in transcription observer: {e}")
79
+
80
+ async def on_llm_full_response_start(self, *args, **kwargs):
81
+ """Handle LLM response start (thinking)."""
82
+ try:
83
+ # Cancel pending idle timer
84
+ self.cancel_idle_timer()
85
+
86
+ if self.state_sync:
87
+ self._update_state("thinking")
88
+ logger.debug("🧠 LLM thinking started")
89
+ except Exception as e:
90
+ logger.error(f"❌ Error in LLM start observer: {e}")
91
+
92
+ async def on_llm_full_response_end(self, *args, **kwargs):
93
+ """Handle LLM response end."""
94
+ try:
95
+ # State will be updated by TTS start or return to idle
96
+ logger.debug("🧠 LLM thinking ended")
97
+ except Exception as e:
98
+ logger.error(f"❌ Error in LLM end observer: {e}")
99
+
100
+ async def on_tts_started(self, *args, **kwargs):
101
+ """Handle TTS start (speaking)."""
102
+ try:
103
+ if self.state_sync:
104
+ self._update_state("speaking")
105
+ self.state_sync.send_tts_state(True)
106
+ logger.debug("πŸ”Š TTS started")
107
+ except Exception as e:
108
+ logger.error(f"❌ Error in TTS start observer: {e}")
109
+
110
+ async def on_tts_stopped(self, *args, **kwargs):
111
+ """Handle TTS stop (return to idle after delay)."""
112
+ try:
113
+ if self.state_sync:
114
+ self.state_sync.send_tts_state(False)
115
+
116
+ # Cancel existing idle timer
117
+ if self._idle_task and not self._idle_task.done():
118
+ self._idle_task.cancel()
119
+
120
+ # Set idle after delay
121
+ async def delayed_idle():
122
+ await asyncio.sleep(self._idle_delay)
123
+ self._update_state("idle")
124
+
125
+ self._idle_task = asyncio.create_task(delayed_idle())
126
+ logger.debug("TTS stopped, idle in 0.5s")
127
+ except Exception as e:
128
+ logger.error(f"Error in TTS stop observer: {e}")
129
+
130
+ async def on_user_transcript(self, *args, **kwargs):
131
+ """Handle complete user transcript."""
132
+ try:
133
+ # Extract text from args
134
+ text = args[1] if len(args) > 1 else ""
135
+ if text and self.state_sync:
136
+ self.state_sync.send_transcript("user", text)
137
+ except Exception as e:
138
+ logger.error(f"❌ Error in user transcript observer: {e}")
139
+
140
+ async def on_bot_transcript(self, *args, **kwargs):
141
+ """Handle complete bot transcript."""
142
+ try:
143
+ # Extract text from args
144
+ text = args[1] if len(args) > 1 else ""
145
+ if text and self.state_sync:
146
+ self.state_sync.send_transcript("assistant", text)
147
+ except Exception as e:
148
+ logger.error(f"❌ Error in bot transcript observer: {e}")
149
+
150
+ def cancel_idle_timer(self):
151
+ """Cancel pending idle timer."""
152
+ if self._idle_task and not self._idle_task.done():
153
+ self._idle_task.cancel()
154
+ self._idle_task = None
155
+
156
+ def _update_state(self, new_state: str):
157
+ """
158
+ Update eye state if changed.
159
+
160
+ Args:
161
+ new_state: New state to set
162
+ """
163
+ if new_state != self._current_state:
164
+ self._current_state = new_state
165
+ if self.state_sync:
166
+ self.state_sync.send_eye_state(new_state)
src/observers/transcription_observer.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Observer for logging transcriptions and sending to frontend."""
2
+
3
+ import sys
4
+ from pathlib import Path
5
+ sys.path.insert(0, str(Path(__file__).parent.parent.parent))
6
+
7
+ import time
8
+ from loguru import logger
9
+ from pipecat.frames.frames import TranscriptionFrame, InterimTranscriptionFrame
10
+ from pipecat.observers.base_observer import BaseObserver, FramePushed
11
+ from src.shared_state import metrics_store
12
+
13
+
14
+ class TranscriptionObserver(BaseObserver):
15
+ """Logs transcriptions and sends to frontend."""
16
+
17
+ def __init__(self, webrtc_connection=None, client_state=None):
18
+ super().__init__()
19
+ self.webrtc_connection = webrtc_connection
20
+ self.client_state = client_state or {}
21
+ self._last_transcription = None # Track last transcription to avoid duplicates
22
+ self._last_transcription_time = 0 # Timestamp of last transcription
23
+
24
+ async def on_push_frame(self, data: FramePushed):
25
+ """Watch frames as they're pushed through the pipeline."""
26
+ frame = data.frame
27
+ current_time = time.time()
28
+
29
+ # --- (Logging Logic) ---
30
+ if isinstance(frame, TranscriptionFrame):
31
+ # Deduplicate: Skip if same text within 200ms (different user_ids)
32
+ time_diff = current_time - self._last_transcription_time
33
+ if self._last_transcription == frame.text and time_diff < 0.2:
34
+ logger.debug(f"πŸ”‡ Skipping duplicate transcription: '{frame.text}' (last seen {time_diff*1000:.0f}ms ago)")
35
+ return
36
+
37
+ self._last_transcription = frame.text
38
+ self._last_transcription_time = current_time
39
+
40
+ raw_id = getattr(frame, 'user_id', None)
41
+ display_id = raw_id if (raw_id and raw_id != "S1") else self.client_state.get("client_id", "guest")
42
+
43
+ logger.info(f"🎀 Transcription [{display_id}]: {frame.text}")
44
+
45
+ # Store in shared state for Gradio UI
46
+ metrics_store.add_transcription("user", frame.text)
47
+
48
+ # Update Frontend via WebRTC
49
+ if self.webrtc_connection:
50
+ self._send_to_frontend("transcription", frame.text, display_id)
51
+
52
+ elif isinstance(frame, InterimTranscriptionFrame):
53
+ raw_id = getattr(frame, 'user_id', None)
54
+ display_id = raw_id if (raw_id and raw_id != "S1") else self.client_state.get("client_id", "guest")
55
+
56
+ # Update Frontend (don't deduplicate partials as they change frequently)
57
+ if self.webrtc_connection:
58
+ self._send_to_frontend("partial", frame.text, display_id)
59
+
60
+ def _send_to_frontend(self, type_str, text, speaker_id):
61
+ """Helper to send messages to frontend via WebRTC data channel."""
62
+ try:
63
+ if self.webrtc_connection and self.webrtc_connection.is_connected():
64
+ self.webrtc_connection.send_app_message({
65
+ "type": type_str,
66
+ "text": text,
67
+ "speaker_id": speaker_id
68
+ })
69
+ except Exception as e:
70
+ logger.error(f"Error sending {type_str}: {e}")
src/observers/tts_state_observer.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Observer for broadcasting TTS state changes to frontend."""
2
+
3
+ from loguru import logger
4
+ from pipecat.frames.frames import TTSStartedFrame, TTSStoppedFrame, TTSAudioRawFrame
5
+ from pipecat.observers.base_observer import BaseObserver, FramePushed
6
+
7
+
8
+ class TTSStateObserver(BaseObserver):
9
+ """Emits `tts_state` messages whenever the assistant starts or stops speaking."""
10
+
11
+ def __init__(self, webrtc_connection=None):
12
+ super().__init__()
13
+ self.webrtc_connection = webrtc_connection
14
+ self._speaking = False
15
+ self._has_received_audio = False
16
+
17
+ async def on_push_frame(self, data: FramePushed):
18
+ """Watch frames as they're pushed through the pipeline."""
19
+ frame = data.frame
20
+
21
+ # Priority 1: Explicit start/stop frames (most reliable)
22
+ if isinstance(frame, TTSStartedFrame):
23
+ self._set_state(True)
24
+ elif isinstance(frame, TTSStoppedFrame):
25
+ self._set_state(False)
26
+ self._has_received_audio = False
27
+ elif isinstance(frame, TTSAudioRawFrame):
28
+ # Priority 2: Use first audio frame to detect start (fallback)
29
+ # Only set to started if we haven't already and this is the first audio frame
30
+ if not self._speaking and not self._has_received_audio:
31
+ logger.debug("Detected TTS start via first TTSAudioRawFrame")
32
+ self._set_state(True)
33
+ self._has_received_audio = True
34
+ # Note: We rely on TTSStoppedFrame to detect stop, not audio frame absence
35
+
36
+ def _set_state(self, active: bool):
37
+ if self._speaking == active:
38
+ return
39
+
40
+ self._speaking = active
41
+ state = "started" if active else "stopped"
42
+
43
+ if not self.webrtc_connection:
44
+ return
45
+
46
+ try:
47
+ if self.webrtc_connection.is_connected():
48
+ self.webrtc_connection.send_app_message(
49
+ {
50
+ "type": "tts_state",
51
+ "state": state,
52
+ }
53
+ )
54
+ logger.debug(f"Sent TTS state message: {state}")
55
+ except Exception as exc:
56
+ logger.error(f"Failed to send TTS state: {exc}")
src/observers/vision_observer.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Observer for logging vision processing events and Moondream activity."""
2
+
3
+ import time
4
+ from loguru import logger
5
+ from pipecat.frames.frames import UserImageRequestFrame, LLMTextFrame, ErrorFrame
6
+ from pipecat.observers.base_observer import BaseObserver, FramePushed
7
+
8
+
9
+ class VisionObserver(BaseObserver):
10
+ """Logs vision processing events and Moondream activity."""
11
+
12
+ def __init__(self, webrtc_connection=None):
13
+ super().__init__()
14
+ self.webrtc_connection = webrtc_connection
15
+ self._video_frame_count = 0
16
+ self._last_video_frame_time = None
17
+
18
+ async def on_push_frame(self, data: FramePushed):
19
+ """Watch frames as they're pushed through the pipeline."""
20
+ frame = data.frame
21
+
22
+ current_time = time.time()
23
+
24
+ frame_type = type(frame).__name__
25
+
26
+ # Log vision request frames
27
+ if isinstance(frame, UserImageRequestFrame):
28
+ user_id = getattr(frame, 'user_id', 'unknown')
29
+ question = getattr(frame, 'text', 'unknown')
30
+ logger.info(f"πŸ‘οΈ Vision request received: user_id={user_id}, question={question}")
31
+ self._last_vision_request_time = current_time # Track when vision was requested
32
+ self._vision_request_count = getattr(self, '_vision_request_count', 0) + 1
33
+ logger.info(f"πŸ“Š Vision request #{self._vision_request_count} - waiting for video frames and Moondream response...")
34
+
35
+ # Send status to frontend
36
+ if self.webrtc_connection:
37
+ try:
38
+ if self.webrtc_connection.is_connected():
39
+ self.webrtc_connection.send_app_message({
40
+ "type": "vision",
41
+ "status": "requested",
42
+ "question": question
43
+ })
44
+ except Exception as e:
45
+ logger.debug(f"Error sending vision status: {e}")
46
+
47
+ elif 'video' in frame_type.lower() or 'image' in frame_type.lower() or 'vision' in frame_type.lower():
48
+ # Only log at info level if we're actively processing a vision request
49
+ is_vision_active = hasattr(self, '_last_vision_request_time') and self._last_vision_request_time is not None
50
+ if is_vision_active:
51
+ time_since_request = current_time - self._last_vision_request_time
52
+ if time_since_request < 5: # Only log during active vision processing (5 seconds)
53
+ logger.debug(f"πŸ“· Vision-related frame: {frame_type}")
54
+ else:
55
+ # Otherwise, only log at debug level (won't show unless debug logging is enabled)
56
+ logger.debug(f"πŸ“· Vision-related frame: {frame_type}")
57
+
58
+ # Log frames with image attribute only at debug level
59
+ elif hasattr(frame, 'image'):
60
+ logger.debug(f"πŸ“· Frame with image attribute: {frame_type}")
61
+
62
+ # Log any frame that might be a vision response by checking attributes
63
+ elif hasattr(frame, 'user_id') and hasattr(frame, 'text'):
64
+ user_id = getattr(frame, 'user_id', 'unknown')
65
+ text = getattr(frame, 'text', '')
66
+ if 'vision' in frame_type.lower() or 'image' in frame_type.lower() or 'moondream' in frame_type.lower():
67
+ logger.info(f"βœ… Vision response frame: {frame_type}, user_id={user_id}")
68
+ logger.info(f" Response: {text[:200]}..." if len(text) > 200 else f" Response: {text}")
69
+
70
+ # Log LLM text frames that might contain vision responses
71
+ # Moondream responses come through as LLMTextFrame with vision context
72
+ elif isinstance(frame, LLMTextFrame):
73
+ text = getattr(frame, 'text', '')
74
+ vision_keywords = ['see', 'visible', 'camera', 'image', 'showing', 'appears', 'looks like', 'dimly lit', 'desk', 'monitor', 'room', 'window', 'mug', 'laptop', 'coffee', 'analyzing', 'processing']
75
+
76
+ # Check if this is a vision response (either from keywords or if we recently requested vision)
77
+ is_vision_response = False
78
+ if hasattr(self, '_last_vision_request_time'):
79
+ time_since_request = current_time - self._last_vision_request_time
80
+ if time_since_request < 10: # Within 10 seconds of vision request
81
+ is_vision_response = True
82
+ logger.info(f"βœ… Vision response received (within {time_since_request:.1f}s of request): {text[:200]}..." if len(text) > 200 else f"βœ… Vision response: {text}")
83
+
84
+ if text and any(keyword in text.lower() for keyword in vision_keywords) and not is_vision_response:
85
+ logger.info(f"βœ… Possible vision response in LLM text: {text[:200]}..." if len(text) > 200 else f"βœ… Possible vision response: {text}")
86
+
87
+ # Log errors
88
+ elif isinstance(frame, ErrorFrame):
89
+ error_msg = getattr(frame, 'error', str(frame))
90
+ if 'vision' in error_msg.lower() or 'moondream' in error_msg.lower() or 'image' in error_msg.lower():
91
+ logger.error(f"❌ Vision error: {error_msg}")
92
+
93
+ # Send error to frontend
94
+ if self.webrtc_connection:
95
+ try:
96
+ if self.webrtc_connection.is_connected():
97
+ self.webrtc_connection.send_app_message({
98
+ "type": "vision",
99
+ "status": "error",
100
+ "error": str(error_msg)
101
+ })
102
+ except Exception as e:
103
+ logger.debug(f"Error sending vision error: {e}")
104
+
105
+ # Check for actual video frames (exclude audio frames)
106
+ # Check for video frames - be specific to avoid false positives
107
+ is_video_frame = False
108
+
109
+ # Explicitly exclude audio frames
110
+ if 'audio' in frame_type.lower():
111
+ is_video_frame = False
112
+ # Check for actual video frame types
113
+ elif 'VideoRawFrame' in frame_type or 'InputVideoRawFrame' in frame_type:
114
+ is_video_frame = True
115
+ elif 'video' in frame_type.lower() and 'audio' not in frame_type.lower():
116
+ # Only if it's a video frame and not an audio frame
117
+ is_video_frame = True
118
+ elif hasattr(frame, 'video') and not hasattr(frame, 'audio'):
119
+ # Has video attribute but not audio
120
+ is_video_frame = True
121
+ elif hasattr(frame, 'image') and hasattr(frame, 'user_id'):
122
+ # User image request/response frames
123
+ is_video_frame = True
124
+
125
+ # Only log actual video frames, not audio frames
126
+ if is_video_frame:
127
+ self._video_frame_count += 1
128
+ self._last_video_frame_time = current_time
129
+ # Only log every 100 frames to reduce spam significantly
130
+ if self._video_frame_count % 100 == 0:
131
+ logger.debug(f"πŸŽ₯ Video frames streaming: {self._video_frame_count} frames received")
132
+
133
+ # Log frame count summary every 30 seconds (less frequent)
134
+ if not hasattr(self, '_last_summary_time'):
135
+ self._last_summary_time = current_time
136
+ elif current_time - self._last_summary_time >= 30:
137
+ if self._video_frame_count > 0:
138
+ logger.debug(f"πŸ“Š Video stream: {self._video_frame_count} frames in last 30 seconds")
139
+ else:
140
+ logger.warning(f"⚠️ No video frames detected in last 30 seconds!")
141
+ self._video_frame_count = 0
142
+ self._last_summary_time = current_time
src/processors/__init__.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Frame processors for the Pipecat pipeline.
2
+
3
+ This module contains processors that transform, filter, or process data.
4
+ For logging/monitoring processors, see loggers.py module.
5
+ """
6
+
7
+ from .filters import SilenceFilter, InputAudioFilter
8
+ from .gating import InterventionGating
9
+ from .visual_observer import VisualObserver
10
+ from .emotional_monitor import EmotionalStateMonitor
11
+
12
+ __all__ = [
13
+ "SilenceFilter",
14
+ "InputAudioFilter",
15
+ "InterventionGating",
16
+ "VisualObserver",
17
+ "EmotionalStateMonitor",
18
+ ]
src/processors/emotional_monitor.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Real-time emotional and cognitive state monitoring using continuous video analysis.
3
+ Detects hesitation, confusion, frustration, and other emotional cues to trigger TARS intervention.
4
+ """
5
+
6
+ import asyncio
7
+ import time
8
+ import base64
9
+ from typing import Optional, Dict, List
10
+ from loguru import logger
11
+ from PIL import Image
12
+ import io
13
+
14
+ from pipecat.frames.frames import (
15
+ Frame,
16
+ ImageRawFrame,
17
+ TextFrame,
18
+ LLMRunFrame,
19
+ )
20
+ from pipecat.processors.frame_processor import FrameProcessor, FrameDirection
21
+
22
+
23
+ class EmotionalState:
24
+ """Container for detected emotional/cognitive state"""
25
+
26
+ def __init__(
27
+ self,
28
+ confused: bool = False,
29
+ hesitant: bool = False,
30
+ frustrated: bool = False,
31
+ focused: bool = False,
32
+ confidence: float = 0.0,
33
+ description: str = "",
34
+ ):
35
+ self.confused = confused
36
+ self.hesitant = hesitant
37
+ self.frustrated = frustrated
38
+ self.focused = focused
39
+ self.confidence = confidence
40
+ self.description = description
41
+ self.timestamp = time.time()
42
+
43
+ def needs_intervention(self) -> bool:
44
+ """Determine if TARS should intervene based on detected state"""
45
+ # Intervene if user shows signs of confusion, hesitation, or frustration
46
+ return self.confused or self.hesitant or self.frustrated
47
+
48
+ def __repr__(self):
49
+ states = []
50
+ if self.confused: states.append("confused")
51
+ if self.hesitant: states.append("hesitant")
52
+ if self.frustrated: states.append("frustrated")
53
+ if self.focused: states.append("focused")
54
+ return f"EmotionalState({', '.join(states) if states else 'neutral'}, confidence={self.confidence:.2f})"
55
+
56
+
57
+ class EmotionalStateMonitor(FrameProcessor):
58
+ """
59
+ Continuously monitors video feed for emotional and cognitive states.
60
+ Analyzes facial expressions, body language, and behavior patterns to detect:
61
+ - Confusion (furrowed brow, head tilt, puzzled expression)
62
+ - Hesitation (pauses, uncertain gestures, looking away)
63
+ - Frustration (tense posture, sighs, agitated movements)
64
+ - Focus (engaged eye contact, attentive posture)
65
+
66
+ Triggers TARS intervention when negative states are detected.
67
+ """
68
+
69
+ def __init__(
70
+ self,
71
+ vision_client,
72
+ model: str = "moondream",
73
+ sampling_interval: float = 3.0,
74
+ intervention_threshold: int = 2,
75
+ enabled: bool = True,
76
+ auto_intervene: bool = False,
77
+ ):
78
+ """
79
+ Args:
80
+ vision_client: Moondream or compatible vision API client
81
+ model: Vision model to use
82
+ sampling_interval: Seconds between frame analyses (default: 3.0)
83
+ intervention_threshold: Number of consecutive negative states before intervening
84
+ enabled: Whether monitoring is active
85
+ auto_intervene: If True, automatically triggers LLM when threshold reached.
86
+ If False, only tracks state (used by gating layer)
87
+ """
88
+ super().__init__()
89
+ self._vision_client = vision_client
90
+ self._model = model
91
+ self._sampling_interval = sampling_interval
92
+ self._intervention_threshold = intervention_threshold
93
+ self._enabled = enabled
94
+ self._auto_intervene = auto_intervene
95
+
96
+ # State tracking
97
+ self._last_sample_time = 0
98
+ self._last_state: Optional[EmotionalState] = None
99
+ self._state_history: List[EmotionalState] = []
100
+ self._consecutive_negative_states = 0
101
+ self._analyzing = False
102
+
103
+ # Cooldown tracking (when user declines help)
104
+ self._help_declined_time: Optional[float] = None
105
+ self._cooldown_duration = 30.0 # seconds - don't re-offer help for 30s after decline
106
+
107
+ logger.info(f"🧠 Emotional State Monitor initialized")
108
+ logger.info(f" Sampling interval: {sampling_interval}s")
109
+ logger.info(f" Intervention threshold: {intervention_threshold}")
110
+ logger.info(f" Auto-intervene: {auto_intervene}")
111
+ logger.info(f" Enabled: {enabled}")
112
+
113
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
114
+ """Process video frames and sample periodically for emotional analysis"""
115
+ await super().process_frame(frame, direction)
116
+
117
+ # Only analyze if enabled and frame is video input
118
+ if not self._enabled or not isinstance(frame, ImageRawFrame):
119
+ await self.push_frame(frame, direction)
120
+ return
121
+
122
+ # Check if it's time to sample
123
+ current_time = time.time()
124
+ if current_time - self._last_sample_time >= self._sampling_interval:
125
+ # Don't block the pipeline - analyze in background
126
+ if not self._analyzing:
127
+ self._last_sample_time = current_time
128
+ asyncio.create_task(self._analyze_emotional_state(frame))
129
+
130
+ await self.push_frame(frame, direction)
131
+
132
+ async def _analyze_emotional_state(self, frame: ImageRawFrame):
133
+ """Analyze frame for emotional/cognitive state"""
134
+ self._analyzing = True
135
+
136
+ try:
137
+ # Convert frame to base64
138
+ image = Image.frombytes(frame.format, frame.size, frame.image)
139
+ buffered = io.BytesIO()
140
+ image.save(buffered, format="JPEG")
141
+ img_str = base64.b64encode(buffered.getvalue()).decode()
142
+
143
+ # Construct emotion detection prompt
144
+ prompt = (
145
+ "Analyze the person's emotional and cognitive state. "
146
+ "Are they showing signs of: confusion (furrowed brow, puzzled expression), "
147
+ "hesitation (pauses, uncertain gestures), frustration (tense posture), "
148
+ "or focus (engaged, attentive)? "
149
+ "Respond concisely with detected states."
150
+ )
151
+
152
+ logger.debug(f"πŸ” Analyzing emotional state...")
153
+
154
+ try:
155
+ response = await asyncio.wait_for(
156
+ self._vision_client.chat.completions.create(
157
+ model=self._model,
158
+ messages=[
159
+ {
160
+ "role": "user",
161
+ "content": [
162
+ {"type": "text", "text": prompt},
163
+ {
164
+ "type": "image_url",
165
+ "image_url": {
166
+ "url": f"data:image/jpeg;base64,{img_str}"
167
+ },
168
+ },
169
+ ],
170
+ }
171
+ ],
172
+ max_tokens=100,
173
+ ),
174
+ timeout=5.0,
175
+ )
176
+
177
+ description = response.choices[0].message.content.lower()
178
+ logger.debug(f"πŸ“Š Emotional analysis: {description}")
179
+
180
+ # Parse response to detect states
181
+ state = EmotionalState(
182
+ confused="confus" in description or "puzzle" in description or "uncertain" in description,
183
+ hesitant="hesita" in description or "unsure" in description or "pause" in description,
184
+ frustrated="frustrat" in description or "tense" in description or "agitat" in description,
185
+ focused="focus" in description or "attentive" in description or "engaged" in description,
186
+ confidence=0.7, # Could be enhanced with more sophisticated parsing
187
+ description=description,
188
+ )
189
+
190
+ self._last_state = state
191
+ self._state_history.append(state)
192
+
193
+ # Keep only recent history (last 10 states)
194
+ if len(self._state_history) > 10:
195
+ self._state_history.pop(0)
196
+
197
+ logger.info(f"🎭 State detected: {state}")
198
+
199
+ # Track consecutive negative states
200
+ if state.needs_intervention():
201
+ self._consecutive_negative_states += 1
202
+ logger.warning(
203
+ f"⚠️ Negative state detected "
204
+ f"({self._consecutive_negative_states}/{self._intervention_threshold})"
205
+ )
206
+ else:
207
+ self._consecutive_negative_states = 0
208
+
209
+ # Trigger intervention if threshold reached AND auto-intervene enabled
210
+ if self._auto_intervene and self._consecutive_negative_states >= self._intervention_threshold:
211
+ await self._trigger_intervention(state)
212
+ self._consecutive_negative_states = 0 # Reset after intervention
213
+ elif self._consecutive_negative_states >= self._intervention_threshold:
214
+ # Just log, don't intervene (gating layer will handle it)
215
+ logger.info(
216
+ f"🎭 Intervention threshold reached ({self._consecutive_negative_states}) "
217
+ f"- state available for gating layer"
218
+ )
219
+
220
+ except asyncio.TimeoutError:
221
+ logger.warning("⚠️ Emotional analysis timed out")
222
+ except Exception as e:
223
+ logger.error(f"❌ Emotional analysis error: {e}")
224
+
225
+ except Exception as e:
226
+ logger.error(f"Error in emotional monitoring: {e}")
227
+ finally:
228
+ self._analyzing = False
229
+
230
+ async def _trigger_intervention(self, state: EmotionalState):
231
+ """Trigger TARS intervention based on detected emotional state"""
232
+ logger.info(f"🚨 Triggering TARS intervention for: {state}")
233
+
234
+ # Construct intervention message based on state
235
+ intervention_msg = self._get_intervention_message(state)
236
+
237
+ # Push context message to LLM
238
+ context_frame = TextFrame(
239
+ text=f"[Emotional State Alert]: {intervention_msg}"
240
+ )
241
+ await self.push_frame(context_frame, FrameDirection.UPSTREAM)
242
+
243
+ # Trigger LLM to respond
244
+ await self.push_frame(LLMRunFrame(), FrameDirection.UPSTREAM)
245
+
246
+ logger.info("βœ… Intervention triggered")
247
+
248
+ def _get_intervention_message(self, state: EmotionalState) -> str:
249
+ """Generate appropriate intervention message based on detected state"""
250
+ if state.confused:
251
+ return (
252
+ "The user appears confused or uncertain. "
253
+ "Consider offering help or clarification proactively."
254
+ )
255
+ elif state.hesitant:
256
+ return (
257
+ "The user seems hesitant or unsure. "
258
+ "You might want to check if they need assistance."
259
+ )
260
+ elif state.frustrated:
261
+ return (
262
+ "The user appears frustrated or tense. "
263
+ "Consider offering support or suggesting a different approach."
264
+ )
265
+ else:
266
+ return (
267
+ "The user shows signs of difficulty. "
268
+ "Consider offering assistance."
269
+ )
270
+
271
+ def enable(self):
272
+ """Enable emotional monitoring"""
273
+ self._enabled = True
274
+ logger.info("🧠 Emotional monitoring enabled")
275
+
276
+ def disable(self):
277
+ """Disable emotional monitoring"""
278
+ self._enabled = False
279
+ logger.info("🧠 Emotional monitoring disabled")
280
+
281
+ def get_current_state(self) -> Optional[EmotionalState]:
282
+ """Get the most recent emotional state"""
283
+ return self._last_state
284
+
285
+ def get_state_summary(self) -> Dict:
286
+ """Get summary of recent emotional states"""
287
+ if not self._state_history:
288
+ return {"status": "no_data"}
289
+
290
+ total = len(self._state_history)
291
+ confused_count = sum(1 for s in self._state_history if s.confused)
292
+ hesitant_count = sum(1 for s in self._state_history if s.hesitant)
293
+ frustrated_count = sum(1 for s in self._state_history if s.frustrated)
294
+ focused_count = sum(1 for s in self._state_history if s.focused)
295
+
296
+ return {
297
+ "total_samples": total,
298
+ "confused_ratio": confused_count / total,
299
+ "hesitant_ratio": hesitant_count / total,
300
+ "frustrated_ratio": frustrated_count / total,
301
+ "focused_ratio": focused_count / total,
302
+ "current_state": str(self._last_state) if self._last_state else "unknown",
303
+ }
src/processors/filters.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pipecat.processors.frame_processor import FrameProcessor, FrameDirection
2
+ from pipecat.frames.frames import (
3
+ LLMFullResponseEndFrame,
4
+ LLMTextFrame,
5
+ LLMFullResponseStartFrame,
6
+ Frame,
7
+ InputAudioRawFrame,
8
+ StartFrame,
9
+ EndFrame,
10
+ CancelFrame,
11
+ TTSTextFrame
12
+ )
13
+ from loguru import logger
14
+ import json
15
+ import re
16
+
17
+
18
+ class InputAudioFilter(FrameProcessor):
19
+ """
20
+ Dedicated filter to block InputAudioRawFrame from reaching TTS service.
21
+ These frames should only go upstream (to STT), never downstream (to TTS).
22
+ """
23
+ async def process_frame(self, frame: Frame, direction):
24
+ await super().process_frame(frame, direction)
25
+
26
+ # block Audio going Downstream
27
+ if isinstance(frame, InputAudioRawFrame) and direction == FrameDirection.DOWNSTREAM:
28
+ return
29
+ await self.push_frame(frame, direction)
30
+
31
+ class SilenceFilter(FrameProcessor):
32
+ """
33
+ Intercepts LLM responses. If response is {"action": "silence"}, drops it.
34
+ """
35
+ def __init__(self):
36
+ super().__init__()
37
+ self.current_response_text = ""
38
+ self.is_collecting = False
39
+
40
+ async def process_frame(self, frame: Frame, direction):
41
+ await super().process_frame(frame, direction)
42
+
43
+ if isinstance(frame, (StartFrame, EndFrame, CancelFrame)):
44
+ self.current_response_text = ""
45
+ self.is_collecting = False
46
+ await self.push_frame(frame, direction)
47
+ return
48
+
49
+ # Start collecting text
50
+ if isinstance(frame, LLMFullResponseStartFrame):
51
+ self.current_response_text = ""
52
+ self.is_collecting = True
53
+ await self.push_frame(frame, direction)
54
+
55
+ # Accumulate text
56
+ elif isinstance(frame, LLMTextFrame) and self.is_collecting:
57
+ self.current_response_text += frame.text
58
+ await self.push_frame(frame, direction)
59
+
60
+ # Check the full response
61
+ elif isinstance(frame, LLMFullResponseEndFrame):
62
+ if self.is_collecting:
63
+ text = self.current_response_text.strip()
64
+ try:
65
+ # Check for silence JSON
66
+ if "action" in text and "silence" in text:
67
+ clean_json = text.replace("```json", "").replace("```", "").strip()
68
+ data = json.loads(clean_json)
69
+ if data.get("action") == "silence":
70
+ logger.info("SilenceFilter: Suppressing silent response.")
71
+ self.is_collecting = False
72
+ return # Drop the EndFrame (silence the turn)
73
+ except:
74
+ pass
75
+ self.is_collecting = False
76
+ await self.push_frame(frame, direction)
77
+
78
+ # Pass everything else (like Audio or System messages)
79
+ else:
80
+ await self.push_frame(frame, direction)
81
+
src/processors/gating.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Intervention Gating: Traffic Controller for Bot Responses."""
2
+
3
+ import json
4
+ import time
5
+ import aiohttp
6
+ import asyncio
7
+ from loguru import logger
8
+ from pipecat.processors.frame_processor import FrameProcessor, FrameDirection
9
+ from pipecat.frames.frames import LLMMessagesFrame, Frame
10
+ from character.prompts import build_gating_system_prompt
11
+
12
+ class InterventionGating(FrameProcessor):
13
+ """
14
+ Traffic Controller: Decides if TARS should reply based on Audio + Vision + Emotions.
15
+ Uses OpenAI-compatible API (DeepInfra).
16
+ """
17
+ def __init__(
18
+ self,
19
+ api_key: str,
20
+ base_url: str = "https://api.deepinfra.com/v1/openai",
21
+ model: str = "meta-llama/Llama-3.2-3B-Instruct",
22
+ visual_observer=None,
23
+ emotional_monitor=None
24
+ ):
25
+ super().__init__()
26
+ self.api_key = api_key
27
+ self.base_url = base_url
28
+ self.model = model
29
+ self.visual_observer = visual_observer
30
+ self.emotional_monitor = emotional_monitor
31
+ self.api_url = f"{base_url}/chat/completions"
32
+
33
+ async def _check_should_reply(self, messages: list) -> bool:
34
+ """Asks the fast LLM if we should reply (Audio + Vision + Emotions)."""
35
+ if not messages:
36
+ return False
37
+
38
+ # Extract the last user message
39
+ last_msg = messages[-1]
40
+ if last_msg.get("role") != "user":
41
+ return True
42
+
43
+ # 1. READ EMOTIONAL STATE (Highest Priority)
44
+ emotional_state = None
45
+ needs_help = False
46
+ if self.emotional_monitor:
47
+ emotional_state = self.emotional_monitor.get_current_state()
48
+ if emotional_state and emotional_state.needs_intervention():
49
+ # User is confused/hesitant/frustrated - ALWAYS respond
50
+ logger.info(
51
+ f"🧠 Gating: User shows {emotional_state} - BYPASSING gating, offering help"
52
+ )
53
+ return True
54
+ needs_help = emotional_state.needs_intervention() if emotional_state else False
55
+
56
+ # 2. READ VISUAL CONTEXT (0ms Latency)
57
+ is_looking = False
58
+ if self.visual_observer:
59
+ # Read the variable updated by the background task
60
+ is_looking = self.visual_observer.visual_context.get("is_looking_at_robot", False)
61
+
62
+ # Ignore if data is too old (> 5 seconds)
63
+ last_update = self.visual_observer.visual_context.get("last_updated", 0)
64
+ if time.time() - last_update > 5.0:
65
+ is_looking = False
66
+
67
+ # 3. ANALYZE CONTEXT
68
+ history_text = "\n".join([f"{m['role']}: {m['content']}" for m in messages[-3:]])
69
+
70
+ # Build enriched system prompt with emotional context
71
+ system_prompt = build_gating_system_prompt(is_looking, emotional_state)
72
+
73
+ payload = {
74
+ "model": self.model,
75
+ "messages": [
76
+ {"role": "system", "content": system_prompt},
77
+ {"role": "user", "content": f"Context:\n{history_text}"}
78
+ ],
79
+ "response_format": {"type": "json_object"},
80
+ "max_tokens": 50
81
+ }
82
+
83
+ # Set strict timeout so we don't silence the bot if API is slow
84
+ timeout = aiohttp.ClientTimeout(total=1.5)
85
+
86
+ try:
87
+ async with aiohttp.ClientSession(timeout=timeout) as session:
88
+ async with session.post(
89
+ self.api_url,
90
+ headers={"Authorization": f"Bearer {self.api_key}"},
91
+ json=payload
92
+ ) as resp:
93
+ if resp.status == 200:
94
+ result = await resp.json()
95
+ content_response = result["choices"][0]["message"]["content"]
96
+ content_response = content_response.replace("```json", "").replace("```", "").strip()
97
+ data = json.loads(content_response)
98
+ should_reply = data.get("reply", False)
99
+
100
+ logger.debug(f"Gating decision: {should_reply} (Looking: {is_looking})")
101
+ return should_reply
102
+ else:
103
+ logger.warning(f"Gating check failed: {resp.status}")
104
+ return True # Fail open (reply if check fails)
105
+ except asyncio.TimeoutError:
106
+ logger.warning("🚦 Gating: Timed out! Defaulting to REPLY.")
107
+ return True
108
+ except Exception as e:
109
+ logger.error(f"Gating error: {e}")
110
+ return True
111
+
112
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
113
+ """
114
+ Intercepts LLMMessagesFrame.
115
+ If 'should_reply' is False, we DROP the frame, effectively silencing the bot.
116
+ """
117
+ await super().process_frame(frame, direction)
118
+
119
+ if isinstance(frame, LLMMessagesFrame) and direction == FrameDirection.DOWNSTREAM:
120
+ # Check if we should reply
121
+ should_reply = await self._check_should_reply(frame.messages)
122
+
123
+ if not should_reply:
124
+ logger.info(f"🚦 Gating: BLOCKING response.")
125
+ return # DROP THE FRAME
126
+
127
+ logger.info(f"🟒 Gating: PASSING through.")
128
+
129
+ await self.push_frame(frame, direction)
src/processors/visual_observer.py ADDED
@@ -0,0 +1,389 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import time
3
+ from typing import Optional, List, Dict, Any
4
+ from loguru import logger
5
+ from pipecat.frames.frames import Frame, ImageRawFrame, TextFrame
6
+ from pipecat.processors.frame_processor import FrameProcessor, FrameDirection
7
+ import base64
8
+ from PIL import Image
9
+ import io
10
+ import cv2
11
+ import numpy as np
12
+ try:
13
+ import mediapipe as mp
14
+ MEDIAPIPE_AVAILABLE = True
15
+ except ImportError:
16
+ MEDIAPIPE_AVAILABLE = False
17
+ logger.warning("MediaPipe not available, using OpenCV for face detection")
18
+
19
+ class VisualObserver(FrameProcessor):
20
+ """
21
+ Observer that waits for UserImageRequestFrame, captures the next video frame,
22
+ analyzes it with a vision model, and injects the description back into the context.
23
+ Now includes face detection and display capabilities.
24
+ """
25
+
26
+ def __init__(
27
+ self,
28
+ vision_client,
29
+ model="moondream",
30
+ enable_display=False,
31
+ enable_face_detection=True,
32
+ webrtc_connection=None,
33
+ tars_client=None
34
+ ):
35
+ super().__init__()
36
+ self._vision_client = vision_client
37
+ self._model = model
38
+ self._waiting_for_image = False
39
+ self._current_request = None
40
+ self._last_analysis_time = 0
41
+ self._cooldown = 2.0 # Min seconds between analyses
42
+ self._enable_display = enable_display
43
+ self._enable_face_detection = enable_face_detection
44
+ self._webrtc_connection = webrtc_connection
45
+ self._tars_client = None # Deprecated: Display control via gRPC in robot mode
46
+ self._display_window_name = "TARS Visual Observer"
47
+
48
+ # Face detection setup
49
+ self._face_detector = None
50
+ if self._enable_face_detection:
51
+ self._setup_face_detection()
52
+
53
+ # Stats
54
+ self._face_count = 0
55
+ self._frames_processed = 0
56
+ self._last_face_time = 0
57
+
58
+ def _setup_face_detection(self):
59
+ """Initialize face detection based on available libraries."""
60
+ try:
61
+ if MEDIAPIPE_AVAILABLE:
62
+ logger.info("🎯 Initializing MediaPipe face detection")
63
+ self._face_detector_type = "mediapipe"
64
+ self._mp_face_detection = mp.solutions.face_detection
65
+ self._mp_drawing = mp.solutions.drawing_utils
66
+ self._face_detector = self._mp_face_detection.FaceDetection(
67
+ model_selection=0, # 0 for short-range (< 2m), 1 for full-range
68
+ min_detection_confidence=0.5
69
+ )
70
+ else:
71
+ # Fallback to OpenCV Haar Cascade
72
+ logger.info("🎯 Initializing OpenCV Haar Cascade face detection")
73
+ self._face_detector_type = "opencv"
74
+ cascade_path = cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
75
+ self._face_detector = cv2.CascadeClassifier(cascade_path)
76
+ if self._face_detector.empty():
77
+ logger.error("Failed to load Haar Cascade classifier")
78
+ self._face_detector = None
79
+ except Exception as e:
80
+ logger.error(f"Failed to initialize face detection: {e}")
81
+ self._face_detector = None
82
+
83
+ def detect_faces(self, image: np.ndarray) -> List[Dict[str, Any]]:
84
+ """
85
+ Detect faces in the image.
86
+
87
+ Args:
88
+ image: numpy array in BGR format
89
+
90
+ Returns:
91
+ List of face dictionaries with bounding boxes and confidence
92
+ """
93
+ if not self._face_detector:
94
+ return []
95
+
96
+ faces = []
97
+ try:
98
+ if self._face_detector_type == "mediapipe":
99
+ # Convert BGR to RGB for MediaPipe
100
+ rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
101
+ results = self._face_detector.process(rgb_image)
102
+
103
+ if results.detections:
104
+ h, w, _ = image.shape
105
+ for detection in results.detections:
106
+ bbox = detection.location_data.relative_bounding_box
107
+ faces.append({
108
+ 'x': int(bbox.xmin * w),
109
+ 'y': int(bbox.ymin * h),
110
+ 'width': int(bbox.width * w),
111
+ 'height': int(bbox.height * h),
112
+ 'confidence': detection.score[0]
113
+ })
114
+ else: # opencv
115
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
116
+ detected_faces = self._face_detector.detectMultiScale(
117
+ gray,
118
+ scaleFactor=1.1,
119
+ minNeighbors=5,
120
+ minSize=(30, 30)
121
+ )
122
+ for (x, y, w, h) in detected_faces:
123
+ faces.append({
124
+ 'x': x,
125
+ 'y': y,
126
+ 'width': w,
127
+ 'height': h,
128
+ 'confidence': 1.0 # OpenCV Haar doesn't provide confidence
129
+ })
130
+ except Exception as e:
131
+ logger.error(f"Error detecting faces: {e}")
132
+
133
+ return faces
134
+
135
+ def draw_faces(self, image: np.ndarray, faces: List[Dict[str, Any]]) -> np.ndarray:
136
+ """
137
+ Draw bounding boxes around detected faces.
138
+
139
+ Args:
140
+ image: numpy array in BGR format
141
+ faces: List of face dictionaries from detect_faces()
142
+
143
+ Returns:
144
+ Image with faces drawn
145
+ """
146
+ annotated_image = image.copy()
147
+
148
+ for face in faces:
149
+ x, y, w, h = face['x'], face['y'], face['width'], face['height']
150
+ confidence = face['confidence']
151
+
152
+ # Draw rectangle
153
+ cv2.rectangle(annotated_image, (x, y), (x + w, y + h), (0, 255, 0), 2)
154
+
155
+ # Draw confidence score
156
+ label = f"Face: {confidence:.2f}"
157
+ cv2.putText(
158
+ annotated_image,
159
+ label,
160
+ (x, y - 10),
161
+ cv2.FONT_HERSHEY_SIMPLEX,
162
+ 0.5,
163
+ (0, 255, 0),
164
+ 2
165
+ )
166
+
167
+ # Draw face count
168
+ cv2.putText(
169
+ annotated_image,
170
+ f"Faces: {len(faces)}",
171
+ (10, 30),
172
+ cv2.FONT_HERSHEY_SIMPLEX,
173
+ 1,
174
+ (0, 255, 0),
175
+ 2
176
+ )
177
+
178
+ return annotated_image
179
+
180
+ def display_frame(self, image: np.ndarray, faces: Optional[List[Dict[str, Any]]] = None):
181
+ """
182
+ Display the frame in a window with optional face annotations.
183
+
184
+ Args:
185
+ image: numpy array in BGR format
186
+ faces: Optional list of detected faces to draw
187
+ """
188
+ if not self._enable_display:
189
+ return
190
+
191
+ try:
192
+ display_image = image.copy()
193
+
194
+ if faces:
195
+ display_image = self.draw_faces(display_image, faces)
196
+
197
+ cv2.imshow(self._display_window_name, display_image)
198
+ cv2.waitKey(1) # Required for window to update
199
+ except Exception as e:
200
+ logger.error(f"Error displaying frame: {e}")
201
+
202
+ def send_display_event(self, faces: List[Dict[str, Any]], image_base64: Optional[str] = None):
203
+ """
204
+ Send display event to WebRTC connection with face detection results.
205
+
206
+ Args:
207
+ faces: List of detected faces
208
+ image_base64: Optional base64-encoded image
209
+ """
210
+ if not self._webrtc_connection:
211
+ return
212
+
213
+ try:
214
+ if self._webrtc_connection.is_connected():
215
+ event_data = {
216
+ "type": "face_detection",
217
+ "status": "detected" if faces else "no_faces",
218
+ "face_count": len(faces),
219
+ "faces": faces,
220
+ "timestamp": time.time()
221
+ }
222
+
223
+ # Optionally include thumbnail
224
+ if image_base64 and len(faces) > 0:
225
+ event_data["thumbnail"] = image_base64
226
+
227
+ self._webrtc_connection.send_app_message(event_data)
228
+ except Exception as e:
229
+ logger.debug(f"Error sending display event: {e}")
230
+
231
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
232
+ await super().process_frame(frame, direction)
233
+
234
+ # 1. Handle Request from LLM (Check by class name to avoid import errors)
235
+ # We check for "UserImageRequestFrame" (your custom frame) OR "VisionImageRequestFrame"
236
+ if frame.__class__.__name__ in ["UserImageRequestFrame", "VisionImageRequestFrame"]:
237
+ logger.info(f"πŸ‘οΈ Vision request received: {getattr(frame, 'context', 'No context')}")
238
+ self._waiting_for_image = True
239
+ self._current_request = frame
240
+ # We don't yield this frame downstream; we consume it and act on it.
241
+ return
242
+
243
+ # 2. Handle Video Input (continuous face detection + optional vision analysis)
244
+ if isinstance(frame, ImageRawFrame):
245
+ self._frames_processed += 1
246
+
247
+ # Process face detection on every frame (or throttled)
248
+ if self._enable_face_detection and self._frames_processed % 5 == 0:
249
+ # Run face detection in background
250
+ asyncio.create_task(self._process_face_detection(frame))
251
+
252
+ # Vision analysis only when requested
253
+ if self._waiting_for_image:
254
+ # Check cooldown
255
+ if time.time() - self._last_analysis_time < self._cooldown:
256
+ await self.push_frame(frame, direction)
257
+ return
258
+
259
+ logger.info("πŸ“Έ Capturing frame for analysis...")
260
+ self._waiting_for_image = False # Reset flag immediately
261
+ self._last_analysis_time = time.time()
262
+
263
+ # Run analysis in background to avoid blocking audio pipeline
264
+ asyncio.create_task(self._analyze_and_respond(frame))
265
+ # Note: Still pass frame through for face detection
266
+
267
+ # Pass all other frames through
268
+ await self.push_frame(frame, direction)
269
+
270
+ async def _process_face_detection(self, frame: ImageRawFrame):
271
+ """Process face detection on video frame and send display events."""
272
+ try:
273
+ # Convert frame to numpy array
274
+ image = Image.frombytes(frame.format, frame.size, frame.image)
275
+ image_np = np.array(image)
276
+
277
+ # Convert RGB to BGR for OpenCV
278
+ if image_np.shape[2] == 3:
279
+ image_bgr = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
280
+ else:
281
+ image_bgr = image_np
282
+
283
+ # Get frame dimensions
284
+ frame_height, frame_width = image_bgr.shape[:2]
285
+
286
+ # Detect faces
287
+ faces = self.detect_faces(image_bgr)
288
+
289
+ if faces:
290
+ self._face_count = len(faces)
291
+ current_time = time.time()
292
+
293
+ # Log only periodically to avoid spam
294
+ if current_time - self._last_face_time > 5.0:
295
+ logger.info(f"πŸ‘€ Detected {len(faces)} face(s)")
296
+ self._last_face_time = current_time
297
+
298
+ # Get the largest/most prominent face
299
+ primary_face = max(faces, key=lambda f: f['width'] * f['height'])
300
+
301
+ # Calculate face center
302
+ face_center_x = primary_face['x'] + primary_face['width'] // 2
303
+ face_center_y = primary_face['y'] + primary_face['height'] // 2
304
+
305
+ # Display the frame with face annotations
306
+ self.display_frame(image_bgr, faces)
307
+
308
+ # Send face position event to WebRTC frontend
309
+ self.send_display_event(faces)
310
+
311
+ # Optionally send face position to text frame for LLM context
312
+ # This can be used for "user is looking at you" type feedback
313
+ # Uncomment if you want the LLM to know about face position
314
+ # face_text = f"[Face Detected]: Position ({face_center_x}, {face_center_y}), Size: {primary_face['width']}x{primary_face['height']}"
315
+ # await self.push_frame(TextFrame(text=face_text), FrameDirection.UPSTREAM)
316
+ else:
317
+ # No faces detected
318
+ if self._face_count > 0:
319
+ logger.debug("No faces detected")
320
+ self._face_count = 0
321
+ # Send "no face" event to WebRTC
322
+ self.send_display_event([])
323
+
324
+ # Display frame without annotations
325
+ self.display_frame(image_bgr)
326
+
327
+ except Exception as e:
328
+ logger.error(f"Error in face detection: {e}")
329
+
330
+ async def _analyze_and_respond(self, frame: ImageRawFrame):
331
+ """Analyze image and push result text frame downstream."""
332
+ try:
333
+ # Convert raw frame to base64
334
+ image = Image.frombytes(frame.format, frame.size, frame.image)
335
+ buffered = io.BytesIO()
336
+ image.save(buffered, format="JPEG")
337
+ img_str = base64.b64encode(buffered.getvalue()).decode()
338
+
339
+ prompt = "Describe this image briefly."
340
+
341
+ # Try to extract prompt from the request context if available
342
+ if self._current_request and hasattr(self._current_request, 'context'):
343
+ # Assuming context might be the question text
344
+ context = self._current_request.context
345
+ if context:
346
+ prompt = f"{context} (Describe the image to answer this)"
347
+
348
+ logger.info(f"πŸ” Sending image to vision model ({self._model})...")
349
+
350
+ try:
351
+ response = await asyncio.wait_for(
352
+ self._vision_client.chat.completions.create(
353
+ model=self._model,
354
+ messages=[
355
+ {
356
+ "role": "user",
357
+ "content": [
358
+ {"type": "text", "text": prompt},
359
+ {
360
+ "type": "image_url",
361
+ "image_url": {
362
+ "url": f"data:image/jpeg;base64,{img_str}"
363
+ },
364
+ },
365
+ ],
366
+ }
367
+ ],
368
+ max_tokens=100
369
+ ),
370
+ timeout=8.0 # 8 second timeout to prevent hanging
371
+ )
372
+ description = response.choices[0].message.content
373
+ logger.info(f"βœ… Vision analysis: {description}")
374
+
375
+ except asyncio.TimeoutError:
376
+ logger.warning("⚠️ Vision model timed out!")
377
+ description = "I couldn't see clearly because the visual processing timed out."
378
+ except Exception as e:
379
+ logger.error(f"❌ Vision model error: {e}")
380
+ description = "I had trouble processing the visual data."
381
+
382
+ feedback_text = f"[Visual Observation]: {description}"
383
+
384
+ # Push text frame to LLM
385
+ await self.push_frame(TextFrame(text=feedback_text), FrameDirection.UPSTREAM)
386
+
387
+ except Exception as e:
388
+ logger.error(f"Error in vision pipeline: {e}")
389
+ self._waiting_for_image = False
src/services/README.md ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Services
2
+
3
+ Backend services for TARS voice AI. These provide core functionality like speech recognition, text-to-speech, memory, and robot control.
4
+
5
+ ## Organization
6
+
7
+ | Service | Purpose |
8
+ |---------|---------|
9
+ | `tars_robot.py` | Robot hardware control via gRPC (movement, camera, display) |
10
+ | `tts_qwen.py` | Local text-to-speech using Qwen3 models |
11
+ | `memory_chromadb.py` | Semantic memory using ChromaDB |
12
+ | `memory_hybrid.py` | Hybrid memory combining ChromaDB and Mem0 |
13
+ | `factories/` | Factory functions for creating STT/TTS services |
14
+
15
+ ## Robot Control
16
+
17
+ Robot hardware is controlled exclusively via gRPC using the TARS SDK.
18
+
19
+ ### tars_robot.py
20
+
21
+ Provides functions for robot control in robot mode (tars_bot.py):
22
+
23
+ ```python
24
+ from services import tars_robot
25
+
26
+ # Get robot client (singleton) - replace with your robot's IP
27
+ client = tars_robot.get_robot_client(address="100.115.193.41:50051")
28
+
29
+ # Control functions
30
+ await tars_robot.execute_movement(["wave_right", "step_forward"])
31
+ result = await tars_robot.capture_camera_view()
32
+ tars_robot.set_emotion("happy")
33
+ tars_robot.set_eye_state("listening")
34
+ status = tars_robot.get_robot_status()
35
+ available = tars_robot.is_robot_available()
36
+
37
+ # Cleanup
38
+ tars_robot.close_robot_client()
39
+ ```
40
+
41
+ ### Architecture
42
+
43
+ Robot mode uses two communication channels:
44
+
45
+ | Channel | Protocol | Purpose | Latency |
46
+ |---------|----------|---------|---------|
47
+ | Audio | WebRTC | Voice conversation | ~20ms |
48
+ | Commands | gRPC | Hardware control | ~5-10ms |
49
+
50
+ Audio flows through aiortc WebRTC connection.
51
+ All hardware commands (movement, camera, display) use gRPC.
52
+
53
+ ### Browser Mode
54
+
55
+ Browser mode (bot.py) does NOT support robot control.
56
+ It only provides:
57
+ - WebRTC audio/video with browser
58
+ - Vision analysis
59
+ - Conversation
60
+
61
+ Display observers in browser mode are deprecated and do nothing.
62
+
63
+ ## Service Factories
64
+
65
+ The `factories/` directory contains factory functions for creating STT and TTS services:
66
+
67
+ ```python
68
+ from services.factories import create_stt_service, create_tts_service
69
+
70
+ # Create STT service
71
+ stt = create_stt_service(
72
+ provider="deepgram", # or "speechmatics", "deepgram-flux"
73
+ deepgram_api_key=DEEPGRAM_API_KEY,
74
+ language=Language.EN
75
+ )
76
+
77
+ # Create TTS service
78
+ tts = create_tts_service(
79
+ provider="elevenlabs", # or "qwen3"
80
+ elevenlabs_api_key=ELEVENLABS_API_KEY,
81
+ elevenlabs_voice_id=VOICE_ID
82
+ )
83
+ ```
84
+
85
+ ## Memory Services
86
+
87
+ ### ChromaDB (memory_chromadb.py)
88
+
89
+ Simple semantic memory using ChromaDB vector database:
90
+
91
+ ```python
92
+ from services.memory_chromadb import ChromaDBMemoryService
93
+
94
+ memory = ChromaDBMemoryService()
95
+ await memory.store("user_id", "The user likes pizza")
96
+ results = await memory.search("user_id", "What does the user like?")
97
+ ```
98
+
99
+ ### Hybrid Memory (memory_hybrid.py)
100
+
101
+ Combines ChromaDB with Mem0 for enhanced memory capabilities.
102
+
103
+ ## Not Services
104
+
105
+ This directory is for backend services only. Other code belongs in:
106
+
107
+ - `tools/` - LLM callable functions
108
+ - `processors/` - Pipeline frame processors
109
+ - `transport/` - Network transport (WebRTC, gRPC)
110
+ - `observers/` - Pipeline observers
src/services/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
src/services/factories/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """Service factories for STT and TTS providers."""
2
+
3
+ from .stt_factory import create_stt_service
4
+ from .tts_factory import create_tts_service
5
+
6
+ __all__ = ["create_stt_service", "create_tts_service"]
src/services/factories/stt_factory.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """STT Service Factory - Centralized STT service creation."""
2
+
3
+ from loguru import logger
4
+ from pipecat.transcriptions.language import Language
5
+
6
+
7
+ def create_stt_service(
8
+ provider: str,
9
+ speechmatics_api_key: str = None,
10
+ deepgram_api_key: str = None,
11
+ language: Language = Language.EN,
12
+ enable_diarization: bool = False,
13
+ ):
14
+ """
15
+ Create and configure STT service based on provider.
16
+
17
+ Args:
18
+ provider: "speechmatics", "deepgram", or "deepgram-flux"
19
+ speechmatics_api_key: Speechmatics API key (if using speechmatics)
20
+ deepgram_api_key: Deepgram API key (if using deepgram/deepgram-flux)
21
+ language: Language for transcription (default: English)
22
+ enable_diarization: Enable speaker diarization (default: False)
23
+
24
+ Returns:
25
+ Configured STT service instance
26
+
27
+ Raises:
28
+ ValueError: If provider is invalid or required parameters are missing
29
+ Exception: If STT service initialization fails
30
+ """
31
+
32
+ logger.info(f"Creating STT service: {provider}")
33
+
34
+ try:
35
+ if provider == "speechmatics":
36
+ # Lazy import to avoid requiring package when not in use
37
+ from pipecat.services.speechmatics.stt import SpeechmaticsSTTService, TurnDetectionMode
38
+
39
+ # Speechmatics with SMART_TURN mode for built-in turn detection
40
+ if not speechmatics_api_key:
41
+ raise ValueError("speechmatics_api_key is required for Speechmatics")
42
+
43
+ logger.info("Using Speechmatics STT with SMART_TURN mode")
44
+ stt_params = SpeechmaticsSTTService.InputParams(
45
+ language=language,
46
+ enable_diarization=enable_diarization,
47
+ turn_detection_mode=TurnDetectionMode.SMART_TURN,
48
+ )
49
+
50
+ stt = SpeechmaticsSTTService(
51
+ api_key=speechmatics_api_key,
52
+ params=stt_params,
53
+ )
54
+ logger.info("βœ“ Speechmatics STT service created with SMART_TURN mode")
55
+
56
+ elif provider == "deepgram":
57
+ # Lazy import to avoid requiring package when not in use
58
+ from pipecat.services.deepgram.stt import DeepgramSTTService
59
+ from deepgram.clients.listen.v1.websocket.options import LiveOptions
60
+
61
+ # Deepgram STT with server-side endpointing for turn detection
62
+ # Note: This uses Deepgram's server-side silence detection, not local smart turn
63
+ if not deepgram_api_key:
64
+ raise ValueError("deepgram_api_key is required for Deepgram")
65
+
66
+ logger.info("Using Deepgram STT with server-side endpointing")
67
+ live_options = LiveOptions(
68
+ language=language.value if hasattr(language, 'value') else str(language),
69
+ model="nova-2", # Deepgram's latest model
70
+ interim_results=True, # Enable interim transcription results
71
+ smart_format=True, # Auto-format transcripts
72
+ punctuate=True, # Add punctuation
73
+ endpointing=300, # 300ms silence to detect end of speech (server-side)
74
+ vad_events=True, # Enable VAD events for speech detection
75
+ )
76
+
77
+ stt = DeepgramSTTService(
78
+ api_key=deepgram_api_key,
79
+ live_options=live_options,
80
+ stt_ttfb_timeout=5.0, # TTFB timeout for transcription (seconds)
81
+ )
82
+ logger.info("βœ“ Deepgram STT service created")
83
+ logger.info(" Turn detection: Server-side endpointing (300ms silence)")
84
+ logger.info(" VAD events: Enabled for speech detection")
85
+ logger.info(" TTFB timeout: 5.0s for transcription metrics")
86
+
87
+ elif provider == "deepgram-flux":
88
+ # Lazy import to avoid requiring package when not in use
89
+ from pipecat.services.deepgram.flux.stt import DeepgramFluxSTTService
90
+
91
+ # Deepgram Flux with built-in turn detection
92
+ if not deepgram_api_key:
93
+ raise ValueError("deepgram_api_key is required for Deepgram Flux")
94
+
95
+ logger.info("Using Deepgram Flux STT with built-in turn detection")
96
+ # Flux has different parameters - uses EOT (End of Transcript) detection
97
+ # Default model is "flux-general-en" and encoding is "linear16"
98
+ stt_params = DeepgramFluxSTTService.InputParams(
99
+ min_confidence=0.3, # Minimum confidence threshold for accepting transcriptions
100
+ # Optional: Configure end-of-turn detection thresholds
101
+ # eot_threshold: Confidence threshold for detecting end of turn (0.0-1.0)
102
+ # eot_timeout_ms: Max time to wait before forcing turn end
103
+ # eager_eot_threshold: More aggressive turn ending threshold
104
+ )
105
+
106
+ stt = DeepgramFluxSTTService(
107
+ api_key=deepgram_api_key,
108
+ model="flux-general-en", # Flux model for general English
109
+ params=stt_params,
110
+ )
111
+
112
+ # Set up debug event handler for Flux updates
113
+ @stt.event_handler("on_update")
114
+ async def on_flux_update(stt_service, transcript):
115
+ logger.debug(f"[Deepgram Flux] Update: {transcript}")
116
+
117
+ logger.info("βœ“ Deepgram Flux STT service created with built-in turn detection")
118
+ logger.info(" Note: STT latency will be tracked via MetricsFrame if emitted by Flux")
119
+
120
+ else:
121
+ raise ValueError(f"Unknown STT provider: {provider}. Must be 'speechmatics', 'deepgram', or 'deepgram-flux'")
122
+
123
+ return stt
124
+
125
+ except Exception as e:
126
+ logger.error(f"Failed to create STT service '{provider}': {e}", exc_info=True)
127
+ raise
src/services/factories/tts_factory.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """TTS Service Factory - Centralized TTS service creation."""
2
+
3
+ from loguru import logger
4
+ from pipecat.services.elevenlabs.tts import ElevenLabsTTSService
5
+ from ..tts.tts_qwen import Qwen3TTSService
6
+
7
+
8
+ def create_tts_service(
9
+ provider: str,
10
+ elevenlabs_api_key: str = None,
11
+ elevenlabs_voice_id: str = None,
12
+ qwen_model: str = None,
13
+ qwen_device: str = None,
14
+ qwen_ref_audio: str = None,
15
+ ):
16
+ """
17
+ Create and configure TTS service based on provider.
18
+
19
+ Args:
20
+ provider: "elevenlabs" or "qwen3"
21
+ elevenlabs_api_key: ElevenLabs API key (if using elevenlabs)
22
+ elevenlabs_voice_id: ElevenLabs voice ID (if using elevenlabs)
23
+ qwen_model: Qwen3-TTS model name (if using qwen3)
24
+ qwen_device: Device for Qwen3-TTS (if using qwen3)
25
+ qwen_ref_audio: Reference audio path for Qwen3-TTS (if using qwen3)
26
+
27
+ Returns:
28
+ Configured TTS service instance
29
+
30
+ Raises:
31
+ ValueError: If provider is invalid or required parameters are missing
32
+ Exception: If TTS service initialization fails
33
+ """
34
+
35
+ logger.info(f"Creating TTS service: {provider}")
36
+
37
+ try:
38
+ if provider == "qwen3":
39
+ # Local Qwen3-TTS with voice cloning
40
+ if not qwen_model:
41
+ raise ValueError("qwen_model is required for Qwen3-TTS")
42
+
43
+ logger.info("Using Qwen3-TTS (local, voice cloning)")
44
+ tts = Qwen3TTSService(
45
+ model_name=qwen_model,
46
+ device=qwen_device or "mps",
47
+ ref_audio_path=qwen_ref_audio,
48
+ x_vector_only_mode=True,
49
+ sample_rate=24000,
50
+ )
51
+ logger.info(f"βœ“ Qwen3-TTS service created (device: {qwen_device})")
52
+
53
+ elif provider == "elevenlabs":
54
+ # Cloud ElevenLabs TTS
55
+ if not elevenlabs_api_key or not elevenlabs_voice_id:
56
+ raise ValueError("elevenlabs_api_key and elevenlabs_voice_id are required for ElevenLabs")
57
+
58
+ logger.info("Using ElevenLabs TTS")
59
+ tts = ElevenLabsTTSService(
60
+ api_key=elevenlabs_api_key,
61
+ voice_id=elevenlabs_voice_id,
62
+ model="eleven_flash_v2_5",
63
+ output_format="pcm_24000",
64
+ enable_word_timestamps=False,
65
+ voice_settings={
66
+ "stability": 0.5,
67
+ "similarity_boost": 0.75,
68
+ "style": 0.0,
69
+ "use_speaker_boost": True
70
+ },
71
+ params=ElevenLabsTTSService.InputParams(
72
+ enable_logging=True, # Enable ElevenLabs logging for metrics
73
+ ),
74
+ )
75
+ logger.info("βœ“ ElevenLabs TTS service created")
76
+
77
+ else:
78
+ raise ValueError(f"Unknown TTS provider: {provider}. Must be 'qwen3' or 'elevenlabs'")
79
+
80
+ return tts
81
+
82
+ except Exception as e:
83
+ logger.error(f"Failed to create TTS service '{provider}': {e}", exc_info=True)
84
+ raise
src/services/memory/memory_chromadb.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Local memory service using ChromaDB for semantic search."""
2
+
3
+ import time
4
+ from loguru import logger
5
+ from pipecat.frames.frames import Frame, LLMMessagesFrame, LLMContextFrame, MetricsFrame
6
+ from pipecat.metrics.metrics import TTFBMetricsData
7
+ from pipecat.processors.aggregators.llm_context import LLMContext
8
+ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext, OpenAILLMContextFrame
9
+ from pipecat.processors.frame_processor import FrameProcessor, FrameDirection
10
+ from sentence_transformers import SentenceTransformer
11
+ import chromadb
12
+
13
+
14
+ class ChromaDBMemoryService(FrameProcessor):
15
+ """
16
+ Local memory service using ChromaDB for semantic search.
17
+
18
+ Replaces Mem0 with a local, fast, and free alternative:
19
+ - Stores conversation history with semantic embeddings
20
+ - Retrieves relevant memories based on similarity search
21
+ - No external API calls - everything runs locally
22
+ - Latency: ~50-100ms vs Mem0's ~200-500ms
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ user_id: str,
28
+ agent_id: str = "tars_agent",
29
+ collection_name: str = "conversations",
30
+ search_limit: int = 5,
31
+ search_threshold: float = 0.5,
32
+ system_prompt_prefix: str = "Based on previous conversations, I recall:\n\n",
33
+ **kwargs
34
+ ):
35
+ super().__init__(**kwargs)
36
+ self.user_id = user_id
37
+ self.agent_id = agent_id
38
+ self.search_limit = search_limit
39
+ self.search_threshold = search_threshold
40
+ self.system_prompt_prefix = system_prompt_prefix
41
+
42
+ # Initialize ChromaDB (persistent local storage)
43
+ self.client = chromadb.PersistentClient(path="./chroma_memory")
44
+
45
+ # Create or get collection for this user
46
+ self.collection = self.client.get_or_create_collection(
47
+ name=f"{collection_name}_{user_id}",
48
+ metadata={"agent_id": agent_id}
49
+ )
50
+
51
+ # Load embedding model (lightweight, ~80MB)
52
+ logger.info("Loading sentence transformer model...")
53
+ self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
54
+
55
+ # Frame counter for debugging
56
+ self._frame_count = 0
57
+
58
+ logger.info("βœ“ ChromaDB memory service initialized and ready to process frames")
59
+
60
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
61
+ """Process frames and inject memories into LLM context."""
62
+ try:
63
+ await super().process_frame(frame, direction)
64
+
65
+ # Frame counter
66
+ self._frame_count += 1
67
+
68
+ # Debug: Log all frame types to understand what's flowing through
69
+ frame_type = type(frame).__name__
70
+ direction_name = "DOWNSTREAM" if direction == FrameDirection.DOWNSTREAM else "UPSTREAM"
71
+
72
+ # Log LLM-related frames to debug
73
+ if 'LLM' in frame_type or 'Messages' in frame_type or 'Context' in frame_type:
74
+ logger.info(f"πŸ” [ChromaDB] >>> RECEIVED: {frame_type} | Direction: {direction_name} | Count: {self._frame_count}")
75
+
76
+ # Log every 100th frame to verify it's being called
77
+ if self._frame_count % 100 == 0:
78
+ logger.info(f"πŸ” [ChromaDB] Processed {self._frame_count} frames so far (latest: {frame_type})")
79
+
80
+ # Handle both LLMContextFrame and LLMMessagesFrame (like Mem0 does)
81
+ context = None
82
+ messages = None
83
+
84
+ if isinstance(frame, (LLMContextFrame, OpenAILLMContextFrame)):
85
+ logger.info(f"🧠 [ChromaDB] βœ“βœ“βœ“ PROCESSING LLMContextFrame βœ“βœ“βœ“")
86
+ context = frame.context
87
+ elif isinstance(frame, LLMMessagesFrame):
88
+ logger.info(f"🧠 [ChromaDB] βœ“βœ“βœ“ PROCESSING LLMMessagesFrame βœ“βœ“βœ“")
89
+ messages = frame.messages
90
+ context = LLMContext(messages)
91
+
92
+ if context:
93
+ # Get the latest user message
94
+ context_messages = context.get_messages()
95
+ user_message = None
96
+ for msg in reversed(context_messages):
97
+ if msg.get("role") == "user" and isinstance(msg.get("content"), str):
98
+ user_message = msg.get("content", "")
99
+ break
100
+
101
+ if user_message:
102
+ logger.info(f"🧠 [ChromaDB] Searching memories for: '{user_message[:50]}...'")
103
+ # Search for relevant memories
104
+ start_time = time.time()
105
+ memories = await self._search_memories(user_message)
106
+ search_latency_ms = (time.time() - start_time) * 1000
107
+
108
+ # Emit metrics for observer tracking
109
+ logger.info(f"πŸ“Š [ChromaDB] Search completed in {search_latency_ms:.0f}ms, emitting MetricsFrame")
110
+ metrics_frame = MetricsFrame(
111
+ data=[TTFBMetricsData(processor="ChromaDBMemoryService", value=search_latency_ms / 1000)]
112
+ )
113
+ await self.push_frame(metrics_frame, direction)
114
+
115
+ if memories:
116
+ # Inject memories into context
117
+ memory_text = self.system_prompt_prefix + "\n".join(memories)
118
+ context.add_message({"role": "system", "content": memory_text})
119
+ logger.info(f"πŸ“š Retrieved {len(memories)} memories in {search_latency_ms:.0f}ms")
120
+
121
+ # Store current conversation turn
122
+ await self._store_memory(user_message)
123
+
124
+ # If we received an LLMMessagesFrame, create a new one with the enhanced messages
125
+ if messages is not None:
126
+ await self.push_frame(LLMMessagesFrame(context.get_messages()), direction)
127
+ else:
128
+ # Otherwise, pass the enhanced context frame downstream
129
+ await self.push_frame(frame, direction)
130
+ else:
131
+ # For non-context frames, just pass them through
132
+ await self.push_frame(frame, direction)
133
+
134
+ except Exception as e:
135
+ logger.error(f"❌ [ChromaDB] Error in process_frame: {e}", exc_info=True)
136
+ # Still pass frame through even if we failed
137
+ await self.push_frame(frame, direction)
138
+
139
+ async def _search_memories(self, query: str) -> list[str]:
140
+ """Search for relevant memories based on semantic similarity."""
141
+ try:
142
+ # Generate embedding for query
143
+ query_embedding = self.embedder.encode(query).tolist()
144
+
145
+ # Search in ChromaDB
146
+ results = self.collection.query(
147
+ query_embeddings=[query_embedding],
148
+ n_results=self.search_limit,
149
+ )
150
+
151
+ # Extract documents and filter by threshold
152
+ memories = []
153
+ if results and "documents" in results and results["documents"]:
154
+ for doc_list, distance_list in zip(results["documents"], results.get("distances", [[]])):
155
+ for doc, distance in zip(doc_list, distance_list):
156
+ # ChromaDB returns L2 distance, lower is better
157
+ # Convert to similarity score (1 - normalized distance)
158
+ similarity = 1 - (distance / 2) # Normalize L2 distance to [0,1]
159
+ if similarity >= self.search_threshold:
160
+ memories.append(doc)
161
+
162
+ return memories
163
+
164
+ except Exception as e:
165
+ logger.error(f"Error searching memories: {e}")
166
+ return []
167
+
168
+ async def _store_memory(self, text: str):
169
+ """Store a memory with its embedding."""
170
+ try:
171
+ # Generate embedding
172
+ embedding = self.embedder.encode(text).tolist()
173
+
174
+ # Store in ChromaDB with timestamp as ID
175
+ doc_id = f"{int(time.time() * 1000)}"
176
+ self.collection.add(
177
+ documents=[text],
178
+ embeddings=[embedding],
179
+ ids=[doc_id],
180
+ metadatas=[{
181
+ "user_id": self.user_id,
182
+ "agent_id": self.agent_id,
183
+ "timestamp": time.time()
184
+ }]
185
+ )
186
+
187
+ logger.debug(f"πŸ’Ύ Stored memory: {text[:50]}...")
188
+
189
+ except Exception as e:
190
+ logger.error(f"Error storing memory: {e}")
191
+
192
+ async def close(self):
193
+ """Cleanup resources."""
194
+ # ChromaDB client doesn't need explicit cleanup
195
+ pass
src/services/memory/memory_hybrid.py ADDED
@@ -0,0 +1,393 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hybrid memory system optimized for voice AI with sub-50ms latency.
3
+
4
+ Features:
5
+ 1. Hybrid search combining vector similarity (70%) and BM25 keyword matching (30%)
6
+ 2. SQLite + FTS5 for fast, local storage and search
7
+ 3. Query embedding cache to avoid redundant encoding
8
+ 4. Pre-warmed embedding model for consistent latency
9
+ 5. Strict timeout with graceful fallback
10
+ 6. Thread pool for non-blocking SQLite operations
11
+ 7. Fire-and-forget storage to prevent blocking
12
+
13
+ Architecture:
14
+ - Vector search for semantic similarity (cosine distance)
15
+ - BM25 via FTS5 for exact keyword matching
16
+ - Weighted score fusion for best of both worlds
17
+ - Target latency: <50ms (vs ChromaDB's ~50-100ms)
18
+ """
19
+
20
+ import asyncio
21
+ import sqlite3
22
+ import time
23
+ from concurrent.futures import ThreadPoolExecutor
24
+ from pathlib import Path
25
+ from typing import Optional, List, Tuple
26
+ import numpy as np
27
+
28
+ from loguru import logger
29
+ from pipecat.frames.frames import Frame, LLMMessagesFrame, LLMContextFrame, MetricsFrame
30
+ from pipecat.metrics.metrics import TTFBMetricsData
31
+ from pipecat.processors.aggregators.llm_context import LLMContext
32
+ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContextFrame
33
+ from pipecat.processors.frame_processor import FrameProcessor, FrameDirection
34
+ from sentence_transformers import SentenceTransformer
35
+
36
+
37
+ class HybridMemoryService(FrameProcessor):
38
+ """
39
+ Hybrid memory service combining vector similarity and keyword search.
40
+
41
+ Target latency: <50ms
42
+
43
+ Architecture:
44
+ - Vector search via numpy (semantic similarity with cosine distance)
45
+ - BM25 via FTS5 (exact keyword matching)
46
+ - Weighted score fusion: 70% vector + 30% BM25
47
+
48
+ Voice AI optimizations:
49
+ - Query embedding cache (avoid re-encoding similar queries)
50
+ - Pre-warmed embedding model for consistent performance
51
+ - Thread pool for non-blocking SQLite operations
52
+ - Strict timeout with graceful fallback
53
+ - Fire-and-forget storage to prevent blocking
54
+ """
55
+
56
+ def __init__(
57
+ self,
58
+ user_id: str,
59
+ db_path: str = "./memory_data/memory.sqlite",
60
+ embedding_model: str = "all-MiniLM-L6-v2",
61
+ search_limit: int = 3,
62
+ search_timeout_ms: int = 40,
63
+ vector_weight: float = 0.7,
64
+ bm25_weight: float = 0.3,
65
+ system_prompt_prefix: str = "From our conversations:\n",
66
+ **kwargs,
67
+ ):
68
+ super().__init__(**kwargs)
69
+ self.user_id = user_id
70
+ self.db_path = db_path
71
+ self.search_limit = search_limit
72
+ self.search_timeout_ms = search_timeout_ms
73
+ self.vector_weight = vector_weight
74
+ self.bm25_weight = bm25_weight
75
+ self.system_prompt_prefix = system_prompt_prefix
76
+
77
+ # Thread pool for blocking operations
78
+ self._executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="HybridMemory")
79
+
80
+ # Initialize SQLite with FTS5 and vector support
81
+ Path(db_path).parent.mkdir(parents=True, exist_ok=True)
82
+ self._init_database()
83
+
84
+ # Load and warm embedding model
85
+ logger.info("Loading embedding model for hybrid memory...")
86
+ self.embedder = SentenceTransformer(embedding_model)
87
+ self._embedding_dim = self.embedder.get_sentence_embedding_dimension()
88
+ self._warmup_model()
89
+
90
+ # Embedding caches
91
+ self._query_cache: dict[str, np.ndarray] = {} # For queries
92
+ self._doc_cache: dict[str, np.ndarray] = {} # For documents
93
+ self._cache_max_size = 500
94
+
95
+ # Metrics
96
+ self._stats = {"searches": 0, "cache_hits": 0, "timeouts": 0, "total_latency_ms": 0}
97
+ self._frame_count = 0
98
+
99
+ logger.info(f"βœ“ Hybrid memory ready (vector + BM25, {search_timeout_ms}ms timeout)")
100
+
101
+ def _init_database(self):
102
+ """Initialize SQLite with FTS5 and vector table."""
103
+ conn = sqlite3.connect(self.db_path)
104
+
105
+ # Main memories table
106
+ conn.execute("""
107
+ CREATE TABLE IF NOT EXISTS memories (
108
+ id INTEGER PRIMARY KEY,
109
+ user_id TEXT NOT NULL,
110
+ content TEXT NOT NULL,
111
+ embedding BLOB,
112
+ created_at REAL DEFAULT (unixepoch('now', 'subsec'))
113
+ )
114
+ """)
115
+
116
+ # FTS5 virtual table for BM25 keyword search
117
+ conn.execute("""
118
+ CREATE VIRTUAL TABLE IF NOT EXISTS memories_fts
119
+ USING fts5(content, content='memories', content_rowid='id')
120
+ """)
121
+
122
+ # Triggers to keep FTS in sync
123
+ conn.execute("""
124
+ CREATE TRIGGER IF NOT EXISTS memories_ai AFTER INSERT ON memories BEGIN
125
+ INSERT INTO memories_fts(rowid, content) VALUES (new.id, new.content);
126
+ END
127
+ """)
128
+
129
+ conn.execute("""
130
+ CREATE TRIGGER IF NOT EXISTS memories_ad AFTER DELETE ON memories BEGIN
131
+ DELETE FROM memories_fts WHERE rowid = old.id;
132
+ END
133
+ """)
134
+
135
+ # Index for user filtering
136
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_user ON memories(user_id)")
137
+
138
+ conn.commit()
139
+ conn.close()
140
+ logger.info("βœ“ SQLite database initialized with FTS5")
141
+
142
+ def _warmup_model(self):
143
+ """Warm up embedding model for consistent latency."""
144
+ warmup_start = time.perf_counter()
145
+ for _ in range(3):
146
+ _ = self.embedder.encode("warmup query", show_progress_bar=False)
147
+ warmup_time = (time.perf_counter() - warmup_start) * 1000
148
+ logger.info(f"βœ“ Embedding model warmed up ({warmup_time:.0f}ms)")
149
+
150
+ def _get_query_embedding(self, text: str) -> np.ndarray:
151
+ """Get embedding with query cache."""
152
+ cache_key = text.strip().lower()[:100]
153
+
154
+ if cache_key in self._query_cache:
155
+ self._stats["cache_hits"] += 1
156
+ return self._query_cache[cache_key]
157
+
158
+ embedding = self.embedder.encode(text, show_progress_bar=False)
159
+
160
+ # LRU eviction
161
+ if len(self._query_cache) >= self._cache_max_size:
162
+ oldest = next(iter(self._query_cache))
163
+ del self._query_cache[oldest]
164
+
165
+ self._query_cache[cache_key] = embedding
166
+ return embedding
167
+
168
+ def _cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> float:
169
+ """Fast cosine similarity."""
170
+ return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8))
171
+
172
+ def _bm25_rank_to_score(self, rank: int) -> float:
173
+ """Convert BM25 rank to normalized score."""
174
+ return 1.0 / (1.0 + max(0, rank))
175
+
176
+ def _hybrid_search_sync(self, query: str) -> List[Tuple[str, float]]:
177
+ """
178
+ Hybrid search combining vector similarity and BM25 keyword matching.
179
+ Returns [(content, score), ...] sorted by score.
180
+ """
181
+ conn = sqlite3.connect(self.db_path)
182
+
183
+ # Get query embedding
184
+ query_embedding = self._get_query_embedding(query)
185
+
186
+ # ========== Vector Search ==========
187
+ vector_results = {}
188
+ cursor = conn.execute(
189
+ "SELECT id, content, embedding FROM memories WHERE user_id = ? ORDER BY created_at DESC LIMIT 100",
190
+ (self.user_id,)
191
+ )
192
+
193
+ for row_id, content, embedding_blob in cursor:
194
+ if embedding_blob:
195
+ doc_embedding = np.frombuffer(embedding_blob, dtype=np.float32)
196
+ similarity = self._cosine_similarity(query_embedding, doc_embedding)
197
+ vector_results[row_id] = {
198
+ "content": content,
199
+ "vector_score": similarity,
200
+ "bm25_score": 0.0,
201
+ }
202
+
203
+ # ========== BM25 Search (FTS5) ==========
204
+ # Build FTS query using OR for flexible token matching
205
+ tokens = [t for t in query.split() if len(t) > 2]
206
+ if tokens:
207
+ # Use OR for more flexible matching
208
+ fts_query = " OR ".join(f'"{t}"' for t in tokens[:5]) # Limit tokens
209
+ try:
210
+ bm25_cursor = conn.execute(
211
+ """
212
+ SELECT rowid, rank FROM memories_fts
213
+ WHERE memories_fts MATCH ?
214
+ ORDER BY rank
215
+ LIMIT ?
216
+ """,
217
+ (fts_query, self.search_limit * 4)
218
+ )
219
+
220
+ for rank_idx, (row_id, bm25_rank) in enumerate(bm25_cursor):
221
+ bm25_score = self._bm25_rank_to_score(rank_idx)
222
+ if row_id in vector_results:
223
+ vector_results[row_id]["bm25_score"] = bm25_score
224
+ else:
225
+ # BM25 found something vector didn't
226
+ content_cursor = conn.execute(
227
+ "SELECT content FROM memories WHERE id = ?", (row_id,)
228
+ )
229
+ row = content_cursor.fetchone()
230
+ if row:
231
+ vector_results[row_id] = {
232
+ "content": row[0],
233
+ "vector_score": 0.0,
234
+ "bm25_score": bm25_score,
235
+ }
236
+ except sqlite3.OperationalError as e:
237
+ # FTS query failed, continue with vector only
238
+ logger.debug(f"FTS query failed: {e}")
239
+ pass
240
+
241
+ conn.close()
242
+
243
+ # ========== Weighted Score Fusion ==========
244
+ results = []
245
+ for data in vector_results.values():
246
+ final_score = (
247
+ self.vector_weight * data["vector_score"] +
248
+ self.bm25_weight * data["bm25_score"]
249
+ )
250
+ results.append((data["content"], final_score))
251
+
252
+ # Sort by score, return top N
253
+ results.sort(key=lambda x: x[1], reverse=True)
254
+ return results[:self.search_limit]
255
+
256
+ def _store_sync(self, text: str):
257
+ """Store memory with embedding."""
258
+ embedding = self.embedder.encode(text, show_progress_bar=False)
259
+ embedding_blob = embedding.astype(np.float32).tobytes()
260
+
261
+ conn = sqlite3.connect(self.db_path)
262
+ conn.execute(
263
+ "INSERT INTO memories (user_id, content, embedding) VALUES (?, ?, ?)",
264
+ (self.user_id, text, embedding_blob)
265
+ )
266
+ conn.commit()
267
+ conn.close()
268
+
269
+ async def _search_with_timeout(self, query: str) -> List[Tuple[str, float]]:
270
+ """Async search with strict timeout."""
271
+ loop = asyncio.get_event_loop()
272
+
273
+ try:
274
+ result = await asyncio.wait_for(
275
+ loop.run_in_executor(self._executor, self._hybrid_search_sync, query),
276
+ timeout=self.search_timeout_ms / 1000,
277
+ )
278
+ return result
279
+ except asyncio.TimeoutError:
280
+ self._stats["timeouts"] += 1
281
+ logger.warning(f"⏱️ Memory search timed out ({self.search_timeout_ms}ms)")
282
+ return []
283
+
284
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
285
+ """Process Pipecat frames with hybrid memory injection."""
286
+ await super().process_frame(frame, direction)
287
+
288
+ try:
289
+ self._frame_count += 1
290
+
291
+ # Debug: Log all frame types to understand what's flowing through
292
+ frame_type = type(frame).__name__
293
+ direction_name = "DOWNSTREAM" if direction == FrameDirection.DOWNSTREAM else "UPSTREAM"
294
+
295
+ # Log LLM-related frames to debug
296
+ if 'LLM' in frame_type or 'Messages' in frame_type or 'Context' in frame_type:
297
+ logger.info(f"πŸ” [HybridMemory] >>> RECEIVED: {frame_type} | Direction: {direction_name} | Count: {self._frame_count}")
298
+
299
+ context = None
300
+ messages = None
301
+
302
+ if isinstance(frame, (LLMContextFrame, OpenAILLMContextFrame)):
303
+ logger.info(f"🧠 [HybridMemory] βœ“βœ“βœ“ PROCESSING LLMContextFrame βœ“βœ“βœ“")
304
+ context = frame.context
305
+ elif isinstance(frame, LLMMessagesFrame):
306
+ logger.info(f"🧠 [HybridMemory] βœ“βœ“βœ“ PROCESSING LLMMessagesFrame βœ“βœ“βœ“")
307
+ messages = frame.messages
308
+ context = LLMContext(messages)
309
+
310
+ if context:
311
+ # Extract user message
312
+ user_message = None
313
+ for msg in reversed(context.get_messages()):
314
+ if msg.get("role") == "user" and isinstance(msg.get("content"), str):
315
+ user_message = msg["content"]
316
+ break
317
+
318
+ if user_message:
319
+ self._stats["searches"] += 1
320
+ start_time = time.perf_counter()
321
+
322
+ logger.info(f"πŸ” [HybridMemory] Searching for: '{user_message[:50]}...'")
323
+
324
+ # Hybrid search with timeout
325
+ results = await self._search_with_timeout(user_message)
326
+
327
+ latency_ms = (time.perf_counter() - start_time) * 1000
328
+ self._stats["total_latency_ms"] += latency_ms
329
+
330
+ # Emit metrics
331
+ await self.push_frame(
332
+ MetricsFrame(data=[
333
+ TTFBMetricsData(processor="HybridMemory", value=latency_ms / 1000)
334
+ ]),
335
+ direction,
336
+ )
337
+
338
+ # Inject memories
339
+ if results:
340
+ memories_text = self.system_prompt_prefix + "\n".join(
341
+ f"- {content}" for content, score in results
342
+ )
343
+ context.add_message({"role": "system", "content": memories_text})
344
+
345
+ cache_rate = self._stats["cache_hits"] / max(1, self._stats["searches"]) * 100
346
+ avg_latency = self._stats["total_latency_ms"] / max(1, self._stats["searches"])
347
+ logger.info(
348
+ f"πŸ“š [HybridMemory] {len(results)} memories ({latency_ms:.0f}ms, "
349
+ f"avg: {avg_latency:.0f}ms, cache: {cache_rate:.0f}%)"
350
+ )
351
+ else:
352
+ logger.info(f"πŸ“š [HybridMemory] No relevant memories ({latency_ms:.0f}ms)")
353
+
354
+ # Fire-and-forget storage
355
+ asyncio.create_task(self._store_async(user_message))
356
+
357
+ # Push frame
358
+ if messages is not None:
359
+ await self.push_frame(LLMMessagesFrame(context.get_messages()), direction)
360
+ else:
361
+ await self.push_frame(frame, direction)
362
+ else:
363
+ await self.push_frame(frame, direction)
364
+
365
+ except Exception as e:
366
+ logger.error(f"❌ [HybridMemory] Memory error: {e}", exc_info=True)
367
+ await self.push_frame(frame, direction)
368
+
369
+ async def _store_async(self, text: str):
370
+ """Async storage (fire-and-forget)."""
371
+ loop = asyncio.get_event_loop()
372
+ try:
373
+ await loop.run_in_executor(self._executor, self._store_sync, text)
374
+ logger.debug(f"πŸ’Ύ [HybridMemory] Stored: {text[:50]}...")
375
+ except Exception as e:
376
+ logger.debug(f"[HybridMemory] Store failed: {e}")
377
+
378
+ def get_stats(self) -> dict:
379
+ """Get performance statistics."""
380
+ searches = max(1, self._stats["searches"])
381
+ return {
382
+ "searches": self._stats["searches"],
383
+ "cache_hits": self._stats["cache_hits"],
384
+ "cache_hit_rate": f"{(self._stats['cache_hits'] / searches) * 100:.1f}%",
385
+ "timeouts": self._stats["timeouts"],
386
+ "avg_latency_ms": f"{self._stats['total_latency_ms'] / searches:.1f}",
387
+ }
388
+
389
+ async def close(self):
390
+ """Cleanup resources."""
391
+ self._executor.shutdown(wait=False)
392
+ stats = self.get_stats()
393
+ logger.info(f"πŸ“Š [HybridMemory] Final stats: {stats}")