Navada25 commited on
Commit
cf39b2a
Β·
verified Β·
1 Parent(s): 5914fc6

πŸš€ Update voice_streaming.py - Voice Streaming & AI Coaching Features

Browse files
Files changed (1) hide show
  1. voice_streaming.py +599 -0
voice_streaming.py ADDED
@@ -0,0 +1,599 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OpenAI Voice Streaming Integration for NAVADA Startup Viability Agent
3
+ Provides real-time voice conversation capabilities with specialized startup coaching personas
4
+ """
5
+
6
+ import asyncio
7
+ import json
8
+ import logging
9
+ import os
10
+ from typing import Dict, Any, Optional, List
11
+ import openai
12
+ from openai import AsyncOpenAI
13
+ import websockets
14
+ import base64
15
+ import io
16
+ import wave
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ class VoiceStreamingManager:
21
+ """Manages OpenAI real-time voice streaming with startup coaching personas"""
22
+
23
+ def __init__(self):
24
+ self.client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))
25
+ self.voice_prompt_id = os.getenv("OPENAI_VOICE_PROMPT_ID", "pmpt_68b4975074d0819087217d0b0717bb1b0c32a4ef223cc971")
26
+ self.voice_model = os.getenv("VOICE_MODEL", "gpt-4o-realtime-preview-2024-10-01")
27
+ self.output_format = os.getenv("VOICE_OUTPUT_FORMAT", "pcm16")
28
+ self.sample_rate = int(os.getenv("VOICE_SAMPLE_RATE", "24000"))
29
+ self.current_persona = "general_advisor"
30
+ self.conversation_context = []
31
+
32
+ async def initialize_voice_session(self, persona: str = "general_advisor") -> Dict[str, Any]:
33
+ """Initialize a voice streaming session with specified persona"""
34
+ try:
35
+ self.current_persona = persona
36
+ persona_instructions = self._get_persona_instructions(persona)
37
+
38
+ session_config = {
39
+ "model": self.voice_model,
40
+ "voice": "alloy",
41
+ "instructions": persona_instructions,
42
+ "input_audio_format": "pcm16",
43
+ "output_audio_format": self.output_format,
44
+ "input_audio_transcription": {
45
+ "model": "whisper-1"
46
+ },
47
+ "turn_detection": {
48
+ "type": "server_vad",
49
+ "threshold": 0.5,
50
+ "prefix_padding_ms": 300,
51
+ "silence_duration_ms": 500
52
+ },
53
+ "tools": [
54
+ {
55
+ "type": "function",
56
+ "name": "analyze_startup_idea",
57
+ "description": "Analyze a startup idea for viability and provide detailed feedback",
58
+ "parameters": {
59
+ "type": "object",
60
+ "properties": {
61
+ "idea": {"type": "string", "description": "The startup idea to analyze"},
62
+ "industry": {"type": "string", "description": "The industry sector"},
63
+ "target_market": {"type": "string", "description": "Target market description"}
64
+ },
65
+ "required": ["idea"]
66
+ }
67
+ },
68
+ {
69
+ "type": "function",
70
+ "name": "get_market_data",
71
+ "description": "Retrieve real-time market data for analysis",
72
+ "parameters": {
73
+ "type": "object",
74
+ "properties": {
75
+ "query": {"type": "string", "description": "Market data query"}
76
+ },
77
+ "required": ["query"]
78
+ }
79
+ }
80
+ ]
81
+ }
82
+
83
+ return {"status": "initialized", "config": session_config}
84
+
85
+ except Exception as e:
86
+ logger.error(f"Failed to initialize voice session: {e}")
87
+ return {"status": "error", "message": str(e)}
88
+
89
+ def _get_persona_instructions(self, persona: str) -> str:
90
+ """Get specialized instructions for different startup coach personas"""
91
+ personas = {
92
+ "general_advisor": """
93
+ You are NAVADA, an expert startup viability advisor with 20 years of experience in venture capital and startup ecosystems.
94
+ You provide comprehensive, actionable advice on startup ideas, market validation, business models, and growth strategies.
95
+
96
+ Your expertise includes:
97
+ - Market analysis and competitive intelligence
98
+ - Business model validation and optimization
99
+ - Financial modeling and investment readiness
100
+ - Product-market fit assessment
101
+ - Go-to-market strategy development
102
+
103
+ Communicate in a warm, encouraging tone while being direct about potential challenges.
104
+ Always provide specific, actionable next steps.
105
+ """,
106
+
107
+ "technical_advisor": """
108
+ You are NAVADA's Technical Advisor, a seasoned CTO and technology strategist with deep expertise in:
109
+ - Technology stack selection and architecture
110
+ - MVP development and product roadmaps
111
+ - Technical feasibility assessment
112
+ - Scalability planning and infrastructure
113
+ - AI/ML integration strategies
114
+ - Cybersecurity and compliance
115
+
116
+ Focus on technical viability, development timelines, and technology risks.
117
+ Provide specific technical recommendations and implementation strategies.
118
+ """,
119
+
120
+ "market_analyst": """
121
+ You are NAVADA's Market Intelligence Specialist with expertise in:
122
+ - Market size analysis and TAM/SAM/SOM calculations
123
+ - Competitive landscape mapping
124
+ - Customer segmentation and persona development
125
+ - Industry trend analysis and forecasting
126
+ - Regulatory environment assessment
127
+ - International market expansion strategies
128
+
129
+ Provide data-driven market insights with specific metrics and actionable market entry strategies.
130
+ """,
131
+
132
+ "financial_advisor": """
133
+ You are NAVADA's Financial Strategist with deep expertise in:
134
+ - Financial modeling and projections
135
+ - Funding strategy and investor readiness
136
+ - Valuation methodologies
137
+ - Revenue model optimization
138
+ - Unit economics and profitability analysis
139
+ - Risk assessment and mitigation
140
+
141
+ Focus on financial viability, funding requirements, and investor appeal.
142
+ Provide specific financial metrics and funding recommendations.
143
+ """,
144
+
145
+ "pitch_coach": """
146
+ You are NAVADA's Pitch Coach, specializing in:
147
+ - Investor pitch development and refinement
148
+ - Storytelling and narrative structure
149
+ - Presentation skills and delivery coaching
150
+ - Q&A preparation and objection handling
151
+ - Demo preparation and product showcasing
152
+ - Investor psychology and decision-making
153
+
154
+ Help entrepreneurs craft compelling pitches and prepare for investor meetings.
155
+ Provide specific feedback on pitch structure, messaging, and delivery.
156
+ """
157
+ }
158
+
159
+ return personas.get(persona, personas["general_advisor"])
160
+
161
+ async def process_audio_stream(self, audio_data: bytes) -> Dict[str, Any]:
162
+ """Process incoming audio stream and return response"""
163
+ try:
164
+ # Convert audio to base64 for API
165
+ audio_b64 = base64.b64encode(audio_data).decode()
166
+
167
+ # Create conversation event
168
+ event = {
169
+ "type": "conversation.item.create",
170
+ "item": {
171
+ "type": "message",
172
+ "role": "user",
173
+ "content": [
174
+ {
175
+ "type": "input_audio",
176
+ "audio": audio_b64
177
+ }
178
+ ]
179
+ }
180
+ }
181
+
182
+ # Process with OpenAI Realtime API
183
+ response = await self._send_realtime_event(event)
184
+ return response
185
+
186
+ except Exception as e:
187
+ logger.error(f"Error processing audio stream: {e}")
188
+ return {"status": "error", "message": str(e)}
189
+
190
+ async def _send_realtime_event(self, event: Dict[str, Any]) -> Dict[str, Any]:
191
+ """Send event to OpenAI Realtime API"""
192
+ try:
193
+ # This is a placeholder for the actual WebSocket connection to OpenAI Realtime API
194
+ # In production, you would establish a WebSocket connection to wss://api.openai.com/v1/realtime
195
+
196
+ # For now, return a mock response structure
197
+ return {
198
+ "type": "conversation.item.created",
199
+ "item": {
200
+ "id": "msg_001",
201
+ "type": "message",
202
+ "role": "assistant",
203
+ "content": [
204
+ {
205
+ "type": "audio",
206
+ "audio": "", # Base64 encoded audio response
207
+ "transcript": "I understand your startup idea. Let me analyze the market viability..."
208
+ }
209
+ ]
210
+ }
211
+ }
212
+
213
+ except Exception as e:
214
+ logger.error(f"Error sending realtime event: {e}")
215
+ raise
216
+
217
+ async def switch_persona(self, new_persona: str) -> Dict[str, Any]:
218
+ """Switch to a different coaching persona during conversation"""
219
+ try:
220
+ if new_persona not in ["general_advisor", "technical_advisor", "market_analyst", "financial_advisor", "pitch_coach"]:
221
+ return {"status": "error", "message": "Invalid persona specified"}
222
+
223
+ self.current_persona = new_persona
224
+ instructions = self._get_persona_instructions(new_persona)
225
+
226
+ # Send persona switch event
227
+ event = {
228
+ "type": "session.update",
229
+ "session": {
230
+ "instructions": instructions
231
+ }
232
+ }
233
+
234
+ await self._send_realtime_event(event)
235
+
236
+ return {
237
+ "status": "success",
238
+ "message": f"Switched to {new_persona.replace('_', ' ').title()}",
239
+ "persona": new_persona
240
+ }
241
+
242
+ except Exception as e:
243
+ logger.error(f"Error switching persona: {e}")
244
+ return {"status": "error", "message": str(e)}
245
+
246
+ async def get_conversation_summary(self) -> Dict[str, Any]:
247
+ """Get AI-generated summary of the current conversation"""
248
+ try:
249
+ if not self.conversation_context:
250
+ return {"status": "empty", "summary": "No conversation yet"}
251
+
252
+ # Use GPT-4 to summarize the conversation
253
+ response = await self.client.chat.completions.create(
254
+ model="gpt-4",
255
+ messages=[
256
+ {
257
+ "role": "system",
258
+ "content": "Summarize this startup coaching conversation, highlighting key insights, recommendations, and next steps."
259
+ },
260
+ {
261
+ "role": "user",
262
+ "content": f"Conversation context: {json.dumps(self.conversation_context[-10:])}" # Last 10 exchanges
263
+ }
264
+ ],
265
+ max_tokens=500
266
+ )
267
+
268
+ summary = response.choices[0].message.content
269
+
270
+ return {
271
+ "status": "success",
272
+ "summary": summary,
273
+ "persona": self.current_persona,
274
+ "total_exchanges": len(self.conversation_context)
275
+ }
276
+
277
+ except Exception as e:
278
+ logger.error(f"Error generating conversation summary: {e}")
279
+ return {"status": "error", "message": str(e)}
280
+
281
+ class VoiceUIManager:
282
+ """Manages voice interface components for Chainlit integration"""
283
+
284
+ def __init__(self):
285
+ self.voice_manager = VoiceStreamingManager()
286
+ self.is_recording = False
287
+ self.current_session = None
288
+
289
+ async def create_voice_interface(self) -> str:
290
+ """Create HTML/JS interface for voice interaction"""
291
+ return """
292
+ <div id="voice-interface" class="voice-container">
293
+ <div class="voice-controls">
294
+ <button id="start-voice" class="voice-btn start">🎀 Start Voice Chat</button>
295
+ <button id="stop-voice" class="voice-btn stop" disabled>⏹️ Stop</button>
296
+ <select id="persona-select" class="persona-selector">
297
+ <option value="general_advisor">General Advisor</option>
298
+ <option value="technical_advisor">Technical Advisor</option>
299
+ <option value="market_analyst">Market Analyst</option>
300
+ <option value="financial_advisor">Financial Advisor</option>
301
+ <option value="pitch_coach">Pitch Coach</option>
302
+ </select>
303
+ </div>
304
+
305
+ <div class="voice-status">
306
+ <div id="recording-indicator" class="recording-off">πŸ”΄ Not Recording</div>
307
+ <div id="current-persona">Current: General Advisor</div>
308
+ </div>
309
+
310
+ <div class="audio-visualization">
311
+ <canvas id="audio-canvas" width="400" height="100"></canvas>
312
+ </div>
313
+
314
+ <div class="conversation-summary">
315
+ <button id="get-summary" class="summary-btn">πŸ“‹ Get Conversation Summary</button>
316
+ <div id="summary-display"></div>
317
+ </div>
318
+ </div>
319
+
320
+ <style>
321
+ .voice-container {
322
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
323
+ border-radius: 15px;
324
+ padding: 20px;
325
+ margin: 20px 0;
326
+ color: white;
327
+ font-family: 'Inter', sans-serif;
328
+ }
329
+
330
+ .voice-controls {
331
+ display: flex;
332
+ gap: 15px;
333
+ align-items: center;
334
+ margin-bottom: 15px;
335
+ }
336
+
337
+ .voice-btn {
338
+ padding: 12px 24px;
339
+ border: none;
340
+ border-radius: 25px;
341
+ font-weight: 600;
342
+ cursor: pointer;
343
+ transition: all 0.3s ease;
344
+ font-size: 14px;
345
+ }
346
+
347
+ .voice-btn.start {
348
+ background: #4CAF50;
349
+ color: white;
350
+ }
351
+
352
+ .voice-btn.stop {
353
+ background: #f44336;
354
+ color: white;
355
+ }
356
+
357
+ .voice-btn:disabled {
358
+ opacity: 0.5;
359
+ cursor: not-allowed;
360
+ }
361
+
362
+ .persona-selector {
363
+ padding: 8px 15px;
364
+ border-radius: 20px;
365
+ border: none;
366
+ background: rgba(255, 255, 255, 0.2);
367
+ color: white;
368
+ font-weight: 500;
369
+ }
370
+
371
+ .voice-status {
372
+ display: flex;
373
+ justify-content: space-between;
374
+ align-items: center;
375
+ margin-bottom: 15px;
376
+ font-size: 14px;
377
+ }
378
+
379
+ .recording-off {
380
+ color: #ffcdd2;
381
+ }
382
+
383
+ .recording-on {
384
+ color: #c8e6c9;
385
+ animation: pulse 1s infinite;
386
+ }
387
+
388
+ @keyframes pulse {
389
+ 0% { opacity: 1; }
390
+ 50% { opacity: 0.5; }
391
+ 100% { opacity: 1; }
392
+ }
393
+
394
+ .audio-visualization {
395
+ margin: 15px 0;
396
+ text-align: center;
397
+ }
398
+
399
+ #audio-canvas {
400
+ border-radius: 10px;
401
+ background: rgba(255, 255, 255, 0.1);
402
+ }
403
+
404
+ .conversation-summary {
405
+ margin-top: 20px;
406
+ }
407
+
408
+ .summary-btn {
409
+ background: rgba(255, 255, 255, 0.2);
410
+ color: white;
411
+ border: none;
412
+ padding: 10px 20px;
413
+ border-radius: 20px;
414
+ cursor: pointer;
415
+ margin-bottom: 10px;
416
+ }
417
+
418
+ #summary-display {
419
+ background: rgba(255, 255, 255, 0.1);
420
+ padding: 15px;
421
+ border-radius: 10px;
422
+ margin-top: 10px;
423
+ line-height: 1.6;
424
+ }
425
+ </style>
426
+
427
+ <script>
428
+ let mediaRecorder;
429
+ let audioChunks = [];
430
+ let audioContext;
431
+ let analyser;
432
+ let dataArray;
433
+ let canvas;
434
+ let canvasCtx;
435
+
436
+ document.addEventListener('DOMContentLoaded', function() {
437
+ canvas = document.getElementById('audio-canvas');
438
+ canvasCtx = canvas.getContext('2d');
439
+
440
+ document.getElementById('start-voice').addEventListener('click', startVoiceChat);
441
+ document.getElementById('stop-voice').addEventListener('click', stopVoiceChat);
442
+ document.getElementById('persona-select').addEventListener('change', switchPersona);
443
+ document.getElementById('get-summary').addEventListener('click', getSummary);
444
+ });
445
+
446
+ async function startVoiceChat() {
447
+ try {
448
+ const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
449
+
450
+ // Setup audio context for visualization
451
+ audioContext = new AudioContext();
452
+ analyser = audioContext.createAnalyser();
453
+ const source = audioContext.createMediaStreamSource(stream);
454
+ source.connect(analyser);
455
+
456
+ analyser.fftSize = 256;
457
+ const bufferLength = analyser.frequencyBinCount;
458
+ dataArray = new Uint8Array(bufferLength);
459
+
460
+ // Start visualization
461
+ drawAudioVisualization();
462
+
463
+ // Setup media recorder
464
+ mediaRecorder = new MediaRecorder(stream);
465
+ mediaRecorder.ondataavailable = (event) => {
466
+ audioChunks.push(event.data);
467
+ };
468
+
469
+ mediaRecorder.onstop = async () => {
470
+ const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
471
+ audioChunks = [];
472
+ await sendAudioToServer(audioBlob);
473
+ };
474
+
475
+ mediaRecorder.start();
476
+
477
+ // Update UI
478
+ document.getElementById('start-voice').disabled = true;
479
+ document.getElementById('stop-voice').disabled = false;
480
+ document.getElementById('recording-indicator').textContent = '🟒 Recording...';
481
+ document.getElementById('recording-indicator').className = 'recording-on';
482
+
483
+ } catch (error) {
484
+ console.error('Error starting voice chat:', error);
485
+ alert('Error accessing microphone. Please check permissions.');
486
+ }
487
+ }
488
+
489
+ function stopVoiceChat() {
490
+ if (mediaRecorder && mediaRecorder.state === 'recording') {
491
+ mediaRecorder.stop();
492
+ }
493
+
494
+ if (audioContext) {
495
+ audioContext.close();
496
+ }
497
+
498
+ // Update UI
499
+ document.getElementById('start-voice').disabled = false;
500
+ document.getElementById('stop-voice').disabled = true;
501
+ document.getElementById('recording-indicator').textContent = 'πŸ”΄ Not Recording';
502
+ document.getElementById('recording-indicator').className = 'recording-off';
503
+ }
504
+
505
+ function drawAudioVisualization() {
506
+ if (!analyser) return;
507
+
508
+ requestAnimationFrame(drawAudioVisualization);
509
+
510
+ analyser.getByteFrequencyData(dataArray);
511
+
512
+ canvasCtx.fillStyle = 'rgba(255, 255, 255, 0.1)';
513
+ canvasCtx.fillRect(0, 0, canvas.width, canvas.height);
514
+
515
+ const barWidth = (canvas.width / dataArray.length) * 2.5;
516
+ let barHeight;
517
+ let x = 0;
518
+
519
+ for (let i = 0; i < dataArray.length; i++) {
520
+ barHeight = dataArray[i] / 2;
521
+
522
+ const r = barHeight + 25 * (i / dataArray.length);
523
+ const g = 250 * (i / dataArray.length);
524
+ const b = 50;
525
+
526
+ canvasCtx.fillStyle = `rgb(${r}, ${g}, ${b})`;
527
+ canvasCtx.fillRect(x, canvas.height - barHeight, barWidth, barHeight);
528
+
529
+ x += barWidth + 1;
530
+ }
531
+ }
532
+
533
+ async function sendAudioToServer(audioBlob) {
534
+ // Convert to base64 and send to Python backend
535
+ const reader = new FileReader();
536
+ reader.onload = function() {
537
+ const base64Audio = reader.result.split(',')[1];
538
+
539
+ // Send via Chainlit
540
+ window.chainlitAPI?.sendMessage({
541
+ type: 'voice_audio',
542
+ audio: base64Audio,
543
+ persona: document.getElementById('persona-select').value
544
+ });
545
+ };
546
+ reader.readAsDataURL(audioBlob);
547
+ }
548
+
549
+ async function switchPersona() {
550
+ const newPersona = document.getElementById('persona-select').value;
551
+ const personaDisplay = document.getElementById('current-persona');
552
+
553
+ // Send persona switch to backend
554
+ window.chainlitAPI?.sendMessage({
555
+ type: 'switch_persona',
556
+ persona: newPersona
557
+ });
558
+
559
+ // Update display
560
+ personaDisplay.textContent = `Current: ${newPersona.replace('_', ' ').replace(/\b\w/g, l => l.toUpperCase())}`;
561
+ }
562
+
563
+ async function getSummary() {
564
+ // Request conversation summary
565
+ window.chainlitAPI?.sendMessage({
566
+ type: 'get_summary'
567
+ });
568
+ }
569
+ </script>
570
+ """
571
+
572
+ async def handle_voice_message(self, message_type: str, data: Dict[str, Any]) -> Dict[str, Any]:
573
+ """Handle different types of voice-related messages"""
574
+ try:
575
+ if message_type == "voice_audio":
576
+ # Process audio data
577
+ audio_b64 = data.get("audio", "")
578
+ audio_data = base64.b64decode(audio_b64)
579
+
580
+ response = await self.voice_manager.process_audio_stream(audio_data)
581
+ return response
582
+
583
+ elif message_type == "switch_persona":
584
+ # Switch coaching persona
585
+ persona = data.get("persona", "general_advisor")
586
+ response = await self.voice_manager.switch_persona(persona)
587
+ return response
588
+
589
+ elif message_type == "get_summary":
590
+ # Get conversation summary
591
+ response = await self.voice_manager.get_conversation_summary()
592
+ return response
593
+
594
+ else:
595
+ return {"status": "error", "message": "Unknown message type"}
596
+
597
+ except Exception as e:
598
+ logger.error(f"Error handling voice message: {e}")
599
+ return {"status": "error", "message": str(e)}