Devakumar868 commited on
Commit
2be6512
Β·
verified Β·
1 Parent(s): 7a76b7a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +284 -0
app.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import numpy as np
4
+ import librosa
5
+ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
6
+ import soundfile as sf
7
+ from huggingface_hub import hf_hub_download
8
+ import json
9
+ import time
10
+ from datetime import datetime
11
+ import os
12
+
13
+ # Initialize models
14
+ class ConversationalAI:
15
+ def __init__(self):
16
+ # Load Parakeet ASR
17
+ self.asr_model = self.load_parakeet_asr()
18
+
19
+ # Load Gemini (using local alternative due to API constraints)
20
+ self.llm_tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
21
+ self.llm_model = AutoModelForCausalLM.from_pretrained(
22
+ "google/gemma-2-9b-it",
23
+ torch_dtype=torch.float16,
24
+ device_map="auto"
25
+ )
26
+
27
+ # Load Dia TTS
28
+ self.tts_model = self.load_dia_tts()
29
+
30
+ # Load ERVQ for emotion recognition
31
+ self.emotion_model = self.load_ervq_emotion()
32
+
33
+ # Conversation history
34
+ self.conversations = {}
35
+
36
+ def load_parakeet_asr(self):
37
+ try:
38
+ from nemo.collections.asr import ASRModel
39
+ model = ASRModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2")
40
+ return model
41
+ except:
42
+ # Fallback to Whisper if Parakeet unavailable
43
+ return pipeline("automatic-speech-recognition",
44
+ model="openai/whisper-large-v3",
45
+ torch_dtype=torch.float16,
46
+ device="cuda")
47
+
48
+ def load_dia_tts(self):
49
+ try:
50
+ # Load Dia model from Nari Labs
51
+ from transformers import AutoModel
52
+ model = AutoModel.from_pretrained("narilabs/dia-1.6b",
53
+ torch_dtype=torch.float16,
54
+ device_map="auto")
55
+ return model
56
+ except:
57
+ # Fallback to high-quality alternative
58
+ return pipeline("text-to-speech",
59
+ model="microsoft/speecht5_tts",
60
+ torch_dtype=torch.float16,
61
+ device="cuda")
62
+
63
+ def load_ervq_emotion(self):
64
+ # ERVQ emotion recognition model
65
+ try:
66
+ return pipeline("audio-classification",
67
+ model="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
68
+ device="cuda")
69
+ except:
70
+ return None
71
+
72
+ def transcribe_audio(self, audio_path):
73
+ """Transcribe audio using Parakeet ASR"""
74
+ try:
75
+ if hasattr(self.asr_model, 'transcribe'):
76
+ # Parakeet method
77
+ transcription = self.asr_model.transcribe([audio_path])
78
+ return transcription[0] if transcription else ""
79
+ else:
80
+ # Whisper fallback
81
+ result = self.asr_model(audio_path)
82
+ return result["text"]
83
+ except Exception as e:
84
+ return f"Transcription error: {str(e)}"
85
+
86
+ def recognize_emotion(self, audio_path):
87
+ """Recognize emotion from audio"""
88
+ if self.emotion_model is None:
89
+ return "neutral"
90
+
91
+ try:
92
+ result = self.emotion_model(audio_path)
93
+ return result[0]["label"].lower()
94
+ except:
95
+ return "neutral"
96
+
97
+ def generate_response(self, text, emotion, conversation_history):
98
+ """Generate contextual response using Gemini"""
99
+ # Build context-aware prompt
100
+ context = f"Previous conversation: {conversation_history[-3:] if conversation_history else 'None'}"
101
+ emotion_context = f"User emotion detected: {emotion}"
102
+
103
+ prompt = f"""You are Maya, a naturally conversational AI assistant with emotional intelligence.
104
+ {context}
105
+ {emotion_context}
106
+
107
+ Respond naturally and emotionally appropriate to: {text}
108
+
109
+ Keep responses conversational, empathetic, and under 100 words."""
110
+
111
+ inputs = self.llm_tokenizer(prompt, return_tensors="pt").to("cuda")
112
+
113
+ with torch.no_grad():
114
+ outputs = self.llm_model.generate(
115
+ **inputs,
116
+ max_new_tokens=150,
117
+ temperature=0.7,
118
+ do_sample=True,
119
+ pad_token_id=self.llm_tokenizer.eos_token_id
120
+ )
121
+
122
+ response = self.llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
123
+ # Extract only the new response
124
+ response = response.split("Respond naturally")[-1].strip()
125
+
126
+ return response
127
+
128
+ def synthesize_speech(self, text, emotion):
129
+ """Generate emotional speech using Dia TTS"""
130
+ try:
131
+ # Emotional context for TTS
132
+ emotional_prompt = f"[{emotion}] {text}"
133
+
134
+ if hasattr(self.tts_model, 'generate_speech'):
135
+ # Dia method
136
+ audio = self.tts_model.generate_speech(emotional_prompt)
137
+ else:
138
+ # Fallback method
139
+ audio = self.tts_model(text)
140
+ audio = audio["audio"]
141
+
142
+ return audio
143
+ except Exception as e:
144
+ return None
145
+
146
+ def process_conversation(self, audio_input, user_id="default"):
147
+ """Main conversation processing pipeline"""
148
+ if audio_input is None:
149
+ return "Please provide audio input", None, "No conversation yet"
150
+
151
+ start_time = time.time()
152
+
153
+ # Initialize user conversation if not exists
154
+ if user_id not in self.conversations:
155
+ self.conversations[user_id] = []
156
+
157
+ # Step 1: Transcribe audio
158
+ transcription = self.transcribe_audio(audio_input)
159
+
160
+ # Step 2: Recognize emotion
161
+ emotion = self.recognize_emotion(audio_input)
162
+
163
+ # Step 3: Generate response
164
+ response_text = self.generate_response(
165
+ transcription, emotion, self.conversations[user_id]
166
+ )
167
+
168
+ # Step 4: Synthesize speech
169
+ response_audio = self.synthesize_speech(response_text, emotion)
170
+
171
+ # Step 5: Update conversation history
172
+ conversation_entry = {
173
+ "timestamp": datetime.now().isoformat(),
174
+ "user_input": transcription,
175
+ "user_emotion": emotion,
176
+ "ai_response": response_text,
177
+ "processing_time": time.time() - start_time
178
+ }
179
+
180
+ self.conversations[user_id].append(conversation_entry)
181
+
182
+ # Keep only last 50 exchanges per user
183
+ if len(self.conversations[user_id]) > 50:
184
+ self.conversations[user_id] = self.conversations[user_id][-50:]
185
+
186
+ # Format conversation history
187
+ history = self.format_conversation_history(user_id)
188
+
189
+ return transcription, response_audio, history
190
+
191
+ def format_conversation_history(self, user_id):
192
+ """Format conversation history for display"""
193
+ if user_id not in self.conversations:
194
+ return "No conversation history"
195
+
196
+ history = []
197
+ for entry in self.conversations[user_id][-10:]: # Show last 10 exchanges
198
+ history.append(f"🎀 You ({entry['user_emotion']}): {entry['user_input']}")
199
+ history.append(f"πŸ€– Maya: {entry['ai_response']}")
200
+ history.append(f"⏱️ Response time: {entry['processing_time']:.2f}s\n")
201
+
202
+ return "\n".join(history)
203
+
204
+ def clear_conversation(self, user_id="default"):
205
+ """Clear conversation history"""
206
+ if user_id in self.conversations:
207
+ self.conversations[user_id] = []
208
+ return "Conversation cleared!"
209
+
210
+ # Initialize the AI system
211
+ ai_system = ConversationalAI()
212
+
213
+ # Gradio interface
214
+ def process_audio(audio):
215
+ transcription, response_audio, history = ai_system.process_conversation(audio)
216
+ return transcription, response_audio, history
217
+
218
+ def clear_chat():
219
+ message = ai_system.clear_conversation()
220
+ return message, "Conversation cleared!"
221
+
222
+ # Create Gradio interface
223
+ with gr.Blocks(title="Maya AI - Advanced Conversational AI", theme=gr.themes.Soft()) as demo:
224
+ gr.Markdown("# 🎀 Maya AI - Your Emotional Conversational Partner")
225
+ gr.Markdown("*Powered by Parakeet ASR, Gemini LLM, and Dia TTS with emotional intelligence*")
226
+
227
+ with gr.Row():
228
+ with gr.Column(scale=1):
229
+ audio_input = gr.Audio(
230
+ sources=["microphone"],
231
+ type="filepath",
232
+ label="πŸŽ™οΈ Speak to Maya",
233
+ interactive=True
234
+ )
235
+
236
+ process_btn = gr.Button("πŸ’¬ Process Conversation", variant="primary")
237
+ clear_btn = gr.Button("πŸ—‘οΈ Clear Conversation", variant="secondary")
238
+
239
+ with gr.Column(scale=2):
240
+ transcription_output = gr.Textbox(
241
+ label="πŸ“ What you said",
242
+ interactive=False,
243
+ lines=3
244
+ )
245
+
246
+ audio_output = gr.Audio(
247
+ label="πŸ”Š Maya's Response",
248
+ interactive=False
249
+ )
250
+
251
+ conversation_history = gr.Textbox(
252
+ label="πŸ’­ Conversation History",
253
+ interactive=False,
254
+ lines=15,
255
+ max_lines=20
256
+ )
257
+
258
+ # Event handlers
259
+ process_btn.click(
260
+ fn=process_audio,
261
+ inputs=[audio_input],
262
+ outputs=[transcription_output, audio_output, conversation_history]
263
+ )
264
+
265
+ clear_btn.click(
266
+ fn=clear_chat,
267
+ outputs=[transcription_output, conversation_history]
268
+ )
269
+
270
+ # Auto-process when audio is recorded
271
+ audio_input.change(
272
+ fn=process_audio,
273
+ inputs=[audio_input],
274
+ outputs=[transcription_output, audio_output, conversation_history]
275
+ )
276
+
277
+ # Launch the app
278
+ if __name__ == "__main__":
279
+ demo.launch(
280
+ server_name="0.0.0.0",
281
+ server_port=7860,
282
+ share=True,
283
+ show_error=True
284
+ )