Devakumar868 commited on
Commit
c91de72
Β·
verified Β·
1 Parent(s): a4f7834

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -126
app.py CHANGED
@@ -13,133 +13,90 @@ import os
13
  # Initialize models
14
  class ConversationalAI:
15
  def __init__(self):
16
- # Load Parakeet ASR
17
- self.asr_model = self.load_parakeet_asr()
18
-
19
- # Load Gemini (using local alternative due to API constraints)
20
- self.llm_tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
 
 
 
21
  self.llm_model = AutoModelForCausalLM.from_pretrained(
22
- "google/gemma-2-9b-it",
23
  torch_dtype=torch.float16,
24
  device_map="auto"
25
  )
26
 
27
- # Load Dia TTS
28
- self.tts_model = self.load_dia_tts()
 
 
 
29
 
30
- # Load ERVQ for emotion recognition
31
- self.emotion_model = self.load_ervq_emotion()
 
 
32
 
33
  # Conversation history
34
  self.conversations = {}
35
-
36
- def load_parakeet_asr(self):
37
- try:
38
- from nemo.collections.asr import ASRModel
39
- model = ASRModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2")
40
- return model
41
- except:
42
- # Fallback to Whisper if Parakeet unavailable
43
- return pipeline("automatic-speech-recognition",
44
- model="openai/whisper-large-v3",
45
- torch_dtype=torch.float16,
46
- device="cuda")
47
-
48
- def load_dia_tts(self):
49
- try:
50
- # Load Dia model from Nari Labs
51
- from transformers import AutoModel
52
- model = AutoModel.from_pretrained("narilabs/dia-1.6b",
53
- torch_dtype=torch.float16,
54
- device_map="auto")
55
- return model
56
- except:
57
- # Fallback to high-quality alternative
58
- return pipeline("text-to-speech",
59
- model="microsoft/speecht5_tts",
60
- torch_dtype=torch.float16,
61
- device="cuda")
62
-
63
- def load_ervq_emotion(self):
64
- # ERVQ emotion recognition model
65
- try:
66
- return pipeline("audio-classification",
67
- model="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
68
- device="cuda")
69
- except:
70
- return None
71
 
72
  def transcribe_audio(self, audio_path):
73
- """Transcribe audio using Parakeet ASR"""
74
  try:
75
- if hasattr(self.asr_model, 'transcribe'):
76
- # Parakeet method
77
- transcription = self.asr_model.transcribe([audio_path])
78
- return transcription[0] if transcription else ""
79
- else:
80
- # Whisper fallback
81
- result = self.asr_model(audio_path)
82
- return result["text"]
83
  except Exception as e:
84
  return f"Transcription error: {str(e)}"
85
 
86
  def recognize_emotion(self, audio_path):
87
  """Recognize emotion from audio"""
88
- if self.emotion_model is None:
89
- return "neutral"
90
-
91
  try:
 
 
 
92
  result = self.emotion_model(audio_path)
93
  return result[0]["label"].lower()
94
  except:
95
  return "neutral"
96
 
97
  def generate_response(self, text, emotion, conversation_history):
98
- """Generate contextual response using Gemini"""
99
- # Build context-aware prompt
100
- context = f"Previous conversation: {conversation_history[-3:] if conversation_history else 'None'}"
101
- emotion_context = f"User emotion detected: {emotion}"
102
-
103
- prompt = f"""You are Maya, a naturally conversational AI assistant with emotional intelligence.
104
- {context}
105
- {emotion_context}
106
-
107
- Respond naturally and emotionally appropriate to: {text}
108
-
109
- Keep responses conversational, empathetic, and under 100 words."""
110
-
111
- inputs = self.llm_tokenizer(prompt, return_tensors="pt").to("cuda")
112
-
113
- with torch.no_grad():
114
- outputs = self.llm_model.generate(
115
- **inputs,
116
- max_new_tokens=150,
117
- temperature=0.7,
118
- do_sample=True,
119
- pad_token_id=self.llm_tokenizer.eos_token_id
120
- )
121
-
122
- response = self.llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
123
- # Extract only the new response
124
- response = response.split("Respond naturally")[-1].strip()
125
-
126
- return response
127
-
128
- def synthesize_speech(self, text, emotion):
129
- """Generate emotional speech using Dia TTS"""
130
  try:
131
- # Emotional context for TTS
132
- emotional_prompt = f"[{emotion}] {text}"
 
 
 
 
 
133
 
134
- if hasattr(self.tts_model, 'generate_speech'):
135
- # Dia method
136
- audio = self.tts_model.generate_speech(emotional_prompt)
137
- else:
138
- # Fallback method
139
- audio = self.tts_model(text)
140
- audio = audio["audio"]
 
141
 
142
- return audio
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  except Exception as e:
144
  return None
145
 
@@ -166,7 +123,7 @@ class ConversationalAI:
166
  )
167
 
168
  # Step 4: Synthesize speech
169
- response_audio = self.synthesize_speech(response_text, emotion)
170
 
171
  # Step 5: Update conversation history
172
  conversation_entry = {
@@ -179,9 +136,9 @@ class ConversationalAI:
179
 
180
  self.conversations[user_id].append(conversation_entry)
181
 
182
- # Keep only last 50 exchanges per user
183
- if len(self.conversations[user_id]) > 50:
184
- self.conversations[user_id] = self.conversations[user_id][-50:]
185
 
186
  # Format conversation history
187
  history = self.format_conversation_history(user_id)
@@ -194,7 +151,7 @@ class ConversationalAI:
194
  return "No conversation history"
195
 
196
  history = []
197
- for entry in self.conversations[user_id][-10:]: # Show last 10 exchanges
198
  history.append(f"🎀 You ({entry['user_emotion']}): {entry['user_input']}")
199
  history.append(f"πŸ€– Maya: {entry['ai_response']}")
200
  history.append(f"⏱️ Response time: {entry['processing_time']:.2f}s\n")
@@ -210,37 +167,39 @@ class ConversationalAI:
210
  # Initialize the AI system
211
  ai_system = ConversationalAI()
212
 
213
- # Gradio interface
214
  def process_audio(audio):
 
 
 
215
  transcription, response_audio, history = ai_system.process_conversation(audio)
216
  return transcription, response_audio, history
217
 
218
  def clear_chat():
219
  message = ai_system.clear_conversation()
220
- return message, "Conversation cleared!"
221
 
222
  # Create Gradio interface
223
- with gr.Blocks(title="Maya AI - Advanced Conversational AI", theme=gr.themes.Soft()) as demo:
224
- gr.Markdown("# 🎀 Maya AI - Your Emotional Conversational Partner")
225
- gr.Markdown("*Powered by Parakeet ASR, Gemini LLM, and Dia TTS with emotional intelligence*")
226
 
227
  with gr.Row():
228
  with gr.Column(scale=1):
229
  audio_input = gr.Audio(
230
  sources=["microphone"],
231
  type="filepath",
232
- label="πŸŽ™οΈ Speak to Maya",
233
- interactive=True
234
  )
235
 
236
- process_btn = gr.Button("πŸ’¬ Process Conversation", variant="primary")
237
- clear_btn = gr.Button("πŸ—‘οΈ Clear Conversation", variant="secondary")
238
 
239
  with gr.Column(scale=2):
240
  transcription_output = gr.Textbox(
241
  label="πŸ“ What you said",
242
- interactive=False,
243
- lines=3
244
  )
245
 
246
  audio_output = gr.Audio(
@@ -250,9 +209,8 @@ with gr.Blocks(title="Maya AI - Advanced Conversational AI", theme=gr.themes.Sof
250
 
251
  conversation_history = gr.Textbox(
252
  label="πŸ’­ Conversation History",
253
- interactive=False,
254
- lines=15,
255
- max_lines=20
256
  )
257
 
258
  # Event handlers
@@ -267,7 +225,7 @@ with gr.Blocks(title="Maya AI - Advanced Conversational AI", theme=gr.themes.Sof
267
  outputs=[transcription_output, conversation_history]
268
  )
269
 
270
- # Auto-process when audio is recorded
271
  audio_input.change(
272
  fn=process_audio,
273
  inputs=[audio_input],
@@ -276,9 +234,4 @@ with gr.Blocks(title="Maya AI - Advanced Conversational AI", theme=gr.themes.Sof
276
 
277
  # Launch the app
278
  if __name__ == "__main__":
279
- demo.launch(
280
- server_name="0.0.0.0",
281
- server_port=7860,
282
- share=True,
283
- show_error=True
284
- )
 
13
  # Initialize models
14
  class ConversationalAI:
15
  def __init__(self):
16
+ # Load ASR model (using Whisper as fallback since Parakeet may not be available)
17
+ self.asr_model = pipeline("automatic-speech-recognition",
18
+ model="openai/whisper-large-v3",
19
+ torch_dtype=torch.float16,
20
+ device="cuda" if torch.cuda.is_available() else "cpu")
21
+
22
+ # Load LLM (using smaller model for HF Spaces)
23
+ self.llm_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
24
  self.llm_model = AutoModelForCausalLM.from_pretrained(
25
+ "microsoft/DialoGPT-medium",
26
  torch_dtype=torch.float16,
27
  device_map="auto"
28
  )
29
 
30
+ # Load TTS model
31
+ self.tts_model = pipeline("text-to-speech",
32
+ model="microsoft/speecht5_tts",
33
+ torch_dtype=torch.float16,
34
+ device="cuda" if torch.cuda.is_available() else "cpu")
35
 
36
+ # Load emotion recognition
37
+ self.emotion_model = pipeline("audio-classification",
38
+ model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
39
+ device="cuda" if torch.cuda.is_available() else "cpu")
40
 
41
  # Conversation history
42
  self.conversations = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  def transcribe_audio(self, audio_path):
45
+ """Transcribe audio using Whisper"""
46
  try:
47
+ if audio_path is None:
48
+ return "No audio provided"
49
+
50
+ result = self.asr_model(audio_path)
51
+ return result["text"]
 
 
 
52
  except Exception as e:
53
  return f"Transcription error: {str(e)}"
54
 
55
  def recognize_emotion(self, audio_path):
56
  """Recognize emotion from audio"""
 
 
 
57
  try:
58
+ if audio_path is None:
59
+ return "neutral"
60
+
61
  result = self.emotion_model(audio_path)
62
  return result[0]["label"].lower()
63
  except:
64
  return "neutral"
65
 
66
  def generate_response(self, text, emotion, conversation_history):
67
+ """Generate contextual response"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  try:
69
+ # Build context-aware prompt
70
+ context = f"Previous conversation: {conversation_history[-2:] if conversation_history else 'None'}"
71
+ emotion_context = f"User emotion: {emotion}"
72
+
73
+ prompt = f"You are Maya, a friendly AI assistant. {context} {emotion_context} User: {text} Maya:"
74
+
75
+ inputs = self.llm_tokenizer.encode(prompt, return_tensors="pt")
76
 
77
+ with torch.no_grad():
78
+ outputs = self.llm_model.generate(
79
+ inputs,
80
+ max_new_tokens=100,
81
+ temperature=0.7,
82
+ do_sample=True,
83
+ pad_token_id=self.llm_tokenizer.eos_token_id
84
+ )
85
 
86
+ response = self.llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
87
+ # Extract only the new response
88
+ response = response.split("Maya:")[-1].strip()
89
+
90
+ return response
91
+ except Exception as e:
92
+ return f"I'm sorry, I encountered an error: {str(e)}"
93
+
94
+ def synthesize_speech(self, text):
95
+ """Generate speech using TTS"""
96
+ try:
97
+ # Use a simple TTS approach for HF Spaces
98
+ audio = self.tts_model(text)
99
+ return audio["audio"]
100
  except Exception as e:
101
  return None
102
 
 
123
  )
124
 
125
  # Step 4: Synthesize speech
126
+ response_audio = self.synthesize_speech(response_text)
127
 
128
  # Step 5: Update conversation history
129
  conversation_entry = {
 
136
 
137
  self.conversations[user_id].append(conversation_entry)
138
 
139
+ # Keep only last 20 exchanges per user
140
+ if len(self.conversations[user_id]) > 20:
141
+ self.conversations[user_id] = self.conversations[user_id][-20:]
142
 
143
  # Format conversation history
144
  history = self.format_conversation_history(user_id)
 
151
  return "No conversation history"
152
 
153
  history = []
154
+ for entry in self.conversations[user_id][-5:]: # Show last 5 exchanges
155
  history.append(f"🎀 You ({entry['user_emotion']}): {entry['user_input']}")
156
  history.append(f"πŸ€– Maya: {entry['ai_response']}")
157
  history.append(f"⏱️ Response time: {entry['processing_time']:.2f}s\n")
 
167
  # Initialize the AI system
168
  ai_system = ConversationalAI()
169
 
170
+ # Gradio interface functions
171
  def process_audio(audio):
172
+ if audio is None:
173
+ return "No audio provided", None, "No conversation yet"
174
+
175
  transcription, response_audio, history = ai_system.process_conversation(audio)
176
  return transcription, response_audio, history
177
 
178
  def clear_chat():
179
  message = ai_system.clear_conversation()
180
+ return "", "Conversation cleared!"
181
 
182
  # Create Gradio interface
183
+ with gr.Blocks(title="Maya AI - Conversational Assistant", theme=gr.themes.Soft()) as demo:
184
+ gr.Markdown("# 🎀 Maya AI - Your Conversational Partner")
185
+ gr.Markdown("*Speak naturally and Maya will respond with voice and emotion recognition*")
186
 
187
  with gr.Row():
188
  with gr.Column(scale=1):
189
  audio_input = gr.Audio(
190
  sources=["microphone"],
191
  type="filepath",
192
+ label="πŸŽ™οΈ Speak to Maya"
 
193
  )
194
 
195
+ process_btn = gr.Button("πŸ’¬ Process", variant="primary")
196
+ clear_btn = gr.Button("πŸ—‘οΈ Clear Chat", variant="secondary")
197
 
198
  with gr.Column(scale=2):
199
  transcription_output = gr.Textbox(
200
  label="πŸ“ What you said",
201
+ lines=2,
202
+ interactive=False
203
  )
204
 
205
  audio_output = gr.Audio(
 
209
 
210
  conversation_history = gr.Textbox(
211
  label="πŸ’­ Conversation History",
212
+ lines=10,
213
+ interactive=False
 
214
  )
215
 
216
  # Event handlers
 
225
  outputs=[transcription_output, conversation_history]
226
  )
227
 
228
+ # Auto-process when audio is uploaded
229
  audio_input.change(
230
  fn=process_audio,
231
  inputs=[audio_input],
 
234
 
235
  # Launch the app
236
  if __name__ == "__main__":
237
+ demo.launch()