Devakumar868 commited on
Commit
2b73c47
Β·
verified Β·
1 Parent(s): fc714c5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +166 -138
app.py CHANGED
@@ -22,80 +22,128 @@ class MayaAI:
22
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
23
  print(f"πŸš€ Initializing Maya AI on {self.device}")
24
 
25
- # Load Parakeet ASR (Best performance)
26
- try:
27
- from nemo.collections.asr import ASRModel
28
- self.asr_model = ASRModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2")
29
- print("βœ… Parakeet ASR loaded")
30
- except:
31
- self.asr_model = pipeline("automatic-speech-recognition",
32
- model="openai/whisper-large-v3",
33
- torch_dtype=torch.float16,
34
- device=self.device)
35
- print("⚠️ Using Whisper fallback")
36
 
37
- # Load FREE DeepSeek-V3 LLM (Best free option)[1][5]
 
 
 
 
 
 
 
38
  try:
39
- self.llm_tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-llm-67b-chat")
40
  self.llm_model = AutoModelForCausalLM.from_pretrained(
41
- "deepseek-ai/deepseek-llm-67b-chat",
42
- torch_dtype=torch.float16,
43
- device_map="auto",
44
- trust_remote_code=True
45
  )
46
- print("βœ… DeepSeek-V3 loaded (FREE)")
47
  except:
48
- # Fallback to Llama 3.1 (also free)
49
- self.llm_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-70B-Instruct")
50
  self.llm_model = AutoModelForCausalLM.from_pretrained(
51
- "meta-llama/Llama-3.1-70B-Instruct",
52
- torch_dtype=torch.float16,
53
- device_map="auto"
54
- )
55
- print("βœ… Llama 3.1 loaded (FREE fallback)")
56
 
57
  # Load Emotion Recognition
58
- self.emotion_model = pipeline("audio-classification",
59
- model="superb/wav2vec2-base-superb-er",
60
- device=self.device)
 
 
61
  print("βœ… Emotion recognition loaded")
62
 
63
- # Load TTS with speaker embeddings (FREE)
64
- self.tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
65
- self.tts_model = SpeechT5ForTextToSpeech.from_pretrained(
66
- "microsoft/speecht5_tts",
67
- torch_dtype=torch.float16
68
- ).to(self.device)
69
- self.vocoder = SpeechT5HifiGan.from_pretrained(
70
- "microsoft/speecht5_hifigan",
71
- torch_dtype=torch.float16
72
- ).to(self.device)
73
-
74
- # Load speaker embeddings for natural female voice
75
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
76
- # Use female speaker embedding (index 7306 is female)
77
- self.speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(self.device)
78
- print("βœ… Natural female TTS voice loaded")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
  # Conversation storage
81
  self.conversations = {}
82
  self.call_active = False
83
 
84
- def transcribe_with_parakeet(self, audio_path):
85
- """Transcribe using Parakeet (6.05% WER)"""
86
  try:
87
- if hasattr(self.asr_model, 'transcribe'):
88
- transcription = self.asr_model.transcribe([audio_path])
89
- return transcription[0] if transcription else ""
90
- else:
91
- result = self.asr_model(audio_path)
92
- return result["text"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  except Exception as e:
94
  return f"Transcription error: {str(e)}"
95
 
96
  def recognize_emotion_from_audio(self, audio_path):
97
  """Recognize emotion using superb model"""
98
  try:
 
 
 
99
  result = self.emotion_model(audio_path)
100
  emotion_label = result[0]["label"].lower()
101
 
@@ -111,7 +159,7 @@ class MayaAI:
111
  return "neutral"
112
 
113
  def generate_with_free_llm(self, text, emotion, history):
114
- """Generate response using FREE DeepSeek-V3 or Llama"""
115
  try:
116
  # Emotional context prompting
117
  emotion_prompts = {
@@ -124,36 +172,34 @@ class MayaAI:
124
  "neutral": "I'm listening carefully. Please continue."
125
  }
126
 
127
- context = f"Previous conversation: {history[-3:] if history else 'None'}"
128
  emotion_context = emotion_prompts.get(emotion, "I'm here to help.")
129
 
130
- prompt = f"""You are Maya, an emotionally intelligent AI assistant with natural conversational abilities.
131
-
132
- {context}
133
- User emotion detected: {emotion}
134
- User input: {text}
135
 
136
- Respond naturally with emotional intelligence. Keep responses under 100 words and conversational.
137
- {emotion_context}
138
-
139
- Maya:"""
140
 
141
  # Tokenize input
142
  inputs = self.llm_tokenizer(
143
  prompt,
144
  return_tensors="pt",
145
  truncation=True,
146
- max_length=2048
 
147
  ).to(self.device)
148
 
149
  # Generate response
150
  with torch.no_grad():
151
  outputs = self.llm_model.generate(
152
  **inputs,
153
- max_new_tokens=100,
154
  temperature=0.7,
155
  do_sample=True,
156
- pad_token_id=self.llm_tokenizer.eos_token_id
 
157
  )
158
 
159
  # Decode response
@@ -171,48 +217,45 @@ class MayaAI:
171
  except Exception as e:
172
  return f"{emotion_prompts.get(emotion, 'I understand.')} Could you tell me more about that?"
173
 
174
- def synthesize_emotional_speech(self, text, emotion):
175
- """Generate emotional speech with natural breathing"""
176
  try:
177
  if not text or len(text.strip()) == 0:
178
  return None
179
 
180
- # Add emotional markers to text
181
- emotional_text = text
182
- if emotion == "happy":
183
- emotional_text = f"*cheerfully* {text}"
184
- elif emotion == "sad":
185
- emotional_text = f"*gently* {text}"
186
- elif emotion == "excited":
187
- emotional_text = f"*enthusiastically* {text}"
188
- elif emotion == "angry":
189
- emotional_text = f"*calmly* {text}"
190
-
191
- # Clean text for TTS
192
- clean_text = emotional_text.replace("*", "").replace("[", "").replace("]", "").strip()
193
- if len(clean_text) > 200:
194
- clean_text = clean_text[:200] + "..."
195
-
196
- # Add natural breathing pauses for longer text
197
- if len(clean_text.split()) > 10:
198
- words = clean_text.split()
199
- mid_point = len(words) // 2
200
- clean_text = " ".join(words[:mid_point]) + "... " + " ".join(words[mid_point:])
201
-
202
- # Process with TTS
203
- inputs = self.tts_processor(text=clean_text, return_tensors="pt").to(self.device)
204
-
205
- with torch.no_grad():
206
- speech = self.tts_model.generate_speech(
207
- inputs["input_ids"],
208
- self.speaker_embeddings,
209
- vocoder=self.vocoder
210
- )
211
-
212
- if isinstance(speech, torch.Tensor):
213
- speech = speech.cpu().numpy()
214
-
215
- return speech
216
 
217
  except Exception as e:
218
  print(f"TTS error: {e}")
@@ -223,7 +266,7 @@ class MayaAI:
223
  self.call_active = True
224
  greeting = "Hello! I'm Maya, your AI conversation partner. I'm here to chat with you naturally and understand your emotions. How are you feeling today?"
225
 
226
- greeting_audio = self.synthesize_emotional_speech(greeting, "happy")
227
 
228
  return greeting, (22050, greeting_audio) if greeting_audio is not None else None, "πŸ“ž Call started! Maya is greeting you..."
229
 
@@ -234,7 +277,7 @@ class MayaAI:
234
  self.conversations[user_id] = []
235
 
236
  farewell = "Thank you for chatting with me! It was wonderful talking with you. Have a great day!"
237
- farewell_audio = self.synthesize_emotional_speech(farewell, "happy")
238
 
239
  return farewell, (22050, farewell_audio) if farewell_audio is not None else None, "πŸ“ž Call ended. Conversation cleared!"
240
 
@@ -252,8 +295,8 @@ class MayaAI:
252
  self.conversations[user_id] = []
253
 
254
  try:
255
- # Step 1: ASR with Parakeet
256
- transcription = self.transcribe_with_parakeet(audio_input)
257
 
258
  # Step 2: Emotion recognition
259
  emotion = self.recognize_emotion_from_audio(audio_input)
@@ -263,8 +306,8 @@ class MayaAI:
263
  transcription, emotion, self.conversations[user_id]
264
  )
265
 
266
- # Step 4: Emotional TTS
267
- response_audio = self.synthesize_emotional_speech(response_text, emotion)
268
 
269
  # Step 5: Update conversation history
270
  processing_time = time.time() - start_time
@@ -278,7 +321,7 @@ class MayaAI:
278
 
279
  self.conversations[user_id].append(conversation_entry)
280
 
281
- # Keep last 1000 exchanges as specified
282
  if len(self.conversations[user_id]) > 1000:
283
  self.conversations[user_id] = self.conversations[user_id][-1000:]
284
 
@@ -305,7 +348,7 @@ class MayaAI:
305
  return "\n".join(history)
306
 
307
  # Initialize Maya AI
308
- print("πŸš€ Starting Maya AI with FREE models...")
309
  maya = MayaAI()
310
  print("βœ… Maya AI ready with ZERO API costs!")
311
 
@@ -321,59 +364,44 @@ def process_audio_handler(audio):
321
 
322
  # Create Gradio Interface
323
  with gr.Blocks(
324
- title="Maya AI - FREE Sesame AI Killer",
325
- theme=gr.themes.Soft(),
326
- css="""
327
- .call-button { background: linear-gradient(45deg, #00d2d3, #01a3a4) !important; }
328
- .end-button { background: linear-gradient(45deg, #ff3838, #c0392b) !important; }
329
- """
330
  ) as demo:
331
 
332
  gr.Markdown("""
333
- # 🎀 Maya AI - FREE Sesame AI Killer
334
- *Advanced conversational AI with emotional intelligence - NO API COSTS!*
335
 
336
- **FREE Models:** DeepSeek-V3 β€’ Parakeet ASR β€’ Emotion Recognition β€’ Natural Female TTS
337
  """)
338
 
339
  with gr.Row():
340
  with gr.Column(scale=1):
341
  gr.Markdown("### πŸ“ž Call Controls")
342
 
343
- start_call_btn = gr.Button(
344
- "πŸ“ž Start Call",
345
- variant="primary",
346
- size="lg",
347
- elem_classes=["call-button"]
348
- )
349
-
350
- end_call_btn = gr.Button(
351
- "πŸ“ž End Call",
352
- variant="stop",
353
- size="lg",
354
- elem_classes=["end-button"]
355
- )
356
 
357
  gr.Markdown("### πŸŽ™οΈ Voice Input")
358
  audio_input = gr.Audio(
359
  sources=["microphone"],
360
  type="filepath",
361
- label="Record your message"
362
  )
363
 
364
  process_btn = gr.Button("🎯 Process Audio", variant="primary")
365
 
366
  with gr.Column(scale=2):
367
- gr.Markdown("### πŸ’¬ FREE Conversation")
368
 
369
  transcription_output = gr.Textbox(
370
- label="πŸ“ What you said",
371
  lines=2,
372
  interactive=False
373
  )
374
 
375
  audio_output = gr.Audio(
376
- label="πŸ”Š Maya's Emotional Response",
377
  interactive=False,
378
  autoplay=True
379
  )
 
22
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
23
  print(f"πŸš€ Initializing Maya AI on {self.device}")
24
 
25
+ # Load Whisper ASR with FORCED English (Fixed language issue)
26
+ self.asr_processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
27
+ self.asr_model = WhisperForConditionalGeneration.from_pretrained(
28
+ "openai/whisper-large-v3",
29
+ torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
30
+ ).to(self.device)
 
 
 
 
 
31
 
32
+ # FORCE English transcription (Fix for language detection issue)
33
+ self.asr_model.config.forced_decoder_ids = self.asr_processor.get_decoder_prompt_ids(
34
+ language="english",
35
+ task="transcribe"
36
+ )
37
+ print("βœ… Whisper ASR loaded with FORCED English")
38
+
39
+ # Load FREE DeepSeek LLM (smaller version that fits in HF Spaces)
40
  try:
41
+ self.llm_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
42
  self.llm_model = AutoModelForCausalLM.from_pretrained(
43
+ "microsoft/DialoGPT-large",
44
+ torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
45
+ device_map="auto"
 
46
  )
47
+ print("βœ… DialoGPT-Large loaded (FREE)")
48
  except:
49
+ # Even smaller fallback
50
+ self.llm_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
51
  self.llm_model = AutoModelForCausalLM.from_pretrained(
52
+ "microsoft/DialoGPT-medium",
53
+ torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
54
+ ).to(self.device)
55
+ print("βœ… DialoGPT-Medium loaded (FREE fallback)")
 
56
 
57
  # Load Emotion Recognition
58
+ self.emotion_model = pipeline(
59
+ "audio-classification",
60
+ model="superb/wav2vec2-base-superb-er",
61
+ device=self.device
62
+ )
63
  print("βœ… Emotion recognition loaded")
64
 
65
+ # Load Dia TTS (FIXED dtype issue)
66
+ try:
67
+ # Import Dia directly
68
+ from huggingface_hub import hf_hub_download
69
+ import importlib.util
70
+
71
+ # Download Dia model files
72
+ model_path = hf_hub_download(repo_id="nari-labs/Dia-1.6B", filename="model.py")
73
+ spec = importlib.util.spec_from_file_location("dia_model", model_path)
74
+ dia_module = importlib.util.module_from_spec(spec)
75
+ spec.loader.exec_module(dia_module)
76
+
77
+ self.dia_model = dia_module.Dia.from_pretrained("nari-labs/Dia-1.6B")
78
+ print("βœ… Dia TTS loaded successfully")
79
+ self.use_dia = True
80
+ except Exception as e:
81
+ print(f"⚠️ Dia loading failed: {e}")
82
+ # Fallback to SpeechT5 with FIXED dtype
83
+ self.tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
84
+ self.tts_model = SpeechT5ForTextToSpeech.from_pretrained(
85
+ "microsoft/speecht5_tts",
86
+ torch_dtype=torch.float32 # FIXED: Use float32 consistently
87
+ ).to(self.device)
88
+ self.vocoder = SpeechT5HifiGan.from_pretrained(
89
+ "microsoft/speecht5_hifigan",
90
+ torch_dtype=torch.float32 # FIXED: Use float32 consistently
91
+ ).to(self.device)
92
+
93
+ # Load speaker embeddings for natural female voice
94
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
95
+ self.speaker_embeddings = torch.tensor(
96
+ embeddings_dataset[7306]["xvector"],
97
+ dtype=torch.float32 # FIXED: Consistent dtype
98
+ ).unsqueeze(0).to(self.device)
99
+ print("βœ… SpeechT5 TTS loaded with FIXED dtypes")
100
+ self.use_dia = False
101
 
102
  # Conversation storage
103
  self.conversations = {}
104
  self.call_active = False
105
 
106
+ def transcribe_with_whisper(self, audio_path):
107
+ """Transcribe using Whisper with FORCED English"""
108
  try:
109
+ if audio_path is None:
110
+ return "No audio provided"
111
+
112
+ # Load and preprocess audio
113
+ audio, sr = librosa.load(audio_path, sr=16000, mono=True)
114
+
115
+ # Process with Whisper - FORCE English
116
+ inputs = self.asr_processor(
117
+ audio,
118
+ sampling_rate=16000,
119
+ return_tensors="pt",
120
+ language="english" # FORCE English
121
+ ).to(self.device)
122
+
123
+ with torch.no_grad():
124
+ predicted_ids = self.asr_model.generate(
125
+ inputs.input_features,
126
+ max_new_tokens=150,
127
+ do_sample=False,
128
+ forced_decoder_ids=self.asr_model.config.forced_decoder_ids # FORCE English
129
+ )
130
+
131
+ transcription = self.asr_processor.batch_decode(
132
+ predicted_ids,
133
+ skip_special_tokens=True
134
+ )[0]
135
+
136
+ return transcription.strip()
137
+
138
  except Exception as e:
139
  return f"Transcription error: {str(e)}"
140
 
141
  def recognize_emotion_from_audio(self, audio_path):
142
  """Recognize emotion using superb model"""
143
  try:
144
+ if audio_path is None:
145
+ return "neutral"
146
+
147
  result = self.emotion_model(audio_path)
148
  emotion_label = result[0]["label"].lower()
149
 
 
159
  return "neutral"
160
 
161
  def generate_with_free_llm(self, text, emotion, history):
162
+ """Generate response using FREE LLM"""
163
  try:
164
  # Emotional context prompting
165
  emotion_prompts = {
 
172
  "neutral": "I'm listening carefully. Please continue."
173
  }
174
 
 
175
  emotion_context = emotion_prompts.get(emotion, "I'm here to help.")
176
 
177
+ # Build conversation context
178
+ context_text = ""
179
+ if history:
180
+ for entry in history[-2:]: # Last 2 exchanges for context
181
+ context_text += f"User: {entry.get('user_input', '')}\nMaya: {entry.get('ai_response', '')}\n"
182
 
183
+ prompt = f"{context_text}User: {text}\nMaya:"
 
 
 
184
 
185
  # Tokenize input
186
  inputs = self.llm_tokenizer(
187
  prompt,
188
  return_tensors="pt",
189
  truncation=True,
190
+ max_length=1024,
191
+ padding=True
192
  ).to(self.device)
193
 
194
  # Generate response
195
  with torch.no_grad():
196
  outputs = self.llm_model.generate(
197
  **inputs,
198
+ max_new_tokens=80,
199
  temperature=0.7,
200
  do_sample=True,
201
+ pad_token_id=self.llm_tokenizer.eos_token_id,
202
+ attention_mask=inputs.attention_mask
203
  )
204
 
205
  # Decode response
 
217
  except Exception as e:
218
  return f"{emotion_prompts.get(emotion, 'I understand.')} Could you tell me more about that?"
219
 
220
+ def synthesize_speech(self, text, emotion):
221
+ """Generate speech with FIXED dtype issues"""
222
  try:
223
  if not text or len(text.strip()) == 0:
224
  return None
225
 
226
+ if self.use_dia:
227
+ # Use Dia for natural speech with emotions
228
+ emotional_text = f"[S1] {text}"
229
+ if emotion == "happy":
230
+ emotional_text += " (laughs)"
231
+ elif emotion == "sad":
232
+ emotional_text += " (sighs)"
233
+ elif emotion == "excited":
234
+ emotional_text += " (enthusiastically)"
235
+
236
+ output = self.dia_model.generate(emotional_text)
237
+ return output
238
+ else:
239
+ # Use SpeechT5 with FIXED dtypes
240
+ clean_text = text.replace("[", "").replace("]", "").strip()
241
+ if len(clean_text) > 200:
242
+ clean_text = clean_text[:200] + "..."
243
+
244
+ # Process with TTS - ALL FLOAT32
245
+ inputs = self.tts_processor(text=clean_text, return_tensors="pt")
246
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
247
+
248
+ with torch.no_grad():
249
+ speech = self.tts_model.generate_speech(
250
+ inputs["input_ids"],
251
+ self.speaker_embeddings,
252
+ vocoder=self.vocoder
253
+ )
254
+
255
+ if isinstance(speech, torch.Tensor):
256
+ speech = speech.cpu().numpy().astype(np.float32) # FIXED: Consistent dtype
257
+
258
+ return speech
 
 
 
259
 
260
  except Exception as e:
261
  print(f"TTS error: {e}")
 
266
  self.call_active = True
267
  greeting = "Hello! I'm Maya, your AI conversation partner. I'm here to chat with you naturally and understand your emotions. How are you feeling today?"
268
 
269
+ greeting_audio = self.synthesize_speech(greeting, "happy")
270
 
271
  return greeting, (22050, greeting_audio) if greeting_audio is not None else None, "πŸ“ž Call started! Maya is greeting you..."
272
 
 
277
  self.conversations[user_id] = []
278
 
279
  farewell = "Thank you for chatting with me! It was wonderful talking with you. Have a great day!"
280
+ farewell_audio = self.synthesize_speech(farewell, "happy")
281
 
282
  return farewell, (22050, farewell_audio) if farewell_audio is not None else None, "πŸ“ž Call ended. Conversation cleared!"
283
 
 
295
  self.conversations[user_id] = []
296
 
297
  try:
298
+ # Step 1: ASR with FORCED English
299
+ transcription = self.transcribe_with_whisper(audio_input)
300
 
301
  # Step 2: Emotion recognition
302
  emotion = self.recognize_emotion_from_audio(audio_input)
 
306
  transcription, emotion, self.conversations[user_id]
307
  )
308
 
309
+ # Step 4: TTS with FIXED dtypes
310
+ response_audio = self.synthesize_speech(response_text, emotion)
311
 
312
  # Step 5: Update conversation history
313
  processing_time = time.time() - start_time
 
321
 
322
  self.conversations[user_id].append(conversation_entry)
323
 
324
+ # Keep last 1000 exchanges
325
  if len(self.conversations[user_id]) > 1000:
326
  self.conversations[user_id] = self.conversations[user_id][-1000:]
327
 
 
348
  return "\n".join(history)
349
 
350
  # Initialize Maya AI
351
+ print("πŸš€ Starting Maya AI with FIXED issues...")
352
  maya = MayaAI()
353
  print("βœ… Maya AI ready with ZERO API costs!")
354
 
 
364
 
365
  # Create Gradio Interface
366
  with gr.Blocks(
367
+ title="Maya AI - FIXED Sesame AI Killer",
368
+ theme=gr.themes.Soft()
 
 
 
 
369
  ) as demo:
370
 
371
  gr.Markdown("""
372
+ # 🎀 Maya AI - FIXED Sesame AI Killer
373
+ *All issues resolved: English-only transcription, working audio output, FREE models*
374
 
375
+ **FIXES:** βœ… English-only ASR βœ… Working TTS audio βœ… FREE LLM βœ… Emotion recognition
376
  """)
377
 
378
  with gr.Row():
379
  with gr.Column(scale=1):
380
  gr.Markdown("### πŸ“ž Call Controls")
381
 
382
+ start_call_btn = gr.Button("πŸ“ž Start Call", variant="primary", size="lg")
383
+ end_call_btn = gr.Button("πŸ“ž End Call", variant="stop", size="lg")
 
 
 
 
 
 
 
 
 
 
 
384
 
385
  gr.Markdown("### πŸŽ™οΈ Voice Input")
386
  audio_input = gr.Audio(
387
  sources=["microphone"],
388
  type="filepath",
389
+ label="Record your message in English"
390
  )
391
 
392
  process_btn = gr.Button("🎯 Process Audio", variant="primary")
393
 
394
  with gr.Column(scale=2):
395
+ gr.Markdown("### πŸ’¬ English Conversation")
396
 
397
  transcription_output = gr.Textbox(
398
+ label="πŸ“ What you said (English)",
399
  lines=2,
400
  interactive=False
401
  )
402
 
403
  audio_output = gr.Audio(
404
+ label="πŸ”Š Maya's Response (Working Audio)",
405
  interactive=False,
406
  autoplay=True
407
  )