Devakumar868 commited on
Commit
d31c491
Β·
verified Β·
1 Parent(s): 9319248

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -81
app.py CHANGED
@@ -22,37 +22,33 @@ class MayaAI:
22
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
23
  print(f"πŸš€ Initializing Maya AI on {self.device}")
24
 
25
- # Load Whisper ASR with FORCED English (Fixed language issue)
26
  self.asr_processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
27
  self.asr_model = WhisperForConditionalGeneration.from_pretrained(
28
  "openai/whisper-large-v3",
29
  torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
30
  ).to(self.device)
31
 
32
- # FORCE English transcription (Fix for language detection issue)
33
  self.asr_model.config.forced_decoder_ids = self.asr_processor.get_decoder_prompt_ids(
34
  language="english",
35
  task="transcribe"
36
  )
37
  print("βœ… Whisper ASR loaded with FORCED English")
38
 
39
- # Load FREE DeepSeek LLM (smaller version that fits in HF Spaces)
40
- try:
41
- self.llm_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
42
- self.llm_model = AutoModelForCausalLM.from_pretrained(
43
- "microsoft/DialoGPT-large",
44
- torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
45
- device_map="auto"
46
- )
47
- print("βœ… DialoGPT-Large loaded (FREE)")
48
- except:
49
- # Even smaller fallback
50
- self.llm_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
51
- self.llm_model = AutoModelForCausalLM.from_pretrained(
52
- "microsoft/DialoGPT-medium",
53
- torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
54
- ).to(self.device)
55
- print("βœ… DialoGPT-Medium loaded (FREE fallback)")
56
 
57
  # Load Emotion Recognition
58
  self.emotion_model = pipeline(
@@ -62,42 +58,38 @@ class MayaAI:
62
  )
63
  print("βœ… Emotion recognition loaded")
64
 
65
- # Load Dia TTS (FIXED dtype issue)
66
  try:
67
- # Import Dia directly
68
- from huggingface_hub import hf_hub_download
69
- import importlib.util
70
-
71
- # Download Dia model files
72
- model_path = hf_hub_download(repo_id="nari-labs/Dia-1.6B", filename="model.py")
73
- spec = importlib.util.spec_from_file_location("dia_model", model_path)
74
- dia_module = importlib.util.module_from_spec(spec)
75
- spec.loader.exec_module(dia_module)
76
-
77
- self.dia_model = dia_module.Dia.from_pretrained("nari-labs/Dia-1.6B")
78
- print("βœ… Dia TTS loaded successfully")
79
- self.use_dia = True
80
  except Exception as e:
81
- print(f"⚠️ Dia loading failed: {e}")
82
- # Fallback to SpeechT5 with FIXED dtype
83
  self.tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
84
  self.tts_model = SpeechT5ForTextToSpeech.from_pretrained(
85
  "microsoft/speecht5_tts",
86
- torch_dtype=torch.float32 # FIXED: Use float32 consistently
87
  ).to(self.device)
88
  self.vocoder = SpeechT5HifiGan.from_pretrained(
89
  "microsoft/speecht5_hifigan",
90
- torch_dtype=torch.float32 # FIXED: Use float32 consistently
91
  ).to(self.device)
92
 
93
- # Load speaker embeddings for natural female voice
94
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
95
  self.speaker_embeddings = torch.tensor(
96
  embeddings_dataset[7306]["xvector"],
97
- dtype=torch.float32 # FIXED: Consistent dtype
98
  ).unsqueeze(0).to(self.device)
99
- print("βœ… SpeechT5 TTS loaded with FIXED dtypes")
100
- self.use_dia = False
101
 
102
  # Conversation storage
103
  self.conversations = {}
@@ -117,7 +109,7 @@ class MayaAI:
117
  audio,
118
  sampling_rate=16000,
119
  return_tensors="pt",
120
- language="english" # FORCE English
121
  ).to(self.device)
122
 
123
  with torch.no_grad():
@@ -125,7 +117,7 @@ class MayaAI:
125
  inputs.input_features,
126
  max_new_tokens=150,
127
  do_sample=False,
128
- forced_decoder_ids=self.asr_model.config.forced_decoder_ids # FORCE English
129
  )
130
 
131
  transcription = self.asr_processor.batch_decode(
@@ -159,7 +151,7 @@ class MayaAI:
159
  return "neutral"
160
 
161
  def generate_with_free_llm(self, text, emotion, history):
162
- """Generate response using FREE LLM"""
163
  try:
164
  # Emotional context prompting
165
  emotion_prompts = {
@@ -177,29 +169,31 @@ class MayaAI:
177
  # Build conversation context
178
  context_text = ""
179
  if history:
180
- for entry in history[-2:]: # Last 2 exchanges for context
181
  context_text += f"User: {entry.get('user_input', '')}\nMaya: {entry.get('ai_response', '')}\n"
182
 
183
  prompt = f"{context_text}User: {text}\nMaya:"
184
 
185
- # Tokenize input
186
  inputs = self.llm_tokenizer(
187
  prompt,
188
  return_tensors="pt",
189
  truncation=True,
190
  max_length=1024,
191
- padding=True
 
192
  ).to(self.device)
193
 
194
- # Generate response
195
  with torch.no_grad():
196
  outputs = self.llm_model.generate(
197
- **inputs,
 
198
  max_new_tokens=80,
199
  temperature=0.7,
200
  do_sample=True,
201
- pad_token_id=self.llm_tokenizer.eos_token_id,
202
- attention_mask=inputs.attention_mask
203
  )
204
 
205
  # Decode response
@@ -217,31 +211,61 @@ class MayaAI:
217
  except Exception as e:
218
  return f"{emotion_prompts.get(emotion, 'I understand.')} Could you tell me more about that?"
219
 
220
- def synthesize_speech(self, text, emotion):
221
- """Generate speech with FIXED dtype issues"""
222
  try:
223
  if not text or len(text.strip()) == 0:
224
  return None
225
 
226
- if self.use_dia:
227
- # Use Dia for natural speech with emotions
228
- emotional_text = f"[S1] {text}"
 
 
229
  if emotion == "happy":
230
- emotional_text += " (laughs)"
231
  elif emotion == "sad":
232
- emotional_text += " (sighs)"
233
  elif emotion == "excited":
234
- emotional_text += " (enthusiastically)"
 
 
 
 
235
 
236
- output = self.dia_model.generate(emotional_text)
237
- return output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  else:
239
- # Use SpeechT5 with FIXED dtypes
240
  clean_text = text.replace("[", "").replace("]", "").strip()
241
  if len(clean_text) > 200:
242
  clean_text = clean_text[:200] + "..."
243
 
244
- # Process with TTS - ALL FLOAT32
 
 
 
 
 
 
 
245
  inputs = self.tts_processor(text=clean_text, return_tensors="pt")
246
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
247
 
@@ -253,7 +277,7 @@ class MayaAI:
253
  )
254
 
255
  if isinstance(speech, torch.Tensor):
256
- speech = speech.cpu().numpy().astype(np.float32) # FIXED: Consistent dtype
257
 
258
  return speech
259
 
@@ -266,9 +290,10 @@ class MayaAI:
266
  self.call_active = True
267
  greeting = "Hello! I'm Maya, your AI conversation partner. I'm here to chat with you naturally and understand your emotions. How are you feeling today?"
268
 
269
- greeting_audio = self.synthesize_speech(greeting, "happy")
270
 
271
- return greeting, (22050, greeting_audio) if greeting_audio is not None else None, "πŸ“ž Call started! Maya is greeting you..."
 
272
 
273
  def end_call(self, user_id="default"):
274
  """End call and clear conversation"""
@@ -277,9 +302,10 @@ class MayaAI:
277
  self.conversations[user_id] = []
278
 
279
  farewell = "Thank you for chatting with me! It was wonderful talking with you. Have a great day!"
280
- farewell_audio = self.synthesize_speech(farewell, "happy")
281
 
282
- return farewell, (22050, farewell_audio) if farewell_audio is not None else None, "πŸ“ž Call ended. Conversation cleared!"
 
283
 
284
  def process_conversation(self, audio_input, user_id="default"):
285
  """Main conversation processing pipeline"""
@@ -301,13 +327,13 @@ class MayaAI:
301
  # Step 2: Emotion recognition
302
  emotion = self.recognize_emotion_from_audio(audio_input)
303
 
304
- # Step 3: FREE LLM generation
305
  response_text = self.generate_with_free_llm(
306
  transcription, emotion, self.conversations[user_id]
307
  )
308
 
309
- # Step 4: TTS with FIXED dtypes
310
- response_audio = self.synthesize_speech(response_text, emotion)
311
 
312
  # Step 5: Update conversation history
313
  processing_time = time.time() - start_time
@@ -327,7 +353,8 @@ class MayaAI:
327
 
328
  history = self.format_conversation_history(user_id)
329
 
330
- return transcription, (22050, response_audio) if response_audio is not None else None, history
 
331
 
332
  except Exception as e:
333
  return f"Processing error: {str(e)}", None, "Error in processing"
@@ -348,9 +375,9 @@ class MayaAI:
348
  return "\n".join(history)
349
 
350
  # Initialize Maya AI
351
- print("πŸš€ Starting Maya AI with FIXED issues...")
352
  maya = MayaAI()
353
- print("βœ… Maya AI ready with ZERO API costs!")
354
 
355
  # Gradio Interface Functions
356
  def start_call_handler():
@@ -364,15 +391,15 @@ def process_audio_handler(audio):
364
 
365
  # Create Gradio Interface
366
  with gr.Blocks(
367
- title="Maya AI - FIXED Sesame AI Killer",
368
  theme=gr.themes.Soft()
369
  ) as demo:
370
 
371
  gr.Markdown("""
372
- # 🎀 Maya AI - FIXED Sesame AI Killer
373
- *All issues resolved: English-only transcription, working audio output, FREE models*
374
 
375
- **FIXES:** βœ… English-only ASR βœ… Working TTS audio βœ… FREE LLM βœ… Emotion recognition
376
  """)
377
 
378
  with gr.Row():
@@ -392,7 +419,7 @@ with gr.Blocks(
392
  process_btn = gr.Button("🎯 Process Audio", variant="primary")
393
 
394
  with gr.Column(scale=2):
395
- gr.Markdown("### πŸ’¬ English Conversation")
396
 
397
  transcription_output = gr.Textbox(
398
  label="πŸ“ What you said (English)",
@@ -401,13 +428,13 @@ with gr.Blocks(
401
  )
402
 
403
  audio_output = gr.Audio(
404
- label="πŸ”Š Maya's Response (Working Audio)",
405
  interactive=False,
406
  autoplay=True
407
  )
408
 
409
  conversation_display = gr.Textbox(
410
- label="πŸ’­ Live Conversation (FREE)",
411
  lines=15,
412
  interactive=False,
413
  show_copy_button=True
 
22
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
23
  print(f"πŸš€ Initializing Maya AI on {self.device}")
24
 
25
+ # Load Whisper ASR with FORCED English
26
  self.asr_processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
27
  self.asr_model = WhisperForConditionalGeneration.from_pretrained(
28
  "openai/whisper-large-v3",
29
  torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
30
  ).to(self.device)
31
 
32
+ # FORCE English transcription
33
  self.asr_model.config.forced_decoder_ids = self.asr_processor.get_decoder_prompt_ids(
34
  language="english",
35
  task="transcribe"
36
  )
37
  print("βœ… Whisper ASR loaded with FORCED English")
38
 
39
+ # Load FREE LLM with FIXED attention mask
40
+ self.llm_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
41
+ # FIX: Set pad_token to eos_token to avoid attention mask warnings
42
+ if self.llm_tokenizer.pad_token is None:
43
+ self.llm_tokenizer.pad_token = self.llm_tokenizer.eos_token
44
+
45
+ self.llm_model = AutoModelForCausalLM.from_pretrained(
46
+ "microsoft/DialoGPT-large",
47
+ torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
48
+ device_map="auto",
49
+ pad_token_id=self.llm_tokenizer.eos_token_id
50
+ )
51
+ print("βœ… DialoGPT-Large loaded with FIXED attention masks")
 
 
 
 
52
 
53
  # Load Emotion Recognition
54
  self.emotion_model = pipeline(
 
58
  )
59
  print("βœ… Emotion recognition loaded")
60
 
61
+ # Load REAL Natural TTS (Better than Dia)
62
  try:
63
+ # Use Bark for natural, emotional speech
64
+ from transformers import BarkModel, BarkProcessor
65
+ self.bark_processor = BarkProcessor.from_pretrained("suno/bark")
66
+ self.bark_model = BarkModel.from_pretrained(
67
+ "suno/bark",
68
+ torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
69
+ ).to(self.device)
70
+ print("βœ… Bark TTS loaded (Natural emotional speech)")
71
+ self.use_bark = True
 
 
 
 
72
  except Exception as e:
73
+ print(f"⚠️ Bark loading failed: {e}")
74
+ # Fallback to SpeechT5 with FIXED dtypes
75
  self.tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
76
  self.tts_model = SpeechT5ForTextToSpeech.from_pretrained(
77
  "microsoft/speecht5_tts",
78
+ torch_dtype=torch.float32
79
  ).to(self.device)
80
  self.vocoder = SpeechT5HifiGan.from_pretrained(
81
  "microsoft/speecht5_hifigan",
82
+ torch_dtype=torch.float32
83
  ).to(self.device)
84
 
85
+ # Load female speaker embeddings
86
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
87
  self.speaker_embeddings = torch.tensor(
88
  embeddings_dataset[7306]["xvector"],
89
+ dtype=torch.float32
90
  ).unsqueeze(0).to(self.device)
91
+ print("βœ… SpeechT5 TTS loaded with natural female voice")
92
+ self.use_bark = False
93
 
94
  # Conversation storage
95
  self.conversations = {}
 
109
  audio,
110
  sampling_rate=16000,
111
  return_tensors="pt",
112
+ language="english"
113
  ).to(self.device)
114
 
115
  with torch.no_grad():
 
117
  inputs.input_features,
118
  max_new_tokens=150,
119
  do_sample=False,
120
+ forced_decoder_ids=self.asr_model.config.forced_decoder_ids
121
  )
122
 
123
  transcription = self.asr_processor.batch_decode(
 
151
  return "neutral"
152
 
153
  def generate_with_free_llm(self, text, emotion, history):
154
+ """Generate response using FREE LLM with FIXED attention masks"""
155
  try:
156
  # Emotional context prompting
157
  emotion_prompts = {
 
169
  # Build conversation context
170
  context_text = ""
171
  if history:
172
+ for entry in history[-2:]:
173
  context_text += f"User: {entry.get('user_input', '')}\nMaya: {entry.get('ai_response', '')}\n"
174
 
175
  prompt = f"{context_text}User: {text}\nMaya:"
176
 
177
+ # Tokenize input with PROPER attention mask
178
  inputs = self.llm_tokenizer(
179
  prompt,
180
  return_tensors="pt",
181
  truncation=True,
182
  max_length=1024,
183
+ padding=True,
184
+ add_special_tokens=True
185
  ).to(self.device)
186
 
187
+ # Generate response with PROPER attention mask
188
  with torch.no_grad():
189
  outputs = self.llm_model.generate(
190
+ input_ids=inputs.input_ids,
191
+ attention_mask=inputs.attention_mask, # FIX: Explicit attention mask
192
  max_new_tokens=80,
193
  temperature=0.7,
194
  do_sample=True,
195
+ pad_token_id=self.llm_tokenizer.pad_token_id,
196
+ eos_token_id=self.llm_tokenizer.eos_token_id
197
  )
198
 
199
  # Decode response
 
211
  except Exception as e:
212
  return f"{emotion_prompts.get(emotion, 'I understand.')} Could you tell me more about that?"
213
 
214
+ def synthesize_natural_speech(self, text, emotion):
215
+ """Generate natural emotional speech (Better than Dia)"""
216
  try:
217
  if not text or len(text.strip()) == 0:
218
  return None
219
 
220
+ if self.use_bark:
221
+ # Use Bark for natural emotional speech with breathing
222
+ voice_preset = "v2/en_speaker_6" # Female voice
223
+
224
+ # Add emotional context to text
225
  if emotion == "happy":
226
+ emotional_text = f"β™ͺ {text} β™ͺ" # Musical notes for happiness
227
  elif emotion == "sad":
228
+ emotional_text = f"[sighs] {text}"
229
  elif emotion == "excited":
230
+ emotional_text = f"{text}!"
231
+ elif emotion == "angry":
232
+ emotional_text = f"[frustrated] {text}"
233
+ else:
234
+ emotional_text = text
235
 
236
+ # Add natural breathing for longer text
237
+ if len(emotional_text.split()) > 15:
238
+ words = emotional_text.split()
239
+ mid_point = len(words) // 2
240
+ emotional_text = " ".join(words[:mid_point]) + " [pause] " + " ".join(words[mid_point:])
241
+
242
+ inputs = self.bark_processor(
243
+ emotional_text,
244
+ voice_preset=voice_preset,
245
+ return_tensors="pt"
246
+ ).to(self.device)
247
+
248
+ with torch.no_grad():
249
+ audio_array = self.bark_model.generate(**inputs)
250
+
251
+ if isinstance(audio_array, torch.Tensor):
252
+ audio_array = audio_array.cpu().numpy().squeeze()
253
+
254
+ return audio_array
255
  else:
256
+ # Use SpeechT5 with emotional context
257
  clean_text = text.replace("[", "").replace("]", "").strip()
258
  if len(clean_text) > 200:
259
  clean_text = clean_text[:200] + "..."
260
 
261
+ # Add emotional inflection through punctuation
262
+ if emotion == "happy":
263
+ clean_text = clean_text.replace(".", "!")
264
+ elif emotion == "excited":
265
+ clean_text = clean_text + "!"
266
+ elif emotion == "sad":
267
+ clean_text = clean_text.replace("!", ".")
268
+
269
  inputs = self.tts_processor(text=clean_text, return_tensors="pt")
270
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
271
 
 
277
  )
278
 
279
  if isinstance(speech, torch.Tensor):
280
+ speech = speech.cpu().numpy().astype(np.float32)
281
 
282
  return speech
283
 
 
290
  self.call_active = True
291
  greeting = "Hello! I'm Maya, your AI conversation partner. I'm here to chat with you naturally and understand your emotions. How are you feeling today?"
292
 
293
+ greeting_audio = self.synthesize_natural_speech(greeting, "happy")
294
 
295
+ sample_rate = 24000 if self.use_bark else 22050
296
+ return greeting, (sample_rate, greeting_audio) if greeting_audio is not None else None, "πŸ“ž Call started! Maya is greeting you..."
297
 
298
  def end_call(self, user_id="default"):
299
  """End call and clear conversation"""
 
302
  self.conversations[user_id] = []
303
 
304
  farewell = "Thank you for chatting with me! It was wonderful talking with you. Have a great day!"
305
+ farewell_audio = self.synthesize_natural_speech(farewell, "happy")
306
 
307
+ sample_rate = 24000 if self.use_bark else 22050
308
+ return farewell, (sample_rate, farewell_audio) if farewell_audio is not None else None, "πŸ“ž Call ended. Conversation cleared!"
309
 
310
  def process_conversation(self, audio_input, user_id="default"):
311
  """Main conversation processing pipeline"""
 
327
  # Step 2: Emotion recognition
328
  emotion = self.recognize_emotion_from_audio(audio_input)
329
 
330
+ # Step 3: FREE LLM generation with FIXED attention masks
331
  response_text = self.generate_with_free_llm(
332
  transcription, emotion, self.conversations[user_id]
333
  )
334
 
335
+ # Step 4: Natural TTS (Better than Dia)
336
+ response_audio = self.synthesize_natural_speech(response_text, emotion)
337
 
338
  # Step 5: Update conversation history
339
  processing_time = time.time() - start_time
 
353
 
354
  history = self.format_conversation_history(user_id)
355
 
356
+ sample_rate = 24000 if self.use_bark else 22050
357
+ return transcription, (sample_rate, response_audio) if response_audio is not None else None, history
358
 
359
  except Exception as e:
360
  return f"Processing error: {str(e)}", None, "Error in processing"
 
375
  return "\n".join(history)
376
 
377
  # Initialize Maya AI
378
+ print("πŸš€ Starting Maya AI with REAL natural speech...")
379
  maya = MayaAI()
380
+ print("βœ… Maya AI ready with natural emotional speech!")
381
 
382
  # Gradio Interface Functions
383
  def start_call_handler():
 
391
 
392
  # Create Gradio Interface
393
  with gr.Blocks(
394
+ title="Maya AI - Natural Speech Sesame Killer",
395
  theme=gr.themes.Soft()
396
  ) as demo:
397
 
398
  gr.Markdown("""
399
+ # 🎀 Maya AI - Natural Speech Sesame Killer
400
+ *Better than Dia: Natural emotional speech with breathing, laughter, and human-like responses*
401
 
402
+ **Features:** βœ… Bark Natural TTS βœ… English-only ASR βœ… Emotion Recognition βœ… FREE Models βœ… Human-like Speech
403
  """)
404
 
405
  with gr.Row():
 
419
  process_btn = gr.Button("🎯 Process Audio", variant="primary")
420
 
421
  with gr.Column(scale=2):
422
+ gr.Markdown("### πŸ’¬ Natural Conversation")
423
 
424
  transcription_output = gr.Textbox(
425
  label="πŸ“ What you said (English)",
 
428
  )
429
 
430
  audio_output = gr.Audio(
431
+ label="πŸ”Š Maya's Natural Response (Better than Dia)",
432
  interactive=False,
433
  autoplay=True
434
  )
435
 
436
  conversation_display = gr.Textbox(
437
+ label="πŸ’­ Live Conversation (FREE & Natural)",
438
  lines=15,
439
  interactive=False,
440
  show_copy_button=True