Devakumar868 commited on
Commit
6e6580b
Β·
verified Β·
1 Parent(s): 594a961

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -58
app.py CHANGED
@@ -4,8 +4,7 @@ import numpy as np
4
  import librosa
5
  from transformers import (
6
  pipeline, AutoTokenizer, AutoModelForCausalLM,
7
- WhisperProcessor, WhisperForConditionalGeneration,
8
- SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
9
  )
10
  import soundfile as sf
11
  import json
@@ -15,6 +14,9 @@ import os
15
  import warnings
16
  from datasets import load_dataset
17
 
 
 
 
18
  warnings.filterwarnings("ignore")
19
 
20
  class MayaAI:
@@ -58,20 +60,19 @@ class MayaAI:
58
  )
59
  print("βœ… Emotion recognition loaded")
60
 
61
- # Load REAL Natural TTS (Better than Dia)
62
  try:
63
- # Use Bark for natural, emotional speech
64
- from transformers import BarkModel, BarkProcessor
65
- self.bark_processor = BarkProcessor.from_pretrained("suno/bark")
66
- self.bark_model = BarkModel.from_pretrained(
67
- "suno/bark",
68
- torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
69
- ).to(self.device)
70
- print("βœ… Bark TTS loaded (Natural emotional speech)")
71
- self.use_bark = True
72
  except Exception as e:
73
- print(f"⚠️ Bark loading failed: {e}")
74
  # Fallback to SpeechT5 with FIXED dtypes
 
 
75
  self.tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
76
  self.tts_model = SpeechT5ForTextToSpeech.from_pretrained(
77
  "microsoft/speecht5_tts",
@@ -88,8 +89,8 @@ class MayaAI:
88
  embeddings_dataset[7306]["xvector"],
89
  dtype=torch.float32
90
  ).unsqueeze(0).to(self.device)
91
- print("βœ… SpeechT5 TTS loaded with natural female voice")
92
- self.use_bark = False
93
 
94
  # Conversation storage
95
  self.conversations = {}
@@ -211,49 +212,44 @@ class MayaAI:
211
  except Exception as e:
212
  return f"{emotion_prompts.get(emotion, 'I understand.')} Could you tell me more about that?"
213
 
214
- def synthesize_natural_speech(self, text, emotion):
215
- """Generate natural emotional speech (Better than Dia)"""
216
  try:
217
  if not text or len(text.strip()) == 0:
218
  return None
219
 
220
- if self.use_bark:
221
- # Use Bark for natural emotional speech with breathing
222
- voice_preset = "v2/en_speaker_6" # Female voice
223
-
224
- # Add emotional context to text
225
  if emotion == "happy":
226
- emotional_text = f"β™ͺ {text} β™ͺ" # Musical notes for happiness
227
  elif emotion == "sad":
228
- emotional_text = f"[sighs] {text}"
229
  elif emotion == "excited":
230
- emotional_text = f"{text}!"
231
  elif emotion == "angry":
232
- emotional_text = f"[frustrated] {text}"
 
 
233
  else:
234
- emotional_text = text
235
 
236
- # Add natural breathing for longer text
237
  if len(emotional_text.split()) > 15:
238
  words = emotional_text.split()
239
  mid_point = len(words) // 2
240
- emotional_text = " ".join(words[:mid_point]) + " [pause] " + " ".join(words[mid_point:])
241
-
242
- inputs = self.bark_processor(
243
- emotional_text,
244
- voice_preset=voice_preset,
245
- return_tensors="pt"
246
- ).to(self.device)
247
-
248
- with torch.no_grad():
249
- audio_array = self.bark_model.generate(**inputs)
250
 
251
- if isinstance(audio_array, torch.Tensor):
252
- audio_array = audio_array.cpu().numpy().squeeze()
 
 
 
 
253
 
254
- return audio_array
255
  else:
256
- # Use SpeechT5 with emotional context
257
  clean_text = text.replace("[", "").replace("]", "").strip()
258
  if len(clean_text) > 200:
259
  clean_text = clean_text[:200] + "..."
@@ -290,9 +286,10 @@ class MayaAI:
290
  self.call_active = True
291
  greeting = "Hello! I'm Maya, your AI conversation partner. I'm here to chat with you naturally and understand your emotions. How are you feeling today?"
292
 
293
- greeting_audio = self.synthesize_natural_speech(greeting, "happy")
294
 
295
- sample_rate = 24000 if self.use_bark else 22050
 
296
  return greeting, (sample_rate, greeting_audio) if greeting_audio is not None else None, "πŸ“ž Call started! Maya is greeting you..."
297
 
298
  def end_call(self, user_id="default"):
@@ -302,9 +299,9 @@ class MayaAI:
302
  self.conversations[user_id] = []
303
 
304
  farewell = "Thank you for chatting with me! It was wonderful talking with you. Have a great day!"
305
- farewell_audio = self.synthesize_natural_speech(farewell, "happy")
306
 
307
- sample_rate = 24000 if self.use_bark else 22050
308
  return farewell, (sample_rate, farewell_audio) if farewell_audio is not None else None, "πŸ“ž Call ended. Conversation cleared!"
309
 
310
  def process_conversation(self, audio_input, user_id="default"):
@@ -332,8 +329,8 @@ class MayaAI:
332
  transcription, emotion, self.conversations[user_id]
333
  )
334
 
335
- # Step 4: Natural TTS (Better than Dia)
336
- response_audio = self.synthesize_natural_speech(response_text, emotion)
337
 
338
  # Step 5: Update conversation history
339
  processing_time = time.time() - start_time
@@ -347,13 +344,13 @@ class MayaAI:
347
 
348
  self.conversations[user_id].append(conversation_entry)
349
 
350
- # Keep last 1000 exchanges
351
  if len(self.conversations[user_id]) > 1000:
352
  self.conversations[user_id] = self.conversations[user_id][-1000:]
353
 
354
  history = self.format_conversation_history(user_id)
355
 
356
- sample_rate = 24000 if self.use_bark else 22050
357
  return transcription, (sample_rate, response_audio) if response_audio is not None else None, history
358
 
359
  except Exception as e:
@@ -375,7 +372,7 @@ class MayaAI:
375
  return "\n".join(history)
376
 
377
  # Initialize Maya AI
378
- print("πŸš€ Starting Maya AI with REAL natural speech...")
379
  maya = MayaAI()
380
  print("βœ… Maya AI ready with natural emotional speech!")
381
 
@@ -391,15 +388,15 @@ def process_audio_handler(audio):
391
 
392
  # Create Gradio Interface
393
  with gr.Blocks(
394
- title="Maya AI - Natural Speech Sesame Killer",
395
  theme=gr.themes.Soft()
396
  ) as demo:
397
 
398
  gr.Markdown("""
399
- # 🎀 Maya AI - Natural Speech Sesame Killer
400
- *Better than Dia: Natural emotional speech with breathing, laughter, and human-like responses*
401
 
402
- **Features:** βœ… Bark Natural TTS βœ… English-only ASR βœ… Emotion Recognition βœ… FREE Models βœ… Human-like Speech
403
  """)
404
 
405
  with gr.Row():
@@ -419,7 +416,7 @@ with gr.Blocks(
419
  process_btn = gr.Button("🎯 Process Audio", variant="primary")
420
 
421
  with gr.Column(scale=2):
422
- gr.Markdown("### πŸ’¬ Natural Conversation")
423
 
424
  transcription_output = gr.Textbox(
425
  label="πŸ“ What you said (English)",
@@ -428,13 +425,13 @@ with gr.Blocks(
428
  )
429
 
430
  audio_output = gr.Audio(
431
- label="πŸ”Š Maya's Natural Response (Better than Dia)",
432
  interactive=False,
433
  autoplay=True
434
  )
435
 
436
  conversation_display = gr.Textbox(
437
- label="πŸ’­ Live Conversation (FREE & Natural)",
438
  lines=15,
439
  interactive=False,
440
  show_copy_button=True
 
4
  import librosa
5
  from transformers import (
6
  pipeline, AutoTokenizer, AutoModelForCausalLM,
7
+ WhisperProcessor, WhisperForConditionalGeneration
 
8
  )
9
  import soundfile as sf
10
  import json
 
14
  import warnings
15
  from datasets import load_dataset
16
 
17
+ # Import Dia TTS model
18
+ from dia.model import Dia
19
+
20
  warnings.filterwarnings("ignore")
21
 
22
  class MayaAI:
 
60
  )
61
  print("βœ… Emotion recognition loaded")
62
 
63
+ # Load Dia TTS Model (The REAL Dia from Nari Labs)
64
  try:
65
+ self.dia_model = Dia.from_pretrained(
66
+ "nari-labs/Dia-1.6B",
67
+ compute_dtype="float16" if self.device == "cuda" else "float32"
68
+ )[11][13][15]
69
+ print("βœ… Dia TTS loaded successfully from Nari Labs")
70
+ self.use_dia = True
 
 
 
71
  except Exception as e:
72
+ print(f"⚠️ Dia loading failed: {e}")
73
  # Fallback to SpeechT5 with FIXED dtypes
74
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
75
+
76
  self.tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
77
  self.tts_model = SpeechT5ForTextToSpeech.from_pretrained(
78
  "microsoft/speecht5_tts",
 
89
  embeddings_dataset[7306]["xvector"],
90
  dtype=torch.float32
91
  ).unsqueeze(0).to(self.device)
92
+ print("βœ… SpeechT5 TTS loaded as fallback")
93
+ self.use_dia = False
94
 
95
  # Conversation storage
96
  self.conversations = {}
 
212
  except Exception as e:
213
  return f"{emotion_prompts.get(emotion, 'I understand.')} Could you tell me more about that?"
214
 
215
+ def synthesize_with_dia(self, text, emotion):
216
+ """Generate natural emotional speech using Dia TTS"""[11][13][15]
217
  try:
218
  if not text or len(text.strip()) == 0:
219
  return None
220
 
221
+ if self.use_dia:
222
+ # Use Dia TTS with proper speaker tags and emotional context
223
+ # Add emotional markers based on Dia's supported non-verbal tags
 
 
224
  if emotion == "happy":
225
+ emotional_text = f"[S1] {text} (laughs)"[11][15]
226
  elif emotion == "sad":
227
+ emotional_text = f"[S1] {text} (sighs)"[11][15]
228
  elif emotion == "excited":
229
+ emotional_text = f"[S1] {text}!"
230
  elif emotion == "angry":
231
+ emotional_text = f"[S1] {text} (clears throat)"[11][15]
232
+ elif emotion == "surprised":
233
+ emotional_text = f"[S1] {text} (gasps)"[11][15]
234
  else:
235
+ emotional_text = f"[S1] {text}"[11][15]
236
 
237
+ # Add natural breathing for longer text (Dia feature)
238
  if len(emotional_text.split()) > 15:
239
  words = emotional_text.split()
240
  mid_point = len(words) // 2
241
+ emotional_text = " ".join(words[:mid_point]) + " (inhales) " + " ".join(words[mid_point:])
 
 
 
 
 
 
 
 
 
242
 
243
+ # Generate using Dia model
244
+ output = self.dia_model.generate(
245
+ emotional_text,
246
+ use_torch_compile=True if self.device == "cuda" else False,
247
+ verbose=False
248
+ )[11][18]
249
 
250
+ return output
251
  else:
252
+ # Use SpeechT5 fallback with emotional context
253
  clean_text = text.replace("[", "").replace("]", "").strip()
254
  if len(clean_text) > 200:
255
  clean_text = clean_text[:200] + "..."
 
286
  self.call_active = True
287
  greeting = "Hello! I'm Maya, your AI conversation partner. I'm here to chat with you naturally and understand your emotions. How are you feeling today?"
288
 
289
+ greeting_audio = self.synthesize_with_dia(greeting, "happy")
290
 
291
+ # Dia outputs at 44100 Hz sample rate
292
+ sample_rate = 44100 if self.use_dia else 22050
293
  return greeting, (sample_rate, greeting_audio) if greeting_audio is not None else None, "πŸ“ž Call started! Maya is greeting you..."
294
 
295
  def end_call(self, user_id="default"):
 
299
  self.conversations[user_id] = []
300
 
301
  farewell = "Thank you for chatting with me! It was wonderful talking with you. Have a great day!"
302
+ farewell_audio = self.synthesize_with_dia(farewell, "happy")
303
 
304
+ sample_rate = 44100 if self.use_dia else 22050
305
  return farewell, (sample_rate, farewell_audio) if farewell_audio is not None else None, "πŸ“ž Call ended. Conversation cleared!"
306
 
307
  def process_conversation(self, audio_input, user_id="default"):
 
329
  transcription, emotion, self.conversations[user_id]
330
  )
331
 
332
+ # Step 4: Dia TTS with natural emotional speech
333
+ response_audio = self.synthesize_with_dia(response_text, emotion)
334
 
335
  # Step 5: Update conversation history
336
  processing_time = time.time() - start_time
 
344
 
345
  self.conversations[user_id].append(conversation_entry)
346
 
347
+ # Keep last 1000 exchanges as specified
348
  if len(self.conversations[user_id]) > 1000:
349
  self.conversations[user_id] = self.conversations[user_id][-1000:]
350
 
351
  history = self.format_conversation_history(user_id)
352
 
353
+ sample_rate = 44100 if self.use_dia else 22050
354
  return transcription, (sample_rate, response_audio) if response_audio is not None else None, history
355
 
356
  except Exception as e:
 
372
  return "\n".join(history)
373
 
374
  # Initialize Maya AI
375
+ print("πŸš€ Starting Maya AI with Dia TTS...")
376
  maya = MayaAI()
377
  print("βœ… Maya AI ready with natural emotional speech!")
378
 
 
388
 
389
  # Create Gradio Interface
390
  with gr.Blocks(
391
+ title="Maya AI - Dia TTS Sesame Killer",
392
  theme=gr.themes.Soft()
393
  ) as demo:
394
 
395
  gr.Markdown("""
396
+ # 🎀 Maya AI - Dia TTS Sesame Killer
397
+ *Powered by Nari Labs Dia TTS: Ultra-realistic dialogue with natural breathing, laughter, and emotional speech*
398
 
399
+ **Features:** βœ… Dia Natural TTS βœ… English-only ASR βœ… Emotion Recognition βœ… FREE Models βœ… Human-like Speech with Non-verbals
400
  """)
401
 
402
  with gr.Row():
 
416
  process_btn = gr.Button("🎯 Process Audio", variant="primary")
417
 
418
  with gr.Column(scale=2):
419
+ gr.Markdown("### πŸ’¬ Natural Dia Conversation")
420
 
421
  transcription_output = gr.Textbox(
422
  label="πŸ“ What you said (English)",
 
425
  )
426
 
427
  audio_output = gr.Audio(
428
+ label="πŸ”Š Maya's Dia Response (Natural with Breathing & Emotions)",
429
  interactive=False,
430
  autoplay=True
431
  )
432
 
433
  conversation_display = gr.Textbox(
434
+ label="πŸ’­ Live Conversation (FREE & Natural Dia TTS)",
435
  lines=15,
436
  interactive=False,
437
  show_copy_button=True