Spaces:

Devakumar868
/

Maya-AI

Runtime error

App Files Files Community

Devakumar868 commited on Jun 23

Commit

6e6580b

verified ·

1 Parent(s): 594a961

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -58

app.py CHANGED Viewed

@@ -4,8 +4,7 @@ import numpy as np
 import librosa
 from transformers import (
     pipeline, AutoTokenizer, AutoModelForCausalLM,
-    WhisperProcessor, WhisperForConditionalGeneration,
-    SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 )
 import soundfile as sf
 import json
@@ -15,6 +14,9 @@ import os
 import warnings
 from datasets import load_dataset
 warnings.filterwarnings("ignore")
 class MayaAI:
@@ -58,20 +60,19 @@ class MayaAI:
         )
         print("✅ Emotion recognition loaded")
-        # Load REAL Natural TTS (Better than Dia)
         try:
-            # Use Bark for natural, emotional speech
-            from transformers import BarkModel, BarkProcessor
-            self.bark_processor = BarkProcessor.from_pretrained("suno/bark")
-            self.bark_model = BarkModel.from_pretrained(
-                "suno/bark",
-                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
-            ).to(self.device)
-            print("✅ Bark TTS loaded (Natural emotional speech)")
-            self.use_bark = True
         except Exception as e:
-            print(f"⚠️ Bark loading failed: {e}")
             # Fallback to SpeechT5 with FIXED dtypes
             self.tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
             self.tts_model = SpeechT5ForTextToSpeech.from_pretrained(
                 "microsoft/speecht5_tts",
@@ -88,8 +89,8 @@ class MayaAI:
                 embeddings_dataset[7306]["xvector"],
                 dtype=torch.float32
             ).unsqueeze(0).to(self.device)
-            print("✅ SpeechT5 TTS loaded with natural female voice")
-            self.use_bark = False
         # Conversation storage
         self.conversations = {}
@@ -211,49 +212,44 @@ class MayaAI:
         except Exception as e:
             return f"{emotion_prompts.get(emotion, 'I understand.')} Could you tell me more about that?"
-    def synthesize_natural_speech(self, text, emotion):
-        """Generate natural emotional speech (Better than Dia)"""
         try:
             if not text or len(text.strip()) == 0:
                 return None
-            if self.use_bark:
-                # Use Bark for natural emotional speech with breathing
-                voice_preset = "v2/en_speaker_6"  # Female voice
-                # Add emotional context to text
                 if emotion == "happy":
-                    emotional_text = f"♪ {text} ♪"  # Musical notes for happiness
                 elif emotion == "sad":
-                    emotional_text = f"[sighs] {text}"
                 elif emotion == "excited":
-                    emotional_text = f"{text}!"
                 elif emotion == "angry":
-                    emotional_text = f"[frustrated] {text}"
                 else:
-                    emotional_text = text
-                # Add natural breathing for longer text
                 if len(emotional_text.split()) > 15:
                     words = emotional_text.split()
                     mid_point = len(words) // 2
-                    emotional_text = " ".join(words[:mid_point]) + " [pause] " + " ".join(words[mid_point:])
-                inputs = self.bark_processor(
-                    emotional_text,
-                    voice_preset=voice_preset,
-                    return_tensors="pt"
-                ).to(self.device)
-                with torch.no_grad():
-                    audio_array = self.bark_model.generate(**inputs)
-                if isinstance(audio_array, torch.Tensor):
-                    audio_array = audio_array.cpu().numpy().squeeze()
-                return audio_array
             else:
-                # Use SpeechT5 with emotional context
                 clean_text = text.replace("[", "").replace("]", "").strip()
                 if len(clean_text) > 200:
                     clean_text = clean_text[:200] + "..."
@@ -290,9 +286,10 @@ class MayaAI:
         self.call_active = True
         greeting = "Hello! I'm Maya, your AI conversation partner. I'm here to chat with you naturally and understand your emotions. How are you feeling today?"
-        greeting_audio = self.synthesize_natural_speech(greeting, "happy")
-        sample_rate = 24000 if self.use_bark else 22050
         return greeting, (sample_rate, greeting_audio) if greeting_audio is not None else None, "📞 Call started! Maya is greeting you..."
     def end_call(self, user_id="default"):
@@ -302,9 +299,9 @@ class MayaAI:
             self.conversations[user_id] = []
         farewell = "Thank you for chatting with me! It was wonderful talking with you. Have a great day!"
-        farewell_audio = self.synthesize_natural_speech(farewell, "happy")
-        sample_rate = 24000 if self.use_bark else 22050
         return farewell, (sample_rate, farewell_audio) if farewell_audio is not None else None, "📞 Call ended. Conversation cleared!"
     def process_conversation(self, audio_input, user_id="default"):
@@ -332,8 +329,8 @@ class MayaAI:
                 transcription, emotion, self.conversations[user_id]
             )
-            # Step 4: Natural TTS (Better than Dia)
-            response_audio = self.synthesize_natural_speech(response_text, emotion)
             # Step 5: Update conversation history
             processing_time = time.time() - start_time
@@ -347,13 +344,13 @@ class MayaAI:
             self.conversations[user_id].append(conversation_entry)
-            # Keep last 1000 exchanges
             if len(self.conversations[user_id]) > 1000:
                 self.conversations[user_id] = self.conversations[user_id][-1000:]
             history = self.format_conversation_history(user_id)
-            sample_rate = 24000 if self.use_bark else 22050
             return transcription, (sample_rate, response_audio) if response_audio is not None else None, history
         except Exception as e:
@@ -375,7 +372,7 @@ class MayaAI:
         return "\n".join(history)
 # Initialize Maya AI
-print("🚀 Starting Maya AI with REAL natural speech...")
 maya = MayaAI()
 print("✅ Maya AI ready with natural emotional speech!")
@@ -391,15 +388,15 @@ def process_audio_handler(audio):
 # Create Gradio Interface
 with gr.Blocks(
-    title="Maya AI - Natural Speech Sesame Killer",
     theme=gr.themes.Soft()
 ) as demo:
     gr.Markdown("""
-    # 🎤 Maya AI - Natural Speech Sesame Killer
-    *Better than Dia: Natural emotional speech with breathing, laughter, and human-like responses*
-    **Features:** ✅ Bark Natural TTS ✅ English-only ASR ✅ Emotion Recognition ✅ FREE Models ✅ Human-like Speech
     """)
     with gr.Row():
@@ -419,7 +416,7 @@ with gr.Blocks(
             process_btn = gr.Button("🎯 Process Audio", variant="primary")
         with gr.Column(scale=2):
-            gr.Markdown("### 💬 Natural Conversation")
             transcription_output = gr.Textbox(
                 label="📝 What you said (English)",
@@ -428,13 +425,13 @@ with gr.Blocks(
             )
             audio_output = gr.Audio(
-                label="🔊 Maya's Natural Response (Better than Dia)",
                 interactive=False,
                 autoplay=True
             )
             conversation_display = gr.Textbox(
-                label="💭 Live Conversation (FREE & Natural)",
                 lines=15,
                 interactive=False,
                 show_copy_button=True

 import librosa
 from transformers import (
     pipeline, AutoTokenizer, AutoModelForCausalLM,
+    WhisperProcessor, WhisperForConditionalGeneration
 )
 import soundfile as sf
 import json
 import warnings
 from datasets import load_dataset
+# Import Dia TTS model
+from dia.model import Dia
 warnings.filterwarnings("ignore")
 class MayaAI:
         )
         print("✅ Emotion recognition loaded")
+        # Load Dia TTS Model (The REAL Dia from Nari Labs)
         try:
+            self.dia_model = Dia.from_pretrained(
+                "nari-labs/Dia-1.6B",
+                compute_dtype="float16" if self.device == "cuda" else "float32"
+            )[11][13][15]
+            print("✅ Dia TTS loaded successfully from Nari Labs")
+            self.use_dia = True
         except Exception as e:
+            print(f"⚠️ Dia loading failed: {e}")
             # Fallback to SpeechT5 with FIXED dtypes
+            from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
             self.tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
             self.tts_model = SpeechT5ForTextToSpeech.from_pretrained(
                 "microsoft/speecht5_tts",
                 embeddings_dataset[7306]["xvector"],
                 dtype=torch.float32
             ).unsqueeze(0).to(self.device)
+            print("✅ SpeechT5 TTS loaded as fallback")
+            self.use_dia = False
         # Conversation storage
         self.conversations = {}
         except Exception as e:
             return f"{emotion_prompts.get(emotion, 'I understand.')} Could you tell me more about that?"
+    def synthesize_with_dia(self, text, emotion):
+        """Generate natural emotional speech using Dia TTS"""[11][13][15]
         try:
             if not text or len(text.strip()) == 0:
                 return None
+            if self.use_dia:
+                # Use Dia TTS with proper speaker tags and emotional context
+                # Add emotional markers based on Dia's supported non-verbal tags
                 if emotion == "happy":
+                    emotional_text = f"[S1] {text} (laughs)"[11][15]
                 elif emotion == "sad":
+                    emotional_text = f"[S1] {text} (sighs)"[11][15]
                 elif emotion == "excited":
+                    emotional_text = f"[S1] {text}!"
                 elif emotion == "angry":
+                    emotional_text = f"[S1] {text} (clears throat)"[11][15]
+                elif emotion == "surprised":
+                    emotional_text = f"[S1] {text} (gasps)"[11][15]
                 else:
+                    emotional_text = f"[S1] {text}"[11][15]
+                # Add natural breathing for longer text (Dia feature)
                 if len(emotional_text.split()) > 15:
                     words = emotional_text.split()
                     mid_point = len(words) // 2
+                    emotional_text = " ".join(words[:mid_point]) + " (inhales) " + " ".join(words[mid_point:])
+                # Generate using Dia model
+                output = self.dia_model.generate(
+                    emotional_text,
+                    use_torch_compile=True if self.device == "cuda" else False,
+                    verbose=False
+                )[11][18]
+                return output
             else:
+                # Use SpeechT5 fallback with emotional context
                 clean_text = text.replace("[", "").replace("]", "").strip()
                 if len(clean_text) > 200:
                     clean_text = clean_text[:200] + "..."
         self.call_active = True
         greeting = "Hello! I'm Maya, your AI conversation partner. I'm here to chat with you naturally and understand your emotions. How are you feeling today?"
+        greeting_audio = self.synthesize_with_dia(greeting, "happy")
+        # Dia outputs at 44100 Hz sample rate
+        sample_rate = 44100 if self.use_dia else 22050
         return greeting, (sample_rate, greeting_audio) if greeting_audio is not None else None, "📞 Call started! Maya is greeting you..."
     def end_call(self, user_id="default"):
             self.conversations[user_id] = []
         farewell = "Thank you for chatting with me! It was wonderful talking with you. Have a great day!"
+        farewell_audio = self.synthesize_with_dia(farewell, "happy")
+        sample_rate = 44100 if self.use_dia else 22050
         return farewell, (sample_rate, farewell_audio) if farewell_audio is not None else None, "📞 Call ended. Conversation cleared!"
     def process_conversation(self, audio_input, user_id="default"):
                 transcription, emotion, self.conversations[user_id]
             )
+            # Step 4: Dia TTS with natural emotional speech
+            response_audio = self.synthesize_with_dia(response_text, emotion)
             # Step 5: Update conversation history
             processing_time = time.time() - start_time
             self.conversations[user_id].append(conversation_entry)
+            # Keep last 1000 exchanges as specified
             if len(self.conversations[user_id]) > 1000:
                 self.conversations[user_id] = self.conversations[user_id][-1000:]
             history = self.format_conversation_history(user_id)
+            sample_rate = 44100 if self.use_dia else 22050
             return transcription, (sample_rate, response_audio) if response_audio is not None else None, history
         except Exception as e:
         return "\n".join(history)
 # Initialize Maya AI
+print("🚀 Starting Maya AI with Dia TTS...")
 maya = MayaAI()
 print("✅ Maya AI ready with natural emotional speech!")
 # Create Gradio Interface
 with gr.Blocks(
+    title="Maya AI - Dia TTS Sesame Killer",
     theme=gr.themes.Soft()
 ) as demo:
     gr.Markdown("""
+    # 🎤 Maya AI - Dia TTS Sesame Killer
+    *Powered by Nari Labs Dia TTS: Ultra-realistic dialogue with natural breathing, laughter, and emotional speech*
+    **Features:** ✅ Dia Natural TTS ✅ English-only ASR ✅ Emotion Recognition ✅ FREE Models ✅ Human-like Speech with Non-verbals
     """)
     with gr.Row():
             process_btn = gr.Button("🎯 Process Audio", variant="primary")
         with gr.Column(scale=2):
+            gr.Markdown("### 💬 Natural Dia Conversation")
             transcription_output = gr.Textbox(
                 label="📝 What you said (English)",
             )
             audio_output = gr.Audio(
+                label="🔊 Maya's Dia Response (Natural with Breathing & Emotions)",
                 interactive=False,
                 autoplay=True
             )
             conversation_display = gr.Textbox(
+                label="💭 Live Conversation (FREE & Natural Dia TTS)",
                 lines=15,
                 interactive=False,
                 show_copy_button=True