Spaces:

Devakumar868
/

Maya-AI

Runtime error

App Files Files Community

Devakumar868 commited on Jun 23

Commit

c78b630

verified ·

1 Parent(s): 6e6580b

Update app.py

Browse files

Files changed (1) hide show

app.py +110 -82

app.py CHANGED Viewed

@@ -12,10 +12,15 @@ import time
 from datetime import datetime
 import os
 import warnings
-from datasets import load_dataset
-# Import Dia TTS model
-from dia.model import Dia
 warnings.filterwarnings("ignore")
@@ -60,18 +65,36 @@ class MayaAI:
         )
         print("✅ Emotion recognition loaded")
-        # Load Dia TTS Model (The REAL Dia from Nari Labs)
         try:
-            self.dia_model = Dia.from_pretrained(
-                "nari-labs/Dia-1.6B",
-                compute_dtype="float16" if self.device == "cuda" else "float32"
-            )[11][13][15]
-            print("✅ Dia TTS loaded successfully from Nari Labs")
-            self.use_dia = True
-        except Exception as e:
-            print(f"⚠️ Dia loading failed: {e}")
-            # Fallback to SpeechT5 with FIXED dtypes
             from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
             self.tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
             self.tts_model = SpeechT5ForTextToSpeech.from_pretrained(
@@ -90,12 +113,9 @@ class MayaAI:
                 dtype=torch.float32
             ).unsqueeze(0).to(self.device)
             print("✅ SpeechT5 TTS loaded as fallback")
-            self.use_dia = False
-        # Conversation storage
-        self.conversations = {}
-        self.call_active = False
     def transcribe_with_whisper(self, audio_path):
         """Transcribe using Whisper with FORCED English"""
         try:
@@ -189,7 +209,7 @@ class MayaAI:
             with torch.no_grad():
                 outputs = self.llm_model.generate(
                     input_ids=inputs.input_ids,
-                    attention_mask=inputs.attention_mask,  # FIX: Explicit attention mask
                     max_new_tokens=80,
                     temperature=0.7,
                     do_sample=True,
@@ -213,84 +233,92 @@ class MayaAI:
             return f"{emotion_prompts.get(emotion, 'I understand.')} Could you tell me more about that?"
     def synthesize_with_dia(self, text, emotion):
-        """Generate natural emotional speech using Dia TTS"""[11][13][15]
         try:
             if not text or len(text.strip()) == 0:
                 return None
             if self.use_dia:
-                # Use Dia TTS with proper speaker tags and emotional context
-                # Add emotional markers based on Dia's supported non-verbal tags
                 if emotion == "happy":
-                    emotional_text = f"[S1] {text} (laughs)"[11][15]
                 elif emotion == "sad":
-                    emotional_text = f"[S1] {text} (sighs)"[11][15]
                 elif emotion == "excited":
-                    emotional_text = f"[S1] {text}!"
                 elif emotion == "angry":
-                    emotional_text = f"[S1] {text} (clears throat)"[11][15]
                 elif emotion == "surprised":
-                    emotional_text = f"[S1] {text} (gasps)"[11][15]
                 else:
-                    emotional_text = f"[S1] {text}"[11][15]
-                # Add natural breathing for longer text (Dia feature)
-                if len(emotional_text.split()) > 15:
-                    words = emotional_text.split()
-                    mid_point = len(words) // 2
-                    emotional_text = " ".join(words[:mid_point]) + " (inhales) " + " ".join(words[mid_point:])
-                # Generate using Dia model
                 output = self.dia_model.generate(
                     emotional_text,
                     use_torch_compile=True if self.device == "cuda" else False,
                     verbose=False
-                )[11][18]
                 return output
             else:
-                # Use SpeechT5 fallback with emotional context
-                clean_text = text.replace("[", "").replace("]", "").strip()
-                if len(clean_text) > 200:
-                    clean_text = clean_text[:200] + "..."
-                # Add emotional inflection through punctuation
-                if emotion == "happy":
-                    clean_text = clean_text.replace(".", "!")
-                elif emotion == "excited":
-                    clean_text = clean_text + "!"
-                elif emotion == "sad":
-                    clean_text = clean_text.replace("!", ".")
-                inputs = self.tts_processor(text=clean_text, return_tensors="pt")
-                inputs = {k: v.to(self.device) for k, v in inputs.items()}
-                with torch.no_grad():
-                    speech = self.tts_model.generate_speech(
-                        inputs["input_ids"],
-                        self.speaker_embeddings,
-                        vocoder=self.vocoder
-                    )
-                if isinstance(speech, torch.Tensor):
-                    speech = speech.cpu().numpy().astype(np.float32)
-                return speech
         except Exception as e:
-            print(f"TTS error: {e}")
             return None
     def start_call(self):
         """Start a new call session"""
         self.call_active = True
         greeting = "Hello! I'm Maya, your AI conversation partner. I'm here to chat with you naturally and understand your emotions. How are you feeling today?"
         greeting_audio = self.synthesize_with_dia(greeting, "happy")
-        # Dia outputs at 44100 Hz sample rate
-        sample_rate = 44100 if self.use_dia else 22050
-        return greeting, (sample_rate, greeting_audio) if greeting_audio is not None else None, "📞 Call started! Maya is greeting you..."
     def end_call(self, user_id="default"):
         """End call and clear conversation"""
@@ -301,7 +329,7 @@ class MayaAI:
         farewell = "Thank you for chatting with me! It was wonderful talking with you. Have a great day!"
         farewell_audio = self.synthesize_with_dia(farewell, "happy")
-        sample_rate = 44100 if self.use_dia else 22050
         return farewell, (sample_rate, farewell_audio) if farewell_audio is not None else None, "📞 Call ended. Conversation cleared!"
     def process_conversation(self, audio_input, user_id="default"):
@@ -329,7 +357,7 @@ class MayaAI:
                 transcription, emotion, self.conversations[user_id]
             )
-            # Step 4: Dia TTS with natural emotional speech
             response_audio = self.synthesize_with_dia(response_text, emotion)
             # Step 5: Update conversation history
@@ -344,13 +372,13 @@ class MayaAI:
             self.conversations[user_id].append(conversation_entry)
-            # Keep last 1000 exchanges as specified
             if len(self.conversations[user_id]) > 1000:
                 self.conversations[user_id] = self.conversations[user_id][-1000:]
             history = self.format_conversation_history(user_id)
-            sample_rate = 44100 if self.use_dia else 22050
             return transcription, (sample_rate, response_audio) if response_audio is not None else None, history
         except Exception as e:
@@ -372,9 +400,9 @@ class MayaAI:
         return "\n".join(history)
 # Initialize Maya AI
-print("🚀 Starting Maya AI with Dia TTS...")
 maya = MayaAI()
-print("✅ Maya AI ready with natural emotional speech!")
 # Gradio Interface Functions
 def start_call_handler():
@@ -386,17 +414,17 @@ def end_call_handler():
 def process_audio_handler(audio):
     return maya.process_conversation(audio)
-# Create Gradio Interface
 with gr.Blocks(
-    title="Maya AI - Dia TTS Sesame Killer",
     theme=gr.themes.Soft()
 ) as demo:
     gr.Markdown("""
-    # 🎤 Maya AI - Dia TTS Sesame Killer
-    *Powered by Nari Labs Dia TTS: Ultra-realistic dialogue with natural breathing, laughter, and emotional speech*
-    **Features:** ✅ Dia Natural TTS ✅ English-only ASR ✅ Emotion Recognition ✅ FREE Models ✅ Human-like Speech with Non-verbals
     """)
     with gr.Row():
@@ -416,7 +444,7 @@ with gr.Blocks(
             process_btn = gr.Button("🎯 Process Audio", variant="primary")
         with gr.Column(scale=2):
-            gr.Markdown("### 💬 Natural Dia Conversation")
             transcription_output = gr.Textbox(
                 label="📝 What you said (English)",
@@ -425,13 +453,13 @@ with gr.Blocks(
             )
             audio_output = gr.Audio(
-                label="🔊 Maya's Dia Response (Natural with Breathing & Emotions)",
                 interactive=False,
                 autoplay=True
             )
             conversation_display = gr.Textbox(
-                label="💭 Live Conversation (FREE & Natural Dia TTS)",
                 lines=15,
                 interactive=False,
                 show_copy_button=True

 from datetime import datetime
 import os
 import warnings
+# Import Dia model correctly[2]
+try:
+    from dia.model import Dia
+    DIA_AVAILABLE = True
+    print("✅ Dia model imported successfully")
+except ImportError as e:
+    print(f"⚠️ Dia import failed: {e}")
+    DIA_AVAILABLE = False
 warnings.filterwarnings("ignore")
         )
         print("✅ Emotion recognition loaded")
+        # Load REAL Dia TTS Model[2]
+        if DIA_AVAILABLE:
+            try:
+                # Load Dia model with correct parameters[2]
+                self.dia_model = Dia.from_pretrained(
+                    "nari-labs/Dia-1.6B",
+                    compute_dtype="float16" if self.device == "cuda" else "float32",
+                    device=self.device
+                )
+                print("✅ Dia TTS loaded (Ultra-realistic dialogue generation)")
+                self.use_dia = True
+            except Exception as e:
+                print(f"⚠️ Dia loading failed: {e}")
+                self.use_dia = False
+                self._load_fallback_tts()
+        else:
+            print("⚠️ Dia not available, using fallback TTS")
+            self.use_dia = False
+            self._load_fallback_tts()
+        # Conversation storage
+        self.conversations = {}
+        self.call_active = False
+        self.speaker_turn = 1  # Track speaker turns for Dia[2]
+    def _load_fallback_tts(self):
+        """Load fallback TTS if Dia is not available"""
         try:
             from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+            from datasets import load_dataset
             self.tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
             self.tts_model = SpeechT5ForTextToSpeech.from_pretrained(
                 dtype=torch.float32
             ).unsqueeze(0).to(self.device)
             print("✅ SpeechT5 TTS loaded as fallback")
+        except Exception as e:
+            print(f"❌ Fallback TTS loading failed: {e}")
     def transcribe_with_whisper(self, audio_path):
         """Transcribe using Whisper with FORCED English"""
         try:
             with torch.no_grad():
                 outputs = self.llm_model.generate(
                     input_ids=inputs.input_ids,
+                    attention_mask=inputs.attention_mask,
                     max_new_tokens=80,
                     temperature=0.7,
                     do_sample=True,
             return f"{emotion_prompts.get(emotion, 'I understand.')} Could you tell me more about that?"
     def synthesize_with_dia(self, text, emotion):
+        """Generate ultra-realistic dialogue using Dia[2]"""
         try:
             if not text or len(text.strip()) == 0:
                 return None
             if self.use_dia:
+                # Format text for Dia with proper speaker tags[2]
+                speaker_tag = f"[S{self.speaker_turn}]"
+                # Add emotional non-verbals based on emotion[2]
                 if emotion == "happy":
+                    emotional_text = f"{speaker_tag} {text} (laughs)"
                 elif emotion == "sad":
+                    emotional_text = f"{speaker_tag} {text} (sighs)"
                 elif emotion == "excited":
+                    emotional_text = f"{speaker_tag} {text}!"
                 elif emotion == "angry":
+                    emotional_text = f"{speaker_tag} {text} (frustrated tone)"
                 elif emotion == "surprised":
+                    emotional_text = f"{speaker_tag} {text} (gasps)"
                 else:
+                    emotional_text = f"{speaker_tag} {text}"
+                # Generate with Dia[2]
                 output = self.dia_model.generate(
                     emotional_text,
                     use_torch_compile=True if self.device == "cuda" else False,
                     verbose=False
+                )
+                # Toggle speaker for next turn[2]
+                self.speaker_turn = 2 if self.speaker_turn == 1 else 1
                 return output
             else:
+                # Fallback to SpeechT5
+                return self._synthesize_with_fallback(text, emotion)
+        except Exception as e:
+            print(f"Dia TTS error: {e}")
+            return self._synthesize_with_fallback(text, emotion)
+    def _synthesize_with_fallback(self, text, emotion):
+        """Fallback TTS synthesis"""
+        try:
+            clean_text = text.replace("[", "").replace("]", "").strip()
+            if len(clean_text) > 200:
+                clean_text = clean_text[:200] + "..."
+            # Add emotional inflection through punctuation
+            if emotion == "happy":
+                clean_text = clean_text.replace(".", "!")
+            elif emotion == "excited":
+                clean_text = clean_text + "!"
+            elif emotion == "sad":
+                clean_text = clean_text.replace("!", ".")
+            inputs = self.tts_processor(text=clean_text, return_tensors="pt")
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            with torch.no_grad():
+                speech = self.tts_model.generate_speech(
+                    inputs["input_ids"],
+                    self.speaker_embeddings,
+                    vocoder=self.vocoder
+                )
+            if isinstance(speech, torch.Tensor):
+                speech = speech.cpu().numpy().astype(np.float32)
+            return speech
         except Exception as e:
+            print(f"Fallback TTS error: {e}")
             return None
     def start_call(self):
         """Start a new call session"""
         self.call_active = True
+        self.speaker_turn = 1  # Reset speaker turn[2]
         greeting = "Hello! I'm Maya, your AI conversation partner. I'm here to chat with you naturally and understand your emotions. How are you feeling today?"
         greeting_audio = self.synthesize_with_dia(greeting, "happy")
+        # Dia outputs at 24kHz, fallback at 22050Hz[2]
+        sample_rate = 24000 if self.use_dia else 22050
+        return greeting, (sample_rate, greeting_audio) if greeting_audio is not None else None, "📞 Call started! Maya is greeting you with ultra-realistic speech..."
     def end_call(self, user_id="default"):
         """End call and clear conversation"""
         farewell = "Thank you for chatting with me! It was wonderful talking with you. Have a great day!"
         farewell_audio = self.synthesize_with_dia(farewell, "happy")
+        sample_rate = 24000 if self.use_dia else 22050
         return farewell, (sample_rate, farewell_audio) if farewell_audio is not None else None, "📞 Call ended. Conversation cleared!"
     def process_conversation(self, audio_input, user_id="default"):
                 transcription, emotion, self.conversations[user_id]
             )
+            # Step 4: Ultra-realistic TTS with Dia[2]
             response_audio = self.synthesize_with_dia(response_text, emotion)
             # Step 5: Update conversation history
             self.conversations[user_id].append(conversation_entry)
+            # Keep last 1000 exchanges as requested[5]
             if len(self.conversations[user_id]) > 1000:
                 self.conversations[user_id] = self.conversations[user_id][-1000:]
             history = self.format_conversation_history(user_id)
+            sample_rate = 24000 if self.use_dia else 22050
             return transcription, (sample_rate, response_audio) if response_audio is not None else None, history
         except Exception as e:
         return "\n".join(history)
 # Initialize Maya AI
+print("🚀 Starting Maya AI with REAL Dia TTS...")
 maya = MayaAI()
+print("✅ Maya AI ready with ultra-realistic dialogue generation!")
 # Gradio Interface Functions
 def start_call_handler():
 def process_audio_handler(audio):
     return maya.process_conversation(audio)
+# Create Gradio Interface[7]
 with gr.Blocks(
+    title="Maya AI - Dia-Powered Sesame Killer",
     theme=gr.themes.Soft()
 ) as demo:
     gr.Markdown("""
+    # 🎤 Maya AI - Dia-Powered Sesame Killer
+    *Ultra-realistic dialogue generation with Dia TTS - Natural breathing, laughter, and human-like responses*
+    **Features:** ✅ Real Dia TTS ✅ English-only ASR ✅ Emotion Recognition ✅ FREE LLM ✅ Ultra-realistic Speech
     """)
     with gr.Row():
             process_btn = gr.Button("🎯 Process Audio", variant="primary")
         with gr.Column(scale=2):
+            gr.Markdown("### 💬 Ultra-Realistic Conversation")
             transcription_output = gr.Textbox(
                 label="📝 What you said (English)",
             )
             audio_output = gr.Audio(
+                label="🔊 Maya's Ultra-Realistic Response (Dia TTS)",
                 interactive=False,
                 autoplay=True
             )
             conversation_display = gr.Textbox(
+                label="💭 Live Conversation (FREE & Ultra-Realistic)",
                 lines=15,
                 interactive=False,
                 show_copy_button=True