Spaces:

Devakumar868
/

Maya-AI

Runtime error

App Files Files Community

Devakumar868 commited on Jun 23

Commit

d31c491

verified ·

1 Parent(s): 9319248

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -81

app.py CHANGED Viewed

@@ -22,37 +22,33 @@ class MayaAI:
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"🚀 Initializing Maya AI on {self.device}")
-        # Load Whisper ASR with FORCED English (Fixed language issue)
         self.asr_processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
         self.asr_model = WhisperForConditionalGeneration.from_pretrained(
             "openai/whisper-large-v3",
             torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
         ).to(self.device)
-        # FORCE English transcription (Fix for language detection issue)
         self.asr_model.config.forced_decoder_ids = self.asr_processor.get_decoder_prompt_ids(
             language="english",
             task="transcribe"
         )
         print("✅ Whisper ASR loaded with FORCED English")
-        # Load FREE DeepSeek LLM (smaller version that fits in HF Spaces)
-        try:
-            self.llm_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
-            self.llm_model = AutoModelForCausalLM.from_pretrained(
-                "microsoft/DialoGPT-large",
-                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
-                device_map="auto"
-            )
-            print("✅ DialoGPT-Large loaded (FREE)")
-        except:
-            # Even smaller fallback
-            self.llm_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
-            self.llm_model = AutoModelForCausalLM.from_pretrained(
-                "microsoft/DialoGPT-medium",
-                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
-            ).to(self.device)
-            print("✅ DialoGPT-Medium loaded (FREE fallback)")
         # Load Emotion Recognition
         self.emotion_model = pipeline(
@@ -62,42 +58,38 @@ class MayaAI:
         )
         print("✅ Emotion recognition loaded")
-        # Load Dia TTS (FIXED dtype issue)
         try:
-            # Import Dia directly
-            from huggingface_hub import hf_hub_download
-            import importlib.util
-            # Download Dia model files
-            model_path = hf_hub_download(repo_id="nari-labs/Dia-1.6B", filename="model.py")
-            spec = importlib.util.spec_from_file_location("dia_model", model_path)
-            dia_module = importlib.util.module_from_spec(spec)
-            spec.loader.exec_module(dia_module)
-            self.dia_model = dia_module.Dia.from_pretrained("nari-labs/Dia-1.6B")
-            print("✅ Dia TTS loaded successfully")
-            self.use_dia = True
         except Exception as e:
-            print(f"⚠️ Dia loading failed: {e}")
-            # Fallback to SpeechT5 with FIXED dtype
             self.tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
             self.tts_model = SpeechT5ForTextToSpeech.from_pretrained(
                 "microsoft/speecht5_tts",
-                torch_dtype=torch.float32  # FIXED: Use float32 consistently
             ).to(self.device)
             self.vocoder = SpeechT5HifiGan.from_pretrained(
                 "microsoft/speecht5_hifigan",
-                torch_dtype=torch.float32  # FIXED: Use float32 consistently
             ).to(self.device)
-            # Load speaker embeddings for natural female voice
             embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
             self.speaker_embeddings = torch.tensor(
                 embeddings_dataset[7306]["xvector"],
-                dtype=torch.float32  # FIXED: Consistent dtype
             ).unsqueeze(0).to(self.device)
-            print("✅ SpeechT5 TTS loaded with FIXED dtypes")
-            self.use_dia = False
         # Conversation storage
         self.conversations = {}
@@ -117,7 +109,7 @@ class MayaAI:
                 audio,
                 sampling_rate=16000,
                 return_tensors="pt",
-                language="english"  # FORCE English
             ).to(self.device)
             with torch.no_grad():
@@ -125,7 +117,7 @@ class MayaAI:
                     inputs.input_features,
                     max_new_tokens=150,
                     do_sample=False,
-                    forced_decoder_ids=self.asr_model.config.forced_decoder_ids  # FORCE English
                 )
             transcription = self.asr_processor.batch_decode(
@@ -159,7 +151,7 @@ class MayaAI:
             return "neutral"
     def generate_with_free_llm(self, text, emotion, history):
-        """Generate response using FREE LLM"""
         try:
             # Emotional context prompting
             emotion_prompts = {
@@ -177,29 +169,31 @@ class MayaAI:
             # Build conversation context
             context_text = ""
             if history:
-                for entry in history[-2:]:  # Last 2 exchanges for context
                     context_text += f"User: {entry.get('user_input', '')}\nMaya: {entry.get('ai_response', '')}\n"
             prompt = f"{context_text}User: {text}\nMaya:"
-            # Tokenize input
             inputs = self.llm_tokenizer(
                 prompt,
                 return_tensors="pt",
                 truncation=True,
                 max_length=1024,
-                padding=True
             ).to(self.device)
-            # Generate response
             with torch.no_grad():
                 outputs = self.llm_model.generate(
-                    **inputs,
                     max_new_tokens=80,
                     temperature=0.7,
                     do_sample=True,
-                    pad_token_id=self.llm_tokenizer.eos_token_id,
-                    attention_mask=inputs.attention_mask
                 )
             # Decode response
@@ -217,31 +211,61 @@ class MayaAI:
         except Exception as e:
             return f"{emotion_prompts.get(emotion, 'I understand.')} Could you tell me more about that?"
-    def synthesize_speech(self, text, emotion):
-        """Generate speech with FIXED dtype issues"""
         try:
             if not text or len(text.strip()) == 0:
                 return None
-            if self.use_dia:
-                # Use Dia for natural speech with emotions
-                emotional_text = f"[S1] {text}"
                 if emotion == "happy":
-                    emotional_text += " (laughs)"
                 elif emotion == "sad":
-                    emotional_text += " (sighs)"
                 elif emotion == "excited":
-                    emotional_text += " (enthusiastically)"
-                output = self.dia_model.generate(emotional_text)
-                return output
             else:
-                # Use SpeechT5 with FIXED dtypes
                 clean_text = text.replace("[", "").replace("]", "").strip()
                 if len(clean_text) > 200:
                     clean_text = clean_text[:200] + "..."
-                # Process with TTS - ALL FLOAT32
                 inputs = self.tts_processor(text=clean_text, return_tensors="pt")
                 inputs = {k: v.to(self.device) for k, v in inputs.items()}
@@ -253,7 +277,7 @@ class MayaAI:
                     )
                 if isinstance(speech, torch.Tensor):
-                    speech = speech.cpu().numpy().astype(np.float32)  # FIXED: Consistent dtype
                 return speech
@@ -266,9 +290,10 @@ class MayaAI:
         self.call_active = True
         greeting = "Hello! I'm Maya, your AI conversation partner. I'm here to chat with you naturally and understand your emotions. How are you feeling today?"
-        greeting_audio = self.synthesize_speech(greeting, "happy")
-        return greeting, (22050, greeting_audio) if greeting_audio is not None else None, "📞 Call started! Maya is greeting you..."
     def end_call(self, user_id="default"):
         """End call and clear conversation"""
@@ -277,9 +302,10 @@ class MayaAI:
             self.conversations[user_id] = []
         farewell = "Thank you for chatting with me! It was wonderful talking with you. Have a great day!"
-        farewell_audio = self.synthesize_speech(farewell, "happy")
-        return farewell, (22050, farewell_audio) if farewell_audio is not None else None, "📞 Call ended. Conversation cleared!"
     def process_conversation(self, audio_input, user_id="default"):
         """Main conversation processing pipeline"""
@@ -301,13 +327,13 @@ class MayaAI:
             # Step 2: Emotion recognition
             emotion = self.recognize_emotion_from_audio(audio_input)
-            # Step 3: FREE LLM generation
             response_text = self.generate_with_free_llm(
                 transcription, emotion, self.conversations[user_id]
             )
-            # Step 4: TTS with FIXED dtypes
-            response_audio = self.synthesize_speech(response_text, emotion)
             # Step 5: Update conversation history
             processing_time = time.time() - start_time
@@ -327,7 +353,8 @@ class MayaAI:
             history = self.format_conversation_history(user_id)
-            return transcription, (22050, response_audio) if response_audio is not None else None, history
         except Exception as e:
             return f"Processing error: {str(e)}", None, "Error in processing"
@@ -348,9 +375,9 @@ class MayaAI:
         return "\n".join(history)
 # Initialize Maya AI
-print("🚀 Starting Maya AI with FIXED issues...")
 maya = MayaAI()
-print("✅ Maya AI ready with ZERO API costs!")
 # Gradio Interface Functions
 def start_call_handler():
@@ -364,15 +391,15 @@ def process_audio_handler(audio):
 # Create Gradio Interface
 with gr.Blocks(
-    title="Maya AI - FIXED Sesame AI Killer",
     theme=gr.themes.Soft()
 ) as demo:
     gr.Markdown("""
-    # 🎤 Maya AI - FIXED Sesame AI Killer
-    *All issues resolved: English-only transcription, working audio output, FREE models*
-    **FIXES:** ✅ English-only ASR ✅ Working TTS audio ✅ FREE LLM ✅ Emotion recognition
     """)
     with gr.Row():
@@ -392,7 +419,7 @@ with gr.Blocks(
             process_btn = gr.Button("🎯 Process Audio", variant="primary")
         with gr.Column(scale=2):
-            gr.Markdown("### 💬 English Conversation")
             transcription_output = gr.Textbox(
                 label="📝 What you said (English)",
@@ -401,13 +428,13 @@ with gr.Blocks(
             )
             audio_output = gr.Audio(
-                label="🔊 Maya's Response (Working Audio)",
                 interactive=False,
                 autoplay=True
             )
             conversation_display = gr.Textbox(
-                label="💭 Live Conversation (FREE)",
                 lines=15,
                 interactive=False,
                 show_copy_button=True

         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"🚀 Initializing Maya AI on {self.device}")
+        # Load Whisper ASR with FORCED English
         self.asr_processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
         self.asr_model = WhisperForConditionalGeneration.from_pretrained(
             "openai/whisper-large-v3",
             torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
         ).to(self.device)
+        # FORCE English transcription
         self.asr_model.config.forced_decoder_ids = self.asr_processor.get_decoder_prompt_ids(
             language="english",
             task="transcribe"
         )
         print("✅ Whisper ASR loaded with FORCED English")
+        # Load FREE LLM with FIXED attention mask
+        self.llm_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
+        # FIX: Set pad_token to eos_token to avoid attention mask warnings
+        if self.llm_tokenizer.pad_token is None:
+            self.llm_tokenizer.pad_token = self.llm_tokenizer.eos_token
+        self.llm_model = AutoModelForCausalLM.from_pretrained(
+            "microsoft/DialoGPT-large",
+            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
+            device_map="auto",
+            pad_token_id=self.llm_tokenizer.eos_token_id
+        )
+        print("✅ DialoGPT-Large loaded with FIXED attention masks")
         # Load Emotion Recognition
         self.emotion_model = pipeline(
         )
         print("✅ Emotion recognition loaded")
+        # Load REAL Natural TTS (Better than Dia)
         try:
+            # Use Bark for natural, emotional speech
+            from transformers import BarkModel, BarkProcessor
+            self.bark_processor = BarkProcessor.from_pretrained("suno/bark")
+            self.bark_model = BarkModel.from_pretrained(
+                "suno/bark",
+                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
+            ).to(self.device)
+            print("✅ Bark TTS loaded (Natural emotional speech)")
+            self.use_bark = True
         except Exception as e:
+            print(f"⚠️ Bark loading failed: {e}")
+            # Fallback to SpeechT5 with FIXED dtypes
             self.tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
             self.tts_model = SpeechT5ForTextToSpeech.from_pretrained(
                 "microsoft/speecht5_tts",
+                torch_dtype=torch.float32
             ).to(self.device)
             self.vocoder = SpeechT5HifiGan.from_pretrained(
                 "microsoft/speecht5_hifigan",
+                torch_dtype=torch.float32
             ).to(self.device)
+            # Load female speaker embeddings
             embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
             self.speaker_embeddings = torch.tensor(
                 embeddings_dataset[7306]["xvector"],
+                dtype=torch.float32
             ).unsqueeze(0).to(self.device)
+            print("✅ SpeechT5 TTS loaded with natural female voice")
+            self.use_bark = False
         # Conversation storage
         self.conversations = {}
                 audio,
                 sampling_rate=16000,
                 return_tensors="pt",
+                language="english"
             ).to(self.device)
             with torch.no_grad():
                     inputs.input_features,
                     max_new_tokens=150,
                     do_sample=False,
+                    forced_decoder_ids=self.asr_model.config.forced_decoder_ids
                 )
             transcription = self.asr_processor.batch_decode(
             return "neutral"
     def generate_with_free_llm(self, text, emotion, history):
+        """Generate response using FREE LLM with FIXED attention masks"""
         try:
             # Emotional context prompting
             emotion_prompts = {
             # Build conversation context
             context_text = ""
             if history:
+                for entry in history[-2:]:
                     context_text += f"User: {entry.get('user_input', '')}\nMaya: {entry.get('ai_response', '')}\n"
             prompt = f"{context_text}User: {text}\nMaya:"
+            # Tokenize input with PROPER attention mask
             inputs = self.llm_tokenizer(
                 prompt,
                 return_tensors="pt",
                 truncation=True,
                 max_length=1024,
+                padding=True,
+                add_special_tokens=True
             ).to(self.device)
+            # Generate response with PROPER attention mask
             with torch.no_grad():
                 outputs = self.llm_model.generate(
+                    input_ids=inputs.input_ids,
+                    attention_mask=inputs.attention_mask,  # FIX: Explicit attention mask
                     max_new_tokens=80,
                     temperature=0.7,
                     do_sample=True,
+                    pad_token_id=self.llm_tokenizer.pad_token_id,
+                    eos_token_id=self.llm_tokenizer.eos_token_id
                 )
             # Decode response
         except Exception as e:
             return f"{emotion_prompts.get(emotion, 'I understand.')} Could you tell me more about that?"
+    def synthesize_natural_speech(self, text, emotion):
+        """Generate natural emotional speech (Better than Dia)"""
         try:
             if not text or len(text.strip()) == 0:
                 return None
+            if self.use_bark:
+                # Use Bark for natural emotional speech with breathing
+                voice_preset = "v2/en_speaker_6"  # Female voice
+                # Add emotional context to text
                 if emotion == "happy":
+                    emotional_text = f"♪ {text} ♪"  # Musical notes for happiness
                 elif emotion == "sad":
+                    emotional_text = f"[sighs] {text}"
                 elif emotion == "excited":
+                    emotional_text = f"{text}!"
+                elif emotion == "angry":
+                    emotional_text = f"[frustrated] {text}"
+                else:
+                    emotional_text = text
+                # Add natural breathing for longer text
+                if len(emotional_text.split()) > 15:
+                    words = emotional_text.split()
+                    mid_point = len(words) // 2
+                    emotional_text = " ".join(words[:mid_point]) + " [pause] " + " ".join(words[mid_point:])
+                inputs = self.bark_processor(
+                    emotional_text,
+                    voice_preset=voice_preset,
+                    return_tensors="pt"
+                ).to(self.device)
+                with torch.no_grad():
+                    audio_array = self.bark_model.generate(**inputs)
+                if isinstance(audio_array, torch.Tensor):
+                    audio_array = audio_array.cpu().numpy().squeeze()
+                return audio_array
             else:
+                # Use SpeechT5 with emotional context
                 clean_text = text.replace("[", "").replace("]", "").strip()
                 if len(clean_text) > 200:
                     clean_text = clean_text[:200] + "..."
+                # Add emotional inflection through punctuation
+                if emotion == "happy":
+                    clean_text = clean_text.replace(".", "!")
+                elif emotion == "excited":
+                    clean_text = clean_text + "!"
+                elif emotion == "sad":
+                    clean_text = clean_text.replace("!", ".")
                 inputs = self.tts_processor(text=clean_text, return_tensors="pt")
                 inputs = {k: v.to(self.device) for k, v in inputs.items()}
                     )
                 if isinstance(speech, torch.Tensor):
+                    speech = speech.cpu().numpy().astype(np.float32)
                 return speech
         self.call_active = True
         greeting = "Hello! I'm Maya, your AI conversation partner. I'm here to chat with you naturally and understand your emotions. How are you feeling today?"
+        greeting_audio = self.synthesize_natural_speech(greeting, "happy")
+        sample_rate = 24000 if self.use_bark else 22050
+        return greeting, (sample_rate, greeting_audio) if greeting_audio is not None else None, "📞 Call started! Maya is greeting you..."
     def end_call(self, user_id="default"):
         """End call and clear conversation"""
             self.conversations[user_id] = []
         farewell = "Thank you for chatting with me! It was wonderful talking with you. Have a great day!"
+        farewell_audio = self.synthesize_natural_speech(farewell, "happy")
+        sample_rate = 24000 if self.use_bark else 22050
+        return farewell, (sample_rate, farewell_audio) if farewell_audio is not None else None, "📞 Call ended. Conversation cleared!"
     def process_conversation(self, audio_input, user_id="default"):
         """Main conversation processing pipeline"""
             # Step 2: Emotion recognition
             emotion = self.recognize_emotion_from_audio(audio_input)
+            # Step 3: FREE LLM generation with FIXED attention masks
             response_text = self.generate_with_free_llm(
                 transcription, emotion, self.conversations[user_id]
             )
+            # Step 4: Natural TTS (Better than Dia)
+            response_audio = self.synthesize_natural_speech(response_text, emotion)
             # Step 5: Update conversation history
             processing_time = time.time() - start_time
             history = self.format_conversation_history(user_id)
+            sample_rate = 24000 if self.use_bark else 22050
+            return transcription, (sample_rate, response_audio) if response_audio is not None else None, history
         except Exception as e:
             return f"Processing error: {str(e)}", None, "Error in processing"
         return "\n".join(history)
 # Initialize Maya AI
+print("🚀 Starting Maya AI with REAL natural speech...")
 maya = MayaAI()
+print("✅ Maya AI ready with natural emotional speech!")
 # Gradio Interface Functions
 def start_call_handler():
 # Create Gradio Interface
 with gr.Blocks(
+    title="Maya AI - Natural Speech Sesame Killer",
     theme=gr.themes.Soft()
 ) as demo:
     gr.Markdown("""
+    # 🎤 Maya AI - Natural Speech Sesame Killer
+    *Better than Dia: Natural emotional speech with breathing, laughter, and human-like responses*
+    **Features:** ✅ Bark Natural TTS ✅ English-only ASR ✅ Emotion Recognition ✅ FREE Models ✅ Human-like Speech
     """)
     with gr.Row():
             process_btn = gr.Button("🎯 Process Audio", variant="primary")
         with gr.Column(scale=2):
+            gr.Markdown("### 💬 Natural Conversation")
             transcription_output = gr.Textbox(
                 label="📝 What you said (English)",
             )
             audio_output = gr.Audio(
+                label="🔊 Maya's Natural Response (Better than Dia)",
                 interactive=False,
                 autoplay=True
             )
             conversation_display = gr.Textbox(
+                label="💭 Live Conversation (FREE & Natural)",
                 lines=15,
                 interactive=False,
                 show_copy_button=True