Spaces:

Devakumar868
/

Maya-AI

Runtime error

App Files Files Community

Devakumar868 commited on Jun 23

Commit

2b73c47

verified ·

1 Parent(s): fc714c5

Update app.py

Browse files

Files changed (1) hide show

app.py +166 -138

app.py CHANGED Viewed

@@ -22,80 +22,128 @@ class MayaAI:
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"🚀 Initializing Maya AI on {self.device}")
-        # Load Parakeet ASR (Best performance)
-        try:
-            from nemo.collections.asr import ASRModel
-            self.asr_model = ASRModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2")
-            print("✅ Parakeet ASR loaded")
-        except:
-            self.asr_model = pipeline("automatic-speech-recognition",
-                                    model="openai/whisper-large-v3",
-                                    torch_dtype=torch.float16,
-                                    device=self.device)
-            print("⚠️ Using Whisper fallback")
-        # Load FREE DeepSeek-V3 LLM (Best free option)[1][5]
         try:
-            self.llm_tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-llm-67b-chat")
             self.llm_model = AutoModelForCausalLM.from_pretrained(
-                "deepseek-ai/deepseek-llm-67b-chat",
-                torch_dtype=torch.float16,
-                device_map="auto",
-                trust_remote_code=True
             )
-            print("✅ DeepSeek-V3 loaded (FREE)")
         except:
-            # Fallback to Llama 3.1 (also free)
-            self.llm_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-70B-Instruct")
             self.llm_model = AutoModelForCausalLM.from_pretrained(
-                "meta-llama/Llama-3.1-70B-Instruct",
-                torch_dtype=torch.float16,
-                device_map="auto"
-            )
-            print("✅ Llama 3.1 loaded (FREE fallback)")
         # Load Emotion Recognition
-        self.emotion_model = pipeline("audio-classification",
-                                    model="superb/wav2vec2-base-superb-er",
-                                    device=self.device)
         print("✅ Emotion recognition loaded")
-        # Load TTS with speaker embeddings (FREE)
-        self.tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
-        self.tts_model = SpeechT5ForTextToSpeech.from_pretrained(
-            "microsoft/speecht5_tts",
-            torch_dtype=torch.float16
-        ).to(self.device)
-        self.vocoder = SpeechT5HifiGan.from_pretrained(
-            "microsoft/speecht5_hifigan",
-            torch_dtype=torch.float16
-        ).to(self.device)
-        # Load speaker embeddings for natural female voice
-        embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
-        # Use female speaker embedding (index 7306 is female)
-        self.speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(self.device)
-        print("✅ Natural female TTS voice loaded")
         # Conversation storage
         self.conversations = {}
         self.call_active = False
-    def transcribe_with_parakeet(self, audio_path):
-        """Transcribe using Parakeet (6.05% WER)"""
         try:
-            if hasattr(self.asr_model, 'transcribe'):
-                transcription = self.asr_model.transcribe([audio_path])
-                return transcription[0] if transcription else ""
-            else:
-                result = self.asr_model(audio_path)
-                return result["text"]
         except Exception as e:
             return f"Transcription error: {str(e)}"
     def recognize_emotion_from_audio(self, audio_path):
         """Recognize emotion using superb model"""
         try:
             result = self.emotion_model(audio_path)
             emotion_label = result[0]["label"].lower()
@@ -111,7 +159,7 @@ class MayaAI:
             return "neutral"
     def generate_with_free_llm(self, text, emotion, history):
-        """Generate response using FREE DeepSeek-V3 or Llama"""
         try:
             # Emotional context prompting
             emotion_prompts = {
@@ -124,36 +172,34 @@ class MayaAI:
                 "neutral": "I'm listening carefully. Please continue."
             }
-            context = f"Previous conversation: {history[-3:] if history else 'None'}"
             emotion_context = emotion_prompts.get(emotion, "I'm here to help.")
-            prompt = f"""You are Maya, an emotionally intelligent AI assistant with natural conversational abilities.
-            {context}
-            User emotion detected: {emotion}
-            User input: {text}
-            Respond naturally with emotional intelligence. Keep responses under 100 words and conversational.
-            {emotion_context}
-            Maya:"""
             # Tokenize input
             inputs = self.llm_tokenizer(
                 prompt,
                 return_tensors="pt",
                 truncation=True,
-                max_length=2048
             ).to(self.device)
             # Generate response
             with torch.no_grad():
                 outputs = self.llm_model.generate(
                     **inputs,
-                    max_new_tokens=100,
                     temperature=0.7,
                     do_sample=True,
-                    pad_token_id=self.llm_tokenizer.eos_token_id
                 )
             # Decode response
@@ -171,48 +217,45 @@ class MayaAI:
         except Exception as e:
             return f"{emotion_prompts.get(emotion, 'I understand.')} Could you tell me more about that?"
-    def synthesize_emotional_speech(self, text, emotion):
-        """Generate emotional speech with natural breathing"""
         try:
             if not text or len(text.strip()) == 0:
                 return None
-            # Add emotional markers to text
-            emotional_text = text
-            if emotion == "happy":
-                emotional_text = f"*cheerfully* {text}"
-            elif emotion == "sad":
-                emotional_text = f"*gently* {text}"
-            elif emotion == "excited":
-                emotional_text = f"*enthusiastically* {text}"
-            elif emotion == "angry":
-                emotional_text = f"*calmly* {text}"
-            # Clean text for TTS
-            clean_text = emotional_text.replace("*", "").replace("[", "").replace("]", "").strip()
-            if len(clean_text) > 200:
-                clean_text = clean_text[:200] + "..."
-            # Add natural breathing pauses for longer text
-            if len(clean_text.split()) > 10:
-                words = clean_text.split()
-                mid_point = len(words) // 2
-                clean_text = " ".join(words[:mid_point]) + "... " + " ".join(words[mid_point:])
-            # Process with TTS
-            inputs = self.tts_processor(text=clean_text, return_tensors="pt").to(self.device)
-            with torch.no_grad():
-                speech = self.tts_model.generate_speech(
-                    inputs["input_ids"],
-                    self.speaker_embeddings,
-                    vocoder=self.vocoder
-                )
-            if isinstance(speech, torch.Tensor):
-                speech = speech.cpu().numpy()
-            return speech
         except Exception as e:
             print(f"TTS error: {e}")
@@ -223,7 +266,7 @@ class MayaAI:
         self.call_active = True
         greeting = "Hello! I'm Maya, your AI conversation partner. I'm here to chat with you naturally and understand your emotions. How are you feeling today?"
-        greeting_audio = self.synthesize_emotional_speech(greeting, "happy")
         return greeting, (22050, greeting_audio) if greeting_audio is not None else None, "📞 Call started! Maya is greeting you..."
@@ -234,7 +277,7 @@ class MayaAI:
             self.conversations[user_id] = []
         farewell = "Thank you for chatting with me! It was wonderful talking with you. Have a great day!"
-        farewell_audio = self.synthesize_emotional_speech(farewell, "happy")
         return farewell, (22050, farewell_audio) if farewell_audio is not None else None, "📞 Call ended. Conversation cleared!"
@@ -252,8 +295,8 @@ class MayaAI:
             self.conversations[user_id] = []
         try:
-            # Step 1: ASR with Parakeet
-            transcription = self.transcribe_with_parakeet(audio_input)
             # Step 2: Emotion recognition
             emotion = self.recognize_emotion_from_audio(audio_input)
@@ -263,8 +306,8 @@ class MayaAI:
                 transcription, emotion, self.conversations[user_id]
             )
-            # Step 4: Emotional TTS
-            response_audio = self.synthesize_emotional_speech(response_text, emotion)
             # Step 5: Update conversation history
             processing_time = time.time() - start_time
@@ -278,7 +321,7 @@ class MayaAI:
             self.conversations[user_id].append(conversation_entry)
-            # Keep last 1000 exchanges as specified
             if len(self.conversations[user_id]) > 1000:
                 self.conversations[user_id] = self.conversations[user_id][-1000:]
@@ -305,7 +348,7 @@ class MayaAI:
         return "\n".join(history)
 # Initialize Maya AI
-print("🚀 Starting Maya AI with FREE models...")
 maya = MayaAI()
 print("✅ Maya AI ready with ZERO API costs!")
@@ -321,59 +364,44 @@ def process_audio_handler(audio):
 # Create Gradio Interface
 with gr.Blocks(
-    title="Maya AI - FREE Sesame AI Killer",
-    theme=gr.themes.Soft(),
-    css="""
-    .call-button { background: linear-gradient(45deg, #00d2d3, #01a3a4) !important; }
-    .end-button { background: linear-gradient(45deg, #ff3838, #c0392b) !important; }
-    """
 ) as demo:
     gr.Markdown("""
-    # 🎤 Maya AI - FREE Sesame AI Killer
-    *Advanced conversational AI with emotional intelligence - NO API COSTS!*
-    **FREE Models:** DeepSeek-V3 • Parakeet ASR • Emotion Recognition • Natural Female TTS
     """)
     with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown("### 📞 Call Controls")
-            start_call_btn = gr.Button(
-                "📞 Start Call",
-                variant="primary",
-                size="lg",
-                elem_classes=["call-button"]
-            )
-            end_call_btn = gr.Button(
-                "📞 End Call",
-                variant="stop",
-                size="lg",
-                elem_classes=["end-button"]
-            )
             gr.Markdown("### 🎙️ Voice Input")
             audio_input = gr.Audio(
                 sources=["microphone"],
                 type="filepath",
-                label="Record your message"
             )
             process_btn = gr.Button("🎯 Process Audio", variant="primary")
         with gr.Column(scale=2):
-            gr.Markdown("### 💬 FREE Conversation")
             transcription_output = gr.Textbox(
-                label="📝 What you said",
                 lines=2,
                 interactive=False
             )
             audio_output = gr.Audio(
-                label="🔊 Maya's Emotional Response",
                 interactive=False,
                 autoplay=True
             )

         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"🚀 Initializing Maya AI on {self.device}")
+        # Load Whisper ASR with FORCED English (Fixed language issue)
+        self.asr_processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
+        self.asr_model = WhisperForConditionalGeneration.from_pretrained(
+            "openai/whisper-large-v3",
+            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
+        ).to(self.device)
+        # FORCE English transcription (Fix for language detection issue)
+        self.asr_model.config.forced_decoder_ids = self.asr_processor.get_decoder_prompt_ids(
+            language="english",
+            task="transcribe"
+        )
+        print("✅ Whisper ASR loaded with FORCED English")
+        # Load FREE DeepSeek LLM (smaller version that fits in HF Spaces)
         try:
+            self.llm_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
             self.llm_model = AutoModelForCausalLM.from_pretrained(
+                "microsoft/DialoGPT-large",
+                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
+                device_map="auto"
             )
+            print("✅ DialoGPT-Large loaded (FREE)")
         except:
+            # Even smaller fallback
+            self.llm_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
             self.llm_model = AutoModelForCausalLM.from_pretrained(
+                "microsoft/DialoGPT-medium",
+                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
+            ).to(self.device)
+            print("✅ DialoGPT-Medium loaded (FREE fallback)")
         # Load Emotion Recognition
+        self.emotion_model = pipeline(
+            "audio-classification",
+            model="superb/wav2vec2-base-superb-er",
+            device=self.device
+        )
         print("✅ Emotion recognition loaded")
+        # Load Dia TTS (FIXED dtype issue)
+        try:
+            # Import Dia directly
+            from huggingface_hub import hf_hub_download
+            import importlib.util
+            # Download Dia model files
+            model_path = hf_hub_download(repo_id="nari-labs/Dia-1.6B", filename="model.py")
+            spec = importlib.util.spec_from_file_location("dia_model", model_path)
+            dia_module = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(dia_module)
+            self.dia_model = dia_module.Dia.from_pretrained("nari-labs/Dia-1.6B")
+            print("✅ Dia TTS loaded successfully")
+            self.use_dia = True
+        except Exception as e:
+            print(f"⚠️ Dia loading failed: {e}")
+            # Fallback to SpeechT5 with FIXED dtype
+            self.tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+            self.tts_model = SpeechT5ForTextToSpeech.from_pretrained(
+                "microsoft/speecht5_tts",
+                torch_dtype=torch.float32  # FIXED: Use float32 consistently
+            ).to(self.device)
+            self.vocoder = SpeechT5HifiGan.from_pretrained(
+                "microsoft/speecht5_hifigan",
+                torch_dtype=torch.float32  # FIXED: Use float32 consistently
+            ).to(self.device)
+            # Load speaker embeddings for natural female voice
+            embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+            self.speaker_embeddings = torch.tensor(
+                embeddings_dataset[7306]["xvector"],
+                dtype=torch.float32  # FIXED: Consistent dtype
+            ).unsqueeze(0).to(self.device)
+            print("✅ SpeechT5 TTS loaded with FIXED dtypes")
+            self.use_dia = False
         # Conversation storage
         self.conversations = {}
         self.call_active = False
+    def transcribe_with_whisper(self, audio_path):
+        """Transcribe using Whisper with FORCED English"""
         try:
+            if audio_path is None:
+                return "No audio provided"
+            # Load and preprocess audio
+            audio, sr = librosa.load(audio_path, sr=16000, mono=True)
+            # Process with Whisper - FORCE English
+            inputs = self.asr_processor(
+                audio,
+                sampling_rate=16000,
+                return_tensors="pt",
+                language="english"  # FORCE English
+            ).to(self.device)
+            with torch.no_grad():
+                predicted_ids = self.asr_model.generate(
+                    inputs.input_features,
+                    max_new_tokens=150,
+                    do_sample=False,
+                    forced_decoder_ids=self.asr_model.config.forced_decoder_ids  # FORCE English
+                )
+            transcription = self.asr_processor.batch_decode(
+                predicted_ids,
+                skip_special_tokens=True
+            )[0]
+            return transcription.strip()
         except Exception as e:
             return f"Transcription error: {str(e)}"
     def recognize_emotion_from_audio(self, audio_path):
         """Recognize emotion using superb model"""
         try:
+            if audio_path is None:
+                return "neutral"
             result = self.emotion_model(audio_path)
             emotion_label = result[0]["label"].lower()
             return "neutral"
     def generate_with_free_llm(self, text, emotion, history):
+        """Generate response using FREE LLM"""
         try:
             # Emotional context prompting
             emotion_prompts = {
                 "neutral": "I'm listening carefully. Please continue."
             }
             emotion_context = emotion_prompts.get(emotion, "I'm here to help.")
+            # Build conversation context
+            context_text = ""
+            if history:
+                for entry in history[-2:]:  # Last 2 exchanges for context
+                    context_text += f"User: {entry.get('user_input', '')}\nMaya: {entry.get('ai_response', '')}\n"
+            prompt = f"{context_text}User: {text}\nMaya:"
             # Tokenize input
             inputs = self.llm_tokenizer(
                 prompt,
                 return_tensors="pt",
                 truncation=True,
+                max_length=1024,
+                padding=True
             ).to(self.device)
             # Generate response
             with torch.no_grad():
                 outputs = self.llm_model.generate(
                     **inputs,
+                    max_new_tokens=80,
                     temperature=0.7,
                     do_sample=True,
+                    pad_token_id=self.llm_tokenizer.eos_token_id,
+                    attention_mask=inputs.attention_mask
                 )
             # Decode response
         except Exception as e:
             return f"{emotion_prompts.get(emotion, 'I understand.')} Could you tell me more about that?"
+    def synthesize_speech(self, text, emotion):
+        """Generate speech with FIXED dtype issues"""
         try:
             if not text or len(text.strip()) == 0:
                 return None
+            if self.use_dia:
+                # Use Dia for natural speech with emotions
+                emotional_text = f"[S1] {text}"
+                if emotion == "happy":
+                    emotional_text += " (laughs)"
+                elif emotion == "sad":
+                    emotional_text += " (sighs)"
+                elif emotion == "excited":
+                    emotional_text += " (enthusiastically)"
+                output = self.dia_model.generate(emotional_text)
+                return output
+            else:
+                # Use SpeechT5 with FIXED dtypes
+                clean_text = text.replace("[", "").replace("]", "").strip()
+                if len(clean_text) > 200:
+                    clean_text = clean_text[:200] + "..."
+                # Process with TTS - ALL FLOAT32
+                inputs = self.tts_processor(text=clean_text, return_tensors="pt")
+                inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                with torch.no_grad():
+                    speech = self.tts_model.generate_speech(
+                        inputs["input_ids"],
+                        self.speaker_embeddings,
+                        vocoder=self.vocoder
+                    )
+                if isinstance(speech, torch.Tensor):
+                    speech = speech.cpu().numpy().astype(np.float32)  # FIXED: Consistent dtype
+                return speech
         except Exception as e:
             print(f"TTS error: {e}")
         self.call_active = True
         greeting = "Hello! I'm Maya, your AI conversation partner. I'm here to chat with you naturally and understand your emotions. How are you feeling today?"
+        greeting_audio = self.synthesize_speech(greeting, "happy")
         return greeting, (22050, greeting_audio) if greeting_audio is not None else None, "📞 Call started! Maya is greeting you..."
             self.conversations[user_id] = []
         farewell = "Thank you for chatting with me! It was wonderful talking with you. Have a great day!"
+        farewell_audio = self.synthesize_speech(farewell, "happy")
         return farewell, (22050, farewell_audio) if farewell_audio is not None else None, "📞 Call ended. Conversation cleared!"
             self.conversations[user_id] = []
         try:
+            # Step 1: ASR with FORCED English
+            transcription = self.transcribe_with_whisper(audio_input)
             # Step 2: Emotion recognition
             emotion = self.recognize_emotion_from_audio(audio_input)
                 transcription, emotion, self.conversations[user_id]
             )
+            # Step 4: TTS with FIXED dtypes
+            response_audio = self.synthesize_speech(response_text, emotion)
             # Step 5: Update conversation history
             processing_time = time.time() - start_time
             self.conversations[user_id].append(conversation_entry)
+            # Keep last 1000 exchanges
             if len(self.conversations[user_id]) > 1000:
                 self.conversations[user_id] = self.conversations[user_id][-1000:]
         return "\n".join(history)
 # Initialize Maya AI
+print("🚀 Starting Maya AI with FIXED issues...")
 maya = MayaAI()
 print("✅ Maya AI ready with ZERO API costs!")
 # Create Gradio Interface
 with gr.Blocks(
+    title="Maya AI - FIXED Sesame AI Killer",
+    theme=gr.themes.Soft()
 ) as demo:
     gr.Markdown("""
+    # 🎤 Maya AI - FIXED Sesame AI Killer
+    *All issues resolved: English-only transcription, working audio output, FREE models*
+    **FIXES:** ✅ English-only ASR ✅ Working TTS audio ✅ FREE LLM ✅ Emotion recognition
     """)
     with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown("### 📞 Call Controls")
+            start_call_btn = gr.Button("📞 Start Call", variant="primary", size="lg")
+            end_call_btn = gr.Button("📞 End Call", variant="stop", size="lg")
             gr.Markdown("### 🎙️ Voice Input")
             audio_input = gr.Audio(
                 sources=["microphone"],
                 type="filepath",
+                label="Record your message in English"
             )
             process_btn = gr.Button("🎯 Process Audio", variant="primary")
         with gr.Column(scale=2):
+            gr.Markdown("### 💬 English Conversation")
             transcription_output = gr.Textbox(
+                label="📝 What you said (English)",
                 lines=2,
                 interactive=False
             )
             audio_output = gr.Audio(
+                label="🔊 Maya's Response (Working Audio)",
                 interactive=False,
                 autoplay=True
             )