Spaces:

Devakumar868
/

Maya-AI

Runtime error

App Files Files Community

Devakumar868 commited on Jun 23

Commit

c91de72

verified ·

1 Parent(s): a4f7834

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -126

app.py CHANGED Viewed

@@ -13,133 +13,90 @@ import os
 # Initialize models
 class ConversationalAI:
     def __init__(self):
-        # Load Parakeet ASR
-        self.asr_model = self.load_parakeet_asr()
-        # Load Gemini (using local alternative due to API constraints)
-        self.llm_tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
         self.llm_model = AutoModelForCausalLM.from_pretrained(
-            "google/gemma-2-9b-it",
             torch_dtype=torch.float16,
             device_map="auto"
         )
-        # Load Dia TTS
-        self.tts_model = self.load_dia_tts()
-        # Load ERVQ for emotion recognition
-        self.emotion_model = self.load_ervq_emotion()
         # Conversation history
         self.conversations = {}
-    def load_parakeet_asr(self):
-        try:
-            from nemo.collections.asr import ASRModel
-            model = ASRModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2")
-            return model
-        except:
-            # Fallback to Whisper if Parakeet unavailable
-            return pipeline("automatic-speech-recognition",
-                          model="openai/whisper-large-v3",
-                          torch_dtype=torch.float16,
-                          device="cuda")
-    def load_dia_tts(self):
-        try:
-            # Load Dia model from Nari Labs
-            from transformers import AutoModel
-            model = AutoModel.from_pretrained("narilabs/dia-1.6b",
-                                            torch_dtype=torch.float16,
-                                            device_map="auto")
-            return model
-        except:
-            # Fallback to high-quality alternative
-            return pipeline("text-to-speech",
-                          model="microsoft/speecht5_tts",
-                          torch_dtype=torch.float16,
-                          device="cuda")
-    def load_ervq_emotion(self):
-        # ERVQ emotion recognition model
-        try:
-            return pipeline("audio-classification",
-                          model="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
-                          device="cuda")
-        except:
-            return None
     def transcribe_audio(self, audio_path):
-        """Transcribe audio using Parakeet ASR"""
         try:
-            if hasattr(self.asr_model, 'transcribe'):
-                # Parakeet method
-                transcription = self.asr_model.transcribe([audio_path])
-                return transcription[0] if transcription else ""
-            else:
-                # Whisper fallback
-                result = self.asr_model(audio_path)
-                return result["text"]
         except Exception as e:
             return f"Transcription error: {str(e)}"
     def recognize_emotion(self, audio_path):
         """Recognize emotion from audio"""
-        if self.emotion_model is None:
-            return "neutral"
         try:
             result = self.emotion_model(audio_path)
             return result[0]["label"].lower()
         except:
             return "neutral"
     def generate_response(self, text, emotion, conversation_history):
-        """Generate contextual response using Gemini"""
-        # Build context-aware prompt
-        context = f"Previous conversation: {conversation_history[-3:] if conversation_history else 'None'}"
-        emotion_context = f"User emotion detected: {emotion}"
-        prompt = f"""You are Maya, a naturally conversational AI assistant with emotional intelligence.
-        {context}
-        {emotion_context}
-        Respond naturally and emotionally appropriate to: {text}
-        Keep responses conversational, empathetic, and under 100 words."""
-        inputs = self.llm_tokenizer(prompt, return_tensors="pt").to("cuda")
-        with torch.no_grad():
-            outputs = self.llm_model.generate(
-                **inputs,
-                max_new_tokens=150,
-                temperature=0.7,
-                do_sample=True,
-                pad_token_id=self.llm_tokenizer.eos_token_id
-            )
-        response = self.llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
-        # Extract only the new response
-        response = response.split("Respond naturally")[-1].strip()
-        return response
-    def synthesize_speech(self, text, emotion):
-        """Generate emotional speech using Dia TTS"""
         try:
-            # Emotional context for TTS
-            emotional_prompt = f"[{emotion}] {text}"
-            if hasattr(self.tts_model, 'generate_speech'):
-                # Dia method
-                audio = self.tts_model.generate_speech(emotional_prompt)
-            else:
-                # Fallback method
-                audio = self.tts_model(text)
-                audio = audio["audio"]
-            return audio
         except Exception as e:
             return None
@@ -166,7 +123,7 @@ class ConversationalAI:
         )
         # Step 4: Synthesize speech
-        response_audio = self.synthesize_speech(response_text, emotion)
         # Step 5: Update conversation history
         conversation_entry = {
@@ -179,9 +136,9 @@ class ConversationalAI:
         self.conversations[user_id].append(conversation_entry)
-        # Keep only last 50 exchanges per user
-        if len(self.conversations[user_id]) > 50:
-            self.conversations[user_id] = self.conversations[user_id][-50:]
         # Format conversation history
         history = self.format_conversation_history(user_id)
@@ -194,7 +151,7 @@ class ConversationalAI:
             return "No conversation history"
         history = []
-        for entry in self.conversations[user_id][-10:]:  # Show last 10 exchanges
             history.append(f"🎤 You ({entry['user_emotion']}): {entry['user_input']}")
             history.append(f"🤖 Maya: {entry['ai_response']}")
             history.append(f"⏱️ Response time: {entry['processing_time']:.2f}s\n")
@@ -210,37 +167,39 @@ class ConversationalAI:
 # Initialize the AI system
 ai_system = ConversationalAI()
-# Gradio interface
 def process_audio(audio):
     transcription, response_audio, history = ai_system.process_conversation(audio)
     return transcription, response_audio, history
 def clear_chat():
     message = ai_system.clear_conversation()
-    return message, "Conversation cleared!"
 # Create Gradio interface
-with gr.Blocks(title="Maya AI - Advanced Conversational AI", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🎤 Maya AI - Your Emotional Conversational Partner")
-    gr.Markdown("*Powered by Parakeet ASR, Gemini LLM, and Dia TTS with emotional intelligence*")
     with gr.Row():
         with gr.Column(scale=1):
             audio_input = gr.Audio(
                 sources=["microphone"],
                 type="filepath",
-                label="🎙️ Speak to Maya",
-                interactive=True
             )
-            process_btn = gr.Button("💬 Process Conversation", variant="primary")
-            clear_btn = gr.Button("🗑️ Clear Conversation", variant="secondary")
         with gr.Column(scale=2):
             transcription_output = gr.Textbox(
                 label="📝 What you said",
-                interactive=False,
-                lines=3
             )
             audio_output = gr.Audio(
@@ -250,9 +209,8 @@ with gr.Blocks(title="Maya AI - Advanced Conversational AI", theme=gr.themes.Sof
             conversation_history = gr.Textbox(
                 label="💭 Conversation History",
-                interactive=False,
-                lines=15,
-                max_lines=20
             )
     # Event handlers
@@ -267,7 +225,7 @@ with gr.Blocks(title="Maya AI - Advanced Conversational AI", theme=gr.themes.Sof
         outputs=[transcription_output, conversation_history]
     )
-    # Auto-process when audio is recorded
     audio_input.change(
         fn=process_audio,
         inputs=[audio_input],
@@ -276,9 +234,4 @@ with gr.Blocks(title="Maya AI - Advanced Conversational AI", theme=gr.themes.Sof
 # Launch the app
 if __name__ == "__main__":
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=True,
-        show_error=True
-    )

 # Initialize models
 class ConversationalAI:
     def __init__(self):
+        # Load ASR model (using Whisper as fallback since Parakeet may not be available)
+        self.asr_model = pipeline("automatic-speech-recognition",
+                                model="openai/whisper-large-v3",
+                                torch_dtype=torch.float16,
+                                device="cuda" if torch.cuda.is_available() else "cpu")
+        # Load LLM (using smaller model for HF Spaces)
+        self.llm_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
         self.llm_model = AutoModelForCausalLM.from_pretrained(
+            "microsoft/DialoGPT-medium",
             torch_dtype=torch.float16,
             device_map="auto"
         )
+        # Load TTS model
+        self.tts_model = pipeline("text-to-speech",
+                                model="microsoft/speecht5_tts",
+                                torch_dtype=torch.float16,
+                                device="cuda" if torch.cuda.is_available() else "cpu")
+        # Load emotion recognition
+        self.emotion_model = pipeline("audio-classification",
+                                    model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
+                                    device="cuda" if torch.cuda.is_available() else "cpu")
         # Conversation history
         self.conversations = {}
     def transcribe_audio(self, audio_path):
+        """Transcribe audio using Whisper"""
         try:
+            if audio_path is None:
+                return "No audio provided"
+            result = self.asr_model(audio_path)
+            return result["text"]
         except Exception as e:
             return f"Transcription error: {str(e)}"
     def recognize_emotion(self, audio_path):
         """Recognize emotion from audio"""
         try:
+            if audio_path is None:
+                return "neutral"
             result = self.emotion_model(audio_path)
             return result[0]["label"].lower()
         except:
             return "neutral"
     def generate_response(self, text, emotion, conversation_history):
+        """Generate contextual response"""
         try:
+            # Build context-aware prompt
+            context = f"Previous conversation: {conversation_history[-2:] if conversation_history else 'None'}"
+            emotion_context = f"User emotion: {emotion}"
+            prompt = f"You are Maya, a friendly AI assistant. {context} {emotion_context} User: {text} Maya:"
+            inputs = self.llm_tokenizer.encode(prompt, return_tensors="pt")
+            with torch.no_grad():
+                outputs = self.llm_model.generate(
+                    inputs,
+                    max_new_tokens=100,
+                    temperature=0.7,
+                    do_sample=True,
+                    pad_token_id=self.llm_tokenizer.eos_token_id
+                )
+            response = self.llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
+            # Extract only the new response
+            response = response.split("Maya:")[-1].strip()
+            return response
+        except Exception as e:
+            return f"I'm sorry, I encountered an error: {str(e)}"
+    def synthesize_speech(self, text):
+        """Generate speech using TTS"""
+        try:
+            # Use a simple TTS approach for HF Spaces
+            audio = self.tts_model(text)
+            return audio["audio"]
         except Exception as e:
             return None
         )
         # Step 4: Synthesize speech
+        response_audio = self.synthesize_speech(response_text)
         # Step 5: Update conversation history
         conversation_entry = {
         self.conversations[user_id].append(conversation_entry)
+        # Keep only last 20 exchanges per user
+        if len(self.conversations[user_id]) > 20:
+            self.conversations[user_id] = self.conversations[user_id][-20:]
         # Format conversation history
         history = self.format_conversation_history(user_id)
             return "No conversation history"
         history = []
+        for entry in self.conversations[user_id][-5:]:  # Show last 5 exchanges
             history.append(f"🎤 You ({entry['user_emotion']}): {entry['user_input']}")
             history.append(f"🤖 Maya: {entry['ai_response']}")
             history.append(f"⏱️ Response time: {entry['processing_time']:.2f}s\n")
 # Initialize the AI system
 ai_system = ConversationalAI()
+# Gradio interface functions
 def process_audio(audio):
+    if audio is None:
+        return "No audio provided", None, "No conversation yet"
     transcription, response_audio, history = ai_system.process_conversation(audio)
     return transcription, response_audio, history
 def clear_chat():
     message = ai_system.clear_conversation()
+    return "", "Conversation cleared!"
 # Create Gradio interface
+with gr.Blocks(title="Maya AI - Conversational Assistant", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🎤 Maya AI - Your Conversational Partner")
+    gr.Markdown("*Speak naturally and Maya will respond with voice and emotion recognition*")
     with gr.Row():
         with gr.Column(scale=1):
             audio_input = gr.Audio(
                 sources=["microphone"],
                 type="filepath",
+                label="🎙️ Speak to Maya"
             )
+            process_btn = gr.Button("💬 Process", variant="primary")
+            clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")
         with gr.Column(scale=2):
             transcription_output = gr.Textbox(
                 label="📝 What you said",
+                lines=2,
+                interactive=False
             )
             audio_output = gr.Audio(
             conversation_history = gr.Textbox(
                 label="💭 Conversation History",
+                lines=10,
+                interactive=False
             )
     # Event handlers
         outputs=[transcription_output, conversation_history]
     )
+    # Auto-process when audio is uploaded
     audio_input.change(
         fn=process_audio,
         inputs=[audio_input],
 # Launch the app
 if __name__ == "__main__":
+    demo.launch()