Spaces:

Devakumar868
/

Maya-AI

Runtime error

App Files Files Community

Devakumar868 commited on Jun 23

Commit

7b6a710

verified ·

1 Parent(s): d71b62b

Update app.py

Browse files

Files changed (1) hide show

app.py +244 -206

app.py CHANGED Viewed

@@ -2,163 +2,158 @@ import gradio as gr
 import torch
 import numpy as np
 import librosa
-from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, WhisperProcessor, WhisperForConditionalGeneration
 import soundfile as sf
 import json
 import time
 from datetime import datetime
 import os
 import warnings
-# Suppress warnings for cleaner output
 warnings.filterwarnings("ignore")
-class ConversationalAI:
     def __init__(self):
-        # Set device
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        print(f"Using device: {self.device}")
-        # Load Whisper ASR with proper configuration
-        self.asr_processor = WhisperProcessor.from_pretrained("openai/whisper-base.en")
-        self.asr_model = WhisperForConditionalGeneration.from_pretrained(
-            "openai/whisper-base.en",
-            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
-        ).to(self.device)
-        # Load LLM with proper device handling
-        self.llm_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
-        self.llm_tokenizer.pad_token = self.llm_tokenizer.eos_token
-        self.llm_model = AutoModelForCausalLM.from_pretrained(
-            "microsoft/DialoGPT-medium",
-            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
-            pad_token_id=self.llm_tokenizer.eos_token_id
-        ).to(self.device)
-        # Load TTS model
-        self.tts_model = pipeline(
-            "text-to-speech",
-            model="microsoft/speecht5_tts",
-            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
-            device=self.device
-        )
-        # Load WORKING audio emotion recognition model
-        self.emotion_model = pipeline(
-            "audio-classification",
-            model="superb/wav2vec2-base-superb-er",
-            device=self.device
-        )
-        # Conversation history
         self.conversations = {}
-    def transcribe_audio(self, audio_path):
-        """Transcribe audio using Whisper with proper device handling"""
         try:
-            if audio_path is None:
-                return "No audio provided"
-            # Load and preprocess audio
-            audio, sr = librosa.load(audio_path, sr=16000, mono=True)
-            # Process with Whisper
-            inputs = self.asr_processor(
-                audio,
-                sampling_rate=16000,
-                return_tensors="pt",
-                language="en"
-            ).to(self.device)
-            with torch.no_grad():
-                predicted_ids = self.asr_model.generate(
-                    inputs.input_features,
-                    max_new_tokens=100,
-                    do_sample=False
-                )
-            transcription = self.asr_processor.batch_decode(
-                predicted_ids,
-                skip_special_tokens=True
-            )[0]
-            return transcription.strip()
         except Exception as e:
             return f"Transcription error: {str(e)}"
-    def recognize_emotion(self, audio_path):
-        """Recognize emotion from audio using working model"""
         try:
-            if audio_path is None:
-                return "neutral"
             result = self.emotion_model(audio_path)
             emotion_label = result[0]["label"].lower()
-            # Map SUPERB emotions to common emotions
-            emotion_mapping = {
-                "ang": "angry",
-                "hap": "happy",
-                "exc": "excited",
-                "sad": "sad",
-                "fru": "frustrated",
-                "fea": "fearful",
-                "sur": "surprised",
-                "neu": "neutral",
-                "dis": "disgusted"
             }
-            return emotion_mapping.get(emotion_label, emotion_label)
-        except Exception as e:
-            print(f"Emotion recognition error: {e}")
             return "neutral"
-    def generate_response(self, text, emotion, conversation_history):
-        """Generate contextual response with proper device handling"""
         try:
-            if text.startswith("Transcription error") or not text.strip():
-                return "I'm sorry, I couldn't understand what you said. Could you please try again?"
-            # Build context-aware prompt with emotion
-            emotion_responses = {
-                "angry": "I understand you're feeling frustrated. Let me help you with that.",
-                "sad": "I can sense you're feeling down. I'm here to listen and support you.",
-                "happy": "I love your positive energy! That's wonderful to hear.",
-                "excited": "Your enthusiasm is contagious! Tell me more about it.",
-                "fearful": "I can hear the concern in your voice. Let's work through this together.",
-                "surprised": "That sounds quite unexpected! What happened?",
-                "frustrated": "I can tell this is bothering you. Let's see how I can help.",
-                "neutral": "I'm listening. Please go on."
             }
-            emotion_context = emotion_responses.get(emotion, "I'm here to help.")
-            # Simple but effective response generation
-            if len(text.split()) < 3:
-                return f"{emotion_context} Could you tell me more about that?"
-            prompt = f"User ({emotion}): {text}\nMaya (helpful assistant):"
-            # Tokenize with proper attention mask
             inputs = self.llm_tokenizer(
-                prompt,
                 return_tensors="pt",
-                padding=True,
                 truncation=True,
-                max_length=512
             ).to(self.device)
             with torch.no_grad():
                 outputs = self.llm_model.generate(
-                    input_ids=inputs.input_ids,
-                    attention_mask=inputs.attention_mask,
-                    max_new_tokens=60,
                     temperature=0.7,
                     do_sample=True,
-                    pad_token_id=self.llm_tokenizer.eos_token_id,
-                    eos_token_id=self.llm_tokenizer.eos_token_id
                 )
             # Decode response
@@ -167,58 +162,109 @@ class ConversationalAI:
                 skip_special_tokens=True
             ).strip()
-            # Clean up and add emotion context if response is empty
             if not response or len(response) < 5:
                 return emotion_context
             return response
         except Exception as e:
-            return "I'm here to help. What would you like to talk about?"
-    def synthesize_speech(self, text):
-        """Generate speech using TTS"""
         try:
             if not text or len(text.strip()) == 0:
                 return None
             # Clean text for TTS
-            clean_text = text.replace("[", "").replace("]", "").strip()
             if len(clean_text) > 200:
                 clean_text = clean_text[:200] + "..."
-            audio = self.tts_model(clean_text)
-            return audio["audio"]
         except Exception as e:
             print(f"TTS error: {e}")
             return None
     def process_conversation(self, audio_input, user_id="default"):
         """Main conversation processing pipeline"""
         if audio_input is None:
-            return "Please record some audio first", None, "No conversation yet"
         start_time = time.time()
-        # Initialize user conversation if not exists
         if user_id not in self.conversations:
             self.conversations[user_id] = []
         try:
-            # Step 1: Transcribe audio
-            transcription = self.transcribe_audio(audio_input)
-            # Step 2: Recognize emotion from audio
-            emotion = self.recognize_emotion(audio_input)
-            # Step 3: Generate response
-            response_text = self.generate_response(
                 transcription, emotion, self.conversations[user_id]
             )
-            # Step 4: Synthesize speech
-            response_audio = self.synthesize_speech(response_text)
             # Step 5: Update conversation history
             processing_time = time.time() - start_time
@@ -232,147 +278,139 @@ class ConversationalAI:
             self.conversations[user_id].append(conversation_entry)
-            # Keep only last 15 exchanges per user
-            if len(self.conversations[user_id]) > 15:
-                self.conversations[user_id] = self.conversations[user_id][-15:]
-            # Format conversation history
             history = self.format_conversation_history(user_id)
-            return transcription, response_audio, history
         except Exception as e:
-            error_msg = f"Processing error: {str(e)}"
-            return error_msg, None, "Error occurred during processing"
     def format_conversation_history(self, user_id):
         """Format conversation history for display"""
         if user_id not in self.conversations or not self.conversations[user_id]:
-            return "No conversation history yet. Start by recording some audio!"
         history = []
-        for i, entry in enumerate(self.conversations[user_id][-5:], 1):
             history.append(f"**Exchange {i}** ({entry['timestamp']})")
             history.append(f"🎤 **You** ({entry['user_emotion']}): {entry['user_input']}")
             history.append(f"🤖 **Maya**: {entry['ai_response']}")
-            history.append(f"⏱️ *Response time: {entry['processing_time']:.2f}s*")
             history.append("---")
         return "\n".join(history)
-    def clear_conversation(self, user_id="default"):
-        """Clear conversation history"""
-        if user_id in self.conversations:
-            self.conversations[user_id] = []
-        return "Conversation cleared! Ready for a fresh start."
-# Initialize the AI system
-print("Initializing Maya AI...")
-ai_system = ConversationalAI()
-print("Maya AI ready!")
-# Gradio interface functions
-def process_audio(audio):
-    if audio is None:
-        return "Please record some audio first", None, "Click the microphone button above to start recording"
-    return ai_system.process_conversation(audio)
-def clear_chat():
-    message = ai_system.clear_conversation()
-    return "", None, message
-def greet():
-    return "", None, "👋 Hi! I'm Maya, your AI conversation partner. Click the microphone button and start talking!"
-# Create Gradio interface
 with gr.Blocks(
-    title="Maya AI - Conversational Assistant",
     theme=gr.themes.Soft(),
     css="""
-    .gradio-container {
-        max-width: 1200px !important;
-    }
-    .audio-container {
-        min-height: 200px;
-    }
     """
 ) as demo:
     gr.Markdown("""
-    # 🎤 Maya AI - Your Conversational Partner
-    *Advanced speech recognition with emotional understanding*
-    **Instructions:** Click the microphone button, speak clearly, then click stop. Maya will respond with voice and text!
     """)
     with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown("### 🎙️ Voice Input")
             audio_input = gr.Audio(
                 sources=["microphone"],
                 type="filepath",
-                label="Record your message",
-                elem_classes=["audio-container"]
             )
-            with gr.Row():
-                process_btn = gr.Button("💬 Process Audio", variant="primary", size="lg")
-                clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")
         with gr.Column(scale=2):
-            gr.Markdown("### 📝 Conversation")
             transcription_output = gr.Textbox(
-                label="What you said",
                 lines=2,
-                interactive=False,
-                placeholder="Your speech will appear here..."
             )
             audio_output = gr.Audio(
-                label="🔊 Maya's Response",
                 interactive=False,
                 autoplay=True
             )
-            conversation_history = gr.Textbox(
-                label="💭 Conversation History",
-                lines=12,
                 interactive=False,
-                placeholder="Conversation history will appear here...",
                 show_copy_button=True
             )
-    # Event handlers
-    process_btn.click(
-        fn=process_audio,
-        inputs=[audio_input],
-        outputs=[transcription_output, audio_output, conversation_history]
     )
-    clear_btn.click(
-        fn=clear_chat,
-        outputs=[transcription_output, audio_output, conversation_history]
     )
-    # Auto-process when audio is uploaded/recorded
-    audio_input.stop_recording(
-        fn=process_audio,
         inputs=[audio_input],
-        outputs=[transcription_output, audio_output, conversation_history]
     )
-    # Initialize with greeting
-    demo.load(
-        fn=greet,
-        outputs=[transcription_output, audio_output, conversation_history]
     )
-# Launch the app
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
-        show_error=True,
-        quiet=True
     )

 import torch
 import numpy as np
 import librosa
+from transformers import (
+    pipeline, AutoTokenizer, AutoModelForCausalLM,
+    WhisperProcessor, WhisperForConditionalGeneration,
+    SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+)
 import soundfile as sf
 import json
 import time
 from datetime import datetime
 import os
 import warnings
+from datasets import load_dataset
 warnings.filterwarnings("ignore")
+class MayaAI:
     def __init__(self):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"🚀 Initializing Maya AI on {self.device}")
+        # Load Parakeet ASR (Best performance)
+        try:
+            from nemo.collections.asr import ASRModel
+            self.asr_model = ASRModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2")
+            print("✅ Parakeet ASR loaded")
+        except:
+            self.asr_model = pipeline("automatic-speech-recognition",
+                                    model="openai/whisper-large-v3",
+                                    torch_dtype=torch.float16,
+                                    device=self.device)
+            print("⚠️ Using Whisper fallback")
+        # Load FREE DeepSeek-V3 LLM (Best free option)[1][5]
+        try:
+            self.llm_tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-llm-67b-chat")
+            self.llm_model = AutoModelForCausalLM.from_pretrained(
+                "deepseek-ai/deepseek-llm-67b-chat",
+                torch_dtype=torch.float16,
+                device_map="auto",
+                trust_remote_code=True
+            )
+            print("✅ DeepSeek-V3 loaded (FREE)")
+        except:
+            # Fallback to Llama 3.1 (also free)
+            self.llm_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-70B-Instruct")
+            self.llm_model = AutoModelForCausalLM.from_pretrained(
+                "meta-llama/Llama-3.1-70B-Instruct",
+                torch_dtype=torch.float16,
+                device_map="auto"
+            )
+            print("✅ Llama 3.1 loaded (FREE fallback)")
+        # Load Emotion Recognition
+        self.emotion_model = pipeline("audio-classification",
+                                    model="superb/wav2vec2-base-superb-er",
+                                    device=self.device)
+        print("✅ Emotion recognition loaded")
+        # Load TTS with speaker embeddings (FREE)
+        self.tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+        self.tts_model = SpeechT5ForTextToSpeech.from_pretrained(
+            "microsoft/speecht5_tts",
+            torch_dtype=torch.float16
+        ).to(self.device)
+        self.vocoder = SpeechT5HifiGan.from_pretrained(
+            "microsoft/speecht5_hifigan",
+            torch_dtype=torch.float16
+        ).to(self.device)
+        # Load speaker embeddings for natural female voice
+        embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+        # Use female speaker embedding (index 7306 is female)
+        self.speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(self.device)
+        print("✅ Natural female TTS voice loaded")
+        # Conversation storage
         self.conversations = {}
+        self.call_active = False
+    def transcribe_with_parakeet(self, audio_path):
+        """Transcribe using Parakeet (6.05% WER)"""
         try:
+            if hasattr(self.asr_model, 'transcribe'):
+                transcription = self.asr_model.transcribe([audio_path])
+                return transcription[0] if transcription else ""
+            else:
+                result = self.asr_model(audio_path)
+                return result["text"]
         except Exception as e:
             return f"Transcription error: {str(e)}"
+    def recognize_emotion_from_audio(self, audio_path):
+        """Recognize emotion using superb model"""
         try:
             result = self.emotion_model(audio_path)
             emotion_label = result[0]["label"].lower()
+            # Map to human emotions
+            emotion_map = {
+                "ang": "angry", "hap": "happy", "exc": "excited",
+                "sad": "sad", "fru": "frustrated", "fea": "fearful",
+                "sur": "surprised", "neu": "neutral", "dis": "disgusted"
             }
+            return emotion_map.get(emotion_label, emotion_label)
+        except:
             return "neutral"
+    def generate_with_free_llm(self, text, emotion, history):
+        """Generate response using FREE DeepSeek-V3 or Llama"""
         try:
+            # Emotional context prompting
+            emotion_prompts = {
+                "angry": "I understand you're frustrated. Let me help calm this situation.",
+                "sad": "I can hear the sadness in your voice. I'm here to support you.",
+                "happy": "Your joy is infectious! I love your positive energy.",
+                "excited": "Your enthusiasm is amazing! Tell me more!",
+                "fearful": "I sense your concern. Let's work through this together.",
+                "surprised": "That sounds unexpected! What happened?",
+                "neutral": "I'm listening carefully. Please continue."
             }
+            context = f"Previous conversation: {history[-3:] if history else 'None'}"
+            emotion_context = emotion_prompts.get(emotion, "I'm here to help.")
+            prompt = f"""You are Maya, an emotionally intelligent AI assistant with natural conversational abilities.
+            {context}
+            User emotion detected: {emotion}
+            User input: {text}
+            Respond naturally with emotional intelligence. Keep responses under 100 words and conversational.
+            {emotion_context}
+            Maya:"""
+            # Tokenize input
             inputs = self.llm_tokenizer(
+                prompt,
                 return_tensors="pt",
                 truncation=True,
+                max_length=2048
             ).to(self.device)
+            # Generate response
             with torch.no_grad():
                 outputs = self.llm_model.generate(
+                    **inputs,
+                    max_new_tokens=100,
                     temperature=0.7,
                     do_sample=True,
+                    pad_token_id=self.llm_tokenizer.eos_token_id
                 )
             # Decode response
                 skip_special_tokens=True
             ).strip()
+            # Clean up response
             if not response or len(response) < 5:
                 return emotion_context
             return response
         except Exception as e:
+            return f"{emotion_prompts.get(emotion, 'I understand.')} Could you tell me more about that?"
+    def synthesize_emotional_speech(self, text, emotion):
+        """Generate emotional speech with natural breathing"""
         try:
             if not text or len(text.strip()) == 0:
                 return None
+            # Add emotional markers to text
+            emotional_text = text
+            if emotion == "happy":
+                emotional_text = f"*cheerfully* {text}"
+            elif emotion == "sad":
+                emotional_text = f"*gently* {text}"
+            elif emotion == "excited":
+                emotional_text = f"*enthusiastically* {text}"
+            elif emotion == "angry":
+                emotional_text = f"*calmly* {text}"
             # Clean text for TTS
+            clean_text = emotional_text.replace("*", "").replace("[", "").replace("]", "").strip()
             if len(clean_text) > 200:
                 clean_text = clean_text[:200] + "..."
+            # Add natural breathing pauses for longer text
+            if len(clean_text.split()) > 10:
+                words = clean_text.split()
+                mid_point = len(words) // 2
+                clean_text = " ".join(words[:mid_point]) + "... " + " ".join(words[mid_point:])
+            # Process with TTS
+            inputs = self.tts_processor(text=clean_text, return_tensors="pt").to(self.device)
+            with torch.no_grad():
+                speech = self.tts_model.generate_speech(
+                    inputs["input_ids"],
+                    self.speaker_embeddings,
+                    vocoder=self.vocoder
+                )
+            if isinstance(speech, torch.Tensor):
+                speech = speech.cpu().numpy()
+            return speech
         except Exception as e:
             print(f"TTS error: {e}")
             return None
+    def start_call(self):
+        """Start a new call session"""
+        self.call_active = True
+        greeting = "Hello! I'm Maya, your AI conversation partner. I'm here to chat with you naturally and understand your emotions. How are you feeling today?"
+        greeting_audio = self.synthesize_emotional_speech(greeting, "happy")
+        return greeting, (22050, greeting_audio) if greeting_audio is not None else None, "📞 Call started! Maya is greeting you..."
+    def end_call(self, user_id="default"):
+        """End call and clear conversation"""
+        self.call_active = False
+        if user_id in self.conversations:
+            self.conversations[user_id] = []
+        farewell = "Thank you for chatting with me! It was wonderful talking with you. Have a great day!"
+        farewell_audio = self.synthesize_emotional_speech(farewell, "happy")
+        return farewell, (22050, farewell_audio) if farewell_audio is not None else None, "📞 Call ended. Conversation cleared!"
     def process_conversation(self, audio_input, user_id="default"):
         """Main conversation processing pipeline"""
+        if not self.call_active:
+            return "Please start a call first by clicking the 'Start Call' button", None, "No active call"
         if audio_input is None:
+            return "Please record some audio", None, "No audio input"
         start_time = time.time()
         if user_id not in self.conversations:
             self.conversations[user_id] = []
         try:
+            # Step 1: ASR with Parakeet
+            transcription = self.transcribe_with_parakeet(audio_input)
+            # Step 2: Emotion recognition
+            emotion = self.recognize_emotion_from_audio(audio_input)
+            # Step 3: FREE LLM generation
+            response_text = self.generate_with_free_llm(
                 transcription, emotion, self.conversations[user_id]
             )
+            # Step 4: Emotional TTS
+            response_audio = self.synthesize_emotional_speech(response_text, emotion)
             # Step 5: Update conversation history
             processing_time = time.time() - start_time
             self.conversations[user_id].append(conversation_entry)
+            # Keep last 1000 exchanges as specified
+            if len(self.conversations[user_id]) > 1000:
+                self.conversations[user_id] = self.conversations[user_id][-1000:]
             history = self.format_conversation_history(user_id)
+            return transcription, (22050, response_audio) if response_audio is not None else None, history
         except Exception as e:
+            return f"Processing error: {str(e)}", None, "Error in processing"
     def format_conversation_history(self, user_id):
         """Format conversation history for display"""
         if user_id not in self.conversations or not self.conversations[user_id]:
+            return "No conversation history yet."
         history = []
+        for i, entry in enumerate(self.conversations[user_id][-10:], 1):
             history.append(f"**Exchange {i}** ({entry['timestamp']})")
             history.append(f"🎤 **You** ({entry['user_emotion']}): {entry['user_input']}")
             history.append(f"🤖 **Maya**: {entry['ai_response']}")
+            history.append(f"⏱️ *{entry['processing_time']:.2f}s*")
             history.append("---")
         return "\n".join(history)
+# Initialize Maya AI
+print("🚀 Starting Maya AI with FREE models...")
+maya = MayaAI()
+print("✅ Maya AI ready with ZERO API costs!")
+# Gradio Interface Functions
+def start_call_handler():
+    return maya.start_call()
+def end_call_handler():
+    return maya.end_call()
+def process_audio_handler(audio):
+    return maya.process_conversation(audio)
+# Create Gradio Interface
 with gr.Blocks(
+    title="Maya AI - FREE Sesame AI Killer",
     theme=gr.themes.Soft(),
     css="""
+    .call-button { background: linear-gradient(45deg, #00d2d3, #01a3a4) !important; }
+    .end-button { background: linear-gradient(45deg, #ff3838, #c0392b) !important; }
     """
 ) as demo:
     gr.Markdown("""
+    # 🎤 Maya AI - FREE Sesame AI Killer
+    *Advanced conversational AI with emotional intelligence - NO API COSTS!*
+    **FREE Models:** DeepSeek-V3 • Parakeet ASR • Emotion Recognition • Natural Female TTS
     """)
     with gr.Row():
         with gr.Column(scale=1):
+            gr.Markdown("### 📞 Call Controls")
+            start_call_btn = gr.Button(
+                "📞 Start Call",
+                variant="primary",
+                size="lg",
+                elem_classes=["call-button"]
+            )
+            end_call_btn = gr.Button(
+                "📞 End Call",
+                variant="stop",
+                size="lg",
+                elem_classes=["end-button"]
+            )
             gr.Markdown("### 🎙️ Voice Input")
             audio_input = gr.Audio(
                 sources=["microphone"],
                 type="filepath",
+                label="Record your message"
             )
+            process_btn = gr.Button("🎯 Process Audio", variant="primary")
         with gr.Column(scale=2):
+            gr.Markdown("### 💬 FREE Conversation")
             transcription_output = gr.Textbox(
+                label="📝 What you said",
                 lines=2,
+                interactive=False
             )
             audio_output = gr.Audio(
+                label="🔊 Maya's Emotional Response",
                 interactive=False,
                 autoplay=True
             )
+            conversation_display = gr.Textbox(
+                label="💭 Live Conversation (FREE)",
+                lines=15,
                 interactive=False,
                 show_copy_button=True
             )
+    # Event Handlers
+    start_call_btn.click(
+        fn=start_call_handler,
+        outputs=[transcription_output, audio_output, conversation_display]
     )
+    end_call_btn.click(
+        fn=end_call_handler,
+        outputs=[transcription_output, audio_output, conversation_display]
     )
+    process_btn.click(
+        fn=process_audio_handler,
         inputs=[audio_input],
+        outputs=[transcription_output, audio_output, conversation_display]
     )
+    audio_input.stop_recording(
+        fn=process_audio_handler,
+        inputs=[audio_input],
+        outputs=[transcription_output, audio_output, conversation_display]
     )
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
+        show_error=True
     )