import gradio as gr import torch import numpy as np import librosa from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM import soundfile as sf from huggingface_hub import hf_hub_download import json import time from datetime import datetime import os # Initialize models class ConversationalAI: def __init__(self): # Load ASR model (using Whisper as fallback since Parakeet may not be available) self.asr_model = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", torch_dtype=torch.float16, device="cuda" if torch.cuda.is_available() else "cpu") # Load LLM (using smaller model for HF Spaces) self.llm_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium") self.llm_model = AutoModelForCausalLM.from_pretrained( "microsoft/DialoGPT-medium", torch_dtype=torch.float16, device_map="auto" ) # Load TTS model self.tts_model = pipeline("text-to-speech", model="microsoft/speecht5_tts", torch_dtype=torch.float16, device="cuda" if torch.cuda.is_available() else "cpu") # Load emotion recognition self.emotion_model = pipeline("audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition", device="cuda" if torch.cuda.is_available() else "cpu") # Conversation history self.conversations = {} def transcribe_audio(self, audio_path): """Transcribe audio using Whisper""" try: if audio_path is None: return "No audio provided" result = self.asr_model(audio_path) return result["text"] except Exception as e: return f"Transcription error: {str(e)}" def recognize_emotion(self, audio_path): """Recognize emotion from audio""" try: if audio_path is None: return "neutral" result = self.emotion_model(audio_path) return result[0]["label"].lower() except: return "neutral" def generate_response(self, text, emotion, conversation_history): """Generate contextual response""" try: # Build context-aware prompt context = f"Previous conversation: {conversation_history[-2:] if conversation_history else 'None'}" emotion_context = f"User emotion: {emotion}" prompt = f"You are Maya, a friendly AI assistant. {context} {emotion_context} User: {text} Maya:" inputs = self.llm_tokenizer.encode(prompt, return_tensors="pt") with torch.no_grad(): outputs = self.llm_model.generate( inputs, max_new_tokens=100, temperature=0.7, do_sample=True, pad_token_id=self.llm_tokenizer.eos_token_id ) response = self.llm_tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract only the new response response = response.split("Maya:")[-1].strip() return response except Exception as e: return f"I'm sorry, I encountered an error: {str(e)}" def synthesize_speech(self, text): """Generate speech using TTS""" try: # Use a simple TTS approach for HF Spaces audio = self.tts_model(text) return audio["audio"] except Exception as e: return None def process_conversation(self, audio_input, user_id="default"): """Main conversation processing pipeline""" if audio_input is None: return "Please provide audio input", None, "No conversation yet" start_time = time.time() # Initialize user conversation if not exists if user_id not in self.conversations: self.conversations[user_id] = [] # Step 1: Transcribe audio transcription = self.transcribe_audio(audio_input) # Step 2: Recognize emotion emotion = self.recognize_emotion(audio_input) # Step 3: Generate response response_text = self.generate_response( transcription, emotion, self.conversations[user_id] ) # Step 4: Synthesize speech response_audio = self.synthesize_speech(response_text) # Step 5: Update conversation history conversation_entry = { "timestamp": datetime.now().isoformat(), "user_input": transcription, "user_emotion": emotion, "ai_response": response_text, "processing_time": time.time() - start_time } self.conversations[user_id].append(conversation_entry) # Keep only last 20 exchanges per user if len(self.conversations[user_id]) > 20: self.conversations[user_id] = self.conversations[user_id][-20:] # Format conversation history history = self.format_conversation_history(user_id) return transcription, response_audio, history def format_conversation_history(self, user_id): """Format conversation history for display""" if user_id not in self.conversations: return "No conversation history" history = [] for entry in self.conversations[user_id][-5:]: # Show last 5 exchanges history.append(f"🎤 You ({entry['user_emotion']}): {entry['user_input']}") history.append(f"🤖 Maya: {entry['ai_response']}") history.append(f"⏱️ Response time: {entry['processing_time']:.2f}s\n") return "\n".join(history) def clear_conversation(self, user_id="default"): """Clear conversation history""" if user_id in self.conversations: self.conversations[user_id] = [] return "Conversation cleared!" # Initialize the AI system ai_system = ConversationalAI() # Gradio interface functions def process_audio(audio): if audio is None: return "No audio provided", None, "No conversation yet" transcription, response_audio, history = ai_system.process_conversation(audio) return transcription, response_audio, history def clear_chat(): message = ai_system.clear_conversation() return "", "Conversation cleared!" # Create Gradio interface with gr.Blocks(title="Maya AI - Conversational Assistant", theme=gr.themes.Soft()) as demo: gr.Markdown("# 🎤 Maya AI - Your Conversational Partner") gr.Markdown("*Speak naturally and Maya will respond with voice and emotion recognition*") with gr.Row(): with gr.Column(scale=1): audio_input = gr.Audio( sources=["microphone"], type="filepath", label="🎙️ Speak to Maya" ) process_btn = gr.Button("💬 Process", variant="primary") clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary") with gr.Column(scale=2): transcription_output = gr.Textbox( label="📝 What you said", lines=2, interactive=False ) audio_output = gr.Audio( label="🔊 Maya's Response", interactive=False ) conversation_history = gr.Textbox( label="💭 Conversation History", lines=10, interactive=False ) # Event handlers process_btn.click( fn=process_audio, inputs=[audio_input], outputs=[transcription_output, audio_output, conversation_history] ) clear_btn.click( fn=clear_chat, outputs=[transcription_output, conversation_history] ) # Auto-process when audio is uploaded audio_input.change( fn=process_audio, inputs=[audio_input], outputs=[transcription_output, audio_output, conversation_history] ) # Launch the app if __name__ == "__main__": demo.launch()