import gradio as gr import torch from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline import librosa from gtts import gTTS import numpy as np import tempfile import os # Device configuration DEVICE = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {DEVICE}") class EmotionAwareTranscriber: def __init__(self, model_size="base"): print("Initializing models...") # Initialize Whisper self.processor = WhisperProcessor.from_pretrained(f"openai/whisper-{model_size}") self.model = WhisperForConditionalGeneration.from_pretrained(f"openai/whisper-{model_size}").to(DEVICE) # Initialize emotion classifier self.emotion_classifier = pipeline( "text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=1 ) # Response templates self.response_templates = { 'happy': { 'motivational': ["Your joy is absolutely contagious! This kind of positive energy is what makes life worth living..."], 'calm': ["I can feel the warmth of your happiness radiating through your words..."], 'energetic': ["OH MY GOODNESS THIS IS INCREDIBLE NEWS!!! LET'S CELEBRATE!!!"], 'angry': ["How can you be happy when there's so much suffering in the world?"] }, 'sad': { 'motivational': ["I hear the sadness in your voice, and I want you to know that this feeling won't last forever..."], 'calm': ["I sense your heavy heart, and I'm here with you in this moment..."], 'energetic': ["HEY! I know you're feeling DOWN right now, but LISTEN UP! YOU'VE GOT THIS!!"], 'angry': ["Stop wallowing and do something productive!"] }, 'angry': { 'motivational': ["That passion can fuel positive change! Let's channel this energy constructively..."], 'calm': ["I sense your anger. Let's take a deep breath and find solutions..."], 'energetic': ["YEAH! I FEEL THAT TOO! NOW LET'S DO SOMETHING ABOUT IT!!"], 'angry': ["You think YOU'RE angry? The whole system is broken!"] }, 'disgust': { 'motivational': ["I can tell you're feeling repulsed. Sometimes disgust protects us from harmful things..."], 'calm': ["I sense your strong distaste. Let's remove ourselves from this situation..."], 'energetic': ["EW EW EW!!! GROSS ALERT! LET'S GET AWAY FROM THAT RIGHT NOW!!"], 'angry': ["This is ABSOLUTELY DISGUSTING and UNACCEPTABLE!"] }, 'fear': { 'motivational': ["It's understandable to feel scared. Remember you've survived tough times before..."], 'calm': ["Fear is a natural response. Let's assess the situation calmly..."], 'energetic': ["DANGER ALERT! BUT WAIT - LET'S MAKE A SAFETY PLAN!!"], 'angry': ["Stop being such a coward!"] }, 'neutral': { 'motivational': ["I appreciate you sharing this with me. Every day brings new opportunities..."], 'calm': ["Thank you for expressing yourself. I'm here to listen..."], 'energetic': ["LET'S FIND SOMETHING EXCITING TO DO RIGHT NOW!!"], 'angry': ["Is that all? How utterly boring."] }, 'surprise': { 'motivational': ["Unexpected moments can be life's greatest gifts!..."], 'calm': ["I sense your surprise. Let's observe what unfolds..."], 'energetic': ["WHOA! NO WAY! THAT'S INCREDIBLE!!!"], 'angry': ["Why are you surprised? You should have seen this coming!"] }, 'tired': { 'motivational': ["Rest is revolutionary. Recharge and return stronger..."], 'calm': ["Fatigue is natural. Honor your need for rest..."], 'energetic': ["DON'T QUIT! PUSH THROUGH! YOU'RE ALMOST THERE!!"], 'angry': ["Tired? That's pathetic! Winners never rest!"] } } def detect_emotion(self, text): try: result = self.emotion_classifier(text)[0][0] emotion = result['label'].lower() # Manual checks disgust_keywords = ['disgusting', 'gross', 'revolting'] if any(kw in text.lower() for kw in disgust_keywords): return 'disgust' tired_keywords = ['exhausted', 'tired', 'sleepy'] if any(kw in text.lower() for kw in tired_keywords): return 'tired' return emotion except Exception as e: print(f"Emotion detection error: {e}") return 'neutral' def generate_response(self, text, emotion, style): try: if emotion not in self.response_templates: emotion = 'neutral' if style not in self.response_templates[emotion]: style = 'motivational' return np.random.choice(self.response_templates[emotion][style]) except Exception as e: print(f"Response generation error: {e}") return "I appreciate you sharing this with me." def text_to_speech(self, text, style="motivational"): try: voice_params = { 'motivational': {'lang': 'en', 'slow': False}, 'calm': {'lang': 'en', 'slow': True}, 'energetic': {'lang': 'en-uk', 'slow': False}, 'angry': {'lang': 'en-au', 'slow': False} }.get(style, {'lang': 'en'}) with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as fp: tts = gTTS(text=text, **voice_params) tts.save(fp.name) return fp.name except Exception as e: print(f"TTS error: {e}") return None def process_audio(self, audio_path, style): try: # Transcribe waveform, _ = librosa.load(audio_path, sr=16000) input_features = self.processor(waveform, sampling_rate=16000, return_tensors="pt").input_features.to(DEVICE) predicted_ids = self.model.generate(input_features, max_length=200) transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] # Detect emotion emotion = self.detect_emotion(transcription) # Generate response response = self.generate_response(transcription, emotion, style) # Convert to speech audio_output = self.text_to_speech(response, style) return { "transcription": transcription, "emotion": emotion, "response": response, "audio": audio_output } except Exception as e: print(f"Processing error: {e}") return { "transcription": "Error processing audio", "emotion": "neutral", "response": "Sorry, something went wrong", "audio": None } # Initialize the transcriber first transcriber = EmotionAwareTranscriber() # Define a global variable to store the last audio file path last_audio_file = None # Define the process_audio_wrapper function AFTER initializing the variable def process_audio_wrapper(audio_path, style): global last_audio_file result = transcriber.process_audio(audio_path, style) # Clean up previous audio files if last_audio_file and os.path.exists(last_audio_file): try: os.unlink(last_audio_file) except Exception as e: print(f"Error cleaning up audio file: {e}") last_audio_file = result["audio"] return ( result["transcription"], result["emotion"].upper(), result["response"], result["audio"] if result["audio"] else None ) # Gradio interface with gr.Blocks(title="Emotion-Aware Audio Transcriber") as demo: gr.Markdown("# 🎤 Emotion-Aware Audio Transcriber") gr.Markdown("Upload an audio file to get a transcription with emotional analysis and response") with gr.Row(): audio_input = gr.Audio(label="Upload Audio", type="filepath") style_selector = gr.Radio( ["motivational", "calm", "energetic", "angry"], label="Response Style", value="motivational" ) submit_btn = gr.Button("Process", variant="primary") with gr.Column(): transcription_output = gr.Textbox(label="Transcription") emotion_output = gr.Textbox(label="Detected Emotion") response_output = gr.Textbox(label="Generated Response") audio_output = gr.Audio(label="Spoken Response") submit_btn.click( fn=process_audio_wrapper, inputs=[audio_input, style_selector], outputs=[transcription_output, emotion_output, response_output, audio_output] ) # Launch the app if __name__ == "__main__": demo.launch() else: # This part is crucial for HuggingFace Spaces deployment app = demo