Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline | |
| import librosa | |
| from gtts import gTTS | |
| import numpy as np | |
| import tempfile | |
| import os | |
| # Device configuration | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Using device: {DEVICE}") | |
| class EmotionAwareTranscriber: | |
| def __init__(self, model_size="base"): | |
| print("Initializing models...") | |
| # Initialize Whisper | |
| self.processor = WhisperProcessor.from_pretrained(f"openai/whisper-{model_size}") | |
| self.model = WhisperForConditionalGeneration.from_pretrained(f"openai/whisper-{model_size}").to(DEVICE) | |
| # Initialize emotion classifier | |
| self.emotion_classifier = pipeline( | |
| "text-classification", | |
| model="j-hartmann/emotion-english-distilroberta-base", | |
| top_k=1 | |
| ) | |
| # Response templates | |
| self.response_templates = { | |
| 'happy': { | |
| 'motivational': ["Your joy is absolutely contagious! This kind of positive energy is what makes life worth living..."], | |
| 'calm': ["I can feel the warmth of your happiness radiating through your words..."], | |
| 'energetic': ["OH MY GOODNESS THIS IS INCREDIBLE NEWS!!! LET'S CELEBRATE!!!"], | |
| 'angry': ["How can you be happy when there's so much suffering in the world?"] | |
| }, | |
| 'sad': { | |
| 'motivational': ["I hear the sadness in your voice, and I want you to know that this feeling won't last forever..."], | |
| 'calm': ["I sense your heavy heart, and I'm here with you in this moment..."], | |
| 'energetic': ["HEY! I know you're feeling DOWN right now, but LISTEN UP! YOU'VE GOT THIS!!"], | |
| 'angry': ["Stop wallowing and do something productive!"] | |
| }, | |
| 'angry': { | |
| 'motivational': ["That passion can fuel positive change! Let's channel this energy constructively..."], | |
| 'calm': ["I sense your anger. Let's take a deep breath and find solutions..."], | |
| 'energetic': ["YEAH! I FEEL THAT TOO! NOW LET'S DO SOMETHING ABOUT IT!!"], | |
| 'angry': ["You think YOU'RE angry? The whole system is broken!"] | |
| }, | |
| 'disgust': { | |
| 'motivational': ["I can tell you're feeling repulsed. Sometimes disgust protects us from harmful things..."], | |
| 'calm': ["I sense your strong distaste. Let's remove ourselves from this situation..."], | |
| 'energetic': ["EW EW EW!!! GROSS ALERT! LET'S GET AWAY FROM THAT RIGHT NOW!!"], | |
| 'angry': ["This is ABSOLUTELY DISGUSTING and UNACCEPTABLE!"] | |
| }, | |
| 'fear': { | |
| 'motivational': ["It's understandable to feel scared. Remember you've survived tough times before..."], | |
| 'calm': ["Fear is a natural response. Let's assess the situation calmly..."], | |
| 'energetic': ["DANGER ALERT! BUT WAIT - LET'S MAKE A SAFETY PLAN!!"], | |
| 'angry': ["Stop being such a coward!"] | |
| }, | |
| 'neutral': { | |
| 'motivational': ["I appreciate you sharing this with me. Every day brings new opportunities..."], | |
| 'calm': ["Thank you for expressing yourself. I'm here to listen..."], | |
| 'energetic': ["LET'S FIND SOMETHING EXCITING TO DO RIGHT NOW!!"], | |
| 'angry': ["Is that all? How utterly boring."] | |
| }, | |
| 'surprise': { | |
| 'motivational': ["Unexpected moments can be life's greatest gifts!..."], | |
| 'calm': ["I sense your surprise. Let's observe what unfolds..."], | |
| 'energetic': ["WHOA! NO WAY! THAT'S INCREDIBLE!!!"], | |
| 'angry': ["Why are you surprised? You should have seen this coming!"] | |
| }, | |
| 'tired': { | |
| 'motivational': ["Rest is revolutionary. Recharge and return stronger..."], | |
| 'calm': ["Fatigue is natural. Honor your need for rest..."], | |
| 'energetic': ["DON'T QUIT! PUSH THROUGH! YOU'RE ALMOST THERE!!"], | |
| 'angry': ["Tired? That's pathetic! Winners never rest!"] | |
| } | |
| } | |
| def detect_emotion(self, text): | |
| try: | |
| result = self.emotion_classifier(text)[0][0] | |
| emotion = result['label'].lower() | |
| # Manual checks | |
| disgust_keywords = ['disgusting', 'gross', 'revolting'] | |
| if any(kw in text.lower() for kw in disgust_keywords): | |
| return 'disgust' | |
| tired_keywords = ['exhausted', 'tired', 'sleepy'] | |
| if any(kw in text.lower() for kw in tired_keywords): | |
| return 'tired' | |
| return emotion | |
| except Exception as e: | |
| print(f"Emotion detection error: {e}") | |
| return 'neutral' | |
| def generate_response(self, text, emotion, style): | |
| try: | |
| if emotion not in self.response_templates: | |
| emotion = 'neutral' | |
| if style not in self.response_templates[emotion]: | |
| style = 'motivational' | |
| return np.random.choice(self.response_templates[emotion][style]) | |
| except Exception as e: | |
| print(f"Response generation error: {e}") | |
| return "I appreciate you sharing this with me." | |
| def text_to_speech(self, text, style="motivational"): | |
| try: | |
| voice_params = { | |
| 'motivational': {'lang': 'en', 'slow': False}, | |
| 'calm': {'lang': 'en', 'slow': True}, | |
| 'energetic': {'lang': 'en-uk', 'slow': False}, | |
| 'angry': {'lang': 'en-au', 'slow': False} | |
| }.get(style, {'lang': 'en'}) | |
| with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as fp: | |
| tts = gTTS(text=text, **voice_params) | |
| tts.save(fp.name) | |
| return fp.name | |
| except Exception as e: | |
| print(f"TTS error: {e}") | |
| return None | |
| def process_audio(self, audio_path, style): | |
| try: | |
| # Transcribe | |
| waveform, _ = librosa.load(audio_path, sr=16000) | |
| input_features = self.processor(waveform, sampling_rate=16000, return_tensors="pt").input_features.to(DEVICE) | |
| predicted_ids = self.model.generate(input_features, max_length=200) | |
| transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] | |
| # Detect emotion | |
| emotion = self.detect_emotion(transcription) | |
| # Generate response | |
| response = self.generate_response(transcription, emotion, style) | |
| # Convert to speech | |
| audio_output = self.text_to_speech(response, style) | |
| return { | |
| "transcription": transcription, | |
| "emotion": emotion, | |
| "response": response, | |
| "audio": audio_output | |
| } | |
| except Exception as e: | |
| print(f"Processing error: {e}") | |
| return { | |
| "transcription": "Error processing audio", | |
| "emotion": "neutral", | |
| "response": "Sorry, something went wrong", | |
| "audio": None | |
| } | |
| # Initialize the transcriber first | |
| transcriber = EmotionAwareTranscriber() | |
| # Define a global variable to store the last audio file path | |
| last_audio_file = None | |
| # Define the process_audio_wrapper function AFTER initializing the variable | |
| def process_audio_wrapper(audio_path, style): | |
| global last_audio_file | |
| result = transcriber.process_audio(audio_path, style) | |
| # Clean up previous audio files | |
| if last_audio_file and os.path.exists(last_audio_file): | |
| try: | |
| os.unlink(last_audio_file) | |
| except Exception as e: | |
| print(f"Error cleaning up audio file: {e}") | |
| last_audio_file = result["audio"] | |
| return ( | |
| result["transcription"], | |
| result["emotion"].upper(), | |
| result["response"], | |
| result["audio"] if result["audio"] else None | |
| ) | |
| # Gradio interface | |
| with gr.Blocks(title="Emotion-Aware Audio Transcriber") as demo: | |
| gr.Markdown("# 🎤 Emotion-Aware Audio Transcriber") | |
| gr.Markdown("Upload an audio file to get a transcription with emotional analysis and response") | |
| with gr.Row(): | |
| audio_input = gr.Audio(label="Upload Audio", type="filepath") | |
| style_selector = gr.Radio( | |
| ["motivational", "calm", "energetic", "angry"], | |
| label="Response Style", | |
| value="motivational" | |
| ) | |
| submit_btn = gr.Button("Process", variant="primary") | |
| with gr.Column(): | |
| transcription_output = gr.Textbox(label="Transcription") | |
| emotion_output = gr.Textbox(label="Detected Emotion") | |
| response_output = gr.Textbox(label="Generated Response") | |
| audio_output = gr.Audio(label="Spoken Response") | |
| submit_btn.click( | |
| fn=process_audio_wrapper, | |
| inputs=[audio_input, style_selector], | |
| outputs=[transcription_output, emotion_output, response_output, audio_output] | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch() | |
| else: | |
| # This part is crucial for HuggingFace Spaces deployment | |
| app = demo |