Spaces:

KavyaBansal
/

ToneRewriter

Sleeping

File size: 9,280 Bytes

import gradio as gr
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
import librosa
from gtts import gTTS
import numpy as np
import tempfile
import os

# Device configuration
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

class EmotionAwareTranscriber:
    def __init__(self, model_size="base"):
        print("Initializing models...")

        # Initialize Whisper
        self.processor = WhisperProcessor.from_pretrained(f"openai/whisper-{model_size}")
        self.model = WhisperForConditionalGeneration.from_pretrained(f"openai/whisper-{model_size}").to(DEVICE)

        # Initialize emotion classifier
        self.emotion_classifier = pipeline(
            "text-classification",
            model="j-hartmann/emotion-english-distilroberta-base",
            top_k=1
        )

        # Response templates
        self.response_templates = {
            'happy': {
                'motivational': ["Your joy is absolutely contagious! This kind of positive energy is what makes life worth living..."],
                'calm': ["I can feel the warmth of your happiness radiating through your words..."],
                'energetic': ["OH MY GOODNESS THIS IS INCREDIBLE NEWS!!! LET'S CELEBRATE!!!"],
                'angry': ["How can you be happy when there's so much suffering in the world?"]
            },
            'sad': {
                'motivational': ["I hear the sadness in your voice, and I want you to know that this feeling won't last forever..."],
                'calm': ["I sense your heavy heart, and I'm here with you in this moment..."],
                'energetic': ["HEY! I know you're feeling DOWN right now, but LISTEN UP! YOU'VE GOT THIS!!"],
                'angry': ["Stop wallowing and do something productive!"]
            },
            'angry': {
                'motivational': ["That passion can fuel positive change! Let's channel this energy constructively..."],
                'calm': ["I sense your anger. Let's take a deep breath and find solutions..."],
                'energetic': ["YEAH! I FEEL THAT TOO! NOW LET'S DO SOMETHING ABOUT IT!!"],
                'angry': ["You think YOU'RE angry? The whole system is broken!"]
            },
            'disgust': {
                'motivational': ["I can tell you're feeling repulsed. Sometimes disgust protects us from harmful things..."],
                'calm': ["I sense your strong distaste. Let's remove ourselves from this situation..."],
                'energetic': ["EW EW EW!!! GROSS ALERT! LET'S GET AWAY FROM THAT RIGHT NOW!!"],
                'angry': ["This is ABSOLUTELY DISGUSTING and UNACCEPTABLE!"]
            },
            'fear': {
                'motivational': ["It's understandable to feel scared. Remember you've survived tough times before..."],
                'calm': ["Fear is a natural response. Let's assess the situation calmly..."],
                'energetic': ["DANGER ALERT! BUT WAIT - LET'S MAKE A SAFETY PLAN!!"],
                'angry': ["Stop being such a coward!"]
            },
            'neutral': {
                'motivational': ["I appreciate you sharing this with me. Every day brings new opportunities..."],
                'calm': ["Thank you for expressing yourself. I'm here to listen..."],
                'energetic': ["LET'S FIND SOMETHING EXCITING TO DO RIGHT NOW!!"],
                'angry': ["Is that all? How utterly boring."]
            },
            'surprise': {
                'motivational': ["Unexpected moments can be life's greatest gifts!..."],
                'calm': ["I sense your surprise. Let's observe what unfolds..."],
                'energetic': ["WHOA! NO WAY! THAT'S INCREDIBLE!!!"],
                'angry': ["Why are you surprised? You should have seen this coming!"]
            },
            'tired': {
                'motivational': ["Rest is revolutionary. Recharge and return stronger..."],
                'calm': ["Fatigue is natural. Honor your need for rest..."],
                'energetic': ["DON'T QUIT! PUSH THROUGH! YOU'RE ALMOST THERE!!"],
                'angry': ["Tired? That's pathetic! Winners never rest!"]
            }
        }

    def detect_emotion(self, text):
        try:
            result = self.emotion_classifier(text)[0][0]
            emotion = result['label'].lower()

            # Manual checks
            disgust_keywords = ['disgusting', 'gross', 'revolting']
            if any(kw in text.lower() for kw in disgust_keywords):
                return 'disgust'

            tired_keywords = ['exhausted', 'tired', 'sleepy']
            if any(kw in text.lower() for kw in tired_keywords):
                return 'tired'

            return emotion
        except Exception as e:
            print(f"Emotion detection error: {e}")
            return 'neutral'

    def generate_response(self, text, emotion, style):
        try:
            if emotion not in self.response_templates:
                emotion = 'neutral'
            if style not in self.response_templates[emotion]:
                style = 'motivational'
            return np.random.choice(self.response_templates[emotion][style])
        except Exception as e:
            print(f"Response generation error: {e}")
            return "I appreciate you sharing this with me."

    def text_to_speech(self, text, style="motivational"):
        try:
            voice_params = {
                'motivational': {'lang': 'en', 'slow': False},
                'calm': {'lang': 'en', 'slow': True},
                'energetic': {'lang': 'en-uk', 'slow': False},
                'angry': {'lang': 'en-au', 'slow': False}
            }.get(style, {'lang': 'en'})

            with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as fp:
                tts = gTTS(text=text, **voice_params)
                tts.save(fp.name)
                return fp.name
        except Exception as e:
            print(f"TTS error: {e}")
            return None

    def process_audio(self, audio_path, style):
        try:
            # Transcribe
            waveform, _ = librosa.load(audio_path, sr=16000)
            input_features = self.processor(waveform, sampling_rate=16000, return_tensors="pt").input_features.to(DEVICE)
            predicted_ids = self.model.generate(input_features, max_length=200)
            transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

            # Detect emotion
            emotion = self.detect_emotion(transcription)

            # Generate response
            response = self.generate_response(transcription, emotion, style)

            # Convert to speech
            audio_output = self.text_to_speech(response, style)

            return {
                "transcription": transcription,
                "emotion": emotion,
                "response": response,
                "audio": audio_output
            }
        except Exception as e:
            print(f"Processing error: {e}")
            return {
                "transcription": "Error processing audio",
                "emotion": "neutral",
                "response": "Sorry, something went wrong",
                "audio": None
            }

# Initialize the transcriber first
transcriber = EmotionAwareTranscriber()

# Define a global variable to store the last audio file path
last_audio_file = None

# Define the process_audio_wrapper function AFTER initializing the variable
def process_audio_wrapper(audio_path, style):
    global last_audio_file
    
    result = transcriber.process_audio(audio_path, style)

    # Clean up previous audio files
    if last_audio_file and os.path.exists(last_audio_file):
        try:
            os.unlink(last_audio_file)
        except Exception as e:
            print(f"Error cleaning up audio file: {e}")
    
    last_audio_file = result["audio"]

    return (
        result["transcription"],
        result["emotion"].upper(),
        result["response"],
        result["audio"] if result["audio"] else None
    )

# Gradio interface
with gr.Blocks(title="Emotion-Aware Audio Transcriber") as demo:
    gr.Markdown("# 🎤 Emotion-Aware Audio Transcriber")
    gr.Markdown("Upload an audio file to get a transcription with emotional analysis and response")

    with gr.Row():
        audio_input = gr.Audio(label="Upload Audio", type="filepath")
        style_selector = gr.Radio(
            ["motivational", "calm", "energetic", "angry"],
            label="Response Style",
            value="motivational"
        )
        submit_btn = gr.Button("Process", variant="primary")

    with gr.Column():
        transcription_output = gr.Textbox(label="Transcription")
        emotion_output = gr.Textbox(label="Detected Emotion")
        response_output = gr.Textbox(label="Generated Response")
        audio_output = gr.Audio(label="Spoken Response")

    submit_btn.click(
        fn=process_audio_wrapper,
        inputs=[audio_input, style_selector],
        outputs=[transcription_output, emotion_output, response_output, audio_output]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()
else:
    # This part is crucial for HuggingFace Spaces deployment
    app = demo