ToneRewriter / app.py
KavyaBansal's picture
Update app.py
5faa186 verified
import gradio as gr
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
import librosa
from gtts import gTTS
import numpy as np
import tempfile
import os
# Device configuration
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")
class EmotionAwareTranscriber:
def __init__(self, model_size="base"):
print("Initializing models...")
# Initialize Whisper
self.processor = WhisperProcessor.from_pretrained(f"openai/whisper-{model_size}")
self.model = WhisperForConditionalGeneration.from_pretrained(f"openai/whisper-{model_size}").to(DEVICE)
# Initialize emotion classifier
self.emotion_classifier = pipeline(
"text-classification",
model="j-hartmann/emotion-english-distilroberta-base",
top_k=1
)
# Response templates
self.response_templates = {
'happy': {
'motivational': ["Your joy is absolutely contagious! This kind of positive energy is what makes life worth living..."],
'calm': ["I can feel the warmth of your happiness radiating through your words..."],
'energetic': ["OH MY GOODNESS THIS IS INCREDIBLE NEWS!!! LET'S CELEBRATE!!!"],
'angry': ["How can you be happy when there's so much suffering in the world?"]
},
'sad': {
'motivational': ["I hear the sadness in your voice, and I want you to know that this feeling won't last forever..."],
'calm': ["I sense your heavy heart, and I'm here with you in this moment..."],
'energetic': ["HEY! I know you're feeling DOWN right now, but LISTEN UP! YOU'VE GOT THIS!!"],
'angry': ["Stop wallowing and do something productive!"]
},
'angry': {
'motivational': ["That passion can fuel positive change! Let's channel this energy constructively..."],
'calm': ["I sense your anger. Let's take a deep breath and find solutions..."],
'energetic': ["YEAH! I FEEL THAT TOO! NOW LET'S DO SOMETHING ABOUT IT!!"],
'angry': ["You think YOU'RE angry? The whole system is broken!"]
},
'disgust': {
'motivational': ["I can tell you're feeling repulsed. Sometimes disgust protects us from harmful things..."],
'calm': ["I sense your strong distaste. Let's remove ourselves from this situation..."],
'energetic': ["EW EW EW!!! GROSS ALERT! LET'S GET AWAY FROM THAT RIGHT NOW!!"],
'angry': ["This is ABSOLUTELY DISGUSTING and UNACCEPTABLE!"]
},
'fear': {
'motivational': ["It's understandable to feel scared. Remember you've survived tough times before..."],
'calm': ["Fear is a natural response. Let's assess the situation calmly..."],
'energetic': ["DANGER ALERT! BUT WAIT - LET'S MAKE A SAFETY PLAN!!"],
'angry': ["Stop being such a coward!"]
},
'neutral': {
'motivational': ["I appreciate you sharing this with me. Every day brings new opportunities..."],
'calm': ["Thank you for expressing yourself. I'm here to listen..."],
'energetic': ["LET'S FIND SOMETHING EXCITING TO DO RIGHT NOW!!"],
'angry': ["Is that all? How utterly boring."]
},
'surprise': {
'motivational': ["Unexpected moments can be life's greatest gifts!..."],
'calm': ["I sense your surprise. Let's observe what unfolds..."],
'energetic': ["WHOA! NO WAY! THAT'S INCREDIBLE!!!"],
'angry': ["Why are you surprised? You should have seen this coming!"]
},
'tired': {
'motivational': ["Rest is revolutionary. Recharge and return stronger..."],
'calm': ["Fatigue is natural. Honor your need for rest..."],
'energetic': ["DON'T QUIT! PUSH THROUGH! YOU'RE ALMOST THERE!!"],
'angry': ["Tired? That's pathetic! Winners never rest!"]
}
}
def detect_emotion(self, text):
try:
result = self.emotion_classifier(text)[0][0]
emotion = result['label'].lower()
# Manual checks
disgust_keywords = ['disgusting', 'gross', 'revolting']
if any(kw in text.lower() for kw in disgust_keywords):
return 'disgust'
tired_keywords = ['exhausted', 'tired', 'sleepy']
if any(kw in text.lower() for kw in tired_keywords):
return 'tired'
return emotion
except Exception as e:
print(f"Emotion detection error: {e}")
return 'neutral'
def generate_response(self, text, emotion, style):
try:
if emotion not in self.response_templates:
emotion = 'neutral'
if style not in self.response_templates[emotion]:
style = 'motivational'
return np.random.choice(self.response_templates[emotion][style])
except Exception as e:
print(f"Response generation error: {e}")
return "I appreciate you sharing this with me."
def text_to_speech(self, text, style="motivational"):
try:
voice_params = {
'motivational': {'lang': 'en', 'slow': False},
'calm': {'lang': 'en', 'slow': True},
'energetic': {'lang': 'en-uk', 'slow': False},
'angry': {'lang': 'en-au', 'slow': False}
}.get(style, {'lang': 'en'})
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as fp:
tts = gTTS(text=text, **voice_params)
tts.save(fp.name)
return fp.name
except Exception as e:
print(f"TTS error: {e}")
return None
def process_audio(self, audio_path, style):
try:
# Transcribe
waveform, _ = librosa.load(audio_path, sr=16000)
input_features = self.processor(waveform, sampling_rate=16000, return_tensors="pt").input_features.to(DEVICE)
predicted_ids = self.model.generate(input_features, max_length=200)
transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
# Detect emotion
emotion = self.detect_emotion(transcription)
# Generate response
response = self.generate_response(transcription, emotion, style)
# Convert to speech
audio_output = self.text_to_speech(response, style)
return {
"transcription": transcription,
"emotion": emotion,
"response": response,
"audio": audio_output
}
except Exception as e:
print(f"Processing error: {e}")
return {
"transcription": "Error processing audio",
"emotion": "neutral",
"response": "Sorry, something went wrong",
"audio": None
}
# Initialize the transcriber first
transcriber = EmotionAwareTranscriber()
# Define a global variable to store the last audio file path
last_audio_file = None
# Define the process_audio_wrapper function AFTER initializing the variable
def process_audio_wrapper(audio_path, style):
global last_audio_file
result = transcriber.process_audio(audio_path, style)
# Clean up previous audio files
if last_audio_file and os.path.exists(last_audio_file):
try:
os.unlink(last_audio_file)
except Exception as e:
print(f"Error cleaning up audio file: {e}")
last_audio_file = result["audio"]
return (
result["transcription"],
result["emotion"].upper(),
result["response"],
result["audio"] if result["audio"] else None
)
# Gradio interface
with gr.Blocks(title="Emotion-Aware Audio Transcriber") as demo:
gr.Markdown("# 🎤 Emotion-Aware Audio Transcriber")
gr.Markdown("Upload an audio file to get a transcription with emotional analysis and response")
with gr.Row():
audio_input = gr.Audio(label="Upload Audio", type="filepath")
style_selector = gr.Radio(
["motivational", "calm", "energetic", "angry"],
label="Response Style",
value="motivational"
)
submit_btn = gr.Button("Process", variant="primary")
with gr.Column():
transcription_output = gr.Textbox(label="Transcription")
emotion_output = gr.Textbox(label="Detected Emotion")
response_output = gr.Textbox(label="Generated Response")
audio_output = gr.Audio(label="Spoken Response")
submit_btn.click(
fn=process_audio_wrapper,
inputs=[audio_input, style_selector],
outputs=[transcription_output, emotion_output, response_output, audio_output]
)
# Launch the app
if __name__ == "__main__":
demo.launch()
else:
# This part is crucial for HuggingFace Spaces deployment
app = demo