Spaces:

KavyaBansal
/

ToneRewriter

Sleeping

App Files Files Community

ToneRewriter / app.py

KavyaBansal

Update app.py

5faa186 verified 9 months ago

raw

history blame contribute delete

9.28 kB

	import gradio as gr
	import torch
	from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
	import librosa
	from gtts import gTTS
	import numpy as np
	import tempfile
	import os

	# Device configuration
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {DEVICE}")

	class EmotionAwareTranscriber:
	def __init__(self, model_size="base"):
	print("Initializing models...")

	# Initialize Whisper
	self.processor = WhisperProcessor.from_pretrained(f"openai/whisper-{model_size}")
	self.model = WhisperForConditionalGeneration.from_pretrained(f"openai/whisper-{model_size}").to(DEVICE)

	# Initialize emotion classifier
	self.emotion_classifier = pipeline(
	"text-classification",
	model="j-hartmann/emotion-english-distilroberta-base",
	top_k=1
	)

	# Response templates
	self.response_templates = {
	'happy': {
	'motivational': ["Your joy is absolutely contagious! This kind of positive energy is what makes life worth living..."],
	'calm': ["I can feel the warmth of your happiness radiating through your words..."],
	'energetic': ["OH MY GOODNESS THIS IS INCREDIBLE NEWS!!! LET'S CELEBRATE!!!"],
	'angry': ["How can you be happy when there's so much suffering in the world?"]
	},
	'sad': {
	'motivational': ["I hear the sadness in your voice, and I want you to know that this feeling won't last forever..."],
	'calm': ["I sense your heavy heart, and I'm here with you in this moment..."],
	'energetic': ["HEY! I know you're feeling DOWN right now, but LISTEN UP! YOU'VE GOT THIS!!"],
	'angry': ["Stop wallowing and do something productive!"]
	},
	'angry': {
	'motivational': ["That passion can fuel positive change! Let's channel this energy constructively..."],
	'calm': ["I sense your anger. Let's take a deep breath and find solutions..."],
	'energetic': ["YEAH! I FEEL THAT TOO! NOW LET'S DO SOMETHING ABOUT IT!!"],
	'angry': ["You think YOU'RE angry? The whole system is broken!"]
	},
	'disgust': {
	'motivational': ["I can tell you're feeling repulsed. Sometimes disgust protects us from harmful things..."],
	'calm': ["I sense your strong distaste. Let's remove ourselves from this situation..."],
	'energetic': ["EW EW EW!!! GROSS ALERT! LET'S GET AWAY FROM THAT RIGHT NOW!!"],
	'angry': ["This is ABSOLUTELY DISGUSTING and UNACCEPTABLE!"]
	},
	'fear': {
	'motivational': ["It's understandable to feel scared. Remember you've survived tough times before..."],
	'calm': ["Fear is a natural response. Let's assess the situation calmly..."],
	'energetic': ["DANGER ALERT! BUT WAIT - LET'S MAKE A SAFETY PLAN!!"],
	'angry': ["Stop being such a coward!"]
	},
	'neutral': {
	'motivational': ["I appreciate you sharing this with me. Every day brings new opportunities..."],
	'calm': ["Thank you for expressing yourself. I'm here to listen..."],
	'energetic': ["LET'S FIND SOMETHING EXCITING TO DO RIGHT NOW!!"],
	'angry': ["Is that all? How utterly boring."]
	},
	'surprise': {
	'motivational': ["Unexpected moments can be life's greatest gifts!..."],
	'calm': ["I sense your surprise. Let's observe what unfolds..."],
	'energetic': ["WHOA! NO WAY! THAT'S INCREDIBLE!!!"],
	'angry': ["Why are you surprised? You should have seen this coming!"]
	},
	'tired': {
	'motivational': ["Rest is revolutionary. Recharge and return stronger..."],
	'calm': ["Fatigue is natural. Honor your need for rest..."],
	'energetic': ["DON'T QUIT! PUSH THROUGH! YOU'RE ALMOST THERE!!"],
	'angry': ["Tired? That's pathetic! Winners never rest!"]
	}
	}

	def detect_emotion(self, text):
	try:
	result = self.emotion_classifier(text)[0][0]
	emotion = result['label'].lower()

	# Manual checks
	disgust_keywords = ['disgusting', 'gross', 'revolting']
	if any(kw in text.lower() for kw in disgust_keywords):
	return 'disgust'

	tired_keywords = ['exhausted', 'tired', 'sleepy']
	if any(kw in text.lower() for kw in tired_keywords):
	return 'tired'

	return emotion
	except Exception as e:
	print(f"Emotion detection error: {e}")
	return 'neutral'

	def generate_response(self, text, emotion, style):
	try:
	if emotion not in self.response_templates:
	emotion = 'neutral'
	if style not in self.response_templates[emotion]:
	style = 'motivational'
	return np.random.choice(self.response_templates[emotion][style])
	except Exception as e:
	print(f"Response generation error: {e}")
	return "I appreciate you sharing this with me."

	def text_to_speech(self, text, style="motivational"):
	try:
	voice_params = {
	'motivational': {'lang': 'en', 'slow': False},
	'calm': {'lang': 'en', 'slow': True},
	'energetic': {'lang': 'en-uk', 'slow': False},
	'angry': {'lang': 'en-au', 'slow': False}
	}.get(style, {'lang': 'en'})

	with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as fp:
	tts = gTTS(text=text, **voice_params)
	tts.save(fp.name)
	return fp.name
	except Exception as e:
	print(f"TTS error: {e}")
	return None

	def process_audio(self, audio_path, style):
	try:
	# Transcribe
	waveform, _ = librosa.load(audio_path, sr=16000)
	input_features = self.processor(waveform, sampling_rate=16000, return_tensors="pt").input_features.to(DEVICE)
	predicted_ids = self.model.generate(input_features, max_length=200)
	transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

	# Detect emotion
	emotion = self.detect_emotion(transcription)

	# Generate response
	response = self.generate_response(transcription, emotion, style)

	# Convert to speech
	audio_output = self.text_to_speech(response, style)

	return {
	"transcription": transcription,
	"emotion": emotion,
	"response": response,
	"audio": audio_output
	}
	except Exception as e:
	print(f"Processing error: {e}")
	return {
	"transcription": "Error processing audio",
	"emotion": "neutral",
	"response": "Sorry, something went wrong",
	"audio": None
	}

	# Initialize the transcriber first
	transcriber = EmotionAwareTranscriber()

	# Define a global variable to store the last audio file path
	last_audio_file = None

	# Define the process_audio_wrapper function AFTER initializing the variable
	def process_audio_wrapper(audio_path, style):
	global last_audio_file

	result = transcriber.process_audio(audio_path, style)

	# Clean up previous audio files
	if last_audio_file and os.path.exists(last_audio_file):
	try:
	os.unlink(last_audio_file)
	except Exception as e:
	print(f"Error cleaning up audio file: {e}")

	last_audio_file = result["audio"]

	return (
	result["transcription"],
	result["emotion"].upper(),
	result["response"],
	result["audio"] if result["audio"] else None
	)

	# Gradio interface
	with gr.Blocks(title="Emotion-Aware Audio Transcriber") as demo:
	gr.Markdown("# 🎤 Emotion-Aware Audio Transcriber")
	gr.Markdown("Upload an audio file to get a transcription with emotional analysis and response")

	with gr.Row():
	audio_input = gr.Audio(label="Upload Audio", type="filepath")
	style_selector = gr.Radio(
	["motivational", "calm", "energetic", "angry"],
	label="Response Style",
	value="motivational"
	)
	submit_btn = gr.Button("Process", variant="primary")

	with gr.Column():
	transcription_output = gr.Textbox(label="Transcription")
	emotion_output = gr.Textbox(label="Detected Emotion")
	response_output = gr.Textbox(label="Generated Response")
	audio_output = gr.Audio(label="Spoken Response")

	submit_btn.click(
	fn=process_audio_wrapper,
	inputs=[audio_input, style_selector],
	outputs=[transcription_output, emotion_output, response_output, audio_output]
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch()
	else:
	# This part is crucial for HuggingFace Spaces deployment
	app = demo