Spaces:

ZoyaRabail
/

check

Sleeping

App Files Files Community

check / app.py

ZoyaRabail

Update app.py

4fa37f8 verified 4 months ago

raw

history blame contribute delete

4.47 kB

	import os
	import gradio as gr
	import asyncio
	import tempfile
	import edge_tts
	import requests
	from langdetect import detect, LangDetectException
	from transformers import pipeline, M2M100ForConditionalGeneration, M2M100Tokenizer

	# ----------------------------
	# 1. SPEECH TO TEXT (Whisper)
	# ----------------------------
	stt_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-small")

	def transcribe(audio):
	if audio is None:
	return None
	result = stt_pipeline(audio)
	return result["text"]

	# ----------------------------
	# 2. TRANSLATION (M2M100)
	# ----------------------------
	m2m_model_name = "facebook/m2m100_418M"
	m2m_tokenizer = M2M100Tokenizer.from_pretrained(m2m_model_name)
	m2m_model = M2M100ForConditionalGeneration.from_pretrained(m2m_model_name)

	LANG_UI_TO_CODE = {"English": "en", "Spanish": "es", "French": "fr"}

	def translate_text(user_text, target_lang_ui):
	if not user_text.strip():
	return ""
	target_code = LANG_UI_TO_CODE.get(target_lang_ui, "en")
	try:
	src_code = detect(user_text)
	except LangDetectException:
	src_code = "en"
	if src_code == target_code:
	return user_text
	m2m_tokenizer.src_lang = src_code
	encoded = m2m_tokenizer(user_text, return_tensors="pt")
	generated = m2m_model.generate(**encoded, forced_bos_token_id=m2m_tokenizer.get_lang_id(target_code))
	return m2m_tokenizer.decode(generated[0], skip_special_tokens=True)

	# ----------------------------
	# 3. EMOTION DETECTION (Groq API)
	# ----------------------------
	GROQ_API_KEY = os.getenv("GROQ_API_KEY")
	API_URL = "https://api.groq.ai/v1/text/analyze"

	def detect_emotion_tone(text):
	if not text.strip():
	return "neutral"
	headers = {"Authorization": f"Bearer {GROQ_API_KEY}", "Content-Type": "application/json"}
	payload = {"text": text, "features": ["emotion"]}
	try:
	r = requests.post(API_URL, headers=headers, json=payload)
	r.raise_for_status()
	result = r.json()
	emotions = result.get("emotion", {})
	if not emotions:
	return "neutral"
	return max(emotions, key=emotions.get)
	except Exception:
	return "neutral"

	# ----------------------------
	# 4. TEXT TO SPEECH (Edge TTS)
	# ----------------------------
	async def text_to_speech(text, voice, rate, pitch):
	if not text.strip():
	return None
	voice_short_name = voice.split(" - ")[0]
	communicate = edge_tts.Communicate(text, voice_short_name, rate=f"{rate:+d}%", pitch=f"{pitch:+d}Hz")
	with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
	await communicate.save(tmp.name)
	return tmp.name

	def tts_sync(text, voice, rate, pitch):
	return asyncio.run(text_to_speech(text, voice, rate, pitch))

	# ----------------------------
	# 5. PIPELINE FUNCTION
	# ----------------------------
	async def full_pipeline(audio, target_lang):
	# Step 1: STT
	text = transcribe(audio)
	if not text:
	return None

	# Step 2: Translate
	translated = translate_text(text, target_lang)

	# Step 3: Emotion Detection
	emotion = detect_emotion_tone(text)

	# Step 4: Pick voice based on emotion
	voices = await edge_tts.list_voices()
	if emotion == "happy":
	voice_choice = [v for v in voices if "en-US-AriaNeural" in v["ShortName"]]
	elif emotion == "sad":
	voice_choice = [v for v in voices if "en-US-JennyNeural" in v["ShortName"]]
	elif emotion == "angry":
	voice_choice = [v for v in voices if "en-US-GuyNeural" in v["ShortName"]]
	else:
	voice_choice = [voices[0]]
	voice_final = f"{voice_choice[0]['ShortName']} - {voice_choice[0]['Locale']}"

	# Step 5: Generate final audio
	audio_out = await text_to_speech(translated, voice_final, 0, 0)
	return audio_out

	# ----------------------------
	# 6. GRADIO UI
	# ----------------------------
	with gr.Blocks() as demo:
	gr.Markdown("# 🎤 Speech Translator with Emotions")

	with gr.Row():
	audio_in = gr.Audio(sources=["microphone"], type="filepath", label="Record Speech")
	target_lang = gr.Dropdown(choices=["English", "Spanish", "French"], value="English", label="Target Language")

	final_speech = gr.Audio(label="🔊 Final Speech", type="filepath")

	run_btn = gr.Button("🚀 Translate & Speak")
	run_btn.click(fn=full_pipeline, inputs=[audio_in, target_lang], outputs=[final_speech])

	if __name__ == "__main__":
	demo.launch()