import os import gradio as gr import asyncio import tempfile import edge_tts import requests from langdetect import detect, LangDetectException from transformers import pipeline, M2M100ForConditionalGeneration, M2M100Tokenizer # ---------------------------- # 1. SPEECH TO TEXT (Whisper) # ---------------------------- stt_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-small") def transcribe(audio): if audio is None: return None result = stt_pipeline(audio) return result["text"] # ---------------------------- # 2. TRANSLATION (M2M100) # ---------------------------- m2m_model_name = "facebook/m2m100_418M" m2m_tokenizer = M2M100Tokenizer.from_pretrained(m2m_model_name) m2m_model = M2M100ForConditionalGeneration.from_pretrained(m2m_model_name) LANG_UI_TO_CODE = {"English": "en", "Spanish": "es", "French": "fr"} def translate_text(user_text, target_lang_ui): if not user_text.strip(): return "" target_code = LANG_UI_TO_CODE.get(target_lang_ui, "en") try: src_code = detect(user_text) except LangDetectException: src_code = "en" if src_code == target_code: return user_text m2m_tokenizer.src_lang = src_code encoded = m2m_tokenizer(user_text, return_tensors="pt") generated = m2m_model.generate(**encoded, forced_bos_token_id=m2m_tokenizer.get_lang_id(target_code)) return m2m_tokenizer.decode(generated[0], skip_special_tokens=True) # ---------------------------- # 3. EMOTION DETECTION (Groq API) # ---------------------------- GROQ_API_KEY = os.getenv("GROQ_API_KEY") API_URL = "https://api.groq.ai/v1/text/analyze" def detect_emotion_tone(text): if not text.strip(): return "neutral" headers = {"Authorization": f"Bearer {GROQ_API_KEY}", "Content-Type": "application/json"} payload = {"text": text, "features": ["emotion"]} try: r = requests.post(API_URL, headers=headers, json=payload) r.raise_for_status() result = r.json() emotions = result.get("emotion", {}) if not emotions: return "neutral" return max(emotions, key=emotions.get) except Exception: return "neutral" # ---------------------------- # 4. TEXT TO SPEECH (Edge TTS) # ---------------------------- async def text_to_speech(text, voice, rate, pitch): if not text.strip(): return None voice_short_name = voice.split(" - ")[0] communicate = edge_tts.Communicate(text, voice_short_name, rate=f"{rate:+d}%", pitch=f"{pitch:+d}Hz") with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp: await communicate.save(tmp.name) return tmp.name def tts_sync(text, voice, rate, pitch): return asyncio.run(text_to_speech(text, voice, rate, pitch)) # ---------------------------- # 5. PIPELINE FUNCTION # ---------------------------- async def full_pipeline(audio, target_lang): # Step 1: STT text = transcribe(audio) if not text: return None # Step 2: Translate translated = translate_text(text, target_lang) # Step 3: Emotion Detection emotion = detect_emotion_tone(text) # Step 4: Pick voice based on emotion voices = await edge_tts.list_voices() if emotion == "happy": voice_choice = [v for v in voices if "en-US-AriaNeural" in v["ShortName"]] elif emotion == "sad": voice_choice = [v for v in voices if "en-US-JennyNeural" in v["ShortName"]] elif emotion == "angry": voice_choice = [v for v in voices if "en-US-GuyNeural" in v["ShortName"]] else: voice_choice = [voices[0]] voice_final = f"{voice_choice[0]['ShortName']} - {voice_choice[0]['Locale']}" # Step 5: Generate final audio audio_out = await text_to_speech(translated, voice_final, 0, 0) return audio_out # ---------------------------- # 6. GRADIO UI # ---------------------------- with gr.Blocks() as demo: gr.Markdown("# 🎤 Speech Translator with Emotions") with gr.Row(): audio_in = gr.Audio(sources=["microphone"], type="filepath", label="Record Speech") target_lang = gr.Dropdown(choices=["English", "Spanish", "French"], value="English", label="Target Language") final_speech = gr.Audio(label="🔊 Final Speech", type="filepath") run_btn = gr.Button("🚀 Translate & Speak") run_btn.click(fn=full_pipeline, inputs=[audio_in, target_lang], outputs=[final_speech]) if __name__ == "__main__": demo.launch()