Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| import asyncio | |
| import tempfile | |
| import edge_tts | |
| import requests | |
| from langdetect import detect, LangDetectException | |
| from transformers import pipeline, M2M100ForConditionalGeneration, M2M100Tokenizer | |
| # ---------------------------- | |
| # 1. SPEECH TO TEXT (Whisper) | |
| # ---------------------------- | |
| stt_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-small") | |
| def transcribe(audio): | |
| if audio is None: | |
| return None | |
| result = stt_pipeline(audio) | |
| return result["text"] | |
| # ---------------------------- | |
| # 2. TRANSLATION (M2M100) | |
| # ---------------------------- | |
| m2m_model_name = "facebook/m2m100_418M" | |
| m2m_tokenizer = M2M100Tokenizer.from_pretrained(m2m_model_name) | |
| m2m_model = M2M100ForConditionalGeneration.from_pretrained(m2m_model_name) | |
| LANG_UI_TO_CODE = {"English": "en", "Spanish": "es", "French": "fr"} | |
| def translate_text(user_text, target_lang_ui): | |
| if not user_text.strip(): | |
| return "" | |
| target_code = LANG_UI_TO_CODE.get(target_lang_ui, "en") | |
| try: | |
| src_code = detect(user_text) | |
| except LangDetectException: | |
| src_code = "en" | |
| if src_code == target_code: | |
| return user_text | |
| m2m_tokenizer.src_lang = src_code | |
| encoded = m2m_tokenizer(user_text, return_tensors="pt") | |
| generated = m2m_model.generate(**encoded, forced_bos_token_id=m2m_tokenizer.get_lang_id(target_code)) | |
| return m2m_tokenizer.decode(generated[0], skip_special_tokens=True) | |
| # ---------------------------- | |
| # 3. EMOTION DETECTION (Groq API) | |
| # ---------------------------- | |
| GROQ_API_KEY = os.getenv("GROQ_API_KEY") | |
| API_URL = "https://api.groq.ai/v1/text/analyze" | |
| def detect_emotion_tone(text): | |
| if not text.strip(): | |
| return "neutral" | |
| headers = {"Authorization": f"Bearer {GROQ_API_KEY}", "Content-Type": "application/json"} | |
| payload = {"text": text, "features": ["emotion"]} | |
| try: | |
| r = requests.post(API_URL, headers=headers, json=payload) | |
| r.raise_for_status() | |
| result = r.json() | |
| emotions = result.get("emotion", {}) | |
| if not emotions: | |
| return "neutral" | |
| return max(emotions, key=emotions.get) | |
| except Exception: | |
| return "neutral" | |
| # ---------------------------- | |
| # 4. TEXT TO SPEECH (Edge TTS) | |
| # ---------------------------- | |
| async def text_to_speech(text, voice, rate, pitch): | |
| if not text.strip(): | |
| return None | |
| voice_short_name = voice.split(" - ")[0] | |
| communicate = edge_tts.Communicate(text, voice_short_name, rate=f"{rate:+d}%", pitch=f"{pitch:+d}Hz") | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp: | |
| await communicate.save(tmp.name) | |
| return tmp.name | |
| def tts_sync(text, voice, rate, pitch): | |
| return asyncio.run(text_to_speech(text, voice, rate, pitch)) | |
| # ---------------------------- | |
| # 5. PIPELINE FUNCTION | |
| # ---------------------------- | |
| async def full_pipeline(audio, target_lang): | |
| # Step 1: STT | |
| text = transcribe(audio) | |
| if not text: | |
| return None | |
| # Step 2: Translate | |
| translated = translate_text(text, target_lang) | |
| # Step 3: Emotion Detection | |
| emotion = detect_emotion_tone(text) | |
| # Step 4: Pick voice based on emotion | |
| voices = await edge_tts.list_voices() | |
| if emotion == "happy": | |
| voice_choice = [v for v in voices if "en-US-AriaNeural" in v["ShortName"]] | |
| elif emotion == "sad": | |
| voice_choice = [v for v in voices if "en-US-JennyNeural" in v["ShortName"]] | |
| elif emotion == "angry": | |
| voice_choice = [v for v in voices if "en-US-GuyNeural" in v["ShortName"]] | |
| else: | |
| voice_choice = [voices[0]] | |
| voice_final = f"{voice_choice[0]['ShortName']} - {voice_choice[0]['Locale']}" | |
| # Step 5: Generate final audio | |
| audio_out = await text_to_speech(translated, voice_final, 0, 0) | |
| return audio_out | |
| # ---------------------------- | |
| # 6. GRADIO UI | |
| # ---------------------------- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# π€ Speech Translator with Emotions") | |
| with gr.Row(): | |
| audio_in = gr.Audio(sources=["microphone"], type="filepath", label="Record Speech") | |
| target_lang = gr.Dropdown(choices=["English", "Spanish", "French"], value="English", label="Target Language") | |
| final_speech = gr.Audio(label="π Final Speech", type="filepath") | |
| run_btn = gr.Button("π Translate & Speak") | |
| run_btn.click(fn=full_pipeline, inputs=[audio_in, target_lang], outputs=[final_speech]) | |
| if __name__ == "__main__": | |
| demo.launch() | |