Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| import subprocess | |
| import tempfile | |
| import os | |
| import shutil | |
| import librosa | |
| from typing import Tuple, Optional | |
| from transformers import WhisperProcessor, WhisperForConditionalGeneration | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
| from fastapi import FastAPI, File, UploadFile, HTTPException, Query | |
| from fastapi.responses import FileResponse | |
| import uvicorn | |
| # ============================================================================= | |
| # Audio Language Translator - Gradio UI + REST API | |
| # ============================================================================= | |
| # Pipeline: Whisper (ASR) β NLLB (Translation) β Edge-TTS (Speech Synthesis) | |
| # | |
| # Interfaces: | |
| # - Gradio UI: Interactive web interface for users | |
| # - REST API: Programmatic access for developers | |
| # | |
| # Research Foundation: | |
| # - Radford et al. (2022) "Robust Speech Recognition via Large-Scale Weak Supervision" | |
| # - Costa-jussΓ et al. (2022) "No Language Left Behind" | |
| # ============================================================================= | |
| # ----- Device Setup ----- | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| print(f"Device: {device}") | |
| # ----- Load Whisper ----- | |
| print("Loading Whisper...") | |
| whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-small") | |
| whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small") | |
| whisper_model = whisper_model.to(device) | |
| whisper_model.eval() | |
| print("β Whisper loaded") | |
| # ----- Load NLLB ----- | |
| print("Loading NLLB...") | |
| nllb_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M") | |
| nllb_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M") | |
| nllb_model = nllb_model.to(device) | |
| nllb_model.eval() | |
| print("β NLLB loaded") | |
| # ----- Language Configuration ----- | |
| SUPPORTED_LANGUAGES = { | |
| "en": "English", "es": "Spanish", "fr": "French", "de": "German", | |
| "zh": "Chinese", "ar": "Arabic", "hi": "Hindi", "ja": "Japanese", | |
| "ko": "Korean", "pt": "Portuguese", "ru": "Russian", "it": "Italian", | |
| "nl": "Dutch", "pl": "Polish", "tr": "Turkish" | |
| } | |
| LANG_TO_NLLB = { | |
| "en": "eng_Latn", "es": "spa_Latn", "fr": "fra_Latn", "de": "deu_Latn", | |
| "zh": "zho_Hans", "ar": "arb_Arab", "hi": "hin_Deva", "ja": "jpn_Jpan", | |
| "ko": "kor_Hang", "pt": "por_Latn", "ru": "rus_Cyrl", "it": "ita_Latn", | |
| "nl": "nld_Latn", "pl": "pol_Latn", "tr": "tur_Latn" | |
| } | |
| TTS_VOICES = { | |
| "en": { | |
| "voices": [ | |
| ("en-US-JennyNeural", "Jenny (US, Female)"), | |
| ("en-US-GuyNeural", "Guy (US, Male)"), | |
| ("en-GB-SoniaNeural", "Sonia (UK, Female)"), | |
| ], | |
| "default": "en-US-JennyNeural" | |
| }, | |
| "es": { | |
| "voices": [ | |
| ("es-ES-ElviraNeural", "Elvira (Spain, Female)"), | |
| ("es-MX-DaliaNeural", "Dalia (Mexico, Female)"), | |
| ("es-ES-AlvaroNeural", "Alvaro (Spain, Male)"), | |
| ], | |
| "default": "es-ES-ElviraNeural" | |
| }, | |
| "fr": { | |
| "voices": [ | |
| ("fr-FR-DeniseNeural", "Denise (France, Female)"), | |
| ("fr-FR-HenriNeural", "Henri (France, Male)"), | |
| ("fr-CA-SylvieNeural", "Sylvie (Canada, Female)"), | |
| ], | |
| "default": "fr-FR-DeniseNeural" | |
| }, | |
| "de": { | |
| "voices": [ | |
| ("de-DE-KatjaNeural", "Katja (Female)"), | |
| ("de-DE-ConradNeural", "Conrad (Male)"), | |
| ("de-AT-IngridNeural", "Ingrid (Austria, Female)"), | |
| ], | |
| "default": "de-DE-KatjaNeural" | |
| }, | |
| "zh": { | |
| "voices": [ | |
| ("zh-CN-XiaoxiaoNeural", "Xiaoxiao (Female)"), | |
| ("zh-CN-YunxiNeural", "Yunxi (Male)"), | |
| ("zh-CN-XiaoyiNeural", "Xiaoyi (Female)"), | |
| ], | |
| "default": "zh-CN-XiaoxiaoNeural" | |
| }, | |
| "ar": {"voices": [("ar-SA-ZariyahNeural", "Zariyah (Female)")], "default": "ar-SA-ZariyahNeural"}, | |
| "hi": {"voices": [("hi-IN-SwaraNeural", "Swara (Female)")], "default": "hi-IN-SwaraNeural"}, | |
| "ja": {"voices": [("ja-JP-NanamiNeural", "Nanami (Female)")], "default": "ja-JP-NanamiNeural"}, | |
| "ko": {"voices": [("ko-KR-SunHiNeural", "SunHi (Female)")], "default": "ko-KR-SunHiNeural"}, | |
| "pt": {"voices": [("pt-BR-FranciscaNeural", "Francisca (Brazil, Female)")], "default": "pt-BR-FranciscaNeural"}, | |
| "ru": {"voices": [("ru-RU-SvetlanaNeural", "Svetlana (Female)")], "default": "ru-RU-SvetlanaNeural"}, | |
| "it": {"voices": [("it-IT-ElsaNeural", "Elsa (Female)")], "default": "it-IT-ElsaNeural"}, | |
| "nl": {"voices": [("nl-NL-ColetteNeural", "Colette (Female)")], "default": "nl-NL-ColetteNeural"}, | |
| "pl": {"voices": [("pl-PL-AgnieszkaNeural", "Agnieszka (Female)")], "default": "pl-PL-AgnieszkaNeural"}, | |
| "tr": {"voices": [("tr-TR-EmelNeural", "Emel (Female)")], "default": "tr-TR-EmelNeural"}, | |
| } | |
| # ============================================================================= | |
| # CORE FUNCTIONS (Shared by Gradio and API) | |
| # ============================================================================= | |
| def text_to_speech(text: str, lang_code: str, voice: str = None) -> str: | |
| """Convert text to speech using edge-tts CLI.""" | |
| if lang_code not in TTS_VOICES: | |
| raise ValueError(f"Unsupported language: {lang_code}") | |
| if voice is None: | |
| voice = TTS_VOICES[lang_code]["default"] | |
| temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") | |
| temp_path = temp_file.name | |
| temp_file.close() | |
| cmd = ["edge-tts", "--voice", voice, "--text", text, "--write-media", temp_path] | |
| result = subprocess.run(cmd, capture_output=True, text=True) | |
| if result.returncode != 0: | |
| raise RuntimeError(f"TTS failed: {result.stderr}") | |
| return temp_path | |
| def transcribe_audio(audio_path: str) -> Tuple[str, str]: | |
| """Transcribe audio using Whisper and detect language.""" | |
| audio, sr = librosa.load(audio_path, sr=16000) | |
| input_features = whisper_processor( | |
| audio, sampling_rate=16000, return_tensors="pt" | |
| ).input_features.to(device) | |
| with torch.no_grad(): | |
| decoder_input_ids = torch.tensor([[50258]]).to(device) | |
| outputs = whisper_model( | |
| input_features, | |
| decoder_input_ids=decoder_input_ids, | |
| return_dict=True | |
| ) | |
| logits = outputs.logits[0, 0] | |
| lang_tokens = { | |
| "en": 50259, "zh": 50260, "de": 50261, "es": 50262, "ru": 50263, | |
| "ko": 50264, "fr": 50265, "ja": 50266, "pt": 50267, "tr": 50268, | |
| "pl": 50269, "nl": 50271, "ar": 50272, "it": 50274, "hi": 50276 | |
| } | |
| lang_scores = {lang: logits[token_id].item() for lang, token_id in lang_tokens.items()} | |
| detected_lang = max(lang_scores, key=lang_scores.get) | |
| with torch.no_grad(): | |
| predicted_ids = whisper_model.generate( | |
| input_features, | |
| language=detected_lang, | |
| task="transcribe", | |
| max_new_tokens=440, | |
| ) | |
| transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].strip() | |
| return transcription, detected_lang | |
| def translate_text(text: str, source_lang: str, target_lang: str) -> str: | |
| """Translate text using NLLB.""" | |
| if source_lang == target_lang or not text.strip(): | |
| return text | |
| src_nllb = LANG_TO_NLLB.get(source_lang) | |
| tgt_nllb = LANG_TO_NLLB.get(target_lang) | |
| nllb_tokenizer.src_lang = src_nllb | |
| inputs = nllb_tokenizer(text, return_tensors="pt", max_length=512, truncation=True) | |
| inputs = {k: v.to(device) for k, v in inputs.items()} | |
| with torch.no_grad(): | |
| translated_ids = nllb_model.generate( | |
| **inputs, | |
| forced_bos_token_id=nllb_tokenizer.convert_tokens_to_ids(tgt_nllb), | |
| max_new_tokens=512, | |
| num_beams=5, | |
| early_stopping=True | |
| ) | |
| return nllb_tokenizer.batch_decode(translated_ids, skip_special_tokens=True)[0] | |
| def full_pipeline(audio_path: str, target_lang: str, voice: str = None) -> Tuple[str, str, str, str, str]: | |
| """Complete audio translation pipeline.""" | |
| try: | |
| transcription, detected_lang = transcribe_audio(audio_path) | |
| detected_lang_name = SUPPORTED_LANGUAGES.get(detected_lang, detected_lang) | |
| if not transcription.strip(): | |
| return detected_lang_name, "(No speech detected)", "", None, "β οΈ No speech detected" | |
| target_lang_name = SUPPORTED_LANGUAGES.get(target_lang, target_lang) | |
| if detected_lang == target_lang: | |
| translated_text = transcription | |
| else: | |
| translated_text = translate_text(transcription, detected_lang, target_lang) | |
| output_audio = text_to_speech(translated_text, target_lang, voice) | |
| status = f"β Detected: {detected_lang_name} β Output: {target_lang_name}" | |
| return detected_lang_name, transcription, translated_text, output_audio, status | |
| except Exception as e: | |
| import traceback | |
| traceback.print_exc() | |
| return "Error", "", "", None, f"β Error: {str(e)}" | |
| # ============================================================================= | |
| # REST API ENDPOINTS | |
| # ============================================================================= | |
| # Create FastAPI app for API endpoints | |
| api_app = FastAPI( | |
| title="Audio Language Translator API", | |
| description=""" | |
| REST API for translating spoken audio between 15 languages. | |
| **Pipeline:** Whisper (ASR) β NLLB (Translation) β Edge-TTS (Speech Synthesis) | |
| **Endpoints:** | |
| - `GET /api/languages` - List supported languages | |
| - `GET /api/voices/{lang}` - Get available voices for a language | |
| - `POST /api/transcribe` - Transcribe audio (no translation) | |
| - `POST /api/translate` - Full translation pipeline | |
| - `GET /api/health` - Health check | |
| **Research Foundation:** | |
| - [Whisper](https://arxiv.org/abs/2212.04356) (Radford et al., 2022) | |
| - [NLLB](https://arxiv.org/abs/2207.04672) (Costa-jussΓ et al., 2022) | |
| """, | |
| version="1.0.0" | |
| ) | |
| def health_check(): | |
| """Check API health and model status.""" | |
| return { | |
| "status": "healthy", | |
| "device": str(device), | |
| "models_loaded": True | |
| } | |
| def get_languages(): | |
| """Get list of supported languages.""" | |
| return { | |
| "languages": [ | |
| {"code": code, "name": name} | |
| for code, name in SUPPORTED_LANGUAGES.items() | |
| ], | |
| "total": len(SUPPORTED_LANGUAGES) | |
| } | |
| def get_voices(lang_code: str): | |
| """Get available TTS voices for a language.""" | |
| if lang_code not in TTS_VOICES: | |
| raise HTTPException(status_code=404, detail=f"Language '{lang_code}' not supported") | |
| voices = TTS_VOICES[lang_code] | |
| return { | |
| "language": lang_code, | |
| "language_name": SUPPORTED_LANGUAGES.get(lang_code, lang_code), | |
| "voices": [{"id": v[0], "name": v[1]} for v in voices["voices"]], | |
| "default": voices["default"] | |
| } | |
| async def api_transcribe(file: UploadFile = File(...)): | |
| """Transcribe audio and detect language (no translation).""" | |
| # Save uploaded file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: | |
| shutil.copyfileobj(file.file, tmp) | |
| tmp_path = tmp.name | |
| try: | |
| transcription, detected_lang = transcribe_audio(tmp_path) | |
| return { | |
| "transcription": transcription, | |
| "detected_language": detected_lang, | |
| "detected_language_name": SUPPORTED_LANGUAGES.get(detected_lang, detected_lang) | |
| } | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| finally: | |
| os.unlink(tmp_path) | |
| async def api_translate( | |
| file: UploadFile = File(...), | |
| target_language: str = Query(..., description="Target language code (e.g., 'es', 'fr', 'de')"), | |
| voice: Optional[str] = Query(None, description="TTS voice ID (optional)") | |
| ): | |
| """ | |
| Full translation pipeline: transcribe β translate β text-to-speech. | |
| Returns JSON with text results. Use /api/translate/audio to get audio file. | |
| """ | |
| if target_language not in SUPPORTED_LANGUAGES: | |
| raise HTTPException( | |
| status_code=400, | |
| detail=f"Unsupported target language: {target_language}. Supported: {list(SUPPORTED_LANGUAGES.keys())}" | |
| ) | |
| # Save uploaded file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: | |
| shutil.copyfileobj(file.file, tmp) | |
| input_path = tmp.name | |
| try: | |
| # Run pipeline | |
| detected_lang_name, transcription, translated_text, output_audio, status = full_pipeline( | |
| input_path, target_language, voice | |
| ) | |
| return { | |
| "original_text": transcription, | |
| "detected_language": detected_lang_name, | |
| "translated_text": translated_text, | |
| "target_language": SUPPORTED_LANGUAGES.get(target_language, target_language), | |
| "target_language_code": target_language, | |
| "audio_generated": output_audio is not None, | |
| "status": status | |
| } | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| finally: | |
| os.unlink(input_path) | |
| async def api_translate_audio( | |
| file: UploadFile = File(...), | |
| target_language: str = Query(..., description="Target language code"), | |
| voice: Optional[str] = Query(None, description="TTS voice ID (optional)") | |
| ): | |
| """Full translation pipeline - returns audio file directly.""" | |
| if target_language not in SUPPORTED_LANGUAGES: | |
| raise HTTPException(status_code=400, detail=f"Unsupported language: {target_language}") | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: | |
| shutil.copyfileobj(file.file, tmp) | |
| input_path = tmp.name | |
| try: | |
| _, _, _, output_audio, _ = full_pipeline(input_path, target_language, voice) | |
| if output_audio is None: | |
| raise HTTPException(status_code=500, detail="Failed to generate audio") | |
| return FileResponse( | |
| output_audio, | |
| media_type="audio/mpeg", | |
| filename=f"translated_{target_language}.mp3" | |
| ) | |
| finally: | |
| os.unlink(input_path) | |
| # ============================================================================= | |
| # GRADIO INTERFACE | |
| # ============================================================================= | |
| def get_voice_id(lang_code: str, voice_name: str) -> str: | |
| if lang_code in TTS_VOICES: | |
| for vid, vname in TTS_VOICES[lang_code]["voices"]: | |
| if vname == voice_name: | |
| return vid | |
| return TTS_VOICES[lang_code]["default"] | |
| def update_voices(lang: str): | |
| voices = [v[1] for v in TTS_VOICES[lang]["voices"]] | |
| return gr.Dropdown(choices=voices, value=voices[0]) | |
| def process(audio, target_lang, voice_name): | |
| if audio is None: | |
| return "β οΈ Upload or record audio first.", "", "", None | |
| voice_id = get_voice_id(target_lang, voice_name) | |
| detected, original, translated, output_audio, status = full_pipeline(audio, target_lang, voice_id) | |
| return f"**Detected:** {detected}\n\n**Status:** {status}", original, translated, output_audio | |
| lang_choices = [(name, code) for code, name in SUPPORTED_LANGUAGES.items()] | |
| # Create Gradio interface | |
| with gr.Blocks(title="Audio Language Translator") as demo: | |
| gr.Markdown(""" | |
| # π Audio Language Translator | |
| Translate spoken audio between 15 languages using AI. | |
| **Pipeline:** Whisper (ASR) β NLLB (Translation) β Edge-TTS (Speech Synthesis) | |
| --- | |
| **π REST API Available!** [View API Documentation](/docs) | |
| --- | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### π€ Input") | |
| audio_in = gr.Audio(label="Upload or Record", type="filepath", sources=["upload", "microphone"]) | |
| target = gr.Dropdown(label="Target Language", choices=lang_choices, value="es") | |
| voice = gr.Dropdown(label="Voice", choices=[v[1] for v in TTS_VOICES["es"]["voices"]], value=TTS_VOICES["es"]["voices"][0][1]) | |
| btn = gr.Button("π Translate", variant="primary") | |
| with gr.Column(): | |
| gr.Markdown("### π Output") | |
| status_out = gr.Markdown() | |
| original_out = gr.Textbox(label="Original Transcription", lines=3) | |
| translated_out = gr.Textbox(label="Translated Text", lines=3) | |
| audio_out = gr.Audio(label="Translated Audio", type="filepath") | |
| target.change(update_voices, target, voice) | |
| btn.click(process, [audio_in, target, voice], [status_out, original_out, translated_out, audio_out]) | |
| with gr.Accordion("π REST API Documentation", open=False): | |
| gr.Markdown(""" | |
| ### API Endpoints | |
| Access the interactive API documentation at **`/api/docs`** | |
| | Endpoint | Method | Description | | |
| |----------|--------|-------------| | |
| | `/api/health` | GET | Health check | | |
| | `/api/languages` | GET | List supported languages | | |
| | `/api/voices/{lang}` | GET | Get voices for a language | | |
| | `/api/transcribe` | POST | Transcribe audio only | | |
| | `/api/translate` | POST | Full translation (returns JSON) | | |
| | `/api/translate/audio` | POST | Full translation (returns audio file) | | |
| ### Example Usage (Python) | |
| ```python | |
| import requests | |
| # Translate audio file | |
| with open("input.wav", "rb") as f: | |
| response = requests.post( | |
| "https://your-space.hf.space/api/translate", | |
| files={"file": f}, | |
| params={"target_language": "es"} | |
| ) | |
| print(response.json()) | |
| ``` | |
| ### Example Usage (cURL) | |
| ```bash | |
| curl -X POST "https://your-space.hf.space/api/translate" \ | |
| -F "file=@input.wav" \ | |
| -F "target_language=es" | |
| ``` | |
| """) | |
| with gr.Accordion("π Supported Languages & Voices", open=False): | |
| gr.Markdown(""" | |
| **Tier 1 (Multiple Voices):** English (3), Spanish (3), French (3), German (3), Chinese (3) | |
| **Tier 2 (Single Voice):** Arabic, Hindi, Japanese, Korean, Portuguese, Russian, Italian, Dutch, Polish, Turkish | |
| **Total:** 15 languages, 25 voices | |
| """) | |
| with gr.Accordion("π§ Technical Details", open=False): | |
| gr.Markdown(""" | |
| | Component | Model | Parameters | Purpose | | |
| |-----------|-------|------------|---------| | |
| | ASR | openai/whisper-small | 244M | Speech-to-text with language detection | | |
| | Translation | facebook/nllb-200-distilled-600M | 615M | Multilingual translation (200 languages) | | |
| | TTS | Microsoft Edge-TTS | API | Neural text-to-speech (25 voices) | | |
| **GPU Memory:** ~3.5 GB (Whisper + NLLB) | |
| """) | |
| # Mount FastAPI to Gradio | |
| # Mount Gradio onto FastAPI | |
| app = gr.mount_gradio_app(api_app, demo, path="/") | |
| # HuggingFace Spaces runs app.py directly, not via __main__ | |
| # So we need to use uvicorn for both local and HF deployment | |
| import uvicorn | |
| if __name__ == "__main__": | |
| uvicorn.run(app, host="0.0.0.0", port=7860) | |
| else: | |
| # For HuggingFace Spaces - it imports the app directly | |
| # The 'app' variable is already set above | |
| pass |