Spaces:

Nav772
/

audio-language-translator

Sleeping

App Files Files Community

Nav772 commited on Feb 19

Commit

2342a6c

verified ·

1 Parent(s): 0cac47e

Update app.py

Browse files

Files changed (1) hide show

app.py +217 -13

app.py CHANGED Viewed

@@ -3,21 +3,27 @@ import torch
 import subprocess
 import tempfile
 import os
 import librosa
 from typing import Tuple, Optional
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 # =============================================================================
-# Audio Language Translator
 # =============================================================================
 # Pipeline: Whisper (ASR) → NLLB (Translation) → Edge-TTS (Speech Synthesis)
 #
 # Research Foundation:
 # - Radford et al. (2022) "Robust Speech Recognition via Large-Scale Weak Supervision"
-#   https://arxiv.org/abs/2212.04356
 # - Costa-jussà et al. (2022) "No Language Left Behind"
-#   https://arxiv.org/abs/2207.04672
 # =============================================================================
 # ----- Device Setup -----
@@ -108,7 +114,10 @@ TTS_VOICES = {
     "tr": {"voices": [("tr-TR-EmelNeural", "Emel (Female)")], "default": "tr-TR-EmelNeural"},
 }
-# ----- Core Functions -----
 def text_to_speech(text: str, lang_code: str, voice: str = None) -> str:
     """Convert text to speech using edge-tts CLI."""
     if lang_code not in TTS_VOICES:
@@ -217,7 +226,161 @@ def full_pipeline(audio_path: str, target_lang: str, voice: str = None) -> Tuple
         return "Error", "", "", None, f"❌ Error: {str(e)}"
-# ----- Gradio Interface -----
 def get_voice_id(lang_code: str, voice_name: str) -> str:
     if lang_code in TTS_VOICES:
         for vid, vname in TTS_VOICES[lang_code]["voices"]:
@@ -240,19 +403,20 @@ def process(audio, target_lang, voice_name):
 lang_choices = [(name, code) for code, name in SUPPORTED_LANGUAGES.items()]
-demo = gr.Blocks()
-with demo:
     gr.Markdown("""
     # 🌍 Audio Language Translator
     Translate spoken audio between 15 languages using AI.
     **Pipeline:** Whisper (ASR) → NLLB (Translation) → Edge-TTS (Speech Synthesis)
-    **Research Foundation:**
-    - [Whisper: Robust Speech Recognition](https://arxiv.org/abs/2212.04356) (Radford et al., 2022)
-    - [NLLB: No Language Left Behind](https://arxiv.org/abs/2207.04672) (Costa-jussà et al., 2022)
     """)
     with gr.Row():
@@ -273,6 +437,43 @@ with demo:
     target.change(update_voices, target, voice)
     btn.click(process, [audio_in, target, voice], [status_out, original_out, translated_out, audio_out])
     with gr.Accordion("📚 Supported Languages & Voices", open=False):
         gr.Markdown("""
         **Tier 1 (Multiple Voices):** English (3), Spanish (3), French (3), German (3), Chinese (3)
@@ -293,5 +494,8 @@ with demo:
         **GPU Memory:** ~3.5 GB (Whisper + NLLB)
         """)
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

 import subprocess
 import tempfile
 import os
+import shutil
 import librosa
 from typing import Tuple, Optional
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from fastapi import FastAPI, File, UploadFile, HTTPException, Query
+from fastapi.responses import FileResponse
+import uvicorn
 # =============================================================================
+# Audio Language Translator - Gradio UI + REST API
 # =============================================================================
 # Pipeline: Whisper (ASR) → NLLB (Translation) → Edge-TTS (Speech Synthesis)
 #
+# Interfaces:
+# - Gradio UI: Interactive web interface for users
+# - REST API: Programmatic access for developers
+#
 # Research Foundation:
 # - Radford et al. (2022) "Robust Speech Recognition via Large-Scale Weak Supervision"
 # - Costa-jussà et al. (2022) "No Language Left Behind"
 # =============================================================================
 # ----- Device Setup -----
     "tr": {"voices": [("tr-TR-EmelNeural", "Emel (Female)")], "default": "tr-TR-EmelNeural"},
 }
+# =============================================================================
+# CORE FUNCTIONS (Shared by Gradio and API)
+# =============================================================================
 def text_to_speech(text: str, lang_code: str, voice: str = None) -> str:
     """Convert text to speech using edge-tts CLI."""
     if lang_code not in TTS_VOICES:
         return "Error", "", "", None, f"❌ Error: {str(e)}"
+# =============================================================================
+# REST API ENDPOINTS
+# =============================================================================
+# Create FastAPI app for API endpoints
+api_app = FastAPI(
+    title="Audio Language Translator API",
+    description="""
+    REST API for translating spoken audio between 15 languages.
+    **Pipeline:** Whisper (ASR) → NLLB (Translation) → Edge-TTS (Speech Synthesis)
+    **Endpoints:**
+    - `GET /api/languages` - List supported languages
+    - `GET /api/voices/{lang}` - Get available voices for a language
+    - `POST /api/transcribe` - Transcribe audio (no translation)
+    - `POST /api/translate` - Full translation pipeline
+    - `GET /api/health` - Health check
+    **Research Foundation:**
+    - [Whisper](https://arxiv.org/abs/2212.04356) (Radford et al., 2022)
+    - [NLLB](https://arxiv.org/abs/2207.04672) (Costa-jussà et al., 2022)
+    """,
+    version="1.0.0"
+)
+@api_app.get("/api/health")
+def health_check():
+    """Check API health and model status."""
+    return {
+        "status": "healthy",
+        "device": str(device),
+        "models_loaded": True
+    }
+@api_app.get("/api/languages")
+def get_languages():
+    """Get list of supported languages."""
+    return {
+        "languages": [
+            {"code": code, "name": name}
+            for code, name in SUPPORTED_LANGUAGES.items()
+        ],
+        "total": len(SUPPORTED_LANGUAGES)
+    }
+@api_app.get("/api/voices/{lang_code}")
+def get_voices(lang_code: str):
+    """Get available TTS voices for a language."""
+    if lang_code not in TTS_VOICES:
+        raise HTTPException(status_code=404, detail=f"Language '{lang_code}' not supported")
+    voices = TTS_VOICES[lang_code]
+    return {
+        "language": lang_code,
+        "language_name": SUPPORTED_LANGUAGES.get(lang_code, lang_code),
+        "voices": [{"id": v[0], "name": v[1]} for v in voices["voices"]],
+        "default": voices["default"]
+    }
+@api_app.post("/api/transcribe")
+async def api_transcribe(file: UploadFile = File(...)):
+    """Transcribe audio and detect language (no translation)."""
+    # Save uploaded file
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
+        shutil.copyfileobj(file.file, tmp)
+        tmp_path = tmp.name
+    try:
+        transcription, detected_lang = transcribe_audio(tmp_path)
+        return {
+            "transcription": transcription,
+            "detected_language": detected_lang,
+            "detected_language_name": SUPPORTED_LANGUAGES.get(detected_lang, detected_lang)
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        os.unlink(tmp_path)
+@api_app.post("/api/translate")
+async def api_translate(
+    file: UploadFile = File(...),
+    target_language: str = Query(..., description="Target language code (e.g., 'es', 'fr', 'de')"),
+    voice: Optional[str] = Query(None, description="TTS voice ID (optional)")
+):
+    """
+    Full translation pipeline: transcribe → translate → text-to-speech.
+    Returns JSON with text results. Use /api/translate/audio to get audio file.
+    """
+    if target_language not in SUPPORTED_LANGUAGES:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported target language: {target_language}. Supported: {list(SUPPORTED_LANGUAGES.keys())}"
+        )
+    # Save uploaded file
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
+        shutil.copyfileobj(file.file, tmp)
+        input_path = tmp.name
+    try:
+        # Run pipeline
+        detected_lang_name, transcription, translated_text, output_audio, status = full_pipeline(
+            input_path, target_language, voice
+        )
+        return {
+            "original_text": transcription,
+            "detected_language": detected_lang_name,
+            "translated_text": translated_text,
+            "target_language": SUPPORTED_LANGUAGES.get(target_language, target_language),
+            "target_language_code": target_language,
+            "audio_generated": output_audio is not None,
+            "status": status
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        os.unlink(input_path)
+@api_app.post("/api/translate/audio")
+async def api_translate_audio(
+    file: UploadFile = File(...),
+    target_language: str = Query(..., description="Target language code"),
+    voice: Optional[str] = Query(None, description="TTS voice ID (optional)")
+):
+    """Full translation pipeline - returns audio file directly."""
+    if target_language not in SUPPORTED_LANGUAGES:
+        raise HTTPException(status_code=400, detail=f"Unsupported language: {target_language}")
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
+        shutil.copyfileobj(file.file, tmp)
+        input_path = tmp.name
+    try:
+        _, _, _, output_audio, _ = full_pipeline(input_path, target_language, voice)
+        if output_audio is None:
+            raise HTTPException(status_code=500, detail="Failed to generate audio")
+        return FileResponse(
+            output_audio,
+            media_type="audio/mpeg",
+            filename=f"translated_{target_language}.mp3"
+        )
+    finally:
+        os.unlink(input_path)
+# =============================================================================
+# GRADIO INTERFACE
+# =============================================================================
 def get_voice_id(lang_code: str, voice_name: str) -> str:
     if lang_code in TTS_VOICES:
         for vid, vname in TTS_VOICES[lang_code]["voices"]:
 lang_choices = [(name, code) for code, name in SUPPORTED_LANGUAGES.items()]
+# Create Gradio interface
+with gr.Blocks(title="Audio Language Translator") as demo:
     gr.Markdown("""
     # 🌍 Audio Language Translator
     Translate spoken audio between 15 languages using AI.
     **Pipeline:** Whisper (ASR) → NLLB (Translation) → Edge-TTS (Speech Synthesis)
+    ---
+    **🔌 REST API Available!** Access this translator programmatically at `/api/docs`
+    ---
     """)
     with gr.Row():
     target.change(update_voices, target, voice)
     btn.click(process, [audio_in, target, voice], [status_out, original_out, translated_out, audio_out])
+    with gr.Accordion("🔌 REST API Documentation", open=False):
+        gr.Markdown("""
+        ### API Endpoints
+        Access the interactive API documentation at **`/api/docs`**
+        | Endpoint | Method | Description |
+        |----------|--------|-------------|
+        | `/api/health` | GET | Health check |
+        | `/api/languages` | GET | List supported languages |
+        | `/api/voices/{lang}` | GET | Get voices for a language |
+        | `/api/transcribe` | POST | Transcribe audio only |
+        | `/api/translate` | POST | Full translation (returns JSON) |
+        | `/api/translate/audio` | POST | Full translation (returns audio file) |
+        ### Example Usage (Python)
+```python
+        import requests
+        # Translate audio file
+        with open("input.wav", "rb") as f:
+            response = requests.post(
+                "https://your-space.hf.space/api/translate",
+                files={"file": f},
+                params={"target_language": "es"}
+            )
+        print(response.json())
+```
+        ### Example Usage (cURL)
+```bash
+        curl -X POST "https://your-space.hf.space/api/translate" \
+             -F "file=@input.wav" \
+             -F "target_language=es"
+```
+        """)
     with gr.Accordion("📚 Supported Languages & Voices", open=False):
         gr.Markdown("""
         **Tier 1 (Multiple Voices):** English (3), Spanish (3), French (3), German (3), Chinese (3)
         **GPU Memory:** ~3.5 GB (Whisper + NLLB)
         """)
+# Mount FastAPI to Gradio
+app = gr.mount_gradio_app(api_app, demo, path="/")
 if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)