Nav772 commited on
Commit
2342a6c
Β·
verified Β·
1 Parent(s): 0cac47e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +217 -13
app.py CHANGED
@@ -3,21 +3,27 @@ import torch
3
  import subprocess
4
  import tempfile
5
  import os
 
6
  import librosa
7
  from typing import Tuple, Optional
8
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
9
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 
 
 
10
 
11
  # =============================================================================
12
- # Audio Language Translator
13
  # =============================================================================
14
  # Pipeline: Whisper (ASR) β†’ NLLB (Translation) β†’ Edge-TTS (Speech Synthesis)
15
  #
 
 
 
 
16
  # Research Foundation:
17
  # - Radford et al. (2022) "Robust Speech Recognition via Large-Scale Weak Supervision"
18
- # https://arxiv.org/abs/2212.04356
19
  # - Costa-jussΓ  et al. (2022) "No Language Left Behind"
20
- # https://arxiv.org/abs/2207.04672
21
  # =============================================================================
22
 
23
  # ----- Device Setup -----
@@ -108,7 +114,10 @@ TTS_VOICES = {
108
  "tr": {"voices": [("tr-TR-EmelNeural", "Emel (Female)")], "default": "tr-TR-EmelNeural"},
109
  }
110
 
111
- # ----- Core Functions -----
 
 
 
112
  def text_to_speech(text: str, lang_code: str, voice: str = None) -> str:
113
  """Convert text to speech using edge-tts CLI."""
114
  if lang_code not in TTS_VOICES:
@@ -217,7 +226,161 @@ def full_pipeline(audio_path: str, target_lang: str, voice: str = None) -> Tuple
217
  return "Error", "", "", None, f"❌ Error: {str(e)}"
218
 
219
 
220
- # ----- Gradio Interface -----
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  def get_voice_id(lang_code: str, voice_name: str) -> str:
222
  if lang_code in TTS_VOICES:
223
  for vid, vname in TTS_VOICES[lang_code]["voices"]:
@@ -240,19 +403,20 @@ def process(audio, target_lang, voice_name):
240
 
241
  lang_choices = [(name, code) for code, name in SUPPORTED_LANGUAGES.items()]
242
 
243
- demo = gr.Blocks()
244
-
245
- with demo:
246
  gr.Markdown("""
247
  # 🌍 Audio Language Translator
248
 
249
  Translate spoken audio between 15 languages using AI.
250
 
251
  **Pipeline:** Whisper (ASR) β†’ NLLB (Translation) β†’ Edge-TTS (Speech Synthesis)
252
-
253
- **Research Foundation:**
254
- - [Whisper: Robust Speech Recognition](https://arxiv.org/abs/2212.04356) (Radford et al., 2022)
255
- - [NLLB: No Language Left Behind](https://arxiv.org/abs/2207.04672) (Costa-jussΓ  et al., 2022)
 
 
256
  """)
257
 
258
  with gr.Row():
@@ -273,6 +437,43 @@ with demo:
273
  target.change(update_voices, target, voice)
274
  btn.click(process, [audio_in, target, voice], [status_out, original_out, translated_out, audio_out])
275
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
  with gr.Accordion("πŸ“š Supported Languages & Voices", open=False):
277
  gr.Markdown("""
278
  **Tier 1 (Multiple Voices):** English (3), Spanish (3), French (3), German (3), Chinese (3)
@@ -293,5 +494,8 @@ with demo:
293
  **GPU Memory:** ~3.5 GB (Whisper + NLLB)
294
  """)
295
 
 
 
 
296
  if __name__ == "__main__":
297
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
3
  import subprocess
4
  import tempfile
5
  import os
6
+ import shutil
7
  import librosa
8
  from typing import Tuple, Optional
9
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
10
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
11
+ from fastapi import FastAPI, File, UploadFile, HTTPException, Query
12
+ from fastapi.responses import FileResponse
13
+ import uvicorn
14
 
15
  # =============================================================================
16
+ # Audio Language Translator - Gradio UI + REST API
17
  # =============================================================================
18
  # Pipeline: Whisper (ASR) β†’ NLLB (Translation) β†’ Edge-TTS (Speech Synthesis)
19
  #
20
+ # Interfaces:
21
+ # - Gradio UI: Interactive web interface for users
22
+ # - REST API: Programmatic access for developers
23
+ #
24
  # Research Foundation:
25
  # - Radford et al. (2022) "Robust Speech Recognition via Large-Scale Weak Supervision"
 
26
  # - Costa-jussΓ  et al. (2022) "No Language Left Behind"
 
27
  # =============================================================================
28
 
29
  # ----- Device Setup -----
 
114
  "tr": {"voices": [("tr-TR-EmelNeural", "Emel (Female)")], "default": "tr-TR-EmelNeural"},
115
  }
116
 
117
+ # =============================================================================
118
+ # CORE FUNCTIONS (Shared by Gradio and API)
119
+ # =============================================================================
120
+
121
  def text_to_speech(text: str, lang_code: str, voice: str = None) -> str:
122
  """Convert text to speech using edge-tts CLI."""
123
  if lang_code not in TTS_VOICES:
 
226
  return "Error", "", "", None, f"❌ Error: {str(e)}"
227
 
228
 
229
+ # =============================================================================
230
+ # REST API ENDPOINTS
231
+ # =============================================================================
232
+
233
+ # Create FastAPI app for API endpoints
234
+ api_app = FastAPI(
235
+ title="Audio Language Translator API",
236
+ description="""
237
+ REST API for translating spoken audio between 15 languages.
238
+
239
+ **Pipeline:** Whisper (ASR) β†’ NLLB (Translation) β†’ Edge-TTS (Speech Synthesis)
240
+
241
+ **Endpoints:**
242
+ - `GET /api/languages` - List supported languages
243
+ - `GET /api/voices/{lang}` - Get available voices for a language
244
+ - `POST /api/transcribe` - Transcribe audio (no translation)
245
+ - `POST /api/translate` - Full translation pipeline
246
+ - `GET /api/health` - Health check
247
+
248
+ **Research Foundation:**
249
+ - [Whisper](https://arxiv.org/abs/2212.04356) (Radford et al., 2022)
250
+ - [NLLB](https://arxiv.org/abs/2207.04672) (Costa-jussΓ  et al., 2022)
251
+ """,
252
+ version="1.0.0"
253
+ )
254
+
255
+ @api_app.get("/api/health")
256
+ def health_check():
257
+ """Check API health and model status."""
258
+ return {
259
+ "status": "healthy",
260
+ "device": str(device),
261
+ "models_loaded": True
262
+ }
263
+
264
+ @api_app.get("/api/languages")
265
+ def get_languages():
266
+ """Get list of supported languages."""
267
+ return {
268
+ "languages": [
269
+ {"code": code, "name": name}
270
+ for code, name in SUPPORTED_LANGUAGES.items()
271
+ ],
272
+ "total": len(SUPPORTED_LANGUAGES)
273
+ }
274
+
275
+ @api_app.get("/api/voices/{lang_code}")
276
+ def get_voices(lang_code: str):
277
+ """Get available TTS voices for a language."""
278
+ if lang_code not in TTS_VOICES:
279
+ raise HTTPException(status_code=404, detail=f"Language '{lang_code}' not supported")
280
+
281
+ voices = TTS_VOICES[lang_code]
282
+ return {
283
+ "language": lang_code,
284
+ "language_name": SUPPORTED_LANGUAGES.get(lang_code, lang_code),
285
+ "voices": [{"id": v[0], "name": v[1]} for v in voices["voices"]],
286
+ "default": voices["default"]
287
+ }
288
+
289
+ @api_app.post("/api/transcribe")
290
+ async def api_transcribe(file: UploadFile = File(...)):
291
+ """Transcribe audio and detect language (no translation)."""
292
+ # Save uploaded file
293
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
294
+ shutil.copyfileobj(file.file, tmp)
295
+ tmp_path = tmp.name
296
+
297
+ try:
298
+ transcription, detected_lang = transcribe_audio(tmp_path)
299
+ return {
300
+ "transcription": transcription,
301
+ "detected_language": detected_lang,
302
+ "detected_language_name": SUPPORTED_LANGUAGES.get(detected_lang, detected_lang)
303
+ }
304
+ except Exception as e:
305
+ raise HTTPException(status_code=500, detail=str(e))
306
+ finally:
307
+ os.unlink(tmp_path)
308
+
309
+ @api_app.post("/api/translate")
310
+ async def api_translate(
311
+ file: UploadFile = File(...),
312
+ target_language: str = Query(..., description="Target language code (e.g., 'es', 'fr', 'de')"),
313
+ voice: Optional[str] = Query(None, description="TTS voice ID (optional)")
314
+ ):
315
+ """
316
+ Full translation pipeline: transcribe β†’ translate β†’ text-to-speech.
317
+
318
+ Returns JSON with text results. Use /api/translate/audio to get audio file.
319
+ """
320
+ if target_language not in SUPPORTED_LANGUAGES:
321
+ raise HTTPException(
322
+ status_code=400,
323
+ detail=f"Unsupported target language: {target_language}. Supported: {list(SUPPORTED_LANGUAGES.keys())}"
324
+ )
325
+
326
+ # Save uploaded file
327
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
328
+ shutil.copyfileobj(file.file, tmp)
329
+ input_path = tmp.name
330
+
331
+ try:
332
+ # Run pipeline
333
+ detected_lang_name, transcription, translated_text, output_audio, status = full_pipeline(
334
+ input_path, target_language, voice
335
+ )
336
+
337
+ return {
338
+ "original_text": transcription,
339
+ "detected_language": detected_lang_name,
340
+ "translated_text": translated_text,
341
+ "target_language": SUPPORTED_LANGUAGES.get(target_language, target_language),
342
+ "target_language_code": target_language,
343
+ "audio_generated": output_audio is not None,
344
+ "status": status
345
+ }
346
+ except Exception as e:
347
+ raise HTTPException(status_code=500, detail=str(e))
348
+ finally:
349
+ os.unlink(input_path)
350
+
351
+ @api_app.post("/api/translate/audio")
352
+ async def api_translate_audio(
353
+ file: UploadFile = File(...),
354
+ target_language: str = Query(..., description="Target language code"),
355
+ voice: Optional[str] = Query(None, description="TTS voice ID (optional)")
356
+ ):
357
+ """Full translation pipeline - returns audio file directly."""
358
+ if target_language not in SUPPORTED_LANGUAGES:
359
+ raise HTTPException(status_code=400, detail=f"Unsupported language: {target_language}")
360
+
361
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
362
+ shutil.copyfileobj(file.file, tmp)
363
+ input_path = tmp.name
364
+
365
+ try:
366
+ _, _, _, output_audio, _ = full_pipeline(input_path, target_language, voice)
367
+
368
+ if output_audio is None:
369
+ raise HTTPException(status_code=500, detail="Failed to generate audio")
370
+
371
+ return FileResponse(
372
+ output_audio,
373
+ media_type="audio/mpeg",
374
+ filename=f"translated_{target_language}.mp3"
375
+ )
376
+ finally:
377
+ os.unlink(input_path)
378
+
379
+
380
+ # =============================================================================
381
+ # GRADIO INTERFACE
382
+ # =============================================================================
383
+
384
  def get_voice_id(lang_code: str, voice_name: str) -> str:
385
  if lang_code in TTS_VOICES:
386
  for vid, vname in TTS_VOICES[lang_code]["voices"]:
 
403
 
404
  lang_choices = [(name, code) for code, name in SUPPORTED_LANGUAGES.items()]
405
 
406
+ # Create Gradio interface
407
+ with gr.Blocks(title="Audio Language Translator") as demo:
 
408
  gr.Markdown("""
409
  # 🌍 Audio Language Translator
410
 
411
  Translate spoken audio between 15 languages using AI.
412
 
413
  **Pipeline:** Whisper (ASR) β†’ NLLB (Translation) β†’ Edge-TTS (Speech Synthesis)
414
+
415
+ ---
416
+
417
+ **πŸ”Œ REST API Available!** Access this translator programmatically at `/api/docs`
418
+
419
+ ---
420
  """)
421
 
422
  with gr.Row():
 
437
  target.change(update_voices, target, voice)
438
  btn.click(process, [audio_in, target, voice], [status_out, original_out, translated_out, audio_out])
439
 
440
+ with gr.Accordion("πŸ”Œ REST API Documentation", open=False):
441
+ gr.Markdown("""
442
+ ### API Endpoints
443
+
444
+ Access the interactive API documentation at **`/api/docs`**
445
+
446
+ | Endpoint | Method | Description |
447
+ |----------|--------|-------------|
448
+ | `/api/health` | GET | Health check |
449
+ | `/api/languages` | GET | List supported languages |
450
+ | `/api/voices/{lang}` | GET | Get voices for a language |
451
+ | `/api/transcribe` | POST | Transcribe audio only |
452
+ | `/api/translate` | POST | Full translation (returns JSON) |
453
+ | `/api/translate/audio` | POST | Full translation (returns audio file) |
454
+
455
+ ### Example Usage (Python)
456
+ ```python
457
+ import requests
458
+
459
+ # Translate audio file
460
+ with open("input.wav", "rb") as f:
461
+ response = requests.post(
462
+ "https://your-space.hf.space/api/translate",
463
+ files={"file": f},
464
+ params={"target_language": "es"}
465
+ )
466
+ print(response.json())
467
+ ```
468
+
469
+ ### Example Usage (cURL)
470
+ ```bash
471
+ curl -X POST "https://your-space.hf.space/api/translate" \
472
+ -F "file=@input.wav" \
473
+ -F "target_language=es"
474
+ ```
475
+ """)
476
+
477
  with gr.Accordion("πŸ“š Supported Languages & Voices", open=False):
478
  gr.Markdown("""
479
  **Tier 1 (Multiple Voices):** English (3), Spanish (3), French (3), German (3), Chinese (3)
 
494
  **GPU Memory:** ~3.5 GB (Whisper + NLLB)
495
  """)
496
 
497
+ # Mount FastAPI to Gradio
498
+ app = gr.mount_gradio_app(api_app, demo, path="/")
499
+
500
  if __name__ == "__main__":
501
+ uvicorn.run(app, host="0.0.0.0", port=7860)