Spaces:
Sleeping
Sleeping
Fix Whisper language handling for Tagalog in translate-audio endpoint
Browse files- Added WHISPER_LANGUAGE_MAPPING to convert ISO codes (eng, tgl) to Whisper-compatible language names (english, tagalog)
- Updated /translate-audio endpoint to use correct language names for Whisper model
- Removed forced_decoder_ids to resolve conflict with language parameter
- Ensured default fallback to English for unmapped languages
- Addresses error: "Unsupported language: tgl" and forced_decoder_ids warning
app.py
CHANGED
|
@@ -17,7 +17,7 @@ from fastapi import FastAPI, HTTPException, UploadFile, File, Form, BackgroundTa
|
|
| 17 |
from fastapi.responses import JSONResponse
|
| 18 |
from fastapi.staticfiles import StaticFiles
|
| 19 |
from typing import Dict, Any, Optional, Tuple, List
|
| 20 |
-
from
|
| 21 |
|
| 22 |
# Configure logging
|
| 23 |
logging.basicConfig(level=logging.INFO)
|
|
@@ -62,6 +62,12 @@ LANGUAGE_MAPPING = {
|
|
| 62 |
"Pangasinan": "pag"
|
| 63 |
}
|
| 64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
NLLB_LANGUAGE_CODES = {
|
| 66 |
"eng": "eng_Latn",
|
| 67 |
"tgl": "tgl_Latn",
|
|
@@ -294,7 +300,7 @@ def synthesize_speech(text: str, target_code: str) -> Tuple[Optional[str], Optio
|
|
| 294 |
return None, "Failed to load TTS model for the target language"
|
| 295 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 296 |
try:
|
| 297 |
-
inputs = tts_tokenizer(text, return_tensors="pt").
|
| 298 |
with torch.no_grad():
|
| 299 |
output = tts_model(**inputs)
|
| 300 |
speech = output.waveform.cpu().numpy().squeeze()
|
|
@@ -463,9 +469,10 @@ async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form
|
|
| 463 |
|
| 464 |
if use_whisper:
|
| 465 |
logger.info("Using Whisper model for transcription")
|
|
|
|
| 466 |
inputs = stt_processor_whisper(waveform.numpy(), sampling_rate=16000, return_tensors="pt").to(device)
|
| 467 |
with torch.no_grad():
|
| 468 |
-
generated_ids = stt_model_whisper.generate(**inputs, language=
|
| 469 |
transcription = stt_processor_whisper.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
| 470 |
else:
|
| 471 |
logger.info("Using MMS model for transcription")
|
|
|
|
| 17 |
from fastapi.responses import JSONResponse
|
| 18 |
from fastapi.staticfiles import StaticFiles
|
| 19 |
from typing import Dict, Any, Optional, Tuple, List
|
| 20 |
+
from(datetime import datetime, timedelta)
|
| 21 |
|
| 22 |
# Configure logging
|
| 23 |
logging.basicConfig(level=logging.INFO)
|
|
|
|
| 62 |
"Pangasinan": "pag"
|
| 63 |
}
|
| 64 |
|
| 65 |
+
# Mapping for Whisper language names
|
| 66 |
+
WHISPER_LANGUAGE_MAPPING = {
|
| 67 |
+
"eng": "english",
|
| 68 |
+
"tgl": "tagalog"
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
NLLB_LANGUAGE_CODES = {
|
| 72 |
"eng": "eng_Latn",
|
| 73 |
"tgl": "tgl_Latn",
|
|
|
|
| 300 |
return None, "Failed to load TTS model for the target language"
|
| 301 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 302 |
try:
|
| 303 |
+
inputs = tts_tokenizer(text, return_tensors="pt").toagli(device)
|
| 304 |
with torch.no_grad():
|
| 305 |
output = tts_model(**inputs)
|
| 306 |
speech = output.waveform.cpu().numpy().squeeze()
|
|
|
|
| 469 |
|
| 470 |
if use_whisper:
|
| 471 |
logger.info("Using Whisper model for transcription")
|
| 472 |
+
whisper_lang = WHISPER_LANGUAGE_MAPPING.get(source_code, "english") # Default to English if not mapped
|
| 473 |
inputs = stt_processor_whisper(waveform.numpy(), sampling_rate=16000, return_tensors="pt").to(device)
|
| 474 |
with torch.no_grad():
|
| 475 |
+
generated_ids = stt_model_whisper.generate(**inputs, language=whisper_lang)
|
| 476 |
transcription = stt_processor_whisper.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
| 477 |
else:
|
| 478 |
logger.info("Using MMS model for transcription")
|