Spaces:

Jerich
/

TalklasApp2

Sleeping

App Files Files Community

Jerich commited on Apr 29, 2025

Commit

06e8ec6

verified ·

1 Parent(s): 28e1a88

Updated the code to use the Whisper model if the source language is English or Tagalog; otherwise, it will use MMS. Additionally, the link to the synthesized speech has been updated to match the current space.

Browse files

Files changed (1) hide show

app.py +70 -138

app.py CHANGED Viewed

@@ -43,8 +43,10 @@ error_message = None
 current_tts_language = "tgl"  # Track the current TTS language
 # Model instances
-stt_processor = None
-stt_model = None
 mt_model = None
 mt_tokenizer = None
 tts_model = None
@@ -85,60 +87,39 @@ def check_inappropriate_content(text: str) -> bool:
     Check if the text contains inappropriate content.
     Returns True if inappropriate content is detected, False otherwise.
     """
-    # Convert to lowercase for case-insensitive matching
     text_lower = text.lower()
-    # Check for inappropriate words
     for word in INAPPROPRIATE_WORDS:
-        # Use word boundary matching to avoid false positives
         pattern = r'\b' + re.escape(word) + r'\b'
         if re.search(pattern, text_lower):
             logger.warning(f"Inappropriate content detected: {word}")
             return True
     return False
 # Function to save PCM data as a WAV file
 def save_pcm_to_wav(pcm_data: list, sample_rate: int, output_path: str):
-    # Convert pcm_data to a NumPy array of 16-bit integers
     pcm_array = np.array(pcm_data, dtype=np.int16)
     with wave.open(output_path, 'wb') as wav_file:
-        # Set WAV parameters: 1 channel (mono), 2 bytes per sample (16-bit), sample rate
         wav_file.setnchannels(1)
-        wav_file.setsampwidth(2)  # 16-bit audio
         wav_file.setframerate(sample_rate)
-        # Write the 16-bit PCM data as bytes (little-endian)
         wav_file.writeframes(pcm_array.tobytes())
 # Function to detect speech using an energy-based approach
 def detect_speech(waveform: torch.Tensor, sample_rate: int, threshold: float = 0.01, min_speech_duration: float = 0.5) -> bool:
-    """
-    Detects if the audio contains speech using an energy-based approach.
-    Returns True if speech is detected, False otherwise.
-    """
-    # Convert waveform to numpy array
     waveform_np = waveform.numpy()
     if waveform_np.ndim > 1:
-        waveform_np = waveform_np.mean(axis=0)  # Convert stereo to mono
-    # Compute RMS energy
     rms = np.sqrt(np.mean(waveform_np**2))
     logger.info(f"RMS energy: {rms}")
-    # Check if RMS energy exceeds the threshold
     if rms < threshold:
         logger.info("No speech detected: RMS energy below threshold")
         return False
-    # Optionally, check for minimum speech duration (requires more sophisticated VAD)
-    # For now, we assume if RMS is above threshold, there is speech
     return True
 # Function to clean up old audio files
 def cleanup_old_audio_files():
     logger.info("Starting cleanup of old audio files...")
-    expiration_time = datetime.now() - timedelta(minutes=10)  # Files older than 10 minutes
     for filename in os.listdir(AUDIO_DIR):
         file_path = os.path.join(AUDIO_DIR, filename)
         if os.path.isfile(file_path):
@@ -154,42 +135,48 @@ def cleanup_old_audio_files():
 def schedule_cleanup():
     while True:
         cleanup_old_audio_files()
-        time.sleep(300)  # Run every 5 minutes (300 seconds)
 # Function to load models in background
 def load_models_task():
     global models_loaded, loading_in_progress, model_status, error_message
-    global stt_processor, stt_model, mt_model, mt_tokenizer, tts_model, tts_tokenizer
     try:
         loading_in_progress = True
-        # Load STT model (MMS with fallback to Whisper)
-        logger.info("Starting to load STT model...")
         from transformers import AutoProcessor, AutoModelForCTC, WhisperProcessor, WhisperForConditionalGeneration
         try:
-            logger.info("Loading MMS STT model...")
             model_status["stt"] = "loading"
-            stt_processor = AutoProcessor.from_pretrained("facebook/mms-1b-all")
-            stt_model = AutoModelForCTC.from_pretrained("facebook/mms-1b-all")
             device = "cuda" if torch.cuda.is_available() else "cpu"
-            stt_model.to(device)
             logger.info("MMS STT model loaded successfully")
-            model_status["stt"] = "loaded_mms"
-        except Exception as mms_error:
-            logger.error(f"Failed to load MMS STT model: {str(mms_error)}")
-            logger.info("Falling back to Whisper STT model...")
-            try:
-                stt_processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
-                stt_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
-                stt_model.to(device)
-                logger.info("Whisper STT model loaded successfully as fallback")
-                model_status["stt"] = "loaded_whisper"
-            except Exception as whisper_error:
-                logger.error(f"Failed to load Whisper STT model: {str(whisper_error)}")
                 model_status["stt"] = "failed"
-                error_message = f"STT model loading failed: MMS error: {str(mms_error)}, Whisper error: {str(whisper_error)}"
                 return
         # Load MT model
@@ -210,7 +197,7 @@ def load_models_task():
             error_message = f"MT model loading failed: {str(e)}"
             return
-        # Load TTS model (default to Tagalog, will be updated dynamically)
         logger.info("Starting to load TTS model...")
         from transformers import VitsModel, AutoTokenizer
@@ -224,7 +211,6 @@ def load_models_task():
             model_status["tts"] = "loaded"
         except Exception as e:
             logger.error(f"Failed to load TTS model for Tagalog: {str(e)}")
-            # Fallback to English TTS if the target language fails
             try:
                 logger.info("Falling back to MMS-TTS English model...")
                 tts_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
@@ -265,21 +251,13 @@ def start_cleanup_task():
 # Function to load or update TTS model for a specific language
 def load_tts_model_for_language(target_code: str) -> bool:
-    """
-    Load or update the TTS model for the specified language.
-    Returns True if successful, False otherwise.
-    """
     global tts_model, tts_tokenizer, current_tts_language, model_status
     if target_code not in LANGUAGE_MAPPING.values():
         logger.error(f"Invalid language code: {target_code}")
         return False
-    # Skip if the model is already loaded for the target language
     if current_tts_language == target_code and model_status["tts"].startswith("loaded"):
         logger.info(f"TTS model for {target_code} is already loaded.")
         return True
     device = "cuda" if torch.cuda.is_available() else "cpu"
     try:
         logger.info(f"Loading MMS-TTS model for {target_code}...")
@@ -309,19 +287,11 @@ def load_tts_model_for_language(target_code: str) -> bool:
 # Function to synthesize speech from text
 def synthesize_speech(text: str, target_code: str) -> Tuple[Optional[str], Optional[str]]:
-    """
-    Convert text to speech for the specified language.
-    Returns a tuple of (output_path, error_message).
-    """
     global tts_model, tts_tokenizer
     request_id = str(uuid.uuid4())
     output_path = os.path.join(AUDIO_DIR, f"{request_id}.wav")
-    # Make sure the TTS model is loaded for the target language
     if not load_tts_model_for_language(target_code):
         return None, "Failed to load TTS model for the target language"
     device = "cuda" if torch.cuda.is_available() else "cpu"
     try:
         inputs = tts_tokenizer(text, return_tensors="pt").to(device)
@@ -330,11 +300,8 @@ def synthesize_speech(text: str, target_code: str) -> Tuple[Optional[str], Optio
         speech = output.waveform.cpu().numpy().squeeze()
         speech = (speech * 32767).astype(np.int16)
         sample_rate = tts_model.config.sampling_rate
-        # Save the audio as a WAV file
         save_pcm_to_wav(speech.tolist(), sample_rate, output_path)
         logger.info(f"Saved synthesized audio to {output_path}")
         return output_path, None
     except Exception as e:
         error_msg = f"Error during TTS conversion: {str(e)}"
@@ -350,14 +317,11 @@ async def startup_event():
 @app.get("/")
 async def root():
-    """Root endpoint for default health check"""
     logger.info("Root endpoint requested")
     return {"status": "healthy"}
 @app.get("/health")
 async def health_check():
-    """Health check endpoint that always returns successfully"""
-    global models_loaded, loading_in_progress, model_status, error_message
     logger.info("Health check requested")
     return {
         "status": "healthy",
@@ -369,22 +333,16 @@ async def health_check():
 @app.post("/translate-text")
 async def translate_text(text: str = Form(...), source_lang: str = Form(...), target_lang: str = Form(...)):
-    """Endpoint to translate text and convert to speech"""
     global mt_model, mt_tokenizer, tts_model, tts_tokenizer, current_tts_language
     if not text:
         raise HTTPException(status_code=400, detail="No text provided")
     if source_lang not in LANGUAGE_MAPPING or target_lang not in LANGUAGE_MAPPING:
         raise HTTPException(status_code=400, detail="Invalid language selected")
     logger.info(f"Translate-text requested: {text} from {source_lang} to {target_lang}")
     request_id = str(uuid.uuid4())
-    # Translate the text
     source_code = LANGUAGE_MAPPING[source_lang]
     target_code = LANGUAGE_MAPPING[target_lang]
     translated_text = "Translation not available"
     if model_status["mt"] == "loaded" and mt_model is not None and mt_tokenizer is not None:
         try:
             source_nllb_code = NLLB_LANGUAGE_CODES[source_code]
@@ -405,26 +363,20 @@ async def translate_text(text: str = Form(...), source_lang: str = Form(...), ta
             translated_text = f"Translation failed: {str(e)}"
     else:
         logger.warning("MT model not loaded, skipping translation")
-    # Check for inappropriate content in the source text and translated text
     is_inappropriate = check_inappropriate_content(text) or check_inappropriate_content(translated_text)
     if is_inappropriate:
         logger.warning("Inappropriate content detected in translation request")
-    # Convert translated text to speech
     output_audio_url = None
     if model_status["tts"].startswith("loaded"):
-        # Load or update TTS model for the target language
         if load_tts_model_for_language(target_code):
             try:
                 output_path, error = synthesize_speech(translated_text, target_code)
                 if output_path:
                     output_filename = os.path.basename(output_path)
-                    output_audio_url = f"https://jerich-talklasapp2.hf.space/audio_output/{output_filename}"
                     logger.info("TTS conversion completed")
             except Exception as e:
                 logger.error(f"Error during TTS conversion: {str(e)}")
     return {
         "request_id": request_id,
         "status": "completed",
@@ -437,8 +389,8 @@ async def translate_text(text: str = Form(...), source_lang: str = Form(...), ta
 @app.post("/translate-audio")
 async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form(...), target_lang: str = Form(...)):
-    """Endpoint to transcribe, translate, and convert audio to speech"""
-    global stt_processor, stt_model, mt_model, mt_tokenizer, tts_model, tts_tokenizer, current_tts_language
     if not audio:
         raise HTTPException(status_code=400, detail="No audio file provided")
@@ -448,20 +400,33 @@ async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form
     logger.info(f"Translate-audio requested: {audio.filename} from {source_lang} to {target_lang}")
     request_id = str(uuid.uuid4())
-    # Check if STT models are loaded
-    if model_status["stt"] not in ["loaded_mms", "loaded_mms_default", "loaded_whisper"] or stt_processor is None or stt_model is None:
-        logger.warning("STT model not loaded, returning placeholder response")
         return {
             "request_id": request_id,
             "status": "processing",
-            "message": "STT model not loaded yet. Please try again later.",
             "source_text": "Transcription not available",
             "translated_text": "Translation not available",
             "output_audio": None,
             "is_inappropriate": False
         }
-    # Save the uploaded audio to a temporary file
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
         temp_file.write(await audio.read())
         temp_path = temp_file.name
@@ -472,19 +437,16 @@ async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form
     is_inappropriate = False
     try:
-        # Step 1: Load and resample the audio using torchaudio
         logger.info(f"Reading audio file: {temp_path}")
         waveform, sample_rate = torchaudio.load(temp_path)
         logger.info(f"Audio loaded: sample_rate={sample_rate}, waveform_shape={waveform.shape}")
-        # Resample to 16 kHz if needed (required by Whisper and MMS models)
         if sample_rate != 16000:
             logger.info(f"Resampling audio from {sample_rate} Hz to 16000 Hz")
             resampler = torchaudio.transforms.Resample(sample_rate, 16000)
             waveform = resampler(waveform)
             sample_rate = 16000
-        # Step 2: Detect speech
         if not detect_speech(waveform, sample_rate):
             return {
                 "request_id": request_id,
@@ -496,49 +458,25 @@ async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form
                 "is_inappropriate": False
             }
-        # Step 3: Transcribe the audio (STT)
         device = "cuda" if torch.cuda.is_available() else "cpu"
         logger.info(f"Using device: {device}")
-        # Determine which model to use based on source language
-        source_code = LANGUAGE_MAPPING[source_lang]
-        use_whisper = source_code in ["eng", "tgl"]  # Use Whisper for English and Tagalog
-        use_mms = not use_whisper  # Use MMS for other Philippine languages
-        logger.info(f"Source language: {source_lang} ({source_code}), Using Whisper: {use_whisper}, Using MMS: {use_mms}")
-        # Process with appropriate model
-        inputs = stt_processor(waveform.numpy(), sampling_rate=16000, return_tensors="pt").to(device)
-        logger.info("Audio processed, generating transcription...")
-        with torch.no_grad():
-            if use_whisper and model_status["stt"] == "loaded_whisper":
-                # Whisper model for English and Tagalog
-                logger.info(f"Using Whisper model for {source_lang}")
-                generated_ids = stt_model.generate(**inputs, language="en" if source_code == "eng" else "tl")
-                transcription = stt_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-            elif model_status["stt"] in ["loaded_mms", "loaded_mms_default"]:
-                # MMS model for other Philippine languages
-                logger.info(f"Using MMS model for {source_lang}")
-                logits = stt_model(**inputs).logits
                 predicted_ids = torch.argmax(logits, dim=-1)
-                transcription = stt_processor.batch_decode(predicted_ids)[0]
-            else:
-                # Fallback to any available model
-                logger.info(f"Preferred model not available, using fallback model")
-                if model_status["stt"] == "loaded_whisper":
-                    generated_ids = stt_model.generate(**inputs, language="en")
-                    transcription = stt_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-                else:
-                    logits = stt_model(**inputs).logits
-                    predicted_ids = torch.argmax(logits, dim=-1)
-                    transcription = stt_processor.batch_decode(predicted_ids)[0]
         logger.info(f"Transcription completed: {transcription}")
-        # Step 4: Translate the transcribed text (MT)
         target_code = LANGUAGE_MAPPING[target_lang]
         if model_status["mt"] == "loaded" and mt_model is not None and mt_tokenizer is not None:
             try:
                 source_nllb_code = NLLB_LANGUAGE_CODES[source_code]
@@ -559,18 +497,16 @@ async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form
         else:
             logger.warning("MT model not loaded, skipping translation")
-        # Step 5: Check for inappropriate content
         is_inappropriate = check_inappropriate_content(transcription) or check_inappropriate_content(translated_text)
         if is_inappropriate:
             logger.warning("Inappropriate content detected in audio transcription or translation")
-        # Step 6: Convert translated text to speech (TTS)
         if load_tts_model_for_language(target_code):
             try:
                 output_path, error = synthesize_speech(translated_text, target_code)
                 if output_path:
                     output_filename = os.path.basename(output_path)
-                    output_audio_url = f"https://jerich-talklasapp2.hf.space/audio_output/{output_filename}"
                     logger.info("TTS conversion completed")
             except Exception as e:
                 logger.error(f"Error during TTS conversion: {str(e)}")
@@ -601,7 +537,6 @@ async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form
 @app.post("/text-to-speech")
 async def text_to_speech(text: str = Form(...), target_lang: str = Form(...)):
-    """Endpoint to convert text to speech in the specified language"""
     if not text:
         raise HTTPException(status_code=400, detail="No text provided")
     if target_lang not in LANGUAGE_MAPPING:
@@ -611,20 +546,17 @@ async def text_to_speech(text: str = Form(...), target_lang: str = Form(...)):
     request_id = str(uuid.uuid4())
     target_code = LANGUAGE_MAPPING[target_lang]
-    # Check for inappropriate content
     is_inappropriate = check_inappropriate_content(text)
     if is_inappropriate:
         logger.warning("Inappropriate content detected in text-to-speech request")
-    # Synthesize speech
     output_audio_url = None
     if model_status["tts"].startswith("loaded") or load_tts_model_for_language(target_code):
         try:
             output_path, error = synthesize_speech(text, target_code)
             if output_path:
                 output_filename = os.path.basename(output_path)
-                output_audio_url = f"https://jerich-talklasapp2.hf.space/audio_output/{output_filename}"
                 logger.info("TTS conversion completed")
             else:
                 logger.error(f"TTS conversion failed: {error}")

 current_tts_language = "tgl"  # Track the current TTS language
 # Model instances
+stt_processor_whisper = None
+stt_model_whisper = None
+stt_processor_mms = None
+stt_model_mms = None
 mt_model = None
 mt_tokenizer = None
 tts_model = None
     Check if the text contains inappropriate content.
     Returns True if inappropriate content is detected, False otherwise.
     """
     text_lower = text.lower()
     for word in INAPPROPRIATE_WORDS:
         pattern = r'\b' + re.escape(word) + r'\b'
         if re.search(pattern, text_lower):
             logger.warning(f"Inappropriate content detected: {word}")
             return True
     return False
 # Function to save PCM data as a WAV file
 def save_pcm_to_wav(pcm_data: list, sample_rate: int, output_path: str):
     pcm_array = np.array(pcm_data, dtype=np.int16)
     with wave.open(output_path, 'wb') as wav_file:
         wav_file.setnchannels(1)
+        wav_file.setsampwidth(2)
         wav_file.setframerate(sample_rate)
         wav_file.writeframes(pcm_array.tobytes())
 # Function to detect speech using an energy-based approach
 def detect_speech(waveform: torch.Tensor, sample_rate: int, threshold: float = 0.01, min_speech_duration: float = 0.5) -> bool:
     waveform_np = waveform.numpy()
     if waveform_np.ndim > 1:
+        waveform_np = waveform_np.mean(axis=0)
     rms = np.sqrt(np.mean(waveform_np**2))
     logger.info(f"RMS energy: {rms}")
     if rms < threshold:
         logger.info("No speech detected: RMS energy below threshold")
         return False
     return True
 # Function to clean up old audio files
 def cleanup_old_audio_files():
     logger.info("Starting cleanup of old audio files...")
+    expiration_time = datetime.now() - timedelta(minutes=10)
     for filename in os.listdir(AUDIO_DIR):
         file_path = os.path.join(AUDIO_DIR, filename)
         if os.path.isfile(file_path):
 def schedule_cleanup():
     while True:
         cleanup_old_audio_files()
+        time.sleep(300)
 # Function to load models in background
 def load_models_task():
     global models_loaded, loading_in_progress, model_status, error_message
+    global stt_processor_whisper, stt_model_whisper, stt_processor_mms, stt_model_mms
+    global mt_model, mt_tokenizer, tts_model, tts_tokenizer
     try:
         loading_in_progress = True
+        # Load STT models
+        logger.info("Starting to load STT models...")
         from transformers import AutoProcessor, AutoModelForCTC, WhisperProcessor, WhisperForConditionalGeneration
         try:
+            logger.info("Loading Whisper STT model...")
             model_status["stt"] = "loading"
+            stt_processor_whisper = WhisperProcessor.from_pretrained("openai/whisper-tiny")
+            stt_model_whisper = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
             device = "cuda" if torch.cuda.is_available() else "cpu"
+            stt_model_whisper.to(device)
+            logger.info("Whisper STT model loaded successfully")
+            model_status["stt"] = "loaded_whisper"
+        except Exception as e:
+            logger.error(f"Failed to load Whisper STT model: {str(e)}")
+            model_status["stt"] = "failed"
+            error_message = f"Whisper STT model loading failed: {str(e)}"
+            return
+        try:
+            logger.info("Loading MMS STT model...")
+            stt_processor_mms = AutoProcessor.from_pretrained("facebook/mms-1b-all")
+            stt_model_mms = AutoModelForCTC.from_pretrained("facebook/mms-1b-all")
+            stt_model_mms.to(device)
             logger.info("MMS STT model loaded successfully")
+            model_status["stt"] = "loaded_both" if model_status["stt"] == "loaded_whisper" else "loaded_mms"
+        except Exception as e:
+            logger.error(f"Failed to load MMS STT model: {str(e)}")
+            if model_status["stt"] != "loaded_whisper":
                 model_status["stt"] = "failed"
+                error_message = f"MMS STT model loading failed: {str(e)}"
                 return
         # Load MT model
             error_message = f"MT model loading failed: {str(e)}"
             return
+        # Load TTS model (default to Tagalog)
         logger.info("Starting to load TTS model...")
         from transformers import VitsModel, AutoTokenizer
             model_status["tts"] = "loaded"
         except Exception as e:
             logger.error(f"Failed to load TTS model for Tagalog: {str(e)}")
             try:
                 logger.info("Falling back to MMS-TTS English model...")
                 tts_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
 # Function to load or update TTS model for a specific language
 def load_tts_model_for_language(target_code: str) -> bool:
     global tts_model, tts_tokenizer, current_tts_language, model_status
     if target_code not in LANGUAGE_MAPPING.values():
         logger.error(f"Invalid language code: {target_code}")
         return False
     if current_tts_language == target_code and model_status["tts"].startswith("loaded"):
         logger.info(f"TTS model for {target_code} is already loaded.")
         return True
     device = "cuda" if torch.cuda.is_available() else "cpu"
     try:
         logger.info(f"Loading MMS-TTS model for {target_code}...")
 # Function to synthesize speech from text
 def synthesize_speech(text: str, target_code: str) -> Tuple[Optional[str], Optional[str]]:
     global tts_model, tts_tokenizer
     request_id = str(uuid.uuid4())
     output_path = os.path.join(AUDIO_DIR, f"{request_id}.wav")
     if not load_tts_model_for_language(target_code):
         return None, "Failed to load TTS model for the target language"
     device = "cuda" if torch.cuda.is_available() else "cpu"
     try:
         inputs = tts_tokenizer(text, return_tensors="pt").to(device)
         speech = output.waveform.cpu().numpy().squeeze()
         speech = (speech * 32767).astype(np.int16)
         sample_rate = tts_model.config.sampling_rate
         save_pcm_to_wav(speech.tolist(), sample_rate, output_path)
         logger.info(f"Saved synthesized audio to {output_path}")
         return output_path, None
     except Exception as e:
         error_msg = f"Error during TTS conversion: {str(e)}"
 @app.get("/")
 async def root():
     logger.info("Root endpoint requested")
     return {"status": "healthy"}
 @app.get("/health")
 async def health_check():
     logger.info("Health check requested")
     return {
         "status": "healthy",
 @app.post("/translate-text")
 async def translate_text(text: str = Form(...), source_lang: str = Form(...), target_lang: str = Form(...)):
     global mt_model, mt_tokenizer, tts_model, tts_tokenizer, current_tts_language
     if not text:
         raise HTTPException(status_code=400, detail="No text provided")
     if source_lang not in LANGUAGE_MAPPING or target_lang not in LANGUAGE_MAPPING:
         raise HTTPException(status_code=400, detail="Invalid language selected")
     logger.info(f"Translate-text requested: {text} from {source_lang} to {target_lang}")
     request_id = str(uuid.uuid4())
     source_code = LANGUAGE_MAPPING[source_lang]
     target_code = LANGUAGE_MAPPING[target_lang]
     translated_text = "Translation not available"
     if model_status["mt"] == "loaded" and mt_model is not None and mt_tokenizer is not None:
         try:
             source_nllb_code = NLLB_LANGUAGE_CODES[source_code]
             translated_text = f"Translation failed: {str(e)}"
     else:
         logger.warning("MT model not loaded, skipping translation")
     is_inappropriate = check_inappropriate_content(text) or check_inappropriate_content(translated_text)
     if is_inappropriate:
         logger.warning("Inappropriate content detected in translation request")
     output_audio_url = None
     if model_status["tts"].startswith("loaded"):
         if load_tts_model_for_language(target_code):
             try:
                 output_path, error = synthesize_speech(translated_text, target_code)
                 if output_path:
                     output_filename = os.path.basename(output_path)
+                    output_audio_url = f"https://jerich-talklasapp.hf.space/audio_output/{output_filename}"
                     logger.info("TTS conversion completed")
             except Exception as e:
                 logger.error(f"Error during TTS conversion: {str(e)}")
     return {
         "request_id": request_id,
         "status": "completed",
 @app.post("/translate-audio")
 async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form(...), target_lang: str = Form(...)):
+    global stt_processor_whisper, stt_model_whisper, stt_processor_mms, stt_model_mms
+    global mt_model, mt_tokenizer, tts_model, tts_tokenizer, current_tts_language
     if not audio:
         raise HTTPException(status_code=400, detail="No audio file provided")
     logger.info(f"Translate-audio requested: {audio.filename} from {source_lang} to {target_lang}")
     request_id = str(uuid.uuid4())
+    source_code = LANGUAGE_MAPPING[source_lang]
+    use_whisper = source_code in ["eng", "tgl"]
+    # Check if appropriate STT model is loaded
+    if use_whisper and (stt_processor_whisper is None or stt_model_whisper is None):
+        logger.warning("Whisper STT model not loaded, returning placeholder response")
+        return {
+            "request_id": request_id,
+            "status": "processing",
+            "message": "Whisper STT model not loaded yet. Please try again later.",
+            "source_text": "Transcription not available",
+            "translated_text": "Translation not available",
+            "output_audio": None,
+            "is_inappropriate": False
+        }
+    elif not use_whisper and (stt_processor_mms is None or stt_model_mms is None):
+        logger.warning("MMS STT model not loaded, returning placeholder response")
         return {
             "request_id": request_id,
             "status": "processing",
+            "message": "MMS STT model not loaded yet. Please try again later.",
             "source_text": "Transcription not available",
             "translated_text": "Translation not available",
             "output_audio": None,
             "is_inappropriate": False
         }
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
         temp_file.write(await audio.read())
         temp_path = temp_file.name
     is_inappropriate = False
     try:
         logger.info(f"Reading audio file: {temp_path}")
         waveform, sample_rate = torchaudio.load(temp_path)
         logger.info(f"Audio loaded: sample_rate={sample_rate}, waveform_shape={waveform.shape}")
         if sample_rate != 16000:
             logger.info(f"Resampling audio from {sample_rate} Hz to 16000 Hz")
             resampler = torchaudio.transforms.Resample(sample_rate, 16000)
             waveform = resampler(waveform)
             sample_rate = 16000
         if not detect_speech(waveform, sample_rate):
             return {
                 "request_id": request_id,
                 "is_inappropriate": False
             }
         device = "cuda" if torch.cuda.is_available() else "cpu"
         logger.info(f"Using device: {device}")
+        if use_whisper:
+            logger.info("Using Whisper model for transcription")
+            inputs = stt_processor_whisper(waveform.numpy(), sampling_rate=16000, return_tensors="pt").to(device)
+            with torch.no_grad():
+                generated_ids = stt_model_whisper.generate(**inputs, language=source_code)
+                transcription = stt_processor_whisper.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        else:
+            logger.info("Using MMS model for transcription")
+            inputs = stt_processor_mms(waveform.numpy(), sampling_rate=16000, return_tensors="pt").to(device)
+            with torch.no_grad():
+                logits = stt_model_mms(**inputs).logits
                 predicted_ids = torch.argmax(logits, dim=-1)
+                transcription = stt_processor_mms.batch_decode(predicted_ids)[0]
         logger.info(f"Transcription completed: {transcription}")
         target_code = LANGUAGE_MAPPING[target_lang]
         if model_status["mt"] == "loaded" and mt_model is not None and mt_tokenizer is not None:
             try:
                 source_nllb_code = NLLB_LANGUAGE_CODES[source_code]
         else:
             logger.warning("MT model not loaded, skipping translation")
         is_inappropriate = check_inappropriate_content(transcription) or check_inappropriate_content(translated_text)
         if is_inappropriate:
             logger.warning("Inappropriate content detected in audio transcription or translation")
         if load_tts_model_for_language(target_code):
             try:
                 output_path, error = synthesize_speech(translated_text, target_code)
                 if output_path:
                     output_filename = os.path.basename(output_path)
+                    output_audio_url = f"https://jerich-talklasapp.hf.space/audio_output/{output_filename}"
                     logger.info("TTS conversion completed")
             except Exception as e:
                 logger.error(f"Error during TTS conversion: {str(e)}")
 @app.post("/text-to-speech")
 async def text_to_speech(text: str = Form(...), target_lang: str = Form(...)):
     if not text:
         raise HTTPException(status_code=400, detail="No text provided")
     if target_lang not in LANGUAGE_MAPPING:
     request_id = str(uuid.uuid4())
     target_code = LANGUAGE_MAPPING[target_lang]
     is_inappropriate = check_inappropriate_content(text)
     if is_inappropriate:
         logger.warning("Inappropriate content detected in text-to-speech request")
     output_audio_url = None
     if model_status["tts"].startswith("loaded") or load_tts_model_for_language(target_code):
         try:
             output_path, error = synthesize_speech(text, target_code)
             if output_path:
                 output_filename = os.path.basename(output_path)
+                output_audio_url = f"https://jerich-talklasapp.hf.space/audio_output/{output_filename}"
                 logger.info("TTS conversion completed")
             else:
                 logger.error(f"TTS conversion failed: {error}")