Spaces:

Jerich
/

TalklasApp2

Sleeping

App Files Files Community

Jerich commited on Apr 29, 2025

Commit

28e1a88

verified ·

1 Parent(s): 0ef092a

Updated the code to use the Whisper model if the source language is English or Tagalog; otherwise, it will use MMS. Additionally, the link to the synthesized speech has been updated to match the current space.

Browse files

Files changed (1) hide show

app.py +31 -10

app.py CHANGED Viewed

@@ -420,7 +420,7 @@ async def translate_text(text: str = Form(...), source_lang: str = Form(...), ta
                 output_path, error = synthesize_speech(translated_text, target_code)
                 if output_path:
                     output_filename = os.path.basename(output_path)
-                    output_audio_url = f"https://jerich-talklasapp.hf.space/audio_output/{output_filename}"
                     logger.info("TTS conversion completed")
             except Exception as e:
                 logger.error(f"Error during TTS conversion: {str(e)}")
@@ -448,7 +448,7 @@ async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form
     logger.info(f"Translate-audio requested: {audio.filename} from {source_lang} to {target_lang}")
     request_id = str(uuid.uuid4())
-    # Check if STT model is loaded
     if model_status["stt"] not in ["loaded_mms", "loaded_mms_default", "loaded_whisper"] or stt_processor is None or stt_model is None:
         logger.warning("STT model not loaded, returning placeholder response")
         return {
@@ -499,23 +499,44 @@ async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form
         # Step 3: Transcribe the audio (STT)
         device = "cuda" if torch.cuda.is_available() else "cpu"
         logger.info(f"Using device: {device}")
         inputs = stt_processor(waveform.numpy(), sampling_rate=16000, return_tensors="pt").to(device)
         logger.info("Audio processed, generating transcription...")
         with torch.no_grad():
-            if model_status["stt"] == "loaded_whisper":
-                # Whisper model
-                generated_ids = stt_model.generate(**inputs, language="en")
                 transcription = stt_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-            else:
-                # MMS model
                 logits = stt_model(**inputs).logits
                 predicted_ids = torch.argmax(logits, dim=-1)
                 transcription = stt_processor.batch_decode(predicted_ids)[0]
         logger.info(f"Transcription completed: {transcription}")
         # Step 4: Translate the transcribed text (MT)
-        source_code = LANGUAGE_MAPPING[source_lang]
         target_code = LANGUAGE_MAPPING[target_lang]
         if model_status["mt"] == "loaded" and mt_model is not None and mt_tokenizer is not None:
@@ -549,7 +570,7 @@ async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form
                 output_path, error = synthesize_speech(translated_text, target_code)
                 if output_path:
                     output_filename = os.path.basename(output_path)
-                    output_audio_url = f"https://jerich-talklasapp.hf.space/audio_output/{output_filename}"
                     logger.info("TTS conversion completed")
             except Exception as e:
                 logger.error(f"Error during TTS conversion: {str(e)}")
@@ -603,7 +624,7 @@ async def text_to_speech(text: str = Form(...), target_lang: str = Form(...)):
             output_path, error = synthesize_speech(text, target_code)
             if output_path:
                 output_filename = os.path.basename(output_path)
-                output_audio_url = f"https://jerich-talklasapp.hf.space/audio_output/{output_filename}"
                 logger.info("TTS conversion completed")
             else:
                 logger.error(f"TTS conversion failed: {error}")

                 output_path, error = synthesize_speech(translated_text, target_code)
                 if output_path:
                     output_filename = os.path.basename(output_path)
+                    output_audio_url = f"https://jerich-talklasapp2.hf.space/audio_output/{output_filename}"
                     logger.info("TTS conversion completed")
             except Exception as e:
                 logger.error(f"Error during TTS conversion: {str(e)}")
     logger.info(f"Translate-audio requested: {audio.filename} from {source_lang} to {target_lang}")
     request_id = str(uuid.uuid4())
+    # Check if STT models are loaded
     if model_status["stt"] not in ["loaded_mms", "loaded_mms_default", "loaded_whisper"] or stt_processor is None or stt_model is None:
         logger.warning("STT model not loaded, returning placeholder response")
         return {
         # Step 3: Transcribe the audio (STT)
         device = "cuda" if torch.cuda.is_available() else "cpu"
         logger.info(f"Using device: {device}")
+        # Determine which model to use based on source language
+        source_code = LANGUAGE_MAPPING[source_lang]
+        use_whisper = source_code in ["eng", "tgl"]  # Use Whisper for English and Tagalog
+        use_mms = not use_whisper  # Use MMS for other Philippine languages
+        logger.info(f"Source language: {source_lang} ({source_code}), Using Whisper: {use_whisper}, Using MMS: {use_mms}")
+        # Process with appropriate model
         inputs = stt_processor(waveform.numpy(), sampling_rate=16000, return_tensors="pt").to(device)
         logger.info("Audio processed, generating transcription...")
         with torch.no_grad():
+            if use_whisper and model_status["stt"] == "loaded_whisper":
+                # Whisper model for English and Tagalog
+                logger.info(f"Using Whisper model for {source_lang}")
+                generated_ids = stt_model.generate(**inputs, language="en" if source_code == "eng" else "tl")
                 transcription = stt_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+            elif model_status["stt"] in ["loaded_mms", "loaded_mms_default"]:
+                # MMS model for other Philippine languages
+                logger.info(f"Using MMS model for {source_lang}")
                 logits = stt_model(**inputs).logits
                 predicted_ids = torch.argmax(logits, dim=-1)
                 transcription = stt_processor.batch_decode(predicted_ids)[0]
+            else:
+                # Fallback to any available model
+                logger.info(f"Preferred model not available, using fallback model")
+                if model_status["stt"] == "loaded_whisper":
+                    generated_ids = stt_model.generate(**inputs, language="en")
+                    transcription = stt_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+                else:
+                    logits = stt_model(**inputs).logits
+                    predicted_ids = torch.argmax(logits, dim=-1)
+                    transcription = stt_processor.batch_decode(predicted_ids)[0]
         logger.info(f"Transcription completed: {transcription}")
         # Step 4: Translate the transcribed text (MT)
         target_code = LANGUAGE_MAPPING[target_lang]
         if model_status["mt"] == "loaded" and mt_model is not None and mt_tokenizer is not None:
                 output_path, error = synthesize_speech(translated_text, target_code)
                 if output_path:
                     output_filename = os.path.basename(output_path)
+                    output_audio_url = f"https://jerich-talklasapp2.hf.space/audio_output/{output_filename}"
                     logger.info("TTS conversion completed")
             except Exception as e:
                 logger.error(f"Error during TTS conversion: {str(e)}")
             output_path, error = synthesize_speech(text, target_code)
             if output_path:
                 output_filename = os.path.basename(output_path)
+                output_audio_url = f"https://jerich-talklasapp2.hf.space/audio_output/{output_filename}"
                 logger.info("TTS conversion completed")
             else:
                 logger.error(f"TTS conversion failed: {error}")