Spaces:

Jerich
/

TalklasApp

Paused

App Files Files Community

Jerich commited on Apr 14, 2025

Commit

b0c2331

verified ·

1 Parent(s): 8157595

Replaced librosa with torchaudio for audio loading and resampling. Added speech detection (energy-based or webrtcvad for accuracy). Improved /translate-audio endpoint to handle silent audio gracefully.

Browse files

Files changed (1) hide show

app.py +48 -8

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ import uuid
 import torch
 import numpy as np
 import soundfile as sf
-import librosa
 import wave
 import time
 from fastapi import FastAPI, HTTPException, UploadFile, File, Form, BackgroundTasks
@@ -80,6 +80,30 @@ def save_pcm_to_wav(pcm_data: list, sample_rate: int, output_path: str):
         # Write the 16-bit PCM data as bytes (little-endian)
         wav_file.writeframes(pcm_array.tobytes())
 # Function to clean up old audio files
 def cleanup_old_audio_files():
     logger.info("Starting cleanup of old audio files...")
@@ -417,17 +441,33 @@ async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form
     output_audio_url = None
     try:
-        # Step 1: Transcribe the audio (STT)
         logger.info(f"Reading audio file: {temp_path}")
-        waveform, sample_rate = sf.read(temp_path)
         logger.info(f"Audio loaded: sample_rate={sample_rate}, waveform_shape={waveform.shape}")
         if sample_rate != 16000:
             logger.info(f"Resampling audio from {sample_rate} Hz to 16000 Hz")
-            waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=16000)
         device = "cuda" if torch.cuda.is_available() else "cpu"
         logger.info(f"Using device: {device}")
-        inputs = stt_processor(waveform, sampling_rate=16000, return_tensors="pt").to(device)
         logger.info("Audio processed, generating transcription...")
         with torch.no_grad():
@@ -442,7 +482,7 @@ async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form
                 transcription = stt_processor.batch_decode(predicted_ids)[0]
         logger.info(f"Transcription completed: {transcription}")
-        # Step 2: Translate the transcribed text (MT)
         source_code = LANGUAGE_MAPPING[source_lang]
         target_code = LANGUAGE_MAPPING[target_lang]
@@ -466,7 +506,7 @@ async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form
         else:
             logger.warning("MT model not loaded, skipping translation")
-        # Step 3: Convert translated text to speech (TTS)
         if model_status["tts"].startswith("loaded") and tts_model is not None and tts_tokenizer is not None:
             try:
                 inputs = tts_tokenizer(translated_text, return_tensors="pt").to(device)

 import torch
 import numpy as np
 import soundfile as sf
+import torchaudio
 import wave
 import time
 from fastapi import FastAPI, HTTPException, UploadFile, File, Form, BackgroundTasks
         # Write the 16-bit PCM data as bytes (little-endian)
         wav_file.writeframes(pcm_array.tobytes())
+# Function to detect speech using an energy-based approach
+def detect_speech(waveform: torch.Tensor, sample_rate: int, threshold: float = 0.01, min_speech_duration: float = 0.5) -> bool:
+    """
+    Detects if the audio contains speech using an energy-based approach.
+    Returns True if speech is detected, False otherwise.
+    """
+    # Convert waveform to numpy array
+    waveform_np = waveform.numpy()
+    if waveform_np.ndim > 1:
+        waveform_np = waveform_np.mean(axis=0)  # Convert stereo to mono
+    # Compute RMS energy
+    rms = np.sqrt(np.mean(waveform_np**2))
+    logger.info(f"RMS energy: {rms}")
+    # Check if RMS energy exceeds the threshold
+    if rms < threshold:
+        logger.info("No speech detected: RMS energy below threshold")
+        return False
+    # Optionally, check for minimum speech duration (requires more sophisticated VAD)
+    # For now, we assume if RMS is above threshold, there is speech
+    return True
 # Function to clean up old audio files
 def cleanup_old_audio_files():
     logger.info("Starting cleanup of old audio files...")
     output_audio_url = None
     try:
+        # Step 1: Load and resample the audio using torchaudio
         logger.info(f"Reading audio file: {temp_path}")
+        waveform, sample_rate = torchaudio.load(temp_path)
         logger.info(f"Audio loaded: sample_rate={sample_rate}, waveform_shape={waveform.shape}")
+        # Resample to 16 kHz if needed (required by Whisper and MMS models)
         if sample_rate != 16000:
             logger.info(f"Resampling audio from {sample_rate} Hz to 16000 Hz")
+            resampler = torchaudio.transforms.Resample(sample_rate, 16000)
+            waveform = resampler(waveform)
+            sample_rate = 16000
+        # Step 2: Detect speech
+        if not detect_speech(waveform, sample_rate):
+            return {
+                "request_id": request_id,
+                "status": "failed",
+                "message": "No speech detected in the audio.",
+                "source_text": "No speech detected",
+                "translated_text": "No translation available",
+                "output_audio": None
+            }
+        # Step 3: Transcribe the audio (STT)
         device = "cuda" if torch.cuda.is_available() else "cpu"
         logger.info(f"Using device: {device}")
+        inputs = stt_processor(waveform.numpy(), sampling_rate=16000, return_tensors="pt").to(device)
         logger.info("Audio processed, generating transcription...")
         with torch.no_grad():
                 transcription = stt_processor.batch_decode(predicted_ids)[0]
         logger.info(f"Transcription completed: {transcription}")
+        # Step 4: Translate the transcribed text (MT)
         source_code = LANGUAGE_MAPPING[source_lang]
         target_code = LANGUAGE_MAPPING[target_lang]
         else:
             logger.warning("MT model not loaded, skipping translation")
+        # Step 5: Convert translated text to speech (TTS)
         if model_status["tts"].startswith("loaded") and tts_model is not None and tts_tokenizer is not None:
             try:
                 inputs = tts_tokenizer(translated_text, return_tensors="pt").to(device)