Spaces:

sathish-93
/

demo

Runtime error

App Files Files Community

sathishkumarbsk commited on Jan 30

Commit

a9a4266

1 Parent(s): 9ed494b

latest changes

Browse files

Files changed (6) hide show

app/.DS_Store +0 -0
app/main.py +33 -6
app/services/asr.py +113 -19
app/services/audio_preprocessor.py +156 -10
app/services/ffmpeg.py +1 -1
requirements.txt +1 -0

app/.DS_Store CHANGED Viewed

Binary files a/app/.DS_Store and b/app/.DS_Store differ

app/main.py CHANGED Viewed

@@ -44,6 +44,7 @@ class TranscribeRequest(BaseModel):
     output_language: Optional[str] = None  # "en" for English/romanized, "auto" or None for auto-detect
     asr_model: Optional[str] = "whisper"  # "whisper" or "pingala"
     preprocess: Optional[bool] = False  # Enable audio preprocessing pipeline
 # HTML Test Page (inline for simplicity)
@@ -195,10 +196,17 @@ HTML_PAGE = """
         </div>
         <div class="language-selector">
             <label style="display:inline;font-weight:normal;cursor:pointer;">
-                <input type="checkbox" id="preprocess" style="margin-right:6px;">
                 Enable Audio Preprocessing (noise reduction, silence trimming, loudness normalization)
             </label>
         </div>
         <div class="tabs">
             <button class="tab active" onclick="showTab('upload')">File Upload</button>
             <button class="tab" onclick="showTab('media')">Media URL</button>
@@ -268,6 +276,15 @@ HTML_PAGE = """
             return document.getElementById('preprocess').checked;
         }
         async function submitFile() {
             const file = document.getElementById('file').files[0];
             if (!file) {
@@ -281,6 +298,7 @@ HTML_PAGE = """
             formData.append('output_language', getSelectedLanguage());
             formData.append('asr_model', getSelectedModel());
             formData.append('preprocess', getPreprocess());
             try {
                 const response = await fetch('/transcribe', {
@@ -311,7 +329,7 @@ HTML_PAGE = """
             }
             setStatus('loading', 'Downloading and processing...');
-            const requestBody = { media_url: url, output_language: getSelectedLanguage(), asr_model: getSelectedModel(), preprocess: getPreprocess() };
             try {
                 const response = await fetch('/transcribe', {
@@ -343,7 +361,7 @@ HTML_PAGE = """
             }
             setStatus('loading', 'Downloading YouTube audio and processing...');
-            const requestBody = { youtube_url: url, output_language: getSelectedLanguage(), asr_model: getSelectedModel(), preprocess: getPreprocess() };
             try {
                 const response = await fetch('/transcribe', {
@@ -462,6 +480,7 @@ async def transcribe(
         output_language: Optional[str] = None  # None means auto-detect
         asr_model: str = "whisper"  # Default to Whisper
         preprocess: bool = False  # Audio preprocessing toggle
         # Check if it's a file upload (multipart form)
         content_type = request.headers.get("content-type", "")
@@ -484,6 +503,10 @@ async def transcribe(
                 form_preprocess = form.get("preprocess")
                 if form_preprocess and form_preprocess.lower() == "true":
                     preprocess = True
         # Check if it's a JSON request
         elif "application/json" in content_type:
@@ -509,6 +532,10 @@ async def transcribe(
             if req.preprocess:
                 preprocess = True
             if req.youtube_url:
                 logger.info(f"Processing YouTube URL: {req.youtube_url}")
                 input_path = await ingest_youtube(req.youtube_url, request_id, temp_dir)
@@ -517,7 +544,7 @@ async def transcribe(
                 logger.info(f"Processing media URL: {req.media_url}")
                 input_path = await ingest_media_url(req.media_url, request_id, temp_dir)
-        logger.info(f"ASR model: {asr_model}, Output language: {output_language or 'auto-detect'}, Preprocess: {preprocess}")
         # Validate we have input
         if input_path is None:
@@ -537,7 +564,7 @@ async def transcribe(
         # Audio preprocessing (if enabled)
         if preprocess:
             logger.info("Audio preprocessing enabled, running pipeline...")
-            normalized_path = await preprocess_audio(normalized_path, request_id, temp_dir)
         # Transcribe with selected ASR model
         logger.info(f"Starting transcription with {asr_model}...")
@@ -546,7 +573,7 @@ async def transcribe(
         elif asr_model == "ai4bharat":
             transcript = await transcribe_audio_ai4bharat(normalized_path, request_id, output_language)
         else:
-            transcript = await transcribe_audio(normalized_path, request_id, output_language)
         logger.info("Transcription complete")
         return PlainTextResponse(

     output_language: Optional[str] = None  # "en" for English/romanized, "auto" or None for auto-detect
     asr_model: Optional[str] = "whisper"  # "whisper" or "pingala"
     preprocess: Optional[bool] = False  # Enable audio preprocessing pipeline
+    noise_method: Optional[str] = "noisereduce"  # "noisereduce" or "deepfilternet"
 # HTML Test Page (inline for simplicity)
         </div>
         <div class="language-selector">
             <label style="display:inline;font-weight:normal;cursor:pointer;">
+                <input type="checkbox" id="preprocess" style="margin-right:6px;" onchange="toggleNoiseMethod()">
                 Enable Audio Preprocessing (noise reduction, silence trimming, loudness normalization)
             </label>
         </div>
+        <div class="language-selector" id="noise_method_container" style="display:none;">
+            <label for="noise_method">Noise Reduction Method:</label>
+            <select id="noise_method">
+                <option value="noisereduce">Spectral Gating (lightweight, stationary noise)</option>
+                <option value="deepfilternet">DeepFilterNet3 (neural, all noise types)</option>
+            </select>
+        </div>
         <div class="tabs">
             <button class="tab active" onclick="showTab('upload')">File Upload</button>
             <button class="tab" onclick="showTab('media')">Media URL</button>
             return document.getElementById('preprocess').checked;
         }
+        function getNoiseMethod() {
+            return document.getElementById('noise_method').value;
+        }
+        function toggleNoiseMethod() {
+            const container = document.getElementById('noise_method_container');
+            container.style.display = document.getElementById('preprocess').checked ? 'block' : 'none';
+        }
         async function submitFile() {
             const file = document.getElementById('file').files[0];
             if (!file) {
             formData.append('output_language', getSelectedLanguage());
             formData.append('asr_model', getSelectedModel());
             formData.append('preprocess', getPreprocess());
+            formData.append('noise_method', getNoiseMethod());
             try {
                 const response = await fetch('/transcribe', {
             }
             setStatus('loading', 'Downloading and processing...');
+            const requestBody = { media_url: url, output_language: getSelectedLanguage(), asr_model: getSelectedModel(), preprocess: getPreprocess(), noise_method: getNoiseMethod() };
             try {
                 const response = await fetch('/transcribe', {
             }
             setStatus('loading', 'Downloading YouTube audio and processing...');
+            const requestBody = { youtube_url: url, output_language: getSelectedLanguage(), asr_model: getSelectedModel(), preprocess: getPreprocess(), noise_method: getNoiseMethod() };
             try {
                 const response = await fetch('/transcribe', {
         output_language: Optional[str] = None  # None means auto-detect
         asr_model: str = "whisper"  # Default to Whisper
         preprocess: bool = False  # Audio preprocessing toggle
+        noise_method: str = "noisereduce"  # Noise reduction method
         # Check if it's a file upload (multipart form)
         content_type = request.headers.get("content-type", "")
                 form_preprocess = form.get("preprocess")
                 if form_preprocess and form_preprocess.lower() == "true":
                     preprocess = True
+                # Get noise method
+                form_noise = form.get("noise_method")
+                if form_noise and form_noise in ("noisereduce", "deepfilternet"):
+                    noise_method = form_noise
         # Check if it's a JSON request
         elif "application/json" in content_type:
             if req.preprocess:
                 preprocess = True
+            # Get noise method
+            if req.noise_method and req.noise_method in ("noisereduce", "deepfilternet"):
+                noise_method = req.noise_method
             if req.youtube_url:
                 logger.info(f"Processing YouTube URL: {req.youtube_url}")
                 input_path = await ingest_youtube(req.youtube_url, request_id, temp_dir)
                 logger.info(f"Processing media URL: {req.media_url}")
                 input_path = await ingest_media_url(req.media_url, request_id, temp_dir)
+        logger.info(f"ASR model: {asr_model}, Output language: {output_language or 'auto-detect'}, Preprocess: {preprocess}, Noise: {noise_method}")
         # Validate we have input
         if input_path is None:
         # Audio preprocessing (if enabled)
         if preprocess:
             logger.info("Audio preprocessing enabled, running pipeline...")
+            normalized_path = await preprocess_audio(normalized_path, request_id, temp_dir, noise_method)
         # Transcribe with selected ASR model
         logger.info(f"Starting transcription with {asr_model}...")
         elif asr_model == "ai4bharat":
             transcript = await transcribe_audio_ai4bharat(normalized_path, request_id, output_language)
         else:
+            transcript = await transcribe_audio(normalized_path, request_id, output_language, use_chunking=preprocess)
         logger.info("Transcription complete")
         return PlainTextResponse(

app/services/asr.py CHANGED Viewed

@@ -1,11 +1,16 @@
 """
 ASR (Automatic Speech Recognition) service using official OpenAI Whisper.
 Thread-safe model loading with singleton pattern.
 """
 import asyncio
 import threading
 from pathlib import Path
-from typing import Optional
 from app.core.config import settings
 from app.core.logging import get_request_logger
@@ -61,7 +66,8 @@ class ASRService:
         self,
         audio_path: Path,
         request_id: str,
-        language: Optional[str] = None
     ) -> str:
         """
         Transcribe audio file to plain text.
@@ -71,9 +77,10 @@ class ASRService:
             audio_path: Path to the audio file
             request_id: Unique request ID for logging
             language: Language code (e.g., "en" for English/romanized output, None for auto-detect)
         """
         logger = get_request_logger(request_id)
-        logger.info(f"Starting transcription: {audio_path}, language: {language or 'auto-detect'}")
         # Ensure model is loaded
         self._load_model()
@@ -83,39 +90,63 @@ class ASRService:
         try:
             result = await asyncio.wait_for(
-                loop.run_in_executor(None, self._transcribe_sync, audio_path, request_id, language),
                 timeout=settings.ASR_TIMEOUT
             )
             return result
         except asyncio.TimeoutError:
             raise ASRError(f"Transcription timeout after {settings.ASR_TIMEOUT}s")
-    def _transcribe_sync(self, audio_path: Path, request_id: str, language: Optional[str] = None) -> str:
         """Synchronous transcription (runs in thread pool)."""
         logger = get_request_logger(request_id)
         try:
-            # Build transcription options
-            transcribe_options = {
-                "task": "transcribe",
-                "verbose": False,
-            }
-            # If language is specified, use it; otherwise auto-detect
             if language:
-                transcribe_options["language"] = language
                 logger.info(f"Using specified language: {language}")
-            result = self._model.transcribe(str(audio_path), **transcribe_options)
             detected_lang = result.get("language", "unknown")
             logger.info(f"Detected/used language: {detected_lang}")
-            # Get the full text
             full_text = result.get("text", "").strip()
-            # Clean up extra whitespace
-            import re
             full_text = re.sub(r'\s+', ' ', full_text).strip()
             logger.info(f"Transcription complete: {len(full_text)} characters")
@@ -124,11 +155,74 @@ class ASRService:
         except Exception as e:
             raise ASRError(f"Transcription failed: {e}")
 # Global ASR service instance
 asr_service = ASRService()
-async def transcribe_audio(audio_path: Path, request_id: str, language: Optional[str] = None) -> str:
     """Convenience function to transcribe audio."""
-    return await asr_service.transcribe(audio_path, request_id, language)

 """
 ASR (Automatic Speech Recognition) service using official OpenAI Whisper.
 Thread-safe model loading with singleton pattern.
+Supports smart chunking via Silero VAD for long audio.
 """
 import asyncio
+import re
 import threading
 from pathlib import Path
+from typing import List, Optional, Tuple
+import numpy as np
+import soundfile as sf
 from app.core.config import settings
 from app.core.logging import get_request_logger
         self,
         audio_path: Path,
         request_id: str,
+        language: Optional[str] = None,
+        use_chunking: bool = False
     ) -> str:
         """
         Transcribe audio file to plain text.
             audio_path: Path to the audio file
             request_id: Unique request ID for logging
             language: Language code (e.g., "en" for English/romanized output, None for auto-detect)
+            use_chunking: If True, use VAD-based smart chunking for long audio
         """
         logger = get_request_logger(request_id)
+        logger.info(f"Starting transcription: {audio_path}, language: {language or 'auto-detect'}, chunking: {use_chunking}")
         # Ensure model is loaded
         self._load_model()
         try:
             result = await asyncio.wait_for(
+                loop.run_in_executor(
+                    None, self._transcribe_sync, audio_path, request_id, language, use_chunking
+                ),
                 timeout=settings.ASR_TIMEOUT
             )
             return result
         except asyncio.TimeoutError:
             raise ASRError(f"Transcription timeout after {settings.ASR_TIMEOUT}s")
+    def _get_transcribe_options(self, language: Optional[str] = None) -> dict:
+        """Build Whisper transcription options."""
+        options = {
+            "task": "transcribe",
+            "verbose": False,
+            "beam_size": 5,
+            "best_of": 5,
+            "temperature": 0,
+            "condition_on_previous_text": True,
+            "initial_prompt": (
+                "This is a clear, well-structured transcription with proper punctuation, "
+                "capitalization, and natural sentence breaks."
+            ),
+            "compression_ratio_threshold": 2.4,
+            "logprob_threshold": -1.0,
+            "no_speech_threshold": 0.6,
+        }
+        if language:
+            options["language"] = language
+        return options
+    def _transcribe_sync(
+        self, audio_path: Path, request_id: str,
+        language: Optional[str] = None, use_chunking: bool = False
+    ) -> str:
         """Synchronous transcription (runs in thread pool)."""
         logger = get_request_logger(request_id)
         try:
+            # Check audio duration to decide chunking
+            audio_data, sr = sf.read(audio_path, dtype='float32')
+            duration = len(audio_data) / sr
+            logger.info(f"Audio duration: {duration:.1f}s")
+            if use_chunking and duration > 30.0:
+                return self._transcribe_chunked(audio_data, sr, audio_path, request_id, language)
+            # Standard single-pass transcription
             if language:
                 logger.info(f"Using specified language: {language}")
+            options = self._get_transcribe_options(language)
+            result = self._model.transcribe(str(audio_path), **options)
             detected_lang = result.get("language", "unknown")
             logger.info(f"Detected/used language: {detected_lang}")
             full_text = result.get("text", "").strip()
             full_text = re.sub(r'\s+', ' ', full_text).strip()
             logger.info(f"Transcription complete: {len(full_text)} characters")
         except Exception as e:
             raise ASRError(f"Transcription failed: {e}")
+    def _transcribe_chunked(
+        self, audio_data: np.ndarray, sr: int,
+        audio_path: Path, request_id: str, language: Optional[str] = None
+    ) -> str:
+        """Transcribe long audio using VAD-based smart chunking."""
+        logger = get_request_logger(request_id)
+        logger.info("Using smart chunking for long audio...")
+        from app.services.audio_preprocessor import get_speech_chunks
+        # Get speech-boundary chunks
+        chunks = get_speech_chunks(audio_path, request_id)
+        logger.info(f"Transcribing {len(chunks)} chunks...")
+        if language:
+            logger.info(f"Using specified language: {language}")
+        options = self._get_transcribe_options(language)
+        transcripts = []
+        for i, (start_sample, end_sample) in enumerate(chunks):
+            chunk_audio = audio_data[start_sample:end_sample]
+            chunk_duration = len(chunk_audio) / sr
+            logger.info(f"Transcribing chunk {i+1}/{len(chunks)}: {chunk_duration:.1f}s")
+            # Write chunk to temp file for Whisper
+            import tempfile
+            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
+                sf.write(tmp.name, chunk_audio, sr, subtype='PCM_16')
+                tmp_path = tmp.name
+            try:
+                # Use previous chunk's text as context for continuity
+                if transcripts:
+                    # Use last 200 chars of previous transcription as context
+                    prev_text = transcripts[-1][-200:]
+                    options["initial_prompt"] = prev_text
+                result = self._model.transcribe(tmp_path, **options)
+                chunk_text = result.get("text", "").strip()
+                chunk_text = re.sub(r'\s+', ' ', chunk_text).strip()
+                if chunk_text:
+                    transcripts.append(chunk_text)
+                    logger.info(f"Chunk {i+1}: {len(chunk_text)} chars")
+                else:
+                    logger.info(f"Chunk {i+1}: empty (no speech)")
+            finally:
+                import os
+                os.unlink(tmp_path)
+        full_text = " ".join(transcripts)
+        full_text = re.sub(r'\s+', ' ', full_text).strip()
+        logger.info(
+            f"Chunked transcription complete: {len(chunks)} chunks, "
+            f"{len(full_text)} characters"
+        )
+        return full_text
 # Global ASR service instance
 asr_service = ASRService()
+async def transcribe_audio(
+    audio_path: Path, request_id: str,
+    language: Optional[str] = None, use_chunking: bool = False
+) -> str:
     """Convenience function to transcribe audio."""
+    return await asr_service.transcribe(audio_path, request_id, language, use_chunking)

app/services/audio_preprocessor.py CHANGED Viewed

@@ -54,15 +54,28 @@ def _load_vad_model():
         return _vad_model, _vad_utils
-def reduce_noise(wav_path: Path, request_id: str, temp_dir: Path) -> Path:
     """
-    Apply spectral gating noise reduction.
-    Estimates noise profile and subtracts it from the signal.
     """
     import noisereduce as nr
     logger = get_request_logger(request_id)
-    logger.info("Preprocessing step 1: Noise reduction...")
     audio, sr = sf.read(wav_path, dtype='float32')
     original_size = len(audio)
@@ -70,7 +83,7 @@ def reduce_noise(wav_path: Path, request_id: str, temp_dir: Path) -> Path:
     reduced = nr.reduce_noise(
         y=audio,
         sr=sr,
-        prop_decrease=0.8,      # Reduce noise by 80%
         n_fft=2048,
         hop_length=512,
     )
@@ -78,7 +91,72 @@ def reduce_noise(wav_path: Path, request_id: str, temp_dir: Path) -> Path:
     output_path = temp_dir / f"{uuid.uuid4().hex[:8]}_denoised.wav"
     sf.write(str(output_path), reduced, sr, subtype='PCM_16')
-    logger.info(f"Noise reduction complete: {original_size} samples, saved to {output_path}")
     return output_path
@@ -183,18 +261,86 @@ async def loudnorm_compress(wav_path: Path, request_id: str, temp_dir: Path) ->
     return output_path
-async def preprocess_audio(wav_path: Path, request_id: str, temp_dir: Path) -> Path:
     """
     Run the full audio preprocessing pipeline:
-    1. Noise reduction (spectral gating)
     2. VAD silence trimming
     3. Loudness normalization + dynamic range compression
     Input: 16kHz mono WAV (from normalize_audio)
     Output: Preprocessed 16kHz mono WAV ready for Whisper
     """
     logger = get_request_logger(request_id)
-    logger.info("Starting audio preprocessing pipeline...")
     original_size = wav_path.stat().st_size
@@ -202,7 +348,7 @@ async def preprocess_audio(wav_path: Path, request_id: str, temp_dir: Path) -> P
         # Step 1: Noise reduction (CPU-bound, run in thread)
         loop = asyncio.get_event_loop()
         denoised_path = await loop.run_in_executor(
-            None, reduce_noise, wav_path, request_id, temp_dir
         )
         # Step 2: VAD silence trimming (CPU-bound, run in thread)

         return _vad_model, _vad_utils
+def reduce_noise(wav_path: Path, request_id: str, temp_dir: Path, method: str = "noisereduce") -> Path:
     """
+    Apply noise reduction using the selected method.
+    Methods:
+        - "noisereduce": Spectral gating (lightweight, good for stationary noise)
+        - "deepfilternet": Neural speech enhancement (handles all noise types, reverb)
     """
+    logger = get_request_logger(request_id)
+    if method == "deepfilternet":
+        return _reduce_noise_deepfilter(wav_path, request_id, temp_dir)
+    else:
+        return _reduce_noise_spectral(wav_path, request_id, temp_dir)
+def _reduce_noise_spectral(wav_path: Path, request_id: str, temp_dir: Path) -> Path:
+    """Apply spectral gating noise reduction via noisereduce."""
     import noisereduce as nr
     logger = get_request_logger(request_id)
+    logger.info("Preprocessing step 1: Noise reduction (spectral gating)...")
     audio, sr = sf.read(wav_path, dtype='float32')
     original_size = len(audio)
     reduced = nr.reduce_noise(
         y=audio,
         sr=sr,
+        prop_decrease=0.8,
         n_fft=2048,
         hop_length=512,
     )
     output_path = temp_dir / f"{uuid.uuid4().hex[:8]}_denoised.wav"
     sf.write(str(output_path), reduced, sr, subtype='PCM_16')
+    logger.info(f"Spectral noise reduction complete: {original_size} samples")
+    return output_path
+# DeepFilterNet singleton
+_df_model = None
+_df_state = None
+_df_lock = threading.Lock()
+def _load_deepfilter_model():
+    """Load DeepFilterNet3 model (cached singleton)."""
+    global _df_model, _df_state
+    if _df_model is not None:
+        return _df_model, _df_state
+    with _df_lock:
+        if _df_model is not None:
+            return _df_model, _df_state
+        import logging
+        logger = logging.getLogger("transcription")
+        logger.info("Loading DeepFilterNet3 model...")
+        from df.enhance import init_df
+        _df_model, _df_state, _ = init_df()
+        logger.info("DeepFilterNet3 model loaded")
+        return _df_model, _df_state
+def _reduce_noise_deepfilter(wav_path: Path, request_id: str, temp_dir: Path) -> Path:
+    """Apply neural speech enhancement via DeepFilterNet3."""
+    from df.enhance import enhance, load_audio, save_audio
+    logger = get_request_logger(request_id)
+    logger.info("Preprocessing step 1: Noise reduction (DeepFilterNet3)...")
+    model, df_state = _load_deepfilter_model()
+    model_sr = df_state.sr()  # 48000
+    # Load and resample to model's sample rate
+    audio_tensor, _ = load_audio(str(wav_path), sr=model_sr)
+    logger.info(f"Loaded audio for DeepFilterNet: {audio_tensor.shape}")
+    # Enhance
+    enhanced = enhance(model, df_state, audio_tensor)
+    # Save at 48kHz then resample back to 16kHz via soundfile
+    temp_48k = temp_dir / f"{uuid.uuid4().hex[:8]}_df_48k.wav"
+    save_audio(str(temp_48k), enhanced, model_sr)
+    # Read back and resample to 16kHz
+    import torchaudio
+    waveform, orig_sr = torchaudio.load(str(temp_48k))
+    if orig_sr != 16000:
+        resampler = torchaudio.transforms.Resample(orig_freq=orig_sr, new_freq=16000)
+        waveform = resampler(waveform)
+    output_path = temp_dir / f"{uuid.uuid4().hex[:8]}_denoised.wav"
+    sf.write(str(output_path), waveform.squeeze().numpy(), 16000, subtype='PCM_16')
+    # Clean up temp 48k file
+    temp_48k.unlink(missing_ok=True)
+    logger.info(f"DeepFilterNet3 noise reduction complete")
     return output_path
     return output_path
+def get_speech_chunks(wav_path: Path, request_id: str, max_chunk_s: float = 28.0):
+    """
+    Use Silero VAD to detect speech segments and group them into optimal chunks
+    for Whisper transcription. Splits at natural pauses, never mid-speech.
+    Returns list of (start_sample, end_sample) tuples.
+    Each chunk is <= max_chunk_s seconds and aligned to speech boundaries.
+    """
+    logger = get_request_logger(request_id)
+    model, utils = _load_vad_model()
+    (get_speech_timestamps, _, _, _, _) = utils
+    audio, sr = sf.read(wav_path, dtype='float32')
+    wav_tensor = torch.from_numpy(audio)
+    total_duration = len(audio) / sr
+    speech_timestamps = get_speech_timestamps(
+        wav_tensor,
+        model,
+        sampling_rate=sr,
+        threshold=0.5,
+        min_speech_duration_ms=250,
+        min_silence_duration_ms=500,
+        speech_pad_ms=200,
+    )
+    if not speech_timestamps:
+        logger.warning("VAD detected no speech for chunking, returning whole audio")
+        return [(0, len(audio))]
+    # Group speech segments into chunks of <= max_chunk_s
+    chunks = []
+    current_start = speech_timestamps[0]['start']
+    current_end = speech_timestamps[0]['end']
+    for ts in speech_timestamps[1:]:
+        # Would adding this segment exceed max chunk duration?
+        potential_duration = (ts['end'] - current_start) / sr
+        if potential_duration <= max_chunk_s:
+            # Extend current chunk to include this segment
+            current_end = ts['end']
+        else:
+            # Save current chunk, start a new one
+            chunks.append((current_start, current_end))
+            current_start = ts['start']
+            current_end = ts['end']
+    # Don't forget the last chunk
+    chunks.append((current_start, current_end))
+    logger.info(
+        f"Smart chunking: {total_duration:.1f}s audio -> {len(chunks)} chunks "
+        f"(from {len(speech_timestamps)} speech segments)"
+    )
+    for i, (s, e) in enumerate(chunks):
+        logger.info(f"  Chunk {i+1}: {s/sr:.1f}s - {e/sr:.1f}s ({(e-s)/sr:.1f}s)")
+    return chunks
+async def preprocess_audio(
+    wav_path: Path, request_id: str, temp_dir: Path,
+    noise_method: str = "noisereduce"
+) -> Path:
     """
     Run the full audio preprocessing pipeline:
+    1. Noise reduction (spectral gating or DeepFilterNet3)
     2. VAD silence trimming
     3. Loudness normalization + dynamic range compression
+    Args:
+        noise_method: "noisereduce" (spectral gating) or "deepfilternet" (neural)
     Input: 16kHz mono WAV (from normalize_audio)
     Output: Preprocessed 16kHz mono WAV ready for Whisper
     """
     logger = get_request_logger(request_id)
+    logger.info(f"Starting audio preprocessing pipeline (noise: {noise_method})...")
     original_size = wav_path.stat().st_size
         # Step 1: Noise reduction (CPU-bound, run in thread)
         loop = asyncio.get_event_loop()
         denoised_path = await loop.run_in_executor(
+            None, reduce_noise, wav_path, request_id, temp_dir, noise_method
         )
         # Step 2: VAD silence trimming (CPU-bound, run in thread)

app/services/ffmpeg.py CHANGED Viewed

@@ -32,7 +32,7 @@ async def normalize_audio(
     cmd = [
         "ffmpeg",
         "-i", str(input_path),
-        "-af", "highpass=f=80",  # Remove sub-80Hz rumble (no speech content)
         "-ar", "16000",      # 16kHz sample rate
         "-ac", "1",          # Mono
         "-c:a", "pcm_s16le", # 16-bit PCM

     cmd = [
         "ffmpeg",
         "-i", str(input_path),
+        "-af", "highpass=f=80,lowpass=f=8000",  # Bandpass: speech band 80Hz-8kHz
         "-ar", "16000",      # 16kHz sample rate
         "-ac", "1",          # Mono
         "-c:a", "pcm_s16le", # 16-bit PCM

requirements.txt CHANGED Viewed

@@ -18,6 +18,7 @@ aksharamukha>=2.0
 # Audio preprocessing
 noisereduce>=3.0
 # Pin NumPy to 1.x for compatibility with Whisper dependencies
 numpy<2

 # Audio preprocessing
 noisereduce>=3.0
+deepfilternet>=0.5
 # Pin NumPy to 1.x for compatibility with Whisper dependencies
 numpy<2