sathishkumarbsk commited on
Commit
a9a4266
·
1 Parent(s): 9ed494b

latest changes

Browse files
app/.DS_Store CHANGED
Binary files a/app/.DS_Store and b/app/.DS_Store differ
 
app/main.py CHANGED
@@ -44,6 +44,7 @@ class TranscribeRequest(BaseModel):
44
  output_language: Optional[str] = None # "en" for English/romanized, "auto" or None for auto-detect
45
  asr_model: Optional[str] = "whisper" # "whisper" or "pingala"
46
  preprocess: Optional[bool] = False # Enable audio preprocessing pipeline
 
47
 
48
 
49
  # HTML Test Page (inline for simplicity)
@@ -195,10 +196,17 @@ HTML_PAGE = """
195
  </div>
196
  <div class="language-selector">
197
  <label style="display:inline;font-weight:normal;cursor:pointer;">
198
- <input type="checkbox" id="preprocess" style="margin-right:6px;">
199
  Enable Audio Preprocessing (noise reduction, silence trimming, loudness normalization)
200
  </label>
201
  </div>
 
 
 
 
 
 
 
202
  <div class="tabs">
203
  <button class="tab active" onclick="showTab('upload')">File Upload</button>
204
  <button class="tab" onclick="showTab('media')">Media URL</button>
@@ -268,6 +276,15 @@ HTML_PAGE = """
268
  return document.getElementById('preprocess').checked;
269
  }
270
 
 
 
 
 
 
 
 
 
 
271
  async function submitFile() {
272
  const file = document.getElementById('file').files[0];
273
  if (!file) {
@@ -281,6 +298,7 @@ HTML_PAGE = """
281
  formData.append('output_language', getSelectedLanguage());
282
  formData.append('asr_model', getSelectedModel());
283
  formData.append('preprocess', getPreprocess());
 
284
 
285
  try {
286
  const response = await fetch('/transcribe', {
@@ -311,7 +329,7 @@ HTML_PAGE = """
311
  }
312
 
313
  setStatus('loading', 'Downloading and processing...');
314
- const requestBody = { media_url: url, output_language: getSelectedLanguage(), asr_model: getSelectedModel(), preprocess: getPreprocess() };
315
 
316
  try {
317
  const response = await fetch('/transcribe', {
@@ -343,7 +361,7 @@ HTML_PAGE = """
343
  }
344
 
345
  setStatus('loading', 'Downloading YouTube audio and processing...');
346
- const requestBody = { youtube_url: url, output_language: getSelectedLanguage(), asr_model: getSelectedModel(), preprocess: getPreprocess() };
347
 
348
  try {
349
  const response = await fetch('/transcribe', {
@@ -462,6 +480,7 @@ async def transcribe(
462
  output_language: Optional[str] = None # None means auto-detect
463
  asr_model: str = "whisper" # Default to Whisper
464
  preprocess: bool = False # Audio preprocessing toggle
 
465
 
466
  # Check if it's a file upload (multipart form)
467
  content_type = request.headers.get("content-type", "")
@@ -484,6 +503,10 @@ async def transcribe(
484
  form_preprocess = form.get("preprocess")
485
  if form_preprocess and form_preprocess.lower() == "true":
486
  preprocess = True
 
 
 
 
487
 
488
  # Check if it's a JSON request
489
  elif "application/json" in content_type:
@@ -509,6 +532,10 @@ async def transcribe(
509
  if req.preprocess:
510
  preprocess = True
511
 
 
 
 
 
512
  if req.youtube_url:
513
  logger.info(f"Processing YouTube URL: {req.youtube_url}")
514
  input_path = await ingest_youtube(req.youtube_url, request_id, temp_dir)
@@ -517,7 +544,7 @@ async def transcribe(
517
  logger.info(f"Processing media URL: {req.media_url}")
518
  input_path = await ingest_media_url(req.media_url, request_id, temp_dir)
519
 
520
- logger.info(f"ASR model: {asr_model}, Output language: {output_language or 'auto-detect'}, Preprocess: {preprocess}")
521
 
522
  # Validate we have input
523
  if input_path is None:
@@ -537,7 +564,7 @@ async def transcribe(
537
  # Audio preprocessing (if enabled)
538
  if preprocess:
539
  logger.info("Audio preprocessing enabled, running pipeline...")
540
- normalized_path = await preprocess_audio(normalized_path, request_id, temp_dir)
541
 
542
  # Transcribe with selected ASR model
543
  logger.info(f"Starting transcription with {asr_model}...")
@@ -546,7 +573,7 @@ async def transcribe(
546
  elif asr_model == "ai4bharat":
547
  transcript = await transcribe_audio_ai4bharat(normalized_path, request_id, output_language)
548
  else:
549
- transcript = await transcribe_audio(normalized_path, request_id, output_language)
550
 
551
  logger.info("Transcription complete")
552
  return PlainTextResponse(
 
44
  output_language: Optional[str] = None # "en" for English/romanized, "auto" or None for auto-detect
45
  asr_model: Optional[str] = "whisper" # "whisper" or "pingala"
46
  preprocess: Optional[bool] = False # Enable audio preprocessing pipeline
47
+ noise_method: Optional[str] = "noisereduce" # "noisereduce" or "deepfilternet"
48
 
49
 
50
  # HTML Test Page (inline for simplicity)
 
196
  </div>
197
  <div class="language-selector">
198
  <label style="display:inline;font-weight:normal;cursor:pointer;">
199
+ <input type="checkbox" id="preprocess" style="margin-right:6px;" onchange="toggleNoiseMethod()">
200
  Enable Audio Preprocessing (noise reduction, silence trimming, loudness normalization)
201
  </label>
202
  </div>
203
+ <div class="language-selector" id="noise_method_container" style="display:none;">
204
+ <label for="noise_method">Noise Reduction Method:</label>
205
+ <select id="noise_method">
206
+ <option value="noisereduce">Spectral Gating (lightweight, stationary noise)</option>
207
+ <option value="deepfilternet">DeepFilterNet3 (neural, all noise types)</option>
208
+ </select>
209
+ </div>
210
  <div class="tabs">
211
  <button class="tab active" onclick="showTab('upload')">File Upload</button>
212
  <button class="tab" onclick="showTab('media')">Media URL</button>
 
276
  return document.getElementById('preprocess').checked;
277
  }
278
 
279
+ function getNoiseMethod() {
280
+ return document.getElementById('noise_method').value;
281
+ }
282
+
283
+ function toggleNoiseMethod() {
284
+ const container = document.getElementById('noise_method_container');
285
+ container.style.display = document.getElementById('preprocess').checked ? 'block' : 'none';
286
+ }
287
+
288
  async function submitFile() {
289
  const file = document.getElementById('file').files[0];
290
  if (!file) {
 
298
  formData.append('output_language', getSelectedLanguage());
299
  formData.append('asr_model', getSelectedModel());
300
  formData.append('preprocess', getPreprocess());
301
+ formData.append('noise_method', getNoiseMethod());
302
 
303
  try {
304
  const response = await fetch('/transcribe', {
 
329
  }
330
 
331
  setStatus('loading', 'Downloading and processing...');
332
+ const requestBody = { media_url: url, output_language: getSelectedLanguage(), asr_model: getSelectedModel(), preprocess: getPreprocess(), noise_method: getNoiseMethod() };
333
 
334
  try {
335
  const response = await fetch('/transcribe', {
 
361
  }
362
 
363
  setStatus('loading', 'Downloading YouTube audio and processing...');
364
+ const requestBody = { youtube_url: url, output_language: getSelectedLanguage(), asr_model: getSelectedModel(), preprocess: getPreprocess(), noise_method: getNoiseMethod() };
365
 
366
  try {
367
  const response = await fetch('/transcribe', {
 
480
  output_language: Optional[str] = None # None means auto-detect
481
  asr_model: str = "whisper" # Default to Whisper
482
  preprocess: bool = False # Audio preprocessing toggle
483
+ noise_method: str = "noisereduce" # Noise reduction method
484
 
485
  # Check if it's a file upload (multipart form)
486
  content_type = request.headers.get("content-type", "")
 
503
  form_preprocess = form.get("preprocess")
504
  if form_preprocess and form_preprocess.lower() == "true":
505
  preprocess = True
506
+ # Get noise method
507
+ form_noise = form.get("noise_method")
508
+ if form_noise and form_noise in ("noisereduce", "deepfilternet"):
509
+ noise_method = form_noise
510
 
511
  # Check if it's a JSON request
512
  elif "application/json" in content_type:
 
532
  if req.preprocess:
533
  preprocess = True
534
 
535
+ # Get noise method
536
+ if req.noise_method and req.noise_method in ("noisereduce", "deepfilternet"):
537
+ noise_method = req.noise_method
538
+
539
  if req.youtube_url:
540
  logger.info(f"Processing YouTube URL: {req.youtube_url}")
541
  input_path = await ingest_youtube(req.youtube_url, request_id, temp_dir)
 
544
  logger.info(f"Processing media URL: {req.media_url}")
545
  input_path = await ingest_media_url(req.media_url, request_id, temp_dir)
546
 
547
+ logger.info(f"ASR model: {asr_model}, Output language: {output_language or 'auto-detect'}, Preprocess: {preprocess}, Noise: {noise_method}")
548
 
549
  # Validate we have input
550
  if input_path is None:
 
564
  # Audio preprocessing (if enabled)
565
  if preprocess:
566
  logger.info("Audio preprocessing enabled, running pipeline...")
567
+ normalized_path = await preprocess_audio(normalized_path, request_id, temp_dir, noise_method)
568
 
569
  # Transcribe with selected ASR model
570
  logger.info(f"Starting transcription with {asr_model}...")
 
573
  elif asr_model == "ai4bharat":
574
  transcript = await transcribe_audio_ai4bharat(normalized_path, request_id, output_language)
575
  else:
576
+ transcript = await transcribe_audio(normalized_path, request_id, output_language, use_chunking=preprocess)
577
 
578
  logger.info("Transcription complete")
579
  return PlainTextResponse(
app/services/asr.py CHANGED
@@ -1,11 +1,16 @@
1
  """
2
  ASR (Automatic Speech Recognition) service using official OpenAI Whisper.
3
  Thread-safe model loading with singleton pattern.
 
4
  """
5
  import asyncio
 
6
  import threading
7
  from pathlib import Path
8
- from typing import Optional
 
 
 
9
 
10
  from app.core.config import settings
11
  from app.core.logging import get_request_logger
@@ -61,7 +66,8 @@ class ASRService:
61
  self,
62
  audio_path: Path,
63
  request_id: str,
64
- language: Optional[str] = None
 
65
  ) -> str:
66
  """
67
  Transcribe audio file to plain text.
@@ -71,9 +77,10 @@ class ASRService:
71
  audio_path: Path to the audio file
72
  request_id: Unique request ID for logging
73
  language: Language code (e.g., "en" for English/romanized output, None for auto-detect)
 
74
  """
75
  logger = get_request_logger(request_id)
76
- logger.info(f"Starting transcription: {audio_path}, language: {language or 'auto-detect'}")
77
 
78
  # Ensure model is loaded
79
  self._load_model()
@@ -83,39 +90,63 @@ class ASRService:
83
 
84
  try:
85
  result = await asyncio.wait_for(
86
- loop.run_in_executor(None, self._transcribe_sync, audio_path, request_id, language),
 
 
87
  timeout=settings.ASR_TIMEOUT
88
  )
89
  return result
90
  except asyncio.TimeoutError:
91
  raise ASRError(f"Transcription timeout after {settings.ASR_TIMEOUT}s")
92
 
93
- def _transcribe_sync(self, audio_path: Path, request_id: str, language: Optional[str] = None) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  """Synchronous transcription (runs in thread pool)."""
95
  logger = get_request_logger(request_id)
96
 
97
  try:
98
- # Build transcription options
99
- transcribe_options = {
100
- "task": "transcribe",
101
- "verbose": False,
102
- }
 
 
103
 
104
- # If language is specified, use it; otherwise auto-detect
105
  if language:
106
- transcribe_options["language"] = language
107
  logger.info(f"Using specified language: {language}")
108
 
109
- result = self._model.transcribe(str(audio_path), **transcribe_options)
 
110
 
111
  detected_lang = result.get("language", "unknown")
112
  logger.info(f"Detected/used language: {detected_lang}")
113
 
114
- # Get the full text
115
  full_text = result.get("text", "").strip()
116
-
117
- # Clean up extra whitespace
118
- import re
119
  full_text = re.sub(r'\s+', ' ', full_text).strip()
120
 
121
  logger.info(f"Transcription complete: {len(full_text)} characters")
@@ -124,11 +155,74 @@ class ASRService:
124
  except Exception as e:
125
  raise ASRError(f"Transcription failed: {e}")
126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
  # Global ASR service instance
129
  asr_service = ASRService()
130
 
131
 
132
- async def transcribe_audio(audio_path: Path, request_id: str, language: Optional[str] = None) -> str:
 
 
 
133
  """Convenience function to transcribe audio."""
134
- return await asr_service.transcribe(audio_path, request_id, language)
 
1
  """
2
  ASR (Automatic Speech Recognition) service using official OpenAI Whisper.
3
  Thread-safe model loading with singleton pattern.
4
+ Supports smart chunking via Silero VAD for long audio.
5
  """
6
  import asyncio
7
+ import re
8
  import threading
9
  from pathlib import Path
10
+ from typing import List, Optional, Tuple
11
+
12
+ import numpy as np
13
+ import soundfile as sf
14
 
15
  from app.core.config import settings
16
  from app.core.logging import get_request_logger
 
66
  self,
67
  audio_path: Path,
68
  request_id: str,
69
+ language: Optional[str] = None,
70
+ use_chunking: bool = False
71
  ) -> str:
72
  """
73
  Transcribe audio file to plain text.
 
77
  audio_path: Path to the audio file
78
  request_id: Unique request ID for logging
79
  language: Language code (e.g., "en" for English/romanized output, None for auto-detect)
80
+ use_chunking: If True, use VAD-based smart chunking for long audio
81
  """
82
  logger = get_request_logger(request_id)
83
+ logger.info(f"Starting transcription: {audio_path}, language: {language or 'auto-detect'}, chunking: {use_chunking}")
84
 
85
  # Ensure model is loaded
86
  self._load_model()
 
90
 
91
  try:
92
  result = await asyncio.wait_for(
93
+ loop.run_in_executor(
94
+ None, self._transcribe_sync, audio_path, request_id, language, use_chunking
95
+ ),
96
  timeout=settings.ASR_TIMEOUT
97
  )
98
  return result
99
  except asyncio.TimeoutError:
100
  raise ASRError(f"Transcription timeout after {settings.ASR_TIMEOUT}s")
101
 
102
+ def _get_transcribe_options(self, language: Optional[str] = None) -> dict:
103
+ """Build Whisper transcription options."""
104
+ options = {
105
+ "task": "transcribe",
106
+ "verbose": False,
107
+ "beam_size": 5,
108
+ "best_of": 5,
109
+ "temperature": 0,
110
+ "condition_on_previous_text": True,
111
+ "initial_prompt": (
112
+ "This is a clear, well-structured transcription with proper punctuation, "
113
+ "capitalization, and natural sentence breaks."
114
+ ),
115
+ "compression_ratio_threshold": 2.4,
116
+ "logprob_threshold": -1.0,
117
+ "no_speech_threshold": 0.6,
118
+ }
119
+ if language:
120
+ options["language"] = language
121
+ return options
122
+
123
+ def _transcribe_sync(
124
+ self, audio_path: Path, request_id: str,
125
+ language: Optional[str] = None, use_chunking: bool = False
126
+ ) -> str:
127
  """Synchronous transcription (runs in thread pool)."""
128
  logger = get_request_logger(request_id)
129
 
130
  try:
131
+ # Check audio duration to decide chunking
132
+ audio_data, sr = sf.read(audio_path, dtype='float32')
133
+ duration = len(audio_data) / sr
134
+ logger.info(f"Audio duration: {duration:.1f}s")
135
+
136
+ if use_chunking and duration > 30.0:
137
+ return self._transcribe_chunked(audio_data, sr, audio_path, request_id, language)
138
 
139
+ # Standard single-pass transcription
140
  if language:
 
141
  logger.info(f"Using specified language: {language}")
142
 
143
+ options = self._get_transcribe_options(language)
144
+ result = self._model.transcribe(str(audio_path), **options)
145
 
146
  detected_lang = result.get("language", "unknown")
147
  logger.info(f"Detected/used language: {detected_lang}")
148
 
 
149
  full_text = result.get("text", "").strip()
 
 
 
150
  full_text = re.sub(r'\s+', ' ', full_text).strip()
151
 
152
  logger.info(f"Transcription complete: {len(full_text)} characters")
 
155
  except Exception as e:
156
  raise ASRError(f"Transcription failed: {e}")
157
 
158
+ def _transcribe_chunked(
159
+ self, audio_data: np.ndarray, sr: int,
160
+ audio_path: Path, request_id: str, language: Optional[str] = None
161
+ ) -> str:
162
+ """Transcribe long audio using VAD-based smart chunking."""
163
+ logger = get_request_logger(request_id)
164
+ logger.info("Using smart chunking for long audio...")
165
+
166
+ from app.services.audio_preprocessor import get_speech_chunks
167
+
168
+ # Get speech-boundary chunks
169
+ chunks = get_speech_chunks(audio_path, request_id)
170
+ logger.info(f"Transcribing {len(chunks)} chunks...")
171
+
172
+ if language:
173
+ logger.info(f"Using specified language: {language}")
174
+
175
+ options = self._get_transcribe_options(language)
176
+ transcripts = []
177
+
178
+ for i, (start_sample, end_sample) in enumerate(chunks):
179
+ chunk_audio = audio_data[start_sample:end_sample]
180
+ chunk_duration = len(chunk_audio) / sr
181
+ logger.info(f"Transcribing chunk {i+1}/{len(chunks)}: {chunk_duration:.1f}s")
182
+
183
+ # Write chunk to temp file for Whisper
184
+ import tempfile
185
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
186
+ sf.write(tmp.name, chunk_audio, sr, subtype='PCM_16')
187
+ tmp_path = tmp.name
188
+
189
+ try:
190
+ # Use previous chunk's text as context for continuity
191
+ if transcripts:
192
+ # Use last 200 chars of previous transcription as context
193
+ prev_text = transcripts[-1][-200:]
194
+ options["initial_prompt"] = prev_text
195
+
196
+ result = self._model.transcribe(tmp_path, **options)
197
+ chunk_text = result.get("text", "").strip()
198
+ chunk_text = re.sub(r'\s+', ' ', chunk_text).strip()
199
+
200
+ if chunk_text:
201
+ transcripts.append(chunk_text)
202
+ logger.info(f"Chunk {i+1}: {len(chunk_text)} chars")
203
+ else:
204
+ logger.info(f"Chunk {i+1}: empty (no speech)")
205
+ finally:
206
+ import os
207
+ os.unlink(tmp_path)
208
+
209
+ full_text = " ".join(transcripts)
210
+ full_text = re.sub(r'\s+', ' ', full_text).strip()
211
+
212
+ logger.info(
213
+ f"Chunked transcription complete: {len(chunks)} chunks, "
214
+ f"{len(full_text)} characters"
215
+ )
216
+ return full_text
217
+
218
 
219
  # Global ASR service instance
220
  asr_service = ASRService()
221
 
222
 
223
+ async def transcribe_audio(
224
+ audio_path: Path, request_id: str,
225
+ language: Optional[str] = None, use_chunking: bool = False
226
+ ) -> str:
227
  """Convenience function to transcribe audio."""
228
+ return await asr_service.transcribe(audio_path, request_id, language, use_chunking)
app/services/audio_preprocessor.py CHANGED
@@ -54,15 +54,28 @@ def _load_vad_model():
54
  return _vad_model, _vad_utils
55
 
56
 
57
- def reduce_noise(wav_path: Path, request_id: str, temp_dir: Path) -> Path:
58
  """
59
- Apply spectral gating noise reduction.
60
- Estimates noise profile and subtracts it from the signal.
 
 
 
61
  """
 
 
 
 
 
 
 
 
 
 
62
  import noisereduce as nr
63
 
64
  logger = get_request_logger(request_id)
65
- logger.info("Preprocessing step 1: Noise reduction...")
66
 
67
  audio, sr = sf.read(wav_path, dtype='float32')
68
  original_size = len(audio)
@@ -70,7 +83,7 @@ def reduce_noise(wav_path: Path, request_id: str, temp_dir: Path) -> Path:
70
  reduced = nr.reduce_noise(
71
  y=audio,
72
  sr=sr,
73
- prop_decrease=0.8, # Reduce noise by 80%
74
  n_fft=2048,
75
  hop_length=512,
76
  )
@@ -78,7 +91,72 @@ def reduce_noise(wav_path: Path, request_id: str, temp_dir: Path) -> Path:
78
  output_path = temp_dir / f"{uuid.uuid4().hex[:8]}_denoised.wav"
79
  sf.write(str(output_path), reduced, sr, subtype='PCM_16')
80
 
81
- logger.info(f"Noise reduction complete: {original_size} samples, saved to {output_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  return output_path
83
 
84
 
@@ -183,18 +261,86 @@ async def loudnorm_compress(wav_path: Path, request_id: str, temp_dir: Path) ->
183
  return output_path
184
 
185
 
186
- async def preprocess_audio(wav_path: Path, request_id: str, temp_dir: Path) -> Path:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  """
188
  Run the full audio preprocessing pipeline:
189
- 1. Noise reduction (spectral gating)
190
  2. VAD silence trimming
191
  3. Loudness normalization + dynamic range compression
192
 
 
 
 
193
  Input: 16kHz mono WAV (from normalize_audio)
194
  Output: Preprocessed 16kHz mono WAV ready for Whisper
195
  """
196
  logger = get_request_logger(request_id)
197
- logger.info("Starting audio preprocessing pipeline...")
198
 
199
  original_size = wav_path.stat().st_size
200
 
@@ -202,7 +348,7 @@ async def preprocess_audio(wav_path: Path, request_id: str, temp_dir: Path) -> P
202
  # Step 1: Noise reduction (CPU-bound, run in thread)
203
  loop = asyncio.get_event_loop()
204
  denoised_path = await loop.run_in_executor(
205
- None, reduce_noise, wav_path, request_id, temp_dir
206
  )
207
 
208
  # Step 2: VAD silence trimming (CPU-bound, run in thread)
 
54
  return _vad_model, _vad_utils
55
 
56
 
57
+ def reduce_noise(wav_path: Path, request_id: str, temp_dir: Path, method: str = "noisereduce") -> Path:
58
  """
59
+ Apply noise reduction using the selected method.
60
+
61
+ Methods:
62
+ - "noisereduce": Spectral gating (lightweight, good for stationary noise)
63
+ - "deepfilternet": Neural speech enhancement (handles all noise types, reverb)
64
  """
65
+ logger = get_request_logger(request_id)
66
+
67
+ if method == "deepfilternet":
68
+ return _reduce_noise_deepfilter(wav_path, request_id, temp_dir)
69
+ else:
70
+ return _reduce_noise_spectral(wav_path, request_id, temp_dir)
71
+
72
+
73
+ def _reduce_noise_spectral(wav_path: Path, request_id: str, temp_dir: Path) -> Path:
74
+ """Apply spectral gating noise reduction via noisereduce."""
75
  import noisereduce as nr
76
 
77
  logger = get_request_logger(request_id)
78
+ logger.info("Preprocessing step 1: Noise reduction (spectral gating)...")
79
 
80
  audio, sr = sf.read(wav_path, dtype='float32')
81
  original_size = len(audio)
 
83
  reduced = nr.reduce_noise(
84
  y=audio,
85
  sr=sr,
86
+ prop_decrease=0.8,
87
  n_fft=2048,
88
  hop_length=512,
89
  )
 
91
  output_path = temp_dir / f"{uuid.uuid4().hex[:8]}_denoised.wav"
92
  sf.write(str(output_path), reduced, sr, subtype='PCM_16')
93
 
94
+ logger.info(f"Spectral noise reduction complete: {original_size} samples")
95
+ return output_path
96
+
97
+
98
+ # DeepFilterNet singleton
99
+ _df_model = None
100
+ _df_state = None
101
+ _df_lock = threading.Lock()
102
+
103
+
104
+ def _load_deepfilter_model():
105
+ """Load DeepFilterNet3 model (cached singleton)."""
106
+ global _df_model, _df_state
107
+ if _df_model is not None:
108
+ return _df_model, _df_state
109
+
110
+ with _df_lock:
111
+ if _df_model is not None:
112
+ return _df_model, _df_state
113
+
114
+ import logging
115
+ logger = logging.getLogger("transcription")
116
+ logger.info("Loading DeepFilterNet3 model...")
117
+
118
+ from df.enhance import init_df
119
+ _df_model, _df_state, _ = init_df()
120
+
121
+ logger.info("DeepFilterNet3 model loaded")
122
+ return _df_model, _df_state
123
+
124
+
125
+ def _reduce_noise_deepfilter(wav_path: Path, request_id: str, temp_dir: Path) -> Path:
126
+ """Apply neural speech enhancement via DeepFilterNet3."""
127
+ from df.enhance import enhance, load_audio, save_audio
128
+
129
+ logger = get_request_logger(request_id)
130
+ logger.info("Preprocessing step 1: Noise reduction (DeepFilterNet3)...")
131
+
132
+ model, df_state = _load_deepfilter_model()
133
+ model_sr = df_state.sr() # 48000
134
+
135
+ # Load and resample to model's sample rate
136
+ audio_tensor, _ = load_audio(str(wav_path), sr=model_sr)
137
+ logger.info(f"Loaded audio for DeepFilterNet: {audio_tensor.shape}")
138
+
139
+ # Enhance
140
+ enhanced = enhance(model, df_state, audio_tensor)
141
+
142
+ # Save at 48kHz then resample back to 16kHz via soundfile
143
+ temp_48k = temp_dir / f"{uuid.uuid4().hex[:8]}_df_48k.wav"
144
+ save_audio(str(temp_48k), enhanced, model_sr)
145
+
146
+ # Read back and resample to 16kHz
147
+ import torchaudio
148
+ waveform, orig_sr = torchaudio.load(str(temp_48k))
149
+ if orig_sr != 16000:
150
+ resampler = torchaudio.transforms.Resample(orig_freq=orig_sr, new_freq=16000)
151
+ waveform = resampler(waveform)
152
+
153
+ output_path = temp_dir / f"{uuid.uuid4().hex[:8]}_denoised.wav"
154
+ sf.write(str(output_path), waveform.squeeze().numpy(), 16000, subtype='PCM_16')
155
+
156
+ # Clean up temp 48k file
157
+ temp_48k.unlink(missing_ok=True)
158
+
159
+ logger.info(f"DeepFilterNet3 noise reduction complete")
160
  return output_path
161
 
162
 
 
261
  return output_path
262
 
263
 
264
+ def get_speech_chunks(wav_path: Path, request_id: str, max_chunk_s: float = 28.0):
265
+ """
266
+ Use Silero VAD to detect speech segments and group them into optimal chunks
267
+ for Whisper transcription. Splits at natural pauses, never mid-speech.
268
+
269
+ Returns list of (start_sample, end_sample) tuples.
270
+ Each chunk is <= max_chunk_s seconds and aligned to speech boundaries.
271
+ """
272
+ logger = get_request_logger(request_id)
273
+
274
+ model, utils = _load_vad_model()
275
+ (get_speech_timestamps, _, _, _, _) = utils
276
+
277
+ audio, sr = sf.read(wav_path, dtype='float32')
278
+ wav_tensor = torch.from_numpy(audio)
279
+ total_duration = len(audio) / sr
280
+
281
+ speech_timestamps = get_speech_timestamps(
282
+ wav_tensor,
283
+ model,
284
+ sampling_rate=sr,
285
+ threshold=0.5,
286
+ min_speech_duration_ms=250,
287
+ min_silence_duration_ms=500,
288
+ speech_pad_ms=200,
289
+ )
290
+
291
+ if not speech_timestamps:
292
+ logger.warning("VAD detected no speech for chunking, returning whole audio")
293
+ return [(0, len(audio))]
294
+
295
+ # Group speech segments into chunks of <= max_chunk_s
296
+ chunks = []
297
+ current_start = speech_timestamps[0]['start']
298
+ current_end = speech_timestamps[0]['end']
299
+
300
+ for ts in speech_timestamps[1:]:
301
+ # Would adding this segment exceed max chunk duration?
302
+ potential_duration = (ts['end'] - current_start) / sr
303
+
304
+ if potential_duration <= max_chunk_s:
305
+ # Extend current chunk to include this segment
306
+ current_end = ts['end']
307
+ else:
308
+ # Save current chunk, start a new one
309
+ chunks.append((current_start, current_end))
310
+ current_start = ts['start']
311
+ current_end = ts['end']
312
+
313
+ # Don't forget the last chunk
314
+ chunks.append((current_start, current_end))
315
+
316
+ logger.info(
317
+ f"Smart chunking: {total_duration:.1f}s audio -> {len(chunks)} chunks "
318
+ f"(from {len(speech_timestamps)} speech segments)"
319
+ )
320
+ for i, (s, e) in enumerate(chunks):
321
+ logger.info(f" Chunk {i+1}: {s/sr:.1f}s - {e/sr:.1f}s ({(e-s)/sr:.1f}s)")
322
+
323
+ return chunks
324
+
325
+
326
+ async def preprocess_audio(
327
+ wav_path: Path, request_id: str, temp_dir: Path,
328
+ noise_method: str = "noisereduce"
329
+ ) -> Path:
330
  """
331
  Run the full audio preprocessing pipeline:
332
+ 1. Noise reduction (spectral gating or DeepFilterNet3)
333
  2. VAD silence trimming
334
  3. Loudness normalization + dynamic range compression
335
 
336
+ Args:
337
+ noise_method: "noisereduce" (spectral gating) or "deepfilternet" (neural)
338
+
339
  Input: 16kHz mono WAV (from normalize_audio)
340
  Output: Preprocessed 16kHz mono WAV ready for Whisper
341
  """
342
  logger = get_request_logger(request_id)
343
+ logger.info(f"Starting audio preprocessing pipeline (noise: {noise_method})...")
344
 
345
  original_size = wav_path.stat().st_size
346
 
 
348
  # Step 1: Noise reduction (CPU-bound, run in thread)
349
  loop = asyncio.get_event_loop()
350
  denoised_path = await loop.run_in_executor(
351
+ None, reduce_noise, wav_path, request_id, temp_dir, noise_method
352
  )
353
 
354
  # Step 2: VAD silence trimming (CPU-bound, run in thread)
app/services/ffmpeg.py CHANGED
@@ -32,7 +32,7 @@ async def normalize_audio(
32
  cmd = [
33
  "ffmpeg",
34
  "-i", str(input_path),
35
- "-af", "highpass=f=80", # Remove sub-80Hz rumble (no speech content)
36
  "-ar", "16000", # 16kHz sample rate
37
  "-ac", "1", # Mono
38
  "-c:a", "pcm_s16le", # 16-bit PCM
 
32
  cmd = [
33
  "ffmpeg",
34
  "-i", str(input_path),
35
+ "-af", "highpass=f=80,lowpass=f=8000", # Bandpass: speech band 80Hz-8kHz
36
  "-ar", "16000", # 16kHz sample rate
37
  "-ac", "1", # Mono
38
  "-c:a", "pcm_s16le", # 16-bit PCM
requirements.txt CHANGED
@@ -18,6 +18,7 @@ aksharamukha>=2.0
18
 
19
  # Audio preprocessing
20
  noisereduce>=3.0
 
21
 
22
  # Pin NumPy to 1.x for compatibility with Whisper dependencies
23
  numpy<2
 
18
 
19
  # Audio preprocessing
20
  noisereduce>=3.0
21
+ deepfilternet>=0.5
22
 
23
  # Pin NumPy to 1.x for compatibility with Whisper dependencies
24
  numpy<2