habulaj commited on
Commit
c13f0ec
·
verified ·
1 Parent(s): 1056927

Update srt_utils.py

Browse files
Files changed (1) hide show
  1. srt_utils.py +68 -54
srt_utils.py CHANGED
@@ -300,72 +300,86 @@ import os
300
 
301
  def process_audio_for_transcription(input_file: str) -> str:
302
  """
303
- Process audio to maximize speech clarity for invalid transcription.
304
- Applies:
305
- 1. Mono conversion
306
- 2. Resampling to 16kHz (Whisper optimal)
307
- 3. Highpass (200Hz) to remove rumble
308
- 4. Lowpass (3000Hz) to focus on speech band (telephone quality is sufficient for text)
309
- 5. AFFTDN (FFT-based noise reduction)
310
- 6. Dynaudnorm (Dynamic Audio Normalizer) to boost quiet speech
311
- 7. Compression (generic)
312
 
313
- Returns path to processed .mp3 file
314
  """
315
 
316
- # Check if ffmpeg exists
317
- ffmpeg_cmd = shutil.which("ffmpeg")
318
- if not ffmpeg_cmd:
319
- print("⚠️ FFmpeg não encontrado. Pulando processamento de áudio.")
320
- return input_file
321
-
322
- output_file = input_file + ".processed.mp3"
323
-
324
- # Complex filter chain
325
- # 1. afftdn: Denoise using FFT (requires noise profile, but default 'nr' often works blindly) - wait, rnnoise is better if available, but afftdn is standard.
326
- # Actually, simplistic filters are safer to avoid artifacts.
327
- # highpass=f=200, lowpass=f=3000, afftdn=nf=-25, dynaudnorm=f=150:g=15
328
- # afftdn might not be available in all builds. Let's stick to safe filters first.
329
-
330
- # Safe Filter Chain:
331
- # 1. silenceremove=stop_periods=-1:stop_duration=1:stop_threshold=-90dB (optional, maybe skip)
332
- # 2. highpass=f=200, lowpass=f=3000 (Bandpass)
333
- # 3. dynaudnorm (Normalize loudness dynamically)
334
- # 4. volume=1.5 (Boost a bit globally)
335
-
336
- # "Retire qualquer música de fundo" -> Extremely hard without AI like Spleeter.
337
- # To reduce music impact without destroying voice, we use a Gentler Vocal EQ.
338
- # We essentially attenuate frequencies where music dominates (Sub-bass, huge highs)
339
- # and normalize volume using EBU R128 (loudnorm) which is more natural than dynaudnorm.
340
-
341
- # Filter Chain Strategy (Simplified):
342
- # 1. Highpass (200Hz) - Cut rumble/bass lines
343
- # 2. Lowpass (8000Hz) - Keep up to 8kHz for clarity (s, t, p sounds), cut cymbal shimmer.
344
- # 3. Loudnorm - Standardize volume without aggressive pumping.
345
 
346
- filters = "highpass=f=200,lowpass=f=8000,loudnorm"
 
 
 
 
347
 
 
 
 
 
 
 
348
  try:
 
 
 
 
349
  command = [
350
- ffmpeg_cmd,
351
- "-y", # Overwrite
352
- "-i", input_file,
353
- "-vn", # No video
354
- "-ar", "16000", # 16kHz
355
- "-ac", "1", # Mono
356
- "-af", filters,
357
- "-c:a", "libmp3lame",
358
- "-q:a", "2", # High quality VBR
359
- output_file
360
  ]
361
 
362
- print(f"🔊 Processando áudio com FFmpeg (Gentle EQ + Loudnorm): {' '.join(command)}")
 
363
  subprocess.run(command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
364
 
365
- if os.path.exists(output_file):
366
- return output_file
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
367
 
368
  except Exception as e:
369
- print(f"⚠️ Falha ao processar áudio: {e}")
 
 
370
 
 
371
  return input_file
 
300
 
301
  def process_audio_for_transcription(input_file: str) -> str:
302
  """
303
+ Process audio to maximize speech clarity using Demucs (AI Source Separation).
304
+ It separates the audio into stems (vocals, drums, bass, other) and returns ONLY the vocals.
 
 
 
 
 
 
 
305
 
306
+ Returns path to processed .mp3 file (vocals)
307
  """
308
 
309
+ print(f"🔊 [Demucs] Iniciando isolamento de voz via AI...")
310
+
311
+ # Output directory for demucs
312
+ output_dir = os.path.join("static", "separated")
313
+ os.makedirs(output_dir, exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
 
315
+ # Demucs works best with CLI.
316
+ # Command: demucs --two-stems=vocals -n htdemucs_ft "input_file" -o "output_dir"
317
+ # --two-stems=vocals -> Saves time by only separating vocals/other
318
+ # -n htdemucs_ft -> High quality model (might be slow, maybe use htdemucs if too slow)
319
+ # Let's use `htdemucs` which is good balance.
320
 
321
+ # Check if demucs is installed (it should be via requirements.txt)
322
+ demucs_cmd = shutil.which("demucs")
323
+ if not demucs_cmd:
324
+ # Fallback to python -m demucs
325
+ demucs_cmd = "demucs"
326
+
327
  try:
328
+ # Run Demucs
329
+ # NOTE: First run will download model (~100MB+).
330
+ model = "htdemucs" # Good default
331
+
332
  command = [
333
+ demucs_cmd,
334
+ "--two-stems=vocals",
335
+ "-n", model,
336
+ input_file,
337
+ "-o", output_dir
 
 
 
 
 
338
  ]
339
 
340
+ print(f"🔊 Executando Demucs: {' '.join(command)}")
341
+ # This can take time.
342
  subprocess.run(command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
343
 
344
+ # Demucs output structure: output_dir / model_name / input_filename_no_ext / vocals.wav
345
+ input_filename = os.path.basename(input_file)
346
+ input_stem = os.path.splitext(input_filename)[0]
347
+
348
+ vocals_path = os.path.join(output_dir, model, input_stem, "vocals.wav")
349
+
350
+ if os.path.exists(vocals_path):
351
+ print(f"✅ Demucs sucesso: {vocals_path}")
352
+
353
+ # Convert Wav to MP3 to save space/bandwidth if needed,
354
+ # OR just return the wav if Groq supports it (Groq supports wav).
355
+ # Let's convert to MP3 16kHz mono to optimize upload to Groq
356
+
357
+ final_output = input_file + ".vocals.mp3"
358
+
359
+ ffmpeg_cmd = shutil.which("ffmpeg")
360
+ if ffmpeg_cmd:
361
+ # Compress to mono mp3
362
+ cmd_convert = [
363
+ ffmpeg_cmd, "-y",
364
+ "-i", vocals_path,
365
+ "-ac", "1", "-ar", "16000",
366
+ "-c:a", "libmp3lame", "-q:a", "2",
367
+ final_output
368
+ ]
369
+ subprocess.run(cmd_convert, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
370
+
371
+ # Cleanup demucs folder? Maybe keep for cache but better safe space
372
+ try: shutil.rmtree(os.path.join(output_dir, model, input_stem))
373
+ except: pass
374
+
375
+ return final_output
376
+
377
+ return vocals_path
378
 
379
  except Exception as e:
380
+ print(f"⚠️ Falha no Demucs: {e}")
381
+ import traceback
382
+ traceback.print_exc()
383
 
384
+ print("⚠️ Retornando arquivo original (fallback)")
385
  return input_file