habulaj commited on
Commit
fdb9ed2
·
verified ·
1 Parent(s): 4aa5e54

Update srt_utils.py

Browse files
Files changed (1) hide show
  1. srt_utils.py +94 -90
srt_utils.py CHANGED
@@ -298,107 +298,111 @@ import subprocess
298
  import shutil
299
  import os
300
 
301
- def process_audio_for_transcription(input_file: str) -> str:
302
  """
303
- Process audio to maximize speech clarity using Demucs (AI Source Separation).
304
- It separates the audio into stems (vocals, drums, bass, other) and returns ONLY the vocals.
 
 
 
 
305
 
306
  Returns path to processed .mp3 file (vocals)
307
  """
308
 
309
- print(f"🔊 [Demucs] Iniciando isolamento de voz via AI...")
310
-
311
- # Output directory for demucs
312
- output_dir = os.path.join("static", "separated")
313
  os.makedirs(output_dir, exist_ok=True)
314
 
315
- # Demucs works best with CLI.
316
- # Command: demucs --two-stems=vocals -n htdemucs_ft "input_file" -o "output_dir"
317
- # --two-stems=vocals -> Saves time by only separating vocals/other
318
- # -n htdemucs_ft -> High quality model (might be slow, maybe use htdemucs if too slow)
319
- # Let's use `htdemucs` which is good balance.
 
 
 
 
 
320
 
321
- # Check if demucs is installed (it should be via requirements.txt)
322
- demucs_cmd = shutil.which("demucs")
323
- if not demucs_cmd:
324
- # Fallback to python -m demucs
325
- demucs_cmd = "demucs"
326
-
327
- try:
328
- # Run Demucs
329
- # NOTE: First run will download model (~100MB+).
330
- model = "htdemucs" # Good default
331
 
332
- command = [
333
- demucs_cmd,
334
- "--two-stems=vocals",
335
- "-n", model,
336
- "-d", "cpu",
337
- "--mp3", # Output as MP3 directly
338
- "--mp3-bitrate", "128",
339
- input_file,
340
- "-o", output_dir
341
- ]
342
-
343
- print(f"🔊 Executando Demucs: {' '.join(command)}")
344
- # Capture output for debugging
345
- result = subprocess.run(command, check=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
346
-
347
- if result.returncode != 0:
348
- print(f"⚠️ Erro no Demucs (Code {result.returncode}):")
349
- print(f"STDOUT: {result.stdout}")
350
- print(f"STDERR: {result.stderr}")
351
- # Fallback will trigger below
352
- else:
353
- # Demucs output structure: output_dir / model_name / input_filename_no_ext / vocals.mp3 (NOTE: .mp3 now)
354
- input_filename = os.path.basename(input_file)
355
- input_stem = os.path.splitext(input_filename)[0]
356
 
357
- # Check for mp3
358
- vocals_path = os.path.join(output_dir, model, input_stem, "vocals.mp3")
359
 
360
- if os.path.exists(vocals_path):
361
- print(f"✅ Demucs sucesso: {vocals_path}")
362
-
363
- # Resample to 16k just to be sure and mono? Demucs output might be stereo 44.1k
364
- # Groq takes mp3 fine, but 16k mono is smaller/faster.
365
-
366
- final_output = input_file + ".vocals.mp3"
367
-
368
- ffmpeg_cmd = shutil.which("ffmpeg")
369
- if ffmpeg_cmd:
370
- # Compress to mono mp3 16k with aggressive voice enhancement
371
- # Filters include highpass, noise reduction, compression, EQ, and normalization
372
- filter_chain = (
373
- "highpass=f=100,"
374
- "afftdn=nr=10:nf=-50:tn=1,"
375
- "compand=attacks=0:points=-80/-90|-45/-25|-27/-9|0/-7:gain=5,"
376
- "equalizer=f=3000:width_type=h:width=1000:g=5,"
377
- "loudnorm"
378
- )
379
-
380
- cmd_convert = [
381
- ffmpeg_cmd, "-y",
382
- "-i", vocals_path,
383
- "-ac", "1", "-ar", "16000",
384
- "-af", filter_chain,
385
- "-c:a", "libmp3lame", "-q:a", "2",
386
- final_output
387
- ]
388
- subprocess.run(cmd_convert, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
389
-
390
- # Cleanup demucs folder
391
- try: shutil.rmtree(os.path.join(output_dir, model, input_stem))
392
- except: pass
393
-
394
- return final_output
395
 
396
- return vocals_path
 
397
 
398
- except Exception as e:
399
- print(f"⚠️ Falha no Demucs: {e}")
400
- import traceback
401
- traceback.print_exc()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
 
403
- print("⚠️ Retornando arquivo original (fallback)")
404
- return input_file
 
 
 
 
 
 
 
 
 
 
 
 
298
  import shutil
299
  import os
300
 
301
+ def process_audio_for_transcription(input_file: str, has_bg_music: bool = False) -> str:
302
  """
303
+ Process audio to maximize speech clarity.
304
+
305
+ Args:
306
+ input_file: Path to input audio
307
+ has_bg_music: If True, uses Demucs to remove background music (slow).
308
+ If False, skips Demucs but applies voice enhancement filters (fast).
309
 
310
  Returns path to processed .mp3 file (vocals)
311
  """
312
 
313
+ # Output directory for processed files
314
+ output_dir = os.path.join("static", "processed")
 
 
315
  os.makedirs(output_dir, exist_ok=True)
316
 
317
+ input_filename = os.path.basename(input_file)
318
+ input_stem = os.path.splitext(input_filename)[0]
319
+ final_output = os.path.join(output_dir, f"{input_stem}.processed.mp3")
320
+
321
+ ffmpeg_cmd = shutil.which("ffmpeg")
322
+ if not ffmpeg_cmd:
323
+ print("⚠️ FFmpeg não encontrado!")
324
+ return input_file
325
+
326
+ vocals_path = input_file
327
 
328
+ # 1. Background Music Removal (Demucs) - OPTIONAL
329
+ if has_bg_music:
330
+ print(f"🔊 [Demucs] Iniciando isolamento de voz via AI (has_bg_music=True)...")
331
+ demucs_output_dir = os.path.join("static", "separated")
332
+ os.makedirs(demucs_output_dir, exist_ok=True)
 
 
 
 
 
333
 
334
+ # Check demucs availability
335
+ demucs_cmd = shutil.which("demucs")
336
+ if not demucs_cmd:
337
+ demucs_cmd = "demucs" # Fallback to path alias
338
+
339
+ try:
340
+ model = "htdemucs"
341
+ command = [
342
+ demucs_cmd,
343
+ "--two-stems=vocals",
344
+ "-n", model,
345
+ "-d", "cpu",
346
+ "--mp3",
347
+ "--mp3-bitrate", "128",
348
+ input_file,
349
+ "-o", demucs_output_dir
350
+ ]
 
 
 
 
 
 
 
351
 
352
+ print(f"🔊 Executando Demucs...")
353
+ result = subprocess.run(command, check=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
354
 
355
+ if result.returncode == 0:
356
+ # Demucs success
357
+ # Path: output_dir / model_name / input_filename_no_ext / vocals.mp3
358
+ demucs_vocals = os.path.join(demucs_output_dir, model, input_stem, "vocals.mp3")
359
+ if os.path.exists(demucs_vocals):
360
+ print(f"✅ Demucs sucesso: {demucs_vocals}")
361
+ vocals_path = demucs_vocals
362
+ else:
363
+ print(f"⚠️ Erro no Demucs (Code {result.returncode}), continuando com audio original.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
 
365
+ except Exception as e:
366
+ print(f"⚠️ Falha no Demucs: {e}")
367
 
368
+ else:
369
+ print(f" [Demucs] Pulando remoção de música (has_bg_music=False).")
370
+
371
+ # 2. Voice Enhancement (FFmpeg Filters) - ALWAYS RUN
372
+ print(f"🔊 [FFmpeg] Aplicando filtros de melhoria de voz...")
373
+
374
+ # Compress to mono mp3 16k with aggressive voice enhancement
375
+ # Filters include highpass, noise reduction, compression, EQ, and normalization
376
+ filter_chain = (
377
+ "highpass=f=100,"
378
+ "afftdn=nr=10:nf=-50:tn=1,"
379
+ "compand=attacks=0:points=-80/-90|-45/-25|-27/-9|0/-7:gain=5,"
380
+ "equalizer=f=3000:width_type=h:width=1000:g=5,"
381
+ "loudnorm"
382
+ )
383
+
384
+ cmd_convert = [
385
+ ffmpeg_cmd, "-y",
386
+ "-i", vocals_path,
387
+ "-ac", "1", "-ar", "16000",
388
+ "-af", filter_chain,
389
+ "-c:a", "libmp3lame", "-q:a", "2",
390
+ final_output
391
+ ]
392
+
393
+ try:
394
+ subprocess.run(cmd_convert, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
395
 
396
+ # Cleanup demucs folder if it was used
397
+ if has_bg_music and "separated" in vocals_path:
398
+ try:
399
+ # We need to find the parent folder of 'vocals.mp3' which is the song folder
400
+ song_folder = os.path.dirname(vocals_path)
401
+ shutil.rmtree(song_folder)
402
+ except: pass
403
+
404
+ return final_output
405
+
406
+ except Exception as e:
407
+ print(f"⚠️ Erro no FFmpeg: {e}")
408
+ return vocals_path