Update srt_utils.py
Browse files- srt_utils.py +94 -90
srt_utils.py
CHANGED
|
@@ -298,107 +298,111 @@ import subprocess
|
|
| 298 |
import shutil
|
| 299 |
import os
|
| 300 |
|
| 301 |
-
def process_audio_for_transcription(input_file: str) -> str:
|
| 302 |
"""
|
| 303 |
-
Process audio to maximize speech clarity
|
| 304 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
|
| 306 |
Returns path to processed .mp3 file (vocals)
|
| 307 |
"""
|
| 308 |
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
# Output directory for demucs
|
| 312 |
-
output_dir = os.path.join("static", "separated")
|
| 313 |
os.makedirs(output_dir, exist_ok=True)
|
| 314 |
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
|
| 321 |
-
#
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
try:
|
| 328 |
-
# Run Demucs
|
| 329 |
-
# NOTE: First run will download model (~100MB+).
|
| 330 |
-
model = "htdemucs" # Good default
|
| 331 |
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
"
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
print(f"STDOUT: {result.stdout}")
|
| 350 |
-
print(f"STDERR: {result.stderr}")
|
| 351 |
-
# Fallback will trigger below
|
| 352 |
-
else:
|
| 353 |
-
# Demucs output structure: output_dir / model_name / input_filename_no_ext / vocals.mp3 (NOTE: .mp3 now)
|
| 354 |
-
input_filename = os.path.basename(input_file)
|
| 355 |
-
input_stem = os.path.splitext(input_filename)[0]
|
| 356 |
|
| 357 |
-
|
| 358 |
-
|
| 359 |
|
| 360 |
-
if
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
if ffmpeg_cmd:
|
| 370 |
-
# Compress to mono mp3 16k with aggressive voice enhancement
|
| 371 |
-
# Filters include highpass, noise reduction, compression, EQ, and normalization
|
| 372 |
-
filter_chain = (
|
| 373 |
-
"highpass=f=100,"
|
| 374 |
-
"afftdn=nr=10:nf=-50:tn=1,"
|
| 375 |
-
"compand=attacks=0:points=-80/-90|-45/-25|-27/-9|0/-7:gain=5,"
|
| 376 |
-
"equalizer=f=3000:width_type=h:width=1000:g=5,"
|
| 377 |
-
"loudnorm"
|
| 378 |
-
)
|
| 379 |
-
|
| 380 |
-
cmd_convert = [
|
| 381 |
-
ffmpeg_cmd, "-y",
|
| 382 |
-
"-i", vocals_path,
|
| 383 |
-
"-ac", "1", "-ar", "16000",
|
| 384 |
-
"-af", filter_chain,
|
| 385 |
-
"-c:a", "libmp3lame", "-q:a", "2",
|
| 386 |
-
final_output
|
| 387 |
-
]
|
| 388 |
-
subprocess.run(cmd_convert, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
| 389 |
-
|
| 390 |
-
# Cleanup demucs folder
|
| 391 |
-
try: shutil.rmtree(os.path.join(output_dir, model, input_stem))
|
| 392 |
-
except: pass
|
| 393 |
-
|
| 394 |
-
return final_output
|
| 395 |
|
| 396 |
-
|
|
|
|
| 397 |
|
| 398 |
-
|
| 399 |
-
print(f"
|
| 400 |
-
|
| 401 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 402 |
|
| 403 |
-
|
| 404 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 298 |
import shutil
|
| 299 |
import os
|
| 300 |
|
| 301 |
+
def process_audio_for_transcription(input_file: str, has_bg_music: bool = False) -> str:
|
| 302 |
"""
|
| 303 |
+
Process audio to maximize speech clarity.
|
| 304 |
+
|
| 305 |
+
Args:
|
| 306 |
+
input_file: Path to input audio
|
| 307 |
+
has_bg_music: If True, uses Demucs to remove background music (slow).
|
| 308 |
+
If False, skips Demucs but applies voice enhancement filters (fast).
|
| 309 |
|
| 310 |
Returns path to processed .mp3 file (vocals)
|
| 311 |
"""
|
| 312 |
|
| 313 |
+
# Output directory for processed files
|
| 314 |
+
output_dir = os.path.join("static", "processed")
|
|
|
|
|
|
|
| 315 |
os.makedirs(output_dir, exist_ok=True)
|
| 316 |
|
| 317 |
+
input_filename = os.path.basename(input_file)
|
| 318 |
+
input_stem = os.path.splitext(input_filename)[0]
|
| 319 |
+
final_output = os.path.join(output_dir, f"{input_stem}.processed.mp3")
|
| 320 |
+
|
| 321 |
+
ffmpeg_cmd = shutil.which("ffmpeg")
|
| 322 |
+
if not ffmpeg_cmd:
|
| 323 |
+
print("⚠️ FFmpeg não encontrado!")
|
| 324 |
+
return input_file
|
| 325 |
+
|
| 326 |
+
vocals_path = input_file
|
| 327 |
|
| 328 |
+
# 1. Background Music Removal (Demucs) - OPTIONAL
|
| 329 |
+
if has_bg_music:
|
| 330 |
+
print(f"🔊 [Demucs] Iniciando isolamento de voz via AI (has_bg_music=True)...")
|
| 331 |
+
demucs_output_dir = os.path.join("static", "separated")
|
| 332 |
+
os.makedirs(demucs_output_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
|
| 334 |
+
# Check demucs availability
|
| 335 |
+
demucs_cmd = shutil.which("demucs")
|
| 336 |
+
if not demucs_cmd:
|
| 337 |
+
demucs_cmd = "demucs" # Fallback to path alias
|
| 338 |
+
|
| 339 |
+
try:
|
| 340 |
+
model = "htdemucs"
|
| 341 |
+
command = [
|
| 342 |
+
demucs_cmd,
|
| 343 |
+
"--two-stems=vocals",
|
| 344 |
+
"-n", model,
|
| 345 |
+
"-d", "cpu",
|
| 346 |
+
"--mp3",
|
| 347 |
+
"--mp3-bitrate", "128",
|
| 348 |
+
input_file,
|
| 349 |
+
"-o", demucs_output_dir
|
| 350 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
|
| 352 |
+
print(f"🔊 Executando Demucs...")
|
| 353 |
+
result = subprocess.run(command, check=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
| 354 |
|
| 355 |
+
if result.returncode == 0:
|
| 356 |
+
# Demucs success
|
| 357 |
+
# Path: output_dir / model_name / input_filename_no_ext / vocals.mp3
|
| 358 |
+
demucs_vocals = os.path.join(demucs_output_dir, model, input_stem, "vocals.mp3")
|
| 359 |
+
if os.path.exists(demucs_vocals):
|
| 360 |
+
print(f"✅ Demucs sucesso: {demucs_vocals}")
|
| 361 |
+
vocals_path = demucs_vocals
|
| 362 |
+
else:
|
| 363 |
+
print(f"⚠️ Erro no Demucs (Code {result.returncode}), continuando com audio original.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
|
| 365 |
+
except Exception as e:
|
| 366 |
+
print(f"⚠️ Falha no Demucs: {e}")
|
| 367 |
|
| 368 |
+
else:
|
| 369 |
+
print(f"⏩ [Demucs] Pulando remoção de música (has_bg_music=False).")
|
| 370 |
+
|
| 371 |
+
# 2. Voice Enhancement (FFmpeg Filters) - ALWAYS RUN
|
| 372 |
+
print(f"🔊 [FFmpeg] Aplicando filtros de melhoria de voz...")
|
| 373 |
+
|
| 374 |
+
# Compress to mono mp3 16k with aggressive voice enhancement
|
| 375 |
+
# Filters include highpass, noise reduction, compression, EQ, and normalization
|
| 376 |
+
filter_chain = (
|
| 377 |
+
"highpass=f=100,"
|
| 378 |
+
"afftdn=nr=10:nf=-50:tn=1,"
|
| 379 |
+
"compand=attacks=0:points=-80/-90|-45/-25|-27/-9|0/-7:gain=5,"
|
| 380 |
+
"equalizer=f=3000:width_type=h:width=1000:g=5,"
|
| 381 |
+
"loudnorm"
|
| 382 |
+
)
|
| 383 |
+
|
| 384 |
+
cmd_convert = [
|
| 385 |
+
ffmpeg_cmd, "-y",
|
| 386 |
+
"-i", vocals_path,
|
| 387 |
+
"-ac", "1", "-ar", "16000",
|
| 388 |
+
"-af", filter_chain,
|
| 389 |
+
"-c:a", "libmp3lame", "-q:a", "2",
|
| 390 |
+
final_output
|
| 391 |
+
]
|
| 392 |
+
|
| 393 |
+
try:
|
| 394 |
+
subprocess.run(cmd_convert, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
| 395 |
|
| 396 |
+
# Cleanup demucs folder if it was used
|
| 397 |
+
if has_bg_music and "separated" in vocals_path:
|
| 398 |
+
try:
|
| 399 |
+
# We need to find the parent folder of 'vocals.mp3' which is the song folder
|
| 400 |
+
song_folder = os.path.dirname(vocals_path)
|
| 401 |
+
shutil.rmtree(song_folder)
|
| 402 |
+
except: pass
|
| 403 |
+
|
| 404 |
+
return final_output
|
| 405 |
+
|
| 406 |
+
except Exception as e:
|
| 407 |
+
print(f"⚠️ Erro no FFmpeg: {e}")
|
| 408 |
+
return vocals_path
|