Update srt_utils.py
Browse files- srt_utils.py +68 -54
srt_utils.py
CHANGED
|
@@ -300,72 +300,86 @@ import os
|
|
| 300 |
|
| 301 |
def process_audio_for_transcription(input_file: str) -> str:
|
| 302 |
"""
|
| 303 |
-
Process audio to maximize speech clarity
|
| 304 |
-
|
| 305 |
-
1. Mono conversion
|
| 306 |
-
2. Resampling to 16kHz (Whisper optimal)
|
| 307 |
-
3. Highpass (200Hz) to remove rumble
|
| 308 |
-
4. Lowpass (3000Hz) to focus on speech band (telephone quality is sufficient for text)
|
| 309 |
-
5. AFFTDN (FFT-based noise reduction)
|
| 310 |
-
6. Dynaudnorm (Dynamic Audio Normalizer) to boost quiet speech
|
| 311 |
-
7. Compression (generic)
|
| 312 |
|
| 313 |
-
Returns path to processed .mp3 file
|
| 314 |
"""
|
| 315 |
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
output_file = input_file + ".processed.mp3"
|
| 323 |
-
|
| 324 |
-
# Complex filter chain
|
| 325 |
-
# 1. afftdn: Denoise using FFT (requires noise profile, but default 'nr' often works blindly) - wait, rnnoise is better if available, but afftdn is standard.
|
| 326 |
-
# Actually, simplistic filters are safer to avoid artifacts.
|
| 327 |
-
# highpass=f=200, lowpass=f=3000, afftdn=nf=-25, dynaudnorm=f=150:g=15
|
| 328 |
-
# afftdn might not be available in all builds. Let's stick to safe filters first.
|
| 329 |
-
|
| 330 |
-
# Safe Filter Chain:
|
| 331 |
-
# 1. silenceremove=stop_periods=-1:stop_duration=1:stop_threshold=-90dB (optional, maybe skip)
|
| 332 |
-
# 2. highpass=f=200, lowpass=f=3000 (Bandpass)
|
| 333 |
-
# 3. dynaudnorm (Normalize loudness dynamically)
|
| 334 |
-
# 4. volume=1.5 (Boost a bit globally)
|
| 335 |
-
|
| 336 |
-
# "Retire qualquer música de fundo" -> Extremely hard without AI like Spleeter.
|
| 337 |
-
# To reduce music impact without destroying voice, we use a Gentler Vocal EQ.
|
| 338 |
-
# We essentially attenuate frequencies where music dominates (Sub-bass, huge highs)
|
| 339 |
-
# and normalize volume using EBU R128 (loudnorm) which is more natural than dynaudnorm.
|
| 340 |
-
|
| 341 |
-
# Filter Chain Strategy (Simplified):
|
| 342 |
-
# 1. Highpass (200Hz) - Cut rumble/bass lines
|
| 343 |
-
# 2. Lowpass (8000Hz) - Keep up to 8kHz for clarity (s, t, p sounds), cut cymbal shimmer.
|
| 344 |
-
# 3. Loudnorm - Standardize volume without aggressive pumping.
|
| 345 |
|
| 346 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 348 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
command = [
|
| 350 |
-
|
| 351 |
-
"-
|
| 352 |
-
"-
|
| 353 |
-
|
| 354 |
-
"-
|
| 355 |
-
"-ac", "1", # Mono
|
| 356 |
-
"-af", filters,
|
| 357 |
-
"-c:a", "libmp3lame",
|
| 358 |
-
"-q:a", "2", # High quality VBR
|
| 359 |
-
output_file
|
| 360 |
]
|
| 361 |
|
| 362 |
-
print(f"🔊
|
|
|
|
| 363 |
subprocess.run(command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
| 364 |
|
| 365 |
-
|
| 366 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
|
| 368 |
except Exception as e:
|
| 369 |
-
print(f"⚠️ Falha
|
|
|
|
|
|
|
| 370 |
|
|
|
|
| 371 |
return input_file
|
|
|
|
| 300 |
|
| 301 |
def process_audio_for_transcription(input_file: str) -> str:
|
| 302 |
"""
|
| 303 |
+
Process audio to maximize speech clarity using Demucs (AI Source Separation).
|
| 304 |
+
It separates the audio into stems (vocals, drums, bass, other) and returns ONLY the vocals.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
|
| 306 |
+
Returns path to processed .mp3 file (vocals)
|
| 307 |
"""
|
| 308 |
|
| 309 |
+
print(f"🔊 [Demucs] Iniciando isolamento de voz via AI...")
|
| 310 |
+
|
| 311 |
+
# Output directory for demucs
|
| 312 |
+
output_dir = os.path.join("static", "separated")
|
| 313 |
+
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
|
| 315 |
+
# Demucs works best with CLI.
|
| 316 |
+
# Command: demucs --two-stems=vocals -n htdemucs_ft "input_file" -o "output_dir"
|
| 317 |
+
# --two-stems=vocals -> Saves time by only separating vocals/other
|
| 318 |
+
# -n htdemucs_ft -> High quality model (might be slow, maybe use htdemucs if too slow)
|
| 319 |
+
# Let's use `htdemucs` which is good balance.
|
| 320 |
|
| 321 |
+
# Check if demucs is installed (it should be via requirements.txt)
|
| 322 |
+
demucs_cmd = shutil.which("demucs")
|
| 323 |
+
if not demucs_cmd:
|
| 324 |
+
# Fallback to python -m demucs
|
| 325 |
+
demucs_cmd = "demucs"
|
| 326 |
+
|
| 327 |
try:
|
| 328 |
+
# Run Demucs
|
| 329 |
+
# NOTE: First run will download model (~100MB+).
|
| 330 |
+
model = "htdemucs" # Good default
|
| 331 |
+
|
| 332 |
command = [
|
| 333 |
+
demucs_cmd,
|
| 334 |
+
"--two-stems=vocals",
|
| 335 |
+
"-n", model,
|
| 336 |
+
input_file,
|
| 337 |
+
"-o", output_dir
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
]
|
| 339 |
|
| 340 |
+
print(f"🔊 Executando Demucs: {' '.join(command)}")
|
| 341 |
+
# This can take time.
|
| 342 |
subprocess.run(command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
| 343 |
|
| 344 |
+
# Demucs output structure: output_dir / model_name / input_filename_no_ext / vocals.wav
|
| 345 |
+
input_filename = os.path.basename(input_file)
|
| 346 |
+
input_stem = os.path.splitext(input_filename)[0]
|
| 347 |
+
|
| 348 |
+
vocals_path = os.path.join(output_dir, model, input_stem, "vocals.wav")
|
| 349 |
+
|
| 350 |
+
if os.path.exists(vocals_path):
|
| 351 |
+
print(f"✅ Demucs sucesso: {vocals_path}")
|
| 352 |
+
|
| 353 |
+
# Convert Wav to MP3 to save space/bandwidth if needed,
|
| 354 |
+
# OR just return the wav if Groq supports it (Groq supports wav).
|
| 355 |
+
# Let's convert to MP3 16kHz mono to optimize upload to Groq
|
| 356 |
+
|
| 357 |
+
final_output = input_file + ".vocals.mp3"
|
| 358 |
+
|
| 359 |
+
ffmpeg_cmd = shutil.which("ffmpeg")
|
| 360 |
+
if ffmpeg_cmd:
|
| 361 |
+
# Compress to mono mp3
|
| 362 |
+
cmd_convert = [
|
| 363 |
+
ffmpeg_cmd, "-y",
|
| 364 |
+
"-i", vocals_path,
|
| 365 |
+
"-ac", "1", "-ar", "16000",
|
| 366 |
+
"-c:a", "libmp3lame", "-q:a", "2",
|
| 367 |
+
final_output
|
| 368 |
+
]
|
| 369 |
+
subprocess.run(cmd_convert, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
| 370 |
+
|
| 371 |
+
# Cleanup demucs folder? Maybe keep for cache but better safe space
|
| 372 |
+
try: shutil.rmtree(os.path.join(output_dir, model, input_stem))
|
| 373 |
+
except: pass
|
| 374 |
+
|
| 375 |
+
return final_output
|
| 376 |
+
|
| 377 |
+
return vocals_path
|
| 378 |
|
| 379 |
except Exception as e:
|
| 380 |
+
print(f"⚠️ Falha no Demucs: {e}")
|
| 381 |
+
import traceback
|
| 382 |
+
traceback.print_exc()
|
| 383 |
|
| 384 |
+
print("⚠️ Retornando arquivo original (fallback)")
|
| 385 |
return input_file
|