Spaces:

habulaj
/

subapi

Running

App Files Files Community

habulaj commited on 18 days ago

Commit

c13f0ec

verified ·

1 Parent(s): 1056927

Update srt_utils.py

Browse files

Files changed (1) hide show

srt_utils.py +68 -54

srt_utils.py CHANGED Viewed

@@ -300,72 +300,86 @@ import os
 def process_audio_for_transcription(input_file: str) -> str:
     """
-    Process audio to maximize speech clarity for invalid transcription.
-    Applies:
-    1. Mono conversion
-    2. Resampling to 16kHz (Whisper optimal)
-    3. Highpass (200Hz) to remove rumble
-    4. Lowpass (3000Hz) to focus on speech band (telephone quality is sufficient for text)
-    5. AFFTDN (FFT-based noise reduction)
-    6. Dynaudnorm (Dynamic Audio Normalizer) to boost quiet speech
-    7. Compression (generic)
-    Returns path to processed .mp3 file
     """
-    # Check if ffmpeg exists
-    ffmpeg_cmd = shutil.which("ffmpeg")
-    if not ffmpeg_cmd:
-        print("⚠️ FFmpeg não encontrado. Pulando processamento de áudio.")
-        return input_file
-    output_file = input_file + ".processed.mp3"
-    # Complex filter chain
-    # 1. afftdn: Denoise using FFT (requires noise profile, but default 'nr' often works blindly) - wait, rnnoise is better if available, but afftdn is standard.
-    # Actually, simplistic filters are safer to avoid artifacts.
-    # highpass=f=200, lowpass=f=3000, afftdn=nf=-25, dynaudnorm=f=150:g=15
-    # afftdn might not be available in all builds. Let's stick to safe filters first.
-    # Safe Filter Chain:
-    # 1. silenceremove=stop_periods=-1:stop_duration=1:stop_threshold=-90dB (optional, maybe skip)
-    # 2. highpass=f=200, lowpass=f=3000 (Bandpass)
-    # 3. dynaudnorm (Normalize loudness dynamically)
-    # 4. volume=1.5 (Boost a bit globally)
-    # "Retire qualquer música de fundo" -> Extremely hard without AI like Spleeter.
-    # To reduce music impact without destroying voice, we use a Gentler Vocal EQ.
-    # We essentially attenuate frequencies where music dominates (Sub-bass, huge highs)
-    # and normalize volume using EBU R128 (loudnorm) which is more natural than dynaudnorm.
-    # Filter Chain Strategy (Simplified):
-    # 1. Highpass (200Hz) - Cut rumble/bass lines
-    # 2. Lowpass (8000Hz) - Keep up to 8kHz for clarity (s, t, p sounds), cut cymbal shimmer.
-    # 3. Loudnorm - Standardize volume without aggressive pumping.
-    filters = "highpass=f=200,lowpass=f=8000,loudnorm"
     try:
         command = [
-            ffmpeg_cmd,
-            "-y", # Overwrite
-            "-i", input_file,
-            "-vn", # No video
-            "-ar", "16000", # 16kHz
-            "-ac", "1", # Mono
-            "-af", filters,
-            "-c:a", "libmp3lame",
-            "-q:a", "2", # High quality VBR
-            output_file
         ]
-        print(f"🔊 Processando áudio com FFmpeg (Gentle EQ + Loudnorm): {' '.join(command)}")
         subprocess.run(command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-        if os.path.exists(output_file):
-            return output_file
     except Exception as e:
-        print(f"⚠️ Falha ao processar áudio: {e}")
     return input_file

 def process_audio_for_transcription(input_file: str) -> str:
     """
+    Process audio to maximize speech clarity using Demucs (AI Source Separation).
+    It separates the audio into stems (vocals, drums, bass, other) and returns ONLY the vocals.
+    Returns path to processed .mp3 file (vocals)
     """
+    print(f"🔊 [Demucs] Iniciando isolamento de voz via AI...")
+    # Output directory for demucs
+    output_dir = os.path.join("static", "separated")
+    os.makedirs(output_dir, exist_ok=True)
+    # Demucs works best with CLI.
+    # Command: demucs --two-stems=vocals -n htdemucs_ft "input_file" -o "output_dir"
+    # --two-stems=vocals -> Saves time by only separating vocals/other
+    # -n htdemucs_ft -> High quality model (might be slow, maybe use htdemucs if too slow)
+    # Let's use `htdemucs` which is good balance.
+    # Check if demucs is installed (it should be via requirements.txt)
+    demucs_cmd = shutil.which("demucs")
+    if not demucs_cmd:
+        # Fallback to python -m demucs
+        demucs_cmd = "demucs"
     try:
+        # Run Demucs
+        # NOTE: First run will download model (~100MB+).
+        model = "htdemucs" # Good default
         command = [
+            demucs_cmd,
+            "--two-stems=vocals",
+            "-n", model,
+            input_file,
+            "-o", output_dir
         ]
+        print(f"🔊 Executando Demucs: {' '.join(command)}")
+        # This can take time.
         subprocess.run(command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        # Demucs output structure: output_dir / model_name / input_filename_no_ext / vocals.wav
+        input_filename = os.path.basename(input_file)
+        input_stem = os.path.splitext(input_filename)[0]
+        vocals_path = os.path.join(output_dir, model, input_stem, "vocals.wav")
+        if os.path.exists(vocals_path):
+            print(f"✅ Demucs sucesso: {vocals_path}")
+            # Convert Wav to MP3 to save space/bandwidth if needed,
+            # OR just return the wav if Groq supports it (Groq supports wav).
+            # Let's convert to MP3 16kHz mono to optimize upload to Groq
+            final_output = input_file + ".vocals.mp3"
+            ffmpeg_cmd = shutil.which("ffmpeg")
+            if ffmpeg_cmd:
+                 # Compress to mono mp3
+                 cmd_convert = [
+                    ffmpeg_cmd, "-y",
+                    "-i", vocals_path,
+                    "-ac", "1", "-ar", "16000",
+                    "-c:a", "libmp3lame", "-q:a", "2",
+                    final_output
+                 ]
+                 subprocess.run(cmd_convert, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+                 # Cleanup demucs folder? Maybe keep for cache but better safe space
+                 try: shutil.rmtree(os.path.join(output_dir, model, input_stem))
+                 except: pass
+                 return final_output
+            return vocals_path
     except Exception as e:
+        print(f"⚠️ Falha no Demucs: {e}")
+        import traceback
+        traceback.print_exc()
+    print("⚠️ Retornando arquivo original (fallback)")
     return input_file