Spaces:

ssasio
/

VOX33

Running

App Files Files Community

ssasio commited on 23 days ago

Commit

8c2f37b

verified ·

1 Parent(s): 19a2e3d

Upload audio_enhance.py

Browse files

Files changed (1) hide show

audio_enhance.py +141 -0

audio_enhance.py ADDED Viewed

	@@ -0,0 +1,141 @@

+"""audio_enhance.py — Voice cloning audio preprocessor
+=======================================================
+Прилага 3 стъпки преди CODEC encode при клониране на глас:
+  1. Noise reduction   — премахва фонов шум (spectral gating)
+  2. De-essing         — намалява съскането (5–10 kHz notch)
+  3. Warming           — леко усилва ниските средни (200–800 Hz)
+Всичко работи само с numpy + scipy (без външни зависимости).
+noisereduce е опционален — ако не е инсталиран, се пропуска стъпка 1.
+"""
+import numpy as np
+from scipy import signal as sig
+# ── 1. Noise reduction ────────────────────────────────────────
+def reduce_noise(audio: np.ndarray, sr: int, prop_decrease: float = 0.75) -> np.ndarray:
+    """
+    Spectral gating noise reduction.
+    prop_decrease: 0.0 = без ефект, 1.0 = пълно заглушаване на шума.
+    По-консервативна стойност (0.75) запазва естествеността на гласа.
+    """
+    try:
+        import noisereduce as nr
+        # non-stationary = следи шума динамично (по-добре за записи с вариращ шум)
+        reduced = nr.reduce_noise(
+            y=audio,
+            sr=sr,
+            prop_decrease=prop_decrease,
+            stationary=False,
+            freq_mask_smooth_hz=500,
+            time_mask_smooth_ms=50,
+        )
+        return reduced.astype(np.float32)
+    except ImportError:
+        # noisereduce не е инсталиран — пропускаме стъпката
+        print("  [enhance] noisereduce не е намерен — пропускане на noise reduction")
+        return audio
+# ── 2. De-esser (намаляване на съскане) ──────────────────────
+def de_ess(audio: np.ndarray, sr: int,
+           freq_low: float = 5000.0,
+           freq_high: float = 10000.0,
+           reduction_db: float = 6.0) -> np.ndarray:
+    """
+    Намалява съскащите честоти (сибиланс) чрез band-stop EQ.
+    freq_low / freq_high: диапазон на съскането в Hz (обикновено 5–10 kHz).
+    reduction_db: колко dB да намали (6 dB = наполовина по амплитуда).
+    """
+    nyq = sr / 2.0
+    low = freq_low / nyq
+    high = freq_high / nyq
+    # Клипваме до валиден диапазон
+    low = max(0.01, min(low, 0.98))
+    high = max(low + 0.01, min(high, 0.99))
+    # Band-stop (notch) филтър
+    b, a = sig.butter(2, [low, high], btype='bandstop')
+    filtered = sig.filtfilt(b, a, audio).astype(np.float32)
+    # Смесваме — не заместваме изцяло, за да запазим натуралност
+    gain = 10 ** (-reduction_db / 20.0)   # amplitude gain за подтиснатия обхват
+    # Изчисляваме само sibilant band и го добавяме обратно редуциран
+    sibilant = audio - filtered            # само съскащите честоти
+    result = filtered + sibilant * gain    # filtered + намален сибилант
+    return result.astype(np.float32)
+# ── 3. Warming (топлина на гласа) ────────────────────────────
+def warm_voice(audio: np.ndarray, sr: int,
+               freq_low: float = 200.0,
+               freq_high: float = 800.0,
+               boost_db: float = 2.5) -> np.ndarray:
+    """
+    Леко усилва ниско-средните честоти (200–800 Hz) за по-топъл глас.
+    boost_db: колко dB усилване (2–3 dB е естествено, над 4 е прекалено).
+    """
+    nyq = sr / 2.0
+    low = freq_low / nyq
+    high = freq_high / nyq
+    low = max(0.01, min(low, 0.98))
+    high = max(low + 0.01, min(high, 0.99))
+    # Band-pass — изолираме средните
+    b, a = sig.butter(2, [low, high], btype='bandpass')
+    warm_band = sig.filtfilt(b, a, audio).astype(np.float32)
+    gain = 10 ** (boost_db / 20.0) - 1.0  # само добавката (gain - 1)
+    result = audio + warm_band * gain
+    return result.astype(np.float32)
+# ── 4. Нормализация ───────────────────────────────────────────
+def normalize(audio: np.ndarray, target_peak: float = 0.95) -> np.ndarray:
+    """Пиково нормализиране — не позволява клипване след EQ."""
+    peak = np.max(np.abs(audio))
+    if peak > 1e-6:
+        audio = audio / peak * target_peak
+    return audio.astype(np.float32)
+# ── 5. Главна функция ─────────────────────────────────────────
+def enhance_voice_for_cloning(
+    audio: np.ndarray,
+    sr: int,
+    do_denoise: bool = True,
+    do_deess: bool = True,
+    do_warm: bool = True,
+    denoise_strength: float = 0.75,   # 0.0–1.0
+    deess_reduction_db: float = 6.0,  # dB намаляване на съскане
+    warm_boost_db: float = 2.5,       # dB усилване на топлина
+) -> np.ndarray:
+    """
+    Пълен pipeline за почистване на референтен глас преди клониране.
+    Връща почистено np.float32 аудио.
+    """
+    # Осигуряваме float32 mono
+    audio = audio.astype(np.float32)
+    if audio.ndim > 1:
+        audio = audio.mean(axis=0)  # stereo → mono
+    print(f"  [enhance] Входен сигнал: {len(audio)/sr:.1f}s @ {sr}Hz")
+    if do_denoise:
+        print(f"  [enhance] Noise reduction (сила={denoise_strength:.0%})...")
+        audio = reduce_noise(audio, sr, prop_decrease=denoise_strength)
+    if do_deess:
+        print(f"  [enhance] De-essing ({deess_reduction_db:.0f}dB)...")
+        audio = de_ess(audio, sr, reduction_db=deess_reduction_db)
+    if do_warm:
+        print(f"  [enhance] Warming (+{warm_boost_db:.1f}dB @ 200–800Hz)...")
+        audio = warm_voice(audio, sr, boost_db=warm_boost_db)
+    audio = normalize(audio)
+    print("  [enhance] Готово ✓")
+    return audio