Spaces:

crackuser
/

voiceclone-dev

Sleeping

App Files Files Community

crackuser commited on Sep 10, 2025

Commit

0d1b7fe

verified ·

1 Parent(s): 7fb0a37

Create audio_processor.py

Browse files

Files changed (1) hide show

audio_processor.py +226 -0

audio_processor.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import numpy as np
+import librosa
+import soundfile as sf
+import noisereduce as nr
+from scipy import signal
+from scipy.signal import butter, filtfilt
+import tempfile
+import os
+from typing import Tuple, Optional
+import io
+class AudioProcessor:
+    """Advanced audio processing for voice cloning"""
+    def __init__(self):
+        self.target_sr = 22050
+    def preprocess_audio(self, audio: np.ndarray, sr: int) -> np.ndarray:
+        """Comprehensive audio preprocessing"""
+        # Resample to target sample rate
+        if sr != self.target_sr:
+            audio = librosa.resample(audio, orig_sr=sr, target_sr=self.target_sr)
+        # Normalize amplitude
+        audio = self.normalize_audio(audio)
+        # Trim silence
+        audio = self.trim_silence(audio)
+        # Apply noise reduction
+        audio = self.reduce_noise(audio)
+        # Apply pre-emphasis filter
+        audio = self.apply_preemphasis(audio)
+        return audio
+    def normalize_audio(self, audio: np.ndarray, target_db: float = -20.0) -> np.ndarray:
+        """Normalize audio to target dB level"""
+        # Calculate RMS
+        rms = np.sqrt(np.mean(audio**2))
+        if rms > 0:
+            # Convert target dB to linear scale
+            target_rms = 10**(target_db / 20)
+            # Apply normalization
+            audio = audio * (target_rms / rms)
+            # Prevent clipping
+            max_val = np.max(np.abs(audio))
+            if max_val > 0.95:
+                audio = audio * (0.95 / max_val)
+        return audio
+    def trim_silence(self, audio: np.ndarray, threshold_db: float = -40.0) -> np.ndarray:
+        """Trim silence from beginning and end"""
+        # Use librosa's trim function
+        trimmed_audio, _ = librosa.effects.trim(
+            audio,
+            top_db=-threshold_db,
+            frame_length=2048,
+            hop_length=512
+        )
+        return trimmed_audio
+    def reduce_noise(self, audio: np.ndarray) -> np.ndarray:
+        """Apply noise reduction"""
+        try:
+            # Use noisereduce library
+            reduced_noise = nr.reduce_noise(y=audio, sr=self.target_sr)
+            return reduced_noise
+        except:
+            # Fallback: simple high-pass filter
+            return self.apply_highpass_filter(audio, cutoff=80)
+    def apply_preemphasis(self, audio: np.ndarray, coeff: float = 0.97) -> np.ndarray:
+        """Apply pre-emphasis filter"""
+        return signal.lfilter([1, -coeff], [1], audio)
+    def apply_deemphasis(self, audio: np.ndarray, coeff: float = 0.97) -> np.ndarray:
+        """Apply de-emphasis filter"""
+        return signal.lfilter([1], [1, -coeff], audio)
+    def apply_highpass_filter(self, audio: np.ndarray, cutoff: float = 80) -> np.ndarray:
+        """Apply high-pass filter"""
+        nyquist = self.target_sr * 0.5
+        normal_cutoff = cutoff / nyquist
+        b, a = butter(5, normal_cutoff, btype='high', analog=False)
+        return filtfilt(b, a, audio)
+    def apply_lowpass_filter(self, audio: np.ndarray, cutoff: float = 8000) -> np.ndarray:
+        """Apply low-pass filter"""
+        nyquist = self.target_sr * 0.5
+        normal_cutoff = cutoff / nyquist
+        b, a = butter(5, normal_cutoff, btype='low', analog=False)
+        return filtfilt(b, a, audio)
+    def apply_fade(self, audio: np.ndarray, fade_duration: float = 0.01) -> np.ndarray:
+        """Apply fade in/out"""
+        fade_samples = int(fade_duration * self.target_sr)
+        if len(audio) > 2 * fade_samples:
+            # Fade in
+            fade_in = np.linspace(0, 1, fade_samples)
+            audio[:fade_samples] *= fade_in
+            # Fade out
+            fade_out = np.linspace(1, 0, fade_samples)
+            audio[-fade_samples:] *= fade_out
+        return audio
+    def enhance_audio(self, audio: np.ndarray) -> np.ndarray:
+        """Enhance audio quality"""
+        # Apply noise reduction
+        enhanced = self.reduce_noise(audio)
+        # Apply gentle compression
+        enhanced = self.apply_compression(enhanced)
+        # Apply EQ boost for clarity
+        enhanced = self.apply_eq_boost(enhanced)
+        # Final normalization
+        enhanced = self.normalize_audio(enhanced)
+        # Apply fade
+        enhanced = self.apply_fade(enhanced)
+        return enhanced
+    def apply_compression(self, audio: np.ndarray, threshold: float = 0.5, ratio: float = 4.0) -> np.ndarray:
+        """Apply dynamic range compression"""
+        # Simple compression algorithm
+        compressed = audio.copy()
+        # Find samples above threshold
+        above_threshold = np.abs(compressed) > threshold
+        # Apply compression to samples above threshold
+        compressed[above_threshold] = np.sign(compressed[above_threshold]) * (
+            threshold + (np.abs(compressed[above_threshold]) - threshold) / ratio
+        )
+        return compressed
+    def apply_eq_boost(self, audio: np.ndarray) -> np.ndarray:
+        """Apply EQ boost for vocal clarity"""
+        # Boost frequencies important for speech (1-4 kHz)
+        # This is a simplified EQ - would use more sophisticated filtering in practice
+        # High-pass filter to remove low frequency noise
+        audio = self.apply_highpass_filter(audio, cutoff=85)
+        # Gentle low-pass to prevent harsh highs
+        audio = self.apply_lowpass_filter(audio, cutoff=7500)
+        return audio
+    def pitch_shift(self, audio: np.ndarray, semitones: float) -> np.ndarray:
+        """Shift pitch by semitones"""
+        return librosa.effects.pitch_shift(audio, sr=self.target_sr, n_steps=semitones)
+    def time_stretch(self, audio: np.ndarray, rate: float) -> np.ndarray:
+        """Change playback speed without affecting pitch"""
+        return librosa.effects.time_stretch(audio, rate=rate)
+    def detect_voice_activity(self, audio: np.ndarray, frame_duration: float = 0.025) -> np.ndarray:
+        """Detect voice activity in audio"""
+        frame_length = int(frame_duration * self.target_sr)
+        hop_length = frame_length // 2
+        # Calculate energy for each frame
+        energy = []
+        for i in range(0, len(audio) - frame_length + 1, hop_length):
+            frame = audio[i:i + frame_length]
+            frame_energy = np.sum(frame ** 2)
+            energy.append(frame_energy)
+        energy = np.array(energy)
+        # Simple threshold-based VAD
+        threshold = np.mean(energy) * 0.1
+        voice_activity = energy > threshold
+        return voice_activity
+    @staticmethod
+    def audio_to_bytes(audio: np.ndarray, sample_rate: int) -> bytes:
+        """Convert audio array to bytes for streaming"""
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
+            sf.write(tmp_file.name, audio, sample_rate)
+            with open(tmp_file.name, 'rb') as f:
+                audio_bytes = f.read()
+            # Clean up
+            os.unlink(tmp_file.name)
+            return audio_bytes
+    @staticmethod
+    def bytes_to_audio(audio_bytes: bytes) -> Tuple[np.ndarray, int]:
+        """Convert bytes to audio array"""
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
+            tmp_file.write(audio_bytes)
+            tmp_file.flush()
+            audio, sr = librosa.load(tmp_file.name, sr=None)
+            # Clean up
+            os.unlink(tmp_file.name)
+            return audio, sr