Spaces:

crackuser
/

voiceclone-dev

Sleeping

App Files Files Community

crackuser commited on Sep 10, 2025

Commit

4ec8b62

verified ·

1 Parent(s): 02e1276

Delete audio_processor.py

Browse files

Files changed (1) hide show

audio_processor.py +0 -226

audio_processor.py DELETED Viewed

@@ -1,226 +0,0 @@
-import numpy as np
-import librosa
-import soundfile as sf
-import noisereduce as nr
-from scipy import signal
-from scipy.signal import butter, filtfilt
-import tempfile
-import os
-from typing import Tuple, Optional
-import io
-class AudioProcessor:
-    """Advanced audio processing for voice cloning"""
-    def __init__(self):
-        self.target_sr = 22050
-    def preprocess_audio(self, audio: np.ndarray, sr: int) -> np.ndarray:
-        """Comprehensive audio preprocessing"""
-        # Resample to target sample rate
-        if sr != self.target_sr:
-            audio = librosa.resample(audio, orig_sr=sr, target_sr=self.target_sr)
-        # Normalize amplitude
-        audio = self.normalize_audio(audio)
-        # Trim silence
-        audio = self.trim_silence(audio)
-        # Apply noise reduction
-        audio = self.reduce_noise(audio)
-        # Apply pre-emphasis filter
-        audio = self.apply_preemphasis(audio)
-        return audio
-    def normalize_audio(self, audio: np.ndarray, target_db: float = -20.0) -> np.ndarray:
-        """Normalize audio to target dB level"""
-        # Calculate RMS
-        rms = np.sqrt(np.mean(audio**2))
-        if rms > 0:
-            # Convert target dB to linear scale
-            target_rms = 10**(target_db / 20)
-            # Apply normalization
-            audio = audio * (target_rms / rms)
-            # Prevent clipping
-            max_val = np.max(np.abs(audio))
-            if max_val > 0.95:
-                audio = audio * (0.95 / max_val)
-        return audio
-    def trim_silence(self, audio: np.ndarray, threshold_db: float = -40.0) -> np.ndarray:
-        """Trim silence from beginning and end"""
-        # Use librosa's trim function
-        trimmed_audio, _ = librosa.effects.trim(
-            audio,
-            top_db=-threshold_db,
-            frame_length=2048,
-            hop_length=512
-        )
-        return trimmed_audio
-    def reduce_noise(self, audio: np.ndarray) -> np.ndarray:
-        """Apply noise reduction"""
-        try:
-            # Use noisereduce library
-            reduced_noise = nr.reduce_noise(y=audio, sr=self.target_sr)
-            return reduced_noise
-        except:
-            # Fallback: simple high-pass filter
-            return self.apply_highpass_filter(audio, cutoff=80)
-    def apply_preemphasis(self, audio: np.ndarray, coeff: float = 0.97) -> np.ndarray:
-        """Apply pre-emphasis filter"""
-        return signal.lfilter([1, -coeff], [1], audio)
-    def apply_deemphasis(self, audio: np.ndarray, coeff: float = 0.97) -> np.ndarray:
-        """Apply de-emphasis filter"""
-        return signal.lfilter([1], [1, -coeff], audio)
-    def apply_highpass_filter(self, audio: np.ndarray, cutoff: float = 80) -> np.ndarray:
-        """Apply high-pass filter"""
-        nyquist = self.target_sr * 0.5
-        normal_cutoff = cutoff / nyquist
-        b, a = butter(5, normal_cutoff, btype='high', analog=False)
-        return filtfilt(b, a, audio)
-    def apply_lowpass_filter(self, audio: np.ndarray, cutoff: float = 8000) -> np.ndarray:
-        """Apply low-pass filter"""
-        nyquist = self.target_sr * 0.5
-        normal_cutoff = cutoff / nyquist
-        b, a = butter(5, normal_cutoff, btype='low', analog=False)
-        return filtfilt(b, a, audio)
-    def apply_fade(self, audio: np.ndarray, fade_duration: float = 0.01) -> np.ndarray:
-        """Apply fade in/out"""
-        fade_samples = int(fade_duration * self.target_sr)
-        if len(audio) > 2 * fade_samples:
-            # Fade in
-            fade_in = np.linspace(0, 1, fade_samples)
-            audio[:fade_samples] *= fade_in
-            # Fade out
-            fade_out = np.linspace(1, 0, fade_samples)
-            audio[-fade_samples:] *= fade_out
-        return audio
-    def enhance_audio(self, audio: np.ndarray) -> np.ndarray:
-        """Enhance audio quality"""
-        # Apply noise reduction
-        enhanced = self.reduce_noise(audio)
-        # Apply gentle compression
-        enhanced = self.apply_compression(enhanced)
-        # Apply EQ boost for clarity
-        enhanced = self.apply_eq_boost(enhanced)
-        # Final normalization
-        enhanced = self.normalize_audio(enhanced)
-        # Apply fade
-        enhanced = self.apply_fade(enhanced)
-        return enhanced
-    def apply_compression(self, audio: np.ndarray, threshold: float = 0.5, ratio: float = 4.0) -> np.ndarray:
-        """Apply dynamic range compression"""
-        # Simple compression algorithm
-        compressed = audio.copy()
-        # Find samples above threshold
-        above_threshold = np.abs(compressed) > threshold
-        # Apply compression to samples above threshold
-        compressed[above_threshold] = np.sign(compressed[above_threshold]) * (
-            threshold + (np.abs(compressed[above_threshold]) - threshold) / ratio
-        )
-        return compressed
-    def apply_eq_boost(self, audio: np.ndarray) -> np.ndarray:
-        """Apply EQ boost for vocal clarity"""
-        # Boost frequencies important for speech (1-4 kHz)
-        # This is a simplified EQ - would use more sophisticated filtering in practice
-        # High-pass filter to remove low frequency noise
-        audio = self.apply_highpass_filter(audio, cutoff=85)
-        # Gentle low-pass to prevent harsh highs
-        audio = self.apply_lowpass_filter(audio, cutoff=7500)
-        return audio
-    def pitch_shift(self, audio: np.ndarray, semitones: float) -> np.ndarray:
-        """Shift pitch by semitones"""
-        return librosa.effects.pitch_shift(audio, sr=self.target_sr, n_steps=semitones)
-    def time_stretch(self, audio: np.ndarray, rate: float) -> np.ndarray:
-        """Change playback speed without affecting pitch"""
-        return librosa.effects.time_stretch(audio, rate=rate)
-    def detect_voice_activity(self, audio: np.ndarray, frame_duration: float = 0.025) -> np.ndarray:
-        """Detect voice activity in audio"""
-        frame_length = int(frame_duration * self.target_sr)
-        hop_length = frame_length // 2
-        # Calculate energy for each frame
-        energy = []
-        for i in range(0, len(audio) - frame_length + 1, hop_length):
-            frame = audio[i:i + frame_length]
-            frame_energy = np.sum(frame ** 2)
-            energy.append(frame_energy)
-        energy = np.array(energy)
-        # Simple threshold-based VAD
-        threshold = np.mean(energy) * 0.1
-        voice_activity = energy > threshold
-        return voice_activity
-    @staticmethod
-    def audio_to_bytes(audio: np.ndarray, sample_rate: int) -> bytes:
-        """Convert audio array to bytes for streaming"""
-        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
-            sf.write(tmp_file.name, audio, sample_rate)
-            with open(tmp_file.name, 'rb') as f:
-                audio_bytes = f.read()
-            # Clean up
-            os.unlink(tmp_file.name)
-            return audio_bytes
-    @staticmethod
-    def bytes_to_audio(audio_bytes: bytes) -> Tuple[np.ndarray, int]:
-        """Convert bytes to audio array"""
-        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
-            tmp_file.write(audio_bytes)
-            tmp_file.flush()
-            audio, sr = librosa.load(tmp_file.name, sr=None)
-            # Clean up
-            os.unlink(tmp_file.name)
-            return audio, sr