Spaces:

Pratik333
/

Speech_recognition

Build error

File size: 3,544 Bytes

4935b2c

"""
Lightweight audio preprocessing for speaker recognition.
MINIMAL processing to preserve voice characteristics.
"""

import numpy as np
import librosa
from scipy import signal
import warnings
warnings.filterwarnings('ignore')

try:
    import noisereduce as nr
    HAS_NOISEREDUCE = True
except ImportError:
    HAS_NOISEREDUCE = False
    print("⚠️ Install noisereduce for better results: pip install noisereduce")


class AudioPreprocessor:
    """Lightweight preprocessing - preserves voice characteristics."""
    
    def __init__(self, sample_rate=16000):
        self.sample_rate = sample_rate
        
    def process(self, audio, sr=None, mode='light'):
        """
        Minimal preprocessing pipeline.
        
        Args:
            audio: numpy array (float32)
            sr: sample rate
            mode: 'light' for enrollment, 'standard' for identification
            
        Returns:
            preprocessed audio (numpy array)
        """
        if sr is None:
            sr = self.sample_rate
            
        # Step 1: Resample if needed
        if sr != self.sample_rate:
            audio = librosa.resample(audio, orig_sr=sr, target_sr=self.sample_rate)
            sr = self.sample_rate
        
        # Step 2: Convert to mono if stereo
        if len(audio.shape) > 1:
            audio = audio.mean(axis=1)
        
        # Step 3: Remove DC offset
        audio = audio - np.mean(audio)
        
        # Step 4: Normalize amplitude
        audio = self._normalize(audio)
        
        # Step 5: Light noise reduction ONLY if mode is standard
        if mode == 'standard' and HAS_NOISEREDUCE and len(audio) > sr * 0.5:
            audio = self._reduce_noise_light(audio, sr)
        
        # Step 6: Final normalization
        audio = self._normalize(audio)
        
        return audio
    
    def _normalize(self, audio):
        """Normalize audio to [-1, 1] range."""
        max_val = np.abs(audio).max()
        if max_val > 0:
            audio = audio / max_val
        return audio
    
    def _reduce_noise_light(self, audio, sr):
        """LIGHT noise reduction - preserves voice characteristics."""
        try:
            reduced = nr.reduce_noise(
                y=audio,
                sr=sr,
                stationary=True,
                prop_decrease=0.5,  # Only 50% reduction (was 1.0 = 100%)
                freq_mask_smooth_hz=1000,
                time_mask_smooth_ms=100
            )
            return reduced
        except Exception as e:
            print(f"⚠️ Noise reduction skipped: {e}")
            return audio


# Global preprocessor instance
_preprocessor = None

def get_preprocessor():
    """Get or create global preprocessor instance."""
    global _preprocessor
    if _preprocessor is None:
        _preprocessor = AudioPreprocessor()
    return _preprocessor


def preprocess_audio(audio, sr=16000, for_enrollment=False):
    """
    Convenience function for preprocessing audio.
    
    Args:
        audio: numpy array
        sr: sample rate
        for_enrollment: if True, use lighter processing (preserves voice)
        
    Returns:
        preprocessed audio
    """
    preprocessor = get_preprocessor()
    
    if for_enrollment:
        # LIGHT processing for enrollment - preserve voice characteristics
        return preprocessor.process(audio, sr, mode='light')
    else:
        # STANDARD processing for identification - light noise reduction
        return preprocessor.process(audio, sr, mode='standard')