Spaces:

aryan083
/

Speech-To-Text

Sleeping

File size: 7,978 Bytes

312e168

import librosa
import numpy as np
import soundfile as sf
from typing import Union, Tuple, Optional
import logging
import os
from pathlib import Path

class AudioProcessor:
    """
    Audio processing utilities for speech-to-text preprocessing.
    Optimizes audio for better transcription accuracy.
    """
    
    def __init__(self, target_sr: int = 16000):
        self.target_sr = target_sr
        self.logger = logging.getLogger(__name__)
    
    def preprocess_audio(self, audio_input: Union[str, np.ndarray], 
                        normalize: bool = True, 
                        trim_silence: bool = True,
                        noise_reduction: bool = False) -> np.ndarray:
        """
        Preprocess audio for optimal speech recognition.
        
        Args:
            audio_input: Path to audio file or numpy array
            normalize: Whether to normalize audio amplitude
            trim_silence: Whether to trim silence from beginning/end
            noise_reduction: Whether to apply basic noise reduction
            
        Returns:
            Preprocessed audio as numpy array
        """
        try:
            # Load audio if it's a file path
            if isinstance(audio_input, str):
                audio, sr = librosa.load(audio_input, sr=self.target_sr)
            else:
                audio = audio_input
                sr = self.target_sr
            
            # Resample if needed
            if sr != self.target_sr:
                audio = librosa.resample(audio, orig_sr=sr, target_sr=self.target_sr)
            
            # Normalize audio
            if normalize:
                audio = librosa.util.normalize(audio)
            
            # Trim silence
            if trim_silence:
                audio, _ = librosa.effects.trim(audio, top_db=20)
            
            # Basic noise reduction using spectral gating
            if noise_reduction:
                audio = self._reduce_noise(audio)
            
            # Ensure audio is not empty
            if len(audio) == 0:
                self.logger.warning("Audio is empty after preprocessing")
                return np.zeros(1024)  # Return minimal audio
            
            return audio
            
        except Exception as e:
            self.logger.error(f"Audio preprocessing error: {e}")
            # Return original audio or minimal fallback
            if isinstance(audio_input, np.ndarray):
                return audio_input
            else:
                return np.zeros(1024)
    
    def _reduce_noise(self, audio: np.ndarray, noise_factor: float = 0.1) -> np.ndarray:
        """
        Simple noise reduction using spectral subtraction.
        
        Args:
            audio: Input audio signal
            noise_factor: Factor for noise reduction (0.0 to 1.0)
            
        Returns:
            Noise-reduced audio
        """
        try:
            # Compute STFT
            stft = librosa.stft(audio)
            magnitude = np.abs(stft)
            phase = np.angle(stft)
            
            # Estimate noise from first few frames
            noise_frames = min(10, magnitude.shape[1] // 4)
            noise_profile = np.mean(magnitude[:, :noise_frames], axis=1, keepdims=True)
            
            # Spectral subtraction
            clean_magnitude = magnitude - noise_factor * noise_profile
            clean_magnitude = np.maximum(clean_magnitude, 0.1 * magnitude)
            
            # Reconstruct audio
            clean_stft = clean_magnitude * np.exp(1j * phase)
            clean_audio = librosa.istft(clean_stft)
            
            return clean_audio
            
        except Exception as e:
            self.logger.warning(f"Noise reduction failed: {e}")
            return audio
    
    def validate_audio(self, audio_path: str) -> Tuple[bool, str]:
        """
        Validate audio file for processing.
        
        Args:
            audio_path: Path to audio file
            
        Returns:
            Tuple of (is_valid, message)
        """
        try:
            if not os.path.exists(audio_path):
                return False, "Audio file does not exist"
            
            # Check file size
            file_size = os.path.getsize(audio_path)
            if file_size == 0:
                return False, "Audio file is empty"
            
            if file_size > 100 * 1024 * 1024:  # 100MB limit
                return False, "Audio file too large (>100MB)"
            
            # Try to load audio
            try:
                audio, sr = librosa.load(audio_path, duration=1.0)  # Load first second
                if len(audio) == 0:
                    return False, "Audio file contains no audio data"
            except Exception as e:
                return False, f"Cannot load audio file: {str(e)}"
            
            return True, "Audio file is valid"
            
        except Exception as e:
            return False, f"Audio validation error: {str(e)}"
    
    def get_audio_info(self, audio_path: str) -> dict:
        """
        Get information about audio file.
        
        Args:
            audio_path: Path to audio file
            
        Returns:
            Dictionary with audio information
        """
        try:
            # Get file info
            file_size = os.path.getsize(audio_path)
            
            # Load audio to get properties
            audio, sr = librosa.load(audio_path, sr=None)
            duration = len(audio) / sr
            
            return {
                "file_path": audio_path,
                "file_size_mb": file_size / (1024 * 1024),
                "duration_seconds": duration,
                "sample_rate": sr,
                "channels": 1 if audio.ndim == 1 else audio.shape[0],
                "samples": len(audio),
                "format": Path(audio_path).suffix.lower()
            }
            
        except Exception as e:
            return {
                "error": f"Cannot get audio info: {str(e)}"
            }
    
    def convert_audio_format(self, input_path: str, output_path: str, 
                           target_format: str = "wav") -> bool:
        """
        Convert audio to different format.
        
        Args:
            input_path: Input audio file path
            output_path: Output audio file path
            target_format: Target format (wav, mp3, flac, etc.)
            
        Returns:
            Success status
        """
        try:
            # Load audio
            audio, sr = librosa.load(input_path, sr=self.target_sr)
            
            # Save in target format
            sf.write(output_path, audio, sr, format=target_format.upper())
            
            return True
            
        except Exception as e:
            self.logger.error(f"Audio conversion error: {e}")
            return False
    
    def split_audio(self, audio_path: str, chunk_duration: int = 30) -> list:
        """
        Split long audio into chunks for processing.
        
        Args:
            audio_path: Path to audio file
            chunk_duration: Duration of each chunk in seconds
            
        Returns:
            List of audio chunks as numpy arrays
        """
        try:
            # Load full audio
            audio, sr = librosa.load(audio_path, sr=self.target_sr)
            
            # Calculate chunk size in samples
            chunk_samples = chunk_duration * sr
            
            # Split audio into chunks
            chunks = []
            for i in range(0, len(audio), chunk_samples):
                chunk = audio[i:i + chunk_samples]
                if len(chunk) > sr:  # Only include chunks longer than 1 second
                    chunks.append(chunk)
            
            return chunks
            
        except Exception as e:
            self.logger.error(f"Audio splitting error: {e}")
            return []