Speech-To-Text / src /utils /audio_utils.py
aryan083's picture
added basic ENGLISH STT
312e168
import librosa
import numpy as np
import soundfile as sf
from typing import Union, Tuple, Optional
import logging
import os
from pathlib import Path
class AudioProcessor:
"""
Audio processing utilities for speech-to-text preprocessing.
Optimizes audio for better transcription accuracy.
"""
def __init__(self, target_sr: int = 16000):
self.target_sr = target_sr
self.logger = logging.getLogger(__name__)
def preprocess_audio(self, audio_input: Union[str, np.ndarray],
normalize: bool = True,
trim_silence: bool = True,
noise_reduction: bool = False) -> np.ndarray:
"""
Preprocess audio for optimal speech recognition.
Args:
audio_input: Path to audio file or numpy array
normalize: Whether to normalize audio amplitude
trim_silence: Whether to trim silence from beginning/end
noise_reduction: Whether to apply basic noise reduction
Returns:
Preprocessed audio as numpy array
"""
try:
# Load audio if it's a file path
if isinstance(audio_input, str):
audio, sr = librosa.load(audio_input, sr=self.target_sr)
else:
audio = audio_input
sr = self.target_sr
# Resample if needed
if sr != self.target_sr:
audio = librosa.resample(audio, orig_sr=sr, target_sr=self.target_sr)
# Normalize audio
if normalize:
audio = librosa.util.normalize(audio)
# Trim silence
if trim_silence:
audio, _ = librosa.effects.trim(audio, top_db=20)
# Basic noise reduction using spectral gating
if noise_reduction:
audio = self._reduce_noise(audio)
# Ensure audio is not empty
if len(audio) == 0:
self.logger.warning("Audio is empty after preprocessing")
return np.zeros(1024) # Return minimal audio
return audio
except Exception as e:
self.logger.error(f"Audio preprocessing error: {e}")
# Return original audio or minimal fallback
if isinstance(audio_input, np.ndarray):
return audio_input
else:
return np.zeros(1024)
def _reduce_noise(self, audio: np.ndarray, noise_factor: float = 0.1) -> np.ndarray:
"""
Simple noise reduction using spectral subtraction.
Args:
audio: Input audio signal
noise_factor: Factor for noise reduction (0.0 to 1.0)
Returns:
Noise-reduced audio
"""
try:
# Compute STFT
stft = librosa.stft(audio)
magnitude = np.abs(stft)
phase = np.angle(stft)
# Estimate noise from first few frames
noise_frames = min(10, magnitude.shape[1] // 4)
noise_profile = np.mean(magnitude[:, :noise_frames], axis=1, keepdims=True)
# Spectral subtraction
clean_magnitude = magnitude - noise_factor * noise_profile
clean_magnitude = np.maximum(clean_magnitude, 0.1 * magnitude)
# Reconstruct audio
clean_stft = clean_magnitude * np.exp(1j * phase)
clean_audio = librosa.istft(clean_stft)
return clean_audio
except Exception as e:
self.logger.warning(f"Noise reduction failed: {e}")
return audio
def validate_audio(self, audio_path: str) -> Tuple[bool, str]:
"""
Validate audio file for processing.
Args:
audio_path: Path to audio file
Returns:
Tuple of (is_valid, message)
"""
try:
if not os.path.exists(audio_path):
return False, "Audio file does not exist"
# Check file size
file_size = os.path.getsize(audio_path)
if file_size == 0:
return False, "Audio file is empty"
if file_size > 100 * 1024 * 1024: # 100MB limit
return False, "Audio file too large (>100MB)"
# Try to load audio
try:
audio, sr = librosa.load(audio_path, duration=1.0) # Load first second
if len(audio) == 0:
return False, "Audio file contains no audio data"
except Exception as e:
return False, f"Cannot load audio file: {str(e)}"
return True, "Audio file is valid"
except Exception as e:
return False, f"Audio validation error: {str(e)}"
def get_audio_info(self, audio_path: str) -> dict:
"""
Get information about audio file.
Args:
audio_path: Path to audio file
Returns:
Dictionary with audio information
"""
try:
# Get file info
file_size = os.path.getsize(audio_path)
# Load audio to get properties
audio, sr = librosa.load(audio_path, sr=None)
duration = len(audio) / sr
return {
"file_path": audio_path,
"file_size_mb": file_size / (1024 * 1024),
"duration_seconds": duration,
"sample_rate": sr,
"channels": 1 if audio.ndim == 1 else audio.shape[0],
"samples": len(audio),
"format": Path(audio_path).suffix.lower()
}
except Exception as e:
return {
"error": f"Cannot get audio info: {str(e)}"
}
def convert_audio_format(self, input_path: str, output_path: str,
target_format: str = "wav") -> bool:
"""
Convert audio to different format.
Args:
input_path: Input audio file path
output_path: Output audio file path
target_format: Target format (wav, mp3, flac, etc.)
Returns:
Success status
"""
try:
# Load audio
audio, sr = librosa.load(input_path, sr=self.target_sr)
# Save in target format
sf.write(output_path, audio, sr, format=target_format.upper())
return True
except Exception as e:
self.logger.error(f"Audio conversion error: {e}")
return False
def split_audio(self, audio_path: str, chunk_duration: int = 30) -> list:
"""
Split long audio into chunks for processing.
Args:
audio_path: Path to audio file
chunk_duration: Duration of each chunk in seconds
Returns:
List of audio chunks as numpy arrays
"""
try:
# Load full audio
audio, sr = librosa.load(audio_path, sr=self.target_sr)
# Calculate chunk size in samples
chunk_samples = chunk_duration * sr
# Split audio into chunks
chunks = []
for i in range(0, len(audio), chunk_samples):
chunk = audio[i:i + chunk_samples]
if len(chunk) > sr: # Only include chunks longer than 1 second
chunks.append(chunk)
return chunks
except Exception as e:
self.logger.error(f"Audio splitting error: {e}")
return []