Spaces:
Sleeping
Sleeping
File size: 4,486 Bytes
e685c03 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
"""
Audio processing module for zero-shot keyword spotting.
Handles audio loading, preprocessing, and feature extraction.
"""
import librosa
import numpy as np
import torch
from typing import Union, Tuple
import warnings
warnings.filterwarnings("ignore")
class AudioProcessor:
"""Handles audio preprocessing for the keyword spotting model."""
def __init__(self, target_sample_rate: int = 48000, max_duration: float = 30.0):
"""
Initialize the audio processor.
Args:
target_sample_rate: Target sampling rate for audio processing
max_duration: Maximum audio duration in seconds
"""
self.target_sample_rate = target_sample_rate
self.max_duration = max_duration
self.max_samples = int(target_sample_rate * max_duration)
def load_audio(self, audio_path: str) -> Tuple[np.ndarray, int]:
"""
Load audio file and return waveform and sample rate.
Args:
audio_path: Path to the audio file
Returns:
Tuple of (waveform, sample_rate)
"""
try:
# Use librosa for robust audio loading
waveform, sr = librosa.load(audio_path, sr=None)
return waveform, sr
except Exception as e:
raise ValueError(f"Error loading audio file: {str(e)}")
def preprocess_audio(self, waveform: np.ndarray, sample_rate: int) -> torch.Tensor:
"""
Preprocess audio waveform for model input.
Args:
waveform: Audio waveform as numpy array
sample_rate: Original sample rate
Returns:
Preprocessed audio tensor
"""
# Convert to float32 if needed
if waveform.dtype != np.float32:
waveform = waveform.astype(np.float32)
# Resample if necessary
if sample_rate != self.target_sample_rate:
waveform = librosa.resample(
waveform,
orig_sr=sample_rate,
target_sr=self.target_sample_rate
)
# Ensure mono audio
if len(waveform.shape) > 1:
waveform = librosa.to_mono(waveform)
# Trim or pad to max duration
if len(waveform) > self.max_samples:
# Trim to max duration
waveform = waveform[:self.max_samples]
elif len(waveform) < self.max_samples:
# Pad with zeros
padding = self.max_samples - len(waveform)
waveform = np.pad(waveform, (0, padding), mode='constant', constant_values=0)
# Normalize audio
waveform = self._normalize_audio(waveform)
# Convert to tensor
audio_tensor = torch.from_numpy(waveform).float()
return audio_tensor
def _normalize_audio(self, waveform: np.ndarray) -> np.ndarray:
"""
Normalize audio waveform.
Args:
waveform: Input waveform
Returns:
Normalized waveform
"""
# RMS normalization
rms = np.sqrt(np.mean(waveform**2))
if rms > 0:
waveform = waveform / (rms * 10) # Scale down to prevent clipping
# Clip to [-1, 1] range
waveform = np.clip(waveform, -1.0, 1.0)
return waveform
def process_audio_file(self, audio_path: str) -> torch.Tensor:
"""
Complete audio processing pipeline from file to tensor.
Args:
audio_path: Path to audio file
Returns:
Preprocessed audio tensor ready for model input
"""
waveform, sample_rate = self.load_audio(audio_path)
processed_audio = self.preprocess_audio(waveform, sample_rate)
return processed_audio
def process_audio_array(self, audio_array: np.ndarray, sample_rate: int) -> torch.Tensor:
"""
Process audio from numpy array (e.g., from Gradio microphone input).
Args:
audio_array: Audio data as numpy array
sample_rate: Sample rate of the audio
Returns:
Preprocessed audio tensor
"""
return self.preprocess_audio(audio_array, sample_rate)
|