File size: 4,486 Bytes
e685c03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""

Audio processing module for zero-shot keyword spotting.

Handles audio loading, preprocessing, and feature extraction.

"""

import librosa
import numpy as np
import torch
from typing import Union, Tuple
import warnings

warnings.filterwarnings("ignore")


class AudioProcessor:
    """Handles audio preprocessing for the keyword spotting model."""
    
    def __init__(self, target_sample_rate: int = 48000, max_duration: float = 30.0):
        """

        Initialize the audio processor.

        

        Args:

            target_sample_rate: Target sampling rate for audio processing

            max_duration: Maximum audio duration in seconds

        """
        self.target_sample_rate = target_sample_rate
        self.max_duration = max_duration
        self.max_samples = int(target_sample_rate * max_duration)
    
    def load_audio(self, audio_path: str) -> Tuple[np.ndarray, int]:
        """

        Load audio file and return waveform and sample rate.

        

        Args:

            audio_path: Path to the audio file

            

        Returns:

            Tuple of (waveform, sample_rate)

        """
        try:
            # Use librosa for robust audio loading
            waveform, sr = librosa.load(audio_path, sr=None)
            return waveform, sr
        except Exception as e:
            raise ValueError(f"Error loading audio file: {str(e)}")
    
    def preprocess_audio(self, waveform: np.ndarray, sample_rate: int) -> torch.Tensor:
        """

        Preprocess audio waveform for model input.

        

        Args:

            waveform: Audio waveform as numpy array

            sample_rate: Original sample rate

            

        Returns:

            Preprocessed audio tensor

        """
        # Convert to float32 if needed
        if waveform.dtype != np.float32:
            waveform = waveform.astype(np.float32)
        
        # Resample if necessary
        if sample_rate != self.target_sample_rate:
            waveform = librosa.resample(
                waveform, 
                orig_sr=sample_rate, 
                target_sr=self.target_sample_rate
            )
        
        # Ensure mono audio
        if len(waveform.shape) > 1:
            waveform = librosa.to_mono(waveform)
        
        # Trim or pad to max duration
        if len(waveform) > self.max_samples:
            # Trim to max duration
            waveform = waveform[:self.max_samples]
        elif len(waveform) < self.max_samples:
            # Pad with zeros
            padding = self.max_samples - len(waveform)
            waveform = np.pad(waveform, (0, padding), mode='constant', constant_values=0)
        
        # Normalize audio
        waveform = self._normalize_audio(waveform)
        
        # Convert to tensor
        audio_tensor = torch.from_numpy(waveform).float()
        
        return audio_tensor
    
    def _normalize_audio(self, waveform: np.ndarray) -> np.ndarray:
        """

        Normalize audio waveform.

        

        Args:

            waveform: Input waveform

            

        Returns:

            Normalized waveform

        """
        # RMS normalization
        rms = np.sqrt(np.mean(waveform**2))
        if rms > 0:
            waveform = waveform / (rms * 10)  # Scale down to prevent clipping
        
        # Clip to [-1, 1] range
        waveform = np.clip(waveform, -1.0, 1.0)
        
        return waveform
    
    def process_audio_file(self, audio_path: str) -> torch.Tensor:
        """

        Complete audio processing pipeline from file to tensor.

        

        Args:

            audio_path: Path to audio file

            

        Returns:

            Preprocessed audio tensor ready for model input

        """
        waveform, sample_rate = self.load_audio(audio_path)
        processed_audio = self.preprocess_audio(waveform, sample_rate)
        return processed_audio
    
    def process_audio_array(self, audio_array: np.ndarray, sample_rate: int) -> torch.Tensor:
        """

        Process audio from numpy array (e.g., from Gradio microphone input).

        

        Args:

            audio_array: Audio data as numpy array

            sample_rate: Sample rate of the audio

            

        Returns:

            Preprocessed audio tensor

        """
        return self.preprocess_audio(audio_array, sample_rate)