Kalpokoch commited on
Commit
d23e88c
·
1 Parent(s): 00826e1

audio_preprocessing.py added

Browse files
Files changed (1) hide show
  1. audio_preprocessing.py +65 -27
audio_preprocessing.py CHANGED
@@ -1,6 +1,6 @@
1
  """
2
- Audio Preprocessing Module for Respiratory Analysis
3
- Matches the exact preprocessing used during training
4
  """
5
 
6
  import librosa
@@ -14,7 +14,7 @@ warnings.filterwarnings('ignore')
14
 
15
  class RespiratoryAudioPreprocessor:
16
  """
17
- Audio preprocessor that matches training pipeline exactly
18
  Converts raw audio files to mel-spectrograms for model inference
19
  """
20
 
@@ -30,7 +30,7 @@ class RespiratoryAudioPreprocessor:
30
  power: float = 2.0,
31
  duration: float = 3.0): # 3 seconds max duration
32
  """
33
- Initialize preprocessing parameters to match training
34
  """
35
  self.target_sr = target_sr
36
  self.n_mels = n_mels
@@ -44,23 +44,23 @@ class RespiratoryAudioPreprocessor:
44
  self.duration = duration
45
  self.target_length = int(target_sr * duration) # 3 seconds worth of samples
46
 
47
- # Expected output shape for your model
48
  self.expected_shape = (1, 1, 128, 251) # (batch, channels, height, width)
49
 
50
  def load_and_normalize_audio(self, audio_input: Union[str, np.ndarray, tuple]) -> np.ndarray:
51
  """
52
- Load audio file and normalize
53
  """
54
  try:
55
  # Handle different input types
56
  if isinstance(audio_input, str):
57
- # File path
58
  audio_data, sr = librosa.load(audio_input, sr=self.target_sr, duration=self.duration)
59
  elif isinstance(audio_input, tuple):
60
- # (sample_rate, audio_array) from Gradio
61
  sr, audio_data = audio_input
62
 
63
- # Convert to float if needed
64
  if audio_data.dtype != np.float32:
65
  if audio_data.dtype == np.int16:
66
  audio_data = audio_data.astype(np.float32) / 32767.0
@@ -85,7 +85,7 @@ class RespiratoryAudioPreprocessor:
85
  else:
86
  raise ValueError(f"Unsupported audio input type: {type(audio_input)}")
87
 
88
- # Pad if too short
89
  if len(audio_data) < self.target_length:
90
  audio_data = np.pad(audio_data, (0, self.target_length - len(audio_data)),
91
  mode='constant', constant_values=0)
@@ -97,10 +97,10 @@ class RespiratoryAudioPreprocessor:
97
 
98
  def extract_mel_spectrogram(self, audio_data: np.ndarray) -> np.ndarray:
99
  """
100
- Extract mel spectrogram features matching training preprocessing
101
  """
102
  try:
103
- # Extract mel spectrogram
104
  mel_spec = librosa.feature.melspectrogram(
105
  y=audio_data,
106
  sr=self.target_sr,
@@ -114,7 +114,7 @@ class RespiratoryAudioPreprocessor:
114
  power=self.power
115
  )
116
 
117
- # Convert to log scale (dB)
118
  mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
119
 
120
  return mel_spec_db
@@ -124,33 +124,33 @@ class RespiratoryAudioPreprocessor:
124
 
125
  def normalize_spectrogram(self, mel_spec: np.ndarray) -> np.ndarray:
126
  """
127
- Normalize mel spectrogram to match training normalization
128
- This matches the normalization used in your training pipeline
129
  """
130
- # Mean and std normalization
131
  mean = np.mean(mel_spec)
132
  std = np.std(mel_spec)
133
 
134
  if std == 0:
135
  normalized = mel_spec - mean
136
  else:
137
- normalized = (mel_spec - mean) / std
138
 
139
- # Clamp values to reasonable range (matching training)
140
  normalized = np.clip(normalized, -5.0, 5.0)
141
 
142
  return normalized
143
 
144
  def resize_spectrogram(self, mel_spec: np.ndarray, target_width: int = 251) -> np.ndarray:
145
  """
146
- Resize spectrogram to target dimensions
147
  """
148
  current_height, current_width = mel_spec.shape
149
 
150
  if current_width == target_width:
151
  return mel_spec
152
 
153
- # Use librosa's time stretching for width adjustment
154
  if current_width < target_width:
155
  # Pad if too narrow
156
  pad_width = target_width - current_width
@@ -164,6 +164,7 @@ class RespiratoryAudioPreprocessor:
164
  def preprocess_audio(self, audio_input: Union[str, np.ndarray, tuple]) -> torch.Tensor:
165
  """
166
  Complete preprocessing pipeline from audio to model input tensor
 
167
  """
168
  try:
169
  # Step 1: Load and normalize audio
@@ -172,17 +173,17 @@ class RespiratoryAudioPreprocessor:
172
  # Step 2: Extract mel spectrogram
173
  mel_spec = self.extract_mel_spectrogram(audio_data)
174
 
175
- # Step 3: Normalize spectrogram
176
  mel_spec_norm = self.normalize_spectrogram(mel_spec)
177
 
178
- # Step 4: Resize to target dimensions
179
  mel_spec_resized = self.resize_spectrogram(mel_spec_norm)
180
 
181
  # Step 5: Convert to tensor and add batch + channel dimensions
182
  tensor_input = torch.FloatTensor(mel_spec_resized)
183
  tensor_input = tensor_input.unsqueeze(0).unsqueeze(0) # Add batch and channel dims
184
 
185
- # Verify output shape
186
  if tensor_input.shape != self.expected_shape:
187
  raise RuntimeError(f"Output shape {tensor_input.shape} doesn't match expected {self.expected_shape}")
188
 
@@ -193,7 +194,7 @@ class RespiratoryAudioPreprocessor:
193
 
194
  def get_preprocessing_info(self) -> Dict:
195
  """
196
- Get preprocessing configuration info
197
  """
198
  return {
199
  'target_sr': self.target_sr,
@@ -201,13 +202,50 @@ class RespiratoryAudioPreprocessor:
201
  'n_fft': self.n_fft,
202
  'hop_length': self.hop_length,
203
  'duration': self.duration,
204
- 'output_shape': self.expected_shape
 
205
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
  # Example usage and testing
208
  if __name__ == "__main__":
209
- # Initialize preprocessor
210
- preprocessor = RespiratoryAudioPreprocessor()
 
 
 
 
211
 
212
  # Test with dummy audio data
213
  dummy_audio = np.random.randn(22050 * 2) # 2 seconds of audio
 
1
  """
2
+ Audio Preprocessing Module for Respiratory Symptom Analysis
3
+ Matches the exact preprocessing used during training in your Coswara notebook
4
  """
5
 
6
  import librosa
 
14
 
15
  class RespiratoryAudioPreprocessor:
16
  """
17
+ Audio preprocessor that matches your training pipeline exactly
18
  Converts raw audio files to mel-spectrograms for model inference
19
  """
20
 
 
30
  power: float = 2.0,
31
  duration: float = 3.0): # 3 seconds max duration
32
  """
33
+ Initialize preprocessing parameters to match your training
34
  """
35
  self.target_sr = target_sr
36
  self.n_mels = n_mels
 
44
  self.duration = duration
45
  self.target_length = int(target_sr * duration) # 3 seconds worth of samples
46
 
47
+ # Expected output shape for your model (from your notebook: 128x251)
48
  self.expected_shape = (1, 1, 128, 251) # (batch, channels, height, width)
49
 
50
  def load_and_normalize_audio(self, audio_input: Union[str, np.ndarray, tuple]) -> np.ndarray:
51
  """
52
+ Load audio file and normalize - matches your training data loading
53
  """
54
  try:
55
  # Handle different input types
56
  if isinstance(audio_input, str):
57
+ # File path - most common case for API
58
  audio_data, sr = librosa.load(audio_input, sr=self.target_sr, duration=self.duration)
59
  elif isinstance(audio_input, tuple):
60
+ # (sample_rate, audio_array) from web uploads
61
  sr, audio_data = audio_input
62
 
63
+ # Convert to float32 if needed
64
  if audio_data.dtype != np.float32:
65
  if audio_data.dtype == np.int16:
66
  audio_data = audio_data.astype(np.float32) / 32767.0
 
85
  else:
86
  raise ValueError(f"Unsupported audio input type: {type(audio_input)}")
87
 
88
+ # Pad if too short (matching your training approach)
89
  if len(audio_data) < self.target_length:
90
  audio_data = np.pad(audio_data, (0, self.target_length - len(audio_data)),
91
  mode='constant', constant_values=0)
 
97
 
98
  def extract_mel_spectrogram(self, audio_data: np.ndarray) -> np.ndarray:
99
  """
100
+ Extract mel spectrogram features - exactly matching your training preprocessing
101
  """
102
  try:
103
+ # Extract mel spectrogram (matching your notebook parameters)
104
  mel_spec = librosa.feature.melspectrogram(
105
  y=audio_data,
106
  sr=self.target_sr,
 
114
  power=self.power
115
  )
116
 
117
+ # Convert to log scale (dB) - matching your training
118
  mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
119
 
120
  return mel_spec_db
 
124
 
125
  def normalize_spectrogram(self, mel_spec: np.ndarray) -> np.ndarray:
126
  """
127
+ Normalize mel spectrogram to match your training normalization
128
+ This matches the normalization used in your MultiSymptomDataset
129
  """
130
+ # Mean and std normalization (matching your training pipeline)
131
  mean = np.mean(mel_spec)
132
  std = np.std(mel_spec)
133
 
134
  if std == 0:
135
  normalized = mel_spec - mean
136
  else:
137
+ normalized = (mel_spec - mean) / (std + 1e-8) # Adding small epsilon
138
 
139
+ # Clamp values to reasonable range (matching your training)
140
  normalized = np.clip(normalized, -5.0, 5.0)
141
 
142
  return normalized
143
 
144
  def resize_spectrogram(self, mel_spec: np.ndarray, target_width: int = 251) -> np.ndarray:
145
  """
146
+ Resize spectrogram to target dimensions (matching your model input: 128x251)
147
  """
148
  current_height, current_width = mel_spec.shape
149
 
150
  if current_width == target_width:
151
  return mel_spec
152
 
153
+ # Resize to match your training data dimensions
154
  if current_width < target_width:
155
  # Pad if too narrow
156
  pad_width = target_width - current_width
 
164
  def preprocess_audio(self, audio_input: Union[str, np.ndarray, tuple]) -> torch.Tensor:
165
  """
166
  Complete preprocessing pipeline from audio to model input tensor
167
+ Matches exactly what your MultiSymptomDataset does in training
168
  """
169
  try:
170
  # Step 1: Load and normalize audio
 
173
  # Step 2: Extract mel spectrogram
174
  mel_spec = self.extract_mel_spectrogram(audio_data)
175
 
176
+ # Step 3: Normalize spectrogram (matching training)
177
  mel_spec_norm = self.normalize_spectrogram(mel_spec)
178
 
179
+ # Step 4: Resize to target dimensions (128x251)
180
  mel_spec_resized = self.resize_spectrogram(mel_spec_norm)
181
 
182
  # Step 5: Convert to tensor and add batch + channel dimensions
183
  tensor_input = torch.FloatTensor(mel_spec_resized)
184
  tensor_input = tensor_input.unsqueeze(0).unsqueeze(0) # Add batch and channel dims
185
 
186
+ # Verify output shape matches your model expectations
187
  if tensor_input.shape != self.expected_shape:
188
  raise RuntimeError(f"Output shape {tensor_input.shape} doesn't match expected {self.expected_shape}")
189
 
 
194
 
195
  def get_preprocessing_info(self) -> Dict:
196
  """
197
+ Get preprocessing configuration info for API endpoints
198
  """
199
  return {
200
  'target_sr': self.target_sr,
 
202
  'n_fft': self.n_fft,
203
  'hop_length': self.hop_length,
204
  'duration': self.duration,
205
+ 'output_shape': self.expected_shape,
206
+ 'target_symptoms': ['fever', 'cold', 'sorethroat', 'lossofsmell', 'fatigue', 'cough']
207
  }
208
+
209
+ def validate_audio_file(self, audio_path: str) -> Tuple[bool, str]:
210
+ """
211
+ Validate if audio file is suitable for processing
212
+ """
213
+ try:
214
+ # Check file existence
215
+ if not audio_path or not isinstance(audio_path, str):
216
+ return False, "No audio file provided"
217
+
218
+ # Try to load audio
219
+ audio, sr = librosa.load(audio_path, sr=None, duration=0.1) # Load just 0.1s for validation
220
+
221
+ # Check if audio is not empty
222
+ if len(audio) == 0:
223
+ return False, "Audio file is empty or corrupted"
224
+
225
+ # Check duration (load full file for duration check)
226
+ try:
227
+ duration = librosa.get_duration(path=audio_path)
228
+ if duration < 0.5: # Minimum 0.5 seconds
229
+ return False, f"Audio too short ({duration:.1f}s). Minimum 0.5 seconds required."
230
+ if duration > 30.0: # Maximum 30 seconds
231
+ return False, f"Audio too long ({duration:.1f}s). Maximum 30 seconds allowed."
232
+ except:
233
+ # If duration check fails, proceed
234
+ pass
235
+
236
+ return True, "Audio file is valid"
237
+
238
+ except Exception as e:
239
+ return False, f"Error validating audio: {str(e)}"
240
 
241
  # Example usage and testing
242
  if __name__ == "__main__":
243
+ # Initialize preprocessor with your exact training parameters
244
+ preprocessor = RespiratoryAudioPreprocessor(
245
+ target_sr=22050, # Matching your training
246
+ n_mels=128, # Matching your model input
247
+ duration=3.0 # 3 seconds as used in training
248
+ )
249
 
250
  # Test with dummy audio data
251
  dummy_audio = np.random.randn(22050 * 2) # 2 seconds of audio