Kalpokoch commited on
Commit
ed9b2d0
·
verified ·
1 Parent(s): e4fd245

Update audio_preprocessing.py

Browse files
Files changed (1) hide show
  1. audio_preprocessing.py +92 -128
audio_preprocessing.py CHANGED
@@ -1,6 +1,6 @@
1
  """
2
  Audio Preprocessing Module for Respiratory Symptom Analysis
3
- Fixed for Docker container deployment and Numba caching issues
4
  """
5
 
6
  import librosa
@@ -10,20 +10,19 @@ import warnings
10
  from typing import Union, Tuple, Dict
11
  import soundfile as sf
12
  import os
 
13
 
14
  # Fix for Numba caching issues in Docker containers
15
  os.environ['NUMBA_CACHE_DIR'] = '/tmp'
16
- os.environ['NUMBA_DISABLE_JIT'] = '0' # Keep JIT enabled but fix caching
17
 
18
- # Disable specific warnings that occur in containers
19
- warnings.filterwarnings('ignore', category=UserWarning, module='librosa')
20
- warnings.filterwarnings('ignore', category=FutureWarning, module='librosa')
21
  warnings.filterwarnings('ignore')
22
 
23
  class RespiratoryAudioPreprocessor:
24
  """
25
- Audio preprocessor that matches your training pipeline exactly
26
- Fixed for Docker container deployment
27
  """
28
 
29
  def __init__(self,
@@ -37,9 +36,7 @@ class RespiratoryAudioPreprocessor:
37
  fmax: float = None,
38
  power: float = 2.0,
39
  duration: float = 3.0):
40
- """
41
- Initialize preprocessing parameters to match your training
42
- """
43
  self.target_sr = target_sr
44
  self.n_mels = n_mels
45
  self.n_fft = n_fft
@@ -52,78 +49,90 @@ class RespiratoryAudioPreprocessor:
52
  self.duration = duration
53
  self.target_length = int(target_sr * duration)
54
 
55
- # Expected output shape for your model
56
  self.expected_shape = (1, 1, 128, 251)
57
 
58
- # Pre-compile librosa functions to avoid runtime caching issues
59
  self._warmup_librosa()
60
 
61
  def _warmup_librosa(self):
62
- """
63
- Pre-compile librosa functions with dummy data to avoid caching issues
64
- """
65
  try:
66
- # Create small dummy audio for warming up librosa/numba
67
  dummy_audio = np.random.randn(1024).astype(np.float32)
68
-
69
- # Warm up librosa functions
70
  _ = librosa.feature.melspectrogram(
71
  y=dummy_audio,
72
  sr=self.target_sr,
73
- n_mels=32, # Smaller for warmup
74
- n_fft=512, # Smaller for warmup
75
  hop_length=256
76
  )
77
-
78
  print("✅ Librosa functions warmed up successfully")
79
-
80
  except Exception as e:
81
  print(f"⚠️ Librosa warmup warning: {str(e)}")
82
- # Continue anyway - this is just optimization
83
 
84
- def load_and_normalize_audio(self, audio_input: Union[str, np.ndarray, tuple]) -> np.ndarray:
85
  """
86
- Load audio file and normalize - with enhanced error handling
87
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  try:
89
- # Handle different input types
90
  if isinstance(audio_input, str):
91
- # File path - most common case for API
92
  try:
93
- # Use soundfile directly first (more reliable in containers)
94
  audio_data, sr = sf.read(audio_input)
95
 
96
  # Convert to mono if stereo
97
  if len(audio_data.shape) > 1:
98
  audio_data = np.mean(audio_data, axis=1)
99
 
100
- # Resample if needed
101
  if sr != self.target_sr:
102
- audio_data = librosa.resample(
103
- audio_data,
104
- orig_sr=sr,
105
- target_sr=self.target_sr,
106
- res_type='kaiser_fast' # Faster resampling
107
- )
108
 
109
  except Exception as sf_error:
110
- # Fallback to librosa if soundfile fails
111
  try:
112
- audio_data, sr = librosa.load(
113
- audio_input,
114
- sr=self.target_sr,
115
- duration=self.duration,
116
- res_type='kaiser_fast'
117
- )
 
 
 
 
 
 
 
 
 
118
  except Exception as librosa_error:
119
- raise RuntimeError(f"Failed to load audio with both soundfile and librosa. "
120
- f"SoundFile error: {sf_error}. Librosa error: {librosa_error}")
121
 
122
  elif isinstance(audio_input, tuple):
123
- # (sample_rate, audio_array) from web uploads
124
  sr, audio_data = audio_input
125
 
126
- # Convert to float32 if needed
127
  if audio_data.dtype != np.float32:
128
  if audio_data.dtype == np.int16:
129
  audio_data = audio_data.astype(np.float32) / 32767.0
@@ -136,21 +145,16 @@ class RespiratoryAudioPreprocessor:
136
  if len(audio_data.shape) > 1:
137
  audio_data = np.mean(audio_data, axis=1)
138
 
139
- # Resample if needed
140
  if sr != self.target_sr:
141
- audio_data = librosa.resample(
142
- audio_data,
143
- orig_sr=sr,
144
- target_sr=self.target_sr,
145
- res_type='kaiser_fast'
146
- )
147
 
148
- # Trim to duration
149
  if len(audio_data) > self.target_length:
150
  audio_data = audio_data[:self.target_length]
151
 
152
  elif isinstance(audio_input, np.ndarray):
153
- # Raw audio array
154
  audio_data = audio_input.astype(np.float32)
155
 
156
  # Convert to mono if stereo
@@ -162,7 +166,7 @@ class RespiratoryAudioPreprocessor:
162
  else:
163
  raise ValueError(f"Unsupported audio input type: {type(audio_input)}")
164
 
165
- # Ensure audio_data is 1D
166
  if len(audio_data.shape) > 1:
167
  audio_data = audio_data.flatten()
168
 
@@ -186,16 +190,14 @@ class RespiratoryAudioPreprocessor:
186
  raise RuntimeError(f"Failed to load audio: {str(e)}")
187
 
188
  def extract_mel_spectrogram(self, audio_data: np.ndarray) -> np.ndarray:
189
- """
190
- Extract mel spectrogram with enhanced error handling
191
- """
192
  try:
193
- # Ensure audio is float32 and 1D
194
  audio_data = np.asarray(audio_data, dtype=np.float32)
195
  if len(audio_data.shape) > 1:
196
  audio_data = audio_data.flatten()
197
 
198
- # Extract mel spectrogram with error handling
199
  try:
200
  mel_spec = librosa.feature.melspectrogram(
201
  y=audio_data,
@@ -208,22 +210,19 @@ class RespiratoryAudioPreprocessor:
208
  fmin=self.fmin,
209
  fmax=self.fmax,
210
  power=self.power,
211
- center=True, # Ensure consistent behavior
212
- pad_mode='constant' # Avoid edge effects
213
  )
214
  except Exception as mel_error:
215
- # Fallback with simpler parameters
216
- print(f"⚠️ Mel spectrogram extraction failed, trying fallback: {mel_error}")
217
  mel_spec = librosa.feature.melspectrogram(
218
  y=audio_data,
219
  sr=self.target_sr,
220
- n_mels=self.n_mels,
221
- n_fft=min(self.n_fft, len(audio_data)),
222
- hop_length=self.hop_length
223
  )
224
 
225
- # Convert to log scale (dB)
226
- # Use np.maximum to avoid log(0) issues
227
  mel_spec = np.maximum(mel_spec, 1e-10)
228
  mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
229
 
@@ -233,11 +232,8 @@ class RespiratoryAudioPreprocessor:
233
  raise RuntimeError(f"Failed to extract mel spectrogram: {str(e)}")
234
 
235
  def normalize_spectrogram(self, mel_spec: np.ndarray) -> np.ndarray:
236
- """
237
- Normalize mel spectrogram
238
- """
239
  try:
240
- # Mean and std normalization
241
  mean = np.mean(mel_spec)
242
  std = np.std(mel_spec)
243
 
@@ -246,18 +242,14 @@ class RespiratoryAudioPreprocessor:
246
  else:
247
  normalized = (mel_spec - mean) / (std + 1e-8)
248
 
249
- # Clamp values to reasonable range
250
  normalized = np.clip(normalized, -5.0, 5.0)
251
-
252
  return normalized
253
 
254
  except Exception as e:
255
  raise RuntimeError(f"Failed to normalize spectrogram: {str(e)}")
256
 
257
  def resize_spectrogram(self, mel_spec: np.ndarray, target_width: int = 251) -> np.ndarray:
258
- """
259
- Resize spectrogram to target dimensions
260
- """
261
  try:
262
  current_height, current_width = mel_spec.shape
263
 
@@ -265,7 +257,6 @@ class RespiratoryAudioPreprocessor:
265
  return mel_spec
266
 
267
  if current_width < target_width:
268
- # Pad if too narrow
269
  pad_width = target_width - current_width
270
  mel_spec = np.pad(
271
  mel_spec,
@@ -274,7 +265,6 @@ class RespiratoryAudioPreprocessor:
274
  constant_values=0
275
  )
276
  else:
277
- # Truncate if too wide
278
  mel_spec = mel_spec[:, :target_width]
279
 
280
  return mel_spec
@@ -283,38 +273,32 @@ class RespiratoryAudioPreprocessor:
283
  raise RuntimeError(f"Failed to resize spectrogram: {str(e)}")
284
 
285
  def preprocess_audio(self, audio_input: Union[str, np.ndarray, tuple]) -> torch.Tensor:
286
- """
287
- Complete preprocessing pipeline with comprehensive error handling
288
- """
289
  try:
290
- # Step 1: Load and normalize audio
291
  audio_data = self.load_and_normalize_audio(audio_input)
292
 
293
- # Step 2: Extract mel spectrogram
294
  mel_spec = self.extract_mel_spectrogram(audio_data)
295
 
296
- # Step 3: Normalize spectrogram
297
  mel_spec_norm = self.normalize_spectrogram(mel_spec)
298
 
299
- # Step 4: Resize to target dimensions
300
  mel_spec_resized = self.resize_spectrogram(mel_spec_norm)
301
 
302
- # Step 5: Convert to tensor
303
  tensor_input = torch.FloatTensor(mel_spec_resized)
304
- tensor_input = tensor_input.unsqueeze(0).unsqueeze(0) # Add batch and channel dims
305
 
306
- # Verify output shape
307
  if tensor_input.shape != self.expected_shape:
308
- print(f"⚠️ Output shape {tensor_input.shape} doesn't match expected {self.expected_shape}")
309
- # Try to fix common shape issues
310
- if tensor_input.shape[2:] != self.expected_shape[2:]:
311
- # Resize to correct dimensions
312
- tensor_input = torch.nn.functional.interpolate(
313
- tensor_input,
314
- size=self.expected_shape[2:],
315
- mode='bilinear',
316
- align_corners=False
317
- )
318
 
319
  return tensor_input
320
 
@@ -322,7 +306,7 @@ class RespiratoryAudioPreprocessor:
322
  raise RuntimeError(f"Preprocessing failed: {str(e)}")
323
 
324
  def get_preprocessing_info(self) -> Dict:
325
- """Get preprocessing configuration info"""
326
  return {
327
  'target_sr': self.target_sr,
328
  'n_mels': self.n_mels,
@@ -330,24 +314,23 @@ class RespiratoryAudioPreprocessor:
330
  'hop_length': self.hop_length,
331
  'duration': self.duration,
332
  'output_shape': self.expected_shape,
333
- 'target_symptoms': ['fever', 'cold', 'sorethroat', 'lossofsmell', 'fatigue', 'cough']
334
  }
335
 
336
  def validate_audio_file(self, audio_path: str) -> Tuple[bool, str]:
337
- """Validate if audio file is suitable for processing"""
338
  try:
339
- if not audio_path or not isinstance(audio_path, str):
340
  return False, "No audio file provided"
341
 
342
- # Try to get basic file info
343
  try:
344
  info = sf.info(audio_path)
345
  duration = info.duration
346
 
347
  if duration < 0.5:
348
- return False, f"Audio too short ({duration:.1f}s). Minimum 0.5 seconds required."
349
  if duration > 30.0:
350
- return False, f"Audio too long ({duration:.1f}s). Maximum 30 seconds allowed."
351
 
352
  return True, "Audio file is valid"
353
 
@@ -355,23 +338,4 @@ class RespiratoryAudioPreprocessor:
355
  return False, f"Error validating audio: {str(e)}"
356
 
357
  except Exception as e:
358
- return False, f"Error validating audio: {str(e)}"
359
-
360
- # Example usage and testing
361
- if __name__ == "__main__":
362
- try:
363
- # Initialize preprocessor
364
- preprocessor = RespiratoryAudioPreprocessor()
365
-
366
- # Test with dummy audio data
367
- dummy_audio = np.random.randn(22050 * 2).astype(np.float32)
368
-
369
- # Preprocess
370
- tensor_output = preprocessor.preprocess_audio(dummy_audio)
371
- print(f"✅ Preprocessing successful!")
372
- print(f"Output shape: {tensor_output.shape}")
373
- print(f"Output dtype: {tensor_output.dtype}")
374
- print(f"Output range: [{tensor_output.min():.3f}, {tensor_output.max():.3f}]")
375
-
376
- except Exception as e:
377
- print(f"❌ Preprocessing test failed: {e}")
 
1
  """
2
  Audio Preprocessing Module for Respiratory Symptom Analysis
3
+ Version without external resampling dependencies (resampy-free)
4
  """
5
 
6
  import librosa
 
10
  from typing import Union, Tuple, Dict
11
  import soundfile as sf
12
  import os
13
+ from scipy import signal
14
 
15
  # Fix for Numba caching issues in Docker containers
16
  os.environ['NUMBA_CACHE_DIR'] = '/tmp'
17
+ os.environ['NUMBA_DISABLE_JIT'] = '0'
18
 
19
+ # Disable warnings
 
 
20
  warnings.filterwarnings('ignore')
21
 
22
  class RespiratoryAudioPreprocessor:
23
  """
24
+ Audio preprocessor without external resampling dependencies
25
+ Uses scipy.signal for resampling instead of resampy
26
  """
27
 
28
  def __init__(self,
 
36
  fmax: float = None,
37
  power: float = 2.0,
38
  duration: float = 3.0):
39
+ """Initialize preprocessing parameters"""
 
 
40
  self.target_sr = target_sr
41
  self.n_mels = n_mels
42
  self.n_fft = n_fft
 
49
  self.duration = duration
50
  self.target_length = int(target_sr * duration)
51
 
52
+ # Expected output shape
53
  self.expected_shape = (1, 1, 128, 251)
54
 
55
+ # Pre-warm librosa
56
  self._warmup_librosa()
57
 
58
  def _warmup_librosa(self):
59
+ """Pre-compile librosa functions"""
 
 
60
  try:
 
61
  dummy_audio = np.random.randn(1024).astype(np.float32)
 
 
62
  _ = librosa.feature.melspectrogram(
63
  y=dummy_audio,
64
  sr=self.target_sr,
65
+ n_mels=32,
66
+ n_fft=512,
67
  hop_length=256
68
  )
 
69
  print("✅ Librosa functions warmed up successfully")
 
70
  except Exception as e:
71
  print(f"⚠️ Librosa warmup warning: {str(e)}")
 
72
 
73
+ def scipy_resample(self, audio_data: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
74
  """
75
+ Custom resampling using scipy.signal instead of resampy
76
  """
77
+ if orig_sr == target_sr:
78
+ return audio_data
79
+
80
+ try:
81
+ # Calculate resampling ratio
82
+ resample_ratio = target_sr / orig_sr
83
+
84
+ # Use scipy.signal.resample for resampling
85
+ target_length = int(len(audio_data) * resample_ratio)
86
+ resampled_audio = signal.resample(audio_data, target_length)
87
+
88
+ return resampled_audio.astype(np.float32)
89
+
90
+ except Exception as e:
91
+ print(f"⚠️ Scipy resampling failed: {e}, using original audio")
92
+ return audio_data
93
+
94
+ def load_and_normalize_audio(self, audio_input: Union[str, np.ndarray, tuple]) -> np.ndarray:
95
+ """Load audio file without resampy dependency"""
96
  try:
 
97
  if isinstance(audio_input, str):
98
+ # Load with soundfile first
99
  try:
 
100
  audio_data, sr = sf.read(audio_input)
101
 
102
  # Convert to mono if stereo
103
  if len(audio_data.shape) > 1:
104
  audio_data = np.mean(audio_data, axis=1)
105
 
106
+ # Resample using scipy if needed
107
  if sr != self.target_sr:
108
+ audio_data = self.scipy_resample(audio_data, sr, self.target_sr)
 
 
 
 
 
109
 
110
  except Exception as sf_error:
111
+ # Fallback: try loading without librosa resampling
112
  try:
113
+ # Load with original sample rate first
114
+ audio_data, sr = librosa.load(audio_input, sr=None)
115
+
116
+ # Convert to mono if stereo
117
+ if len(audio_data.shape) > 1:
118
+ audio_data = np.mean(audio_data, axis=1)
119
+
120
+ # Manual resampling with scipy
121
+ if sr != self.target_sr:
122
+ audio_data = self.scipy_resample(audio_data, sr, self.target_sr)
123
+
124
+ # Limit duration manually
125
+ if len(audio_data) > self.target_length:
126
+ audio_data = audio_data[:self.target_length]
127
+
128
  except Exception as librosa_error:
129
+ raise RuntimeError(f"Failed to load audio. SoundFile: {sf_error}. Librosa: {librosa_error}")
 
130
 
131
  elif isinstance(audio_input, tuple):
132
+ # (sample_rate, audio_array) from uploads
133
  sr, audio_data = audio_input
134
 
135
+ # Convert to float32
136
  if audio_data.dtype != np.float32:
137
  if audio_data.dtype == np.int16:
138
  audio_data = audio_data.astype(np.float32) / 32767.0
 
145
  if len(audio_data.shape) > 1:
146
  audio_data = np.mean(audio_data, axis=1)
147
 
148
+ # Resample using scipy
149
  if sr != self.target_sr:
150
+ audio_data = self.scipy_resample(audio_data, sr, self.target_sr)
 
 
 
 
 
151
 
152
+ # Trim duration
153
  if len(audio_data) > self.target_length:
154
  audio_data = audio_data[:self.target_length]
155
 
156
  elif isinstance(audio_input, np.ndarray):
157
+ # Raw audio array (assume target_sr)
158
  audio_data = audio_input.astype(np.float32)
159
 
160
  # Convert to mono if stereo
 
166
  else:
167
  raise ValueError(f"Unsupported audio input type: {type(audio_input)}")
168
 
169
+ # Ensure 1D
170
  if len(audio_data.shape) > 1:
171
  audio_data = audio_data.flatten()
172
 
 
190
  raise RuntimeError(f"Failed to load audio: {str(e)}")
191
 
192
  def extract_mel_spectrogram(self, audio_data: np.ndarray) -> np.ndarray:
193
+ """Extract mel spectrogram without resampling dependencies"""
 
 
194
  try:
195
+ # Ensure proper format
196
  audio_data = np.asarray(audio_data, dtype=np.float32)
197
  if len(audio_data.shape) > 1:
198
  audio_data = audio_data.flatten()
199
 
200
+ # Extract mel spectrogram
201
  try:
202
  mel_spec = librosa.feature.melspectrogram(
203
  y=audio_data,
 
210
  fmin=self.fmin,
211
  fmax=self.fmax,
212
  power=self.power,
213
+ center=True,
214
+ pad_mode='constant'
215
  )
216
  except Exception as mel_error:
217
+ # Simplified fallback
218
+ print(f"⚠️ Using simplified mel spectrogram extraction: {mel_error}")
219
  mel_spec = librosa.feature.melspectrogram(
220
  y=audio_data,
221
  sr=self.target_sr,
222
+ n_mels=self.n_mels
 
 
223
  )
224
 
225
+ # Convert to dB
 
226
  mel_spec = np.maximum(mel_spec, 1e-10)
227
  mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
228
 
 
232
  raise RuntimeError(f"Failed to extract mel spectrogram: {str(e)}")
233
 
234
  def normalize_spectrogram(self, mel_spec: np.ndarray) -> np.ndarray:
235
+ """Normalize spectrogram"""
 
 
236
  try:
 
237
  mean = np.mean(mel_spec)
238
  std = np.std(mel_spec)
239
 
 
242
  else:
243
  normalized = (mel_spec - mean) / (std + 1e-8)
244
 
 
245
  normalized = np.clip(normalized, -5.0, 5.0)
 
246
  return normalized
247
 
248
  except Exception as e:
249
  raise RuntimeError(f"Failed to normalize spectrogram: {str(e)}")
250
 
251
  def resize_spectrogram(self, mel_spec: np.ndarray, target_width: int = 251) -> np.ndarray:
252
+ """Resize spectrogram to target dimensions"""
 
 
253
  try:
254
  current_height, current_width = mel_spec.shape
255
 
 
257
  return mel_spec
258
 
259
  if current_width < target_width:
 
260
  pad_width = target_width - current_width
261
  mel_spec = np.pad(
262
  mel_spec,
 
265
  constant_values=0
266
  )
267
  else:
 
268
  mel_spec = mel_spec[:, :target_width]
269
 
270
  return mel_spec
 
273
  raise RuntimeError(f"Failed to resize spectrogram: {str(e)}")
274
 
275
  def preprocess_audio(self, audio_input: Union[str, np.ndarray, tuple]) -> torch.Tensor:
276
+ """Complete preprocessing pipeline"""
 
 
277
  try:
278
+ # Load audio
279
  audio_data = self.load_and_normalize_audio(audio_input)
280
 
281
+ # Extract features
282
  mel_spec = self.extract_mel_spectrogram(audio_data)
283
 
284
+ # Normalize
285
  mel_spec_norm = self.normalize_spectrogram(mel_spec)
286
 
287
+ # Resize
288
  mel_spec_resized = self.resize_spectrogram(mel_spec_norm)
289
 
290
+ # Convert to tensor
291
  tensor_input = torch.FloatTensor(mel_spec_resized)
292
+ tensor_input = tensor_input.unsqueeze(0).unsqueeze(0)
293
 
294
+ # Fix shape if needed
295
  if tensor_input.shape != self.expected_shape:
296
+ tensor_input = torch.nn.functional.interpolate(
297
+ tensor_input,
298
+ size=self.expected_shape[2:],
299
+ mode='bilinear',
300
+ align_corners=False
301
+ )
 
 
 
 
302
 
303
  return tensor_input
304
 
 
306
  raise RuntimeError(f"Preprocessing failed: {str(e)}")
307
 
308
  def get_preprocessing_info(self) -> Dict:
309
+ """Get preprocessing info"""
310
  return {
311
  'target_sr': self.target_sr,
312
  'n_mels': self.n_mels,
 
314
  'hop_length': self.hop_length,
315
  'duration': self.duration,
316
  'output_shape': self.expected_shape,
317
+ 'resampling_method': 'scipy.signal'
318
  }
319
 
320
  def validate_audio_file(self, audio_path: str) -> Tuple[bool, str]:
321
+ """Validate audio file"""
322
  try:
323
+ if not audio_path:
324
  return False, "No audio file provided"
325
 
 
326
  try:
327
  info = sf.info(audio_path)
328
  duration = info.duration
329
 
330
  if duration < 0.5:
331
+ return False, f"Audio too short ({duration:.1f}s)"
332
  if duration > 30.0:
333
+ return False, f"Audio too long ({duration:.1f}s)"
334
 
335
  return True, "Audio file is valid"
336
 
 
338
  return False, f"Error validating audio: {str(e)}"
339
 
340
  except Exception as e:
341
+ return False, f"Validation error: {str(e)}"