Spaces:

Kalpokoch
/

respiratory-symptom-api

Sleeping

App Files Files Community

Kalpokoch commited on Sep 29, 2025

Commit

ed9b2d0

verified ·

1 Parent(s): e4fd245

Update audio_preprocessing.py

Browse files

Files changed (1) hide show

audio_preprocessing.py +92 -128

audio_preprocessing.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """
 Audio Preprocessing Module for Respiratory Symptom Analysis
-Fixed for Docker container deployment and Numba caching issues
 """
 import librosa
@@ -10,20 +10,19 @@ import warnings
 from typing import Union, Tuple, Dict
 import soundfile as sf
 import os
 # Fix for Numba caching issues in Docker containers
 os.environ['NUMBA_CACHE_DIR'] = '/tmp'
-os.environ['NUMBA_DISABLE_JIT'] = '0'  # Keep JIT enabled but fix caching
-# Disable specific warnings that occur in containers
-warnings.filterwarnings('ignore', category=UserWarning, module='librosa')
-warnings.filterwarnings('ignore', category=FutureWarning, module='librosa')
 warnings.filterwarnings('ignore')
 class RespiratoryAudioPreprocessor:
     """
-    Audio preprocessor that matches your training pipeline exactly
-    Fixed for Docker container deployment
     """
     def __init__(self,
@@ -37,9 +36,7 @@ class RespiratoryAudioPreprocessor:
                  fmax: float = None,
                  power: float = 2.0,
                  duration: float = 3.0):
-        """
-        Initialize preprocessing parameters to match your training
-        """
         self.target_sr = target_sr
         self.n_mels = n_mels
         self.n_fft = n_fft
@@ -52,78 +49,90 @@ class RespiratoryAudioPreprocessor:
         self.duration = duration
         self.target_length = int(target_sr * duration)
-        # Expected output shape for your model
         self.expected_shape = (1, 1, 128, 251)
-        # Pre-compile librosa functions to avoid runtime caching issues
         self._warmup_librosa()
     def _warmup_librosa(self):
-        """
-        Pre-compile librosa functions with dummy data to avoid caching issues
-        """
         try:
-            # Create small dummy audio for warming up librosa/numba
             dummy_audio = np.random.randn(1024).astype(np.float32)
-            # Warm up librosa functions
             _ = librosa.feature.melspectrogram(
                 y=dummy_audio,
                 sr=self.target_sr,
-                n_mels=32,  # Smaller for warmup
-                n_fft=512,  # Smaller for warmup
                 hop_length=256
             )
             print("✅ Librosa functions warmed up successfully")
         except Exception as e:
             print(f"⚠️ Librosa warmup warning: {str(e)}")
-            # Continue anyway - this is just optimization
-    def load_and_normalize_audio(self, audio_input: Union[str, np.ndarray, tuple]) -> np.ndarray:
         """
-        Load audio file and normalize - with enhanced error handling
         """
         try:
-            # Handle different input types
             if isinstance(audio_input, str):
-                # File path - most common case for API
                 try:
-                    # Use soundfile directly first (more reliable in containers)
                     audio_data, sr = sf.read(audio_input)
                     # Convert to mono if stereo
                     if len(audio_data.shape) > 1:
                         audio_data = np.mean(audio_data, axis=1)
-                    # Resample if needed
                     if sr != self.target_sr:
-                        audio_data = librosa.resample(
-                            audio_data,
-                            orig_sr=sr,
-                            target_sr=self.target_sr,
-                            res_type='kaiser_fast'  # Faster resampling
-                        )
                 except Exception as sf_error:
-                    # Fallback to librosa if soundfile fails
                     try:
-                        audio_data, sr = librosa.load(
-                            audio_input,
-                            sr=self.target_sr,
-                            duration=self.duration,
-                            res_type='kaiser_fast'
-                        )
                     except Exception as librosa_error:
-                        raise RuntimeError(f"Failed to load audio with both soundfile and librosa. "
-                                         f"SoundFile error: {sf_error}. Librosa error: {librosa_error}")
             elif isinstance(audio_input, tuple):
-                # (sample_rate, audio_array) from web uploads
                 sr, audio_data = audio_input
-                # Convert to float32 if needed
                 if audio_data.dtype != np.float32:
                     if audio_data.dtype == np.int16:
                         audio_data = audio_data.astype(np.float32) / 32767.0
@@ -136,21 +145,16 @@ class RespiratoryAudioPreprocessor:
                 if len(audio_data.shape) > 1:
                     audio_data = np.mean(audio_data, axis=1)
-                # Resample if needed
                 if sr != self.target_sr:
-                    audio_data = librosa.resample(
-                        audio_data,
-                        orig_sr=sr,
-                        target_sr=self.target_sr,
-                        res_type='kaiser_fast'
-                    )
-                # Trim to duration
                 if len(audio_data) > self.target_length:
                     audio_data = audio_data[:self.target_length]
             elif isinstance(audio_input, np.ndarray):
-                # Raw audio array
                 audio_data = audio_input.astype(np.float32)
                 # Convert to mono if stereo
@@ -162,7 +166,7 @@ class RespiratoryAudioPreprocessor:
             else:
                 raise ValueError(f"Unsupported audio input type: {type(audio_input)}")
-            # Ensure audio_data is 1D
             if len(audio_data.shape) > 1:
                 audio_data = audio_data.flatten()
@@ -186,16 +190,14 @@ class RespiratoryAudioPreprocessor:
             raise RuntimeError(f"Failed to load audio: {str(e)}")
     def extract_mel_spectrogram(self, audio_data: np.ndarray) -> np.ndarray:
-        """
-        Extract mel spectrogram with enhanced error handling
-        """
         try:
-            # Ensure audio is float32 and 1D
             audio_data = np.asarray(audio_data, dtype=np.float32)
             if len(audio_data.shape) > 1:
                 audio_data = audio_data.flatten()
-            # Extract mel spectrogram with error handling
             try:
                 mel_spec = librosa.feature.melspectrogram(
                     y=audio_data,
@@ -208,22 +210,19 @@ class RespiratoryAudioPreprocessor:
                     fmin=self.fmin,
                     fmax=self.fmax,
                     power=self.power,
-                    center=True,  # Ensure consistent behavior
-                    pad_mode='constant'  # Avoid edge effects
                 )
             except Exception as mel_error:
-                # Fallback with simpler parameters
-                print(f"⚠️ Mel spectrogram extraction failed, trying fallback: {mel_error}")
                 mel_spec = librosa.feature.melspectrogram(
                     y=audio_data,
                     sr=self.target_sr,
-                    n_mels=self.n_mels,
-                    n_fft=min(self.n_fft, len(audio_data)),
-                    hop_length=self.hop_length
                 )
-            # Convert to log scale (dB)
-            # Use np.maximum to avoid log(0) issues
             mel_spec = np.maximum(mel_spec, 1e-10)
             mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
@@ -233,11 +232,8 @@ class RespiratoryAudioPreprocessor:
             raise RuntimeError(f"Failed to extract mel spectrogram: {str(e)}")
     def normalize_spectrogram(self, mel_spec: np.ndarray) -> np.ndarray:
-        """
-        Normalize mel spectrogram
-        """
         try:
-            # Mean and std normalization
             mean = np.mean(mel_spec)
             std = np.std(mel_spec)
@@ -246,18 +242,14 @@ class RespiratoryAudioPreprocessor:
             else:
                 normalized = (mel_spec - mean) / (std + 1e-8)
-            # Clamp values to reasonable range
             normalized = np.clip(normalized, -5.0, 5.0)
             return normalized
         except Exception as e:
             raise RuntimeError(f"Failed to normalize spectrogram: {str(e)}")
     def resize_spectrogram(self, mel_spec: np.ndarray, target_width: int = 251) -> np.ndarray:
-        """
-        Resize spectrogram to target dimensions
-        """
         try:
             current_height, current_width = mel_spec.shape
@@ -265,7 +257,6 @@ class RespiratoryAudioPreprocessor:
                 return mel_spec
             if current_width < target_width:
-                # Pad if too narrow
                 pad_width = target_width - current_width
                 mel_spec = np.pad(
                     mel_spec,
@@ -274,7 +265,6 @@ class RespiratoryAudioPreprocessor:
                     constant_values=0
                 )
             else:
-                # Truncate if too wide
                 mel_spec = mel_spec[:, :target_width]
             return mel_spec
@@ -283,38 +273,32 @@ class RespiratoryAudioPreprocessor:
             raise RuntimeError(f"Failed to resize spectrogram: {str(e)}")
     def preprocess_audio(self, audio_input: Union[str, np.ndarray, tuple]) -> torch.Tensor:
-        """
-        Complete preprocessing pipeline with comprehensive error handling
-        """
         try:
-            # Step 1: Load and normalize audio
             audio_data = self.load_and_normalize_audio(audio_input)
-            # Step 2: Extract mel spectrogram
             mel_spec = self.extract_mel_spectrogram(audio_data)
-            # Step 3: Normalize spectrogram
             mel_spec_norm = self.normalize_spectrogram(mel_spec)
-            # Step 4: Resize to target dimensions
             mel_spec_resized = self.resize_spectrogram(mel_spec_norm)
-            # Step 5: Convert to tensor
             tensor_input = torch.FloatTensor(mel_spec_resized)
-            tensor_input = tensor_input.unsqueeze(0).unsqueeze(0)  # Add batch and channel dims
-            # Verify output shape
             if tensor_input.shape != self.expected_shape:
-                print(f"⚠️ Output shape {tensor_input.shape} doesn't match expected {self.expected_shape}")
-                # Try to fix common shape issues
-                if tensor_input.shape[2:] != self.expected_shape[2:]:
-                    # Resize to correct dimensions
-                    tensor_input = torch.nn.functional.interpolate(
-                        tensor_input,
-                        size=self.expected_shape[2:],
-                        mode='bilinear',
-                        align_corners=False
-                    )
             return tensor_input
@@ -322,7 +306,7 @@ class RespiratoryAudioPreprocessor:
             raise RuntimeError(f"Preprocessing failed: {str(e)}")
     def get_preprocessing_info(self) -> Dict:
-        """Get preprocessing configuration info"""
         return {
             'target_sr': self.target_sr,
             'n_mels': self.n_mels,
@@ -330,24 +314,23 @@ class RespiratoryAudioPreprocessor:
             'hop_length': self.hop_length,
             'duration': self.duration,
             'output_shape': self.expected_shape,
-            'target_symptoms': ['fever', 'cold', 'sorethroat', 'lossofsmell', 'fatigue', 'cough']
         }
     def validate_audio_file(self, audio_path: str) -> Tuple[bool, str]:
-        """Validate if audio file is suitable for processing"""
         try:
-            if not audio_path or not isinstance(audio_path, str):
                 return False, "No audio file provided"
-            # Try to get basic file info
             try:
                 info = sf.info(audio_path)
                 duration = info.duration
                 if duration < 0.5:
-                    return False, f"Audio too short ({duration:.1f}s). Minimum 0.5 seconds required."
                 if duration > 30.0:
-                    return False, f"Audio too long ({duration:.1f}s). Maximum 30 seconds allowed."
                 return True, "Audio file is valid"
@@ -355,23 +338,4 @@ class RespiratoryAudioPreprocessor:
                 return False, f"Error validating audio: {str(e)}"
         except Exception as e:
-            return False, f"Error validating audio: {str(e)}"
-# Example usage and testing
-if __name__ == "__main__":
-    try:
-        # Initialize preprocessor
-        preprocessor = RespiratoryAudioPreprocessor()
-        # Test with dummy audio data
-        dummy_audio = np.random.randn(22050 * 2).astype(np.float32)
-        # Preprocess
-        tensor_output = preprocessor.preprocess_audio(dummy_audio)
-        print(f"✅ Preprocessing successful!")
-        print(f"Output shape: {tensor_output.shape}")
-        print(f"Output dtype: {tensor_output.dtype}")
-        print(f"Output range: [{tensor_output.min():.3f}, {tensor_output.max():.3f}]")
-    except Exception as e:
-        print(f"❌ Preprocessing test failed: {e}")

 """
 Audio Preprocessing Module for Respiratory Symptom Analysis
+Version without external resampling dependencies (resampy-free)
 """
 import librosa
 from typing import Union, Tuple, Dict
 import soundfile as sf
 import os
+from scipy import signal
 # Fix for Numba caching issues in Docker containers
 os.environ['NUMBA_CACHE_DIR'] = '/tmp'
+os.environ['NUMBA_DISABLE_JIT'] = '0'
+# Disable warnings
 warnings.filterwarnings('ignore')
 class RespiratoryAudioPreprocessor:
     """
+    Audio preprocessor without external resampling dependencies
+    Uses scipy.signal for resampling instead of resampy
     """
     def __init__(self,
                  fmax: float = None,
                  power: float = 2.0,
                  duration: float = 3.0):
+        """Initialize preprocessing parameters"""
         self.target_sr = target_sr
         self.n_mels = n_mels
         self.n_fft = n_fft
         self.duration = duration
         self.target_length = int(target_sr * duration)
+        # Expected output shape
         self.expected_shape = (1, 1, 128, 251)
+        # Pre-warm librosa
         self._warmup_librosa()
     def _warmup_librosa(self):
+        """Pre-compile librosa functions"""
         try:
             dummy_audio = np.random.randn(1024).astype(np.float32)
             _ = librosa.feature.melspectrogram(
                 y=dummy_audio,
                 sr=self.target_sr,
+                n_mels=32,
+                n_fft=512,
                 hop_length=256
             )
             print("✅ Librosa functions warmed up successfully")
         except Exception as e:
             print(f"⚠️ Librosa warmup warning: {str(e)}")
+    def scipy_resample(self, audio_data: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
         """
+        Custom resampling using scipy.signal instead of resampy
         """
+        if orig_sr == target_sr:
+            return audio_data
+        try:
+            # Calculate resampling ratio
+            resample_ratio = target_sr / orig_sr
+            # Use scipy.signal.resample for resampling
+            target_length = int(len(audio_data) * resample_ratio)
+            resampled_audio = signal.resample(audio_data, target_length)
+            return resampled_audio.astype(np.float32)
+        except Exception as e:
+            print(f"⚠️ Scipy resampling failed: {e}, using original audio")
+            return audio_data
+    def load_and_normalize_audio(self, audio_input: Union[str, np.ndarray, tuple]) -> np.ndarray:
+        """Load audio file without resampy dependency"""
         try:
             if isinstance(audio_input, str):
+                # Load with soundfile first
                 try:
                     audio_data, sr = sf.read(audio_input)
                     # Convert to mono if stereo
                     if len(audio_data.shape) > 1:
                         audio_data = np.mean(audio_data, axis=1)
+                    # Resample using scipy if needed
                     if sr != self.target_sr:
+                        audio_data = self.scipy_resample(audio_data, sr, self.target_sr)
                 except Exception as sf_error:
+                    # Fallback: try loading without librosa resampling
                     try:
+                        # Load with original sample rate first
+                        audio_data, sr = librosa.load(audio_input, sr=None)
+                        # Convert to mono if stereo
+                        if len(audio_data.shape) > 1:
+                            audio_data = np.mean(audio_data, axis=1)
+                        # Manual resampling with scipy
+                        if sr != self.target_sr:
+                            audio_data = self.scipy_resample(audio_data, sr, self.target_sr)
+                        # Limit duration manually
+                        if len(audio_data) > self.target_length:
+                            audio_data = audio_data[:self.target_length]
                     except Exception as librosa_error:
+                        raise RuntimeError(f"Failed to load audio. SoundFile: {sf_error}. Librosa: {librosa_error}")
             elif isinstance(audio_input, tuple):
+                # (sample_rate, audio_array) from uploads
                 sr, audio_data = audio_input
+                # Convert to float32
                 if audio_data.dtype != np.float32:
                     if audio_data.dtype == np.int16:
                         audio_data = audio_data.astype(np.float32) / 32767.0
                 if len(audio_data.shape) > 1:
                     audio_data = np.mean(audio_data, axis=1)
+                # Resample using scipy
                 if sr != self.target_sr:
+                    audio_data = self.scipy_resample(audio_data, sr, self.target_sr)
+                # Trim duration
                 if len(audio_data) > self.target_length:
                     audio_data = audio_data[:self.target_length]
             elif isinstance(audio_input, np.ndarray):
+                # Raw audio array (assume target_sr)
                 audio_data = audio_input.astype(np.float32)
                 # Convert to mono if stereo
             else:
                 raise ValueError(f"Unsupported audio input type: {type(audio_input)}")
+            # Ensure 1D
             if len(audio_data.shape) > 1:
                 audio_data = audio_data.flatten()
             raise RuntimeError(f"Failed to load audio: {str(e)}")
     def extract_mel_spectrogram(self, audio_data: np.ndarray) -> np.ndarray:
+        """Extract mel spectrogram without resampling dependencies"""
         try:
+            # Ensure proper format
             audio_data = np.asarray(audio_data, dtype=np.float32)
             if len(audio_data.shape) > 1:
                 audio_data = audio_data.flatten()
+            # Extract mel spectrogram
             try:
                 mel_spec = librosa.feature.melspectrogram(
                     y=audio_data,
                     fmin=self.fmin,
                     fmax=self.fmax,
                     power=self.power,
+                    center=True,
+                    pad_mode='constant'
                 )
             except Exception as mel_error:
+                # Simplified fallback
+                print(f"⚠️ Using simplified mel spectrogram extraction: {mel_error}")
                 mel_spec = librosa.feature.melspectrogram(
                     y=audio_data,
                     sr=self.target_sr,
+                    n_mels=self.n_mels
                 )
+            # Convert to dB
             mel_spec = np.maximum(mel_spec, 1e-10)
             mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
             raise RuntimeError(f"Failed to extract mel spectrogram: {str(e)}")
     def normalize_spectrogram(self, mel_spec: np.ndarray) -> np.ndarray:
+        """Normalize spectrogram"""
         try:
             mean = np.mean(mel_spec)
             std = np.std(mel_spec)
             else:
                 normalized = (mel_spec - mean) / (std + 1e-8)
             normalized = np.clip(normalized, -5.0, 5.0)
             return normalized
         except Exception as e:
             raise RuntimeError(f"Failed to normalize spectrogram: {str(e)}")
     def resize_spectrogram(self, mel_spec: np.ndarray, target_width: int = 251) -> np.ndarray:
+        """Resize spectrogram to target dimensions"""
         try:
             current_height, current_width = mel_spec.shape
                 return mel_spec
             if current_width < target_width:
                 pad_width = target_width - current_width
                 mel_spec = np.pad(
                     mel_spec,
                     constant_values=0
                 )
             else:
                 mel_spec = mel_spec[:, :target_width]
             return mel_spec
             raise RuntimeError(f"Failed to resize spectrogram: {str(e)}")
     def preprocess_audio(self, audio_input: Union[str, np.ndarray, tuple]) -> torch.Tensor:
+        """Complete preprocessing pipeline"""
         try:
+            # Load audio
             audio_data = self.load_and_normalize_audio(audio_input)
+            # Extract features
             mel_spec = self.extract_mel_spectrogram(audio_data)
+            # Normalize
             mel_spec_norm = self.normalize_spectrogram(mel_spec)
+            # Resize
             mel_spec_resized = self.resize_spectrogram(mel_spec_norm)
+            # Convert to tensor
             tensor_input = torch.FloatTensor(mel_spec_resized)
+            tensor_input = tensor_input.unsqueeze(0).unsqueeze(0)
+            # Fix shape if needed
             if tensor_input.shape != self.expected_shape:
+                tensor_input = torch.nn.functional.interpolate(
+                    tensor_input,
+                    size=self.expected_shape[2:],
+                    mode='bilinear',
+                    align_corners=False
+                )
             return tensor_input
             raise RuntimeError(f"Preprocessing failed: {str(e)}")
     def get_preprocessing_info(self) -> Dict:
+        """Get preprocessing info"""
         return {
             'target_sr': self.target_sr,
             'n_mels': self.n_mels,
             'hop_length': self.hop_length,
             'duration': self.duration,
             'output_shape': self.expected_shape,
+            'resampling_method': 'scipy.signal'
         }
     def validate_audio_file(self, audio_path: str) -> Tuple[bool, str]:
+        """Validate audio file"""
         try:
+            if not audio_path:
                 return False, "No audio file provided"
             try:
                 info = sf.info(audio_path)
                 duration = info.duration
                 if duration < 0.5:
+                    return False, f"Audio too short ({duration:.1f}s)"
                 if duration > 30.0:
+                    return False, f"Audio too long ({duration:.1f}s)"
                 return True, "Audio file is valid"
                 return False, f"Error validating audio: {str(e)}"
         except Exception as e:
+            return False, f"Validation error: {str(e)}"