Spaces:

Kalpokoch
/

respiratory-symptom-api

Sleeping

App Files Files Community

Kalpokoch commited on Sep 29, 2025

Commit

d23e88c

1 Parent(s): 00826e1

audio_preprocessing.py added

Browse files

Files changed (1) hide show

audio_preprocessing.py +65 -27

audio_preprocessing.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """
-Audio Preprocessing Module for Respiratory Analysis
-Matches the exact preprocessing used during training
 """
 import librosa
@@ -14,7 +14,7 @@ warnings.filterwarnings('ignore')
 class RespiratoryAudioPreprocessor:
     """
-    Audio preprocessor that matches training pipeline exactly
     Converts raw audio files to mel-spectrograms for model inference
     """
@@ -30,7 +30,7 @@ class RespiratoryAudioPreprocessor:
                  power: float = 2.0,
                  duration: float = 3.0):  # 3 seconds max duration
         """
-        Initialize preprocessing parameters to match training
         """
         self.target_sr = target_sr
         self.n_mels = n_mels
@@ -44,23 +44,23 @@ class RespiratoryAudioPreprocessor:
         self.duration = duration
         self.target_length = int(target_sr * duration)  # 3 seconds worth of samples
-        # Expected output shape for your model
         self.expected_shape = (1, 1, 128, 251)  # (batch, channels, height, width)
     def load_and_normalize_audio(self, audio_input: Union[str, np.ndarray, tuple]) -> np.ndarray:
         """
-        Load audio file and normalize
         """
         try:
             # Handle different input types
             if isinstance(audio_input, str):
-                # File path
                 audio_data, sr = librosa.load(audio_input, sr=self.target_sr, duration=self.duration)
             elif isinstance(audio_input, tuple):
-                # (sample_rate, audio_array) from Gradio
                 sr, audio_data = audio_input
-                # Convert to float if needed
                 if audio_data.dtype != np.float32:
                     if audio_data.dtype == np.int16:
                         audio_data = audio_data.astype(np.float32) / 32767.0
@@ -85,7 +85,7 @@ class RespiratoryAudioPreprocessor:
             else:
                 raise ValueError(f"Unsupported audio input type: {type(audio_input)}")
-            # Pad if too short
             if len(audio_data) < self.target_length:
                 audio_data = np.pad(audio_data, (0, self.target_length - len(audio_data)),
                                   mode='constant', constant_values=0)
@@ -97,10 +97,10 @@ class RespiratoryAudioPreprocessor:
     def extract_mel_spectrogram(self, audio_data: np.ndarray) -> np.ndarray:
         """
-        Extract mel spectrogram features matching training preprocessing
         """
         try:
-            # Extract mel spectrogram
             mel_spec = librosa.feature.melspectrogram(
                 y=audio_data,
                 sr=self.target_sr,
@@ -114,7 +114,7 @@ class RespiratoryAudioPreprocessor:
                 power=self.power
             )
-            # Convert to log scale (dB)
             mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
             return mel_spec_db
@@ -124,33 +124,33 @@ class RespiratoryAudioPreprocessor:
     def normalize_spectrogram(self, mel_spec: np.ndarray) -> np.ndarray:
         """
-        Normalize mel spectrogram to match training normalization
-        This matches the normalization used in your training pipeline
         """
-        # Mean and std normalization
         mean = np.mean(mel_spec)
         std = np.std(mel_spec)
         if std == 0:
             normalized = mel_spec - mean
         else:
-            normalized = (mel_spec - mean) / std
-        # Clamp values to reasonable range (matching training)
         normalized = np.clip(normalized, -5.0, 5.0)
         return normalized
     def resize_spectrogram(self, mel_spec: np.ndarray, target_width: int = 251) -> np.ndarray:
         """
-        Resize spectrogram to target dimensions
         """
         current_height, current_width = mel_spec.shape
         if current_width == target_width:
             return mel_spec
-        # Use librosa's time stretching for width adjustment
         if current_width < target_width:
             # Pad if too narrow
             pad_width = target_width - current_width
@@ -164,6 +164,7 @@ class RespiratoryAudioPreprocessor:
     def preprocess_audio(self, audio_input: Union[str, np.ndarray, tuple]) -> torch.Tensor:
         """
         Complete preprocessing pipeline from audio to model input tensor
         """
         try:
             # Step 1: Load and normalize audio
@@ -172,17 +173,17 @@ class RespiratoryAudioPreprocessor:
             # Step 2: Extract mel spectrogram
             mel_spec = self.extract_mel_spectrogram(audio_data)
-            # Step 3: Normalize spectrogram
             mel_spec_norm = self.normalize_spectrogram(mel_spec)
-            # Step 4: Resize to target dimensions
             mel_spec_resized = self.resize_spectrogram(mel_spec_norm)
             # Step 5: Convert to tensor and add batch + channel dimensions
             tensor_input = torch.FloatTensor(mel_spec_resized)
             tensor_input = tensor_input.unsqueeze(0).unsqueeze(0)  # Add batch and channel dims
-            # Verify output shape
             if tensor_input.shape != self.expected_shape:
                 raise RuntimeError(f"Output shape {tensor_input.shape} doesn't match expected {self.expected_shape}")
@@ -193,7 +194,7 @@ class RespiratoryAudioPreprocessor:
     def get_preprocessing_info(self) -> Dict:
         """
-        Get preprocessing configuration info
         """
         return {
             'target_sr': self.target_sr,
@@ -201,13 +202,50 @@ class RespiratoryAudioPreprocessor:
             'n_fft': self.n_fft,
             'hop_length': self.hop_length,
             'duration': self.duration,
-            'output_shape': self.expected_shape
         }
 # Example usage and testing
 if __name__ == "__main__":
-    # Initialize preprocessor
-    preprocessor = RespiratoryAudioPreprocessor()
     # Test with dummy audio data
     dummy_audio = np.random.randn(22050 * 2)  # 2 seconds of audio

 """
+Audio Preprocessing Module for Respiratory Symptom Analysis
+Matches the exact preprocessing used during training in your Coswara notebook
 """
 import librosa
 class RespiratoryAudioPreprocessor:
     """
+    Audio preprocessor that matches your training pipeline exactly
     Converts raw audio files to mel-spectrograms for model inference
     """
                  power: float = 2.0,
                  duration: float = 3.0):  # 3 seconds max duration
         """
+        Initialize preprocessing parameters to match your training
         """
         self.target_sr = target_sr
         self.n_mels = n_mels
         self.duration = duration
         self.target_length = int(target_sr * duration)  # 3 seconds worth of samples
+        # Expected output shape for your model (from your notebook: 128x251)
         self.expected_shape = (1, 1, 128, 251)  # (batch, channels, height, width)
     def load_and_normalize_audio(self, audio_input: Union[str, np.ndarray, tuple]) -> np.ndarray:
         """
+        Load audio file and normalize - matches your training data loading
         """
         try:
             # Handle different input types
             if isinstance(audio_input, str):
+                # File path - most common case for API
                 audio_data, sr = librosa.load(audio_input, sr=self.target_sr, duration=self.duration)
             elif isinstance(audio_input, tuple):
+                # (sample_rate, audio_array) from web uploads
                 sr, audio_data = audio_input
+                # Convert to float32 if needed
                 if audio_data.dtype != np.float32:
                     if audio_data.dtype == np.int16:
                         audio_data = audio_data.astype(np.float32) / 32767.0
             else:
                 raise ValueError(f"Unsupported audio input type: {type(audio_input)}")
+            # Pad if too short (matching your training approach)
             if len(audio_data) < self.target_length:
                 audio_data = np.pad(audio_data, (0, self.target_length - len(audio_data)),
                                   mode='constant', constant_values=0)
     def extract_mel_spectrogram(self, audio_data: np.ndarray) -> np.ndarray:
         """
+        Extract mel spectrogram features - exactly matching your training preprocessing
         """
         try:
+            # Extract mel spectrogram (matching your notebook parameters)
             mel_spec = librosa.feature.melspectrogram(
                 y=audio_data,
                 sr=self.target_sr,
                 power=self.power
             )
+            # Convert to log scale (dB) - matching your training
             mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
             return mel_spec_db
     def normalize_spectrogram(self, mel_spec: np.ndarray) -> np.ndarray:
         """
+        Normalize mel spectrogram to match your training normalization
+        This matches the normalization used in your MultiSymptomDataset
         """
+        # Mean and std normalization (matching your training pipeline)
         mean = np.mean(mel_spec)
         std = np.std(mel_spec)
         if std == 0:
             normalized = mel_spec - mean
         else:
+            normalized = (mel_spec - mean) / (std + 1e-8)  # Adding small epsilon
+        # Clamp values to reasonable range (matching your training)
         normalized = np.clip(normalized, -5.0, 5.0)
         return normalized
     def resize_spectrogram(self, mel_spec: np.ndarray, target_width: int = 251) -> np.ndarray:
         """
+        Resize spectrogram to target dimensions (matching your model input: 128x251)
         """
         current_height, current_width = mel_spec.shape
         if current_width == target_width:
             return mel_spec
+        # Resize to match your training data dimensions
         if current_width < target_width:
             # Pad if too narrow
             pad_width = target_width - current_width
     def preprocess_audio(self, audio_input: Union[str, np.ndarray, tuple]) -> torch.Tensor:
         """
         Complete preprocessing pipeline from audio to model input tensor
+        Matches exactly what your MultiSymptomDataset does in training
         """
         try:
             # Step 1: Load and normalize audio
             # Step 2: Extract mel spectrogram
             mel_spec = self.extract_mel_spectrogram(audio_data)
+            # Step 3: Normalize spectrogram (matching training)
             mel_spec_norm = self.normalize_spectrogram(mel_spec)
+            # Step 4: Resize to target dimensions (128x251)
             mel_spec_resized = self.resize_spectrogram(mel_spec_norm)
             # Step 5: Convert to tensor and add batch + channel dimensions
             tensor_input = torch.FloatTensor(mel_spec_resized)
             tensor_input = tensor_input.unsqueeze(0).unsqueeze(0)  # Add batch and channel dims
+            # Verify output shape matches your model expectations
             if tensor_input.shape != self.expected_shape:
                 raise RuntimeError(f"Output shape {tensor_input.shape} doesn't match expected {self.expected_shape}")
     def get_preprocessing_info(self) -> Dict:
         """
+        Get preprocessing configuration info for API endpoints
         """
         return {
             'target_sr': self.target_sr,
             'n_fft': self.n_fft,
             'hop_length': self.hop_length,
             'duration': self.duration,
+            'output_shape': self.expected_shape,
+            'target_symptoms': ['fever', 'cold', 'sorethroat', 'lossofsmell', 'fatigue', 'cough']
         }
+    def validate_audio_file(self, audio_path: str) -> Tuple[bool, str]:
+        """
+        Validate if audio file is suitable for processing
+        """
+        try:
+            # Check file existence
+            if not audio_path or not isinstance(audio_path, str):
+                return False, "No audio file provided"
+            # Try to load audio
+            audio, sr = librosa.load(audio_path, sr=None, duration=0.1)  # Load just 0.1s for validation
+            # Check if audio is not empty
+            if len(audio) == 0:
+                return False, "Audio file is empty or corrupted"
+            # Check duration (load full file for duration check)
+            try:
+                duration = librosa.get_duration(path=audio_path)
+                if duration < 0.5:  # Minimum 0.5 seconds
+                    return False, f"Audio too short ({duration:.1f}s). Minimum 0.5 seconds required."
+                if duration > 30.0:  # Maximum 30 seconds
+                    return False, f"Audio too long ({duration:.1f}s). Maximum 30 seconds allowed."
+            except:
+                # If duration check fails, proceed
+                pass
+            return True, "Audio file is valid"
+        except Exception as e:
+            return False, f"Error validating audio: {str(e)}"
 # Example usage and testing
 if __name__ == "__main__":
+    # Initialize preprocessor with your exact training parameters
+    preprocessor = RespiratoryAudioPreprocessor(
+        target_sr=22050,  # Matching your training
+        n_mels=128,       # Matching your model input
+        duration=3.0      # 3 seconds as used in training
+    )
     # Test with dummy audio data
     dummy_audio = np.random.randn(22050 * 2)  # 2 seconds of audio