Spaces:
Runtime error
Runtime error
| """ | |
| Audio preprocessing and feature extraction module for respiratory disease detection. | |
| """ | |
| import librosa | |
| import numpy as np | |
| import soundfile as sf | |
| from pathlib import Path | |
| from typing import Tuple, Dict, Optional | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| class AudioPreprocessor: | |
| """Handles audio loading, normalization, and feature extraction.""" | |
| def __init__(self, sample_rate: int = 16000, duration: float = 5.0): | |
| """ | |
| Initialize the audio preprocessor. | |
| Args: | |
| sample_rate: Target sample rate for all audio files | |
| duration: Target duration in seconds (will pad/trim) | |
| """ | |
| self.sample_rate = sample_rate | |
| self.duration = duration | |
| self.target_length = int(sample_rate * duration) | |
| def load_audio(self, file_path: str) -> np.ndarray: | |
| """ | |
| Load and normalize audio file. | |
| Args: | |
| file_path: Path to audio file | |
| Returns: | |
| Normalized audio array | |
| """ | |
| try: | |
| # Load audio file | |
| audio, sr = librosa.load(file_path, sr=self.sample_rate, mono=True) | |
| # Normalize audio to fixed length | |
| audio = self._normalize_length(audio) | |
| # Normalize amplitude | |
| audio = librosa.util.normalize(audio) | |
| return audio | |
| except Exception as e: | |
| print(f"Error loading {file_path}: {e}") | |
| return np.zeros(self.target_length) | |
| def _normalize_length(self, audio: np.ndarray) -> np.ndarray: | |
| """Pad or trim audio to target length.""" | |
| if len(audio) < self.target_length: | |
| # Pad with zeros | |
| audio = np.pad(audio, (0, self.target_length - len(audio))) | |
| else: | |
| # Trim to target length | |
| audio = audio[:self.target_length] | |
| return audio | |
| def extract_mfcc(self, audio: np.ndarray, n_mfcc: int = 40) -> np.ndarray: | |
| """ | |
| Extract MFCC features from audio. | |
| Args: | |
| audio: Audio signal | |
| n_mfcc: Number of MFCCs to extract | |
| Returns: | |
| MFCC features (n_mfcc, time_steps) | |
| """ | |
| mfcc = librosa.feature.mfcc( | |
| y=audio, | |
| sr=self.sample_rate, | |
| n_mfcc=n_mfcc, | |
| n_fft=2048, | |
| hop_length=512 | |
| ) | |
| return mfcc | |
| def extract_mel_spectrogram(self, audio: np.ndarray, n_mels: int = 128) -> np.ndarray: | |
| """ | |
| Extract mel spectrogram from audio. | |
| Args: | |
| audio: Audio signal | |
| n_mels: Number of mel bands | |
| Returns: | |
| Mel spectrogram | |
| """ | |
| mel_spec = librosa.feature.melspectrogram( | |
| y=audio, | |
| sr=self.sample_rate, | |
| n_mels=n_mels, | |
| n_fft=2048, | |
| hop_length=512 | |
| ) | |
| # Convert to log scale | |
| mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max) | |
| return mel_spec_db | |
| def extract_spectral_features(self, audio: np.ndarray) -> Dict[str, np.ndarray]: | |
| """ | |
| Extract various spectral features. | |
| Args: | |
| audio: Audio signal | |
| Returns: | |
| Dictionary of spectral features | |
| """ | |
| features = {} | |
| # Spectral centroid | |
| features['spectral_centroid'] = librosa.feature.spectral_centroid( | |
| y=audio, sr=self.sample_rate | |
| )[0] | |
| # Spectral rolloff | |
| features['spectral_rolloff'] = librosa.feature.spectral_rolloff( | |
| y=audio, sr=self.sample_rate | |
| )[0] | |
| # Zero crossing rate | |
| features['zero_crossing_rate'] = librosa.feature.zero_crossing_rate(audio)[0] | |
| # Chroma features | |
| features['chroma'] = librosa.feature.chroma_stft( | |
| y=audio, sr=self.sample_rate | |
| ) | |
| # Spectral contrast | |
| features['spectral_contrast'] = librosa.feature.spectral_contrast( | |
| y=audio, sr=self.sample_rate | |
| ) | |
| return features | |
| def extract_all_features(self, audio: np.ndarray) -> Dict[str, np.ndarray]: | |
| """ | |
| Extract all audio features. | |
| Args: | |
| audio: Audio signal | |
| Returns: | |
| Dictionary containing all features | |
| """ | |
| features = { | |
| 'mfcc': self.extract_mfcc(audio), | |
| 'mel_spectrogram': self.extract_mel_spectrogram(audio), | |
| } | |
| features.update(self.extract_spectral_features(audio)) | |
| return features | |
| def compute_statistics(self, feature_array: np.ndarray) -> np.ndarray: | |
| """ | |
| Compute statistical features (mean, std, min, max) from feature array. | |
| Args: | |
| feature_array: 2D feature array (features, time) | |
| Returns: | |
| Flattened statistical features | |
| """ | |
| stats = [] | |
| stats.extend(np.mean(feature_array, axis=1)) | |
| stats.extend(np.std(feature_array, axis=1)) | |
| stats.extend(np.min(feature_array, axis=1)) | |
| stats.extend(np.max(feature_array, axis=1)) | |
| return np.array(stats) | |
| class AudioAugmenter: | |
| """Augments audio data for better model generalization.""" | |
| def add_noise(audio: np.ndarray, noise_level: float = 0.005) -> np.ndarray: | |
| """Add random noise to audio.""" | |
| noise = np.random.randn(len(audio)) | |
| return audio + noise_level * noise | |
| def time_stretch(audio: np.ndarray, rate: float = 1.2) -> np.ndarray: | |
| """Time stretch audio.""" | |
| return librosa.effects.time_stretch(audio, rate=rate) | |
| def pitch_shift(audio: np.ndarray, sr: int, n_steps: int = 2) -> np.ndarray: | |
| """Shift pitch of audio.""" | |
| return librosa.effects.pitch_shift(audio, sr=sr, n_steps=n_steps) | |
| def random_gain(audio: np.ndarray, min_gain: float = 0.8, max_gain: float = 1.2) -> np.ndarray: | |
| """Apply random gain to audio.""" | |
| gain = np.random.uniform(min_gain, max_gain) | |
| return audio * gain | |
| def augment(self, audio: np.ndarray, sr: int, techniques: list = None) -> np.ndarray: | |
| """ | |
| Apply random augmentation techniques. | |
| Args: | |
| audio: Audio signal | |
| sr: Sample rate | |
| techniques: List of augmentation techniques to apply | |
| Returns: | |
| Augmented audio | |
| """ | |
| if techniques is None: | |
| techniques = ['noise', 'gain'] | |
| augmented = audio.copy() | |
| for technique in techniques: | |
| if technique == 'noise' and np.random.rand() > 0.5: | |
| augmented = self.add_noise(augmented) | |
| elif technique == 'pitch' and np.random.rand() > 0.5: | |
| n_steps = np.random.randint(-2, 3) | |
| augmented = self.pitch_shift(augmented, sr, n_steps) | |
| elif technique == 'stretch' and np.random.rand() > 0.5: | |
| rate = np.random.uniform(0.9, 1.1) | |
| augmented = self.time_stretch(augmented, rate) | |
| elif technique == 'gain' and np.random.rand() > 0.5: | |
| augmented = self.random_gain(augmented) | |
| return augmented | |
| def process_dataset(data_dir: str, output_dir: str, preprocessor: AudioPreprocessor): | |
| """ | |
| Process all audio files in a dataset directory. | |
| Args: | |
| data_dir: Directory containing raw audio files | |
| output_dir: Directory to save processed features | |
| preprocessor: AudioPreprocessor instance | |
| """ | |
| data_path = Path(data_dir) | |
| output_path = Path(output_dir) | |
| output_path.mkdir(parents=True, exist_ok=True) | |
| audio_files = list(data_path.rglob('*.wav')) + list(data_path.rglob('*.mp3')) | |
| print(f"Found {len(audio_files)} audio files") | |
| for audio_file in audio_files: | |
| try: | |
| # Load and preprocess audio | |
| audio = preprocessor.load_audio(str(audio_file)) | |
| # Extract features | |
| features = preprocessor.extract_all_features(audio) | |
| # Save features | |
| relative_path = audio_file.relative_to(data_path) | |
| output_file = output_path / relative_path.with_suffix('.npz') | |
| output_file.parent.mkdir(parents=True, exist_ok=True) | |
| np.savez_compressed(output_file, **features) | |
| except Exception as e: | |
| print(f"Error processing {audio_file}: {e}") | |
| print(f"Processing complete. Features saved to {output_dir}") | |
| if __name__ == "__main__": | |
| # Example usage | |
| preprocessor = AudioPreprocessor(sample_rate=16000, duration=5.0) | |
| # Process a single file (example) | |
| # audio = preprocessor.load_audio("path/to/audio.wav") | |
| # features = preprocessor.extract_all_features(audio) | |
| # print("MFCC shape:", features['mfcc'].shape) | |
| # print("Mel spectrogram shape:", features['mel_spectrogram'].shape) | |