""" Audio preprocessing and feature extraction module for respiratory disease detection. """ import librosa import numpy as np import soundfile as sf from pathlib import Path from typing import Tuple, Dict, Optional import warnings warnings.filterwarnings('ignore') class AudioPreprocessor: """Handles audio loading, normalization, and feature extraction.""" def __init__(self, sample_rate: int = 16000, duration: float = 5.0): """ Initialize the audio preprocessor. Args: sample_rate: Target sample rate for all audio files duration: Target duration in seconds (will pad/trim) """ self.sample_rate = sample_rate self.duration = duration self.target_length = int(sample_rate * duration) def load_audio(self, file_path: str) -> np.ndarray: """ Load and normalize audio file. Args: file_path: Path to audio file Returns: Normalized audio array """ try: # Load audio file audio, sr = librosa.load(file_path, sr=self.sample_rate, mono=True) # Normalize audio to fixed length audio = self._normalize_length(audio) # Normalize amplitude audio = librosa.util.normalize(audio) return audio except Exception as e: print(f"Error loading {file_path}: {e}") return np.zeros(self.target_length) def _normalize_length(self, audio: np.ndarray) -> np.ndarray: """Pad or trim audio to target length.""" if len(audio) < self.target_length: # Pad with zeros audio = np.pad(audio, (0, self.target_length - len(audio))) else: # Trim to target length audio = audio[:self.target_length] return audio def extract_mfcc(self, audio: np.ndarray, n_mfcc: int = 40) -> np.ndarray: """ Extract MFCC features from audio. Args: audio: Audio signal n_mfcc: Number of MFCCs to extract Returns: MFCC features (n_mfcc, time_steps) """ mfcc = librosa.feature.mfcc( y=audio, sr=self.sample_rate, n_mfcc=n_mfcc, n_fft=2048, hop_length=512 ) return mfcc def extract_mel_spectrogram(self, audio: np.ndarray, n_mels: int = 128) -> np.ndarray: """ Extract mel spectrogram from audio. Args: audio: Audio signal n_mels: Number of mel bands Returns: Mel spectrogram """ mel_spec = librosa.feature.melspectrogram( y=audio, sr=self.sample_rate, n_mels=n_mels, n_fft=2048, hop_length=512 ) # Convert to log scale mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max) return mel_spec_db def extract_spectral_features(self, audio: np.ndarray) -> Dict[str, np.ndarray]: """ Extract various spectral features. Args: audio: Audio signal Returns: Dictionary of spectral features """ features = {} # Spectral centroid features['spectral_centroid'] = librosa.feature.spectral_centroid( y=audio, sr=self.sample_rate )[0] # Spectral rolloff features['spectral_rolloff'] = librosa.feature.spectral_rolloff( y=audio, sr=self.sample_rate )[0] # Zero crossing rate features['zero_crossing_rate'] = librosa.feature.zero_crossing_rate(audio)[0] # Chroma features features['chroma'] = librosa.feature.chroma_stft( y=audio, sr=self.sample_rate ) # Spectral contrast features['spectral_contrast'] = librosa.feature.spectral_contrast( y=audio, sr=self.sample_rate ) return features def extract_all_features(self, audio: np.ndarray) -> Dict[str, np.ndarray]: """ Extract all audio features. Args: audio: Audio signal Returns: Dictionary containing all features """ features = { 'mfcc': self.extract_mfcc(audio), 'mel_spectrogram': self.extract_mel_spectrogram(audio), } features.update(self.extract_spectral_features(audio)) return features def compute_statistics(self, feature_array: np.ndarray) -> np.ndarray: """ Compute statistical features (mean, std, min, max) from feature array. Args: feature_array: 2D feature array (features, time) Returns: Flattened statistical features """ stats = [] stats.extend(np.mean(feature_array, axis=1)) stats.extend(np.std(feature_array, axis=1)) stats.extend(np.min(feature_array, axis=1)) stats.extend(np.max(feature_array, axis=1)) return np.array(stats) class AudioAugmenter: """Augments audio data for better model generalization.""" @staticmethod def add_noise(audio: np.ndarray, noise_level: float = 0.005) -> np.ndarray: """Add random noise to audio.""" noise = np.random.randn(len(audio)) return audio + noise_level * noise @staticmethod def time_stretch(audio: np.ndarray, rate: float = 1.2) -> np.ndarray: """Time stretch audio.""" return librosa.effects.time_stretch(audio, rate=rate) @staticmethod def pitch_shift(audio: np.ndarray, sr: int, n_steps: int = 2) -> np.ndarray: """Shift pitch of audio.""" return librosa.effects.pitch_shift(audio, sr=sr, n_steps=n_steps) @staticmethod def random_gain(audio: np.ndarray, min_gain: float = 0.8, max_gain: float = 1.2) -> np.ndarray: """Apply random gain to audio.""" gain = np.random.uniform(min_gain, max_gain) return audio * gain def augment(self, audio: np.ndarray, sr: int, techniques: list = None) -> np.ndarray: """ Apply random augmentation techniques. Args: audio: Audio signal sr: Sample rate techniques: List of augmentation techniques to apply Returns: Augmented audio """ if techniques is None: techniques = ['noise', 'gain'] augmented = audio.copy() for technique in techniques: if technique == 'noise' and np.random.rand() > 0.5: augmented = self.add_noise(augmented) elif technique == 'pitch' and np.random.rand() > 0.5: n_steps = np.random.randint(-2, 3) augmented = self.pitch_shift(augmented, sr, n_steps) elif technique == 'stretch' and np.random.rand() > 0.5: rate = np.random.uniform(0.9, 1.1) augmented = self.time_stretch(augmented, rate) elif technique == 'gain' and np.random.rand() > 0.5: augmented = self.random_gain(augmented) return augmented def process_dataset(data_dir: str, output_dir: str, preprocessor: AudioPreprocessor): """ Process all audio files in a dataset directory. Args: data_dir: Directory containing raw audio files output_dir: Directory to save processed features preprocessor: AudioPreprocessor instance """ data_path = Path(data_dir) output_path = Path(output_dir) output_path.mkdir(parents=True, exist_ok=True) audio_files = list(data_path.rglob('*.wav')) + list(data_path.rglob('*.mp3')) print(f"Found {len(audio_files)} audio files") for audio_file in audio_files: try: # Load and preprocess audio audio = preprocessor.load_audio(str(audio_file)) # Extract features features = preprocessor.extract_all_features(audio) # Save features relative_path = audio_file.relative_to(data_path) output_file = output_path / relative_path.with_suffix('.npz') output_file.parent.mkdir(parents=True, exist_ok=True) np.savez_compressed(output_file, **features) except Exception as e: print(f"Error processing {audio_file}: {e}") print(f"Processing complete. Features saved to {output_dir}") if __name__ == "__main__": # Example usage preprocessor = AudioPreprocessor(sample_rate=16000, duration=5.0) # Process a single file (example) # audio = preprocessor.load_audio("path/to/audio.wav") # features = preprocessor.extract_all_features(audio) # print("MFCC shape:", features['mfcc'].shape) # print("Mel spectrogram shape:", features['mel_spectrogram'].shape)