""" Utility functions for preprocessing tabular and audio data """ import numpy as np import pandas as pd import librosa from sklearn.impute import SimpleImputer import warnings warnings.filterwarnings('ignore') class TabularPreprocessor: """Preprocessor for tabular medical data""" def __init__(self): self.feature_cols = [ 'age', 'gender', 'tbContactHistory', 'wheezingHistory', 'phlegmCough', 'familyAsthmaHistory', 'feverHistory', 'coldPresent', 'packYears' ] self.imputer = SimpleImputer(strategy='most_frequent') self.is_fitted = False def fit(self, X_df): """Fit the imputer on training data""" X_train = X_df[self.feature_cols].copy() self.imputer.fit(X_train) self.is_fitted = True return self def transform(self, data_dict): """ Transform a single input dictionary to model-ready format Args: data_dict: Dictionary with keys matching feature_cols e.g., {'age': 43, 'gender': 1, ...} Returns: numpy array of shape (1, n_features) """ if not self.is_fitted: # If not fitted, create a simple imputer with most_frequent strategy # This handles the case where we load a pre-trained model self.imputer = SimpleImputer(strategy='most_frequent') # Create a dummy dataframe to fit dummy_df = pd.DataFrame([data_dict]) self.imputer.fit(dummy_df[self.feature_cols]) self.is_fitted = True # Create dataframe from input df = pd.DataFrame([data_dict]) # Ensure all feature columns exist for col in self.feature_cols: if col not in df.columns: df[col] = np.nan # Select and order features X = df[self.feature_cols].copy() # Impute missing values X_imputed = self.imputer.transform(X) # Convert to float32 X_imputed = X_imputed.astype(np.float32) return X_imputed class AudioPreprocessor: """Preprocessor for audio data (cough and vowel sounds)""" def __init__(self, sample_rate=16000, duration=1.0, n_mfcc=20, n_fft=2048, hop_length=512, n_mels=64): """ Initialize audio preprocessor with same parameters as training Args: sample_rate: Target sample rate (Hz) duration: Target duration in seconds n_mfcc: Number of MFCC coefficients n_fft: FFT window size hop_length: Hop length for STFT n_mels: Number of mel bands """ self.sample_rate = sample_rate self.duration = duration self.n_mfcc = n_mfcc self.n_fft = n_fft self.hop_length = hop_length self.n_mels = n_mels self.input_size = n_mfcc * 3 # MFCC + Delta + Delta-Delta def load_and_extract_features(self, audio_path_or_array): """ Load audio file and extract MFCC features Args: audio_path_or_array: Path to audio file or numpy array Returns: numpy array of shape (n_frames, n_mfcc*3) """ try: # Load audio if isinstance(audio_path_or_array, str): audio, sr = librosa.load(audio_path_or_array, sr=self.sample_rate) else: # Assume it's already an array audio = audio_path_or_array sr = self.sample_rate # Ensure audio is mono if len(audio.shape) > 1: audio = np.mean(audio, axis=0) # Segment to target duration (1 second) target_samples = int(self.sample_rate * self.duration) if len(audio) > target_samples: audio = audio[:target_samples] else: audio = np.pad(audio, (0, target_samples - len(audio)), mode='constant') # Extract MFCC mfcc = librosa.feature.mfcc( y=audio, sr=self.sample_rate, n_mfcc=self.n_mfcc, n_fft=self.n_fft, hop_length=self.hop_length, n_mels=self.n_mels ) # Extract delta and delta-delta delta = librosa.feature.delta(mfcc) delta_delta = librosa.feature.delta(mfcc, order=2) # Combine features (n_mfcc*3, n_frames) features = np.vstack([mfcc, delta, delta_delta]) # Transpose to (n_frames, n_mfcc*3) features = features.T # Handle NaN and Inf values features = np.nan_to_num(features, nan=0.0, posinf=0.0, neginf=0.0) # Clip extreme values features = np.clip(features, -1e6, 1e6) return features except Exception as e: print(f"Error processing audio: {str(e)}") # Return zero features on error expected_frames = int((self.sample_rate * self.duration) / self.hop_length) + 1 return np.zeros((expected_frames, self.input_size)) def extract_from_both_audios(self, cough_audio, vowel_audio, combine_mode="concat"): """ Extract features from both cough and vowel audio Args: cough_audio: Path to cough audio or numpy array vowel_audio: Path to vowel audio or numpy array combine_mode: How to combine features ("concat" or "average") Returns: Combined features as numpy array """ cough_features = self.load_and_extract_features(cough_audio) vowel_features = self.load_and_extract_features(vowel_audio) if combine_mode == "concat": # Concatenate along feature dimension combined = np.concatenate([cough_features, vowel_features], axis=1) elif combine_mode == "average": # Average the features combined = (cough_features + vowel_features) / 2.0 else: raise ValueError(f"Unknown combine_mode: {combine_mode}") return combined def get_disease_name(prediction): """Convert disease prediction to readable name""" disease_map = { 0: "Healthy", 1: "COPD (Chronic Obstructive Pulmonary Disease)", 2: "Asthma" } return disease_map.get(int(prediction), "Unknown") def get_disease_info(prediction): """Get detailed information about the predicted disease""" info_map = { 0: { "name": "Healthy", "description": "No respiratory disease detected. Lung function appears normal.", "recommendations": [ "Maintain regular exercise and healthy lifestyle", "Avoid smoking and secondhand smoke", "Get regular health check-ups" ] }, 1: { "name": "COPD (Chronic Obstructive Pulmonary Disease)", "description": "A chronic inflammatory lung disease that causes obstructed airflow from the lungs.", "recommendations": [ "Consult with a pulmonologist for proper diagnosis", "Consider pulmonary rehabilitation program", "Quit smoking if applicable", "Use prescribed medications as directed", "Get vaccinated against flu and pneumonia" ] }, 2: { "name": "Asthma", "description": "A condition in which airways narrow and swell, producing extra mucus.", "recommendations": [ "Consult with an allergist or pulmonologist", "Identify and avoid asthma triggers", "Use prescribed inhalers as directed", "Monitor breathing with a peak flow meter", "Have an asthma action plan" ] } } return info_map.get(int(prediction), info_map[0])