Spaces:
Build error
Build error
| """ | |
| Utility functions for preprocessing tabular and audio data | |
| """ | |
| import numpy as np | |
| import pandas as pd | |
| import librosa | |
| from sklearn.impute import SimpleImputer | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| class TabularPreprocessor: | |
| """Preprocessor for tabular medical data""" | |
| def __init__(self): | |
| self.feature_cols = [ | |
| 'age', 'gender', 'tbContactHistory', 'wheezingHistory', | |
| 'phlegmCough', 'familyAsthmaHistory', 'feverHistory', | |
| 'coldPresent', 'packYears' | |
| ] | |
| self.imputer = SimpleImputer(strategy='most_frequent') | |
| self.is_fitted = False | |
| def fit(self, X_df): | |
| """Fit the imputer on training data""" | |
| X_train = X_df[self.feature_cols].copy() | |
| self.imputer.fit(X_train) | |
| self.is_fitted = True | |
| return self | |
| def transform(self, data_dict): | |
| """ | |
| Transform a single input dictionary to model-ready format | |
| Args: | |
| data_dict: Dictionary with keys matching feature_cols | |
| e.g., {'age': 43, 'gender': 1, ...} | |
| Returns: | |
| numpy array of shape (1, n_features) | |
| """ | |
| if not self.is_fitted: | |
| # If not fitted, create a simple imputer with most_frequent strategy | |
| # This handles the case where we load a pre-trained model | |
| self.imputer = SimpleImputer(strategy='most_frequent') | |
| # Create a dummy dataframe to fit | |
| dummy_df = pd.DataFrame([data_dict]) | |
| self.imputer.fit(dummy_df[self.feature_cols]) | |
| self.is_fitted = True | |
| # Create dataframe from input | |
| df = pd.DataFrame([data_dict]) | |
| # Ensure all feature columns exist | |
| for col in self.feature_cols: | |
| if col not in df.columns: | |
| df[col] = np.nan | |
| # Select and order features | |
| X = df[self.feature_cols].copy() | |
| # Impute missing values | |
| X_imputed = self.imputer.transform(X) | |
| # Convert to float32 | |
| X_imputed = X_imputed.astype(np.float32) | |
| return X_imputed | |
| class AudioPreprocessor: | |
| """Preprocessor for audio data (cough and vowel sounds)""" | |
| def __init__(self, sample_rate=16000, duration=1.0, n_mfcc=20, | |
| n_fft=2048, hop_length=512, n_mels=64): | |
| """ | |
| Initialize audio preprocessor with same parameters as training | |
| Args: | |
| sample_rate: Target sample rate (Hz) | |
| duration: Target duration in seconds | |
| n_mfcc: Number of MFCC coefficients | |
| n_fft: FFT window size | |
| hop_length: Hop length for STFT | |
| n_mels: Number of mel bands | |
| """ | |
| self.sample_rate = sample_rate | |
| self.duration = duration | |
| self.n_mfcc = n_mfcc | |
| self.n_fft = n_fft | |
| self.hop_length = hop_length | |
| self.n_mels = n_mels | |
| self.input_size = n_mfcc * 3 # MFCC + Delta + Delta-Delta | |
| def load_and_extract_features(self, audio_path_or_array): | |
| """ | |
| Load audio file and extract MFCC features | |
| Args: | |
| audio_path_or_array: Path to audio file or numpy array | |
| Returns: | |
| numpy array of shape (n_frames, n_mfcc*3) | |
| """ | |
| try: | |
| # Load audio | |
| if isinstance(audio_path_or_array, str): | |
| audio, sr = librosa.load(audio_path_or_array, sr=self.sample_rate) | |
| else: | |
| # Assume it's already an array | |
| audio = audio_path_or_array | |
| sr = self.sample_rate | |
| # Ensure audio is mono | |
| if len(audio.shape) > 1: | |
| audio = np.mean(audio, axis=0) | |
| # Segment to target duration (1 second) | |
| target_samples = int(self.sample_rate * self.duration) | |
| if len(audio) > target_samples: | |
| audio = audio[:target_samples] | |
| else: | |
| audio = np.pad(audio, (0, target_samples - len(audio)), mode='constant') | |
| # Extract MFCC | |
| mfcc = librosa.feature.mfcc( | |
| y=audio, | |
| sr=self.sample_rate, | |
| n_mfcc=self.n_mfcc, | |
| n_fft=self.n_fft, | |
| hop_length=self.hop_length, | |
| n_mels=self.n_mels | |
| ) | |
| # Extract delta and delta-delta | |
| delta = librosa.feature.delta(mfcc) | |
| delta_delta = librosa.feature.delta(mfcc, order=2) | |
| # Combine features (n_mfcc*3, n_frames) | |
| features = np.vstack([mfcc, delta, delta_delta]) | |
| # Transpose to (n_frames, n_mfcc*3) | |
| features = features.T | |
| # Handle NaN and Inf values | |
| features = np.nan_to_num(features, nan=0.0, posinf=0.0, neginf=0.0) | |
| # Clip extreme values | |
| features = np.clip(features, -1e6, 1e6) | |
| return features | |
| except Exception as e: | |
| print(f"Error processing audio: {str(e)}") | |
| # Return zero features on error | |
| expected_frames = int((self.sample_rate * self.duration) / self.hop_length) + 1 | |
| return np.zeros((expected_frames, self.input_size)) | |
| def extract_from_both_audios(self, cough_audio, vowel_audio, combine_mode="concat"): | |
| """ | |
| Extract features from both cough and vowel audio | |
| Args: | |
| cough_audio: Path to cough audio or numpy array | |
| vowel_audio: Path to vowel audio or numpy array | |
| combine_mode: How to combine features ("concat" or "average") | |
| Returns: | |
| Combined features as numpy array | |
| """ | |
| cough_features = self.load_and_extract_features(cough_audio) | |
| vowel_features = self.load_and_extract_features(vowel_audio) | |
| if combine_mode == "concat": | |
| # Concatenate along feature dimension | |
| combined = np.concatenate([cough_features, vowel_features], axis=1) | |
| elif combine_mode == "average": | |
| # Average the features | |
| combined = (cough_features + vowel_features) / 2.0 | |
| else: | |
| raise ValueError(f"Unknown combine_mode: {combine_mode}") | |
| return combined | |
| def get_disease_name(prediction): | |
| """Convert disease prediction to readable name""" | |
| disease_map = { | |
| 0: "Healthy", | |
| 1: "COPD (Chronic Obstructive Pulmonary Disease)", | |
| 2: "Asthma" | |
| } | |
| return disease_map.get(int(prediction), "Unknown") | |
| def get_disease_info(prediction): | |
| """Get detailed information about the predicted disease""" | |
| info_map = { | |
| 0: { | |
| "name": "Healthy", | |
| "description": "No respiratory disease detected. Lung function appears normal.", | |
| "recommendations": [ | |
| "Maintain regular exercise and healthy lifestyle", | |
| "Avoid smoking and secondhand smoke", | |
| "Get regular health check-ups" | |
| ] | |
| }, | |
| 1: { | |
| "name": "COPD (Chronic Obstructive Pulmonary Disease)", | |
| "description": "A chronic inflammatory lung disease that causes obstructed airflow from the lungs.", | |
| "recommendations": [ | |
| "Consult with a pulmonologist for proper diagnosis", | |
| "Consider pulmonary rehabilitation program", | |
| "Quit smoking if applicable", | |
| "Use prescribed medications as directed", | |
| "Get vaccinated against flu and pneumonia" | |
| ] | |
| }, | |
| 2: { | |
| "name": "Asthma", | |
| "description": "A condition in which airways narrow and swell, producing extra mucus.", | |
| "recommendations": [ | |
| "Consult with an allergist or pulmonologist", | |
| "Identify and avoid asthma triggers", | |
| "Use prescribed inhalers as directed", | |
| "Monitor breathing with a peak flow meter", | |
| "Have an asthma action plan" | |
| ] | |
| } | |
| } | |
| return info_map.get(int(prediction), info_map[0]) | |