# src/data_preprocessing.py - Convert patient data to numerical features import pandas as pd import numpy as np from sklearn.preprocessing import LabelEncoder, StandardScaler from datetime import datetime import joblib import os def preprocess_patient_data(csv_file="data/patient_data.csv"): """ Convert patient CSV data to numerical features for VAE training """ print("Loading and preprocessing patient data...") # Load data df = pd.read_csv(csv_file) print(f"Original data shape: {df.shape}") # Create numerical features features = {} # 1. Age (already numerical) features['age'] = df['Age'].values # 2. Gender (encode: Male=0, Female=1) gender_encoder = LabelEncoder() features['gender'] = gender_encoder.fit_transform(df['Gender']) # 3. Diagnosis (encode categorical) diagnosis_encoder = LabelEncoder() features['diagnosis'] = diagnosis_encoder.fit_transform(df['Diagnosis']) # 4. Blood Type (encode categorical) blood_encoder = LabelEncoder() features['blood_type'] = blood_encoder.fit_transform(df['BloodType']) # 5. Length of stay (calculate from admission/discharge dates) df['AdmissionDate'] = pd.to_datetime(df['AdmissionDate']) df['DischargeDate'] = pd.to_datetime(df['DischargeDate']) features['length_of_stay'] = (df['DischargeDate'] - df['AdmissionDate']).dt.days # 6. Age group (create age categories) age_bins = [0, 18, 35, 50, 65, 100] age_labels = [0, 1, 2, 3, 4] features['age_group'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, include_lowest=True).astype(int) # 7. Season of admission (extract from admission date) features['admission_season'] = df['AdmissionDate'].dt.quarter - 1 # 0=Q1, 1=Q2, 2=Q3, 3=Q4 # 8. Day of week admission (0=Monday, 6=Sunday) features['admission_day'] = df['AdmissionDate'].dt.dayofweek # 9. Month of admission (0-11) features['admission_month'] = df['AdmissionDate'].dt.month - 1 # 10. Year of admission (normalized) features['admission_year'] = df['AdmissionDate'].dt.year - 2020 # Normalize to 2020 as base # Convert to DataFrame feature_df = pd.DataFrame(features) # Handle any missing values feature_df = feature_df.fillna(feature_df.mean()) print(f"Processed features shape: {feature_df.shape}") print("Feature columns:", list(feature_df.columns)) # Save encoders for later use encoders = { 'gender': gender_encoder, 'diagnosis': diagnosis_encoder, 'blood_type': blood_encoder } os.makedirs("models", exist_ok=True) joblib.dump(encoders, 'models/encoders.pkl') # Save processed data os.makedirs("data", exist_ok=True) feature_df.to_csv('data/processed_patient_data.csv', index=False) print("Data preprocessing completed!") print(f"Number of features: {feature_df.shape[1]}") return feature_df, encoders def create_sample_data_for_training(): """ Create a sample dataset if the original data is not available """ print("Creating sample patient data for training...") np.random.seed(42) n_samples = 1000 # Generate realistic patient data data = { 'age': np.random.normal(50, 20, n_samples).clip(1, 100), 'gender': np.random.choice([0, 1], n_samples), 'bmi': np.random.normal(25, 5, n_samples).clip(15, 50), 'blood_pressure': np.random.normal(120, 20, n_samples).clip(80, 200), 'diabetes': np.random.choice([0, 1], n_samples, p=[0.8, 0.2]), 'cholesterol': np.random.normal(200, 40, n_samples).clip(100, 300), 'heart_rate': np.random.normal(75, 15, n_samples).clip(40, 120) } df = pd.DataFrame(data) os.makedirs("data", exist_ok=True) df.to_csv('data/patient_data.csv', index=False) print(f"Sample data created with {n_samples} patients") return df if __name__ == "__main__": try: # Try to preprocess the real data feature_df, encoders = preprocess_patient_data() print("Successfully processed real patient data!") except Exception as e: print(f"Error processing real data: {e}") print("Creating sample data instead...") create_sample_data_for_training()