Spaces:
Sleeping
Sleeping
| # src/data_preprocessing.py - Convert patient data to numerical features | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.preprocessing import LabelEncoder, StandardScaler | |
| from datetime import datetime | |
| import joblib | |
| import os | |
| def preprocess_patient_data(csv_file="data/patient_data.csv"): | |
| """ | |
| Convert patient CSV data to numerical features for VAE training | |
| """ | |
| print("Loading and preprocessing patient data...") | |
| # Load data | |
| df = pd.read_csv(csv_file) | |
| print(f"Original data shape: {df.shape}") | |
| # Create numerical features | |
| features = {} | |
| # 1. Age (already numerical) | |
| features['age'] = df['Age'].values | |
| # 2. Gender (encode: Male=0, Female=1) | |
| gender_encoder = LabelEncoder() | |
| features['gender'] = gender_encoder.fit_transform(df['Gender']) | |
| # 3. Diagnosis (encode categorical) | |
| diagnosis_encoder = LabelEncoder() | |
| features['diagnosis'] = diagnosis_encoder.fit_transform(df['Diagnosis']) | |
| # 4. Blood Type (encode categorical) | |
| blood_encoder = LabelEncoder() | |
| features['blood_type'] = blood_encoder.fit_transform(df['BloodType']) | |
| # 5. Length of stay (calculate from admission/discharge dates) | |
| df['AdmissionDate'] = pd.to_datetime(df['AdmissionDate']) | |
| df['DischargeDate'] = pd.to_datetime(df['DischargeDate']) | |
| features['length_of_stay'] = (df['DischargeDate'] - df['AdmissionDate']).dt.days | |
| # 6. Age group (create age categories) | |
| age_bins = [0, 18, 35, 50, 65, 100] | |
| age_labels = [0, 1, 2, 3, 4] | |
| features['age_group'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, include_lowest=True).astype(int) | |
| # 7. Season of admission (extract from admission date) | |
| features['admission_season'] = df['AdmissionDate'].dt.quarter - 1 # 0=Q1, 1=Q2, 2=Q3, 3=Q4 | |
| # 8. Day of week admission (0=Monday, 6=Sunday) | |
| features['admission_day'] = df['AdmissionDate'].dt.dayofweek | |
| # 9. Month of admission (0-11) | |
| features['admission_month'] = df['AdmissionDate'].dt.month - 1 | |
| # 10. Year of admission (normalized) | |
| features['admission_year'] = df['AdmissionDate'].dt.year - 2020 # Normalize to 2020 as base | |
| # Convert to DataFrame | |
| feature_df = pd.DataFrame(features) | |
| # Handle any missing values | |
| feature_df = feature_df.fillna(feature_df.mean()) | |
| print(f"Processed features shape: {feature_df.shape}") | |
| print("Feature columns:", list(feature_df.columns)) | |
| # Save encoders for later use | |
| encoders = { | |
| 'gender': gender_encoder, | |
| 'diagnosis': diagnosis_encoder, | |
| 'blood_type': blood_encoder | |
| } | |
| os.makedirs("models", exist_ok=True) | |
| joblib.dump(encoders, 'models/encoders.pkl') | |
| # Save processed data | |
| os.makedirs("data", exist_ok=True) | |
| feature_df.to_csv('data/processed_patient_data.csv', index=False) | |
| print("Data preprocessing completed!") | |
| print(f"Number of features: {feature_df.shape[1]}") | |
| return feature_df, encoders | |
| def create_sample_data_for_training(): | |
| """ | |
| Create a sample dataset if the original data is not available | |
| """ | |
| print("Creating sample patient data for training...") | |
| np.random.seed(42) | |
| n_samples = 1000 | |
| # Generate realistic patient data | |
| data = { | |
| 'age': np.random.normal(50, 20, n_samples).clip(1, 100), | |
| 'gender': np.random.choice([0, 1], n_samples), | |
| 'bmi': np.random.normal(25, 5, n_samples).clip(15, 50), | |
| 'blood_pressure': np.random.normal(120, 20, n_samples).clip(80, 200), | |
| 'diabetes': np.random.choice([0, 1], n_samples, p=[0.8, 0.2]), | |
| 'cholesterol': np.random.normal(200, 40, n_samples).clip(100, 300), | |
| 'heart_rate': np.random.normal(75, 15, n_samples).clip(40, 120) | |
| } | |
| df = pd.DataFrame(data) | |
| os.makedirs("data", exist_ok=True) | |
| df.to_csv('data/patient_data.csv', index=False) | |
| print(f"Sample data created with {n_samples} patients") | |
| return df | |
| if __name__ == "__main__": | |
| try: | |
| # Try to preprocess the real data | |
| feature_df, encoders = preprocess_patient_data() | |
| print("Successfully processed real patient data!") | |
| except Exception as e: | |
| print(f"Error processing real data: {e}") | |
| print("Creating sample data instead...") | |
| create_sample_data_for_training() |