File size: 4,346 Bytes
902fa1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# src/data_preprocessing.py - Convert patient data to numerical features
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from datetime import datetime
import joblib
import os

def preprocess_patient_data(csv_file="data/patient_data.csv"):
    """
    Convert patient CSV data to numerical features for VAE training
    """
    print("Loading and preprocessing patient data...")
    
    # Load data
    df = pd.read_csv(csv_file)
    print(f"Original data shape: {df.shape}")
    
    # Create numerical features
    features = {}
    
    # 1. Age (already numerical)
    features['age'] = df['Age'].values
    
    # 2. Gender (encode: Male=0, Female=1)
    gender_encoder = LabelEncoder()
    features['gender'] = gender_encoder.fit_transform(df['Gender'])
    
    # 3. Diagnosis (encode categorical)
    diagnosis_encoder = LabelEncoder()
    features['diagnosis'] = diagnosis_encoder.fit_transform(df['Diagnosis'])
    
    # 4. Blood Type (encode categorical)
    blood_encoder = LabelEncoder()
    features['blood_type'] = blood_encoder.fit_transform(df['BloodType'])
    
    # 5. Length of stay (calculate from admission/discharge dates)
    df['AdmissionDate'] = pd.to_datetime(df['AdmissionDate'])
    df['DischargeDate'] = pd.to_datetime(df['DischargeDate'])
    features['length_of_stay'] = (df['DischargeDate'] - df['AdmissionDate']).dt.days
    
    # 6. Age group (create age categories)
    age_bins = [0, 18, 35, 50, 65, 100]
    age_labels = [0, 1, 2, 3, 4]
    features['age_group'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, include_lowest=True).astype(int)
    
    # 7. Season of admission (extract from admission date)
    features['admission_season'] = df['AdmissionDate'].dt.quarter - 1  # 0=Q1, 1=Q2, 2=Q3, 3=Q4
    
    # 8. Day of week admission (0=Monday, 6=Sunday)
    features['admission_day'] = df['AdmissionDate'].dt.dayofweek
    
    # 9. Month of admission (0-11)
    features['admission_month'] = df['AdmissionDate'].dt.month - 1
    
    # 10. Year of admission (normalized)
    features['admission_year'] = df['AdmissionDate'].dt.year - 2020  # Normalize to 2020 as base
    
    # Convert to DataFrame
    feature_df = pd.DataFrame(features)
    
    # Handle any missing values
    feature_df = feature_df.fillna(feature_df.mean())
    
    print(f"Processed features shape: {feature_df.shape}")
    print("Feature columns:", list(feature_df.columns))
    
    # Save encoders for later use
    encoders = {
        'gender': gender_encoder,
        'diagnosis': diagnosis_encoder,
        'blood_type': blood_encoder
    }
    
    os.makedirs("models", exist_ok=True)
    joblib.dump(encoders, 'models/encoders.pkl')
    
    # Save processed data
    os.makedirs("data", exist_ok=True)
    feature_df.to_csv('data/processed_patient_data.csv', index=False)
    
    print("Data preprocessing completed!")
    print(f"Number of features: {feature_df.shape[1]}")
    
    return feature_df, encoders

def create_sample_data_for_training():
    """
    Create a sample dataset if the original data is not available
    """
    print("Creating sample patient data for training...")
    
    np.random.seed(42)
    n_samples = 1000
    
    # Generate realistic patient data
    data = {
        'age': np.random.normal(50, 20, n_samples).clip(1, 100),
        'gender': np.random.choice([0, 1], n_samples),
        'bmi': np.random.normal(25, 5, n_samples).clip(15, 50),
        'blood_pressure': np.random.normal(120, 20, n_samples).clip(80, 200),
        'diabetes': np.random.choice([0, 1], n_samples, p=[0.8, 0.2]),
        'cholesterol': np.random.normal(200, 40, n_samples).clip(100, 300),
        'heart_rate': np.random.normal(75, 15, n_samples).clip(40, 120)
    }
    
    df = pd.DataFrame(data)
    os.makedirs("data", exist_ok=True)
    df.to_csv('data/patient_data.csv', index=False)
    
    print(f"Sample data created with {n_samples} patients")
    return df

if __name__ == "__main__":
    try:
        # Try to preprocess the real data
        feature_df, encoders = preprocess_patient_data()
        print("Successfully processed real patient data!")
    except Exception as e:
        print(f"Error processing real data: {e}")
        print("Creating sample data instead...")
        create_sample_data_for_training()