Healthmodels / src /data_preprocessing.py
theaniketgiri's picture
first
902fa1b
# src/data_preprocessing.py - Convert patient data to numerical features
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from datetime import datetime
import joblib
import os
def preprocess_patient_data(csv_file="data/patient_data.csv"):
"""
Convert patient CSV data to numerical features for VAE training
"""
print("Loading and preprocessing patient data...")
# Load data
df = pd.read_csv(csv_file)
print(f"Original data shape: {df.shape}")
# Create numerical features
features = {}
# 1. Age (already numerical)
features['age'] = df['Age'].values
# 2. Gender (encode: Male=0, Female=1)
gender_encoder = LabelEncoder()
features['gender'] = gender_encoder.fit_transform(df['Gender'])
# 3. Diagnosis (encode categorical)
diagnosis_encoder = LabelEncoder()
features['diagnosis'] = diagnosis_encoder.fit_transform(df['Diagnosis'])
# 4. Blood Type (encode categorical)
blood_encoder = LabelEncoder()
features['blood_type'] = blood_encoder.fit_transform(df['BloodType'])
# 5. Length of stay (calculate from admission/discharge dates)
df['AdmissionDate'] = pd.to_datetime(df['AdmissionDate'])
df['DischargeDate'] = pd.to_datetime(df['DischargeDate'])
features['length_of_stay'] = (df['DischargeDate'] - df['AdmissionDate']).dt.days
# 6. Age group (create age categories)
age_bins = [0, 18, 35, 50, 65, 100]
age_labels = [0, 1, 2, 3, 4]
features['age_group'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, include_lowest=True).astype(int)
# 7. Season of admission (extract from admission date)
features['admission_season'] = df['AdmissionDate'].dt.quarter - 1 # 0=Q1, 1=Q2, 2=Q3, 3=Q4
# 8. Day of week admission (0=Monday, 6=Sunday)
features['admission_day'] = df['AdmissionDate'].dt.dayofweek
# 9. Month of admission (0-11)
features['admission_month'] = df['AdmissionDate'].dt.month - 1
# 10. Year of admission (normalized)
features['admission_year'] = df['AdmissionDate'].dt.year - 2020 # Normalize to 2020 as base
# Convert to DataFrame
feature_df = pd.DataFrame(features)
# Handle any missing values
feature_df = feature_df.fillna(feature_df.mean())
print(f"Processed features shape: {feature_df.shape}")
print("Feature columns:", list(feature_df.columns))
# Save encoders for later use
encoders = {
'gender': gender_encoder,
'diagnosis': diagnosis_encoder,
'blood_type': blood_encoder
}
os.makedirs("models", exist_ok=True)
joblib.dump(encoders, 'models/encoders.pkl')
# Save processed data
os.makedirs("data", exist_ok=True)
feature_df.to_csv('data/processed_patient_data.csv', index=False)
print("Data preprocessing completed!")
print(f"Number of features: {feature_df.shape[1]}")
return feature_df, encoders
def create_sample_data_for_training():
"""
Create a sample dataset if the original data is not available
"""
print("Creating sample patient data for training...")
np.random.seed(42)
n_samples = 1000
# Generate realistic patient data
data = {
'age': np.random.normal(50, 20, n_samples).clip(1, 100),
'gender': np.random.choice([0, 1], n_samples),
'bmi': np.random.normal(25, 5, n_samples).clip(15, 50),
'blood_pressure': np.random.normal(120, 20, n_samples).clip(80, 200),
'diabetes': np.random.choice([0, 1], n_samples, p=[0.8, 0.2]),
'cholesterol': np.random.normal(200, 40, n_samples).clip(100, 300),
'heart_rate': np.random.normal(75, 15, n_samples).clip(40, 120)
}
df = pd.DataFrame(data)
os.makedirs("data", exist_ok=True)
df.to_csv('data/patient_data.csv', index=False)
print(f"Sample data created with {n_samples} patients")
return df
if __name__ == "__main__":
try:
# Try to preprocess the real data
feature_df, encoders = preprocess_patient_data()
print("Successfully processed real patient data!")
except Exception as e:
print(f"Error processing real data: {e}")
print("Creating sample data instead...")
create_sample_data_for_training()