File size: 7,608 Bytes
19d6abb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 | #!/usr/bin/env python
"""
PRESAGE Model Training Pipeline (v2)
Trains 3 disease risk prediction models with proper feature scaling
Saves both models AND scalers for consistent prediction
"""
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib
import warnings
warnings.filterwarnings('ignore')
print("=" * 70)
print("PRESAGE MODEL TRAINING PIPELINE v2 (with Feature Scaling)")
print("=" * 70)
# ============================================================================
# MODEL 1: DIABETES RISK PREDICTION
# ============================================================================
print("\n[1/3] TRAINING DIABETES RISK MODEL...")
print("-" * 70)
try:
df_diabetes = pd.read_csv('diabetes_data.csv')
print(f"β Loaded diabetes dataset: {df_diabetes.shape}")
# Prepare features and target
X_diabetes = df_diabetes.drop('Diabetes', axis=1)
y_diabetes = df_diabetes['Diabetes']
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X_diabetes, y_diabetes, test_size=0.2, random_state=42, stratify=y_diabetes
)
# Create and save scaler
scaler_diabetes = StandardScaler()
X_train_scaled = scaler_diabetes.fit_transform(X_train)
X_test_scaled = scaler_diabetes.transform(X_test)
# Train Gradient Boosting (best performer)
model_diabetes = GradientBoostingClassifier(n_estimators=100, random_state=42)
model_diabetes.fit(X_train_scaled, y_train)
y_pred = model_diabetes.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
print(f"\n Gradient Boosting Classifier:")
print(f" Accuracy: {accuracy:.4f}")
print(f" Precision: {precision:.4f}")
print(f" Recall: {recall:.4f}")
print(f" F1-Score: {f1:.4f}")
# Save model and scaler
joblib.dump(model_diabetes, 'data/risk_models/diabetes_model.pkl')
joblib.dump(scaler_diabetes, 'data/risk_models/diabetes_scaler.pkl')
print(f"\nβ Diabetes Model & Scaler Saved")
print(f" Accuracy: {accuracy:.4f}")
except Exception as e:
print(f"β Error training diabetes model: {e}")
import traceback
traceback.print_exc()
# ============================================================================
# MODEL 2: CARDIOVASCULAR RISK PREDICTION
# ============================================================================
print("\n[2/3] TRAINING CARDIOVASCULAR RISK MODEL...")
print("-" * 70)
try:
# Read and parse cardiovascular data (semicolon-delimited)
df_cardio = pd.read_csv('cardio_train.csv', sep=';')
print(f"β Loaded cardiovascular dataset: {df_cardio.shape}")
# Target is 'cardio' column
X_cardio = df_cardio.drop('cardio', axis=1)
y_cardio = df_cardio['cardio']
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X_cardio, y_cardio, test_size=0.2, random_state=42, stratify=y_cardio
)
# Create and save scaler
scaler_cardio = StandardScaler()
X_train_scaled = scaler_cardio.fit_transform(X_train)
X_test_scaled = scaler_cardio.transform(X_test)
# Train Gradient Boosting
model_cardio = GradientBoostingClassifier(n_estimators=100, random_state=42)
model_cardio.fit(X_train_scaled, y_train)
y_pred = model_cardio.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
print(f"\n Gradient Boosting Classifier:")
print(f" Accuracy: {accuracy:.4f}")
print(f" Precision: {precision:.4f}")
print(f" Recall: {recall:.4f}")
print(f" F1-Score: {f1:.4f}")
# Save model and scaler
joblib.dump(model_cardio, 'data/risk_models/cardiovascular_model.pkl')
joblib.dump(scaler_cardio, 'data/risk_models/cardiovascular_scaler.pkl')
print(f"\nβ Cardiovascular Model & Scaler Saved")
print(f" Accuracy: {accuracy:.4f}")
except Exception as e:
print(f"β Error training cardiovascular model: {e}")
import traceback
traceback.print_exc()
# ============================================================================
# MODEL 3: HYPERTENSION RISK PREDICTION
# ============================================================================
print("\n[3/3] TRAINING HYPERTENSION RISK MODEL...")
print("-" * 70)
try:
df_chronic = pd.read_csv('chronic_disease_prediction_dataset.csv')
print(f"β Loaded chronic disease dataset: {df_chronic.shape}")
# Drop Patient_ID
df_chronic = df_chronic.drop('Patient_ID', axis=1)
# Encode categorical variables
le_dict = {}
categorical_cols = df_chronic.select_dtypes(include=['object']).columns
for col in categorical_cols:
if col != 'HasChronicDisease':
le = LabelEncoder()
df_chronic[col] = le.fit_transform(df_chronic[col])
le_dict[col] = le
# Target: HasChronicDisease (as proxy for hypertension)
le_target = LabelEncoder()
y_hypertension = le_target.fit_transform(df_chronic['HasChronicDisease'])
X_hypertension = df_chronic.drop('HasChronicDisease', axis=1)
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X_hypertension, y_hypertension, test_size=0.2, random_state=42, stratify=y_hypertension
)
# Create and save scaler
scaler_hyper = StandardScaler()
X_train_scaled = scaler_hyper.fit_transform(X_train)
X_test_scaled = scaler_hyper.transform(X_test)
# Train Gradient Boosting
model_hyper = GradientBoostingClassifier(n_estimators=100, random_state=42)
model_hyper.fit(X_train_scaled, y_train)
y_pred = model_hyper.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
print(f"\n Gradient Boosting Classifier:")
print(f" Accuracy: {accuracy:.4f}")
print(f" Precision: {precision:.4f}")
print(f" Recall: {recall:.4f}")
print(f" F1-Score: {f1:.4f}")
# Save model and scaler
joblib.dump(model_hyper, 'data/risk_models/hypertension_model.pkl')
joblib.dump(scaler_hyper, 'data/risk_models/hypertension_scaler.pkl')
print(f"\nβ Hypertension Model & Scaler Saved")
print(f" Accuracy: {accuracy:.4f}")
except Exception as e:
print(f"β Error training hypertension model: {e}")
import traceback
traceback.print_exc()
# ============================================================================
# SUMMARY
# ============================================================================
print("\n" + "=" * 70)
print("MODEL TRAINING COMPLETE β")
print("=" * 70)
print("\nAll models and scalers saved to: data/risk_models/")
print(" - diabetes_model.pkl + diabetes_scaler.pkl")
print(" - cardiovascular_model.pkl + cardiovascular_scaler.pkl")
print(" - hypertension_model.pkl + hypertension_scaler.pkl")
print("\n" + "=" * 70)
|