File size: 9,552 Bytes
19d6abb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 | #!/usr/bin/env python
"""
PRESAGE Model Training Pipeline
Trains 3 disease risk prediction models from provided datasets
with cross-validation and accuracy optimization
"""
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib
import warnings
warnings.filterwarnings('ignore')
print("=" * 70)
print("PRESAGE MODEL TRAINING PIPELINE")
print("=" * 70)
# ============================================================================
# MODEL 1: DIABETES RISK PREDICTION
# ============================================================================
print("\n[1/3] TRAINING DIABETES RISK MODEL...")
print("-" * 70)
try:
df_diabetes = pd.read_csv('diabetes_data.csv')
print(f"β Loaded diabetes dataset: {df_diabetes.shape}")
# Prepare features and target
X_diabetes = df_diabetes.drop('Diabetes', axis=1)
y_diabetes = df_diabetes['Diabetes']
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X_diabetes, y_diabetes, test_size=0.2, random_state=42, stratify=y_diabetes
)
# Scale features
scaler_diabetes = StandardScaler()
X_train_scaled = scaler_diabetes.fit_transform(X_train)
X_test_scaled = scaler_diabetes.transform(X_test)
# Train multiple models and select best
models_diabetes = {
'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}
best_model_diabetes = None
best_score_diabetes = 0
results_diabetes = {}
for name, model in models_diabetes.items():
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
results_diabetes[name] = {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1
}
print(f"\n {name}:")
print(f" Accuracy: {accuracy:.4f}")
print(f" Precision: {precision:.4f}")
print(f" Recall: {recall:.4f}")
print(f" F1-Score: {f1:.4f}")
if accuracy > best_score_diabetes:
best_score_diabetes = accuracy
best_model_diabetes = model
# Save best diabetes model
diabetes_model_path = 'data/risk_models/diabetes_model.pkl'
joblib.dump(best_model_diabetes, diabetes_model_path)
print(f"\nβ Best Diabetes Model Saved: {diabetes_model_path}")
print(f" Accuracy: {best_score_diabetes:.4f}")
except Exception as e:
print(f"β Error training diabetes model: {e}")
# ============================================================================
# MODEL 2: CARDIOVASCULAR RISK PREDICTION
# ============================================================================
print("\n[2/3] TRAINING CARDIOVASCULAR RISK MODEL...")
print("-" * 70)
try:
# Read and parse cardiovascular data (semicolon-delimited)
df_cardio = pd.read_csv('cardio_train.csv', sep=';')
print(f"β Loaded cardiovascular dataset: {df_cardio.shape}")
# Target is 'cardio' column
X_cardio = df_cardio.drop('cardio', axis=1)
y_cardio = df_cardio['cardio']
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X_cardio, y_cardio, test_size=0.2, random_state=42, stratify=y_cardio
)
# Scale features
scaler_cardio = StandardScaler()
X_train_scaled = scaler_cardio.fit_transform(X_train)
X_test_scaled = scaler_cardio.transform(X_test)
# Train multiple models
models_cardio = {
'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}
best_model_cardio = None
best_score_cardio = 0
results_cardio = {}
for name, model in models_cardio.items():
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
results_cardio[name] = {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1
}
print(f"\n {name}:")
print(f" Accuracy: {accuracy:.4f}")
print(f" Precision: {precision:.4f}")
print(f" Recall: {recall:.4f}")
print(f" F1-Score: {f1:.4f}")
if accuracy > best_score_cardio:
best_score_cardio = accuracy
best_model_cardio = model
# Save best cardiovascular model
cardio_model_path = 'data/risk_models/cardiovascular_model.pkl'
joblib.dump(best_model_cardio, cardio_model_path)
print(f"\nβ Best Cardiovascular Model Saved: {cardio_model_path}")
print(f" Accuracy: {best_score_cardio:.4f}")
except Exception as e:
print(f"β Error training cardiovascular model: {e}")
# ============================================================================
# MODEL 3: HYPERTENSION RISK PREDICTION
# ============================================================================
print("\n[3/3] TRAINING HYPERTENSION RISK MODEL...")
print("-" * 70)
try:
df_chronic = pd.read_csv('chronic_disease_prediction_dataset.csv')
print(f"β Loaded chronic disease dataset: {df_chronic.shape}")
# Drop Patient_ID
df_chronic = df_chronic.drop('Patient_ID', axis=1)
# Encode categorical variables
le_dict = {}
categorical_cols = df_chronic.select_dtypes(include=['object']).columns
for col in categorical_cols:
if col != 'HasChronicDisease':
le = LabelEncoder()
df_chronic[col] = le.fit_transform(df_chronic[col])
le_dict[col] = le
# Target: HasChronicDisease (as proxy for hypertension)
le_target = LabelEncoder()
y_hypertension = le_target.fit_transform(df_chronic['HasChronicDisease'])
X_hypertension = df_chronic.drop('HasChronicDisease', axis=1)
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X_hypertension, y_hypertension, test_size=0.2, random_state=42, stratify=y_hypertension
)
# Scale features
scaler_hyper = StandardScaler()
X_train_scaled = scaler_hyper.fit_transform(X_train)
X_test_scaled = scaler_hyper.transform(X_test)
# Train multiple models
models_hyper = {
'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}
best_model_hyper = None
best_score_hyper = 0
results_hyper = {}
for name, model in models_hyper.items():
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
results_hyper[name] = {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1
}
print(f"\n {name}:")
print(f" Accuracy: {accuracy:.4f}")
print(f" Precision: {precision:.4f}")
print(f" Recall: {recall:.4f}")
print(f" F1-Score: {f1:.4f}")
if accuracy > best_score_hyper:
best_score_hyper = accuracy
best_model_hyper = model
# Save best hypertension model
hyper_model_path = 'data/risk_models/hypertension_model.pkl'
joblib.dump(best_model_hyper, hyper_model_path)
print(f"\nβ Best Hypertension Model Saved: {hyper_model_path}")
print(f" Accuracy: {best_score_hyper:.4f}")
except Exception as e:
print(f"β Error training hypertension model: {e}")
# ============================================================================
# SUMMARY
# ============================================================================
print("\n" + "=" * 70)
print("MODEL TRAINING SUMMARY")
print("=" * 70)
print(f"\nβ Diabetes Model: Accuracy = {best_score_diabetes:.4f}")
print(f"β Cardiovascular Model: Accuracy = {best_score_cardio:.4f}")
print(f"β Hypertension Model: Accuracy = {best_score_hyper:.4f}")
print(f"\nAll models saved to: data/risk_models/")
print("\n" + "=" * 70)
|