""" Batch Prediction on Heart Attack Dataset Loads the dataset, predicts risk for EVERY row, and saves the results. """ import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' import pandas as pd import numpy as np import joblib from tensorflow.keras.models import load_model # ── Config ──────────────────────────────────────────────────────────── BASE_DIR = os.path.dirname(os.path.abspath(__file__)) DATA_PATH = os.path.join(BASE_DIR, "Heart Attack Data Set.csv") MODEL_DIR = os.path.join(BASE_DIR, "saved_model") OUTPUT_PATH = os.path.join(BASE_DIR, "heart_attack_with_predictions.csv") # ── load Resources ──────────────────────────────────────────────────── print(f"📂 Loading dataset from: {DATA_PATH}") df = pd.read_csv(DATA_PATH) print(f"Start Loading model and scaler...") model = load_model(os.path.join(MODEL_DIR, "heart_attack_model.keras")) scaler = joblib.load(os.path.join(MODEL_DIR, "scaler.pkl")) # ── Preprocess Features ─────────────────────────────────────────────── # We need to ensure we use the exact same columns as training (excluding target) # Auto-detect target again to drop it target_candidates = ['target', 'output', 'label', 'class', 'result'] target_col = None for col in df.columns: if col.strip().lower() in target_candidates: target_col = col break if target_col: print(f"Target column detected: '{target_col}' (Dropping for prediction)") X = df.drop(columns=[target_col]) else: X = df.copy() # Handle missing values (same logic as training) numeric_cols = X.select_dtypes(include=[np.number]).columns X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].median()) cat_cols = X.select_dtypes(exclude=[np.number]).columns if len(cat_cols) > 0: # simple fill for batch script X[cat_cols] = X[cat_cols].fillna(X[cat_cols].mode().iloc[0]) # Scale features X_scaled = scaler.transform(X) # ── Make Predictions ────────────────────────────────────────────────── print(f"🔮 Predicting on {len(df)} patients...") predictions = model.predict(X_scaled, verbose=1) # Add results to dataframe df['Predicted_Probability'] = predictions.flatten() df['Predicted_Risk_Label'] = (df['Predicted_Probability'] > 0.5).astype(int) df['Risk_Level'] = df['Predicted_Probability'].apply( lambda p: "High Risk" if p > 0.5 else "Low Risk" ) # ── Save & Show ─────────────────────────────────────────────────────── df.to_csv(OUTPUT_PATH, index=False) print(f"\n✅ Predictions saved to: {OUTPUT_PATH}") print("\n── Sample Results (First 5 Rows) ─────────────────────────────") # Show relevant columns + predictions cols_to_show = ['age', 'sex', 'cp', 'chol', 'target', 'Predicted_Risk_Label', 'Risk_Level', 'Predicted_Probability'] # Filter columns that actually exist cols_to_show = [c for c in cols_to_show if c in df.columns] print(df[cols_to_show].head(10).to_string(index=False)) print("────────────────────────────────────────────────────────────────") # Calculate Accuracy on this full dataset (since we have labels) if target_col: correct = (df[target_col] == df['Predicted_Risk_Label']).sum() total = len(df) print(f"\nOverall Accuracy on Full Dataset: {correct}/{total} ({correct/total:.2%})")