"""
Batch Prediction on Heart Attack Dataset
Loads the dataset, predicts risk for EVERY row, and saves the results.
"""

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import pandas as pd
import numpy as np
import joblib
from tensorflow.keras.models import load_model

# ── Config ────────────────────────────────────────────────────────────
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_PATH = os.path.join(BASE_DIR, "Heart Attack Data Set.csv")
MODEL_DIR = os.path.join(BASE_DIR, "saved_model")
OUTPUT_PATH = os.path.join(BASE_DIR, "heart_attack_with_predictions.csv")

# ── load Resources ────────────────────────────────────────────────────
print(f"📂 Loading dataset from: {DATA_PATH}")
df = pd.read_csv(DATA_PATH)

print(f"Start Loading model and scaler...")
model = load_model(os.path.join(MODEL_DIR, "heart_attack_model.keras"))
scaler = joblib.load(os.path.join(MODEL_DIR, "scaler.pkl"))

# ── Preprocess Features ───────────────────────────────────────────────
# We need to ensure we use the exact same columns as training (excluding target)
# Auto-detect target again to drop it
target_candidates = ['target', 'output', 'label', 'class', 'result']
target_col = None
for col in df.columns:
    if col.strip().lower() in target_candidates:
        target_col = col
        break

if target_col:
    print(f"Target column detected: '{target_col}' (Dropping for prediction)")
    X = df.drop(columns=[target_col])
else:
    X = df.copy()

# Handle missing values (same logic as training)
numeric_cols = X.select_dtypes(include=[np.number]).columns
X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].median())
cat_cols = X.select_dtypes(exclude=[np.number]).columns
if len(cat_cols) > 0:
    # simple fill for batch script
    X[cat_cols] = X[cat_cols].fillna(X[cat_cols].mode().iloc[0])

# Scale features
X_scaled = scaler.transform(X)

# ── Make Predictions ──────────────────────────────────────────────────
print(f"🔮 Predicting on {len(df)} patients...")
predictions = model.predict(X_scaled, verbose=1)

# Add results to dataframe
df['Predicted_Probability'] = predictions.flatten()
df['Predicted_Risk_Label'] = (df['Predicted_Probability'] > 0.5).astype(int)
df['Risk_Level'] = df['Predicted_Probability'].apply(
    lambda p: "High Risk" if p > 0.5 else "Low Risk"
)

# ── Save & Show ───────────────────────────────────────────────────────
df.to_csv(OUTPUT_PATH, index=False)
print(f"\n✅ Predictions saved to: {OUTPUT_PATH}")

print("\n── Sample Results (First 5 Rows) ─────────────────────────────")
# Show relevant columns + predictions
cols_to_show = ['age', 'sex', 'cp', 'chol', 'target', 'Predicted_Risk_Label', 'Risk_Level', 'Predicted_Probability']
# Filter columns that actually exist
cols_to_show = [c for c in cols_to_show if c in df.columns]

print(df[cols_to_show].head(10).to_string(index=False))
print("────────────────────────────────────────────────────────────────")

# Calculate Accuracy on this full dataset (since we have labels)
if target_col:
    correct = (df[target_col] == df['Predicted_Risk_Label']).sum()
    total = len(df)
    print(f"\nOverall Accuracy on Full Dataset: {correct}/{total} ({correct/total:.2%})")