File size: 38,098 Bytes

82dec99

import streamlit as st
import pandas as pd
import numpy as np
import re
import io
import os
import joblib
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
from sklearn.metrics import (
    accuracy_score, confusion_matrix, silhouette_score,
    classification_report, f1_score, precision_score, recall_score
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.feature_selection import mutual_info_classif
from sklearn.utils import resample

# ==========================================================
# PAGE CONFIG
# ==========================================================
st.set_page_config(
    page_title="AI AutoML Platform",
    page_icon="🤖",
    layout="wide"
)

# ==========================================================
# SESSION STATE
# ==========================================================
if "history" not in st.session_state:
    st.session_state.history = []

if "last_model_name" not in st.session_state:
    st.session_state.last_model_name = None

if "last_score" not in st.session_state:
    st.session_state.last_score = None
#store detailed results per model run for reports
if "model_results" not in st.session_state:
    st.session_state.model_results = []

#store selected target so report can reference it
if "selected_target" not in st.session_state:
    st.session_state.selected_target = None

# store the cleaned df reference for report generation
if "cleaned_df" not in st.session_state:
    st.session_state.cleaned_df = None

# ==========================================================
# THEME CSS
# ==========================================================
st.markdown("""

<style>

.stApp {

    background: linear-gradient(135deg,#0f172a,#111827,#020617);

    color: white;

}

.big-title {

    font-size: 42px;

    font-weight: 800;

    color: #38bdf8;

    text-align:center;

    padding:15px;

}

.sub-title {

    text-align:center;

    color:#cbd5e1;

    font-size:18px;

    margin-bottom:25px;

}

.section {

    background:#0f172a;

    padding:12px;

    border-radius:12px;

    color:#38bdf8;

    font-weight:700;

    font-size:24px;

    margin-top:20px;

}

.stButton>button {

    background:#38bdf8;

    color:black;

    border:none;

    border-radius:10px;

    font-weight:700;

}

.stButton>button:hover {

    background:#0ea5e9;

    color:white;

}

div[data-baseweb="select"] > div {

    background:#1e293b !important;

    color:white !important;

}

.model-result-box {

    background:#1e293b;

    padding:20px;

    border-radius:12px;

    border:2px solid #38bdf8;

    margin:15px 0;

}

/* File Uploader Button */

.stFileUploader>div>div>button {

    background:#38bdf8 !important;

    color:black !important;

    border:none !important;

    border-radius:10px !important;

    font-weight:700 !important;

}

.stFileUploader>div>div>button:hover {

    background:#0ea5e9 !important;

    color:white !important;

}

/* File Uploader Button Alternative Selectors */

.stFileUploader button {

    background:#38bdf8 !important;

    color:black !important;

    border:none !important;

    border-radius:10px !important;

    font-weight:700 !important;

}

.stFileUploader button:hover {

    background:#0ea5e9 !important;

    color:white !important;

}

/* Download Buttons */

.stDownloadButton>button {

    background:#38bdf8 !important;

    color:black !important;

    border:none !important;

    border-radius:10px !important;

    font-weight:700 !important;

}

.stDownloadButton>button:hover {

    background:#0ea5e9 !important;

    color:white !important;

}

/* File Uploader Label */

.stFileUploader label {

    color:#38bdf8 !important;

    font-size:16px !important;

    font-weight:700 !important;

}

/* Selectbox Labels */

.stSelectbox label {

    color:#38bdf8 !important;

    font-size:16px !important;

    font-weight:700 !important;

}

/* Text and Write Styling */

p {

    color:#cbd5e1;

}

.stWrite {

    color:#cbd5e1;

}

/* Center pyplot figures and add lateral padding */

.stPlotlyChart, .stPyplot {

    display: flex;

    justify-content: center;

}

.stPyplot {

    padding: 0 50px;

}

.stPlotlyChart {

    padding: 0 50px;

}

/* Centered containers */

.stContainer {

    max-width: 95%;

    margin-left: auto;

    margin-right: auto;

}

/* Classification Report Text */

.stText {

    color: white !important;

}

.stText pre {

    color: white !important;

}

.stText * {

    color: white !important;

}

</style>

""", unsafe_allow_html=True)

# ==========================================================
# HEADER
# ==========================================================
st.markdown('<div class="big-title">🤖 AI AutoML Platform</div>', unsafe_allow_html=True)
st.markdown('<div class="sub-title">upload csv select model download trained model</div>', unsafe_allow_html=True)

# ==========================================================
# HELPERS
# ==========================================================
def smart_clean(df):
    df = df.copy()
    df = df.drop_duplicates()

    for col in df.columns:
        if df[col].dtype == "object":
            df[col] = df[col].fillna(df[col].mode()[0])
        else:
            # use median instead of mean (more robust to outliers)
            df[col] = df[col].fillna(df[col].median())

    return df


def convert_units(value):
    try:
        txt = str(value).lower().strip()

        nums = re.findall(r'[\d.]+', txt)
        if not nums:
            return value

        num = float(nums[0])

        if "km" in txt:
            return num * 1000
        elif "cm" in txt:
            return num / 100
        elif "mm" in txt:
            return num / 1000
        elif "m" in txt:
            return num
        else:
            return num
    except:
        return value


def detect_unit_columns(df):
    df = df.copy()

    for col in df.columns:
        if df[col].dtype == "object":
            sample = str(df[col].iloc[0]).lower()

            if any(x in sample for x in ["km", "cm", "mm", " m"]):
                df[col] = df[col].apply(convert_units)

    return df


def detect_best_target(df):
    scores = {}

    for col in df.columns:
        score = 0
        unique = df[col].nunique()
        ratio = unique / len(df)

        if 2 <= unique <= 15:
            score += 6

        if df[col].dtype == "object":
            score += 3

        if ratio > 0.9:
            score -= 10

        if unique > 50:
            score -= 5

        scores[col] = score

    best = max(scores, key=scores.get)
    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)

    return best, ranked[:5]


def prepare_for_supervised(df, target):
    data = df.copy()

    for col in data.columns:
        if data[col].dtype == "object":
            le = LabelEncoder()
            data[col] = le.fit_transform(data[col].astype(str))

    X = data.drop(columns=[target])
    y = data[target]

    return X, y, data


# --- ACCURACY HELPER FUNCTIONS ---

def clip_outliers_iqr(df):
    """Clip outliers using IQR method instead of removing rows."""
    df = df.copy()
    info = {}
    for col in df.select_dtypes(include=[np.number]).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        n_out = ((df[col] < lower) | (df[col] > upper)).sum()
        if n_out > 0:
            df[col] = df[col].clip(lower=lower, upper=upper)
            info[col] = n_out
    return df, info


def remove_low_variance(X, threshold=0.01):
    """Remove features with near-zero variance."""
    variances = X.var()
    low = variances[variances < threshold].index.tolist()
    if low:
        X = X.drop(columns=low)
    return X, low


def remove_high_correlation(X, threshold=0.95):
    """Remove one of each pair of highly correlated features."""
    corr = X.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    to_drop = [c for c in upper.columns if any(upper[c] > threshold)]
    if to_drop:
        X = X.drop(columns=to_drop)
    return X, to_drop


def balance_classes(X, y):
    """Oversample minority classes to match majority count."""
    classes, counts = np.unique(y, return_counts=True)
    if len(classes) < 2:
        return X, y, False

    max_count = counts.max()
    ratio = max_count / counts.min()
    if ratio < 2:
        return X, y, False

    X_out = X.copy()
    y_out = y.copy()

    for cls, cnt in zip(classes, counts):
        if cnt < max_count:
            idx = y[y == cls].index
            extra = resample(X.loc[idx], replace=True, n_samples=max_count - cnt, random_state=42)
            y_extra = pd.Series([cls] * (max_count - cnt), index=extra.index)
            X_out = pd.concat([X_out, extra])
            y_out = pd.concat([y_out, y_extra])

    return X_out, y_out, True


def select_top_features(X, y, max_features=20):
    """Select top features by mutual information."""
    if X.shape[1] <= max_features:
        return X, list(X.columns)

    mi = mutual_info_classif(X, y, random_state=42)
    top = pd.Series(mi, index=X.columns).sort_values(ascending=False).head(max_features).index.tolist()
    return X[top], top


def preprocess_for_model(df, target):
    """Full accuracy-boosting preprocessing pipeline."""
    X, y, transformed = prepare_for_supervised(df, target)

    # Clip outliers
    transformed_clipped, outlier_info = clip_outliers_iqr(transformed)
    X = transformed_clipped.drop(columns=[target])
    y = transformed_clipped[target]

    # Remove low variance
    X, low_var = remove_low_variance(X)

    # Remove high correlation
    X, high_corr = remove_high_correlation(X)

    # Balance classes
    X, y, balanced = balance_classes(X, y)

    # Feature selection
    X, selected = select_top_features(X, y)

    return X, y, transformed, {
        "outliers_clipped": outlier_info,
        "low_var_removed": low_var,
        "high_corr_removed": high_corr,
        "class_balanced": balanced,
        "features_used": list(X.columns),
    }


def show_confusion(y_true, y_pred, title):
    fig, ax = plt.subplots(figsize=(5,4))
    cm = confusion_matrix(y_true, y_pred)

    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        linewidths=1
    )

    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("Actual")

    col1, col2, col3 = st.columns([1, 2, 1])
    with col2:
        st.pyplot(fig)
    return fig


def compact_bar(labels, values, title):
    fig, ax = plt.subplots(figsize=(6,3))

    sns.barplot(x=labels, y=values)

    plt.xticks(rotation=20)
    plt.title(title)

    col1, col2, col3 = st.columns([1, 2, 1])
    with col2:
        st.pyplot(fig)
    return fig


def save_result(name, score, target_col, features_used, extra_info=None):
    """Enhanced save_result that stores all details for reporting."""
    st.session_state.last_model_name = name
    st.session_state.last_score = score

    entry = {
        "Model": name,
        "Score": score,
        "Target": target_col,
        "Features": features_used,
        "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    }

    if extra_info:
        entry.update(extra_info)

    st.session_state.history.append(entry)
    st.session_state.model_results.append(entry)


# --- REPORT GENERATORS ---

def generate_text_report(df, target, model_results):
    """Generate a comprehensive TXT report with every detail."""
    best = max(model_results, key=lambda x: x["Score"]) if model_results else None

    lines = []
    lines.append("=" * 70)
    lines.append("  DARK AI AUTOML PLATFORM - FULL REPORT")
    lines.append("=" * 70)
    lines.append(f"  Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    lines.append("")
    lines.append("-" * 70)
    lines.append("  DATASET SUMMARY")
    lines.append("-" * 70)
    lines.append(f"  Rows: {df.shape[0]}")
    lines.append(f"  Columns: {df.shape[1]}")
    lines.append(f"  Target Column: {target}")
    lines.append(f"  Target Unique Values: {df[target].nunique()}")
    lines.append("")

    lines.append("-" * 70)
    lines.append("  COLUMN DETAILS")
    lines.append("-" * 70)
    for col in df.columns:
        dtype = str(df[col].dtype)
        nunique = df[col].nunique()
        missing = df[col].isnull().sum()
        lines.append(f"  {col}: type={dtype}, unique={nunique}, missing={missing}")
    lines.append("")

    lines.append("-" * 70)
    lines.append("  MODEL RESULTS (ALL RUNS)")
    lines.append("-" * 70)
    for i, r in enumerate(model_results, 1):
        lines.append("")
        lines.append(f"  Run #{i}")
        lines.append(f"  Model: {r['Model']}")
        lines.append(f"  Accuracy/Score: {r['Score']:.2f}%")
        lines.append(f"  Target Feature: {r.get('Target', 'N/A')}")
        lines.append(f"  Features Used: {r.get('Features', 'N/A')}")
        lines.append(f"  Timestamp: {r.get('Timestamp', 'N/A')}")
        if "Precision" in r:
            lines.append(f"  Precision: {r['Precision']:.2f}%")
        if "Recall" in r:
            lines.append(f"  Recall: {r['Recall']:.2f}%")
        if "F1Score" in r:
            lines.append(f"  F1 Score: {r['F1Score']:.2f}%")
        if "BestParams" in r:
            lines.append(f"  Best Hyperparameters: {r['BestParams']}")
        if "OutliersClipped" in r:
            lines.append(f"  Outliers Clipped: {r['OutliersClipped']} columns")
        if "LowVarRemoved" in r:
            lines.append(f"  Low Variance Features Removed: {r['LowVarRemoved']}")
        if "HighCorrRemoved" in r:
            lines.append(f"  High Correlation Features Removed: {r['HighCorrRemoved']}")
        if "ClassBalanced" in r:
            lines.append(f"  Class Balancing Applied: {r['ClassBalanced']}")
        if "BestK" in r:
            lines.append(f"  Optimal Clusters (k): {r['BestK']}")

    if best:
        lines.append("")
        lines.append("-" * 70)
        lines.append("  BEST MODEL")
        lines.append("-" * 70)
        lines.append(f"  Model: {best['Model']}")
        lines.append(f"  Score: {best['Score']:.2f}%")
        lines.append(f"  Target: {best.get('Target', 'N/A')}")

    lines.append("")
    lines.append("-" * 70)
    lines.append("  PREPROCESSING PIPELINE")
    lines.append("-" * 70)
    lines.append("  - Duplicate removal")
    lines.append("  - Missing values handled (median for numeric, mode for categorical)")
    lines.append("  - Unit conversion (km/cm/mm -> m)")
    lines.append("  - Categorical encoding (LabelEncoder)")
    lines.append("  - Outlier clipping (IQR method)")
    lines.append("  - Low variance feature removal")
    lines.append("  - High correlation feature removal")
    lines.append("  - Class imbalance handling (oversampling)")
    lines.append("  - Feature selection (mutual information, top 20)")
    lines.append("  - Scaling where required (StandardScaler / RobustScaler)")
    lines.append("  - Hyperparameter tuning (GridSearchCV)")
    lines.append("  - Stratified cross-validation (5-fold)")
    lines.append("")
    lines.append("=" * 70)
    lines.append("  END OF REPORT")
    lines.append("=" * 70)

    return "\n".join(lines)


def generate_xlsx_report(df, target, model_results):
    """Generate a multi-sheet XLSX report with every detail."""
    output = io.BytesIO()

    with pd.ExcelWriter(output, engine="openpyxl") as writer:
        # Sheet 1: Dataset Summary
        summary = pd.DataFrame({
            "Property": ["Rows", "Columns", "Target Column", "Target Unique Values"],
            "Value": [df.shape[0], df.shape[1], target, df[target].nunique()]
        })
        summary.to_excel(writer, sheet_name="Dataset Summary", index=False)

        # Sheet 2: Column Details
        col_details = []
        for col in df.columns:
            col_details.append({
                "Column": col,
                "Type": str(df[col].dtype),
                "Unique Values": df[col].nunique(),
                "Missing Values": df[col].isnull().sum(),
            })
        pd.DataFrame(col_details).to_excel(writer, sheet_name="Column Details", index=False)

        # Sheet 3: Model Results
        results_df = pd.DataFrame(model_results)
        results_df.to_excel(writer, sheet_name="Model Results", index=False)

        # Sheet 4: Best Model
        if model_results:
            best = max(model_results, key=lambda x: x["Score"])
            pd.DataFrame([best]).to_excel(writer, sheet_name="Best Model", index=False)

    output.seek(0)
    return output


# ==========================================================
# UPLOAD
# ==========================================================
st.markdown('<div class="section">📁 Upload Dataset</div>', unsafe_allow_html=True)

file = st.file_uploader("Upload CSV File", type=["csv"])

# ==========================================================
# MAIN APP
# ==========================================================
if file:

    raw = pd.read_csv(file)

    st.markdown('<div class="section">📌 Dataset Preview</div>', unsafe_allow_html=True)
    st.dataframe(raw.head(), use_container_width=True)

    df = smart_clean(raw)
    df = detect_unit_columns(df)

    st.session_state.cleaned_df = df

    # ------------------------------------------------------
    # TARGET DETECTION
    # ------------------------------------------------------
    st.markdown('<div class="section">🎯 AI Target Detection</div>', unsafe_allow_html=True)

    best_target, top5 = detect_best_target(df)

    st.success(f"Recommended Target Column: {best_target}")

    st.write("Top Suggestions:")

    for n, s in top5:
        st.write(f"• {n} (score: {s})")

    # Dropdown with AI recommendation pre-selected, user can override
    target = st.selectbox(
        "Choose Target Column (AI recommended is pre-selected - change if needed)",
        [best_target] + [c for c in df.columns if c != best_target]
    )

    st.session_state.selected_target = target

    # ------------------------------------------------------
    # MODEL SELECT
    # ------------------------------------------------------
    st.markdown('<div class="section">🤖 Choose Model</div>', unsafe_allow_html=True)

    model_choice = st.selectbox(
        "Select One Model",
        [
            "Random Forest",
            "SVM",
            "Logistic Regression",
            "Decision Tree",
            "KMeans Clustering"
        ]
    )

    # ------------------------------------------------------
    # APPLY MODEL
    # ------------------------------------------------------
    if st.button("🚀 Apply Model"):

        # Each model result is in its own container so
        #  applying a second model shows results separately beneath the first

        # RANDOM FOREST
        if model_choice == "Random Forest":

            X, y, transformed, pp_info = preprocess_for_model(df, target)
            features_used = pp_info["features_used"]

            result_box = st.container()
            with result_box:
                st.markdown('<div class="model-result-box">', unsafe_allow_html=True)
                st.markdown(f"### Random Forest Results (Target: {target})")

                col1, col2 = st.columns(2)

                with col1:
                    st.write("Original")
                    st.dataframe(raw.head())

                with col2:
                    st.write("Processed")
                    st.dataframe(transformed.head())

                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, test_size=0.2, random_state=42, stratify=y
                )

                cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

                model = GridSearchCV(
                    RandomForestClassifier(),
                    {
                        "n_estimators":[100,200,300],
                        "max_depth":[5,10,15,None],
                        "min_samples_split":[2,5],
                        "min_samples_leaf":[1,2]
                    },
                    cv=cv,
                    n_jobs=-1
                )

                model.fit(X_train, y_train)

                pred = model.predict(X_test)

                acc = accuracy_score(y_test, pred)*100
                prec = precision_score(y_test, pred, average="weighted", zero_division=0)*100
                rec = recall_score(y_test, pred, average="weighted", zero_division=0)*100
                f1 = f1_score(y_test, pred, average="weighted", zero_division=0)*100

                st.success(f"Accuracy: {acc:.2f}%")
                st.info(f"Precision: {prec:.2f}% | Recall: {rec:.2f}% | F1: {f1:.2f}%")

                show_confusion(y_test, pred, "Random Forest Matrix")

                imp = pd.Series(
                    model.best_estimator_.feature_importances_,
                    index=X.columns
                ).sort_values(ascending=False).head(8)

                compact_bar(imp.index, imp.values, "Feature Importance")

                st.write("**Classification Report:**")
                st.text(classification_report(y_test, pred, zero_division=0))

                st.markdown('</div>', unsafe_allow_html=True)

            joblib.dump(model.best_estimator_, "random_forest.pkl")

            save_result("Random Forest", acc, target, ", ".join(features_used), {
                "Precision": prec,
                "Recall": rec,
                "F1Score": f1,
                "BestParams": str(model.best_params_),
                "OutliersClipped": len(pp_info["outliers_clipped"]),
                "LowVarRemoved": str(pp_info["low_var_removed"]),
                "HighCorrRemoved": str(pp_info["high_corr_removed"]),
                "ClassBalanced": pp_info["class_balanced"],
            })

        # SVM
        elif model_choice == "SVM":

            X, y, transformed, pp_info = preprocess_for_model(df, target)
            features_used = pp_info["features_used"]

            result_box = st.container()
            with result_box:
                st.markdown('<div class="model-result-box">', unsafe_allow_html=True)
                st.markdown(f"### SVM Results (Target: {target})")

                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, test_size=0.2, random_state=42, stratify=y
                )

                # RobustScaler for SVM (handles outliers better)
                sc = RobustScaler()

                X_train = sc.fit_transform(X_train)
                X_test = sc.transform(X_test)

                cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

                model = GridSearchCV(
                    SVC(),
                    {
                        "C":[0.1,1,10,100],
                        "kernel":["rbf","linear","poly"],
                        "gamma":["scale","auto"]
                    },
                    cv=cv,
                    n_jobs=-1
                )

                model.fit(X_train, y_train)

                pred = model.predict(X_test)

                acc = accuracy_score(y_test, pred)*100
                prec = precision_score(y_test, pred, average="weighted", zero_division=0)*100
                rec = recall_score(y_test, pred, average="weighted", zero_division=0)*100
                f1 = f1_score(y_test, pred, average="weighted", zero_division=0)*100

                st.success(f"Accuracy: {acc:.2f}%")
                st.info(f"Precision: {prec:.2f}% | Recall: {rec:.2f}% | F1: {f1:.2f}%")

                show_confusion(y_test, pred, "SVM Matrix")

                st.write("**Classification Report:**")
                st.text(classification_report(y_test, pred, zero_division=0))

                st.markdown('</div>', unsafe_allow_html=True)

            joblib.dump(model.best_estimator_, "svm.pkl")

            save_result("SVM", acc, target, ", ".join(features_used), {
                "Precision": prec,
                "Recall": rec,
                "F1Score": f1,
                "BestParams": str(model.best_params_),
                "OutliersClipped": len(pp_info["outliers_clipped"]),
                "LowVarRemoved": str(pp_info["low_var_removed"]),
                "HighCorrRemoved": str(pp_info["high_corr_removed"]),
                "ClassBalanced": pp_info["class_balanced"],
            })

        # LOGISTIC
        elif model_choice == "Logistic Regression":

            X, y, transformed, pp_info = preprocess_for_model(df, target)
            features_used = pp_info["features_used"]

            result_box = st.container()
            with result_box:
                st.markdown('<div class="model-result-box">', unsafe_allow_html=True)
                st.markdown(f"### Logistic Regression Results (Target: {target})")

                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, test_size=0.2, random_state=42, stratify=y
                )

                sc = StandardScaler()

                X_train = sc.fit_transform(X_train)
                X_test = sc.transform(X_test)

                cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

                model = GridSearchCV(
                    LogisticRegression(max_iter=5000, solver="liblinear"),
                    {
                        "C":[0.01,0.1,1,10,100],
                        "penalty":["l1","l2"]
                    },
                    cv=cv,
                    n_jobs=-1
                )

                model.fit(X_train, y_train)

                pred = model.predict(X_test)

                acc = accuracy_score(y_test, pred)*100
                prec = precision_score(y_test, pred, average="weighted", zero_division=0)*100
                rec = recall_score(y_test, pred, average="weighted", zero_division=0)*100
                f1 = f1_score(y_test, pred, average="weighted", zero_division=0)*100

                st.success(f"Accuracy: {acc:.2f}%")
                st.info(f"Precision: {prec:.2f}% | Recall: {rec:.2f}% | F1: {f1:.2f}%")

                show_confusion(y_test, pred, "Logistic Regression Matrix")

                # Show coefficient magnitudes for logistic regression
                if hasattr(model.best_estimator_, "coef_"):
                    coef = pd.Series(
                        np.abs(model.best_estimator_.coef_[0]),
                        index=X.columns
                    ).sort_values(ascending=False).head(8)
                    compact_bar(coef.index, coef.values, "Feature Coefficients (Absolute)")

                st.write("**Classification Report:**")
                st.text(classification_report(y_test, pred, zero_division=0))

                st.markdown('</div>', unsafe_allow_html=True)

            joblib.dump(model.best_estimator_, "logistic.pkl")

            save_result("Logistic Regression", acc, target, ", ".join(features_used), {
                "Precision": prec,
                "Recall": rec,
                "F1Score": f1,
                "BestParams": str(model.best_params_),
                "OutliersClipped": len(pp_info["outliers_clipped"]),
                "LowVarRemoved": str(pp_info["low_var_removed"]),
                "HighCorrRemoved": str(pp_info["high_corr_removed"]),
                "ClassBalanced": pp_info["class_balanced"],
            })

        # DECISION TREE
        elif model_choice == "Decision Tree":

            X, y, transformed, pp_info = preprocess_for_model(df, target)
            features_used = pp_info["features_used"]

            result_box = st.container()
            with result_box:
                st.markdown('<div class="model-result-box">', unsafe_allow_html=True)
                st.markdown(f"### Decision Tree Results (Target: {target})")

                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, test_size=0.2, random_state=42, stratify=y
                )

                cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

                model = GridSearchCV(
                    DecisionTreeClassifier(),
                    {
                        "max_depth":[3,5,10,15,None],
                        "min_samples_split":[2,5,10],
                        "min_samples_leaf":[1,2,4],
                        "criterion":["gini","entropy"]
                    },
                    cv=cv,
                    n_jobs=-1
                )

                model.fit(X_train, y_train)

                pred = model.predict(X_test)

                acc = accuracy_score(y_test, pred)*100
                prec = precision_score(y_test, pred, average="weighted", zero_division=0)*100
                rec = recall_score(y_test, pred, average="weighted", zero_division=0)*100
                f1 = f1_score(y_test, pred, average="weighted", zero_division=0)*100

                st.success(f"Accuracy: {acc:.2f}%")
                st.info(f"Precision: {prec:.2f}% | Recall: {rec:.2f}% | F1: {f1:.2f}%")

                show_confusion(y_test, pred, "Decision Tree Matrix")

                # Feature importance for decision tree
                imp = pd.Series(
                    model.best_estimator_.feature_importances_,
                    index=X.columns
                ).sort_values(ascending=False).head(8)
                compact_bar(imp.index, imp.values, "Feature Importance")

                st.write("**Classification Report:**")
                st.text(classification_report(y_test, pred, zero_division=0))

                st.markdown('</div>', unsafe_allow_html=True)

            joblib.dump(model.best_estimator_, "decision_tree.pkl")

            save_result("Decision Tree", acc, target, ", ".join(features_used), {
                "Precision": prec,
                "Recall": rec,
                "F1Score": f1,
                "BestParams": str(model.best_params_),
                "OutliersClipped": len(pp_info["outliers_clipped"]),
                "LowVarRemoved": str(pp_info["low_var_removed"]),
                "HighCorrRemoved": str(pp_info["high_corr_removed"]),
                "ClassBalanced": pp_info["class_balanced"],
            })

        # KMEANS
        elif model_choice == "KMeans Clustering":

            temp = df.copy()

            for col in temp.columns:
                if temp[col].dtype == "object":
                    le = LabelEncoder()
                    temp[col] = le.fit_transform(temp[col].astype(str))

            X = temp.drop(columns=[target])

            # Clip outliers for clustering too
            temp_clipped, outlier_info = clip_outliers_iqr(temp)
            X_clipped = temp_clipped.drop(columns=[target])

            sc = StandardScaler()
            Xs = sc.fit_transform(X_clipped)

            # Find optimal k using elbow method
            inertias = []
            K_range = range(2, min(11, len(df) // 10 + 1))
            for k in K_range:
                km = KMeans(n_clusters=k, random_state=42, n_init=10)
                km.fit(Xs)
                inertias.append(km.inertia_)

            best_k = 3
            if len(inertias) >= 3:
                diffs = [inertias[i] - inertias[i+1] for i in range(len(inertias)-1)]
                if diffs:
                    elbow_idx = np.argmax(diffs) + 1
                    best_k = list(K_range)[elbow_idx] if elbow_idx < len(list(K_range)) else 3
                    best_k = max(2, min(best_k, 10))

            result_box = st.container()
            with result_box:
                st.markdown('<div class="model-result-box">', unsafe_allow_html=True)
                st.markdown(f"### KMeans Clustering Results (Target: {target})")

                model = KMeans(n_clusters=best_k, random_state=42, n_init=10)

                cluster = model.fit_predict(Xs)

                score = silhouette_score(Xs, cluster)*100

                st.success(f"Cluster Quality Score: {score:.2f}% (k={best_k})")

                fig, ax = plt.subplots(figsize=(6,4))
                plt.scatter(Xs[:,0], Xs[:,1], c=cluster, cmap="viridis")
                plt.title(f"Clusters (k={best_k})")
                col1, col2, col3 = st.columns([1, 2, 1])
                with col2:
                    st.pyplot(fig)

                # Elbow plot
                fig2, ax2 = plt.subplots(figsize=(6,3))
                plt.plot(list(K_range), inertias, "bo-")
                plt.xlabel("Number of Clusters (k)")
                plt.ylabel("Inertia")
                plt.title("Elbow Method")
                col1, col2, col3 = st.columns([1, 2, 1])
                with col2:
                    st.pyplot(fig2)

                # Cluster distribution
                cluster_counts = pd.Series(cluster).value_counts().sort_index()
                fig3, ax3 = plt.subplots(figsize=(6,3))
                sns.barplot(x=cluster_counts.index, y=cluster_counts.values)
                plt.xlabel("Cluster")
                plt.ylabel("Count")
                plt.title("Cluster Distribution")
                col1, col2, col3 = st.columns([1, 2, 1])
                with col2:
                    st.pyplot(fig3)

                st.markdown('</div>', unsafe_allow_html=True)

            joblib.dump(model, "kmeans.pkl")

            save_result("KMeans Clustering", score, target, ", ".join(X_clipped.columns), {
                "BestK": best_k,
                "OutliersClipped": len(outlier_info),
            })

# ==========================================================
# DOWNLOAD SECTION
# ==========================================================
if st.session_state.last_model_name:

    st.markdown('<div class="section">⬇ Downloads</div>', unsafe_allow_html=True)

    file_map = {
        "Random Forest":"random_forest.pkl",
        "SVM":"svm.pkl",
        "Logistic Regression":"logistic.pkl",
        "Decision Tree":"decision_tree.pkl",
        "KMeans Clustering":"kmeans.pkl"
    }

    current = file_map[st.session_state.last_model_name]

    if os.path.exists(current):

        with open(current, "rb") as f:
            st.download_button(
                label=f"Download {st.session_state.last_model_name} (Deploy Ready)",
                data=f,
                file_name=current,
                mime="application/octet-stream"
            )

# ==========================================================
# HISTORY + REPORTS
# ==========================================================
if len(st.session_state.history) > 0:

    st.markdown('<div class="section">📊 History</div>', unsafe_allow_html=True)

    hist = pd.DataFrame(st.session_state.history)

    st.dataframe(hist, use_container_width=True)

    fig, ax = plt.subplots(figsize=(6,3))
    sns.barplot(data=hist, x="Model", y="Score")
    plt.xticks(rotation=20)
    plt.title("All Applied Models")
    col1, col2, col3 = st.columns([1, 2, 1])
    with col2:
        st.pyplot(fig)

    # CSV
    csv_buffer = io.StringIO()
    hist.to_csv(csv_buffer, index=False)

    st.download_button(
        "Download Results CSV",
        csv_buffer.getvalue(),
        "results.csv"
    )

    # TXT report 
    if st.session_state.cleaned_df is not None and len(st.session_state.model_results) > 0:
        report_text = generate_text_report(
            st.session_state.cleaned_df,
            st.session_state.selected_target or "unknown",
            st.session_state.model_results
        )

        st.download_button(
            "Download Full Report (TXT)",
            report_text,
            "full_report.txt",
            mime="text/plain"
        )

        # XLSX report
        try:
            xlsx_data = generate_xlsx_report(
                st.session_state.cleaned_df,
                st.session_state.selected_target or "unknown",
                st.session_state.model_results
            )
            st.download_button(
                "Download Full Report (XLSX)",
                data=xlsx_data.getvalue(),
                file_name="full_report.xlsx",
                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
            )
        except Exception:
            pass

# ==========================================================
# RESET
# ==========================================================
st.markdown('<div class="section">♻ Reset</div>', unsafe_allow_html=True)

if st.button("Clear History"):

    st.session_state.history = []
    st.session_state.last_model_name = None
    st.session_state.last_score = None
    st.session_state.model_results = []
    st.session_state.selected_target = None
    st.session_state.cleaned_df = None

    st.success("History Cleared")