| import streamlit as st
|
| import pandas as pd
|
| import numpy as np
|
| import re
|
| import io
|
| import os
|
| import joblib
|
| import matplotlib
|
| matplotlib.use("Agg")
|
| import matplotlib.pyplot as plt
|
| import seaborn as sns
|
| from datetime import datetime
|
|
|
| from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
|
| from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
|
| from sklearn.metrics import (
|
| accuracy_score, confusion_matrix, silhouette_score,
|
| classification_report, f1_score, precision_score, recall_score
|
| )
|
| from sklearn.ensemble import RandomForestClassifier
|
| from sklearn.svm import SVC
|
| from sklearn.linear_model import LogisticRegression
|
| from sklearn.tree import DecisionTreeClassifier
|
| from sklearn.cluster import KMeans
|
| from sklearn.feature_selection import mutual_info_classif
|
| from sklearn.utils import resample
|
|
|
|
|
|
|
|
|
| st.set_page_config(
|
| page_title="AI AutoML Platform",
|
| page_icon="🤖",
|
| layout="wide"
|
| )
|
|
|
|
|
|
|
|
|
| if "history" not in st.session_state:
|
| st.session_state.history = []
|
|
|
| if "last_model_name" not in st.session_state:
|
| st.session_state.last_model_name = None
|
|
|
| if "last_score" not in st.session_state:
|
| st.session_state.last_score = None
|
|
|
| if "model_results" not in st.session_state:
|
| st.session_state.model_results = []
|
|
|
|
|
| if "selected_target" not in st.session_state:
|
| st.session_state.selected_target = None
|
|
|
|
|
| if "cleaned_df" not in st.session_state:
|
| st.session_state.cleaned_df = None
|
|
|
|
|
|
|
|
|
| st.markdown("""
|
| <style>
|
| .stApp {
|
| background: linear-gradient(135deg,#0f172a,#111827,#020617);
|
| color: white;
|
| }
|
| .big-title {
|
| font-size: 42px;
|
| font-weight: 800;
|
| color: #38bdf8;
|
| text-align:center;
|
| padding:15px;
|
| }
|
| .sub-title {
|
| text-align:center;
|
| color:#cbd5e1;
|
| font-size:18px;
|
| margin-bottom:25px;
|
| }
|
| .section {
|
| background:#0f172a;
|
| padding:12px;
|
| border-radius:12px;
|
| color:#38bdf8;
|
| font-weight:700;
|
| font-size:24px;
|
| margin-top:20px;
|
| }
|
| .stButton>button {
|
| background:#38bdf8;
|
| color:black;
|
| border:none;
|
| border-radius:10px;
|
| font-weight:700;
|
| }
|
| .stButton>button:hover {
|
| background:#0ea5e9;
|
| color:white;
|
| }
|
| div[data-baseweb="select"] > div {
|
| background:#1e293b !important;
|
| color:white !important;
|
| }
|
| .model-result-box {
|
| background:#1e293b;
|
| padding:20px;
|
| border-radius:12px;
|
| border:2px solid #38bdf8;
|
| margin:15px 0;
|
| }
|
| /* File Uploader Button */
|
| .stFileUploader>div>div>button {
|
| background:#38bdf8 !important;
|
| color:black !important;
|
| border:none !important;
|
| border-radius:10px !important;
|
| font-weight:700 !important;
|
| }
|
| .stFileUploader>div>div>button:hover {
|
| background:#0ea5e9 !important;
|
| color:white !important;
|
| }
|
| /* File Uploader Button Alternative Selectors */
|
| .stFileUploader button {
|
| background:#38bdf8 !important;
|
| color:black !important;
|
| border:none !important;
|
| border-radius:10px !important;
|
| font-weight:700 !important;
|
| }
|
| .stFileUploader button:hover {
|
| background:#0ea5e9 !important;
|
| color:white !important;
|
| }
|
| /* Download Buttons */
|
| .stDownloadButton>button {
|
| background:#38bdf8 !important;
|
| color:black !important;
|
| border:none !important;
|
| border-radius:10px !important;
|
| font-weight:700 !important;
|
| }
|
| .stDownloadButton>button:hover {
|
| background:#0ea5e9 !important;
|
| color:white !important;
|
| }
|
| /* File Uploader Label */
|
| .stFileUploader label {
|
| color:#38bdf8 !important;
|
| font-size:16px !important;
|
| font-weight:700 !important;
|
| }
|
| /* Selectbox Labels */
|
| .stSelectbox label {
|
| color:#38bdf8 !important;
|
| font-size:16px !important;
|
| font-weight:700 !important;
|
| }
|
| /* Text and Write Styling */
|
| p {
|
| color:#cbd5e1;
|
| }
|
| .stWrite {
|
| color:#cbd5e1;
|
| }
|
| /* Center pyplot figures and add lateral padding */
|
| .stPlotlyChart, .stPyplot {
|
| display: flex;
|
| justify-content: center;
|
| }
|
| .stPyplot {
|
| padding: 0 50px;
|
| }
|
| .stPlotlyChart {
|
| padding: 0 50px;
|
| }
|
| /* Centered containers */
|
| .stContainer {
|
| max-width: 95%;
|
| margin-left: auto;
|
| margin-right: auto;
|
| }
|
| /* Classification Report Text */
|
| .stText {
|
| color: white !important;
|
| }
|
| .stText pre {
|
| color: white !important;
|
| }
|
| .stText * {
|
| color: white !important;
|
| }
|
| </style>
|
| """, unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
| st.markdown('<div class="big-title">🤖 AI AutoML Platform</div>', unsafe_allow_html=True)
|
| st.markdown('<div class="sub-title">upload csv select model download trained model</div>', unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
| def smart_clean(df):
|
| df = df.copy()
|
| df = df.drop_duplicates()
|
|
|
| for col in df.columns:
|
| if df[col].dtype == "object":
|
| df[col] = df[col].fillna(df[col].mode()[0])
|
| else:
|
|
|
| df[col] = df[col].fillna(df[col].median())
|
|
|
| return df
|
|
|
|
|
| def convert_units(value):
|
| try:
|
| txt = str(value).lower().strip()
|
|
|
| nums = re.findall(r'[\d.]+', txt)
|
| if not nums:
|
| return value
|
|
|
| num = float(nums[0])
|
|
|
| if "km" in txt:
|
| return num * 1000
|
| elif "cm" in txt:
|
| return num / 100
|
| elif "mm" in txt:
|
| return num / 1000
|
| elif "m" in txt:
|
| return num
|
| else:
|
| return num
|
| except:
|
| return value
|
|
|
|
|
| def detect_unit_columns(df):
|
| df = df.copy()
|
|
|
| for col in df.columns:
|
| if df[col].dtype == "object":
|
| sample = str(df[col].iloc[0]).lower()
|
|
|
| if any(x in sample for x in ["km", "cm", "mm", " m"]):
|
| df[col] = df[col].apply(convert_units)
|
|
|
| return df
|
|
|
|
|
| def detect_best_target(df):
|
| scores = {}
|
|
|
| for col in df.columns:
|
| score = 0
|
| unique = df[col].nunique()
|
| ratio = unique / len(df)
|
|
|
| if 2 <= unique <= 15:
|
| score += 6
|
|
|
| if df[col].dtype == "object":
|
| score += 3
|
|
|
| if ratio > 0.9:
|
| score -= 10
|
|
|
| if unique > 50:
|
| score -= 5
|
|
|
| scores[col] = score
|
|
|
| best = max(scores, key=scores.get)
|
| ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
|
|
|
| return best, ranked[:5]
|
|
|
|
|
| def prepare_for_supervised(df, target):
|
| data = df.copy()
|
|
|
| for col in data.columns:
|
| if data[col].dtype == "object":
|
| le = LabelEncoder()
|
| data[col] = le.fit_transform(data[col].astype(str))
|
|
|
| X = data.drop(columns=[target])
|
| y = data[target]
|
|
|
| return X, y, data
|
|
|
|
|
|
|
|
|
| def clip_outliers_iqr(df):
|
| """Clip outliers using IQR method instead of removing rows."""
|
| df = df.copy()
|
| info = {}
|
| for col in df.select_dtypes(include=[np.number]).columns:
|
| Q1 = df[col].quantile(0.25)
|
| Q3 = df[col].quantile(0.75)
|
| IQR = Q3 - Q1
|
| lower = Q1 - 1.5 * IQR
|
| upper = Q3 + 1.5 * IQR
|
| n_out = ((df[col] < lower) | (df[col] > upper)).sum()
|
| if n_out > 0:
|
| df[col] = df[col].clip(lower=lower, upper=upper)
|
| info[col] = n_out
|
| return df, info
|
|
|
|
|
| def remove_low_variance(X, threshold=0.01):
|
| """Remove features with near-zero variance."""
|
| variances = X.var()
|
| low = variances[variances < threshold].index.tolist()
|
| if low:
|
| X = X.drop(columns=low)
|
| return X, low
|
|
|
|
|
| def remove_high_correlation(X, threshold=0.95):
|
| """Remove one of each pair of highly correlated features."""
|
| corr = X.corr().abs()
|
| upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
|
| to_drop = [c for c in upper.columns if any(upper[c] > threshold)]
|
| if to_drop:
|
| X = X.drop(columns=to_drop)
|
| return X, to_drop
|
|
|
|
|
| def balance_classes(X, y):
|
| """Oversample minority classes to match majority count."""
|
| classes, counts = np.unique(y, return_counts=True)
|
| if len(classes) < 2:
|
| return X, y, False
|
|
|
| max_count = counts.max()
|
| ratio = max_count / counts.min()
|
| if ratio < 2:
|
| return X, y, False
|
|
|
| X_out = X.copy()
|
| y_out = y.copy()
|
|
|
| for cls, cnt in zip(classes, counts):
|
| if cnt < max_count:
|
| idx = y[y == cls].index
|
| extra = resample(X.loc[idx], replace=True, n_samples=max_count - cnt, random_state=42)
|
| y_extra = pd.Series([cls] * (max_count - cnt), index=extra.index)
|
| X_out = pd.concat([X_out, extra])
|
| y_out = pd.concat([y_out, y_extra])
|
|
|
| return X_out, y_out, True
|
|
|
|
|
| def select_top_features(X, y, max_features=20):
|
| """Select top features by mutual information."""
|
| if X.shape[1] <= max_features:
|
| return X, list(X.columns)
|
|
|
| mi = mutual_info_classif(X, y, random_state=42)
|
| top = pd.Series(mi, index=X.columns).sort_values(ascending=False).head(max_features).index.tolist()
|
| return X[top], top
|
|
|
|
|
| def preprocess_for_model(df, target):
|
| """Full accuracy-boosting preprocessing pipeline."""
|
| X, y, transformed = prepare_for_supervised(df, target)
|
|
|
|
|
| transformed_clipped, outlier_info = clip_outliers_iqr(transformed)
|
| X = transformed_clipped.drop(columns=[target])
|
| y = transformed_clipped[target]
|
|
|
|
|
| X, low_var = remove_low_variance(X)
|
|
|
|
|
| X, high_corr = remove_high_correlation(X)
|
|
|
|
|
| X, y, balanced = balance_classes(X, y)
|
|
|
|
|
| X, selected = select_top_features(X, y)
|
|
|
| return X, y, transformed, {
|
| "outliers_clipped": outlier_info,
|
| "low_var_removed": low_var,
|
| "high_corr_removed": high_corr,
|
| "class_balanced": balanced,
|
| "features_used": list(X.columns),
|
| }
|
|
|
|
|
| def show_confusion(y_true, y_pred, title):
|
| fig, ax = plt.subplots(figsize=(5,4))
|
| cm = confusion_matrix(y_true, y_pred)
|
|
|
| sns.heatmap(
|
| cm,
|
| annot=True,
|
| fmt="d",
|
| cmap="Blues",
|
| linewidths=1
|
| )
|
|
|
| plt.title(title)
|
| plt.xlabel("Predicted")
|
| plt.ylabel("Actual")
|
|
|
| col1, col2, col3 = st.columns([1, 2, 1])
|
| with col2:
|
| st.pyplot(fig)
|
| return fig
|
|
|
|
|
| def compact_bar(labels, values, title):
|
| fig, ax = plt.subplots(figsize=(6,3))
|
|
|
| sns.barplot(x=labels, y=values)
|
|
|
| plt.xticks(rotation=20)
|
| plt.title(title)
|
|
|
| col1, col2, col3 = st.columns([1, 2, 1])
|
| with col2:
|
| st.pyplot(fig)
|
| return fig
|
|
|
|
|
| def save_result(name, score, target_col, features_used, extra_info=None):
|
| """Enhanced save_result that stores all details for reporting."""
|
| st.session_state.last_model_name = name
|
| st.session_state.last_score = score
|
|
|
| entry = {
|
| "Model": name,
|
| "Score": score,
|
| "Target": target_col,
|
| "Features": features_used,
|
| "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
| }
|
|
|
| if extra_info:
|
| entry.update(extra_info)
|
|
|
| st.session_state.history.append(entry)
|
| st.session_state.model_results.append(entry)
|
|
|
|
|
|
|
|
|
| def generate_text_report(df, target, model_results):
|
| """Generate a comprehensive TXT report with every detail."""
|
| best = max(model_results, key=lambda x: x["Score"]) if model_results else None
|
|
|
| lines = []
|
| lines.append("=" * 70)
|
| lines.append(" DARK AI AUTOML PLATFORM - FULL REPORT")
|
| lines.append("=" * 70)
|
| lines.append(f" Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| lines.append("")
|
| lines.append("-" * 70)
|
| lines.append(" DATASET SUMMARY")
|
| lines.append("-" * 70)
|
| lines.append(f" Rows: {df.shape[0]}")
|
| lines.append(f" Columns: {df.shape[1]}")
|
| lines.append(f" Target Column: {target}")
|
| lines.append(f" Target Unique Values: {df[target].nunique()}")
|
| lines.append("")
|
|
|
| lines.append("-" * 70)
|
| lines.append(" COLUMN DETAILS")
|
| lines.append("-" * 70)
|
| for col in df.columns:
|
| dtype = str(df[col].dtype)
|
| nunique = df[col].nunique()
|
| missing = df[col].isnull().sum()
|
| lines.append(f" {col}: type={dtype}, unique={nunique}, missing={missing}")
|
| lines.append("")
|
|
|
| lines.append("-" * 70)
|
| lines.append(" MODEL RESULTS (ALL RUNS)")
|
| lines.append("-" * 70)
|
| for i, r in enumerate(model_results, 1):
|
| lines.append("")
|
| lines.append(f" Run #{i}")
|
| lines.append(f" Model: {r['Model']}")
|
| lines.append(f" Accuracy/Score: {r['Score']:.2f}%")
|
| lines.append(f" Target Feature: {r.get('Target', 'N/A')}")
|
| lines.append(f" Features Used: {r.get('Features', 'N/A')}")
|
| lines.append(f" Timestamp: {r.get('Timestamp', 'N/A')}")
|
| if "Precision" in r:
|
| lines.append(f" Precision: {r['Precision']:.2f}%")
|
| if "Recall" in r:
|
| lines.append(f" Recall: {r['Recall']:.2f}%")
|
| if "F1Score" in r:
|
| lines.append(f" F1 Score: {r['F1Score']:.2f}%")
|
| if "BestParams" in r:
|
| lines.append(f" Best Hyperparameters: {r['BestParams']}")
|
| if "OutliersClipped" in r:
|
| lines.append(f" Outliers Clipped: {r['OutliersClipped']} columns")
|
| if "LowVarRemoved" in r:
|
| lines.append(f" Low Variance Features Removed: {r['LowVarRemoved']}")
|
| if "HighCorrRemoved" in r:
|
| lines.append(f" High Correlation Features Removed: {r['HighCorrRemoved']}")
|
| if "ClassBalanced" in r:
|
| lines.append(f" Class Balancing Applied: {r['ClassBalanced']}")
|
| if "BestK" in r:
|
| lines.append(f" Optimal Clusters (k): {r['BestK']}")
|
|
|
| if best:
|
| lines.append("")
|
| lines.append("-" * 70)
|
| lines.append(" BEST MODEL")
|
| lines.append("-" * 70)
|
| lines.append(f" Model: {best['Model']}")
|
| lines.append(f" Score: {best['Score']:.2f}%")
|
| lines.append(f" Target: {best.get('Target', 'N/A')}")
|
|
|
| lines.append("")
|
| lines.append("-" * 70)
|
| lines.append(" PREPROCESSING PIPELINE")
|
| lines.append("-" * 70)
|
| lines.append(" - Duplicate removal")
|
| lines.append(" - Missing values handled (median for numeric, mode for categorical)")
|
| lines.append(" - Unit conversion (km/cm/mm -> m)")
|
| lines.append(" - Categorical encoding (LabelEncoder)")
|
| lines.append(" - Outlier clipping (IQR method)")
|
| lines.append(" - Low variance feature removal")
|
| lines.append(" - High correlation feature removal")
|
| lines.append(" - Class imbalance handling (oversampling)")
|
| lines.append(" - Feature selection (mutual information, top 20)")
|
| lines.append(" - Scaling where required (StandardScaler / RobustScaler)")
|
| lines.append(" - Hyperparameter tuning (GridSearchCV)")
|
| lines.append(" - Stratified cross-validation (5-fold)")
|
| lines.append("")
|
| lines.append("=" * 70)
|
| lines.append(" END OF REPORT")
|
| lines.append("=" * 70)
|
|
|
| return "\n".join(lines)
|
|
|
|
|
| def generate_xlsx_report(df, target, model_results):
|
| """Generate a multi-sheet XLSX report with every detail."""
|
| output = io.BytesIO()
|
|
|
| with pd.ExcelWriter(output, engine="openpyxl") as writer:
|
|
|
| summary = pd.DataFrame({
|
| "Property": ["Rows", "Columns", "Target Column", "Target Unique Values"],
|
| "Value": [df.shape[0], df.shape[1], target, df[target].nunique()]
|
| })
|
| summary.to_excel(writer, sheet_name="Dataset Summary", index=False)
|
|
|
|
|
| col_details = []
|
| for col in df.columns:
|
| col_details.append({
|
| "Column": col,
|
| "Type": str(df[col].dtype),
|
| "Unique Values": df[col].nunique(),
|
| "Missing Values": df[col].isnull().sum(),
|
| })
|
| pd.DataFrame(col_details).to_excel(writer, sheet_name="Column Details", index=False)
|
|
|
|
|
| results_df = pd.DataFrame(model_results)
|
| results_df.to_excel(writer, sheet_name="Model Results", index=False)
|
|
|
|
|
| if model_results:
|
| best = max(model_results, key=lambda x: x["Score"])
|
| pd.DataFrame([best]).to_excel(writer, sheet_name="Best Model", index=False)
|
|
|
| output.seek(0)
|
| return output
|
|
|
|
|
|
|
|
|
|
|
| st.markdown('<div class="section">📁 Upload Dataset</div>', unsafe_allow_html=True)
|
|
|
| file = st.file_uploader("Upload CSV File", type=["csv"])
|
|
|
|
|
|
|
|
|
| if file:
|
|
|
| raw = pd.read_csv(file)
|
|
|
| st.markdown('<div class="section">📌 Dataset Preview</div>', unsafe_allow_html=True)
|
| st.dataframe(raw.head(), use_container_width=True)
|
|
|
| df = smart_clean(raw)
|
| df = detect_unit_columns(df)
|
|
|
| st.session_state.cleaned_df = df
|
|
|
|
|
|
|
|
|
| st.markdown('<div class="section">🎯 AI Target Detection</div>', unsafe_allow_html=True)
|
|
|
| best_target, top5 = detect_best_target(df)
|
|
|
| st.success(f"Recommended Target Column: {best_target}")
|
|
|
| st.write("Top Suggestions:")
|
|
|
| for n, s in top5:
|
| st.write(f"• {n} (score: {s})")
|
|
|
|
|
| target = st.selectbox(
|
| "Choose Target Column (AI recommended is pre-selected - change if needed)",
|
| [best_target] + [c for c in df.columns if c != best_target]
|
| )
|
|
|
| st.session_state.selected_target = target
|
|
|
|
|
|
|
|
|
| st.markdown('<div class="section">🤖 Choose Model</div>', unsafe_allow_html=True)
|
|
|
| model_choice = st.selectbox(
|
| "Select One Model",
|
| [
|
| "Random Forest",
|
| "SVM",
|
| "Logistic Regression",
|
| "Decision Tree",
|
| "KMeans Clustering"
|
| ]
|
| )
|
|
|
|
|
|
|
|
|
| if st.button("🚀 Apply Model"):
|
|
|
|
|
|
|
|
|
|
|
| if model_choice == "Random Forest":
|
|
|
| X, y, transformed, pp_info = preprocess_for_model(df, target)
|
| features_used = pp_info["features_used"]
|
|
|
| result_box = st.container()
|
| with result_box:
|
| st.markdown('<div class="model-result-box">', unsafe_allow_html=True)
|
| st.markdown(f"### Random Forest Results (Target: {target})")
|
|
|
| col1, col2 = st.columns(2)
|
|
|
| with col1:
|
| st.write("Original")
|
| st.dataframe(raw.head())
|
|
|
| with col2:
|
| st.write("Processed")
|
| st.dataframe(transformed.head())
|
|
|
| X_train, X_test, y_train, y_test = train_test_split(
|
| X, y, test_size=0.2, random_state=42, stratify=y
|
| )
|
|
|
| cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
|
|
|
| model = GridSearchCV(
|
| RandomForestClassifier(),
|
| {
|
| "n_estimators":[100,200,300],
|
| "max_depth":[5,10,15,None],
|
| "min_samples_split":[2,5],
|
| "min_samples_leaf":[1,2]
|
| },
|
| cv=cv,
|
| n_jobs=-1
|
| )
|
|
|
| model.fit(X_train, y_train)
|
|
|
| pred = model.predict(X_test)
|
|
|
| acc = accuracy_score(y_test, pred)*100
|
| prec = precision_score(y_test, pred, average="weighted", zero_division=0)*100
|
| rec = recall_score(y_test, pred, average="weighted", zero_division=0)*100
|
| f1 = f1_score(y_test, pred, average="weighted", zero_division=0)*100
|
|
|
| st.success(f"Accuracy: {acc:.2f}%")
|
| st.info(f"Precision: {prec:.2f}% | Recall: {rec:.2f}% | F1: {f1:.2f}%")
|
|
|
| show_confusion(y_test, pred, "Random Forest Matrix")
|
|
|
| imp = pd.Series(
|
| model.best_estimator_.feature_importances_,
|
| index=X.columns
|
| ).sort_values(ascending=False).head(8)
|
|
|
| compact_bar(imp.index, imp.values, "Feature Importance")
|
|
|
| st.write("**Classification Report:**")
|
| st.text(classification_report(y_test, pred, zero_division=0))
|
|
|
| st.markdown('</div>', unsafe_allow_html=True)
|
|
|
| joblib.dump(model.best_estimator_, "random_forest.pkl")
|
|
|
| save_result("Random Forest", acc, target, ", ".join(features_used), {
|
| "Precision": prec,
|
| "Recall": rec,
|
| "F1Score": f1,
|
| "BestParams": str(model.best_params_),
|
| "OutliersClipped": len(pp_info["outliers_clipped"]),
|
| "LowVarRemoved": str(pp_info["low_var_removed"]),
|
| "HighCorrRemoved": str(pp_info["high_corr_removed"]),
|
| "ClassBalanced": pp_info["class_balanced"],
|
| })
|
|
|
|
|
| elif model_choice == "SVM":
|
|
|
| X, y, transformed, pp_info = preprocess_for_model(df, target)
|
| features_used = pp_info["features_used"]
|
|
|
| result_box = st.container()
|
| with result_box:
|
| st.markdown('<div class="model-result-box">', unsafe_allow_html=True)
|
| st.markdown(f"### SVM Results (Target: {target})")
|
|
|
| X_train, X_test, y_train, y_test = train_test_split(
|
| X, y, test_size=0.2, random_state=42, stratify=y
|
| )
|
|
|
|
|
| sc = RobustScaler()
|
|
|
| X_train = sc.fit_transform(X_train)
|
| X_test = sc.transform(X_test)
|
|
|
| cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
|
|
|
| model = GridSearchCV(
|
| SVC(),
|
| {
|
| "C":[0.1,1,10,100],
|
| "kernel":["rbf","linear","poly"],
|
| "gamma":["scale","auto"]
|
| },
|
| cv=cv,
|
| n_jobs=-1
|
| )
|
|
|
| model.fit(X_train, y_train)
|
|
|
| pred = model.predict(X_test)
|
|
|
| acc = accuracy_score(y_test, pred)*100
|
| prec = precision_score(y_test, pred, average="weighted", zero_division=0)*100
|
| rec = recall_score(y_test, pred, average="weighted", zero_division=0)*100
|
| f1 = f1_score(y_test, pred, average="weighted", zero_division=0)*100
|
|
|
| st.success(f"Accuracy: {acc:.2f}%")
|
| st.info(f"Precision: {prec:.2f}% | Recall: {rec:.2f}% | F1: {f1:.2f}%")
|
|
|
| show_confusion(y_test, pred, "SVM Matrix")
|
|
|
| st.write("**Classification Report:**")
|
| st.text(classification_report(y_test, pred, zero_division=0))
|
|
|
| st.markdown('</div>', unsafe_allow_html=True)
|
|
|
| joblib.dump(model.best_estimator_, "svm.pkl")
|
|
|
| save_result("SVM", acc, target, ", ".join(features_used), {
|
| "Precision": prec,
|
| "Recall": rec,
|
| "F1Score": f1,
|
| "BestParams": str(model.best_params_),
|
| "OutliersClipped": len(pp_info["outliers_clipped"]),
|
| "LowVarRemoved": str(pp_info["low_var_removed"]),
|
| "HighCorrRemoved": str(pp_info["high_corr_removed"]),
|
| "ClassBalanced": pp_info["class_balanced"],
|
| })
|
|
|
|
|
| elif model_choice == "Logistic Regression":
|
|
|
| X, y, transformed, pp_info = preprocess_for_model(df, target)
|
| features_used = pp_info["features_used"]
|
|
|
| result_box = st.container()
|
| with result_box:
|
| st.markdown('<div class="model-result-box">', unsafe_allow_html=True)
|
| st.markdown(f"### Logistic Regression Results (Target: {target})")
|
|
|
| X_train, X_test, y_train, y_test = train_test_split(
|
| X, y, test_size=0.2, random_state=42, stratify=y
|
| )
|
|
|
| sc = StandardScaler()
|
|
|
| X_train = sc.fit_transform(X_train)
|
| X_test = sc.transform(X_test)
|
|
|
| cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
|
|
|
| model = GridSearchCV(
|
| LogisticRegression(max_iter=5000, solver="liblinear"),
|
| {
|
| "C":[0.01,0.1,1,10,100],
|
| "penalty":["l1","l2"]
|
| },
|
| cv=cv,
|
| n_jobs=-1
|
| )
|
|
|
| model.fit(X_train, y_train)
|
|
|
| pred = model.predict(X_test)
|
|
|
| acc = accuracy_score(y_test, pred)*100
|
| prec = precision_score(y_test, pred, average="weighted", zero_division=0)*100
|
| rec = recall_score(y_test, pred, average="weighted", zero_division=0)*100
|
| f1 = f1_score(y_test, pred, average="weighted", zero_division=0)*100
|
|
|
| st.success(f"Accuracy: {acc:.2f}%")
|
| st.info(f"Precision: {prec:.2f}% | Recall: {rec:.2f}% | F1: {f1:.2f}%")
|
|
|
| show_confusion(y_test, pred, "Logistic Regression Matrix")
|
|
|
|
|
| if hasattr(model.best_estimator_, "coef_"):
|
| coef = pd.Series(
|
| np.abs(model.best_estimator_.coef_[0]),
|
| index=X.columns
|
| ).sort_values(ascending=False).head(8)
|
| compact_bar(coef.index, coef.values, "Feature Coefficients (Absolute)")
|
|
|
| st.write("**Classification Report:**")
|
| st.text(classification_report(y_test, pred, zero_division=0))
|
|
|
| st.markdown('</div>', unsafe_allow_html=True)
|
|
|
| joblib.dump(model.best_estimator_, "logistic.pkl")
|
|
|
| save_result("Logistic Regression", acc, target, ", ".join(features_used), {
|
| "Precision": prec,
|
| "Recall": rec,
|
| "F1Score": f1,
|
| "BestParams": str(model.best_params_),
|
| "OutliersClipped": len(pp_info["outliers_clipped"]),
|
| "LowVarRemoved": str(pp_info["low_var_removed"]),
|
| "HighCorrRemoved": str(pp_info["high_corr_removed"]),
|
| "ClassBalanced": pp_info["class_balanced"],
|
| })
|
|
|
|
|
| elif model_choice == "Decision Tree":
|
|
|
| X, y, transformed, pp_info = preprocess_for_model(df, target)
|
| features_used = pp_info["features_used"]
|
|
|
| result_box = st.container()
|
| with result_box:
|
| st.markdown('<div class="model-result-box">', unsafe_allow_html=True)
|
| st.markdown(f"### Decision Tree Results (Target: {target})")
|
|
|
| X_train, X_test, y_train, y_test = train_test_split(
|
| X, y, test_size=0.2, random_state=42, stratify=y
|
| )
|
|
|
| cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
|
|
|
| model = GridSearchCV(
|
| DecisionTreeClassifier(),
|
| {
|
| "max_depth":[3,5,10,15,None],
|
| "min_samples_split":[2,5,10],
|
| "min_samples_leaf":[1,2,4],
|
| "criterion":["gini","entropy"]
|
| },
|
| cv=cv,
|
| n_jobs=-1
|
| )
|
|
|
| model.fit(X_train, y_train)
|
|
|
| pred = model.predict(X_test)
|
|
|
| acc = accuracy_score(y_test, pred)*100
|
| prec = precision_score(y_test, pred, average="weighted", zero_division=0)*100
|
| rec = recall_score(y_test, pred, average="weighted", zero_division=0)*100
|
| f1 = f1_score(y_test, pred, average="weighted", zero_division=0)*100
|
|
|
| st.success(f"Accuracy: {acc:.2f}%")
|
| st.info(f"Precision: {prec:.2f}% | Recall: {rec:.2f}% | F1: {f1:.2f}%")
|
|
|
| show_confusion(y_test, pred, "Decision Tree Matrix")
|
|
|
|
|
| imp = pd.Series(
|
| model.best_estimator_.feature_importances_,
|
| index=X.columns
|
| ).sort_values(ascending=False).head(8)
|
| compact_bar(imp.index, imp.values, "Feature Importance")
|
|
|
| st.write("**Classification Report:**")
|
| st.text(classification_report(y_test, pred, zero_division=0))
|
|
|
| st.markdown('</div>', unsafe_allow_html=True)
|
|
|
| joblib.dump(model.best_estimator_, "decision_tree.pkl")
|
|
|
| save_result("Decision Tree", acc, target, ", ".join(features_used), {
|
| "Precision": prec,
|
| "Recall": rec,
|
| "F1Score": f1,
|
| "BestParams": str(model.best_params_),
|
| "OutliersClipped": len(pp_info["outliers_clipped"]),
|
| "LowVarRemoved": str(pp_info["low_var_removed"]),
|
| "HighCorrRemoved": str(pp_info["high_corr_removed"]),
|
| "ClassBalanced": pp_info["class_balanced"],
|
| })
|
|
|
|
|
| elif model_choice == "KMeans Clustering":
|
|
|
| temp = df.copy()
|
|
|
| for col in temp.columns:
|
| if temp[col].dtype == "object":
|
| le = LabelEncoder()
|
| temp[col] = le.fit_transform(temp[col].astype(str))
|
|
|
| X = temp.drop(columns=[target])
|
|
|
|
|
| temp_clipped, outlier_info = clip_outliers_iqr(temp)
|
| X_clipped = temp_clipped.drop(columns=[target])
|
|
|
| sc = StandardScaler()
|
| Xs = sc.fit_transform(X_clipped)
|
|
|
|
|
| inertias = []
|
| K_range = range(2, min(11, len(df) // 10 + 1))
|
| for k in K_range:
|
| km = KMeans(n_clusters=k, random_state=42, n_init=10)
|
| km.fit(Xs)
|
| inertias.append(km.inertia_)
|
|
|
| best_k = 3
|
| if len(inertias) >= 3:
|
| diffs = [inertias[i] - inertias[i+1] for i in range(len(inertias)-1)]
|
| if diffs:
|
| elbow_idx = np.argmax(diffs) + 1
|
| best_k = list(K_range)[elbow_idx] if elbow_idx < len(list(K_range)) else 3
|
| best_k = max(2, min(best_k, 10))
|
|
|
| result_box = st.container()
|
| with result_box:
|
| st.markdown('<div class="model-result-box">', unsafe_allow_html=True)
|
| st.markdown(f"### KMeans Clustering Results (Target: {target})")
|
|
|
| model = KMeans(n_clusters=best_k, random_state=42, n_init=10)
|
|
|
| cluster = model.fit_predict(Xs)
|
|
|
| score = silhouette_score(Xs, cluster)*100
|
|
|
| st.success(f"Cluster Quality Score: {score:.2f}% (k={best_k})")
|
|
|
| fig, ax = plt.subplots(figsize=(6,4))
|
| plt.scatter(Xs[:,0], Xs[:,1], c=cluster, cmap="viridis")
|
| plt.title(f"Clusters (k={best_k})")
|
| col1, col2, col3 = st.columns([1, 2, 1])
|
| with col2:
|
| st.pyplot(fig)
|
|
|
|
|
| fig2, ax2 = plt.subplots(figsize=(6,3))
|
| plt.plot(list(K_range), inertias, "bo-")
|
| plt.xlabel("Number of Clusters (k)")
|
| plt.ylabel("Inertia")
|
| plt.title("Elbow Method")
|
| col1, col2, col3 = st.columns([1, 2, 1])
|
| with col2:
|
| st.pyplot(fig2)
|
|
|
|
|
| cluster_counts = pd.Series(cluster).value_counts().sort_index()
|
| fig3, ax3 = plt.subplots(figsize=(6,3))
|
| sns.barplot(x=cluster_counts.index, y=cluster_counts.values)
|
| plt.xlabel("Cluster")
|
| plt.ylabel("Count")
|
| plt.title("Cluster Distribution")
|
| col1, col2, col3 = st.columns([1, 2, 1])
|
| with col2:
|
| st.pyplot(fig3)
|
|
|
| st.markdown('</div>', unsafe_allow_html=True)
|
|
|
| joblib.dump(model, "kmeans.pkl")
|
|
|
| save_result("KMeans Clustering", score, target, ", ".join(X_clipped.columns), {
|
| "BestK": best_k,
|
| "OutliersClipped": len(outlier_info),
|
| })
|
|
|
|
|
|
|
|
|
| if st.session_state.last_model_name:
|
|
|
| st.markdown('<div class="section">⬇ Downloads</div>', unsafe_allow_html=True)
|
|
|
| file_map = {
|
| "Random Forest":"random_forest.pkl",
|
| "SVM":"svm.pkl",
|
| "Logistic Regression":"logistic.pkl",
|
| "Decision Tree":"decision_tree.pkl",
|
| "KMeans Clustering":"kmeans.pkl"
|
| }
|
|
|
| current = file_map[st.session_state.last_model_name]
|
|
|
| if os.path.exists(current):
|
|
|
| with open(current, "rb") as f:
|
| st.download_button(
|
| label=f"Download {st.session_state.last_model_name} (Deploy Ready)",
|
| data=f,
|
| file_name=current,
|
| mime="application/octet-stream"
|
| )
|
|
|
|
|
|
|
|
|
| if len(st.session_state.history) > 0:
|
|
|
| st.markdown('<div class="section">📊 History</div>', unsafe_allow_html=True)
|
|
|
| hist = pd.DataFrame(st.session_state.history)
|
|
|
| st.dataframe(hist, use_container_width=True)
|
|
|
| fig, ax = plt.subplots(figsize=(6,3))
|
| sns.barplot(data=hist, x="Model", y="Score")
|
| plt.xticks(rotation=20)
|
| plt.title("All Applied Models")
|
| col1, col2, col3 = st.columns([1, 2, 1])
|
| with col2:
|
| st.pyplot(fig)
|
|
|
|
|
| csv_buffer = io.StringIO()
|
| hist.to_csv(csv_buffer, index=False)
|
|
|
| st.download_button(
|
| "Download Results CSV",
|
| csv_buffer.getvalue(),
|
| "results.csv"
|
| )
|
|
|
|
|
| if st.session_state.cleaned_df is not None and len(st.session_state.model_results) > 0:
|
| report_text = generate_text_report(
|
| st.session_state.cleaned_df,
|
| st.session_state.selected_target or "unknown",
|
| st.session_state.model_results
|
| )
|
|
|
| st.download_button(
|
| "Download Full Report (TXT)",
|
| report_text,
|
| "full_report.txt",
|
| mime="text/plain"
|
| )
|
|
|
|
|
| try:
|
| xlsx_data = generate_xlsx_report(
|
| st.session_state.cleaned_df,
|
| st.session_state.selected_target or "unknown",
|
| st.session_state.model_results
|
| )
|
| st.download_button(
|
| "Download Full Report (XLSX)",
|
| data=xlsx_data.getvalue(),
|
| file_name="full_report.xlsx",
|
| mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
| )
|
| except Exception:
|
| pass
|
|
|
|
|
|
|
|
|
| st.markdown('<div class="section">♻ Reset</div>', unsafe_allow_html=True)
|
|
|
| if st.button("Clear History"):
|
|
|
| st.session_state.history = []
|
| st.session_state.last_model_name = None
|
| st.session_state.last_score = None
|
| st.session_state.model_results = []
|
| st.session_state.selected_target = None
|
| st.session_state.cleaned_df = None
|
|
|
| st.success("History Cleared")
|
|
|