import streamlit as st
import pandas as pd
import numpy as np
import re
import io
import os
import joblib
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
from sklearn.metrics import (
accuracy_score, confusion_matrix, silhouette_score,
classification_report, f1_score, precision_score, recall_score
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.feature_selection import mutual_info_classif
from sklearn.utils import resample
# ==========================================================
# PAGE CONFIG
# ==========================================================
st.set_page_config(
page_title="AI AutoML Platform",
page_icon="🤖",
layout="wide"
)
# ==========================================================
# SESSION STATE
# ==========================================================
if "history" not in st.session_state:
st.session_state.history = []
if "last_model_name" not in st.session_state:
st.session_state.last_model_name = None
if "last_score" not in st.session_state:
st.session_state.last_score = None
#store detailed results per model run for reports
if "model_results" not in st.session_state:
st.session_state.model_results = []
#store selected target so report can reference it
if "selected_target" not in st.session_state:
st.session_state.selected_target = None
# store the cleaned df reference for report generation
if "cleaned_df" not in st.session_state:
st.session_state.cleaned_df = None
# ==========================================================
# THEME CSS
# ==========================================================
st.markdown("""
""", unsafe_allow_html=True)
# ==========================================================
# HEADER
# ==========================================================
st.markdown('
🤖 AI AutoML Platform
', unsafe_allow_html=True)
st.markdown('upload csv select model download trained model
', unsafe_allow_html=True)
# ==========================================================
# HELPERS
# ==========================================================
def smart_clean(df):
df = df.copy()
df = df.drop_duplicates()
for col in df.columns:
if df[col].dtype == "object":
df[col] = df[col].fillna(df[col].mode()[0])
else:
# use median instead of mean (more robust to outliers)
df[col] = df[col].fillna(df[col].median())
return df
def convert_units(value):
try:
txt = str(value).lower().strip()
nums = re.findall(r'[\d.]+', txt)
if not nums:
return value
num = float(nums[0])
if "km" in txt:
return num * 1000
elif "cm" in txt:
return num / 100
elif "mm" in txt:
return num / 1000
elif "m" in txt:
return num
else:
return num
except:
return value
def detect_unit_columns(df):
df = df.copy()
for col in df.columns:
if df[col].dtype == "object":
sample = str(df[col].iloc[0]).lower()
if any(x in sample for x in ["km", "cm", "mm", " m"]):
df[col] = df[col].apply(convert_units)
return df
def detect_best_target(df):
scores = {}
for col in df.columns:
score = 0
unique = df[col].nunique()
ratio = unique / len(df)
if 2 <= unique <= 15:
score += 6
if df[col].dtype == "object":
score += 3
if ratio > 0.9:
score -= 10
if unique > 50:
score -= 5
scores[col] = score
best = max(scores, key=scores.get)
ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
return best, ranked[:5]
def prepare_for_supervised(df, target):
data = df.copy()
for col in data.columns:
if data[col].dtype == "object":
le = LabelEncoder()
data[col] = le.fit_transform(data[col].astype(str))
X = data.drop(columns=[target])
y = data[target]
return X, y, data
# --- ACCURACY HELPER FUNCTIONS ---
def clip_outliers_iqr(df):
"""Clip outliers using IQR method instead of removing rows."""
df = df.copy()
info = {}
for col in df.select_dtypes(include=[np.number]).columns:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
n_out = ((df[col] < lower) | (df[col] > upper)).sum()
if n_out > 0:
df[col] = df[col].clip(lower=lower, upper=upper)
info[col] = n_out
return df, info
def remove_low_variance(X, threshold=0.01):
"""Remove features with near-zero variance."""
variances = X.var()
low = variances[variances < threshold].index.tolist()
if low:
X = X.drop(columns=low)
return X, low
def remove_high_correlation(X, threshold=0.95):
"""Remove one of each pair of highly correlated features."""
corr = X.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [c for c in upper.columns if any(upper[c] > threshold)]
if to_drop:
X = X.drop(columns=to_drop)
return X, to_drop
def balance_classes(X, y):
"""Oversample minority classes to match majority count."""
classes, counts = np.unique(y, return_counts=True)
if len(classes) < 2:
return X, y, False
max_count = counts.max()
ratio = max_count / counts.min()
if ratio < 2:
return X, y, False
X_out = X.copy()
y_out = y.copy()
for cls, cnt in zip(classes, counts):
if cnt < max_count:
idx = y[y == cls].index
extra = resample(X.loc[idx], replace=True, n_samples=max_count - cnt, random_state=42)
y_extra = pd.Series([cls] * (max_count - cnt), index=extra.index)
X_out = pd.concat([X_out, extra])
y_out = pd.concat([y_out, y_extra])
return X_out, y_out, True
def select_top_features(X, y, max_features=20):
"""Select top features by mutual information."""
if X.shape[1] <= max_features:
return X, list(X.columns)
mi = mutual_info_classif(X, y, random_state=42)
top = pd.Series(mi, index=X.columns).sort_values(ascending=False).head(max_features).index.tolist()
return X[top], top
def preprocess_for_model(df, target):
"""Full accuracy-boosting preprocessing pipeline."""
X, y, transformed = prepare_for_supervised(df, target)
# Clip outliers
transformed_clipped, outlier_info = clip_outliers_iqr(transformed)
X = transformed_clipped.drop(columns=[target])
y = transformed_clipped[target]
# Remove low variance
X, low_var = remove_low_variance(X)
# Remove high correlation
X, high_corr = remove_high_correlation(X)
# Balance classes
X, y, balanced = balance_classes(X, y)
# Feature selection
X, selected = select_top_features(X, y)
return X, y, transformed, {
"outliers_clipped": outlier_info,
"low_var_removed": low_var,
"high_corr_removed": high_corr,
"class_balanced": balanced,
"features_used": list(X.columns),
}
def show_confusion(y_true, y_pred, title):
fig, ax = plt.subplots(figsize=(5,4))
cm = confusion_matrix(y_true, y_pred)
sns.heatmap(
cm,
annot=True,
fmt="d",
cmap="Blues",
linewidths=1
)
plt.title(title)
plt.xlabel("Predicted")
plt.ylabel("Actual")
col1, col2, col3 = st.columns([1, 2, 1])
with col2:
st.pyplot(fig)
return fig
def compact_bar(labels, values, title):
fig, ax = plt.subplots(figsize=(6,3))
sns.barplot(x=labels, y=values)
plt.xticks(rotation=20)
plt.title(title)
col1, col2, col3 = st.columns([1, 2, 1])
with col2:
st.pyplot(fig)
return fig
def save_result(name, score, target_col, features_used, extra_info=None):
"""Enhanced save_result that stores all details for reporting."""
st.session_state.last_model_name = name
st.session_state.last_score = score
entry = {
"Model": name,
"Score": score,
"Target": target_col,
"Features": features_used,
"Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
}
if extra_info:
entry.update(extra_info)
st.session_state.history.append(entry)
st.session_state.model_results.append(entry)
# --- REPORT GENERATORS ---
def generate_text_report(df, target, model_results):
"""Generate a comprehensive TXT report with every detail."""
best = max(model_results, key=lambda x: x["Score"]) if model_results else None
lines = []
lines.append("=" * 70)
lines.append(" DARK AI AUTOML PLATFORM - FULL REPORT")
lines.append("=" * 70)
lines.append(f" Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
lines.append("")
lines.append("-" * 70)
lines.append(" DATASET SUMMARY")
lines.append("-" * 70)
lines.append(f" Rows: {df.shape[0]}")
lines.append(f" Columns: {df.shape[1]}")
lines.append(f" Target Column: {target}")
lines.append(f" Target Unique Values: {df[target].nunique()}")
lines.append("")
lines.append("-" * 70)
lines.append(" COLUMN DETAILS")
lines.append("-" * 70)
for col in df.columns:
dtype = str(df[col].dtype)
nunique = df[col].nunique()
missing = df[col].isnull().sum()
lines.append(f" {col}: type={dtype}, unique={nunique}, missing={missing}")
lines.append("")
lines.append("-" * 70)
lines.append(" MODEL RESULTS (ALL RUNS)")
lines.append("-" * 70)
for i, r in enumerate(model_results, 1):
lines.append("")
lines.append(f" Run #{i}")
lines.append(f" Model: {r['Model']}")
lines.append(f" Accuracy/Score: {r['Score']:.2f}%")
lines.append(f" Target Feature: {r.get('Target', 'N/A')}")
lines.append(f" Features Used: {r.get('Features', 'N/A')}")
lines.append(f" Timestamp: {r.get('Timestamp', 'N/A')}")
if "Precision" in r:
lines.append(f" Precision: {r['Precision']:.2f}%")
if "Recall" in r:
lines.append(f" Recall: {r['Recall']:.2f}%")
if "F1Score" in r:
lines.append(f" F1 Score: {r['F1Score']:.2f}%")
if "BestParams" in r:
lines.append(f" Best Hyperparameters: {r['BestParams']}")
if "OutliersClipped" in r:
lines.append(f" Outliers Clipped: {r['OutliersClipped']} columns")
if "LowVarRemoved" in r:
lines.append(f" Low Variance Features Removed: {r['LowVarRemoved']}")
if "HighCorrRemoved" in r:
lines.append(f" High Correlation Features Removed: {r['HighCorrRemoved']}")
if "ClassBalanced" in r:
lines.append(f" Class Balancing Applied: {r['ClassBalanced']}")
if "BestK" in r:
lines.append(f" Optimal Clusters (k): {r['BestK']}")
if best:
lines.append("")
lines.append("-" * 70)
lines.append(" BEST MODEL")
lines.append("-" * 70)
lines.append(f" Model: {best['Model']}")
lines.append(f" Score: {best['Score']:.2f}%")
lines.append(f" Target: {best.get('Target', 'N/A')}")
lines.append("")
lines.append("-" * 70)
lines.append(" PREPROCESSING PIPELINE")
lines.append("-" * 70)
lines.append(" - Duplicate removal")
lines.append(" - Missing values handled (median for numeric, mode for categorical)")
lines.append(" - Unit conversion (km/cm/mm -> m)")
lines.append(" - Categorical encoding (LabelEncoder)")
lines.append(" - Outlier clipping (IQR method)")
lines.append(" - Low variance feature removal")
lines.append(" - High correlation feature removal")
lines.append(" - Class imbalance handling (oversampling)")
lines.append(" - Feature selection (mutual information, top 20)")
lines.append(" - Scaling where required (StandardScaler / RobustScaler)")
lines.append(" - Hyperparameter tuning (GridSearchCV)")
lines.append(" - Stratified cross-validation (5-fold)")
lines.append("")
lines.append("=" * 70)
lines.append(" END OF REPORT")
lines.append("=" * 70)
return "\n".join(lines)
def generate_xlsx_report(df, target, model_results):
"""Generate a multi-sheet XLSX report with every detail."""
output = io.BytesIO()
with pd.ExcelWriter(output, engine="openpyxl") as writer:
# Sheet 1: Dataset Summary
summary = pd.DataFrame({
"Property": ["Rows", "Columns", "Target Column", "Target Unique Values"],
"Value": [df.shape[0], df.shape[1], target, df[target].nunique()]
})
summary.to_excel(writer, sheet_name="Dataset Summary", index=False)
# Sheet 2: Column Details
col_details = []
for col in df.columns:
col_details.append({
"Column": col,
"Type": str(df[col].dtype),
"Unique Values": df[col].nunique(),
"Missing Values": df[col].isnull().sum(),
})
pd.DataFrame(col_details).to_excel(writer, sheet_name="Column Details", index=False)
# Sheet 3: Model Results
results_df = pd.DataFrame(model_results)
results_df.to_excel(writer, sheet_name="Model Results", index=False)
# Sheet 4: Best Model
if model_results:
best = max(model_results, key=lambda x: x["Score"])
pd.DataFrame([best]).to_excel(writer, sheet_name="Best Model", index=False)
output.seek(0)
return output
# ==========================================================
# UPLOAD
# ==========================================================
st.markdown('📁 Upload Dataset
', unsafe_allow_html=True)
file = st.file_uploader("Upload CSV File", type=["csv"])
# ==========================================================
# MAIN APP
# ==========================================================
if file:
raw = pd.read_csv(file)
st.markdown('📌 Dataset Preview
', unsafe_allow_html=True)
st.dataframe(raw.head(), use_container_width=True)
df = smart_clean(raw)
df = detect_unit_columns(df)
st.session_state.cleaned_df = df
# ------------------------------------------------------
# TARGET DETECTION
# ------------------------------------------------------
st.markdown('🎯 AI Target Detection
', unsafe_allow_html=True)
best_target, top5 = detect_best_target(df)
st.success(f"Recommended Target Column: {best_target}")
st.write("Top Suggestions:")
for n, s in top5:
st.write(f"• {n} (score: {s})")
# Dropdown with AI recommendation pre-selected, user can override
target = st.selectbox(
"Choose Target Column (AI recommended is pre-selected - change if needed)",
[best_target] + [c for c in df.columns if c != best_target]
)
st.session_state.selected_target = target
# ------------------------------------------------------
# MODEL SELECT
# ------------------------------------------------------
st.markdown('🤖 Choose Model
', unsafe_allow_html=True)
model_choice = st.selectbox(
"Select One Model",
[
"Random Forest",
"SVM",
"Logistic Regression",
"Decision Tree",
"KMeans Clustering"
]
)
# ------------------------------------------------------
# APPLY MODEL
# ------------------------------------------------------
if st.button("🚀 Apply Model"):
# Each model result is in its own container so
# applying a second model shows results separately beneath the first
# RANDOM FOREST
if model_choice == "Random Forest":
X, y, transformed, pp_info = preprocess_for_model(df, target)
features_used = pp_info["features_used"]
result_box = st.container()
with result_box:
st.markdown('', unsafe_allow_html=True)
st.markdown(f"### Random Forest Results (Target: {target})")
col1, col2 = st.columns(2)
with col1:
st.write("Original")
st.dataframe(raw.head())
with col2:
st.write("Processed")
st.dataframe(transformed.head())
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
model = GridSearchCV(
RandomForestClassifier(),
{
"n_estimators":[100,200,300],
"max_depth":[5,10,15,None],
"min_samples_split":[2,5],
"min_samples_leaf":[1,2]
},
cv=cv,
n_jobs=-1
)
model.fit(X_train, y_train)
pred = model.predict(X_test)
acc = accuracy_score(y_test, pred)*100
prec = precision_score(y_test, pred, average="weighted", zero_division=0)*100
rec = recall_score(y_test, pred, average="weighted", zero_division=0)*100
f1 = f1_score(y_test, pred, average="weighted", zero_division=0)*100
st.success(f"Accuracy: {acc:.2f}%")
st.info(f"Precision: {prec:.2f}% | Recall: {rec:.2f}% | F1: {f1:.2f}%")
show_confusion(y_test, pred, "Random Forest Matrix")
imp = pd.Series(
model.best_estimator_.feature_importances_,
index=X.columns
).sort_values(ascending=False).head(8)
compact_bar(imp.index, imp.values, "Feature Importance")
st.write("**Classification Report:**")
st.text(classification_report(y_test, pred, zero_division=0))
st.markdown('
', unsafe_allow_html=True)
joblib.dump(model.best_estimator_, "random_forest.pkl")
save_result("Random Forest", acc, target, ", ".join(features_used), {
"Precision": prec,
"Recall": rec,
"F1Score": f1,
"BestParams": str(model.best_params_),
"OutliersClipped": len(pp_info["outliers_clipped"]),
"LowVarRemoved": str(pp_info["low_var_removed"]),
"HighCorrRemoved": str(pp_info["high_corr_removed"]),
"ClassBalanced": pp_info["class_balanced"],
})
# SVM
elif model_choice == "SVM":
X, y, transformed, pp_info = preprocess_for_model(df, target)
features_used = pp_info["features_used"]
result_box = st.container()
with result_box:
st.markdown('', unsafe_allow_html=True)
st.markdown(f"### SVM Results (Target: {target})")
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# RobustScaler for SVM (handles outliers better)
sc = RobustScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
model = GridSearchCV(
SVC(),
{
"C":[0.1,1,10,100],
"kernel":["rbf","linear","poly"],
"gamma":["scale","auto"]
},
cv=cv,
n_jobs=-1
)
model.fit(X_train, y_train)
pred = model.predict(X_test)
acc = accuracy_score(y_test, pred)*100
prec = precision_score(y_test, pred, average="weighted", zero_division=0)*100
rec = recall_score(y_test, pred, average="weighted", zero_division=0)*100
f1 = f1_score(y_test, pred, average="weighted", zero_division=0)*100
st.success(f"Accuracy: {acc:.2f}%")
st.info(f"Precision: {prec:.2f}% | Recall: {rec:.2f}% | F1: {f1:.2f}%")
show_confusion(y_test, pred, "SVM Matrix")
st.write("**Classification Report:**")
st.text(classification_report(y_test, pred, zero_division=0))
st.markdown('
', unsafe_allow_html=True)
joblib.dump(model.best_estimator_, "svm.pkl")
save_result("SVM", acc, target, ", ".join(features_used), {
"Precision": prec,
"Recall": rec,
"F1Score": f1,
"BestParams": str(model.best_params_),
"OutliersClipped": len(pp_info["outliers_clipped"]),
"LowVarRemoved": str(pp_info["low_var_removed"]),
"HighCorrRemoved": str(pp_info["high_corr_removed"]),
"ClassBalanced": pp_info["class_balanced"],
})
# LOGISTIC
elif model_choice == "Logistic Regression":
X, y, transformed, pp_info = preprocess_for_model(df, target)
features_used = pp_info["features_used"]
result_box = st.container()
with result_box:
st.markdown('', unsafe_allow_html=True)
st.markdown(f"### Logistic Regression Results (Target: {target})")
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
model = GridSearchCV(
LogisticRegression(max_iter=5000, solver="liblinear"),
{
"C":[0.01,0.1,1,10,100],
"penalty":["l1","l2"]
},
cv=cv,
n_jobs=-1
)
model.fit(X_train, y_train)
pred = model.predict(X_test)
acc = accuracy_score(y_test, pred)*100
prec = precision_score(y_test, pred, average="weighted", zero_division=0)*100
rec = recall_score(y_test, pred, average="weighted", zero_division=0)*100
f1 = f1_score(y_test, pred, average="weighted", zero_division=0)*100
st.success(f"Accuracy: {acc:.2f}%")
st.info(f"Precision: {prec:.2f}% | Recall: {rec:.2f}% | F1: {f1:.2f}%")
show_confusion(y_test, pred, "Logistic Regression Matrix")
# Show coefficient magnitudes for logistic regression
if hasattr(model.best_estimator_, "coef_"):
coef = pd.Series(
np.abs(model.best_estimator_.coef_[0]),
index=X.columns
).sort_values(ascending=False).head(8)
compact_bar(coef.index, coef.values, "Feature Coefficients (Absolute)")
st.write("**Classification Report:**")
st.text(classification_report(y_test, pred, zero_division=0))
st.markdown('
', unsafe_allow_html=True)
joblib.dump(model.best_estimator_, "logistic.pkl")
save_result("Logistic Regression", acc, target, ", ".join(features_used), {
"Precision": prec,
"Recall": rec,
"F1Score": f1,
"BestParams": str(model.best_params_),
"OutliersClipped": len(pp_info["outliers_clipped"]),
"LowVarRemoved": str(pp_info["low_var_removed"]),
"HighCorrRemoved": str(pp_info["high_corr_removed"]),
"ClassBalanced": pp_info["class_balanced"],
})
# DECISION TREE
elif model_choice == "Decision Tree":
X, y, transformed, pp_info = preprocess_for_model(df, target)
features_used = pp_info["features_used"]
result_box = st.container()
with result_box:
st.markdown('', unsafe_allow_html=True)
st.markdown(f"### Decision Tree Results (Target: {target})")
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
model = GridSearchCV(
DecisionTreeClassifier(),
{
"max_depth":[3,5,10,15,None],
"min_samples_split":[2,5,10],
"min_samples_leaf":[1,2,4],
"criterion":["gini","entropy"]
},
cv=cv,
n_jobs=-1
)
model.fit(X_train, y_train)
pred = model.predict(X_test)
acc = accuracy_score(y_test, pred)*100
prec = precision_score(y_test, pred, average="weighted", zero_division=0)*100
rec = recall_score(y_test, pred, average="weighted", zero_division=0)*100
f1 = f1_score(y_test, pred, average="weighted", zero_division=0)*100
st.success(f"Accuracy: {acc:.2f}%")
st.info(f"Precision: {prec:.2f}% | Recall: {rec:.2f}% | F1: {f1:.2f}%")
show_confusion(y_test, pred, "Decision Tree Matrix")
# Feature importance for decision tree
imp = pd.Series(
model.best_estimator_.feature_importances_,
index=X.columns
).sort_values(ascending=False).head(8)
compact_bar(imp.index, imp.values, "Feature Importance")
st.write("**Classification Report:**")
st.text(classification_report(y_test, pred, zero_division=0))
st.markdown('
', unsafe_allow_html=True)
joblib.dump(model.best_estimator_, "decision_tree.pkl")
save_result("Decision Tree", acc, target, ", ".join(features_used), {
"Precision": prec,
"Recall": rec,
"F1Score": f1,
"BestParams": str(model.best_params_),
"OutliersClipped": len(pp_info["outliers_clipped"]),
"LowVarRemoved": str(pp_info["low_var_removed"]),
"HighCorrRemoved": str(pp_info["high_corr_removed"]),
"ClassBalanced": pp_info["class_balanced"],
})
# KMEANS
elif model_choice == "KMeans Clustering":
temp = df.copy()
for col in temp.columns:
if temp[col].dtype == "object":
le = LabelEncoder()
temp[col] = le.fit_transform(temp[col].astype(str))
X = temp.drop(columns=[target])
# Clip outliers for clustering too
temp_clipped, outlier_info = clip_outliers_iqr(temp)
X_clipped = temp_clipped.drop(columns=[target])
sc = StandardScaler()
Xs = sc.fit_transform(X_clipped)
# Find optimal k using elbow method
inertias = []
K_range = range(2, min(11, len(df) // 10 + 1))
for k in K_range:
km = KMeans(n_clusters=k, random_state=42, n_init=10)
km.fit(Xs)
inertias.append(km.inertia_)
best_k = 3
if len(inertias) >= 3:
diffs = [inertias[i] - inertias[i+1] for i in range(len(inertias)-1)]
if diffs:
elbow_idx = np.argmax(diffs) + 1
best_k = list(K_range)[elbow_idx] if elbow_idx < len(list(K_range)) else 3
best_k = max(2, min(best_k, 10))
result_box = st.container()
with result_box:
st.markdown('', unsafe_allow_html=True)
st.markdown(f"### KMeans Clustering Results (Target: {target})")
model = KMeans(n_clusters=best_k, random_state=42, n_init=10)
cluster = model.fit_predict(Xs)
score = silhouette_score(Xs, cluster)*100
st.success(f"Cluster Quality Score: {score:.2f}% (k={best_k})")
fig, ax = plt.subplots(figsize=(6,4))
plt.scatter(Xs[:,0], Xs[:,1], c=cluster, cmap="viridis")
plt.title(f"Clusters (k={best_k})")
col1, col2, col3 = st.columns([1, 2, 1])
with col2:
st.pyplot(fig)
# Elbow plot
fig2, ax2 = plt.subplots(figsize=(6,3))
plt.plot(list(K_range), inertias, "bo-")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Inertia")
plt.title("Elbow Method")
col1, col2, col3 = st.columns([1, 2, 1])
with col2:
st.pyplot(fig2)
# Cluster distribution
cluster_counts = pd.Series(cluster).value_counts().sort_index()
fig3, ax3 = plt.subplots(figsize=(6,3))
sns.barplot(x=cluster_counts.index, y=cluster_counts.values)
plt.xlabel("Cluster")
plt.ylabel("Count")
plt.title("Cluster Distribution")
col1, col2, col3 = st.columns([1, 2, 1])
with col2:
st.pyplot(fig3)
st.markdown('
', unsafe_allow_html=True)
joblib.dump(model, "kmeans.pkl")
save_result("KMeans Clustering", score, target, ", ".join(X_clipped.columns), {
"BestK": best_k,
"OutliersClipped": len(outlier_info),
})
# ==========================================================
# DOWNLOAD SECTION
# ==========================================================
if st.session_state.last_model_name:
st.markdown('⬇ Downloads
', unsafe_allow_html=True)
file_map = {
"Random Forest":"random_forest.pkl",
"SVM":"svm.pkl",
"Logistic Regression":"logistic.pkl",
"Decision Tree":"decision_tree.pkl",
"KMeans Clustering":"kmeans.pkl"
}
current = file_map[st.session_state.last_model_name]
if os.path.exists(current):
with open(current, "rb") as f:
st.download_button(
label=f"Download {st.session_state.last_model_name} (Deploy Ready)",
data=f,
file_name=current,
mime="application/octet-stream"
)
# ==========================================================
# HISTORY + REPORTS
# ==========================================================
if len(st.session_state.history) > 0:
st.markdown('📊 History
', unsafe_allow_html=True)
hist = pd.DataFrame(st.session_state.history)
st.dataframe(hist, use_container_width=True)
fig, ax = plt.subplots(figsize=(6,3))
sns.barplot(data=hist, x="Model", y="Score")
plt.xticks(rotation=20)
plt.title("All Applied Models")
col1, col2, col3 = st.columns([1, 2, 1])
with col2:
st.pyplot(fig)
# CSV
csv_buffer = io.StringIO()
hist.to_csv(csv_buffer, index=False)
st.download_button(
"Download Results CSV",
csv_buffer.getvalue(),
"results.csv"
)
# TXT report
if st.session_state.cleaned_df is not None and len(st.session_state.model_results) > 0:
report_text = generate_text_report(
st.session_state.cleaned_df,
st.session_state.selected_target or "unknown",
st.session_state.model_results
)
st.download_button(
"Download Full Report (TXT)",
report_text,
"full_report.txt",
mime="text/plain"
)
# XLSX report
try:
xlsx_data = generate_xlsx_report(
st.session_state.cleaned_df,
st.session_state.selected_target or "unknown",
st.session_state.model_results
)
st.download_button(
"Download Full Report (XLSX)",
data=xlsx_data.getvalue(),
file_name="full_report.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
except Exception:
pass
# ==========================================================
# RESET
# ==========================================================
st.markdown('♻ Reset
', unsafe_allow_html=True)
if st.button("Clear History"):
st.session_state.history = []
st.session_state.last_model_name = None
st.session_state.last_score = None
st.session_state.model_results = []
st.session_state.selected_target = None
st.session_state.cleaned_df = None
st.success("History Cleared")