analysis_web / app.py
shimaa22's picture
Update app.py
d4b1c85 verified
import gradio as gr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
accuracy_score,
precision_score,
recall_score,
f1_score,
confusion_matrix
)
from imblearn.over_sampling import SMOTE
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, Table, TableStyle
from reportlab.lib import colors
from reportlab.lib.styles import getSampleStyleSheet
# =========================
# GLOBALS
# =========================
df_global = None
best_model_name = None
best_model_obj = None
no_global = None
cw_global = None
smote_global = None
cm_global = None
# =========================
# UPLOAD
# =========================
def upload_and_clean(file):
global df_global
df = pd.read_csv(file.name)
df = df.drop_duplicates()
for col in df.columns:
if pd.api.types.is_numeric_dtype(df[col]):
df[col] = df[col].fillna(df[col].median())
else:
df[col] = df[col].fillna(df[col].mode()[0])
df_global = df
return (
"Data Loaded Successfully",
df.head(),
gr.update(choices=list(df.columns)),
gr.update(choices=list(df.columns))
)
# =========================
# ANALYSIS VISUALIZATION
# =========================
def analyze_data(target):
df = df_global.copy()
images = []
cols = [c for c in df.columns if c != target]
for col in cols[:6]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
df[col].astype(str).value_counts().head(10).plot(
kind="bar",
ax=axes[0]
)
axes[0].set_title(f"Bar - {col}")
axes[0].tick_params(axis='x', rotation=45)
df[col].astype(str).value_counts().head(6).plot(
kind="pie",
ax=axes[1],
autopct="%1.1f%%"
)
axes[1].set_title(f"Pie - {col}")
axes[1].set_ylabel("")
plt.tight_layout()
path = f"/tmp/{col}.png"
plt.savefig(path)
plt.close()
images.append(path)
return images
# =========================
# CONFUSION MATRIX
# =========================
def plot_cm(y_true, y_pred, title):
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(4,4))
plt.imshow(cm, cmap="Blues")
plt.title(title)
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
plt.text(j, i, cm[i, j], ha="center", va="center")
path = f"/tmp/{title}.png"
plt.savefig(path)
plt.close()
return path
# =========================
# ML (NO / CW / SMOTE)
# =========================
def run_ml(target):
global df_global, best_model_name
global no_global, cw_global, smote_global, cm_global
df = df_global.copy()
# encode
for col in df.columns:
if not pd.api.types.is_numeric_dtype(df[col]):
df[col] = LabelEncoder().fit_transform(df[col].astype(str))
X = df.drop(columns=[target])
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# imbalance check
counts = np.bincount(y)
imbalance = min(counts) / max(counts) < 0.5
models = {
"Decision Tree": DecisionTreeClassifier(),
"Random Forest": RandomForestClassifier(),
"XGBoost": XGBClassifier(eval_metric="logloss")
}
no_rows, cw_rows, smote_rows = [], [], []
cm_images = {}
best_score = 0
# =========================
# NO SAMPLING
# =========================
for name, model in models.items():
model.fit(X_train, y_train)
pred = model.predict(X_test)
acc = accuracy_score(y_test, pred)
no_rows.append({
"Model": name,
"Accuracy": acc,
"Precision": precision_score(y_test, pred, average="weighted", zero_division=0),
"Recall": recall_score(y_test, pred, average="weighted", zero_division=0),
"F1": f1_score(y_test, pred, average="weighted", zero_division=0)
})
cm_images[f"{name}_no"] = plot_cm(y_test, pred, f"{name}_NO")
if acc > best_score:
best_score = acc
best_model_name = name + " (No)"
# =========================
# CLASS WEIGHT
# =========================
for name in models.keys():
if name == "Decision Tree":
model = DecisionTreeClassifier(class_weight="balanced")
elif name == "Random Forest":
model = RandomForestClassifier(class_weight="balanced")
else:
model = XGBClassifier(eval_metric="logloss")
model.fit(X_train, y_train)
pred = model.predict(X_test)
cw_rows.append({
"Model": name,
"Accuracy": accuracy_score(y_test, pred),
"Precision": precision_score(y_test, pred, average="weighted", zero_division=0),
"Recall": recall_score(y_test, pred, average="weighted", zero_division=0),
"F1": f1_score(y_test, pred, average="weighted", zero_division=0)
})
cm_images[f"{name}_cw"] = plot_cm(y_test, pred, f"{name}_CW")
# =========================
# SMOTE
# =========================
if imbalance:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)
else:
X_res, y_res = X_train, y_train
for name, model in models.items():
model.fit(X_res, y_res)
pred = model.predict(X_test)
smote_rows.append({
"Model": name,
"Accuracy": accuracy_score(y_test, pred),
"Precision": precision_score(y_test, pred, average="weighted", zero_division=0),
"Recall": recall_score(y_test, pred, average="weighted", zero_division=0),
"F1": f1_score(y_test, pred, average="weighted", zero_division=0)
})
cm_images[f"{name}_smote"] = plot_cm(y_test, pred, f"{name}_SMOTE")
# store globally
no_global = pd.DataFrame(no_rows)
cw_global = pd.DataFrame(cw_rows)
smote_global = pd.DataFrame(smote_rows)
cm_global = cm_images
return (
f"Imbalance: {imbalance}",
no_global,
cw_global,
smote_global,
list(cm_images.values())
)
# =========================
# FEATURE IMPORTANCE
# =========================
def feature_importance():
global best_model_obj
if hasattr(best_model_obj, "feature_importances_"):
plt.figure(figsize=(6,4))
plt.barh(range(len(best_model_obj.feature_importances_)),
best_model_obj.feature_importances_)
path = "/tmp/feat.png"
plt.savefig(path)
plt.close()
return path
return None
# =========================
# PDF REPORT
# =========================
def generate_pdf():
global no_global, cw_global, smote_global, cm_global, best_model_name
path = "/tmp/report.pdf"
doc = SimpleDocTemplate(path)
styles = getSampleStyleSheet()
elements = []
elements.append(Paragraph("AutoML Full Report", styles["Title"]))
elements.append(Spacer(1, 10))
elements.append(Paragraph(f"Best Model: {best_model_name}", styles["Heading2"]))
def add_table(df, title):
elements.append(Spacer(1, 10))
elements.append(Paragraph(title, styles["Heading3"]))
data = [df.columns.tolist()] + df.values.tolist()
table = Table(data)
table.setStyle(TableStyle([
("BACKGROUND", (0,0), (-1,0), colors.grey),
("TEXTCOLOR", (0,0), (-1,0), colors.white),
("GRID", (0,0), (-1,-1), 0.5, colors.black)
]))
elements.append(table)
add_table(no_global, "No Sampling")
add_table(cw_global, "Class Weight")
add_table(smote_global, "SMOTE")
elements.append(Spacer(1, 10))
elements.append(Paragraph("Confusion Matrices", styles["Heading2"]))
for name, img in cm_global.items():
elements.append(Paragraph(name, styles["Normal"]))
elements.append(Image(img, width=200, height=200))
doc.build(elements)
return path
# =========================
# ANALYSIS
# =========================
def full_analysis(target):
ml_status, no_df, cw_df, smote_df, imgs = run_ml(target)
return ml_status, no_df, cw_df, smote_df, imgs
# =========================
# UI
# =========================
with gr.Blocks() as demo:
gr.Markdown("# 🚀 Advanced AutoML System")
file = gr.File()
upload_btn = gr.Button("Upload")
status = gr.Textbox()
preview = gr.Dataframe()
target = gr.Dropdown(label="Target")
run_btn = gr.Button("Run Full Analysis")
ml_status = gr.Textbox()
no_table = gr.Dataframe()
cw_table = gr.Dataframe()
smote_table = gr.Dataframe()
gallery = gr.Gallery(columns=2)
feat_btn = gr.Button("Feature Importance")
feat_img = gr.Image()
pdf_btn = gr.Button("Download PDF")
pdf_file = gr.File()
upload_btn.click(upload_and_clean, file,
[status, preview, target, target])
run_btn.click(full_analysis, target,
[ml_status, no_table, cw_table, smote_table, gallery])
feat_btn.click(feature_importance, None, feat_img)
pdf_btn.click(generate_pdf, None, pdf_file)
demo.launch(share=True)