import gradio as gr import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from xgboost import XGBClassifier from sklearn.metrics import ( accuracy_score, precision_score, recall_score, f1_score, confusion_matrix ) from imblearn.over_sampling import SMOTE from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, Table, TableStyle from reportlab.lib import colors from reportlab.lib.styles import getSampleStyleSheet # ========================= # GLOBALS # ========================= df_global = None best_model_name = None best_model_obj = None no_global = None cw_global = None smote_global = None cm_global = None # ========================= # UPLOAD # ========================= def upload_and_clean(file): global df_global df = pd.read_csv(file.name) df = df.drop_duplicates() for col in df.columns: if pd.api.types.is_numeric_dtype(df[col]): df[col] = df[col].fillna(df[col].median()) else: df[col] = df[col].fillna(df[col].mode()[0]) df_global = df return ( "Data Loaded Successfully", df.head(), gr.update(choices=list(df.columns)), gr.update(choices=list(df.columns)) ) # ========================= # ANALYSIS VISUALIZATION # ========================= def analyze_data(target): df = df_global.copy() images = [] cols = [c for c in df.columns if c != target] for col in cols[:6]: fig, axes = plt.subplots(1, 2, figsize=(12, 4)) df[col].astype(str).value_counts().head(10).plot( kind="bar", ax=axes[0] ) axes[0].set_title(f"Bar - {col}") axes[0].tick_params(axis='x', rotation=45) df[col].astype(str).value_counts().head(6).plot( kind="pie", ax=axes[1], autopct="%1.1f%%" ) axes[1].set_title(f"Pie - {col}") axes[1].set_ylabel("") plt.tight_layout() path = f"/tmp/{col}.png" plt.savefig(path) plt.close() images.append(path) return images # ========================= # CONFUSION MATRIX # ========================= def plot_cm(y_true, y_pred, title): cm = confusion_matrix(y_true, y_pred) plt.figure(figsize=(4,4)) plt.imshow(cm, cmap="Blues") plt.title(title) for i in range(cm.shape[0]): for j in range(cm.shape[1]): plt.text(j, i, cm[i, j], ha="center", va="center") path = f"/tmp/{title}.png" plt.savefig(path) plt.close() return path # ========================= # ML (NO / CW / SMOTE) # ========================= def run_ml(target): global df_global, best_model_name global no_global, cw_global, smote_global, cm_global df = df_global.copy() # encode for col in df.columns: if not pd.api.types.is_numeric_dtype(df[col]): df[col] = LabelEncoder().fit_transform(df[col].astype(str)) X = df.drop(columns=[target]) y = df[target] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) # imbalance check counts = np.bincount(y) imbalance = min(counts) / max(counts) < 0.5 models = { "Decision Tree": DecisionTreeClassifier(), "Random Forest": RandomForestClassifier(), "XGBoost": XGBClassifier(eval_metric="logloss") } no_rows, cw_rows, smote_rows = [], [], [] cm_images = {} best_score = 0 # ========================= # NO SAMPLING # ========================= for name, model in models.items(): model.fit(X_train, y_train) pred = model.predict(X_test) acc = accuracy_score(y_test, pred) no_rows.append({ "Model": name, "Accuracy": acc, "Precision": precision_score(y_test, pred, average="weighted", zero_division=0), "Recall": recall_score(y_test, pred, average="weighted", zero_division=0), "F1": f1_score(y_test, pred, average="weighted", zero_division=0) }) cm_images[f"{name}_no"] = plot_cm(y_test, pred, f"{name}_NO") if acc > best_score: best_score = acc best_model_name = name + " (No)" # ========================= # CLASS WEIGHT # ========================= for name in models.keys(): if name == "Decision Tree": model = DecisionTreeClassifier(class_weight="balanced") elif name == "Random Forest": model = RandomForestClassifier(class_weight="balanced") else: model = XGBClassifier(eval_metric="logloss") model.fit(X_train, y_train) pred = model.predict(X_test) cw_rows.append({ "Model": name, "Accuracy": accuracy_score(y_test, pred), "Precision": precision_score(y_test, pred, average="weighted", zero_division=0), "Recall": recall_score(y_test, pred, average="weighted", zero_division=0), "F1": f1_score(y_test, pred, average="weighted", zero_division=0) }) cm_images[f"{name}_cw"] = plot_cm(y_test, pred, f"{name}_CW") # ========================= # SMOTE # ========================= if imbalance: sm = SMOTE(random_state=42) X_res, y_res = sm.fit_resample(X_train, y_train) else: X_res, y_res = X_train, y_train for name, model in models.items(): model.fit(X_res, y_res) pred = model.predict(X_test) smote_rows.append({ "Model": name, "Accuracy": accuracy_score(y_test, pred), "Precision": precision_score(y_test, pred, average="weighted", zero_division=0), "Recall": recall_score(y_test, pred, average="weighted", zero_division=0), "F1": f1_score(y_test, pred, average="weighted", zero_division=0) }) cm_images[f"{name}_smote"] = plot_cm(y_test, pred, f"{name}_SMOTE") # store globally no_global = pd.DataFrame(no_rows) cw_global = pd.DataFrame(cw_rows) smote_global = pd.DataFrame(smote_rows) cm_global = cm_images return ( f"Imbalance: {imbalance}", no_global, cw_global, smote_global, list(cm_images.values()) ) # ========================= # FEATURE IMPORTANCE # ========================= def feature_importance(): global best_model_obj if hasattr(best_model_obj, "feature_importances_"): plt.figure(figsize=(6,4)) plt.barh(range(len(best_model_obj.feature_importances_)), best_model_obj.feature_importances_) path = "/tmp/feat.png" plt.savefig(path) plt.close() return path return None # ========================= # PDF REPORT # ========================= def generate_pdf(): global no_global, cw_global, smote_global, cm_global, best_model_name path = "/tmp/report.pdf" doc = SimpleDocTemplate(path) styles = getSampleStyleSheet() elements = [] elements.append(Paragraph("AutoML Full Report", styles["Title"])) elements.append(Spacer(1, 10)) elements.append(Paragraph(f"Best Model: {best_model_name}", styles["Heading2"])) def add_table(df, title): elements.append(Spacer(1, 10)) elements.append(Paragraph(title, styles["Heading3"])) data = [df.columns.tolist()] + df.values.tolist() table = Table(data) table.setStyle(TableStyle([ ("BACKGROUND", (0,0), (-1,0), colors.grey), ("TEXTCOLOR", (0,0), (-1,0), colors.white), ("GRID", (0,0), (-1,-1), 0.5, colors.black) ])) elements.append(table) add_table(no_global, "No Sampling") add_table(cw_global, "Class Weight") add_table(smote_global, "SMOTE") elements.append(Spacer(1, 10)) elements.append(Paragraph("Confusion Matrices", styles["Heading2"])) for name, img in cm_global.items(): elements.append(Paragraph(name, styles["Normal"])) elements.append(Image(img, width=200, height=200)) doc.build(elements) return path # ========================= # ANALYSIS # ========================= def full_analysis(target): ml_status, no_df, cw_df, smote_df, imgs = run_ml(target) return ml_status, no_df, cw_df, smote_df, imgs # ========================= # UI # ========================= with gr.Blocks() as demo: gr.Markdown("# 🚀 Advanced AutoML System") file = gr.File() upload_btn = gr.Button("Upload") status = gr.Textbox() preview = gr.Dataframe() target = gr.Dropdown(label="Target") run_btn = gr.Button("Run Full Analysis") ml_status = gr.Textbox() no_table = gr.Dataframe() cw_table = gr.Dataframe() smote_table = gr.Dataframe() gallery = gr.Gallery(columns=2) feat_btn = gr.Button("Feature Importance") feat_img = gr.Image() pdf_btn = gr.Button("Download PDF") pdf_file = gr.File() upload_btn.click(upload_and_clean, file, [status, preview, target, target]) run_btn.click(full_analysis, target, [ml_status, no_table, cw_table, smote_table, gallery]) feat_btn.click(feature_importance, None, feat_img) pdf_btn.click(generate_pdf, None, pdf_file) demo.launch(share=True)