Spaces:

clementBE
/

smart_xlsx

Sleeping

App Files Files Community

clementBE commited on Jul 17, 2025

Commit

36ca2d6

verified ·

1 Parent(s): ac3986c

Update app.py

Browse files

Files changed (1) hide show

app.py +157 -151

app.py CHANGED Viewed

@@ -1,188 +1,179 @@
 import gradio as gr
 import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.ensemble import RandomForestClassifier
-from sklearn.metrics import classification_report, confusion_matrix
-import matplotlib.pyplot as plt
-import seaborn as sns
-import io
-import base64
-import re
-# Step 1: Preprocessing function
 def preprocess_dataframe(df, quantile_binning=False, count_words=False):
     df = df.copy()
-    # Discretize numeric columns into quartiles (adds new columns ending in '_qbin')
     if quantile_binning:
-        for col in df.select_dtypes(include='number').columns:
             try:
-                df[col + "_qbin"] = pd.qcut(df[col], q=4, labels=False, duplicates='drop')
-            except Exception:
-                continue
-    # Count number of words in text columns (adds new columns ending in '_wordcount')
     if count_words:
-        for col in df.select_dtypes(include='object').columns:
-            df[col + "_wordcount"] = df[col].apply(lambda x: len(str(x).split()))
     return df
-# Step 2: Load data from uploaded file
-def load_data(file):
-    if file is None:
-        return None, [], pd.DataFrame(), "", ""
-    try:
-        filepath = file.name if hasattr(file, "name") else file
-        if filepath.endswith(".csv"):
-            try:
-                df = pd.read_csv(filepath, encoding='utf-8')
-            except UnicodeDecodeError:
-                df = pd.read_csv(filepath, encoding='latin1')
         else:
-            df = pd.read_excel(filepath)
-        columns = list(df.columns)
-        preview = df.head(100)
-        missing = df.isnull().sum()
-        desc = df.describe(include='all').T
-        # Summary markdown table
-        summary_md = "### Data Summary\n\n| Column | Missing | Min | Max | Mean | Median | Unique |\n|---|---|---|---|---|---|---|\n"
-        for col in df.columns:
-            miss = missing[col]
-            min_val = desc.loc[col, 'min'] if 'min' in desc.columns and col in desc.index else "-"
-            max_val = desc.loc[col, 'max'] if 'max' in desc.columns and col in desc.index else "-"
-            mean_val = desc.loc[col, 'mean'] if 'mean' in desc.columns and col in desc.index else "-"
-            median_val = df[col].median() if pd.api.types.is_numeric_dtype(df[col]) else "-"
-            unique_val = df[col].nunique()
-            summary_md += f"| {col} | {miss} | {min_val} | {max_val} | {mean_val} | {median_val} | {unique_val} |\n"
-        return df, columns, preview, summary_md, ""
-    except Exception as e:
-        return None, [], pd.DataFrame(), "", f"❌ Error loading file: {e}"
-# Step 3: Train RandomForest model on selected columns
-def train_model(df, target_col, feature_cols):
-    if df is None or df.empty:
-        return "Please upload a valid dataset first.", None, ""
-    if target_col not in df.columns:
-        return "Target column not found.", None, ""
-    if not feature_cols:
-        return "Select at least one feature column.", None, ""
-    df_clean = df[[target_col] + feature_cols].dropna()
-    if df_clean.empty:
-        return "No data left after removing missing values.", None, ""
-    # Convert categorical columns into dummy variables
-    X = pd.get_dummies(df_clean[feature_cols])
-    y = df_clean[target_col]
-    if y.nunique() < 2:
-        return "Target must have at least two classes.", None, ""
-    try:
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-    except ValueError as e:
-        return f"Error splitting data: {e}", None, ""
-    model = RandomForestClassifier(random_state=42)
     model.fit(X_train, y_train)
-    y_pred = model.predict(X_test)
     report = classification_report(y_test, y_pred)
-    # Plot confusion matrix
-    cm = confusion_matrix(y_test, y_pred)
-    fig, ax = plt.subplots(figsize=(6, 5))
-    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
-    ax.set(xlabel='Predicted', ylabel='True', title='Confusion Matrix')
-    plt.tight_layout()
-    buf = io.BytesIO()
-    plt.savefig(buf, format="png")
-    plt.close(fig)
-    img_html = f'<img src="data:image/png;base64,{base64.b64encode(buf.getvalue()).decode()}" alt="Confusion Matrix"/>'
-    help_text = generate_help_text(report)
-    return report, img_html, help_text
-# Step 4: Auto-generate explanation of metrics
-def generate_help_text(report_text):
-    try:
-        macro = re.search(r'macro avg\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)', report_text)
-        if macro:
-            precision = float(macro.group(1))
-            recall = float(macro.group(2))
-            f1 = float(macro.group(3))
-            text = (
-                f"### Performance Insights\n"
-                f"- **Precision (~{precision:.2f})**: Accuracy of positive predictions.\n"
-                f"- **Recall (~{recall:.2f})**: Coverage of actual positives.\n"
-                f"- **F1-score (~{f1:.2f})**: Balance between precision and recall.\n\n"
-            )
-            if precision < 0.5: text += "⚠️ Low precision: many false positives.\n"
-            if recall < 0.5: text += "⚠️ Low recall: many false negatives.\n"
-            if precision > 0.8 and recall > 0.8: text += "✅ Strong performance across both metrics.\n"
-            return text + "\nReview the confusion matrix for misclassifications."
-    except Exception:
-        pass
-    return "Help will appear after training."
-# Step 5: When file is uploaded, load, preprocess and update all UI elements
-def on_file_change(file, quantile_binning, count_words):
-    df, columns, preview, summary_md, error = load_data(file)
-    if df is None:
-        return None, gr.update(choices=[], value=None), gr.update(choices=[], value=[]), pd.DataFrame(), "", "", "", error
-    df_processed = preprocess_dataframe(df, quantile_binning, count_words)
-    return (
-        df_processed,  # Store processed dataframe in state
-        gr.update(choices=list(df_processed.columns)),  # Update target dropdown
-        gr.update(choices=list(df_processed.columns)),  # Update feature checkboxes
-        preview,  # Show raw preview
-        summary_md,  # Show summary
-        df_processed.head(100),  # Show processed preview
-        "",  # Clear classification report
-        "",  # Clear help text
-    )
-# Step 6: Build the Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("# 📊 Easy ML Classifier for CSV/XLSX Files")
-    gr.Markdown("### Step 1: Upload your file (CSV or Excel)")
     df_state = gr.State(None)
-    file_input = gr.File(label="Upload CSV or Excel File", file_types=[".csv", ".xlsx", ".xls"])
-    gr.Markdown("### Step 2: Choose preprocessing options (optional)")
-    quantile_option = gr.Checkbox(label="Discretize Numeric Columns into Quartiles (adds '_qbin')")
-    wordcount_option = gr.Checkbox(label="Count Words in Text Columns (adds '_wordcount')")
-    gr.Markdown("### Step 3: Preview the original and processed data")
     with gr.Row():
-        table_preview = gr.DataFrame(label="Original Data Preview")
-        processed_preview = gr.DataFrame(label="Processed Data (with new columns)")
-    gr.Markdown("### Step 4: Explore data summary")
-    data_summary = gr.Markdown()
-    gr.Markdown("### Step 5: Select your target and features")
     with gr.Row():
-        target_col = gr.Dropdown(label="Select Target Column (what you want to predict)")
-        feature_cols = gr.CheckboxGroup(label="Select Feature Columns (used to make predictions)")
-    gr.Markdown("### Step 6: Train the classifier")
-    train_btn = gr.Button("🚀 Train Model")
-    gr.Markdown("### Step 7: Results")
-    output = gr.Textbox(label="Classification Report", lines=10)
-    confusion_plot = gr.HTML()
-    help_box = gr.Markdown()
-    # Trigger when file is uploaded or options changed
     file_input.change(
         fn=on_file_change,
         inputs=[file_input, quantile_option, wordcount_option],
@@ -194,17 +185,32 @@ with gr.Blocks() as demo:
             data_summary,
             processed_preview,
             output,
-            help_box,
         ]
     )
-    # Train model when button is clicked
-    train_btn.click(
-        fn=train_model,
         inputs=[df_state, target_col, feature_cols],
-        outputs=[output, confusion_plot, help_box]
     )
-# Step 7: Launch app with public URL
 if __name__ == "__main__":
     demo.launch(share=True)

 import gradio as gr
 import pandas as pd
+import numpy as np
 from sklearn.model_selection import train_test_split
 from sklearn.ensemble import RandomForestClassifier
+from sklearn.preprocessing import LabelEncoder
+from sklearn.metrics import classification_report
+# ----------- Helper Functions -----------
+def load_data(file):
+    try:
+        if file.name.endswith(".csv"):
+            df = pd.read_csv(file.name)
+        else:
+            df = pd.read_excel(file.name)
+        # Show first 5 rows of the uploaded file
+        preview = df.head(5)
+        # Create a short summary with column types and missing values
+        summary = pd.DataFrame({
+            "Column": df.columns,
+            "Data Type": [df[col].dtype for col in df.columns],
+            "Missing (%)": [df[col].isnull().mean() * 100 for col in df.columns]
+        })
+        return df, df.columns.tolist(), preview, summary.to_markdown(), ""
+    except Exception as e:
+        return None, [], pd.DataFrame(), "", f"❌ Error loading file: {e}"
 def preprocess_dataframe(df, quantile_binning=False, count_words=False):
+    # Copy the original DataFrame to avoid overwriting
     df = df.copy()
+    # If user selects quantile binning
     if quantile_binning:
+        numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns
+        for col in numeric_cols:
             try:
+                df[col + "_qbin"] = pd.qcut(df[col], q=4, labels=["Q1", "Q2", "Q3", "Q4"])
+            except Exception as e:
+                print(f"Warning (qbin failed for {col}):", e)
+    # If user selects count_words for text columns
     if count_words:
+        text_cols = df.select_dtypes(include=["object"]).columns
+        for col in text_cols:
+            try:
+                df[col + "_wordcount"] = df[col].astype(str).apply(lambda x: len(x.split()))
+            except Exception as e:
+                print(f"Warning (wordcount failed for {col}):", e)
     return df
+def train_model(df, target_column, feature_columns):
+    # Remove rows with missing target values
+    df = df.dropna(subset=[target_column])
+    # Fill missing values in feature columns
+    for col in feature_columns:
+        if df[col].dtype == "O":
+            df[col] = df[col].fillna("missing")
         else:
+            df[col] = df[col].fillna(df[col].median())
+    X = df[feature_columns]
+    y = df[target_column]
+    # Encode categorical features
+    for col in X.select_dtypes(include=["object"]).columns:
+        X[col] = LabelEncoder().fit_transform(X[col])
+    # Encode target if it's categorical
+    if y.dtype == "O":
+        y = LabelEncoder().fit_transform(y)
+    # Split into train and test sets
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+    # Train a simple Random Forest model
+    model = RandomForestClassifier()
     model.fit(X_train, y_train)
+    # Predict and show classification report
+    y_pred = model.predict(X_test)
     report = classification_report(y_test, y_pred)
+    return report
+# ----------- Gradio Interface Setup -----------
 with gr.Blocks() as demo:
+    gr.Markdown("# 🧠 CSV/XLSX Classifier with Auto Summary and Visualization")
+    # Store the DataFrame in memory
     df_state = gr.State(None)
+    # Upload section
+    with gr.Row():
+        file_input = gr.File(label="📁 Upload CSV or Excel File", file_types=[".csv", ".xlsx", ".xls"])
+    with gr.Row():
+        quantile_option = gr.Checkbox(label="📊 Discretize Numeric Columns into Quartiles")
+        wordcount_option = gr.Checkbox(label="📝 Count Words in Text Columns")
     with gr.Row():
+        target_col = gr.Dropdown(label="🎯 Target Column (What you want to predict)", choices=[])
+        feature_cols = gr.CheckboxGroup(label="🧾 Feature Columns (Used to predict target)", choices=[])
+    # Buttons
+    with gr.Row():
+        train_button = gr.Button("🚀 Train Model")
+        clear_button = gr.Button("🔄 Clear All")
+    # Outputs
     with gr.Row():
+        output = gr.Textbox(label="📋 Model Output (Classification Report)", lines=10)
+    with gr.Row():
+        data_summary = gr.Textbox(label="📊 Data Summary", lines=10)
+        help_box = gr.Textbox(label="💡 Help", lines=5, value="✔️ Upload a dataset, choose preprocessing options, then train.")
+    # Data Previews
+    with gr.Row():
+        table_preview = gr.DataFrame(label="🔍 Original Data Preview")
+        processed_preview = gr.DataFrame(label="🧪 Processed Data Preview (with new columns)")
+    # ----------- Define App Logic -----------
+    # Handle file upload and update column options
+    def on_file_change(file, quantile_binning, count_words):
+        df, _, preview, summary_md, error = load_data(file)
+        if df is None:
+            return None, gr.update(choices=[], value=None), gr.update(choices=[], value=[]), pd.DataFrame(), "", "", "", error
+        # Apply preprocessing
+        df_processed = preprocess_dataframe(df, quantile_binning, count_words)
+        # Update selectors with new columns from the processed DataFrame
+        columns = list(df_processed.columns)
+        return (
+            df_processed,  # Store processed df in state
+            gr.update(choices=columns, value=None),
+            gr.update(choices=columns, value=[]),
+            preview,
+            summary_md,
+            df_processed.head(100),  # Show processed preview
+            "",  # Clear model output
+            ""   # Clear help
+        )
+    # Handle training the model
+    def on_train(df, target, features):
+        if df is None:
+            return "⚠️ Please upload a file first."
+        if target is None or not features:
+            return "⚠️ Please select target and feature columns."
+        return train_model(df, target, features)
+    # Clear all interface elements
+    def on_clear():
+        return (
+            None,  # df_state
+            None,  # target_col
+            [],    # feature_cols
+            pd.DataFrame(),
+            "",
+            pd.DataFrame(),
+            "",
+            "✔️ Upload a dataset, choose preprocessing options, then train."
+        )
+    # ----------- Connect Actions to Widgets -----------
     file_input.change(
         fn=on_file_change,
         inputs=[file_input, quantile_option, wordcount_option],
             data_summary,
             processed_preview,
             output,
+            help_box
         ]
     )
+    train_button.click(
+        fn=on_train,
         inputs=[df_state, target_col, feature_cols],
+        outputs=output
     )
+    clear_button.click(
+        fn=on_clear,
+        inputs=[],
+        outputs=[
+            df_state,
+            target_col,
+            feature_cols,
+            table_preview,
+            data_summary,
+            processed_preview,
+            output,
+            help_box
+        ]
+    )
+# ----------- Launch the App -----------
 if __name__ == "__main__":
     demo.launch(share=True)