Spaces:

clementBE
/

smart_xlsx

Sleeping

App Files Files Community

clementBE commited on Jul 11, 2025

Commit

624ddf1

verified ·

1 Parent(s): 76e3b5f

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -27

app.py CHANGED Viewed

@@ -40,7 +40,20 @@ def load_data(file):
     except Exception as e:
         return None, [], pd.DataFrame(), "", f"❌ Error loading file: {e}"
-def train_model(df, target_col, feature_cols):
     if df is None or df.empty:
         return "Please upload a valid dataset first.", None, ""
     if target_col not in df.columns:
@@ -52,37 +65,37 @@ def train_model(df, target_col, feature_cols):
     if df_clean.empty:
         return "No data left after removing missing values.", None, ""
-    X = pd.get_dummies(df_clean[feature_cols])
-    y = df_clean[target_col]
-    if y.nunique() < 2:
-        return "Target must have at least two classes.", None, ""
-    try:
         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-    except ValueError as e:
-        return f"Error splitting data: {e}", None, ""
-    model = RandomForestClassifier(random_state=42)
-    model.fit(X_train, y_train)
-    y_pred = model.predict(X_test)
-    report = classification_report(y_test, y_pred)
-    # Plot confusion matrix
-    cm = confusion_matrix(y_test, y_pred)
-    fig, ax = plt.subplots(figsize=(6, 5))
-    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
-    ax.set(xlabel='Predicted', ylabel='True', title='Confusion Matrix')
-    plt.tight_layout()
-    buf = io.BytesIO()
-    plt.savefig(buf, format="png")
-    plt.close(fig)
-    img_html = f'<img src="data:image/png;base64,{base64.b64encode(buf.getvalue()).decode()}" alt="Confusion Matrix"/>'
-    help_text = generate_help_text(report)
-    return report, img_html, help_text
 def generate_help_text(report_text):
     try:
@@ -126,9 +139,13 @@ with gr.Blocks() as demo:
         data_summary = gr.Markdown()
     with gr.Row():
-        target_col = gr.Dropdown(label="🎯 Target Column")
         feature_cols = gr.CheckboxGroup(label="📊 Feature Columns")
     train_btn = gr.Button("🚀 Train Model")
     output = gr.Textbox(label="📋 Classification Report", lines=10)
@@ -143,8 +160,9 @@ with gr.Blocks() as demo:
     train_btn.click(
         fn=train_model,
-        inputs=[df_state, target_col, feature_cols],
         outputs=[output, confusion_plot, help_box]
     )
-demo.launch()

     except Exception as e:
         return None, [], pd.DataFrame(), "", f"❌ Error loading file: {e}"
+def preprocess_features(df, feature_cols, recategorize_quartiles=False, count_words=False):
+    processed_df = df.copy()
+    for col in feature_cols:
+        if recategorize_quartiles and pd.api.types.is_numeric_dtype(processed_df[col]):
+            processed_df[col] = pd.qcut(processed_df[col], q=4, duplicates='drop').astype(str)
+        if count_words and processed_df[col].dtype == object:
+            processed_df[col] = processed_df[col].fillna("").apply(lambda x: len(str(x).split()))
+    X = pd.get_dummies(processed_df[feature_cols])
+    return X
+def train_model(df, target_col, feature_cols, recategorize_quartiles=False, count_words=False):
     if df is None or df.empty:
         return "Please upload a valid dataset first.", None, ""
     if target_col not in df.columns:
     if df_clean.empty:
         return "No data left after removing missing values.", None, ""
+    try:
+        X = preprocess_features(df_clean, feature_cols, recategorize_quartiles, count_words)
+        y = df_clean[target_col]
+        if y.nunique() < 2:
+            return "Target must have at least two classes.", None, ""
         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+        model = RandomForestClassifier(random_state=42)
+        model.fit(X_train, y_train)
+        y_pred = model.predict(X_test)
+        report = classification_report(y_test, y_pred)
+        cm = confusion_matrix(y_test, y_pred)
+        fig, ax = plt.subplots(figsize=(6, 5))
+        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
+        ax.set(xlabel='Predicted', ylabel='True', title='Confusion Matrix')
+        plt.tight_layout()
+        buf = io.BytesIO()
+        plt.savefig(buf, format="png")
+        plt.close(fig)
+        img_html = f'<img src="data:image/png;base64,{base64.b64encode(buf.getvalue()).decode()}" alt="Confusion Matrix"/>'
+        help_text = generate_help_text(report)
+        return report, img_html, help_text
+    except Exception as e:
+        return f"Error during training: {e}", None, ""
 def generate_help_text(report_text):
     try:
         data_summary = gr.Markdown()
     with gr.Row():
+        target_col = gr.Dropdown(label="🌟 Target Column")
         feature_cols = gr.CheckboxGroup(label="📊 Feature Columns")
+    with gr.Row():
+        recategorize_quartiles = gr.Checkbox(label="Discretize Numeric Columns into Quartiles")
+        count_words = gr.Checkbox(label="Count Words in Text Columns")
     train_btn = gr.Button("🚀 Train Model")
     output = gr.Textbox(label="📋 Classification Report", lines=10)
     train_btn.click(
         fn=train_model,
+        inputs=[df_state, target_col, feature_cols, recategorize_quartiles, count_words],
         outputs=[output, confusion_plot, help_box]
     )
+if __name__ == "__main__":
+    demo.launch()