Spaces:

clementBE
/

smart_xlsx

Sleeping

App Files Files Community

clementBE commited on Jul 17, 2025

Commit

3750790

verified ·

1 Parent(s): 8b22417

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -100

app.py CHANGED Viewed

@@ -5,77 +5,66 @@ from sklearn.model_selection import train_test_split
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.metrics import classification_report
-# ✅ Load data from CSV or Excel
 def load_data(file):
     try:
-        # 🔄 Use file object directly to avoid FileNotFoundError
-        if file.name.endswith(".csv"):
-            df = pd.read_csv(file)
         else:
-            df = pd.read_excel(file)
-        preview = df.head(5)
-        summary = pd.DataFrame({
-            "Column": df.columns,
-            "Data Type": [df[col].dtype for col in df.columns],
-            "Missing (%)": [df[col].isnull().mean() * 100 for col in df.columns]
-        })
-        return df, df.columns.tolist(), preview, summary.to_markdown(), ""
     except Exception as e:
-        return None, [], pd.DataFrame(), "", f"❌ Error loading file: {e}"
-# ✅ Preprocess DataFrame
-def preprocess_dataframe(df, quantile_binning=False, count_words=False):
-    df = df.copy()
-    # ➕ Add _qbin columns for numeric columns
-    if quantile_binning:
-        numeric_cols = df.select_dtypes(include=np.number).columns
-        for col in numeric_cols:
-            try:
-                df[col + "_qbin"] = pd.qcut(df[col], q=4, labels=False, duplicates='drop')
-            except:
-                pass  # Some columns can't be binned (e.g., constant values)
-    # ➕ Add _wordcount columns for text columns
-    if count_words:
-        text_cols = df.select_dtypes(include="object").columns
-        for col in text_cols:
-            df[col + "_wordcount"] = df[col].astype(str).apply(lambda x: len(x.split()))
-    return df
-# ✅ Handle file input and update UI
-def on_file_change(file, quantile_binning, count_words):
-    df, _, preview, summary_md, error = load_data(file)
-    if df is None:
-        return None, gr.update(choices=[], value=None), gr.update(choices=[], value=[]), pd.DataFrame(), "", "", "", error
-    # 🔄 Preprocess data and get new columns
-    df_processed = preprocess_dataframe(df, quantile_binning, count_words)
-    columns = list(df_processed.columns)
     return (
-        df_processed,                               # Save in state
-        gr.update(choices=columns, value=None),     # Update target dropdown
-        gr.update(choices=columns, value=[]),       # Update feature selector
-        preview,                                    # Show original preview
-        summary_md,                                 # Show summary table
-        df_processed.head(100),                     # Show processed data
-        "", "",                                     # Clear output and help box
     )
-# ✅ Train model
-def train_model(df, target, features):
-    if df is None or target is None or not features:
-        return "⚠️ Please upload data, select a target column and features.", ""
     try:
-        X = df[features]
-        y = df[target]
-        # Handle categorical features
         X = pd.get_dummies(X)
         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
@@ -83,61 +72,54 @@ def train_model(df, target, features):
         clf.fit(X_train, y_train)
         y_pred = clf.predict(X_test)
-        report = classification_report(y_test, y_pred)
-        help_text = "✅ Model trained successfully. You can review the metrics below."
-        return report, help_text
     except Exception as e:
-        return f"❌ Error training model: {e}", ""
-# ✅ Gradio App Interface
-with gr.Blocks(title="📊 ML Model Trainer with Quartiles and Word Counts") as app:
-    gr.Markdown("## 🧠 Train a Machine Learning Model on Your Dataset")
-    with gr.Row():
-        file_input = gr.File(label="📁 Upload CSV or Excel", file_types=[".csv", ".xls", ".xlsx"])
-        quantile_option = gr.Checkbox(label="Discretize into Quartiles", value=True)
-        wordcount_option = gr.Checkbox(label="Count Words in Text Columns", value=True)
     with gr.Row():
-        target_col = gr.Dropdown(label="🎯 Select Target Column")
-        feature_cols = gr.CheckboxGroup(label="🧠 Select Feature Columns")
-    with gr.Row():
-        df_state = gr.State()
-    with gr.Row():
-        table_preview = gr.DataFrame(label="📋 Data Preview")
-        processed_preview = gr.DataFrame(label="🔍 Processed Data (100 rows)")
-    data_summary = gr.Markdown()
-    with gr.Row():
-        train_button = gr.Button("🚀 Train Model")
-        output = gr.Textbox(label="📊 Classification Report", lines=10)
-        help_box = gr.Textbox(label="ℹ️ Status", interactive=False)
-    # 🔄 Events
     file_input.change(
-        fn=on_file_change,
-        inputs=[file_input, quantile_option, wordcount_option],
-        outputs=[
-            df_state,
-            target_col,
-            feature_cols,
-            table_preview,
-            data_summary,
-            processed_preview,
-            output,
-            help_box,
-        ],
     )
     train_button.click(
         fn=train_model,
-        inputs=[df_state, target_col, feature_cols],
-        outputs=[output, help_box],
     )
-# 🔁 Launch the app
 app.launch()

 from sklearn.ensemble import RandomForestClassifier
 from sklearn.metrics import classification_report
+# Global states for original and processed data
+original_df = None
+processed_df = None
+# ✅ STEP 1: Load file
 def load_data(file):
+    global original_df
     try:
+        if file.name.endswith('.csv'):
+            original_df = pd.read_csv(file)
         else:
+            original_df = pd.read_excel(file)
+        return original_df.head(10), "✅ File loaded successfully."
     except Exception as e:
+        return pd.DataFrame(), f"❌ Error: {e}"
+# ✅ STEP 2: Process data (discretize + word count)
+def process_data():
+    global original_df, processed_df
+    if original_df is None:
+        return pd.DataFrame(), gr.update(choices=[]), gr.update(choices=[]), "⚠️ Please load a dataset first."
+    df = original_df.copy()
+    # Discretize numeric columns into quartiles
+    for col in df.select_dtypes(include=np.number).columns:
+        try:
+            df[col + "_qbin"] = pd.qcut(df[col], 4, labels=False, duplicates='drop')
+        except:
+            pass
+    # Add word count for text columns
+    for col in df.select_dtypes(include='object').columns:
+        df[col + "_wordcount"] = df[col].astype(str).apply(lambda x: len(x.split()))
+    processed_df = df.copy()
     return (
+        df.head(10),
+        gr.update(choices=df.columns.tolist()),
+        gr.update(choices=df.columns.tolist()),
+        "✅ Data processed: discretized and word counts added."
     )
+# ✅ STEP 3: Train model
+def train_model(target_col, feature_cols):
+    global processed_df
+    if processed_df is None:
+        return "⚠️ Please process your data first."
+    if not target_col or not feature_cols:
+        return "⚠️ Please select target and at least one feature."
     try:
+        X = processed_df[feature_cols]
+        y = processed_df[target_col]
+        # Handle categorical variables
         X = pd.get_dummies(X)
         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
         clf.fit(X_train, y_train)
         y_pred = clf.predict(X_test)
+        return classification_report(y_test, y_pred)
     except Exception as e:
+        return f"❌ Model training failed: {e}"
+# ✅ Gradio UI
+with gr.Blocks(title="Step-by-Step Model Trainer") as app:
+    gr.Markdown("## 🧠 Step-by-Step Model Trainer with Discretization and Word Count")
+    # Step 1: Load file
     with gr.Row():
+        file_input = gr.File(label="📁 Upload CSV or Excel")
+        load_output = gr.Textbox(label="ℹ️ File Load Status", interactive=False)
+    original_preview = gr.DataFrame(label="🔍 Original Data (First 10 Rows)")
+    # Step 2: Process Data
+    process_button = gr.Button("⚙️ Apply Discretization + Word Count")
+    processed_preview = gr.DataFrame(label="🔬 Processed Data (First 10 Rows)")
+    process_status = gr.Textbox(label="ℹ️ Process Status", interactive=False)
+    # Step 3: Select Columns
+    target_selector = gr.Dropdown(label="🎯 Target Column")
+    feature_selector = gr.CheckboxGroup(label="📊 Feature Columns")
+    # Step 4: Train
+    train_button = gr.Button("🚀 Train Model")
+    train_output = gr.Textbox(label="📈 Classification Report", lines=10)
+    # Step 1: File input event
     file_input.change(
+        fn=load_data,
+        inputs=[file_input],
+        outputs=[original_preview, load_output]
+    )
+    # Step 2: Process data event
+    process_button.click(
+        fn=process_data,
+        inputs=[],
+        outputs=[processed_preview, target_selector, feature_selector, process_status]
     )
+    # Step 3 + 4: Train model event
     train_button.click(
         fn=train_model,
+        inputs=[target_selector, feature_selector],
+        outputs=[train_output]
     )
+# Launch the app
 app.launch()