clementBE commited on
Commit
36ca2d6
Β·
verified Β·
1 Parent(s): ac3986c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +157 -151
app.py CHANGED
@@ -1,188 +1,179 @@
1
  import gradio as gr
2
  import pandas as pd
 
3
  from sklearn.model_selection import train_test_split
4
  from sklearn.ensemble import RandomForestClassifier
5
- from sklearn.metrics import classification_report, confusion_matrix
6
- import matplotlib.pyplot as plt
7
- import seaborn as sns
8
- import io
9
- import base64
10
- import re
11
-
12
- # Step 1: Preprocessing function
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  def preprocess_dataframe(df, quantile_binning=False, count_words=False):
 
14
  df = df.copy()
15
 
16
- # Discretize numeric columns into quartiles (adds new columns ending in '_qbin')
17
  if quantile_binning:
18
- for col in df.select_dtypes(include='number').columns:
 
19
  try:
20
- df[col + "_qbin"] = pd.qcut(df[col], q=4, labels=False, duplicates='drop')
21
- except Exception:
22
- continue
23
 
24
- # Count number of words in text columns (adds new columns ending in '_wordcount')
25
  if count_words:
26
- for col in df.select_dtypes(include='object').columns:
27
- df[col + "_wordcount"] = df[col].apply(lambda x: len(str(x).split()))
 
 
 
 
28
 
29
  return df
30
 
31
- # Step 2: Load data from uploaded file
32
- def load_data(file):
33
- if file is None:
34
- return None, [], pd.DataFrame(), "", ""
35
 
36
- try:
37
- filepath = file.name if hasattr(file, "name") else file
38
- if filepath.endswith(".csv"):
39
- try:
40
- df = pd.read_csv(filepath, encoding='utf-8')
41
- except UnicodeDecodeError:
42
- df = pd.read_csv(filepath, encoding='latin1')
43
  else:
44
- df = pd.read_excel(filepath)
45
-
46
- columns = list(df.columns)
47
- preview = df.head(100)
48
- missing = df.isnull().sum()
49
- desc = df.describe(include='all').T
50
-
51
- # Summary markdown table
52
- summary_md = "### Data Summary\n\n| Column | Missing | Min | Max | Mean | Median | Unique |\n|---|---|---|---|---|---|---|\n"
53
- for col in df.columns:
54
- miss = missing[col]
55
- min_val = desc.loc[col, 'min'] if 'min' in desc.columns and col in desc.index else "-"
56
- max_val = desc.loc[col, 'max'] if 'max' in desc.columns and col in desc.index else "-"
57
- mean_val = desc.loc[col, 'mean'] if 'mean' in desc.columns and col in desc.index else "-"
58
- median_val = df[col].median() if pd.api.types.is_numeric_dtype(df[col]) else "-"
59
- unique_val = df[col].nunique()
60
- summary_md += f"| {col} | {miss} | {min_val} | {max_val} | {mean_val} | {median_val} | {unique_val} |\n"
61
-
62
- return df, columns, preview, summary_md, ""
63
- except Exception as e:
64
- return None, [], pd.DataFrame(), "", f"❌ Error loading file: {e}"
65
-
66
- # Step 3: Train RandomForest model on selected columns
67
- def train_model(df, target_col, feature_cols):
68
- if df is None or df.empty:
69
- return "Please upload a valid dataset first.", None, ""
70
- if target_col not in df.columns:
71
- return "Target column not found.", None, ""
72
- if not feature_cols:
73
- return "Select at least one feature column.", None, ""
74
 
75
- df_clean = df[[target_col] + feature_cols].dropna()
76
- if df_clean.empty:
77
- return "No data left after removing missing values.", None, ""
78
 
79
- # Convert categorical columns into dummy variables
80
- X = pd.get_dummies(df_clean[feature_cols])
81
- y = df_clean[target_col]
82
 
83
- if y.nunique() < 2:
84
- return "Target must have at least two classes.", None, ""
 
85
 
86
- try:
87
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
88
- except ValueError as e:
89
- return f"Error splitting data: {e}", None, ""
90
 
91
- model = RandomForestClassifier(random_state=42)
 
92
  model.fit(X_train, y_train)
93
- y_pred = model.predict(X_test)
94
 
 
 
95
  report = classification_report(y_test, y_pred)
96
 
97
- # Plot confusion matrix
98
- cm = confusion_matrix(y_test, y_pred)
99
- fig, ax = plt.subplots(figsize=(6, 5))
100
- sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
101
- ax.set(xlabel='Predicted', ylabel='True', title='Confusion Matrix')
102
- plt.tight_layout()
103
 
104
- buf = io.BytesIO()
105
- plt.savefig(buf, format="png")
106
- plt.close(fig)
107
- img_html = f'<img src="data:image/png;base64,{base64.b64encode(buf.getvalue()).decode()}" alt="Confusion Matrix"/>'
108
 
109
- help_text = generate_help_text(report)
110
- return report, img_html, help_text
111
-
112
- # Step 4: Auto-generate explanation of metrics
113
- def generate_help_text(report_text):
114
- try:
115
- macro = re.search(r'macro avg\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)', report_text)
116
- if macro:
117
- precision = float(macro.group(1))
118
- recall = float(macro.group(2))
119
- f1 = float(macro.group(3))
120
- text = (
121
- f"### Performance Insights\n"
122
- f"- **Precision (~{precision:.2f})**: Accuracy of positive predictions.\n"
123
- f"- **Recall (~{recall:.2f})**: Coverage of actual positives.\n"
124
- f"- **F1-score (~{f1:.2f})**: Balance between precision and recall.\n\n"
125
- )
126
- if precision < 0.5: text += "⚠️ Low precision: many false positives.\n"
127
- if recall < 0.5: text += "⚠️ Low recall: many false negatives.\n"
128
- if precision > 0.8 and recall > 0.8: text += "βœ… Strong performance across both metrics.\n"
129
- return text + "\nReview the confusion matrix for misclassifications."
130
- except Exception:
131
- pass
132
- return "Help will appear after training."
133
-
134
- # Step 5: When file is uploaded, load, preprocess and update all UI elements
135
- def on_file_change(file, quantile_binning, count_words):
136
- df, columns, preview, summary_md, error = load_data(file)
137
- if df is None:
138
- return None, gr.update(choices=[], value=None), gr.update(choices=[], value=[]), pd.DataFrame(), "", "", "", error
139
- df_processed = preprocess_dataframe(df, quantile_binning, count_words)
140
- return (
141
- df_processed, # Store processed dataframe in state
142
- gr.update(choices=list(df_processed.columns)), # Update target dropdown
143
- gr.update(choices=list(df_processed.columns)), # Update feature checkboxes
144
- preview, # Show raw preview
145
- summary_md, # Show summary
146
- df_processed.head(100), # Show processed preview
147
- "", # Clear classification report
148
- "", # Clear help text
149
- )
150
-
151
- # Step 6: Build the Gradio interface
152
  with gr.Blocks() as demo:
153
- gr.Markdown("# πŸ“Š Easy ML Classifier for CSV/XLSX Files")
154
 
155
- gr.Markdown("### Step 1: Upload your file (CSV or Excel)")
156
  df_state = gr.State(None)
157
 
158
- file_input = gr.File(label="Upload CSV or Excel File", file_types=[".csv", ".xlsx", ".xls"])
 
 
159
 
160
- gr.Markdown("### Step 2: Choose preprocessing options (optional)")
161
- quantile_option = gr.Checkbox(label="Discretize Numeric Columns into Quartiles (adds '_qbin')")
162
- wordcount_option = gr.Checkbox(label="Count Words in Text Columns (adds '_wordcount')")
163
 
164
- gr.Markdown("### Step 3: Preview the original and processed data")
165
  with gr.Row():
166
- table_preview = gr.DataFrame(label="Original Data Preview")
167
- processed_preview = gr.DataFrame(label="Processed Data (with new columns)")
168
 
169
- gr.Markdown("### Step 4: Explore data summary")
170
- data_summary = gr.Markdown()
 
 
171
 
172
- gr.Markdown("### Step 5: Select your target and features")
173
  with gr.Row():
174
- target_col = gr.Dropdown(label="Select Target Column (what you want to predict)")
175
- feature_cols = gr.CheckboxGroup(label="Select Feature Columns (used to make predictions)")
176
 
177
- gr.Markdown("### Step 6: Train the classifier")
178
- train_btn = gr.Button("πŸš€ Train Model")
 
179
 
180
- gr.Markdown("### Step 7: Results")
181
- output = gr.Textbox(label="Classification Report", lines=10)
182
- confusion_plot = gr.HTML()
183
- help_box = gr.Markdown()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
- # Trigger when file is uploaded or options changed
186
  file_input.change(
187
  fn=on_file_change,
188
  inputs=[file_input, quantile_option, wordcount_option],
@@ -194,17 +185,32 @@ with gr.Blocks() as demo:
194
  data_summary,
195
  processed_preview,
196
  output,
197
- help_box,
198
  ]
199
  )
200
 
201
- # Train model when button is clicked
202
- train_btn.click(
203
- fn=train_model,
204
  inputs=[df_state, target_col, feature_cols],
205
- outputs=[output, confusion_plot, help_box]
206
  )
207
 
208
- # Step 7: Launch app with public URL
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  if __name__ == "__main__":
210
  demo.launch(share=True)
 
1
  import gradio as gr
2
  import pandas as pd
3
+ import numpy as np
4
  from sklearn.model_selection import train_test_split
5
  from sklearn.ensemble import RandomForestClassifier
6
+ from sklearn.preprocessing import LabelEncoder
7
+ from sklearn.metrics import classification_report
8
+
9
+ # ----------- Helper Functions -----------
10
+
11
+ def load_data(file):
12
+ try:
13
+ if file.name.endswith(".csv"):
14
+ df = pd.read_csv(file.name)
15
+ else:
16
+ df = pd.read_excel(file.name)
17
+
18
+ # Show first 5 rows of the uploaded file
19
+ preview = df.head(5)
20
+
21
+ # Create a short summary with column types and missing values
22
+ summary = pd.DataFrame({
23
+ "Column": df.columns,
24
+ "Data Type": [df[col].dtype for col in df.columns],
25
+ "Missing (%)": [df[col].isnull().mean() * 100 for col in df.columns]
26
+ })
27
+
28
+ return df, df.columns.tolist(), preview, summary.to_markdown(), ""
29
+ except Exception as e:
30
+ return None, [], pd.DataFrame(), "", f"❌ Error loading file: {e}"
31
+
32
  def preprocess_dataframe(df, quantile_binning=False, count_words=False):
33
+ # Copy the original DataFrame to avoid overwriting
34
  df = df.copy()
35
 
36
+ # If user selects quantile binning
37
  if quantile_binning:
38
+ numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns
39
+ for col in numeric_cols:
40
  try:
41
+ df[col + "_qbin"] = pd.qcut(df[col], q=4, labels=["Q1", "Q2", "Q3", "Q4"])
42
+ except Exception as e:
43
+ print(f"Warning (qbin failed for {col}):", e)
44
 
45
+ # If user selects count_words for text columns
46
  if count_words:
47
+ text_cols = df.select_dtypes(include=["object"]).columns
48
+ for col in text_cols:
49
+ try:
50
+ df[col + "_wordcount"] = df[col].astype(str).apply(lambda x: len(x.split()))
51
+ except Exception as e:
52
+ print(f"Warning (wordcount failed for {col}):", e)
53
 
54
  return df
55
 
56
+ def train_model(df, target_column, feature_columns):
57
+ # Remove rows with missing target values
58
+ df = df.dropna(subset=[target_column])
 
59
 
60
+ # Fill missing values in feature columns
61
+ for col in feature_columns:
62
+ if df[col].dtype == "O":
63
+ df[col] = df[col].fillna("missing")
 
 
 
64
  else:
65
+ df[col] = df[col].fillna(df[col].median())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
+ X = df[feature_columns]
68
+ y = df[target_column]
 
69
 
70
+ # Encode categorical features
71
+ for col in X.select_dtypes(include=["object"]).columns:
72
+ X[col] = LabelEncoder().fit_transform(X[col])
73
 
74
+ # Encode target if it's categorical
75
+ if y.dtype == "O":
76
+ y = LabelEncoder().fit_transform(y)
77
 
78
+ # Split into train and test sets
79
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
 
80
 
81
+ # Train a simple Random Forest model
82
+ model = RandomForestClassifier()
83
  model.fit(X_train, y_train)
 
84
 
85
+ # Predict and show classification report
86
+ y_pred = model.predict(X_test)
87
  report = classification_report(y_test, y_pred)
88
 
89
+ return report
 
 
 
 
 
90
 
91
+ # ----------- Gradio Interface Setup -----------
 
 
 
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  with gr.Blocks() as demo:
94
+ gr.Markdown("# 🧠 CSV/XLSX Classifier with Auto Summary and Visualization")
95
 
96
+ # Store the DataFrame in memory
97
  df_state = gr.State(None)
98
 
99
+ # Upload section
100
+ with gr.Row():
101
+ file_input = gr.File(label="πŸ“ Upload CSV or Excel File", file_types=[".csv", ".xlsx", ".xls"])
102
 
103
+ with gr.Row():
104
+ quantile_option = gr.Checkbox(label="πŸ“Š Discretize Numeric Columns into Quartiles")
105
+ wordcount_option = gr.Checkbox(label="πŸ“ Count Words in Text Columns")
106
 
 
107
  with gr.Row():
108
+ target_col = gr.Dropdown(label="🎯 Target Column (What you want to predict)", choices=[])
109
+ feature_cols = gr.CheckboxGroup(label="🧾 Feature Columns (Used to predict target)", choices=[])
110
 
111
+ # Buttons
112
+ with gr.Row():
113
+ train_button = gr.Button("πŸš€ Train Model")
114
+ clear_button = gr.Button("πŸ”„ Clear All")
115
 
116
+ # Outputs
117
  with gr.Row():
118
+ output = gr.Textbox(label="πŸ“‹ Model Output (Classification Report)", lines=10)
 
119
 
120
+ with gr.Row():
121
+ data_summary = gr.Textbox(label="πŸ“Š Data Summary", lines=10)
122
+ help_box = gr.Textbox(label="πŸ’‘ Help", lines=5, value="βœ”οΈ Upload a dataset, choose preprocessing options, then train.")
123
 
124
+ # Data Previews
125
+ with gr.Row():
126
+ table_preview = gr.DataFrame(label="πŸ” Original Data Preview")
127
+ processed_preview = gr.DataFrame(label="πŸ§ͺ Processed Data Preview (with new columns)")
128
+
129
+ # ----------- Define App Logic -----------
130
+
131
+ # Handle file upload and update column options
132
+ def on_file_change(file, quantile_binning, count_words):
133
+ df, _, preview, summary_md, error = load_data(file)
134
+ if df is None:
135
+ return None, gr.update(choices=[], value=None), gr.update(choices=[], value=[]), pd.DataFrame(), "", "", "", error
136
+
137
+ # Apply preprocessing
138
+ df_processed = preprocess_dataframe(df, quantile_binning, count_words)
139
+
140
+ # Update selectors with new columns from the processed DataFrame
141
+ columns = list(df_processed.columns)
142
+
143
+ return (
144
+ df_processed, # Store processed df in state
145
+ gr.update(choices=columns, value=None),
146
+ gr.update(choices=columns, value=[]),
147
+ preview,
148
+ summary_md,
149
+ df_processed.head(100), # Show processed preview
150
+ "", # Clear model output
151
+ "" # Clear help
152
+ )
153
+
154
+ # Handle training the model
155
+ def on_train(df, target, features):
156
+ if df is None:
157
+ return "⚠️ Please upload a file first."
158
+ if target is None or not features:
159
+ return "⚠️ Please select target and feature columns."
160
+ return train_model(df, target, features)
161
+
162
+ # Clear all interface elements
163
+ def on_clear():
164
+ return (
165
+ None, # df_state
166
+ None, # target_col
167
+ [], # feature_cols
168
+ pd.DataFrame(),
169
+ "",
170
+ pd.DataFrame(),
171
+ "",
172
+ "βœ”οΈ Upload a dataset, choose preprocessing options, then train."
173
+ )
174
+
175
+ # ----------- Connect Actions to Widgets -----------
176
 
 
177
  file_input.change(
178
  fn=on_file_change,
179
  inputs=[file_input, quantile_option, wordcount_option],
 
185
  data_summary,
186
  processed_preview,
187
  output,
188
+ help_box
189
  ]
190
  )
191
 
192
+ train_button.click(
193
+ fn=on_train,
 
194
  inputs=[df_state, target_col, feature_cols],
195
+ outputs=output
196
  )
197
 
198
+ clear_button.click(
199
+ fn=on_clear,
200
+ inputs=[],
201
+ outputs=[
202
+ df_state,
203
+ target_col,
204
+ feature_cols,
205
+ table_preview,
206
+ data_summary,
207
+ processed_preview,
208
+ output,
209
+ help_box
210
+ ]
211
+ )
212
+
213
+ # ----------- Launch the App -----------
214
+
215
  if __name__ == "__main__":
216
  demo.launch(share=True)