clementBE commited on
Commit
d38505c
Β·
verified Β·
1 Parent(s): 23782e2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -61
app.py CHANGED
@@ -1,15 +1,18 @@
1
  import gradio as gr
2
  import pandas as pd
3
  import numpy as np
 
 
 
4
  from sklearn.model_selection import train_test_split
5
  from sklearn.ensemble import RandomForestClassifier
6
  from sklearn.metrics import classification_report
7
 
8
- # Global variables to store data
9
  original_df = None
10
  processed_df = None
 
 
11
 
12
- # STEP 1: Load data from file (CSV or Excel)
13
  def load_data(file):
14
  global original_df
15
  try:
@@ -17,128 +20,126 @@ def load_data(file):
17
  original_df = pd.read_csv(file)
18
  else:
19
  original_df = pd.read_excel(file)
20
- return original_df.head(10), "βœ… File loaded successfully."
 
 
 
 
21
  except Exception as e:
22
- return pd.DataFrame(), f"❌ Error loading file: {e}"
23
 
24
- # STEP 2: Process data
25
- # - Discretize numeric columns into quartiles (4 bins) and deciles (10 bins)
26
- # - Count words in text columns
27
  def process_data():
28
  global original_df, processed_df
29
-
30
  if original_df is None:
31
- return pd.DataFrame(), gr.update(choices=[]), gr.update(choices=[]), "⚠️ Please load a dataset first."
32
-
33
  df = original_df.copy()
34
-
35
- # Discretize numeric columns into quartiles
36
  for col in df.select_dtypes(include=np.number).columns:
37
  try:
38
  df[col + "_qbin"] = pd.qcut(df[col], 4, labels=False, duplicates='drop')
39
  except Exception:
40
- pass # skip if not suitable for binning
41
-
42
- # Discretize numeric columns into deciles
43
  for col in df.select_dtypes(include=np.number).columns:
44
  try:
45
  df[col + "_decil"] = pd.qcut(df[col], 10, labels=False, duplicates='drop')
46
  except Exception:
47
- pass # skip if not suitable for binning
48
-
49
- # Add word count for text/object columns
50
  for col in df.select_dtypes(include='object').columns:
51
  df[col + "_wordcount"] = df[col].astype(str).apply(lambda x: len(x.split()))
52
-
53
  processed_df = df.copy()
54
-
55
- # Update dropdown choices with all columns including new ones
56
  all_columns = df.columns.tolist()
57
-
58
- return (
59
- df.head(10),
60
- gr.update(choices=all_columns),
61
- gr.update(choices=all_columns),
62
- "βœ… Data processed: quartiles, deciles, and word counts added."
63
  )
 
64
 
65
- # STEP 3: Train model
66
- # - Select target and features from dropdown and checkbox group
67
- # - Train RandomForestClassifier and show classification report
68
  def train_model(target_col, feature_cols):
69
- global processed_df
70
-
71
  if processed_df is None:
72
- return "⚠️ Please process your data first."
73
-
74
  if not target_col or not feature_cols:
75
- return "⚠️ Please select a target column and at least one feature."
76
-
77
  try:
78
  X = processed_df[feature_cols]
79
  y = processed_df[target_col]
80
-
81
- # Convert categorical variables into dummy/indicator variables
82
  X = pd.get_dummies(X)
83
-
84
- # Split data into train and test sets
85
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
86
-
87
- # Train Random Forest classifier
88
  clf = RandomForestClassifier(random_state=42)
89
  clf.fit(X_train, y_train)
90
-
91
- # Predict on test set
92
  y_pred = clf.predict(X_test)
93
-
94
- # Generate classification report
95
  report = classification_report(y_test, y_pred)
96
- return report
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  except Exception as e:
98
- return f"❌ Model training failed: {e}"
99
 
100
- # Build Gradio Interface
101
- with gr.Blocks(title="Step-by-Step Model Trainer with Deciles & Word Count") as app:
102
- gr.Markdown("## 🧠 Step-by-Step Model Trainer\nUpload your data, process it (discretize & count words), then train a model.")
103
 
104
- # Step 1: File upload
105
  with gr.Row():
106
  file_input = gr.File(label="πŸ“ Upload CSV or Excel file")
107
  load_status = gr.Textbox(label="ℹ️ File Load Status", interactive=False)
108
 
109
  original_preview = gr.DataFrame(label="πŸ” Original Data Preview (first 10 rows)")
 
110
 
111
- # Step 2: Process data
112
- process_button = gr.Button("βš™οΈ Process Data (Discretize & Word Count)")
113
  processed_preview = gr.DataFrame(label="πŸ”¬ Processed Data Preview (first 10 rows)")
114
  process_status = gr.Textbox(label="ℹ️ Process Status", interactive=False)
 
115
 
116
- # Step 3: Select target and features for model training
117
  target_selector = gr.Dropdown(label="🎯 Select Target Column", choices=[])
118
  feature_selector = gr.CheckboxGroup(label="πŸ“Š Select Feature Columns", choices=[])
119
 
120
- # Step 4: Train model
121
  train_button = gr.Button("πŸš€ Train Model")
122
  train_output = gr.Textbox(label="πŸ“ˆ Classification Report", lines=10)
 
 
123
 
124
- # Events & callbacks
125
  file_input.change(
126
  fn=load_data,
127
  inputs=[file_input],
128
- outputs=[original_preview, load_status]
129
  )
130
 
131
  process_button.click(
132
  fn=process_data,
133
  inputs=[],
134
- outputs=[processed_preview, target_selector, feature_selector, process_status]
135
  )
136
 
137
  train_button.click(
138
  fn=train_model,
139
  inputs=[target_selector, feature_selector],
140
- outputs=[train_output]
141
  )
142
 
143
- # Launch app
144
  app.launch()
 
1
  import gradio as gr
2
  import pandas as pd
3
  import numpy as np
4
+ import seaborn as sns
5
+ import matplotlib.pyplot as plt
6
+ from io import BytesIO
7
  from sklearn.model_selection import train_test_split
8
  from sklearn.ensemble import RandomForestClassifier
9
  from sklearn.metrics import classification_report
10
 
 
11
  original_df = None
12
  processed_df = None
13
+ trained_model = None
14
+ processed_X_columns = None # Keep processed features list for importances
15
 
 
16
  def load_data(file):
17
  global original_df
18
  try:
 
20
  original_df = pd.read_csv(file)
21
  else:
22
  original_df = pd.read_excel(file)
23
+ help_text = (
24
+ "Step 1: Data loaded successfully! Here you see a preview of the first 10 rows.\n"
25
+ "Next, click 'Process Data' to discretize numeric columns and add word counts."
26
+ )
27
+ return original_df.head(10), "βœ… File loaded successfully.", help_text
28
  except Exception as e:
29
+ return pd.DataFrame(), f"❌ Error loading file: {e}", "Please upload a valid CSV or Excel file."
30
 
 
 
 
31
  def process_data():
32
  global original_df, processed_df
 
33
  if original_df is None:
34
+ return pd.DataFrame(), gr.update(choices=[]), gr.update(choices=[]), "⚠️ Please load a dataset first.", ""
 
35
  df = original_df.copy()
36
+ # Quartiles
 
37
  for col in df.select_dtypes(include=np.number).columns:
38
  try:
39
  df[col + "_qbin"] = pd.qcut(df[col], 4, labels=False, duplicates='drop')
40
  except Exception:
41
+ pass
42
+ # Deciles
 
43
  for col in df.select_dtypes(include=np.number).columns:
44
  try:
45
  df[col + "_decil"] = pd.qcut(df[col], 10, labels=False, duplicates='drop')
46
  except Exception:
47
+ pass
48
+ # Word counts
 
49
  for col in df.select_dtypes(include='object').columns:
50
  df[col + "_wordcount"] = df[col].astype(str).apply(lambda x: len(x.split()))
 
51
  processed_df = df.copy()
 
 
52
  all_columns = df.columns.tolist()
53
+ help_text = (
54
+ "Step 2: Data processed!\n"
55
+ "- Numeric columns discretized into quartiles and deciles.\n"
56
+ "- Word counts added for text columns.\n"
57
+ "You can now select your target and feature columns."
 
58
  )
59
+ return df.head(10), gr.update(choices=all_columns), gr.update(choices=all_columns), "βœ… Data processed.", help_text
60
 
 
 
 
61
  def train_model(target_col, feature_cols):
62
+ global processed_df, trained_model, processed_X_columns
 
63
  if processed_df is None:
64
+ return "⚠️ Please process your data first.", None, ""
 
65
  if not target_col or not feature_cols:
66
+ return "⚠️ Please select a target and at least one feature.", None, ""
 
67
  try:
68
  X = processed_df[feature_cols]
69
  y = processed_df[target_col]
 
 
70
  X = pd.get_dummies(X)
71
+ processed_X_columns = X.columns.tolist()
 
72
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
 
73
  clf = RandomForestClassifier(random_state=42)
74
  clf.fit(X_train, y_train)
75
+ trained_model = clf
 
76
  y_pred = clf.predict(X_test)
 
 
77
  report = classification_report(y_test, y_pred)
78
+
79
+ help_text = (
80
+ "Step 3: Model trained!\n"
81
+ "- Classification report shows precision, recall, f1-score per class.\n"
82
+ "- Below is a heatmap of feature importances to help interpret what features influenced the model most."
83
+ )
84
+ # Create heatmap plot and return as image
85
+ fi = clf.feature_importances_
86
+ fi_df = pd.DataFrame({'Feature': processed_X_columns, 'Importance': fi})
87
+ fi_df = fi_df.sort_values(by='Importance', ascending=False).head(20) # Top 20 features for clarity
88
+
89
+ plt.figure(figsize=(8,6))
90
+ sns.heatmap(fi_df.set_index('Feature').T, annot=True, cmap="YlGnBu", cbar_kws={'label': 'Feature Importance'})
91
+ plt.title("Feature Importances Heatmap (Top 20)")
92
+ plt.tight_layout()
93
+
94
+ buf = BytesIO()
95
+ plt.savefig(buf, format="png")
96
+ plt.close()
97
+ buf.seek(0)
98
+
99
+ return report, buf.read(), help_text
100
  except Exception as e:
101
+ return f"❌ Model training failed: {e}", None, ""
102
 
103
+ with gr.Blocks(title="Step-by-Step Model Trainer with Help and Heatmap") as app:
104
+ gr.Markdown("## 🧠 Step-by-Step Model Trainer\nUpload your data, process it, train a model, and get help at each step!")
 
105
 
 
106
  with gr.Row():
107
  file_input = gr.File(label="πŸ“ Upload CSV or Excel file")
108
  load_status = gr.Textbox(label="ℹ️ File Load Status", interactive=False)
109
 
110
  original_preview = gr.DataFrame(label="πŸ” Original Data Preview (first 10 rows)")
111
+ load_help = gr.Textbox(label="πŸ“– Step 1 Help", interactive=False)
112
 
113
+ process_button = gr.Button("βš™οΈ Process Data")
 
114
  processed_preview = gr.DataFrame(label="πŸ”¬ Processed Data Preview (first 10 rows)")
115
  process_status = gr.Textbox(label="ℹ️ Process Status", interactive=False)
116
+ process_help = gr.Textbox(label="πŸ“– Step 2 Help", interactive=False)
117
 
 
118
  target_selector = gr.Dropdown(label="🎯 Select Target Column", choices=[])
119
  feature_selector = gr.CheckboxGroup(label="πŸ“Š Select Feature Columns", choices=[])
120
 
 
121
  train_button = gr.Button("πŸš€ Train Model")
122
  train_output = gr.Textbox(label="πŸ“ˆ Classification Report", lines=10)
123
+ train_help = gr.Textbox(label="πŸ“– Step 3 Help", interactive=False)
124
+ heatmap_img = gr.Image(label="πŸ”₯ Feature Importances Heatmap")
125
 
126
+ # Callbacks
127
  file_input.change(
128
  fn=load_data,
129
  inputs=[file_input],
130
+ outputs=[original_preview, load_status, load_help]
131
  )
132
 
133
  process_button.click(
134
  fn=process_data,
135
  inputs=[],
136
+ outputs=[processed_preview, target_selector, feature_selector, process_status, process_help]
137
  )
138
 
139
  train_button.click(
140
  fn=train_model,
141
  inputs=[target_selector, feature_selector],
142
+ outputs=[train_output, heatmap_img, train_help]
143
  )
144
 
 
145
  app.launch()