abersbail commited on
Commit
ca86eea
·
verified ·
1 Parent(s): 0d0ff9b

Add predictive ML workbench Space

Browse files
README.md CHANGED
@@ -1,12 +1,14 @@
1
  ---
2
- title: Predictive Ml Workbench
3
- emoji: 🦀
4
- colorFrom: yellow
5
- colorTo: gray
6
  sdk: gradio
7
- sdk_version: 6.10.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
1
  ---
2
+ title: Predictive ML Workbench
3
+ colorFrom: green
4
+ colorTo: blue
 
5
  sdk: gradio
 
6
  app_file: app.py
7
  pinned: false
8
+ license: mit
9
  ---
10
 
11
+ # Predictive ML Workbench
12
+
13
+ Advanced free CPU scikit-learn project for regression, classification, clustering,
14
+ dimensionality reduction, preprocessing, model selection, and evaluation.
app.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from predictive_ml_workbench.service import PredictiveMLWorkbenchService
4
+
5
+
6
+ service = PredictiveMLWorkbenchService()
7
+
8
+
9
+ def run_workbench(file_obj, workflow, target_column, test_size, cv_folds, max_clusters):
10
+ file_path = getattr(file_obj, "name", file_obj)
11
+ return service.run(
12
+ csv_path=file_path,
13
+ workflow=workflow,
14
+ target_column=target_column,
15
+ test_size=float(test_size),
16
+ cv_folds=int(cv_folds),
17
+ max_clusters=int(max_clusters),
18
+ )
19
+
20
+
21
+ with gr.Blocks(
22
+ title="Predictive ML Workbench",
23
+ theme=gr.themes.Soft(primary_hue="green", secondary_hue="blue"),
24
+ ) as demo:
25
+ gr.Markdown(
26
+ """
27
+ # Predictive ML Workbench
28
+ Upload a CSV dataset and run end-to-end machine learning workflows for regression,
29
+ classification, clustering, dimensionality reduction, preprocessing, model selection,
30
+ and evaluation.
31
+ """
32
+ )
33
+
34
+ with gr.Accordion("What this project covers", open=False):
35
+ gr.Markdown(
36
+ """
37
+ - Regression
38
+ - Classification
39
+ - Clustering
40
+ - Dimensionality reduction
41
+ - Preprocessing with numeric and categorical pipelines
42
+ - Model selection with cross-validation
43
+ - Evaluation with workflow-specific metrics and plots
44
+ """
45
+ )
46
+
47
+ dataset_input = gr.File(label="CSV Dataset", file_types=[".csv"])
48
+
49
+ with gr.Row():
50
+ workflow_input = gr.Dropdown(
51
+ choices=["Classification", "Regression", "Clustering", "Dimensionality Reduction"],
52
+ value="Classification",
53
+ label="Workflow",
54
+ )
55
+ target_input = gr.Textbox(
56
+ label="Target Column",
57
+ placeholder="Required for regression/classification; optional otherwise",
58
+ )
59
+
60
+ with gr.Row():
61
+ test_size_input = gr.Slider(0.1, 0.4, value=0.2, step=0.05, label="Test Split")
62
+ cv_input = gr.Slider(2, 5, value=3, step=1, label="CV Folds")
63
+ cluster_input = gr.Slider(3, 8, value=6, step=1, label="Max Clusters")
64
+
65
+ run_button = gr.Button("Run Workflow", variant="primary")
66
+
67
+ model_output = gr.Textbox(label="Selected Model / Method", lines=2)
68
+ metrics_output = gr.Textbox(label="Metrics", lines=10)
69
+ preview_output = gr.Textbox(label="Data Preview", lines=10)
70
+ plot_output = gr.Plot(label="Visualization")
71
+ status_output = gr.Textbox(label="Status", lines=3)
72
+
73
+ run_button.click(
74
+ fn=run_workbench,
75
+ inputs=[
76
+ dataset_input,
77
+ workflow_input,
78
+ target_input,
79
+ test_size_input,
80
+ cv_input,
81
+ cluster_input,
82
+ ],
83
+ outputs=[
84
+ model_output,
85
+ metrics_output,
86
+ preview_output,
87
+ plot_output,
88
+ status_output,
89
+ ],
90
+ )
91
+
92
+
93
+ if __name__ == "__main__":
94
+ demo.launch()
predictive_ml_workbench/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .service import PredictiveMLWorkbenchService
2
+
3
+ __all__ = ["PredictiveMLWorkbenchService"]
predictive_ml_workbench/service.py ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import matplotlib.pyplot as plt
4
+ import numpy as np
5
+ import pandas as pd
6
+ from sklearn.base import clone
7
+ from sklearn.cluster import AgglomerativeClustering, KMeans
8
+ from sklearn.compose import ColumnTransformer
9
+ from sklearn.decomposition import PCA
10
+ from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor, RandomForestClassifier, RandomForestRegressor
11
+ from sklearn.impute import SimpleImputer
12
+ from sklearn.linear_model import LogisticRegression, Ridge
13
+ from sklearn.metrics import (
14
+ accuracy_score,
15
+ confusion_matrix,
16
+ f1_score,
17
+ mean_absolute_error,
18
+ mean_squared_error,
19
+ r2_score,
20
+ silhouette_score,
21
+ )
22
+ from sklearn.model_selection import GridSearchCV, train_test_split
23
+ from sklearn.pipeline import Pipeline
24
+ from sklearn.preprocessing import OneHotEncoder, StandardScaler
25
+
26
+
27
+ os.environ.setdefault("LOKY_MAX_CPU_COUNT", "2")
28
+
29
+
30
+ class PredictiveMLWorkbenchService:
31
+ def run(self, csv_path, workflow, target_column, test_size, cv_folds, max_clusters):
32
+ if not csv_path:
33
+ return "", "", "", None, "Upload a CSV file first."
34
+
35
+ try:
36
+ df = pd.read_csv(csv_path)
37
+ except Exception as exc:
38
+ return "", "", "", None, f"Could not read CSV: {type(exc).__name__}: {exc}"
39
+
40
+ if df.empty:
41
+ return "", "", "", None, "Dataset is empty."
42
+
43
+ try:
44
+ if workflow == "Classification":
45
+ return self._run_classification(df, target_column, test_size, cv_folds)
46
+ if workflow == "Regression":
47
+ return self._run_regression(df, target_column, test_size, cv_folds)
48
+ if workflow == "Clustering":
49
+ return self._run_clustering(df, target_column, max_clusters)
50
+ return self._run_dimensionality_reduction(df, target_column)
51
+ except Exception as exc:
52
+ return "", "", "", None, f"Workflow failed: {type(exc).__name__}: {exc}"
53
+
54
+ def _run_classification(self, df, target_column, test_size, cv_folds):
55
+ x, y = self._supervised_split(df, target_column)
56
+ preprocessor = self._build_preprocessor(x)
57
+
58
+ candidates = [
59
+ (
60
+ "LogisticRegression",
61
+ LogisticRegression(max_iter=600),
62
+ {"model__C": [0.5, 1.0, 2.0]},
63
+ ),
64
+ (
65
+ "RandomForestClassifier",
66
+ RandomForestClassifier(random_state=42),
67
+ {"model__n_estimators": [120, 220], "model__max_depth": [None, 8]},
68
+ ),
69
+ (
70
+ "GradientBoostingClassifier",
71
+ GradientBoostingClassifier(random_state=42),
72
+ {"model__n_estimators": [80, 140], "model__learning_rate": [0.05, 0.1]},
73
+ ),
74
+ ]
75
+
76
+ x_train, x_test, y_train, y_test = train_test_split(
77
+ x,
78
+ y,
79
+ test_size=test_size,
80
+ random_state=42,
81
+ stratify=y if y.nunique() > 1 else None,
82
+ )
83
+
84
+ best_name, best_search = self._select_model(
85
+ candidates=candidates,
86
+ preprocessor=preprocessor,
87
+ x_train=x_train,
88
+ y_train=y_train,
89
+ cv_folds=cv_folds,
90
+ scoring="f1_macro",
91
+ )
92
+
93
+ preds = best_search.best_estimator_.predict(x_test)
94
+ acc = accuracy_score(y_test, preds)
95
+ macro_f1 = f1_score(y_test, preds, average="macro")
96
+ metrics = "\n".join(
97
+ [
98
+ f"Accuracy: {acc:.4f}",
99
+ f"Macro F1: {macro_f1:.4f}",
100
+ f"CV Best Score: {best_search.best_score_:.4f}",
101
+ f"Train Rows: {len(x_train)}",
102
+ f"Test Rows: {len(x_test)}",
103
+ f"Classes: {y.nunique()}",
104
+ f"Best Params: {best_search.best_params_}",
105
+ ]
106
+ )
107
+
108
+ fig = self._plot_confusion_matrix(y_test, preds)
109
+ preview = x.head(8).to_string(index=False)
110
+ status = "Completed end-to-end classification workflow with preprocessing, model selection, and evaluation."
111
+ return best_name, metrics, preview, fig, status
112
+
113
+ def _run_regression(self, df, target_column, test_size, cv_folds):
114
+ x, y = self._supervised_split(df, target_column)
115
+ if not pd.api.types.is_numeric_dtype(y):
116
+ raise ValueError("Regression target column must be numeric.")
117
+
118
+ preprocessor = self._build_preprocessor(x)
119
+ candidates = [
120
+ (
121
+ "Ridge",
122
+ Ridge(),
123
+ {"model__alpha": [0.5, 1.0, 2.0, 5.0]},
124
+ ),
125
+ (
126
+ "RandomForestRegressor",
127
+ RandomForestRegressor(random_state=42),
128
+ {"model__n_estimators": [120, 220], "model__max_depth": [None, 8]},
129
+ ),
130
+ (
131
+ "GradientBoostingRegressor",
132
+ GradientBoostingRegressor(random_state=42),
133
+ {"model__n_estimators": [80, 140], "model__learning_rate": [0.05, 0.1]},
134
+ ),
135
+ ]
136
+
137
+ x_train, x_test, y_train, y_test = train_test_split(
138
+ x,
139
+ y,
140
+ test_size=test_size,
141
+ random_state=42,
142
+ )
143
+
144
+ best_name, best_search = self._select_model(
145
+ candidates=candidates,
146
+ preprocessor=preprocessor,
147
+ x_train=x_train,
148
+ y_train=y_train,
149
+ cv_folds=cv_folds,
150
+ scoring="r2",
151
+ )
152
+
153
+ preds = best_search.best_estimator_.predict(x_test)
154
+ r2 = r2_score(y_test, preds)
155
+ mae = mean_absolute_error(y_test, preds)
156
+ rmse = float(np.sqrt(mean_squared_error(y_test, preds)))
157
+ metrics = "\n".join(
158
+ [
159
+ f"R2: {r2:.4f}",
160
+ f"MAE: {mae:.4f}",
161
+ f"RMSE: {rmse:.4f}",
162
+ f"CV Best Score: {best_search.best_score_:.4f}",
163
+ f"Train Rows: {len(x_train)}",
164
+ f"Test Rows: {len(x_test)}",
165
+ f"Best Params: {best_search.best_params_}",
166
+ ]
167
+ )
168
+
169
+ fig = self._plot_regression_scatter(y_test, preds)
170
+ preview = x.head(8).to_string(index=False)
171
+ status = "Completed end-to-end regression workflow with preprocessing, model selection, and evaluation."
172
+ return best_name, metrics, preview, fig, status
173
+
174
+ def _run_clustering(self, df, target_column, max_clusters):
175
+ x = df.copy()
176
+ if target_column and target_column in x.columns:
177
+ x = x.drop(columns=[target_column])
178
+
179
+ preprocessor = self._build_preprocessor(x)
180
+ transformed = preprocessor.fit_transform(x)
181
+ transformed = np.asarray(transformed)
182
+
183
+ sample = transformed
184
+ if transformed.shape[0] > 1200:
185
+ sample = transformed[:1200]
186
+
187
+ best = None
188
+ best_labels = None
189
+ for n_clusters in range(2, max_clusters + 1):
190
+ for name, estimator in [
191
+ ("KMeans", KMeans(n_clusters=n_clusters, random_state=42, n_init=10)),
192
+ ("AgglomerativeClustering", AgglomerativeClustering(n_clusters=n_clusters)),
193
+ ]:
194
+ labels = estimator.fit_predict(sample)
195
+ if len(np.unique(labels)) < 2:
196
+ continue
197
+ score = silhouette_score(sample, labels)
198
+ if best is None or score > best["score"]:
199
+ best = {"name": name, "clusters": n_clusters, "score": score}
200
+ best_labels = labels
201
+
202
+ if best is None:
203
+ raise ValueError("Could not produce a valid clustering result.")
204
+
205
+ reduced = PCA(n_components=2, random_state=42).fit_transform(sample)
206
+ metrics = "\n".join(
207
+ [
208
+ f"Algorithm: {best['name']}",
209
+ f"Clusters: {best['clusters']}",
210
+ f"Silhouette Score: {best['score']:.4f}",
211
+ f"Rows Used: {sample.shape[0]}",
212
+ f"Features After Preprocessing: {sample.shape[1]}",
213
+ ]
214
+ )
215
+ fig = self._plot_cluster_scatter(reduced, best_labels, title=f"{best['name']} clustering")
216
+ preview = x.head(8).to_string(index=False)
217
+ status = "Completed clustering workflow with preprocessing, model selection across algorithms, and evaluation."
218
+ return best["name"], metrics, preview, fig, status
219
+
220
+ def _run_dimensionality_reduction(self, df, target_column):
221
+ x = df.copy()
222
+ labels = None
223
+ if target_column and target_column in x.columns:
224
+ labels = x[target_column].astype(str)
225
+ x = x.drop(columns=[target_column])
226
+
227
+ preprocessor = self._build_preprocessor(x)
228
+ transformed = preprocessor.fit_transform(x)
229
+ transformed = np.asarray(transformed)
230
+
231
+ n_components = 2 if transformed.shape[1] >= 2 else 1
232
+ pca = PCA(n_components=n_components, random_state=42)
233
+ reduced = pca.fit_transform(transformed)
234
+
235
+ explained = pca.explained_variance_ratio_
236
+ metrics = "\n".join(
237
+ [
238
+ f"Method: PCA",
239
+ f"Components: {n_components}",
240
+ f"Explained Variance: {', '.join(f'{v:.4f}' for v in explained)}",
241
+ f"Cumulative Variance: {explained.sum():.4f}",
242
+ f"Rows: {transformed.shape[0]}",
243
+ f"Features After Preprocessing: {transformed.shape[1]}",
244
+ ]
245
+ )
246
+
247
+ fig = self._plot_pca_scatter(reduced, labels)
248
+ preview_df = pd.DataFrame(reduced, columns=[f"PC{i+1}" for i in range(n_components)])
249
+ preview = preview_df.head(8).to_string(index=False)
250
+ status = "Completed dimensionality reduction workflow with preprocessing and PCA evaluation."
251
+ return "PCA", metrics, preview, fig, status
252
+
253
+ def _build_preprocessor(self, x):
254
+ numeric_cols = x.select_dtypes(include=["number"]).columns.tolist()
255
+ categorical_cols = [col for col in x.columns if col not in numeric_cols]
256
+
257
+ numeric_pipeline = Pipeline(
258
+ steps=[
259
+ ("imputer", SimpleImputer(strategy="median")),
260
+ ("scaler", StandardScaler()),
261
+ ]
262
+ )
263
+
264
+ categorical_pipeline = Pipeline(
265
+ steps=[
266
+ ("imputer", SimpleImputer(strategy="most_frequent")),
267
+ ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
268
+ ]
269
+ )
270
+
271
+ return ColumnTransformer(
272
+ transformers=[
273
+ ("num", numeric_pipeline, numeric_cols),
274
+ ("cat", categorical_pipeline, categorical_cols),
275
+ ],
276
+ remainder="drop",
277
+ )
278
+
279
+ def _supervised_split(self, df, target_column):
280
+ if not target_column:
281
+ raise ValueError("Target column is required for supervised workflows.")
282
+ if target_column not in df.columns:
283
+ raise ValueError(f"Target column `{target_column}` was not found.")
284
+
285
+ x = df.drop(columns=[target_column])
286
+ y = df[target_column]
287
+ if x.shape[1] == 0:
288
+ raise ValueError("Dataset needs at least one feature column.")
289
+ return x, y
290
+
291
+ def _select_model(self, candidates, preprocessor, x_train, y_train, cv_folds, scoring):
292
+ best_name = None
293
+ best_search = None
294
+
295
+ for name, estimator, param_grid in candidates:
296
+ pipeline = Pipeline(
297
+ steps=[
298
+ ("preprocessor", clone(preprocessor)),
299
+ ("model", estimator),
300
+ ]
301
+ )
302
+ search = GridSearchCV(
303
+ estimator=pipeline,
304
+ param_grid=param_grid,
305
+ cv=cv_folds,
306
+ scoring=scoring,
307
+ n_jobs=1,
308
+ )
309
+ search.fit(x_train, y_train)
310
+ if best_search is None or search.best_score_ > best_search.best_score_:
311
+ best_name = name
312
+ best_search = search
313
+
314
+ return best_name, best_search
315
+
316
+ def _plot_confusion_matrix(self, y_true, y_pred):
317
+ fig, ax = plt.subplots(figsize=(5.5, 4.5))
318
+ labels = np.unique(np.concatenate([np.asarray(y_true), np.asarray(y_pred)]))
319
+ matrix = confusion_matrix(y_true, y_pred, labels=labels)
320
+ im = ax.imshow(matrix, cmap="Blues")
321
+ ax.set_title("Confusion Matrix")
322
+ ax.set_xlabel("Predicted")
323
+ ax.set_ylabel("True")
324
+ ax.set_xticks(range(len(labels)))
325
+ ax.set_xticklabels(labels, rotation=45, ha="right")
326
+ ax.set_yticks(range(len(labels)))
327
+ ax.set_yticklabels(labels)
328
+ for i in range(matrix.shape[0]):
329
+ for j in range(matrix.shape[1]):
330
+ ax.text(j, i, str(matrix[i, j]), ha="center", va="center", color="black")
331
+ fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
332
+ fig.tight_layout()
333
+ return fig
334
+
335
+ def _plot_regression_scatter(self, y_true, y_pred):
336
+ fig, ax = plt.subplots(figsize=(5.5, 4.5))
337
+ ax.scatter(y_true, y_pred, alpha=0.75)
338
+ min_val = min(np.min(y_true), np.min(y_pred))
339
+ max_val = max(np.max(y_true), np.max(y_pred))
340
+ ax.plot([min_val, max_val], [min_val, max_val], linestyle="--", color="red")
341
+ ax.set_title("Actual vs Predicted")
342
+ ax.set_xlabel("Actual")
343
+ ax.set_ylabel("Predicted")
344
+ fig.tight_layout()
345
+ return fig
346
+
347
+ def _plot_cluster_scatter(self, reduced, labels, title):
348
+ fig, ax = plt.subplots(figsize=(5.5, 4.5))
349
+ scatter = ax.scatter(reduced[:, 0], reduced[:, 1], c=labels, cmap="tab10", alpha=0.85)
350
+ ax.set_title(title)
351
+ ax.set_xlabel("Component 1")
352
+ ax.set_ylabel("Component 2")
353
+ fig.colorbar(scatter, ax=ax, fraction=0.046, pad=0.04)
354
+ fig.tight_layout()
355
+ return fig
356
+
357
+ def _plot_pca_scatter(self, reduced, labels):
358
+ fig, ax = plt.subplots(figsize=(5.5, 4.5))
359
+ if reduced.shape[1] == 1:
360
+ ax.scatter(reduced[:, 0], np.zeros_like(reduced[:, 0]), alpha=0.75)
361
+ ax.set_ylabel("Zero Axis")
362
+ else:
363
+ if labels is not None:
364
+ unique_labels = labels.astype(str)
365
+ for label in sorted(unique_labels.unique())[:8]:
366
+ mask = unique_labels == label
367
+ ax.scatter(reduced[mask, 0], reduced[mask, 1], alpha=0.75, label=label)
368
+ ax.legend(loc="best", fontsize=8)
369
+ else:
370
+ ax.scatter(reduced[:, 0], reduced[:, 1], alpha=0.75)
371
+ ax.set_ylabel("PC2")
372
+ ax.set_title("PCA Projection")
373
+ ax.set_xlabel("PC1")
374
+ fig.tight_layout()
375
+ return fig
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio>=5.23.0
2
+ matplotlib>=3.10.1
3
+ numpy>=2.1.0
4
+ pandas>=2.2.3
5
+ scikit-learn>=1.6.1