nodronm commited on
Commit
2e6ad36
·
verified ·
1 Parent(s): ce4f969

Update aap.py

Browse files
Files changed (1) hide show
  1. aap.py +64 -433
aap.py CHANGED
@@ -1,440 +1,71 @@
1
- # -*- coding: utf-8 -*-
2
- """AAP.ipynb
3
-
4
- Automatically generated by Colab.
5
-
6
- Original file is located at
7
- https://colab.research.google.com/drive/1rxnN6J5ojM0HFXh5HxHo9AF4oOfq_fwM
8
- """
9
-
10
- import pandas as pd
11
  import numpy as np
12
- from sklearn.impute import SimpleImputer
13
- from sklearn.model_selection import train_test_split
14
- from sklearn.compose import ColumnTransformer
15
- from sklearn.preprocessing import OneHotEncoder, StandardScaler
16
- from sklearn.pipeline import Pipeline
17
- from xgboost import XGBRegressor
18
-
19
- try:
20
- # Google Colab: upload via picker
21
- from google.colab import files
22
- uploaded = files.upload() # select minimal_messy_task_performance.csv
23
- import io
24
- df = pd.read_csv(io.BytesIO(uploaded['dataset.csv']))
25
- except ModuleNotFoundError:
26
- df = pd.read_csv('dataset.csv')
27
-
28
- df.shape
29
-
30
- df.head()
31
-
32
- df['Team'] = df['Team'].str.lower().fillna('team_unknown')
33
-
34
- imp = SimpleImputer(strategy='median')
35
- df['ErrorRate'] = imp.fit_transform(df[['ErrorRate']])
36
-
37
- df = df[df['ProductivityScore'] > 0].reset_index(drop=True)
38
- print("Remaining rows:", df.shape[0])
39
-
40
- df.head()
41
-
42
- df['ThroughputRate'] = df['OrderQuantity'] / df['AvgTaskTime_Minutes']
43
- df['TimePressure'] = df['OrderQuantity'] / (df['DeadlineDays'].replace(0, 1) * df['AvgTaskTime_Minutes'])
44
- priority_map = {'High': 3, 'Medium': 2, 'Low': 1}
45
- df['PriorityLevel'] = (df['Priority'].str.capitalize()
46
- .map(priority_map).fillna(1).astype(int))
47
- df.drop('Priority', axis=1, inplace=True)
48
-
49
- df.head(10)
50
-
51
- X = df.drop('ProductivityScore', axis=1)
52
- y = df['ProductivityScore']
53
- X_train, X_test, y_train, y_test = train_test_split(
54
- X, y, test_size=0.2, random_state=42
55
- )
56
-
57
- cat_cols = ['Team','ProductType','TaskType']
58
- num_cols = ['OrderQuantity','DeadlineDays','ExperienceYears','AvgTaskTime_Minutes',
59
- 'ErrorRate','TrainingHours','DayNumber','ThroughputRate','TimePressure','PriorityLevel']
60
-
61
- preprocessor = ColumnTransformer([
62
- ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
63
- ('num', StandardScaler(), num_cols)
64
- ])
65
-
66
- pipeline = Pipeline([
67
- ('preprocessor', preprocessor),
68
- ('regressor', XGBRegressor(
69
- objective='reg:squarederror',
70
- random_state=42,
71
- tree_method='hist'
72
- ))
73
- ])
74
-
75
- pipeline.fit(X_train, y_train)
76
-
77
- # Step 7: Evaluate on test set
78
- from sklearn.metrics import r2_score, mean_squared_error
79
- y_pred = pipeline.predict(X_test)
80
- print(f"Test R²: {r2_score(y_test, y_pred):.4f}")
81
- print(f"Test MSE: {mean_squared_error(y_test, y_pred):.4f}")
82
-
83
- # Step 1: Define hyperparameter search space
84
- from scipy.stats import randint, uniform, loguniform
85
-
86
- param_dist = {
87
- 'regressor__n_estimators': randint(100, 1000),
88
- 'regressor__max_depth': randint(3, 15),
89
- 'regressor__learning_rate': uniform(0.01, 0.29),
90
- 'regressor__subsample': uniform(0.5, 0.5),
91
- 'regressor__colsample_bytree': uniform(0.5, 0.5),
92
- 'regressor__gamma': uniform(0, 0.5),
93
- 'regressor__reg_alpha': loguniform(1e-3, 1e2),
94
- 'regressor__reg_lambda': loguniform(1e-3, 1e2),
95
- 'regressor__min_child_weight': randint(1, 10),
96
- }
97
-
98
- # Step 2: Set up RandomizedSearchCV
99
- from sklearn.model_selection import RandomizedSearchCV
100
-
101
- search = RandomizedSearchCV(
102
- estimator=pipeline,
103
- param_distributions=param_dist,
104
- n_iter=50, # number of parameter settings to sample
105
- scoring='r2',
106
- cv=3,
107
- n_jobs=-1,
108
- verbose=1,
109
- random_state=42
110
- )
111
-
112
- # Step 3: Run the hyperparameter search
113
- search.fit(X_train, y_train)
114
-
115
- # Step 4: Inspect the best parameters & CV score
116
- print("🔍 Best parameters:", search.best_params_)
117
- print(f"Best CV R²: {search.best_score_:.4f}")
118
-
119
- # Step 5: Evaluate the tuned model on the test set
120
- best_model = search.best_estimator_
121
- from sklearn.metrics import r2_score, mean_squared_error
122
-
123
- y_pred = best_model.predict(X_test)
124
- print(f"Test R²: {r2_score(y_test, y_pred):.4f}")
125
- print(f"Test MSE: {mean_squared_error(y_test, y_pred):.4f}")
126
-
127
- from sklearn.model_selection import GridSearchCV
128
- from sklearn.metrics import r2_score, mean_squared_error
129
  import joblib
130
 
131
- # 1) Extract your best random‐search parameters
132
- best = search.best_params_
133
-
134
- # 2) Create a tight grid around them
135
- param_grid = {
136
- 'regressor__n_estimators': [
137
- max(100, best['regressor__n_estimators'] - 100),
138
- best['regressor__n_estimators'],
139
- best['regressor__n_estimators'] + 100
140
- ],
141
- 'regressor__max_depth': [
142
- max(3, best['regressor__max_depth'] - 2),
143
- best['regressor__max_depth'],
144
- best['regressor__max_depth'] + 2
145
- ],
146
- 'regressor__learning_rate': [
147
- best['regressor__learning_rate'] * 0.5,
148
- best['regressor__learning_rate'],
149
- best['regressor__learning_rate'] * 1.5
150
- ],
 
 
 
 
 
 
151
  }
152
 
153
- # 3) Set up GridSearchCV
154
- grid_search = GridSearchCV(
155
- estimator=pipeline,
156
- param_grid=param_grid,
157
- scoring='r2',
158
- cv=3, # 3-fold CV
159
- n_jobs=-1,
160
- verbose=1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  )
162
 
163
- # 4) Run grid search on training set
164
- grid_search.fit(X_train, y_train)
165
-
166
- # 5) Evaluate on test set
167
- y_pred = grid_search.predict(X_test)
168
- print("Grid Search Best R²:", r2_score(y_test, y_pred))
169
- print("Grid Search MSE: ", mean_squared_error(y_test, y_pred))
170
-
171
- # 6) Save the final, tuned model
172
- joblib.dump(grid_search.best_estimator_, 'task_distribution_model_grid_tuned.joblib')
173
-
174
- !pip install optuna
175
- import optuna
176
-
177
- def objective(trial):
178
- params = {
179
- 'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
180
- 'max_depth': trial.suggest_int('max_depth', 3, 15),
181
- 'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
182
- 'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
183
- 'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
184
- 'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10),
185
- 'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10),
186
- }
187
- model = Pipeline([
188
- ('preprocessor', preprocessor),
189
- ('regressor', XGBRegressor(**params, tree_method='hist', random_state=42))
190
- ])
191
- from sklearn.model_selection import cross_val_score
192
- score = cross_val_score(model, X_train, y_train, cv=3, scoring='r2', n_jobs=-1).mean()
193
- return score
194
-
195
- study = optuna.create_study(direction='maximize')
196
- study.optimize(objective, n_trials=50)
197
- print("Optuna best R²:", study.best_value)
198
- print(" Best params:", study.best_params)
199
-
200
- from sklearn.pipeline import Pipeline
201
- from lightgbm import LGBMRegressor
202
- from sklearn.model_selection import RandomizedSearchCV
203
- from scipy.stats import randint, uniform
204
-
205
- # 1a) Build a LightGBM pipeline
206
- lgb_pipeline = Pipeline([
207
- ('preprocessor', preprocessor),
208
- ('regressor', LGBMRegressor(random_state=42))
209
- ])
210
-
211
- # 1b) Define a random search space
212
- param_dist_lgb = {
213
- 'regressor__n_estimators': randint(100, 1000),
214
- 'regressor__max_depth': randint(3, 15),
215
- 'regressor__learning_rate':uniform(0.01, 0.29),
216
- 'regressor__subsample': uniform(0.5, 0.5),
217
- 'regressor__colsample_bytree': uniform(0.5, 0.5),
218
- 'regressor__reg_alpha': uniform(0, 1),
219
- 'regressor__reg_lambda': uniform(0, 1),
220
- }
221
-
222
- search_lgb = RandomizedSearchCV(
223
- lgb_pipeline,
224
- param_distributions=param_dist_lgb,
225
- n_iter=50,
226
- scoring='r2',
227
- cv=3,
228
- n_jobs=-1,
229
- random_state=42,
230
- verbose=1
231
- )
232
-
233
- search_lgb.fit(X_train, y_train)
234
- print("LightGBM Best CV R²:", search_lgb.best_score_)
235
- # Evaluate on test
236
- y_pred = search_lgb.predict(X_test)
237
- from sklearn.metrics import r2_score, mean_squared_error
238
- print(" LightGBM Test R²:", r2_score(y_test, y_pred))
239
- print(" LightGBM Test MSE:", mean_squared_error(y_test, y_pred))
240
-
241
- import optuna
242
- from sklearn.pipeline import Pipeline
243
- from sklearn.model_selection import cross_val_score
244
- from lightgbm import LGBMRegressor
245
- from sklearn.metrics import r2_score, mean_squared_error
246
- import joblib
247
-
248
- # Optuna objective function for LightGBM
249
- def objective_lgb(trial):
250
- params = {
251
- "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
252
- "max_depth": trial.suggest_int("max_depth", 3, 12),
253
- "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 1e-1),
254
- "num_leaves": trial.suggest_int("num_leaves", 20, 200),
255
- "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
256
- "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
257
- "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-3, 10.0),
258
- "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-3, 10.0),
259
- "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
260
- "min_split_gain": trial.suggest_uniform("min_split_gain", 0, 1.0),
261
- "random_state": 42
262
- }
263
- # Build pipeline with current params
264
- pipeline_lgb = Pipeline([
265
- ("preprocessor", preprocessor),
266
- ("regressor", LGBMRegressor(**params))
267
- ])
268
- # 3-fold CV on training set
269
- scores = cross_val_score(pipeline_lgb, X_train, y_train,
270
- scoring="r2", cv=3, n_jobs=-1)
271
- return scores.mean()
272
-
273
- # Create and run the study
274
- study_lgb = optuna.create_study(direction="maximize")
275
- study_lgb.optimize(objective_lgb, n_trials=50)
276
-
277
- print("🔍 Optuna LightGBM best R²:", study_lgb.best_value)
278
- print("✨ Best hyperparameters:", study_lgb.best_params)
279
-
280
- # Retrain final model on full training data
281
- best_params = study_lgb.best_params
282
- lgb_final = Pipeline([
283
- ("preprocessor", preprocessor),
284
- ("regressor", LGBMRegressor(**best_params))
285
- ])
286
-
287
- !pip install optuna
288
- import optuna
289
- from sklearn.pipeline import Pipeline
290
- from sklearn.model_selection import KFold, cross_val_score
291
- from sklearn.metrics import r2_score, mean_squared_error
292
- from lightgbm import LGBMRegressor
293
- import numpy as np
294
- import joblib
295
-
296
- # Enhanced Optuna objective function with pruning
297
- def objective_lgb_pruned(trial):
298
- params = {
299
- "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
300
- "max_depth": trial.suggest_int("max_depth", 3, 12),
301
- "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 1e-1),
302
- "num_leaves": trial.suggest_int("num_leaves", 20, 200),
303
- "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
304
- "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
305
- "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-3, 10.0),
306
- "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-3, 10.0),
307
- "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
308
- "min_split_gain": trial.suggest_uniform("min_split_gain", 0, 1.0),
309
- "random_state": 42,
310
- "verbose": -1 # Suppress LightGBM warnings
311
- }
312
-
313
- # Use KFold for manual cross-validation with pruning
314
- kf = KFold(n_splits=3, shuffle=True, random_state=42)
315
- scores = []
316
-
317
- for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
318
- X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
319
- y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
320
-
321
- # Build pipeline
322
- pipeline_lgb = Pipeline([
323
- ("preprocessor", preprocessor),
324
- ("regressor", LGBMRegressor(**params))
325
- ])
326
-
327
- # Fit and predict
328
- pipeline_lgb.fit(X_fold_train, y_fold_train)
329
- y_pred = pipeline_lgb.predict(X_fold_val)
330
- score = r2_score(y_fold_val, y_pred)
331
- scores.append(score)
332
-
333
- # Report intermediate value for pruning
334
- trial.report(score, fold)
335
-
336
- # Check if trial should be pruned
337
- if trial.should_prune():
338
- raise optuna.TrialPruned()
339
-
340
- return np.mean(scores)
341
-
342
- # Create study with pruning
343
- study_lgb_pruned = optuna.create_study(
344
- direction="maximize",
345
- pruner=optuna.pruners.MedianPruner(
346
- n_startup_trials=10, # Number of trials before pruning starts
347
- n_warmup_steps=5, # Number of steps before considering pruning
348
- interval_steps=1 # Interval between pruning checks
349
- ),
350
- sampler=optuna.samplers.TPESampler(
351
- n_startup_trials=20,
352
- n_ei_candidates=24,
353
- seed=42
354
- )
355
- )
356
-
357
- # Optimize with more trials since pruning makes it faster
358
- study_lgb_pruned.optimize(objective_lgb_pruned, n_trials=100)
359
-
360
- print("Optuna LightGBM (with pruning) R²:", study_lgb_pruned.best_value)
361
- print("Best hyperparameters:", study_lgb_pruned.best_params)
362
- print("Number of pruned trials:", len([t for t in study_lgb_pruned.trials if t.state == optuna.trial.TrialState.PRUNED]))
363
-
364
- # Train final model
365
- best_params = study_lgb_pruned.best_params
366
- lgb_final_pruned = Pipeline([
367
- ("preprocessor", preprocessor),
368
- ("regressor", LGBMRegressor(**best_params))
369
- ])
370
-
371
- lgb_final_pruned.fit(X_train, y_train)
372
-
373
- # Evaluate on test set
374
- y_pred_test = lgb_final_pruned.predict(X_test)
375
- test_r2 = r2_score(y_test, y_pred_test)
376
- test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
377
-
378
- import joblib
379
-
380
- # 1) Save the pruned LightGBM pipeline
381
- model_filename = 'model.joblib'
382
- joblib.dump(lgb_final_pruned, model_filename)
383
- print(f"Model exported to {model_filename}")
384
-
385
- # 2) (Optional) In Colab, download directly:
386
- from google.colab import files
387
- files.download(model_filename)
388
-
389
- # 👇 Paste this after your training cell 👇
390
-
391
- import numpy as np
392
- import matplotlib.pyplot as plt
393
- from IPython.display import display
394
-
395
- # 1) Recover your teams & specialties from df
396
- teams = sorted(df['Team'].unique())
397
- specialty_map = dict(zip(df['Team'], df['Specialty']))
398
-
399
- # 2) Define the example task you want to test
400
- example_task = {
401
- 'ProductType': 'Mothball',
402
- 'TaskType': 'Packaging',
403
- 'OrderQuantity': 120,
404
- 'DeadlineDays': 1,
405
- 'ExperienceYears': 6,
406
- 'AvgTaskTime_Minutes': 28.0,
407
- 'ErrorRate': 0.05,
408
- 'TrainingHours': 20.0,
409
- 'DayNumber': 2,
410
- 'ThroughputRate': 120 / 28.0,
411
- 'TimePressure': 120 / (4 * 28.0),
412
- 'PriorityLevel': 3
413
- }
414
-
415
- # 3) Build a DataFrame with one row per team
416
- rows = []
417
- for team in teams:
418
- r = example_task.copy()
419
- r['Team'] = team
420
- r['Specialty'] = specialty_map[team]
421
- rows.append(r)
422
- test_df = pd.DataFrame(rows)
423
-
424
- # 4) Predict & rank
425
- test_df['PredictedProductivity'] = pipeline.predict(test_df)
426
- ranked = test_df.sort_values('PredictedProductivity', ascending=False).reset_index(drop=True)
427
-
428
- # 5) Display the table
429
- print("🏆 Team Productivity Rankings:")
430
- display(ranked[['Team','PredictedProductivity']])
431
-
432
- # 6) Optional: plot a horizontal bar chart
433
- plt.figure(figsize=(8,5))
434
- plt.barh(ranked['Team'], ranked['PredictedProductivity'], color='steelblue')
435
- plt.gca().invert_yaxis()
436
- plt.xlabel('Predicted Productivity')
437
- plt.title('Team Ranking for Example Task')
438
- plt.grid(axis='x', linestyle='--', alpha=0.5)
439
- plt.tight_layout()
440
- plt.show()
 
1
+ import gradio as gr
2
+ import matplotlib.pyplot as plt
 
 
 
 
 
 
 
 
3
  import numpy as np
4
+ import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import joblib
6
 
7
+ # Load the model and dataset
8
+ model = joblib.load("xgb_model.joblib")
9
+ df = pd.read_csv("worker_productivity.csv") # Make sure this is uploaded to Hugging Face Space
10
+
11
+ # Get unique teams
12
+ teams = sorted(df['team'].unique())
13
+
14
+ # Define a base task to simulate a prediction input
15
+ base_task = {
16
+ 'quarter': 'Q2',
17
+ 'department': 'sewing',
18
+ 'day': 'Monday',
19
+ 'no_of_workers': 48,
20
+ 'incentive': 2.5,
21
+ 'idle_time': 0.3,
22
+ 'idle_men': 4,
23
+ 'smv': 30.0,
24
+ 'month': 5,
25
+ 'day_of_week': 0,
26
+ 'is_weekend': 0,
27
+ 'smv_per_worker': 30.0 / 48,
28
+ 'effort_index': 30.0 + 2.5 + 1.0 - 0.3,
29
+ 'log_wip': np.log1p(50),
30
+ 'log_overtime': np.log1p(1.0),
31
+ 'no_of_style_change': 0,
32
+ 'targeted_productivity': 0.75
33
  }
34
 
35
+ # Prediction function
36
+
37
+ def predict():
38
+ team_scores = []
39
+
40
+ for team in teams:
41
+ task = base_task.copy()
42
+ task['team'] = team
43
+ task_df = pd.DataFrame([task])
44
+ pred = model.predict(task_df)[0]
45
+ team_scores.append((team, pred))
46
+
47
+ # Sort results
48
+ team_scores_df = pd.DataFrame(team_scores, columns=["Team", "Predicted Productivity"])
49
+ team_scores_df = team_scores_df.sort_values(by="Predicted Productivity", ascending=False)
50
+
51
+ # Plot results
52
+ fig, ax = plt.subplots(figsize=(10, 6))
53
+ ax.barh(team_scores_df["Team"].astype(str), team_scores_df["Predicted Productivity"], color='skyblue')
54
+ ax.set_xlabel("Predicted Productivity")
55
+ ax.set_title("Predicted Productivity by Team for Custom Task")
56
+ ax.invert_yaxis()
57
+ plt.tight_layout()
58
+
59
+ return fig
60
+
61
+ # Gradio UI
62
+ demo = gr.Interface(
63
+ fn=predict,
64
+ inputs=[],
65
+ outputs=[gr.Plot(label="Team Productivity Rankings")],
66
+ live=False,
67
+ title="Worker Productivity Predictor",
68
+ description="Generates predicted productivity scores for each team on a fixed custom task."
69
  )
70
 
71
+ demo.launch()