nodronm commited on
Commit
03f3f30
·
verified ·
1 Parent(s): 299940a

Upload aap.py

Browse files
Files changed (1) hide show
  1. aap.py +440 -0
aap.py ADDED
@@ -0,0 +1,440 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """AAP.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1rxnN6J5ojM0HFXh5HxHo9AF4oOfq_fwM
8
+ """
9
+
10
+ import pandas as pd
11
+ import numpy as np
12
+ from sklearn.impute import SimpleImputer
13
+ from sklearn.model_selection import train_test_split
14
+ from sklearn.compose import ColumnTransformer
15
+ from sklearn.preprocessing import OneHotEncoder, StandardScaler
16
+ from sklearn.pipeline import Pipeline
17
+ from xgboost import XGBRegressor
18
+
19
+ try:
20
+ # Google Colab: upload via picker
21
+ from google.colab import files
22
+ uploaded = files.upload() # select minimal_messy_task_performance.csv
23
+ import io
24
+ df = pd.read_csv(io.BytesIO(uploaded['dataset.csv']))
25
+ except ModuleNotFoundError:
26
+ df = pd.read_csv('dataset.csv')
27
+
28
+ df.shape
29
+
30
+ df.head()
31
+
32
+ df['Team'] = df['Team'].str.lower().fillna('team_unknown')
33
+
34
+ imp = SimpleImputer(strategy='median')
35
+ df['ErrorRate'] = imp.fit_transform(df[['ErrorRate']])
36
+
37
+ df = df[df['ProductivityScore'] > 0].reset_index(drop=True)
38
+ print("Remaining rows:", df.shape[0])
39
+
40
+ df.head()
41
+
42
+ df['ThroughputRate'] = df['OrderQuantity'] / df['AvgTaskTime_Minutes']
43
+ df['TimePressure'] = df['OrderQuantity'] / (df['DeadlineDays'].replace(0, 1) * df['AvgTaskTime_Minutes'])
44
+ priority_map = {'High': 3, 'Medium': 2, 'Low': 1}
45
+ df['PriorityLevel'] = (df['Priority'].str.capitalize()
46
+ .map(priority_map).fillna(1).astype(int))
47
+ df.drop('Priority', axis=1, inplace=True)
48
+
49
+ df.head(10)
50
+
51
+ X = df.drop('ProductivityScore', axis=1)
52
+ y = df['ProductivityScore']
53
+ X_train, X_test, y_train, y_test = train_test_split(
54
+ X, y, test_size=0.2, random_state=42
55
+ )
56
+
57
+ cat_cols = ['Team','ProductType','TaskType']
58
+ num_cols = ['OrderQuantity','DeadlineDays','ExperienceYears','AvgTaskTime_Minutes',
59
+ 'ErrorRate','TrainingHours','DayNumber','ThroughputRate','TimePressure','PriorityLevel']
60
+
61
+ preprocessor = ColumnTransformer([
62
+ ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
63
+ ('num', StandardScaler(), num_cols)
64
+ ])
65
+
66
+ pipeline = Pipeline([
67
+ ('preprocessor', preprocessor),
68
+ ('regressor', XGBRegressor(
69
+ objective='reg:squarederror',
70
+ random_state=42,
71
+ tree_method='hist'
72
+ ))
73
+ ])
74
+
75
+ pipeline.fit(X_train, y_train)
76
+
77
+ # Step 7: Evaluate on test set
78
+ from sklearn.metrics import r2_score, mean_squared_error
79
+ y_pred = pipeline.predict(X_test)
80
+ print(f"Test R²: {r2_score(y_test, y_pred):.4f}")
81
+ print(f"Test MSE: {mean_squared_error(y_test, y_pred):.4f}")
82
+
83
+ # Step 1: Define hyperparameter search space
84
+ from scipy.stats import randint, uniform, loguniform
85
+
86
+ param_dist = {
87
+ 'regressor__n_estimators': randint(100, 1000),
88
+ 'regressor__max_depth': randint(3, 15),
89
+ 'regressor__learning_rate': uniform(0.01, 0.29),
90
+ 'regressor__subsample': uniform(0.5, 0.5),
91
+ 'regressor__colsample_bytree': uniform(0.5, 0.5),
92
+ 'regressor__gamma': uniform(0, 0.5),
93
+ 'regressor__reg_alpha': loguniform(1e-3, 1e2),
94
+ 'regressor__reg_lambda': loguniform(1e-3, 1e2),
95
+ 'regressor__min_child_weight': randint(1, 10),
96
+ }
97
+
98
+ # Step 2: Set up RandomizedSearchCV
99
+ from sklearn.model_selection import RandomizedSearchCV
100
+
101
+ search = RandomizedSearchCV(
102
+ estimator=pipeline,
103
+ param_distributions=param_dist,
104
+ n_iter=50, # number of parameter settings to sample
105
+ scoring='r2',
106
+ cv=3,
107
+ n_jobs=-1,
108
+ verbose=1,
109
+ random_state=42
110
+ )
111
+
112
+ # Step 3: Run the hyperparameter search
113
+ search.fit(X_train, y_train)
114
+
115
+ # Step 4: Inspect the best parameters & CV score
116
+ print("🔍 Best parameters:", search.best_params_)
117
+ print(f"Best CV R²: {search.best_score_:.4f}")
118
+
119
+ # Step 5: Evaluate the tuned model on the test set
120
+ best_model = search.best_estimator_
121
+ from sklearn.metrics import r2_score, mean_squared_error
122
+
123
+ y_pred = best_model.predict(X_test)
124
+ print(f"Test R²: {r2_score(y_test, y_pred):.4f}")
125
+ print(f"Test MSE: {mean_squared_error(y_test, y_pred):.4f}")
126
+
127
+ from sklearn.model_selection import GridSearchCV
128
+ from sklearn.metrics import r2_score, mean_squared_error
129
+ import joblib
130
+
131
+ # 1) Extract your best random‐search parameters
132
+ best = search.best_params_
133
+
134
+ # 2) Create a tight grid around them
135
+ param_grid = {
136
+ 'regressor__n_estimators': [
137
+ max(100, best['regressor__n_estimators'] - 100),
138
+ best['regressor__n_estimators'],
139
+ best['regressor__n_estimators'] + 100
140
+ ],
141
+ 'regressor__max_depth': [
142
+ max(3, best['regressor__max_depth'] - 2),
143
+ best['regressor__max_depth'],
144
+ best['regressor__max_depth'] + 2
145
+ ],
146
+ 'regressor__learning_rate': [
147
+ best['regressor__learning_rate'] * 0.5,
148
+ best['regressor__learning_rate'],
149
+ best['regressor__learning_rate'] * 1.5
150
+ ],
151
+ }
152
+
153
+ # 3) Set up GridSearchCV
154
+ grid_search = GridSearchCV(
155
+ estimator=pipeline,
156
+ param_grid=param_grid,
157
+ scoring='r2',
158
+ cv=3, # 3-fold CV
159
+ n_jobs=-1,
160
+ verbose=1
161
+ )
162
+
163
+ # 4) Run grid search on training set
164
+ grid_search.fit(X_train, y_train)
165
+
166
+ # 5) Evaluate on test set
167
+ y_pred = grid_search.predict(X_test)
168
+ print("Grid Search Best R²:", r2_score(y_test, y_pred))
169
+ print("Grid Search MSE: ", mean_squared_error(y_test, y_pred))
170
+
171
+ # 6) Save the final, tuned model
172
+ joblib.dump(grid_search.best_estimator_, 'task_distribution_model_grid_tuned.joblib')
173
+
174
+ !pip install optuna
175
+ import optuna
176
+
177
+ def objective(trial):
178
+ params = {
179
+ 'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
180
+ 'max_depth': trial.suggest_int('max_depth', 3, 15),
181
+ 'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
182
+ 'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
183
+ 'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
184
+ 'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10),
185
+ 'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10),
186
+ }
187
+ model = Pipeline([
188
+ ('preprocessor', preprocessor),
189
+ ('regressor', XGBRegressor(**params, tree_method='hist', random_state=42))
190
+ ])
191
+ from sklearn.model_selection import cross_val_score
192
+ score = cross_val_score(model, X_train, y_train, cv=3, scoring='r2', n_jobs=-1).mean()
193
+ return score
194
+
195
+ study = optuna.create_study(direction='maximize')
196
+ study.optimize(objective, n_trials=50)
197
+ print("Optuna best R²:", study.best_value)
198
+ print(" Best params:", study.best_params)
199
+
200
+ from sklearn.pipeline import Pipeline
201
+ from lightgbm import LGBMRegressor
202
+ from sklearn.model_selection import RandomizedSearchCV
203
+ from scipy.stats import randint, uniform
204
+
205
+ # 1a) Build a LightGBM pipeline
206
+ lgb_pipeline = Pipeline([
207
+ ('preprocessor', preprocessor),
208
+ ('regressor', LGBMRegressor(random_state=42))
209
+ ])
210
+
211
+ # 1b) Define a random search space
212
+ param_dist_lgb = {
213
+ 'regressor__n_estimators': randint(100, 1000),
214
+ 'regressor__max_depth': randint(3, 15),
215
+ 'regressor__learning_rate':uniform(0.01, 0.29),
216
+ 'regressor__subsample': uniform(0.5, 0.5),
217
+ 'regressor__colsample_bytree': uniform(0.5, 0.5),
218
+ 'regressor__reg_alpha': uniform(0, 1),
219
+ 'regressor__reg_lambda': uniform(0, 1),
220
+ }
221
+
222
+ search_lgb = RandomizedSearchCV(
223
+ lgb_pipeline,
224
+ param_distributions=param_dist_lgb,
225
+ n_iter=50,
226
+ scoring='r2',
227
+ cv=3,
228
+ n_jobs=-1,
229
+ random_state=42,
230
+ verbose=1
231
+ )
232
+
233
+ search_lgb.fit(X_train, y_train)
234
+ print("LightGBM Best CV R²:", search_lgb.best_score_)
235
+ # Evaluate on test
236
+ y_pred = search_lgb.predict(X_test)
237
+ from sklearn.metrics import r2_score, mean_squared_error
238
+ print(" LightGBM Test R²:", r2_score(y_test, y_pred))
239
+ print(" LightGBM Test MSE:", mean_squared_error(y_test, y_pred))
240
+
241
+ import optuna
242
+ from sklearn.pipeline import Pipeline
243
+ from sklearn.model_selection import cross_val_score
244
+ from lightgbm import LGBMRegressor
245
+ from sklearn.metrics import r2_score, mean_squared_error
246
+ import joblib
247
+
248
+ # Optuna objective function for LightGBM
249
+ def objective_lgb(trial):
250
+ params = {
251
+ "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
252
+ "max_depth": trial.suggest_int("max_depth", 3, 12),
253
+ "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 1e-1),
254
+ "num_leaves": trial.suggest_int("num_leaves", 20, 200),
255
+ "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
256
+ "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
257
+ "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-3, 10.0),
258
+ "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-3, 10.0),
259
+ "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
260
+ "min_split_gain": trial.suggest_uniform("min_split_gain", 0, 1.0),
261
+ "random_state": 42
262
+ }
263
+ # Build pipeline with current params
264
+ pipeline_lgb = Pipeline([
265
+ ("preprocessor", preprocessor),
266
+ ("regressor", LGBMRegressor(**params))
267
+ ])
268
+ # 3-fold CV on training set
269
+ scores = cross_val_score(pipeline_lgb, X_train, y_train,
270
+ scoring="r2", cv=3, n_jobs=-1)
271
+ return scores.mean()
272
+
273
+ # Create and run the study
274
+ study_lgb = optuna.create_study(direction="maximize")
275
+ study_lgb.optimize(objective_lgb, n_trials=50)
276
+
277
+ print("🔍 Optuna LightGBM best R²:", study_lgb.best_value)
278
+ print("✨ Best hyperparameters:", study_lgb.best_params)
279
+
280
+ # Retrain final model on full training data
281
+ best_params = study_lgb.best_params
282
+ lgb_final = Pipeline([
283
+ ("preprocessor", preprocessor),
284
+ ("regressor", LGBMRegressor(**best_params))
285
+ ])
286
+
287
+ !pip install optuna
288
+ import optuna
289
+ from sklearn.pipeline import Pipeline
290
+ from sklearn.model_selection import KFold, cross_val_score
291
+ from sklearn.metrics import r2_score, mean_squared_error
292
+ from lightgbm import LGBMRegressor
293
+ import numpy as np
294
+ import joblib
295
+
296
+ # Enhanced Optuna objective function with pruning
297
+ def objective_lgb_pruned(trial):
298
+ params = {
299
+ "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
300
+ "max_depth": trial.suggest_int("max_depth", 3, 12),
301
+ "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 1e-1),
302
+ "num_leaves": trial.suggest_int("num_leaves", 20, 200),
303
+ "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
304
+ "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
305
+ "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-3, 10.0),
306
+ "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-3, 10.0),
307
+ "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
308
+ "min_split_gain": trial.suggest_uniform("min_split_gain", 0, 1.0),
309
+ "random_state": 42,
310
+ "verbose": -1 # Suppress LightGBM warnings
311
+ }
312
+
313
+ # Use KFold for manual cross-validation with pruning
314
+ kf = KFold(n_splits=3, shuffle=True, random_state=42)
315
+ scores = []
316
+
317
+ for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
318
+ X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
319
+ y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
320
+
321
+ # Build pipeline
322
+ pipeline_lgb = Pipeline([
323
+ ("preprocessor", preprocessor),
324
+ ("regressor", LGBMRegressor(**params))
325
+ ])
326
+
327
+ # Fit and predict
328
+ pipeline_lgb.fit(X_fold_train, y_fold_train)
329
+ y_pred = pipeline_lgb.predict(X_fold_val)
330
+ score = r2_score(y_fold_val, y_pred)
331
+ scores.append(score)
332
+
333
+ # Report intermediate value for pruning
334
+ trial.report(score, fold)
335
+
336
+ # Check if trial should be pruned
337
+ if trial.should_prune():
338
+ raise optuna.TrialPruned()
339
+
340
+ return np.mean(scores)
341
+
342
+ # Create study with pruning
343
+ study_lgb_pruned = optuna.create_study(
344
+ direction="maximize",
345
+ pruner=optuna.pruners.MedianPruner(
346
+ n_startup_trials=10, # Number of trials before pruning starts
347
+ n_warmup_steps=5, # Number of steps before considering pruning
348
+ interval_steps=1 # Interval between pruning checks
349
+ ),
350
+ sampler=optuna.samplers.TPESampler(
351
+ n_startup_trials=20,
352
+ n_ei_candidates=24,
353
+ seed=42
354
+ )
355
+ )
356
+
357
+ # Optimize with more trials since pruning makes it faster
358
+ study_lgb_pruned.optimize(objective_lgb_pruned, n_trials=100)
359
+
360
+ print("Optuna LightGBM (with pruning) R²:", study_lgb_pruned.best_value)
361
+ print("Best hyperparameters:", study_lgb_pruned.best_params)
362
+ print("Number of pruned trials:", len([t for t in study_lgb_pruned.trials if t.state == optuna.trial.TrialState.PRUNED]))
363
+
364
+ # Train final model
365
+ best_params = study_lgb_pruned.best_params
366
+ lgb_final_pruned = Pipeline([
367
+ ("preprocessor", preprocessor),
368
+ ("regressor", LGBMRegressor(**best_params))
369
+ ])
370
+
371
+ lgb_final_pruned.fit(X_train, y_train)
372
+
373
+ # Evaluate on test set
374
+ y_pred_test = lgb_final_pruned.predict(X_test)
375
+ test_r2 = r2_score(y_test, y_pred_test)
376
+ test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
377
+
378
+ import joblib
379
+
380
+ # 1) Save the pruned LightGBM pipeline
381
+ model_filename = 'model.joblib'
382
+ joblib.dump(lgb_final_pruned, model_filename)
383
+ print(f"Model exported to {model_filename}")
384
+
385
+ # 2) (Optional) In Colab, download directly:
386
+ from google.colab import files
387
+ files.download(model_filename)
388
+
389
+ # 👇 Paste this after your training cell 👇
390
+
391
+ import numpy as np
392
+ import matplotlib.pyplot as plt
393
+ from IPython.display import display
394
+
395
+ # 1) Recover your teams & specialties from df
396
+ teams = sorted(df['Team'].unique())
397
+ specialty_map = dict(zip(df['Team'], df['Specialty']))
398
+
399
+ # 2) Define the example task you want to test
400
+ example_task = {
401
+ 'ProductType': 'Mothball',
402
+ 'TaskType': 'Packaging',
403
+ 'OrderQuantity': 120,
404
+ 'DeadlineDays': 1,
405
+ 'ExperienceYears': 6,
406
+ 'AvgTaskTime_Minutes': 28.0,
407
+ 'ErrorRate': 0.05,
408
+ 'TrainingHours': 20.0,
409
+ 'DayNumber': 2,
410
+ 'ThroughputRate': 120 / 28.0,
411
+ 'TimePressure': 120 / (4 * 28.0),
412
+ 'PriorityLevel': 3
413
+ }
414
+
415
+ # 3) Build a DataFrame with one row per team
416
+ rows = []
417
+ for team in teams:
418
+ r = example_task.copy()
419
+ r['Team'] = team
420
+ r['Specialty'] = specialty_map[team]
421
+ rows.append(r)
422
+ test_df = pd.DataFrame(rows)
423
+
424
+ # 4) Predict & rank
425
+ test_df['PredictedProductivity'] = pipeline.predict(test_df)
426
+ ranked = test_df.sort_values('PredictedProductivity', ascending=False).reset_index(drop=True)
427
+
428
+ # 5) Display the table
429
+ print("🏆 Team Productivity Rankings:")
430
+ display(ranked[['Team','PredictedProductivity']])
431
+
432
+ # 6) Optional: plot a horizontal bar chart
433
+ plt.figure(figsize=(8,5))
434
+ plt.barh(ranked['Team'], ranked['PredictedProductivity'], color='steelblue')
435
+ plt.gca().invert_yaxis()
436
+ plt.xlabel('Predicted Productivity')
437
+ plt.title('Team Ranking for Example Task')
438
+ plt.grid(axis='x', linestyle='--', alpha=0.5)
439
+ plt.tight_layout()
440
+ plt.show()