Spaces:

nodronm
/

APP2

Sleeping

App Files Files Community

nodronm commited on Jul 23, 2025

Commit

03f3f30

verified ·

1 Parent(s): 299940a

Upload aap.py

Browse files

Files changed (1) hide show

aap.py +440 -0

aap.py ADDED Viewed

	@@ -0,0 +1,440 @@

+# -*- coding: utf-8 -*-
+"""AAP.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1rxnN6J5ojM0HFXh5HxHo9AF4oOfq_fwM
+"""
+import pandas as pd
+import numpy as np
+from sklearn.impute import SimpleImputer
+from sklearn.model_selection import train_test_split
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from sklearn.pipeline import Pipeline
+from xgboost import XGBRegressor
+try:
+    # Google Colab: upload via picker
+    from google.colab import files
+    uploaded = files.upload()  # select minimal_messy_task_performance.csv
+    import io
+    df = pd.read_csv(io.BytesIO(uploaded['dataset.csv']))
+except ModuleNotFoundError:
+    df = pd.read_csv('dataset.csv')
+df.shape
+df.head()
+df['Team'] = df['Team'].str.lower().fillna('team_unknown')
+imp = SimpleImputer(strategy='median')
+df['ErrorRate'] = imp.fit_transform(df[['ErrorRate']])
+df = df[df['ProductivityScore'] > 0].reset_index(drop=True)
+print("Remaining rows:", df.shape[0])
+df.head()
+df['ThroughputRate'] = df['OrderQuantity'] / df['AvgTaskTime_Minutes']
+df['TimePressure'] = df['OrderQuantity'] / (df['DeadlineDays'].replace(0, 1) * df['AvgTaskTime_Minutes'])
+priority_map = {'High': 3, 'Medium': 2, 'Low': 1}
+df['PriorityLevel'] = (df['Priority'].str.capitalize()
+                       .map(priority_map).fillna(1).astype(int))
+df.drop('Priority', axis=1, inplace=True)
+df.head(10)
+X = df.drop('ProductivityScore', axis=1)
+y = df['ProductivityScore']
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.2, random_state=42
+)
+cat_cols = ['Team','ProductType','TaskType']
+num_cols = ['OrderQuantity','DeadlineDays','ExperienceYears','AvgTaskTime_Minutes',
+            'ErrorRate','TrainingHours','DayNumber','ThroughputRate','TimePressure','PriorityLevel']
+preprocessor = ColumnTransformer([
+    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
+    ('num', StandardScaler(), num_cols)
+])
+pipeline = Pipeline([
+    ('preprocessor', preprocessor),
+    ('regressor', XGBRegressor(
+        objective='reg:squarederror',
+        random_state=42,
+        tree_method='hist'
+    ))
+])
+pipeline.fit(X_train, y_train)
+# Step 7: Evaluate on test set
+from sklearn.metrics import r2_score, mean_squared_error
+y_pred = pipeline.predict(X_test)
+print(f"Test R²:  {r2_score(y_test, y_pred):.4f}")
+print(f"Test MSE: {mean_squared_error(y_test, y_pred):.4f}")
+# Step 1: Define hyperparameter search space
+from scipy.stats import randint, uniform, loguniform
+param_dist = {
+    'regressor__n_estimators':      randint(100, 1000),
+    'regressor__max_depth':         randint(3, 15),
+    'regressor__learning_rate':     uniform(0.01, 0.29),
+    'regressor__subsample':         uniform(0.5, 0.5),
+    'regressor__colsample_bytree':  uniform(0.5, 0.5),
+    'regressor__gamma':             uniform(0, 0.5),
+    'regressor__reg_alpha':         loguniform(1e-3, 1e2),
+    'regressor__reg_lambda':        loguniform(1e-3, 1e2),
+    'regressor__min_child_weight':  randint(1, 10),
+}
+# Step 2: Set up RandomizedSearchCV
+from sklearn.model_selection import RandomizedSearchCV
+search = RandomizedSearchCV(
+    estimator=pipeline,
+    param_distributions=param_dist,
+    n_iter=50,             # number of parameter settings to sample
+    scoring='r2',
+    cv=3,
+    n_jobs=-1,
+    verbose=1,
+    random_state=42
+)
+# Step 3: Run the hyperparameter search
+search.fit(X_train, y_train)
+# Step 4: Inspect the best parameters & CV score
+print("🔍 Best parameters:", search.best_params_)
+print(f"Best CV R²: {search.best_score_:.4f}")
+# Step 5: Evaluate the tuned model on the test set
+best_model = search.best_estimator_
+from sklearn.metrics import r2_score, mean_squared_error
+y_pred = best_model.predict(X_test)
+print(f"Test R²:  {r2_score(y_test, y_pred):.4f}")
+print(f"Test MSE: {mean_squared_error(y_test, y_pred):.4f}")
+from sklearn.model_selection import GridSearchCV
+from sklearn.metrics import r2_score, mean_squared_error
+import joblib
+# 1) Extract your best random‐search parameters
+best = search.best_params_
+# 2) Create a tight grid around them
+param_grid = {
+    'regressor__n_estimators': [
+        max(100, best['regressor__n_estimators'] - 100),
+        best['regressor__n_estimators'],
+        best['regressor__n_estimators'] + 100
+    ],
+    'regressor__max_depth': [
+        max(3, best['regressor__max_depth'] - 2),
+        best['regressor__max_depth'],
+        best['regressor__max_depth'] + 2
+    ],
+    'regressor__learning_rate': [
+        best['regressor__learning_rate'] * 0.5,
+        best['regressor__learning_rate'],
+        best['regressor__learning_rate'] * 1.5
+    ],
+}
+# 3) Set up GridSearchCV
+grid_search = GridSearchCV(
+    estimator=pipeline,
+    param_grid=param_grid,
+    scoring='r2',
+    cv=3,            # 3-fold CV
+    n_jobs=-1,
+    verbose=1
+)
+# 4) Run grid search on training set
+grid_search.fit(X_train, y_train)
+# 5) Evaluate on test set
+y_pred = grid_search.predict(X_test)
+print("Grid Search Best R²:", r2_score(y_test, y_pred))
+print("Grid Search MSE: ", mean_squared_error(y_test, y_pred))
+# 6) Save the final, tuned model
+joblib.dump(grid_search.best_estimator_, 'task_distribution_model_grid_tuned.joblib')
+!pip install optuna
+import optuna
+def objective(trial):
+    params = {
+      'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
+      'max_depth': trial.suggest_int('max_depth', 3, 15),
+      'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
+      'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
+      'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
+      'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10),
+      'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10),
+    }
+    model = Pipeline([
+      ('preprocessor', preprocessor),
+      ('regressor', XGBRegressor(**params, tree_method='hist', random_state=42))
+    ])
+    from sklearn.model_selection import cross_val_score
+    score = cross_val_score(model, X_train, y_train, cv=3, scoring='r2', n_jobs=-1).mean()
+    return score
+study = optuna.create_study(direction='maximize')
+study.optimize(objective, n_trials=50)
+print("Optuna best R²:", study.best_value)
+print("  Best params:", study.best_params)
+from sklearn.pipeline import Pipeline
+from lightgbm import LGBMRegressor
+from sklearn.model_selection import RandomizedSearchCV
+from scipy.stats import randint, uniform
+# 1a) Build a LightGBM pipeline
+lgb_pipeline = Pipeline([
+    ('preprocessor', preprocessor),
+    ('regressor', LGBMRegressor(random_state=42))
+])
+# 1b) Define a random search space
+param_dist_lgb = {
+    'regressor__n_estimators': randint(100, 1000),
+    'regressor__max_depth':    randint(3, 15),
+    'regressor__learning_rate':uniform(0.01, 0.29),
+    'regressor__subsample':     uniform(0.5, 0.5),
+    'regressor__colsample_bytree': uniform(0.5, 0.5),
+    'regressor__reg_alpha':     uniform(0, 1),
+    'regressor__reg_lambda':    uniform(0, 1),
+}
+search_lgb = RandomizedSearchCV(
+    lgb_pipeline,
+    param_distributions=param_dist_lgb,
+    n_iter=50,
+    scoring='r2',
+    cv=3,
+    n_jobs=-1,
+    random_state=42,
+    verbose=1
+)
+search_lgb.fit(X_train, y_train)
+print("LightGBM Best CV R²:", search_lgb.best_score_)
+# Evaluate on test
+y_pred = search_lgb.predict(X_test)
+from sklearn.metrics import r2_score, mean_squared_error
+print(" LightGBM Test R²:", r2_score(y_test, y_pred))
+print(" LightGBM Test MSE:", mean_squared_error(y_test, y_pred))
+import optuna
+from sklearn.pipeline import Pipeline
+from sklearn.model_selection import cross_val_score
+from lightgbm import LGBMRegressor
+from sklearn.metrics import r2_score, mean_squared_error
+import joblib
+# Optuna objective function for LightGBM
+def objective_lgb(trial):
+    params = {
+        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
+        "max_depth": trial.suggest_int("max_depth", 3, 12),
+        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 1e-1),
+        "num_leaves": trial.suggest_int("num_leaves", 20, 200),
+        "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
+        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
+        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-3, 10.0),
+        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-3, 10.0),
+        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
+        "min_split_gain": trial.suggest_uniform("min_split_gain", 0, 1.0),
+        "random_state": 42
+    }
+    # Build pipeline with current params
+    pipeline_lgb = Pipeline([
+        ("preprocessor", preprocessor),
+        ("regressor", LGBMRegressor(**params))
+    ])
+    # 3-fold CV on training set
+    scores = cross_val_score(pipeline_lgb, X_train, y_train,
+                             scoring="r2", cv=3, n_jobs=-1)
+    return scores.mean()
+# Create and run the study
+study_lgb = optuna.create_study(direction="maximize")
+study_lgb.optimize(objective_lgb, n_trials=50)
+print("🔍 Optuna LightGBM best R²:", study_lgb.best_value)
+print("✨ Best hyperparameters:", study_lgb.best_params)
+# Retrain final model on full training data
+best_params = study_lgb.best_params
+lgb_final = Pipeline([
+    ("preprocessor", preprocessor),
+    ("regressor", LGBMRegressor(**best_params))
+])
+!pip install optuna
+import optuna
+from sklearn.pipeline import Pipeline
+from sklearn.model_selection import KFold, cross_val_score
+from sklearn.metrics import r2_score, mean_squared_error
+from lightgbm import LGBMRegressor
+import numpy as np
+import joblib
+# Enhanced Optuna objective function with pruning
+def objective_lgb_pruned(trial):
+    params = {
+        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
+        "max_depth": trial.suggest_int("max_depth", 3, 12),
+        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 1e-1),
+        "num_leaves": trial.suggest_int("num_leaves", 20, 200),
+        "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
+        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
+        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-3, 10.0),
+        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-3, 10.0),
+        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
+        "min_split_gain": trial.suggest_uniform("min_split_gain", 0, 1.0),
+        "random_state": 42,
+        "verbose": -1  # Suppress LightGBM warnings
+    }
+    # Use KFold for manual cross-validation with pruning
+    kf = KFold(n_splits=3, shuffle=True, random_state=42)
+    scores = []
+    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
+        X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
+        y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
+        # Build pipeline
+        pipeline_lgb = Pipeline([
+            ("preprocessor", preprocessor),
+            ("regressor", LGBMRegressor(**params))
+        ])
+        # Fit and predict
+        pipeline_lgb.fit(X_fold_train, y_fold_train)
+        y_pred = pipeline_lgb.predict(X_fold_val)
+        score = r2_score(y_fold_val, y_pred)
+        scores.append(score)
+        # Report intermediate value for pruning
+        trial.report(score, fold)
+        # Check if trial should be pruned
+        if trial.should_prune():
+            raise optuna.TrialPruned()
+    return np.mean(scores)
+# Create study with pruning
+study_lgb_pruned = optuna.create_study(
+    direction="maximize",
+    pruner=optuna.pruners.MedianPruner(
+        n_startup_trials=10,  # Number of trials before pruning starts
+        n_warmup_steps=5,     # Number of steps before considering pruning
+        interval_steps=1      # Interval between pruning checks
+    ),
+    sampler=optuna.samplers.TPESampler(
+        n_startup_trials=20,
+        n_ei_candidates=24,
+        seed=42
+    )
+)
+# Optimize with more trials since pruning makes it faster
+study_lgb_pruned.optimize(objective_lgb_pruned, n_trials=100)
+print("Optuna LightGBM (with pruning) R²:", study_lgb_pruned.best_value)
+print("Best hyperparameters:", study_lgb_pruned.best_params)
+print("Number of pruned trials:", len([t for t in study_lgb_pruned.trials if t.state == optuna.trial.TrialState.PRUNED]))
+# Train final model
+best_params = study_lgb_pruned.best_params
+lgb_final_pruned = Pipeline([
+    ("preprocessor", preprocessor),
+    ("regressor", LGBMRegressor(**best_params))
+])
+lgb_final_pruned.fit(X_train, y_train)
+# Evaluate on test set
+y_pred_test = lgb_final_pruned.predict(X_test)
+test_r2 = r2_score(y_test, y_pred_test)
+test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
+import joblib
+# 1) Save the pruned LightGBM pipeline
+model_filename = 'model.joblib'
+joblib.dump(lgb_final_pruned, model_filename)
+print(f"Model exported to {model_filename}")
+# 2) (Optional) In Colab, download directly:
+from google.colab import files
+files.download(model_filename)
+# 👇 Paste this after your training cell 👇
+import numpy as np
+import matplotlib.pyplot as plt
+from IPython.display import display
+# 1) Recover your teams & specialties from df
+teams = sorted(df['Team'].unique())
+specialty_map = dict(zip(df['Team'], df['Specialty']))
+# 2) Define the example task you want to test
+example_task = {
+    'ProductType':         'Mothball',
+    'TaskType':            'Packaging',
+    'OrderQuantity':       120,
+    'DeadlineDays':        1,
+    'ExperienceYears':     6,
+    'AvgTaskTime_Minutes': 28.0,
+    'ErrorRate':           0.05,
+    'TrainingHours':       20.0,
+    'DayNumber':           2,
+    'ThroughputRate':      120 / 28.0,
+    'TimePressure':        120 / (4 * 28.0),
+    'PriorityLevel':       3
+}
+# 3) Build a DataFrame with one row per team
+rows = []
+for team in teams:
+    r = example_task.copy()
+    r['Team'] = team
+    r['Specialty'] = specialty_map[team]
+    rows.append(r)
+test_df = pd.DataFrame(rows)
+# 4) Predict & rank
+test_df['PredictedProductivity'] = pipeline.predict(test_df)
+ranked = test_df.sort_values('PredictedProductivity', ascending=False).reset_index(drop=True)
+# 5) Display the table
+print("🏆 Team Productivity Rankings:")
+display(ranked[['Team','PredictedProductivity']])
+# 6) Optional: plot a horizontal bar chart
+plt.figure(figsize=(8,5))
+plt.barh(ranked['Team'], ranked['PredictedProductivity'], color='steelblue')
+plt.gca().invert_yaxis()
+plt.xlabel('Predicted Productivity')
+plt.title('Team Ranking for Example Task')
+plt.grid(axis='x', linestyle='--', alpha=0.5)
+plt.tight_layout()
+plt.show()