Spaces:

nodronm
/

APP2

Sleeping

App Files Files Community

nodronm commited on Jul 23, 2025

Commit

2e6ad36

verified ·

1 Parent(s): ce4f969

Update aap.py

Browse files

Files changed (1) hide show

aap.py +64 -433

aap.py CHANGED Viewed

@@ -1,440 +1,71 @@
-# -*- coding: utf-8 -*-
-"""AAP.ipynb
-Automatically generated by Colab.
-Original file is located at
-    https://colab.research.google.com/drive/1rxnN6J5ojM0HFXh5HxHo9AF4oOfq_fwM
-"""
-import pandas as pd
 import numpy as np
-from sklearn.impute import SimpleImputer
-from sklearn.model_selection import train_test_split
-from sklearn.compose import ColumnTransformer
-from sklearn.preprocessing import OneHotEncoder, StandardScaler
-from sklearn.pipeline import Pipeline
-from xgboost import XGBRegressor
-try:
-    # Google Colab: upload via picker
-    from google.colab import files
-    uploaded = files.upload()  # select minimal_messy_task_performance.csv
-    import io
-    df = pd.read_csv(io.BytesIO(uploaded['dataset.csv']))
-except ModuleNotFoundError:
-    df = pd.read_csv('dataset.csv')
-df.shape
-df.head()
-df['Team'] = df['Team'].str.lower().fillna('team_unknown')
-imp = SimpleImputer(strategy='median')
-df['ErrorRate'] = imp.fit_transform(df[['ErrorRate']])
-df = df[df['ProductivityScore'] > 0].reset_index(drop=True)
-print("Remaining rows:", df.shape[0])
-df.head()
-df['ThroughputRate'] = df['OrderQuantity'] / df['AvgTaskTime_Minutes']
-df['TimePressure'] = df['OrderQuantity'] / (df['DeadlineDays'].replace(0, 1) * df['AvgTaskTime_Minutes'])
-priority_map = {'High': 3, 'Medium': 2, 'Low': 1}
-df['PriorityLevel'] = (df['Priority'].str.capitalize()
-                       .map(priority_map).fillna(1).astype(int))
-df.drop('Priority', axis=1, inplace=True)
-df.head(10)
-X = df.drop('ProductivityScore', axis=1)
-y = df['ProductivityScore']
-X_train, X_test, y_train, y_test = train_test_split(
-    X, y, test_size=0.2, random_state=42
-)
-cat_cols = ['Team','ProductType','TaskType']
-num_cols = ['OrderQuantity','DeadlineDays','ExperienceYears','AvgTaskTime_Minutes',
-            'ErrorRate','TrainingHours','DayNumber','ThroughputRate','TimePressure','PriorityLevel']
-preprocessor = ColumnTransformer([
-    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
-    ('num', StandardScaler(), num_cols)
-])
-pipeline = Pipeline([
-    ('preprocessor', preprocessor),
-    ('regressor', XGBRegressor(
-        objective='reg:squarederror',
-        random_state=42,
-        tree_method='hist'
-    ))
-])
-pipeline.fit(X_train, y_train)
-# Step 7: Evaluate on test set
-from sklearn.metrics import r2_score, mean_squared_error
-y_pred = pipeline.predict(X_test)
-print(f"Test R²:  {r2_score(y_test, y_pred):.4f}")
-print(f"Test MSE: {mean_squared_error(y_test, y_pred):.4f}")
-# Step 1: Define hyperparameter search space
-from scipy.stats import randint, uniform, loguniform
-param_dist = {
-    'regressor__n_estimators':      randint(100, 1000),
-    'regressor__max_depth':         randint(3, 15),
-    'regressor__learning_rate':     uniform(0.01, 0.29),
-    'regressor__subsample':         uniform(0.5, 0.5),
-    'regressor__colsample_bytree':  uniform(0.5, 0.5),
-    'regressor__gamma':             uniform(0, 0.5),
-    'regressor__reg_alpha':         loguniform(1e-3, 1e2),
-    'regressor__reg_lambda':        loguniform(1e-3, 1e2),
-    'regressor__min_child_weight':  randint(1, 10),
-}
-# Step 2: Set up RandomizedSearchCV
-from sklearn.model_selection import RandomizedSearchCV
-search = RandomizedSearchCV(
-    estimator=pipeline,
-    param_distributions=param_dist,
-    n_iter=50,             # number of parameter settings to sample
-    scoring='r2',
-    cv=3,
-    n_jobs=-1,
-    verbose=1,
-    random_state=42
-)
-# Step 3: Run the hyperparameter search
-search.fit(X_train, y_train)
-# Step 4: Inspect the best parameters & CV score
-print("🔍 Best parameters:", search.best_params_)
-print(f"Best CV R²: {search.best_score_:.4f}")
-# Step 5: Evaluate the tuned model on the test set
-best_model = search.best_estimator_
-from sklearn.metrics import r2_score, mean_squared_error
-y_pred = best_model.predict(X_test)
-print(f"Test R²:  {r2_score(y_test, y_pred):.4f}")
-print(f"Test MSE: {mean_squared_error(y_test, y_pred):.4f}")
-from sklearn.model_selection import GridSearchCV
-from sklearn.metrics import r2_score, mean_squared_error
 import joblib
-# 1) Extract your best random‐search parameters
-best = search.best_params_
-# 2) Create a tight grid around them
-param_grid = {
-    'regressor__n_estimators': [
-        max(100, best['regressor__n_estimators'] - 100),
-        best['regressor__n_estimators'],
-        best['regressor__n_estimators'] + 100
-    ],
-    'regressor__max_depth': [
-        max(3, best['regressor__max_depth'] - 2),
-        best['regressor__max_depth'],
-        best['regressor__max_depth'] + 2
-    ],
-    'regressor__learning_rate': [
-        best['regressor__learning_rate'] * 0.5,
-        best['regressor__learning_rate'],
-        best['regressor__learning_rate'] * 1.5
-    ],
 }
-# 3) Set up GridSearchCV
-grid_search = GridSearchCV(
-    estimator=pipeline,
-    param_grid=param_grid,
-    scoring='r2',
-    cv=3,            # 3-fold CV
-    n_jobs=-1,
-    verbose=1
 )
-# 4) Run grid search on training set
-grid_search.fit(X_train, y_train)
-# 5) Evaluate on test set
-y_pred = grid_search.predict(X_test)
-print("Grid Search Best R²:", r2_score(y_test, y_pred))
-print("Grid Search MSE: ", mean_squared_error(y_test, y_pred))
-# 6) Save the final, tuned model
-joblib.dump(grid_search.best_estimator_, 'task_distribution_model_grid_tuned.joblib')
-!pip install optuna
-import optuna
-def objective(trial):
-    params = {
-      'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
-      'max_depth': trial.suggest_int('max_depth', 3, 15),
-      'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
-      'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
-      'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
-      'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10),
-      'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10),
-    }
-    model = Pipeline([
-      ('preprocessor', preprocessor),
-      ('regressor', XGBRegressor(**params, tree_method='hist', random_state=42))
-    ])
-    from sklearn.model_selection import cross_val_score
-    score = cross_val_score(model, X_train, y_train, cv=3, scoring='r2', n_jobs=-1).mean()
-    return score
-study = optuna.create_study(direction='maximize')
-study.optimize(objective, n_trials=50)
-print("Optuna best R²:", study.best_value)
-print("  Best params:", study.best_params)
-from sklearn.pipeline import Pipeline
-from lightgbm import LGBMRegressor
-from sklearn.model_selection import RandomizedSearchCV
-from scipy.stats import randint, uniform
-# 1a) Build a LightGBM pipeline
-lgb_pipeline = Pipeline([
-    ('preprocessor', preprocessor),
-    ('regressor', LGBMRegressor(random_state=42))
-])
-# 1b) Define a random search space
-param_dist_lgb = {
-    'regressor__n_estimators': randint(100, 1000),
-    'regressor__max_depth':    randint(3, 15),
-    'regressor__learning_rate':uniform(0.01, 0.29),
-    'regressor__subsample':     uniform(0.5, 0.5),
-    'regressor__colsample_bytree': uniform(0.5, 0.5),
-    'regressor__reg_alpha':     uniform(0, 1),
-    'regressor__reg_lambda':    uniform(0, 1),
-}
-search_lgb = RandomizedSearchCV(
-    lgb_pipeline,
-    param_distributions=param_dist_lgb,
-    n_iter=50,
-    scoring='r2',
-    cv=3,
-    n_jobs=-1,
-    random_state=42,
-    verbose=1
-)
-search_lgb.fit(X_train, y_train)
-print("LightGBM Best CV R²:", search_lgb.best_score_)
-# Evaluate on test
-y_pred = search_lgb.predict(X_test)
-from sklearn.metrics import r2_score, mean_squared_error
-print(" LightGBM Test R²:", r2_score(y_test, y_pred))
-print(" LightGBM Test MSE:", mean_squared_error(y_test, y_pred))
-import optuna
-from sklearn.pipeline import Pipeline
-from sklearn.model_selection import cross_val_score
-from lightgbm import LGBMRegressor
-from sklearn.metrics import r2_score, mean_squared_error
-import joblib
-# Optuna objective function for LightGBM
-def objective_lgb(trial):
-    params = {
-        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
-        "max_depth": trial.suggest_int("max_depth", 3, 12),
-        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 1e-1),
-        "num_leaves": trial.suggest_int("num_leaves", 20, 200),
-        "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
-        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
-        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-3, 10.0),
-        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-3, 10.0),
-        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
-        "min_split_gain": trial.suggest_uniform("min_split_gain", 0, 1.0),
-        "random_state": 42
-    }
-    # Build pipeline with current params
-    pipeline_lgb = Pipeline([
-        ("preprocessor", preprocessor),
-        ("regressor", LGBMRegressor(**params))
-    ])
-    # 3-fold CV on training set
-    scores = cross_val_score(pipeline_lgb, X_train, y_train,
-                             scoring="r2", cv=3, n_jobs=-1)
-    return scores.mean()
-# Create and run the study
-study_lgb = optuna.create_study(direction="maximize")
-study_lgb.optimize(objective_lgb, n_trials=50)
-print("🔍 Optuna LightGBM best R²:", study_lgb.best_value)
-print("✨ Best hyperparameters:", study_lgb.best_params)
-# Retrain final model on full training data
-best_params = study_lgb.best_params
-lgb_final = Pipeline([
-    ("preprocessor", preprocessor),
-    ("regressor", LGBMRegressor(**best_params))
-])
-!pip install optuna
-import optuna
-from sklearn.pipeline import Pipeline
-from sklearn.model_selection import KFold, cross_val_score
-from sklearn.metrics import r2_score, mean_squared_error
-from lightgbm import LGBMRegressor
-import numpy as np
-import joblib
-# Enhanced Optuna objective function with pruning
-def objective_lgb_pruned(trial):
-    params = {
-        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
-        "max_depth": trial.suggest_int("max_depth", 3, 12),
-        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 1e-1),
-        "num_leaves": trial.suggest_int("num_leaves", 20, 200),
-        "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
-        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
-        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-3, 10.0),
-        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-3, 10.0),
-        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
-        "min_split_gain": trial.suggest_uniform("min_split_gain", 0, 1.0),
-        "random_state": 42,
-        "verbose": -1  # Suppress LightGBM warnings
-    }
-    # Use KFold for manual cross-validation with pruning
-    kf = KFold(n_splits=3, shuffle=True, random_state=42)
-    scores = []
-    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
-        X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
-        y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
-        # Build pipeline
-        pipeline_lgb = Pipeline([
-            ("preprocessor", preprocessor),
-            ("regressor", LGBMRegressor(**params))
-        ])
-        # Fit and predict
-        pipeline_lgb.fit(X_fold_train, y_fold_train)
-        y_pred = pipeline_lgb.predict(X_fold_val)
-        score = r2_score(y_fold_val, y_pred)
-        scores.append(score)
-        # Report intermediate value for pruning
-        trial.report(score, fold)
-        # Check if trial should be pruned
-        if trial.should_prune():
-            raise optuna.TrialPruned()
-    return np.mean(scores)
-# Create study with pruning
-study_lgb_pruned = optuna.create_study(
-    direction="maximize",
-    pruner=optuna.pruners.MedianPruner(
-        n_startup_trials=10,  # Number of trials before pruning starts
-        n_warmup_steps=5,     # Number of steps before considering pruning
-        interval_steps=1      # Interval between pruning checks
-    ),
-    sampler=optuna.samplers.TPESampler(
-        n_startup_trials=20,
-        n_ei_candidates=24,
-        seed=42
-    )
-)
-# Optimize with more trials since pruning makes it faster
-study_lgb_pruned.optimize(objective_lgb_pruned, n_trials=100)
-print("Optuna LightGBM (with pruning) R²:", study_lgb_pruned.best_value)
-print("Best hyperparameters:", study_lgb_pruned.best_params)
-print("Number of pruned trials:", len([t for t in study_lgb_pruned.trials if t.state == optuna.trial.TrialState.PRUNED]))
-# Train final model
-best_params = study_lgb_pruned.best_params
-lgb_final_pruned = Pipeline([
-    ("preprocessor", preprocessor),
-    ("regressor", LGBMRegressor(**best_params))
-])
-lgb_final_pruned.fit(X_train, y_train)
-# Evaluate on test set
-y_pred_test = lgb_final_pruned.predict(X_test)
-test_r2 = r2_score(y_test, y_pred_test)
-test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
-import joblib
-# 1) Save the pruned LightGBM pipeline
-model_filename = 'model.joblib'
-joblib.dump(lgb_final_pruned, model_filename)
-print(f"Model exported to {model_filename}")
-# 2) (Optional) In Colab, download directly:
-from google.colab import files
-files.download(model_filename)
-# 👇 Paste this after your training cell 👇
-import numpy as np
-import matplotlib.pyplot as plt
-from IPython.display import display
-# 1) Recover your teams & specialties from df
-teams = sorted(df['Team'].unique())
-specialty_map = dict(zip(df['Team'], df['Specialty']))
-# 2) Define the example task you want to test
-example_task = {
-    'ProductType':         'Mothball',
-    'TaskType':            'Packaging',
-    'OrderQuantity':       120,
-    'DeadlineDays':        1,
-    'ExperienceYears':     6,
-    'AvgTaskTime_Minutes': 28.0,
-    'ErrorRate':           0.05,
-    'TrainingHours':       20.0,
-    'DayNumber':           2,
-    'ThroughputRate':      120 / 28.0,
-    'TimePressure':        120 / (4 * 28.0),
-    'PriorityLevel':       3
-}
-# 3) Build a DataFrame with one row per team
-rows = []
-for team in teams:
-    r = example_task.copy()
-    r['Team'] = team
-    r['Specialty'] = specialty_map[team]
-    rows.append(r)
-test_df = pd.DataFrame(rows)
-# 4) Predict & rank
-test_df['PredictedProductivity'] = pipeline.predict(test_df)
-ranked = test_df.sort_values('PredictedProductivity', ascending=False).reset_index(drop=True)
-# 5) Display the table
-print("🏆 Team Productivity Rankings:")
-display(ranked[['Team','PredictedProductivity']])
-# 6) Optional: plot a horizontal bar chart
-plt.figure(figsize=(8,5))
-plt.barh(ranked['Team'], ranked['PredictedProductivity'], color='steelblue')
-plt.gca().invert_yaxis()
-plt.xlabel('Predicted Productivity')
-plt.title('Team Ranking for Example Task')
-plt.grid(axis='x', linestyle='--', alpha=0.5)
-plt.tight_layout()
-plt.show()

+import gradio as gr
+import matplotlib.pyplot as plt
 import numpy as np
+import pandas as pd
 import joblib
+# Load the model and dataset
+model = joblib.load("xgb_model.joblib")
+df = pd.read_csv("worker_productivity.csv")  # Make sure this is uploaded to Hugging Face Space
+# Get unique teams
+teams = sorted(df['team'].unique())
+# Define a base task to simulate a prediction input
+base_task = {
+    'quarter': 'Q2',
+    'department': 'sewing',
+    'day': 'Monday',
+    'no_of_workers': 48,
+    'incentive': 2.5,
+    'idle_time': 0.3,
+    'idle_men': 4,
+    'smv': 30.0,
+    'month': 5,
+    'day_of_week': 0,
+    'is_weekend': 0,
+    'smv_per_worker': 30.0 / 48,
+    'effort_index': 30.0 + 2.5 + 1.0 - 0.3,
+    'log_wip': np.log1p(50),
+    'log_overtime': np.log1p(1.0),
+    'no_of_style_change': 0,
+    'targeted_productivity': 0.75
 }
+# Prediction function
+def predict():
+    team_scores = []
+    for team in teams:
+        task = base_task.copy()
+        task['team'] = team
+        task_df = pd.DataFrame([task])
+        pred = model.predict(task_df)[0]
+        team_scores.append((team, pred))
+    # Sort results
+    team_scores_df = pd.DataFrame(team_scores, columns=["Team", "Predicted Productivity"])
+    team_scores_df = team_scores_df.sort_values(by="Predicted Productivity", ascending=False)
+    # Plot results
+    fig, ax = plt.subplots(figsize=(10, 6))
+    ax.barh(team_scores_df["Team"].astype(str), team_scores_df["Predicted Productivity"], color='skyblue')
+    ax.set_xlabel("Predicted Productivity")
+    ax.set_title("Predicted Productivity by Team for Custom Task")
+    ax.invert_yaxis()
+    plt.tight_layout()
+    return fig
+# Gradio UI
+demo = gr.Interface(
+    fn=predict,
+    inputs=[],
+    outputs=[gr.Plot(label="Team Productivity Rankings")],
+    live=False,
+    title="Worker Productivity Predictor",
+    description="Generates predicted productivity scores for each team on a fixed custom task."
 )
+demo.launch()