import numpy as np import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split import joblib def generate_synthetic_data(n_samples=5000, random_state=42): rng = np.random.RandomState(random_state) meetings_count = rng.randint(0, 12, size=n_samples) # kpl total_meeting_hours = rng.uniform(0, 9, size=n_samples) # h context_switches = rng.randint(0, 20, size=n_samples) # kpl deep_work_blocks = rng.randint(0, 5, size=n_samples) # kpl break_minutes = rng.randint(0, 120, size=n_samples) # min day_start_hour = rng.randint(7, 11, size=n_samples) # 7–10 day_end_hour = rng.randint(14, 21, size=n_samples) # 14–20 df = pd.DataFrame({ "meetings_count": meetings_count, "total_meeting_hours": total_meeting_hours, "context_switches": context_switches, "deep_work_blocks": deep_work_blocks, "break_minutes": break_minutes, "day_start_hour": day_start_hour, "day_end_hour": day_end_hour, }) # Heuristic "actual workload" [0, 1] day_length = day_end_hour - day_start_hour load_score = ( 0.3 * (meetings_count / 10) + 0.25 * (total_meeting_hours / 8) + 0.2 * (context_switches / 20) + 0.15 * (day_length / 12) - 0.15 * (deep_work_blocks / 4) - 0.1 * (break_minutes / 120) + rng.normal(0, 0.05, size=n_samples) ) load_score = np.clip(load_score, 0, 1) # Discretize into classes 0 = low, 1 = medium, 2 = high labels = np.zeros(n_samples, dtype=int) labels[load_score > 0.33] = 1 labels[load_score > 0.66] = 2 return df, labels if __name__ == "__main__": X, y = generate_synthetic_data() X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) clf = RandomForestClassifier( n_estimators=150, max_depth=8, random_state=42 ) clf.fit(X_train, y_train) acc = clf.score(X_test, y_test) print(f"Test accuracy: {acc:.3f}") joblib.dump(clf, "workload_model.joblib") print("Saved model to workload_model.joblib")