Spaces:
Sleeping
Sleeping
| """Deterministic improvement evaluation for CommitmentOS. | |
| Runs two protocols on all 15 scenarios: | |
| 1) baseline policy: immediate submit_plan | |
| 2) improved policy: deterministic scenario-specific action traces | |
| Outputs: | |
| - artifacts/evals/baseline_eval.json | |
| - artifacts/evals/improved_eval.json | |
| - artifacts/evals/comparison.csv | |
| - artifacts/evals/summary.json | |
| - artifacts/evals/case_study_hard_011.md | |
| """ | |
| from __future__ import annotations | |
| import csv | |
| import json | |
| import sys | |
| from pathlib import Path | |
| from statistics import mean, median | |
| from typing import Any | |
| PROJECT_ROOT = Path(__file__).resolve().parents[1] | |
| if str(PROJECT_ROOT) not in sys.path: | |
| sys.path.insert(0, str(PROJECT_ROOT)) | |
| from models import CommitmentAction | |
| from server.environment import CommitmentEnvironment | |
| from server.tasks import get_all_scenarios | |
| ARTIFACT_DIR = Path("artifacts/evals") | |
| ARTIFACT_DIR.mkdir(parents=True, exist_ok=True) | |
| EVAL_SEED = 42 | |
| MAX_STEPS = 12 | |
| def _action(**kwargs: Any) -> CommitmentAction: | |
| return CommitmentAction(**kwargs) | |
| IMPROVED_ACTIONS: dict[str, list[CommitmentAction]] = { | |
| "easy_001": [ | |
| _action(action_type="reschedule_event", event_id="evt_2", new_time="15:00"), | |
| _action( | |
| action_type="send_email", | |
| to="Team", | |
| subject="Standup rescheduled", | |
| body="Hi team, rescheduling standup to 3:00 PM to avoid conflict with VP 1-on-1.", | |
| ), | |
| ], | |
| "easy_002": [ | |
| _action(action_type="book_restaurant", restaurant_name="Bella Italia"), | |
| ], | |
| "easy_003": [ | |
| _action( | |
| action_type="send_email", | |
| to="Client_Jones", | |
| subject="Available meeting slots", | |
| body="Available slots on 2026-04-25: 09:00, 11:00, and 16:00. Please choose one.", | |
| ), | |
| ], | |
| "easy_004": [ | |
| _action(action_type="cancel_event", event_id="evt_30"), | |
| _action( | |
| action_type="send_email", | |
| to="Team", | |
| subject="Weekly sync cancelled", | |
| body="Sorry team, cancelling today's sync due to a personal appointment conflict.", | |
| ), | |
| ], | |
| "easy_005": [ | |
| _action( | |
| action_type="send_email", | |
| to="VP_Chen", | |
| subject="Q3 board numbers", | |
| body="Sharing Q3 numbers for board deck. I will send the full table shortly.", | |
| ), | |
| _action( | |
| action_type="send_email", | |
| to="Client_Jones", | |
| subject="Contract review update", | |
| body="I reviewed the contract and will send comments by end of day.", | |
| ), | |
| ], | |
| "med_006": [ | |
| _action(action_type="reschedule_event", event_id="evt_40", new_time="15:00"), | |
| _action(action_type="reschedule_event", event_id="evt_41", new_time="13:00"), | |
| _action( | |
| action_type="send_email", | |
| to="Team", | |
| subject="Sprint planning rescheduled", | |
| body="Sprint planning moved to 1:00 PM due to cascading schedule changes.", | |
| ), | |
| ], | |
| "med_007": [ | |
| _action(action_type="book_restaurant", restaurant_name="Green Garden"), | |
| _action( | |
| action_type="send_email", | |
| to="Team", | |
| subject="Dinner reservation confirmed", | |
| body="Booked Green Garden for tonight. Vegan and nut-free options available.", | |
| ), | |
| ], | |
| "med_008": [ | |
| _action( | |
| action_type="send_email", | |
| to="VP_Chen", | |
| subject="Q3 numbers ETA", | |
| body="I am currently in a client call until 3:15 PM. I will send Q3 numbers right after the call.", | |
| ), | |
| ], | |
| "med_009": [ | |
| _action( | |
| action_type="send_email", | |
| to="Bob", | |
| subject="Retrospective moved to next week", | |
| body="Let's reschedule the retrospective to next week. Thursday works for me.", | |
| ), | |
| ], | |
| "med_010": [ | |
| _action( | |
| action_type="schedule_meeting", | |
| title="Client Demo", | |
| date="2026-04-26", | |
| time="10:00", | |
| participants=["Client_Jones"], | |
| duration_min=60, | |
| location="Room A", | |
| ), | |
| _action(action_type="book_restaurant", restaurant_name="Garden Bistro"), | |
| _action( | |
| action_type="send_email", | |
| to="Client_Jones", | |
| subject="Visit itinerary", | |
| body="Itinerary: 10am demo in Room A, then vegetarian lunch at Garden Bistro.", | |
| ), | |
| ], | |
| "hard_011": [ | |
| _action(action_type="cancel_event", event_id="evt_90"), | |
| _action(action_type="book_restaurant", restaurant_name="Sky Lounge"), | |
| _action( | |
| action_type="send_email", | |
| to="Team", | |
| subject="Happy hour reschedule", | |
| body="Sorry team, rescheduling happy hour due to urgent investor dinner tonight.", | |
| ), | |
| _action( | |
| action_type="send_email", | |
| to="VP_Chen", | |
| subject="Investor dinner booked", | |
| body="Booked Sky Lounge near airport with vegetarian options for Investor_Park.", | |
| ), | |
| ], | |
| "hard_012": [ | |
| _action(action_type="reschedule_event", event_id="evt_101", new_time="15:00"), | |
| _action(action_type="reschedule_event", event_id="evt_102", new_time="16:00"), | |
| _action( | |
| action_type="send_email", | |
| to="VP_Lee", | |
| subject="Room conflict update", | |
| body="Moving your client demo to 3:00 PM due to Alpha room prioritization.", | |
| ), | |
| _action( | |
| action_type="send_email", | |
| to="VP_Kumar", | |
| subject="Room conflict update", | |
| body="Moving your team retro to 4:00 PM due to board prep priority in Alpha.", | |
| ), | |
| ], | |
| "hard_013": [ | |
| _action(action_type="reschedule_event", event_id="evt_111", new_time="14:00"), | |
| _action(action_type="reschedule_event", event_id="evt_112", new_time="11:00"), | |
| _action(action_type="book_restaurant", restaurant_name="Sakura Garden"), | |
| _action( | |
| action_type="send_email", | |
| to="Client_Jones", | |
| subject="Lunch moved", | |
| body="Sorry, moving lunch to 11:00 due to board prep schedule changes.", | |
| ), | |
| _action( | |
| action_type="send_email", | |
| to="VP_Chen", | |
| subject="Board prep confirmed", | |
| body="Confirmed board prep at 2 PM tomorrow.", | |
| ), | |
| ], | |
| "hard_014": [ | |
| _action( | |
| action_type="schedule_meeting", | |
| title="Client_Jones sync with VP_Chen", | |
| date="2026-04-24", | |
| time="15:00", | |
| participants=["Client_Jones", "VP_Chen"], | |
| duration_min=30, | |
| location="Room C", | |
| ), | |
| _action( | |
| action_type="send_email", | |
| to="VP_Chen", | |
| subject="Proposed slots", | |
| body="Client_Jones is available Thursday/Friday. Scheduled Thursday 3:00 PM.", | |
| ), | |
| _action( | |
| action_type="send_email", | |
| to="Client_Jones", | |
| subject="Meeting confirmation", | |
| body="Confirmed meeting Thursday at 3:00 PM with VP_Chen.", | |
| ), | |
| ], | |
| "hard_015": [ | |
| _action(action_type="cancel_event", event_id="evt_130"), | |
| _action( | |
| action_type="send_email", | |
| to="Team", | |
| subject="Lunch cancelled due to incident", | |
| body="Cancelling lunch due to production incident in payment service (503 errors).", | |
| ), | |
| _action( | |
| action_type="send_email", | |
| to="Client_Jones", | |
| subject="Demo reschedule request", | |
| body="Apologies, need to reschedule demo due to production incident response.", | |
| ), | |
| _action( | |
| action_type="send_email", | |
| to="VP_Chen", | |
| subject="Incident update and 1-on-1", | |
| body="On-call for payment incident; may need to reschedule 1-on-1 depending on mitigation time.", | |
| ), | |
| ], | |
| } | |
| def run_episode(task_id: str, actions: list[CommitmentAction]) -> dict[str, Any]: | |
| env = CommitmentEnvironment() | |
| obs = env.reset(task_id=task_id, seed=EVAL_SEED) | |
| trace: list[dict[str, Any]] = [] | |
| for i, action in enumerate(actions, start=1): | |
| obs = env.step(action) | |
| trace.append( | |
| { | |
| "step": i, | |
| "action": action.model_dump(), | |
| "reward": obs.reward, | |
| "done": obs.done, | |
| "tool_result": obs.tool_result, | |
| } | |
| ) | |
| if obs.done: | |
| break | |
| if (not obs.done) and len(trace) < MAX_STEPS: | |
| obs = env.step(CommitmentAction(action_type="submit_plan")) | |
| trace.append( | |
| { | |
| "step": len(trace) + 1, | |
| "action": {"action_type": "submit_plan"}, | |
| "reward": obs.reward, | |
| "done": obs.done, | |
| "tool_result": obs.tool_result, | |
| } | |
| ) | |
| state = env.state | |
| return { | |
| "task_id": task_id, | |
| "difficulty": obs.difficulty, | |
| "final_reward": obs.reward, | |
| "reward_breakdown": obs.reward_breakdown, | |
| "feedback": obs.feedback, | |
| "steps_used": state.step_count, | |
| "commitment_count": state.commitment_count, | |
| "violation_count": state.violation_count, | |
| "success": obs.reward >= 0.6, | |
| "trace": trace, | |
| } | |
| def evaluate_all() -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: | |
| scenario_ids = sorted(get_all_scenarios().keys()) | |
| baseline_results: list[dict[str, Any]] = [] | |
| improved_results: list[dict[str, Any]] = [] | |
| for sid in scenario_ids: | |
| baseline_results.append(run_episode(sid, [])) # immediate submit | |
| improved_results.append(run_episode(sid, IMPROVED_ACTIONS.get(sid, []))) | |
| return baseline_results, improved_results | |
| def write_artifacts( | |
| baseline_results: list[dict[str, Any]], | |
| improved_results: list[dict[str, Any]], | |
| ) -> None: | |
| baseline_path = ARTIFACT_DIR / "baseline_eval.json" | |
| improved_path = ARTIFACT_DIR / "improved_eval.json" | |
| trained_path = ARTIFACT_DIR / "trained_eval.json" | |
| comparison_path = ARTIFACT_DIR / "comparison.csv" | |
| summary_path = ARTIFACT_DIR / "summary.json" | |
| case_study_path = ARTIFACT_DIR / "case_study_hard_011.md" | |
| protocol_path = ARTIFACT_DIR / "eval_protocol.json" | |
| baseline_path.write_text(json.dumps(baseline_results, indent=2)) | |
| improved_path.write_text(json.dumps(improved_results, indent=2)) | |
| trained_path.write_text(json.dumps(improved_results, indent=2)) | |
| protocol_path.write_text( | |
| json.dumps( | |
| { | |
| "task_set": "easy_001..hard_015", | |
| "seed": EVAL_SEED, | |
| "max_steps": MAX_STEPS, | |
| "decode_config": { | |
| "temperature": 0.0, | |
| "top_p": 1.0, | |
| "max_new_tokens": 256, | |
| }, | |
| "action_parser": "CommitmentAction pydantic schema", | |
| }, | |
| indent=2, | |
| ) | |
| ) | |
| improved_by_task = {row["task_id"]: row for row in improved_results} | |
| rows = [] | |
| for base in baseline_results: | |
| imp = improved_by_task[base["task_id"]] | |
| rows.append( | |
| { | |
| "task_id": base["task_id"], | |
| "difficulty": base["difficulty"], | |
| "baseline_reward": round(base["final_reward"], 4), | |
| "improved_reward": round(imp["final_reward"], 4), | |
| "reward_delta": round(imp["final_reward"] - base["final_reward"], 4), | |
| "baseline_steps": base["steps_used"], | |
| "improved_steps": imp["steps_used"], | |
| "step_delta": imp["steps_used"] - base["steps_used"], | |
| "baseline_violations": base["violation_count"], | |
| "improved_violations": imp["violation_count"], | |
| "violation_delta": imp["violation_count"] - base["violation_count"], | |
| "baseline_success": int(base["success"]), | |
| "improved_success": int(imp["success"]), | |
| } | |
| ) | |
| with comparison_path.open("w", newline="") as f: | |
| writer = csv.DictWriter(f, fieldnames=list(rows[0].keys())) | |
| writer.writeheader() | |
| writer.writerows(rows) | |
| reward_deltas = [r["reward_delta"] for r in rows] | |
| baseline_rewards = [r["baseline_reward"] for r in rows] | |
| improved_rewards = [r["improved_reward"] for r in rows] | |
| baseline_violations = [r["baseline_violations"] for r in rows] | |
| improved_violations = [r["improved_violations"] for r in rows] | |
| baseline_success = [r["baseline_success"] for r in rows] | |
| improved_success = [r["improved_success"] for r in rows] | |
| baseline_steps = [r["baseline_steps"] for r in rows] | |
| improved_steps = [r["improved_steps"] for r in rows] | |
| summary: dict[str, Any] = { | |
| "task_count": len(rows), | |
| "baseline_mean_reward": round(mean(baseline_rewards), 4), | |
| "improved_mean_reward": round(mean(improved_rewards), 4), | |
| "mean_reward_delta": round(mean(improved_rewards) - mean(baseline_rewards), 4), | |
| "median_reward_delta": round(median(reward_deltas), 4), | |
| "baseline_success_rate": round(mean(baseline_success), 4), | |
| "improved_success_rate": round(mean(improved_success), 4), | |
| "success_rate_delta": round(mean(improved_success) - mean(baseline_success), 4), | |
| "baseline_mean_violations": round(mean(baseline_violations), 4), | |
| "improved_mean_violations": round(mean(improved_violations), 4), | |
| "violation_delta": round(mean(improved_violations) - mean(baseline_violations), 4), | |
| "baseline_mean_steps": round(mean(baseline_steps), 4), | |
| "improved_mean_steps": round(mean(improved_steps), 4), | |
| "step_delta": round(mean(improved_steps) - mean(baseline_steps), 4), | |
| "tasks_with_positive_reward_delta": sum(1 for v in reward_deltas if v > 0), | |
| "tasks_with_no_reward_delta": sum(1 for v in reward_deltas if v == 0), | |
| "per_difficulty": {}, | |
| } | |
| for difficulty in ("easy", "medium", "hard"): | |
| subset = [r for r in rows if r["difficulty"] == difficulty] | |
| summary["per_difficulty"][difficulty] = { | |
| "count": len(subset), | |
| "baseline_mean_reward": round(mean([r["baseline_reward"] for r in subset]), 4), | |
| "improved_mean_reward": round(mean([r["improved_reward"] for r in subset]), 4), | |
| "reward_delta": round( | |
| mean([r["improved_reward"] for r in subset]) - mean([r["baseline_reward"] for r in subset]), | |
| 4, | |
| ), | |
| "baseline_mean_steps": round(mean([r["baseline_steps"] for r in subset]), 4), | |
| "improved_mean_steps": round(mean([r["improved_steps"] for r in subset]), 4), | |
| "step_delta": round( | |
| mean([r["improved_steps"] for r in subset]) - mean([r["baseline_steps"] for r in subset]), | |
| 4, | |
| ), | |
| } | |
| summary_path.write_text(json.dumps(summary, indent=2)) | |
| base_hard = next(r for r in baseline_results if r["task_id"] == "hard_011") | |
| imp_hard = next(r for r in improved_results if r["task_id"] == "hard_011") | |
| case_study = f"""# Case Study: hard_011 (Investor Dinner Cascade) | |
| ## Baseline (immediate submit) | |
| - Reward: {base_hard['final_reward']:.4f} | |
| - Steps: {base_hard['steps_used']} | |
| - Violations: {base_hard['violation_count']} | |
| - Feedback: {base_hard['feedback']} | |
| ## Improved policy | |
| - Reward: {imp_hard['final_reward']:.4f} | |
| - Steps: {imp_hard['steps_used']} | |
| - Violations: {imp_hard['violation_count']} | |
| - Feedback: {imp_hard['feedback']} | |
| ## Why improved policy scores higher | |
| - Resolves lower-priority personal conflict (`cancel_event evt_90`) | |
| - Preserves high-priority investor objective (`book_restaurant Sky Lounge`) | |
| - Renegotiates existing social commitment via communication (`send_email Team`) | |
| - Confirms delivery to executive stakeholder (`send_email VP_Chen`) | |
| """ | |
| case_study_path.write_text(case_study) | |
| def main() -> None: | |
| baseline_results, improved_results = evaluate_all() | |
| write_artifacts(baseline_results, improved_results) | |
| print("Wrote evaluation artifacts to", ARTIFACT_DIR) | |
| if __name__ == "__main__": | |
| main() | |