| """ |
| evaluate.py β Compare baseline vs trained RL policy |
| ===================================================== |
| Run after train.py completes. |
| """ |
| from __future__ import annotations |
| import os |
| import json |
| import requests |
| from openai import OpenAI |
| from dotenv import load_dotenv |
|
|
| load_dotenv() |
|
|
| ENV_URL = os.getenv("ENV_BASE_URL", "https://sejal-k-ai-sprint-manager.hf.space") |
| BASELINE_MODEL = os.getenv("MODEL_NAME", "meta-llama/Llama-3.1-8B-Instruct") |
| API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1") |
| API_KEY = os.getenv("HF_TOKEN", "dummy") |
| POLICY_PATH = "./results/best_policy.json" |
| TASKS = ["easy_sprint", "medium_sprint", "hard_sprint"] |
| N_SEEDS = 5 |
|
|
| SYSTEM = """You are a Tech Lead. Output ONLY JSON. |
| {"action_type": "assign", "task_id": "T1", "dev_id": "dev1", "new_priority": null} |
| Only assign backlog tasks to available skill-matched developers.""" |
|
|
|
|
| |
|
|
| def env_reset(task_name, seed=42): |
| r = requests.post(f"{ENV_URL}/reset", |
| json={"task_name": task_name, "seed": seed}, timeout=30) |
| r.raise_for_status() |
| return r.json() |
|
|
| def env_step(action): |
| r = requests.post(f"{ENV_URL}/step", json={"action": { |
| "action_type": action.get("action_type", "skip"), |
| "task_id": action.get("task_id"), |
| "dev_id": action.get("dev_id"), |
| "new_priority": action.get("new_priority"), |
| }}, timeout=30) |
| r.raise_for_status() |
| return r.json() |
|
|
| def run_episode(act_fn, task_name, seed=42): |
| obs = env_reset(task_name, seed) |
| result = {"info": {}} |
| for _ in range(12): |
| if obs.get("done"): |
| break |
| action = act_fn(obs) |
| result = env_step(action) |
| obs = result["observation"] |
| return max(0.01, min(0.99, result.get("info", {}).get("final_score", 0.01))) |
|
|
| def evaluate(act_fn, name): |
| print(f"\n {name}") |
| print(f" {'β'*45}") |
| scores = {} |
| for task in TASKS: |
| vals = [] |
| for seed in range(N_SEEDS): |
| try: |
| vals.append(run_episode(act_fn, task, seed=seed*13+42)) |
| except Exception: |
| vals.append(0.01) |
| avg = sum(vals) / len(vals) |
| scores[task] = round(avg, 4) |
| bar = "β" * int(avg * 20) |
| print(f" {task:<20} {avg:.4f} {bar}") |
| overall = sum(scores.values()) / len(scores) |
| scores["average"] = round(overall, 4) |
| print(f" {'AVERAGE':<20} {overall:.4f}") |
| return scores |
|
|
|
|
| |
|
|
| class TrainedPolicy: |
| def __init__(self, path): |
| with open(path) as f: |
| w = json.load(f) |
| self.priority_weight = w["priority_weight"] |
| self.deadline_weight = w["deadline_weight"] |
| self.skill_weight = w["skill_weight"] |
| self.load_weight = w["load_weight"] |
|
|
| def act(self, obs): |
| day = obs.get("current_day", 1) |
| backlog = [t for t in obs["tasks"] if t["status"] == "backlog"] |
| avail = [d for d in obs["developers"] |
| if d["is_available"] and d["current_load"] < d["capacity"]] |
| if not backlog or not avail: |
| return {"action_type": "skip", "task_id": None, |
| "dev_id": None, "new_priority": None} |
|
|
| best, bt, bd = float("-inf"), None, None |
| for t in backlog: |
| for d in avail: |
| s = self.priority_weight * (6 - t["priority"]) |
| s += self.deadline_weight * (10 / max(1, t["deadline"] - day)) |
| if d["skill"] == t["required_skill"]: |
| s += self.skill_weight * 3 |
| elif d["skill"] == "fullstack": |
| s += self.skill_weight * 2 |
| else: |
| s -= self.skill_weight * 2 |
| s -= self.load_weight * (d["current_load"] / max(d["capacity"], 1)) * 2 |
| if s > best: |
| best, bt, bd = s, t, d |
| if bt and bd: |
| return {"action_type": "assign", "task_id": bt["id"], |
| "dev_id": bd["id"], "new_priority": None} |
| return {"action_type": "skip", "task_id": None, |
| "dev_id": None, "new_priority": None} |
|
|
|
|
| |
|
|
| def make_baseline_act(): |
| client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY) |
|
|
| def act(obs): |
| backlog = sorted([t for t in obs["tasks"] if t["status"] == "backlog"], |
| key=lambda t: (t["priority"], t["deadline"])) |
| avail = [d for d in obs["developers"] |
| if d["is_available"] and d["current_load"] < d["capacity"]] |
| prompt = ( |
| f"Day {obs['current_day']}/{obs['sprint_length']}\n" |
| f"Backlog: {[t['id']+':'+t['required_skill'] for t in backlog[:4]]}\n" |
| f"Devs: {[d['id']+':'+d['skill'] for d in avail]}\n" |
| f"JSON action:" |
| ) |
| try: |
| resp = client.chat.completions.create( |
| model=BASELINE_MODEL, |
| messages=[ |
| {"role": "system", "content": SYSTEM}, |
| {"role": "user", "content": prompt}, |
| ], |
| temperature=0.1, max_tokens=80, |
| ) |
| text = resp.choices[0].message.content or "" |
| a = json.loads(text.strip()) |
| if a.get("action_type") in ("assign","reassign","skip","unblock"): |
| return a |
| except Exception: |
| pass |
| return {"action_type": "skip", "task_id": None, |
| "dev_id": None, "new_priority": None} |
| return act |
|
|
|
|
| |
|
|
| def main(): |
| try: |
| r = requests.get(f"{ENV_URL}/health", timeout=10) |
| print(f"Server: {r.json()}") |
| except Exception: |
| print("ERROR: Start server first: python ui.py") |
| return |
|
|
| print("\n" + "="*55) |
| print(" EVALUATION: Baseline vs Trained RL Policy") |
| print("="*55) |
|
|
| results = {} |
|
|
| |
| results["baseline"] = evaluate(make_baseline_act(), f"Baseline LLM ({BASELINE_MODEL})") |
|
|
| |
| if os.path.exists(POLICY_PATH): |
| policy = TrainedPolicy(POLICY_PATH) |
| results["trained"] = evaluate(policy.act, f"Trained RL Policy ({POLICY_PATH})") |
|
|
| |
| print(f"\n{'='*55}") |
| print(" IMPROVEMENT SUMMARY") |
| print(f"{'='*55}") |
| print(f" {'Task':<20} {'Baseline':>10} {'Trained':>10} {'Delta':>10}") |
| print(f" {'β'*48}") |
| for task in TASKS + ["average"]: |
| b = results["baseline"].get(task, 0) |
| t = results["trained"].get(task, 0) |
| delta = t - b |
| sign = "+" if delta >= 0 else "" |
| print(f" {task:<20} {b:>10.4f} {t:>10.4f} {sign}{delta:>9.4f}") |
| else: |
| print(f"\n No trained policy at {POLICY_PATH}") |
| print(" Run python train.py first.") |
|
|
| |
| os.makedirs("results", exist_ok=True) |
| with open("results/evaluation.json", "w") as f: |
| json.dump(results, f, indent=2) |
| print(f"\n Saved β results/evaluation.json") |
|
|
|
|
| if __name__ == "__main__": |
| main() |