trial1 / evaluate.py

Upload trained LoRA adapter from Kaggle

35e9f20 verified 2 months ago

7.83 kB

	"""
	evaluate.py — Compare baseline vs trained RL policy
	=====================================================
	Run after train.py completes.
	"""
	from __future__ import annotations
	import os
	import json
	import requests
	from openai import OpenAI
	from dotenv import load_dotenv

	load_dotenv()

	ENV_URL = os.getenv("ENV_BASE_URL", "https://sejal-k-ai-sprint-manager.hf.space")
	BASELINE_MODEL = os.getenv("MODEL_NAME", "meta-llama/Llama-3.1-8B-Instruct")
	API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
	API_KEY = os.getenv("HF_TOKEN", "dummy")
	POLICY_PATH = "./results/best_policy.json"
	TASKS = ["easy_sprint", "medium_sprint", "hard_sprint"]
	N_SEEDS = 5

	SYSTEM = """You are a Tech Lead. Output ONLY JSON.
	{"action_type": "assign", "task_id": "T1", "dev_id": "dev1", "new_priority": null}
	Only assign backlog tasks to available skill-matched developers."""


	# ── Shared helpers ────────────────────────────────────────────────────────────

	def env_reset(task_name, seed=42):
	r = requests.post(f"{ENV_URL}/reset",
	json={"task_name": task_name, "seed": seed}, timeout=30)
	r.raise_for_status()
	return r.json()

	def env_step(action):
	r = requests.post(f"{ENV_URL}/step", json={"action": {
	"action_type": action.get("action_type", "skip"),
	"task_id": action.get("task_id"),
	"dev_id": action.get("dev_id"),
	"new_priority": action.get("new_priority"),
	}}, timeout=30)
	r.raise_for_status()
	return r.json()

	def run_episode(act_fn, task_name, seed=42):
	obs = env_reset(task_name, seed)
	result = {"info": {}}
	for _ in range(12):
	if obs.get("done"):
	break
	action = act_fn(obs)
	result = env_step(action)
	obs = result["observation"]
	return max(0.01, min(0.99, result.get("info", {}).get("final_score", 0.01)))

	def evaluate(act_fn, name):
	print(f"\n {name}")
	print(f" {'─'*45}")
	scores = {}
	for task in TASKS:
	vals = []
	for seed in range(N_SEEDS):
	try:
	vals.append(run_episode(act_fn, task, seed=seed*13+42))
	except Exception:
	vals.append(0.01)
	avg = sum(vals) / len(vals)
	scores[task] = round(avg, 4)
	bar = "█" * int(avg * 20)
	print(f" {task:<20} {avg:.4f} {bar}")
	overall = sum(scores.values()) / len(scores)
	scores["average"] = round(overall, 4)
	print(f" {'AVERAGE':<20} {overall:.4f}")
	return scores


	# ── Trained policy agent ──────────────────────────────────────────────────────

	class TrainedPolicy:
	def __init__(self, path):
	with open(path) as f:
	w = json.load(f)
	self.priority_weight = w["priority_weight"]
	self.deadline_weight = w["deadline_weight"]
	self.skill_weight = w["skill_weight"]
	self.load_weight = w["load_weight"]

	def act(self, obs):
	day = obs.get("current_day", 1)
	backlog = [t for t in obs["tasks"] if t["status"] == "backlog"]
	avail = [d for d in obs["developers"]
	if d["is_available"] and d["current_load"] < d["capacity"]]
	if not backlog or not avail:
	return {"action_type": "skip", "task_id": None,
	"dev_id": None, "new_priority": None}

	best, bt, bd = float("-inf"), None, None
	for t in backlog:
	for d in avail:
	s = self.priority_weight * (6 - t["priority"])
	s += self.deadline_weight * (10 / max(1, t["deadline"] - day))
	if d["skill"] == t["required_skill"]:
	s += self.skill_weight * 3
	elif d["skill"] == "fullstack":
	s += self.skill_weight * 2
	else:
	s -= self.skill_weight * 2
	s -= self.load_weight * (d["current_load"] / max(d["capacity"], 1)) * 2
	if s > best:
	best, bt, bd = s, t, d
	if bt and bd:
	return {"action_type": "assign", "task_id": bt["id"],
	"dev_id": bd["id"], "new_priority": None}
	return {"action_type": "skip", "task_id": None,
	"dev_id": None, "new_priority": None}


	# ── Baseline LLM agent ────────────────────────────────────────────────────────

	def make_baseline_act():
	client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)

	def act(obs):
	backlog = sorted([t for t in obs["tasks"] if t["status"] == "backlog"],
	key=lambda t: (t["priority"], t["deadline"]))
	avail = [d for d in obs["developers"]
	if d["is_available"] and d["current_load"] < d["capacity"]]
	prompt = (
	f"Day {obs['current_day']}/{obs['sprint_length']}\n"
	f"Backlog: {[t['id']+':'+t['required_skill'] for t in backlog[:4]]}\n"
	f"Devs: {[d['id']+':'+d['skill'] for d in avail]}\n"
	f"JSON action:"
	)
	try:
	resp = client.chat.completions.create(
	model=BASELINE_MODEL,
	messages=[
	{"role": "system", "content": SYSTEM},
	{"role": "user", "content": prompt},
	],
	temperature=0.1, max_tokens=80,
	)
	text = resp.choices[0].message.content or ""
	a = json.loads(text.strip())
	if a.get("action_type") in ("assign","reassign","skip","unblock"):
	return a
	except Exception:
	pass
	return {"action_type": "skip", "task_id": None,
	"dev_id": None, "new_priority": None}
	return act


	# ── Main ──────────────────────────────────────────────────────────────────────

	def main():
	try:
	r = requests.get(f"{ENV_URL}/health", timeout=10)
	print(f"Server: {r.json()}")
	except Exception:
	print("ERROR: Start server first: python ui.py")
	return

	print("\n" + "="*55)
	print(" EVALUATION: Baseline vs Trained RL Policy")
	print("="*55)

	results = {}

	# Baseline
	results["baseline"] = evaluate(make_baseline_act(), f"Baseline LLM ({BASELINE_MODEL})")

	# Trained policy
	if os.path.exists(POLICY_PATH):
	policy = TrainedPolicy(POLICY_PATH)
	results["trained"] = evaluate(policy.act, f"Trained RL Policy ({POLICY_PATH})")

	# Comparison table
	print(f"\n{'='*55}")
	print(" IMPROVEMENT SUMMARY")
	print(f"{'='*55}")
	print(f" {'Task':<20} {'Baseline':>10} {'Trained':>10} {'Delta':>10}")
	print(f" {'─'*48}")
	for task in TASKS + ["average"]:
	b = results["baseline"].get(task, 0)
	t = results["trained"].get(task, 0)
	delta = t - b
	sign = "+" if delta >= 0 else ""
	print(f" {task:<20} {b:>10.4f} {t:>10.4f} {sign}{delta:>9.4f}")
	else:
	print(f"\n No trained policy at {POLICY_PATH}")
	print(" Run python train.py first.")

	# Save
	os.makedirs("results", exist_ok=True)
	with open("results/evaluation.json", "w") as f:
	json.dump(results, f, indent=2)
	print(f"\n Saved → results/evaluation.json")


	if __name__ == "__main__":
	main()