trial1 / evaluate.py
priyaaaaaasharmaaaaa's picture
Upload trained LoRA adapter from Kaggle
35e9f20 verified
Raw
History Blame Contribute Delete
7.83 kB
"""
evaluate.py β€” Compare baseline vs trained RL policy
=====================================================
Run after train.py completes.
"""
from __future__ import annotations
import os
import json
import requests
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
ENV_URL = os.getenv("ENV_BASE_URL", "https://sejal-k-ai-sprint-manager.hf.space")
BASELINE_MODEL = os.getenv("MODEL_NAME", "meta-llama/Llama-3.1-8B-Instruct")
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
API_KEY = os.getenv("HF_TOKEN", "dummy")
POLICY_PATH = "./results/best_policy.json"
TASKS = ["easy_sprint", "medium_sprint", "hard_sprint"]
N_SEEDS = 5
SYSTEM = """You are a Tech Lead. Output ONLY JSON.
{"action_type": "assign", "task_id": "T1", "dev_id": "dev1", "new_priority": null}
Only assign backlog tasks to available skill-matched developers."""
# ── Shared helpers ────────────────────────────────────────────────────────────
def env_reset(task_name, seed=42):
r = requests.post(f"{ENV_URL}/reset",
json={"task_name": task_name, "seed": seed}, timeout=30)
r.raise_for_status()
return r.json()
def env_step(action):
r = requests.post(f"{ENV_URL}/step", json={"action": {
"action_type": action.get("action_type", "skip"),
"task_id": action.get("task_id"),
"dev_id": action.get("dev_id"),
"new_priority": action.get("new_priority"),
}}, timeout=30)
r.raise_for_status()
return r.json()
def run_episode(act_fn, task_name, seed=42):
obs = env_reset(task_name, seed)
result = {"info": {}}
for _ in range(12):
if obs.get("done"):
break
action = act_fn(obs)
result = env_step(action)
obs = result["observation"]
return max(0.01, min(0.99, result.get("info", {}).get("final_score", 0.01)))
def evaluate(act_fn, name):
print(f"\n {name}")
print(f" {'─'*45}")
scores = {}
for task in TASKS:
vals = []
for seed in range(N_SEEDS):
try:
vals.append(run_episode(act_fn, task, seed=seed*13+42))
except Exception:
vals.append(0.01)
avg = sum(vals) / len(vals)
scores[task] = round(avg, 4)
bar = "β–ˆ" * int(avg * 20)
print(f" {task:<20} {avg:.4f} {bar}")
overall = sum(scores.values()) / len(scores)
scores["average"] = round(overall, 4)
print(f" {'AVERAGE':<20} {overall:.4f}")
return scores
# ── Trained policy agent ──────────────────────────────────────────────────────
class TrainedPolicy:
def __init__(self, path):
with open(path) as f:
w = json.load(f)
self.priority_weight = w["priority_weight"]
self.deadline_weight = w["deadline_weight"]
self.skill_weight = w["skill_weight"]
self.load_weight = w["load_weight"]
def act(self, obs):
day = obs.get("current_day", 1)
backlog = [t for t in obs["tasks"] if t["status"] == "backlog"]
avail = [d for d in obs["developers"]
if d["is_available"] and d["current_load"] < d["capacity"]]
if not backlog or not avail:
return {"action_type": "skip", "task_id": None,
"dev_id": None, "new_priority": None}
best, bt, bd = float("-inf"), None, None
for t in backlog:
for d in avail:
s = self.priority_weight * (6 - t["priority"])
s += self.deadline_weight * (10 / max(1, t["deadline"] - day))
if d["skill"] == t["required_skill"]:
s += self.skill_weight * 3
elif d["skill"] == "fullstack":
s += self.skill_weight * 2
else:
s -= self.skill_weight * 2
s -= self.load_weight * (d["current_load"] / max(d["capacity"], 1)) * 2
if s > best:
best, bt, bd = s, t, d
if bt and bd:
return {"action_type": "assign", "task_id": bt["id"],
"dev_id": bd["id"], "new_priority": None}
return {"action_type": "skip", "task_id": None,
"dev_id": None, "new_priority": None}
# ── Baseline LLM agent ────────────────────────────────────────────────────────
def make_baseline_act():
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
def act(obs):
backlog = sorted([t for t in obs["tasks"] if t["status"] == "backlog"],
key=lambda t: (t["priority"], t["deadline"]))
avail = [d for d in obs["developers"]
if d["is_available"] and d["current_load"] < d["capacity"]]
prompt = (
f"Day {obs['current_day']}/{obs['sprint_length']}\n"
f"Backlog: {[t['id']+':'+t['required_skill'] for t in backlog[:4]]}\n"
f"Devs: {[d['id']+':'+d['skill'] for d in avail]}\n"
f"JSON action:"
)
try:
resp = client.chat.completions.create(
model=BASELINE_MODEL,
messages=[
{"role": "system", "content": SYSTEM},
{"role": "user", "content": prompt},
],
temperature=0.1, max_tokens=80,
)
text = resp.choices[0].message.content or ""
a = json.loads(text.strip())
if a.get("action_type") in ("assign","reassign","skip","unblock"):
return a
except Exception:
pass
return {"action_type": "skip", "task_id": None,
"dev_id": None, "new_priority": None}
return act
# ── Main ──────────────────────────────────────────────────────────────────────
def main():
try:
r = requests.get(f"{ENV_URL}/health", timeout=10)
print(f"Server: {r.json()}")
except Exception:
print("ERROR: Start server first: python ui.py")
return
print("\n" + "="*55)
print(" EVALUATION: Baseline vs Trained RL Policy")
print("="*55)
results = {}
# Baseline
results["baseline"] = evaluate(make_baseline_act(), f"Baseline LLM ({BASELINE_MODEL})")
# Trained policy
if os.path.exists(POLICY_PATH):
policy = TrainedPolicy(POLICY_PATH)
results["trained"] = evaluate(policy.act, f"Trained RL Policy ({POLICY_PATH})")
# Comparison table
print(f"\n{'='*55}")
print(" IMPROVEMENT SUMMARY")
print(f"{'='*55}")
print(f" {'Task':<20} {'Baseline':>10} {'Trained':>10} {'Delta':>10}")
print(f" {'─'*48}")
for task in TASKS + ["average"]:
b = results["baseline"].get(task, 0)
t = results["trained"].get(task, 0)
delta = t - b
sign = "+" if delta >= 0 else ""
print(f" {task:<20} {b:>10.4f} {t:>10.4f} {sign}{delta:>9.4f}")
else:
print(f"\n No trained policy at {POLICY_PATH}")
print(" Run python train.py first.")
# Save
os.makedirs("results", exist_ok=True)
with open("results/evaluation.json", "w") as f:
json.dump(results, f, indent=2)
print(f"\n Saved β†’ results/evaluation.json")
if __name__ == "__main__":
main()