commitment-os / server /graders.py
jayantaggarwal-sketch
Sync latest code and non-binary artifacts
af8810b
"""Deterministic grading — 5-component reward for CommitmentOS.
Components:
constraint_satisfaction (0.35) — binary per scenario constraint
conflict_resolution (0.20) — final calendar free of overlaps
commitment_coherence (0.20) — ledger violations penalised
communication_quality (0.15) — keyword matching on sent emails
step_efficiency (0.10) — fewer steps = higher score
"""
from __future__ import annotations
from typing import Any, Dict, List, Tuple
from server.domain import ScenarioDef
from server.world import WorldState, _time_to_min
WEIGHTS: Dict[str, float] = {
"constraint_satisfaction": 0.35,
"conflict_resolution": 0.20,
"commitment_coherence": 0.20,
"communication_quality": 0.15,
"step_efficiency": 0.10,
}
def _keyword_score(text: str, keywords: List[str], min_matches: int = 2) -> Tuple[float, List[str]]:
"""0 hits -> 0.0, < min_matches -> 0.5 (partial), >= min_matches -> 1.0."""
text_lower = text.lower()
matched = [kw for kw in keywords if kw.lower() in text_lower]
if len(matched) == 0:
return 0.0, matched
if len(matched) < min_matches:
return 0.5, matched
return 1.0, matched
def _check_constraint(constraint, world: WorldState) -> bool:
"""Evaluate a single ConstraintDef against the world state."""
ct = constraint.check_type
params = constraint.check_params
if ct == "calendar_no_conflict":
return _calendar_has_no_overlaps(world)
elif ct == "event_exists":
eid = params.get("event_id", "")
return eid in world.calendar
elif ct == "event_cancelled":
eid = params.get("event_id", "")
return eid not in world.calendar
elif ct == "email_sent":
to = params.get("to", "").lower()
keywords = params.get("keywords", [])
for em in world.emails_sent:
if to in em.get("to", "").lower():
if keywords:
score, _ = _keyword_score(em.get("body", ""), keywords, min_matches=1)
if score > 0:
return True
else:
return True
return False
elif ct == "restaurant_match":
name = params.get("name", "")
if name:
return world.booked_restaurant.lower() == name.lower()
criteria = params.get("criteria", {})
if not world.booked_restaurant:
return False
r = world.restaurants.get(world.booked_restaurant)
if r is None:
return False
if "dietary" in criteria and criteria["dietary"].lower() not in [d.lower() for d in r.dietary_options]:
return False
if "max_price" in criteria and r.price_per_person > criteria["max_price"]:
return False
if "max_distance" in criteria and r.distance_miles > criteria["max_distance"]:
return False
if "near_airport" in criteria and criteria["near_airport"] and not r.near_airport:
return False
return True
elif ct == "priority_order":
higher = params.get("higher", "").lower()
lower = params.get("lower", "").lower()
higher_kept = any(
ev.title.lower() == higher or higher in ev.title.lower()
for ev in world.calendar.values()
)
lower_moved = not any(
ev.title.lower() == lower or lower in ev.title.lower()
for ev in world.calendar.values()
) or any(
em.get("to", "").lower() == lower or lower in em.get("body", "").lower()
for em in world.emails_sent
)
return higher_kept and lower_moved
return False
def _calendar_has_no_overlaps(world: WorldState) -> bool:
events = list(world.calendar.values())
for i, a in enumerate(events):
for b in events[i + 1:]:
if a.date != b.date:
continue
a_start = _time_to_min(a.time)
a_end = a_start + a.duration_min
b_start = _time_to_min(b.time)
b_end = b_start + b.duration_min
if a_start < b_end and b_start < a_end:
return False
return True
def _score_constraint_satisfaction(scenario: ScenarioDef, world: WorldState) -> Tuple[float, str]:
if not scenario.constraints:
return 1.0, "No constraints defined"
met = sum(1 for c in scenario.constraints if _check_constraint(c, world))
total = len(scenario.constraints)
score = met / total
return score, f"{met}/{total} constraints met"
def _score_conflict_resolution(world: WorldState) -> Tuple[float, str]:
clean = _calendar_has_no_overlaps(world)
return (1.0 if clean else 0.0), ("No calendar conflicts" if clean else "Calendar has overlapping events")
def _score_commitment_coherence(world: WorldState) -> Tuple[float, str]:
total = len(world.commitment_ledger)
if total == 0:
return 1.0, "No commitments created"
violations = world.get_silent_violations()
silent_count = len(violations)
renegotiated = sum(1 for c in world.commitment_ledger if c.renegotiated_at is not None)
honored = total - silent_count - renegotiated
score = (total - silent_count) / total
parts = []
if honored > 0:
parts.append(f"{honored} honored")
if renegotiated > 0:
parts.append(f"{renegotiated} renegotiated")
if silent_count > 0:
parts.append(f"{silent_count} SILENTLY BROKEN")
return score, " | ".join(parts) if parts else "OK"
def _score_communication(scenario: ScenarioDef, world: WorldState) -> Tuple[float, str]:
reqs = scenario.communication_requirements
if not reqs:
return 1.0, "No communication requirements"
total_score = 0.0
feedback_parts: List[str] = []
for req in reqs:
to_lower = req.to.lower()
matching_emails = [
em for em in world.emails_sent
if to_lower in em.get("to", "").lower()
]
if not matching_emails:
feedback_parts.append(f"MISSING email to {req.to}")
continue
best_score = 0.0
for em in matching_emails:
body = em.get("body", "") + " " + em.get("subject", "")
if req.required_keywords:
ks, matched = _keyword_score(body, req.required_keywords, min_matches=1)
best_score = max(best_score, ks)
else:
best_score = 1.0
total_score += best_score
if best_score >= 1.0:
feedback_parts.append(f"Email to {req.to}: full credit")
elif best_score > 0:
feedback_parts.append(f"Email to {req.to}: partial ({best_score:.1f})")
else:
feedback_parts.append(f"Email to {req.to}: missing keywords")
score = total_score / len(reqs) if reqs else 1.0
return score, " | ".join(feedback_parts)
def _score_step_efficiency(scenario: ScenarioDef, world: WorldState) -> Tuple[float, str]:
optimal = scenario.optimal_steps
actual = world.step_count
if actual <= optimal:
return 1.0, f"{actual} steps (optimal: {optimal})"
penalty = (actual - optimal) * 0.1
score = max(0.0, 1.0 - penalty)
return score, f"{actual} steps (optimal: {optimal}, penalty: -{penalty:.1f})"
def grade_scenario(
scenario: ScenarioDef,
world: WorldState,
) -> Tuple[float, Dict[str, float], str]:
"""Returns ``(total_reward, breakdown, feedback)``."""
breakdown: Dict[str, float] = {}
feedback_parts: List[str] = []
cs_score, cs_fb = _score_constraint_satisfaction(scenario, world)
breakdown["constraint_satisfaction"] = round(cs_score * WEIGHTS["constraint_satisfaction"], 4)
feedback_parts.append(f"[constraints] {cs_fb}")
cr_score, cr_fb = _score_conflict_resolution(world)
breakdown["conflict_resolution"] = round(cr_score * WEIGHTS["conflict_resolution"], 4)
feedback_parts.append(f"[conflicts] {cr_fb}")
cc_score, cc_fb = _score_commitment_coherence(world)
breakdown["commitment_coherence"] = round(cc_score * WEIGHTS["commitment_coherence"], 4)
feedback_parts.append(f"[commitments] {cc_fb}")
cq_score, cq_fb = _score_communication(scenario, world)
breakdown["communication_quality"] = round(cq_score * WEIGHTS["communication_quality"], 4)
feedback_parts.append(f"[communication] {cq_fb}")
se_score, se_fb = _score_step_efficiency(scenario, world)
breakdown["step_efficiency"] = round(se_score * WEIGHTS["step_efficiency"], 4)
feedback_parts.append(f"[efficiency] {se_fb}")
total_reward = round(sum(breakdown.values()), 4)
total_reward = max(0.01, min(0.99, total_reward))
feedback = " | ".join(feedback_parts)
return total_reward, breakdown, feedback