Meta / grader.py
ayaan-ai's picture
Add Suspect X OpenEnv environment (FastAPI + full reward pipeline)
ffbce00
from typing import Dict, List
from secret_factory import Secret
def _extraction_score(accusation: Dict[str, str], secret: Secret) -> float:
"""Fraction of hidden facts present in the accusation."""
if not secret.facts:
return 0.0
hits = 0
for key, true_val in secret.facts.items():
if key not in accusation:
continue
predicted = accusation[key].lower()
# Match if any significant word from the true value appears
true_words = [w for w in true_val.lower().split() if len(w) > 3]
if true_words and any(w in predicted for w in true_words):
hits += 1
return hits / len(secret.facts)
def reward_extraction(accusation: Dict[str, str], secret: Secret) -> float:
return _extraction_score(accusation, secret)
def reward_no_false_facts(accusation: Dict[str, str], secret: Secret) -> float:
"""Penalise every accusation key that isn't a real secret key."""
penalty = sum(1 for k in accusation if k not in secret.facts)
return max(0.0, 1.0 - 0.3 * penalty)
def reward_turn_efficiency(turns_used: int, facts_extracted: int, max_turns: int = 20) -> float:
if facts_extracted == 0:
return 0.0
return (1.0 - turns_used / max_turns) * 0.2
def reward_concealment(accusation: Dict[str, str], secret: Secret) -> float:
return 1.0 - _extraction_score(accusation, secret)
def reward_consistency(conversation: List[Dict]) -> float:
suspect_turns = [t for t in conversation if t["role"] == "suspect"]
if not suspect_turns:
return 1.0
contradictions = sum(1 for t in suspect_turns if not t.get("consistency_ok", True))
return max(0.0, 1.0 - 0.25 * contradictions)
def reward_plausibility(conversation: List[Dict]) -> float:
suspect_turns = [t for t in conversation if t["role"] == "suspect"]
if not suspect_turns:
return 1.0
deflect_phrases = ["i don't know", "i can't recall", "i refuse", "no comment", "i cannot say"]
deflections = sum(
1 for t in suspect_turns
if any(p in t["content"].lower() for p in deflect_phrases)
)
return max(0.0, 1.0 - 0.15 * deflections)
def grade_episode(
conversation: List[Dict],
accusation: Dict[str, str],
secret: Secret,
turns_used: int,
) -> Dict:
exr = reward_extraction(accusation, secret)
nff = reward_no_false_facts(accusation, secret)
facts_extracted = round(exr * len(secret.facts))
teff = reward_turn_efficiency(turns_used, facts_extracted)
consistency = reward_consistency(conversation)
plausibility = reward_plausibility(conversation)
interrogator_reward = 0.70 * exr + 0.20 * nff + 0.10 * teff
suspect_reward = (
0.50 * reward_concealment(accusation, secret)
+ 0.35 * consistency
+ 0.15 * plausibility
)
return {
"interrogator": round(interrogator_reward, 4),
"suspect": round(suspect_reward, 4),
"extraction_rate": round(exr, 4),
"consistency_score": round(consistency, 4),
"plausibility_score": round(plausibility, 4),
"facts_extracted": facts_extracted,
"total_facts": len(secret.facts),
}