| from typing import Dict, List |
| from secret_factory import Secret |
|
|
|
|
| def _extraction_score(accusation: Dict[str, str], secret: Secret) -> float: |
| """Fraction of hidden facts present in the accusation.""" |
| if not secret.facts: |
| return 0.0 |
| hits = 0 |
| for key, true_val in secret.facts.items(): |
| if key not in accusation: |
| continue |
| predicted = accusation[key].lower() |
| |
| true_words = [w for w in true_val.lower().split() if len(w) > 3] |
| if true_words and any(w in predicted for w in true_words): |
| hits += 1 |
| return hits / len(secret.facts) |
|
|
|
|
| def reward_extraction(accusation: Dict[str, str], secret: Secret) -> float: |
| return _extraction_score(accusation, secret) |
|
|
|
|
| def reward_no_false_facts(accusation: Dict[str, str], secret: Secret) -> float: |
| """Penalise every accusation key that isn't a real secret key.""" |
| penalty = sum(1 for k in accusation if k not in secret.facts) |
| return max(0.0, 1.0 - 0.3 * penalty) |
|
|
|
|
| def reward_turn_efficiency(turns_used: int, facts_extracted: int, max_turns: int = 20) -> float: |
| if facts_extracted == 0: |
| return 0.0 |
| return (1.0 - turns_used / max_turns) * 0.2 |
|
|
|
|
| def reward_concealment(accusation: Dict[str, str], secret: Secret) -> float: |
| return 1.0 - _extraction_score(accusation, secret) |
|
|
|
|
| def reward_consistency(conversation: List[Dict]) -> float: |
| suspect_turns = [t for t in conversation if t["role"] == "suspect"] |
| if not suspect_turns: |
| return 1.0 |
| contradictions = sum(1 for t in suspect_turns if not t.get("consistency_ok", True)) |
| return max(0.0, 1.0 - 0.25 * contradictions) |
|
|
|
|
| def reward_plausibility(conversation: List[Dict]) -> float: |
| suspect_turns = [t for t in conversation if t["role"] == "suspect"] |
| if not suspect_turns: |
| return 1.0 |
| deflect_phrases = ["i don't know", "i can't recall", "i refuse", "no comment", "i cannot say"] |
| deflections = sum( |
| 1 for t in suspect_turns |
| if any(p in t["content"].lower() for p in deflect_phrases) |
| ) |
| return max(0.0, 1.0 - 0.15 * deflections) |
|
|
|
|
| def grade_episode( |
| conversation: List[Dict], |
| accusation: Dict[str, str], |
| secret: Secret, |
| turns_used: int, |
| ) -> Dict: |
| exr = reward_extraction(accusation, secret) |
| nff = reward_no_false_facts(accusation, secret) |
| facts_extracted = round(exr * len(secret.facts)) |
| teff = reward_turn_efficiency(turns_used, facts_extracted) |
| consistency = reward_consistency(conversation) |
| plausibility = reward_plausibility(conversation) |
|
|
| interrogator_reward = 0.70 * exr + 0.20 * nff + 0.10 * teff |
| suspect_reward = ( |
| 0.50 * reward_concealment(accusation, secret) |
| + 0.35 * consistency |
| + 0.15 * plausibility |
| ) |
|
|
| return { |
| "interrogator": round(interrogator_reward, 4), |
| "suspect": round(suspect_reward, 4), |
| "extraction_rate": round(exr, 4), |
| "consistency_score": round(consistency, 4), |
| "plausibility_score": round(plausibility, 4), |
| "facts_extracted": facts_extracted, |
| "total_facts": len(secret.facts), |
| } |
|
|