File size: 3,172 Bytes
ffbce00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
from typing import Dict, List
from secret_factory import Secret


def _extraction_score(accusation: Dict[str, str], secret: Secret) -> float:
    """Fraction of hidden facts present in the accusation."""
    if not secret.facts:
        return 0.0
    hits = 0
    for key, true_val in secret.facts.items():
        if key not in accusation:
            continue
        predicted = accusation[key].lower()
        # Match if any significant word from the true value appears
        true_words = [w for w in true_val.lower().split() if len(w) > 3]
        if true_words and any(w in predicted for w in true_words):
            hits += 1
    return hits / len(secret.facts)


def reward_extraction(accusation: Dict[str, str], secret: Secret) -> float:
    return _extraction_score(accusation, secret)


def reward_no_false_facts(accusation: Dict[str, str], secret: Secret) -> float:
    """Penalise every accusation key that isn't a real secret key."""
    penalty = sum(1 for k in accusation if k not in secret.facts)
    return max(0.0, 1.0 - 0.3 * penalty)


def reward_turn_efficiency(turns_used: int, facts_extracted: int, max_turns: int = 20) -> float:
    if facts_extracted == 0:
        return 0.0
    return (1.0 - turns_used / max_turns) * 0.2


def reward_concealment(accusation: Dict[str, str], secret: Secret) -> float:
    return 1.0 - _extraction_score(accusation, secret)


def reward_consistency(conversation: List[Dict]) -> float:
    suspect_turns = [t for t in conversation if t["role"] == "suspect"]
    if not suspect_turns:
        return 1.0
    contradictions = sum(1 for t in suspect_turns if not t.get("consistency_ok", True))
    return max(0.0, 1.0 - 0.25 * contradictions)


def reward_plausibility(conversation: List[Dict]) -> float:
    suspect_turns = [t for t in conversation if t["role"] == "suspect"]
    if not suspect_turns:
        return 1.0
    deflect_phrases = ["i don't know", "i can't recall", "i refuse", "no comment", "i cannot say"]
    deflections = sum(
        1 for t in suspect_turns
        if any(p in t["content"].lower() for p in deflect_phrases)
    )
    return max(0.0, 1.0 - 0.15 * deflections)


def grade_episode(
    conversation: List[Dict],
    accusation: Dict[str, str],
    secret: Secret,
    turns_used: int,
) -> Dict:
    exr = reward_extraction(accusation, secret)
    nff = reward_no_false_facts(accusation, secret)
    facts_extracted = round(exr * len(secret.facts))
    teff = reward_turn_efficiency(turns_used, facts_extracted)
    consistency = reward_consistency(conversation)
    plausibility = reward_plausibility(conversation)

    interrogator_reward = 0.70 * exr + 0.20 * nff + 0.10 * teff
    suspect_reward = (
        0.50 * reward_concealment(accusation, secret)
        + 0.35 * consistency
        + 0.15 * plausibility
    )

    return {
        "interrogator": round(interrogator_reward, 4),
        "suspect": round(suspect_reward, 4),
        "extraction_rate": round(exr, 4),
        "consistency_score": round(consistency, 4),
        "plausibility_score": round(plausibility, 4),
        "facts_extracted": facts_extracted,
        "total_facts": len(secret.facts),
    }