File size: 4,297 Bytes
989cfe7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""Unit tests for src.evaluation."""

from __future__ import annotations

import json

import pytest

from src.evaluation import (
    EvaluationParseError,
    ParsedTurn,
    format_evaluation,
    format_score,
    parse_model_output,
)


VALID_EVAL = {
    "competent": 5,
    "likeable": 4,
    "considerate": 6,
    "polite": 7,
    "formal": 3,
    "demanding": 2,
}


def _valid_payload(**overrides) -> str:
    """Build a JSON string representing a well-formed model output."""
    payload = {"response": "Hi there!", "evaluation": dict(VALID_EVAL)}
    payload.update(overrides)
    return json.dumps(payload)


# ---- format_score ----

def test_format_score_basic():
    assert format_score("polite", 2) == "barely polite (2/7)"
    assert format_score("competent", 5) == "quite competent (5/7)"
    assert format_score("demanding", 7) == "extremely demanding (7/7)"


def test_format_score_all_levels_render():
    for score in range(1, 8):
        result = format_score("attr", score)
        assert f"({score}/7)" in result


@pytest.mark.parametrize("bad_score", [0, 8, -1, 100])
def test_format_score_rejects_out_of_range(bad_score: int):
    with pytest.raises(ValueError):
        format_score("polite", bad_score)


# ---- format_evaluation ----

def test_format_evaluation_returns_ordered_list():
    result = format_evaluation(VALID_EVAL)
    assert len(result) == 6
    assert result[0].endswith("(5/7)")  # competent first by default


def test_format_evaluation_custom_order():
    result = format_evaluation(VALID_EVAL, attributes=["polite", "formal"])
    assert len(result) == 2
    assert "polite" in result[0]
    assert "formal" in result[1]


# ---- parse_model_output: happy paths ----

def test_parse_model_output_returns_parsed_turn():
    turn = parse_model_output(_valid_payload())
    assert isinstance(turn, ParsedTurn)
    assert turn.response == "Hi there!"
    assert turn.evaluation == VALID_EVAL


def test_parse_model_output_strips_json_fences():
    raw = f"```json\n{_valid_payload()}\n```"
    turn = parse_model_output(raw)
    assert turn.evaluation == VALID_EVAL


def test_parse_model_output_strips_plain_fences():
    raw = f"```\n{_valid_payload()}\n```"
    turn = parse_model_output(raw)
    assert turn.evaluation == VALID_EVAL


def test_parse_model_output_ignores_extra_attributes():
    extra = dict(VALID_EVAL)
    extra["unexpected"] = 5
    raw = json.dumps({"response": "hi", "evaluation": extra})
    turn = parse_model_output(raw)
    assert "unexpected" not in turn.evaluation


# ---- parse_model_output: error paths ----

def test_parse_model_output_rejects_invalid_json():
    with pytest.raises(EvaluationParseError, match="parse JSON"):
        parse_model_output("not even close to json")


def test_parse_model_output_rejects_missing_response():
    raw = json.dumps({"evaluation": VALID_EVAL})
    with pytest.raises(EvaluationParseError, match="response"):
        parse_model_output(raw)


def test_parse_model_output_rejects_missing_evaluation():
    raw = json.dumps({"response": "hi"})
    with pytest.raises(EvaluationParseError, match="evaluation"):
        parse_model_output(raw)


def test_parse_model_output_rejects_missing_attribute():
    incomplete = dict(VALID_EVAL)
    del incomplete["polite"]
    raw = json.dumps({"response": "hi", "evaluation": incomplete})
    with pytest.raises(EvaluationParseError, match="polite"):
        parse_model_output(raw)


def test_parse_model_output_rejects_out_of_range_score():
    bad = dict(VALID_EVAL)
    bad["polite"] = 9
    raw = json.dumps({"response": "hi", "evaluation": bad})
    with pytest.raises(EvaluationParseError, match="out of range"):
        parse_model_output(raw)


def test_parse_model_output_rejects_bool_score():
    bad = dict(VALID_EVAL)
    bad["polite"] = True  # noqa: not an int despite isinstance(True, int)
    raw = json.dumps({"response": "hi", "evaluation": bad})
    with pytest.raises(EvaluationParseError, match="not an integer"):
        parse_model_output(raw)


def test_parse_model_output_rejects_float_score():
    bad = dict(VALID_EVAL)
    bad["polite"] = 5.5
    raw = json.dumps({"response": "hi", "evaluation": bad})
    with pytest.raises(EvaluationParseError, match="not an integer"):
        parse_model_output(raw)