Spaces:
Running
Running
File size: 4,297 Bytes
989cfe7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | """Unit tests for src.evaluation."""
from __future__ import annotations
import json
import pytest
from src.evaluation import (
EvaluationParseError,
ParsedTurn,
format_evaluation,
format_score,
parse_model_output,
)
VALID_EVAL = {
"competent": 5,
"likeable": 4,
"considerate": 6,
"polite": 7,
"formal": 3,
"demanding": 2,
}
def _valid_payload(**overrides) -> str:
"""Build a JSON string representing a well-formed model output."""
payload = {"response": "Hi there!", "evaluation": dict(VALID_EVAL)}
payload.update(overrides)
return json.dumps(payload)
# ---- format_score ----
def test_format_score_basic():
assert format_score("polite", 2) == "barely polite (2/7)"
assert format_score("competent", 5) == "quite competent (5/7)"
assert format_score("demanding", 7) == "extremely demanding (7/7)"
def test_format_score_all_levels_render():
for score in range(1, 8):
result = format_score("attr", score)
assert f"({score}/7)" in result
@pytest.mark.parametrize("bad_score", [0, 8, -1, 100])
def test_format_score_rejects_out_of_range(bad_score: int):
with pytest.raises(ValueError):
format_score("polite", bad_score)
# ---- format_evaluation ----
def test_format_evaluation_returns_ordered_list():
result = format_evaluation(VALID_EVAL)
assert len(result) == 6
assert result[0].endswith("(5/7)") # competent first by default
def test_format_evaluation_custom_order():
result = format_evaluation(VALID_EVAL, attributes=["polite", "formal"])
assert len(result) == 2
assert "polite" in result[0]
assert "formal" in result[1]
# ---- parse_model_output: happy paths ----
def test_parse_model_output_returns_parsed_turn():
turn = parse_model_output(_valid_payload())
assert isinstance(turn, ParsedTurn)
assert turn.response == "Hi there!"
assert turn.evaluation == VALID_EVAL
def test_parse_model_output_strips_json_fences():
raw = f"```json\n{_valid_payload()}\n```"
turn = parse_model_output(raw)
assert turn.evaluation == VALID_EVAL
def test_parse_model_output_strips_plain_fences():
raw = f"```\n{_valid_payload()}\n```"
turn = parse_model_output(raw)
assert turn.evaluation == VALID_EVAL
def test_parse_model_output_ignores_extra_attributes():
extra = dict(VALID_EVAL)
extra["unexpected"] = 5
raw = json.dumps({"response": "hi", "evaluation": extra})
turn = parse_model_output(raw)
assert "unexpected" not in turn.evaluation
# ---- parse_model_output: error paths ----
def test_parse_model_output_rejects_invalid_json():
with pytest.raises(EvaluationParseError, match="parse JSON"):
parse_model_output("not even close to json")
def test_parse_model_output_rejects_missing_response():
raw = json.dumps({"evaluation": VALID_EVAL})
with pytest.raises(EvaluationParseError, match="response"):
parse_model_output(raw)
def test_parse_model_output_rejects_missing_evaluation():
raw = json.dumps({"response": "hi"})
with pytest.raises(EvaluationParseError, match="evaluation"):
parse_model_output(raw)
def test_parse_model_output_rejects_missing_attribute():
incomplete = dict(VALID_EVAL)
del incomplete["polite"]
raw = json.dumps({"response": "hi", "evaluation": incomplete})
with pytest.raises(EvaluationParseError, match="polite"):
parse_model_output(raw)
def test_parse_model_output_rejects_out_of_range_score():
bad = dict(VALID_EVAL)
bad["polite"] = 9
raw = json.dumps({"response": "hi", "evaluation": bad})
with pytest.raises(EvaluationParseError, match="out of range"):
parse_model_output(raw)
def test_parse_model_output_rejects_bool_score():
bad = dict(VALID_EVAL)
bad["polite"] = True # noqa: not an int despite isinstance(True, int)
raw = json.dumps({"response": "hi", "evaluation": bad})
with pytest.raises(EvaluationParseError, match="not an integer"):
parse_model_output(raw)
def test_parse_model_output_rejects_float_score():
bad = dict(VALID_EVAL)
bad["polite"] = 5.5
raw = json.dumps({"response": "hi", "evaluation": bad})
with pytest.raises(EvaluationParseError, match="not an integer"):
parse_model_output(raw) |