prisma-chatbot / tests /test_evaluation.py
RolandM's picture
Add evaluation module with parsing, validation, and formatting
989cfe7
"""Unit tests for src.evaluation."""
from __future__ import annotations
import json
import pytest
from src.evaluation import (
EvaluationParseError,
ParsedTurn,
format_evaluation,
format_score,
parse_model_output,
)
VALID_EVAL = {
"competent": 5,
"likeable": 4,
"considerate": 6,
"polite": 7,
"formal": 3,
"demanding": 2,
}
def _valid_payload(**overrides) -> str:
"""Build a JSON string representing a well-formed model output."""
payload = {"response": "Hi there!", "evaluation": dict(VALID_EVAL)}
payload.update(overrides)
return json.dumps(payload)
# ---- format_score ----
def test_format_score_basic():
assert format_score("polite", 2) == "barely polite (2/7)"
assert format_score("competent", 5) == "quite competent (5/7)"
assert format_score("demanding", 7) == "extremely demanding (7/7)"
def test_format_score_all_levels_render():
for score in range(1, 8):
result = format_score("attr", score)
assert f"({score}/7)" in result
@pytest.mark.parametrize("bad_score", [0, 8, -1, 100])
def test_format_score_rejects_out_of_range(bad_score: int):
with pytest.raises(ValueError):
format_score("polite", bad_score)
# ---- format_evaluation ----
def test_format_evaluation_returns_ordered_list():
result = format_evaluation(VALID_EVAL)
assert len(result) == 6
assert result[0].endswith("(5/7)") # competent first by default
def test_format_evaluation_custom_order():
result = format_evaluation(VALID_EVAL, attributes=["polite", "formal"])
assert len(result) == 2
assert "polite" in result[0]
assert "formal" in result[1]
# ---- parse_model_output: happy paths ----
def test_parse_model_output_returns_parsed_turn():
turn = parse_model_output(_valid_payload())
assert isinstance(turn, ParsedTurn)
assert turn.response == "Hi there!"
assert turn.evaluation == VALID_EVAL
def test_parse_model_output_strips_json_fences():
raw = f"```json\n{_valid_payload()}\n```"
turn = parse_model_output(raw)
assert turn.evaluation == VALID_EVAL
def test_parse_model_output_strips_plain_fences():
raw = f"```\n{_valid_payload()}\n```"
turn = parse_model_output(raw)
assert turn.evaluation == VALID_EVAL
def test_parse_model_output_ignores_extra_attributes():
extra = dict(VALID_EVAL)
extra["unexpected"] = 5
raw = json.dumps({"response": "hi", "evaluation": extra})
turn = parse_model_output(raw)
assert "unexpected" not in turn.evaluation
# ---- parse_model_output: error paths ----
def test_parse_model_output_rejects_invalid_json():
with pytest.raises(EvaluationParseError, match="parse JSON"):
parse_model_output("not even close to json")
def test_parse_model_output_rejects_missing_response():
raw = json.dumps({"evaluation": VALID_EVAL})
with pytest.raises(EvaluationParseError, match="response"):
parse_model_output(raw)
def test_parse_model_output_rejects_missing_evaluation():
raw = json.dumps({"response": "hi"})
with pytest.raises(EvaluationParseError, match="evaluation"):
parse_model_output(raw)
def test_parse_model_output_rejects_missing_attribute():
incomplete = dict(VALID_EVAL)
del incomplete["polite"]
raw = json.dumps({"response": "hi", "evaluation": incomplete})
with pytest.raises(EvaluationParseError, match="polite"):
parse_model_output(raw)
def test_parse_model_output_rejects_out_of_range_score():
bad = dict(VALID_EVAL)
bad["polite"] = 9
raw = json.dumps({"response": "hi", "evaluation": bad})
with pytest.raises(EvaluationParseError, match="out of range"):
parse_model_output(raw)
def test_parse_model_output_rejects_bool_score():
bad = dict(VALID_EVAL)
bad["polite"] = True # noqa: not an int despite isinstance(True, int)
raw = json.dumps({"response": "hi", "evaluation": bad})
with pytest.raises(EvaluationParseError, match="not an integer"):
parse_model_output(raw)
def test_parse_model_output_rejects_float_score():
bad = dict(VALID_EVAL)
bad["polite"] = 5.5
raw = json.dumps({"response": "hi", "evaluation": bad})
with pytest.raises(EvaluationParseError, match="not an integer"):
parse_model_output(raw)