Spaces:
Running
Running
| """Unit tests for src.evaluation.""" | |
| from __future__ import annotations | |
| import json | |
| import pytest | |
| from src.evaluation import ( | |
| EvaluationParseError, | |
| ParsedTurn, | |
| format_evaluation, | |
| format_score, | |
| parse_model_output, | |
| ) | |
| VALID_EVAL = { | |
| "competent": 5, | |
| "likeable": 4, | |
| "considerate": 6, | |
| "polite": 7, | |
| "formal": 3, | |
| "demanding": 2, | |
| } | |
| def _valid_payload(**overrides) -> str: | |
| """Build a JSON string representing a well-formed model output.""" | |
| payload = {"response": "Hi there!", "evaluation": dict(VALID_EVAL)} | |
| payload.update(overrides) | |
| return json.dumps(payload) | |
| # ---- format_score ---- | |
| def test_format_score_basic(): | |
| assert format_score("polite", 2) == "barely polite (2/7)" | |
| assert format_score("competent", 5) == "quite competent (5/7)" | |
| assert format_score("demanding", 7) == "extremely demanding (7/7)" | |
| def test_format_score_all_levels_render(): | |
| for score in range(1, 8): | |
| result = format_score("attr", score) | |
| assert f"({score}/7)" in result | |
| def test_format_score_rejects_out_of_range(bad_score: int): | |
| with pytest.raises(ValueError): | |
| format_score("polite", bad_score) | |
| # ---- format_evaluation ---- | |
| def test_format_evaluation_returns_ordered_list(): | |
| result = format_evaluation(VALID_EVAL) | |
| assert len(result) == 6 | |
| assert result[0].endswith("(5/7)") # competent first by default | |
| def test_format_evaluation_custom_order(): | |
| result = format_evaluation(VALID_EVAL, attributes=["polite", "formal"]) | |
| assert len(result) == 2 | |
| assert "polite" in result[0] | |
| assert "formal" in result[1] | |
| # ---- parse_model_output: happy paths ---- | |
| def test_parse_model_output_returns_parsed_turn(): | |
| turn = parse_model_output(_valid_payload()) | |
| assert isinstance(turn, ParsedTurn) | |
| assert turn.response == "Hi there!" | |
| assert turn.evaluation == VALID_EVAL | |
| def test_parse_model_output_strips_json_fences(): | |
| raw = f"```json\n{_valid_payload()}\n```" | |
| turn = parse_model_output(raw) | |
| assert turn.evaluation == VALID_EVAL | |
| def test_parse_model_output_strips_plain_fences(): | |
| raw = f"```\n{_valid_payload()}\n```" | |
| turn = parse_model_output(raw) | |
| assert turn.evaluation == VALID_EVAL | |
| def test_parse_model_output_ignores_extra_attributes(): | |
| extra = dict(VALID_EVAL) | |
| extra["unexpected"] = 5 | |
| raw = json.dumps({"response": "hi", "evaluation": extra}) | |
| turn = parse_model_output(raw) | |
| assert "unexpected" not in turn.evaluation | |
| # ---- parse_model_output: error paths ---- | |
| def test_parse_model_output_rejects_invalid_json(): | |
| with pytest.raises(EvaluationParseError, match="parse JSON"): | |
| parse_model_output("not even close to json") | |
| def test_parse_model_output_rejects_missing_response(): | |
| raw = json.dumps({"evaluation": VALID_EVAL}) | |
| with pytest.raises(EvaluationParseError, match="response"): | |
| parse_model_output(raw) | |
| def test_parse_model_output_rejects_missing_evaluation(): | |
| raw = json.dumps({"response": "hi"}) | |
| with pytest.raises(EvaluationParseError, match="evaluation"): | |
| parse_model_output(raw) | |
| def test_parse_model_output_rejects_missing_attribute(): | |
| incomplete = dict(VALID_EVAL) | |
| del incomplete["polite"] | |
| raw = json.dumps({"response": "hi", "evaluation": incomplete}) | |
| with pytest.raises(EvaluationParseError, match="polite"): | |
| parse_model_output(raw) | |
| def test_parse_model_output_rejects_out_of_range_score(): | |
| bad = dict(VALID_EVAL) | |
| bad["polite"] = 9 | |
| raw = json.dumps({"response": "hi", "evaluation": bad}) | |
| with pytest.raises(EvaluationParseError, match="out of range"): | |
| parse_model_output(raw) | |
| def test_parse_model_output_rejects_bool_score(): | |
| bad = dict(VALID_EVAL) | |
| bad["polite"] = True # noqa: not an int despite isinstance(True, int) | |
| raw = json.dumps({"response": "hi", "evaluation": bad}) | |
| with pytest.raises(EvaluationParseError, match="not an integer"): | |
| parse_model_output(raw) | |
| def test_parse_model_output_rejects_float_score(): | |
| bad = dict(VALID_EVAL) | |
| bad["polite"] = 5.5 | |
| raw = json.dumps({"response": "hi", "evaluation": bad}) | |
| with pytest.raises(EvaluationParseError, match="not an integer"): | |
| parse_model_output(raw) |