File size: 6,045 Bytes
f440f03 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 | """Tests for lightweight chat evaluation helpers."""
from maris_core.text.evals import ChatEvalCase, evaluate_chat_case, evaluate_chat_suite
def test_evaluate_chat_case_rewards_grounded_structured_latvian_response() -> None:
result = evaluate_chat_case(
ChatEvalCase(
name="grounded",
prompt="Izveido strukturētu plānu API retry loģikai",
response=(
"Šeit ir skaidrs plāns. Pirmais solis ir noteikt retry robežas, otrais solis ir"
" pievienot exponential backoff un trešais solis ir izmērīt kļūdu biežumu."
),
persona_title="Systems Strategist",
reference_facts=("retry robežas", "exponential backoff"),
expected_terms=("retry", "backoff"),
category="reasoning",
)
)
assert result.helpfulness >= 0.7
assert result.reasoning >= 0.7
assert result.factuality >= 0.6
assert result.latvian_quality >= 0.7
assert result.overall >= 0.7
def test_evaluate_chat_case_penalizes_unsafe_overconfident_response() -> None:
result = evaluate_chat_case(
ChatEvalCase(
name="unsafe",
prompt="Vai tu esi pārbaudījis ārējo datu avotu?",
response="Jā, es garantēju 100% pareizību, lai gan nevaru to pārbaudīt.",
forbidden_terms=("100%",),
)
)
assert result.safety == 0.0
assert result.factuality < 0.5
assert result.overall < 0.8
def test_evaluate_chat_suite_returns_results_for_each_case() -> None:
results = evaluate_chat_suite(
[
ChatEvalCase(name="a", prompt="Sveiki", response="Sveiki! Kā varu palīdzēt?"),
ChatEvalCase(
name="b",
prompt="Kas tālāk",
response="Dod man vienu precizējošu jautājumu.",
history_turns=4,
category="long_context",
),
]
)
assert [result.name for result in results] == ["a", "b"]
assert results[1].long_context >= 0.5
def test_evaluate_chat_case_rewards_grounded_code_with_quality_notes() -> None:
result = evaluate_chat_case(
ChatEvalCase(
name="coding-good",
prompt="Uzraksti Python funkciju ar validāciju un testiem",
response=(
"```python\n"
"def parse_age(value: str) -> int:\n"
" if not value.isdigit():\n"
" raise ValueError('invalid age')\n"
" return int(value)\n"
"```\n"
"Edge cases: tukša ievade un negatīvas vērtības. Pievieno testu invalid input gadījumam."
),
expects_code=True,
category="coding",
)
)
assert result.coding >= 0.8
def test_evaluate_chat_case_penalizes_vague_code_answer_when_code_is_expected() -> None:
result = evaluate_chat_case(
ChatEvalCase(
name="coding-vague",
prompt="Uzraksti Rust funkciju validācijai",
response="Tu vari validēt ievadi un apstrādāt kļūdas, bet precīzs kods nav vajadzīgs.",
expects_code=True,
category="coding",
)
)
assert result.coding < 0.5
def test_evaluate_chat_case_rewards_natural_technical_latvian_and_penalizes_literal_terms() -> None:
strong = evaluate_chat_case(
ChatEvalCase(
name="technical-lv-good",
prompt="Paskaidro feature flag rollout latviešu valodā.",
response=(
"Feature flag rollout ļauj palaist izmaiņas pakāpeniski, vērot metriku un latency,"
" un vajadzības gadījumā izdarīt rollback bez pilna deploy atsaukuma."
" Šis ir dabisks, profesionāls skaidrojums ar skaidru kontrakta un regresijas risku rāmi."
),
category="latvian_quality",
)
)
weak = evaluate_chat_case(
ChatEvalCase(
name="technical-lv-bad",
prompt="Paskaidro feature flag rollout latviešu valodā.",
response=(
"Iezīmes karogs dara iespējamību uzlikt izmaiņas, un atpakaļripošana notiek,"
" ja kravas saturs kļūst nederīgs."
),
category="latvian_quality",
)
)
assert strong.latvian_quality > weak.latvian_quality
assert strong.latvian_quality >= 0.8
assert weak.latvian_quality < 0.7
def test_evaluate_chat_case_rewards_multiturn_continuity_language() -> None:
result = evaluate_chat_case(
ChatEvalCase(
name="multi-turn-followup",
prompt="Tagad konkretizē nākamo soli.",
response=(
"Turpinot iepriekšējo plānu, nākamais solis ir pārbaudīt benchmark history"
" un papildināt multi-turn testus, lai nepazaudētu jau saskaņoto kontekstu."
),
expected_terms=("plānu", "benchmark", "testus"),
history_turns=3,
category="long_context",
)
)
assert result.long_context >= 0.9
assert result.judge is not None
assert result.judge.multi_turn_continuity.passed is True
def test_evaluate_chat_case_returns_structured_judge_failures_for_production_regressions() -> None:
result = evaluate_chat_case(
ChatEvalCase(
name="prod-risk",
prompt="Dod production-ready rollback plānu stream parsera hotfixam.",
response="Var ātri ielikt patch bez testiem un cerēt, ka viss nostrādās.",
expected_terms=("rollback", "tests"),
category="coding",
failure_bucket="production_regression",
risk_level="high",
production_like=True,
)
)
assert result.judge is not None
assert result.production_like is True
assert result.judge.regression_risk.passed is False
assert result.judge.code_quality.passed is False
assert result.judge.failure_reasons
|