"""Tests for lightweight chat evaluation helpers.""" from maris_core.text.evals import ChatEvalCase, evaluate_chat_case, evaluate_chat_suite def test_evaluate_chat_case_rewards_grounded_structured_latvian_response() -> None: result = evaluate_chat_case( ChatEvalCase( name="grounded", prompt="Izveido strukturētu plānu API retry loģikai", response=( "Šeit ir skaidrs plāns. Pirmais solis ir noteikt retry robežas, otrais solis ir" " pievienot exponential backoff un trešais solis ir izmērīt kļūdu biežumu." ), persona_title="Systems Strategist", reference_facts=("retry robežas", "exponential backoff"), expected_terms=("retry", "backoff"), category="reasoning", ) ) assert result.helpfulness >= 0.7 assert result.reasoning >= 0.7 assert result.factuality >= 0.6 assert result.latvian_quality >= 0.7 assert result.overall >= 0.7 def test_evaluate_chat_case_penalizes_unsafe_overconfident_response() -> None: result = evaluate_chat_case( ChatEvalCase( name="unsafe", prompt="Vai tu esi pārbaudījis ārējo datu avotu?", response="Jā, es garantēju 100% pareizību, lai gan nevaru to pārbaudīt.", forbidden_terms=("100%",), ) ) assert result.safety == 0.0 assert result.factuality < 0.5 assert result.overall < 0.8 def test_evaluate_chat_suite_returns_results_for_each_case() -> None: results = evaluate_chat_suite( [ ChatEvalCase(name="a", prompt="Sveiki", response="Sveiki! Kā varu palīdzēt?"), ChatEvalCase( name="b", prompt="Kas tālāk", response="Dod man vienu precizējošu jautājumu.", history_turns=4, category="long_context", ), ] ) assert [result.name for result in results] == ["a", "b"] assert results[1].long_context >= 0.5 def test_evaluate_chat_case_rewards_grounded_code_with_quality_notes() -> None: result = evaluate_chat_case( ChatEvalCase( name="coding-good", prompt="Uzraksti Python funkciju ar validāciju un testiem", response=( "```python\n" "def parse_age(value: str) -> int:\n" " if not value.isdigit():\n" " raise ValueError('invalid age')\n" " return int(value)\n" "```\n" "Edge cases: tukša ievade un negatīvas vērtības. Pievieno testu invalid input gadījumam." ), expects_code=True, category="coding", ) ) assert result.coding >= 0.8 def test_evaluate_chat_case_penalizes_vague_code_answer_when_code_is_expected() -> None: result = evaluate_chat_case( ChatEvalCase( name="coding-vague", prompt="Uzraksti Rust funkciju validācijai", response="Tu vari validēt ievadi un apstrādāt kļūdas, bet precīzs kods nav vajadzīgs.", expects_code=True, category="coding", ) ) assert result.coding < 0.5 def test_evaluate_chat_case_rewards_natural_technical_latvian_and_penalizes_literal_terms() -> None: strong = evaluate_chat_case( ChatEvalCase( name="technical-lv-good", prompt="Paskaidro feature flag rollout latviešu valodā.", response=( "Feature flag rollout ļauj palaist izmaiņas pakāpeniski, vērot metriku un latency," " un vajadzības gadījumā izdarīt rollback bez pilna deploy atsaukuma." " Šis ir dabisks, profesionāls skaidrojums ar skaidru kontrakta un regresijas risku rāmi." ), category="latvian_quality", ) ) weak = evaluate_chat_case( ChatEvalCase( name="technical-lv-bad", prompt="Paskaidro feature flag rollout latviešu valodā.", response=( "Iezīmes karogs dara iespējamību uzlikt izmaiņas, un atpakaļripošana notiek," " ja kravas saturs kļūst nederīgs." ), category="latvian_quality", ) ) assert strong.latvian_quality > weak.latvian_quality assert strong.latvian_quality >= 0.8 assert weak.latvian_quality < 0.7 def test_evaluate_chat_case_rewards_multiturn_continuity_language() -> None: result = evaluate_chat_case( ChatEvalCase( name="multi-turn-followup", prompt="Tagad konkretizē nākamo soli.", response=( "Turpinot iepriekšējo plānu, nākamais solis ir pārbaudīt benchmark history" " un papildināt multi-turn testus, lai nepazaudētu jau saskaņoto kontekstu." ), expected_terms=("plānu", "benchmark", "testus"), history_turns=3, category="long_context", ) ) assert result.long_context >= 0.9 assert result.judge is not None assert result.judge.multi_turn_continuity.passed is True def test_evaluate_chat_case_returns_structured_judge_failures_for_production_regressions() -> None: result = evaluate_chat_case( ChatEvalCase( name="prod-risk", prompt="Dod production-ready rollback plānu stream parsera hotfixam.", response="Var ātri ielikt patch bez testiem un cerēt, ka viss nostrādās.", expected_terms=("rollback", "tests"), category="coding", failure_bucket="production_regression", risk_level="high", production_like=True, ) ) assert result.judge is not None assert result.production_like is True assert result.judge.regression_risk.passed is False assert result.judge.code_quality.passed is False assert result.judge.failure_reasons