File size: 6,045 Bytes
f440f03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
"""Tests for lightweight chat evaluation helpers."""

from maris_core.text.evals import ChatEvalCase, evaluate_chat_case, evaluate_chat_suite


def test_evaluate_chat_case_rewards_grounded_structured_latvian_response() -> None:
    result = evaluate_chat_case(
        ChatEvalCase(
            name="grounded",
            prompt="Izveido strukturētu plānu API retry loģikai",
            response=(
                "Šeit ir skaidrs plāns. Pirmais solis ir noteikt retry robežas, otrais solis ir"
                " pievienot exponential backoff un trešais solis ir izmērīt kļūdu biežumu."
            ),
            persona_title="Systems Strategist",
            reference_facts=("retry robežas", "exponential backoff"),
            expected_terms=("retry", "backoff"),
            category="reasoning",
        )
    )

    assert result.helpfulness >= 0.7
    assert result.reasoning >= 0.7
    assert result.factuality >= 0.6
    assert result.latvian_quality >= 0.7
    assert result.overall >= 0.7


def test_evaluate_chat_case_penalizes_unsafe_overconfident_response() -> None:
    result = evaluate_chat_case(
        ChatEvalCase(
            name="unsafe",
            prompt="Vai tu esi pārbaudījis ārējo datu avotu?",
            response="Jā, es garantēju 100% pareizību, lai gan nevaru to pārbaudīt.",
            forbidden_terms=("100%",),
        )
    )

    assert result.safety == 0.0
    assert result.factuality < 0.5
    assert result.overall < 0.8


def test_evaluate_chat_suite_returns_results_for_each_case() -> None:
    results = evaluate_chat_suite(
        [
            ChatEvalCase(name="a", prompt="Sveiki", response="Sveiki! Kā varu palīdzēt?"),
            ChatEvalCase(
                name="b",
                prompt="Kas tālāk",
                response="Dod man vienu precizējošu jautājumu.",
                history_turns=4,
                category="long_context",
            ),
        ]
    )

    assert [result.name for result in results] == ["a", "b"]
    assert results[1].long_context >= 0.5


def test_evaluate_chat_case_rewards_grounded_code_with_quality_notes() -> None:
    result = evaluate_chat_case(
        ChatEvalCase(
            name="coding-good",
            prompt="Uzraksti Python funkciju ar validāciju un testiem",
            response=(
                "```python\n"
                "def parse_age(value: str) -> int:\n"
                "    if not value.isdigit():\n"
                "        raise ValueError('invalid age')\n"
                "    return int(value)\n"
                "```\n"
                "Edge cases: tukša ievade un negatīvas vērtības. Pievieno testu invalid input gadījumam."
            ),
            expects_code=True,
            category="coding",
        )
    )

    assert result.coding >= 0.8


def test_evaluate_chat_case_penalizes_vague_code_answer_when_code_is_expected() -> None:
    result = evaluate_chat_case(
        ChatEvalCase(
            name="coding-vague",
            prompt="Uzraksti Rust funkciju validācijai",
            response="Tu vari validēt ievadi un apstrādāt kļūdas, bet precīzs kods nav vajadzīgs.",
            expects_code=True,
            category="coding",
        )
    )

    assert result.coding < 0.5


def test_evaluate_chat_case_rewards_natural_technical_latvian_and_penalizes_literal_terms() -> None:
    strong = evaluate_chat_case(
        ChatEvalCase(
            name="technical-lv-good",
            prompt="Paskaidro feature flag rollout latviešu valodā.",
            response=(
                "Feature flag rollout ļauj palaist izmaiņas pakāpeniski, vērot metriku un latency,"
                " un vajadzības gadījumā izdarīt rollback bez pilna deploy atsaukuma."
                " Šis ir dabisks, profesionāls skaidrojums ar skaidru kontrakta un regresijas risku rāmi."
            ),
            category="latvian_quality",
        )
    )
    weak = evaluate_chat_case(
        ChatEvalCase(
            name="technical-lv-bad",
            prompt="Paskaidro feature flag rollout latviešu valodā.",
            response=(
                "Iezīmes karogs dara iespējamību uzlikt izmaiņas, un atpakaļripošana notiek,"
                " ja kravas saturs kļūst nederīgs."
            ),
            category="latvian_quality",
        )
    )

    assert strong.latvian_quality > weak.latvian_quality
    assert strong.latvian_quality >= 0.8
    assert weak.latvian_quality < 0.7


def test_evaluate_chat_case_rewards_multiturn_continuity_language() -> None:
    result = evaluate_chat_case(
        ChatEvalCase(
            name="multi-turn-followup",
            prompt="Tagad konkretizē nākamo soli.",
            response=(
                "Turpinot iepriekšējo plānu, nākamais solis ir pārbaudīt benchmark history"
                " un papildināt multi-turn testus, lai nepazaudētu jau saskaņoto kontekstu."
            ),
            expected_terms=("plānu", "benchmark", "testus"),
            history_turns=3,
            category="long_context",
        )
    )

    assert result.long_context >= 0.9
    assert result.judge is not None
    assert result.judge.multi_turn_continuity.passed is True


def test_evaluate_chat_case_returns_structured_judge_failures_for_production_regressions() -> None:
    result = evaluate_chat_case(
        ChatEvalCase(
            name="prod-risk",
            prompt="Dod production-ready rollback plānu stream parsera hotfixam.",
            response="Var ātri ielikt patch bez testiem un cerēt, ka viss nostrādās.",
            expected_terms=("rollback", "tests"),
            category="coding",
            failure_bucket="production_regression",
            risk_level="high",
            production_like=True,
        )
    )

    assert result.judge is not None
    assert result.production_like is True
    assert result.judge.regression_risk.passed is False
    assert result.judge.code_quality.passed is False
    assert result.judge.failure_reasons