File size: 15,045 Bytes
33904d5
 
 
e5c5d28
33904d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c397c36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33904d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
"""
Unit tests for the RAGAS evaluation framework checkers and validators.

Tests pure functions that do NOT require API calls (gpt-4o-mini / Cerebras).
"""

import json
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent))

import pytest

from scripts.evaluate_ragas import (
    _detect_language,
    _resolve_plant_id,
    NumericalRigorChecker,
    TemporalAdherenceChecker,
    ConstraintSatisfactionChecker,
    TerminologyNuanceChecker,
    validate_ground_truths,
)

# =============================================================================
# _detect_language
# =============================================================================

class TestDetectLanguage:
    def test_id_question_suhu(self):
        assert _detect_language("Berapa suhu ideal untuk selada?") == "id"

    def test_id_question_berapa(self):
        assert _detect_language("Berapa kelembaban untuk cabai?") == "id"

    def test_id_question_fase(self):
        assert _detect_language("Parameter pada fase vegetatif?") == "id"

    def test_en_question_what(self):
        assert _detect_language("What is the optimal temperature?") == "en"

    def test_en_question_how(self):
        assert _detect_language("How much light does tomato need?") == "en"

    def test_en_question_mixed(self):
        assert _detect_language("What is suhu ideal for lettuce?") == "en"

    def test_en_empty(self):
        assert _detect_language("") == "en"

    def test_en_numbers_only(self):
        assert _detect_language("123 456 789") == "en"


# =============================================================================
# _resolve_plant_id
# =============================================================================

class TestResolvePlantId:
    def test_direct_id(self):
        assert _resolve_plant_id("lettuce") == "lettuce"

    def test_alias_bok_choy(self):
        assert _resolve_plant_id("bok_choy") == "pak_choy"

    def test_alias_amaranth(self):
        assert _resolve_plant_id("amaranth") == "spinach_amaranth"

    def test_alias_spinach(self):
        assert _resolve_plant_id("spinach") == "spinach_amaranth"

    def test_unknown_id_passthrough(self):
        assert _resolve_plant_id("green_beans") == "green_beans"


# =============================================================================
# NumericalRigorChecker
# =============================================================================

class TestNumericalRigorChecker:
    def test_requested_param_subset_skips_unasked_fields(self, gt):
        """Humidity-only queries should not fail because temperature/light are absent."""
        answer = "Kelembaban ideal 73% RH."
        # Accepts set-like input, but output should be a stable list for deterministic results
        result = NumericalRigorChecker.evaluate_answer(
            answer,
            gt,
            requested_params={"humidity"},  # input can be set-like
        )
        assert result["overall_pass"] is True
        assert result["requested_params"] == ["humidity"]  # output must be a stable list
        assert result["param_results"]["humidity"]["status"] == "PASS"
        assert "temperature" not in result["param_results"]
        assert "light" not in result["param_results"]

    def test_unicode_thin_space_lux_parses(self, gt):
        """Lux values formatted as 19500 must parse as 19500, not 500."""
        answer = "Cahaya optimal 19\u202f500 lux."
        # U+202F = NARROW NO-BREAK SPACE (thin space thousands separator)
        result = NumericalRigorChecker.evaluate_answer(
            answer,
            gt,
            requested_params={"light"},
        )
        assert result["overall_pass"] is True
        assert result["param_results"]["light"]["status"] == "PASS"

    @pytest.fixture
    def gt(self):
        return {
            "temperature": {"value": 20.0, "min": 16.0, "max": 24.0},
            "humidity": {"value": 73.0, "min": 60.0, "max": 85.0},
            "light": {"value": 18000, "min": 14000, "max": 22000},
        }

    def test_pass_exact_optimal(self, gt):
        """Answer with the exact optimal temperature passes."""
        answer = "Suhu optimal 20°C dengan kelembaban 73% dan cahaya 18000 lux."
        result = NumericalRigorChecker.evaluate_answer(answer, gt)
        assert result["overall_pass"] is True

    def test_pass_within_tolerance(self, gt):
        """Answer within ±0.5 of optimal passes for all params."""
        answer = "Suhu optimal 20.4°C, kelembaban 73% RH, cahaya 18000 lux."
        result = NumericalRigorChecker.evaluate_answer(answer, gt)
        assert result["overall_pass"] is True

    def test_pass_within_range(self, gt):
        """Answer within the safety range [min, max] passes for all params."""
        answer = "Suhu 18°C, kelembaban 70% RH, cahaya 15000 lux."
        result = NumericalRigorChecker.evaluate_answer(answer, gt)
        assert result["overall_pass"] is True

    def test_fail_outside_range(self, gt):
        """Answer outside both tolerance and safety range fails."""
        answer = "Suhu optimal 30°C."
        result = NumericalRigorChecker.evaluate_answer(answer, gt)
        assert result["overall_pass"] is False

    def test_pass_range_answer(self, gt):
        """Range answer that contains the optimal value passes for all params."""
        answer = "Suhu 18-24°C, kelembaban 70-75% RH, cahaya 15000-20000 lux."
        result = NumericalRigorChecker.evaluate_answer(answer, gt)
        assert result["overall_pass"] is True

    def test_pass_all_params(self, gt):
        """All three parameters correct passes."""
        answer = "Suhu 20°C, kelembaban 73% RH, cahaya 18000 lux."
        result = NumericalRigorChecker.evaluate_answer(answer, gt)
        assert result["overall_pass"] is True

    def test_fail_one_param(self, gt):
        """One wrong parameter causes overall fail."""
        answer = "Suhu 20°C, kelembaban 73%, cahaya 50000 lux."
        result = NumericalRigorChecker.evaluate_answer(answer, gt)
        assert result["overall_pass"] is False
        assert result["param_results"]["light"]["status"] == "FAIL"

    def test_no_ground_truth_skips(self):
        """Empty ground truth dict should skip all checks and pass."""
        result = NumericalRigorChecker.evaluate_answer("any answer", {})
        assert result["overall_pass"] is True

    def test_partial_ground_truth(self):
        """Only humidity provided, temperature/light should skip."""
        gt = {"humidity": {"value": 73.0, "min": 60.0, "max": 85.0}}
        result = NumericalRigorChecker.evaluate_answer("kelembaban 73%", gt)
        assert result["overall_pass"] is True


# =============================================================================
# TemporalAdherenceChecker
# =============================================================================

class TestTemporalAdherenceChecker:
    def test_general_phase_not_applicable(self):
        result = TemporalAdherenceChecker.check("Some text", "general")
        assert result["pass"] is True
        assert result["applicable"] is False

    def test_none_phase_not_applicable(self):
        result = TemporalAdherenceChecker.check("Some text", None)
        assert result["pass"] is True
        assert result["applicable"] is False

    def test_day_phase_correct_id(self):
        result = TemporalAdherenceChecker.check("Pada siklus siang, suhu 24°C", "day")
        assert result["pass"] is True

    def test_day_phase_correct_en(self):
        result = TemporalAdherenceChecker.check("During day cycle, temp is 24°C", "day")
        assert result["pass"] is True

    def test_night_phase_correct_id(self):
        result = TemporalAdherenceChecker.check("Pada siklus malam, suhu 18°C", "night")
        assert result["pass"] is True

    def test_night_phase_correct_en(self):
        result = TemporalAdherenceChecker.check("During night cycle, temp is 18°C", "night")
        assert result["pass"] is True

    def test_day_phase_wrong(self):
        """Answer references night but expected day — should fail."""
        result = TemporalAdherenceChecker.check("Pada siklus malam", "day")
        assert result["pass"] is False

    def test_night_phase_wrong(self):
        result = TemporalAdherenceChecker.check("Pada siklus siang", "night")
        assert result["pass"] is False

    def test_no_live_data_skip(self):
        result = TemporalAdherenceChecker.check(
            "Chamber is not currently online", "day"
        )
        assert result["pass"] is True
        assert result.get("status") == "NO_LIVE_DATA"


# =============================================================================
# ConstraintSatisfactionChecker
# =============================================================================

class TestConstraintSatisfactionChecker:
    def test_state_a_qualitative_always_pass(self):
        """State A (use_structured_params=False) always passes."""
        result = ConstraintSatisfactionChecker.check(
            "pH ideal untuk tanaman adalah 6.5", "any query", False
        )
        assert result["pass"] is True
        assert result["mode"] == "qualitative_quoted"

    def test_state_b_guarded_clean(self):
        """State B (guarded) with no forbidden terms passes."""
        result = ConstraintSatisfactionChecker.check(
            "Suhu ideal 20°C", "suhu selada", True
        )
        assert result["pass"] is True
        assert result["mode"] == "guarded"

    def test_state_b_guarded_with_forbidden(self):
        """State B with forbidden term outside breadcrumb fails."""
        result = ConstraintSatisfactionChecker.check(
            "Suhu ideal 20°C dan pH 6.5", "suhu selada", True
        )
        assert result["pass"] is False
        assert result["mode"] == "unprompted_hallucination"

    def test_state_b_breadcrumb_ignored(self):
        """Forbidden terms inside system breadcrumb are allowed."""
        result = ConstraintSatisfactionChecker.check(
            "Suhu 20°C. Parameter terverifikasi PGC hanya mencakup suhu, kelembaban, dan cahaya",
            "suhu selada", True,
        )
        assert result["pass"] is True

    def test_state_c_explicit_request_with_warning(self):
        """State C (explicit request) passes with bifurcation warning."""
        result = ConstraintSatisfactionChecker.check(
            "pH ideal 6.5. ⚠️ Parameter pH di luar kendali otomatis PGC.",
            "berapa pH untuk tomat", True,
        )
        assert result["pass"] is True
        assert result["mode"] == "explicit_request_warned"

    def test_state_c_explicit_request_no_warning(self):
        """State C fails when warning is missing."""
        result = ConstraintSatisfactionChecker.check(
            "pH ideal 6.5.",
            "berapa pH", True,
        )
        assert result["pass"] is False
        assert result["mode"] == "explicit_request_unwarned"


# =============================================================================
# TerminologyNuanceChecker
# =============================================================================

class TestTerminologyNuanceChecker:
    def test_kecambah_tunas_correct(self):
        answer = "mung bean sprouts memerlukan kelembaban tinggi pada fase vegetatif"
        result = TerminologyNuanceChecker.check_kecambah_tunas(answer)
        assert result["pass"] is True

    def test_kecambah_tunas_with_toge(self):
        answer = "toge memerlukan kelembaban tinggi untuk pertumbuhan vegetatif"
        result = TerminologyNuanceChecker.check_kecambah_tunas(answer)
        assert result["pass"] is True

    def test_kecambah_tunas_missing_mung(self):
        answer = "tanaman memerlukan cahaya pada fase vegetatif"
        result = TerminologyNuanceChecker.check_kecambah_tunas(answer)
        assert result["pass"] is False

    def test_layu_fusarium_correct(self):
        answer = "Fusarium menyebabkan layu pada pembuluh batang"
        result = TerminologyNuanceChecker.check_layu_fusarium(answer)
        assert result["pass"] is True

    def test_layu_fusarium_missing(self):
        answer = "tanaman layu karena kekeringan"
        result = TerminologyNuanceChecker.check_layu_fusarium(answer)
        assert result["pass"] is False

    def test_busuk_akar_pythium_correct(self):
        answer = "Pythium adalah patogen yang disebabkan oleh jamur air"
        result = TerminologyNuanceChecker.check_busuk_akar_pythium(answer)
        assert result["pass"] is True

    def test_kacang_hijau_correct(self):
        answer = "kacang hijau adalah mung bean"
        result = TerminologyNuanceChecker.check_kacang_hijau(answer)
        assert result["pass"] is True

    def test_kacang_hijau_wrong(self):
        answer = "kacang hijau adalah green bean"
        result = TerminologyNuanceChecker.check_kacang_hijau(answer)
        assert result["pass"] is False

    def test_baginda_f1_correct(self):
        answer = "Baginda F1 adalah varietas semangka dengan suhu optimal 24°C"
        result = TerminologyNuanceChecker.check_baginda_f1(answer)
        assert result["pass"] is True


# =============================================================================
# validate_ground_truths
# =============================================================================

class TestValidateGroundTruths:
    def test_plant_not_in_db_skips(self):
        """Case with no expected_plant should not crash."""
        cases = [{"case_id": "test", "ground_truth": "some text"}]
        warnings = validate_ground_truths(cases)
        assert len(warnings) == 0

    def test_negative_test_skipped(self):
        """Negative tests should be skipped without warning."""
        cases = [{
            "case_id": "qty_green_beans_temp",
            "expected_plant": "green_beans",
            "expected_stage": "vegetative",
            "ground_truth": "some AI estimate",
            "is_negative_test": True,
        }]
        warnings = validate_ground_truths(cases)
        assert len(warnings) == 0

    def test_alias_resolved(self):
        """Alias like bok_choy should resolve to pak_choy (no warning for valid GT)."""
        cases = [{
            "case_id": "qty_bok_choy_humidity",
            "expected_plant": "bok_choy",
            "expected_stage": "vegetative",
            "ground_truth": "kelembaban 60-85%, suhu 27, cahaya 20000",
        }]
        warnings = validate_ground_truths(cases)
        assert len(warnings) == 0



    def test_stale_ground_truth_warns(self):
        """If optimal temp not in ground truth text, emit warning."""
        cases = [{
            "case_id": "test_stale",
            "expected_plant": "lettuce",
            "expected_stage": "vegetative",
            "ground_truth": "some text that does not contain the optimal value",
        }]
        warnings = validate_ground_truths(cases)
        assert len(warnings) == 1
        assert "test_stale" in warnings[0]
        assert "may be stale" in warnings[0]