""" Unit tests for the RAGAS evaluation framework checkers and validators. Tests pure functions that do NOT require API calls (gpt-4o-mini / Cerebras). """ import json import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) import pytest from scripts.evaluate_ragas import ( _detect_language, _resolve_plant_id, NumericalRigorChecker, TemporalAdherenceChecker, ConstraintSatisfactionChecker, TerminologyNuanceChecker, validate_ground_truths, ) # ============================================================================= # _detect_language # ============================================================================= class TestDetectLanguage: def test_id_question_suhu(self): assert _detect_language("Berapa suhu ideal untuk selada?") == "id" def test_id_question_berapa(self): assert _detect_language("Berapa kelembaban untuk cabai?") == "id" def test_id_question_fase(self): assert _detect_language("Parameter pada fase vegetatif?") == "id" def test_en_question_what(self): assert _detect_language("What is the optimal temperature?") == "en" def test_en_question_how(self): assert _detect_language("How much light does tomato need?") == "en" def test_en_question_mixed(self): assert _detect_language("What is suhu ideal for lettuce?") == "en" def test_en_empty(self): assert _detect_language("") == "en" def test_en_numbers_only(self): assert _detect_language("123 456 789") == "en" # ============================================================================= # _resolve_plant_id # ============================================================================= class TestResolvePlantId: def test_direct_id(self): assert _resolve_plant_id("lettuce") == "lettuce" def test_alias_bok_choy(self): assert _resolve_plant_id("bok_choy") == "pak_choy" def test_alias_amaranth(self): assert _resolve_plant_id("amaranth") == "spinach_amaranth" def test_alias_spinach(self): assert _resolve_plant_id("spinach") == "spinach_amaranth" def test_unknown_id_passthrough(self): assert _resolve_plant_id("green_beans") == "green_beans" # ============================================================================= # NumericalRigorChecker # ============================================================================= class TestNumericalRigorChecker: def test_requested_param_subset_skips_unasked_fields(self, gt): """Humidity-only queries should not fail because temperature/light are absent.""" answer = "Kelembaban ideal 73% RH." # Accepts set-like input, but output should be a stable list for deterministic results result = NumericalRigorChecker.evaluate_answer( answer, gt, requested_params={"humidity"}, # input can be set-like ) assert result["overall_pass"] is True assert result["requested_params"] == ["humidity"] # output must be a stable list assert result["param_results"]["humidity"]["status"] == "PASS" assert "temperature" not in result["param_results"] assert "light" not in result["param_results"] def test_unicode_thin_space_lux_parses(self, gt): """Lux values formatted as 19500 must parse as 19500, not 500.""" answer = "Cahaya optimal 19\u202f500 lux." # U+202F = NARROW NO-BREAK SPACE (thin space thousands separator) result = NumericalRigorChecker.evaluate_answer( answer, gt, requested_params={"light"}, ) assert result["overall_pass"] is True assert result["param_results"]["light"]["status"] == "PASS" @pytest.fixture def gt(self): return { "temperature": {"value": 20.0, "min": 16.0, "max": 24.0}, "humidity": {"value": 73.0, "min": 60.0, "max": 85.0}, "light": {"value": 18000, "min": 14000, "max": 22000}, } def test_pass_exact_optimal(self, gt): """Answer with the exact optimal temperature passes.""" answer = "Suhu optimal 20°C dengan kelembaban 73% dan cahaya 18000 lux." result = NumericalRigorChecker.evaluate_answer(answer, gt) assert result["overall_pass"] is True def test_pass_within_tolerance(self, gt): """Answer within ±0.5 of optimal passes for all params.""" answer = "Suhu optimal 20.4°C, kelembaban 73% RH, cahaya 18000 lux." result = NumericalRigorChecker.evaluate_answer(answer, gt) assert result["overall_pass"] is True def test_pass_within_range(self, gt): """Answer within the safety range [min, max] passes for all params.""" answer = "Suhu 18°C, kelembaban 70% RH, cahaya 15000 lux." result = NumericalRigorChecker.evaluate_answer(answer, gt) assert result["overall_pass"] is True def test_fail_outside_range(self, gt): """Answer outside both tolerance and safety range fails.""" answer = "Suhu optimal 30°C." result = NumericalRigorChecker.evaluate_answer(answer, gt) assert result["overall_pass"] is False def test_pass_range_answer(self, gt): """Range answer that contains the optimal value passes for all params.""" answer = "Suhu 18-24°C, kelembaban 70-75% RH, cahaya 15000-20000 lux." result = NumericalRigorChecker.evaluate_answer(answer, gt) assert result["overall_pass"] is True def test_pass_all_params(self, gt): """All three parameters correct passes.""" answer = "Suhu 20°C, kelembaban 73% RH, cahaya 18000 lux." result = NumericalRigorChecker.evaluate_answer(answer, gt) assert result["overall_pass"] is True def test_fail_one_param(self, gt): """One wrong parameter causes overall fail.""" answer = "Suhu 20°C, kelembaban 73%, cahaya 50000 lux." result = NumericalRigorChecker.evaluate_answer(answer, gt) assert result["overall_pass"] is False assert result["param_results"]["light"]["status"] == "FAIL" def test_no_ground_truth_skips(self): """Empty ground truth dict should skip all checks and pass.""" result = NumericalRigorChecker.evaluate_answer("any answer", {}) assert result["overall_pass"] is True def test_partial_ground_truth(self): """Only humidity provided, temperature/light should skip.""" gt = {"humidity": {"value": 73.0, "min": 60.0, "max": 85.0}} result = NumericalRigorChecker.evaluate_answer("kelembaban 73%", gt) assert result["overall_pass"] is True # ============================================================================= # TemporalAdherenceChecker # ============================================================================= class TestTemporalAdherenceChecker: def test_general_phase_not_applicable(self): result = TemporalAdherenceChecker.check("Some text", "general") assert result["pass"] is True assert result["applicable"] is False def test_none_phase_not_applicable(self): result = TemporalAdherenceChecker.check("Some text", None) assert result["pass"] is True assert result["applicable"] is False def test_day_phase_correct_id(self): result = TemporalAdherenceChecker.check("Pada siklus siang, suhu 24°C", "day") assert result["pass"] is True def test_day_phase_correct_en(self): result = TemporalAdherenceChecker.check("During day cycle, temp is 24°C", "day") assert result["pass"] is True def test_night_phase_correct_id(self): result = TemporalAdherenceChecker.check("Pada siklus malam, suhu 18°C", "night") assert result["pass"] is True def test_night_phase_correct_en(self): result = TemporalAdherenceChecker.check("During night cycle, temp is 18°C", "night") assert result["pass"] is True def test_day_phase_wrong(self): """Answer references night but expected day — should fail.""" result = TemporalAdherenceChecker.check("Pada siklus malam", "day") assert result["pass"] is False def test_night_phase_wrong(self): result = TemporalAdherenceChecker.check("Pada siklus siang", "night") assert result["pass"] is False def test_no_live_data_skip(self): result = TemporalAdherenceChecker.check( "Chamber is not currently online", "day" ) assert result["pass"] is True assert result.get("status") == "NO_LIVE_DATA" # ============================================================================= # ConstraintSatisfactionChecker # ============================================================================= class TestConstraintSatisfactionChecker: def test_state_a_qualitative_always_pass(self): """State A (use_structured_params=False) always passes.""" result = ConstraintSatisfactionChecker.check( "pH ideal untuk tanaman adalah 6.5", "any query", False ) assert result["pass"] is True assert result["mode"] == "qualitative_quoted" def test_state_b_guarded_clean(self): """State B (guarded) with no forbidden terms passes.""" result = ConstraintSatisfactionChecker.check( "Suhu ideal 20°C", "suhu selada", True ) assert result["pass"] is True assert result["mode"] == "guarded" def test_state_b_guarded_with_forbidden(self): """State B with forbidden term outside breadcrumb fails.""" result = ConstraintSatisfactionChecker.check( "Suhu ideal 20°C dan pH 6.5", "suhu selada", True ) assert result["pass"] is False assert result["mode"] == "unprompted_hallucination" def test_state_b_breadcrumb_ignored(self): """Forbidden terms inside system breadcrumb are allowed.""" result = ConstraintSatisfactionChecker.check( "Suhu 20°C. Parameter terverifikasi PGC hanya mencakup suhu, kelembaban, dan cahaya", "suhu selada", True, ) assert result["pass"] is True def test_state_c_explicit_request_with_warning(self): """State C (explicit request) passes with bifurcation warning.""" result = ConstraintSatisfactionChecker.check( "pH ideal 6.5. ⚠️ Parameter pH di luar kendali otomatis PGC.", "berapa pH untuk tomat", True, ) assert result["pass"] is True assert result["mode"] == "explicit_request_warned" def test_state_c_explicit_request_no_warning(self): """State C fails when warning is missing.""" result = ConstraintSatisfactionChecker.check( "pH ideal 6.5.", "berapa pH", True, ) assert result["pass"] is False assert result["mode"] == "explicit_request_unwarned" # ============================================================================= # TerminologyNuanceChecker # ============================================================================= class TestTerminologyNuanceChecker: def test_kecambah_tunas_correct(self): answer = "mung bean sprouts memerlukan kelembaban tinggi pada fase vegetatif" result = TerminologyNuanceChecker.check_kecambah_tunas(answer) assert result["pass"] is True def test_kecambah_tunas_with_toge(self): answer = "toge memerlukan kelembaban tinggi untuk pertumbuhan vegetatif" result = TerminologyNuanceChecker.check_kecambah_tunas(answer) assert result["pass"] is True def test_kecambah_tunas_missing_mung(self): answer = "tanaman memerlukan cahaya pada fase vegetatif" result = TerminologyNuanceChecker.check_kecambah_tunas(answer) assert result["pass"] is False def test_layu_fusarium_correct(self): answer = "Fusarium menyebabkan layu pada pembuluh batang" result = TerminologyNuanceChecker.check_layu_fusarium(answer) assert result["pass"] is True def test_layu_fusarium_missing(self): answer = "tanaman layu karena kekeringan" result = TerminologyNuanceChecker.check_layu_fusarium(answer) assert result["pass"] is False def test_busuk_akar_pythium_correct(self): answer = "Pythium adalah patogen yang disebabkan oleh jamur air" result = TerminologyNuanceChecker.check_busuk_akar_pythium(answer) assert result["pass"] is True def test_kacang_hijau_correct(self): answer = "kacang hijau adalah mung bean" result = TerminologyNuanceChecker.check_kacang_hijau(answer) assert result["pass"] is True def test_kacang_hijau_wrong(self): answer = "kacang hijau adalah green bean" result = TerminologyNuanceChecker.check_kacang_hijau(answer) assert result["pass"] is False def test_baginda_f1_correct(self): answer = "Baginda F1 adalah varietas semangka dengan suhu optimal 24°C" result = TerminologyNuanceChecker.check_baginda_f1(answer) assert result["pass"] is True # ============================================================================= # validate_ground_truths # ============================================================================= class TestValidateGroundTruths: def test_plant_not_in_db_skips(self): """Case with no expected_plant should not crash.""" cases = [{"case_id": "test", "ground_truth": "some text"}] warnings = validate_ground_truths(cases) assert len(warnings) == 0 def test_negative_test_skipped(self): """Negative tests should be skipped without warning.""" cases = [{ "case_id": "qty_green_beans_temp", "expected_plant": "green_beans", "expected_stage": "vegetative", "ground_truth": "some AI estimate", "is_negative_test": True, }] warnings = validate_ground_truths(cases) assert len(warnings) == 0 def test_alias_resolved(self): """Alias like bok_choy should resolve to pak_choy (no warning for valid GT).""" cases = [{ "case_id": "qty_bok_choy_humidity", "expected_plant": "bok_choy", "expected_stage": "vegetative", "ground_truth": "kelembaban 60-85%, suhu 27, cahaya 20000", }] warnings = validate_ground_truths(cases) assert len(warnings) == 0 def test_stale_ground_truth_warns(self): """If optimal temp not in ground truth text, emit warning.""" cases = [{ "case_id": "test_stale", "expected_plant": "lettuce", "expected_stage": "vegetative", "ground_truth": "some text that does not contain the optimal value", }] warnings = validate_ground_truths(cases) assert len(warnings) == 1 assert "test_stale" in warnings[0] assert "may be stale" in warnings[0]