Spaces:
Running
Running
| """ | |
| Unit tests for the RAGAS evaluation framework checkers and validators. | |
| Tests pure functions that do NOT require API calls (gpt-4o-mini / Cerebras). | |
| """ | |
| import json | |
| import sys | |
| from pathlib import Path | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| import pytest | |
| from scripts.evaluate_ragas import ( | |
| _detect_language, | |
| _resolve_plant_id, | |
| NumericalRigorChecker, | |
| TemporalAdherenceChecker, | |
| ConstraintSatisfactionChecker, | |
| TerminologyNuanceChecker, | |
| validate_ground_truths, | |
| ) | |
| # ============================================================================= | |
| # _detect_language | |
| # ============================================================================= | |
| class TestDetectLanguage: | |
| def test_id_question_suhu(self): | |
| assert _detect_language("Berapa suhu ideal untuk selada?") == "id" | |
| def test_id_question_berapa(self): | |
| assert _detect_language("Berapa kelembaban untuk cabai?") == "id" | |
| def test_id_question_fase(self): | |
| assert _detect_language("Parameter pada fase vegetatif?") == "id" | |
| def test_en_question_what(self): | |
| assert _detect_language("What is the optimal temperature?") == "en" | |
| def test_en_question_how(self): | |
| assert _detect_language("How much light does tomato need?") == "en" | |
| def test_en_question_mixed(self): | |
| assert _detect_language("What is suhu ideal for lettuce?") == "en" | |
| def test_en_empty(self): | |
| assert _detect_language("") == "en" | |
| def test_en_numbers_only(self): | |
| assert _detect_language("123 456 789") == "en" | |
| # ============================================================================= | |
| # _resolve_plant_id | |
| # ============================================================================= | |
| class TestResolvePlantId: | |
| def test_direct_id(self): | |
| assert _resolve_plant_id("lettuce") == "lettuce" | |
| def test_alias_bok_choy(self): | |
| assert _resolve_plant_id("bok_choy") == "pak_choy" | |
| def test_alias_amaranth(self): | |
| assert _resolve_plant_id("amaranth") == "spinach_amaranth" | |
| def test_alias_spinach(self): | |
| assert _resolve_plant_id("spinach") == "spinach_amaranth" | |
| def test_unknown_id_passthrough(self): | |
| assert _resolve_plant_id("green_beans") == "green_beans" | |
| # ============================================================================= | |
| # NumericalRigorChecker | |
| # ============================================================================= | |
| class TestNumericalRigorChecker: | |
| def test_requested_param_subset_skips_unasked_fields(self, gt): | |
| """Humidity-only queries should not fail because temperature/light are absent.""" | |
| answer = "Kelembaban ideal 73% RH." | |
| # Accepts set-like input, but output should be a stable list for deterministic results | |
| result = NumericalRigorChecker.evaluate_answer( | |
| answer, | |
| gt, | |
| requested_params={"humidity"}, # input can be set-like | |
| ) | |
| assert result["overall_pass"] is True | |
| assert result["requested_params"] == ["humidity"] # output must be a stable list | |
| assert result["param_results"]["humidity"]["status"] == "PASS" | |
| assert "temperature" not in result["param_results"] | |
| assert "light" not in result["param_results"] | |
| def test_unicode_thin_space_lux_parses(self, gt): | |
| """Lux values formatted as 19500 must parse as 19500, not 500.""" | |
| answer = "Cahaya optimal 19\u202f500 lux." | |
| # U+202F = NARROW NO-BREAK SPACE (thin space thousands separator) | |
| result = NumericalRigorChecker.evaluate_answer( | |
| answer, | |
| gt, | |
| requested_params={"light"}, | |
| ) | |
| assert result["overall_pass"] is True | |
| assert result["param_results"]["light"]["status"] == "PASS" | |
| def gt(self): | |
| return { | |
| "temperature": {"value": 20.0, "min": 16.0, "max": 24.0}, | |
| "humidity": {"value": 73.0, "min": 60.0, "max": 85.0}, | |
| "light": {"value": 18000, "min": 14000, "max": 22000}, | |
| } | |
| def test_pass_exact_optimal(self, gt): | |
| """Answer with the exact optimal temperature passes.""" | |
| answer = "Suhu optimal 20°C dengan kelembaban 73% dan cahaya 18000 lux." | |
| result = NumericalRigorChecker.evaluate_answer(answer, gt) | |
| assert result["overall_pass"] is True | |
| def test_pass_within_tolerance(self, gt): | |
| """Answer within ±0.5 of optimal passes for all params.""" | |
| answer = "Suhu optimal 20.4°C, kelembaban 73% RH, cahaya 18000 lux." | |
| result = NumericalRigorChecker.evaluate_answer(answer, gt) | |
| assert result["overall_pass"] is True | |
| def test_pass_within_range(self, gt): | |
| """Answer within the safety range [min, max] passes for all params.""" | |
| answer = "Suhu 18°C, kelembaban 70% RH, cahaya 15000 lux." | |
| result = NumericalRigorChecker.evaluate_answer(answer, gt) | |
| assert result["overall_pass"] is True | |
| def test_fail_outside_range(self, gt): | |
| """Answer outside both tolerance and safety range fails.""" | |
| answer = "Suhu optimal 30°C." | |
| result = NumericalRigorChecker.evaluate_answer(answer, gt) | |
| assert result["overall_pass"] is False | |
| def test_pass_range_answer(self, gt): | |
| """Range answer that contains the optimal value passes for all params.""" | |
| answer = "Suhu 18-24°C, kelembaban 70-75% RH, cahaya 15000-20000 lux." | |
| result = NumericalRigorChecker.evaluate_answer(answer, gt) | |
| assert result["overall_pass"] is True | |
| def test_pass_all_params(self, gt): | |
| """All three parameters correct passes.""" | |
| answer = "Suhu 20°C, kelembaban 73% RH, cahaya 18000 lux." | |
| result = NumericalRigorChecker.evaluate_answer(answer, gt) | |
| assert result["overall_pass"] is True | |
| def test_fail_one_param(self, gt): | |
| """One wrong parameter causes overall fail.""" | |
| answer = "Suhu 20°C, kelembaban 73%, cahaya 50000 lux." | |
| result = NumericalRigorChecker.evaluate_answer(answer, gt) | |
| assert result["overall_pass"] is False | |
| assert result["param_results"]["light"]["status"] == "FAIL" | |
| def test_no_ground_truth_skips(self): | |
| """Empty ground truth dict should skip all checks and pass.""" | |
| result = NumericalRigorChecker.evaluate_answer("any answer", {}) | |
| assert result["overall_pass"] is True | |
| def test_partial_ground_truth(self): | |
| """Only humidity provided, temperature/light should skip.""" | |
| gt = {"humidity": {"value": 73.0, "min": 60.0, "max": 85.0}} | |
| result = NumericalRigorChecker.evaluate_answer("kelembaban 73%", gt) | |
| assert result["overall_pass"] is True | |
| # ============================================================================= | |
| # TemporalAdherenceChecker | |
| # ============================================================================= | |
| class TestTemporalAdherenceChecker: | |
| def test_general_phase_not_applicable(self): | |
| result = TemporalAdherenceChecker.check("Some text", "general") | |
| assert result["pass"] is True | |
| assert result["applicable"] is False | |
| def test_none_phase_not_applicable(self): | |
| result = TemporalAdherenceChecker.check("Some text", None) | |
| assert result["pass"] is True | |
| assert result["applicable"] is False | |
| def test_day_phase_correct_id(self): | |
| result = TemporalAdherenceChecker.check("Pada siklus siang, suhu 24°C", "day") | |
| assert result["pass"] is True | |
| def test_day_phase_correct_en(self): | |
| result = TemporalAdherenceChecker.check("During day cycle, temp is 24°C", "day") | |
| assert result["pass"] is True | |
| def test_night_phase_correct_id(self): | |
| result = TemporalAdherenceChecker.check("Pada siklus malam, suhu 18°C", "night") | |
| assert result["pass"] is True | |
| def test_night_phase_correct_en(self): | |
| result = TemporalAdherenceChecker.check("During night cycle, temp is 18°C", "night") | |
| assert result["pass"] is True | |
| def test_day_phase_wrong(self): | |
| """Answer references night but expected day — should fail.""" | |
| result = TemporalAdherenceChecker.check("Pada siklus malam", "day") | |
| assert result["pass"] is False | |
| def test_night_phase_wrong(self): | |
| result = TemporalAdherenceChecker.check("Pada siklus siang", "night") | |
| assert result["pass"] is False | |
| def test_no_live_data_skip(self): | |
| result = TemporalAdherenceChecker.check( | |
| "Chamber is not currently online", "day" | |
| ) | |
| assert result["pass"] is True | |
| assert result.get("status") == "NO_LIVE_DATA" | |
| # ============================================================================= | |
| # ConstraintSatisfactionChecker | |
| # ============================================================================= | |
| class TestConstraintSatisfactionChecker: | |
| def test_state_a_qualitative_always_pass(self): | |
| """State A (use_structured_params=False) always passes.""" | |
| result = ConstraintSatisfactionChecker.check( | |
| "pH ideal untuk tanaman adalah 6.5", "any query", False | |
| ) | |
| assert result["pass"] is True | |
| assert result["mode"] == "qualitative_quoted" | |
| def test_state_b_guarded_clean(self): | |
| """State B (guarded) with no forbidden terms passes.""" | |
| result = ConstraintSatisfactionChecker.check( | |
| "Suhu ideal 20°C", "suhu selada", True | |
| ) | |
| assert result["pass"] is True | |
| assert result["mode"] == "guarded" | |
| def test_state_b_guarded_with_forbidden(self): | |
| """State B with forbidden term outside breadcrumb fails.""" | |
| result = ConstraintSatisfactionChecker.check( | |
| "Suhu ideal 20°C dan pH 6.5", "suhu selada", True | |
| ) | |
| assert result["pass"] is False | |
| assert result["mode"] == "unprompted_hallucination" | |
| def test_state_b_breadcrumb_ignored(self): | |
| """Forbidden terms inside system breadcrumb are allowed.""" | |
| result = ConstraintSatisfactionChecker.check( | |
| "Suhu 20°C. Parameter terverifikasi PGC hanya mencakup suhu, kelembaban, dan cahaya", | |
| "suhu selada", True, | |
| ) | |
| assert result["pass"] is True | |
| def test_state_c_explicit_request_with_warning(self): | |
| """State C (explicit request) passes with bifurcation warning.""" | |
| result = ConstraintSatisfactionChecker.check( | |
| "pH ideal 6.5. ⚠️ Parameter pH di luar kendali otomatis PGC.", | |
| "berapa pH untuk tomat", True, | |
| ) | |
| assert result["pass"] is True | |
| assert result["mode"] == "explicit_request_warned" | |
| def test_state_c_explicit_request_no_warning(self): | |
| """State C fails when warning is missing.""" | |
| result = ConstraintSatisfactionChecker.check( | |
| "pH ideal 6.5.", | |
| "berapa pH", True, | |
| ) | |
| assert result["pass"] is False | |
| assert result["mode"] == "explicit_request_unwarned" | |
| # ============================================================================= | |
| # TerminologyNuanceChecker | |
| # ============================================================================= | |
| class TestTerminologyNuanceChecker: | |
| def test_kecambah_tunas_correct(self): | |
| answer = "mung bean sprouts memerlukan kelembaban tinggi pada fase vegetatif" | |
| result = TerminologyNuanceChecker.check_kecambah_tunas(answer) | |
| assert result["pass"] is True | |
| def test_kecambah_tunas_with_toge(self): | |
| answer = "toge memerlukan kelembaban tinggi untuk pertumbuhan vegetatif" | |
| result = TerminologyNuanceChecker.check_kecambah_tunas(answer) | |
| assert result["pass"] is True | |
| def test_kecambah_tunas_missing_mung(self): | |
| answer = "tanaman memerlukan cahaya pada fase vegetatif" | |
| result = TerminologyNuanceChecker.check_kecambah_tunas(answer) | |
| assert result["pass"] is False | |
| def test_layu_fusarium_correct(self): | |
| answer = "Fusarium menyebabkan layu pada pembuluh batang" | |
| result = TerminologyNuanceChecker.check_layu_fusarium(answer) | |
| assert result["pass"] is True | |
| def test_layu_fusarium_missing(self): | |
| answer = "tanaman layu karena kekeringan" | |
| result = TerminologyNuanceChecker.check_layu_fusarium(answer) | |
| assert result["pass"] is False | |
| def test_busuk_akar_pythium_correct(self): | |
| answer = "Pythium adalah patogen yang disebabkan oleh jamur air" | |
| result = TerminologyNuanceChecker.check_busuk_akar_pythium(answer) | |
| assert result["pass"] is True | |
| def test_kacang_hijau_correct(self): | |
| answer = "kacang hijau adalah mung bean" | |
| result = TerminologyNuanceChecker.check_kacang_hijau(answer) | |
| assert result["pass"] is True | |
| def test_kacang_hijau_wrong(self): | |
| answer = "kacang hijau adalah green bean" | |
| result = TerminologyNuanceChecker.check_kacang_hijau(answer) | |
| assert result["pass"] is False | |
| def test_baginda_f1_correct(self): | |
| answer = "Baginda F1 adalah varietas semangka dengan suhu optimal 24°C" | |
| result = TerminologyNuanceChecker.check_baginda_f1(answer) | |
| assert result["pass"] is True | |
| # ============================================================================= | |
| # validate_ground_truths | |
| # ============================================================================= | |
| class TestValidateGroundTruths: | |
| def test_plant_not_in_db_skips(self): | |
| """Case with no expected_plant should not crash.""" | |
| cases = [{"case_id": "test", "ground_truth": "some text"}] | |
| warnings = validate_ground_truths(cases) | |
| assert len(warnings) == 0 | |
| def test_negative_test_skipped(self): | |
| """Negative tests should be skipped without warning.""" | |
| cases = [{ | |
| "case_id": "qty_green_beans_temp", | |
| "expected_plant": "green_beans", | |
| "expected_stage": "vegetative", | |
| "ground_truth": "some AI estimate", | |
| "is_negative_test": True, | |
| }] | |
| warnings = validate_ground_truths(cases) | |
| assert len(warnings) == 0 | |
| def test_alias_resolved(self): | |
| """Alias like bok_choy should resolve to pak_choy (no warning for valid GT).""" | |
| cases = [{ | |
| "case_id": "qty_bok_choy_humidity", | |
| "expected_plant": "bok_choy", | |
| "expected_stage": "vegetative", | |
| "ground_truth": "kelembaban 60-85%, suhu 27, cahaya 20000", | |
| }] | |
| warnings = validate_ground_truths(cases) | |
| assert len(warnings) == 0 | |
| def test_stale_ground_truth_warns(self): | |
| """If optimal temp not in ground truth text, emit warning.""" | |
| cases = [{ | |
| "case_id": "test_stale", | |
| "expected_plant": "lettuce", | |
| "expected_stage": "vegetative", | |
| "ground_truth": "some text that does not contain the optimal value", | |
| }] | |
| warnings = validate_ground_truths(cases) | |
| assert len(warnings) == 1 | |
| assert "test_stale" in warnings[0] | |
| assert "may be stale" in warnings[0] | |