Spaces:

Jacooo
/

PGC-AI-Chatbot

Running

File size: 15,045 Bytes

"""
Unit tests for the RAGAS evaluation framework checkers and validators.

Tests pure functions that do NOT require API calls (gpt-4o-mini / Cerebras).
"""

import json
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent))

import pytest

from scripts.evaluate_ragas import (
    _detect_language,
    _resolve_plant_id,
    NumericalRigorChecker,
    TemporalAdherenceChecker,
    ConstraintSatisfactionChecker,
    TerminologyNuanceChecker,
    validate_ground_truths,
)

# =============================================================================
# _detect_language
# =============================================================================

class TestDetectLanguage:
    def test_id_question_suhu(self):
        assert _detect_language("Berapa suhu ideal untuk selada?") == "id"

    def test_id_question_berapa(self):
        assert _detect_language("Berapa kelembaban untuk cabai?") == "id"

    def test_id_question_fase(self):
        assert _detect_language("Parameter pada fase vegetatif?") == "id"

    def test_en_question_what(self):
        assert _detect_language("What is the optimal temperature?") == "en"

    def test_en_question_how(self):
        assert _detect_language("How much light does tomato need?") == "en"

    def test_en_question_mixed(self):
        assert _detect_language("What is suhu ideal for lettuce?") == "en"

    def test_en_empty(self):
        assert _detect_language("") == "en"

    def test_en_numbers_only(self):
        assert _detect_language("123 456 789") == "en"


# =============================================================================
# _resolve_plant_id
# =============================================================================

class TestResolvePlantId:
    def test_direct_id(self):
        assert _resolve_plant_id("lettuce") == "lettuce"

    def test_alias_bok_choy(self):
        assert _resolve_plant_id("bok_choy") == "pak_choy"

    def test_alias_amaranth(self):
        assert _resolve_plant_id("amaranth") == "spinach_amaranth"

    def test_alias_spinach(self):
        assert _resolve_plant_id("spinach") == "spinach_amaranth"

    def test_unknown_id_passthrough(self):
        assert _resolve_plant_id("green_beans") == "green_beans"


# =============================================================================
# NumericalRigorChecker
# =============================================================================

class TestNumericalRigorChecker:
    def test_requested_param_subset_skips_unasked_fields(self, gt):
        """Humidity-only queries should not fail because temperature/light are absent."""
        answer = "Kelembaban ideal 73% RH."
        # Accepts set-like input, but output should be a stable list for deterministic results
        result = NumericalRigorChecker.evaluate_answer(
            answer,
            gt,
            requested_params={"humidity"},  # input can be set-like
        )
        assert result["overall_pass"] is True
        assert result["requested_params"] == ["humidity"]  # output must be a stable list
        assert result["param_results"]["humidity"]["status"] == "PASS"
        assert "temperature" not in result["param_results"]
        assert "light" not in result["param_results"]

    def test_unicode_thin_space_lux_parses(self, gt):
        """Lux values formatted as 19500 must parse as 19500, not 500."""
        answer = "Cahaya optimal 19\u202f500 lux."
        # U+202F = NARROW NO-BREAK SPACE (thin space thousands separator)
        result = NumericalRigorChecker.evaluate_answer(
            answer,
            gt,
            requested_params={"light"},
        )
        assert result["overall_pass"] is True
        assert result["param_results"]["light"]["status"] == "PASS"

    @pytest.fixture
    def gt(self):
        return {
            "temperature": {"value": 20.0, "min": 16.0, "max": 24.0},
            "humidity": {"value": 73.0, "min": 60.0, "max": 85.0},
            "light": {"value": 18000, "min": 14000, "max": 22000},
        }

    def test_pass_exact_optimal(self, gt):
        """Answer with the exact optimal temperature passes."""
        answer = "Suhu optimal 20°C dengan kelembaban 73% dan cahaya 18000 lux."
        result = NumericalRigorChecker.evaluate_answer(answer, gt)
        assert result["overall_pass"] is True

    def test_pass_within_tolerance(self, gt):
        """Answer within ±0.5 of optimal passes for all params."""
        answer = "Suhu optimal 20.4°C, kelembaban 73% RH, cahaya 18000 lux."
        result = NumericalRigorChecker.evaluate_answer(answer, gt)
        assert result["overall_pass"] is True

    def test_pass_within_range(self, gt):
        """Answer within the safety range [min, max] passes for all params."""
        answer = "Suhu 18°C, kelembaban 70% RH, cahaya 15000 lux."
        result = NumericalRigorChecker.evaluate_answer(answer, gt)
        assert result["overall_pass"] is True

    def test_fail_outside_range(self, gt):
        """Answer outside both tolerance and safety range fails."""
        answer = "Suhu optimal 30°C."
        result = NumericalRigorChecker.evaluate_answer(answer, gt)
        assert result["overall_pass"] is False

    def test_pass_range_answer(self, gt):
        """Range answer that contains the optimal value passes for all params."""
        answer = "Suhu 18-24°C, kelembaban 70-75% RH, cahaya 15000-20000 lux."
        result = NumericalRigorChecker.evaluate_answer(answer, gt)
        assert result["overall_pass"] is True

    def test_pass_all_params(self, gt):
        """All three parameters correct passes."""
        answer = "Suhu 20°C, kelembaban 73% RH, cahaya 18000 lux."
        result = NumericalRigorChecker.evaluate_answer(answer, gt)
        assert result["overall_pass"] is True

    def test_fail_one_param(self, gt):
        """One wrong parameter causes overall fail."""
        answer = "Suhu 20°C, kelembaban 73%, cahaya 50000 lux."
        result = NumericalRigorChecker.evaluate_answer(answer, gt)
        assert result["overall_pass"] is False
        assert result["param_results"]["light"]["status"] == "FAIL"

    def test_no_ground_truth_skips(self):
        """Empty ground truth dict should skip all checks and pass."""
        result = NumericalRigorChecker.evaluate_answer("any answer", {})
        assert result["overall_pass"] is True

    def test_partial_ground_truth(self):
        """Only humidity provided, temperature/light should skip."""
        gt = {"humidity": {"value": 73.0, "min": 60.0, "max": 85.0}}
        result = NumericalRigorChecker.evaluate_answer("kelembaban 73%", gt)
        assert result["overall_pass"] is True


# =============================================================================
# TemporalAdherenceChecker
# =============================================================================

class TestTemporalAdherenceChecker:
    def test_general_phase_not_applicable(self):
        result = TemporalAdherenceChecker.check("Some text", "general")
        assert result["pass"] is True
        assert result["applicable"] is False

    def test_none_phase_not_applicable(self):
        result = TemporalAdherenceChecker.check("Some text", None)
        assert result["pass"] is True
        assert result["applicable"] is False

    def test_day_phase_correct_id(self):
        result = TemporalAdherenceChecker.check("Pada siklus siang, suhu 24°C", "day")
        assert result["pass"] is True

    def test_day_phase_correct_en(self):
        result = TemporalAdherenceChecker.check("During day cycle, temp is 24°C", "day")
        assert result["pass"] is True

    def test_night_phase_correct_id(self):
        result = TemporalAdherenceChecker.check("Pada siklus malam, suhu 18°C", "night")
        assert result["pass"] is True

    def test_night_phase_correct_en(self):
        result = TemporalAdherenceChecker.check("During night cycle, temp is 18°C", "night")
        assert result["pass"] is True

    def test_day_phase_wrong(self):
        """Answer references night but expected day — should fail."""
        result = TemporalAdherenceChecker.check("Pada siklus malam", "day")
        assert result["pass"] is False

    def test_night_phase_wrong(self):
        result = TemporalAdherenceChecker.check("Pada siklus siang", "night")
        assert result["pass"] is False

    def test_no_live_data_skip(self):
        result = TemporalAdherenceChecker.check(
            "Chamber is not currently online", "day"
        )
        assert result["pass"] is True
        assert result.get("status") == "NO_LIVE_DATA"


# =============================================================================
# ConstraintSatisfactionChecker
# =============================================================================

class TestConstraintSatisfactionChecker:
    def test_state_a_qualitative_always_pass(self):
        """State A (use_structured_params=False) always passes."""
        result = ConstraintSatisfactionChecker.check(
            "pH ideal untuk tanaman adalah 6.5", "any query", False
        )
        assert result["pass"] is True
        assert result["mode"] == "qualitative_quoted"

    def test_state_b_guarded_clean(self):
        """State B (guarded) with no forbidden terms passes."""
        result = ConstraintSatisfactionChecker.check(
            "Suhu ideal 20°C", "suhu selada", True
        )
        assert result["pass"] is True
        assert result["mode"] == "guarded"

    def test_state_b_guarded_with_forbidden(self):
        """State B with forbidden term outside breadcrumb fails."""
        result = ConstraintSatisfactionChecker.check(
            "Suhu ideal 20°C dan pH 6.5", "suhu selada", True
        )
        assert result["pass"] is False
        assert result["mode"] == "unprompted_hallucination"

    def test_state_b_breadcrumb_ignored(self):
        """Forbidden terms inside system breadcrumb are allowed."""
        result = ConstraintSatisfactionChecker.check(
            "Suhu 20°C. Parameter terverifikasi PGC hanya mencakup suhu, kelembaban, dan cahaya",
            "suhu selada", True,
        )
        assert result["pass"] is True

    def test_state_c_explicit_request_with_warning(self):
        """State C (explicit request) passes with bifurcation warning."""
        result = ConstraintSatisfactionChecker.check(
            "pH ideal 6.5. ⚠️ Parameter pH di luar kendali otomatis PGC.",
            "berapa pH untuk tomat", True,
        )
        assert result["pass"] is True
        assert result["mode"] == "explicit_request_warned"

    def test_state_c_explicit_request_no_warning(self):
        """State C fails when warning is missing."""
        result = ConstraintSatisfactionChecker.check(
            "pH ideal 6.5.",
            "berapa pH", True,
        )
        assert result["pass"] is False
        assert result["mode"] == "explicit_request_unwarned"


# =============================================================================
# TerminologyNuanceChecker
# =============================================================================

class TestTerminologyNuanceChecker:
    def test_kecambah_tunas_correct(self):
        answer = "mung bean sprouts memerlukan kelembaban tinggi pada fase vegetatif"
        result = TerminologyNuanceChecker.check_kecambah_tunas(answer)
        assert result["pass"] is True

    def test_kecambah_tunas_with_toge(self):
        answer = "toge memerlukan kelembaban tinggi untuk pertumbuhan vegetatif"
        result = TerminologyNuanceChecker.check_kecambah_tunas(answer)
        assert result["pass"] is True

    def test_kecambah_tunas_missing_mung(self):
        answer = "tanaman memerlukan cahaya pada fase vegetatif"
        result = TerminologyNuanceChecker.check_kecambah_tunas(answer)
        assert result["pass"] is False

    def test_layu_fusarium_correct(self):
        answer = "Fusarium menyebabkan layu pada pembuluh batang"
        result = TerminologyNuanceChecker.check_layu_fusarium(answer)
        assert result["pass"] is True

    def test_layu_fusarium_missing(self):
        answer = "tanaman layu karena kekeringan"
        result = TerminologyNuanceChecker.check_layu_fusarium(answer)
        assert result["pass"] is False

    def test_busuk_akar_pythium_correct(self):
        answer = "Pythium adalah patogen yang disebabkan oleh jamur air"
        result = TerminologyNuanceChecker.check_busuk_akar_pythium(answer)
        assert result["pass"] is True

    def test_kacang_hijau_correct(self):
        answer = "kacang hijau adalah mung bean"
        result = TerminologyNuanceChecker.check_kacang_hijau(answer)
        assert result["pass"] is True

    def test_kacang_hijau_wrong(self):
        answer = "kacang hijau adalah green bean"
        result = TerminologyNuanceChecker.check_kacang_hijau(answer)
        assert result["pass"] is False

    def test_baginda_f1_correct(self):
        answer = "Baginda F1 adalah varietas semangka dengan suhu optimal 24°C"
        result = TerminologyNuanceChecker.check_baginda_f1(answer)
        assert result["pass"] is True


# =============================================================================
# validate_ground_truths
# =============================================================================

class TestValidateGroundTruths:
    def test_plant_not_in_db_skips(self):
        """Case with no expected_plant should not crash."""
        cases = [{"case_id": "test", "ground_truth": "some text"}]
        warnings = validate_ground_truths(cases)
        assert len(warnings) == 0

    def test_negative_test_skipped(self):
        """Negative tests should be skipped without warning."""
        cases = [{
            "case_id": "qty_green_beans_temp",
            "expected_plant": "green_beans",
            "expected_stage": "vegetative",
            "ground_truth": "some AI estimate",
            "is_negative_test": True,
        }]
        warnings = validate_ground_truths(cases)
        assert len(warnings) == 0

    def test_alias_resolved(self):
        """Alias like bok_choy should resolve to pak_choy (no warning for valid GT)."""
        cases = [{
            "case_id": "qty_bok_choy_humidity",
            "expected_plant": "bok_choy",
            "expected_stage": "vegetative",
            "ground_truth": "kelembaban 60-85%, suhu 27, cahaya 20000",
        }]
        warnings = validate_ground_truths(cases)
        assert len(warnings) == 0



    def test_stale_ground_truth_warns(self):
        """If optimal temp not in ground truth text, emit warning."""
        cases = [{
            "case_id": "test_stale",
            "expected_plant": "lettuce",
            "expected_stage": "vegetative",
            "ground_truth": "some text that does not contain the optimal value",
        }]
        warnings = validate_ground_truths(cases)
        assert len(warnings) == 1
        assert "test_stale" in warnings[0]
        assert "may be stale" in warnings[0]