PGC-AI-Chatbot / tests /test_evaluation_framework.py
Jacooo's picture
Deploy from GitHub: 991875b
e5c5d28 verified
"""
Unit tests for the RAGAS evaluation framework checkers and validators.
Tests pure functions that do NOT require API calls (gpt-4o-mini / Cerebras).
"""
import json
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
import pytest
from scripts.evaluate_ragas import (
_detect_language,
_resolve_plant_id,
NumericalRigorChecker,
TemporalAdherenceChecker,
ConstraintSatisfactionChecker,
TerminologyNuanceChecker,
validate_ground_truths,
)
# =============================================================================
# _detect_language
# =============================================================================
class TestDetectLanguage:
def test_id_question_suhu(self):
assert _detect_language("Berapa suhu ideal untuk selada?") == "id"
def test_id_question_berapa(self):
assert _detect_language("Berapa kelembaban untuk cabai?") == "id"
def test_id_question_fase(self):
assert _detect_language("Parameter pada fase vegetatif?") == "id"
def test_en_question_what(self):
assert _detect_language("What is the optimal temperature?") == "en"
def test_en_question_how(self):
assert _detect_language("How much light does tomato need?") == "en"
def test_en_question_mixed(self):
assert _detect_language("What is suhu ideal for lettuce?") == "en"
def test_en_empty(self):
assert _detect_language("") == "en"
def test_en_numbers_only(self):
assert _detect_language("123 456 789") == "en"
# =============================================================================
# _resolve_plant_id
# =============================================================================
class TestResolvePlantId:
def test_direct_id(self):
assert _resolve_plant_id("lettuce") == "lettuce"
def test_alias_bok_choy(self):
assert _resolve_plant_id("bok_choy") == "pak_choy"
def test_alias_amaranth(self):
assert _resolve_plant_id("amaranth") == "spinach_amaranth"
def test_alias_spinach(self):
assert _resolve_plant_id("spinach") == "spinach_amaranth"
def test_unknown_id_passthrough(self):
assert _resolve_plant_id("green_beans") == "green_beans"
# =============================================================================
# NumericalRigorChecker
# =============================================================================
class TestNumericalRigorChecker:
def test_requested_param_subset_skips_unasked_fields(self, gt):
"""Humidity-only queries should not fail because temperature/light are absent."""
answer = "Kelembaban ideal 73% RH."
# Accepts set-like input, but output should be a stable list for deterministic results
result = NumericalRigorChecker.evaluate_answer(
answer,
gt,
requested_params={"humidity"}, # input can be set-like
)
assert result["overall_pass"] is True
assert result["requested_params"] == ["humidity"] # output must be a stable list
assert result["param_results"]["humidity"]["status"] == "PASS"
assert "temperature" not in result["param_results"]
assert "light" not in result["param_results"]
def test_unicode_thin_space_lux_parses(self, gt):
"""Lux values formatted as 19500 must parse as 19500, not 500."""
answer = "Cahaya optimal 19\u202f500 lux."
# U+202F = NARROW NO-BREAK SPACE (thin space thousands separator)
result = NumericalRigorChecker.evaluate_answer(
answer,
gt,
requested_params={"light"},
)
assert result["overall_pass"] is True
assert result["param_results"]["light"]["status"] == "PASS"
@pytest.fixture
def gt(self):
return {
"temperature": {"value": 20.0, "min": 16.0, "max": 24.0},
"humidity": {"value": 73.0, "min": 60.0, "max": 85.0},
"light": {"value": 18000, "min": 14000, "max": 22000},
}
def test_pass_exact_optimal(self, gt):
"""Answer with the exact optimal temperature passes."""
answer = "Suhu optimal 20°C dengan kelembaban 73% dan cahaya 18000 lux."
result = NumericalRigorChecker.evaluate_answer(answer, gt)
assert result["overall_pass"] is True
def test_pass_within_tolerance(self, gt):
"""Answer within ±0.5 of optimal passes for all params."""
answer = "Suhu optimal 20.4°C, kelembaban 73% RH, cahaya 18000 lux."
result = NumericalRigorChecker.evaluate_answer(answer, gt)
assert result["overall_pass"] is True
def test_pass_within_range(self, gt):
"""Answer within the safety range [min, max] passes for all params."""
answer = "Suhu 18°C, kelembaban 70% RH, cahaya 15000 lux."
result = NumericalRigorChecker.evaluate_answer(answer, gt)
assert result["overall_pass"] is True
def test_fail_outside_range(self, gt):
"""Answer outside both tolerance and safety range fails."""
answer = "Suhu optimal 30°C."
result = NumericalRigorChecker.evaluate_answer(answer, gt)
assert result["overall_pass"] is False
def test_pass_range_answer(self, gt):
"""Range answer that contains the optimal value passes for all params."""
answer = "Suhu 18-24°C, kelembaban 70-75% RH, cahaya 15000-20000 lux."
result = NumericalRigorChecker.evaluate_answer(answer, gt)
assert result["overall_pass"] is True
def test_pass_all_params(self, gt):
"""All three parameters correct passes."""
answer = "Suhu 20°C, kelembaban 73% RH, cahaya 18000 lux."
result = NumericalRigorChecker.evaluate_answer(answer, gt)
assert result["overall_pass"] is True
def test_fail_one_param(self, gt):
"""One wrong parameter causes overall fail."""
answer = "Suhu 20°C, kelembaban 73%, cahaya 50000 lux."
result = NumericalRigorChecker.evaluate_answer(answer, gt)
assert result["overall_pass"] is False
assert result["param_results"]["light"]["status"] == "FAIL"
def test_no_ground_truth_skips(self):
"""Empty ground truth dict should skip all checks and pass."""
result = NumericalRigorChecker.evaluate_answer("any answer", {})
assert result["overall_pass"] is True
def test_partial_ground_truth(self):
"""Only humidity provided, temperature/light should skip."""
gt = {"humidity": {"value": 73.0, "min": 60.0, "max": 85.0}}
result = NumericalRigorChecker.evaluate_answer("kelembaban 73%", gt)
assert result["overall_pass"] is True
# =============================================================================
# TemporalAdherenceChecker
# =============================================================================
class TestTemporalAdherenceChecker:
def test_general_phase_not_applicable(self):
result = TemporalAdherenceChecker.check("Some text", "general")
assert result["pass"] is True
assert result["applicable"] is False
def test_none_phase_not_applicable(self):
result = TemporalAdherenceChecker.check("Some text", None)
assert result["pass"] is True
assert result["applicable"] is False
def test_day_phase_correct_id(self):
result = TemporalAdherenceChecker.check("Pada siklus siang, suhu 24°C", "day")
assert result["pass"] is True
def test_day_phase_correct_en(self):
result = TemporalAdherenceChecker.check("During day cycle, temp is 24°C", "day")
assert result["pass"] is True
def test_night_phase_correct_id(self):
result = TemporalAdherenceChecker.check("Pada siklus malam, suhu 18°C", "night")
assert result["pass"] is True
def test_night_phase_correct_en(self):
result = TemporalAdherenceChecker.check("During night cycle, temp is 18°C", "night")
assert result["pass"] is True
def test_day_phase_wrong(self):
"""Answer references night but expected day — should fail."""
result = TemporalAdherenceChecker.check("Pada siklus malam", "day")
assert result["pass"] is False
def test_night_phase_wrong(self):
result = TemporalAdherenceChecker.check("Pada siklus siang", "night")
assert result["pass"] is False
def test_no_live_data_skip(self):
result = TemporalAdherenceChecker.check(
"Chamber is not currently online", "day"
)
assert result["pass"] is True
assert result.get("status") == "NO_LIVE_DATA"
# =============================================================================
# ConstraintSatisfactionChecker
# =============================================================================
class TestConstraintSatisfactionChecker:
def test_state_a_qualitative_always_pass(self):
"""State A (use_structured_params=False) always passes."""
result = ConstraintSatisfactionChecker.check(
"pH ideal untuk tanaman adalah 6.5", "any query", False
)
assert result["pass"] is True
assert result["mode"] == "qualitative_quoted"
def test_state_b_guarded_clean(self):
"""State B (guarded) with no forbidden terms passes."""
result = ConstraintSatisfactionChecker.check(
"Suhu ideal 20°C", "suhu selada", True
)
assert result["pass"] is True
assert result["mode"] == "guarded"
def test_state_b_guarded_with_forbidden(self):
"""State B with forbidden term outside breadcrumb fails."""
result = ConstraintSatisfactionChecker.check(
"Suhu ideal 20°C dan pH 6.5", "suhu selada", True
)
assert result["pass"] is False
assert result["mode"] == "unprompted_hallucination"
def test_state_b_breadcrumb_ignored(self):
"""Forbidden terms inside system breadcrumb are allowed."""
result = ConstraintSatisfactionChecker.check(
"Suhu 20°C. Parameter terverifikasi PGC hanya mencakup suhu, kelembaban, dan cahaya",
"suhu selada", True,
)
assert result["pass"] is True
def test_state_c_explicit_request_with_warning(self):
"""State C (explicit request) passes with bifurcation warning."""
result = ConstraintSatisfactionChecker.check(
"pH ideal 6.5. ⚠️ Parameter pH di luar kendali otomatis PGC.",
"berapa pH untuk tomat", True,
)
assert result["pass"] is True
assert result["mode"] == "explicit_request_warned"
def test_state_c_explicit_request_no_warning(self):
"""State C fails when warning is missing."""
result = ConstraintSatisfactionChecker.check(
"pH ideal 6.5.",
"berapa pH", True,
)
assert result["pass"] is False
assert result["mode"] == "explicit_request_unwarned"
# =============================================================================
# TerminologyNuanceChecker
# =============================================================================
class TestTerminologyNuanceChecker:
def test_kecambah_tunas_correct(self):
answer = "mung bean sprouts memerlukan kelembaban tinggi pada fase vegetatif"
result = TerminologyNuanceChecker.check_kecambah_tunas(answer)
assert result["pass"] is True
def test_kecambah_tunas_with_toge(self):
answer = "toge memerlukan kelembaban tinggi untuk pertumbuhan vegetatif"
result = TerminologyNuanceChecker.check_kecambah_tunas(answer)
assert result["pass"] is True
def test_kecambah_tunas_missing_mung(self):
answer = "tanaman memerlukan cahaya pada fase vegetatif"
result = TerminologyNuanceChecker.check_kecambah_tunas(answer)
assert result["pass"] is False
def test_layu_fusarium_correct(self):
answer = "Fusarium menyebabkan layu pada pembuluh batang"
result = TerminologyNuanceChecker.check_layu_fusarium(answer)
assert result["pass"] is True
def test_layu_fusarium_missing(self):
answer = "tanaman layu karena kekeringan"
result = TerminologyNuanceChecker.check_layu_fusarium(answer)
assert result["pass"] is False
def test_busuk_akar_pythium_correct(self):
answer = "Pythium adalah patogen yang disebabkan oleh jamur air"
result = TerminologyNuanceChecker.check_busuk_akar_pythium(answer)
assert result["pass"] is True
def test_kacang_hijau_correct(self):
answer = "kacang hijau adalah mung bean"
result = TerminologyNuanceChecker.check_kacang_hijau(answer)
assert result["pass"] is True
def test_kacang_hijau_wrong(self):
answer = "kacang hijau adalah green bean"
result = TerminologyNuanceChecker.check_kacang_hijau(answer)
assert result["pass"] is False
def test_baginda_f1_correct(self):
answer = "Baginda F1 adalah varietas semangka dengan suhu optimal 24°C"
result = TerminologyNuanceChecker.check_baginda_f1(answer)
assert result["pass"] is True
# =============================================================================
# validate_ground_truths
# =============================================================================
class TestValidateGroundTruths:
def test_plant_not_in_db_skips(self):
"""Case with no expected_plant should not crash."""
cases = [{"case_id": "test", "ground_truth": "some text"}]
warnings = validate_ground_truths(cases)
assert len(warnings) == 0
def test_negative_test_skipped(self):
"""Negative tests should be skipped without warning."""
cases = [{
"case_id": "qty_green_beans_temp",
"expected_plant": "green_beans",
"expected_stage": "vegetative",
"ground_truth": "some AI estimate",
"is_negative_test": True,
}]
warnings = validate_ground_truths(cases)
assert len(warnings) == 0
def test_alias_resolved(self):
"""Alias like bok_choy should resolve to pak_choy (no warning for valid GT)."""
cases = [{
"case_id": "qty_bok_choy_humidity",
"expected_plant": "bok_choy",
"expected_stage": "vegetative",
"ground_truth": "kelembaban 60-85%, suhu 27, cahaya 20000",
}]
warnings = validate_ground_truths(cases)
assert len(warnings) == 0
def test_stale_ground_truth_warns(self):
"""If optimal temp not in ground truth text, emit warning."""
cases = [{
"case_id": "test_stale",
"expected_plant": "lettuce",
"expected_stage": "vegetative",
"ground_truth": "some text that does not contain the optimal value",
}]
warnings = validate_ground_truths(cases)
assert len(warnings) == 1
assert "test_stale" in warnings[0]
assert "may be stale" in warnings[0]