Spaces:

Jacooo
/

PGC-AI-Chatbot

Running

App Files Files Community

PGC-AI-Chatbot / tests /test_evaluation_framework.py

Jacooo

Deploy from GitHub: 991875b

e5c5d28 verified 7 days ago

raw

history blame contribute delete

15 kB

	"""
	Unit tests for the RAGAS evaluation framework checkers and validators.

	Tests pure functions that do NOT require API calls (gpt-4o-mini / Cerebras).
	"""

	import json
	import sys
	from pathlib import Path

	sys.path.insert(0, str(Path(__file__).parent.parent))

	import pytest

	from scripts.evaluate_ragas import (
	_detect_language,
	_resolve_plant_id,
	NumericalRigorChecker,
	TemporalAdherenceChecker,
	ConstraintSatisfactionChecker,
	TerminologyNuanceChecker,
	validate_ground_truths,
	)

	# =============================================================================
	# _detect_language
	# =============================================================================

	class TestDetectLanguage:
	def test_id_question_suhu(self):
	assert _detect_language("Berapa suhu ideal untuk selada?") == "id"

	def test_id_question_berapa(self):
	assert _detect_language("Berapa kelembaban untuk cabai?") == "id"

	def test_id_question_fase(self):
	assert _detect_language("Parameter pada fase vegetatif?") == "id"

	def test_en_question_what(self):
	assert _detect_language("What is the optimal temperature?") == "en"

	def test_en_question_how(self):
	assert _detect_language("How much light does tomato need?") == "en"

	def test_en_question_mixed(self):
	assert _detect_language("What is suhu ideal for lettuce?") == "en"

	def test_en_empty(self):
	assert _detect_language("") == "en"

	def test_en_numbers_only(self):
	assert _detect_language("123 456 789") == "en"


	# =============================================================================
	# _resolve_plant_id
	# =============================================================================

	class TestResolvePlantId:
	def test_direct_id(self):
	assert _resolve_plant_id("lettuce") == "lettuce"

	def test_alias_bok_choy(self):
	assert _resolve_plant_id("bok_choy") == "pak_choy"

	def test_alias_amaranth(self):
	assert _resolve_plant_id("amaranth") == "spinach_amaranth"

	def test_alias_spinach(self):
	assert _resolve_plant_id("spinach") == "spinach_amaranth"

	def test_unknown_id_passthrough(self):
	assert _resolve_plant_id("green_beans") == "green_beans"


	# =============================================================================
	# NumericalRigorChecker
	# =============================================================================

	class TestNumericalRigorChecker:
	def test_requested_param_subset_skips_unasked_fields(self, gt):
	"""Humidity-only queries should not fail because temperature/light are absent."""
	answer = "Kelembaban ideal 73% RH."
	# Accepts set-like input, but output should be a stable list for deterministic results
	result = NumericalRigorChecker.evaluate_answer(
	answer,
	gt,
	requested_params={"humidity"}, # input can be set-like
	)
	assert result["overall_pass"] is True
	assert result["requested_params"] == ["humidity"] # output must be a stable list
	assert result["param_results"]["humidity"]["status"] == "PASS"
	assert "temperature" not in result["param_results"]
	assert "light" not in result["param_results"]

	def test_unicode_thin_space_lux_parses(self, gt):
	"""Lux values formatted as 19500 must parse as 19500, not 500."""
	answer = "Cahaya optimal 19\u202f500 lux."
	# U+202F = NARROW NO-BREAK SPACE (thin space thousands separator)
	result = NumericalRigorChecker.evaluate_answer(
	answer,
	gt,
	requested_params={"light"},
	)
	assert result["overall_pass"] is True
	assert result["param_results"]["light"]["status"] == "PASS"

	@pytest.fixture
	def gt(self):
	return {
	"temperature": {"value": 20.0, "min": 16.0, "max": 24.0},
	"humidity": {"value": 73.0, "min": 60.0, "max": 85.0},
	"light": {"value": 18000, "min": 14000, "max": 22000},
	}

	def test_pass_exact_optimal(self, gt):
	"""Answer with the exact optimal temperature passes."""
	answer = "Suhu optimal 20°C dengan kelembaban 73% dan cahaya 18000 lux."
	result = NumericalRigorChecker.evaluate_answer(answer, gt)
	assert result["overall_pass"] is True

	def test_pass_within_tolerance(self, gt):
	"""Answer within ±0.5 of optimal passes for all params."""
	answer = "Suhu optimal 20.4°C, kelembaban 73% RH, cahaya 18000 lux."
	result = NumericalRigorChecker.evaluate_answer(answer, gt)
	assert result["overall_pass"] is True

	def test_pass_within_range(self, gt):
	"""Answer within the safety range [min, max] passes for all params."""
	answer = "Suhu 18°C, kelembaban 70% RH, cahaya 15000 lux."
	result = NumericalRigorChecker.evaluate_answer(answer, gt)
	assert result["overall_pass"] is True

	def test_fail_outside_range(self, gt):
	"""Answer outside both tolerance and safety range fails."""
	answer = "Suhu optimal 30°C."
	result = NumericalRigorChecker.evaluate_answer(answer, gt)
	assert result["overall_pass"] is False

	def test_pass_range_answer(self, gt):
	"""Range answer that contains the optimal value passes for all params."""
	answer = "Suhu 18-24°C, kelembaban 70-75% RH, cahaya 15000-20000 lux."
	result = NumericalRigorChecker.evaluate_answer(answer, gt)
	assert result["overall_pass"] is True

	def test_pass_all_params(self, gt):
	"""All three parameters correct passes."""
	answer = "Suhu 20°C, kelembaban 73% RH, cahaya 18000 lux."
	result = NumericalRigorChecker.evaluate_answer(answer, gt)
	assert result["overall_pass"] is True

	def test_fail_one_param(self, gt):
	"""One wrong parameter causes overall fail."""
	answer = "Suhu 20°C, kelembaban 73%, cahaya 50000 lux."
	result = NumericalRigorChecker.evaluate_answer(answer, gt)
	assert result["overall_pass"] is False
	assert result["param_results"]["light"]["status"] == "FAIL"

	def test_no_ground_truth_skips(self):
	"""Empty ground truth dict should skip all checks and pass."""
	result = NumericalRigorChecker.evaluate_answer("any answer", {})
	assert result["overall_pass"] is True

	def test_partial_ground_truth(self):
	"""Only humidity provided, temperature/light should skip."""
	gt = {"humidity": {"value": 73.0, "min": 60.0, "max": 85.0}}
	result = NumericalRigorChecker.evaluate_answer("kelembaban 73%", gt)
	assert result["overall_pass"] is True


	# =============================================================================
	# TemporalAdherenceChecker
	# =============================================================================

	class TestTemporalAdherenceChecker:
	def test_general_phase_not_applicable(self):
	result = TemporalAdherenceChecker.check("Some text", "general")
	assert result["pass"] is True
	assert result["applicable"] is False

	def test_none_phase_not_applicable(self):
	result = TemporalAdherenceChecker.check("Some text", None)
	assert result["pass"] is True
	assert result["applicable"] is False

	def test_day_phase_correct_id(self):
	result = TemporalAdherenceChecker.check("Pada siklus siang, suhu 24°C", "day")
	assert result["pass"] is True

	def test_day_phase_correct_en(self):
	result = TemporalAdherenceChecker.check("During day cycle, temp is 24°C", "day")
	assert result["pass"] is True

	def test_night_phase_correct_id(self):
	result = TemporalAdherenceChecker.check("Pada siklus malam, suhu 18°C", "night")
	assert result["pass"] is True

	def test_night_phase_correct_en(self):
	result = TemporalAdherenceChecker.check("During night cycle, temp is 18°C", "night")
	assert result["pass"] is True

	def test_day_phase_wrong(self):
	"""Answer references night but expected day — should fail."""
	result = TemporalAdherenceChecker.check("Pada siklus malam", "day")
	assert result["pass"] is False

	def test_night_phase_wrong(self):
	result = TemporalAdherenceChecker.check("Pada siklus siang", "night")
	assert result["pass"] is False

	def test_no_live_data_skip(self):
	result = TemporalAdherenceChecker.check(
	"Chamber is not currently online", "day"
	)
	assert result["pass"] is True
	assert result.get("status") == "NO_LIVE_DATA"


	# =============================================================================
	# ConstraintSatisfactionChecker
	# =============================================================================

	class TestConstraintSatisfactionChecker:
	def test_state_a_qualitative_always_pass(self):
	"""State A (use_structured_params=False) always passes."""
	result = ConstraintSatisfactionChecker.check(
	"pH ideal untuk tanaman adalah 6.5", "any query", False
	)
	assert result["pass"] is True
	assert result["mode"] == "qualitative_quoted"

	def test_state_b_guarded_clean(self):
	"""State B (guarded) with no forbidden terms passes."""
	result = ConstraintSatisfactionChecker.check(
	"Suhu ideal 20°C", "suhu selada", True
	)
	assert result["pass"] is True
	assert result["mode"] == "guarded"

	def test_state_b_guarded_with_forbidden(self):
	"""State B with forbidden term outside breadcrumb fails."""
	result = ConstraintSatisfactionChecker.check(
	"Suhu ideal 20°C dan pH 6.5", "suhu selada", True
	)
	assert result["pass"] is False
	assert result["mode"] == "unprompted_hallucination"

	def test_state_b_breadcrumb_ignored(self):
	"""Forbidden terms inside system breadcrumb are allowed."""
	result = ConstraintSatisfactionChecker.check(
	"Suhu 20°C. Parameter terverifikasi PGC hanya mencakup suhu, kelembaban, dan cahaya",
	"suhu selada", True,
	)
	assert result["pass"] is True

	def test_state_c_explicit_request_with_warning(self):
	"""State C (explicit request) passes with bifurcation warning."""
	result = ConstraintSatisfactionChecker.check(
	"pH ideal 6.5. ⚠️ Parameter pH di luar kendali otomatis PGC.",
	"berapa pH untuk tomat", True,
	)
	assert result["pass"] is True
	assert result["mode"] == "explicit_request_warned"

	def test_state_c_explicit_request_no_warning(self):
	"""State C fails when warning is missing."""
	result = ConstraintSatisfactionChecker.check(
	"pH ideal 6.5.",
	"berapa pH", True,
	)
	assert result["pass"] is False
	assert result["mode"] == "explicit_request_unwarned"


	# =============================================================================
	# TerminologyNuanceChecker
	# =============================================================================

	class TestTerminologyNuanceChecker:
	def test_kecambah_tunas_correct(self):
	answer = "mung bean sprouts memerlukan kelembaban tinggi pada fase vegetatif"
	result = TerminologyNuanceChecker.check_kecambah_tunas(answer)
	assert result["pass"] is True

	def test_kecambah_tunas_with_toge(self):
	answer = "toge memerlukan kelembaban tinggi untuk pertumbuhan vegetatif"
	result = TerminologyNuanceChecker.check_kecambah_tunas(answer)
	assert result["pass"] is True

	def test_kecambah_tunas_missing_mung(self):
	answer = "tanaman memerlukan cahaya pada fase vegetatif"
	result = TerminologyNuanceChecker.check_kecambah_tunas(answer)
	assert result["pass"] is False

	def test_layu_fusarium_correct(self):
	answer = "Fusarium menyebabkan layu pada pembuluh batang"
	result = TerminologyNuanceChecker.check_layu_fusarium(answer)
	assert result["pass"] is True

	def test_layu_fusarium_missing(self):
	answer = "tanaman layu karena kekeringan"
	result = TerminologyNuanceChecker.check_layu_fusarium(answer)
	assert result["pass"] is False

	def test_busuk_akar_pythium_correct(self):
	answer = "Pythium adalah patogen yang disebabkan oleh jamur air"
	result = TerminologyNuanceChecker.check_busuk_akar_pythium(answer)
	assert result["pass"] is True

	def test_kacang_hijau_correct(self):
	answer = "kacang hijau adalah mung bean"
	result = TerminologyNuanceChecker.check_kacang_hijau(answer)
	assert result["pass"] is True

	def test_kacang_hijau_wrong(self):
	answer = "kacang hijau adalah green bean"
	result = TerminologyNuanceChecker.check_kacang_hijau(answer)
	assert result["pass"] is False

	def test_baginda_f1_correct(self):
	answer = "Baginda F1 adalah varietas semangka dengan suhu optimal 24°C"
	result = TerminologyNuanceChecker.check_baginda_f1(answer)
	assert result["pass"] is True


	# =============================================================================
	# validate_ground_truths
	# =============================================================================

	class TestValidateGroundTruths:
	def test_plant_not_in_db_skips(self):
	"""Case with no expected_plant should not crash."""
	cases = [{"case_id": "test", "ground_truth": "some text"}]
	warnings = validate_ground_truths(cases)
	assert len(warnings) == 0

	def test_negative_test_skipped(self):
	"""Negative tests should be skipped without warning."""
	cases = [{
	"case_id": "qty_green_beans_temp",
	"expected_plant": "green_beans",
	"expected_stage": "vegetative",
	"ground_truth": "some AI estimate",
	"is_negative_test": True,
	}]
	warnings = validate_ground_truths(cases)
	assert len(warnings) == 0

	def test_alias_resolved(self):
	"""Alias like bok_choy should resolve to pak_choy (no warning for valid GT)."""
	cases = [{
	"case_id": "qty_bok_choy_humidity",
	"expected_plant": "bok_choy",
	"expected_stage": "vegetative",
	"ground_truth": "kelembaban 60-85%, suhu 27, cahaya 20000",
	}]
	warnings = validate_ground_truths(cases)
	assert len(warnings) == 0



	def test_stale_ground_truth_warns(self):
	"""If optimal temp not in ground truth text, emit warning."""
	cases = [{
	"case_id": "test_stale",
	"expected_plant": "lettuce",
	"expected_stage": "vegetative",
	"ground_truth": "some text that does not contain the optimal value",
	}]
	warnings = validate_ground_truths(cases)
	assert len(warnings) == 1
	assert "test_stale" in warnings[0]
	assert "may be stale" in warnings[0]