packetcourt / scripts /evaluate.py
DIV-45's picture
feat: integrate MiniCPM-V label extraction
59446fc verified
Raw
History Blame Contribute Delete
2.41 kB
from __future__ import annotations
import json
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT / "src"))
from packetcourt import audit_packet
def load_cases(path: Path) -> list[dict]:
return [json.loads(line) for line in path.read_text().splitlines() if line.strip()]
def evaluate_case(case: dict) -> tuple[int, int, list[str]]:
audit = audit_packet(case["front_text"], case["back_text"])
checks: list[tuple[bool, str]] = []
actual_claims = {claim.claim for claim in audit.claims}
checks.append((actual_claims == set(case["expected_claims"]), "claim set"))
verdicts = {claim.claim: claim.verdict.value for claim in audit.claims}
for claim, expected in case.get("expected_verdicts", {}).items():
checks.append((verdicts.get(claim) == expected, f"{claim} verdict"))
gap_text = " ".join(
f"{finding.headline} {finding.front_impression} {finding.quiet_context}"
for finding in audit.persuasion_gap
).lower()
for term in case.get("expected_gap_terms", []):
checks.append((term.lower() in gap_text, f"gap contains {term}"))
if "expected_best_before" in case:
checks.append((audit.expiry.best_before == case["expected_best_before"], "best-before date"))
if "expected_after_opening" in case:
checks.append((audit.expiry.after_opening_instruction == case["expected_after_opening"], "after-opening instruction"))
if "expected_sugar_teaspoons" in case:
checks.append((audit.whole_packet.sugar_teaspoons == case["expected_sugar_teaspoons"], "sugar teaspoons"))
failures = [label for passed, label in checks if not passed]
return len(checks) - len(failures), len(checks), failures
def main() -> int:
cases = load_cases(ROOT / "data" / "golden_cases.jsonl")
passed = total = 0
for case in cases:
case_passed, case_total, failures = evaluate_case(case)
passed += case_passed
total += case_total
marker = "PASS" if not failures else "FAIL"
print(f"{marker} {case['id']} {case['title']}: {case_passed}/{case_total}")
for failure in failures:
print(f" - {failure}")
print(f"\nPacketCourt golden evaluation: {passed}/{total} checks passed across {len(cases)} cases.")
return 0 if passed == total else 1
if __name__ == "__main__":
raise SystemExit(main())