Spaces:
Sleeping
Sleeping
| # training/parser_eval.py | |
| # ------------------------------------------------------------ | |
| # Parser Evaluation (Stage 10A) | |
| # | |
| # This version ONLY evaluates: | |
| # - Rule parser | |
| # - Extended parser | |
| # | |
| # The LLM parser is intentionally disabled at this stage | |
| # because alias maps and schema are not trained yet. | |
| # | |
| # This makes Stage 10A FAST and stable (< 3 seconds). | |
| # ------------------------------------------------------------ | |
| import json | |
| import os | |
| from typing import Dict, Any | |
| from engine.parser_rules import parse_text_rules | |
| from engine.parser_ext import parse_text_extended | |
| # Path to the gold tests | |
| GOLD_PATH = "training/gold_tests.json" | |
| def evaluate_single_test(test: Dict[str, Any]) -> Dict[str, Any]: | |
| """ | |
| Evaluate one gold test with rules + extended parsers. | |
| """ | |
| text = test.get("input", "") | |
| expected = test.get("expected", {}) | |
| # Run deterministic parsers | |
| rule_out = parse_text_rules(text).get("parsed_fields", {}) | |
| ext_out = parse_text_extended(text).get("parsed_fields", {}) | |
| # Merge rule + extended (extended overwrites rules) | |
| merged = dict(rule_out) | |
| for k, v in ext_out.items(): | |
| if v != "Unknown": | |
| merged[k] = v | |
| total = len(expected) | |
| correct = 0 | |
| wrong = {} | |
| for field, exp_val in expected.items(): | |
| got = merged.get(field, "Unknown") | |
| if got.lower() == exp_val.lower(): | |
| correct += 0 if exp_val == "Unknown" else 1 # Unknown is neutral | |
| else: | |
| wrong[field] = {"expected": exp_val, "got": got} | |
| return { | |
| "correct": correct, | |
| "total": total, | |
| "accuracy": correct / total if total else 0, | |
| "wrong": wrong, | |
| "merged": merged, | |
| } | |
| def run_parser_eval(mode: str = "rules_extended") -> Dict[str, Any]: | |
| """ | |
| Evaluate ALL gold tests using rules + extended parsing only. | |
| """ | |
| if not os.path.exists(GOLD_PATH): | |
| return {"error": f"Gold test file not found at {GOLD_PATH}"} | |
| with open(GOLD_PATH, "r", encoding="utf-8") as f: | |
| gold = json.load(f) | |
| results = [] | |
| wrong_cases = [] | |
| total_correct = 0 | |
| total_fields = 0 | |
| for test in gold: | |
| out = evaluate_single_test(test) | |
| results.append(out) | |
| total_correct += out["correct"] | |
| total_fields += out["total"] | |
| if out["wrong"]: | |
| wrong_cases.append({ | |
| "name": test.get("name", "Unnamed"), | |
| "wrong": out["wrong"], | |
| "parsed": out["merged"], | |
| "expected": test.get("expected", {}) | |
| }) | |
| summary = { | |
| "mode": "rules+extended", | |
| "tests": len(gold), | |
| "total_correct": total_correct, | |
| "total_fields": total_fields, | |
| "overall_accuracy": total_correct / total_fields if total_fields else 0, | |
| "wrong_cases": wrong_cases, | |
| } | |
| return summary | |