Spaces:

EphAsad
/

BactKing

Sleeping

File size: 2,863 Bytes

1168cd6

# training/parser_eval.py
# ------------------------------------------------------------
# Parser Evaluation (Stage 10A)
#
# This version ONLY evaluates:
#   - Rule parser
#   - Extended parser
#
# The LLM parser is intentionally disabled at this stage
# because alias maps and schema are not trained yet.
#
# This makes Stage 10A FAST and stable (< 3 seconds).
# ------------------------------------------------------------

import json
import os
from typing import Dict, Any

from engine.parser_rules import parse_text_rules
from engine.parser_ext import parse_text_extended


# Path to the gold tests
GOLD_PATH = "training/gold_tests.json"


def evaluate_single_test(test: Dict[str, Any]) -> Dict[str, Any]:
    """
    Evaluate one gold test with rules + extended parsers.
    """
    text = test.get("input", "")
    expected = test.get("expected", {})

    # Run deterministic parsers
    rule_out = parse_text_rules(text).get("parsed_fields", {})
    ext_out = parse_text_extended(text).get("parsed_fields", {})

    # Merge rule + extended (extended overwrites rules)
    merged = dict(rule_out)
    for k, v in ext_out.items():
        if v != "Unknown":
            merged[k] = v

    total = len(expected)
    correct = 0
    wrong = {}

    for field, exp_val in expected.items():
        got = merged.get(field, "Unknown")
        if got.lower() == exp_val.lower():
            correct += 0 if exp_val == "Unknown" else 1   # Unknown is neutral
        else:
            wrong[field] = {"expected": exp_val, "got": got}

    return {
        "correct": correct,
        "total": total,
        "accuracy": correct / total if total else 0,
        "wrong": wrong,
        "merged": merged,
    }


def run_parser_eval(mode: str = "rules_extended") -> Dict[str, Any]:
    """
    Evaluate ALL gold tests using rules + extended parsing only.
    """
    if not os.path.exists(GOLD_PATH):
        return {"error": f"Gold test file not found at {GOLD_PATH}"}

    with open(GOLD_PATH, "r", encoding="utf-8") as f:
        gold = json.load(f)

    results = []
    wrong_cases = []

    total_correct = 0
    total_fields = 0

    for test in gold:
        out = evaluate_single_test(test)
        results.append(out)

        total_correct += out["correct"]
        total_fields += out["total"]

        if out["wrong"]:
            wrong_cases.append({
                "name": test.get("name", "Unnamed"),
                "wrong": out["wrong"],
                "parsed": out["merged"],
                "expected": test.get("expected", {})
            })

    summary = {
        "mode": "rules+extended",
        "tests": len(gold),
        "total_correct": total_correct,
        "total_fields": total_fields,
        "overall_accuracy": total_correct / total_fields if total_fields else 0,
        "wrong_cases": wrong_cases,
    }

    return summary