Spaces:

EphAsad
/

BactKing

Sleeping

App Files Files Community

BactKing / training /parser_eval.py

EphAsad

Upload 23 files

1168cd6 verified 3 months ago

raw

history blame contribute delete

2.86 kB

	# training/parser_eval.py
	# ------------------------------------------------------------
	# Parser Evaluation (Stage 10A)
	#
	# This version ONLY evaluates:
	# - Rule parser
	# - Extended parser
	#
	# The LLM parser is intentionally disabled at this stage
	# because alias maps and schema are not trained yet.
	#
	# This makes Stage 10A FAST and stable (< 3 seconds).
	# ------------------------------------------------------------

	import json
	import os
	from typing import Dict, Any

	from engine.parser_rules import parse_text_rules
	from engine.parser_ext import parse_text_extended


	# Path to the gold tests
	GOLD_PATH = "training/gold_tests.json"


	def evaluate_single_test(test: Dict[str, Any]) -> Dict[str, Any]:
	"""
	Evaluate one gold test with rules + extended parsers.
	"""
	text = test.get("input", "")
	expected = test.get("expected", {})

	# Run deterministic parsers
	rule_out = parse_text_rules(text).get("parsed_fields", {})
	ext_out = parse_text_extended(text).get("parsed_fields", {})

	# Merge rule + extended (extended overwrites rules)
	merged = dict(rule_out)
	for k, v in ext_out.items():
	if v != "Unknown":
	merged[k] = v

	total = len(expected)
	correct = 0
	wrong = {}

	for field, exp_val in expected.items():
	got = merged.get(field, "Unknown")
	if got.lower() == exp_val.lower():
	correct += 0 if exp_val == "Unknown" else 1 # Unknown is neutral
	else:
	wrong[field] = {"expected": exp_val, "got": got}

	return {
	"correct": correct,
	"total": total,
	"accuracy": correct / total if total else 0,
	"wrong": wrong,
	"merged": merged,
	}


	def run_parser_eval(mode: str = "rules_extended") -> Dict[str, Any]:
	"""
	Evaluate ALL gold tests using rules + extended parsing only.
	"""
	if not os.path.exists(GOLD_PATH):
	return {"error": f"Gold test file not found at {GOLD_PATH}"}

	with open(GOLD_PATH, "r", encoding="utf-8") as f:
	gold = json.load(f)

	results = []
	wrong_cases = []

	total_correct = 0
	total_fields = 0

	for test in gold:
	out = evaluate_single_test(test)
	results.append(out)

	total_correct += out["correct"]
	total_fields += out["total"]

	if out["wrong"]:
	wrong_cases.append({
	"name": test.get("name", "Unnamed"),
	"wrong": out["wrong"],
	"parsed": out["merged"],
	"expected": test.get("expected", {})
	})

	summary = {
	"mode": "rules+extended",
	"tests": len(gold),
	"total_correct": total_correct,
	"total_fields": total_fields,
	"overall_accuracy": total_correct / total_fields if total_fields else 0,
	"wrong_cases": wrong_cases,
	}

	return summary