Spaces:

cmboulanger
/

tei-annotator

Runtime error

App Files Files Community

tei-annotator / tei_annotator /evaluation /evaluator.py

cmboulanger

Add human-inspectable to evaluate-llm.py output via `--show-annotations`

406ca65 about 1 month ago

raw

history blame contribute delete

8.63 kB

	"""
	evaluator.py — High-level evaluation entry points.

	evaluate_element(gold_element, schema, endpoint, ...)
	Evaluate annotation of a single XML element against its gold standard.

	evaluate_file(gold_xml_path, schema, endpoint, ...)
	Evaluate annotation of every child element inside a container element.
	Returns per-record results and corpus-level aggregated metrics.

	Both functions follow the same pipeline:
	1. Extract gold spans from the gold element (character offsets in plain text).
	2. Strip all tags → plain text (same text the annotator will see).
	3. Run annotate() on the plain text.
	4. Wrap the annotated XML in a synthetic root, parse it, extract spans.
	5. Match predicted spans against gold spans.
	6. Return an EvaluationResult with P/R/F1.
	"""

	from __future__ import annotations

	import re
	import warnings
	from pathlib import Path

	from lxml import etree

	# Matches a well-formed XML tag (open/close/self-closing) or comment.
	# Used to identify tag boundaries when escaping non-schema angle-brackets.
	_XML_TAG_RE = re.compile(
	r"<(/?)([a-zA-Z_][\w:.-])(\s[^<>\"'](?:(?:\"[^\"]\"\|'[^']')[^<>\"']))?/?>\|<!--.*?-->",
	re.DOTALL,
	)


	def _escape_nonschema_brackets(fragment: str, allowed_tags: frozenset[str]) -> str:
	"""
	Escape ``&``, ``<``, and ``>`` in the text portions of fragment that are
	not part of a valid schema tag.

	- ``&`` → ``&`` (bare ampersands are invalid XML; must be escaped first)
	- ``<`` / ``>`` whose element name is NOT in allowed_tags → ``<`` / ``>``

	Tags with names in allowed_tags (injected by the annotator) and XML comments
	are left untouched. This handles literal text like ``A & B`` or ``<italic>``
	that originates from decoded XML entities in gold-standard files.
	"""
	def _escape_text(t: str) -> str:
	# & must be replaced before < / > to avoid double-encoding
	return t.replace("&", "&").replace("<", "<").replace(">", ">")

	parts: list[str] = []
	last = 0
	for m in _XML_TAG_RE.finditer(fragment):
	parts.append(_escape_text(fragment[last : m.start()]))
	# m.group(2) is the tag name; None for comments
	tag_name = m.group(2)
	if tag_name is None or tag_name in allowed_tags:
	parts.append(m.group()) # keep it as real XML
	else:
	parts.append(_escape_text(m.group()))
	last = m.end()
	parts.append(_escape_text(fragment[last:]))
	return "".join(parts)

	from ..inference.endpoint import EndpointConfig
	from ..models.schema import TEISchema
	from ..pipeline import annotate
	from .extractor import extract_spans
	from .metrics import EvaluationResult, MatchMode, aggregate, compute_metrics

	# TEI namespace used in documents like blbl-examples.tei.xml
	_TEI_NS = "http://www.tei-c.org/ns/1.0"


	def evaluate_element(
	gold_element: etree._Element,
	schema: TEISchema,
	endpoint: EndpointConfig,
	gliner_model: str \| None = None,
	match_mode: MatchMode = MatchMode.TEXT,
	overlap_threshold: float = 0.5,
	chunk_size: int = 1500,
	chunk_overlap: int = 200,
	) -> EvaluationResult:
	"""
	Evaluate annotation quality for a single XML element.

	Parameters
	----------
	gold_element :
	An lxml element with manually annotated child tags (the gold standard).
	schema :
	TEISchema describing the elements that the annotator should produce.
	endpoint :
	Injected inference dependency passed unchanged to :func:`annotate`.
	gliner_model :
	GLiNER model ID for the optional pre-detection pass.
	Defaults to ``None`` (disabled) — enable for real-world runs.
	match_mode :
	How to decide whether a predicted span matches a gold span.
	overlap_threshold :
	IoU threshold when match_mode is OVERLAP.
	chunk_size, chunk_overlap :
	Chunking parameters forwarded to :func:`annotate`.

	Returns
	-------
	:class:`~tei_annotator.evaluation.metrics.EvaluationResult`
	"""
	# Step 1 — extract gold spans (and the plain text they are anchored to)
	plain_text, gold_spans = extract_spans(gold_element)

	if not plain_text.strip():
	return compute_metrics([], [])

	# Step 2 — annotate the plain text
	result = annotate(
	text=plain_text,
	schema=schema,
	endpoint=endpoint,
	gliner_model=gliner_model,
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	)

	# Step 3 — parse the annotated XML output
	# annotate() returns a fragment (no root element), so we wrap it.
	# Escape any '<'/'>' whose tag name is not in the schema — these are
	# literal text characters that lxml would otherwise parse as elements
	# (e.g. <italic> in gold-standard elements becomes raw '<italic>').
	allowed_tags = frozenset(e.tag for e in schema.elements)
	safe_xml = _escape_nonschema_brackets(result.xml, allowed_tags)
	try:
	pred_root = etree.fromstring(f"<_root>{safe_xml}</_root>".encode())
	except etree.XMLSyntaxError as exc:
	warnings.warn(
	f"Could not parse annotator output as XML; treating as empty: {exc}",
	stacklevel=2,
	)
	return compute_metrics(gold_spans, [])

	# Step 4 — extract predicted spans from the parsed output
	_, pred_spans = extract_spans(pred_root)

	# Step 5 — match and compute metrics
	eval_result = compute_metrics(
	gold_spans,
	pred_spans,
	mode=match_mode,
	overlap_threshold=overlap_threshold,
	)
	eval_result.annotation_xml = result.xml
	return eval_result


	def evaluate_file(
	gold_xml_path: str \| Path,
	schema: TEISchema,
	endpoint: EndpointConfig,
	root_element: str = "listBibl",
	child_element: str = "bibl",
	gliner_model: str \| None = None,
	match_mode: MatchMode = MatchMode.TEXT,
	overlap_threshold: float = 0.5,
	chunk_size: int = 1500,
	chunk_overlap: int = 200,
	max_items: int \| None = None,
	) -> tuple[list[EvaluationResult], EvaluationResult]:
	"""
	Evaluate annotation quality against a gold-standard TEI XML file.

	Finds every ``<child_element>`` inside ``<root_element>``, strips its
	tags to obtain plain text, runs :func:`annotate`, and compares the result
	to the original annotation.

	Parameters
	----------
	gold_xml_path :
	Path to a TEI XML file (e.g. ``tests/fixtures/blbl-examples.tei.xml``).
	schema :
	TEISchema to use for annotation.
	endpoint :
	Inference endpoint configuration.
	root_element :
	Container element name to search for (default: ``"listBibl"``).
	child_element :
	Individual record element name to annotate (default: ``"bibl"``).
	gliner_model :
	GLiNER model ID, or ``None`` to disable.
	match_mode :
	Span matching criterion.
	overlap_threshold :
	IoU threshold for OVERLAP mode.
	chunk_size, chunk_overlap :
	Chunking parameters forwarded to :func:`annotate`.
	max_items :
	If set, only the first max_items child elements are evaluated.
	Useful for quick smoke runs.

	Returns
	-------
	(per_record_results, aggregated_result)
	per_record_results — one :class:`EvaluationResult` per child element.
	aggregated_result — corpus-level metrics (TP/FP/FN summed across
	all records, then P/R/F1 computed from those totals).
	"""
	tree = etree.parse(str(gold_xml_path))

	def _find(tag: str) -> list[etree._Element]:
	"""Search with TEI namespace first, then without."""
	elems = tree.findall(f".//{{{_TEI_NS}}}{tag}")
	return elems or tree.findall(f".//{tag}")

	containers = _find(root_element)
	all_children: list[etree._Element] = []
	for container in containers:
	children = container.findall(f"{{{_TEI_NS}}}{child_element}")
	if not children:
	children = container.findall(child_element)
	all_children.extend(children)

	if max_items is not None:
	all_children = all_children[:max_items]

	per_record: list[EvaluationResult] = []
	for element in all_children:
	result = evaluate_element(
	gold_element=element,
	schema=schema,
	endpoint=endpoint,
	gliner_model=gliner_model,
	match_mode=match_mode,
	overlap_threshold=overlap_threshold,
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	)
	per_record.append(result)

	aggregated = aggregate(per_record)
	return per_record, aggregated