cmboulanger's picture
Add human-inspectable to evaluate-llm.py output via `--show-annotations`
406ca65
"""
evaluator.py β€” High-level evaluation entry points.
evaluate_element(gold_element, schema, endpoint, ...)
Evaluate annotation of a single XML element against its gold standard.
evaluate_file(gold_xml_path, schema, endpoint, ...)
Evaluate annotation of every child element inside a container element.
Returns per-record results and corpus-level aggregated metrics.
Both functions follow the same pipeline:
1. Extract gold spans from the gold element (character offsets in plain text).
2. Strip all tags β†’ plain text (same text the annotator will see).
3. Run annotate() on the plain text.
4. Wrap the annotated XML in a synthetic root, parse it, extract spans.
5. Match predicted spans against gold spans.
6. Return an EvaluationResult with P/R/F1.
"""
from __future__ import annotations
import re
import warnings
from pathlib import Path
from lxml import etree
# Matches a well-formed XML tag (open/close/self-closing) or comment.
# Used to identify tag boundaries when escaping non-schema angle-brackets.
_XML_TAG_RE = re.compile(
r"<(/?)([a-zA-Z_][\w:.-]*)(\s[^<>\"']*(?:(?:\"[^\"]*\"|'[^']*')[^<>\"']*)*)?/?>|<!--.*?-->",
re.DOTALL,
)
def _escape_nonschema_brackets(fragment: str, allowed_tags: frozenset[str]) -> str:
"""
Escape ``&``, ``<``, and ``>`` in the text portions of *fragment* that are
not part of a valid schema tag.
- ``&`` β†’ ``&amp;`` (bare ampersands are invalid XML; must be escaped first)
- ``<`` / ``>`` whose element name is NOT in *allowed_tags* β†’ ``&lt;`` / ``&gt;``
Tags with names in *allowed_tags* (injected by the annotator) and XML comments
are left untouched. This handles literal text like ``A & B`` or ``<italic>``
that originates from decoded XML entities in gold-standard files.
"""
def _escape_text(t: str) -> str:
# & must be replaced before < / > to avoid double-encoding
return t.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
parts: list[str] = []
last = 0
for m in _XML_TAG_RE.finditer(fragment):
parts.append(_escape_text(fragment[last : m.start()]))
# m.group(2) is the tag name; None for comments
tag_name = m.group(2)
if tag_name is None or tag_name in allowed_tags:
parts.append(m.group()) # keep it as real XML
else:
parts.append(_escape_text(m.group()))
last = m.end()
parts.append(_escape_text(fragment[last:]))
return "".join(parts)
from ..inference.endpoint import EndpointConfig
from ..models.schema import TEISchema
from ..pipeline import annotate
from .extractor import extract_spans
from .metrics import EvaluationResult, MatchMode, aggregate, compute_metrics
# TEI namespace used in documents like blbl-examples.tei.xml
_TEI_NS = "http://www.tei-c.org/ns/1.0"
def evaluate_element(
gold_element: etree._Element,
schema: TEISchema,
endpoint: EndpointConfig,
gliner_model: str | None = None,
match_mode: MatchMode = MatchMode.TEXT,
overlap_threshold: float = 0.5,
chunk_size: int = 1500,
chunk_overlap: int = 200,
) -> EvaluationResult:
"""
Evaluate annotation quality for a single XML element.
Parameters
----------
gold_element :
An lxml element with manually annotated child tags (the gold standard).
schema :
TEISchema describing the elements that the annotator should produce.
endpoint :
Injected inference dependency passed unchanged to :func:`annotate`.
gliner_model :
GLiNER model ID for the optional pre-detection pass.
Defaults to ``None`` (disabled) β€” enable for real-world runs.
match_mode :
How to decide whether a predicted span matches a gold span.
overlap_threshold :
IoU threshold when *match_mode* is OVERLAP.
chunk_size, chunk_overlap :
Chunking parameters forwarded to :func:`annotate`.
Returns
-------
:class:`~tei_annotator.evaluation.metrics.EvaluationResult`
"""
# Step 1 β€” extract gold spans (and the plain text they are anchored to)
plain_text, gold_spans = extract_spans(gold_element)
if not plain_text.strip():
return compute_metrics([], [])
# Step 2 β€” annotate the plain text
result = annotate(
text=plain_text,
schema=schema,
endpoint=endpoint,
gliner_model=gliner_model,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
# Step 3 β€” parse the annotated XML output
# annotate() returns a fragment (no root element), so we wrap it.
# Escape any '<'/'>' whose tag name is not in the schema β€” these are
# literal text characters that lxml would otherwise parse as elements
# (e.g. &lt;italic&gt; in gold-standard elements becomes raw '<italic>').
allowed_tags = frozenset(e.tag for e in schema.elements)
safe_xml = _escape_nonschema_brackets(result.xml, allowed_tags)
try:
pred_root = etree.fromstring(f"<_root>{safe_xml}</_root>".encode())
except etree.XMLSyntaxError as exc:
warnings.warn(
f"Could not parse annotator output as XML; treating as empty: {exc}",
stacklevel=2,
)
return compute_metrics(gold_spans, [])
# Step 4 β€” extract predicted spans from the parsed output
_, pred_spans = extract_spans(pred_root)
# Step 5 β€” match and compute metrics
eval_result = compute_metrics(
gold_spans,
pred_spans,
mode=match_mode,
overlap_threshold=overlap_threshold,
)
eval_result.annotation_xml = result.xml
return eval_result
def evaluate_file(
gold_xml_path: str | Path,
schema: TEISchema,
endpoint: EndpointConfig,
root_element: str = "listBibl",
child_element: str = "bibl",
gliner_model: str | None = None,
match_mode: MatchMode = MatchMode.TEXT,
overlap_threshold: float = 0.5,
chunk_size: int = 1500,
chunk_overlap: int = 200,
max_items: int | None = None,
) -> tuple[list[EvaluationResult], EvaluationResult]:
"""
Evaluate annotation quality against a gold-standard TEI XML file.
Finds every ``<child_element>`` inside ``<root_element>``, strips its
tags to obtain plain text, runs :func:`annotate`, and compares the result
to the original annotation.
Parameters
----------
gold_xml_path :
Path to a TEI XML file (e.g. ``tests/fixtures/blbl-examples.tei.xml``).
schema :
TEISchema to use for annotation.
endpoint :
Inference endpoint configuration.
root_element :
Container element name to search for (default: ``"listBibl"``).
child_element :
Individual record element name to annotate (default: ``"bibl"``).
gliner_model :
GLiNER model ID, or ``None`` to disable.
match_mode :
Span matching criterion.
overlap_threshold :
IoU threshold for OVERLAP mode.
chunk_size, chunk_overlap :
Chunking parameters forwarded to :func:`annotate`.
max_items :
If set, only the first *max_items* child elements are evaluated.
Useful for quick smoke runs.
Returns
-------
(per_record_results, aggregated_result)
*per_record_results* β€” one :class:`EvaluationResult` per child element.
*aggregated_result* β€” corpus-level metrics (TP/FP/FN summed across
all records, then P/R/F1 computed from those totals).
"""
tree = etree.parse(str(gold_xml_path))
def _find(tag: str) -> list[etree._Element]:
"""Search with TEI namespace first, then without."""
elems = tree.findall(f".//{{{_TEI_NS}}}{tag}")
return elems or tree.findall(f".//{tag}")
containers = _find(root_element)
all_children: list[etree._Element] = []
for container in containers:
children = container.findall(f"{{{_TEI_NS}}}{child_element}")
if not children:
children = container.findall(child_element)
all_children.extend(children)
if max_items is not None:
all_children = all_children[:max_items]
per_record: list[EvaluationResult] = []
for element in all_children:
result = evaluate_element(
gold_element=element,
schema=schema,
endpoint=endpoint,
gliner_model=gliner_model,
match_mode=match_mode,
overlap_threshold=overlap_threshold,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
per_record.append(result)
aggregated = aggregate(per_record)
return per_record, aggregated