""" evaluator.py — High-level evaluation entry points. evaluate_element(gold_element, schema, endpoint, ...) Evaluate annotation of a single XML element against its gold standard. evaluate_file(gold_xml_path, schema, endpoint, ...) Evaluate annotation of every child element inside a container element. Returns per-record results and corpus-level aggregated metrics. Both functions follow the same pipeline: 1. Extract gold spans from the gold element (character offsets in plain text). 2. Strip all tags → plain text (same text the annotator will see). 3. Run annotate() on the plain text. 4. Wrap the annotated XML in a synthetic root, parse it, extract spans. 5. Match predicted spans against gold spans. 6. Return an EvaluationResult with P/R/F1. """ from __future__ import annotations import re import warnings from pathlib import Path from lxml import etree # Matches a well-formed XML tag (open/close/self-closing) or comment. # Used to identify tag boundaries when escaping non-schema angle-brackets. _XML_TAG_RE = re.compile( r"<(/?)([a-zA-Z_][\w:.-]*)(\s[^<>\"']*(?:(?:\"[^\"]*\"|'[^']*')[^<>\"']*)*)?/?>|", re.DOTALL, ) def _escape_nonschema_brackets(fragment: str, allowed_tags: frozenset[str]) -> str: """ Escape ``&``, ``<``, and ``>`` in the text portions of *fragment* that are not part of a valid schema tag. - ``&`` → ``&`` (bare ampersands are invalid XML; must be escaped first) - ``<`` / ``>`` whose element name is NOT in *allowed_tags* → ``<`` / ``>`` Tags with names in *allowed_tags* (injected by the annotator) and XML comments are left untouched. This handles literal text like ``A & B`` or ```` that originates from decoded XML entities in gold-standard files. """ def _escape_text(t: str) -> str: # & must be replaced before < / > to avoid double-encoding return t.replace("&", "&").replace("<", "<").replace(">", ">") parts: list[str] = [] last = 0 for m in _XML_TAG_RE.finditer(fragment): parts.append(_escape_text(fragment[last : m.start()])) # m.group(2) is the tag name; None for comments tag_name = m.group(2) if tag_name is None or tag_name in allowed_tags: parts.append(m.group()) # keep it as real XML else: parts.append(_escape_text(m.group())) last = m.end() parts.append(_escape_text(fragment[last:])) return "".join(parts) from ..inference.endpoint import EndpointConfig from ..models.schema import TEISchema from ..pipeline import annotate from .extractor import extract_spans from .metrics import EvaluationResult, MatchMode, aggregate, compute_metrics # TEI namespace used in documents like blbl-examples.tei.xml _TEI_NS = "http://www.tei-c.org/ns/1.0" def evaluate_element( gold_element: etree._Element, schema: TEISchema, endpoint: EndpointConfig, gliner_model: str | None = None, match_mode: MatchMode = MatchMode.TEXT, overlap_threshold: float = 0.5, chunk_size: int = 1500, chunk_overlap: int = 200, ) -> EvaluationResult: """ Evaluate annotation quality for a single XML element. Parameters ---------- gold_element : An lxml element with manually annotated child tags (the gold standard). schema : TEISchema describing the elements that the annotator should produce. endpoint : Injected inference dependency passed unchanged to :func:`annotate`. gliner_model : GLiNER model ID for the optional pre-detection pass. Defaults to ``None`` (disabled) — enable for real-world runs. match_mode : How to decide whether a predicted span matches a gold span. overlap_threshold : IoU threshold when *match_mode* is OVERLAP. chunk_size, chunk_overlap : Chunking parameters forwarded to :func:`annotate`. Returns ------- :class:`~tei_annotator.evaluation.metrics.EvaluationResult` """ # Step 1 — extract gold spans (and the plain text they are anchored to) plain_text, gold_spans = extract_spans(gold_element) if not plain_text.strip(): return compute_metrics([], []) # Step 2 — annotate the plain text result = annotate( text=plain_text, schema=schema, endpoint=endpoint, gliner_model=gliner_model, chunk_size=chunk_size, chunk_overlap=chunk_overlap, ) # Step 3 — parse the annotated XML output # annotate() returns a fragment (no root element), so we wrap it. # Escape any '<'/'>' whose tag name is not in the schema — these are # literal text characters that lxml would otherwise parse as elements # (e.g. <italic> in gold-standard elements becomes raw ''). allowed_tags = frozenset(e.tag for e in schema.elements) safe_xml = _escape_nonschema_brackets(result.xml, allowed_tags) try: pred_root = etree.fromstring(f"<_root>{safe_xml}".encode()) except etree.XMLSyntaxError as exc: warnings.warn( f"Could not parse annotator output as XML; treating as empty: {exc}", stacklevel=2, ) return compute_metrics(gold_spans, []) # Step 4 — extract predicted spans from the parsed output _, pred_spans = extract_spans(pred_root) # Step 5 — match and compute metrics eval_result = compute_metrics( gold_spans, pred_spans, mode=match_mode, overlap_threshold=overlap_threshold, ) eval_result.annotation_xml = result.xml return eval_result def evaluate_file( gold_xml_path: str | Path, schema: TEISchema, endpoint: EndpointConfig, root_element: str = "listBibl", child_element: str = "bibl", gliner_model: str | None = None, match_mode: MatchMode = MatchMode.TEXT, overlap_threshold: float = 0.5, chunk_size: int = 1500, chunk_overlap: int = 200, max_items: int | None = None, ) -> tuple[list[EvaluationResult], EvaluationResult]: """ Evaluate annotation quality against a gold-standard TEI XML file. Finds every ```` inside ````, strips its tags to obtain plain text, runs :func:`annotate`, and compares the result to the original annotation. Parameters ---------- gold_xml_path : Path to a TEI XML file (e.g. ``tests/fixtures/blbl-examples.tei.xml``). schema : TEISchema to use for annotation. endpoint : Inference endpoint configuration. root_element : Container element name to search for (default: ``"listBibl"``). child_element : Individual record element name to annotate (default: ``"bibl"``). gliner_model : GLiNER model ID, or ``None`` to disable. match_mode : Span matching criterion. overlap_threshold : IoU threshold for OVERLAP mode. chunk_size, chunk_overlap : Chunking parameters forwarded to :func:`annotate`. max_items : If set, only the first *max_items* child elements are evaluated. Useful for quick smoke runs. Returns ------- (per_record_results, aggregated_result) *per_record_results* — one :class:`EvaluationResult` per child element. *aggregated_result* — corpus-level metrics (TP/FP/FN summed across all records, then P/R/F1 computed from those totals). """ tree = etree.parse(str(gold_xml_path)) def _find(tag: str) -> list[etree._Element]: """Search with TEI namespace first, then without.""" elems = tree.findall(f".//{{{_TEI_NS}}}{tag}") return elems or tree.findall(f".//{tag}") containers = _find(root_element) all_children: list[etree._Element] = [] for container in containers: children = container.findall(f"{{{_TEI_NS}}}{child_element}") if not children: children = container.findall(child_element) all_children.extend(children) if max_items is not None: all_children = all_children[:max_items] per_record: list[EvaluationResult] = [] for element in all_children: result = evaluate_element( gold_element=element, schema=schema, endpoint=endpoint, gliner_model=gliner_model, match_mode=match_mode, overlap_threshold=overlap_threshold, chunk_size=chunk_size, chunk_overlap=chunk_overlap, ) per_record.append(result) aggregated = aggregate(per_record) return per_record, aggregated