Spaces:
Runtime error
Runtime error
| """ | |
| evaluator.py β High-level evaluation entry points. | |
| evaluate_element(gold_element, schema, endpoint, ...) | |
| Evaluate annotation of a single XML element against its gold standard. | |
| evaluate_file(gold_xml_path, schema, endpoint, ...) | |
| Evaluate annotation of every child element inside a container element. | |
| Returns per-record results and corpus-level aggregated metrics. | |
| Both functions follow the same pipeline: | |
| 1. Extract gold spans from the gold element (character offsets in plain text). | |
| 2. Strip all tags β plain text (same text the annotator will see). | |
| 3. Run annotate() on the plain text. | |
| 4. Wrap the annotated XML in a synthetic root, parse it, extract spans. | |
| 5. Match predicted spans against gold spans. | |
| 6. Return an EvaluationResult with P/R/F1. | |
| """ | |
| from __future__ import annotations | |
| import re | |
| import warnings | |
| from pathlib import Path | |
| from lxml import etree | |
| # Matches a well-formed XML tag (open/close/self-closing) or comment. | |
| # Used to identify tag boundaries when escaping non-schema angle-brackets. | |
| _XML_TAG_RE = re.compile( | |
| r"<(/?)([a-zA-Z_][\w:.-]*)(\s[^<>\"']*(?:(?:\"[^\"]*\"|'[^']*')[^<>\"']*)*)?/?>|<!--.*?-->", | |
| re.DOTALL, | |
| ) | |
| def _escape_nonschema_brackets(fragment: str, allowed_tags: frozenset[str]) -> str: | |
| """ | |
| Escape ``&``, ``<``, and ``>`` in the text portions of *fragment* that are | |
| not part of a valid schema tag. | |
| - ``&`` β ``&`` (bare ampersands are invalid XML; must be escaped first) | |
| - ``<`` / ``>`` whose element name is NOT in *allowed_tags* β ``<`` / ``>`` | |
| Tags with names in *allowed_tags* (injected by the annotator) and XML comments | |
| are left untouched. This handles literal text like ``A & B`` or ``<italic>`` | |
| that originates from decoded XML entities in gold-standard files. | |
| """ | |
| def _escape_text(t: str) -> str: | |
| # & must be replaced before < / > to avoid double-encoding | |
| return t.replace("&", "&").replace("<", "<").replace(">", ">") | |
| parts: list[str] = [] | |
| last = 0 | |
| for m in _XML_TAG_RE.finditer(fragment): | |
| parts.append(_escape_text(fragment[last : m.start()])) | |
| # m.group(2) is the tag name; None for comments | |
| tag_name = m.group(2) | |
| if tag_name is None or tag_name in allowed_tags: | |
| parts.append(m.group()) # keep it as real XML | |
| else: | |
| parts.append(_escape_text(m.group())) | |
| last = m.end() | |
| parts.append(_escape_text(fragment[last:])) | |
| return "".join(parts) | |
| from ..inference.endpoint import EndpointConfig | |
| from ..models.schema import TEISchema | |
| from ..pipeline import annotate | |
| from .extractor import extract_spans | |
| from .metrics import EvaluationResult, MatchMode, aggregate, compute_metrics | |
| # TEI namespace used in documents like blbl-examples.tei.xml | |
| _TEI_NS = "http://www.tei-c.org/ns/1.0" | |
| def evaluate_element( | |
| gold_element: etree._Element, | |
| schema: TEISchema, | |
| endpoint: EndpointConfig, | |
| gliner_model: str | None = None, | |
| match_mode: MatchMode = MatchMode.TEXT, | |
| overlap_threshold: float = 0.5, | |
| chunk_size: int = 1500, | |
| chunk_overlap: int = 200, | |
| ) -> EvaluationResult: | |
| """ | |
| Evaluate annotation quality for a single XML element. | |
| Parameters | |
| ---------- | |
| gold_element : | |
| An lxml element with manually annotated child tags (the gold standard). | |
| schema : | |
| TEISchema describing the elements that the annotator should produce. | |
| endpoint : | |
| Injected inference dependency passed unchanged to :func:`annotate`. | |
| gliner_model : | |
| GLiNER model ID for the optional pre-detection pass. | |
| Defaults to ``None`` (disabled) β enable for real-world runs. | |
| match_mode : | |
| How to decide whether a predicted span matches a gold span. | |
| overlap_threshold : | |
| IoU threshold when *match_mode* is OVERLAP. | |
| chunk_size, chunk_overlap : | |
| Chunking parameters forwarded to :func:`annotate`. | |
| Returns | |
| ------- | |
| :class:`~tei_annotator.evaluation.metrics.EvaluationResult` | |
| """ | |
| # Step 1 β extract gold spans (and the plain text they are anchored to) | |
| plain_text, gold_spans = extract_spans(gold_element) | |
| if not plain_text.strip(): | |
| return compute_metrics([], []) | |
| # Step 2 β annotate the plain text | |
| result = annotate( | |
| text=plain_text, | |
| schema=schema, | |
| endpoint=endpoint, | |
| gliner_model=gliner_model, | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| ) | |
| # Step 3 β parse the annotated XML output | |
| # annotate() returns a fragment (no root element), so we wrap it. | |
| # Escape any '<'/'>' whose tag name is not in the schema β these are | |
| # literal text characters that lxml would otherwise parse as elements | |
| # (e.g. <italic> in gold-standard elements becomes raw '<italic>'). | |
| allowed_tags = frozenset(e.tag for e in schema.elements) | |
| safe_xml = _escape_nonschema_brackets(result.xml, allowed_tags) | |
| try: | |
| pred_root = etree.fromstring(f"<_root>{safe_xml}</_root>".encode()) | |
| except etree.XMLSyntaxError as exc: | |
| warnings.warn( | |
| f"Could not parse annotator output as XML; treating as empty: {exc}", | |
| stacklevel=2, | |
| ) | |
| return compute_metrics(gold_spans, []) | |
| # Step 4 β extract predicted spans from the parsed output | |
| _, pred_spans = extract_spans(pred_root) | |
| # Step 5 β match and compute metrics | |
| eval_result = compute_metrics( | |
| gold_spans, | |
| pred_spans, | |
| mode=match_mode, | |
| overlap_threshold=overlap_threshold, | |
| ) | |
| eval_result.annotation_xml = result.xml | |
| return eval_result | |
| def evaluate_file( | |
| gold_xml_path: str | Path, | |
| schema: TEISchema, | |
| endpoint: EndpointConfig, | |
| root_element: str = "listBibl", | |
| child_element: str = "bibl", | |
| gliner_model: str | None = None, | |
| match_mode: MatchMode = MatchMode.TEXT, | |
| overlap_threshold: float = 0.5, | |
| chunk_size: int = 1500, | |
| chunk_overlap: int = 200, | |
| max_items: int | None = None, | |
| ) -> tuple[list[EvaluationResult], EvaluationResult]: | |
| """ | |
| Evaluate annotation quality against a gold-standard TEI XML file. | |
| Finds every ``<child_element>`` inside ``<root_element>``, strips its | |
| tags to obtain plain text, runs :func:`annotate`, and compares the result | |
| to the original annotation. | |
| Parameters | |
| ---------- | |
| gold_xml_path : | |
| Path to a TEI XML file (e.g. ``tests/fixtures/blbl-examples.tei.xml``). | |
| schema : | |
| TEISchema to use for annotation. | |
| endpoint : | |
| Inference endpoint configuration. | |
| root_element : | |
| Container element name to search for (default: ``"listBibl"``). | |
| child_element : | |
| Individual record element name to annotate (default: ``"bibl"``). | |
| gliner_model : | |
| GLiNER model ID, or ``None`` to disable. | |
| match_mode : | |
| Span matching criterion. | |
| overlap_threshold : | |
| IoU threshold for OVERLAP mode. | |
| chunk_size, chunk_overlap : | |
| Chunking parameters forwarded to :func:`annotate`. | |
| max_items : | |
| If set, only the first *max_items* child elements are evaluated. | |
| Useful for quick smoke runs. | |
| Returns | |
| ------- | |
| (per_record_results, aggregated_result) | |
| *per_record_results* β one :class:`EvaluationResult` per child element. | |
| *aggregated_result* β corpus-level metrics (TP/FP/FN summed across | |
| all records, then P/R/F1 computed from those totals). | |
| """ | |
| tree = etree.parse(str(gold_xml_path)) | |
| def _find(tag: str) -> list[etree._Element]: | |
| """Search with TEI namespace first, then without.""" | |
| elems = tree.findall(f".//{{{_TEI_NS}}}{tag}") | |
| return elems or tree.findall(f".//{tag}") | |
| containers = _find(root_element) | |
| all_children: list[etree._Element] = [] | |
| for container in containers: | |
| children = container.findall(f"{{{_TEI_NS}}}{child_element}") | |
| if not children: | |
| children = container.findall(child_element) | |
| all_children.extend(children) | |
| if max_items is not None: | |
| all_children = all_children[:max_items] | |
| per_record: list[EvaluationResult] = [] | |
| for element in all_children: | |
| result = evaluate_element( | |
| gold_element=element, | |
| schema=schema, | |
| endpoint=endpoint, | |
| gliner_model=gliner_model, | |
| match_mode=match_mode, | |
| overlap_threshold=overlap_threshold, | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| ) | |
| per_record.append(result) | |
| aggregated = aggregate(per_record) | |
| return per_record, aggregated | |