Spaces:

cmboulanger
/

tei-annotator

Runtime error

App Files Files Community

cmboulanger commited on Mar 2

Commit

194050b

1 Parent(s): 8a7ede1

Rename function to be more generic

Browse files

Files changed (6) hide show

scripts/evaluate_llm.py +2 -2
tei_annotator/__init__.py +2 -2
tei_annotator/evaluation/README.md +2 -2
tei_annotator/evaluation/__init__.py +2 -2
tei_annotator/evaluation/evaluator.py +5 -5
tests/test_evaluation.py +11 -11

scripts/evaluate_llm.py CHANGED Viewed

@@ -239,7 +239,7 @@ def run_evaluation(
     import warnings
     from lxml import etree
-    from tei_annotator.evaluation import evaluate_bibl, aggregate, MatchMode
     from tei_annotator.evaluation.extractor import extract_spans
     from tei_annotator.inference.endpoint import EndpointCapability, EndpointConfig
@@ -290,7 +290,7 @@ def run_evaluation(
                     "ignore",
                     message="Output XML validation failed",
                 )
-                result = evaluate_bibl(
                     gold_element=bibl,
                     schema=schema,
                     endpoint=endpoint,

     import warnings
     from lxml import etree
+    from tei_annotator.evaluation import evaluate_element, aggregate, MatchMode
     from tei_annotator.evaluation.extractor import extract_spans
     from tei_annotator.inference.endpoint import EndpointCapability, EndpointConfig
                     "ignore",
                     message="Output XML validation failed",
                 )
+                result = evaluate_element(
                     gold_element=bibl,
                     schema=schema,
                     endpoint=endpoint,

tei_annotator/__init__.py CHANGED Viewed

@@ -15,7 +15,7 @@ from .evaluation import (
     SpanMatch,
     aggregate,
     compute_metrics,
-    evaluate_bibl,
     evaluate_file,
     extract_spans,
     match_spans,
@@ -41,7 +41,7 @@ __all__ = [
     "SpanMatch",
     "aggregate",
     "compute_metrics",
-    "evaluate_bibl",
     "evaluate_file",
     "extract_spans",
     "match_spans",

     SpanMatch,
     aggregate,
     compute_metrics,
+    evaluate_element,
     evaluate_file,
     extract_spans,
     match_spans,
     "SpanMatch",
     "aggregate",
     "compute_metrics",
+    "evaluate_element",
     "evaluate_file",
     "extract_spans",
     "match_spans",

tei_annotator/evaluation/README.md CHANGED Viewed

@@ -107,7 +107,7 @@ Merges a list of `EvaluationResult` objects (one per document record) into a sin
 ## High-level API (`evaluator.py`)
-### `evaluate_bibl(element, schema, endpoint, match_mode)`
 Evaluates annotation of a single lxml `_Element`. Handles:
@@ -120,7 +120,7 @@ Evaluates an entire TEI XML file:
 1. Parses the XML file with lxml.
 2. Finds all first-level child elements of the root.
-3. Calls `evaluate_bibl()` on each, up to `max_items`.
 4. Returns `(list[EvaluationResult], EvaluationResult)` — individual results per record and the corpus-level aggregate.
 ---

 ## High-level API (`evaluator.py`)
+### `evaluate_element(element, schema, endpoint, match_mode)`
 Evaluates annotation of a single lxml `_Element`. Handles:
 1. Parses the XML file with lxml.
 2. Finds all first-level child elements of the root.
+3. Calls `evaluate_element()` on each, up to `max_items`.
 4. Returns `(list[EvaluationResult], EvaluationResult)` — individual results per record and the corpus-level aggregate.
 ---

tei_annotator/evaluation/__init__.py CHANGED Viewed

@@ -14,7 +14,7 @@ Typical usage::
     print(overall.report())
 """
-from .evaluator import evaluate_bibl, evaluate_file
 from .extractor import EvaluationSpan, extract_spans, spans_from_xml_string
 from .metrics import (
     ElementMetrics,
@@ -40,6 +40,6 @@ __all__ = [
     "compute_metrics",
     "aggregate",
     # Evaluator
-    "evaluate_bibl",
     "evaluate_file",
 ]

     print(overall.report())
 """
+from .evaluator import evaluate_element, evaluate_file
 from .extractor import EvaluationSpan, extract_spans, spans_from_xml_string
 from .metrics import (
     ElementMetrics,
     "compute_metrics",
     "aggregate",
     # Evaluator
+    "evaluate_element",
     "evaluate_file",
 ]

tei_annotator/evaluation/evaluator.py CHANGED Viewed

@@ -1,11 +1,11 @@
 """
 evaluator.py — High-level evaluation entry points.
-evaluate_bibl(gold_element, schema, endpoint, ...)
     Evaluate annotation of a single XML element against its gold standard.
 evaluate_file(gold_xml_path, schema, endpoint, ...)
-    Evaluate annotation of every <bibl> in a TEI file's <listBibl>.
     Returns per-record results and corpus-level aggregated metrics.
 Both functions follow the same pipeline:
@@ -69,7 +69,7 @@ from .metrics import EvaluationResult, MatchMode, aggregate, compute_metrics
 _TEI_NS = "http://www.tei-c.org/ns/1.0"
-def evaluate_bibl(
     gold_element: etree._Element,
     schema: TEISchema,
     endpoint: EndpointConfig,
@@ -124,7 +124,7 @@ def evaluate_bibl(
     # annotate() returns a fragment (no root element), so we wrap it.
     # Escape any '<'/'>' whose tag name is not in the schema — these are
     # literal text characters that lxml would otherwise parse as elements
-    # (e.g. &lt;italic&gt; in gold-standard bibls becomes raw '<italic>').
     allowed_tags = frozenset(e.tag for e in schema.elements)
     safe_xml = _escape_nonschema_brackets(result.xml, allowed_tags)
     try:
@@ -219,7 +219,7 @@ def evaluate_file(
     per_record: list[EvaluationResult] = []
     for element in all_children:
-        result = evaluate_bibl(
             gold_element=element,
             schema=schema,
             endpoint=endpoint,

 """
 evaluator.py — High-level evaluation entry points.
+evaluate_element(gold_element, schema, endpoint, ...)
     Evaluate annotation of a single XML element against its gold standard.
 evaluate_file(gold_xml_path, schema, endpoint, ...)
+    Evaluate annotation of every child element inside a container element.
     Returns per-record results and corpus-level aggregated metrics.
 Both functions follow the same pipeline:
 _TEI_NS = "http://www.tei-c.org/ns/1.0"
+def evaluate_element(
     gold_element: etree._Element,
     schema: TEISchema,
     endpoint: EndpointConfig,
     # annotate() returns a fragment (no root element), so we wrap it.
     # Escape any '<'/'>' whose tag name is not in the schema — these are
     # literal text characters that lxml would otherwise parse as elements
+    # (e.g. &lt;italic&gt; in gold-standard elements becomes raw '<italic>').
     allowed_tags = frozenset(e.tag for e in schema.elements)
     safe_xml = _escape_nonschema_brackets(result.xml, allowed_tags)
     try:
     per_record: list[EvaluationResult] = []
     for element in all_children:
+        result = evaluate_element(
             gold_element=element,
             schema=schema,
             endpoint=endpoint,

tests/test_evaluation.py CHANGED Viewed

@@ -22,7 +22,7 @@ from tei_annotator.evaluation.metrics import (
     compute_metrics,
     match_spans,
 )
-from tei_annotator.evaluation.evaluator import evaluate_bibl
 from tei_annotator.inference.endpoint import EndpointCapability, EndpointConfig
 from tei_annotator.models.schema import TEIElement, TEISchema
@@ -326,10 +326,10 @@ class TestAggregate:
 # ---------------------------------------------------------------------------
-# evaluator — evaluate_bibl (mocked endpoint)
 # ---------------------------------------------------------------------------
-class TestEvaluateBibl:
     def _schema(self, *tags):
         return _schema(*tags)
@@ -342,7 +342,7 @@ class TestEvaluateBibl:
             {"element": "author", "text": "Smith", "context": "Smith, 2020.", "attrs": {}},
             {"element": "date", "text": "2020", "context": "Smith, 2020.", "attrs": {}},
         ])
-        result = evaluate_bibl(root, schema, endpoint, gliner_model=None)
         assert result.micro_f1 == pytest.approx(1.0)
         assert result.micro_precision == 1.0
         assert result.micro_recall == 1.0
@@ -355,7 +355,7 @@ class TestEvaluateBibl:
         endpoint = _mock_endpoint([
             {"element": "author", "text": "Smith", "context": "Smith, 2020.", "attrs": {}},
         ])
-        result = evaluate_bibl(root, schema, endpoint, gliner_model=None)
         assert result.micro_precision == 1.0
         assert result.micro_recall == pytest.approx(0.5)
@@ -368,7 +368,7 @@ class TestEvaluateBibl:
             {"element": "author", "text": "Smith", "context": "Smith.", "attrs": {}},
             {"element": "date", "text": "Smith", "context": "Smith.", "attrs": {}},
         ])
-        result = evaluate_bibl(root, schema, endpoint, gliner_model=None)
         assert result.micro_recall == 1.0
         assert result.micro_precision == pytest.approx(0.5)
@@ -377,7 +377,7 @@ class TestEvaluateBibl:
         root = _parse(gold_xml)
         schema = self._schema("author")
         endpoint = _mock_endpoint([])
-        result = evaluate_bibl(root, schema, endpoint, gliner_model=None)
         assert result.micro_precision == 0.0
         assert result.micro_recall == 0.0
         assert result.micro_f1 == 0.0
@@ -390,7 +390,7 @@ class TestEvaluateBibl:
         endpoint = _mock_endpoint([
             {"element": "editor", "text": "Smith", "context": "Smith.", "attrs": {}},
         ])
-        result = evaluate_bibl(root, schema, endpoint, gliner_model=None)
         assert result.micro_f1 == 0.0
     def test_attributes_not_required_for_text_match(self):
@@ -402,7 +402,7 @@ class TestEvaluateBibl:
         endpoint = _mock_endpoint([
             {"element": "title", "text": "My Title", "context": "My Title.", "attrs": {}},
         ])
-        result = evaluate_bibl(root, schema, endpoint, gliner_model=None, match_mode=MatchMode.TEXT)
         assert result.micro_f1 == pytest.approx(1.0)
     def test_exact_match_mode(self):
@@ -414,7 +414,7 @@ class TestEvaluateBibl:
         endpoint = _mock_endpoint([
             {"element": "author", "text": "Smith", "context": "Smith.", "attrs": {}},
         ])
-        result = evaluate_bibl(root, schema, endpoint, gliner_model=None, match_mode=MatchMode.EXACT)
         # Offsets align (same plain text) → should match
         assert result.micro_f1 == pytest.approx(1.0)
@@ -424,7 +424,7 @@ class TestEvaluateBibl:
         root = _parse(gold_xml)
         schema = self._schema("author")
         endpoint = _mock_endpoint([])
-        result = evaluate_bibl(root, schema, endpoint, gliner_model=None)
         # No gold spans, no pred spans → vacuously perfect
         assert result.micro_tp == 0
         assert result.micro_fp == 0

     compute_metrics,
     match_spans,
 )
+from tei_annotator.evaluation.evaluator import evaluate_element
 from tei_annotator.inference.endpoint import EndpointCapability, EndpointConfig
 from tei_annotator.models.schema import TEIElement, TEISchema
 # ---------------------------------------------------------------------------
+# evaluator — evaluate_element (mocked endpoint)
 # ---------------------------------------------------------------------------
+class TestEvaluateElement:
     def _schema(self, *tags):
         return _schema(*tags)
             {"element": "author", "text": "Smith", "context": "Smith, 2020.", "attrs": {}},
             {"element": "date", "text": "2020", "context": "Smith, 2020.", "attrs": {}},
         ])
+        result = evaluate_element(root, schema, endpoint, gliner_model=None)
         assert result.micro_f1 == pytest.approx(1.0)
         assert result.micro_precision == 1.0
         assert result.micro_recall == 1.0
         endpoint = _mock_endpoint([
             {"element": "author", "text": "Smith", "context": "Smith, 2020.", "attrs": {}},
         ])
+        result = evaluate_element(root, schema, endpoint, gliner_model=None)
         assert result.micro_precision == 1.0
         assert result.micro_recall == pytest.approx(0.5)
             {"element": "author", "text": "Smith", "context": "Smith.", "attrs": {}},
             {"element": "date", "text": "Smith", "context": "Smith.", "attrs": {}},
         ])
+        result = evaluate_element(root, schema, endpoint, gliner_model=None)
         assert result.micro_recall == 1.0
         assert result.micro_precision == pytest.approx(0.5)
         root = _parse(gold_xml)
         schema = self._schema("author")
         endpoint = _mock_endpoint([])
+        result = evaluate_element(root, schema, endpoint, gliner_model=None)
         assert result.micro_precision == 0.0
         assert result.micro_recall == 0.0
         assert result.micro_f1 == 0.0
         endpoint = _mock_endpoint([
             {"element": "editor", "text": "Smith", "context": "Smith.", "attrs": {}},
         ])
+        result = evaluate_element(root, schema, endpoint, gliner_model=None)
         assert result.micro_f1 == 0.0
     def test_attributes_not_required_for_text_match(self):
         endpoint = _mock_endpoint([
             {"element": "title", "text": "My Title", "context": "My Title.", "attrs": {}},
         ])
+        result = evaluate_element(root, schema, endpoint, gliner_model=None, match_mode=MatchMode.TEXT)
         assert result.micro_f1 == pytest.approx(1.0)
     def test_exact_match_mode(self):
         endpoint = _mock_endpoint([
             {"element": "author", "text": "Smith", "context": "Smith.", "attrs": {}},
         ])
+        result = evaluate_element(root, schema, endpoint, gliner_model=None, match_mode=MatchMode.EXACT)
         # Offsets align (same plain text) → should match
         assert result.micro_f1 == pytest.approx(1.0)
         root = _parse(gold_xml)
         schema = self._schema("author")
         endpoint = _mock_endpoint([])
+        result = evaluate_element(root, schema, endpoint, gliner_model=None)
         # No gold spans, no pred spans → vacuously perfect
         assert result.micro_tp == 0
         assert result.micro_fp == 0