cmboulanger commited on
Commit
194050b
Β·
1 Parent(s): 8a7ede1

Rename function to be more generic

Browse files
scripts/evaluate_llm.py CHANGED
@@ -239,7 +239,7 @@ def run_evaluation(
239
  import warnings
240
  from lxml import etree
241
 
242
- from tei_annotator.evaluation import evaluate_bibl, aggregate, MatchMode
243
  from tei_annotator.evaluation.extractor import extract_spans
244
  from tei_annotator.inference.endpoint import EndpointCapability, EndpointConfig
245
 
@@ -290,7 +290,7 @@ def run_evaluation(
290
  "ignore",
291
  message="Output XML validation failed",
292
  )
293
- result = evaluate_bibl(
294
  gold_element=bibl,
295
  schema=schema,
296
  endpoint=endpoint,
 
239
  import warnings
240
  from lxml import etree
241
 
242
+ from tei_annotator.evaluation import evaluate_element, aggregate, MatchMode
243
  from tei_annotator.evaluation.extractor import extract_spans
244
  from tei_annotator.inference.endpoint import EndpointCapability, EndpointConfig
245
 
 
290
  "ignore",
291
  message="Output XML validation failed",
292
  )
293
+ result = evaluate_element(
294
  gold_element=bibl,
295
  schema=schema,
296
  endpoint=endpoint,
tei_annotator/__init__.py CHANGED
@@ -15,7 +15,7 @@ from .evaluation import (
15
  SpanMatch,
16
  aggregate,
17
  compute_metrics,
18
- evaluate_bibl,
19
  evaluate_file,
20
  extract_spans,
21
  match_spans,
@@ -41,7 +41,7 @@ __all__ = [
41
  "SpanMatch",
42
  "aggregate",
43
  "compute_metrics",
44
- "evaluate_bibl",
45
  "evaluate_file",
46
  "extract_spans",
47
  "match_spans",
 
15
  SpanMatch,
16
  aggregate,
17
  compute_metrics,
18
+ evaluate_element,
19
  evaluate_file,
20
  extract_spans,
21
  match_spans,
 
41
  "SpanMatch",
42
  "aggregate",
43
  "compute_metrics",
44
+ "evaluate_element",
45
  "evaluate_file",
46
  "extract_spans",
47
  "match_spans",
tei_annotator/evaluation/README.md CHANGED
@@ -107,7 +107,7 @@ Merges a list of `EvaluationResult` objects (one per document record) into a sin
107
 
108
  ## High-level API (`evaluator.py`)
109
 
110
- ### `evaluate_bibl(element, schema, endpoint, match_mode)`
111
 
112
  Evaluates annotation of a single lxml `_Element`. Handles:
113
 
@@ -120,7 +120,7 @@ Evaluates an entire TEI XML file:
120
 
121
  1. Parses the XML file with lxml.
122
  2. Finds all first-level child elements of the root.
123
- 3. Calls `evaluate_bibl()` on each, up to `max_items`.
124
  4. Returns `(list[EvaluationResult], EvaluationResult)` β€” individual results per record and the corpus-level aggregate.
125
 
126
  ---
 
107
 
108
  ## High-level API (`evaluator.py`)
109
 
110
+ ### `evaluate_element(element, schema, endpoint, match_mode)`
111
 
112
  Evaluates annotation of a single lxml `_Element`. Handles:
113
 
 
120
 
121
  1. Parses the XML file with lxml.
122
  2. Finds all first-level child elements of the root.
123
+ 3. Calls `evaluate_element()` on each, up to `max_items`.
124
  4. Returns `(list[EvaluationResult], EvaluationResult)` β€” individual results per record and the corpus-level aggregate.
125
 
126
  ---
tei_annotator/evaluation/__init__.py CHANGED
@@ -14,7 +14,7 @@ Typical usage::
14
  print(overall.report())
15
  """
16
 
17
- from .evaluator import evaluate_bibl, evaluate_file
18
  from .extractor import EvaluationSpan, extract_spans, spans_from_xml_string
19
  from .metrics import (
20
  ElementMetrics,
@@ -40,6 +40,6 @@ __all__ = [
40
  "compute_metrics",
41
  "aggregate",
42
  # Evaluator
43
- "evaluate_bibl",
44
  "evaluate_file",
45
  ]
 
14
  print(overall.report())
15
  """
16
 
17
+ from .evaluator import evaluate_element, evaluate_file
18
  from .extractor import EvaluationSpan, extract_spans, spans_from_xml_string
19
  from .metrics import (
20
  ElementMetrics,
 
40
  "compute_metrics",
41
  "aggregate",
42
  # Evaluator
43
+ "evaluate_element",
44
  "evaluate_file",
45
  ]
tei_annotator/evaluation/evaluator.py CHANGED
@@ -1,11 +1,11 @@
1
  """
2
  evaluator.py β€” High-level evaluation entry points.
3
 
4
- evaluate_bibl(gold_element, schema, endpoint, ...)
5
  Evaluate annotation of a single XML element against its gold standard.
6
 
7
  evaluate_file(gold_xml_path, schema, endpoint, ...)
8
- Evaluate annotation of every <bibl> in a TEI file's <listBibl>.
9
  Returns per-record results and corpus-level aggregated metrics.
10
 
11
  Both functions follow the same pipeline:
@@ -69,7 +69,7 @@ from .metrics import EvaluationResult, MatchMode, aggregate, compute_metrics
69
  _TEI_NS = "http://www.tei-c.org/ns/1.0"
70
 
71
 
72
- def evaluate_bibl(
73
  gold_element: etree._Element,
74
  schema: TEISchema,
75
  endpoint: EndpointConfig,
@@ -124,7 +124,7 @@ def evaluate_bibl(
124
  # annotate() returns a fragment (no root element), so we wrap it.
125
  # Escape any '<'/'>' whose tag name is not in the schema β€” these are
126
  # literal text characters that lxml would otherwise parse as elements
127
- # (e.g. &lt;italic&gt; in gold-standard bibls becomes raw '<italic>').
128
  allowed_tags = frozenset(e.tag for e in schema.elements)
129
  safe_xml = _escape_nonschema_brackets(result.xml, allowed_tags)
130
  try:
@@ -219,7 +219,7 @@ def evaluate_file(
219
 
220
  per_record: list[EvaluationResult] = []
221
  for element in all_children:
222
- result = evaluate_bibl(
223
  gold_element=element,
224
  schema=schema,
225
  endpoint=endpoint,
 
1
  """
2
  evaluator.py β€” High-level evaluation entry points.
3
 
4
+ evaluate_element(gold_element, schema, endpoint, ...)
5
  Evaluate annotation of a single XML element against its gold standard.
6
 
7
  evaluate_file(gold_xml_path, schema, endpoint, ...)
8
+ Evaluate annotation of every child element inside a container element.
9
  Returns per-record results and corpus-level aggregated metrics.
10
 
11
  Both functions follow the same pipeline:
 
69
  _TEI_NS = "http://www.tei-c.org/ns/1.0"
70
 
71
 
72
+ def evaluate_element(
73
  gold_element: etree._Element,
74
  schema: TEISchema,
75
  endpoint: EndpointConfig,
 
124
  # annotate() returns a fragment (no root element), so we wrap it.
125
  # Escape any '<'/'>' whose tag name is not in the schema β€” these are
126
  # literal text characters that lxml would otherwise parse as elements
127
+ # (e.g. &lt;italic&gt; in gold-standard elements becomes raw '<italic>').
128
  allowed_tags = frozenset(e.tag for e in schema.elements)
129
  safe_xml = _escape_nonschema_brackets(result.xml, allowed_tags)
130
  try:
 
219
 
220
  per_record: list[EvaluationResult] = []
221
  for element in all_children:
222
+ result = evaluate_element(
223
  gold_element=element,
224
  schema=schema,
225
  endpoint=endpoint,
tests/test_evaluation.py CHANGED
@@ -22,7 +22,7 @@ from tei_annotator.evaluation.metrics import (
22
  compute_metrics,
23
  match_spans,
24
  )
25
- from tei_annotator.evaluation.evaluator import evaluate_bibl
26
  from tei_annotator.inference.endpoint import EndpointCapability, EndpointConfig
27
  from tei_annotator.models.schema import TEIElement, TEISchema
28
 
@@ -326,10 +326,10 @@ class TestAggregate:
326
 
327
 
328
  # ---------------------------------------------------------------------------
329
- # evaluator β€” evaluate_bibl (mocked endpoint)
330
  # ---------------------------------------------------------------------------
331
 
332
- class TestEvaluateBibl:
333
  def _schema(self, *tags):
334
  return _schema(*tags)
335
 
@@ -342,7 +342,7 @@ class TestEvaluateBibl:
342
  {"element": "author", "text": "Smith", "context": "Smith, 2020.", "attrs": {}},
343
  {"element": "date", "text": "2020", "context": "Smith, 2020.", "attrs": {}},
344
  ])
345
- result = evaluate_bibl(root, schema, endpoint, gliner_model=None)
346
  assert result.micro_f1 == pytest.approx(1.0)
347
  assert result.micro_precision == 1.0
348
  assert result.micro_recall == 1.0
@@ -355,7 +355,7 @@ class TestEvaluateBibl:
355
  endpoint = _mock_endpoint([
356
  {"element": "author", "text": "Smith", "context": "Smith, 2020.", "attrs": {}},
357
  ])
358
- result = evaluate_bibl(root, schema, endpoint, gliner_model=None)
359
  assert result.micro_precision == 1.0
360
  assert result.micro_recall == pytest.approx(0.5)
361
 
@@ -368,7 +368,7 @@ class TestEvaluateBibl:
368
  {"element": "author", "text": "Smith", "context": "Smith.", "attrs": {}},
369
  {"element": "date", "text": "Smith", "context": "Smith.", "attrs": {}},
370
  ])
371
- result = evaluate_bibl(root, schema, endpoint, gliner_model=None)
372
  assert result.micro_recall == 1.0
373
  assert result.micro_precision == pytest.approx(0.5)
374
 
@@ -377,7 +377,7 @@ class TestEvaluateBibl:
377
  root = _parse(gold_xml)
378
  schema = self._schema("author")
379
  endpoint = _mock_endpoint([])
380
- result = evaluate_bibl(root, schema, endpoint, gliner_model=None)
381
  assert result.micro_precision == 0.0
382
  assert result.micro_recall == 0.0
383
  assert result.micro_f1 == 0.0
@@ -390,7 +390,7 @@ class TestEvaluateBibl:
390
  endpoint = _mock_endpoint([
391
  {"element": "editor", "text": "Smith", "context": "Smith.", "attrs": {}},
392
  ])
393
- result = evaluate_bibl(root, schema, endpoint, gliner_model=None)
394
  assert result.micro_f1 == 0.0
395
 
396
  def test_attributes_not_required_for_text_match(self):
@@ -402,7 +402,7 @@ class TestEvaluateBibl:
402
  endpoint = _mock_endpoint([
403
  {"element": "title", "text": "My Title", "context": "My Title.", "attrs": {}},
404
  ])
405
- result = evaluate_bibl(root, schema, endpoint, gliner_model=None, match_mode=MatchMode.TEXT)
406
  assert result.micro_f1 == pytest.approx(1.0)
407
 
408
  def test_exact_match_mode(self):
@@ -414,7 +414,7 @@ class TestEvaluateBibl:
414
  endpoint = _mock_endpoint([
415
  {"element": "author", "text": "Smith", "context": "Smith.", "attrs": {}},
416
  ])
417
- result = evaluate_bibl(root, schema, endpoint, gliner_model=None, match_mode=MatchMode.EXACT)
418
  # Offsets align (same plain text) β†’ should match
419
  assert result.micro_f1 == pytest.approx(1.0)
420
 
@@ -424,7 +424,7 @@ class TestEvaluateBibl:
424
  root = _parse(gold_xml)
425
  schema = self._schema("author")
426
  endpoint = _mock_endpoint([])
427
- result = evaluate_bibl(root, schema, endpoint, gliner_model=None)
428
  # No gold spans, no pred spans β†’ vacuously perfect
429
  assert result.micro_tp == 0
430
  assert result.micro_fp == 0
 
22
  compute_metrics,
23
  match_spans,
24
  )
25
+ from tei_annotator.evaluation.evaluator import evaluate_element
26
  from tei_annotator.inference.endpoint import EndpointCapability, EndpointConfig
27
  from tei_annotator.models.schema import TEIElement, TEISchema
28
 
 
326
 
327
 
328
  # ---------------------------------------------------------------------------
329
+ # evaluator β€” evaluate_element (mocked endpoint)
330
  # ---------------------------------------------------------------------------
331
 
332
+ class TestEvaluateElement:
333
  def _schema(self, *tags):
334
  return _schema(*tags)
335
 
 
342
  {"element": "author", "text": "Smith", "context": "Smith, 2020.", "attrs": {}},
343
  {"element": "date", "text": "2020", "context": "Smith, 2020.", "attrs": {}},
344
  ])
345
+ result = evaluate_element(root, schema, endpoint, gliner_model=None)
346
  assert result.micro_f1 == pytest.approx(1.0)
347
  assert result.micro_precision == 1.0
348
  assert result.micro_recall == 1.0
 
355
  endpoint = _mock_endpoint([
356
  {"element": "author", "text": "Smith", "context": "Smith, 2020.", "attrs": {}},
357
  ])
358
+ result = evaluate_element(root, schema, endpoint, gliner_model=None)
359
  assert result.micro_precision == 1.0
360
  assert result.micro_recall == pytest.approx(0.5)
361
 
 
368
  {"element": "author", "text": "Smith", "context": "Smith.", "attrs": {}},
369
  {"element": "date", "text": "Smith", "context": "Smith.", "attrs": {}},
370
  ])
371
+ result = evaluate_element(root, schema, endpoint, gliner_model=None)
372
  assert result.micro_recall == 1.0
373
  assert result.micro_precision == pytest.approx(0.5)
374
 
 
377
  root = _parse(gold_xml)
378
  schema = self._schema("author")
379
  endpoint = _mock_endpoint([])
380
+ result = evaluate_element(root, schema, endpoint, gliner_model=None)
381
  assert result.micro_precision == 0.0
382
  assert result.micro_recall == 0.0
383
  assert result.micro_f1 == 0.0
 
390
  endpoint = _mock_endpoint([
391
  {"element": "editor", "text": "Smith", "context": "Smith.", "attrs": {}},
392
  ])
393
+ result = evaluate_element(root, schema, endpoint, gliner_model=None)
394
  assert result.micro_f1 == 0.0
395
 
396
  def test_attributes_not_required_for_text_match(self):
 
402
  endpoint = _mock_endpoint([
403
  {"element": "title", "text": "My Title", "context": "My Title.", "attrs": {}},
404
  ])
405
+ result = evaluate_element(root, schema, endpoint, gliner_model=None, match_mode=MatchMode.TEXT)
406
  assert result.micro_f1 == pytest.approx(1.0)
407
 
408
  def test_exact_match_mode(self):
 
414
  endpoint = _mock_endpoint([
415
  {"element": "author", "text": "Smith", "context": "Smith.", "attrs": {}},
416
  ])
417
+ result = evaluate_element(root, schema, endpoint, gliner_model=None, match_mode=MatchMode.EXACT)
418
  # Offsets align (same plain text) β†’ should match
419
  assert result.micro_f1 == pytest.approx(1.0)
420
 
 
424
  root = _parse(gold_xml)
425
  schema = self._schema("author")
426
  endpoint = _mock_endpoint([])
427
+ result = evaluate_element(root, schema, endpoint, gliner_model=None)
428
  # No gold spans, no pred spans β†’ vacuously perfect
429
  assert result.micro_tp == 0
430
  assert result.micro_fp == 0