tei-annotator / tests /integration /test_pipeline_e2e.py
cmboulanger's picture
full implementation
37eaffd
"""
End-to-end integration tests: real GLiNER model + mocked call_fn.
Tests that only use mocked call_fn (gliner_model=None) are also here because
they exercise the full pipeline with non-trivial context resolution scenarios.
Run with: pytest -m integration
"""
from __future__ import annotations
import json
import re
import pytest
pytestmark = pytest.mark.integration
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _strip_tags(xml: str) -> str:
return re.sub(r"<[^>]+>", "", xml)
def _schema(*tags: tuple[str, str]):
"""Build a TEISchema from (tag, description) pairs."""
from tei_annotator.models.schema import TEIAttribute, TEIElement, TEISchema
elements = []
for tag, desc in tags:
if tag == "persName":
elements.append(
TEIElement(
tag="persName",
description=desc,
attributes=[
TEIAttribute(name="ref", description="URI reference"),
TEIAttribute(name="cert", description="certainty", allowed_values=["high", "low"]),
],
)
)
else:
elements.append(TEIElement(tag=tag, description=desc, attributes=[]))
return TEISchema(elements=elements)
def _endpoint(call_fn, capability="json_enforced"):
from tei_annotator.inference.endpoint import EndpointCapability, EndpointConfig
cap = {
"json_enforced": EndpointCapability.JSON_ENFORCED,
"text_generation": EndpointCapability.TEXT_GENERATION,
}[capability]
return EndpointConfig(capability=cap, call_fn=call_fn)
def _annotate(text, schema, call_fn, capability="json_enforced", gliner_model=None, **kw):
from tei_annotator.pipeline import annotate
return annotate(
text=text,
schema=schema,
endpoint=_endpoint(call_fn, capability),
gliner_model=gliner_model,
**kw,
)
# ---------------------------------------------------------------------------
# 1. Exact context longer than span text
# ---------------------------------------------------------------------------
def test_context_longer_than_span_text():
"""Resolver must locate span.text inside a longer context window."""
source = "The treaty was signed by Cardinal Richelieu in Paris."
schema = _schema(("persName", "a person's name"), ("placeName", "a place name"))
def call_fn(_):
return json.dumps([
{
"element": "persName",
"text": "Cardinal Richelieu",
"context": "was signed by Cardinal Richelieu in Paris",
"attrs": {},
},
{
"element": "placeName",
"text": "Paris",
"context": "Cardinal Richelieu in Paris.",
"attrs": {},
},
])
result = _annotate(source, schema, call_fn)
assert "<persName>Cardinal Richelieu</persName>" in result.xml
assert "<placeName>Paris</placeName>" in result.xml
assert _strip_tags(result.xml) == source
# ---------------------------------------------------------------------------
# 2. Same span text appears twice — context disambiguates
# ---------------------------------------------------------------------------
def test_multiple_occurrences_disambiguated_by_context():
"""
'John Smith' appears twice. LLM returns two spans with distinct contexts
pointing at each occurrence. Both must be annotated at the correct offset.
"""
source = "John Smith arrived early. Later, John Smith left."
schema = _schema(("persName", "a person's name"))
def call_fn(_):
return json.dumps([
{
"element": "persName",
"text": "John Smith",
"context": "John Smith arrived early.",
"attrs": {},
},
{
"element": "persName",
"text": "John Smith",
"context": "Later, John Smith left.",
"attrs": {},
},
])
result = _annotate(source, schema, call_fn)
assert result.xml.count("<persName>") == 2
assert result.xml.count("John Smith") == 2
assert _strip_tags(result.xml) == source
# ---------------------------------------------------------------------------
# 3. Long text requiring chunking — global offset calculation
# ---------------------------------------------------------------------------
def test_long_text_entity_in_second_chunk():
"""
Entity is far into a long text; its LLM context is relative to a later chunk.
Offset must be shifted by chunk.start_offset to land at the correct global position.
"""
# Build a ~2500-char text; entity sits well past the first 1500-char chunk
filler = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. " * 25 # ~1425 chars
target_sentence = "Napoleon Bonaparte was exiled to Saint Helena."
source = filler + target_sentence
schema = _schema(
("persName", "a person's name"),
("placeName", "a place name"),
)
def call_fn(prompt):
# The LLM sees either a filler chunk (returns []) or the chunk containing
# the target sentence and returns the two spans.
if "Napoleon" not in prompt:
return "[]"
return json.dumps([
{
"element": "persName",
"text": "Napoleon Bonaparte",
"context": "Napoleon Bonaparte was exiled to Saint Helena.",
"attrs": {},
},
{
"element": "placeName",
"text": "Saint Helena",
"context": "was exiled to Saint Helena.",
"attrs": {},
},
])
result = _annotate(source, schema, call_fn, chunk_size=1500, chunk_overlap=200)
assert "<persName>Napoleon Bonaparte</persName>" in result.xml
assert "<placeName>Saint Helena</placeName>" in result.xml
assert _strip_tags(result.xml) == source
# Verify the annotated positions are truly within the target sentence
napoleon_start = result.xml.index("Napoleon Bonaparte")
assert napoleon_start > 1400, (
f"Napoleon offset {napoleon_start} is too early — chunk offset was not applied"
)
# ---------------------------------------------------------------------------
# 4. Nested spans resolved end-to-end
# ---------------------------------------------------------------------------
def test_nested_spans_end_to_end():
"""
LLM emits an outer persName and inner forename / surname spans.
Both are resolved separately and then nested by the injector.
"""
source = "He met John Smith today."
schema = _schema(
("persName", "a person's full name"),
("forename", "a forename"),
("surname", "a surname"),
)
def call_fn(_):
return json.dumps([
{"element": "persName", "text": "John Smith", "context": "met John Smith today.", "attrs": {}},
{"element": "forename", "text": "John", "context": "met John Smith today.", "attrs": {}},
{"element": "surname", "text": "Smith", "context": "John Smith today.", "attrs": {}},
])
result = _annotate(source, schema, call_fn)
assert "<persName>" in result.xml
assert "<forename>" in result.xml
assert "<surname>" in result.xml
# forename and surname must be inside persName
p_open = result.xml.index("<persName>")
p_close = result.xml.index("</persName>")
fn_open = result.xml.index("<forename>")
sn_close = result.xml.index("</surname>")
assert p_open < fn_open < sn_close < p_close
assert _strip_tags(result.xml) == source
# ---------------------------------------------------------------------------
# 5. Pre-existing XML preserved after annotation
# ---------------------------------------------------------------------------
def test_preexisting_xml_preserved():
"""
Source already has markup (<note> tags). After annotation the original
markup must still be present alongside the new TEI annotations.
"""
source = "He met <note>allegedly</note> John Smith yesterday."
schema = _schema(("persName", "a person's name"))
def call_fn(_):
# The LLM sees stripped plain text: "He met allegedly John Smith yesterday."
return json.dumps([
{
"element": "persName",
"text": "John Smith",
"context": "allegedly John Smith yesterday.",
"attrs": {},
}
])
result = _annotate(source, schema, call_fn)
assert "<note>" in result.xml
assert "</note>" in result.xml
assert "<persName>John Smith</persName>" in result.xml
# Plain text must be unchanged
assert _strip_tags(result.xml) == _strip_tags(source)
# ---------------------------------------------------------------------------
# 6. Attributes preserved end-to-end
# ---------------------------------------------------------------------------
def test_attributes_preserved_end_to_end():
"""Attribute values returned by the LLM must appear verbatim in the output tag."""
source = "The emperor Napoleon was defeated at Waterloo."
schema = _schema(("persName", "a person's name"))
def call_fn(_):
return json.dumps([
{
"element": "persName",
"text": "Napoleon",
"context": "emperor Napoleon was defeated",
"attrs": {"ref": "http://viaf.org/viaf/106964661", "cert": "high"},
}
])
result = _annotate(source, schema, call_fn)
assert 'ref="http://viaf.org/viaf/106964661"' in result.xml
assert 'cert="high"' in result.xml
assert _strip_tags(result.xml) == source
# ---------------------------------------------------------------------------
# 7. Hallucinated context → span silently rejected
# ---------------------------------------------------------------------------
def test_hallucinated_context_span_rejected():
"""
LLM returns a plausible-looking but non-existent context.
The resolver must reject the span; the source text is returned unmodified.
"""
source = "Marie Curie discovered polonium."
schema = _schema(("persName", "a person's name"))
def call_fn(_):
return json.dumps([
{
"element": "persName",
"text": "Marie Curie",
"context": "Dr. Marie Curie discovered polonium", # "Dr. " not in source
"attrs": {},
}
])
result = _annotate(source, schema, call_fn)
assert "<persName>" not in result.xml
assert result.xml == source
# ---------------------------------------------------------------------------
# 8. Fuzzy context match → span annotated and flagged
# ---------------------------------------------------------------------------
def test_fuzzy_context_match_flags_span():
"""
A context with a single-character typo should still resolve via fuzzy
matching (score > 0.92) and be included with fuzzy_match=True.
"""
source = "Galileo Galilei observed the moons of Jupiter."
schema = _schema(("persName", "a person's name"))
def call_fn(_):
return json.dumps([
{
"element": "persName",
"text": "Galileo Galilei",
# One character different from the source — should trigger fuzzy
"context": "Galileo Galilei observd the moons of Jupiter.",
"attrs": {},
}
])
result = _annotate(source, schema, call_fn)
# The span should still be annotated
assert "<persName>Galileo Galilei</persName>" in result.xml
# And flagged as fuzzy
assert len(result.fuzzy_spans) == 1
assert result.fuzzy_spans[0].element == "persName"
assert _strip_tags(result.xml) == source
# ---------------------------------------------------------------------------
# 9. Source text never modified (plain-text invariant)
# ---------------------------------------------------------------------------
def test_plain_text_invariant_with_multiple_entities():
"""Stripping all tags from the output must yield exactly the input text."""
source = (
"Leonardo da Vinci was born in Vinci, Tuscany, "
"and later worked in Milan and Florence."
)
schema = _schema(
("persName", "a person's name"),
("placeName", "a place name"),
)
def call_fn(_):
return json.dumps([
{"element": "persName", "text": "Leonardo da Vinci",
"context": "Leonardo da Vinci was born in Vinci", "attrs": {}},
{"element": "placeName", "text": "Vinci",
"context": "born in Vinci, Tuscany", "attrs": {}},
{"element": "placeName", "text": "Tuscany",
"context": "Vinci, Tuscany, and later", "attrs": {}},
{"element": "placeName", "text": "Milan",
"context": "later worked in Milan and Florence", "attrs": {}},
{"element": "placeName", "text": "Florence",
"context": "in Milan and Florence.", "attrs": {}},
])
result = _annotate(source, schema, call_fn)
assert _strip_tags(result.xml) == source
assert result.xml.count("<placeName>") == 4
assert "<persName>Leonardo da Vinci</persName>" in result.xml
# ---------------------------------------------------------------------------
# 10. Real GLiNER model (requires HuggingFace download)
# ---------------------------------------------------------------------------
def test_pipeline_with_real_gliner():
"""Full pipeline: real GLiNER pre-detection + mocked LLM call_fn."""
schema = _schema(("persName", "a person's name"))
def mock_llm(_: str) -> str:
return json.dumps([
{
"element": "persName",
"text": "Albert Einstein",
"context": "Albert Einstein was born",
"attrs": {},
}
])
result = _annotate(
"Albert Einstein was born in Ulm in 1879.",
schema,
mock_llm,
gliner_model="numind/NuNER_Zero",
)
assert "persName" in result.xml
assert "Albert Einstein" in result.xml
assert result.xml.count("Albert Einstein") == 1