Spaces:

cmboulanger
/

tei-annotator

Runtime error

App Files Files Community

tei-annotator / tests /integration /test_pipeline_e2e.py

cmboulanger

full implementation

37eaffd about 2 months ago

raw

history blame contribute delete

14.6 kB

	"""
	End-to-end integration tests: real GLiNER model + mocked call_fn.

	Tests that only use mocked call_fn (gliner_model=None) are also here because
	they exercise the full pipeline with non-trivial context resolution scenarios.

	Run with: pytest -m integration
	"""

	from __future__ import annotations

	import json
	import re

	import pytest

	pytestmark = pytest.mark.integration

	# ---------------------------------------------------------------------------
	# Helpers
	# ---------------------------------------------------------------------------


	def _strip_tags(xml: str) -> str:
	return re.sub(r"<[^>]+>", "", xml)


	def _schema(*tags: tuple[str, str]):
	"""Build a TEISchema from (tag, description) pairs."""
	from tei_annotator.models.schema import TEIAttribute, TEIElement, TEISchema

	elements = []
	for tag, desc in tags:
	if tag == "persName":
	elements.append(
	TEIElement(
	tag="persName",
	description=desc,
	attributes=[
	TEIAttribute(name="ref", description="URI reference"),
	TEIAttribute(name="cert", description="certainty", allowed_values=["high", "low"]),
	],
	)
	)
	else:
	elements.append(TEIElement(tag=tag, description=desc, attributes=[]))
	return TEISchema(elements=elements)


	def _endpoint(call_fn, capability="json_enforced"):
	from tei_annotator.inference.endpoint import EndpointCapability, EndpointConfig

	cap = {
	"json_enforced": EndpointCapability.JSON_ENFORCED,
	"text_generation": EndpointCapability.TEXT_GENERATION,
	}[capability]
	return EndpointConfig(capability=cap, call_fn=call_fn)


	def _annotate(text, schema, call_fn, capability="json_enforced", gliner_model=None, **kw):
	from tei_annotator.pipeline import annotate

	return annotate(
	text=text,
	schema=schema,
	endpoint=_endpoint(call_fn, capability),
	gliner_model=gliner_model,
	**kw,
	)


	# ---------------------------------------------------------------------------
	# 1. Exact context longer than span text
	# ---------------------------------------------------------------------------


	def test_context_longer_than_span_text():
	"""Resolver must locate span.text inside a longer context window."""
	source = "The treaty was signed by Cardinal Richelieu in Paris."
	schema = _schema(("persName", "a person's name"), ("placeName", "a place name"))

	def call_fn(_):
	return json.dumps([
	{
	"element": "persName",
	"text": "Cardinal Richelieu",
	"context": "was signed by Cardinal Richelieu in Paris",
	"attrs": {},
	},
	{
	"element": "placeName",
	"text": "Paris",
	"context": "Cardinal Richelieu in Paris.",
	"attrs": {},
	},
	])

	result = _annotate(source, schema, call_fn)
	assert "<persName>Cardinal Richelieu</persName>" in result.xml
	assert "<placeName>Paris</placeName>" in result.xml
	assert _strip_tags(result.xml) == source


	# ---------------------------------------------------------------------------
	# 2. Same span text appears twice — context disambiguates
	# ---------------------------------------------------------------------------


	def test_multiple_occurrences_disambiguated_by_context():
	"""
	'John Smith' appears twice. LLM returns two spans with distinct contexts
	pointing at each occurrence. Both must be annotated at the correct offset.
	"""
	source = "John Smith arrived early. Later, John Smith left."
	schema = _schema(("persName", "a person's name"))

	def call_fn(_):
	return json.dumps([
	{
	"element": "persName",
	"text": "John Smith",
	"context": "John Smith arrived early.",
	"attrs": {},
	},
	{
	"element": "persName",
	"text": "John Smith",
	"context": "Later, John Smith left.",
	"attrs": {},
	},
	])

	result = _annotate(source, schema, call_fn)
	assert result.xml.count("<persName>") == 2
	assert result.xml.count("John Smith") == 2
	assert _strip_tags(result.xml) == source


	# ---------------------------------------------------------------------------
	# 3. Long text requiring chunking — global offset calculation
	# ---------------------------------------------------------------------------


	def test_long_text_entity_in_second_chunk():
	"""
	Entity is far into a long text; its LLM context is relative to a later chunk.
	Offset must be shifted by chunk.start_offset to land at the correct global position.
	"""
	# Build a ~2500-char text; entity sits well past the first 1500-char chunk
	filler = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. " * 25 # ~1425 chars
	target_sentence = "Napoleon Bonaparte was exiled to Saint Helena."
	source = filler + target_sentence

	schema = _schema(
	("persName", "a person's name"),
	("placeName", "a place name"),
	)

	def call_fn(prompt):
	# The LLM sees either a filler chunk (returns []) or the chunk containing
	# the target sentence and returns the two spans.
	if "Napoleon" not in prompt:
	return "[]"
	return json.dumps([
	{
	"element": "persName",
	"text": "Napoleon Bonaparte",
	"context": "Napoleon Bonaparte was exiled to Saint Helena.",
	"attrs": {},
	},
	{
	"element": "placeName",
	"text": "Saint Helena",
	"context": "was exiled to Saint Helena.",
	"attrs": {},
	},
	])

	result = _annotate(source, schema, call_fn, chunk_size=1500, chunk_overlap=200)

	assert "<persName>Napoleon Bonaparte</persName>" in result.xml
	assert "<placeName>Saint Helena</placeName>" in result.xml
	assert _strip_tags(result.xml) == source

	# Verify the annotated positions are truly within the target sentence
	napoleon_start = result.xml.index("Napoleon Bonaparte")
	assert napoleon_start > 1400, (
	f"Napoleon offset {napoleon_start} is too early — chunk offset was not applied"
	)


	# ---------------------------------------------------------------------------
	# 4. Nested spans resolved end-to-end
	# ---------------------------------------------------------------------------


	def test_nested_spans_end_to_end():
	"""
	LLM emits an outer persName and inner forename / surname spans.
	Both are resolved separately and then nested by the injector.
	"""
	source = "He met John Smith today."
	schema = _schema(
	("persName", "a person's full name"),
	("forename", "a forename"),
	("surname", "a surname"),
	)

	def call_fn(_):
	return json.dumps([
	{"element": "persName", "text": "John Smith", "context": "met John Smith today.", "attrs": {}},
	{"element": "forename", "text": "John", "context": "met John Smith today.", "attrs": {}},
	{"element": "surname", "text": "Smith", "context": "John Smith today.", "attrs": {}},
	])

	result = _annotate(source, schema, call_fn)

	assert "<persName>" in result.xml
	assert "<forename>" in result.xml
	assert "<surname>" in result.xml
	# forename and surname must be inside persName
	p_open = result.xml.index("<persName>")
	p_close = result.xml.index("</persName>")
	fn_open = result.xml.index("<forename>")
	sn_close = result.xml.index("</surname>")
	assert p_open < fn_open < sn_close < p_close
	assert _strip_tags(result.xml) == source


	# ---------------------------------------------------------------------------
	# 5. Pre-existing XML preserved after annotation
	# ---------------------------------------------------------------------------


	def test_preexisting_xml_preserved():
	"""
	Source already has markup (<note> tags). After annotation the original
	markup must still be present alongside the new TEI annotations.
	"""
	source = "He met <note>allegedly</note> John Smith yesterday."
	schema = _schema(("persName", "a person's name"))

	def call_fn(_):
	# The LLM sees stripped plain text: "He met allegedly John Smith yesterday."
	return json.dumps([
	{
	"element": "persName",
	"text": "John Smith",
	"context": "allegedly John Smith yesterday.",
	"attrs": {},
	}
	])

	result = _annotate(source, schema, call_fn)

	assert "<note>" in result.xml
	assert "</note>" in result.xml
	assert "<persName>John Smith</persName>" in result.xml
	# Plain text must be unchanged
	assert _strip_tags(result.xml) == _strip_tags(source)


	# ---------------------------------------------------------------------------
	# 6. Attributes preserved end-to-end
	# ---------------------------------------------------------------------------


	def test_attributes_preserved_end_to_end():
	"""Attribute values returned by the LLM must appear verbatim in the output tag."""
	source = "The emperor Napoleon was defeated at Waterloo."
	schema = _schema(("persName", "a person's name"))

	def call_fn(_):
	return json.dumps([
	{
	"element": "persName",
	"text": "Napoleon",
	"context": "emperor Napoleon was defeated",
	"attrs": {"ref": "http://viaf.org/viaf/106964661", "cert": "high"},
	}
	])

	result = _annotate(source, schema, call_fn)
	assert 'ref="http://viaf.org/viaf/106964661"' in result.xml
	assert 'cert="high"' in result.xml
	assert _strip_tags(result.xml) == source


	# ---------------------------------------------------------------------------
	# 7. Hallucinated context → span silently rejected
	# ---------------------------------------------------------------------------


	def test_hallucinated_context_span_rejected():
	"""
	LLM returns a plausible-looking but non-existent context.
	The resolver must reject the span; the source text is returned unmodified.
	"""
	source = "Marie Curie discovered polonium."
	schema = _schema(("persName", "a person's name"))

	def call_fn(_):
	return json.dumps([
	{
	"element": "persName",
	"text": "Marie Curie",
	"context": "Dr. Marie Curie discovered polonium", # "Dr. " not in source
	"attrs": {},
	}
	])

	result = _annotate(source, schema, call_fn)
	assert "<persName>" not in result.xml
	assert result.xml == source


	# ---------------------------------------------------------------------------
	# 8. Fuzzy context match → span annotated and flagged
	# ---------------------------------------------------------------------------


	def test_fuzzy_context_match_flags_span():
	"""
	A context with a single-character typo should still resolve via fuzzy
	matching (score > 0.92) and be included with fuzzy_match=True.
	"""
	source = "Galileo Galilei observed the moons of Jupiter."
	schema = _schema(("persName", "a person's name"))

	def call_fn(_):
	return json.dumps([
	{
	"element": "persName",
	"text": "Galileo Galilei",
	# One character different from the source — should trigger fuzzy
	"context": "Galileo Galilei observd the moons of Jupiter.",
	"attrs": {},
	}
	])

	result = _annotate(source, schema, call_fn)
	# The span should still be annotated
	assert "<persName>Galileo Galilei</persName>" in result.xml
	# And flagged as fuzzy
	assert len(result.fuzzy_spans) == 1
	assert result.fuzzy_spans[0].element == "persName"
	assert _strip_tags(result.xml) == source


	# ---------------------------------------------------------------------------
	# 9. Source text never modified (plain-text invariant)
	# ---------------------------------------------------------------------------


	def test_plain_text_invariant_with_multiple_entities():
	"""Stripping all tags from the output must yield exactly the input text."""
	source = (
	"Leonardo da Vinci was born in Vinci, Tuscany, "
	"and later worked in Milan and Florence."
	)
	schema = _schema(
	("persName", "a person's name"),
	("placeName", "a place name"),
	)

	def call_fn(_):
	return json.dumps([
	{"element": "persName", "text": "Leonardo da Vinci",
	"context": "Leonardo da Vinci was born in Vinci", "attrs": {}},
	{"element": "placeName", "text": "Vinci",
	"context": "born in Vinci, Tuscany", "attrs": {}},
	{"element": "placeName", "text": "Tuscany",
	"context": "Vinci, Tuscany, and later", "attrs": {}},
	{"element": "placeName", "text": "Milan",
	"context": "later worked in Milan and Florence", "attrs": {}},
	{"element": "placeName", "text": "Florence",
	"context": "in Milan and Florence.", "attrs": {}},
	])

	result = _annotate(source, schema, call_fn)
	assert _strip_tags(result.xml) == source
	assert result.xml.count("<placeName>") == 4
	assert "<persName>Leonardo da Vinci</persName>" in result.xml


	# ---------------------------------------------------------------------------
	# 10. Real GLiNER model (requires HuggingFace download)
	# ---------------------------------------------------------------------------


	def test_pipeline_with_real_gliner():
	"""Full pipeline: real GLiNER pre-detection + mocked LLM call_fn."""
	schema = _schema(("persName", "a person's name"))

	def mock_llm(_: str) -> str:
	return json.dumps([
	{
	"element": "persName",
	"text": "Albert Einstein",
	"context": "Albert Einstein was born",
	"attrs": {},
	}
	])

	result = _annotate(
	"Albert Einstein was born in Ulm in 1879.",
	schema,
	mock_llm,
	gliner_model="numind/NuNER_Zero",
	)
	assert "persName" in result.xml
	assert "Albert Einstein" in result.xml
	assert result.xml.count("Albert Einstein") == 1