Spaces:

wjbmattingly
/

scripture-detector

Running

William Mattingly

Add scripture detector app

a9a9428 4 days ago

12.2 kB

	"""TEI-XML export and import for Scripture Detector.

	Export schema
	─────────────
	TEI
	├── teiHeader / fileDesc, encodingDesc
	├── text / body / ab ← source text with inline <seg> for annotated spans
	└── standOff / listAnnotation ← one <annotation> per quote

	Import
	──────
	Reads a TEI file produced by this module and reconstructs source name,
	full text, and annotations (with character-offset spans).
	"""

	from __future__ import annotations

	import re
	from datetime import date

	from lxml import etree

	TEI_NS = "http://www.tei-c.org/ns/1.0"
	XML_NS = "http://www.w3.org/XML/1998/namespace"

	_T = f"{{{TEI_NS}}}" # prefix shortcut
	_X = f"{{{XML_NS}}}" # xml: namespace prefix


	# ── helpers ───────────────────────────────────────────────────────────────────

	def _compute_segments(text: str, annotations: list[dict]) -> list[dict]:
	"""Split text at annotation boundaries (same logic as app.compute_segments)."""
	boundaries: set[int] = {0, len(text)}
	for a in annotations:
	if a.get("span_start") is not None and a.get("span_end") is not None:
	boundaries.add(a["span_start"])
	boundaries.add(a["span_end"])
	ordered = sorted(boundaries)
	segments = []
	for i in range(len(ordered) - 1):
	start, end = ordered[i], ordered[i + 1]
	ann_ids = [
	j for j, a in enumerate(annotations)
	if a.get("span_start") is not None
	and a["span_start"] <= start and end <= a["span_end"]
	]
	segments.append({"text": text[start:end], "start": start, "end": end,
	"annotation_ids": ann_ids})
	return segments


	def _ref_label(ref: str, book_names: dict[str, str]) -> str:
	"""'gen_1:5' → 'Genesis 1:5'"""
	ref = ref.strip().lower()
	m = re.match(r"^([a-z0-9]+)_(\d+):(\d+)$", ref)
	if m:
	book_code, ch, vs = m.groups()
	book = book_names.get(book_code, book_code.capitalize())
	return f"{book} {ch}:{vs}"
	m2 = re.match(r"^([a-z0-9]+)_(\d+)$", ref)
	if m2:
	book_code, ch = m2.groups()
	book = book_names.get(book_code, book_code.capitalize())
	return f"{book} {ch}"
	return ref


	# ── export ────────────────────────────────────────────────────────────────────

	def source_to_tei(
	source: dict,
	annotations: list[dict],
	book_names: dict[str, str] \| None = None,
	) -> bytes:
	"""
	Serialise source + annotations as UTF-8 TEI XML bytes.

	source: dict with keys id, name, text, created_at
	annotations: list of dicts with keys id, span_start, span_end,
	quote_text, quote_type, refs
	book_names: {book_code: human_name} — used for human-readable <ref> labels
	"""
	book_names = book_names or {}

	NSMAP = {None: TEI_NS}
	root = etree.Element(f"{_T}TEI", nsmap=NSMAP)

	# ── teiHeader ────────────────────────────────────────────────────────────
	header = etree.SubElement(root, f"{_T}teiHeader")
	fileDesc = etree.SubElement(header, f"{_T}fileDesc")
	titleStmt = etree.SubElement(fileDesc, f"{_T}titleStmt")
	title_el = etree.SubElement(titleStmt, f"{_T}title")
	title_el.text = source["name"]
	resp = etree.SubElement(titleStmt, f"{_T}respStmt")
	resp_resp = etree.SubElement(resp, f"{_T}resp")
	resp_resp.text = "Analyzed by"
	resp_name = etree.SubElement(resp, f"{_T}name")
	resp_name.text = "Scripture Detector (Dr. William J.B. Mattingly, Yale University)"
	pubStmt = etree.SubElement(fileDesc, f"{_T}publicationStmt")
	pub_p = etree.SubElement(pubStmt, f"{_T}p")
	pub_p.text = (
	f"Exported from Scripture Detector on {date.today().isoformat()}. "
	"Scripture Detector is developed by Dr. William J.B. Mattingly, "
	"Cultural Heritage Data Scientist, Yale University."
	)
	srcDesc = etree.SubElement(fileDesc, f"{_T}sourceDesc")
	src_p = etree.SubElement(srcDesc, f"{_T}p")
	src_p.text = "AI-assisted detection of biblical quotations, paraphrases, and allusions."

	encDesc = etree.SubElement(header, f"{_T}encodingDesc")
	projDesc = etree.SubElement(encDesc, f"{_T}projectDesc")
	proj_p = etree.SubElement(projDesc, f"{_T}p")
	proj_p.text = (
	"Scripture Detector uses Google Gemini to identify and classify biblical "
	"references in historical texts. Reference types follow a four-level taxonomy."
	)
	clasDecl = etree.SubElement(encDesc, f"{_T}classDecl")
	taxonomy = etree.SubElement(clasDecl, f"{_T}taxonomy")
	taxonomy.set(f"{_X}id", "sd-types")
	for cat_id, desc in [
	("sd-full", "Full quotation: verbatim or near-verbatim citation of a biblical verse"),
	("sd-partial", "Partial quotation: a recognisable portion of a verse"),
	("sd-paraphrase", "Paraphrase: biblical content restated in different words"),
	("sd-allusion", "Allusion: brief thematic or verbal echo of a scriptural passage"),
	]:
	cat = etree.SubElement(taxonomy, f"{_T}category")
	cat.set(f"{_X}id", cat_id)
	catDesc = etree.SubElement(cat, f"{_T}catDesc")
	catDesc.text = desc

	# ── text / body / ab ─────────────────────────────────────────────────────
	text_el = etree.SubElement(root, f"{_T}text")
	body = etree.SubElement(text_el, f"{_T}body")
	ab = etree.SubElement(body, f"{_T}ab")
	ab.set(f"{_X}id", "source-text")

	segments = _compute_segments(source["text"], annotations)
	last_el = None # most-recently appended child element

	for seg in segments:
	raw = seg["text"]
	if not seg["annotation_ids"]:
	# plain text: append to .text of <ab> or .tail of last element
	if last_el is None:
	ab.text = (ab.text or "") + raw
	else:
	last_el.tail = (last_el.tail or "") + raw
	else:
	ann_refs = " ".join(
	f"#ann{annotations[i]['id']}" for i in seg["annotation_ids"]
	)
	subtypes = {annotations[i]["quote_type"] for i in seg["annotation_ids"]}
	subtype = next(iter(subtypes)) if len(subtypes) == 1 else "mixed"

	seg_el = etree.SubElement(ab, f"{_T}seg")
	seg_el.set(f"{_X}id", f"seg{seg['start']}x{seg['end']}")
	seg_el.set("ana", ann_refs)
	seg_el.set("type", "biblical-reference")
	seg_el.set("subtype", subtype)
	seg_el.text = raw
	last_el = seg_el

	# ── standOff ─────────────────────────────────────────────────────────────
	stand_off = etree.SubElement(root, f"{_T}standOff")
	list_ann = etree.SubElement(stand_off, f"{_T}listAnnotation")

	for a in annotations:
	ann_el = etree.SubElement(list_ann, f"{_T}annotation")
	ann_el.set(f"{_X}id", f"ann{a['id']}")
	ann_el.set("type", "biblical-reference")
	ann_el.set("subtype", a.get("quote_type", "allusion"))
	ann_el.set("ana", f"#sd-{a.get('quote_type','allusion')}")

	note_el = etree.SubElement(ann_el, f"{_T}note")
	note_el.set("type", "quotedText")
	note_el.text = a.get("quote_text", "")

	refs_el = etree.SubElement(ann_el, f"{_T}listRef")
	for ref in (a.get("refs") or []):
	ref_clean = ref.strip().lower()
	ref_el = etree.SubElement(refs_el, f"{_T}ref")
	ref_el.set("target", f"bible:{ref_clean}")
	ref_el.text = _ref_label(ref_clean, book_names)

	return etree.tostring(
	root,
	pretty_print=True,
	xml_declaration=True,
	encoding="UTF-8",
	)


	# ── import ────────────────────────────────────────────────────────────────────

	def tei_to_source_data(xml_bytes: bytes) -> dict:
	"""
	Parse a TEI file produced by :func:`source_to_tei`.

	Returns a dict::

	{
	"name": str,
	"text": str,
	"annotations": [
	{
	"quote_text": str,
	"quote_type": str,
	"refs": [str, ...],
	"span_start": int \| None,
	"span_end": int \| None,
	},
	...
	]
	}
	"""
	root = etree.fromstring(xml_bytes)

	# ── source name ──────────────────────────────────────────────────────────
	title_el = root.find(f".//{_T}teiHeader//{_T}titleStmt/{_T}title")
	name = (title_el.text or "Untitled").strip() if title_el is not None else "Untitled"

	# ── reconstruct plain text + offset map for <seg> ids ────────────────────
	ab = root.find(f".//{_T}body//{_T}ab")
	if ab is None:
	ab = root.find(f".//{_T}body")

	text_parts: list[str] = []
	# Maps xml:id → (start_char, end_char) offsets within the joined text
	offset_map: dict[str, tuple[int, int]] = {}

	def _walk(el: etree._Element) -> None:
	if el.text:
	text_parts.append(el.text)
	for child in el:
	child_start = sum(len(p) for p in text_parts)
	_walk(child)
	child_end = sum(len(p) for p in text_parts)
	xml_id = child.get(f"{_X}id")
	if xml_id:
	offset_map[xml_id] = (child_start, child_end)
	if child.tail:
	text_parts.append(child.tail)

	if ab is not None:
	_walk(ab)

	full_text = "".join(text_parts)

	# ── parse standOff annotations ────────────────────────────────────────────
	annotations: list[dict] = []

	for ann_el in root.findall(f".//{_T}standOff//{_T}annotation"):
	ann_xml_id = ann_el.get(f"{_X}id", "")
	subtype = ann_el.get("subtype", "allusion")

	note_el = ann_el.find(f"{_T}note[@type='quotedText']")
	quote_text = (note_el.text or "").strip() if note_el is not None else ""

	refs: list[str] = []
	for ref_el in ann_el.findall(f".//{_T}ref"):
	target = ref_el.get("target", "")
	if target.startswith("bible:"):
	refs.append(target[6:])

	# Determine character span from the seg elements referencing this annotation
	span_start = span_end = None
	if ab is not None and ann_xml_id:
	ref_key = f"#{ann_xml_id}"
	seg_offsets = []
	for seg_el in ab.iter(f"{_T}seg"):
	ana_val = seg_el.get("ana", "")
	if ref_key in ana_val.split():
	seg_id = seg_el.get(f"{_X}id")
	if seg_id and seg_id in offset_map:
	seg_offsets.append(offset_map[seg_id])
	if seg_offsets:
	span_start = min(s for s, _ in seg_offsets)
	span_end = max(e for _, e in seg_offsets)

	annotations.append({
	"quote_text": quote_text,
	"quote_type": subtype,
	"refs": refs,
	"span_start": span_start,
	"span_end": span_end,
	})

	return {"name": name, "text": full_text, "annotations": annotations}