Spaces:
Running
Running
| """TEI-XML export and import for Scripture Detector. | |
| Export schema | |
| βββββββββββββ | |
| TEI | |
| βββ teiHeader / fileDesc, encodingDesc | |
| βββ text / body / ab β source text with inline <seg> for annotated spans | |
| βββ standOff / listAnnotation β one <annotation> per quote | |
| Import | |
| ββββββ | |
| Reads a TEI file produced by this module and reconstructs source name, | |
| full text, and annotations (with character-offset spans). | |
| """ | |
| from __future__ import annotations | |
| import re | |
| from datetime import date | |
| from lxml import etree | |
| TEI_NS = "http://www.tei-c.org/ns/1.0" | |
| XML_NS = "http://www.w3.org/XML/1998/namespace" | |
| _T = f"{{{TEI_NS}}}" # prefix shortcut | |
| _X = f"{{{XML_NS}}}" # xml: namespace prefix | |
| # ββ helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _compute_segments(text: str, annotations: list[dict]) -> list[dict]: | |
| """Split *text* at annotation boundaries (same logic as app.compute_segments).""" | |
| boundaries: set[int] = {0, len(text)} | |
| for a in annotations: | |
| if a.get("span_start") is not None and a.get("span_end") is not None: | |
| boundaries.add(a["span_start"]) | |
| boundaries.add(a["span_end"]) | |
| ordered = sorted(boundaries) | |
| segments = [] | |
| for i in range(len(ordered) - 1): | |
| start, end = ordered[i], ordered[i + 1] | |
| ann_ids = [ | |
| j for j, a in enumerate(annotations) | |
| if a.get("span_start") is not None | |
| and a["span_start"] <= start and end <= a["span_end"] | |
| ] | |
| segments.append({"text": text[start:end], "start": start, "end": end, | |
| "annotation_ids": ann_ids}) | |
| return segments | |
| def _ref_label(ref: str, book_names: dict[str, str]) -> str: | |
| """'gen_1:5' β 'Genesis 1:5'""" | |
| ref = ref.strip().lower() | |
| m = re.match(r"^([a-z0-9]+)_(\d+):(\d+)$", ref) | |
| if m: | |
| book_code, ch, vs = m.groups() | |
| book = book_names.get(book_code, book_code.capitalize()) | |
| return f"{book} {ch}:{vs}" | |
| m2 = re.match(r"^([a-z0-9]+)_(\d+)$", ref) | |
| if m2: | |
| book_code, ch = m2.groups() | |
| book = book_names.get(book_code, book_code.capitalize()) | |
| return f"{book} {ch}" | |
| return ref | |
| # ββ export ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def source_to_tei( | |
| source: dict, | |
| annotations: list[dict], | |
| book_names: dict[str, str] | None = None, | |
| ) -> bytes: | |
| """ | |
| Serialise *source* + *annotations* as UTF-8 TEI XML bytes. | |
| source: dict with keys id, name, text, created_at | |
| annotations: list of dicts with keys id, span_start, span_end, | |
| quote_text, quote_type, refs | |
| book_names: {book_code: human_name} β used for human-readable <ref> labels | |
| """ | |
| book_names = book_names or {} | |
| NSMAP = {None: TEI_NS} | |
| root = etree.Element(f"{_T}TEI", nsmap=NSMAP) | |
| # ββ teiHeader ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| header = etree.SubElement(root, f"{_T}teiHeader") | |
| fileDesc = etree.SubElement(header, f"{_T}fileDesc") | |
| titleStmt = etree.SubElement(fileDesc, f"{_T}titleStmt") | |
| title_el = etree.SubElement(titleStmt, f"{_T}title") | |
| title_el.text = source["name"] | |
| resp = etree.SubElement(titleStmt, f"{_T}respStmt") | |
| resp_resp = etree.SubElement(resp, f"{_T}resp") | |
| resp_resp.text = "Analyzed by" | |
| resp_name = etree.SubElement(resp, f"{_T}name") | |
| resp_name.text = "Scripture Detector (Dr. William J.B. Mattingly, Yale University)" | |
| pubStmt = etree.SubElement(fileDesc, f"{_T}publicationStmt") | |
| pub_p = etree.SubElement(pubStmt, f"{_T}p") | |
| pub_p.text = ( | |
| f"Exported from Scripture Detector on {date.today().isoformat()}. " | |
| "Scripture Detector is developed by Dr. William J.B. Mattingly, " | |
| "Cultural Heritage Data Scientist, Yale University." | |
| ) | |
| srcDesc = etree.SubElement(fileDesc, f"{_T}sourceDesc") | |
| src_p = etree.SubElement(srcDesc, f"{_T}p") | |
| src_p.text = "AI-assisted detection of biblical quotations, paraphrases, and allusions." | |
| encDesc = etree.SubElement(header, f"{_T}encodingDesc") | |
| projDesc = etree.SubElement(encDesc, f"{_T}projectDesc") | |
| proj_p = etree.SubElement(projDesc, f"{_T}p") | |
| proj_p.text = ( | |
| "Scripture Detector uses Google Gemini to identify and classify biblical " | |
| "references in historical texts. Reference types follow a four-level taxonomy." | |
| ) | |
| clasDecl = etree.SubElement(encDesc, f"{_T}classDecl") | |
| taxonomy = etree.SubElement(clasDecl, f"{_T}taxonomy") | |
| taxonomy.set(f"{_X}id", "sd-types") | |
| for cat_id, desc in [ | |
| ("sd-full", "Full quotation: verbatim or near-verbatim citation of a biblical verse"), | |
| ("sd-partial", "Partial quotation: a recognisable portion of a verse"), | |
| ("sd-paraphrase", "Paraphrase: biblical content restated in different words"), | |
| ("sd-allusion", "Allusion: brief thematic or verbal echo of a scriptural passage"), | |
| ]: | |
| cat = etree.SubElement(taxonomy, f"{_T}category") | |
| cat.set(f"{_X}id", cat_id) | |
| catDesc = etree.SubElement(cat, f"{_T}catDesc") | |
| catDesc.text = desc | |
| # ββ text / body / ab βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| text_el = etree.SubElement(root, f"{_T}text") | |
| body = etree.SubElement(text_el, f"{_T}body") | |
| ab = etree.SubElement(body, f"{_T}ab") | |
| ab.set(f"{_X}id", "source-text") | |
| segments = _compute_segments(source["text"], annotations) | |
| last_el = None # most-recently appended child element | |
| for seg in segments: | |
| raw = seg["text"] | |
| if not seg["annotation_ids"]: | |
| # plain text: append to .text of <ab> or .tail of last element | |
| if last_el is None: | |
| ab.text = (ab.text or "") + raw | |
| else: | |
| last_el.tail = (last_el.tail or "") + raw | |
| else: | |
| ann_refs = " ".join( | |
| f"#ann{annotations[i]['id']}" for i in seg["annotation_ids"] | |
| ) | |
| subtypes = {annotations[i]["quote_type"] for i in seg["annotation_ids"]} | |
| subtype = next(iter(subtypes)) if len(subtypes) == 1 else "mixed" | |
| seg_el = etree.SubElement(ab, f"{_T}seg") | |
| seg_el.set(f"{_X}id", f"seg{seg['start']}x{seg['end']}") | |
| seg_el.set("ana", ann_refs) | |
| seg_el.set("type", "biblical-reference") | |
| seg_el.set("subtype", subtype) | |
| seg_el.text = raw | |
| last_el = seg_el | |
| # ββ standOff βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| stand_off = etree.SubElement(root, f"{_T}standOff") | |
| list_ann = etree.SubElement(stand_off, f"{_T}listAnnotation") | |
| for a in annotations: | |
| ann_el = etree.SubElement(list_ann, f"{_T}annotation") | |
| ann_el.set(f"{_X}id", f"ann{a['id']}") | |
| ann_el.set("type", "biblical-reference") | |
| ann_el.set("subtype", a.get("quote_type", "allusion")) | |
| ann_el.set("ana", f"#sd-{a.get('quote_type','allusion')}") | |
| note_el = etree.SubElement(ann_el, f"{_T}note") | |
| note_el.set("type", "quotedText") | |
| note_el.text = a.get("quote_text", "") | |
| refs_el = etree.SubElement(ann_el, f"{_T}listRef") | |
| for ref in (a.get("refs") or []): | |
| ref_clean = ref.strip().lower() | |
| ref_el = etree.SubElement(refs_el, f"{_T}ref") | |
| ref_el.set("target", f"bible:{ref_clean}") | |
| ref_el.text = _ref_label(ref_clean, book_names) | |
| return etree.tostring( | |
| root, | |
| pretty_print=True, | |
| xml_declaration=True, | |
| encoding="UTF-8", | |
| ) | |
| # ββ import ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def tei_to_source_data(xml_bytes: bytes) -> dict: | |
| """ | |
| Parse a TEI file produced by :func:`source_to_tei`. | |
| Returns a dict:: | |
| { | |
| "name": str, | |
| "text": str, | |
| "annotations": [ | |
| { | |
| "quote_text": str, | |
| "quote_type": str, | |
| "refs": [str, ...], | |
| "span_start": int | None, | |
| "span_end": int | None, | |
| }, | |
| ... | |
| ] | |
| } | |
| """ | |
| root = etree.fromstring(xml_bytes) | |
| # ββ source name ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| title_el = root.find(f".//{_T}teiHeader//{_T}titleStmt/{_T}title") | |
| name = (title_el.text or "Untitled").strip() if title_el is not None else "Untitled" | |
| # ββ reconstruct plain text + offset map for <seg> ids ββββββββββββββββββββ | |
| ab = root.find(f".//{_T}body//{_T}ab") | |
| if ab is None: | |
| ab = root.find(f".//{_T}body") | |
| text_parts: list[str] = [] | |
| # Maps xml:id β (start_char, end_char) offsets within the joined text | |
| offset_map: dict[str, tuple[int, int]] = {} | |
| def _walk(el: etree._Element) -> None: | |
| if el.text: | |
| text_parts.append(el.text) | |
| for child in el: | |
| child_start = sum(len(p) for p in text_parts) | |
| _walk(child) | |
| child_end = sum(len(p) for p in text_parts) | |
| xml_id = child.get(f"{_X}id") | |
| if xml_id: | |
| offset_map[xml_id] = (child_start, child_end) | |
| if child.tail: | |
| text_parts.append(child.tail) | |
| if ab is not None: | |
| _walk(ab) | |
| full_text = "".join(text_parts) | |
| # ββ parse standOff annotations ββββββββββββββββββββββββββββββββββββββββββββ | |
| annotations: list[dict] = [] | |
| for ann_el in root.findall(f".//{_T}standOff//{_T}annotation"): | |
| ann_xml_id = ann_el.get(f"{_X}id", "") | |
| subtype = ann_el.get("subtype", "allusion") | |
| note_el = ann_el.find(f"{_T}note[@type='quotedText']") | |
| quote_text = (note_el.text or "").strip() if note_el is not None else "" | |
| refs: list[str] = [] | |
| for ref_el in ann_el.findall(f".//{_T}ref"): | |
| target = ref_el.get("target", "") | |
| if target.startswith("bible:"): | |
| refs.append(target[6:]) | |
| # Determine character span from the seg elements referencing this annotation | |
| span_start = span_end = None | |
| if ab is not None and ann_xml_id: | |
| ref_key = f"#{ann_xml_id}" | |
| seg_offsets = [] | |
| for seg_el in ab.iter(f"{_T}seg"): | |
| ana_val = seg_el.get("ana", "") | |
| if ref_key in ana_val.split(): | |
| seg_id = seg_el.get(f"{_X}id") | |
| if seg_id and seg_id in offset_map: | |
| seg_offsets.append(offset_map[seg_id]) | |
| if seg_offsets: | |
| span_start = min(s for s, _ in seg_offsets) | |
| span_end = max(e for _, e in seg_offsets) | |
| annotations.append({ | |
| "quote_text": quote_text, | |
| "quote_type": subtype, | |
| "refs": refs, | |
| "span_start": span_start, | |
| "span_end": span_end, | |
| }) | |
| return {"name": name, "text": full_text, "annotations": annotations} | |