Spaces:

cmboulanger
/

tei-annotator

Runtime error

@@ -7,11 +7,17 @@ requires-python = ">=3.12"
 dependencies = [
     "jinja2>=3.1",
     "lxml>=5.0",
     "rapidfuzz>=3.0",
 ]
 [project.optional-dependencies]
 gliner = ["gliner>=0.2"]
 [tool.pytest.ini_options]
 addopts = "-m 'not integration'"

 dependencies = [
     "jinja2>=3.1",
     "lxml>=5.0",
+    "python-dotenv>=1.2.2",
     "rapidfuzz>=3.0",
 ]
 [project.optional-dependencies]
 gliner = ["gliner>=0.2"]
+webservice = [
+    "fastapi>=0.115",
+    "uvicorn[standard]>=0.30",
+    "python-multipart>=0.0.9",
+]
 [tool.pytest.ini_options]
 addopts = "-m 'not integration'"

scripts/evaluate_llm.py CHANGED Viewed

@@ -31,6 +31,8 @@ import urllib.error
 import urllib.request
 from pathlib import Path
 # ---------------------------------------------------------------------------
 # Paths
 # ---------------------------------------------------------------------------
@@ -43,27 +45,7 @@ GOLD_FILE = _REPO / "tests" / "fixtures" / "blbl-examples.tei.xml"
 # text characters, so this string is guaranteed to survive the annotation pass.
 _BATCH_SEP = "\n---RECORD|||SEP|||BOUNDARY---\n"
-# ---------------------------------------------------------------------------
-# .env loader (stdlib-only, no python-dotenv needed)
-# ---------------------------------------------------------------------------
-def _load_env(path: str = ".env") -> None:
-    try:
-        with open(path) as fh:
-            for line in fh:
-                line = line.strip()
-                if not line or line.startswith("#") or "=" not in line:
-                    continue
-                key, _, value = line.partition("=")
-                value = value.strip().strip('"').strip("'")
-                os.environ.setdefault(key.strip(), value)
-    except FileNotFoundError:
-        pass
-_load_env(_REPO / ".env")
 # ---------------------------------------------------------------------------
 # HTTP helper (stdlib urllib)
@@ -132,201 +114,10 @@ def make_kisski_call_fn(
 # ---------------------------------------------------------------------------
-# Schema — focused on the elements that appear in blbl-examples.tei.xml
 # ---------------------------------------------------------------------------
-def _build_schema():
-    from tei_annotator.models.schema import TEIAttribute, TEIElement, TEISchema
-    def attr(name: str, desc: str, allowed: list[str] | None = None) -> TEIAttribute:
-        return TEIAttribute(name=name, description=desc, allowed_values=allowed)
-    return TEISchema(
-        rules=[
-            "For each person's name, emit an 'author' or 'editor' span covering the full name "
-            "AND separate 'surname', 'forename', or 'orgName' spans for the individual name "
-            "parts within that span.",
-            "Never emit 'surname', 'forename', or 'orgName' without a corresponding enclosing "
-            "'author' or 'editor' span.",
-            "When an organisation acts as author or editor, emit BOTH an 'orgName' span AND an "
-            "enclosing 'author' (or 'editor') span. The 'author'/'editor' span MUST enclose the "
-            "'orgName' span — NEVER put an 'author' or 'editor' span inside an 'orgName' span.",
-            "CRITICAL: All name parts for all contiguous authors MUST always be placed inside a "
-            "SINGLE 'author' (or 'editor') span — conjunctions ('and', '&', 'et') and commas "
-            "between names do NOT create separate spans. Emit a new 'author' span only when "
-            "the authors are separated by a title, date, or other non-name bibliographic field.",
-            "In a bibliography, a dash or underscore may stand for a repeated author or editor "
-            "name — tag it as 'author' or 'editor' accordingly.",
-            "CRITICAL: When a parenthesised location appears immediately after a title "
-            "(e.g. 'Title (City, Region)'), end the 'title' span BEFORE the opening parenthesis "
-            "and emit a separate 'pubPlace' span covering only 'City, Region' (not the parentheses). "
-            "Never include a parenthesised location inside a 'title' span.",
-        ],
-        elements=[
-            TEIElement(
-                tag="label",
-                description=(
-                    "A numeric or alphanumeric reference label appearing at the very start of a "
-                    "bibliographic entry, before any author or title. Typical forms: a plain number "
-                    "('17'), a number with a trailing period ('17.'), a number in square brackets "
-                    "('[77]', '[ACL30]'), or a compound number ('5,6'). The separator that follows "
-                    "the label (period, dash, or space) is NOT part of the label. "
-                    "A label is always a number or short code — never a word or name. "
-                    "An ALL-CAPS word at the start of an entry is an author surname, not a label."
-                ),
-                allowed_children=[],
-                attributes=[],
-            ),
-            TEIElement(
-                tag="author",
-                description=(
-                    "Name(s) of the author(s) of the cited work. "
-                    "Names appearing at the start of a bibliographic entry before the title and "
-                    "date are authors."
-                ),
-                allowed_children=['surname', 'forename', 'orgName'],
-                attributes=[],
-            ),
-            TEIElement(
-                tag="editor",
-                description=(
-                    "Name of an editor of the cited work. "
-                    "An editor's name typically follows keywords such as 'in', 'ed.', 'éd.', "
-                    "'Hrsg.', 'dir.', '(ed.)', '(eds.)'. "
-                    "CRITICAL: A person's name (or surname alone) that follows 'in' is an editor — "
-                    "emit an 'editor' span (plus name-part spans), never a 'title' span."
-                ),
-                allowed_children=['surname', 'forename', 'orgName'],
-                attributes=[],
-            ),
-            TEIElement(
-                tag="surname",
-                description="The inherited (family) name of a person.",
-                allowed_children=[],
-                attributes=[],
-            ),
-            TEIElement(
-                tag="forename",
-                description="The given (first) name or initials of a person.",
-                allowed_children=[],
-                attributes=[],
-            ),
-            TEIElement(
-                tag="orgName",
-                description=(
-                    "Name of an organisation that acts as author or editor. "
-                    "Do NOT emit an 'orgName' span inside a 'publisher' span — "
-                    "when an organisation is the publisher, use 'publisher' alone."
-                ),
-                allowed_children=[],
-                attributes=[],
-            ),
-            TEIElement(
-                tag="title",
-                description=(
-                    "Title of the cited work. "
-                    "Do NOT split a title at an internal period or subtitle separator — "
-                    "e.g. 'Classical Literary Criticism. Oxford World Classics' is ONE title span; "
-                    "a city name embedded in a subtitle (e.g. 'Oxford' in 'Oxford World Classics') "
-                    "is NOT a pubPlace — do not interrupt the title span with a pubPlace span. "
-                    "CRITICAL: The title span ends BEFORE any parenthesised location — "
-                    "e.g. in 'Title (City, Region)', only 'Title' is the title span; "
-                    "'City, Region' is a separate pubPlace span. "
-                    "A journal or series title may appear after keywords such as 'in', 'dans', 'in:' — "
-                    "emit a 'title' span for it; do NOT tag it as 'note'."
-                ),
-                allowed_children=[],
-                attributes=[
-                    attr(
-                        "level",
-                        "Publication level: 'a'=article/chapter, 'm'=monograph/book, "
-                        "'j'=journal, 's'=series.",
-                        ["a", "m", "j", "s"],
-                    )
-                ],
-            ),
-            TEIElement(
-                tag="date",
-                description=(
-                    "Publication date or year. "
-                    "When two dates appear in sequence — e.g. '1989 [1972]' (reprint year and "
-                    "original year) — emit a SEPARATE 'date' span for each individual date."
-                ),
-                allowed_children=[],
-                attributes=[],
-            ),
-            TEIElement(
-                tag="publisher",
-                description=(
-                    "Name of the publisher. "
-                    "When multiple publishers are connected by 'and', emit a SINGLE 'publisher' "
-                    "span covering the full text (e.g. 'Cambridge University Press and the Russell "
-                    "Sage Foundation' is one span). Do NOT nest 'orgName' inside 'publisher'."
-                ),
-                allowed_children=[],
-                attributes=[],
-            ),
-            TEIElement(
-                tag="pubPlace",
-                description=(
-                    "Place of publication. "
-                    "CRITICAL: When a location appears in parentheses immediately after the title "
-                    "(e.g. 'Title (City, Region)'), the parenthesised location is the pubPlace — "
-                    "emit a 'pubPlace' span covering only 'City, Region' (without parentheses), "
-                    "and end the 'title' span BEFORE the opening parenthesis. "
-                    "Only tag a city name as pubPlace when it appears OUTSIDE and AFTER the title, "
-                    "typically before a colon and publisher name (e.g. 'Oxford: Oxford UP'). "
-                    "A city name that is part of a subtitle or series name within a title is NOT a pubPlace."
-                ),
-                allowed_children=[],
-                attributes=[],
-            ),
-            TEIElement(
-                tag="biblScope",
-                description=(
-                    "Scope reference within the cited item (page range, volume, issue). "
-                    "Emit a separate 'biblScope' span for volume and for issue. "
-                    "The span text contains ONLY the bare number — do not include labels "
-                    "('Vol.', 'No.', 'n°', 't.') or surrounding punctuation/parentheses. "
-                    "E.g. for 'Vol. 12(3)', emit '12' as unit='volume' and '3' as unit='issue'. "
-                    "E.g. for 'n°198', emit '198' as unit='volume'. "
-                    "Do NOT absorb a volume or issue number into a preceding title span."
-                ),
-                allowed_children=[],
-                attributes=[
-                    attr(
-                        "unit",
-                        "Unit of the scope reference.",
-                        ["page", "volume", "issue"],
-                    )
-                ],
-            ),
-            TEIElement(
-                tag="idno",
-                description="Bibliographic identifier such as DOI, ISBN, or ISSN.",
-                allowed_children=[],
-                attributes=[attr("type", "Identifier type, e.g. DOI, ISBN, ISSN.")],
-            ),
-            TEIElement(
-                tag="note",
-                description=(
-                    "Editorial note or annotation about the cited item. "
-                    "Institutional or series report designations — such as 'Amok Internal Report', "
-                    "'USGS Open-File Report 97-123', or 'Technical Report No. 5' — must be tagged "
-                    "as 'note' with type='report', NOT as 'orgName' or 'title'."
-                ),
-                allowed_children=[],
-                attributes=[attr("type", "Type of note, e.g. 'report'.")],
-            ),
-            TEIElement(
-                tag="ptr",
-                description="Pointer to an external resource such as a URL.",
-                allowed_children=[],
-                attributes=[attr("type", "Type of pointer, e.g. 'web'.")],
-            ),
-        ]
-    )
 # ---------------------------------------------------------------------------
@@ -443,6 +234,40 @@ def _evaluate_batch(
     return results
 # ---------------------------------------------------------------------------
 # Evaluation runner
 # ---------------------------------------------------------------------------
@@ -452,12 +277,10 @@ def run_evaluation(
     provider_name: str,
     call_fn,
     match_mode_str: str,
-    max_items: int | None,
     gliner_model: str | None = None,
     verbose: bool = False,
     output_file: Path | None = None,
-    grep: str | None = None,
-    inverse_grep: str | None = None,
     batch_size: int = 1,
 ) -> bool:
     """
@@ -483,8 +306,6 @@ def run_evaluation(
     except ImportError:
         _tqdm = None
-    _TEI_NS = "http://www.tei-c.org/ns/1.0"
     mode_map = {
         "text": MatchMode.TEXT,
         "exact": MatchMode.EXACT,
@@ -498,21 +319,6 @@ def run_evaluation(
     )
     schema = _build_schema()
-    # --- load gold records --------------------------------------------------
-    tree = etree.parse(str(GOLD_FILE))
-    containers = tree.findall(f".//{{{_TEI_NS}}}listBibl") or tree.findall(".//listBibl")
-    records: list[etree._Element] = []
-    for c in containers:
-        children = c.findall(f"{{{_TEI_NS}}}bibl") or c.findall("bibl")
-        records.extend(children)
-    if grep:
-        _grep_re = re.compile(grep)
-        records = [r for r in records if _grep_re.search("".join(r.itertext()))]
-    if inverse_grep:
-        _igrep_re = re.compile(inverse_grep)
-        records = [r for r in records if not _igrep_re.search("".join(r.itertext()))]
-    if max_items is not None:
-        records = records[:max_items]
     n_total = len(records)
     # --- output destination and progress display ----------------------------
@@ -747,6 +553,12 @@ def _parse_args() -> argparse.Namespace:
             "(e.g. --timeout 600 --batch-size 10 for KISSKI Llama)."
         ),
     )
     return p.parse_args()
@@ -783,18 +595,23 @@ def main() -> int:
     if args.output_file:
         Path(args.output_file).write_text("", encoding="utf-8")
     results: list[bool] = []
     for name, fn in providers:
         ok = run_evaluation(
             provider_name=name,
             call_fn=fn,
             match_mode_str=args.match_mode,
-            max_items=args.max_items,
             gliner_model=args.gliner_model,
             verbose=args.verbose,
             output_file=Path(args.output_file) if args.output_file else None,
-            grep=args.grep,
-            inverse_grep=args.inverse_grep,
             batch_size=args.batch_size,
         )
         results.append(ok)

 import urllib.request
 from pathlib import Path
+from dotenv import load_dotenv
 # ---------------------------------------------------------------------------
 # Paths
 # ---------------------------------------------------------------------------
 # text characters, so this string is guaranteed to survive the annotation pass.
 _BATCH_SEP = "\n---RECORD|||SEP|||BOUNDARY---\n"
+load_dotenv(_REPO / ".env")
 # ---------------------------------------------------------------------------
 # HTTP helper (stdlib urllib)
 # ---------------------------------------------------------------------------
+# Schema
 # ---------------------------------------------------------------------------
+from tei_annotator.schemas.blbl import build_blbl_schema as _build_schema  # noqa: E402
 # ---------------------------------------------------------------------------
     return results
+# ---------------------------------------------------------------------------
+# Record loader
+# ---------------------------------------------------------------------------
+def _load_records(
+    grep: str | None = None,
+    inverse_grep: str | None = None,
+    shuffle: bool = False,
+    max_items: int | None = None,
+):
+    import random
+    from lxml import etree
+    _TEI_NS = "http://www.tei-c.org/ns/1.0"
+    tree = etree.parse(str(GOLD_FILE))
+    containers = tree.findall(f".//{{{_TEI_NS}}}listBibl") or tree.findall(".//listBibl")
+    records: list[etree._Element] = []
+    for c in containers:
+        children = c.findall(f"{{{_TEI_NS}}}bibl") or c.findall("bibl")
+        records.extend(children)
+    if grep:
+        _grep_re = re.compile(grep)
+        records = [r for r in records if _grep_re.search("".join(r.itertext()))]
+    if inverse_grep:
+        _igrep_re = re.compile(inverse_grep)
+        records = [r for r in records if not _igrep_re.search("".join(r.itertext()))]
+    if shuffle:
+        random.shuffle(records)
+    if max_items is not None:
+        records = records[:max_items]
+    return records
 # ---------------------------------------------------------------------------
 # Evaluation runner
 # ---------------------------------------------------------------------------
     provider_name: str,
     call_fn,
     match_mode_str: str,
+    records: list,
     gliner_model: str | None = None,
     verbose: bool = False,
     output_file: Path | None = None,
     batch_size: int = 1,
 ) -> bool:
     """
     except ImportError:
         _tqdm = None
     mode_map = {
         "text": MatchMode.TEXT,
         "exact": MatchMode.EXACT,
     )
     schema = _build_schema()
     n_total = len(records)
     # --- output destination and progress display ----------------------------
             "(e.g. --timeout 600 --batch-size 10 for KISSKI Llama)."
         ),
     )
+    p.add_argument(
+        "--shuffle",
+        action="store_true",
+        default=False,
+        help="Randomly shuffle the evaluation set before applying --max-items.",
+    )
     return p.parse_args()
     if args.output_file:
         Path(args.output_file).write_text("", encoding="utf-8")
+    records = _load_records(
+        grep=args.grep,
+        inverse_grep=args.inverse_grep,
+        shuffle=args.shuffle,
+        max_items=args.max_items,
+    )
     results: list[bool] = []
     for name, fn in providers:
         ok = run_evaluation(
             provider_name=name,
             call_fn=fn,
             match_mode_str=args.match_mode,
+            records=records,
             gliner_model=args.gliner_model,
             verbose=args.verbose,
             output_file=Path(args.output_file) if args.output_file else None,
             batch_size=args.batch_size,
         )
         results.append(ok)

scripts/smoke_test_llm.py CHANGED Viewed

@@ -23,26 +23,9 @@ import urllib.error
 import urllib.request
 from pathlib import Path
-# ---------------------------------------------------------------------------
-# .env loader (stdlib-only, no python-dotenv needed)
-# ---------------------------------------------------------------------------
-def _load_env(path: str = ".env") -> None:
-    try:
-        with open(path) as fh:
-            for line in fh:
-                line = line.strip()
-                if not line or line.startswith("#") or "=" not in line:
-                    continue
-                key, _, value = line.partition("=")
-                value = value.strip().strip('"').strip("'")
-                os.environ.setdefault(key.strip(), value)
-    except FileNotFoundError:
-        pass
-_load_env(Path(__file__).parent.parent / ".env")
 # ---------------------------------------------------------------------------
@@ -191,7 +174,7 @@ def run_smoke_test(provider_name: str, call_fn) -> bool:
     print(f"  ✓ PASSED")
     print(f"  Tags found : {', '.join(tags_found)}")
     if result.fuzzy_spans:
-        print(f"  Fuzzy spans: {[s.text for s in result.fuzzy_spans]}")
     print(f"  Output XML :")
     for line in textwrap.wrap(result.xml, width=72, subsequent_indent="    "):
         print(f"    {line}")

 import urllib.request
 from pathlib import Path
+from dotenv import load_dotenv
+load_dotenv(Path(__file__).parent.parent / ".env")
 # ---------------------------------------------------------------------------
     print(f"  ✓ PASSED")
     print(f"  Tags found : {', '.join(tags_found)}")
     if result.fuzzy_spans:
+        print(f"  Fuzzy spans: {[TEST_TEXT[s.start:s.end] for s in result.fuzzy_spans]}")
     print(f"  Output XML :")
     for line in textwrap.wrap(result.xml, width=72, subsequent_indent="    "):
         print(f"    {line}")

scripts/smoke_test_webservice.py ADDED Viewed

	@@ -0,0 +1,237 @@

+#!/usr/bin/env python
+"""
+Smoke test for the TEI Annotator webservice.
+Assumes a locally running instance.  Start it first:
+    cd webservice && uvicorn main:app --reload
+The base URL is derived from HOST and PORT in webservice/.env.
+If that file does not exist, the script exits with instructions.
+Usage:
+    python scripts/smoke_test_webservice.py [--base-url URL]
+    uv run scripts/smoke_test_webservice.py
+"""
+from __future__ import annotations
+import argparse
+import json
+import re
+import sys
+import urllib.error
+import urllib.request
+from pathlib import Path
+# ---------------------------------------------------------------------------
+# Resolve base URL from webservice/.env
+# ---------------------------------------------------------------------------
+_REPO = Path(__file__).parent.parent
+_ENV_FILE = _REPO / "webservice" / ".env"
+_ENV_TEMPLATE = _REPO / "webservice" / ".env.template"
+def _load_webservice_env() -> dict[str, str]:
+    """Parse webservice/.env and return key→value pairs (no shell expansion)."""
+    if not _ENV_FILE.exists():
+        print(
+            f"ERROR: {_ENV_FILE} not found.\n"
+            f"       Create it from the template first:\n"
+            f"         cp {_ENV_TEMPLATE} {_ENV_FILE}\n"
+            f"       Then fill in at least one API key.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+    env: dict[str, str] = {}
+    for line in _ENV_FILE.read_text(encoding="utf-8").splitlines():
+        line = line.strip()
+        if not line or line.startswith("#") or "=" not in line:
+            continue
+        key, _, value = line.partition("=")
+        env[key.strip()] = value.strip().strip('"').strip("'")
+    return env
+def _default_base_url() -> str:
+    env = _load_webservice_env()
+    host = env.get("HOST", "localhost") or "localhost"
+    # When HOST is 0.0.0.0 (listen on all interfaces), connect via localhost
+    if host in ("0.0.0.0", ""):
+        host = "localhost"
+    port = env.get("PORT", "8000") or "8000"
+    return f"http://{host}:{port}"
+_TEST_TEXT = (
+    "Marie Curie was born in Warsaw, Poland, and later conducted her research "
+    "in Paris, France. Together with her husband Pierre Curie, she discovered "
+    "polonium and radium."
+)
+_MINIMAL_SCHEMA = {
+    "elements": [
+        {"tag": "persName", "description": "a person's name"},
+        {"tag": "placeName", "description": "a geographical place name"},
+    ],
+    "rules": [],
+}
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _get(url: str) -> tuple[int, str]:
+    try:
+        with urllib.request.urlopen(url, timeout=30) as resp:
+            return resp.status, resp.read().decode(errors="replace")
+    except urllib.error.HTTPError as exc:
+        return exc.code, exc.read().decode(errors="replace")
+    except urllib.error.URLError as exc:
+        print(
+            f"\nERROR: Cannot reach the webservice at {url}\n"
+            f"       {exc.reason}\n"
+            f"       Is the server running?  Start it with:\n"
+            f"         cd webservice && uvicorn main:app --reload",
+            file=sys.stderr,
+        )
+        sys.exit(2)
+def _post_json(url: str, payload: dict) -> tuple[int, dict | str]:
+    body = json.dumps(payload).encode()
+    req = urllib.request.Request(
+        url, data=body, headers={"Content-Type": "application/json"}, method="POST"
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=60) as resp:
+            return resp.status, json.loads(resp.read())
+    except urllib.error.HTTPError as exc:
+        return exc.code, exc.read().decode(errors="replace")
+    except urllib.error.URLError as exc:
+        print(
+            f"\nERROR: Cannot reach the webservice at {url}\n"
+            f"       {exc.reason}\n"
+            f"       Is the server running?  Start it with:\n"
+            f"         cd webservice && uvicorn main:app --reload",
+            file=sys.stderr,
+        )
+        sys.exit(2)
+def _check(name: str, ok: bool, detail: str = "") -> bool:
+    status = "PASS" if ok else "FAIL"
+    msg = f"  [{status}] {name}"
+    if detail:
+        msg += f"\n         {detail}"
+    print(msg)
+    return ok
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+def test_html_ui(base_url: str) -> bool:
+    status, body = _get(f"{base_url}/")
+    return _check(
+        "GET / returns HTTP 200 with HTML form",
+        status == 200 and "<form" in body,
+        f"status={status}",
+    )
+def test_api_annotate(base_url: str) -> bool:
+    status, data = _post_json(
+        f"{base_url}/api/annotate",
+        {"text": _TEST_TEXT, "schema": _MINIMAL_SCHEMA},
+    )
+    if not _check(
+        "POST /api/annotate returns HTTP 200",
+        status == 200,
+        f"status={status}  body={str(data)[:200]}",
+    ):
+        return False
+    xml = data.get("xml", "") if isinstance(data, dict) else ""
+    plain = re.sub(r"<[^>]+>", "", xml)
+    ok_nonempty = _check("Response xml is non-empty", bool(xml.strip()))
+    ok_plain = _check(
+        "Plain text preserved after stripping tags",
+        plain == _TEST_TEXT,
+        f"expected: {_TEST_TEXT!r}\n         got:      {plain!r}",
+    )
+    ok_fuzzy = _check(
+        "Response contains fuzzy_spans list",
+        isinstance(data.get("fuzzy_spans"), list) if isinstance(data, dict) else False,
+    )
+    return ok_nonempty and ok_plain and ok_fuzzy
+def test_api_no_schema(base_url: str) -> bool:
+    """Omitting schema should fall back to the built-in BLBL schema."""
+    bibl = "Curie, Marie. 1898. Sur une nouvelle substance radioactive. Paris."
+    status, data = _post_json(f"{base_url}/api/annotate", {"text": bibl})
+    return _check(
+        "POST /api/annotate without schema uses BLBL default (HTTP 200)",
+        status == 200 and isinstance(data, dict) and bool(data.get("xml")),
+        f"status={status}",
+    )
+def test_api_unknown_provider(base_url: str) -> bool:
+    status, _ = _post_json(
+        f"{base_url}/api/annotate",
+        {"text": "hello", "provider": "nonexistent"},
+    )
+    return _check(
+        "POST /api/annotate with unknown provider returns 400",
+        status == 400,
+        f"status={status}",
+    )
+def test_openapi_docs(base_url: str) -> bool:
+    status, _ = _get(f"{base_url}/docs")
+    return _check("GET /docs (OpenAPI UI) returns HTTP 200", status == 200)
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main() -> int:
+    default_url = _default_base_url()
+    parser = argparse.ArgumentParser(description="Smoke-test the TEI Annotator webservice.")
+    parser.add_argument(
+        "--base-url",
+        default=default_url,
+        help=f"Base URL of the running server (default: derived from webservice/.env, currently {default_url})",
+    )
+    args = parser.parse_args()
+    base = args.base_url.rstrip("/")
+    print(f"Smoke-testing {base}\n")
+    results = [
+        test_html_ui(base),
+        test_openapi_docs(base),
+        test_api_unknown_provider(base),
+        test_api_no_schema(base),
+        test_api_annotate(base),
+    ]
+    passed = sum(results)
+    total = len(results)
+    print(f"\n{'═' * 50}")
+    print(f"  {passed}/{total} checks passed")
+    print(f"{'═' * 50}")
+    return 0 if all(results) else 1
+if __name__ == "__main__":
+    sys.exit(main())

tei_annotator/schemas/__init__.py ADDED Viewed

File without changes

tei_annotator/schemas/blbl.py ADDED Viewed

	@@ -0,0 +1,201 @@

+"""
+BLBL bibliographic schema for TEI annotation.
+This schema covers the elements that appear in blbl-examples.tei.xml and is
+shared between the evaluation script and the webservice.
+"""
+from __future__ import annotations
+def build_blbl_schema():
+    from tei_annotator.models.schema import TEIAttribute, TEIElement, TEISchema
+    def attr(name: str, desc: str, allowed: list[str] | None = None) -> TEIAttribute:
+        return TEIAttribute(name=name, description=desc, allowed_values=allowed)
+    return TEISchema(
+        rules=[
+            "For each person's name, emit an 'author' or 'editor' span covering the full name "
+            "AND separate 'surname', 'forename', or 'orgName' spans for the individual name "
+            "parts within that span.",
+            "Never emit 'surname', 'forename', or 'orgName' without a corresponding enclosing "
+            "'author' or 'editor' span.",
+            "When an organisation acts as author or editor, emit BOTH an 'orgName' span AND an "
+            "enclosing 'author' (or 'editor') span. The 'author'/'editor' span MUST enclose the "
+            "'orgName' span — NEVER put an 'author' or 'editor' span inside an 'orgName' span.",
+            "CRITICAL: All name parts for all contiguous authors MUST always be placed inside a "
+            "SINGLE 'author' (or 'editor') span — conjunctions ('and', '&', 'et') and commas "
+            "between names do NOT create separate spans. Emit a new 'author' span only when "
+            "the authors are separated by a title, date, or other non-name bibliographic field.",
+            "In a bibliography, a dash or underscore may stand for a repeated author or editor "
+            "name — tag it as 'author' or 'editor' accordingly.",
+            "CRITICAL: When a parenthesised location appears immediately after a title "
+            "(e.g. 'Title (City, Region)'), end the 'title' span BEFORE the opening parenthesis "
+            "and emit a separate 'pubPlace' span covering only 'City, Region' (not the parentheses). "
+            "Never include a parenthesised location inside a 'title' span.",
+        ],
+        elements=[
+            TEIElement(
+                tag="label",
+                description=(
+                    "A numeric or alphanumeric reference label appearing at the very start of a "
+                    "bibliographic entry, before any author or title. Typical forms: a plain number "
+                    "('17'), a number with a trailing period ('17.'), a number in square brackets "
+                    "('[77]', '[ACL30]'), or a compound number ('5,6'). The separator that follows "
+                    "the label (period, dash, or space) is NOT part of the label. "
+                    "A label is always a number or short code — never a word or name. "
+                    "An ALL-CAPS word at the start of an entry is an author surname, not a label."
+                ),
+                allowed_children=[],
+                attributes=[],
+            ),
+            TEIElement(
+                tag="author",
+                description=(
+                    "Name(s) of the author(s) of the cited work. "
+                    "Names appearing at the start of a bibliographic entry before the title and "
+                    "date are authors."
+                ),
+                allowed_children=['surname', 'forename', 'orgName'],
+                attributes=[],
+            ),
+            TEIElement(
+                tag="editor",
+                description=(
+                    "Name of an editor of the cited work. "
+                    "An editor's name typically follows keywords such as 'in', 'ed.', 'éd.', "
+                    "'Hrsg.', 'dir.', '(ed.)', '(eds.)'. "
+                    "CRITICAL: A person's name (or surname alone) that follows 'in' is an editor — "
+                    "emit an 'editor' span (plus name-part spans), never a 'title' span."
+                ),
+                allowed_children=['surname', 'forename', 'orgName'],
+                attributes=[],
+            ),
+            TEIElement(
+                tag="surname",
+                description="The inherited (family) name of a person.",
+                allowed_children=[],
+                attributes=[],
+            ),
+            TEIElement(
+                tag="forename",
+                description="The given (first) name or initials of a person.",
+                allowed_children=[],
+                attributes=[],
+            ),
+            TEIElement(
+                tag="orgName",
+                description=(
+                    "Name of an organisation that acts as author or editor. "
+                    "Do NOT emit an 'orgName' span inside a 'publisher' span — "
+                    "when an organisation is the publisher, use 'publisher' alone."
+                ),
+                allowed_children=[],
+                attributes=[],
+            ),
+            TEIElement(
+                tag="title",
+                description=(
+                    "Title of the cited work. "
+                    "Do NOT split a title at an internal period or subtitle separator — "
+                    "e.g. 'Classical Literary Criticism. Oxford World Classics' is ONE title span; "
+                    "a city name embedded in a subtitle (e.g. 'Oxford' in 'Oxford World Classics') "
+                    "is NOT a pubPlace — do not interrupt the title span with a pubPlace span. "
+                    "CRITICAL: The title span ends BEFORE any parenthesised location — "
+                    "e.g. in 'Title (City, Region)', only 'Title' is the title span; "
+                    "'City, Region' is a separate pubPlace span. "
+                    "A journal or series title may appear after keywords such as 'in', 'dans', 'in:' — "
+                    "emit a 'title' span for it; do NOT tag it as 'note'."
+                ),
+                allowed_children=[],
+                attributes=[
+                    attr(
+                        "level",
+                        "Publication level: 'a'=article/chapter, 'm'=monograph/book, "
+                        "'j'=journal, 's'=series.",
+                        ["a", "m", "j", "s"],
+                    )
+                ],
+            ),
+            TEIElement(
+                tag="date",
+                description=(
+                    "Publication date or year. "
+                    "When two dates appear in sequence — e.g. '1989 [1972]' (reprint year and "
+                    "original year) — emit a SEPARATE 'date' span for each individual date."
+                ),
+                allowed_children=[],
+                attributes=[],
+            ),
+            TEIElement(
+                tag="publisher",
+                description=(
+                    "Name of the publisher. "
+                    "When multiple publishers are connected by 'and', emit a SINGLE 'publisher' "
+                    "span covering the full text (e.g. 'Cambridge University Press and the Russell "
+                    "Sage Foundation' is one span). Do NOT nest 'orgName' inside 'publisher'."
+                ),
+                allowed_children=[],
+                attributes=[],
+            ),
+            TEIElement(
+                tag="pubPlace",
+                description=(
+                    "Place of publication. "
+                    "CRITICAL: When a location appears in parentheses immediately after the title "
+                    "(e.g. 'Title (City, Region)'), the parenthesised location is the pubPlace — "
+                    "emit a 'pubPlace' span covering only 'City, Region' (without parentheses), "
+                    "and end the 'title' span BEFORE the opening parenthesis. "
+                    "Only tag a city name as pubPlace when it appears OUTSIDE and AFTER the title, "
+                    "typically before a colon and publisher name (e.g. 'Oxford: Oxford UP'). "
+                    "A city name that is part of a subtitle or series name within a title is NOT a pubPlace."
+                ),
+                allowed_children=[],
+                attributes=[],
+            ),
+            TEIElement(
+                tag="biblScope",
+                description=(
+                    "Scope reference within the cited item (page range, volume, issue). "
+                    "Emit a separate 'biblScope' span for volume and for issue. "
+                    "The span text contains ONLY the bare number — do not include labels "
+                    "('Vol.', 'No.', 'n°', 't.') or surrounding punctuation/parentheses. "
+                    "E.g. for 'Vol. 12(3)', emit '12' as unit='volume' and '3' as unit='issue'. "
+                    "E.g. for 'n°198', emit '198' as unit='volume'. "
+                    "Do NOT absorb a volume or issue number into a preceding title span."
+                ),
+                allowed_children=[],
+                attributes=[
+                    attr(
+                        "unit",
+                        "Unit of the scope reference.",
+                        ["page", "volume", "issue"],
+                    )
+                ],
+            ),
+            TEIElement(
+                tag="idno",
+                description="Bibliographic identifier such as DOI, ISBN, or ISSN.",
+                allowed_children=[],
+                attributes=[attr("type", "Identifier type, e.g. DOI, ISBN, ISSN.")],
+            ),
+            TEIElement(
+                tag="note",
+                description=(
+                    "Editorial note or annotation about the cited item. "
+                    "Institutional or series report designations — such as 'Amok Internal Report', "
+                    "'USGS Open-File Report 97-123', or 'Technical Report No. 5' — must be tagged "
+                    "as 'note' with type='report', NOT as 'orgName' or 'title'."
+                ),
+                allowed_children=[],
+                attributes=[attr("type", "Type of note, e.g. 'report'.")],
+            ),
+            TEIElement(
+                tag="ptr",
+                description="Pointer to an external resource such as a URL.",
+                allowed_children=[],
+                attributes=[attr("type", "Type of pointer, e.g. 'web'.")],
+            ),
+        ]
+    )

uv.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff

webservice/.env.template ADDED Viewed

	@@ -0,0 +1,39 @@

+# ============================================================
+# TEI Annotator webservice — environment configuration
+# ============================================================
+# Copy this file to .env and fill in at least one API key.
+# The server will advertise only the providers whose key is set.
+# ============================================================
+# ------------------------------------------------------------
+# LLM provider keys  (at least one required)
+# ------------------------------------------------------------
+# Google Gemini (https://aistudio.google.com/app/apikey)
+GEMINI_API_KEY=
+# KISSKI OpenAI-compatible endpoint (https://kisski.gwdg.de)
+KISSKI_API_KEY=
+# ------------------------------------------------------------
+# Default provider used by the HTML UI and the JSON API when
+# the caller does not specify one.  Must match a key that is set.
+# Allowed values: gemini | kisski
+# ------------------------------------------------------------
+DEFAULT_PROVIDER=gemini
+# ------------------------------------------------------------
+# Optional: GLiNER model for a pre-detection pass before the LLM.
+# Leave empty to disable.  Requires the [gliner] extra:
+#   uv pip install -e ".[gliner,webservice]"
+# Example:
+#   GLINER_MODEL=numind/NuNER_Zero
+# ------------------------------------------------------------
+GLINER_MODEL=
+# ------------------------------------------------------------
+# Server bind settings (used only when running via `python main.py`)
+# When using uvicorn directly, pass --host / --port on the CLI.
+# ------------------------------------------------------------
+HOST=0.0.0.0
+PORT=8099

webservice/Dockerfile ADDED Viewed

	@@ -0,0 +1,27 @@

+# ============================================================
+# TEI Annotator webservice
+# ============================================================
+# Build:
+#   docker build -t tei-annotator-webservice .
+#
+# Run (with an .env file in the webservice/ directory):
+#   docker run --env-file webservice/.env -p 8000:8000 tei-annotator-webservice
+#
+# Run (passing individual env vars):
+#   docker run -e GEMINI_API_KEY=... -p 8000:8000 tei-annotator-webservice
+# ============================================================
+FROM python:3.12-slim
+# Install the package with webservice extras
+WORKDIR /app
+COPY . .
+RUN pip install --no-cache-dir -e ".[webservice]"
+# Working directory for the server so that relative paths
+# (templates/, .env) resolve correctly
+WORKDIR /app/webservice
+EXPOSE 8000
+CMD ["python", "main.py"]

webservice/main.py ADDED Viewed

	@@ -0,0 +1,329 @@

+"""
+TEI Annotator webservice.
+Start with:
+    python main.py           # reads HOST/PORT from .env
+    python main.py --reload  # development mode with auto-reload
+Do NOT start with `uvicorn main:app` directly: uvicorn parses its --port from
+the CLI before importing this module, so load_dotenv() would run too late to
+affect the port binding.
+Configuration is read from a .env file in this directory (or from environment
+variables).  Copy .env.template to .env and fill in at least one API key.
+"""
+from __future__ import annotations
+import json
+import os
+import urllib.error
+import urllib.request
+from pathlib import Path
+from dotenv import load_dotenv
+from fastapi import FastAPI, Form, HTTPException, Request
+from fastapi.responses import HTMLResponse
+from fastapi.templating import Jinja2Templates
+from pydantic import BaseModel, Field
+# ---------------------------------------------------------------------------
+# Config
+# ---------------------------------------------------------------------------
+load_dotenv(Path(__file__).parent / ".env")
+_GEMINI_KEY = os.environ.get("GEMINI_API_KEY", "")
+_KISSKI_KEY = os.environ.get("KISSKI_API_KEY", "")
+_DEFAULT_PROVIDER = os.environ.get("DEFAULT_PROVIDER", "gemini")
+_GLINER_MODEL = os.environ.get("GLINER_MODEL", "") or None
+_AVAILABLE_PROVIDERS: list[str] = []
+if _GEMINI_KEY:
+    _AVAILABLE_PROVIDERS.append("gemini")
+if _KISSKI_KEY:
+    _AVAILABLE_PROVIDERS.append("kisski")
+# ---------------------------------------------------------------------------
+# HTTP helper
+# ---------------------------------------------------------------------------
+def _post_json(url: str, payload: dict, headers: dict, timeout: int = 120) -> dict:
+    body = json.dumps(payload).encode()
+    req = urllib.request.Request(url, data=body, headers=headers, method="POST")
+    try:
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            return json.loads(resp.read())
+    except urllib.error.HTTPError as exc:
+        detail = exc.read().decode(errors="replace")
+        raise RuntimeError(f"HTTP {exc.code}: {detail}") from exc
+# ---------------------------------------------------------------------------
+# Provider call_fn factories
+# ---------------------------------------------------------------------------
+def _make_gemini_call_fn(api_key: str, model: str = "gemini-2.0-flash", timeout: int = 120):
+    url = (
+        f"https://generativelanguage.googleapis.com/v1beta/models"
+        f"/{model}:generateContent?key={api_key}"
+    )
+    def call_fn(prompt: str) -> str:
+        payload = {
+            "contents": [{"parts": [{"text": prompt}]}],
+            "generationConfig": {"temperature": 0.1},
+        }
+        result = _post_json(url, payload, {"Content-Type": "application/json"}, timeout)
+        return result["candidates"][0]["content"]["parts"][0]["text"]
+    call_fn.__name__ = f"gemini/{model}"
+    return call_fn
+def _make_kisski_call_fn(
+    api_key: str,
+    base_url: str = "https://chat-ai.academiccloud.de/v1",
+    model: str = "llama-3.3-70b-instruct",
+    timeout: int = 120,
+):
+    url = f"{base_url}/chat/completions"
+    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
+    def call_fn(prompt: str) -> str:
+        payload = {
+            "model": model,
+            "messages": [{"role": "user", "content": prompt}],
+            "temperature": 0.1,
+        }
+        result = _post_json(url, payload, headers, timeout)
+        return result["choices"][0]["message"]["content"]
+    call_fn.__name__ = f"kisski/{model}"
+    return call_fn
+def _get_call_fn(provider: str):
+    if provider == "gemini":
+        if not _GEMINI_KEY:
+            raise HTTPException(status_code=503, detail="GEMINI_API_KEY not configured")
+        return _make_gemini_call_fn(_GEMINI_KEY)
+    if provider == "kisski":
+        if not _KISSKI_KEY:
+            raise HTTPException(status_code=503, detail="KISSKI_API_KEY not configured")
+        return _make_kisski_call_fn(_KISSKI_KEY)
+    raise HTTPException(status_code=400, detail=f"Unknown provider: {provider!r}")
+# ---------------------------------------------------------------------------
+# Schema helpers
+# ---------------------------------------------------------------------------
+def _schema_from_dict(data: dict):
+    """Build a TEISchema from a plain dict (as received from the JSON API)."""
+    from tei_annotator.models.schema import TEIAttribute, TEIElement, TEISchema
+    elements = []
+    for e in data.get("elements", []):
+        attrs = [
+            TEIAttribute(
+                name=a["name"],
+                description=a.get("description", ""),
+                allowed_values=a.get("allowed_values"),
+            )
+            for a in e.get("attributes", [])
+        ]
+        elements.append(
+            TEIElement(
+                tag=e["tag"],
+                description=e.get("description", ""),
+                allowed_children=e.get("allowed_children", []),
+                attributes=attrs,
+            )
+        )
+    return TEISchema(elements=elements, rules=data.get("rules", []))
+# ---------------------------------------------------------------------------
+# FastAPI app
+# ---------------------------------------------------------------------------
+app = FastAPI(
+    title="TEI Annotator",
+    description="Annotate plain text with TEI XML tags using an LLM backend.",
+    version="0.1.0",
+)
+_TEMPLATES_DIR = Path(__file__).parent / "templates"
+templates = Jinja2Templates(directory=str(_TEMPLATES_DIR))
+# ---------------------------------------------------------------------------
+# HTML endpoints
+# ---------------------------------------------------------------------------
+@app.get("/", response_class=HTMLResponse)
+async def index(request: Request):
+    return templates.TemplateResponse(
+        "index.html",
+        {
+            "request": request,
+            "providers": _AVAILABLE_PROVIDERS,
+            "default_provider": _DEFAULT_PROVIDER if _DEFAULT_PROVIDER in _AVAILABLE_PROVIDERS else (_AVAILABLE_PROVIDERS[0] if _AVAILABLE_PROVIDERS else ""),
+            "result": None,
+            "error": None,
+            "input_text": "",
+        },
+    )
+@app.post("/annotate", response_class=HTMLResponse)
+async def annotate_html(
+    request: Request,
+    text: str = Form(...),
+    provider: str = Form(...),
+):
+    from tei_annotator.inference.endpoint import EndpointCapability, EndpointConfig
+    from tei_annotator.pipeline import annotate
+    from tei_annotator.schemas.blbl import build_blbl_schema
+    result_xml = None
+    error = None
+    try:
+        call_fn = _get_call_fn(provider)
+        endpoint = EndpointConfig(
+            capability=EndpointCapability.TEXT_GENERATION,
+            call_fn=call_fn,
+        )
+        result = annotate(
+            text=text,
+            schema=build_blbl_schema(),
+            endpoint=endpoint,
+            gliner_model=_GLINER_MODEL,
+        )
+        result_xml = result.xml
+    except HTTPException as exc:
+        error = exc.detail
+    except Exception as exc:
+        error = str(exc)
+    return templates.TemplateResponse(
+        "index.html",
+        {
+            "request": request,
+            "providers": _AVAILABLE_PROVIDERS,
+            "default_provider": provider,
+            "result": result_xml,
+            "error": error,
+            "input_text": text,
+        },
+    )
+# ---------------------------------------------------------------------------
+# JSON API
+# ---------------------------------------------------------------------------
+class AttributeSchema(BaseModel):
+    name: str
+    description: str = ""
+    allowed_values: list[str] | None = None
+class ElementSchema(BaseModel):
+    tag: str
+    description: str = ""
+    allowed_children: list[str] = []
+    attributes: list[AttributeSchema] = []
+class SchemaInput(BaseModel):
+    elements: list[ElementSchema]
+    rules: list[str] = []
+class AnnotateRequest(BaseModel):
+    model_config = {"populate_by_name": True}
+    text: str
+    provider: str | None = None
+    tei_schema: SchemaInput | None = Field(None, alias="schema")
+class FuzzySpan(BaseModel):
+    element: str
+    start: int
+    end: int
+class AnnotateResponse(BaseModel):
+    xml: str
+    fuzzy_spans: list[FuzzySpan]
+@app.post("/api/annotate", response_model=AnnotateResponse)
+async def annotate_api(body: AnnotateRequest):
+    """
+    Annotate *text* and return the XML result.
+    - **text**: plain text to annotate.
+    - **provider**: `"gemini"` or `"kisski"` (default: `DEFAULT_PROVIDER` from env).
+    - **schema**: TEI schema definition. Omit to use the built-in BLBL bibliographic schema.
+    """
+    from tei_annotator.inference.endpoint import EndpointCapability, EndpointConfig
+    from tei_annotator.pipeline import annotate
+    from tei_annotator.schemas.blbl import build_blbl_schema
+    provider = body.provider or _DEFAULT_PROVIDER
+    call_fn = _get_call_fn(provider)
+    if body.tei_schema is not None:
+        schema = _schema_from_dict(body.tei_schema.model_dump())
+    else:
+        schema = build_blbl_schema()
+    endpoint = EndpointConfig(
+        capability=EndpointCapability.TEXT_GENERATION,
+        call_fn=call_fn,
+    )
+    try:
+        result = annotate(
+            text=body.text,
+            schema=schema,
+            endpoint=endpoint,
+            gliner_model=_GLINER_MODEL,
+        )
+    except Exception as exc:
+        raise HTTPException(status_code=500, detail=str(exc)) from exc
+    return AnnotateResponse(
+        xml=result.xml,
+        fuzzy_spans=[
+            FuzzySpan(element=s.element, start=s.start, end=s.end)
+            for s in result.fuzzy_spans
+        ],
+    )
+# ---------------------------------------------------------------------------
+# Entry point for direct execution
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    import argparse
+    import uvicorn
+    _parser = argparse.ArgumentParser()
+    _parser.add_argument("--reload", action="store_true", default=False,
+                         help="Enable auto-reload on code changes (development only).")
+    _args = _parser.parse_args()
+    host = os.environ.get("HOST", "0.0.0.0")
+    port = int(os.environ.get("PORT", "8000"))
+    uvicorn.run("main:app", host=host, port=port, reload=_args.reload)

webservice/templates/index.html ADDED Viewed

	@@ -0,0 +1,125 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>TEI Annotator</title>
+  <style>
+    *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
+    body {
+      font-family: system-ui, sans-serif;
+      background: #f5f5f5;
+      color: #222;
+      padding: 2rem;
+      max-width: 900px;
+      margin: 0 auto;
+    }
+    h1 { font-size: 1.5rem; margin-bottom: 0.25rem; }
+    p.subtitle { color: #555; font-size: 0.9rem; margin-bottom: 1.5rem; }
+    label { display: block; font-weight: 600; margin-bottom: 0.3rem; font-size: 0.9rem; }
+    textarea {
+      width: 100%;
+      height: 180px;
+      padding: 0.6rem;
+      font-family: monospace;
+      font-size: 0.85rem;
+      border: 1px solid #ccc;
+      border-radius: 4px;
+      resize: vertical;
+    }
+    select {
+      padding: 0.4rem 0.6rem;
+      border: 1px solid #ccc;
+      border-radius: 4px;
+      font-size: 0.9rem;
+    }
+    .row { display: flex; align-items: center; gap: 1rem; margin-top: 0.75rem; }
+    button {
+      padding: 0.45rem 1.2rem;
+      background: #2563eb;
+      color: #fff;
+      border: none;
+      border-radius: 4px;
+      font-size: 0.9rem;
+      cursor: pointer;
+    }
+    button:hover { background: #1d4ed8; }
+    .result-box {
+      margin-top: 1.5rem;
+      background: #fff;
+      border: 1px solid #d1d5db;
+      border-radius: 4px;
+      padding: 1rem;
+    }
+    .result-box h2 { font-size: 1rem; margin-bottom: 0.5rem; color: #374151; }
+    pre {
+      white-space: pre-wrap;
+      word-break: break-word;
+      font-family: monospace;
+      font-size: 0.82rem;
+      line-height: 1.5;
+      background: #f9fafb;
+      padding: 0.75rem;
+      border-radius: 3px;
+    }
+    .error {
+      margin-top: 1.5rem;
+      background: #fef2f2;
+      border: 1px solid #fca5a5;
+      border-radius: 4px;
+      padding: 0.75rem 1rem;
+      color: #b91c1c;
+      font-size: 0.9rem;
+    }
+    .no-providers {
+      margin-top: 1rem;
+      padding: 0.75rem 1rem;
+      background: #fffbeb;
+      border: 1px solid #fcd34d;
+      border-radius: 4px;
+      font-size: 0.9rem;
+      color: #92400e;
+    }
+  </style>
+</head>
+<body>
+  <h1>TEI Annotator</h1>
+  <p class="subtitle">Paste bibliographic text below to annotate it with TEI XML tags.</p>
+  {% if providers %}
+  <form method="post" action="/annotate">
+    <label for="text">Input text</label>
+    <textarea id="text" name="text" placeholder="Paste a bibliographic entry here…" required>{{ input_text }}</textarea>
+    <div class="row">
+      <div>
+        <label for="provider" style="display:inline; margin-right:0.4rem;">Provider</label>
+        <select id="provider" name="provider">
+          {% for p in providers %}
+          <option value="{{ p }}" {% if p == default_provider %}selected{% endif %}>{{ p }}</option>
+          {% endfor %}
+        </select>
+      </div>
+      <button type="submit">Annotate</button>
+    </div>
+  </form>
+  {% if error %}
+  <div class="error"><strong>Error:</strong> {{ error }}</div>
+  {% endif %}
+  {% if result is not none %}
+  <div class="result-box">
+    <h2>Annotated XML</h2>
+    <pre>{{ result }}</pre>
+  </div>
+  {% endif %}
+  {% else %}
+  <div class="no-providers">
+    No LLM providers are configured. Set <code>GEMINI_API_KEY</code> or
+    <code>KISSKI_API_KEY</code> in <code>webservice/.env</code> and restart the server.
+  </div>
+  {% endif %}
+</body>
+</html>