cmboulanger commited on
Commit
c3f33b6
·
1 Parent(s): dd4de5c

feat: Add webservice for demonstration

Browse files
.claude/settings.local.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(xargs grep -l \"webservice\\\\|fastapi\\\\|flask\")"
5
+ ]
6
+ }
7
+ }
.gitignore CHANGED
@@ -8,7 +8,7 @@ wheels/
8
 
9
  # Virtual environments
10
  .venv
11
- .env*
12
  .DS_Store
13
 
14
  # Local files
 
8
 
9
  # Virtual environments
10
  .venv
11
+ .env
12
  .DS_Store
13
 
14
  # Local files
implementation-plan.md → docs/implementation-plan.md RENAMED
File without changes
pyproject.toml CHANGED
@@ -7,11 +7,17 @@ requires-python = ">=3.12"
7
  dependencies = [
8
  "jinja2>=3.1",
9
  "lxml>=5.0",
 
10
  "rapidfuzz>=3.0",
11
  ]
12
 
13
  [project.optional-dependencies]
14
  gliner = ["gliner>=0.2"]
 
 
 
 
 
15
 
16
  [tool.pytest.ini_options]
17
  addopts = "-m 'not integration'"
 
7
  dependencies = [
8
  "jinja2>=3.1",
9
  "lxml>=5.0",
10
+ "python-dotenv>=1.2.2",
11
  "rapidfuzz>=3.0",
12
  ]
13
 
14
  [project.optional-dependencies]
15
  gliner = ["gliner>=0.2"]
16
+ webservice = [
17
+ "fastapi>=0.115",
18
+ "uvicorn[standard]>=0.30",
19
+ "python-multipart>=0.0.9",
20
+ ]
21
 
22
  [tool.pytest.ini_options]
23
  addopts = "-m 'not integration'"
scripts/evaluate_llm.py CHANGED
@@ -31,6 +31,8 @@ import urllib.error
31
  import urllib.request
32
  from pathlib import Path
33
 
 
 
34
  # ---------------------------------------------------------------------------
35
  # Paths
36
  # ---------------------------------------------------------------------------
@@ -43,27 +45,7 @@ GOLD_FILE = _REPO / "tests" / "fixtures" / "blbl-examples.tei.xml"
43
  # text characters, so this string is guaranteed to survive the annotation pass.
44
  _BATCH_SEP = "\n---RECORD|||SEP|||BOUNDARY---\n"
45
 
46
- # ---------------------------------------------------------------------------
47
- # .env loader (stdlib-only, no python-dotenv needed)
48
- # ---------------------------------------------------------------------------
49
-
50
-
51
- def _load_env(path: str = ".env") -> None:
52
- try:
53
- with open(path) as fh:
54
- for line in fh:
55
- line = line.strip()
56
- if not line or line.startswith("#") or "=" not in line:
57
- continue
58
- key, _, value = line.partition("=")
59
- value = value.strip().strip('"').strip("'")
60
- os.environ.setdefault(key.strip(), value)
61
- except FileNotFoundError:
62
- pass
63
-
64
-
65
- _load_env(_REPO / ".env")
66
-
67
 
68
  # ---------------------------------------------------------------------------
69
  # HTTP helper (stdlib urllib)
@@ -132,201 +114,10 @@ def make_kisski_call_fn(
132
 
133
 
134
  # ---------------------------------------------------------------------------
135
- # Schema — focused on the elements that appear in blbl-examples.tei.xml
136
  # ---------------------------------------------------------------------------
137
 
138
-
139
- def _build_schema():
140
- from tei_annotator.models.schema import TEIAttribute, TEIElement, TEISchema
141
-
142
- def attr(name: str, desc: str, allowed: list[str] | None = None) -> TEIAttribute:
143
- return TEIAttribute(name=name, description=desc, allowed_values=allowed)
144
-
145
- return TEISchema(
146
- rules=[
147
- "For each person's name, emit an 'author' or 'editor' span covering the full name "
148
- "AND separate 'surname', 'forename', or 'orgName' spans for the individual name "
149
- "parts within that span.",
150
- "Never emit 'surname', 'forename', or 'orgName' without a corresponding enclosing "
151
- "'author' or 'editor' span.",
152
- "When an organisation acts as author or editor, emit BOTH an 'orgName' span AND an "
153
- "enclosing 'author' (or 'editor') span. The 'author'/'editor' span MUST enclose the "
154
- "'orgName' span — NEVER put an 'author' or 'editor' span inside an 'orgName' span.",
155
- "CRITICAL: All name parts for all contiguous authors MUST always be placed inside a "
156
- "SINGLE 'author' (or 'editor') span — conjunctions ('and', '&', 'et') and commas "
157
- "between names do NOT create separate spans. Emit a new 'author' span only when "
158
- "the authors are separated by a title, date, or other non-name bibliographic field.",
159
- "In a bibliography, a dash or underscore may stand for a repeated author or editor "
160
- "name — tag it as 'author' or 'editor' accordingly.",
161
- "CRITICAL: When a parenthesised location appears immediately after a title "
162
- "(e.g. 'Title (City, Region)'), end the 'title' span BEFORE the opening parenthesis "
163
- "and emit a separate 'pubPlace' span covering only 'City, Region' (not the parentheses). "
164
- "Never include a parenthesised location inside a 'title' span.",
165
- ],
166
- elements=[
167
- TEIElement(
168
- tag="label",
169
- description=(
170
- "A numeric or alphanumeric reference label appearing at the very start of a "
171
- "bibliographic entry, before any author or title. Typical forms: a plain number "
172
- "('17'), a number with a trailing period ('17.'), a number in square brackets "
173
- "('[77]', '[ACL30]'), or a compound number ('5,6'). The separator that follows "
174
- "the label (period, dash, or space) is NOT part of the label. "
175
- "A label is always a number or short code — never a word or name. "
176
- "An ALL-CAPS word at the start of an entry is an author surname, not a label."
177
- ),
178
- allowed_children=[],
179
- attributes=[],
180
- ),
181
- TEIElement(
182
- tag="author",
183
- description=(
184
- "Name(s) of the author(s) of the cited work. "
185
- "Names appearing at the start of a bibliographic entry before the title and "
186
- "date are authors."
187
- ),
188
- allowed_children=['surname', 'forename', 'orgName'],
189
- attributes=[],
190
- ),
191
- TEIElement(
192
- tag="editor",
193
- description=(
194
- "Name of an editor of the cited work. "
195
- "An editor's name typically follows keywords such as 'in', 'ed.', 'éd.', "
196
- "'Hrsg.', 'dir.', '(ed.)', '(eds.)'. "
197
- "CRITICAL: A person's name (or surname alone) that follows 'in' is an editor — "
198
- "emit an 'editor' span (plus name-part spans), never a 'title' span."
199
- ),
200
- allowed_children=['surname', 'forename', 'orgName'],
201
- attributes=[],
202
- ),
203
- TEIElement(
204
- tag="surname",
205
- description="The inherited (family) name of a person.",
206
- allowed_children=[],
207
- attributes=[],
208
- ),
209
- TEIElement(
210
- tag="forename",
211
- description="The given (first) name or initials of a person.",
212
- allowed_children=[],
213
- attributes=[],
214
- ),
215
- TEIElement(
216
- tag="orgName",
217
- description=(
218
- "Name of an organisation that acts as author or editor. "
219
- "Do NOT emit an 'orgName' span inside a 'publisher' span — "
220
- "when an organisation is the publisher, use 'publisher' alone."
221
- ),
222
- allowed_children=[],
223
- attributes=[],
224
- ),
225
- TEIElement(
226
- tag="title",
227
- description=(
228
- "Title of the cited work. "
229
- "Do NOT split a title at an internal period or subtitle separator — "
230
- "e.g. 'Classical Literary Criticism. Oxford World Classics' is ONE title span; "
231
- "a city name embedded in a subtitle (e.g. 'Oxford' in 'Oxford World Classics') "
232
- "is NOT a pubPlace — do not interrupt the title span with a pubPlace span. "
233
- "CRITICAL: The title span ends BEFORE any parenthesised location — "
234
- "e.g. in 'Title (City, Region)', only 'Title' is the title span; "
235
- "'City, Region' is a separate pubPlace span. "
236
- "A journal or series title may appear after keywords such as 'in', 'dans', 'in:' — "
237
- "emit a 'title' span for it; do NOT tag it as 'note'."
238
- ),
239
- allowed_children=[],
240
- attributes=[
241
- attr(
242
- "level",
243
- "Publication level: 'a'=article/chapter, 'm'=monograph/book, "
244
- "'j'=journal, 's'=series.",
245
- ["a", "m", "j", "s"],
246
- )
247
- ],
248
- ),
249
- TEIElement(
250
- tag="date",
251
- description=(
252
- "Publication date or year. "
253
- "When two dates appear in sequence — e.g. '1989 [1972]' (reprint year and "
254
- "original year) — emit a SEPARATE 'date' span for each individual date."
255
- ),
256
- allowed_children=[],
257
- attributes=[],
258
- ),
259
- TEIElement(
260
- tag="publisher",
261
- description=(
262
- "Name of the publisher. "
263
- "When multiple publishers are connected by 'and', emit a SINGLE 'publisher' "
264
- "span covering the full text (e.g. 'Cambridge University Press and the Russell "
265
- "Sage Foundation' is one span). Do NOT nest 'orgName' inside 'publisher'."
266
- ),
267
- allowed_children=[],
268
- attributes=[],
269
- ),
270
- TEIElement(
271
- tag="pubPlace",
272
- description=(
273
- "Place of publication. "
274
- "CRITICAL: When a location appears in parentheses immediately after the title "
275
- "(e.g. 'Title (City, Region)'), the parenthesised location is the pubPlace — "
276
- "emit a 'pubPlace' span covering only 'City, Region' (without parentheses), "
277
- "and end the 'title' span BEFORE the opening parenthesis. "
278
- "Only tag a city name as pubPlace when it appears OUTSIDE and AFTER the title, "
279
- "typically before a colon and publisher name (e.g. 'Oxford: Oxford UP'). "
280
- "A city name that is part of a subtitle or series name within a title is NOT a pubPlace."
281
- ),
282
- allowed_children=[],
283
- attributes=[],
284
- ),
285
- TEIElement(
286
- tag="biblScope",
287
- description=(
288
- "Scope reference within the cited item (page range, volume, issue). "
289
- "Emit a separate 'biblScope' span for volume and for issue. "
290
- "The span text contains ONLY the bare number — do not include labels "
291
- "('Vol.', 'No.', 'n°', 't.') or surrounding punctuation/parentheses. "
292
- "E.g. for 'Vol. 12(3)', emit '12' as unit='volume' and '3' as unit='issue'. "
293
- "E.g. for 'n°198', emit '198' as unit='volume'. "
294
- "Do NOT absorb a volume or issue number into a preceding title span."
295
- ),
296
- allowed_children=[],
297
- attributes=[
298
- attr(
299
- "unit",
300
- "Unit of the scope reference.",
301
- ["page", "volume", "issue"],
302
- )
303
- ],
304
- ),
305
- TEIElement(
306
- tag="idno",
307
- description="Bibliographic identifier such as DOI, ISBN, or ISSN.",
308
- allowed_children=[],
309
- attributes=[attr("type", "Identifier type, e.g. DOI, ISBN, ISSN.")],
310
- ),
311
- TEIElement(
312
- tag="note",
313
- description=(
314
- "Editorial note or annotation about the cited item. "
315
- "Institutional or series report designations — such as 'Amok Internal Report', "
316
- "'USGS Open-File Report 97-123', or 'Technical Report No. 5' — must be tagged "
317
- "as 'note' with type='report', NOT as 'orgName' or 'title'."
318
- ),
319
- allowed_children=[],
320
- attributes=[attr("type", "Type of note, e.g. 'report'.")],
321
- ),
322
- TEIElement(
323
- tag="ptr",
324
- description="Pointer to an external resource such as a URL.",
325
- allowed_children=[],
326
- attributes=[attr("type", "Type of pointer, e.g. 'web'.")],
327
- ),
328
- ]
329
- )
330
 
331
 
332
  # ---------------------------------------------------------------------------
@@ -443,6 +234,40 @@ def _evaluate_batch(
443
  return results
444
 
445
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
446
  # ---------------------------------------------------------------------------
447
  # Evaluation runner
448
  # ---------------------------------------------------------------------------
@@ -452,12 +277,10 @@ def run_evaluation(
452
  provider_name: str,
453
  call_fn,
454
  match_mode_str: str,
455
- max_items: int | None,
456
  gliner_model: str | None = None,
457
  verbose: bool = False,
458
  output_file: Path | None = None,
459
- grep: str | None = None,
460
- inverse_grep: str | None = None,
461
  batch_size: int = 1,
462
  ) -> bool:
463
  """
@@ -483,8 +306,6 @@ def run_evaluation(
483
  except ImportError:
484
  _tqdm = None
485
 
486
- _TEI_NS = "http://www.tei-c.org/ns/1.0"
487
-
488
  mode_map = {
489
  "text": MatchMode.TEXT,
490
  "exact": MatchMode.EXACT,
@@ -498,21 +319,6 @@ def run_evaluation(
498
  )
499
  schema = _build_schema()
500
 
501
- # --- load gold records --------------------------------------------------
502
- tree = etree.parse(str(GOLD_FILE))
503
- containers = tree.findall(f".//{{{_TEI_NS}}}listBibl") or tree.findall(".//listBibl")
504
- records: list[etree._Element] = []
505
- for c in containers:
506
- children = c.findall(f"{{{_TEI_NS}}}bibl") or c.findall("bibl")
507
- records.extend(children)
508
- if grep:
509
- _grep_re = re.compile(grep)
510
- records = [r for r in records if _grep_re.search("".join(r.itertext()))]
511
- if inverse_grep:
512
- _igrep_re = re.compile(inverse_grep)
513
- records = [r for r in records if not _igrep_re.search("".join(r.itertext()))]
514
- if max_items is not None:
515
- records = records[:max_items]
516
  n_total = len(records)
517
 
518
  # --- output destination and progress display ----------------------------
@@ -747,6 +553,12 @@ def _parse_args() -> argparse.Namespace:
747
  "(e.g. --timeout 600 --batch-size 10 for KISSKI Llama)."
748
  ),
749
  )
 
 
 
 
 
 
750
  return p.parse_args()
751
 
752
 
@@ -783,18 +595,23 @@ def main() -> int:
783
  if args.output_file:
784
  Path(args.output_file).write_text("", encoding="utf-8")
785
 
 
 
 
 
 
 
 
786
  results: list[bool] = []
787
  for name, fn in providers:
788
  ok = run_evaluation(
789
  provider_name=name,
790
  call_fn=fn,
791
  match_mode_str=args.match_mode,
792
- max_items=args.max_items,
793
  gliner_model=args.gliner_model,
794
  verbose=args.verbose,
795
  output_file=Path(args.output_file) if args.output_file else None,
796
- grep=args.grep,
797
- inverse_grep=args.inverse_grep,
798
  batch_size=args.batch_size,
799
  )
800
  results.append(ok)
 
31
  import urllib.request
32
  from pathlib import Path
33
 
34
+ from dotenv import load_dotenv
35
+
36
  # ---------------------------------------------------------------------------
37
  # Paths
38
  # ---------------------------------------------------------------------------
 
45
  # text characters, so this string is guaranteed to survive the annotation pass.
46
  _BATCH_SEP = "\n---RECORD|||SEP|||BOUNDARY---\n"
47
 
48
+ load_dotenv(_REPO / ".env")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  # ---------------------------------------------------------------------------
51
  # HTTP helper (stdlib urllib)
 
114
 
115
 
116
  # ---------------------------------------------------------------------------
117
+ # Schema
118
  # ---------------------------------------------------------------------------
119
 
120
+ from tei_annotator.schemas.blbl import build_blbl_schema as _build_schema # noqa: E402
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
 
123
  # ---------------------------------------------------------------------------
 
234
  return results
235
 
236
 
237
+ # ---------------------------------------------------------------------------
238
+ # Record loader
239
+ # ---------------------------------------------------------------------------
240
+
241
+
242
+ def _load_records(
243
+ grep: str | None = None,
244
+ inverse_grep: str | None = None,
245
+ shuffle: bool = False,
246
+ max_items: int | None = None,
247
+ ):
248
+ import random
249
+ from lxml import etree
250
+
251
+ _TEI_NS = "http://www.tei-c.org/ns/1.0"
252
+ tree = etree.parse(str(GOLD_FILE))
253
+ containers = tree.findall(f".//{{{_TEI_NS}}}listBibl") or tree.findall(".//listBibl")
254
+ records: list[etree._Element] = []
255
+ for c in containers:
256
+ children = c.findall(f"{{{_TEI_NS}}}bibl") or c.findall("bibl")
257
+ records.extend(children)
258
+ if grep:
259
+ _grep_re = re.compile(grep)
260
+ records = [r for r in records if _grep_re.search("".join(r.itertext()))]
261
+ if inverse_grep:
262
+ _igrep_re = re.compile(inverse_grep)
263
+ records = [r for r in records if not _igrep_re.search("".join(r.itertext()))]
264
+ if shuffle:
265
+ random.shuffle(records)
266
+ if max_items is not None:
267
+ records = records[:max_items]
268
+ return records
269
+
270
+
271
  # ---------------------------------------------------------------------------
272
  # Evaluation runner
273
  # ---------------------------------------------------------------------------
 
277
  provider_name: str,
278
  call_fn,
279
  match_mode_str: str,
280
+ records: list,
281
  gliner_model: str | None = None,
282
  verbose: bool = False,
283
  output_file: Path | None = None,
 
 
284
  batch_size: int = 1,
285
  ) -> bool:
286
  """
 
306
  except ImportError:
307
  _tqdm = None
308
 
 
 
309
  mode_map = {
310
  "text": MatchMode.TEXT,
311
  "exact": MatchMode.EXACT,
 
319
  )
320
  schema = _build_schema()
321
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  n_total = len(records)
323
 
324
  # --- output destination and progress display ----------------------------
 
553
  "(e.g. --timeout 600 --batch-size 10 for KISSKI Llama)."
554
  ),
555
  )
556
+ p.add_argument(
557
+ "--shuffle",
558
+ action="store_true",
559
+ default=False,
560
+ help="Randomly shuffle the evaluation set before applying --max-items.",
561
+ )
562
  return p.parse_args()
563
 
564
 
 
595
  if args.output_file:
596
  Path(args.output_file).write_text("", encoding="utf-8")
597
 
598
+ records = _load_records(
599
+ grep=args.grep,
600
+ inverse_grep=args.inverse_grep,
601
+ shuffle=args.shuffle,
602
+ max_items=args.max_items,
603
+ )
604
+
605
  results: list[bool] = []
606
  for name, fn in providers:
607
  ok = run_evaluation(
608
  provider_name=name,
609
  call_fn=fn,
610
  match_mode_str=args.match_mode,
611
+ records=records,
612
  gliner_model=args.gliner_model,
613
  verbose=args.verbose,
614
  output_file=Path(args.output_file) if args.output_file else None,
 
 
615
  batch_size=args.batch_size,
616
  )
617
  results.append(ok)
scripts/smoke_test_llm.py CHANGED
@@ -23,26 +23,9 @@ import urllib.error
23
  import urllib.request
24
  from pathlib import Path
25
 
 
26
 
27
- # ---------------------------------------------------------------------------
28
- # .env loader (stdlib-only, no python-dotenv needed)
29
- # ---------------------------------------------------------------------------
30
-
31
- def _load_env(path: str = ".env") -> None:
32
- try:
33
- with open(path) as fh:
34
- for line in fh:
35
- line = line.strip()
36
- if not line or line.startswith("#") or "=" not in line:
37
- continue
38
- key, _, value = line.partition("=")
39
- value = value.strip().strip('"').strip("'")
40
- os.environ.setdefault(key.strip(), value)
41
- except FileNotFoundError:
42
- pass
43
-
44
-
45
- _load_env(Path(__file__).parent.parent / ".env")
46
 
47
 
48
  # ---------------------------------------------------------------------------
@@ -191,7 +174,7 @@ def run_smoke_test(provider_name: str, call_fn) -> bool:
191
  print(f" ✓ PASSED")
192
  print(f" Tags found : {', '.join(tags_found)}")
193
  if result.fuzzy_spans:
194
- print(f" Fuzzy spans: {[s.text for s in result.fuzzy_spans]}")
195
  print(f" Output XML :")
196
  for line in textwrap.wrap(result.xml, width=72, subsequent_indent=" "):
197
  print(f" {line}")
 
23
  import urllib.request
24
  from pathlib import Path
25
 
26
+ from dotenv import load_dotenv
27
 
28
+ load_dotenv(Path(__file__).parent.parent / ".env")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
 
31
  # ---------------------------------------------------------------------------
 
174
  print(f" ✓ PASSED")
175
  print(f" Tags found : {', '.join(tags_found)}")
176
  if result.fuzzy_spans:
177
+ print(f" Fuzzy spans: {[TEST_TEXT[s.start:s.end] for s in result.fuzzy_spans]}")
178
  print(f" Output XML :")
179
  for line in textwrap.wrap(result.xml, width=72, subsequent_indent=" "):
180
  print(f" {line}")
scripts/smoke_test_webservice.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Smoke test for the TEI Annotator webservice.
4
+
5
+ Assumes a locally running instance. Start it first:
6
+ cd webservice && uvicorn main:app --reload
7
+
8
+ The base URL is derived from HOST and PORT in webservice/.env.
9
+ If that file does not exist, the script exits with instructions.
10
+
11
+ Usage:
12
+ python scripts/smoke_test_webservice.py [--base-url URL]
13
+ uv run scripts/smoke_test_webservice.py
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import argparse
19
+ import json
20
+ import re
21
+ import sys
22
+ import urllib.error
23
+ import urllib.request
24
+ from pathlib import Path
25
+
26
+ # ---------------------------------------------------------------------------
27
+ # Resolve base URL from webservice/.env
28
+ # ---------------------------------------------------------------------------
29
+
30
+ _REPO = Path(__file__).parent.parent
31
+ _ENV_FILE = _REPO / "webservice" / ".env"
32
+ _ENV_TEMPLATE = _REPO / "webservice" / ".env.template"
33
+
34
+
35
+ def _load_webservice_env() -> dict[str, str]:
36
+ """Parse webservice/.env and return key→value pairs (no shell expansion)."""
37
+ if not _ENV_FILE.exists():
38
+ print(
39
+ f"ERROR: {_ENV_FILE} not found.\n"
40
+ f" Create it from the template first:\n"
41
+ f" cp {_ENV_TEMPLATE} {_ENV_FILE}\n"
42
+ f" Then fill in at least one API key.",
43
+ file=sys.stderr,
44
+ )
45
+ sys.exit(1)
46
+ env: dict[str, str] = {}
47
+ for line in _ENV_FILE.read_text(encoding="utf-8").splitlines():
48
+ line = line.strip()
49
+ if not line or line.startswith("#") or "=" not in line:
50
+ continue
51
+ key, _, value = line.partition("=")
52
+ env[key.strip()] = value.strip().strip('"').strip("'")
53
+ return env
54
+
55
+
56
+ def _default_base_url() -> str:
57
+ env = _load_webservice_env()
58
+ host = env.get("HOST", "localhost") or "localhost"
59
+ # When HOST is 0.0.0.0 (listen on all interfaces), connect via localhost
60
+ if host in ("0.0.0.0", ""):
61
+ host = "localhost"
62
+ port = env.get("PORT", "8000") or "8000"
63
+ return f"http://{host}:{port}"
64
+
65
+ _TEST_TEXT = (
66
+ "Marie Curie was born in Warsaw, Poland, and later conducted her research "
67
+ "in Paris, France. Together with her husband Pierre Curie, she discovered "
68
+ "polonium and radium."
69
+ )
70
+
71
+ _MINIMAL_SCHEMA = {
72
+ "elements": [
73
+ {"tag": "persName", "description": "a person's name"},
74
+ {"tag": "placeName", "description": "a geographical place name"},
75
+ ],
76
+ "rules": [],
77
+ }
78
+
79
+
80
+ # ---------------------------------------------------------------------------
81
+ # Helpers
82
+ # ---------------------------------------------------------------------------
83
+
84
+
85
+ def _get(url: str) -> tuple[int, str]:
86
+ try:
87
+ with urllib.request.urlopen(url, timeout=30) as resp:
88
+ return resp.status, resp.read().decode(errors="replace")
89
+ except urllib.error.HTTPError as exc:
90
+ return exc.code, exc.read().decode(errors="replace")
91
+ except urllib.error.URLError as exc:
92
+ print(
93
+ f"\nERROR: Cannot reach the webservice at {url}\n"
94
+ f" {exc.reason}\n"
95
+ f" Is the server running? Start it with:\n"
96
+ f" cd webservice && uvicorn main:app --reload",
97
+ file=sys.stderr,
98
+ )
99
+ sys.exit(2)
100
+
101
+
102
+ def _post_json(url: str, payload: dict) -> tuple[int, dict | str]:
103
+ body = json.dumps(payload).encode()
104
+ req = urllib.request.Request(
105
+ url, data=body, headers={"Content-Type": "application/json"}, method="POST"
106
+ )
107
+ try:
108
+ with urllib.request.urlopen(req, timeout=60) as resp:
109
+ return resp.status, json.loads(resp.read())
110
+ except urllib.error.HTTPError as exc:
111
+ return exc.code, exc.read().decode(errors="replace")
112
+ except urllib.error.URLError as exc:
113
+ print(
114
+ f"\nERROR: Cannot reach the webservice at {url}\n"
115
+ f" {exc.reason}\n"
116
+ f" Is the server running? Start it with:\n"
117
+ f" cd webservice && uvicorn main:app --reload",
118
+ file=sys.stderr,
119
+ )
120
+ sys.exit(2)
121
+
122
+
123
+ def _check(name: str, ok: bool, detail: str = "") -> bool:
124
+ status = "PASS" if ok else "FAIL"
125
+ msg = f" [{status}] {name}"
126
+ if detail:
127
+ msg += f"\n {detail}"
128
+ print(msg)
129
+ return ok
130
+
131
+
132
+ # ---------------------------------------------------------------------------
133
+ # Tests
134
+ # ---------------------------------------------------------------------------
135
+
136
+
137
+ def test_html_ui(base_url: str) -> bool:
138
+ status, body = _get(f"{base_url}/")
139
+ return _check(
140
+ "GET / returns HTTP 200 with HTML form",
141
+ status == 200 and "<form" in body,
142
+ f"status={status}",
143
+ )
144
+
145
+
146
+ def test_api_annotate(base_url: str) -> bool:
147
+ status, data = _post_json(
148
+ f"{base_url}/api/annotate",
149
+ {"text": _TEST_TEXT, "schema": _MINIMAL_SCHEMA},
150
+ )
151
+ if not _check(
152
+ "POST /api/annotate returns HTTP 200",
153
+ status == 200,
154
+ f"status={status} body={str(data)[:200]}",
155
+ ):
156
+ return False
157
+
158
+ xml = data.get("xml", "") if isinstance(data, dict) else ""
159
+ plain = re.sub(r"<[^>]+>", "", xml)
160
+
161
+ ok_nonempty = _check("Response xml is non-empty", bool(xml.strip()))
162
+ ok_plain = _check(
163
+ "Plain text preserved after stripping tags",
164
+ plain == _TEST_TEXT,
165
+ f"expected: {_TEST_TEXT!r}\n got: {plain!r}",
166
+ )
167
+ ok_fuzzy = _check(
168
+ "Response contains fuzzy_spans list",
169
+ isinstance(data.get("fuzzy_spans"), list) if isinstance(data, dict) else False,
170
+ )
171
+ return ok_nonempty and ok_plain and ok_fuzzy
172
+
173
+
174
+ def test_api_no_schema(base_url: str) -> bool:
175
+ """Omitting schema should fall back to the built-in BLBL schema."""
176
+ bibl = "Curie, Marie. 1898. Sur une nouvelle substance radioactive. Paris."
177
+ status, data = _post_json(f"{base_url}/api/annotate", {"text": bibl})
178
+ return _check(
179
+ "POST /api/annotate without schema uses BLBL default (HTTP 200)",
180
+ status == 200 and isinstance(data, dict) and bool(data.get("xml")),
181
+ f"status={status}",
182
+ )
183
+
184
+
185
+ def test_api_unknown_provider(base_url: str) -> bool:
186
+ status, _ = _post_json(
187
+ f"{base_url}/api/annotate",
188
+ {"text": "hello", "provider": "nonexistent"},
189
+ )
190
+ return _check(
191
+ "POST /api/annotate with unknown provider returns 400",
192
+ status == 400,
193
+ f"status={status}",
194
+ )
195
+
196
+
197
+ def test_openapi_docs(base_url: str) -> bool:
198
+ status, _ = _get(f"{base_url}/docs")
199
+ return _check("GET /docs (OpenAPI UI) returns HTTP 200", status == 200)
200
+
201
+
202
+ # ---------------------------------------------------------------------------
203
+ # Main
204
+ # ---------------------------------------------------------------------------
205
+
206
+
207
+ def main() -> int:
208
+ default_url = _default_base_url()
209
+ parser = argparse.ArgumentParser(description="Smoke-test the TEI Annotator webservice.")
210
+ parser.add_argument(
211
+ "--base-url",
212
+ default=default_url,
213
+ help=f"Base URL of the running server (default: derived from webservice/.env, currently {default_url})",
214
+ )
215
+ args = parser.parse_args()
216
+ base = args.base_url.rstrip("/")
217
+
218
+ print(f"Smoke-testing {base}\n")
219
+
220
+ results = [
221
+ test_html_ui(base),
222
+ test_openapi_docs(base),
223
+ test_api_unknown_provider(base),
224
+ test_api_no_schema(base),
225
+ test_api_annotate(base),
226
+ ]
227
+
228
+ passed = sum(results)
229
+ total = len(results)
230
+ print(f"\n{'═' * 50}")
231
+ print(f" {passed}/{total} checks passed")
232
+ print(f"{'═' * 50}")
233
+ return 0 if all(results) else 1
234
+
235
+
236
+ if __name__ == "__main__":
237
+ sys.exit(main())
tei_annotator/schemas/__init__.py ADDED
File without changes
tei_annotator/schemas/blbl.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BLBL bibliographic schema for TEI annotation.
3
+
4
+ This schema covers the elements that appear in blbl-examples.tei.xml and is
5
+ shared between the evaluation script and the webservice.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+
11
+ def build_blbl_schema():
12
+ from tei_annotator.models.schema import TEIAttribute, TEIElement, TEISchema
13
+
14
+ def attr(name: str, desc: str, allowed: list[str] | None = None) -> TEIAttribute:
15
+ return TEIAttribute(name=name, description=desc, allowed_values=allowed)
16
+
17
+ return TEISchema(
18
+ rules=[
19
+ "For each person's name, emit an 'author' or 'editor' span covering the full name "
20
+ "AND separate 'surname', 'forename', or 'orgName' spans for the individual name "
21
+ "parts within that span.",
22
+ "Never emit 'surname', 'forename', or 'orgName' without a corresponding enclosing "
23
+ "'author' or 'editor' span.",
24
+ "When an organisation acts as author or editor, emit BOTH an 'orgName' span AND an "
25
+ "enclosing 'author' (or 'editor') span. The 'author'/'editor' span MUST enclose the "
26
+ "'orgName' span — NEVER put an 'author' or 'editor' span inside an 'orgName' span.",
27
+ "CRITICAL: All name parts for all contiguous authors MUST always be placed inside a "
28
+ "SINGLE 'author' (or 'editor') span — conjunctions ('and', '&', 'et') and commas "
29
+ "between names do NOT create separate spans. Emit a new 'author' span only when "
30
+ "the authors are separated by a title, date, or other non-name bibliographic field.",
31
+ "In a bibliography, a dash or underscore may stand for a repeated author or editor "
32
+ "name — tag it as 'author' or 'editor' accordingly.",
33
+ "CRITICAL: When a parenthesised location appears immediately after a title "
34
+ "(e.g. 'Title (City, Region)'), end the 'title' span BEFORE the opening parenthesis "
35
+ "and emit a separate 'pubPlace' span covering only 'City, Region' (not the parentheses). "
36
+ "Never include a parenthesised location inside a 'title' span.",
37
+ ],
38
+ elements=[
39
+ TEIElement(
40
+ tag="label",
41
+ description=(
42
+ "A numeric or alphanumeric reference label appearing at the very start of a "
43
+ "bibliographic entry, before any author or title. Typical forms: a plain number "
44
+ "('17'), a number with a trailing period ('17.'), a number in square brackets "
45
+ "('[77]', '[ACL30]'), or a compound number ('5,6'). The separator that follows "
46
+ "the label (period, dash, or space) is NOT part of the label. "
47
+ "A label is always a number or short code — never a word or name. "
48
+ "An ALL-CAPS word at the start of an entry is an author surname, not a label."
49
+ ),
50
+ allowed_children=[],
51
+ attributes=[],
52
+ ),
53
+ TEIElement(
54
+ tag="author",
55
+ description=(
56
+ "Name(s) of the author(s) of the cited work. "
57
+ "Names appearing at the start of a bibliographic entry before the title and "
58
+ "date are authors."
59
+ ),
60
+ allowed_children=['surname', 'forename', 'orgName'],
61
+ attributes=[],
62
+ ),
63
+ TEIElement(
64
+ tag="editor",
65
+ description=(
66
+ "Name of an editor of the cited work. "
67
+ "An editor's name typically follows keywords such as 'in', 'ed.', 'éd.', "
68
+ "'Hrsg.', 'dir.', '(ed.)', '(eds.)'. "
69
+ "CRITICAL: A person's name (or surname alone) that follows 'in' is an editor — "
70
+ "emit an 'editor' span (plus name-part spans), never a 'title' span."
71
+ ),
72
+ allowed_children=['surname', 'forename', 'orgName'],
73
+ attributes=[],
74
+ ),
75
+ TEIElement(
76
+ tag="surname",
77
+ description="The inherited (family) name of a person.",
78
+ allowed_children=[],
79
+ attributes=[],
80
+ ),
81
+ TEIElement(
82
+ tag="forename",
83
+ description="The given (first) name or initials of a person.",
84
+ allowed_children=[],
85
+ attributes=[],
86
+ ),
87
+ TEIElement(
88
+ tag="orgName",
89
+ description=(
90
+ "Name of an organisation that acts as author or editor. "
91
+ "Do NOT emit an 'orgName' span inside a 'publisher' span — "
92
+ "when an organisation is the publisher, use 'publisher' alone."
93
+ ),
94
+ allowed_children=[],
95
+ attributes=[],
96
+ ),
97
+ TEIElement(
98
+ tag="title",
99
+ description=(
100
+ "Title of the cited work. "
101
+ "Do NOT split a title at an internal period or subtitle separator — "
102
+ "e.g. 'Classical Literary Criticism. Oxford World Classics' is ONE title span; "
103
+ "a city name embedded in a subtitle (e.g. 'Oxford' in 'Oxford World Classics') "
104
+ "is NOT a pubPlace — do not interrupt the title span with a pubPlace span. "
105
+ "CRITICAL: The title span ends BEFORE any parenthesised location — "
106
+ "e.g. in 'Title (City, Region)', only 'Title' is the title span; "
107
+ "'City, Region' is a separate pubPlace span. "
108
+ "A journal or series title may appear after keywords such as 'in', 'dans', 'in:' — "
109
+ "emit a 'title' span for it; do NOT tag it as 'note'."
110
+ ),
111
+ allowed_children=[],
112
+ attributes=[
113
+ attr(
114
+ "level",
115
+ "Publication level: 'a'=article/chapter, 'm'=monograph/book, "
116
+ "'j'=journal, 's'=series.",
117
+ ["a", "m", "j", "s"],
118
+ )
119
+ ],
120
+ ),
121
+ TEIElement(
122
+ tag="date",
123
+ description=(
124
+ "Publication date or year. "
125
+ "When two dates appear in sequence — e.g. '1989 [1972]' (reprint year and "
126
+ "original year) — emit a SEPARATE 'date' span for each individual date."
127
+ ),
128
+ allowed_children=[],
129
+ attributes=[],
130
+ ),
131
+ TEIElement(
132
+ tag="publisher",
133
+ description=(
134
+ "Name of the publisher. "
135
+ "When multiple publishers are connected by 'and', emit a SINGLE 'publisher' "
136
+ "span covering the full text (e.g. 'Cambridge University Press and the Russell "
137
+ "Sage Foundation' is one span). Do NOT nest 'orgName' inside 'publisher'."
138
+ ),
139
+ allowed_children=[],
140
+ attributes=[],
141
+ ),
142
+ TEIElement(
143
+ tag="pubPlace",
144
+ description=(
145
+ "Place of publication. "
146
+ "CRITICAL: When a location appears in parentheses immediately after the title "
147
+ "(e.g. 'Title (City, Region)'), the parenthesised location is the pubPlace — "
148
+ "emit a 'pubPlace' span covering only 'City, Region' (without parentheses), "
149
+ "and end the 'title' span BEFORE the opening parenthesis. "
150
+ "Only tag a city name as pubPlace when it appears OUTSIDE and AFTER the title, "
151
+ "typically before a colon and publisher name (e.g. 'Oxford: Oxford UP'). "
152
+ "A city name that is part of a subtitle or series name within a title is NOT a pubPlace."
153
+ ),
154
+ allowed_children=[],
155
+ attributes=[],
156
+ ),
157
+ TEIElement(
158
+ tag="biblScope",
159
+ description=(
160
+ "Scope reference within the cited item (page range, volume, issue). "
161
+ "Emit a separate 'biblScope' span for volume and for issue. "
162
+ "The span text contains ONLY the bare number — do not include labels "
163
+ "('Vol.', 'No.', 'n°', 't.') or surrounding punctuation/parentheses. "
164
+ "E.g. for 'Vol. 12(3)', emit '12' as unit='volume' and '3' as unit='issue'. "
165
+ "E.g. for 'n°198', emit '198' as unit='volume'. "
166
+ "Do NOT absorb a volume or issue number into a preceding title span."
167
+ ),
168
+ allowed_children=[],
169
+ attributes=[
170
+ attr(
171
+ "unit",
172
+ "Unit of the scope reference.",
173
+ ["page", "volume", "issue"],
174
+ )
175
+ ],
176
+ ),
177
+ TEIElement(
178
+ tag="idno",
179
+ description="Bibliographic identifier such as DOI, ISBN, or ISSN.",
180
+ allowed_children=[],
181
+ attributes=[attr("type", "Identifier type, e.g. DOI, ISBN, ISSN.")],
182
+ ),
183
+ TEIElement(
184
+ tag="note",
185
+ description=(
186
+ "Editorial note or annotation about the cited item. "
187
+ "Institutional or series report designations — such as 'Amok Internal Report', "
188
+ "'USGS Open-File Report 97-123', or 'Technical Report No. 5' — must be tagged "
189
+ "as 'note' with type='report', NOT as 'orgName' or 'title'."
190
+ ),
191
+ allowed_children=[],
192
+ attributes=[attr("type", "Type of note, e.g. 'report'.")],
193
+ ),
194
+ TEIElement(
195
+ tag="ptr",
196
+ description="Pointer to an external resource such as a URL.",
197
+ allowed_children=[],
198
+ attributes=[attr("type", "Type of pointer, e.g. 'web'.")],
199
+ ),
200
+ ]
201
+ )
uv.lock CHANGED
The diff for this file is too large to render. See raw diff
 
webservice/.env.template ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================================================
2
+ # TEI Annotator webservice — environment configuration
3
+ # ============================================================
4
+ # Copy this file to .env and fill in at least one API key.
5
+ # The server will advertise only the providers whose key is set.
6
+ # ============================================================
7
+
8
+ # ------------------------------------------------------------
9
+ # LLM provider keys (at least one required)
10
+ # ------------------------------------------------------------
11
+
12
+ # Google Gemini (https://aistudio.google.com/app/apikey)
13
+ GEMINI_API_KEY=
14
+
15
+ # KISSKI OpenAI-compatible endpoint (https://kisski.gwdg.de)
16
+ KISSKI_API_KEY=
17
+
18
+ # ------------------------------------------------------------
19
+ # Default provider used by the HTML UI and the JSON API when
20
+ # the caller does not specify one. Must match a key that is set.
21
+ # Allowed values: gemini | kisski
22
+ # ------------------------------------------------------------
23
+ DEFAULT_PROVIDER=gemini
24
+
25
+ # ------------------------------------------------------------
26
+ # Optional: GLiNER model for a pre-detection pass before the LLM.
27
+ # Leave empty to disable. Requires the [gliner] extra:
28
+ # uv pip install -e ".[gliner,webservice]"
29
+ # Example:
30
+ # GLINER_MODEL=numind/NuNER_Zero
31
+ # ------------------------------------------------------------
32
+ GLINER_MODEL=
33
+
34
+ # ------------------------------------------------------------
35
+ # Server bind settings (used only when running via `python main.py`)
36
+ # When using uvicorn directly, pass --host / --port on the CLI.
37
+ # ------------------------------------------------------------
38
+ HOST=0.0.0.0
39
+ PORT=8099
webservice/Dockerfile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================================================
2
+ # TEI Annotator webservice
3
+ # ============================================================
4
+ # Build:
5
+ # docker build -t tei-annotator-webservice .
6
+ #
7
+ # Run (with an .env file in the webservice/ directory):
8
+ # docker run --env-file webservice/.env -p 8000:8000 tei-annotator-webservice
9
+ #
10
+ # Run (passing individual env vars):
11
+ # docker run -e GEMINI_API_KEY=... -p 8000:8000 tei-annotator-webservice
12
+ # ============================================================
13
+
14
+ FROM python:3.12-slim
15
+
16
+ # Install the package with webservice extras
17
+ WORKDIR /app
18
+ COPY . .
19
+ RUN pip install --no-cache-dir -e ".[webservice]"
20
+
21
+ # Working directory for the server so that relative paths
22
+ # (templates/, .env) resolve correctly
23
+ WORKDIR /app/webservice
24
+
25
+ EXPOSE 8000
26
+
27
+ CMD ["python", "main.py"]
webservice/main.py ADDED
@@ -0,0 +1,329 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ TEI Annotator webservice.
3
+
4
+ Start with:
5
+ python main.py # reads HOST/PORT from .env
6
+ python main.py --reload # development mode with auto-reload
7
+
8
+ Do NOT start with `uvicorn main:app` directly: uvicorn parses its --port from
9
+ the CLI before importing this module, so load_dotenv() would run too late to
10
+ affect the port binding.
11
+
12
+ Configuration is read from a .env file in this directory (or from environment
13
+ variables). Copy .env.template to .env and fill in at least one API key.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import json
19
+ import os
20
+ import urllib.error
21
+ import urllib.request
22
+ from pathlib import Path
23
+
24
+ from dotenv import load_dotenv
25
+ from fastapi import FastAPI, Form, HTTPException, Request
26
+ from fastapi.responses import HTMLResponse
27
+ from fastapi.templating import Jinja2Templates
28
+ from pydantic import BaseModel, Field
29
+
30
+ # ---------------------------------------------------------------------------
31
+ # Config
32
+ # ---------------------------------------------------------------------------
33
+
34
+ load_dotenv(Path(__file__).parent / ".env")
35
+
36
+ _GEMINI_KEY = os.environ.get("GEMINI_API_KEY", "")
37
+ _KISSKI_KEY = os.environ.get("KISSKI_API_KEY", "")
38
+ _DEFAULT_PROVIDER = os.environ.get("DEFAULT_PROVIDER", "gemini")
39
+ _GLINER_MODEL = os.environ.get("GLINER_MODEL", "") or None
40
+
41
+ _AVAILABLE_PROVIDERS: list[str] = []
42
+ if _GEMINI_KEY:
43
+ _AVAILABLE_PROVIDERS.append("gemini")
44
+ if _KISSKI_KEY:
45
+ _AVAILABLE_PROVIDERS.append("kisski")
46
+
47
+ # ---------------------------------------------------------------------------
48
+ # HTTP helper
49
+ # ---------------------------------------------------------------------------
50
+
51
+
52
+ def _post_json(url: str, payload: dict, headers: dict, timeout: int = 120) -> dict:
53
+ body = json.dumps(payload).encode()
54
+ req = urllib.request.Request(url, data=body, headers=headers, method="POST")
55
+ try:
56
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
57
+ return json.loads(resp.read())
58
+ except urllib.error.HTTPError as exc:
59
+ detail = exc.read().decode(errors="replace")
60
+ raise RuntimeError(f"HTTP {exc.code}: {detail}") from exc
61
+
62
+
63
+ # ---------------------------------------------------------------------------
64
+ # Provider call_fn factories
65
+ # ---------------------------------------------------------------------------
66
+
67
+
68
+ def _make_gemini_call_fn(api_key: str, model: str = "gemini-2.0-flash", timeout: int = 120):
69
+ url = (
70
+ f"https://generativelanguage.googleapis.com/v1beta/models"
71
+ f"/{model}:generateContent?key={api_key}"
72
+ )
73
+
74
+ def call_fn(prompt: str) -> str:
75
+ payload = {
76
+ "contents": [{"parts": [{"text": prompt}]}],
77
+ "generationConfig": {"temperature": 0.1},
78
+ }
79
+ result = _post_json(url, payload, {"Content-Type": "application/json"}, timeout)
80
+ return result["candidates"][0]["content"]["parts"][0]["text"]
81
+
82
+ call_fn.__name__ = f"gemini/{model}"
83
+ return call_fn
84
+
85
+
86
+ def _make_kisski_call_fn(
87
+ api_key: str,
88
+ base_url: str = "https://chat-ai.academiccloud.de/v1",
89
+ model: str = "llama-3.3-70b-instruct",
90
+ timeout: int = 120,
91
+ ):
92
+ url = f"{base_url}/chat/completions"
93
+ headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
94
+
95
+ def call_fn(prompt: str) -> str:
96
+ payload = {
97
+ "model": model,
98
+ "messages": [{"role": "user", "content": prompt}],
99
+ "temperature": 0.1,
100
+ }
101
+ result = _post_json(url, payload, headers, timeout)
102
+ return result["choices"][0]["message"]["content"]
103
+
104
+ call_fn.__name__ = f"kisski/{model}"
105
+ return call_fn
106
+
107
+
108
+ def _get_call_fn(provider: str):
109
+ if provider == "gemini":
110
+ if not _GEMINI_KEY:
111
+ raise HTTPException(status_code=503, detail="GEMINI_API_KEY not configured")
112
+ return _make_gemini_call_fn(_GEMINI_KEY)
113
+ if provider == "kisski":
114
+ if not _KISSKI_KEY:
115
+ raise HTTPException(status_code=503, detail="KISSKI_API_KEY not configured")
116
+ return _make_kisski_call_fn(_KISSKI_KEY)
117
+ raise HTTPException(status_code=400, detail=f"Unknown provider: {provider!r}")
118
+
119
+
120
+ # ---------------------------------------------------------------------------
121
+ # Schema helpers
122
+ # ---------------------------------------------------------------------------
123
+
124
+
125
+ def _schema_from_dict(data: dict):
126
+ """Build a TEISchema from a plain dict (as received from the JSON API)."""
127
+ from tei_annotator.models.schema import TEIAttribute, TEIElement, TEISchema
128
+
129
+ elements = []
130
+ for e in data.get("elements", []):
131
+ attrs = [
132
+ TEIAttribute(
133
+ name=a["name"],
134
+ description=a.get("description", ""),
135
+ allowed_values=a.get("allowed_values"),
136
+ )
137
+ for a in e.get("attributes", [])
138
+ ]
139
+ elements.append(
140
+ TEIElement(
141
+ tag=e["tag"],
142
+ description=e.get("description", ""),
143
+ allowed_children=e.get("allowed_children", []),
144
+ attributes=attrs,
145
+ )
146
+ )
147
+ return TEISchema(elements=elements, rules=data.get("rules", []))
148
+
149
+
150
+ # ---------------------------------------------------------------------------
151
+ # FastAPI app
152
+ # ---------------------------------------------------------------------------
153
+
154
+ app = FastAPI(
155
+ title="TEI Annotator",
156
+ description="Annotate plain text with TEI XML tags using an LLM backend.",
157
+ version="0.1.0",
158
+ )
159
+
160
+ _TEMPLATES_DIR = Path(__file__).parent / "templates"
161
+ templates = Jinja2Templates(directory=str(_TEMPLATES_DIR))
162
+
163
+
164
+ # ---------------------------------------------------------------------------
165
+ # HTML endpoints
166
+ # ---------------------------------------------------------------------------
167
+
168
+
169
+ @app.get("/", response_class=HTMLResponse)
170
+ async def index(request: Request):
171
+ return templates.TemplateResponse(
172
+ "index.html",
173
+ {
174
+ "request": request,
175
+ "providers": _AVAILABLE_PROVIDERS,
176
+ "default_provider": _DEFAULT_PROVIDER if _DEFAULT_PROVIDER in _AVAILABLE_PROVIDERS else (_AVAILABLE_PROVIDERS[0] if _AVAILABLE_PROVIDERS else ""),
177
+ "result": None,
178
+ "error": None,
179
+ "input_text": "",
180
+ },
181
+ )
182
+
183
+
184
+ @app.post("/annotate", response_class=HTMLResponse)
185
+ async def annotate_html(
186
+ request: Request,
187
+ text: str = Form(...),
188
+ provider: str = Form(...),
189
+ ):
190
+ from tei_annotator.inference.endpoint import EndpointCapability, EndpointConfig
191
+ from tei_annotator.pipeline import annotate
192
+ from tei_annotator.schemas.blbl import build_blbl_schema
193
+
194
+ result_xml = None
195
+ error = None
196
+ try:
197
+ call_fn = _get_call_fn(provider)
198
+ endpoint = EndpointConfig(
199
+ capability=EndpointCapability.TEXT_GENERATION,
200
+ call_fn=call_fn,
201
+ )
202
+ result = annotate(
203
+ text=text,
204
+ schema=build_blbl_schema(),
205
+ endpoint=endpoint,
206
+ gliner_model=_GLINER_MODEL,
207
+ )
208
+ result_xml = result.xml
209
+ except HTTPException as exc:
210
+ error = exc.detail
211
+ except Exception as exc:
212
+ error = str(exc)
213
+
214
+ return templates.TemplateResponse(
215
+ "index.html",
216
+ {
217
+ "request": request,
218
+ "providers": _AVAILABLE_PROVIDERS,
219
+ "default_provider": provider,
220
+ "result": result_xml,
221
+ "error": error,
222
+ "input_text": text,
223
+ },
224
+ )
225
+
226
+
227
+ # ---------------------------------------------------------------------------
228
+ # JSON API
229
+ # ---------------------------------------------------------------------------
230
+
231
+
232
+ class AttributeSchema(BaseModel):
233
+ name: str
234
+ description: str = ""
235
+ allowed_values: list[str] | None = None
236
+
237
+
238
+ class ElementSchema(BaseModel):
239
+ tag: str
240
+ description: str = ""
241
+ allowed_children: list[str] = []
242
+ attributes: list[AttributeSchema] = []
243
+
244
+
245
+ class SchemaInput(BaseModel):
246
+ elements: list[ElementSchema]
247
+ rules: list[str] = []
248
+
249
+
250
+ class AnnotateRequest(BaseModel):
251
+ model_config = {"populate_by_name": True}
252
+
253
+ text: str
254
+ provider: str | None = None
255
+ tei_schema: SchemaInput | None = Field(None, alias="schema")
256
+
257
+
258
+ class FuzzySpan(BaseModel):
259
+ element: str
260
+ start: int
261
+ end: int
262
+
263
+
264
+ class AnnotateResponse(BaseModel):
265
+ xml: str
266
+ fuzzy_spans: list[FuzzySpan]
267
+
268
+
269
+ @app.post("/api/annotate", response_model=AnnotateResponse)
270
+ async def annotate_api(body: AnnotateRequest):
271
+ """
272
+ Annotate *text* and return the XML result.
273
+
274
+ - **text**: plain text to annotate.
275
+ - **provider**: `"gemini"` or `"kisski"` (default: `DEFAULT_PROVIDER` from env).
276
+ - **schema**: TEI schema definition. Omit to use the built-in BLBL bibliographic schema.
277
+ """
278
+ from tei_annotator.inference.endpoint import EndpointCapability, EndpointConfig
279
+ from tei_annotator.pipeline import annotate
280
+ from tei_annotator.schemas.blbl import build_blbl_schema
281
+
282
+ provider = body.provider or _DEFAULT_PROVIDER
283
+ call_fn = _get_call_fn(provider)
284
+
285
+ if body.tei_schema is not None:
286
+ schema = _schema_from_dict(body.tei_schema.model_dump())
287
+ else:
288
+ schema = build_blbl_schema()
289
+
290
+ endpoint = EndpointConfig(
291
+ capability=EndpointCapability.TEXT_GENERATION,
292
+ call_fn=call_fn,
293
+ )
294
+
295
+ try:
296
+ result = annotate(
297
+ text=body.text,
298
+ schema=schema,
299
+ endpoint=endpoint,
300
+ gliner_model=_GLINER_MODEL,
301
+ )
302
+ except Exception as exc:
303
+ raise HTTPException(status_code=500, detail=str(exc)) from exc
304
+
305
+ return AnnotateResponse(
306
+ xml=result.xml,
307
+ fuzzy_spans=[
308
+ FuzzySpan(element=s.element, start=s.start, end=s.end)
309
+ for s in result.fuzzy_spans
310
+ ],
311
+ )
312
+
313
+
314
+ # ---------------------------------------------------------------------------
315
+ # Entry point for direct execution
316
+ # ---------------------------------------------------------------------------
317
+
318
+ if __name__ == "__main__":
319
+ import argparse
320
+ import uvicorn
321
+
322
+ _parser = argparse.ArgumentParser()
323
+ _parser.add_argument("--reload", action="store_true", default=False,
324
+ help="Enable auto-reload on code changes (development only).")
325
+ _args = _parser.parse_args()
326
+
327
+ host = os.environ.get("HOST", "0.0.0.0")
328
+ port = int(os.environ.get("PORT", "8000"))
329
+ uvicorn.run("main:app", host=host, port=port, reload=_args.reload)
webservice/templates/index.html ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>TEI Annotator</title>
7
+ <style>
8
+ *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
9
+ body {
10
+ font-family: system-ui, sans-serif;
11
+ background: #f5f5f5;
12
+ color: #222;
13
+ padding: 2rem;
14
+ max-width: 900px;
15
+ margin: 0 auto;
16
+ }
17
+ h1 { font-size: 1.5rem; margin-bottom: 0.25rem; }
18
+ p.subtitle { color: #555; font-size: 0.9rem; margin-bottom: 1.5rem; }
19
+ label { display: block; font-weight: 600; margin-bottom: 0.3rem; font-size: 0.9rem; }
20
+ textarea {
21
+ width: 100%;
22
+ height: 180px;
23
+ padding: 0.6rem;
24
+ font-family: monospace;
25
+ font-size: 0.85rem;
26
+ border: 1px solid #ccc;
27
+ border-radius: 4px;
28
+ resize: vertical;
29
+ }
30
+ select {
31
+ padding: 0.4rem 0.6rem;
32
+ border: 1px solid #ccc;
33
+ border-radius: 4px;
34
+ font-size: 0.9rem;
35
+ }
36
+ .row { display: flex; align-items: center; gap: 1rem; margin-top: 0.75rem; }
37
+ button {
38
+ padding: 0.45rem 1.2rem;
39
+ background: #2563eb;
40
+ color: #fff;
41
+ border: none;
42
+ border-radius: 4px;
43
+ font-size: 0.9rem;
44
+ cursor: pointer;
45
+ }
46
+ button:hover { background: #1d4ed8; }
47
+ .result-box {
48
+ margin-top: 1.5rem;
49
+ background: #fff;
50
+ border: 1px solid #d1d5db;
51
+ border-radius: 4px;
52
+ padding: 1rem;
53
+ }
54
+ .result-box h2 { font-size: 1rem; margin-bottom: 0.5rem; color: #374151; }
55
+ pre {
56
+ white-space: pre-wrap;
57
+ word-break: break-word;
58
+ font-family: monospace;
59
+ font-size: 0.82rem;
60
+ line-height: 1.5;
61
+ background: #f9fafb;
62
+ padding: 0.75rem;
63
+ border-radius: 3px;
64
+ }
65
+ .error {
66
+ margin-top: 1.5rem;
67
+ background: #fef2f2;
68
+ border: 1px solid #fca5a5;
69
+ border-radius: 4px;
70
+ padding: 0.75rem 1rem;
71
+ color: #b91c1c;
72
+ font-size: 0.9rem;
73
+ }
74
+ .no-providers {
75
+ margin-top: 1rem;
76
+ padding: 0.75rem 1rem;
77
+ background: #fffbeb;
78
+ border: 1px solid #fcd34d;
79
+ border-radius: 4px;
80
+ font-size: 0.9rem;
81
+ color: #92400e;
82
+ }
83
+ </style>
84
+ </head>
85
+ <body>
86
+ <h1>TEI Annotator</h1>
87
+ <p class="subtitle">Paste bibliographic text below to annotate it with TEI XML tags.</p>
88
+
89
+ {% if providers %}
90
+ <form method="post" action="/annotate">
91
+ <label for="text">Input text</label>
92
+ <textarea id="text" name="text" placeholder="Paste a bibliographic entry here…" required>{{ input_text }}</textarea>
93
+
94
+ <div class="row">
95
+ <div>
96
+ <label for="provider" style="display:inline; margin-right:0.4rem;">Provider</label>
97
+ <select id="provider" name="provider">
98
+ {% for p in providers %}
99
+ <option value="{{ p }}" {% if p == default_provider %}selected{% endif %}>{{ p }}</option>
100
+ {% endfor %}
101
+ </select>
102
+ </div>
103
+ <button type="submit">Annotate</button>
104
+ </div>
105
+ </form>
106
+
107
+ {% if error %}
108
+ <div class="error"><strong>Error:</strong> {{ error }}</div>
109
+ {% endif %}
110
+
111
+ {% if result is not none %}
112
+ <div class="result-box">
113
+ <h2>Annotated XML</h2>
114
+ <pre>{{ result }}</pre>
115
+ </div>
116
+ {% endif %}
117
+
118
+ {% else %}
119
+ <div class="no-providers">
120
+ No LLM providers are configured. Set <code>GEMINI_API_KEY</code> or
121
+ <code>KISSKI_API_KEY</code> in <code>webservice/.env</code> and restart the server.
122
+ </div>
123
+ {% endif %}
124
+ </body>
125
+ </html>