Spaces:

cmboulanger
/

tei-annotator

Sleeping

App Files Files Community

tei-annotator / app.py

cmboulanger

fix: sync batch size with sample size in gradio app

1659d98 24 days ago

raw

history blame contribute delete

11.2 kB

	"""
	TEI Annotator — Gradio demo for HuggingFace Spaces.

	Set HF_TOKEN as a Space secret. All inference calls use that token;
	visitors use the app without any login or token input.

	Deploy on HF Spaces (SDK: gradio). Required secret: HF_TOKEN.
	"""

	from __future__ import annotations

	import json
	import os
	import random
	import time
	import urllib.error
	import urllib.request
	from pathlib import Path

	import gradio as gr

	# ---------------------------------------------------------------------------
	# Config (mirrors webservice/main.py — keep in sync if you change models)
	# ---------------------------------------------------------------------------

	_HF_TOKEN = os.environ.get("HF_TOKEN", "")

	_HF_MODELS = [
	"meta-llama/Llama-3.3-70B-Instruct:nscale",
	"meta-llama/Llama-3.1-70B-Instruct:scaleway",
	"Qwen/Qwen3-32B:nscale",
	"deepseek-ai/DeepSeek-R1-Distill-Llama-70B:nscale",
	"Qwen/Qwen2.5-Coder-32B-Instruct:nscale",
	"deepseek-ai/DeepSeek-R1-Distill-Qwen-32B:nscale",
	"meta-llama/Llama-3.1-8B-Instruct:nscale",
	"Qwen/Qwen3-14B:nscale",
	"openai/gpt-oss-20b:nscale",
	"Qwen/QwQ-32B:nscale",
	]

	_HF_BASE_URL = "https://router.huggingface.co/v1"
	_FIXTURE_PATH = Path(__file__).parent / "tests" / "fixtures" / "blbl-examples.tei.xml"

	# Build the schema once at startup — it's immutable and expensive-ish to rebuild per request.
	from tei_annotator.schemas.blbl import build_blbl_schema as _build_blbl_schema
	_SCHEMA = _build_blbl_schema()

	# ---------------------------------------------------------------------------
	# HTTP helper
	# ---------------------------------------------------------------------------


	def _post_json(url: str, payload: dict, headers: dict, timeout: int = 300) -> dict:
	body = json.dumps(payload).encode()
	req = urllib.request.Request(url, data=body, headers=headers, method="POST")
	try:
	with urllib.request.urlopen(req, timeout=timeout) as resp:
	return json.loads(resp.read())
	except urllib.error.HTTPError as exc:
	detail = exc.read().decode(errors="replace")
	raise RuntimeError(f"HTTP {exc.code}: {detail}") from exc


	def _make_call_fn(model: str, timeout: int = 300):
	url = f"{_HF_BASE_URL}/chat/completions"
	headers = {"Content-Type": "application/json", "Authorization": f"Bearer {_HF_TOKEN}"}

	def call_fn(prompt: str) -> str:
	payload = {
	"model": model,
	"messages": [{"role": "user", "content": prompt}],
	"temperature": 0.1,
	}
	result = _post_json(url, payload, headers, timeout)
	return result["choices"][0]["message"]["content"]

	call_fn.__name__ = f"hf/{model}"
	return call_fn


	# ---------------------------------------------------------------------------
	# Action functions
	# ---------------------------------------------------------------------------


	def do_annotate(text: str, model: str):
	if not _HF_TOKEN:
	return "", "HF_TOKEN is not set. Add it as a Space secret."
	if not text.strip():
	return "", "Please enter some text to annotate."

	from tei_annotator.inference.endpoint import EndpointCapability, EndpointConfig
	from tei_annotator.pipeline import annotate

	endpoint = EndpointConfig(
	capability=EndpointCapability.TEXT_GENERATION,
	call_fn=_make_call_fn(model),
	)
	schema = _SCHEMA
	t0 = time.monotonic()
	try:
	result = annotate(text=text, schema=schema, endpoint=endpoint, gliner_model=None)
	except Exception as exc:
	return "", f"Error: {exc}"
	elapsed = round(time.monotonic() - t0, 1)
	return result.xml, f"Done in {elapsed}s"


	def do_load_samples(n: int):
	from lxml import etree

	if not _FIXTURE_PATH.exists():
	return "Fixture file not found. Make sure tests/fixtures/ is present."
	tree = etree.parse(str(_FIXTURE_PATH))
	bibls = tree.findall(".//{http://www.tei-c.org/ns/1.0}bibl")
	samples = random.sample(bibls, min(int(n), len(bibls)))

	from tei_annotator.evaluation.extractor import extract_spans

	return "\n\n".join(extract_spans(el)[0] for el in samples)


	_BATCH_SEP = "\n---RECORD\|\|\|SEP\|\|\|BOUNDARY---\n"


	def do_evaluate(model: str, n: int, batch_size: int = 1):
	if not _HF_TOKEN:
	return None, "HF_TOKEN is not set. Add it as a Space secret."

	from lxml import etree

	from tei_annotator.evaluation.extractor import extract_spans
	from tei_annotator.evaluation.metrics import MatchMode, aggregate, compute_metrics
	from tei_annotator.inference.endpoint import EndpointCapability, EndpointConfig
	from tei_annotator.pipeline import annotate

	if not _FIXTURE_PATH.exists():
	return None, "Fixture file not found. Make sure tests/fixtures/ is present."

	tree = etree.parse(str(_FIXTURE_PATH))
	bibls = tree.findall(".//{http://www.tei-c.org/ns/1.0}bibl")
	samples = random.sample(bibls, min(int(n), len(bibls)))
	schema = _SCHEMA
	endpoint = EndpointConfig(
	capability=EndpointCapability.TEXT_GENERATION,
	call_fn=_make_call_fn(model),
	)
	batch_size = max(1, int(batch_size))
	parser = etree.XMLParser(recover=True)

	def _annotate_one(el):
	plain_text, gold_spans = extract_spans(el)
	try:
	ann_result = annotate(plain_text, schema, endpoint, gliner_model=None)
	pred_el = etree.fromstring(f"<bibl>{ann_result.xml}</bibl>".encode(), parser)
	except Exception:
	pred_el = etree.Element("bibl")
	_, pred_spans = extract_spans(pred_el)
	return compute_metrics(gold_spans, pred_spans, mode=MatchMode.TEXT)

	def _annotate_batch(batch_els):
	plain_texts = []
	gold_spans_list = []
	for el in batch_els:
	pt, gs = extract_spans(el)
	plain_texts.append(pt)
	gold_spans_list.append(gs)

	if any(_BATCH_SEP in t for t in plain_texts):
	return [_annotate_one(el) for el in batch_els]

	combined = _BATCH_SEP.join(plain_texts)
	try:
	ann_result = annotate(combined, schema, endpoint, gliner_model=None)
	except Exception as exc:
	raise RuntimeError(f"Error during batch annotation: {exc}") from exc

	pieces = ann_result.xml.split(_BATCH_SEP)
	if len(pieces) != len(batch_els):
	return [
	compute_metrics(gs, [], mode=MatchMode.TEXT)
	for gs in gold_spans_list
	]

	results = []
	for piece, gs in zip(pieces, gold_spans_list):
	try:
	pred_el = etree.fromstring(f"<bibl>{piece}</bibl>".encode(), parser)
	except Exception:
	pred_el = etree.Element("bibl")
	_, pred_spans = extract_spans(pred_el)
	results.append(compute_metrics(gs, pred_spans, mode=MatchMode.TEXT))
	return results

	per_result = []
	t0 = time.monotonic()
	try:
	if batch_size <= 1:
	for el in samples:
	per_result.append(_annotate_one(el))
	else:
	for start in range(0, len(samples), batch_size):
	per_result.extend(_annotate_batch(samples[start : start + batch_size]))
	except Exception as exc:
	return None, f"Error during annotation: {exc}"

	elapsed = round(time.monotonic() - t0, 1)
	agg = aggregate(per_result)

	rows = [
	[
	tag,
	round(m.precision, 3),
	round(m.recall, 3),
	round(m.f1, 3),
	m.true_positives,
	m.false_positives,
	m.false_negatives,
	]
	for tag, m in sorted(agg.per_element.items(), key=lambda kv: -kv[1].f1)
	]
	summary = (
	f"n={len(samples)} samples \| "
	f"micro P={agg.micro_precision:.3f} R={agg.micro_recall:.3f} F1={agg.micro_f1:.3f} \| "
	f"{elapsed}s"
	)
	return rows, summary


	# ---------------------------------------------------------------------------
	# Gradio UI
	# ---------------------------------------------------------------------------

	with gr.Blocks(title="TEI Annotator") as demo:
	gr.Markdown(
	"# TEI Annotator\n"
	"[GitHub Repo](https://github.com/cboulanger/tei-annotator)\n"
	"This demo annotates bibliographic plain text with TEI XML tags (tei:author, tei:title etc.) "
	"using open LLMs via the HuggingFace Inference Router."
	)
	if not _HF_TOKEN:
	gr.Markdown("> Setup required: Set `HF_TOKEN` as a Space secret and restart.")

	with gr.Tabs():
	# ── Annotation tab ────────────────────────────────────────────────
	with gr.Tab("Annotate"):
	model_dd = gr.Dropdown(
	choices=_HF_MODELS, value=_HF_MODELS[0], label="Model"
	)
	text_in = gr.Textbox(
	lines=5,
	label="Input text",
	placeholder="Paste a bibliographic reference here…",
	)
	annotate_btn = gr.Button("Annotate", variant="primary")
	xml_out = gr.Code(label="XML output", language="html", interactive=False)
	ann_status = gr.Textbox(label="Status", interactive=False, max_lines=1)

	annotate_btn.click(
	do_annotate,
	inputs=[text_in, model_dd],
	outputs=[xml_out, ann_status],
	)

	# ── Evaluation tab ────────────────────────────────────────────────
	with gr.Tab("Evaluate"):
	eval_model_dd = gr.Dropdown(
	choices=_HF_MODELS, value=_HF_MODELS[0], label="Model"
	)
	n_slider = gr.Slider(1, 20, value=5, step=1, label="Number of samples")
	batch_size_slider = gr.Slider(
	1, 20, value=min(n_slider.value, 5), step=1, label="Batch size",
	info="Records per LLM call. Values > 1 reduce latency but may reduce quality.",
	)
	with gr.Row():
	sample_btn = gr.Button("Load sample texts")
	eval_btn = gr.Button("Run evaluation", variant="primary")
	sample_out = gr.Textbox(
	lines=8,
	label="Sampled texts (from gold standard)",
	interactive=False,
	)
	eval_table = gr.Dataframe(
	headers=["Element", "Precision", "Recall", "F1", "TP", "FP", "FN"],
	label="Per-element metrics (sorted by F1)",
	)
	eval_status = gr.Textbox(label="Summary", interactive=False, max_lines=2)

	n_slider.change(
	lambda n: min(int(n), 5),
	inputs=[n_slider],
	outputs=[batch_size_slider],
	)
	sample_btn.click(do_load_samples, inputs=[n_slider], outputs=[sample_out])
	eval_btn.click(
	do_evaluate,
	inputs=[eval_model_dd, n_slider, batch_size_slider],
	outputs=[eval_table, eval_status],
	)

	if __name__ == "__main__":
	demo.launch(ssr_mode=False)