multi-span / potato /codebook /schema_bridge.py
davidjurgens's picture
Deploy Potato demo: Potato — Multi-Field Spans
dc4d6fd verified
Raw
History Blame Contribute Delete
3.59 kB
"""
Schema-loader codebook bridge.
When an annotation scheme opts in with ``codebook: true``, its label
list is sourced from the project's mutable codebook instead of (only)
the static YAML ``labels``. Applied once at server start, before
front-end generation, so every downstream generator
(radio/multiselect/span/hierarchical_multiselect) keeps reading
``scheme["labels"]`` unchanged.
Legacy preservation: a config's existing YAML ``labels`` seed the
codebook the first time (so old configs keep working and the codebook
starts populated); thereafter the database is the source of truth.
"""
from __future__ import annotations
import logging
from typing import Any, Dict, List
from potato.codebook import create_code
from potato.codebook.codebook import Codebook
from potato.codebook.service import DuplicateCodeError
logger = logging.getLogger(__name__)
def _label_name(entry: Any) -> str:
if isinstance(entry, str):
return entry
if isinstance(entry, dict):
return str(entry.get("name") or entry.get("label") or "").strip()
return str(entry).strip()
def _project_of(config: Dict[str, Any]) -> str:
return config.get("annotation_task_name") or "default"
def _seed_from_yaml(
task_dir: str, project: str, yaml_labels: List[Any]
) -> None:
for entry in yaml_labels or []:
name = _label_name(entry)
if not name:
continue
try:
create_code(
task_dir, project=project, name=name,
created_by="config")
except DuplicateCodeError:
pass # idempotent: re-seeding an existing code is fine
def apply_codebook_to_schemes(config: Dict[str, Any]) -> None:
"""Mutate ``config['annotation_schemes']`` in place: for every
scheme with ``codebook: true``, point ``labels`` at the codebook
(seeding it from the scheme's YAML labels on first run)."""
schemes = config.get("annotation_schemes") or []
task_dir = config.get("task_dir", ".")
project = _project_of(config)
for scheme in schemes:
if not isinstance(scheme, dict) or not scheme.get("codebook"):
continue
cb = Codebook.load(task_dir, project)
if cb.is_empty():
_seed_from_yaml(task_dir, project, scheme.get("labels"))
cb = Codebook.load(task_dir, project)
names = cb.labels()
if names:
scheme["labels"] = names
logger.info(
"Codebook bridge: scheme %r now sources %d label(s) "
"from the project codebook",
scheme.get("name"), len(names))
def _icl_sync_listener(task_dir: str, project: str) -> None:
"""Codebook change listener: refresh the *live* server config's
scheme labels so ICL prompts (built fresh from ``schema['labels']``
each call) are restricted to the codebook's current set. Refreshing
the source the prompt is built from *is* the prompt-cache
invalidation — there is no separate persistent ICL prompt cache.
"""
try:
from potato.server_utils import config_module
cfg = config_module.config
except Exception:
return
if not cfg:
return
if (cfg.get("annotation_task_name") or "default") != project:
return
apply_codebook_to_schemes(cfg)
def install_codebook_icl_sync() -> None:
"""Register the ICL-sync listener (idempotent). Called at server
init alongside the other mode initializers."""
from potato.codebook.service import register_change_listener
register_change_listener(_icl_sync_listener)