codebook / potato /codebook_cli.py
davidjurgens's picture
Deploy: Potato — Codebook Annotation
aceb1b2 verified
Raw
History Blame Contribute Delete
3.98 kB
#!/usr/bin/env python
"""
`potato codebook <config.yaml>` — initialise / migrate a project's
codebook from its YAML config.
For every annotation scheme with ``codebook: true`` it ensures a code
exists for each YAML label. Codes get **deterministic** ids
(``uuid5`` over ``project | parent_id | name``) so re-running is a
no-op and the same config always yields the same ids across machines
(important when annotation rows carry a parallel ``code_id``).
Idempotent: existing codes are left untouched; only missing ones are
created. Safe to run repeatedly and in CI.
Usage:
potato codebook path/to/config.yaml
potato codebook path/to/config.yaml --dry-run
"""
from __future__ import annotations
import argparse
import os
import sys
import uuid
from typing import Any, Dict, List
import yaml
from potato.codebook import create_code
from potato.codebook.codebook import Codebook
from potato.codebook.service import DuplicateCodeError
from potato.codebook.store import ROOT
# Stable namespace so ids are reproducible across machines/runs.
_NS = uuid.UUID("6b9b1f6e-1c2d-5a4b-9e3f-c0deb00c0de5")
def deterministic_code_id(project: str, parent_id: str, name: str) -> str:
return uuid.uuid5(_NS, f"{project}\x1f{parent_id}\x1f{name}").hex
def _label_name(entry: Any) -> str:
if isinstance(entry, str):
return entry
if isinstance(entry, dict):
return str(entry.get("name") or entry.get("label") or "").strip()
return str(entry).strip()
def _resolve_task_dir(config_file: str, config: Dict[str, Any]) -> str:
base = os.path.dirname(os.path.abspath(config_file))
return os.path.normpath(os.path.join(base, config.get("task_dir", ".")))
def init_codebook(config_file: str, *, dry_run: bool = False) -> Dict[str, int]:
"""Seed missing codes for every codebook-enabled scheme.
Returns {"created": n, "existing": m}.
"""
with open(config_file, "rt", encoding="utf-8") as fh:
config = yaml.safe_load(fh) or {}
task_dir = _resolve_task_dir(config_file, config)
project = config.get("annotation_task_name") or "default"
schemes: List[Dict[str, Any]] = config.get("annotation_schemes") or []
created = existing = 0
for scheme in schemes:
if not isinstance(scheme, dict) or not scheme.get("codebook"):
continue
cb = Codebook.load(task_dir, project)
present = set(cb.labels())
for entry in scheme.get("labels") or []:
name = _label_name(entry)
if not name:
continue
if name in present:
existing += 1
continue
if dry_run:
created += 1
present.add(name)
continue
cid = deterministic_code_id(project, ROOT, name)
try:
create_code(task_dir, project=project, name=name,
created_by="codebook-cli", code_id=cid)
created += 1
present.add(name)
except DuplicateCodeError:
existing += 1
return {"created": created, "existing": existing}
def main(argv=None) -> int:
parser = argparse.ArgumentParser(
prog="potato codebook",
description="Initialise a project codebook from its YAML config.")
parser.add_argument("config_file", help="Path to the project config.yaml")
parser.add_argument(
"--dry-run", action="store_true",
help="Report what would be created without writing.")
args = parser.parse_args(argv)
if not os.path.isfile(args.config_file):
print(f"Config not found: {args.config_file}", file=sys.stderr)
return 2
result = init_codebook(args.config_file, dry_run=args.dry_run)
verb = "Would create" if args.dry_run else "Created"
print(f"{verb} {result['created']} code(s); "
f"{result['existing']} already present.")
return 0
if __name__ == "__main__":
sys.exit(main())