#!/usr/bin/env python """ `potato codebook ` — initialise / migrate a project's codebook from its YAML config. For every annotation scheme with ``codebook: true`` it ensures a code exists for each YAML label. Codes get **deterministic** ids (``uuid5`` over ``project | parent_id | name``) so re-running is a no-op and the same config always yields the same ids across machines (important when annotation rows carry a parallel ``code_id``). Idempotent: existing codes are left untouched; only missing ones are created. Safe to run repeatedly and in CI. Usage: potato codebook path/to/config.yaml potato codebook path/to/config.yaml --dry-run """ from __future__ import annotations import argparse import os import sys import uuid from typing import Any, Dict, List import yaml from potato.codebook import create_code from potato.codebook.codebook import Codebook from potato.codebook.service import DuplicateCodeError from potato.codebook.store import ROOT # Stable namespace so ids are reproducible across machines/runs. _NS = uuid.UUID("6b9b1f6e-1c2d-5a4b-9e3f-c0deb00c0de5") def deterministic_code_id(project: str, parent_id: str, name: str) -> str: return uuid.uuid5(_NS, f"{project}\x1f{parent_id}\x1f{name}").hex def _label_name(entry: Any) -> str: if isinstance(entry, str): return entry if isinstance(entry, dict): return str(entry.get("name") or entry.get("label") or "").strip() return str(entry).strip() def _resolve_task_dir(config_file: str, config: Dict[str, Any]) -> str: base = os.path.dirname(os.path.abspath(config_file)) return os.path.normpath(os.path.join(base, config.get("task_dir", "."))) def init_codebook(config_file: str, *, dry_run: bool = False) -> Dict[str, int]: """Seed missing codes for every codebook-enabled scheme. Returns {"created": n, "existing": m}. """ with open(config_file, "rt", encoding="utf-8") as fh: config = yaml.safe_load(fh) or {} task_dir = _resolve_task_dir(config_file, config) project = config.get("annotation_task_name") or "default" schemes: List[Dict[str, Any]] = config.get("annotation_schemes") or [] created = existing = 0 for scheme in schemes: if not isinstance(scheme, dict) or not scheme.get("codebook"): continue cb = Codebook.load(task_dir, project) present = set(cb.labels()) for entry in scheme.get("labels") or []: name = _label_name(entry) if not name: continue if name in present: existing += 1 continue if dry_run: created += 1 present.add(name) continue cid = deterministic_code_id(project, ROOT, name) try: create_code(task_dir, project=project, name=name, created_by="codebook-cli", code_id=cid) created += 1 present.add(name) except DuplicateCodeError: existing += 1 return {"created": created, "existing": existing} def main(argv=None) -> int: parser = argparse.ArgumentParser( prog="potato codebook", description="Initialise a project codebook from its YAML config.") parser.add_argument("config_file", help="Path to the project config.yaml") parser.add_argument( "--dry-run", action="store_true", help="Report what would be created without writing.") args = parser.parse_args(argv) if not os.path.isfile(args.config_file): print(f"Config not found: {args.config_file}", file=sys.stderr) return 2 result = init_codebook(args.config_file, dry_run=args.dry_run) verb = "Would create" if args.dry_run else "Created" print(f"{verb} {result['created']} code(s); " f"{result['existing']} already present.") return 0 if __name__ == "__main__": sys.exit(main())