| |
| """ |
| `potato codebook <config.yaml>` — initialise / migrate a project's |
| codebook from its YAML config. |
| |
| For every annotation scheme with ``codebook: true`` it ensures a code |
| exists for each YAML label. Codes get **deterministic** ids |
| (``uuid5`` over ``project | parent_id | name``) so re-running is a |
| no-op and the same config always yields the same ids across machines |
| (important when annotation rows carry a parallel ``code_id``). |
| |
| Idempotent: existing codes are left untouched; only missing ones are |
| created. Safe to run repeatedly and in CI. |
| |
| Usage: |
| potato codebook path/to/config.yaml |
| potato codebook path/to/config.yaml --dry-run |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import os |
| import sys |
| import uuid |
| from typing import Any, Dict, List |
|
|
| import yaml |
|
|
| from potato.codebook import create_code |
| from potato.codebook.codebook import Codebook |
| from potato.codebook.service import DuplicateCodeError |
| from potato.codebook.store import ROOT |
|
|
| |
| _NS = uuid.UUID("6b9b1f6e-1c2d-5a4b-9e3f-c0deb00c0de5") |
|
|
|
|
| def deterministic_code_id(project: str, parent_id: str, name: str) -> str: |
| return uuid.uuid5(_NS, f"{project}\x1f{parent_id}\x1f{name}").hex |
|
|
|
|
| def _label_name(entry: Any) -> str: |
| if isinstance(entry, str): |
| return entry |
| if isinstance(entry, dict): |
| return str(entry.get("name") or entry.get("label") or "").strip() |
| return str(entry).strip() |
|
|
|
|
| def _resolve_task_dir(config_file: str, config: Dict[str, Any]) -> str: |
| base = os.path.dirname(os.path.abspath(config_file)) |
| return os.path.normpath(os.path.join(base, config.get("task_dir", "."))) |
|
|
|
|
| def init_codebook(config_file: str, *, dry_run: bool = False) -> Dict[str, int]: |
| """Seed missing codes for every codebook-enabled scheme. |
| |
| Returns {"created": n, "existing": m}. |
| """ |
| with open(config_file, "rt", encoding="utf-8") as fh: |
| config = yaml.safe_load(fh) or {} |
|
|
| task_dir = _resolve_task_dir(config_file, config) |
| project = config.get("annotation_task_name") or "default" |
| schemes: List[Dict[str, Any]] = config.get("annotation_schemes") or [] |
|
|
| created = existing = 0 |
| for scheme in schemes: |
| if not isinstance(scheme, dict) or not scheme.get("codebook"): |
| continue |
| cb = Codebook.load(task_dir, project) |
| present = set(cb.labels()) |
| for entry in scheme.get("labels") or []: |
| name = _label_name(entry) |
| if not name: |
| continue |
| if name in present: |
| existing += 1 |
| continue |
| if dry_run: |
| created += 1 |
| present.add(name) |
| continue |
| cid = deterministic_code_id(project, ROOT, name) |
| try: |
| create_code(task_dir, project=project, name=name, |
| created_by="codebook-cli", code_id=cid) |
| created += 1 |
| present.add(name) |
| except DuplicateCodeError: |
| existing += 1 |
|
|
| return {"created": created, "existing": existing} |
|
|
|
|
| def main(argv=None) -> int: |
| parser = argparse.ArgumentParser( |
| prog="potato codebook", |
| description="Initialise a project codebook from its YAML config.") |
| parser.add_argument("config_file", help="Path to the project config.yaml") |
| parser.add_argument( |
| "--dry-run", action="store_true", |
| help="Report what would be created without writing.") |
| args = parser.parse_args(argv) |
|
|
| if not os.path.isfile(args.config_file): |
| print(f"Config not found: {args.config_file}", file=sys.stderr) |
| return 2 |
|
|
| result = init_codebook(args.config_file, dry_run=args.dry_run) |
| verb = "Would create" if args.dry_run else "Created" |
| print(f"{verb} {result['created']} code(s); " |
| f"{result['existing']} already present.") |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| sys.exit(main()) |
|
|