File size: 3,983 Bytes
aceb1b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python
"""
`potato codebook <config.yaml>` — initialise / migrate a project's
codebook from its YAML config.

For every annotation scheme with ``codebook: true`` it ensures a code
exists for each YAML label. Codes get **deterministic** ids
(``uuid5`` over ``project | parent_id | name``) so re-running is a
no-op and the same config always yields the same ids across machines
(important when annotation rows carry a parallel ``code_id``).

Idempotent: existing codes are left untouched; only missing ones are
created. Safe to run repeatedly and in CI.

Usage:
    potato codebook path/to/config.yaml
    potato codebook path/to/config.yaml --dry-run
"""

from __future__ import annotations

import argparse
import os
import sys
import uuid
from typing import Any, Dict, List

import yaml

from potato.codebook import create_code
from potato.codebook.codebook import Codebook
from potato.codebook.service import DuplicateCodeError
from potato.codebook.store import ROOT

# Stable namespace so ids are reproducible across machines/runs.
_NS = uuid.UUID("6b9b1f6e-1c2d-5a4b-9e3f-c0deb00c0de5")


def deterministic_code_id(project: str, parent_id: str, name: str) -> str:
    return uuid.uuid5(_NS, f"{project}\x1f{parent_id}\x1f{name}").hex


def _label_name(entry: Any) -> str:
    if isinstance(entry, str):
        return entry
    if isinstance(entry, dict):
        return str(entry.get("name") or entry.get("label") or "").strip()
    return str(entry).strip()


def _resolve_task_dir(config_file: str, config: Dict[str, Any]) -> str:
    base = os.path.dirname(os.path.abspath(config_file))
    return os.path.normpath(os.path.join(base, config.get("task_dir", ".")))


def init_codebook(config_file: str, *, dry_run: bool = False) -> Dict[str, int]:
    """Seed missing codes for every codebook-enabled scheme.

    Returns {"created": n, "existing": m}.
    """
    with open(config_file, "rt", encoding="utf-8") as fh:
        config = yaml.safe_load(fh) or {}

    task_dir = _resolve_task_dir(config_file, config)
    project = config.get("annotation_task_name") or "default"
    schemes: List[Dict[str, Any]] = config.get("annotation_schemes") or []

    created = existing = 0
    for scheme in schemes:
        if not isinstance(scheme, dict) or not scheme.get("codebook"):
            continue
        cb = Codebook.load(task_dir, project)
        present = set(cb.labels())
        for entry in scheme.get("labels") or []:
            name = _label_name(entry)
            if not name:
                continue
            if name in present:
                existing += 1
                continue
            if dry_run:
                created += 1
                present.add(name)
                continue
            cid = deterministic_code_id(project, ROOT, name)
            try:
                create_code(task_dir, project=project, name=name,
                            created_by="codebook-cli", code_id=cid)
                created += 1
                present.add(name)
            except DuplicateCodeError:
                existing += 1

    return {"created": created, "existing": existing}


def main(argv=None) -> int:
    parser = argparse.ArgumentParser(
        prog="potato codebook",
        description="Initialise a project codebook from its YAML config.")
    parser.add_argument("config_file", help="Path to the project config.yaml")
    parser.add_argument(
        "--dry-run", action="store_true",
        help="Report what would be created without writing.")
    args = parser.parse_args(argv)

    if not os.path.isfile(args.config_file):
        print(f"Config not found: {args.config_file}", file=sys.stderr)
        return 2

    result = init_codebook(args.config_file, dry_run=args.dry_run)
    verb = "Would create" if args.dry_run else "Created"
    print(f"{verb} {result['created']} code(s); "
          f"{result['existing']} already present.")
    return 0


if __name__ == "__main__":
    sys.exit(main())