File size: 3,402 Bytes
fe3046d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import json
import os
import time
import urllib.error
import urllib.request
from pathlib import Path


BASE_URL = "http://127.0.0.1:7860"
ROOT = Path("/opt/rca_label_studio")
MARKER = Path(os.getenv("LABEL_STUDIO_BASE_DATA_DIR", "/data")) / ".rca_projects_bootstrapped"


def request(method: str, path: str, payload: dict | list | None = None) -> tuple[int, dict | list | str]:
    token = os.getenv("LABEL_STUDIO_USER_TOKEN", "rca-admin-token")
    body = None
    headers = {"Authorization": f"Token {token}"}
    if payload is not None:
        body = json.dumps(payload).encode("utf-8")
        headers["Content-Type"] = "application/json"

    req = urllib.request.Request(
        f"{BASE_URL}{path}",
        data=body,
        headers=headers,
        method=method,
    )
    with urllib.request.urlopen(req, timeout=30) as resp:
        raw = resp.read().decode("utf-8")
        if not raw:
            return resp.status, ""
        try:
            return resp.status, json.loads(raw)
        except json.JSONDecodeError:
            return resp.status, raw


def wait_for_server() -> None:
    deadline = time.time() + 180
    while time.time() < deadline:
        try:
            request("GET", "/api/projects/")
            return
        except Exception:
            time.sleep(2)
    raise RuntimeError("Label Studio did not become ready within 180 seconds")


def existing_project_titles() -> set[str]:
    _, payload = request("GET", "/api/projects/")
    if isinstance(payload, dict):
        projects = payload.get("results", [])
    elif isinstance(payload, list):
        projects = payload
    else:
        projects = []
    return {str(project.get("title", "")) for project in projects if isinstance(project, dict)}


def create_project(title: str, tasks_path: Path, label_config: str) -> None:
    with tasks_path.open("r", encoding="utf-8") as f:
        tasks = json.load(f)

    description = (
        "Validate the machine RCA for generated legal judgments. "
        "Choose the earliest meaningful FIRAC stage, major error category, "
        "and one exact minor error category."
    )
    _, project = request(
        "POST",
        "/api/projects/",
        {
            "title": title,
            "description": description,
            "label_config": label_config,
            "sampling": "Sequential sampling",
            "show_instruction": True,
            "show_skip_button": False,
            "enable_empty_annotation": False,
        },
    )
    project_id = project["id"]
    request(
        "POST",
        f"/api/projects/{project_id}/import?return_task_ids=true",
        tasks,
    )
    print(f"Bootstrapped {title} with {len(tasks)} tasks", flush=True)


def main() -> None:
    wait_for_server()
    if MARKER.exists():
        print("RCA Label Studio bootstrap marker exists; skipping project creation.", flush=True)
        return

    label_config = (ROOT / "label_config.xml").read_text(encoding="utf-8")
    titles = existing_project_titles()

    for idx in range(1, 4):
        title = f"RCA Validation - Annotator {idx}"
        if title in titles:
            print(f"{title} already exists; skipping.", flush=True)
            continue
        create_project(title, ROOT / "data" / f"tasks_annotator_{idx}.json", label_config)

    MARKER.write_text(str(time.time()), encoding="utf-8")


if __name__ == "__main__":
    main()