File size: 14,179 Bytes
31e282f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
#!/usr/bin/env python3
"""Sync the current git tree to Hugging Face with HF-only card metadata."""

from __future__ import annotations

import argparse
import hashlib
import os
import re
import shutil
import subprocess
import sys
import tempfile
from pathlib import Path
from typing import Any, Callable

DEFAULT_REPO_ID = "Stevesolun/ctx"
DEFAULT_REPO_TYPE = "model"

HF_CARD_METADATA = """---
license: mit
pretty_name: ctx
tags:
  - agents
  - mcp
  - skills
  - knowledge-graph
  - llm-wiki
  - recommendation-system
  - harness
  - codex
  - claude-code
---

"""

LFS_POINTER_PREFIX = b"version https://git-lfs.github.com/spec/v1"
HYDRATED_ARTIFACT_MIN_BYTES = {
    Path("graph/wiki-graph.tar.gz"): 100_000_000,
    Path("graph/wiki-graph-runtime.tar.gz"): 10_000_000,
    Path("graph/skills-sh-catalog.json.gz"): 1_000_000,
}
GRAPH_VALIDATOR_INT_FLAGS = {
    "--min-nodes": "min_nodes",
    "--min-edges": "min_edges",
    "--min-skills-sh-nodes": "min_skills_sh_nodes",
    "--min-semantic-edges": "min_semantic_edges",
    "--expected-nodes": "expected_nodes",
    "--expected-edges": "expected_edges",
    "--expected-semantic-edges": "expected_semantic_edges",
    "--expected-harness-nodes": "expected_harness_nodes",
    "--expected-skills-sh-nodes": "expected_skills_sh_nodes",
    "--expected-skills-sh-catalog-entries": "expected_skills_sh_catalog_entries",
    "--expected-skills-sh-converted": "expected_skills_sh_converted",
    "--expected-skill-pages": "expected_skill_pages",
    "--expected-agent-pages": "expected_agent_pages",
    "--expected-mcp-pages": "expected_mcp_pages",
    "--expected-harness-pages": "expected_harness_pages",
    "--line-threshold": "line_threshold",
    "--max-stage-lines": "max_stage_lines",
}


def with_hf_repo_card_metadata(readme_text: str) -> str:
    """Return README text with Hugging Face repo-card metadata prepended."""
    return HF_CARD_METADATA + _strip_leading_yaml_frontmatter(readme_text)


def _strip_leading_yaml_frontmatter(text: str) -> str:
    if not text.startswith("---\n"):
        return text
    end = text.find("\n---\n", 4)
    if end == -1:
        return text
    return text[end + len("\n---\n") :].lstrip("\n")


def _git(repo: Path, *args: str) -> str:
    return subprocess.check_output(["git", *args], cwd=repo, text=True).strip()


def _git_bytes(repo: Path, *args: str) -> bytes:
    return subprocess.check_output(["git", *args], cwd=repo)


def _iter_tracked_files(repo: Path) -> list[Path]:
    output = _git_bytes(repo, "ls-files", "-z")
    files: list[Path] = []
    for raw in output.split(b"\0"):
        if not raw:
            continue
        rel = Path(raw.decode("utf-8"))
        if rel.is_absolute() or ".." in rel.parts:
            raise ValueError(f"unsafe git path: {rel}")
        files.append(rel)
    return files


def _assert_hydrated_artifacts(repo: Path) -> None:
    for rel, min_bytes in HYDRATED_ARTIFACT_MIN_BYTES.items():
        artifact = repo / rel
        if not artifact.is_file():
            raise FileNotFoundError(
                f"{rel.as_posix()} is required before Hugging Face sync"
            )
        size = artifact.stat().st_size
        if size < min_bytes:
            raise RuntimeError(
                f"{rel.as_posix()} is {size:,} bytes; expected at least "
                f"{min_bytes:,}. Download or rebuild graph release artifacts "
                "before publishing."
            )
        with artifact.open("rb") as fh:
            prefix = fh.read(len(LFS_POINTER_PREFIX))
        if prefix == LFS_POINTER_PREFIX:
            raise RuntimeError(
                f"{rel.as_posix()} is a Git LFS pointer, not the hydrated artifact"
            )
        _assert_matches_lfs_pointer(repo, rel, artifact)
    _validate_graph_artifact_integrity(repo)


def _parse_lfs_pointer(text: str) -> tuple[str, int] | None:
    if not text.startswith(LFS_POINTER_PREFIX.decode("ascii")):
        return None
    oid: str | None = None
    size: int | None = None
    for line in text.splitlines():
        if line.startswith("oid sha256:"):
            oid = line.split(":", 1)[1].strip()
        elif line.startswith("size "):
            try:
                size = int(line.split(" ", 1)[1].strip())
            except ValueError:
                size = None
    if not oid or size is None:
        return None
    return oid, size


def _assert_matches_lfs_pointer(repo: Path, rel: Path, artifact: Path) -> None:
    try:
        raw_pointer = _git_bytes(repo, "show", f"HEAD:{rel.as_posix()}")
    except subprocess.CalledProcessError:
        return
    pointer = raw_pointer.decode("utf-8", errors="replace")
    contract = _parse_lfs_pointer(pointer)
    if contract is None:
        return
    expected_oid, expected_size = contract
    actual_size = artifact.stat().st_size
    sha = hashlib.sha256()
    with artifact.open("rb") as fh:
        for chunk in iter(lambda: fh.read(1024 * 1024), b""):
            sha.update(chunk)
    actual_oid = sha.hexdigest()
    if actual_oid != expected_oid or actual_size != expected_size:
        raise RuntimeError(
            f"{rel.as_posix()} does not match HEAD LFS pointer: "
            f"sha256:{actual_oid} size:{actual_size}; expected "
            f"sha256:{expected_oid} size:{expected_size}"
        )


def _validate_graph_artifact_integrity(repo: Path) -> None:
    validate_graph_artifacts = _load_graph_artifact_validator(repo)
    try:
        validate_graph_artifacts(repo / "graph", **_graph_validation_kwargs(repo))
    except Exception as exc:  # noqa: BLE001
        raise RuntimeError(
            "graph artifact integrity validation failed before Hugging Face sync: "
            f"{exc}"
        ) from exc


def _load_graph_artifact_validator(repo: Path) -> Callable[..., object]:
    src_dir = repo / "src"
    if str(src_dir) not in sys.path:
        sys.path.insert(0, str(src_dir))
    from validate_graph_artifacts import validate_graph_artifacts

    return validate_graph_artifacts


def _graph_validation_kwargs(repo: Path) -> dict[str, object]:
    scripts_dir = repo / "scripts"
    if str(scripts_dir) not in sys.path:
        sys.path.insert(0, str(scripts_dir))
    from ci_preflight import GRAPH_VALIDATE_ARGS

    args = list(GRAPH_VALIDATE_ARGS[1:])
    kwargs: dict[str, object] = {}
    i = 0
    while i < len(args):
        token = args[i]
        if token == "--graph-dir":
            if i + 1 >= len(args):
                raise RuntimeError("GRAPH_VALIDATE_ARGS has --graph-dir without a value")
            if args[i + 1] != "graph":
                raise RuntimeError(
                    "Hugging Face graph validation expects GRAPH_VALIDATE_ARGS "
                    f"to use --graph-dir graph, got {args[i + 1]!r}"
                )
            i += 2
            continue
        if token == "--deep":
            kwargs["deep"] = True
            i += 1
            continue
        field_name = GRAPH_VALIDATOR_INT_FLAGS.get(token)
        if field_name is None:
            raise RuntimeError(
                f"Hugging Face sync does not understand graph validator flag {token!r}"
            )
        if i + 1 >= len(args):
            raise RuntimeError(f"GRAPH_VALIDATE_ARGS has {token} without a value")
        kwargs[field_name] = int(args[i + 1])
        i += 2
    return kwargs


def _assert_repo_stats_current(repo: Path) -> None:
    updater = repo / "src" / "update_repo_stats.py"
    if not updater.is_file():
        raise FileNotFoundError("src/update_repo_stats.py is required before Hugging Face sync")
    subprocess.run(
        [sys.executable, str(updater), "--check"],
        cwd=repo,
        check=True,
    )


def _export_tracked_tree(repo: Path, export_dir: Path) -> None:
    _assert_repo_stats_current(repo)
    _assert_hydrated_artifacts(repo)
    repo_root = repo.resolve()
    export_root = export_dir.resolve()
    for rel in _iter_tracked_files(repo):
        source = (repo_root / rel).resolve()
        if source != repo_root and not source.is_relative_to(repo_root):
            raise ValueError(f"unsafe source path: {rel}")
        if source.is_symlink():
            raise ValueError(f"refusing to follow symlink during HF sync: {rel}")
        if not source.is_file():
            continue
        target = (export_root / rel).resolve()
        if target != export_root and not target.is_relative_to(export_root):
            raise ValueError(f"unsafe export path: {rel}")
        target.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy2(source, target)
    _copy_hydrated_artifacts(repo_root, export_root)


def _copy_hydrated_artifacts(repo_root: Path, export_root: Path) -> None:
    """Copy required graph artifacts even when they are intentionally untracked."""
    for rel in HYDRATED_ARTIFACT_MIN_BYTES:
        source = (repo_root / rel).resolve()
        if source != repo_root and not source.is_relative_to(repo_root):
            raise ValueError(f"unsafe artifact source path: {rel}")
        target = (export_root / rel).resolve()
        if target != export_root and not target.is_relative_to(export_root):
            raise ValueError(f"unsafe artifact export path: {rel}")
        target.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy2(source, target)


def _patch_export_readme(export_dir: Path) -> None:
    readme = export_dir / "README.md"
    readme.write_text(
        with_hf_repo_card_metadata(readme.read_text(encoding="utf-8")),
        encoding="utf-8",
        newline="\n",
    )


def _hf_status_code(exc: BaseException) -> int | None:
    response = getattr(exc, "response", None)
    status = getattr(response, "status_code", None)
    if isinstance(status, int):
        return status
    match = re.search(r"\b(4\d\d|5\d\d)\b", str(exc))
    if match:
        return int(match.group(1))
    return None


def _ensure_hf_repo_exists(*, api: Any, repo_id: str, repo_type: str) -> None:
    try:
        api.repo_info(repo_id=repo_id, repo_type=repo_type)
        return
    except Exception as info_exc:  # noqa: BLE001
        if _hf_status_code(info_exc) != 404:
            raise

    try:
        api.create_repo(repo_id, repo_type=repo_type, exist_ok=True)
    except Exception as create_exc:  # noqa: BLE001
        if _hf_status_code(create_exc) == 429:
            api.repo_info(repo_id=repo_id, repo_type=repo_type)
            return
        raise


def _upload_export(
    *,
    api: Any,
    export_dir: Path,
    repo_id: str,
    repo_type: str,
    head: str,
) -> str:
    info = api.upload_folder(
        repo_id=repo_id,
        repo_type=repo_type,
        folder_path=str(export_dir),
        commit_message=f"Sync ctx {head[:7]}",
        commit_description=f"GitHub commit: {head}",
        delete_patterns="*",
    )
    return str(getattr(info, "commit_url", info))


def _upload_readme_card(
    *,
    api: Any,
    repo: Path,
    repo_id: str,
    repo_type: str,
    head: str,
) -> str:
    readme = repo / "README.md"
    rendered = with_hf_repo_card_metadata(readme.read_text(encoding="utf-8"))
    workspace = Path(tempfile.mkdtemp(prefix="ctx-hf-card-"))
    try:
        card = workspace / "README.md"
        card.write_text(rendered, encoding="utf-8", newline="\n")
        info = api.upload_file(
            repo_id=repo_id,
            repo_type=repo_type,
            path_or_fileobj=str(card),
            path_in_repo="README.md",
            commit_message=f"Sync ctx card {head[:7]}",
            commit_description=f"GitHub commit: {head}",
        )
        return str(getattr(info, "commit_url", info))
    finally:
        shutil.rmtree(workspace, ignore_errors=True)


def sync_to_huggingface(
    *,
    repo: Path,
    repo_id: str,
    repo_type: str,
    token: str,
) -> str:
    """Upload HEAD to Hugging Face and return the commit URL."""
    from huggingface_hub import HfApi

    head = _git(repo, "rev-parse", "HEAD")
    workspace = Path(tempfile.mkdtemp(prefix="ctx-hf-upload-"))
    export_dir = workspace / "export"
    try:
        export_dir.mkdir()
        _export_tracked_tree(repo, export_dir)
        _patch_export_readme(export_dir)
        api = HfApi(token=token)
        _ensure_hf_repo_exists(api=api, repo_id=repo_id, repo_type=repo_type)
        return _upload_export(
            api=api,
            repo_id=repo_id,
            repo_type=repo_type,
            export_dir=export_dir,
            head=head,
        )
    finally:
        shutil.rmtree(workspace, ignore_errors=True)


def sync_card_to_huggingface(
    *,
    repo: Path,
    repo_id: str,
    repo_type: str,
    token: str,
) -> str:
    """Upload only the Hugging Face repo card README."""
    from huggingface_hub import HfApi

    head = _git(repo, "rev-parse", "HEAD")
    api = HfApi(token=token)
    _ensure_hf_repo_exists(api=api, repo_id=repo_id, repo_type=repo_type)
    return _upload_readme_card(
        api=api,
        repo=repo,
        repo_id=repo_id,
        repo_type=repo_type,
        head=head,
    )


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Upload this git checkout to Hugging Face with repo-card metadata"
    )
    parser.add_argument("--repo", default=".", help="Git checkout path")
    parser.add_argument(
        "--repo-id",
        default=os.environ.get("HF_REPO_ID", DEFAULT_REPO_ID),
        help="Hugging Face repo ID",
    )
    parser.add_argument(
        "--repo-type",
        default=os.environ.get("HF_REPO_TYPE", DEFAULT_REPO_TYPE),
        help="Hugging Face repo type",
    )
    parser.add_argument(
        "--card-only",
        action="store_true",
        help="Only refresh README.md repo-card metadata without uploading artifacts",
    )
    args = parser.parse_args()
    token = os.environ.get("HF_TOKEN")
    if not token:
        raise SystemExit("HF_TOKEN is required")
    sync = sync_card_to_huggingface if args.card_only else sync_to_huggingface
    print(
        sync(
            repo=Path(args.repo).resolve(),
            repo_id=args.repo_id,
            repo_type=args.repo_type,
            token=token,
        )
    )


if __name__ == "__main__":
    main()