| |
| """Sync the current git tree to Hugging Face with HF-only card metadata.""" |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import hashlib |
| import os |
| import re |
| import shutil |
| import subprocess |
| import sys |
| import tempfile |
| from pathlib import Path |
| from typing import Any, Callable |
|
|
| DEFAULT_REPO_ID = "Stevesolun/ctx" |
| DEFAULT_REPO_TYPE = "model" |
|
|
| HF_CARD_METADATA = """--- |
| license: mit |
| pretty_name: ctx |
| tags: |
| - agents |
| - mcp |
| - skills |
| - knowledge-graph |
| - llm-wiki |
| - recommendation-system |
| - harness |
| - codex |
| - claude-code |
| --- |
| |
| """ |
|
|
| LFS_POINTER_PREFIX = b"version https://git-lfs.github.com/spec/v1" |
| HYDRATED_ARTIFACT_MIN_BYTES = { |
| Path("graph/wiki-graph.tar.gz"): 100_000_000, |
| Path("graph/wiki-graph-runtime.tar.gz"): 10_000_000, |
| Path("graph/skills-sh-catalog.json.gz"): 1_000_000, |
| } |
| GRAPH_VALIDATOR_INT_FLAGS = { |
| "--min-nodes": "min_nodes", |
| "--min-edges": "min_edges", |
| "--min-skills-sh-nodes": "min_skills_sh_nodes", |
| "--min-semantic-edges": "min_semantic_edges", |
| "--expected-nodes": "expected_nodes", |
| "--expected-edges": "expected_edges", |
| "--expected-semantic-edges": "expected_semantic_edges", |
| "--expected-harness-nodes": "expected_harness_nodes", |
| "--expected-skills-sh-nodes": "expected_skills_sh_nodes", |
| "--expected-skills-sh-catalog-entries": "expected_skills_sh_catalog_entries", |
| "--expected-skills-sh-converted": "expected_skills_sh_converted", |
| "--expected-skill-pages": "expected_skill_pages", |
| "--expected-agent-pages": "expected_agent_pages", |
| "--expected-mcp-pages": "expected_mcp_pages", |
| "--expected-harness-pages": "expected_harness_pages", |
| "--line-threshold": "line_threshold", |
| "--max-stage-lines": "max_stage_lines", |
| } |
|
|
|
|
| def with_hf_repo_card_metadata(readme_text: str) -> str: |
| """Return README text with Hugging Face repo-card metadata prepended.""" |
| return HF_CARD_METADATA + _strip_leading_yaml_frontmatter(readme_text) |
|
|
|
|
| def _strip_leading_yaml_frontmatter(text: str) -> str: |
| if not text.startswith("---\n"): |
| return text |
| end = text.find("\n---\n", 4) |
| if end == -1: |
| return text |
| return text[end + len("\n---\n") :].lstrip("\n") |
|
|
|
|
| def _git(repo: Path, *args: str) -> str: |
| return subprocess.check_output(["git", *args], cwd=repo, text=True).strip() |
|
|
|
|
| def _git_bytes(repo: Path, *args: str) -> bytes: |
| return subprocess.check_output(["git", *args], cwd=repo) |
|
|
|
|
| def _iter_tracked_files(repo: Path) -> list[Path]: |
| output = _git_bytes(repo, "ls-files", "-z") |
| files: list[Path] = [] |
| for raw in output.split(b"\0"): |
| if not raw: |
| continue |
| rel = Path(raw.decode("utf-8")) |
| if rel.is_absolute() or ".." in rel.parts: |
| raise ValueError(f"unsafe git path: {rel}") |
| files.append(rel) |
| return files |
|
|
|
|
| def _assert_hydrated_artifacts(repo: Path) -> None: |
| for rel, min_bytes in HYDRATED_ARTIFACT_MIN_BYTES.items(): |
| artifact = repo / rel |
| if not artifact.is_file(): |
| raise FileNotFoundError( |
| f"{rel.as_posix()} is required before Hugging Face sync" |
| ) |
| size = artifact.stat().st_size |
| if size < min_bytes: |
| raise RuntimeError( |
| f"{rel.as_posix()} is {size:,} bytes; expected at least " |
| f"{min_bytes:,}. Download or rebuild graph release artifacts " |
| "before publishing." |
| ) |
| with artifact.open("rb") as fh: |
| prefix = fh.read(len(LFS_POINTER_PREFIX)) |
| if prefix == LFS_POINTER_PREFIX: |
| raise RuntimeError( |
| f"{rel.as_posix()} is a Git LFS pointer, not the hydrated artifact" |
| ) |
| _assert_matches_lfs_pointer(repo, rel, artifact) |
| _validate_graph_artifact_integrity(repo) |
|
|
|
|
| def _parse_lfs_pointer(text: str) -> tuple[str, int] | None: |
| if not text.startswith(LFS_POINTER_PREFIX.decode("ascii")): |
| return None |
| oid: str | None = None |
| size: int | None = None |
| for line in text.splitlines(): |
| if line.startswith("oid sha256:"): |
| oid = line.split(":", 1)[1].strip() |
| elif line.startswith("size "): |
| try: |
| size = int(line.split(" ", 1)[1].strip()) |
| except ValueError: |
| size = None |
| if not oid or size is None: |
| return None |
| return oid, size |
|
|
|
|
| def _assert_matches_lfs_pointer(repo: Path, rel: Path, artifact: Path) -> None: |
| try: |
| raw_pointer = _git_bytes(repo, "show", f"HEAD:{rel.as_posix()}") |
| except subprocess.CalledProcessError: |
| return |
| pointer = raw_pointer.decode("utf-8", errors="replace") |
| contract = _parse_lfs_pointer(pointer) |
| if contract is None: |
| return |
| expected_oid, expected_size = contract |
| actual_size = artifact.stat().st_size |
| sha = hashlib.sha256() |
| with artifact.open("rb") as fh: |
| for chunk in iter(lambda: fh.read(1024 * 1024), b""): |
| sha.update(chunk) |
| actual_oid = sha.hexdigest() |
| if actual_oid != expected_oid or actual_size != expected_size: |
| raise RuntimeError( |
| f"{rel.as_posix()} does not match HEAD LFS pointer: " |
| f"sha256:{actual_oid} size:{actual_size}; expected " |
| f"sha256:{expected_oid} size:{expected_size}" |
| ) |
|
|
|
|
| def _validate_graph_artifact_integrity(repo: Path) -> None: |
| validate_graph_artifacts = _load_graph_artifact_validator(repo) |
| try: |
| validate_graph_artifacts(repo / "graph", **_graph_validation_kwargs(repo)) |
| except Exception as exc: |
| raise RuntimeError( |
| "graph artifact integrity validation failed before Hugging Face sync: " |
| f"{exc}" |
| ) from exc |
|
|
|
|
| def _load_graph_artifact_validator(repo: Path) -> Callable[..., object]: |
| src_dir = repo / "src" |
| if str(src_dir) not in sys.path: |
| sys.path.insert(0, str(src_dir)) |
| from validate_graph_artifacts import validate_graph_artifacts |
|
|
| return validate_graph_artifacts |
|
|
|
|
| def _graph_validation_kwargs(repo: Path) -> dict[str, object]: |
| scripts_dir = repo / "scripts" |
| if str(scripts_dir) not in sys.path: |
| sys.path.insert(0, str(scripts_dir)) |
| from ci_preflight import GRAPH_VALIDATE_ARGS |
|
|
| args = list(GRAPH_VALIDATE_ARGS[1:]) |
| kwargs: dict[str, object] = {} |
| i = 0 |
| while i < len(args): |
| token = args[i] |
| if token == "--graph-dir": |
| if i + 1 >= len(args): |
| raise RuntimeError("GRAPH_VALIDATE_ARGS has --graph-dir without a value") |
| if args[i + 1] != "graph": |
| raise RuntimeError( |
| "Hugging Face graph validation expects GRAPH_VALIDATE_ARGS " |
| f"to use --graph-dir graph, got {args[i + 1]!r}" |
| ) |
| i += 2 |
| continue |
| if token == "--deep": |
| kwargs["deep"] = True |
| i += 1 |
| continue |
| field_name = GRAPH_VALIDATOR_INT_FLAGS.get(token) |
| if field_name is None: |
| raise RuntimeError( |
| f"Hugging Face sync does not understand graph validator flag {token!r}" |
| ) |
| if i + 1 >= len(args): |
| raise RuntimeError(f"GRAPH_VALIDATE_ARGS has {token} without a value") |
| kwargs[field_name] = int(args[i + 1]) |
| i += 2 |
| return kwargs |
|
|
|
|
| def _assert_repo_stats_current(repo: Path) -> None: |
| updater = repo / "src" / "update_repo_stats.py" |
| if not updater.is_file(): |
| raise FileNotFoundError("src/update_repo_stats.py is required before Hugging Face sync") |
| subprocess.run( |
| [sys.executable, str(updater), "--check"], |
| cwd=repo, |
| check=True, |
| ) |
|
|
|
|
| def _export_tracked_tree(repo: Path, export_dir: Path) -> None: |
| _assert_repo_stats_current(repo) |
| _assert_hydrated_artifacts(repo) |
| repo_root = repo.resolve() |
| export_root = export_dir.resolve() |
| for rel in _iter_tracked_files(repo): |
| source = (repo_root / rel).resolve() |
| if source != repo_root and not source.is_relative_to(repo_root): |
| raise ValueError(f"unsafe source path: {rel}") |
| if source.is_symlink(): |
| raise ValueError(f"refusing to follow symlink during HF sync: {rel}") |
| if not source.is_file(): |
| continue |
| target = (export_root / rel).resolve() |
| if target != export_root and not target.is_relative_to(export_root): |
| raise ValueError(f"unsafe export path: {rel}") |
| target.parent.mkdir(parents=True, exist_ok=True) |
| shutil.copy2(source, target) |
| _copy_hydrated_artifacts(repo_root, export_root) |
|
|
|
|
| def _copy_hydrated_artifacts(repo_root: Path, export_root: Path) -> None: |
| """Copy required graph artifacts even when they are intentionally untracked.""" |
| for rel in HYDRATED_ARTIFACT_MIN_BYTES: |
| source = (repo_root / rel).resolve() |
| if source != repo_root and not source.is_relative_to(repo_root): |
| raise ValueError(f"unsafe artifact source path: {rel}") |
| target = (export_root / rel).resolve() |
| if target != export_root and not target.is_relative_to(export_root): |
| raise ValueError(f"unsafe artifact export path: {rel}") |
| target.parent.mkdir(parents=True, exist_ok=True) |
| shutil.copy2(source, target) |
|
|
|
|
| def _patch_export_readme(export_dir: Path) -> None: |
| readme = export_dir / "README.md" |
| readme.write_text( |
| with_hf_repo_card_metadata(readme.read_text(encoding="utf-8")), |
| encoding="utf-8", |
| newline="\n", |
| ) |
|
|
|
|
| def _hf_status_code(exc: BaseException) -> int | None: |
| response = getattr(exc, "response", None) |
| status = getattr(response, "status_code", None) |
| if isinstance(status, int): |
| return status |
| match = re.search(r"\b(4\d\d|5\d\d)\b", str(exc)) |
| if match: |
| return int(match.group(1)) |
| return None |
|
|
|
|
| def _ensure_hf_repo_exists(*, api: Any, repo_id: str, repo_type: str) -> None: |
| try: |
| api.repo_info(repo_id=repo_id, repo_type=repo_type) |
| return |
| except Exception as info_exc: |
| if _hf_status_code(info_exc) != 404: |
| raise |
|
|
| try: |
| api.create_repo(repo_id, repo_type=repo_type, exist_ok=True) |
| except Exception as create_exc: |
| if _hf_status_code(create_exc) == 429: |
| api.repo_info(repo_id=repo_id, repo_type=repo_type) |
| return |
| raise |
|
|
|
|
| def _upload_export( |
| *, |
| api: Any, |
| export_dir: Path, |
| repo_id: str, |
| repo_type: str, |
| head: str, |
| ) -> str: |
| info = api.upload_folder( |
| repo_id=repo_id, |
| repo_type=repo_type, |
| folder_path=str(export_dir), |
| commit_message=f"Sync ctx {head[:7]}", |
| commit_description=f"GitHub commit: {head}", |
| delete_patterns="*", |
| ) |
| return str(getattr(info, "commit_url", info)) |
|
|
|
|
| def _upload_readme_card( |
| *, |
| api: Any, |
| repo: Path, |
| repo_id: str, |
| repo_type: str, |
| head: str, |
| ) -> str: |
| readme = repo / "README.md" |
| rendered = with_hf_repo_card_metadata(readme.read_text(encoding="utf-8")) |
| workspace = Path(tempfile.mkdtemp(prefix="ctx-hf-card-")) |
| try: |
| card = workspace / "README.md" |
| card.write_text(rendered, encoding="utf-8", newline="\n") |
| info = api.upload_file( |
| repo_id=repo_id, |
| repo_type=repo_type, |
| path_or_fileobj=str(card), |
| path_in_repo="README.md", |
| commit_message=f"Sync ctx card {head[:7]}", |
| commit_description=f"GitHub commit: {head}", |
| ) |
| return str(getattr(info, "commit_url", info)) |
| finally: |
| shutil.rmtree(workspace, ignore_errors=True) |
|
|
|
|
| def sync_to_huggingface( |
| *, |
| repo: Path, |
| repo_id: str, |
| repo_type: str, |
| token: str, |
| ) -> str: |
| """Upload HEAD to Hugging Face and return the commit URL.""" |
| from huggingface_hub import HfApi |
|
|
| head = _git(repo, "rev-parse", "HEAD") |
| workspace = Path(tempfile.mkdtemp(prefix="ctx-hf-upload-")) |
| export_dir = workspace / "export" |
| try: |
| export_dir.mkdir() |
| _export_tracked_tree(repo, export_dir) |
| _patch_export_readme(export_dir) |
| api = HfApi(token=token) |
| _ensure_hf_repo_exists(api=api, repo_id=repo_id, repo_type=repo_type) |
| return _upload_export( |
| api=api, |
| repo_id=repo_id, |
| repo_type=repo_type, |
| export_dir=export_dir, |
| head=head, |
| ) |
| finally: |
| shutil.rmtree(workspace, ignore_errors=True) |
|
|
|
|
| def sync_card_to_huggingface( |
| *, |
| repo: Path, |
| repo_id: str, |
| repo_type: str, |
| token: str, |
| ) -> str: |
| """Upload only the Hugging Face repo card README.""" |
| from huggingface_hub import HfApi |
|
|
| head = _git(repo, "rev-parse", "HEAD") |
| api = HfApi(token=token) |
| _ensure_hf_repo_exists(api=api, repo_id=repo_id, repo_type=repo_type) |
| return _upload_readme_card( |
| api=api, |
| repo=repo, |
| repo_id=repo_id, |
| repo_type=repo_type, |
| head=head, |
| ) |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser( |
| description="Upload this git checkout to Hugging Face with repo-card metadata" |
| ) |
| parser.add_argument("--repo", default=".", help="Git checkout path") |
| parser.add_argument( |
| "--repo-id", |
| default=os.environ.get("HF_REPO_ID", DEFAULT_REPO_ID), |
| help="Hugging Face repo ID", |
| ) |
| parser.add_argument( |
| "--repo-type", |
| default=os.environ.get("HF_REPO_TYPE", DEFAULT_REPO_TYPE), |
| help="Hugging Face repo type", |
| ) |
| parser.add_argument( |
| "--card-only", |
| action="store_true", |
| help="Only refresh README.md repo-card metadata without uploading artifacts", |
| ) |
| args = parser.parse_args() |
| token = os.environ.get("HF_TOKEN") |
| if not token: |
| raise SystemExit("HF_TOKEN is required") |
| sync = sync_card_to_huggingface if args.card_only else sync_to_huggingface |
| print( |
| sync( |
| repo=Path(args.repo).resolve(), |
| repo_id=args.repo_id, |
| repo_type=args.repo_type, |
| token=token, |
| ) |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|