File size: 10,001 Bytes
b751bb5
 
 
 
 
 
 
 
 
 
 
 
53d5d9f
b751bb5
53d5d9f
37d98fb
 
b751bb5
 
53d5d9f
 
b751bb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37d98fb
 
 
 
 
53d5d9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37d98fb
 
 
 
 
b751bb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53d5d9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b751bb5
37d98fb
 
b751bb5
 
 
 
 
 
 
 
 
 
37d98fb
b751bb5
53d5d9f
 
 
 
 
 
 
 
b751bb5
 
 
 
 
 
 
37d98fb
 
b751bb5
53d5d9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b751bb5
53d5d9f
 
 
 
 
 
b751bb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53d5d9f
 
 
 
 
 
37d98fb
 
 
 
53d5d9f
37d98fb
53d5d9f
 
 
 
 
 
 
 
37d98fb
b751bb5
37d98fb
 
 
b751bb5
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
#!/usr/bin/env python3
"""
Upload trained artifacts to Hugging Face Hub.

This repo uses local-path inference. The upload is intended so you can later
download these directories into the same folder layout and run inference.
"""

from __future__ import annotations

import argparse
import os
import shutil
import sys
import tempfile
import time
from datetime import datetime, timezone
from pathlib import Path

LARGE_FILE_UPLOAD_THRESHOLD_BYTES = 100 * 1024 * 1024


def _parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Upload trained intent/IAB artifacts to Hugging Face Hub.")
    parser.add_argument(
        "--repo-id",
        required=True,
        help="HF repo id, e.g. 'yourname/admesh-intent-iab-v1'.",
    )
    parser.add_argument(
        "--token",
        default=os.environ.get("HF_TOKEN"),
        help="HF token. If omitted, uses env HF_TOKEN.",
    )
    parser.add_argument(
        "--private",
        action="store_true",
        help="Create the repo as private.",
    )
    parser.add_argument(
        "--include-multitask",
        action="store_true",
        help="Upload multitask intent model output directory.",
    )
    parser.add_argument(
        "--include-iab",
        action="store_true",
        help="Upload IAB classifier model output directory.",
    )
    parser.add_argument(
        "--include-calibration",
        action="store_true",
        help="Upload artifacts/calibration directory.",
    )
    parser.add_argument(
        "--include-hf-readme",
        action="store_true",
        help="Upload a Hugging Face model card file as README.md in the Hub repo root.",
    )
    parser.add_argument(
        "--include-serving-code",
        action="store_true",
        help="Upload core runtime Python/code files required for Hub trust_remote_code inference.",
    )
    parser.add_argument(
        "--include-root-checkpoint",
        action="store_true",
        help="Upload root-level compatibility checkpoint/tokenizer files used by transformers.pipeline loader.",
    )
    parser.add_argument(
        "--include-all",
        action="store_true",
        help=(
            "Upload everything needed for end-to-end Hub usage: multitask + iab + calibration + "
            "HF README + serving code + root checkpoint/tokenizer files."
        ),
    )
    parser.add_argument(
        "--hf-readme-path",
        default="HF_MODEL_CARD.md",
        help="Local path to the HF model card markdown to upload as README.md (relative to repo root).",
    )
    parser.add_argument(
        "--multitask-dir",
        default="multitask_intent_model_output",
        help="Path to multitask intent output directory (relative to this script's base).",
    )
    parser.add_argument(
        "--iab-dir",
        default="iab_classifier_model_output",
        help="Path to IAB classifier model output directory (relative to this script's base).",
    )
    parser.add_argument(
        "--calibration-dir",
        default="artifacts/calibration",
        help="Path to calibration artifacts directory (relative to this script's base).",
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Print what would be uploaded without actually uploading.",
    )
    return parser.parse_args()


def _iter_local_files(path: Path) -> list[Path]:
    if path.is_file():
        return [path]
    return sorted(p for p in path.rglob("*") if p.is_file())


def _remote_file_paths(path_in_repo: str, local_path: Path) -> list[str]:
    if local_path.is_file():
        return [path_in_repo]
    return [
        f"{path_in_repo}/{file_path.relative_to(local_path).as_posix()}"
        for file_path in _iter_local_files(local_path)
    ]


def _requires_large_upload(local_path: Path) -> bool:
    return any(file_path.stat().st_size >= LARGE_FILE_UPLOAD_THRESHOLD_BYTES for file_path in _iter_local_files(local_path))


def _upload_via_large_folder(api, repo_id: str, repo_path: str, local_path: Path) -> None:
    with tempfile.TemporaryDirectory(prefix="hf_large_upload_") as tmp_dir:
        staging_root = Path(tmp_dir)
        staged_target = staging_root / repo_path
        staged_target.parent.mkdir(parents=True, exist_ok=True)
        if local_path.is_file():
            shutil.copy2(local_path, staged_target)
        else:
            shutil.copytree(
                local_path,
                staged_target,
                ignore=shutil.ignore_patterns(".cache", "__pycache__"),
            )
        # Ensure resumable-upload metadata from previous local attempts does not
        # get carried into the fresh staging directory.
        shutil.rmtree(staged_target / ".cache", ignore_errors=True)
        api.upload_large_folder(
            repo_id=repo_id,
            repo_type="model",
            folder_path=str(staging_root),
            print_report=False,
        )


def _verify_remote_upload(api, repo_id: str, repo_path: str, local_path: Path) -> None:
    expected = set(_remote_file_paths(repo_path, local_path))
    for attempt in range(4):
        files = set(api.list_repo_files(repo_id=repo_id, repo_type="model"))
        missing = sorted(expected - files)
        if not missing:
            return
        if attempt == 3:
            raise RuntimeError(
                "Upload completed but the following remote files are still missing: "
                + ", ".join(missing[:20])
            )
        time.sleep(2 * (attempt + 1))


def main() -> int:
    started_at = time.perf_counter()
    started_wall = datetime.now(timezone.utc).isoformat()
    args = _parse_args()
    if not args.token:
        print("Missing HF token. Provide --token or set env HF_TOKEN.", file=sys.stderr)
        return 2

    repo_root = Path(__file__).resolve().parent.parent

    multitask_dir = (repo_root / args.multitask_dir).resolve()
    iab_dir = (repo_root / args.iab_dir).resolve()
    calibration_dir = (repo_root / args.calibration_dir).resolve()
    hf_readme_path = (repo_root / args.hf_readme_path).resolve()

    if args.include_all:
        args.include_multitask = True
        args.include_iab = True
        args.include_calibration = True
        args.include_hf_readme = True
        args.include_serving_code = True
        args.include_root_checkpoint = True

    to_upload: list[tuple[str, Path]] = []
    if args.include_multitask:
        to_upload.append(("multitask_intent_model_output", multitask_dir))
    if args.include_iab:
        to_upload.append(("iab_classifier_model_output", iab_dir))
    if args.include_calibration:
        to_upload.append(("artifacts/calibration", calibration_dir))
    if args.include_hf_readme:
        to_upload.append(("README.md", hf_readme_path))

    if args.include_serving_code:
        # Files needed by trust_remote_code execution path.
        for rel in [
            "pipeline.py",
            "config.py",
            "config.json",
            "combined_inference.py",
            "model_runtime.py",
            "multitask_runtime.py",
            "multitask_model.py",
            "schemas.py",
            "inference_intent_type.py",
            "inference_subtype.py",
            "inference_decision_phase.py",
            "inference_iab_classifier.py",
            "iab_classifier.py",
            "iab_taxonomy.py",
        ]:
            to_upload.append((rel, (repo_root / rel).resolve()))

    if args.include_root_checkpoint:
        for rel in [
            "model.safetensors",
            "tokenizer.json",
            "tokenizer_config.json",
            "special_tokens_map.json",
            "vocab.txt",
        ]:
            to_upload.append((rel, (repo_root / rel).resolve()))

    if not to_upload:
        print(
            "Nothing to upload. Pass include flags (e.g. --include-all), or one/more of: "
            "--include-multitask --include-iab --include-calibration --include-hf-readme "
            "--include-serving-code --include-root-checkpoint.",
            file=sys.stderr,
        )
        return 2

    # Import lazily so `--dry-run` works without extra deps.
    try:
        from huggingface_hub import HfApi
    except ModuleNotFoundError:
        print("Missing dependency: huggingface_hub. Install with: pip install huggingface_hub", file=sys.stderr)
        return 2

    api = HfApi(token=args.token)
    api.create_repo(repo_id=args.repo_id, repo_type="model", private=args.private, exist_ok=True)

    for repo_path, local_dir in to_upload:
        if not local_dir.exists():
            print(f"[SKIP] {repo_path}: local path does not exist: {local_dir}", file=sys.stderr)
            continue
        if args.dry_run:
            print(f"[DRY] Would upload {local_dir} -> {args.repo_id}:{repo_path}")
            continue
        step_start = time.perf_counter()
        mode = "large-folder" if _requires_large_upload(local_dir) else "standard"
        print(f"[UPLOAD] {local_dir} -> {args.repo_id}:{repo_path} ({mode})")
        if mode == "large-folder":
            _upload_via_large_folder(api, args.repo_id, repo_path, local_dir)
        elif local_dir.is_file():
            api.upload_file(
                repo_id=args.repo_id,
                repo_type="model",
                path_or_fileobj=str(local_dir),
                path_in_repo=repo_path,
            )
        else:
            api.upload_folder(
                repo_id=args.repo_id,
                repo_type="model",
                folder_path=str(local_dir),
                path_in_repo=repo_path,
            )
        _verify_remote_upload(api, args.repo_id, repo_path, local_dir)
        print(f"[DONE ] {repo_path} took {(time.perf_counter() - step_start):.2f}s")

    ended_wall = datetime.now(timezone.utc).isoformat()
    elapsed_s = time.perf_counter() - started_at
    print(f"Upload complete.\nstart: {started_wall}\nend:   {ended_wall}\ntotal: {elapsed_s:.2f}s")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())