Spaces:
Runtime error
Runtime error
File size: 2,295 Bytes
951f760 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 | #!/usr/bin/env python3
from __future__ import annotations
import os
from pathlib import Path
def _download_file(*, repo_id: str, filename: str, local_dir: str, token: str | None, subfolder: str | None = None) -> Path:
from huggingface_hub import hf_hub_download
path = hf_hub_download(
repo_id=repo_id,
repo_type="model",
filename=filename,
subfolder=subfolder,
token=token,
local_dir=local_dir,
local_dir_use_symlinks=False,
)
return Path(path)
def resolve_tokenizer_cache_repo(*, output_repo: str, retina_cache_repo: str) -> str:
return (
os.environ.get("HYDRA_TOKENIZER_CACHE_REPO")
or os.environ.get("FEATHER_HF_OUTPUT_REPO")
or os.environ.get("HF_REPO_ID")
or os.environ.get("HYDRA_RETINA_CACHE_REPO")
or os.environ.get("FEATHER_HF_RETINA_CACHE_REPO")
or output_repo
or retina_cache_repo
)
def tokenizer_cache_prefix() -> str:
vocab_size = int(os.environ.get("HYDRA_VOCAB_SIZE", "65536"))
return f"tokenizer/vocab{vocab_size}"
def hydrate_benchmark_assets(*, cache_dir: Path, output_repo: str, tokenizer_repo: str, token: str | None) -> dict[str, str]:
cache_dir.mkdir(parents=True, exist_ok=True)
tok_dir = cache_dir / "tokenizer"
tok_dir.mkdir(parents=True, exist_ok=True)
tok_repo = resolve_tokenizer_cache_repo(output_repo=tokenizer_repo, retina_cache_repo=tokenizer_repo)
tok_prefix = tokenizer_cache_prefix()
ckpt_path = cache_dir / "best_bpb.pt"
if not ckpt_path.exists():
ckpt_path = _download_file(repo_id=output_repo, filename="best_bpb.pt", local_dir=str(cache_dir), token=token)
tok_path = tok_dir / "tokenizer.pkl"
if not tok_path.exists():
tok_path = _download_file(repo_id=tok_repo, filename="tokenizer.pkl", local_dir=str(tok_dir), token=token, subfolder=tok_prefix)
token_bytes_path = tok_dir / "token_bytes.pt"
if not token_bytes_path.exists():
token_bytes_path = _download_file(repo_id=tok_repo, filename="token_bytes.pt", local_dir=str(tok_dir), token=token, subfolder=tok_prefix)
return {
"checkpoint_path": str(ckpt_path),
"tokenizer_dir": str(tok_dir),
}
|