Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| from __future__ import annotations | |
| import os | |
| from pathlib import Path | |
| def _download_file(*, repo_id: str, filename: str, local_dir: str, token: str | None, subfolder: str | None = None) -> Path: | |
| from huggingface_hub import hf_hub_download | |
| path = hf_hub_download( | |
| repo_id=repo_id, | |
| repo_type="model", | |
| filename=filename, | |
| subfolder=subfolder, | |
| token=token, | |
| local_dir=local_dir, | |
| local_dir_use_symlinks=False, | |
| ) | |
| return Path(path) | |
| def resolve_tokenizer_cache_repo(*, output_repo: str, retina_cache_repo: str) -> str: | |
| return ( | |
| os.environ.get("HYDRA_TOKENIZER_CACHE_REPO") | |
| or os.environ.get("FEATHER_HF_OUTPUT_REPO") | |
| or os.environ.get("HF_REPO_ID") | |
| or os.environ.get("HYDRA_RETINA_CACHE_REPO") | |
| or os.environ.get("FEATHER_HF_RETINA_CACHE_REPO") | |
| or output_repo | |
| or retina_cache_repo | |
| ) | |
| def tokenizer_cache_prefix() -> str: | |
| vocab_size = int(os.environ.get("HYDRA_VOCAB_SIZE", "65536")) | |
| return f"tokenizer/vocab{vocab_size}" | |
| def hydrate_benchmark_assets(*, cache_dir: Path, output_repo: str, tokenizer_repo: str, token: str | None) -> dict[str, str]: | |
| cache_dir.mkdir(parents=True, exist_ok=True) | |
| tok_dir = cache_dir / "tokenizer" | |
| tok_dir.mkdir(parents=True, exist_ok=True) | |
| tok_repo = resolve_tokenizer_cache_repo(output_repo=tokenizer_repo, retina_cache_repo=tokenizer_repo) | |
| tok_prefix = tokenizer_cache_prefix() | |
| ckpt_path = cache_dir / "best_bpb.pt" | |
| if not ckpt_path.exists(): | |
| ckpt_path = _download_file(repo_id=output_repo, filename="best_bpb.pt", local_dir=str(cache_dir), token=token) | |
| tok_path = tok_dir / "tokenizer.pkl" | |
| if not tok_path.exists(): | |
| tok_path = _download_file(repo_id=tok_repo, filename="tokenizer.pkl", local_dir=str(tok_dir), token=token, subfolder=tok_prefix) | |
| token_bytes_path = tok_dir / "token_bytes.pt" | |
| if not token_bytes_path.exists(): | |
| token_bytes_path = _download_file(repo_id=tok_repo, filename="token_bytes.pt", local_dir=str(tok_dir), token=token, subfolder=tok_prefix) | |
| return { | |
| "checkpoint_path": str(ckpt_path), | |
| "tokenizer_dir": str(tok_dir), | |
| } | |