| |
| """ |
| Upload dataset_builder code repository to Hugging Face Hub. |
| |
| Usage: |
| # Upload to your personal account: |
| python upload_code_to_hf.py --hf_user YOUR_USERNAME |
| |
| # Upload to an organization: |
| python upload_code_to_hf.py --hf_user YOUR_ORG |
| |
| # Custom repository name: |
| python upload_code_to_hf.py --hf_user YOUR_USERNAME --repo_name my-dataset-builder |
| |
| # Upload to a model repository (default): |
| python upload_code_to_hf.py --hf_user YOUR_USERNAME |
| |
| # Upload to a space repository: |
| python upload_code_to_hf.py --hf_user YOUR_USERNAME --repo_type space |
| """ |
|
|
| import os |
| import argparse |
| import logging |
| from pathlib import Path |
| from huggingface_hub import HfApi, create_repo |
| from huggingface_hub.utils import HfHubHTTPError |
| import time |
|
|
| logging.basicConfig( |
| level=logging.INFO, |
| format="%(asctime)s [%(levelname)s] %(message)s", |
| datefmt="%Y-%m-%d %H:%M:%S", |
| ) |
| logger = logging.getLogger(__name__) |
|
|
| |
| EXCLUDE_PATTERNS = { |
| "__pycache__", |
| "*.pyc", |
| "*.pyo", |
| "*.pyd", |
| ".git", |
| ".gitignore", |
| ".DS_Store", |
| "*.log", |
| "*.swp", |
| "*.swo", |
| "*~", |
| ".pytest_cache", |
| ".mypy_cache", |
| ".ruff_cache", |
| "*.egg-info", |
| "dist", |
| "build", |
| ".venv", |
| "venv", |
| "env", |
| ".env", |
| "node_modules", |
| ".idea", |
| ".vscode", |
| ".cursor", |
| } |
|
|
| |
| ALWAYS_INCLUDE = { |
| ".gitignore", |
| "README.md", |
| "requirements.txt", |
| "setup.py", |
| "pyproject.toml", |
| } |
|
|
|
|
| def should_exclude(file_path: Path, root: Path) -> bool: |
| """Check if a file should be excluded from upload.""" |
| rel_path = file_path.relative_to(root) |
| |
| |
| if rel_path.name in ALWAYS_INCLUDE: |
| return False |
| |
| |
| for part in rel_path.parts: |
| if part in EXCLUDE_PATTERNS: |
| return True |
| if part.startswith(".") and part not in ALWAYS_INCLUDE: |
| return True |
| |
| |
| if file_path.suffix in {".pyc", ".pyo", ".pyd"}: |
| return True |
| |
| |
| if file_path.suffix == ".log": |
| return True |
| |
| return False |
|
|
|
|
| def get_files_to_upload(root: Path) -> list[Path]: |
| """Get all files to upload, excluding patterns.""" |
| files = [] |
| for file_path in root.rglob("*"): |
| if file_path.is_file() and not should_exclude(file_path, root): |
| files.append(file_path) |
| return sorted(files) |
|
|
|
|
| def upload_code_repo( |
| api: HfApi, |
| repo_id: str, |
| code_dir: Path, |
| repo_type: str = "model", |
| delay_between_files: float = 1.0, |
| ): |
| """Upload code repository to Hugging Face Hub.""" |
| logger.info(f"Uploading code from {code_dir} to {repo_id} (type: {repo_type})") |
| |
| |
| create_repo(repo_id, repo_type=repo_type, exist_ok=True, private=False) |
| logger.info(f"Repository {repo_id} created/verified.") |
| |
| |
| files = get_files_to_upload(code_dir) |
| total = len(files) |
| logger.info(f"Found {total} files to upload.") |
| |
| if total == 0: |
| logger.warning("No files to upload!") |
| return |
| |
| successful = 0 |
| failed = 0 |
| |
| for idx, file_path in enumerate(files, 1): |
| |
| rel_path = file_path.relative_to(code_dir) |
| path_in_repo = str(rel_path).replace("\\", "/") |
| |
| size_kb = file_path.stat().st_size / 1024 |
| logger.info(f"[{idx}/{total}] Uploading {path_in_repo} ({size_kb:.1f} KB)...") |
| |
| try: |
| |
| with open(file_path, "rb") as f: |
| content = f.read() |
| |
| |
| for attempt in range(1, 4): |
| try: |
| api.upload_file( |
| path_or_fileobj=content, |
| path_in_repo=path_in_repo, |
| repo_id=repo_id, |
| repo_type=repo_type, |
| ) |
| successful += 1 |
| logger.info(f"[{idx}/{total}] ✓ {path_in_repo} uploaded.") |
| break |
| except HfHubHTTPError as e: |
| status_code = getattr(e, 'status_code', None) or ( |
| e.response.status_code if hasattr(e, 'response') and e.response else None |
| ) |
| if status_code == 429: |
| wait_time = min(5.0 * (2 ** (attempt - 1)), 60.0) |
| logger.warning( |
| f" Rate limited (429). Waiting {wait_time:.1f}s (attempt {attempt}/3)..." |
| ) |
| time.sleep(wait_time) |
| continue |
| else: |
| raise |
| except Exception as e: |
| if attempt == 3: |
| raise |
| wait_time = 2.0 * attempt |
| logger.warning(f" Error: {e}. Waiting {wait_time:.1f}s (attempt {attempt}/3)...") |
| time.sleep(wait_time) |
| |
| except Exception as e: |
| failed += 1 |
| logger.error(f"[{idx}/{total}] ✗ Failed to upload {path_in_repo}: {e}") |
| |
| |
| if idx < total: |
| time.sleep(delay_between_files) |
| |
| logger.info(f"Upload complete: {successful} successful, {failed} failed out of {total} files.") |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser( |
| description="Upload dataset_builder code repository to Hugging Face Hub", |
| formatter_class=argparse.RawDescriptionHelpFormatter, |
| epilog=__doc__, |
| ) |
| parser.add_argument( |
| "--hf_user", type=str, required=True, |
| help="Hugging Face username or organization name", |
| ) |
| parser.add_argument( |
| "--repo_name", type=str, default="dataset-builder", |
| help="Repository name (default: dataset-builder)", |
| ) |
| parser.add_argument( |
| "--repo_type", type=str, default="model", choices=["model", "space"], |
| help="Repository type (default: model)", |
| ) |
| parser.add_argument( |
| "--code_dir", type=str, default=None, |
| help="Code directory to upload (default: current directory)", |
| ) |
| parser.add_argument( |
| "--delay", type=float, default=1.0, |
| help="Delay in seconds between file uploads (default: 1.0)", |
| ) |
| args = parser.parse_args() |
| |
| |
| if args.code_dir: |
| code_dir = Path(args.code_dir).resolve() |
| else: |
| code_dir = Path(__file__).parent.resolve() |
| |
| if not code_dir.exists(): |
| logger.error(f"Code directory not found: {code_dir}") |
| return |
| |
| |
| api = HfApi() |
| try: |
| user_info = api.whoami() |
| logger.info(f"Logged in as: {user_info.get('name', user_info.get('fullname', 'unknown'))}") |
| except Exception: |
| logger.error( |
| "Not logged in to Hugging Face. Please run:\n" |
| " huggingface-cli login\n" |
| "or set the HF_TOKEN environment variable." |
| ) |
| return |
| |
| repo_id = f"{args.hf_user}/{args.repo_name}" |
| |
| logger.info("=" * 60) |
| logger.info(f"Upload Plan:") |
| logger.info(f" Code directory: {code_dir}") |
| logger.info(f" Repository: {repo_id} (type: {args.repo_type})") |
| logger.info("=" * 60) |
| |
| try: |
| upload_code_repo( |
| api=api, |
| repo_id=repo_id, |
| code_dir=code_dir, |
| repo_type=args.repo_type, |
| delay_between_files=args.delay, |
| ) |
| logger.info("✓ Code repository upload completed!") |
| except Exception as e: |
| logger.error(f"✗ Code repository upload failed: {e}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|
|
|