#!/usr/bin/env python3 """ Upload dataset_builder code repository to Hugging Face Hub. Usage: # Upload to your personal account: python upload_code_to_hf.py --hf_user YOUR_USERNAME # Upload to an organization: python upload_code_to_hf.py --hf_user YOUR_ORG # Custom repository name: python upload_code_to_hf.py --hf_user YOUR_USERNAME --repo_name my-dataset-builder # Upload to a model repository (default): python upload_code_to_hf.py --hf_user YOUR_USERNAME # Upload to a space repository: python upload_code_to_hf.py --hf_user YOUR_USERNAME --repo_type space """ import os import argparse import logging from pathlib import Path from huggingface_hub import HfApi, create_repo from huggingface_hub.utils import HfHubHTTPError import time logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) logger = logging.getLogger(__name__) # Files/directories to exclude from upload EXCLUDE_PATTERNS = { "__pycache__", "*.pyc", "*.pyo", "*.pyd", ".git", ".gitignore", ".DS_Store", "*.log", "*.swp", "*.swo", "*~", ".pytest_cache", ".mypy_cache", ".ruff_cache", "*.egg-info", "dist", "build", ".venv", "venv", "env", ".env", "node_modules", ".idea", ".vscode", ".cursor", } # Files to always include (even if they match exclude patterns) ALWAYS_INCLUDE = { ".gitignore", "README.md", "requirements.txt", "setup.py", "pyproject.toml", } def should_exclude(file_path: Path, root: Path) -> bool: """Check if a file should be excluded from upload.""" rel_path = file_path.relative_to(root) # Always include certain files if rel_path.name in ALWAYS_INCLUDE: return False # Check directory names for part in rel_path.parts: if part in EXCLUDE_PATTERNS: return True if part.startswith(".") and part not in ALWAYS_INCLUDE: return True # Check file extensions if file_path.suffix in {".pyc", ".pyo", ".pyd"}: return True # Check for log files if file_path.suffix == ".log": return True return False def get_files_to_upload(root: Path) -> list[Path]: """Get all files to upload, excluding patterns.""" files = [] for file_path in root.rglob("*"): if file_path.is_file() and not should_exclude(file_path, root): files.append(file_path) return sorted(files) def upload_code_repo( api: HfApi, repo_id: str, code_dir: Path, repo_type: str = "model", delay_between_files: float = 1.0, ): """Upload code repository to Hugging Face Hub.""" logger.info(f"Uploading code from {code_dir} to {repo_id} (type: {repo_type})") # Create repo create_repo(repo_id, repo_type=repo_type, exist_ok=True, private=False) logger.info(f"Repository {repo_id} created/verified.") # Get all files to upload files = get_files_to_upload(code_dir) total = len(files) logger.info(f"Found {total} files to upload.") if total == 0: logger.warning("No files to upload!") return successful = 0 failed = 0 for idx, file_path in enumerate(files, 1): # Calculate relative path in repository rel_path = file_path.relative_to(code_dir) path_in_repo = str(rel_path).replace("\\", "/") # Normalize path separators size_kb = file_path.stat().st_size / 1024 logger.info(f"[{idx}/{total}] Uploading {path_in_repo} ({size_kb:.1f} KB)...") try: # Read file content with open(file_path, "rb") as f: content = f.read() # Upload file for attempt in range(1, 4): # Max 3 retries try: api.upload_file( path_or_fileobj=content, path_in_repo=path_in_repo, repo_id=repo_id, repo_type=repo_type, ) successful += 1 logger.info(f"[{idx}/{total}] ✓ {path_in_repo} uploaded.") break except HfHubHTTPError as e: status_code = getattr(e, 'status_code', None) or ( e.response.status_code if hasattr(e, 'response') and e.response else None ) if status_code == 429: # Rate limited wait_time = min(5.0 * (2 ** (attempt - 1)), 60.0) logger.warning( f" Rate limited (429). Waiting {wait_time:.1f}s (attempt {attempt}/3)..." ) time.sleep(wait_time) continue else: raise except Exception as e: if attempt == 3: raise wait_time = 2.0 * attempt logger.warning(f" Error: {e}. Waiting {wait_time:.1f}s (attempt {attempt}/3)...") time.sleep(wait_time) except Exception as e: failed += 1 logger.error(f"[{idx}/{total}] ✗ Failed to upload {path_in_repo}: {e}") # Add delay between files (except for last file) if idx < total: time.sleep(delay_between_files) logger.info(f"Upload complete: {successful} successful, {failed} failed out of {total} files.") def main(): parser = argparse.ArgumentParser( description="Upload dataset_builder code repository to Hugging Face Hub", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=__doc__, ) parser.add_argument( "--hf_user", type=str, required=True, help="Hugging Face username or organization name", ) parser.add_argument( "--repo_name", type=str, default="dataset-builder", help="Repository name (default: dataset-builder)", ) parser.add_argument( "--repo_type", type=str, default="model", choices=["model", "space"], help="Repository type (default: model)", ) parser.add_argument( "--code_dir", type=str, default=None, help="Code directory to upload (default: current directory)", ) parser.add_argument( "--delay", type=float, default=1.0, help="Delay in seconds between file uploads (default: 1.0)", ) args = parser.parse_args() # Determine code directory if args.code_dir: code_dir = Path(args.code_dir).resolve() else: code_dir = Path(__file__).parent.resolve() if not code_dir.exists(): logger.error(f"Code directory not found: {code_dir}") return # Verify authentication api = HfApi() try: user_info = api.whoami() logger.info(f"Logged in as: {user_info.get('name', user_info.get('fullname', 'unknown'))}") except Exception: logger.error( "Not logged in to Hugging Face. Please run:\n" " huggingface-cli login\n" "or set the HF_TOKEN environment variable." ) return repo_id = f"{args.hf_user}/{args.repo_name}" logger.info("=" * 60) logger.info(f"Upload Plan:") logger.info(f" Code directory: {code_dir}") logger.info(f" Repository: {repo_id} (type: {args.repo_type})") logger.info("=" * 60) try: upload_code_repo( api=api, repo_id=repo_id, code_dir=code_dir, repo_type=args.repo_type, delay_between_files=args.delay, ) logger.info("✓ Code repository upload completed!") except Exception as e: logger.error(f"✗ Code repository upload failed: {e}") if __name__ == "__main__": main()