dataset-builder / upload_code_to_hf.py
DouDou
Upload upload_code_to_hf.py with huggingface_hub
7793dac verified
raw
history blame
7.99 kB
#!/usr/bin/env python3
"""
Upload dataset_builder code repository to Hugging Face Hub.
Usage:
# Upload to your personal account:
python upload_code_to_hf.py --hf_user YOUR_USERNAME
# Upload to an organization:
python upload_code_to_hf.py --hf_user YOUR_ORG
# Custom repository name:
python upload_code_to_hf.py --hf_user YOUR_USERNAME --repo_name my-dataset-builder
# Upload to a model repository (default):
python upload_code_to_hf.py --hf_user YOUR_USERNAME
# Upload to a space repository:
python upload_code_to_hf.py --hf_user YOUR_USERNAME --repo_type space
"""
import os
import argparse
import logging
from pathlib import Path
from huggingface_hub import HfApi, create_repo
from huggingface_hub.utils import HfHubHTTPError
import time
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)
# Files/directories to exclude from upload
EXCLUDE_PATTERNS = {
"__pycache__",
"*.pyc",
"*.pyo",
"*.pyd",
".git",
".gitignore",
".DS_Store",
"*.log",
"*.swp",
"*.swo",
"*~",
".pytest_cache",
".mypy_cache",
".ruff_cache",
"*.egg-info",
"dist",
"build",
".venv",
"venv",
"env",
".env",
"node_modules",
".idea",
".vscode",
".cursor",
}
# Files to always include (even if they match exclude patterns)
ALWAYS_INCLUDE = {
".gitignore",
"README.md",
"requirements.txt",
"setup.py",
"pyproject.toml",
}
def should_exclude(file_path: Path, root: Path) -> bool:
"""Check if a file should be excluded from upload."""
rel_path = file_path.relative_to(root)
# Always include certain files
if rel_path.name in ALWAYS_INCLUDE:
return False
# Check directory names
for part in rel_path.parts:
if part in EXCLUDE_PATTERNS:
return True
if part.startswith(".") and part not in ALWAYS_INCLUDE:
return True
# Check file extensions
if file_path.suffix in {".pyc", ".pyo", ".pyd"}:
return True
# Check for log files
if file_path.suffix == ".log":
return True
return False
def get_files_to_upload(root: Path) -> list[Path]:
"""Get all files to upload, excluding patterns."""
files = []
for file_path in root.rglob("*"):
if file_path.is_file() and not should_exclude(file_path, root):
files.append(file_path)
return sorted(files)
def upload_code_repo(
api: HfApi,
repo_id: str,
code_dir: Path,
repo_type: str = "model",
delay_between_files: float = 1.0,
):
"""Upload code repository to Hugging Face Hub."""
logger.info(f"Uploading code from {code_dir} to {repo_id} (type: {repo_type})")
# Create repo
create_repo(repo_id, repo_type=repo_type, exist_ok=True, private=False)
logger.info(f"Repository {repo_id} created/verified.")
# Get all files to upload
files = get_files_to_upload(code_dir)
total = len(files)
logger.info(f"Found {total} files to upload.")
if total == 0:
logger.warning("No files to upload!")
return
successful = 0
failed = 0
for idx, file_path in enumerate(files, 1):
# Calculate relative path in repository
rel_path = file_path.relative_to(code_dir)
path_in_repo = str(rel_path).replace("\\", "/") # Normalize path separators
size_kb = file_path.stat().st_size / 1024
logger.info(f"[{idx}/{total}] Uploading {path_in_repo} ({size_kb:.1f} KB)...")
try:
# Read file content
with open(file_path, "rb") as f:
content = f.read()
# Upload file
for attempt in range(1, 4): # Max 3 retries
try:
api.upload_file(
path_or_fileobj=content,
path_in_repo=path_in_repo,
repo_id=repo_id,
repo_type=repo_type,
)
successful += 1
logger.info(f"[{idx}/{total}] ✓ {path_in_repo} uploaded.")
break
except HfHubHTTPError as e:
status_code = getattr(e, 'status_code', None) or (
e.response.status_code if hasattr(e, 'response') and e.response else None
)
if status_code == 429: # Rate limited
wait_time = min(5.0 * (2 ** (attempt - 1)), 60.0)
logger.warning(
f" Rate limited (429). Waiting {wait_time:.1f}s (attempt {attempt}/3)..."
)
time.sleep(wait_time)
continue
else:
raise
except Exception as e:
if attempt == 3:
raise
wait_time = 2.0 * attempt
logger.warning(f" Error: {e}. Waiting {wait_time:.1f}s (attempt {attempt}/3)...")
time.sleep(wait_time)
except Exception as e:
failed += 1
logger.error(f"[{idx}/{total}] ✗ Failed to upload {path_in_repo}: {e}")
# Add delay between files (except for last file)
if idx < total:
time.sleep(delay_between_files)
logger.info(f"Upload complete: {successful} successful, {failed} failed out of {total} files.")
def main():
parser = argparse.ArgumentParser(
description="Upload dataset_builder code repository to Hugging Face Hub",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__,
)
parser.add_argument(
"--hf_user", type=str, required=True,
help="Hugging Face username or organization name",
)
parser.add_argument(
"--repo_name", type=str, default="dataset-builder",
help="Repository name (default: dataset-builder)",
)
parser.add_argument(
"--repo_type", type=str, default="model", choices=["model", "space"],
help="Repository type (default: model)",
)
parser.add_argument(
"--code_dir", type=str, default=None,
help="Code directory to upload (default: current directory)",
)
parser.add_argument(
"--delay", type=float, default=1.0,
help="Delay in seconds between file uploads (default: 1.0)",
)
args = parser.parse_args()
# Determine code directory
if args.code_dir:
code_dir = Path(args.code_dir).resolve()
else:
code_dir = Path(__file__).parent.resolve()
if not code_dir.exists():
logger.error(f"Code directory not found: {code_dir}")
return
# Verify authentication
api = HfApi()
try:
user_info = api.whoami()
logger.info(f"Logged in as: {user_info.get('name', user_info.get('fullname', 'unknown'))}")
except Exception:
logger.error(
"Not logged in to Hugging Face. Please run:\n"
" huggingface-cli login\n"
"or set the HF_TOKEN environment variable."
)
return
repo_id = f"{args.hf_user}/{args.repo_name}"
logger.info("=" * 60)
logger.info(f"Upload Plan:")
logger.info(f" Code directory: {code_dir}")
logger.info(f" Repository: {repo_id} (type: {args.repo_type})")
logger.info("=" * 60)
try:
upload_code_repo(
api=api,
repo_id=repo_id,
code_dir=code_dir,
repo_type=args.repo_type,
delay_between_files=args.delay,
)
logger.info("✓ Code repository upload completed!")
except Exception as e:
logger.error(f"✗ Code repository upload failed: {e}")
if __name__ == "__main__":
main()