KeyVID / upload.py

Upload folder using huggingface_hub

d31d787 verified 19 days ago

7.22 kB

	#!/usr/bin/env python3
	"""
	Upload KeyVID model to Hugging Face Hub (Optimized for Speed)
	"""

	from pathlib import Path
	from huggingface_hub import HfApi, login, upload_folder
	import os
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from tqdm import tqdm
	import time

	# Model repository name on Hugging Face
	MODEL_ID = "RyanWW/KeyVID"

	# Path to the KeyVID directory
	KEYVID_PATH = "/dockerx/groups/KeyVID_hf_model"

	# Upload configuration
	MAX_WORKERS = 8 # 并发上传线程数
	CHUNK_SIZE = 100 * 1024 * 1024 # 100MB chunks for large files

	def should_exclude_file(file_path, exclude_patterns):
	"""检查文件是否应该被排除"""
	file_str = str(file_path)
	for pattern in exclude_patterns:
	# 支持通配符匹配
	if '*' in pattern:
	pattern_parts = pattern.split('*')
	if all(part in file_str for part in pattern_parts if part):
	return True
	elif pattern in file_str:
	return True
	return False

	def get_files_to_upload(keyvid_dir, exclude_patterns):
	"""获取需要上传的文件列表"""
	files = []
	total_size = 0

	print("🔍 Scanning files...")
	for file_path in tqdm(keyvid_dir.rglob("*"), desc="Scanning"):
	if file_path.is_file():
	relative_path = file_path.relative_to(keyvid_dir)

	if not should_exclude_file(relative_path, exclude_patterns):
	file_size = file_path.stat().st_size
	files.append((relative_path, file_size))
	total_size += file_size

	return files, total_size

	def format_size(size_bytes):
	"""格式化文件大小"""
	for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
	if size_bytes < 1024.0:
	return f"{size_bytes:.2f} {unit}"
	size_bytes /= 1024.0
	return f"{size_bytes:.2f} PB"

	def upload_file_wrapper(args):
	"""包装文件上传函数用于并发"""
	api, keyvid_dir, file_path, repo_id = args
	try:
	full_path = keyvid_dir / file_path
	api.upload_file(
	path_or_fileobj=str(full_path),
	path_in_repo=str(file_path),
	repo_id=repo_id,
	repo_type="model",
	commit_message="Upload KeyVID model files",
	# multi_commits=True, # 允许分批提交
	)
	return (file_path, True, None)
	except Exception as e:
	return (file_path, False, str(e))

	def main():
	print("🚀 Starting KeyVID upload to Hugging Face (Optimized)...")
	print(f"Repository: {MODEL_ID}")

	# Check authentication
	try:
	api = HfApi()
	print("✅ Hugging Face authentication found")
	except Exception as e:
	print("⚠️ Need to authenticate with Hugging Face")
	print("Please run: huggingface-cli login")
	print("Or set HF_TOKEN environment variable")
	return

	# Paths to upload
	keyvid_dir = Path(KEYVID_PATH)

	if not keyvid_dir.exists():
	print(f"❌ Error: KeyVID directory not found at {KEYVID_PATH}")
	return

	print(f"\n📁 Directory: {keyvid_dir}")

	# 优化后的排除模式
	exclude_patterns = [
	"__pycache__",
	".git",
	"*.pyc",
	".DS_Store",
	"save_results/", # 排除结果目录
	"*.log",
	"*.tmp",
	"error.txt",
	".bash_history",
	".gitignore",
	"upload.py", # 排除上传脚本本身
	]

	# 获取文件列表
	files_to_upload, total_size = get_files_to_upload(keyvid_dir, exclude_patterns)

	print(f"\n📊 Statistics:")
	print(f" Files to upload: {len(files_to_upload)}")
	print(f" Total size: {format_size(total_size)}")

	if len(files_to_upload) == 0:
	print("⚠️ No files to upload!")
	return

	# 询问用户确认
	response = input(f"\n❓ Proceed with upload? (y/n): ").strip().lower()
	if response != 'y':
	print("❌ Upload cancelled")
	return

	# 策略选择：根据文件数量和大小选择上传方式
	large_files = [f for f, s in files_to_upload if s > 100 * 1024 * 1024] # > 100MB
	small_files = [f for f, s in files_to_upload if s <= 100 * 1024 * 1024]

	print(f"\n📦 Upload strategy:")
	print(f" Large files (>100MB): {len(large_files)}")
	print(f" Small files: {len(small_files)}")
	print(f" Concurrent workers: {MAX_WORKERS}")

	# 方法1: 使用 upload_folder（最快，Hugging Face推荐）
	print(f"\n⬆️ Uploading using optimized upload_folder...")
	try:
	start_time = time.time()

	# 使用更精确的ignore_patterns
	ignore_patterns = [
	"/__pycache__/",
	"/.git/",
	"*/.pyc",
	"**/.DS_Store",
	"/save_results/",
	"*/.log",
	"*/.tmp",
	"**/upload.py",
	]

	upload_folder(
	folder_path=str(keyvid_dir),
	repo_id=MODEL_ID,
	repo_type="model",
	ignore_patterns=ignore_patterns,
	commit_message="Upload KeyVID model files",
	multi_commits=True, # 允许分批提交，加快大文件上传
	multi_commits_verbose=True, # 显示详细进度
	)

	elapsed_time = time.time() - start_time
	print(f"\n✅ Upload complete!")
	print(f"⏱️ Time taken: {elapsed_time/60:.2f} minutes")
	print(f"🔗 View model at: https://huggingface.co/{MODEL_ID}")

	except Exception as e:
	print(f"❌ Error with upload_folder: {e}")
	print("\n📝 Falling back to concurrent file upload...")

	# 方法2: 并发上传单个文件（备选方案）
	start_time = time.time()
	failed_files = []

	with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
	# 准备上传任务
	tasks = [
	(api, keyvid_dir, file_path, MODEL_ID)
	for file_path, _ in files_to_upload
	]

	# 提交任务并显示进度
	futures = {executor.submit(upload_file_wrapper, task): task[2]
	for task in tasks}

	with tqdm(total=len(files_to_upload), desc="Uploading") as pbar:
	for future in as_completed(futures):
	file_path, success, error = future.result()
	if success:
	pbar.update(1)
	else:
	failed_files.append((file_path, error))
	pbar.update(1)

	elapsed_time = time.time() - start_time

	if failed_files:
	print(f"\n⚠️ {len(failed_files)} files failed to upload:")
	for file_path, error in failed_files[:10]:
	print(f" - {file_path}: {error}")
	else:
	print(f"\n✅ All files uploaded successfully!")
	print(f"⏱️ Time taken: {elapsed_time/60:.2f} minutes")
	print(f"🔗 View model at: https://huggingface.co/{MODEL_ID}")

	if __name__ == "__main__":
	main()