KeyVID / upload.py
RyanWW's picture
Upload folder using huggingface_hub
d31d787 verified
#!/usr/bin/env python3
"""
Upload KeyVID model to Hugging Face Hub (Optimized for Speed)
"""
from pathlib import Path
from huggingface_hub import HfApi, login, upload_folder
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time
# Model repository name on Hugging Face
MODEL_ID = "RyanWW/KeyVID"
# Path to the KeyVID directory
KEYVID_PATH = "/dockerx/groups/KeyVID_hf_model"
# Upload configuration
MAX_WORKERS = 8 # 并发上传线程数
CHUNK_SIZE = 100 * 1024 * 1024 # 100MB chunks for large files
def should_exclude_file(file_path, exclude_patterns):
"""检查文件是否应该被排除"""
file_str = str(file_path)
for pattern in exclude_patterns:
# 支持通配符匹配
if '*' in pattern:
pattern_parts = pattern.split('*')
if all(part in file_str for part in pattern_parts if part):
return True
elif pattern in file_str:
return True
return False
def get_files_to_upload(keyvid_dir, exclude_patterns):
"""获取需要上传的文件列表"""
files = []
total_size = 0
print("🔍 Scanning files...")
for file_path in tqdm(keyvid_dir.rglob("*"), desc="Scanning"):
if file_path.is_file():
relative_path = file_path.relative_to(keyvid_dir)
if not should_exclude_file(relative_path, exclude_patterns):
file_size = file_path.stat().st_size
files.append((relative_path, file_size))
total_size += file_size
return files, total_size
def format_size(size_bytes):
"""格式化文件大小"""
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
if size_bytes < 1024.0:
return f"{size_bytes:.2f} {unit}"
size_bytes /= 1024.0
return f"{size_bytes:.2f} PB"
def upload_file_wrapper(args):
"""包装文件上传函数用于并发"""
api, keyvid_dir, file_path, repo_id = args
try:
full_path = keyvid_dir / file_path
api.upload_file(
path_or_fileobj=str(full_path),
path_in_repo=str(file_path),
repo_id=repo_id,
repo_type="model",
commit_message="Upload KeyVID model files",
# multi_commits=True, # 允许分批提交
)
return (file_path, True, None)
except Exception as e:
return (file_path, False, str(e))
def main():
print("🚀 Starting KeyVID upload to Hugging Face (Optimized)...")
print(f"Repository: {MODEL_ID}")
# Check authentication
try:
api = HfApi()
print("✅ Hugging Face authentication found")
except Exception as e:
print("⚠️ Need to authenticate with Hugging Face")
print("Please run: huggingface-cli login")
print("Or set HF_TOKEN environment variable")
return
# Paths to upload
keyvid_dir = Path(KEYVID_PATH)
if not keyvid_dir.exists():
print(f"❌ Error: KeyVID directory not found at {KEYVID_PATH}")
return
print(f"\n📁 Directory: {keyvid_dir}")
# 优化后的排除模式
exclude_patterns = [
"__pycache__",
".git",
"*.pyc",
".DS_Store",
"save_results/", # 排除结果目录
"*.log",
"*.tmp",
"error.txt",
".bash_history",
".gitignore",
"upload.py", # 排除上传脚本本身
]
# 获取文件列表
files_to_upload, total_size = get_files_to_upload(keyvid_dir, exclude_patterns)
print(f"\n📊 Statistics:")
print(f" Files to upload: {len(files_to_upload)}")
print(f" Total size: {format_size(total_size)}")
if len(files_to_upload) == 0:
print("⚠️ No files to upload!")
return
# 询问用户确认
response = input(f"\n❓ Proceed with upload? (y/n): ").strip().lower()
if response != 'y':
print("❌ Upload cancelled")
return
# 策略选择:根据文件数量和大小选择上传方式
large_files = [f for f, s in files_to_upload if s > 100 * 1024 * 1024] # > 100MB
small_files = [f for f, s in files_to_upload if s <= 100 * 1024 * 1024]
print(f"\n📦 Upload strategy:")
print(f" Large files (>100MB): {len(large_files)}")
print(f" Small files: {len(small_files)}")
print(f" Concurrent workers: {MAX_WORKERS}")
# 方法1: 使用 upload_folder(最快,Hugging Face推荐)
print(f"\n⬆️ Uploading using optimized upload_folder...")
try:
start_time = time.time()
# 使用更精确的ignore_patterns
ignore_patterns = [
"**/__pycache__/**",
"**/.git/**",
"**/*.pyc",
"**/.DS_Store",
"**/save_results/**",
"**/*.log",
"**/*.tmp",
"**/upload.py",
]
upload_folder(
folder_path=str(keyvid_dir),
repo_id=MODEL_ID,
repo_type="model",
ignore_patterns=ignore_patterns,
commit_message="Upload KeyVID model files",
multi_commits=True, # 允许分批提交,加快大文件上传
multi_commits_verbose=True, # 显示详细进度
)
elapsed_time = time.time() - start_time
print(f"\n✅ Upload complete!")
print(f"⏱️ Time taken: {elapsed_time/60:.2f} minutes")
print(f"🔗 View model at: https://huggingface.co/{MODEL_ID}")
except Exception as e:
print(f"❌ Error with upload_folder: {e}")
print("\n📝 Falling back to concurrent file upload...")
# 方法2: 并发上传单个文件(备选方案)
start_time = time.time()
failed_files = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
# 准备上传任务
tasks = [
(api, keyvid_dir, file_path, MODEL_ID)
for file_path, _ in files_to_upload
]
# 提交任务并显示进度
futures = {executor.submit(upload_file_wrapper, task): task[2]
for task in tasks}
with tqdm(total=len(files_to_upload), desc="Uploading") as pbar:
for future in as_completed(futures):
file_path, success, error = future.result()
if success:
pbar.update(1)
else:
failed_files.append((file_path, error))
pbar.update(1)
elapsed_time = time.time() - start_time
if failed_files:
print(f"\n⚠️ {len(failed_files)} files failed to upload:")
for file_path, error in failed_files[:10]:
print(f" - {file_path}: {error}")
else:
print(f"\n✅ All files uploaded successfully!")
print(f"⏱️ Time taken: {elapsed_time/60:.2f} minutes")
print(f"🔗 View model at: https://huggingface.co/{MODEL_ID}")
if __name__ == "__main__":
main()