File size: 7,224 Bytes
d31d787 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 | #!/usr/bin/env python3
"""
Upload KeyVID model to Hugging Face Hub (Optimized for Speed)
"""
from pathlib import Path
from huggingface_hub import HfApi, login, upload_folder
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time
# Model repository name on Hugging Face
MODEL_ID = "RyanWW/KeyVID"
# Path to the KeyVID directory
KEYVID_PATH = "/dockerx/groups/KeyVID_hf_model"
# Upload configuration
MAX_WORKERS = 8 # 并发上传线程数
CHUNK_SIZE = 100 * 1024 * 1024 # 100MB chunks for large files
def should_exclude_file(file_path, exclude_patterns):
"""检查文件是否应该被排除"""
file_str = str(file_path)
for pattern in exclude_patterns:
# 支持通配符匹配
if '*' in pattern:
pattern_parts = pattern.split('*')
if all(part in file_str for part in pattern_parts if part):
return True
elif pattern in file_str:
return True
return False
def get_files_to_upload(keyvid_dir, exclude_patterns):
"""获取需要上传的文件列表"""
files = []
total_size = 0
print("🔍 Scanning files...")
for file_path in tqdm(keyvid_dir.rglob("*"), desc="Scanning"):
if file_path.is_file():
relative_path = file_path.relative_to(keyvid_dir)
if not should_exclude_file(relative_path, exclude_patterns):
file_size = file_path.stat().st_size
files.append((relative_path, file_size))
total_size += file_size
return files, total_size
def format_size(size_bytes):
"""格式化文件大小"""
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
if size_bytes < 1024.0:
return f"{size_bytes:.2f} {unit}"
size_bytes /= 1024.0
return f"{size_bytes:.2f} PB"
def upload_file_wrapper(args):
"""包装文件上传函数用于并发"""
api, keyvid_dir, file_path, repo_id = args
try:
full_path = keyvid_dir / file_path
api.upload_file(
path_or_fileobj=str(full_path),
path_in_repo=str(file_path),
repo_id=repo_id,
repo_type="model",
commit_message="Upload KeyVID model files",
# multi_commits=True, # 允许分批提交
)
return (file_path, True, None)
except Exception as e:
return (file_path, False, str(e))
def main():
print("🚀 Starting KeyVID upload to Hugging Face (Optimized)...")
print(f"Repository: {MODEL_ID}")
# Check authentication
try:
api = HfApi()
print("✅ Hugging Face authentication found")
except Exception as e:
print("⚠️ Need to authenticate with Hugging Face")
print("Please run: huggingface-cli login")
print("Or set HF_TOKEN environment variable")
return
# Paths to upload
keyvid_dir = Path(KEYVID_PATH)
if not keyvid_dir.exists():
print(f"❌ Error: KeyVID directory not found at {KEYVID_PATH}")
return
print(f"\n📁 Directory: {keyvid_dir}")
# 优化后的排除模式
exclude_patterns = [
"__pycache__",
".git",
"*.pyc",
".DS_Store",
"save_results/", # 排除结果目录
"*.log",
"*.tmp",
"error.txt",
".bash_history",
".gitignore",
"upload.py", # 排除上传脚本本身
]
# 获取文件列表
files_to_upload, total_size = get_files_to_upload(keyvid_dir, exclude_patterns)
print(f"\n📊 Statistics:")
print(f" Files to upload: {len(files_to_upload)}")
print(f" Total size: {format_size(total_size)}")
if len(files_to_upload) == 0:
print("⚠️ No files to upload!")
return
# 询问用户确认
response = input(f"\n❓ Proceed with upload? (y/n): ").strip().lower()
if response != 'y':
print("❌ Upload cancelled")
return
# 策略选择:根据文件数量和大小选择上传方式
large_files = [f for f, s in files_to_upload if s > 100 * 1024 * 1024] # > 100MB
small_files = [f for f, s in files_to_upload if s <= 100 * 1024 * 1024]
print(f"\n📦 Upload strategy:")
print(f" Large files (>100MB): {len(large_files)}")
print(f" Small files: {len(small_files)}")
print(f" Concurrent workers: {MAX_WORKERS}")
# 方法1: 使用 upload_folder(最快,Hugging Face推荐)
print(f"\n⬆️ Uploading using optimized upload_folder...")
try:
start_time = time.time()
# 使用更精确的ignore_patterns
ignore_patterns = [
"**/__pycache__/**",
"**/.git/**",
"**/*.pyc",
"**/.DS_Store",
"**/save_results/**",
"**/*.log",
"**/*.tmp",
"**/upload.py",
]
upload_folder(
folder_path=str(keyvid_dir),
repo_id=MODEL_ID,
repo_type="model",
ignore_patterns=ignore_patterns,
commit_message="Upload KeyVID model files",
multi_commits=True, # 允许分批提交,加快大文件上传
multi_commits_verbose=True, # 显示详细进度
)
elapsed_time = time.time() - start_time
print(f"\n✅ Upload complete!")
print(f"⏱️ Time taken: {elapsed_time/60:.2f} minutes")
print(f"🔗 View model at: https://huggingface.co/{MODEL_ID}")
except Exception as e:
print(f"❌ Error with upload_folder: {e}")
print("\n📝 Falling back to concurrent file upload...")
# 方法2: 并发上传单个文件(备选方案)
start_time = time.time()
failed_files = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
# 准备上传任务
tasks = [
(api, keyvid_dir, file_path, MODEL_ID)
for file_path, _ in files_to_upload
]
# 提交任务并显示进度
futures = {executor.submit(upload_file_wrapper, task): task[2]
for task in tasks}
with tqdm(total=len(files_to_upload), desc="Uploading") as pbar:
for future in as_completed(futures):
file_path, success, error = future.result()
if success:
pbar.update(1)
else:
failed_files.append((file_path, error))
pbar.update(1)
elapsed_time = time.time() - start_time
if failed_files:
print(f"\n⚠️ {len(failed_files)} files failed to upload:")
for file_path, error in failed_files[:10]:
print(f" - {file_path}: {error}")
else:
print(f"\n✅ All files uploaded successfully!")
print(f"⏱️ Time taken: {elapsed_time/60:.2f} minutes")
print(f"🔗 View model at: https://huggingface.co/{MODEL_ID}")
if __name__ == "__main__":
main()
|