#!/usr/bin/env python3 """ Upload KeyVID model to Hugging Face Hub (Optimized for Speed) """ from pathlib import Path from huggingface_hub import HfApi, login, upload_folder import os from concurrent.futures import ThreadPoolExecutor, as_completed from tqdm import tqdm import time # Model repository name on Hugging Face MODEL_ID = "RyanWW/KeyVID" # Path to the KeyVID directory KEYVID_PATH = "/dockerx/groups/KeyVID_hf_model" # Upload configuration MAX_WORKERS = 8 # 并发上传线程数 CHUNK_SIZE = 100 * 1024 * 1024 # 100MB chunks for large files def should_exclude_file(file_path, exclude_patterns): """检查文件是否应该被排除""" file_str = str(file_path) for pattern in exclude_patterns: # 支持通配符匹配 if '*' in pattern: pattern_parts = pattern.split('*') if all(part in file_str for part in pattern_parts if part): return True elif pattern in file_str: return True return False def get_files_to_upload(keyvid_dir, exclude_patterns): """获取需要上传的文件列表""" files = [] total_size = 0 print("🔍 Scanning files...") for file_path in tqdm(keyvid_dir.rglob("*"), desc="Scanning"): if file_path.is_file(): relative_path = file_path.relative_to(keyvid_dir) if not should_exclude_file(relative_path, exclude_patterns): file_size = file_path.stat().st_size files.append((relative_path, file_size)) total_size += file_size return files, total_size def format_size(size_bytes): """格式化文件大小""" for unit in ['B', 'KB', 'MB', 'GB', 'TB']: if size_bytes < 1024.0: return f"{size_bytes:.2f} {unit}" size_bytes /= 1024.0 return f"{size_bytes:.2f} PB" def upload_file_wrapper(args): """包装文件上传函数用于并发""" api, keyvid_dir, file_path, repo_id = args try: full_path = keyvid_dir / file_path api.upload_file( path_or_fileobj=str(full_path), path_in_repo=str(file_path), repo_id=repo_id, repo_type="model", commit_message="Upload KeyVID model files", # multi_commits=True, # 允许分批提交 ) return (file_path, True, None) except Exception as e: return (file_path, False, str(e)) def main(): print("🚀 Starting KeyVID upload to Hugging Face (Optimized)...") print(f"Repository: {MODEL_ID}") # Check authentication try: api = HfApi() print("✅ Hugging Face authentication found") except Exception as e: print("⚠️ Need to authenticate with Hugging Face") print("Please run: huggingface-cli login") print("Or set HF_TOKEN environment variable") return # Paths to upload keyvid_dir = Path(KEYVID_PATH) if not keyvid_dir.exists(): print(f"❌ Error: KeyVID directory not found at {KEYVID_PATH}") return print(f"\n📁 Directory: {keyvid_dir}") # 优化后的排除模式 exclude_patterns = [ "__pycache__", ".git", "*.pyc", ".DS_Store", "save_results/", # 排除结果目录 "*.log", "*.tmp", "error.txt", ".bash_history", ".gitignore", "upload.py", # 排除上传脚本本身 ] # 获取文件列表 files_to_upload, total_size = get_files_to_upload(keyvid_dir, exclude_patterns) print(f"\n📊 Statistics:") print(f" Files to upload: {len(files_to_upload)}") print(f" Total size: {format_size(total_size)}") if len(files_to_upload) == 0: print("⚠️ No files to upload!") return # 询问用户确认 response = input(f"\n❓ Proceed with upload? (y/n): ").strip().lower() if response != 'y': print("❌ Upload cancelled") return # 策略选择:根据文件数量和大小选择上传方式 large_files = [f for f, s in files_to_upload if s > 100 * 1024 * 1024] # > 100MB small_files = [f for f, s in files_to_upload if s <= 100 * 1024 * 1024] print(f"\n📦 Upload strategy:") print(f" Large files (>100MB): {len(large_files)}") print(f" Small files: {len(small_files)}") print(f" Concurrent workers: {MAX_WORKERS}") # 方法1: 使用 upload_folder(最快,Hugging Face推荐) print(f"\n⬆️ Uploading using optimized upload_folder...") try: start_time = time.time() # 使用更精确的ignore_patterns ignore_patterns = [ "**/__pycache__/**", "**/.git/**", "**/*.pyc", "**/.DS_Store", "**/save_results/**", "**/*.log", "**/*.tmp", "**/upload.py", ] upload_folder( folder_path=str(keyvid_dir), repo_id=MODEL_ID, repo_type="model", ignore_patterns=ignore_patterns, commit_message="Upload KeyVID model files", multi_commits=True, # 允许分批提交,加快大文件上传 multi_commits_verbose=True, # 显示详细进度 ) elapsed_time = time.time() - start_time print(f"\n✅ Upload complete!") print(f"⏱️ Time taken: {elapsed_time/60:.2f} minutes") print(f"🔗 View model at: https://huggingface.co/{MODEL_ID}") except Exception as e: print(f"❌ Error with upload_folder: {e}") print("\n📝 Falling back to concurrent file upload...") # 方法2: 并发上传单个文件(备选方案) start_time = time.time() failed_files = [] with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: # 准备上传任务 tasks = [ (api, keyvid_dir, file_path, MODEL_ID) for file_path, _ in files_to_upload ] # 提交任务并显示进度 futures = {executor.submit(upload_file_wrapper, task): task[2] for task in tasks} with tqdm(total=len(files_to_upload), desc="Uploading") as pbar: for future in as_completed(futures): file_path, success, error = future.result() if success: pbar.update(1) else: failed_files.append((file_path, error)) pbar.update(1) elapsed_time = time.time() - start_time if failed_files: print(f"\n⚠️ {len(failed_files)} files failed to upload:") for file_path, error in failed_files[:10]: print(f" - {file_path}: {error}") else: print(f"\n✅ All files uploaded successfully!") print(f"⏱️ Time taken: {elapsed_time/60:.2f} minutes") print(f"🔗 View model at: https://huggingface.co/{MODEL_ID}") if __name__ == "__main__": main()