File size: 7,224 Bytes

d31d787

#!/usr/bin/env python3
"""
Upload KeyVID model to Hugging Face Hub (Optimized for Speed)
"""

from pathlib import Path
from huggingface_hub import HfApi, login, upload_folder
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time

# Model repository name on Hugging Face
MODEL_ID = "RyanWW/KeyVID"

# Path to the KeyVID directory
KEYVID_PATH = "/dockerx/groups/KeyVID_hf_model"

# Upload configuration
MAX_WORKERS = 8  # 并发上传线程数
CHUNK_SIZE = 100 * 1024 * 1024  # 100MB chunks for large files

def should_exclude_file(file_path, exclude_patterns):
    """检查文件是否应该被排除"""
    file_str = str(file_path)
    for pattern in exclude_patterns:
        # 支持通配符匹配
        if '*' in pattern:
            pattern_parts = pattern.split('*')
            if all(part in file_str for part in pattern_parts if part):
                return True
        elif pattern in file_str:
            return True
    return False

def get_files_to_upload(keyvid_dir, exclude_patterns):
    """获取需要上传的文件列表"""
    files = []
    total_size = 0
    
    print("🔍 Scanning files...")
    for file_path in tqdm(keyvid_dir.rglob("*"), desc="Scanning"):
        if file_path.is_file():
            relative_path = file_path.relative_to(keyvid_dir)
            
            if not should_exclude_file(relative_path, exclude_patterns):
                file_size = file_path.stat().st_size
                files.append((relative_path, file_size))
                total_size += file_size
    
    return files, total_size

def format_size(size_bytes):
    """格式化文件大小"""
    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
        if size_bytes < 1024.0:
            return f"{size_bytes:.2f} {unit}"
        size_bytes /= 1024.0
    return f"{size_bytes:.2f} PB"

def upload_file_wrapper(args):
    """包装文件上传函数用于并发"""
    api, keyvid_dir, file_path, repo_id = args
    try:
        full_path = keyvid_dir / file_path
        api.upload_file(
            path_or_fileobj=str(full_path),
            path_in_repo=str(file_path),
            repo_id=repo_id,
            repo_type="model",
            commit_message="Upload KeyVID model files",
            # multi_commits=True,  # 允许分批提交
        )
        return (file_path, True, None)
    except Exception as e:
        return (file_path, False, str(e))

def main():
    print("🚀 Starting KeyVID upload to Hugging Face (Optimized)...")
    print(f"Repository: {MODEL_ID}")
    
    # Check authentication
    try:
        api = HfApi()
        print("✅ Hugging Face authentication found")
    except Exception as e:
        print("⚠️  Need to authenticate with Hugging Face")
        print("Please run: huggingface-cli login")
        print("Or set HF_TOKEN environment variable")
        return
    
    # Paths to upload
    keyvid_dir = Path(KEYVID_PATH)
    
    if not keyvid_dir.exists():
        print(f"❌ Error: KeyVID directory not found at {KEYVID_PATH}")
        return
    
    print(f"\n📁 Directory: {keyvid_dir}")
    
    # 优化后的排除模式
    exclude_patterns = [
        "__pycache__",
        ".git",
        "*.pyc",
        ".DS_Store",
        "save_results/",  # 排除结果目录
        "*.log",
        "*.tmp",
        "error.txt",
        ".bash_history",
        ".gitignore",
        "upload.py",  # 排除上传脚本本身
    ]
    
    # 获取文件列表
    files_to_upload, total_size = get_files_to_upload(keyvid_dir, exclude_patterns)
    
    print(f"\n📊 Statistics:")
    print(f"   Files to upload: {len(files_to_upload)}")
    print(f"   Total size: {format_size(total_size)}")
    
    if len(files_to_upload) == 0:
        print("⚠️  No files to upload!")
        return
    
    # 询问用户确认
    response = input(f"\n❓ Proceed with upload? (y/n): ").strip().lower()
    if response != 'y':
        print("❌ Upload cancelled")
        return
    
    # 策略选择：根据文件数量和大小选择上传方式
    large_files = [f for f, s in files_to_upload if s > 100 * 1024 * 1024]  # > 100MB
    small_files = [f for f, s in files_to_upload if s <= 100 * 1024 * 1024]
    
    print(f"\n📦 Upload strategy:")
    print(f"   Large files (>100MB): {len(large_files)}")
    print(f"   Small files: {len(small_files)}")
    print(f"   Concurrent workers: {MAX_WORKERS}")
    
    # 方法1: 使用 upload_folder（最快，Hugging Face推荐）
    print(f"\n⬆️  Uploading using optimized upload_folder...")
    try:
        start_time = time.time()
        
        # 使用更精确的ignore_patterns
        ignore_patterns = [
            "**/__pycache__/**",
            "**/.git/**",
            "**/*.pyc",
            "**/.DS_Store",
            "**/save_results/**",
            "**/*.log",
            "**/*.tmp",
            "**/upload.py",
        ]
        
        upload_folder(
            folder_path=str(keyvid_dir),
            repo_id=MODEL_ID,
            repo_type="model",
            ignore_patterns=ignore_patterns,
            commit_message="Upload KeyVID model files",
            multi_commits=True,  # 允许分批提交，加快大文件上传
            multi_commits_verbose=True,  # 显示详细进度
        )
        
        elapsed_time = time.time() - start_time
        print(f"\n✅ Upload complete!")
        print(f"⏱️  Time taken: {elapsed_time/60:.2f} minutes")
        print(f"🔗 View model at: https://huggingface.co/{MODEL_ID}")
        
    except Exception as e:
        print(f"❌ Error with upload_folder: {e}")
        print("\n📝 Falling back to concurrent file upload...")
        
        # 方法2: 并发上传单个文件（备选方案）
        start_time = time.time()
        failed_files = []
        
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            # 准备上传任务
            tasks = [
                (api, keyvid_dir, file_path, MODEL_ID)
                for file_path, _ in files_to_upload
            ]
            
            # 提交任务并显示进度
            futures = {executor.submit(upload_file_wrapper, task): task[2] 
                      for task in tasks}
            
            with tqdm(total=len(files_to_upload), desc="Uploading") as pbar:
                for future in as_completed(futures):
                    file_path, success, error = future.result()
                    if success:
                        pbar.update(1)
                    else:
                        failed_files.append((file_path, error))
                        pbar.update(1)
        
        elapsed_time = time.time() - start_time
        
        if failed_files:
            print(f"\n⚠️  {len(failed_files)} files failed to upload:")
            for file_path, error in failed_files[:10]:
                print(f"   - {file_path}: {error}")
        else:
            print(f"\n✅ All files uploaded successfully!")
            print(f"⏱️  Time taken: {elapsed_time/60:.2f} minutes")
            print(f"🔗 View model at: https://huggingface.co/{MODEL_ID}")

if __name__ == "__main__":
    main()