File size: 7,224 Bytes
d31d787
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#!/usr/bin/env python3
"""
Upload KeyVID model to Hugging Face Hub (Optimized for Speed)
"""

from pathlib import Path
from huggingface_hub import HfApi, login, upload_folder
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time

# Model repository name on Hugging Face
MODEL_ID = "RyanWW/KeyVID"

# Path to the KeyVID directory
KEYVID_PATH = "/dockerx/groups/KeyVID_hf_model"

# Upload configuration
MAX_WORKERS = 8  # 并发上传线程数
CHUNK_SIZE = 100 * 1024 * 1024  # 100MB chunks for large files

def should_exclude_file(file_path, exclude_patterns):
    """检查文件是否应该被排除"""
    file_str = str(file_path)
    for pattern in exclude_patterns:
        # 支持通配符匹配
        if '*' in pattern:
            pattern_parts = pattern.split('*')
            if all(part in file_str for part in pattern_parts if part):
                return True
        elif pattern in file_str:
            return True
    return False

def get_files_to_upload(keyvid_dir, exclude_patterns):
    """获取需要上传的文件列表"""
    files = []
    total_size = 0
    
    print("🔍 Scanning files...")
    for file_path in tqdm(keyvid_dir.rglob("*"), desc="Scanning"):
        if file_path.is_file():
            relative_path = file_path.relative_to(keyvid_dir)
            
            if not should_exclude_file(relative_path, exclude_patterns):
                file_size = file_path.stat().st_size
                files.append((relative_path, file_size))
                total_size += file_size
    
    return files, total_size

def format_size(size_bytes):
    """格式化文件大小"""
    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
        if size_bytes < 1024.0:
            return f"{size_bytes:.2f} {unit}"
        size_bytes /= 1024.0
    return f"{size_bytes:.2f} PB"

def upload_file_wrapper(args):
    """包装文件上传函数用于并发"""
    api, keyvid_dir, file_path, repo_id = args
    try:
        full_path = keyvid_dir / file_path
        api.upload_file(
            path_or_fileobj=str(full_path),
            path_in_repo=str(file_path),
            repo_id=repo_id,
            repo_type="model",
            commit_message="Upload KeyVID model files",
            # multi_commits=True,  # 允许分批提交
        )
        return (file_path, True, None)
    except Exception as e:
        return (file_path, False, str(e))

def main():
    print("🚀 Starting KeyVID upload to Hugging Face (Optimized)...")
    print(f"Repository: {MODEL_ID}")
    
    # Check authentication
    try:
        api = HfApi()
        print("✅ Hugging Face authentication found")
    except Exception as e:
        print("⚠️  Need to authenticate with Hugging Face")
        print("Please run: huggingface-cli login")
        print("Or set HF_TOKEN environment variable")
        return
    
    # Paths to upload
    keyvid_dir = Path(KEYVID_PATH)
    
    if not keyvid_dir.exists():
        print(f"❌ Error: KeyVID directory not found at {KEYVID_PATH}")
        return
    
    print(f"\n📁 Directory: {keyvid_dir}")
    
    # 优化后的排除模式
    exclude_patterns = [
        "__pycache__",
        ".git",
        "*.pyc",
        ".DS_Store",
        "save_results/",  # 排除结果目录
        "*.log",
        "*.tmp",
        "error.txt",
        ".bash_history",
        ".gitignore",
        "upload.py",  # 排除上传脚本本身
    ]
    
    # 获取文件列表
    files_to_upload, total_size = get_files_to_upload(keyvid_dir, exclude_patterns)
    
    print(f"\n📊 Statistics:")
    print(f"   Files to upload: {len(files_to_upload)}")
    print(f"   Total size: {format_size(total_size)}")
    
    if len(files_to_upload) == 0:
        print("⚠️  No files to upload!")
        return
    
    # 询问用户确认
    response = input(f"\n❓ Proceed with upload? (y/n): ").strip().lower()
    if response != 'y':
        print("❌ Upload cancelled")
        return
    
    # 策略选择:根据文件数量和大小选择上传方式
    large_files = [f for f, s in files_to_upload if s > 100 * 1024 * 1024]  # > 100MB
    small_files = [f for f, s in files_to_upload if s <= 100 * 1024 * 1024]
    
    print(f"\n📦 Upload strategy:")
    print(f"   Large files (>100MB): {len(large_files)}")
    print(f"   Small files: {len(small_files)}")
    print(f"   Concurrent workers: {MAX_WORKERS}")
    
    # 方法1: 使用 upload_folder(最快,Hugging Face推荐)
    print(f"\n⬆️  Uploading using optimized upload_folder...")
    try:
        start_time = time.time()
        
        # 使用更精确的ignore_patterns
        ignore_patterns = [
            "**/__pycache__/**",
            "**/.git/**",
            "**/*.pyc",
            "**/.DS_Store",
            "**/save_results/**",
            "**/*.log",
            "**/*.tmp",
            "**/upload.py",
        ]
        
        upload_folder(
            folder_path=str(keyvid_dir),
            repo_id=MODEL_ID,
            repo_type="model",
            ignore_patterns=ignore_patterns,
            commit_message="Upload KeyVID model files",
            multi_commits=True,  # 允许分批提交,加快大文件上传
            multi_commits_verbose=True,  # 显示详细进度
        )
        
        elapsed_time = time.time() - start_time
        print(f"\n✅ Upload complete!")
        print(f"⏱️  Time taken: {elapsed_time/60:.2f} minutes")
        print(f"🔗 View model at: https://huggingface.co/{MODEL_ID}")
        
    except Exception as e:
        print(f"❌ Error with upload_folder: {e}")
        print("\n📝 Falling back to concurrent file upload...")
        
        # 方法2: 并发上传单个文件(备选方案)
        start_time = time.time()
        failed_files = []
        
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            # 准备上传任务
            tasks = [
                (api, keyvid_dir, file_path, MODEL_ID)
                for file_path, _ in files_to_upload
            ]
            
            # 提交任务并显示进度
            futures = {executor.submit(upload_file_wrapper, task): task[2] 
                      for task in tasks}
            
            with tqdm(total=len(files_to_upload), desc="Uploading") as pbar:
                for future in as_completed(futures):
                    file_path, success, error = future.result()
                    if success:
                        pbar.update(1)
                    else:
                        failed_files.append((file_path, error))
                        pbar.update(1)
        
        elapsed_time = time.time() - start_time
        
        if failed_files:
            print(f"\n⚠️  {len(failed_files)} files failed to upload:")
            for file_path, error in failed_files[:10]:
                print(f"   - {file_path}: {error}")
        else:
            print(f"\n✅ All files uploaded successfully!")
            print(f"⏱️  Time taken: {elapsed_time/60:.2f} minutes")
            print(f"🔗 View model at: https://huggingface.co/{MODEL_ID}")

if __name__ == "__main__":
    main()