Upload folder using huggingface_hub
Browse files- src/debug_paths.py +20 -0
- src/upload_to_hf.py +61 -52
src/debug_paths.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import glob
|
| 3 |
+
from config import Config
|
| 4 |
+
|
| 5 |
+
print(f"Current Working Directory: {os.getcwd()}")
|
| 6 |
+
print(f"Config.RESULTS_DIR: {Config.RESULTS_DIR}")
|
| 7 |
+
|
| 8 |
+
# Debug Finding Checkpoints
|
| 9 |
+
candidates = glob.glob(os.path.join(Config.RESULTS_DIR, "checkpoint-*"))
|
| 10 |
+
print(f"Found {len(candidates)} candidates:")
|
| 11 |
+
for c in candidates:
|
| 12 |
+
print(f" - {c}")
|
| 13 |
+
|
| 14 |
+
if not candidates:
|
| 15 |
+
# Try relative path manual
|
| 16 |
+
print("Trying relative path './results/checkpoint-*'...")
|
| 17 |
+
candidates = glob.glob("./results/checkpoint-*")
|
| 18 |
+
print(f"Found {len(candidates)} candidates via relative:")
|
| 19 |
+
for c in candidates:
|
| 20 |
+
print(f" - {c}")
|
src/upload_to_hf.py
CHANGED
|
@@ -1,85 +1,94 @@
|
|
| 1 |
import os
|
| 2 |
import sys
|
| 3 |
import glob
|
|
|
|
| 4 |
from huggingface_hub import HfApi, create_repo, upload_folder
|
| 5 |
from config import Config
|
| 6 |
|
| 7 |
def main():
|
| 8 |
-
print("🚀 开始上传
|
| 9 |
|
| 10 |
-
# 1. 检测登录
|
| 11 |
api = HfApi()
|
| 12 |
try:
|
| 13 |
user_info = api.whoami()
|
| 14 |
username = user_info['name']
|
| 15 |
-
print(f"✅
|
| 16 |
-
except
|
| 17 |
-
print("❌
|
| 18 |
return
|
| 19 |
|
| 20 |
-
# 定义仓库名称
|
| 21 |
model_repo_id = f"{username}/sentiment-analysis-bert-finetuned"
|
| 22 |
-
dataset_repo_id = f"{username}/sentiment-analysis-dataset-processed"
|
| 23 |
-
|
| 24 |
-
# ========================================================
|
| 25 |
-
# 2. 上传模型与代码 (合并到一个 Model Repo)
|
| 26 |
-
# ========================================================
|
| 27 |
-
print(f"\n📦 正在准备模型仓库: {model_repo_id}")
|
| 28 |
-
create_repo(repo_id=model_repo_id, repo_type="model", exist_ok=True)
|
| 29 |
|
| 30 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
candidates = glob.glob(os.path.join(Config.RESULTS_DIR, "checkpoint-*"))
|
|
|
|
|
|
|
|
|
|
| 32 |
if candidates:
|
| 33 |
candidates.sort(key=os.path.getmtime)
|
| 34 |
latest_ckpt = candidates[-1]
|
| 35 |
-
print(f"
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
repo_id=model_repo_id,
|
| 41 |
-
repo_type="model",
|
| 42 |
-
ignore_patterns=["optimizer.pt", "scheduler.pt", "rng_state.pth"] # 剔除大文件
|
| 43 |
-
)
|
| 44 |
else:
|
| 45 |
-
print("
|
| 46 |
|
| 47 |
-
#
|
| 48 |
-
print("
|
| 49 |
-
|
|
|
|
| 50 |
upload_folder(
|
| 51 |
-
folder_path=
|
| 52 |
repo_id=model_repo_id,
|
| 53 |
-
repo_type="model"
|
| 54 |
-
ignore_patterns=[
|
| 55 |
-
"results/*", "data/*", "__pycache__", "*.pyc", ".git", ".DS_Store",
|
| 56 |
-
"env", "venv", ".venv", ".ipynb_checkpoints", "**/*.pt"
|
| 57 |
-
]
|
| 58 |
)
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
|
|
|
|
|
|
| 64 |
data_path = os.path.join(Config.DATA_DIR, "processed_dataset")
|
| 65 |
if os.path.exists(data_path):
|
| 66 |
-
print(f"\n
|
| 67 |
create_repo(repo_id=dataset_repo_id, repo_type="dataset", exist_ok=True)
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
upload_folder(
|
| 71 |
-
folder_path=data_path,
|
| 72 |
-
repo_id=dataset_repo_id,
|
| 73 |
-
repo_type="dataset"
|
| 74 |
-
)
|
| 75 |
-
print(f"✅ 数据集已同步: https://huggingface.co/datasets/{dataset_repo_id}")
|
| 76 |
-
else:
|
| 77 |
-
print("⚠️ 未找到 data/processed_dataset,跳过数据集上传。")
|
| 78 |
-
|
| 79 |
-
print("\n🎉 全部上传任务完成!")
|
| 80 |
-
|
| 81 |
if __name__ == "__main__":
|
| 82 |
-
# 解决相对导入
|
| 83 |
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 84 |
parent_dir = os.path.dirname(current_dir)
|
| 85 |
sys.path.append(parent_dir)
|
|
|
|
| 1 |
import os
|
| 2 |
import sys
|
| 3 |
import glob
|
| 4 |
+
import shutil
|
| 5 |
from huggingface_hub import HfApi, create_repo, upload_folder
|
| 6 |
from config import Config
|
| 7 |
|
| 8 |
def main():
|
| 9 |
+
print("🚀 开始重新上传 (Code + Model Combined)...")
|
| 10 |
|
|
|
|
| 11 |
api = HfApi()
|
| 12 |
try:
|
| 13 |
user_info = api.whoami()
|
| 14 |
username = user_info['name']
|
| 15 |
+
print(f"✅ User: {username}")
|
| 16 |
+
except:
|
| 17 |
+
print("❌ Please login first.")
|
| 18 |
return
|
| 19 |
|
|
|
|
| 20 |
model_repo_id = f"{username}/sentiment-analysis-bert-finetuned"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
+
# 1. 准备临时上传目录 (Merge Strategy)
|
| 23 |
+
# create a temp dir to combine everything before uploading to ensure structure is perfect
|
| 24 |
+
upload_dir = "hf_upload_staging"
|
| 25 |
+
if os.path.exists(upload_dir):
|
| 26 |
+
shutil.rmtree(upload_dir)
|
| 27 |
+
os.makedirs(upload_dir)
|
| 28 |
+
|
| 29 |
+
print(f"📦 Staging files to {upload_dir}...")
|
| 30 |
+
|
| 31 |
+
# A. Copy Project Code (src, notebook, etc)
|
| 32 |
+
# We want these at the root
|
| 33 |
+
items_to_copy = ["src", "notebooks", "docs", "demo", "README.md", "requirements.txt", "*.pptx"]
|
| 34 |
+
for pattern in items_to_copy:
|
| 35 |
+
for item in glob.glob(pattern):
|
| 36 |
+
dest = os.path.join(upload_dir, item)
|
| 37 |
+
if os.path.isdir(item):
|
| 38 |
+
shutil.copytree(item, dest, dirs_exist_ok=True)
|
| 39 |
+
else:
|
| 40 |
+
shutil.copy2(item, dest)
|
| 41 |
+
|
| 42 |
+
# B. Copy Model Weights (Flattened to root)
|
| 43 |
+
# Find latest checkpoint
|
| 44 |
candidates = glob.glob(os.path.join(Config.RESULTS_DIR, "checkpoint-*"))
|
| 45 |
+
# Filter out zip files if any
|
| 46 |
+
candidates = [c for c in candidates if os.path.isdir(c)]
|
| 47 |
+
|
| 48 |
if candidates:
|
| 49 |
candidates.sort(key=os.path.getmtime)
|
| 50 |
latest_ckpt = candidates[-1]
|
| 51 |
+
print(f"✅ Found latest checkpoint: {latest_ckpt}")
|
| 52 |
+
|
| 53 |
+
# Files to copy from checkpoint to root
|
| 54 |
+
model_files = ["config.json", "model.safetensors", "pytorch_model.bin", "tokenizer.json", "vocab.txt", "tokenizer_config.json", "special_tokens_map.json"]
|
| 55 |
+
|
| 56 |
+
found_weights = False
|
| 57 |
+
for fname in os.listdir(latest_ckpt):
|
| 58 |
+
if fname in model_files or fname.endswith(".safetensors") or fname.endswith(".bin"):
|
| 59 |
+
# Copy to root of staging
|
| 60 |
+
shutil.copy2(os.path.join(latest_ckpt, fname), os.path.join(upload_dir, fname))
|
| 61 |
+
if "model" in fname or "pytorch" in fname:
|
| 62 |
+
found_weights = True
|
| 63 |
|
| 64 |
+
if not found_weights:
|
| 65 |
+
print("⚠️ WARNING: No model weights (.bin or .safetensors) found in checkpoint!")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
else:
|
| 67 |
+
print("❌ No checkpoints found in results/!")
|
| 68 |
|
| 69 |
+
# 2. Upload the Staged Directory
|
| 70 |
+
print(f"\n⬆️ Uploading entire {upload_dir} to https://huggingface.co/{model_repo_id}")
|
| 71 |
+
create_repo(repo_id=model_repo_id, repo_type="model", exist_ok=True)
|
| 72 |
+
|
| 73 |
upload_folder(
|
| 74 |
+
folder_path=upload_dir,
|
| 75 |
repo_id=model_repo_id,
|
| 76 |
+
repo_type="model"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
)
|
| 78 |
+
|
| 79 |
+
# Cleanup
|
| 80 |
+
shutil.rmtree(upload_dir)
|
| 81 |
+
print("🎉 Done! Model and Code are now together in the repo root.")
|
| 82 |
+
|
| 83 |
+
# Check dataset
|
| 84 |
+
dataset_repo_id = f"{username}/sentiment-analysis-dataset-processed"
|
| 85 |
data_path = os.path.join(Config.DATA_DIR, "processed_dataset")
|
| 86 |
if os.path.exists(data_path):
|
| 87 |
+
print(f"\n⬆️ Uploading dataset to https://huggingface.co/datasets/{dataset_repo_id}")
|
| 88 |
create_repo(repo_id=dataset_repo_id, repo_type="dataset", exist_ok=True)
|
| 89 |
+
upload_folder(folder_path=data_path, repo_id=dataset_repo_id, repo_type="dataset")
|
| 90 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
if __name__ == "__main__":
|
|
|
|
| 92 |
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 93 |
parent_dir = os.path.dirname(current_dir)
|
| 94 |
sys.path.append(parent_dir)
|