robot4 commited on
Commit
89f9a3e
·
verified ·
1 Parent(s): bda2946

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. src/debug_paths.py +20 -0
  2. src/upload_to_hf.py +61 -52
src/debug_paths.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import glob
3
+ from config import Config
4
+
5
+ print(f"Current Working Directory: {os.getcwd()}")
6
+ print(f"Config.RESULTS_DIR: {Config.RESULTS_DIR}")
7
+
8
+ # Debug Finding Checkpoints
9
+ candidates = glob.glob(os.path.join(Config.RESULTS_DIR, "checkpoint-*"))
10
+ print(f"Found {len(candidates)} candidates:")
11
+ for c in candidates:
12
+ print(f" - {c}")
13
+
14
+ if not candidates:
15
+ # Try relative path manual
16
+ print("Trying relative path './results/checkpoint-*'...")
17
+ candidates = glob.glob("./results/checkpoint-*")
18
+ print(f"Found {len(candidates)} candidates via relative:")
19
+ for c in candidates:
20
+ print(f" - {c}")
src/upload_to_hf.py CHANGED
@@ -1,85 +1,94 @@
1
  import os
2
  import sys
3
  import glob
 
4
  from huggingface_hub import HfApi, create_repo, upload_folder
5
  from config import Config
6
 
7
  def main():
8
- print("🚀 开始上传全套项目 (代码 + 模型 + 数据) 到 Hugging Face...")
9
 
10
- # 1. 检测登录
11
  api = HfApi()
12
  try:
13
  user_info = api.whoami()
14
  username = user_info['name']
15
- print(f"✅ 当前登录用户: {username}")
16
- except Exception as e:
17
- print("❌ 未检测到登录状态!请先运行 'huggingface-cli login'")
18
  return
19
 
20
- # 定义仓库名称
21
  model_repo_id = f"{username}/sentiment-analysis-bert-finetuned"
22
- dataset_repo_id = f"{username}/sentiment-analysis-dataset-processed"
23
-
24
- # ========================================================
25
- # 2. 上传模型与代码 (合并到一个 Model Repo)
26
- # ========================================================
27
- print(f"\n📦 正在准备模型仓库: {model_repo_id}")
28
- create_repo(repo_id=model_repo_id, repo_type="model", exist_ok=True)
29
 
30
- # A. 上传最新模型权重 (到根目录)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  candidates = glob.glob(os.path.join(Config.RESULTS_DIR, "checkpoint-*"))
 
 
 
32
  if candidates:
33
  candidates.sort(key=os.path.getmtime)
34
  latest_ckpt = candidates[-1]
35
- print(f" ➡️ 发现最新模型: {latest_ckpt}")
36
- print(" ⬆️ 正在上传模型权重 (model.safetensors 等)... 这可能需要几分钟")
 
 
 
 
 
 
 
 
 
 
37
 
38
- upload_folder(
39
- folder_path=latest_ckpt,
40
- repo_id=model_repo_id,
41
- repo_type="model",
42
- ignore_patterns=["optimizer.pt", "scheduler.pt", "rng_state.pth"] # 剔除大文件
43
- )
44
  else:
45
- print(" ⚠️ 未找到 checkpoint,跳过模型权重上传。")
46
 
47
- # B. 上传项目代码 (到根目录)
48
- print(" ⬆️ 正在上传项目代码 (src, notebook, docs...)...")
49
- # 我们上传当前目录 '.',但要排除 data, results, venv 等杂物
 
50
  upload_folder(
51
- folder_path=".",
52
  repo_id=model_repo_id,
53
- repo_type="model",
54
- ignore_patterns=[
55
- "results/*", "data/*", "__pycache__", "*.pyc", ".git", ".DS_Store",
56
- "env", "venv", ".venv", ".ipynb_checkpoints", "**/*.pt"
57
- ]
58
  )
59
- print(f"✅ 代码与模型已同步: https://huggingface.co/{model_repo_id}")
60
-
61
- # ========================================================
62
- # 3. 上传数据集
63
- # ========================================================
 
 
64
  data_path = os.path.join(Config.DATA_DIR, "processed_dataset")
65
  if os.path.exists(data_path):
66
- print(f"\n📚 正在准备数据集仓库: {dataset_repo_id}")
67
  create_repo(repo_id=dataset_repo_id, repo_type="dataset", exist_ok=True)
68
-
69
- print(" ⬆️ 正在上传数据集...")
70
- upload_folder(
71
- folder_path=data_path,
72
- repo_id=dataset_repo_id,
73
- repo_type="dataset"
74
- )
75
- print(f"✅ 数据集已同步: https://huggingface.co/datasets/{dataset_repo_id}")
76
- else:
77
- print("⚠️ 未找到 data/processed_dataset,跳过数据集上传。")
78
-
79
- print("\n🎉 全部上传任务完成!")
80
-
81
  if __name__ == "__main__":
82
- # 解决相对导入
83
  current_dir = os.path.dirname(os.path.abspath(__file__))
84
  parent_dir = os.path.dirname(current_dir)
85
  sys.path.append(parent_dir)
 
1
  import os
2
  import sys
3
  import glob
4
+ import shutil
5
  from huggingface_hub import HfApi, create_repo, upload_folder
6
  from config import Config
7
 
8
  def main():
9
+ print("🚀 开始重新上传 (Code + Model Combined)...")
10
 
 
11
  api = HfApi()
12
  try:
13
  user_info = api.whoami()
14
  username = user_info['name']
15
+ print(f"✅ User: {username}")
16
+ except:
17
+ print("❌ Please login first.")
18
  return
19
 
 
20
  model_repo_id = f"{username}/sentiment-analysis-bert-finetuned"
 
 
 
 
 
 
 
21
 
22
+ # 1. 准备临时上传目录 (Merge Strategy)
23
+ # create a temp dir to combine everything before uploading to ensure structure is perfect
24
+ upload_dir = "hf_upload_staging"
25
+ if os.path.exists(upload_dir):
26
+ shutil.rmtree(upload_dir)
27
+ os.makedirs(upload_dir)
28
+
29
+ print(f"📦 Staging files to {upload_dir}...")
30
+
31
+ # A. Copy Project Code (src, notebook, etc)
32
+ # We want these at the root
33
+ items_to_copy = ["src", "notebooks", "docs", "demo", "README.md", "requirements.txt", "*.pptx"]
34
+ for pattern in items_to_copy:
35
+ for item in glob.glob(pattern):
36
+ dest = os.path.join(upload_dir, item)
37
+ if os.path.isdir(item):
38
+ shutil.copytree(item, dest, dirs_exist_ok=True)
39
+ else:
40
+ shutil.copy2(item, dest)
41
+
42
+ # B. Copy Model Weights (Flattened to root)
43
+ # Find latest checkpoint
44
  candidates = glob.glob(os.path.join(Config.RESULTS_DIR, "checkpoint-*"))
45
+ # Filter out zip files if any
46
+ candidates = [c for c in candidates if os.path.isdir(c)]
47
+
48
  if candidates:
49
  candidates.sort(key=os.path.getmtime)
50
  latest_ckpt = candidates[-1]
51
+ print(f"✅ Found latest checkpoint: {latest_ckpt}")
52
+
53
+ # Files to copy from checkpoint to root
54
+ model_files = ["config.json", "model.safetensors", "pytorch_model.bin", "tokenizer.json", "vocab.txt", "tokenizer_config.json", "special_tokens_map.json"]
55
+
56
+ found_weights = False
57
+ for fname in os.listdir(latest_ckpt):
58
+ if fname in model_files or fname.endswith(".safetensors") or fname.endswith(".bin"):
59
+ # Copy to root of staging
60
+ shutil.copy2(os.path.join(latest_ckpt, fname), os.path.join(upload_dir, fname))
61
+ if "model" in fname or "pytorch" in fname:
62
+ found_weights = True
63
 
64
+ if not found_weights:
65
+ print("⚠️ WARNING: No model weights (.bin or .safetensors) found in checkpoint!")
 
 
 
 
66
  else:
67
+ print(" No checkpoints found in results/!")
68
 
69
+ # 2. Upload the Staged Directory
70
+ print(f"\n⬆️ Uploading entire {upload_dir} to https://huggingface.co/{model_repo_id}")
71
+ create_repo(repo_id=model_repo_id, repo_type="model", exist_ok=True)
72
+
73
  upload_folder(
74
+ folder_path=upload_dir,
75
  repo_id=model_repo_id,
76
+ repo_type="model"
 
 
 
 
77
  )
78
+
79
+ # Cleanup
80
+ shutil.rmtree(upload_dir)
81
+ print("🎉 Done! Model and Code are now together in the repo root.")
82
+
83
+ # Check dataset
84
+ dataset_repo_id = f"{username}/sentiment-analysis-dataset-processed"
85
  data_path = os.path.join(Config.DATA_DIR, "processed_dataset")
86
  if os.path.exists(data_path):
87
+ print(f"\n⬆️ Uploading dataset to https://huggingface.co/datasets/{dataset_repo_id}")
88
  create_repo(repo_id=dataset_repo_id, repo_type="dataset", exist_ok=True)
89
+ upload_folder(folder_path=data_path, repo_id=dataset_repo_id, repo_type="dataset")
90
+
 
 
 
 
 
 
 
 
 
 
 
91
  if __name__ == "__main__":
 
92
  current_dir = os.path.dirname(os.path.abspath(__file__))
93
  parent_dir = os.path.dirname(current_dir)
94
  sys.path.append(parent_dir)