Upload folder using huggingface_hub

Browse files

Files changed (17) hide show

data/processed_dataset/dataset_dict.json +1 -0
data/processed_dataset/test/data-00000-of-00001.arrow +3 -0
data/processed_dataset/test/dataset_info.json +33 -0
data/processed_dataset/test/state.json +18 -0
data/processed_dataset/train/data-00000-of-00001.arrow +3 -0
data/processed_dataset/train/dataset_info.json +33 -0
data/processed_dataset/train/state.json +18 -0
results/images/data_distribution_2025-12-18_15-27-36.png +0 -0
results/images/metrics_2025-12-18_15-06-59.txt +4 -0
results/images/metrics_2025-12-18_15-19-18.txt +4 -0
results/images/metrics_2025-12-18_15-25-36.txt +4 -0
results/images/metrics_2025-12-18_15-27-41.txt +4 -0
results/images/training_metrics_2025-12-18_15-06-59.png +0 -0
results/images/training_metrics_2025-12-18_15-19-18.png +0 -0
results/images/training_metrics_2025-12-18_15-25-36.png +0 -0
results/images/training_metrics_2025-12-18_15-27-41.png +0 -0
src/upload_to_hf.py +34 -36

data/processed_dataset/dataset_dict.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"splits": ["train", "test"]}

data/processed_dataset/test/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a4590634c3f9bb97b2fb2047cffcbdd00122eb564e6563b8ecb9673a7aa881b
+size 44377040

data/processed_dataset/test/dataset_info.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "labels": {
+      "dtype": "int64",
+      "_type": "Value"
+    },
+    "input_ids": {
+      "feature": {
+        "dtype": "int32",
+        "_type": "Value"
+      },
+      "_type": "List"
+    },
+    "token_type_ids": {
+      "feature": {
+        "dtype": "int8",
+        "_type": "Value"
+      },
+      "_type": "List"
+    },
+    "attention_mask": {
+      "feature": {
+        "dtype": "int8",
+        "_type": "Value"
+      },
+      "_type": "List"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

data/processed_dataset/test/state.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "e68a6594db5a153c",
+  "_format_columns": [
+    "attention_mask",
+    "input_ids",
+    "labels",
+    "token_type_ids"
+  ],
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

data/processed_dataset/train/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b9f4e04f36632cfd2ae601cca3c4541ed2a2987279e320e5b6c544067f92871f
+size 399379240

data/processed_dataset/train/dataset_info.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "labels": {
+      "dtype": "int64",
+      "_type": "Value"
+    },
+    "input_ids": {
+      "feature": {
+        "dtype": "int32",
+        "_type": "Value"
+      },
+      "_type": "List"
+    },
+    "token_type_ids": {
+      "feature": {
+        "dtype": "int8",
+        "_type": "Value"
+      },
+      "_type": "List"
+    },
+    "attention_mask": {
+      "feature": {
+        "dtype": "int8",
+        "_type": "Value"
+      },
+      "_type": "List"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

data/processed_dataset/train/state.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "c52fbe1364b1bc3b",
+  "_format_columns": [
+    "attention_mask",
+    "input_ids",
+    "labels",
+    "token_type_ids"
+  ],
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

results/images/data_distribution_2025-12-18_15-27-36.png ADDED Viewed

results/images/metrics_2025-12-18_15-06-59.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+Timestamp: 2025-12-18_15-06-59
+Final Validation Accuracy: 0.7683
+Final Validation Loss: 0.5479554533958435
+Plot saved to: training_metrics_2025-12-18_15-06-59.png

results/images/metrics_2025-12-18_15-19-18.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+Timestamp: 2025-12-18_15-19-18
+Final Validation Accuracy: 0.7719
+Final Validation Loss: 0.538950502872467
+Plot saved to: training_metrics_2025-12-18_15-19-18.png

results/images/metrics_2025-12-18_15-25-36.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+Timestamp: 2025-12-18_15-25-36
+Final Validation Accuracy: 0.7719
+Final Validation Loss: 0.538950502872467
+Plot saved to: training_metrics_2025-12-18_15-25-36.png

results/images/metrics_2025-12-18_15-27-41.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+Timestamp: 2025-12-18_15-27-41
+Final Validation Accuracy: 0.7746
+Final Validation Loss: 0.5276312828063965
+Plot saved to: training_metrics_2025-12-18_15-27-41.png

results/images/training_metrics_2025-12-18_15-06-59.png ADDED Viewed

results/images/training_metrics_2025-12-18_15-19-18.png ADDED Viewed

results/images/training_metrics_2025-12-18_15-25-36.png ADDED Viewed

results/images/training_metrics_2025-12-18_15-27-41.png ADDED Viewed

src/upload_to_hf.py CHANGED Viewed

@@ -6,7 +6,7 @@ from huggingface_hub import HfApi, create_repo, upload_folder
 from config import Config
 def main():
-    print("🚀 开始重新上传 (Code + Model Combined)...")
     api = HfApi()
     try:
@@ -17,77 +17,75 @@ def main():
         print("❌ Please login first.")
         return
-    model_repo_id = f"{username}/sentiment-analysis-bert-finetuned"
-    # 1. 准备临时上传目录 (Merge Strategy)
-    # create a temp dir to combine everything before uploading to ensure structure is perfect
     upload_dir = "hf_upload_staging"
     if os.path.exists(upload_dir):
         shutil.rmtree(upload_dir)
     os.makedirs(upload_dir)
-    print(f"📦 Staging files to {upload_dir}...")
-    # A. Copy Project Code (src, notebook, etc)
-    # We want these at the root
-    items_to_copy = ["src", "notebooks", "docs", "demo", "README.md", "requirements.txt", "*.pptx"]
     for pattern in items_to_copy:
         for item in glob.glob(pattern):
             dest = os.path.join(upload_dir, item)
             if os.path.isdir(item):
                 shutil.copytree(item, dest, dirs_exist_ok=True)
             else:
                 shutil.copy2(item, dest)
-    # B. Copy Model Weights (Flattened to root)
-    # Find latest checkpoint
     candidates = glob.glob(os.path.join(Config.RESULTS_DIR, "checkpoint-*"))
-    # Filter out zip files if any
     candidates = [c for c in candidates if os.path.isdir(c)]
     if candidates:
         candidates.sort(key=os.path.getmtime)
         latest_ckpt = candidates[-1]
-        print(f"✅ Found latest checkpoint: {latest_ckpt}")
-        # Files to copy from checkpoint to root
         model_files = ["config.json", "model.safetensors", "pytorch_model.bin", "tokenizer.json", "vocab.txt", "tokenizer_config.json", "special_tokens_map.json"]
-        found_weights = False
         for fname in os.listdir(latest_ckpt):
             if fname in model_files or fname.endswith(".safetensors") or fname.endswith(".bin"):
-                 # Copy to root of staging
                  shutil.copy2(os.path.join(latest_ckpt, fname), os.path.join(upload_dir, fname))
-                 if "model" in fname or "pytorch" in fname:
-                     found_weights = True
-        if not found_weights:
-            print("⚠️ WARNING: No model weights (.bin or .safetensors) found in checkpoint!")
     else:
-        print("❌ No checkpoints found in results/!")
-    # 2. Upload the Staged Directory
-    print(f"\n⬆️ Uploading entire {upload_dir} to https://huggingface.co/{model_repo_id}")
-    create_repo(repo_id=model_repo_id, repo_type="model", exist_ok=True)
     upload_folder(
         folder_path=upload_dir,
-        repo_id=model_repo_id,
         repo_type="model"
     )
     # Cleanup
     shutil.rmtree(upload_dir)
-    print("🎉 Done! Model and Code are now together in the repo root.")
-    # Check dataset
-    dataset_repo_id = f"{username}/sentiment-analysis-dataset-processed"
-    data_path = os.path.join(Config.DATA_DIR, "processed_dataset")
-    if os.path.exists(data_path):
-        print(f"\n⬆️ Uploading dataset to https://huggingface.co/datasets/{dataset_repo_id}")
-        create_repo(repo_id=dataset_repo_id, repo_type="dataset", exist_ok=True)
-        upload_folder(folder_path=data_path, repo_id=dataset_repo_id, repo_type="dataset")
 if __name__ == "__main__":
     current_dir = os.path.dirname(os.path.abspath(__file__))
     parent_dir = os.path.dirname(current_dir)

 from config import Config
 def main():
+    print("🚀 开始全量上传 (All-in-One) 到 robot4/sentiment-analysis-bert-finetuned ...")
     api = HfApi()
     try:
         print("❌ Please login first.")
         return
+    # 目标仓库 (用户指定)
+    target_repo_id = "robot4/sentiment-analysis-bert-finetuned"
+    # 1. 准备临时上传目录
     upload_dir = "hf_upload_staging"
     if os.path.exists(upload_dir):
         shutil.rmtree(upload_dir)
     os.makedirs(upload_dir)
+    print(f"📦 正在打包所有文件到 {upload_dir}...")
+    # A. 复制项目代码和资源
+    # 包含了 data, src, docs, notebooks, demo, results/images 等
+    items_to_copy = [
+        "src", "notebooks", "docs", "demo", "data",
+        "README.md", "requirements.txt", "*.pptx"
+    ]
     for pattern in items_to_copy:
         for item in glob.glob(pattern):
             dest = os.path.join(upload_dir, item)
+            print(f"   - Adding {item}...")
             if os.path.isdir(item):
                 shutil.copytree(item, dest, dirs_exist_ok=True)
             else:
                 shutil.copy2(item, dest)
+    # B. 特殊处理 results 目录 (只传图片和 logs，不传所有 checkpoint 文件夹)
+    results_dest = os.path.join(upload_dir, "results")
+    os.makedirs(results_dest, exist_ok=True)
+    # 复制图片
+    if os.path.exists("results/images"):
+        shutil.copytree("results/images", os.path.join(results_dest, "images"), dirs_exist_ok=True)
+    # 复制 txt metrics
+    for txt in glob.glob("results/*.txt"):
+        shutil.copy2(txt, results_dest)
+    # C. 提取最新模型权重到根目录 (方便直接加载)
     candidates = glob.glob(os.path.join(Config.RESULTS_DIR, "checkpoint-*"))
     candidates = [c for c in candidates if os.path.isdir(c)]
     if candidates:
         candidates.sort(key=os.path.getmtime)
         latest_ckpt = candidates[-1]
+        print(f"✅ 提取最新模型权重: {latest_ckpt} -> 根目录")
         model_files = ["config.json", "model.safetensors", "pytorch_model.bin", "tokenizer.json", "vocab.txt", "tokenizer_config.json", "special_tokens_map.json"]
         for fname in os.listdir(latest_ckpt):
             if fname in model_files or fname.endswith(".safetensors") or fname.endswith(".bin"):
                  shutil.copy2(os.path.join(latest_ckpt, fname), os.path.join(upload_dir, fname))
     else:
+        print("⚠️ 未找到 Checkpoint，仅上传代码和数据。")
+    # 2. 执行上传
+    print(f"\n⬆️ 正在上传所有文件到 https://huggingface.co/{target_repo_id}")
+    create_repo(repo_id=target_repo_id, repo_type="model", exist_ok=True)
     upload_folder(
         folder_path=upload_dir,
+        repo_id=target_repo_id,
         repo_type="model"
     )
     # Cleanup
     shutil.rmtree(upload_dir)
+    print("🎉 上传完毕！")
 if __name__ == "__main__":
     current_dir = os.path.dirname(os.path.abspath(__file__))
     parent_dir = os.path.dirname(current_dir)