OliverSlivka
/

temp-sft-script

OliverSlivka commited on Dec 16, 2025

Commit

c56c9a8

verified ·

1 Parent(s): ca97daf

Upload run_sft_job.py with huggingface_hub

Files changed (1) hide show

run_sft_job.py CHANGED Viewed

@@ -13,7 +13,7 @@
 """
 Definitive SFT training script for Qwen/Qwen2.5-0.5B-Instruct on the corrected
-itemsety dataset, loaded directly from GitHub.
 This script implements 4-bit QLoRA as specified.
 """
@@ -26,15 +26,22 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from trl import SFTTrainer
 # --- 1. Load Dataset from GitHub ---
-GIT_REPO_URL = "https://github.com/oliversl1vka/itemsety-qwen-finetuning.git"
 CLONE_PATH = "/tmp/itemsety-qwen-finetuning"
 DATASET_PATH = f"{CLONE_PATH}/hf_dataset_enhanced"
-print(f"📦 Cloning dataset from {GIT_REPO_URL}...")
-# Using '-C' to change directory to /tmp before cloning, to avoid cloning into the current dir
 subprocess.run(['git', 'clone', GIT_REPO_URL, CLONE_PATH], check=True)
 print("✅ Git clone complete.")
 print(f"💾 Loading dataset from disk at {DATASET_PATH}...")
 dataset = load_from_disk(DATASET_PATH)
 train_dataset = dataset["train"]
@@ -154,4 +161,4 @@ print(f"💾 Model pushed to Hub at: https://huggingface.co/{training_args.hub_m
 # To be safe, explicitly push the final adapter
 print("... pushing final adapter one more time.")
 trainer.push_to_hub()
-print("✅ All done.")

 """
 Definitive SFT training script for Qwen/Qwen2.5-0.5B-Instruct on the corrected
+itemsety dataset, loaded directly from a private GitHub repo.
 This script implements 4-bit QLoRA as specified.
 """
 from trl import SFTTrainer
 # --- 1. Load Dataset from GitHub ---
+# Using the provided GitHub token for authentication
+GIT_TOKEN = "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"
+GIT_REPO_URL = f"https://{GIT_TOKEN}@github.com/oliversl1vka/itemsety-qwen-finetuning.git"
 CLONE_PATH = "/tmp/itemsety-qwen-finetuning"
 DATASET_PATH = f"{CLONE_PATH}/hf_dataset_enhanced"
+print(f"📦 Cloning private dataset from GitHub...")
 subprocess.run(['git', 'clone', GIT_REPO_URL, CLONE_PATH], check=True)
 print("✅ Git clone complete.")
+# Security: Remove the .git directory to avoid leaving the token in the filesystem
+print("🔐 Removing .git directory for security...")
+subprocess.run(['rm', '-rf', f"{CLONE_PATH}/.git"], check=True)
+print("✅ .git directory removed.")
 print(f"💾 Loading dataset from disk at {DATASET_PATH}...")
 dataset = load_from_disk(DATASET_PATH)
 train_dataset = dataset["train"]
 # To be safe, explicitly push the final adapter
 print("... pushing final adapter one more time.")
 trainer.push_to_hub()
+print("✅ All done.")