OliverSlivka commited on
Commit
c56c9a8
ยท
verified ยท
1 Parent(s): ca97daf

Upload run_sft_job.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_sft_job.py +12 -5
run_sft_job.py CHANGED
@@ -13,7 +13,7 @@
13
 
14
  """
15
  Definitive SFT training script for Qwen/Qwen2.5-0.5B-Instruct on the corrected
16
- itemsety dataset, loaded directly from GitHub.
17
 
18
  This script implements 4-bit QLoRA as specified.
19
  """
@@ -26,15 +26,22 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
26
  from trl import SFTTrainer
27
 
28
  # --- 1. Load Dataset from GitHub ---
29
- GIT_REPO_URL = "https://github.com/oliversl1vka/itemsety-qwen-finetuning.git"
 
 
30
  CLONE_PATH = "/tmp/itemsety-qwen-finetuning"
31
  DATASET_PATH = f"{CLONE_PATH}/hf_dataset_enhanced"
32
 
33
- print(f"๐Ÿ“ฆ Cloning dataset from {GIT_REPO_URL}...")
34
- # Using '-C' to change directory to /tmp before cloning, to avoid cloning into the current dir
35
  subprocess.run(['git', 'clone', GIT_REPO_URL, CLONE_PATH], check=True)
36
  print("โœ… Git clone complete.")
37
 
 
 
 
 
 
 
38
  print(f"๐Ÿ’พ Loading dataset from disk at {DATASET_PATH}...")
39
  dataset = load_from_disk(DATASET_PATH)
40
  train_dataset = dataset["train"]
@@ -154,4 +161,4 @@ print(f"๐Ÿ’พ Model pushed to Hub at: https://huggingface.co/{training_args.hub_m
154
  # To be safe, explicitly push the final adapter
155
  print("... pushing final adapter one more time.")
156
  trainer.push_to_hub()
157
- print("โœ… All done.")
 
13
 
14
  """
15
  Definitive SFT training script for Qwen/Qwen2.5-0.5B-Instruct on the corrected
16
+ itemsety dataset, loaded directly from a private GitHub repo.
17
 
18
  This script implements 4-bit QLoRA as specified.
19
  """
 
26
  from trl import SFTTrainer
27
 
28
  # --- 1. Load Dataset from GitHub ---
29
+ # Using the provided GitHub token for authentication
30
+ GIT_TOKEN = "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"
31
+ GIT_REPO_URL = f"https://{GIT_TOKEN}@github.com/oliversl1vka/itemsety-qwen-finetuning.git"
32
  CLONE_PATH = "/tmp/itemsety-qwen-finetuning"
33
  DATASET_PATH = f"{CLONE_PATH}/hf_dataset_enhanced"
34
 
35
+ print(f"๐Ÿ“ฆ Cloning private dataset from GitHub...")
 
36
  subprocess.run(['git', 'clone', GIT_REPO_URL, CLONE_PATH], check=True)
37
  print("โœ… Git clone complete.")
38
 
39
+ # Security: Remove the .git directory to avoid leaving the token in the filesystem
40
+ print("๐Ÿ” Removing .git directory for security...")
41
+ subprocess.run(['rm', '-rf', f"{CLONE_PATH}/.git"], check=True)
42
+ print("โœ… .git directory removed.")
43
+
44
+
45
  print(f"๐Ÿ’พ Loading dataset from disk at {DATASET_PATH}...")
46
  dataset = load_from_disk(DATASET_PATH)
47
  train_dataset = dataset["train"]
 
161
  # To be safe, explicitly push the final adapter
162
  print("... pushing final adapter one more time.")
163
  trainer.push_to_hub()
164
+ print("โœ… All done.")