OliverSlivka commited on
Commit
ca97daf
·
verified ·
1 Parent(s): 886cfc7

Upload run_sft_job.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_sft_job.py +112 -90
run_sft_job.py CHANGED
@@ -2,134 +2,156 @@
2
  # /// script
3
  # dependencies = [
4
  # "trl>=0.12.0",
5
- # "peft>=0.7.0",
6
- # "transformers>=4.36.0",
7
- # "accelerate>=0.24.0",
8
- # "trackio",
9
- # "datasets"
 
10
  # ]
11
  # ///
12
 
13
  """
14
- Production-ready SFT training example with all best practices.
15
-
16
- This script demonstrates:
17
- - Trackio integration for real-time monitoring
18
- - LoRA/PEFT for efficient training
19
- - Proper Hub saving configuration
20
- - Train/eval split for monitoring
21
- - Checkpoint management
22
- - Optimized training parameters
23
-
24
- Usage with hf_jobs MCP tool:
25
- hf_jobs("uv", {
26
- "script": '''<paste this entire file>''',
27
- "flavor": "a10g-large",
28
- "timeout": "3h",
29
- "secrets": {"HF_TOKEN": "$HF_TOKEN"},
30
- })
31
-
32
- Or submit the script content directly inline without saving to a file.
33
  """
34
 
35
- import trackio
36
- from datasets import load_dataset
 
37
  from peft import LoraConfig
38
- from trl import SFTTrainer, SFTConfig
39
-
40
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- from datasets import Dataset
 
 
 
 
 
 
 
43
 
44
- # 1. Load Dataset
45
- print("📦 Loading dataset OliverSlivka/itemsety-real-training...")
46
- original_dataset = load_dataset("OliverSlivka/itemsety-real-training")
47
 
48
- def format_dataset(dataset):
49
- # Manually create a new dataset with a 'text' column.
50
- new_data = {"text": []}
51
- for example in dataset:
52
- text = ""
53
- for message in example["messages"]:
54
- role = message["role"]
55
- content = message["content"]
56
- text += f"**{role.capitalize()}:** {content}\n\n"
57
- new_data["text"].append(text)
58
- return Dataset.from_dict(new_data)
59
 
60
- train_dataset = format_dataset(original_dataset["train"])
61
- eval_dataset = format_dataset(original_dataset["validation"])
62
 
63
- print(f"✅ Dataset loaded and formatted. Train: {len(train_dataset)}, Eval: {len(eval_dataset)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- # 2. Training Configuration
66
- config = SFTConfig(
67
  # Hub settings
68
- output_dir="qwen3-0.6b-itemsety-sft",
69
  push_to_hub=True,
70
- hub_model_id="OliverSlivka/qwen3-0.6b-itemsety-sft", # <--- EDIT THIS
71
  hub_strategy="all_checkpoints",
72
 
73
  # Training parameters
74
  num_train_epochs=3,
75
- per_device_train_batch_size=1, # Changed from 4
76
  gradient_accumulation_steps=4,
77
- learning_rate=2e-5,
78
- max_length=2048, # Added
79
 
80
  # Logging & checkpointing
81
- logging_steps=5, # Changed from 10
82
  save_strategy="steps",
83
- save_steps=20, # Changed from 100
84
  save_total_limit=2,
85
 
86
- # Evaluation - IMPORTANT: Only enable if eval_dataset provided
87
  eval_strategy="steps",
88
- eval_steps=20, # Changed from 100
89
 
90
  # Optimization
91
- warmup_ratio=0.1,
92
- lr_scheduler_type="cosine",
93
-
94
- # Monitoring
95
- report_to="trackio", # Integrate with Trackio
96
- project="itemsety-finetune", # project name for the training name (trackio)
97
- run_name="qwen3-0.6b-sft-run-1", #Descriptive name for this training run
 
98
  )
99
 
100
- # LoRA configuration
101
- peft_config = LoraConfig(
102
- r=16,
103
- lora_alpha=32,
104
- lora_dropout=0.05,
105
- bias="none",
106
- task_type="CAUSAL_LM",
107
- target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], # Added more target modules
108
- )
109
 
110
- # 4. Initialize Trainer
111
- print("🎯 Initializing trainer...")
112
  trainer = SFTTrainer(
113
- model="Qwen/Qwen3-0.6B", # Changed model
 
114
  train_dataset=train_dataset,
115
- eval_dataset=eval_dataset, # CRITICAL: Must provide eval_dataset when eval_strategy is enabled
116
- args=config,
117
  peft_config=peft_config,
118
- dataset_text_field="text",
 
 
 
119
  )
120
 
121
- # 5. Start Training
122
- print("🚀 Starting training...")
123
- trainer.train()
124
-
125
- print("✅ Training complete!")
126
- print(f"💾 Model pushed to Hub at: https://huggingface.co/{config.hub_model_id}")
127
- print("📊 View metrics at: https://huggingface.co/spaces/OliverSlivka/trackio")
128
 
129
- # 5. Start Training
130
  print("🚀 Starting training...")
131
  trainer.train()
132
 
133
  print("✅ Training complete!")
134
- print(f"💾 Model pushed to Hub at: https://huggingface.co/{config.hub_model_id}")
135
- print("📊 View metrics at: https://huggingface.co/spaces/OliverSlivka/trackio")
 
 
 
 
 
2
  # /// script
3
  # dependencies = [
4
  # "trl>=0.12.0",
5
+ # "peft>=0.11.1",
6
+ # "transformers>=4.41.2",
7
+ # "accelerate>=0.30.1",
8
+ # "datasets>=2.19.1",
9
+ # "bitsandbytes>=0.43.1",
10
+ # "trackio"
11
  # ]
12
  # ///
13
 
14
  """
15
+ Definitive SFT training script for Qwen/Qwen2.5-0.5B-Instruct on the corrected
16
+ itemsety dataset, loaded directly from GitHub.
17
+
18
+ This script implements 4-bit QLoRA as specified.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  """
20
 
21
+ import subprocess
22
+ import torch
23
+ from datasets import load_from_disk
24
  from peft import LoraConfig
25
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
26
+ from trl import SFTTrainer
27
+
28
+ # --- 1. Load Dataset from GitHub ---
29
+ GIT_REPO_URL = "https://github.com/oliversl1vka/itemsety-qwen-finetuning.git"
30
+ CLONE_PATH = "/tmp/itemsety-qwen-finetuning"
31
+ DATASET_PATH = f"{CLONE_PATH}/hf_dataset_enhanced"
32
+
33
+ print(f"📦 Cloning dataset from {GIT_REPO_URL}...")
34
+ # Using '-C' to change directory to /tmp before cloning, to avoid cloning into the current dir
35
+ subprocess.run(['git', 'clone', GIT_REPO_URL, CLONE_PATH], check=True)
36
+ print("✅ Git clone complete.")
37
+
38
+ print(f"💾 Loading dataset from disk at {DATASET_PATH}...")
39
+ dataset = load_from_disk(DATASET_PATH)
40
+ train_dataset = dataset["train"]
41
+ eval_dataset = dataset["validation"]
42
+
43
+ # Verification assertions
44
+ assert len(train_dataset) == 88, f"Expected 88 train examples, got {len(train_dataset)}"
45
+ assert len(eval_dataset) == 10, f"Expected 10 val examples, got {len(eval_dataset)}"
46
+ assert 'messages' in train_dataset.column_names, "Missing 'messages' column"
47
+ print(f"✅ Dataset loaded successfully. Train: {len(train_dataset)}, Eval: {len(eval_dataset)}")
48
+
49
+
50
+ # --- 2. Model and Tokenizer Configuration ---
51
+ MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
52
+
53
+ # 4-bit QLoRA configuration (as specified)
54
+ quantization_config = BitsAndBytesConfig(
55
+ load_in_4bit=True,
56
+ bnb_4bit_quant_type="nf4",
57
+ bnb_4bit_compute_dtype=torch.bfloat16,
58
+ bnb_4bit_use_double_quant=True,
59
+ )
60
 
61
+ print(f"🔥 Loading model '{MODEL_ID}' with 4-bit QLoRA...")
62
+ model = AutoModelForCausalLM.from_pretrained(
63
+ MODEL_ID,
64
+ quantization_config=quantization_config,
65
+ device_map="auto" # Let accelerate handle device mapping
66
+ )
67
+ model.config.use_cache = False # Recommended for fine-tuning
68
+ model.config.pretraining_tp = 1
69
 
 
 
 
70
 
71
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
72
+ tokenizer.pad_token = tokenizer.eos_token # Set pad token to EOS token
73
+ tokenizer.padding_side = "right"
 
 
 
 
 
 
 
 
74
 
 
 
75
 
76
+ # --- 3. LoRA and Training Configuration ---
77
+ # LoRA config
78
+ peft_config = LoraConfig(
79
+ r=16,
80
+ lora_alpha=32,
81
+ lora_dropout=0.05,
82
+ bias="none",
83
+ task_type="CAUSAL_LM",
84
+ target_modules=[
85
+ "q_proj",
86
+ "k_proj",
87
+ "v_proj",
88
+ "o_proj",
89
+ "gate_proj",
90
+ "up_proj",
91
+ "down_proj",
92
+ ],
93
+ )
94
 
95
+ # Training Arguments
96
+ training_args = TrainingArguments(
97
  # Hub settings
98
+ output_dir="qwen2.5-0.5b-itemsety-qlora",
99
  push_to_hub=True,
100
+ hub_model_id="OliverSlivka/qwen2.5-0.5b-itemsety-qlora-final",
101
  hub_strategy="all_checkpoints",
102
 
103
  # Training parameters
104
  num_train_epochs=3,
105
+ per_device_train_batch_size=4,
106
  gradient_accumulation_steps=4,
107
+ learning_rate=2e-4, # Common for QLoRA
108
+ optim="paged_adamw_8bit", # Use 8-bit AdamW optimizer
109
 
110
  # Logging & checkpointing
111
+ logging_steps=5,
112
  save_strategy="steps",
113
+ save_steps=20,
114
  save_total_limit=2,
115
 
116
+ # Evaluation
117
  eval_strategy="steps",
118
+ eval_steps=20,
119
 
120
  # Optimization
121
+ warmup_ratio=0.03,
122
+ lr_scheduler_type="constant",
123
+ max_grad_norm=0.3,
124
+ max_steps=-1, # Train for num_train_epochs
125
+
126
+ # W&B or other reporting
127
+ report_to="trackio",
128
+ run_name="qwen-itemsety-qlora-run-final"
129
  )
130
 
 
 
 
 
 
 
 
 
 
131
 
132
+ # --- 4. Initialize Trainer ---
133
+ print("🎯 Initializing SFTTrainer...")
134
  trainer = SFTTrainer(
135
+ model=model,
136
+ tokenizer=tokenizer,
137
  train_dataset=train_dataset,
138
+ eval_dataset=eval_dataset,
 
139
  peft_config=peft_config,
140
+ args=training_args,
141
+ max_seq_length=2048,
142
+ dataset_text_field="messages", # Use the messages column
143
+ packing=False # Do not pack sequences
144
  )
145
 
 
 
 
 
 
 
 
146
 
147
+ # --- 5. Start Training ---
148
  print("🚀 Starting training...")
149
  trainer.train()
150
 
151
  print("✅ Training complete!")
152
+ print(f"💾 Model pushed to Hub at: https://huggingface.co/{training_args.hub_model_id}")
153
+
154
+ # To be safe, explicitly push the final adapter
155
+ print("... pushing final adapter one more time.")
156
+ trainer.push_to_hub()
157
+ print("✅ All done.")