davidsmts commited on
Commit
d3d40a6
ยท
verified ยท
1 Parent(s): b418708

Upload run_sft_training.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_sft_training.py +120 -0
run_sft_training.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # /// script
3
+ # dependencies = [
4
+ # "trl>=0.12.0",
5
+ # "peft>=0.7.0",
6
+ # "transformers>=4.36.0",
7
+ # "accelerate>=0.24.0",
8
+ # "trackio", # For real-time monitoring
9
+ # ]
10
+ # ///
11
+
12
+ """
13
+ Production-ready SFT training example with all best practices.
14
+
15
+ This script demonstrates:
16
+ - Trackio integration for real-time monitoring
17
+ - LoRA/PEFT for efficient training
18
+ - Proper Hub saving configuration
19
+ - Train/eval split for monitoring
20
+ - Checkpoint management
21
+ - Optimized training parameters
22
+
23
+ Usage with hf_jobs MCP tool:
24
+ hf_jobs("uv", {
25
+ "script": '''<paste this entire file>''',
26
+ "flavor": "a10g-large",
27
+ "timeout": "3h",
28
+ "secrets": {"HF_TOKEN": "$HF_TOKEN"},
29
+ })
30
+
31
+ Or submit the script content directly inline without saving to a file.
32
+ """
33
+
34
+ import trackio
35
+ from datasets import load_dataset
36
+ from peft import LoraConfig
37
+ from trl import SFTTrainer, SFTConfig
38
+
39
+
40
+
41
+ # Load dataset
42
+ print("๐Ÿ“ฆ Loading dataset...")
43
+ dataset = load_dataset("trl-lib/Capybara", split="train")
44
+ print(f"โœ… Dataset loaded: {len(dataset)} examples")
45
+
46
+ # Create train/eval split
47
+ print("๐Ÿ”€ Creating train/eval split...")
48
+ dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
49
+ train_dataset = dataset_split["train"]
50
+ eval_dataset = dataset_split["test"]
51
+ print(f" Train: {len(train_dataset)} examples")
52
+ print(f" Eval: {len(eval_dataset)} examples")
53
+
54
+ # Note: For memory-constrained demos, skip eval by using full dataset as train_dataset
55
+ # and removing eval_dataset, eval_strategy, and eval_steps from config below
56
+
57
+ # Training configuration
58
+ config = SFTConfig(
59
+ # CRITICAL: Hub settings
60
+ output_dir="qwen-capybara-sft",
61
+ push_to_hub=True,
62
+ hub_model_id="davidsmts/qwen-capybara-sft",
63
+ hub_strategy="every_save", # Push checkpoints
64
+
65
+ # Training parameters
66
+ num_train_epochs=3,
67
+ per_device_train_batch_size=4,
68
+ gradient_accumulation_steps=4,
69
+ learning_rate=2e-5,
70
+ # max_length=1024, # Default - only set if you need different sequence length
71
+
72
+ # Logging & checkpointing
73
+ logging_steps=10,
74
+ save_strategy="steps",
75
+ save_steps=100,
76
+ save_total_limit=2,
77
+
78
+ # Evaluation - IMPORTANT: Only enable if eval_dataset provided
79
+ eval_strategy="steps",
80
+ eval_steps=100,
81
+
82
+ # Optimization
83
+ warmup_ratio=0.1,
84
+ lr_scheduler_type="cosine",
85
+
86
+ # Monitoring
87
+ report_to="trackio", # Integrate with Trackio
88
+ project="sft-finetuning", # project name for the training name (trackio)
89
+ run_name="qwen-capybara-sft", #Descriptive name for this training run
90
+ )
91
+
92
+ # LoRA configuration
93
+ peft_config = LoraConfig(
94
+ r=16,
95
+ lora_alpha=32,
96
+ lora_dropout=0.05,
97
+ bias="none",
98
+ task_type="CAUSAL_LM",
99
+ target_modules=["q_proj", "v_proj"],
100
+ )
101
+
102
+ # Initialize and train
103
+ print("๐ŸŽฏ Initializing trainer...")
104
+ trainer = SFTTrainer(
105
+ model="Qwen/Qwen2.5-0.5B",
106
+ train_dataset=train_dataset,
107
+ eval_dataset=eval_dataset, # CRITICAL: Must provide eval_dataset when eval_strategy is enabled
108
+ args=config,
109
+ peft_config=peft_config,
110
+ )
111
+
112
+ print("๐Ÿš€ Starting training...")
113
+ trainer.train()
114
+
115
+ print("๐Ÿ’พ Pushing to Hub...")
116
+ trainer.push_to_hub()
117
+
118
+ # Finish Trackio tracking
119
+ print("โœ… Complete! Model at: https://huggingface.co/davidsmts/qwen-capybara-sft")
120
+ print("๐Ÿ“Š View metrics at: https://huggingface.co/spaces/davidsmts/trackio")