Upload train_qwen3_8b_hf.py with huggingface_hub
Browse files- train_qwen3_8b_hf.py +10 -10
train_qwen3_8b_hf.py
CHANGED
|
@@ -53,7 +53,7 @@ eval_dataset = dataset_split["test"]
|
|
| 53 |
print(f" Train: {len(train_dataset)} examples")
|
| 54 |
print(f" Eval: {len(eval_dataset)} examples")
|
| 55 |
|
| 56 |
-
# Training configuration
|
| 57 |
config = SFTConfig(
|
| 58 |
# Hub settings
|
| 59 |
output_dir="qwen3-8b-vyvo-copilot",
|
|
@@ -62,26 +62,27 @@ config = SFTConfig(
|
|
| 62 |
hub_strategy="every_save",
|
| 63 |
hub_private_repo=False,
|
| 64 |
|
| 65 |
-
# Training parameters -
|
| 66 |
num_train_epochs=3,
|
| 67 |
-
per_device_train_batch_size=2
|
| 68 |
-
gradient_accumulation_steps=
|
| 69 |
learning_rate=2e-4,
|
| 70 |
-
max_length=
|
| 71 |
|
| 72 |
# Memory optimization
|
| 73 |
gradient_checkpointing=True,
|
|
|
|
| 74 |
bf16=True,
|
|
|
|
| 75 |
|
| 76 |
# Logging & checkpointing
|
| 77 |
logging_steps=10,
|
| 78 |
save_strategy="steps",
|
| 79 |
save_steps=200,
|
| 80 |
-
save_total_limit=
|
| 81 |
|
| 82 |
-
# Evaluation
|
| 83 |
-
eval_strategy="
|
| 84 |
-
eval_steps=200,
|
| 85 |
|
| 86 |
# Optimization
|
| 87 |
warmup_ratio=0.05,
|
|
@@ -109,7 +110,6 @@ print("🎯 Initializing trainer with Qwen/Qwen3-8B...")
|
|
| 109 |
trainer = SFTTrainer(
|
| 110 |
model="Qwen/Qwen3-8B",
|
| 111 |
train_dataset=train_dataset,
|
| 112 |
-
eval_dataset=eval_dataset,
|
| 113 |
args=config,
|
| 114 |
peft_config=peft_config,
|
| 115 |
)
|
|
|
|
| 53 |
print(f" Train: {len(train_dataset)} examples")
|
| 54 |
print(f" Eval: {len(eval_dataset)} examples")
|
| 55 |
|
| 56 |
+
# Training configuration - optimized for memory on A10G
|
| 57 |
config = SFTConfig(
|
| 58 |
# Hub settings
|
| 59 |
output_dir="qwen3-8b-vyvo-copilot",
|
|
|
|
| 62 |
hub_strategy="every_save",
|
| 63 |
hub_private_repo=False,
|
| 64 |
|
| 65 |
+
# Training parameters - reduced for memory
|
| 66 |
num_train_epochs=3,
|
| 67 |
+
per_device_train_batch_size=1, # Reduced from 2
|
| 68 |
+
gradient_accumulation_steps=16, # Increased to maintain effective batch size
|
| 69 |
learning_rate=2e-4,
|
| 70 |
+
max_length=1024, # Reduced from 2048 to save memory
|
| 71 |
|
| 72 |
# Memory optimization
|
| 73 |
gradient_checkpointing=True,
|
| 74 |
+
gradient_checkpointing_kwargs={"use_reentrant": False},
|
| 75 |
bf16=True,
|
| 76 |
+
optim="adamw_8bit", # Use 8-bit optimizer to save memory
|
| 77 |
|
| 78 |
# Logging & checkpointing
|
| 79 |
logging_steps=10,
|
| 80 |
save_strategy="steps",
|
| 81 |
save_steps=200,
|
| 82 |
+
save_total_limit=2,
|
| 83 |
|
| 84 |
+
# Evaluation - skip eval during training to save memory
|
| 85 |
+
eval_strategy="no",
|
|
|
|
| 86 |
|
| 87 |
# Optimization
|
| 88 |
warmup_ratio=0.05,
|
|
|
|
| 110 |
trainer = SFTTrainer(
|
| 111 |
model="Qwen/Qwen3-8B",
|
| 112 |
train_dataset=train_dataset,
|
|
|
|
| 113 |
args=config,
|
| 114 |
peft_config=peft_config,
|
| 115 |
)
|