Upload train_production.py with huggingface_hub
Browse files- train_production.py +10 -8
train_production.py
CHANGED
|
@@ -30,7 +30,10 @@ peft_config = LoraConfig(
|
|
| 30 |
task_type="CAUSAL_LM"
|
| 31 |
)
|
| 32 |
|
| 33 |
-
# Training configuration
|
|
|
|
|
|
|
|
|
|
| 34 |
training_args = SFTConfig(
|
| 35 |
output_dir="comfyui-specialist-v1",
|
| 36 |
num_train_epochs=3,
|
|
@@ -38,24 +41,23 @@ training_args = SFTConfig(
|
|
| 38 |
per_device_eval_batch_size=2,
|
| 39 |
gradient_accumulation_steps=8, # Effective batch size: 16
|
| 40 |
learning_rate=2e-4,
|
| 41 |
-
|
| 42 |
-
logging_steps=
|
| 43 |
-
eval_strategy="
|
| 44 |
-
|
| 45 |
-
save_strategy="steps",
|
| 46 |
-
save_steps=50,
|
| 47 |
save_total_limit=3,
|
| 48 |
load_best_model_at_end=True,
|
| 49 |
metric_for_best_model="eval_loss",
|
| 50 |
greater_is_better=False,
|
| 51 |
push_to_hub=True,
|
| 52 |
hub_model_id="lokegud/comfyui-specialist-v1",
|
| 53 |
-
hub_strategy="
|
| 54 |
hub_private_repo=False,
|
| 55 |
report_to="trackio",
|
| 56 |
project="comfyui-specialist",
|
| 57 |
run_name="production-v1",
|
| 58 |
gradient_checkpointing=True,
|
|
|
|
| 59 |
max_length=2048, # Longer context for full workflows
|
| 60 |
dataset_text_field="messages", # Chat format
|
| 61 |
)
|
|
|
|
| 30 |
task_type="CAUSAL_LM"
|
| 31 |
)
|
| 32 |
|
| 33 |
+
# Training configuration - Fixed for 702 examples
|
| 34 |
+
# With 702 examples: 597 train, 105 eval
|
| 35 |
+
# Steps per epoch: 597 / (2 * 8) = ~37 steps/epoch
|
| 36 |
+
# Total steps: 37 * 3 epochs = ~111 steps
|
| 37 |
training_args = SFTConfig(
|
| 38 |
output_dir="comfyui-specialist-v1",
|
| 39 |
num_train_epochs=3,
|
|
|
|
| 41 |
per_device_eval_batch_size=2,
|
| 42 |
gradient_accumulation_steps=8, # Effective batch size: 16
|
| 43 |
learning_rate=2e-4,
|
| 44 |
+
warmup_ratio=0.1, # Warm up for 10% of training (~11 steps)
|
| 45 |
+
logging_steps=1, # Log every step
|
| 46 |
+
eval_strategy="epoch", # Evaluate after each epoch
|
| 47 |
+
save_strategy="epoch", # Save after each epoch
|
|
|
|
|
|
|
| 48 |
save_total_limit=3,
|
| 49 |
load_best_model_at_end=True,
|
| 50 |
metric_for_best_model="eval_loss",
|
| 51 |
greater_is_better=False,
|
| 52 |
push_to_hub=True,
|
| 53 |
hub_model_id="lokegud/comfyui-specialist-v1",
|
| 54 |
+
hub_strategy="end", # Only push final model
|
| 55 |
hub_private_repo=False,
|
| 56 |
report_to="trackio",
|
| 57 |
project="comfyui-specialist",
|
| 58 |
run_name="production-v1",
|
| 59 |
gradient_checkpointing=True,
|
| 60 |
+
bf16=True, # Faster training with bf16
|
| 61 |
max_length=2048, # Longer context for full workflows
|
| 62 |
dataset_text_field="messages", # Chat format
|
| 63 |
)
|