lokegud commited on
Commit
f4cfc9b
·
verified ·
1 Parent(s): 69c41e0

Upload train_production.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train_production.py +10 -8
train_production.py CHANGED
@@ -30,7 +30,10 @@ peft_config = LoraConfig(
30
  task_type="CAUSAL_LM"
31
  )
32
 
33
- # Training configuration
 
 
 
34
  training_args = SFTConfig(
35
  output_dir="comfyui-specialist-v1",
36
  num_train_epochs=3,
@@ -38,24 +41,23 @@ training_args = SFTConfig(
38
  per_device_eval_batch_size=2,
39
  gradient_accumulation_steps=8, # Effective batch size: 16
40
  learning_rate=2e-4,
41
- warmup_steps=20,
42
- logging_steps=5,
43
- eval_strategy="steps",
44
- eval_steps=20,
45
- save_strategy="steps",
46
- save_steps=50,
47
  save_total_limit=3,
48
  load_best_model_at_end=True,
49
  metric_for_best_model="eval_loss",
50
  greater_is_better=False,
51
  push_to_hub=True,
52
  hub_model_id="lokegud/comfyui-specialist-v1",
53
- hub_strategy="every_save",
54
  hub_private_repo=False,
55
  report_to="trackio",
56
  project="comfyui-specialist",
57
  run_name="production-v1",
58
  gradient_checkpointing=True,
 
59
  max_length=2048, # Longer context for full workflows
60
  dataset_text_field="messages", # Chat format
61
  )
 
30
  task_type="CAUSAL_LM"
31
  )
32
 
33
+ # Training configuration - Fixed for 702 examples
34
+ # With 702 examples: 597 train, 105 eval
35
+ # Steps per epoch: 597 / (2 * 8) = ~37 steps/epoch
36
+ # Total steps: 37 * 3 epochs = ~111 steps
37
  training_args = SFTConfig(
38
  output_dir="comfyui-specialist-v1",
39
  num_train_epochs=3,
 
41
  per_device_eval_batch_size=2,
42
  gradient_accumulation_steps=8, # Effective batch size: 16
43
  learning_rate=2e-4,
44
+ warmup_ratio=0.1, # Warm up for 10% of training (~11 steps)
45
+ logging_steps=1, # Log every step
46
+ eval_strategy="epoch", # Evaluate after each epoch
47
+ save_strategy="epoch", # Save after each epoch
 
 
48
  save_total_limit=3,
49
  load_best_model_at_end=True,
50
  metric_for_best_model="eval_loss",
51
  greater_is_better=False,
52
  push_to_hub=True,
53
  hub_model_id="lokegud/comfyui-specialist-v1",
54
+ hub_strategy="end", # Only push final model
55
  hub_private_repo=False,
56
  report_to="trackio",
57
  project="comfyui-specialist",
58
  run_name="production-v1",
59
  gradient_checkpointing=True,
60
+ bf16=True, # Faster training with bf16
61
  max_length=2048, # Longer context for full workflows
62
  dataset_text_field="messages", # Chat format
63
  )