ssdataanalysis commited on
Commit
38eb889
·
verified ·
1 Parent(s): 779c4ca

Switch to packing=True, batch=4, grad_acc=4, step-based checkpoints for speed

Browse files
Files changed (1) hide show
  1. train.py +8 -8
train.py CHANGED
@@ -87,8 +87,6 @@ output_dir = os.environ.get("OUTPUT_DIR", "ssdataanalysis/gemma-4-E4B-hebrew-fir
87
  print(f"=== Training {model_id} -> {output_dir} ===")
88
 
89
  train_dataset = prepare_dataset(hebrew_ratio=0.5, max_total=120000)
90
- # No eval dataset to avoid OOM during evaluation on A10G 24GB
91
- # We will rely on training loss and periodic checkpointing
92
 
93
  print("Loading tokenizer...")
94
  tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
@@ -112,28 +110,30 @@ peft_config = LoraConfig(
112
  exclude_modules=["vision_tower", "multi_modal_projector"],
113
  )
114
 
 
115
  training_args = SFTConfig(
116
  output_dir=output_dir,
117
  num_train_epochs=3,
118
- per_device_train_batch_size=1,
119
- gradient_accumulation_steps=16,
120
  learning_rate=2e-4,
121
  lr_scheduler_type="cosine",
122
  warmup_steps=500,
123
  weight_decay=0.01,
124
  max_length=2048,
125
- packing=False,
126
  bf16=True,
127
  logging_strategy="steps",
128
  logging_steps=10,
129
  logging_first_step=True,
130
  eval_strategy="no",
131
- save_strategy="epoch",
132
- save_total_limit=2,
 
133
  push_to_hub=True,
134
  hub_model_id=output_dir,
135
  report_to="trackio",
136
- run_name=output_dir,
137
  remove_unused_columns=False,
138
  disable_tqdm=True,
139
  dataset_num_proc=4,
 
87
  print(f"=== Training {model_id} -> {output_dir} ===")
88
 
89
  train_dataset = prepare_dataset(hebrew_ratio=0.5, max_total=120000)
 
 
90
 
91
  print("Loading tokenizer...")
92
  tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
 
110
  exclude_modules=["vision_tower", "multi_modal_projector"],
111
  )
112
 
113
+ # Optimized: packing=True, larger batch, step-based checkpoints
114
  training_args = SFTConfig(
115
  output_dir=output_dir,
116
  num_train_epochs=3,
117
+ per_device_train_batch_size=4,
118
+ gradient_accumulation_steps=4,
119
  learning_rate=2e-4,
120
  lr_scheduler_type="cosine",
121
  warmup_steps=500,
122
  weight_decay=0.01,
123
  max_length=2048,
124
+ packing=True,
125
  bf16=True,
126
  logging_strategy="steps",
127
  logging_steps=10,
128
  logging_first_step=True,
129
  eval_strategy="no",
130
+ save_strategy="steps",
131
+ save_steps=500,
132
+ save_total_limit=3,
133
  push_to_hub=True,
134
  hub_model_id=output_dir,
135
  report_to="trackio",
136
+ run_name=output_dir + "-fast",
137
  remove_unused_columns=False,
138
  disable_tqdm=True,
139
  dataset_num_proc=4,