stmasson commited on
Commit
73c7a79
·
verified ·
1 Parent(s): d24b4ff

Upload scripts/train_alizee_v2_stage1_sft.py with huggingface_hub

Browse files
scripts/train_alizee_v2_stage1_sft.py CHANGED
@@ -219,22 +219,26 @@ def format_coding_sample(example):
219
 
220
  return {"messages": messages, "source": "coding"}
221
 
222
- # Format datasets
223
  print("\n🔄 Formatting datasets...")
224
- print(" Formatting reasoning samples...")
 
 
225
  reasoning_formatted = ocr_full.map(
226
  format_reasoning_sample,
227
  remove_columns=ocr_full.column_names,
228
- num_proc=8,
229
- desc="Formatting reasoning"
 
230
  )
231
 
232
  print(" Formatting coding samples...")
233
  coding_formatted = coding_ds_final.map(
234
  format_coding_sample,
235
  remove_columns=coding_ds_final.column_names,
236
- num_proc=4,
237
- desc="Formatting coding"
 
238
  )
239
 
240
  # Combine and shuffle
 
219
 
220
  return {"messages": messages, "source": "coding"}
221
 
222
+ # Format datasets with better memory handling
223
  print("\n🔄 Formatting datasets...")
224
+ print(" Formatting reasoning samples (this may take a few minutes)...")
225
+
226
+ # Use lower parallelism to avoid OOM and add batched processing
227
  reasoning_formatted = ocr_full.map(
228
  format_reasoning_sample,
229
  remove_columns=ocr_full.column_names,
230
+ num_proc=4, # Reduced from 8 to avoid memory issues
231
+ desc="Formatting reasoning",
232
+ load_from_cache_file=False, # Don't cache to save disk
233
  )
234
 
235
  print(" Formatting coding samples...")
236
  coding_formatted = coding_ds_final.map(
237
  format_coding_sample,
238
  remove_columns=coding_ds_final.column_names,
239
+ num_proc=2, # Reduced parallelism
240
+ desc="Formatting coding",
241
+ load_from_cache_file=False,
242
  )
243
 
244
  # Combine and shuffle