stmasson commited on
Commit
6da594c
·
verified ·
1 Parent(s): a8da371

Upload scripts/train_alizee_v2_stage1_sft.py with huggingface_hub

Browse files
scripts/train_alizee_v2_stage1_sft.py CHANGED
@@ -160,6 +160,9 @@ for i, sample in enumerate(coding_ds):
160
  coding_ds_final = Dataset.from_list(coding_samples)
161
  print(f" Collected {len(coding_ds_final)} coding samples")
162
 
 
 
 
163
  # Format functions for different data sources
164
  def format_reasoning_sample(example):
165
  """Format OpenCodeReasoning sample for instruction tuning.
@@ -169,15 +172,20 @@ def format_reasoning_sample(example):
169
  - output: reasoning trace / expected output explanation
170
  - solution: the actual code
171
  """
 
 
 
 
 
172
  # Create a reasoning-enhanced prompt
173
  messages = [
174
  {
175
  "role": "user",
176
- "content": f"Solve the following programming problem. Think through it step by step.\n\n{example['input']}"
177
  },
178
  {
179
  "role": "assistant",
180
- "content": f"Let me think through this problem step by step.\n\n{example['output']}\n\nHere's my solution:\n\n```python\n{example['solution']}\n```"
181
  }
182
  ]
183
 
@@ -185,8 +193,8 @@ def format_reasoning_sample(example):
185
 
186
  def format_coding_sample(example):
187
  """Format starcoderdata sample for capability preservation."""
188
- # Extract code content
189
- content = example.get("content", "")
190
 
191
  # Create a simple code completion task
192
  lines = content.split("\n")
@@ -300,6 +308,10 @@ training_config = SFTConfig(
300
  dataloader_num_workers=4,
301
  remove_unused_columns=True,
302
  packing=False, # Disable packing for long sequences
 
 
 
 
303
  )
304
 
305
  # Initialize trainer
 
160
  coding_ds_final = Dataset.from_list(coding_samples)
161
  print(f" Collected {len(coding_ds_final)} coding samples")
162
 
163
+ # Approximate character limit for 32K tokens (assuming ~4 chars per token average)
164
+ MAX_CHARS = MAX_SEQ_LENGTH * 3 # ~98K chars, slightly conservative
165
+
166
  # Format functions for different data sources
167
  def format_reasoning_sample(example):
168
  """Format OpenCodeReasoning sample for instruction tuning.
 
172
  - output: reasoning trace / expected output explanation
173
  - solution: the actual code
174
  """
175
+ # Truncate very long fields to prevent memory issues
176
+ input_text = str(example.get('input', ''))[:30000]
177
+ output_text = str(example.get('output', ''))[:50000]
178
+ solution_text = str(example.get('solution', ''))[:15000]
179
+
180
  # Create a reasoning-enhanced prompt
181
  messages = [
182
  {
183
  "role": "user",
184
+ "content": f"Solve the following programming problem. Think through it step by step.\n\n{input_text}"
185
  },
186
  {
187
  "role": "assistant",
188
+ "content": f"Let me think through this problem step by step.\n\n{output_text}\n\nHere's my solution:\n\n```python\n{solution_text}\n```"
189
  }
190
  ]
191
 
 
193
 
194
  def format_coding_sample(example):
195
  """Format starcoderdata sample for capability preservation."""
196
+ # Extract code content and truncate to prevent memory issues
197
+ content = str(example.get("content", ""))[:40000]
198
 
199
  # Create a simple code completion task
200
  lines = content.split("\n")
 
308
  dataloader_num_workers=4,
309
  remove_unused_columns=True,
310
  packing=False, # Disable packing for long sequences
311
+
312
+ # Memory-efficient tokenization (reduce parallel processes to save RAM)
313
+ dataset_num_proc=1, # Single process to avoid OOM during tokenization
314
+ dataset_batch_size=100, # Smaller batches during tokenization
315
  )
316
 
317
  # Initialize trainer