Upload scripts/train_alizee_v2_stage1_sft.py with huggingface_hub
Browse files
scripts/train_alizee_v2_stage1_sft.py
CHANGED
|
@@ -160,6 +160,9 @@ for i, sample in enumerate(coding_ds):
|
|
| 160 |
coding_ds_final = Dataset.from_list(coding_samples)
|
| 161 |
print(f" Collected {len(coding_ds_final)} coding samples")
|
| 162 |
|
|
|
|
|
|
|
|
|
|
| 163 |
# Format functions for different data sources
|
| 164 |
def format_reasoning_sample(example):
|
| 165 |
"""Format OpenCodeReasoning sample for instruction tuning.
|
|
@@ -169,15 +172,20 @@ def format_reasoning_sample(example):
|
|
| 169 |
- output: reasoning trace / expected output explanation
|
| 170 |
- solution: the actual code
|
| 171 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
# Create a reasoning-enhanced prompt
|
| 173 |
messages = [
|
| 174 |
{
|
| 175 |
"role": "user",
|
| 176 |
-
"content": f"Solve the following programming problem. Think through it step by step.\n\n{
|
| 177 |
},
|
| 178 |
{
|
| 179 |
"role": "assistant",
|
| 180 |
-
"content": f"Let me think through this problem step by step.\n\n{
|
| 181 |
}
|
| 182 |
]
|
| 183 |
|
|
@@ -185,8 +193,8 @@ def format_reasoning_sample(example):
|
|
| 185 |
|
| 186 |
def format_coding_sample(example):
|
| 187 |
"""Format starcoderdata sample for capability preservation."""
|
| 188 |
-
# Extract code content
|
| 189 |
-
content = example.get("content", "")
|
| 190 |
|
| 191 |
# Create a simple code completion task
|
| 192 |
lines = content.split("\n")
|
|
@@ -300,6 +308,10 @@ training_config = SFTConfig(
|
|
| 300 |
dataloader_num_workers=4,
|
| 301 |
remove_unused_columns=True,
|
| 302 |
packing=False, # Disable packing for long sequences
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
)
|
| 304 |
|
| 305 |
# Initialize trainer
|
|
|
|
| 160 |
coding_ds_final = Dataset.from_list(coding_samples)
|
| 161 |
print(f" Collected {len(coding_ds_final)} coding samples")
|
| 162 |
|
| 163 |
+
# Approximate character limit for 32K tokens (assuming ~4 chars per token average)
|
| 164 |
+
MAX_CHARS = MAX_SEQ_LENGTH * 3 # ~98K chars, slightly conservative
|
| 165 |
+
|
| 166 |
# Format functions for different data sources
|
| 167 |
def format_reasoning_sample(example):
|
| 168 |
"""Format OpenCodeReasoning sample for instruction tuning.
|
|
|
|
| 172 |
- output: reasoning trace / expected output explanation
|
| 173 |
- solution: the actual code
|
| 174 |
"""
|
| 175 |
+
# Truncate very long fields to prevent memory issues
|
| 176 |
+
input_text = str(example.get('input', ''))[:30000]
|
| 177 |
+
output_text = str(example.get('output', ''))[:50000]
|
| 178 |
+
solution_text = str(example.get('solution', ''))[:15000]
|
| 179 |
+
|
| 180 |
# Create a reasoning-enhanced prompt
|
| 181 |
messages = [
|
| 182 |
{
|
| 183 |
"role": "user",
|
| 184 |
+
"content": f"Solve the following programming problem. Think through it step by step.\n\n{input_text}"
|
| 185 |
},
|
| 186 |
{
|
| 187 |
"role": "assistant",
|
| 188 |
+
"content": f"Let me think through this problem step by step.\n\n{output_text}\n\nHere's my solution:\n\n```python\n{solution_text}\n```"
|
| 189 |
}
|
| 190 |
]
|
| 191 |
|
|
|
|
| 193 |
|
| 194 |
def format_coding_sample(example):
|
| 195 |
"""Format starcoderdata sample for capability preservation."""
|
| 196 |
+
# Extract code content and truncate to prevent memory issues
|
| 197 |
+
content = str(example.get("content", ""))[:40000]
|
| 198 |
|
| 199 |
# Create a simple code completion task
|
| 200 |
lines = content.split("\n")
|
|
|
|
| 308 |
dataloader_num_workers=4,
|
| 309 |
remove_unused_columns=True,
|
| 310 |
packing=False, # Disable packing for long sequences
|
| 311 |
+
|
| 312 |
+
# Memory-efficient tokenization (reduce parallel processes to save RAM)
|
| 313 |
+
dataset_num_proc=1, # Single process to avoid OOM during tokenization
|
| 314 |
+
dataset_batch_size=100, # Smaller batches during tokenization
|
| 315 |
)
|
| 316 |
|
| 317 |
# Initialize trainer
|