stmasson
/

training-scripts

Model card Files Files and versions

xet

Community

stmasson commited on Dec 28, 2025

Commit

6da594c

verified ·

1 Parent(s): a8da371

Upload scripts/train_alizee_v2_stage1_sft.py with huggingface_hub

Browse files

Files changed (1) hide show

scripts/train_alizee_v2_stage1_sft.py +16 -4

scripts/train_alizee_v2_stage1_sft.py CHANGED Viewed

@@ -160,6 +160,9 @@ for i, sample in enumerate(coding_ds):
 coding_ds_final = Dataset.from_list(coding_samples)
 print(f"   Collected {len(coding_ds_final)} coding samples")
 # Format functions for different data sources
 def format_reasoning_sample(example):
     """Format OpenCodeReasoning sample for instruction tuning.
@@ -169,15 +172,20 @@ def format_reasoning_sample(example):
     - output: reasoning trace / expected output explanation
     - solution: the actual code
     """
     # Create a reasoning-enhanced prompt
     messages = [
         {
             "role": "user",
-            "content": f"Solve the following programming problem. Think through it step by step.\n\n{example['input']}"
         },
         {
             "role": "assistant",
-            "content": f"Let me think through this problem step by step.\n\n{example['output']}\n\nHere's my solution:\n\n```python\n{example['solution']}\n```"
         }
     ]
@@ -185,8 +193,8 @@ def format_reasoning_sample(example):
 def format_coding_sample(example):
     """Format starcoderdata sample for capability preservation."""
-    # Extract code content
-    content = example.get("content", "")
     # Create a simple code completion task
     lines = content.split("\n")
@@ -300,6 +308,10 @@ training_config = SFTConfig(
     dataloader_num_workers=4,
     remove_unused_columns=True,
     packing=False,  # Disable packing for long sequences
 )
 # Initialize trainer

 coding_ds_final = Dataset.from_list(coding_samples)
 print(f"   Collected {len(coding_ds_final)} coding samples")
+# Approximate character limit for 32K tokens (assuming ~4 chars per token average)
+MAX_CHARS = MAX_SEQ_LENGTH * 3  # ~98K chars, slightly conservative
 # Format functions for different data sources
 def format_reasoning_sample(example):
     """Format OpenCodeReasoning sample for instruction tuning.
     - output: reasoning trace / expected output explanation
     - solution: the actual code
     """
+    # Truncate very long fields to prevent memory issues
+    input_text = str(example.get('input', ''))[:30000]
+    output_text = str(example.get('output', ''))[:50000]
+    solution_text = str(example.get('solution', ''))[:15000]
     # Create a reasoning-enhanced prompt
     messages = [
         {
             "role": "user",
+            "content": f"Solve the following programming problem. Think through it step by step.\n\n{input_text}"
         },
         {
             "role": "assistant",
+            "content": f"Let me think through this problem step by step.\n\n{output_text}\n\nHere's my solution:\n\n```python\n{solution_text}\n```"
         }
     ]
 def format_coding_sample(example):
     """Format starcoderdata sample for capability preservation."""
+    # Extract code content and truncate to prevent memory issues
+    content = str(example.get("content", ""))[:40000]
     # Create a simple code completion task
     lines = content.split("\n")
     dataloader_num_workers=4,
     remove_unused_columns=True,
     packing=False,  # Disable packing for long sequences
+    # Memory-efficient tokenization (reduce parallel processes to save RAM)
+    dataset_num_proc=1,  # Single process to avoid OOM during tokenization
+    dataset_batch_size=100,  # Smaller batches during tokenization
 )
 # Initialize trainer