stmasson
/

training-scripts

stmasson commited on Dec 28, 2025

Commit

ab24506

verified ·

1 Parent(s): ab21097

Upload scripts/train_alizee_v2_stage1_sft.py with huggingface_hub

Files changed (1) hide show

scripts/train_alizee_v2_stage1_sft.py CHANGED Viewed

@@ -50,7 +50,9 @@ LEARNING_RATE = 5e-5
 EFFECTIVE_BATCH_SIZE = 256
 PER_DEVICE_BATCH = 1
 GRADIENT_ACCUMULATION = EFFECTIVE_BATCH_SIZE // PER_DEVICE_BATCH
-MAX_SEQ_LENGTH = 32768
 NUM_EPOCHS = 2
 WARMUP_RATIO = 0.05
@@ -160,8 +162,8 @@ for i, sample in enumerate(coding_ds):
 coding_ds_final = Dataset.from_list(coding_samples)
 print(f"   Collected {len(coding_ds_final)} coding samples")
-# Approximate character limit for 32K tokens (assuming ~4 chars per token average)
-MAX_CHARS = MAX_SEQ_LENGTH * 3  # ~98K chars, slightly conservative
 # Format functions for different data sources
 def format_reasoning_sample(example):
@@ -172,10 +174,10 @@ def format_reasoning_sample(example):
     - output: reasoning trace / expected output explanation
     - solution: the actual code
     """
-    # Truncate very long fields to prevent memory issues
-    input_text = str(example.get('input', ''))[:30000]
-    output_text = str(example.get('output', ''))[:50000]
-    solution_text = str(example.get('solution', ''))[:15000]
     # Create a reasoning-enhanced prompt
     messages = [
@@ -193,8 +195,8 @@ def format_reasoning_sample(example):
 def format_coding_sample(example):
     """Format starcoderdata sample for capability preservation."""
-    # Extract code content and truncate to prevent memory issues
-    content = str(example.get("content", ""))[:40000]
     # Create a simple code completion task
     lines = content.split("\n")

 EFFECTIVE_BATCH_SIZE = 256
 PER_DEVICE_BATCH = 1
 GRADIENT_ACCUMULATION = EFFECTIVE_BATCH_SIZE // PER_DEVICE_BATCH
+# Reduced from 32K to 8K to avoid disk storage overflow during tokenization
+# (32K × 860K samples creates ~100GB+ tokenized dataset that exceeds pod storage)
+MAX_SEQ_LENGTH = 8192
 NUM_EPOCHS = 2
 WARMUP_RATIO = 0.05
 coding_ds_final = Dataset.from_list(coding_samples)
 print(f"   Collected {len(coding_ds_final)} coding samples")
+# Approximate character limit for 8K tokens (assuming ~4 chars per token average)
+MAX_CHARS = MAX_SEQ_LENGTH * 3  # ~24K chars for 8K tokens
 # Format functions for different data sources
 def format_reasoning_sample(example):
     - output: reasoning trace / expected output explanation
     - solution: the actual code
     """
+    # Truncate fields to fit within 8K tokens (~24K chars total)
+    input_text = str(example.get('input', ''))[:8000]
+    output_text = str(example.get('output', ''))[:12000]
+    solution_text = str(example.get('solution', ''))[:4000]
     # Create a reasoning-enhanced prompt
     messages = [
 def format_coding_sample(example):
     """Format starcoderdata sample for capability preservation."""
+    # Extract code content and truncate to fit 8K context (~20K chars)
+    content = str(example.get("content", ""))[:20000]
     # Create a simple code completion task
     lines = content.split("\n")