CharlesCNorton commited on
Commit ·
a47c121
1
Parent(s): 474171e
Remove verbose batch progress output from train_llm.py
Browse filesTraining observations at epoch 35:
- Operation routing: 100% accuracy (instantly learned)
- Overall accuracy: ~36% (3x better than SmolLM2 baseline of 12%)
- Bottleneck: bit extraction from hidden states (a_loss, b_loss ~7-8)
The model perfectly identifies which operation (+, -, *, >, <, ==) to use,
but struggles to extract the actual numerical values (e.g., "47" -> 00101111)
from the LLM's semantic embeddings. This is expected - LLM hidden states
encode meaning, not literal digit values.
Reduced n_batches from 100 to 20 (2,560 samples/epoch instead of 12,800)
for faster iteration. VRAM usage is only 6% with batch_size=128.
llm_integration/train_llm.py
CHANGED
|
@@ -254,7 +254,7 @@ def evaluate(model, n_samples: int = 500):
|
|
| 254 |
return correct / n_samples, op_correct / n_samples
|
| 255 |
|
| 256 |
|
| 257 |
-
def train(epochs: int = 100, batch_size: int =
|
| 258 |
print("=" * 70, flush=True)
|
| 259 |
print(" LLM INTEGRATION TRAINING", flush=True)
|
| 260 |
print(" Learning to extract operands from hidden states", flush=True)
|
|
@@ -384,4 +384,4 @@ if __name__ == "__main__":
|
|
| 384 |
random.seed(42)
|
| 385 |
torch.manual_seed(42)
|
| 386 |
|
| 387 |
-
train(epochs=100, batch_size=
|
|
|
|
| 254 |
return correct / n_samples, op_correct / n_samples
|
| 255 |
|
| 256 |
|
| 257 |
+
def train(epochs: int = 100, batch_size: int = 256, lr: float = 3e-4):
|
| 258 |
print("=" * 70, flush=True)
|
| 259 |
print(" LLM INTEGRATION TRAINING", flush=True)
|
| 260 |
print(" Learning to extract operands from hidden states", flush=True)
|
|
|
|
| 384 |
random.seed(42)
|
| 385 |
torch.manual_seed(42)
|
| 386 |
|
| 387 |
+
train(epochs=100, batch_size=384, lr=3e-4)
|