Fix ValidationDataset EOS separator inconsistency
Browse filesAlways appending EOS in ValidationDataset regardless of use_eos_separator
made val_loss incomparable to train_loss when the flag is False.
Now all three dataset classes (Packed, Mixed, Validation) share the same
conditional EOS logic.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- llm_lab/data/dataset.py +2 -1
llm_lab/data/dataset.py
CHANGED
|
@@ -205,7 +205,8 @@ class ValidationDataset:
|
|
| 205 |
if not token_ids:
|
| 206 |
continue
|
| 207 |
|
| 208 |
-
|
|
|
|
| 209 |
buffer.extend(token_ids)
|
| 210 |
|
| 211 |
while len(buffer) >= self.config.max_seq_len + 1 and count < self.num_samples:
|
|
|
|
| 205 |
if not token_ids:
|
| 206 |
continue
|
| 207 |
|
| 208 |
+
if self.config.use_eos_separator:
|
| 209 |
+
token_ids.append(self.tokenizer.eos_id)
|
| 210 |
buffer.extend(token_ids)
|
| 211 |
|
| 212 |
while len(buffer) >= self.config.max_seq_len + 1 and count < self.num_samples:
|