Update HROM_Trainer.py
Browse files- HROM_Trainer.py +18 -15
HROM_Trainer.py
CHANGED
|
@@ -28,30 +28,33 @@ logging.basicConfig(
|
|
| 28 |
|
| 29 |
# Configuration
|
| 30 |
CONFIG = {
|
|
|
|
| 31 |
"dim": 768,
|
| 32 |
-
"n_layers":
|
| 33 |
-
"n_heads":
|
| 34 |
-
"ff_dim":
|
|
|
|
|
|
|
| 35 |
"dropout": 0.1,
|
| 36 |
"max_seq_len": 512,
|
| 37 |
-
"
|
|
|
|
|
|
|
|
|
|
| 38 |
"checkpoint_interval": 2000,
|
| 39 |
"debug_interval": 400,
|
| 40 |
-
#
|
| 41 |
"datasets": ["daily_dialog", "empathetic_dialogues", "blended_skill_talk", "AlekseyKorshuk/persona-chat"],
|
| 42 |
-
#
|
| 43 |
-
"
|
| 44 |
-
#
|
| 45 |
-
"
|
| 46 |
-
"
|
| 47 |
-
# Adjusted samples per dataset: with 4 datasets, 50k each gives 200k total samples
|
| 48 |
-
"tokenizer_train_samples_per_dataset": 50000,
|
| 49 |
-
"learning_rate": 2e-5,
|
| 50 |
"warmup_steps": 1000,
|
| 51 |
-
"max_turns": 8, #
|
| 52 |
"max_checkpoints": 5,
|
| 53 |
"num_epochs": 30,
|
| 54 |
-
"grad_accum_steps":
|
| 55 |
}
|
| 56 |
|
| 57 |
# --- Model Definition (HROM, HROMBlock, HROMAttention, SwiGLU, RoPE) ---
|
|
|
|
| 28 |
|
| 29 |
# Configuration
|
| 30 |
CONFIG = {
|
| 31 |
+
# --- Scaled Parameters ---
|
| 32 |
"dim": 768,
|
| 33 |
+
"n_layers": 16,
|
| 34 |
+
"n_heads": 16,
|
| 35 |
+
"ff_dim": 3072, # Explicitly set to 4 * dim
|
| 36 |
+
|
| 37 |
+
# --- Kept Parameters ---
|
| 38 |
"dropout": 0.1,
|
| 39 |
"max_seq_len": 512,
|
| 40 |
+
"vocab_size": 32000, # Fixed by tokenizer
|
| 41 |
+
|
| 42 |
+
# --- Training/Dataset Parameters ---
|
| 43 |
+
"batch_size": 12,
|
| 44 |
"checkpoint_interval": 2000,
|
| 45 |
"debug_interval": 400,
|
| 46 |
+
# --- ADDED CoQA and QuAC ---
|
| 47 |
"datasets": ["daily_dialog", "empathetic_dialogues", "blended_skill_talk", "AlekseyKorshuk/persona-chat"],
|
| 48 |
+
"tokenizer_name": "hrom_tokenizer.json", # New name for expanded tokenizer
|
| 49 |
+
"checkpoint_dir": "checkpoints", # Separate directory for expanded data model
|
| 50 |
+
# --- Increased samples per dataset slightly for tokenizer ---
|
| 51 |
+
"tokenizer_train_samples_per_dataset": 100000, # Use same limit for all, incl. new ones
|
| 52 |
+
"learning_rate": 1e-5,
|
|
|
|
|
|
|
|
|
|
| 53 |
"warmup_steps": 1000,
|
| 54 |
+
"max_turns": 8, # Keep max_turns limit for Q&A datasets too
|
| 55 |
"max_checkpoints": 5,
|
| 56 |
"num_epochs": 30,
|
| 57 |
+
"grad_accum_steps": 16
|
| 58 |
}
|
| 59 |
|
| 60 |
# --- Model Definition (HROM, HROMBlock, HROMAttention, SwiGLU, RoPE) ---
|