config fixes

Files changed (3) hide show

__pycache__/configuration_minitransformer.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/configuration_minitransformer.cpython-312.pyc and b/__pycache__/configuration_minitransformer.cpython-312.pyc differ

config.json CHANGED Viewed

@@ -3,12 +3,12 @@
   "_name_or_path": "Transformer_500M",
   "architectures": ["MiniTransformer"],
   "n_embd": 768,
-  "n_heads": 8,
-  "n_layers": 25,
   "seq_len": 8192,
-  "window_size": 1024,
   "vocab_size": 200064,
-  "mlp_scale": 12,
   "bias": false,
   "dropout": 0.0,
   "num_eigh": 24,
@@ -17,14 +17,14 @@
   "global_bsz": 524288,
   "bsz": 1,
   "warmup_steps": 1907,
-  "eval_period": 25,
   "save_period": 500,
   "max_lr": 3.0e-4,
   "min_lr": 3.0e-5,
   "max_norm": 1.0,
   "dilation": 1,
-  "fsdp": true,
-  "ddp": false,
   "mixed_precision": true,
   "torch_dtype": "bfloat16",
   "use_cpu_offload": false,
@@ -42,8 +42,7 @@
     "buffer": "bfloat16"
   },
   "fsdp_modules": [
-    "Attention",
-    "MLP"
   ],
   "use_activation_checkpointing": true,
   "use_flash_fft": true,

   "_name_or_path": "Transformer_500M",
   "architectures": ["MiniTransformer"],
   "n_embd": 768,
+  "n_heads": 24,
+  "n_layers": 27,
   "seq_len": 8192,
+  "window_size": 8192,
   "vocab_size": 200064,
+  "mlp_scale": 4,
   "bias": false,
   "dropout": 0.0,
   "num_eigh": 24,
   "global_bsz": 524288,
   "bsz": 1,
   "warmup_steps": 1907,
+  "eval_period": 50,
   "save_period": 500,
   "max_lr": 3.0e-4,
   "min_lr": 3.0e-5,
   "max_norm": 1.0,
   "dilation": 1,
+  "fsdp": false,
+  "ddp": true,
   "mixed_precision": true,
   "torch_dtype": "bfloat16",
   "use_cpu_offload": false,
     "buffer": "bfloat16"
   },
   "fsdp_modules": [
+    "AttentionLayer"
   ],
   "use_activation_checkpointing": true,
   "use_flash_fft": true,

configuration_minitransformer.py CHANGED Viewed

@@ -8,12 +8,12 @@ class MiniTransformerConfig(PretrainedConfig):
         self,
         bsz: int = 1,
         n_embd: int = 768,
-        n_heads: int = 8,
-        n_layers: int = 25,
         seq_len: int = 8192,
-        window_size: int = 1024,
         vocab_size: int = 200064,
-        mlp_scale: int = 12,
         bias: bool = False,
         dropout: float = 0.0,
         softcap: float = 50.0,

         self,
         bsz: int = 1,
         n_embd: int = 768,
+        n_heads: int = 24,
+        n_layers: int = 27,
         seq_len: int = 8192,
+        window_size: int = 8192,
         vocab_size: int = 200064,
+        mlp_scale: int = 4,
         bias: bool = False,
         dropout: float = 0.0,
         softcap: float = 50.0,