yagizdevre commited on
Commit
44d0907
·
1 Parent(s): 5ef77ef

config fixes

Browse files
__pycache__/configuration_minitransformer.cpython-312.pyc CHANGED
Binary files a/__pycache__/configuration_minitransformer.cpython-312.pyc and b/__pycache__/configuration_minitransformer.cpython-312.pyc differ
 
config.json CHANGED
@@ -3,12 +3,12 @@
3
  "_name_or_path": "Transformer_500M",
4
  "architectures": ["MiniTransformer"],
5
  "n_embd": 768,
6
- "n_heads": 8,
7
- "n_layers": 25,
8
  "seq_len": 8192,
9
- "window_size": 1024,
10
  "vocab_size": 200064,
11
- "mlp_scale": 12,
12
  "bias": false,
13
  "dropout": 0.0,
14
  "num_eigh": 24,
@@ -17,14 +17,14 @@
17
  "global_bsz": 524288,
18
  "bsz": 1,
19
  "warmup_steps": 1907,
20
- "eval_period": 25,
21
  "save_period": 500,
22
  "max_lr": 3.0e-4,
23
  "min_lr": 3.0e-5,
24
  "max_norm": 1.0,
25
  "dilation": 1,
26
- "fsdp": true,
27
- "ddp": false,
28
  "mixed_precision": true,
29
  "torch_dtype": "bfloat16",
30
  "use_cpu_offload": false,
@@ -42,8 +42,7 @@
42
  "buffer": "bfloat16"
43
  },
44
  "fsdp_modules": [
45
- "Attention",
46
- "MLP"
47
  ],
48
  "use_activation_checkpointing": true,
49
  "use_flash_fft": true,
 
3
  "_name_or_path": "Transformer_500M",
4
  "architectures": ["MiniTransformer"],
5
  "n_embd": 768,
6
+ "n_heads": 24,
7
+ "n_layers": 27,
8
  "seq_len": 8192,
9
+ "window_size": 8192,
10
  "vocab_size": 200064,
11
+ "mlp_scale": 4,
12
  "bias": false,
13
  "dropout": 0.0,
14
  "num_eigh": 24,
 
17
  "global_bsz": 524288,
18
  "bsz": 1,
19
  "warmup_steps": 1907,
20
+ "eval_period": 50,
21
  "save_period": 500,
22
  "max_lr": 3.0e-4,
23
  "min_lr": 3.0e-5,
24
  "max_norm": 1.0,
25
  "dilation": 1,
26
+ "fsdp": false,
27
+ "ddp": true,
28
  "mixed_precision": true,
29
  "torch_dtype": "bfloat16",
30
  "use_cpu_offload": false,
 
42
  "buffer": "bfloat16"
43
  },
44
  "fsdp_modules": [
45
+ "AttentionLayer"
 
46
  ],
47
  "use_activation_checkpointing": true,
48
  "use_flash_fft": true,
configuration_minitransformer.py CHANGED
@@ -8,12 +8,12 @@ class MiniTransformerConfig(PretrainedConfig):
8
  self,
9
  bsz: int = 1,
10
  n_embd: int = 768,
11
- n_heads: int = 8,
12
- n_layers: int = 25,
13
  seq_len: int = 8192,
14
- window_size: int = 1024,
15
  vocab_size: int = 200064,
16
- mlp_scale: int = 12,
17
  bias: bool = False,
18
  dropout: float = 0.0,
19
  softcap: float = 50.0,
 
8
  self,
9
  bsz: int = 1,
10
  n_embd: int = 768,
11
+ n_heads: int = 24,
12
+ n_layers: int = 27,
13
  seq_len: int = 8192,
14
+ window_size: int = 8192,
15
  vocab_size: int = 200064,
16
+ mlp_scale: int = 4,
17
  bias: bool = False,
18
  dropout: float = 0.0,
19
  softcap: float = 50.0,