ItsMaxNorm commited on
Commit
f26127b
·
verified ·
1 Parent(s): 2cf2eda

Training in progress, epoch 1

Browse files
Files changed (2) hide show
  1. config.json +13 -1
  2. training_args.bin +1 -1
config.json CHANGED
@@ -5,6 +5,7 @@
5
  "architectures": [
6
  "LLaDAModelLM"
7
  ],
 
8
  "attention_dropout": 0.0,
9
  "attention_layer_norm": false,
10
  "attention_layer_norm_with_affine": true,
@@ -16,21 +17,27 @@
16
  "bias_for_layer_norm": false,
17
  "block_group_size": 1,
18
  "block_type": "llama",
 
19
  "d_model": 4096,
20
  "embedding_dropout": 0.0,
21
  "embedding_size": 126464,
22
  "eos_token_id": 126081,
23
  "flash_attention": false,
 
 
24
  "include_bias": false,
25
  "include_qkv_bias": false,
26
  "init_cutoff_factor": null,
27
  "init_device": "meta",
28
  "init_fn": "mitchell",
29
  "init_std": 0.02,
 
30
  "input_emb_norm": false,
 
31
  "layer_norm_type": "rms",
32
  "layer_norm_with_affine": true,
33
  "mask_token_id": 126336,
 
34
  "max_sequence_length": 4096,
35
  "mlp_hidden_size": 12288,
36
  "mlp_ratio": 4,
@@ -39,16 +46,21 @@
39
  "n_heads": 32,
40
  "n_kv_heads": 32,
41
  "n_layers": 32,
 
 
 
42
  "pad_token_id": 126081,
43
  "precision": "amp_bf16",
 
44
  "residual_dropout": 0.0,
45
  "rms_norm_eps": 1e-05,
46
  "rope": true,
47
  "rope_full_precision": true,
 
48
  "rope_theta": 500000.0,
49
  "scale_logits": false,
 
50
  "torch_dtype": "float16",
51
- "train_max_sequence_length": 1024,
52
  "transformers_version": "4.52.3",
53
  "use_cache": false,
54
  "vocab_size": 126464,
 
5
  "architectures": [
6
  "LLaDAModelLM"
7
  ],
8
+ "attention_bias": false,
9
  "attention_dropout": 0.0,
10
  "attention_layer_norm": false,
11
  "attention_layer_norm_with_affine": true,
 
17
  "bias_for_layer_norm": false,
18
  "block_group_size": 1,
19
  "block_type": "llama",
20
+ "bos_token_id": 1,
21
  "d_model": 4096,
22
  "embedding_dropout": 0.0,
23
  "embedding_size": 126464,
24
  "eos_token_id": 126081,
25
  "flash_attention": false,
26
+ "hidden_act": "silu",
27
+ "hidden_size": 4096,
28
  "include_bias": false,
29
  "include_qkv_bias": false,
30
  "init_cutoff_factor": null,
31
  "init_device": "meta",
32
  "init_fn": "mitchell",
33
  "init_std": 0.02,
34
+ "initializer_range": 0.02,
35
  "input_emb_norm": false,
36
+ "intermediate_size": 11008,
37
  "layer_norm_type": "rms",
38
  "layer_norm_with_affine": true,
39
  "mask_token_id": 126336,
40
+ "max_position_embeddings": 2048,
41
  "max_sequence_length": 4096,
42
  "mlp_hidden_size": 12288,
43
  "mlp_ratio": 4,
 
46
  "n_heads": 32,
47
  "n_kv_heads": 32,
48
  "n_layers": 32,
49
+ "num_attention_heads": 32,
50
+ "num_hidden_layers": 32,
51
+ "num_key_value_heads": 32,
52
  "pad_token_id": 126081,
53
  "precision": "amp_bf16",
54
+ "pretraining_tp": 1,
55
  "residual_dropout": 0.0,
56
  "rms_norm_eps": 1e-05,
57
  "rope": true,
58
  "rope_full_precision": true,
59
+ "rope_scaling": null,
60
  "rope_theta": 500000.0,
61
  "scale_logits": false,
62
+ "tie_word_embeddings": false,
63
  "torch_dtype": "float16",
 
64
  "transformers_version": "4.52.3",
65
  "use_cache": false,
66
  "vocab_size": 126464,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5587f3f4ddbbaae1869a6389609b91a93f206cebb44db643d6043469496f5d97
3
  size 9784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8939327b78d6c988f6265ad644f6ff3b5353dabf95d8b23342868e91c99d877
3
  size 9784