maidacundo commited on
Commit
686faaf
·
verified ·
1 Parent(s): ab9e7c1

Training in progress, step 10, checkpoint

Browse files
last-checkpoint/config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "OpenMythosForCausalLM"
4
+ ],
5
+ "bias": false,
6
+ "block_size": 256,
7
+ "bos_token_id": 1,
8
+ "dtype": "float32",
9
+ "effective_expected_depth": 10,
10
+ "eos_token_id": 2,
11
+ "head_dim": 64,
12
+ "init_values": {
13
+ "embed_scale": 16.0,
14
+ "embedding": 0.03952847075210474,
15
+ "out_proj": 0.008838834764831844,
16
+ "std": 0.03952847075210474
17
+ },
18
+ "injection_type": "lti",
19
+ "intermediate_size": 1024,
20
+ "kv_lora_rank": 512,
21
+ "mean_backprop_depth": 2,
22
+ "mean_recurrence": 4,
23
+ "model_type": "open_mythos",
24
+ "moe_intermediate_size": 1024,
25
+ "moe_top_k": 2,
26
+ "n_embd": 256,
27
+ "n_heads": 4,
28
+ "n_kv_heads": 4,
29
+ "n_layers": 4,
30
+ "n_layers_in_coda": 1,
31
+ "n_layers_in_prelude": 1,
32
+ "n_layers_in_recurrent_block": 2,
33
+ "n_routed_experts": 8,
34
+ "n_shared_experts": 2,
35
+ "norm_eps": 1e-06,
36
+ "num_key_value_heads": 4,
37
+ "pad_token_id": 0,
38
+ "q_lora_rank": 1536,
39
+ "qk_bias": true,
40
+ "rope_base": 50000.0,
41
+ "rope_head_dim": 64,
42
+ "state_init": "like-init",
43
+ "tie_embeddings": true,
44
+ "tie_word_embeddings": true,
45
+ "transformers_version": "5.6.0",
46
+ "use_cache": false,
47
+ "use_mla": false,
48
+ "use_moe": false,
49
+ "vocab_size": 50257
50
+ }
last-checkpoint/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "output_attentions": false,
6
+ "output_hidden_states": false,
7
+ "pad_token_id": 0,
8
+ "transformers_version": "5.6.0"
9
+ }
last-checkpoint/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cc1acb58f6552f17f47c9e79c248b5766ef129b9cc7004d0289960bcfa126ad
3
+ size 68602152
last-checkpoint/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2812cf244d8d46428667984a8e6ab4a612874e817adebcbd079c4d0b057e17c
3
+ size 137100235
last-checkpoint/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcd679242a831d4484be1b031fb6a525641a1324383532f6e1c8bb5ac52e4ce7
3
+ size 14455
last-checkpoint/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f931434598425c3d35e696dc1ef9a0302880efc75ad45f5e1fbe43cfc68f080
3
+ size 1465
last-checkpoint/trainer_state.json ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 10,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.1,
14
+ "grad_norm": 3.904756546020508,
15
+ "learning_rate": 0.0,
16
+ "loss": 11.039416313171387,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.2,
21
+ "grad_norm": 2.952423572540283,
22
+ "learning_rate": 0.00015,
23
+ "loss": 11.029916763305664,
24
+ "step": 2
25
+ },
26
+ {
27
+ "epoch": 0.3,
28
+ "grad_norm": 2.548621654510498,
29
+ "learning_rate": 0.0003,
30
+ "loss": 10.987221717834473,
31
+ "step": 3
32
+ },
33
+ {
34
+ "epoch": 0.4,
35
+ "grad_norm": 2.724034309387207,
36
+ "learning_rate": 0.000288581929876693,
37
+ "loss": 10.806486129760742,
38
+ "step": 4
39
+ },
40
+ {
41
+ "epoch": 0.5,
42
+ "grad_norm": 2.913846731185913,
43
+ "learning_rate": 0.00025606601717798207,
44
+ "loss": 10.382536888122559,
45
+ "step": 5
46
+ },
47
+ {
48
+ "epoch": 0.6,
49
+ "grad_norm": 3.6747395992279053,
50
+ "learning_rate": 0.00020740251485476345,
51
+ "loss": 10.37621784210205,
52
+ "step": 6
53
+ },
54
+ {
55
+ "epoch": 0.7,
56
+ "grad_norm": 2.790466785430908,
57
+ "learning_rate": 0.00015,
58
+ "loss": 10.06509780883789,
59
+ "step": 7
60
+ },
61
+ {
62
+ "epoch": 0.8,
63
+ "grad_norm": 3.1298794746398926,
64
+ "learning_rate": 9.259748514523653e-05,
65
+ "loss": 10.32823371887207,
66
+ "step": 8
67
+ },
68
+ {
69
+ "epoch": 0.9,
70
+ "grad_norm": 4.239963054656982,
71
+ "learning_rate": 4.3933982822017876e-05,
72
+ "loss": 9.877490997314453,
73
+ "step": 9
74
+ },
75
+ {
76
+ "epoch": 1.0,
77
+ "grad_norm": 3.074774742126465,
78
+ "learning_rate": 1.1418070123306989e-05,
79
+ "loss": 9.85549545288086,
80
+ "step": 10
81
+ }
82
+ ],
83
+ "logging_steps": 1,
84
+ "max_steps": 10,
85
+ "num_input_tokens_seen": 0,
86
+ "num_train_epochs": 9223372036854775807,
87
+ "save_steps": 999999,
88
+ "stateful_callbacks": {
89
+ "TrainerControl": {
90
+ "args": {
91
+ "should_epoch_stop": false,
92
+ "should_evaluate": false,
93
+ "should_log": false,
94
+ "should_save": true,
95
+ "should_training_stop": true
96
+ },
97
+ "attributes": {}
98
+ }
99
+ },
100
+ "total_flos": 131090350080.0,
101
+ "train_batch_size": 2,
102
+ "trial_name": null,
103
+ "trial_params": null
104
+ }
last-checkpoint/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e7be4b80f34efa8c6ffa736724ae3152fa29c25aa950c50b57dfee4c09cee52
3
+ size 5329