Charlie81 commited on
Commit
1400f6b
·
1 Parent(s): d05c72b

delete checkpoints

Browse files
checkpoints/checkpoint-60/config.json DELETED
@@ -1,39 +0,0 @@
1
- {
2
- "architectures": [
3
- "MyOlmoeForCausalLM"
4
- ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "clip_qkv": null,
8
- "eos_token_id": 50279,
9
- "hidden_act": "silu",
10
- "hidden_size": 2048,
11
- "initializer_range": 0.02,
12
- "intermediate_size": 1024,
13
- "max_position_embeddings": 4096,
14
- "max_small_expert_count": 64,
15
- "model_type": "olmoe",
16
- "norm_topk_prob": false,
17
- "num_attention_heads": 16,
18
- "num_experts": 64,
19
- "num_experts_per_tok": 2,
20
- "num_hidden_layers": 16,
21
- "num_key_value_heads": 16,
22
- "num_small_experts": 64,
23
- "output_router_logits": false,
24
- "pad_token_id": 1,
25
- "rms_norm_eps": 1e-05,
26
- "rope_scaling": null,
27
- "rope_theta": 10000.0,
28
- "router_aux_loss_coef": 0.01,
29
- "small_expert_count": 64,
30
- "small_expert_intermediate_ratio": 64,
31
- "small_expert_intermediate_size": 0,
32
- "small_expert_sparsity_coef": 0.1,
33
- "small_expert_strategy": "constant",
34
- "tie_word_embeddings": false,
35
- "torch_dtype": "bfloat16",
36
- "transformers_version": "4.55.2",
37
- "use_cache": true,
38
- "vocab_size": 50304
39
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/checkpoint-60/generation_config.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "eos_token_id": 50279,
4
- "pad_token_id": 1,
5
- "transformers_version": "4.55.2"
6
- }
 
 
 
 
 
 
 
checkpoints/checkpoint-60/model-00001-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9a1b9651bad1a045a178e22cf198d5070c94d4374f6331086e38801fe8d88ca3
3
- size 4997482624
 
 
 
 
checkpoints/checkpoint-60/model-00002-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:025d40cedfb2f13f4718def7d040eca03796140d639f599d9eca5bddda51839a
3
- size 4997867120
 
 
 
 
checkpoints/checkpoint-60/model-00003-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:928fe194d9b0d49311de80482038d636aded9ebe83b373ca36e2fff712895265
3
- size 3856242664
 
 
 
 
checkpoints/checkpoint-60/model.safetensors.index.json DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/checkpoint-60/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0363c4418f9eb060e1b46f43324bf9f6da0ab67754604a2fe1c3ae1d08a99c32
3
- size 25858571
 
 
 
 
checkpoints/checkpoint-60/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
3
- size 14645
 
 
 
 
checkpoints/checkpoint-60/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e1be529198179cd559ddcb4c59a9f665944a456be4a70f4f5dcf79350fe0534
3
- size 1465
 
 
 
 
checkpoints/checkpoint-60/trainer_state.json DELETED
@@ -1,76 +0,0 @@
1
- {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 0.0029433948380213027,
6
- "eval_steps": 500,
7
- "global_step": 60,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.0004905658063368838,
14
- "grad_norm": 32.25,
15
- "learning_rate": 1.471550032701112e-07,
16
- "loss": 104.4204,
17
- "step": 10
18
- },
19
- {
20
- "epoch": 0.0009811316126737675,
21
- "grad_norm": 33.25,
22
- "learning_rate": 3.106605624591236e-07,
23
- "loss": 105.3427,
24
- "step": 20
25
- },
26
- {
27
- "epoch": 0.0014716974190106514,
28
- "grad_norm": 37.25,
29
- "learning_rate": 4.7416612164813603e-07,
30
- "loss": 105.0909,
31
- "step": 30
32
- },
33
- {
34
- "epoch": 0.001962263225347535,
35
- "grad_norm": 27.75,
36
- "learning_rate": 6.376716808371485e-07,
37
- "loss": 105.2529,
38
- "step": 40
39
- },
40
- {
41
- "epoch": 0.002452829031684419,
42
- "grad_norm": 27.5,
43
- "learning_rate": 8.011772400261609e-07,
44
- "loss": 105.3971,
45
- "step": 50
46
- },
47
- {
48
- "epoch": 0.0029433948380213027,
49
- "grad_norm": 30.75,
50
- "learning_rate": 9.646827992151733e-07,
51
- "loss": 105.0396,
52
- "step": 60
53
- }
54
- ],
55
- "logging_steps": 10,
56
- "max_steps": 61155,
57
- "num_input_tokens_seen": 0,
58
- "num_train_epochs": 3,
59
- "save_steps": 20,
60
- "stateful_callbacks": {
61
- "TrainerControl": {
62
- "args": {
63
- "should_epoch_stop": false,
64
- "should_evaluate": false,
65
- "should_log": false,
66
- "should_save": true,
67
- "should_training_stop": false
68
- },
69
- "attributes": {}
70
- }
71
- },
72
- "total_flos": 1.6096442745618432e+17,
73
- "train_batch_size": 2,
74
- "trial_name": null,
75
- "trial_params": null
76
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/checkpoint-60/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ffa490bde32401dd6d70c4f1b1cff8f5df114f94b79824d3a47b2ae8c00b822d
3
- size 5713
 
 
 
 
checkpoints/checkpoint-80/config.json DELETED
@@ -1,39 +0,0 @@
1
- {
2
- "architectures": [
3
- "MyOlmoeForCausalLM"
4
- ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "clip_qkv": null,
8
- "eos_token_id": 50279,
9
- "hidden_act": "silu",
10
- "hidden_size": 2048,
11
- "initializer_range": 0.02,
12
- "intermediate_size": 1024,
13
- "max_position_embeddings": 4096,
14
- "max_small_expert_count": 64,
15
- "model_type": "olmoe",
16
- "norm_topk_prob": false,
17
- "num_attention_heads": 16,
18
- "num_experts": 64,
19
- "num_experts_per_tok": 2,
20
- "num_hidden_layers": 16,
21
- "num_key_value_heads": 16,
22
- "num_small_experts": 64,
23
- "output_router_logits": false,
24
- "pad_token_id": 1,
25
- "rms_norm_eps": 1e-05,
26
- "rope_scaling": null,
27
- "rope_theta": 10000.0,
28
- "router_aux_loss_coef": 0.01,
29
- "small_expert_count": 64,
30
- "small_expert_intermediate_ratio": 64,
31
- "small_expert_intermediate_size": 0,
32
- "small_expert_sparsity_coef": 0.1,
33
- "small_expert_strategy": "constant",
34
- "tie_word_embeddings": false,
35
- "torch_dtype": "bfloat16",
36
- "transformers_version": "4.55.2",
37
- "use_cache": true,
38
- "vocab_size": 50304
39
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/checkpoint-80/generation_config.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "eos_token_id": 50279,
4
- "pad_token_id": 1,
5
- "transformers_version": "4.55.2"
6
- }
 
 
 
 
 
 
 
checkpoints/checkpoint-80/model-00001-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9a1b9651bad1a045a178e22cf198d5070c94d4374f6331086e38801fe8d88ca3
3
- size 4997482624
 
 
 
 
checkpoints/checkpoint-80/model-00002-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:43fb39d4048f4c7f63e0b0989fc374720af15131f2b290760d93b0eca2f0ca3e
3
- size 4997867120
 
 
 
 
checkpoints/checkpoint-80/model-00003-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:5a9fcd7b16ae46a33e32a0eab4ea6b1a02770b62e98f7ffaba025c588e1336e6
3
- size 3856242664
 
 
 
 
checkpoints/checkpoint-80/model.safetensors.index.json DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/checkpoint-80/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:358cc79253a54a16e91c5c08071aba70ca7494d5c48e423c55d77d2ba49212bf
3
- size 25858571
 
 
 
 
checkpoints/checkpoint-80/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
3
- size 14645
 
 
 
 
checkpoints/checkpoint-80/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d484e81fa22cdbcf66c2585a42950fa1db3e6db36dd8936a72f76c2844202008
3
- size 1465
 
 
 
 
checkpoints/checkpoint-80/trainer_state.json DELETED
@@ -1,90 +0,0 @@
1
- {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 0.00392452645069507,
6
- "eval_steps": 500,
7
- "global_step": 80,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.0004905658063368838,
14
- "grad_norm": 32.25,
15
- "learning_rate": 1.471550032701112e-07,
16
- "loss": 104.4204,
17
- "step": 10
18
- },
19
- {
20
- "epoch": 0.0009811316126737675,
21
- "grad_norm": 33.25,
22
- "learning_rate": 3.106605624591236e-07,
23
- "loss": 105.3427,
24
- "step": 20
25
- },
26
- {
27
- "epoch": 0.0014716974190106514,
28
- "grad_norm": 37.25,
29
- "learning_rate": 4.7416612164813603e-07,
30
- "loss": 105.0909,
31
- "step": 30
32
- },
33
- {
34
- "epoch": 0.001962263225347535,
35
- "grad_norm": 27.75,
36
- "learning_rate": 6.376716808371485e-07,
37
- "loss": 105.2529,
38
- "step": 40
39
- },
40
- {
41
- "epoch": 0.002452829031684419,
42
- "grad_norm": 27.5,
43
- "learning_rate": 8.011772400261609e-07,
44
- "loss": 105.3971,
45
- "step": 50
46
- },
47
- {
48
- "epoch": 0.0029433948380213027,
49
- "grad_norm": 30.75,
50
- "learning_rate": 9.646827992151733e-07,
51
- "loss": 105.0396,
52
- "step": 60
53
- },
54
- {
55
- "epoch": 0.003433960644358187,
56
- "grad_norm": 27.75,
57
- "learning_rate": 1.1281883584041859e-06,
58
- "loss": 104.2232,
59
- "step": 70
60
- },
61
- {
62
- "epoch": 0.00392452645069507,
63
- "grad_norm": 29.625,
64
- "learning_rate": 1.2916939175931983e-06,
65
- "loss": 104.437,
66
- "step": 80
67
- }
68
- ],
69
- "logging_steps": 10,
70
- "max_steps": 61155,
71
- "num_input_tokens_seen": 0,
72
- "num_train_epochs": 3,
73
- "save_steps": 20,
74
- "stateful_callbacks": {
75
- "TrainerControl": {
76
- "args": {
77
- "should_epoch_stop": false,
78
- "should_evaluate": false,
79
- "should_log": false,
80
- "should_save": true,
81
- "should_training_stop": false
82
- },
83
- "attributes": {}
84
- }
85
- },
86
- "total_flos": 2.1461923660824576e+17,
87
- "train_batch_size": 2,
88
- "trial_name": null,
89
- "trial_params": null
90
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/checkpoint-80/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ffa490bde32401dd6d70c4f1b1cff8f5df114f94b79824d3a47b2ae8c00b822d
3
- size 5713