aixk commited on
Commit
ce2b3cc
·
1 Parent(s): 9669de6

FastPlus125m Geodesic Backup checkpoint-105 at global step 105

Browse files
checkpoint-105/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "FastPlus125mForCausalLM"
4
+ ],
5
+ "dtype": "float32",
6
+ "hidden_size": 768,
7
+ "initializer_range": 0.02,
8
+ "intermediate_size": 2048,
9
+ "kd_alpha": 0.4,
10
+ "kd_temperature": 2.5,
11
+ "max_position_embeddings": 512,
12
+ "model_type": "fastplus_125m",
13
+ "num_attention_heads": 12,
14
+ "num_hidden_layers": 16,
15
+ "tie_word_embeddings": true,
16
+ "transformers_version": "5.12.0",
17
+ "use_cache": false,
18
+ "vocab_size": 1792
19
+ }
checkpoint-105/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "output_attentions": false,
4
+ "output_hidden_states": false,
5
+ "transformers_version": "5.12.0"
6
+ }
checkpoint-105/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d70ae1d4d421134a39bb7bb2497c9e0e29a6696d5439cde6980193f470bba78
3
+ size 475022672
checkpoint-105/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:940f8cc2971db7ee16e49a5cdcc7e1d9bfbca930991a9314512f9823f9f4c4c5
3
+ size 949694539
checkpoint-105/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4a9f217e852f439efa6bd32fde98d6867f11aa6ea13ddc021ba10af6a0b0934
3
+ size 14645
checkpoint-105/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0604fb6aed7728e4c26776057bda0c591a130bb89b1efd24f5a809be15d4fc7
3
+ size 1383
checkpoint-105/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:715d93f94f69cc2a9664de6febc48ab1cd682b7056ab9016d52301f7c2c872b2
3
+ size 7737
checkpoint-105/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-105/tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "extra_special_tokens": [
6
+ "<|user|>",
7
+ "<|assistant|>"
8
+ ],
9
+ "is_local": false,
10
+ "local_files_only": false,
11
+ "model_max_length": 1000000000000000019884624838656,
12
+ "pad_token": "<pad>",
13
+ "tokenizer_class": "TokenizersBackend",
14
+ "unk_token": "<unk>"
15
+ }
checkpoint-105/trainer_state.json ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 15.0,
6
+ "eval_steps": 500,
7
+ "global_step": 105,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 1.4285714285714286,
14
+ "grad_norm": 35220.515625,
15
+ "learning_rate": 0.0,
16
+ "lorentz_dist_loss": 1.0369,
17
+ "loss": 12.631961822509766,
18
+ "step": 10
19
+ },
20
+ {
21
+ "epoch": 2.857142857142857,
22
+ "grad_norm": 40192.5859375,
23
+ "learning_rate": 3.06e-06,
24
+ "lorentz_dist_loss": 1.0303,
25
+ "loss": 12.552436828613281,
26
+ "step": 20
27
+ },
28
+ {
29
+ "epoch": 4.285714285714286,
30
+ "grad_norm": 27514.802734375,
31
+ "learning_rate": 6.46e-06,
32
+ "lorentz_dist_loss": 1.0012,
33
+ "loss": 12.204200744628906,
34
+ "step": 30
35
+ },
36
+ {
37
+ "epoch": 5.714285714285714,
38
+ "grad_norm": 43463.203125,
39
+ "learning_rate": 9.86e-06,
40
+ "lorentz_dist_loss": 1.0322,
41
+ "loss": 12.575341033935548,
42
+ "step": 40
43
+ },
44
+ {
45
+ "epoch": 7.142857142857143,
46
+ "grad_norm": 46014.42578125,
47
+ "learning_rate": 1.326e-05,
48
+ "lorentz_dist_loss": 0.9906,
49
+ "loss": 12.076918792724609,
50
+ "step": 50
51
+ },
52
+ {
53
+ "epoch": 8.571428571428571,
54
+ "grad_norm": 21427.990234375,
55
+ "learning_rate": 1.6660000000000003e-05,
56
+ "lorentz_dist_loss": 0.9057,
57
+ "loss": 11.058214569091797,
58
+ "step": 60
59
+ },
60
+ {
61
+ "epoch": 10.0,
62
+ "grad_norm": 12262.89453125,
63
+ "learning_rate": 2.006e-05,
64
+ "lorentz_dist_loss": 0.8058,
65
+ "loss": 9.858463287353516,
66
+ "step": 70
67
+ },
68
+ {
69
+ "epoch": 11.428571428571429,
70
+ "grad_norm": 12813.310546875,
71
+ "learning_rate": 2.3460000000000002e-05,
72
+ "lorentz_dist_loss": 0.7448,
73
+ "loss": 9.127013397216796,
74
+ "step": 80
75
+ },
76
+ {
77
+ "epoch": 12.857142857142858,
78
+ "grad_norm": 7791.78955078125,
79
+ "learning_rate": 2.6860000000000004e-05,
80
+ "lorentz_dist_loss": 0.7176,
81
+ "loss": 8.800634765625,
82
+ "step": 90
83
+ },
84
+ {
85
+ "epoch": 14.285714285714286,
86
+ "grad_norm": 7794.1259765625,
87
+ "learning_rate": 3.026e-05,
88
+ "lorentz_dist_loss": 0.7014,
89
+ "loss": 8.605958557128906,
90
+ "step": 100
91
+ }
92
+ ],
93
+ "logging_steps": 10,
94
+ "max_steps": 105,
95
+ "num_input_tokens_seen": 0,
96
+ "num_train_epochs": 15,
97
+ "save_steps": 200,
98
+ "stateful_callbacks": {
99
+ "TrainerControl": {
100
+ "args": {
101
+ "should_epoch_stop": false,
102
+ "should_evaluate": false,
103
+ "should_log": false,
104
+ "should_save": true,
105
+ "should_training_stop": true
106
+ },
107
+ "attributes": {}
108
+ }
109
+ },
110
+ "total_flos": 896596143621120.0,
111
+ "train_batch_size": 12,
112
+ "trial_name": null,
113
+ "trial_params": null
114
+ }
checkpoint-105/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81294502804efcb3a9510bb4aef4a5dfafe7bd4697bb802b14ff6b17f0d6bc98
3
+ size 5201