QuantaSparkLabs commited on
Commit
2243c4f
·
verified ·
1 Parent(s): 8c23b4b

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. checkpoint-100/config.json +34 -0
  2. checkpoint-100/generation_config.json +10 -0
  3. checkpoint-100/model.safetensors +3 -0
  4. checkpoint-100/optimizer.pt +3 -0
  5. checkpoint-100/rng_state.pth +3 -0
  6. checkpoint-100/scheduler.pt +3 -0
  7. checkpoint-100/trainer_state.json +104 -0
  8. checkpoint-100/training_args.bin +3 -0
  9. checkpoint-1000/config.json +34 -0
  10. checkpoint-1000/generation_config.json +10 -0
  11. checkpoint-1000/model.safetensors +3 -0
  12. checkpoint-1000/optimizer.pt +3 -0
  13. checkpoint-1000/rng_state.pth +3 -0
  14. checkpoint-1000/scheduler.pt +3 -0
  15. checkpoint-1000/trainer_state.json +734 -0
  16. checkpoint-1000/training_args.bin +3 -0
  17. checkpoint-1050/config.json +34 -0
  18. checkpoint-1050/generation_config.json +10 -0
  19. checkpoint-1050/model.safetensors +3 -0
  20. checkpoint-1050/optimizer.pt +3 -0
  21. checkpoint-1050/rng_state.pth +3 -0
  22. checkpoint-1050/scheduler.pt +3 -0
  23. checkpoint-1050/trainer_state.json +769 -0
  24. checkpoint-1050/training_args.bin +3 -0
  25. checkpoint-1100/config.json +34 -0
  26. checkpoint-1100/generation_config.json +10 -0
  27. checkpoint-1100/model.safetensors +3 -0
  28. checkpoint-1100/optimizer.pt +3 -0
  29. checkpoint-1100/rng_state.pth +3 -0
  30. checkpoint-1100/scheduler.pt +3 -0
  31. checkpoint-1100/trainer_state.json +804 -0
  32. checkpoint-1100/training_args.bin +3 -0
  33. checkpoint-1150/config.json +34 -0
  34. checkpoint-1150/generation_config.json +10 -0
  35. checkpoint-1150/model.safetensors +3 -0
  36. checkpoint-1150/optimizer.pt +3 -0
  37. checkpoint-1150/rng_state.pth +3 -0
  38. checkpoint-1150/scheduler.pt +3 -0
  39. checkpoint-1150/trainer_state.json +839 -0
  40. checkpoint-1150/training_args.bin +3 -0
  41. checkpoint-1200/config.json +34 -0
  42. checkpoint-1200/generation_config.json +10 -0
  43. checkpoint-1200/model.safetensors +3 -0
  44. checkpoint-1200/optimizer.pt +3 -0
  45. checkpoint-1200/rng_state.pth +3 -0
  46. checkpoint-1200/scheduler.pt +3 -0
  47. checkpoint-1200/trainer_state.json +874 -0
  48. checkpoint-1200/training_args.bin +3 -0
  49. checkpoint-1250/config.json +34 -0
  50. checkpoint-1250/generation_config.json +10 -0
checkpoint-100/config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu",
3
+ "add_cross_attention": false,
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.0,
8
+ "bos_token_id": 1,
9
+ "dtype": "float32",
10
+ "embd_pdrop": 0.0,
11
+ "eos_token_id": 1,
12
+ "initializer_range": 0.02,
13
+ "layer_norm_epsilon": 1e-05,
14
+ "model_type": "gpt2",
15
+ "n_embd": 1,
16
+ "n_head": 1,
17
+ "n_inner": 1,
18
+ "n_layer": 1,
19
+ "n_positions": 1,
20
+ "pad_token_id": 0,
21
+ "reorder_and_upcast_attn": false,
22
+ "resid_pdrop": 0.0,
23
+ "scale_attn_by_inverse_layer_idx": false,
24
+ "scale_attn_weights": true,
25
+ "summary_activation": null,
26
+ "summary_first_dropout": 0.1,
27
+ "summary_proj_to_labels": true,
28
+ "summary_type": "cls_index",
29
+ "summary_use_proj": true,
30
+ "tie_word_embeddings": true,
31
+ "transformers_version": "5.12.0",
32
+ "use_cache": false,
33
+ "vocab_size": 2
34
+ }
checkpoint-100/generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 1,
5
+ "output_attentions": false,
6
+ "output_hidden_states": false,
7
+ "pad_token_id": 0,
8
+ "transformers_version": "5.12.0",
9
+ "use_cache": true
10
+ }
checkpoint-100/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:880c2ec6c05c1e38595824792a1d3850592746c746ee2caa95bee8cb0e8ec77d
3
+ size 1452
checkpoint-100/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e17dc36b91710af38919a82d28733c78849a9938e64d50b056fde953e08af55
3
+ size 13823
checkpoint-100/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:656c952bc98f1ba6483f4b602ab79d3a7eb64d231d7b2b6ae517f06e7e137155
3
+ size 14455
checkpoint-100/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a487ad8594fd184a1dbb5f7128f07682e4d70038d880a5d45dd29b502807a0b
3
+ size 1465
checkpoint-100/trainer_state.json ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 12.5,
6
+ "eval_steps": 500,
7
+ "global_step": 100,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 1.25,
14
+ "grad_norm": 0.0,
15
+ "learning_rate": 0.009989987484355445,
16
+ "loss": 0.0,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 2.5,
21
+ "grad_norm": 0.0,
22
+ "learning_rate": 0.009964956195244054,
23
+ "loss": 0.0,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 3.75,
28
+ "grad_norm": 0.0,
29
+ "learning_rate": 0.009939924906132666,
30
+ "loss": 0.0,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 5.0,
35
+ "grad_norm": 0.0,
36
+ "learning_rate": 0.009914893617021277,
37
+ "loss": 0.0,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 6.25,
42
+ "grad_norm": 0.0,
43
+ "learning_rate": 0.009889862327909888,
44
+ "loss": 0.0,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 7.5,
49
+ "grad_norm": 0.0,
50
+ "learning_rate": 0.009864831038798498,
51
+ "loss": 0.0,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 8.75,
56
+ "grad_norm": 0.0,
57
+ "learning_rate": 0.009839799749687109,
58
+ "loss": 0.0,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 10.0,
63
+ "grad_norm": 0.0,
64
+ "learning_rate": 0.00981476846057572,
65
+ "loss": 0.0,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 11.25,
70
+ "grad_norm": 0.0,
71
+ "learning_rate": 0.00978973717146433,
72
+ "loss": 0.0,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 12.5,
77
+ "grad_norm": 0.0,
78
+ "learning_rate": 0.009764705882352941,
79
+ "loss": 0.0,
80
+ "step": 100
81
+ }
82
+ ],
83
+ "logging_steps": 10,
84
+ "max_steps": 4000,
85
+ "num_input_tokens_seen": 0,
86
+ "num_train_epochs": 500,
87
+ "save_steps": 50,
88
+ "stateful_callbacks": {
89
+ "TrainerControl": {
90
+ "args": {
91
+ "should_epoch_stop": false,
92
+ "should_evaluate": false,
93
+ "should_log": false,
94
+ "should_save": true,
95
+ "should_training_stop": false
96
+ },
97
+ "attributes": {}
98
+ }
99
+ },
100
+ "total_flos": 675648.0,
101
+ "train_batch_size": 64,
102
+ "trial_name": null,
103
+ "trial_params": null
104
+ }
checkpoint-100/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6208b173b902d75ab9814341e70009f66ecc0ecd70d024038fcaaf650010173
3
+ size 5137
checkpoint-1000/config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu",
3
+ "add_cross_attention": false,
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.0,
8
+ "bos_token_id": 1,
9
+ "dtype": "float32",
10
+ "embd_pdrop": 0.0,
11
+ "eos_token_id": 1,
12
+ "initializer_range": 0.02,
13
+ "layer_norm_epsilon": 1e-05,
14
+ "model_type": "gpt2",
15
+ "n_embd": 1,
16
+ "n_head": 1,
17
+ "n_inner": 1,
18
+ "n_layer": 1,
19
+ "n_positions": 1,
20
+ "pad_token_id": 0,
21
+ "reorder_and_upcast_attn": false,
22
+ "resid_pdrop": 0.0,
23
+ "scale_attn_by_inverse_layer_idx": false,
24
+ "scale_attn_weights": true,
25
+ "summary_activation": null,
26
+ "summary_first_dropout": 0.1,
27
+ "summary_proj_to_labels": true,
28
+ "summary_type": "cls_index",
29
+ "summary_use_proj": true,
30
+ "tie_word_embeddings": true,
31
+ "transformers_version": "5.12.0",
32
+ "use_cache": false,
33
+ "vocab_size": 2
34
+ }
checkpoint-1000/generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 1,
5
+ "output_attentions": false,
6
+ "output_hidden_states": false,
7
+ "pad_token_id": 0,
8
+ "transformers_version": "5.12.0",
9
+ "use_cache": true
10
+ }
checkpoint-1000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:880c2ec6c05c1e38595824792a1d3850592746c746ee2caa95bee8cb0e8ec77d
3
+ size 1452
checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4683cfa458233e00c198c54038d450bfd06ca52f719705e01fc34a4845b539a2
3
+ size 13823
checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c92683043c9a8610fa78e10e63d70e47ebd8152c60d1cab4d893b74a45bb5db4
3
+ size 14455
checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:433f845f661e9be47ccaee189de347cf46b20ad6176b2cfd945b4c290cad9fc8
3
+ size 1465
checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,734 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 125.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 1.25,
14
+ "grad_norm": 0.0,
15
+ "learning_rate": 0.009989987484355445,
16
+ "loss": 0.0,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 2.5,
21
+ "grad_norm": 0.0,
22
+ "learning_rate": 0.009964956195244054,
23
+ "loss": 0.0,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 3.75,
28
+ "grad_norm": 0.0,
29
+ "learning_rate": 0.009939924906132666,
30
+ "loss": 0.0,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 5.0,
35
+ "grad_norm": 0.0,
36
+ "learning_rate": 0.009914893617021277,
37
+ "loss": 0.0,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 6.25,
42
+ "grad_norm": 0.0,
43
+ "learning_rate": 0.009889862327909888,
44
+ "loss": 0.0,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 7.5,
49
+ "grad_norm": 0.0,
50
+ "learning_rate": 0.009864831038798498,
51
+ "loss": 0.0,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 8.75,
56
+ "grad_norm": 0.0,
57
+ "learning_rate": 0.009839799749687109,
58
+ "loss": 0.0,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 10.0,
63
+ "grad_norm": 0.0,
64
+ "learning_rate": 0.00981476846057572,
65
+ "loss": 0.0,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 11.25,
70
+ "grad_norm": 0.0,
71
+ "learning_rate": 0.00978973717146433,
72
+ "loss": 0.0,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 12.5,
77
+ "grad_norm": 0.0,
78
+ "learning_rate": 0.009764705882352941,
79
+ "loss": 0.0,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 13.75,
84
+ "grad_norm": 0.0,
85
+ "learning_rate": 0.009739674593241552,
86
+ "loss": 0.0,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 15.0,
91
+ "grad_norm": 0.0,
92
+ "learning_rate": 0.009714643304130162,
93
+ "loss": 0.0,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 16.25,
98
+ "grad_norm": 0.0,
99
+ "learning_rate": 0.009689612015018775,
100
+ "loss": 0.0,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 17.5,
105
+ "grad_norm": 0.0,
106
+ "learning_rate": 0.009664580725907385,
107
+ "loss": 0.0,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 18.75,
112
+ "grad_norm": 0.0,
113
+ "learning_rate": 0.009639549436795996,
114
+ "loss": 0.0,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 20.0,
119
+ "grad_norm": 0.0,
120
+ "learning_rate": 0.009614518147684605,
121
+ "loss": 0.0,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 21.25,
126
+ "grad_norm": 0.0,
127
+ "learning_rate": 0.009589486858573217,
128
+ "loss": 0.0,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 22.5,
133
+ "grad_norm": 0.0,
134
+ "learning_rate": 0.009564455569461828,
135
+ "loss": 0.0,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 23.75,
140
+ "grad_norm": 0.0,
141
+ "learning_rate": 0.009539424280350439,
142
+ "loss": 0.0,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 25.0,
147
+ "grad_norm": 0.0,
148
+ "learning_rate": 0.00951439299123905,
149
+ "loss": 0.0,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 26.25,
154
+ "grad_norm": 0.0,
155
+ "learning_rate": 0.00948936170212766,
156
+ "loss": 0.0,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 27.5,
161
+ "grad_norm": 0.0,
162
+ "learning_rate": 0.00946433041301627,
163
+ "loss": 0.0,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 28.75,
168
+ "grad_norm": 0.0,
169
+ "learning_rate": 0.009439299123904881,
170
+ "loss": 0.0,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 30.0,
175
+ "grad_norm": 0.0,
176
+ "learning_rate": 0.009414267834793492,
177
+ "loss": 0.0,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 31.25,
182
+ "grad_norm": 0.0,
183
+ "learning_rate": 0.009389236545682102,
184
+ "loss": 0.0,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 32.5,
189
+ "grad_norm": 0.0,
190
+ "learning_rate": 0.009364205256570713,
191
+ "loss": 0.0,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 33.75,
196
+ "grad_norm": 0.0,
197
+ "learning_rate": 0.009339173967459325,
198
+ "loss": 0.0,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 35.0,
203
+ "grad_norm": 0.0,
204
+ "learning_rate": 0.009314142678347936,
205
+ "loss": 0.0,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 36.25,
210
+ "grad_norm": 0.0,
211
+ "learning_rate": 0.009289111389236547,
212
+ "loss": 0.0,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 37.5,
217
+ "grad_norm": 0.0,
218
+ "learning_rate": 0.009264080100125156,
219
+ "loss": 0.0,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 38.75,
224
+ "grad_norm": 0.0,
225
+ "learning_rate": 0.009239048811013768,
226
+ "loss": 0.0,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 40.0,
231
+ "grad_norm": 0.0,
232
+ "learning_rate": 0.009214017521902379,
233
+ "loss": 0.0,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 41.25,
238
+ "grad_norm": 0.0,
239
+ "learning_rate": 0.00918898623279099,
240
+ "loss": 0.0,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 42.5,
245
+ "grad_norm": 0.0,
246
+ "learning_rate": 0.0091639549436796,
247
+ "loss": 0.0,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 43.75,
252
+ "grad_norm": 0.0,
253
+ "learning_rate": 0.00913892365456821,
254
+ "loss": 0.0,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 45.0,
259
+ "grad_norm": 0.0,
260
+ "learning_rate": 0.009113892365456821,
261
+ "loss": 0.0,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 46.25,
266
+ "grad_norm": 0.0,
267
+ "learning_rate": 0.009088861076345432,
268
+ "loss": 0.0,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 47.5,
273
+ "grad_norm": 0.0,
274
+ "learning_rate": 0.009063829787234043,
275
+ "loss": 0.0,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 48.75,
280
+ "grad_norm": 0.0,
281
+ "learning_rate": 0.009038798498122653,
282
+ "loss": 0.0,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 50.0,
287
+ "grad_norm": 0.0,
288
+ "learning_rate": 0.009013767209011264,
289
+ "loss": 0.0,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 51.25,
294
+ "grad_norm": 0.0,
295
+ "learning_rate": 0.008988735919899874,
296
+ "loss": 0.0,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 52.5,
301
+ "grad_norm": 0.0,
302
+ "learning_rate": 0.008963704630788487,
303
+ "loss": 0.0,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 53.75,
308
+ "grad_norm": 0.0,
309
+ "learning_rate": 0.008938673341677096,
310
+ "loss": 0.0,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 55.0,
315
+ "grad_norm": 0.0,
316
+ "learning_rate": 0.008913642052565706,
317
+ "loss": 0.0,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 56.25,
322
+ "grad_norm": 0.0,
323
+ "learning_rate": 0.008888610763454317,
324
+ "loss": 0.0,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 57.5,
329
+ "grad_norm": 0.0,
330
+ "learning_rate": 0.00886357947434293,
331
+ "loss": 0.0,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 58.75,
336
+ "grad_norm": 0.0,
337
+ "learning_rate": 0.00883854818523154,
338
+ "loss": 0.0,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 60.0,
343
+ "grad_norm": 0.0,
344
+ "learning_rate": 0.00881351689612015,
345
+ "loss": 0.0,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 61.25,
350
+ "grad_norm": 0.0,
351
+ "learning_rate": 0.008788485607008761,
352
+ "loss": 0.0,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 62.5,
357
+ "grad_norm": 0.0,
358
+ "learning_rate": 0.008763454317897372,
359
+ "loss": 0.0,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 63.75,
364
+ "grad_norm": 0.0,
365
+ "learning_rate": 0.008738423028785983,
366
+ "loss": 0.0,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 65.0,
371
+ "grad_norm": 0.0,
372
+ "learning_rate": 0.008713391739674593,
373
+ "loss": 0.0,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 66.25,
378
+ "grad_norm": 0.0,
379
+ "learning_rate": 0.008688360450563204,
380
+ "loss": 0.0,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 67.5,
385
+ "grad_norm": 0.0,
386
+ "learning_rate": 0.008663329161451815,
387
+ "loss": 0.0,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 68.75,
392
+ "grad_norm": 0.0,
393
+ "learning_rate": 0.008638297872340425,
394
+ "loss": 0.0,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 70.0,
399
+ "grad_norm": 0.0,
400
+ "learning_rate": 0.008613266583229038,
401
+ "loss": 0.0,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 71.25,
406
+ "grad_norm": 0.0,
407
+ "learning_rate": 0.008588235294117647,
408
+ "loss": 0.0,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 72.5,
413
+ "grad_norm": 0.0,
414
+ "learning_rate": 0.008563204005006257,
415
+ "loss": 0.0,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 73.75,
420
+ "grad_norm": 0.0,
421
+ "learning_rate": 0.008538172715894868,
422
+ "loss": 0.0,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 75.0,
427
+ "grad_norm": 0.0,
428
+ "learning_rate": 0.00851314142678348,
429
+ "loss": 0.0,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 76.25,
434
+ "grad_norm": 0.0,
435
+ "learning_rate": 0.00848811013767209,
436
+ "loss": 0.0,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 77.5,
441
+ "grad_norm": 0.0,
442
+ "learning_rate": 0.008463078848560701,
443
+ "loss": 0.0,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 78.75,
448
+ "grad_norm": 0.0,
449
+ "learning_rate": 0.008438047559449312,
450
+ "loss": 0.0,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 80.0,
455
+ "grad_norm": 0.0,
456
+ "learning_rate": 0.008413016270337923,
457
+ "loss": 0.0,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 81.25,
462
+ "grad_norm": 0.0,
463
+ "learning_rate": 0.008387984981226533,
464
+ "loss": 0.0,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 82.5,
469
+ "grad_norm": 0.0,
470
+ "learning_rate": 0.008362953692115144,
471
+ "loss": 0.0,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 83.75,
476
+ "grad_norm": 0.0,
477
+ "learning_rate": 0.008337922403003755,
478
+ "loss": 0.0,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 85.0,
483
+ "grad_norm": 0.0,
484
+ "learning_rate": 0.008312891113892365,
485
+ "loss": 0.0,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 86.25,
490
+ "grad_norm": 0.0,
491
+ "learning_rate": 0.008287859824780976,
492
+ "loss": 0.0,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 87.5,
497
+ "grad_norm": 0.0,
498
+ "learning_rate": 0.008262828535669588,
499
+ "loss": 0.0,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 88.75,
504
+ "grad_norm": 0.0,
505
+ "learning_rate": 0.008237797246558197,
506
+ "loss": 0.0,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 90.0,
511
+ "grad_norm": 0.0,
512
+ "learning_rate": 0.008212765957446808,
513
+ "loss": 0.0,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 91.25,
518
+ "grad_norm": 0.0,
519
+ "learning_rate": 0.008187734668335419,
520
+ "loss": 0.0,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 92.5,
525
+ "grad_norm": 0.0,
526
+ "learning_rate": 0.008162703379224031,
527
+ "loss": 0.0,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 93.75,
532
+ "grad_norm": 0.0,
533
+ "learning_rate": 0.008137672090112642,
534
+ "loss": 0.0,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 95.0,
539
+ "grad_norm": 0.0,
540
+ "learning_rate": 0.008112640801001252,
541
+ "loss": 0.0,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 96.25,
546
+ "grad_norm": 0.0,
547
+ "learning_rate": 0.008087609511889863,
548
+ "loss": 0.0,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 97.5,
553
+ "grad_norm": 0.0,
554
+ "learning_rate": 0.008062578222778474,
555
+ "loss": 0.0,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 98.75,
560
+ "grad_norm": 0.0,
561
+ "learning_rate": 0.008037546933667084,
562
+ "loss": 0.0,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 100.0,
567
+ "grad_norm": 0.0,
568
+ "learning_rate": 0.008012515644555695,
569
+ "loss": 0.0,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 101.25,
574
+ "grad_norm": 0.0,
575
+ "learning_rate": 0.007987484355444305,
576
+ "loss": 0.0,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 102.5,
581
+ "grad_norm": 0.0,
582
+ "learning_rate": 0.007962453066332916,
583
+ "loss": 0.0,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 103.75,
588
+ "grad_norm": 0.0,
589
+ "learning_rate": 0.007937421777221527,
590
+ "loss": 0.0,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 105.0,
595
+ "grad_norm": 0.0,
596
+ "learning_rate": 0.007912390488110137,
597
+ "loss": 0.0,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 106.25,
602
+ "grad_norm": 0.0,
603
+ "learning_rate": 0.007887359198998748,
604
+ "loss": 0.0,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 107.5,
609
+ "grad_norm": 0.0,
610
+ "learning_rate": 0.007862327909887359,
611
+ "loss": 0.0,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 108.75,
616
+ "grad_norm": 0.0,
617
+ "learning_rate": 0.00783729662077597,
618
+ "loss": 0.0,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 110.0,
623
+ "grad_norm": 0.0,
624
+ "learning_rate": 0.007812265331664581,
625
+ "loss": 0.0,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 111.25,
630
+ "grad_norm": 0.0,
631
+ "learning_rate": 0.0077872340425531915,
632
+ "loss": 0.0,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 112.5,
637
+ "grad_norm": 0.0,
638
+ "learning_rate": 0.007762202753441803,
639
+ "loss": 0.0,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 113.75,
644
+ "grad_norm": 0.0,
645
+ "learning_rate": 0.007737171464330414,
646
+ "loss": 0.0,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 115.0,
651
+ "grad_norm": 0.0,
652
+ "learning_rate": 0.0077121401752190235,
653
+ "loss": 0.0,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 116.25,
658
+ "grad_norm": 0.0,
659
+ "learning_rate": 0.007687108886107634,
660
+ "loss": 0.0,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 117.5,
665
+ "grad_norm": 0.0,
666
+ "learning_rate": 0.007662077596996246,
667
+ "loss": 0.0,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 118.75,
672
+ "grad_norm": 0.0,
673
+ "learning_rate": 0.007637046307884856,
674
+ "loss": 0.0,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 120.0,
679
+ "grad_norm": 0.0,
680
+ "learning_rate": 0.007612015018773467,
681
+ "loss": 0.0,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 121.25,
686
+ "grad_norm": 0.0,
687
+ "learning_rate": 0.007586983729662078,
688
+ "loss": 0.0,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 122.5,
693
+ "grad_norm": 0.0,
694
+ "learning_rate": 0.007561952440550689,
695
+ "loss": 0.0,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 123.75,
700
+ "grad_norm": 0.0,
701
+ "learning_rate": 0.007536921151439299,
702
+ "loss": 0.0,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 125.0,
707
+ "grad_norm": 0.0,
708
+ "learning_rate": 0.0075118898623279095,
709
+ "loss": 0.0,
710
+ "step": 1000
711
+ }
712
+ ],
713
+ "logging_steps": 10,
714
+ "max_steps": 4000,
715
+ "num_input_tokens_seen": 0,
716
+ "num_train_epochs": 500,
717
+ "save_steps": 50,
718
+ "stateful_callbacks": {
719
+ "TrainerControl": {
720
+ "args": {
721
+ "should_epoch_stop": false,
722
+ "should_evaluate": false,
723
+ "should_log": false,
724
+ "should_save": true,
725
+ "should_training_stop": false
726
+ },
727
+ "attributes": {}
728
+ }
729
+ },
730
+ "total_flos": 6750000.0,
731
+ "train_batch_size": 64,
732
+ "trial_name": null,
733
+ "trial_params": null
734
+ }
checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6208b173b902d75ab9814341e70009f66ecc0ecd70d024038fcaaf650010173
3
+ size 5137
checkpoint-1050/config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu",
3
+ "add_cross_attention": false,
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.0,
8
+ "bos_token_id": 1,
9
+ "dtype": "float32",
10
+ "embd_pdrop": 0.0,
11
+ "eos_token_id": 1,
12
+ "initializer_range": 0.02,
13
+ "layer_norm_epsilon": 1e-05,
14
+ "model_type": "gpt2",
15
+ "n_embd": 1,
16
+ "n_head": 1,
17
+ "n_inner": 1,
18
+ "n_layer": 1,
19
+ "n_positions": 1,
20
+ "pad_token_id": 0,
21
+ "reorder_and_upcast_attn": false,
22
+ "resid_pdrop": 0.0,
23
+ "scale_attn_by_inverse_layer_idx": false,
24
+ "scale_attn_weights": true,
25
+ "summary_activation": null,
26
+ "summary_first_dropout": 0.1,
27
+ "summary_proj_to_labels": true,
28
+ "summary_type": "cls_index",
29
+ "summary_use_proj": true,
30
+ "tie_word_embeddings": true,
31
+ "transformers_version": "5.12.0",
32
+ "use_cache": false,
33
+ "vocab_size": 2
34
+ }
checkpoint-1050/generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 1,
5
+ "output_attentions": false,
6
+ "output_hidden_states": false,
7
+ "pad_token_id": 0,
8
+ "transformers_version": "5.12.0",
9
+ "use_cache": true
10
+ }
checkpoint-1050/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:880c2ec6c05c1e38595824792a1d3850592746c746ee2caa95bee8cb0e8ec77d
3
+ size 1452
checkpoint-1050/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0587ee5489e3230570b30b2c6399ad8da6204af076e1b7662ccdb717d4f61c53
3
+ size 13823
checkpoint-1050/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72dedc896a3c9245030a09cd03b9a14e0fdca07e3a751912053a18a37a5e6782
3
+ size 14455
checkpoint-1050/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82fa2536fa054f9bf5776f9ff6684daf9790146183edb054be6762da536dacb2
3
+ size 1465
checkpoint-1050/trainer_state.json ADDED
@@ -0,0 +1,769 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 131.25,
6
+ "eval_steps": 500,
7
+ "global_step": 1050,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 1.25,
14
+ "grad_norm": 0.0,
15
+ "learning_rate": 0.009989987484355445,
16
+ "loss": 0.0,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 2.5,
21
+ "grad_norm": 0.0,
22
+ "learning_rate": 0.009964956195244054,
23
+ "loss": 0.0,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 3.75,
28
+ "grad_norm": 0.0,
29
+ "learning_rate": 0.009939924906132666,
30
+ "loss": 0.0,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 5.0,
35
+ "grad_norm": 0.0,
36
+ "learning_rate": 0.009914893617021277,
37
+ "loss": 0.0,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 6.25,
42
+ "grad_norm": 0.0,
43
+ "learning_rate": 0.009889862327909888,
44
+ "loss": 0.0,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 7.5,
49
+ "grad_norm": 0.0,
50
+ "learning_rate": 0.009864831038798498,
51
+ "loss": 0.0,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 8.75,
56
+ "grad_norm": 0.0,
57
+ "learning_rate": 0.009839799749687109,
58
+ "loss": 0.0,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 10.0,
63
+ "grad_norm": 0.0,
64
+ "learning_rate": 0.00981476846057572,
65
+ "loss": 0.0,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 11.25,
70
+ "grad_norm": 0.0,
71
+ "learning_rate": 0.00978973717146433,
72
+ "loss": 0.0,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 12.5,
77
+ "grad_norm": 0.0,
78
+ "learning_rate": 0.009764705882352941,
79
+ "loss": 0.0,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 13.75,
84
+ "grad_norm": 0.0,
85
+ "learning_rate": 0.009739674593241552,
86
+ "loss": 0.0,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 15.0,
91
+ "grad_norm": 0.0,
92
+ "learning_rate": 0.009714643304130162,
93
+ "loss": 0.0,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 16.25,
98
+ "grad_norm": 0.0,
99
+ "learning_rate": 0.009689612015018775,
100
+ "loss": 0.0,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 17.5,
105
+ "grad_norm": 0.0,
106
+ "learning_rate": 0.009664580725907385,
107
+ "loss": 0.0,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 18.75,
112
+ "grad_norm": 0.0,
113
+ "learning_rate": 0.009639549436795996,
114
+ "loss": 0.0,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 20.0,
119
+ "grad_norm": 0.0,
120
+ "learning_rate": 0.009614518147684605,
121
+ "loss": 0.0,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 21.25,
126
+ "grad_norm": 0.0,
127
+ "learning_rate": 0.009589486858573217,
128
+ "loss": 0.0,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 22.5,
133
+ "grad_norm": 0.0,
134
+ "learning_rate": 0.009564455569461828,
135
+ "loss": 0.0,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 23.75,
140
+ "grad_norm": 0.0,
141
+ "learning_rate": 0.009539424280350439,
142
+ "loss": 0.0,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 25.0,
147
+ "grad_norm": 0.0,
148
+ "learning_rate": 0.00951439299123905,
149
+ "loss": 0.0,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 26.25,
154
+ "grad_norm": 0.0,
155
+ "learning_rate": 0.00948936170212766,
156
+ "loss": 0.0,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 27.5,
161
+ "grad_norm": 0.0,
162
+ "learning_rate": 0.00946433041301627,
163
+ "loss": 0.0,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 28.75,
168
+ "grad_norm": 0.0,
169
+ "learning_rate": 0.009439299123904881,
170
+ "loss": 0.0,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 30.0,
175
+ "grad_norm": 0.0,
176
+ "learning_rate": 0.009414267834793492,
177
+ "loss": 0.0,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 31.25,
182
+ "grad_norm": 0.0,
183
+ "learning_rate": 0.009389236545682102,
184
+ "loss": 0.0,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 32.5,
189
+ "grad_norm": 0.0,
190
+ "learning_rate": 0.009364205256570713,
191
+ "loss": 0.0,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 33.75,
196
+ "grad_norm": 0.0,
197
+ "learning_rate": 0.009339173967459325,
198
+ "loss": 0.0,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 35.0,
203
+ "grad_norm": 0.0,
204
+ "learning_rate": 0.009314142678347936,
205
+ "loss": 0.0,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 36.25,
210
+ "grad_norm": 0.0,
211
+ "learning_rate": 0.009289111389236547,
212
+ "loss": 0.0,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 37.5,
217
+ "grad_norm": 0.0,
218
+ "learning_rate": 0.009264080100125156,
219
+ "loss": 0.0,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 38.75,
224
+ "grad_norm": 0.0,
225
+ "learning_rate": 0.009239048811013768,
226
+ "loss": 0.0,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 40.0,
231
+ "grad_norm": 0.0,
232
+ "learning_rate": 0.009214017521902379,
233
+ "loss": 0.0,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 41.25,
238
+ "grad_norm": 0.0,
239
+ "learning_rate": 0.00918898623279099,
240
+ "loss": 0.0,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 42.5,
245
+ "grad_norm": 0.0,
246
+ "learning_rate": 0.0091639549436796,
247
+ "loss": 0.0,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 43.75,
252
+ "grad_norm": 0.0,
253
+ "learning_rate": 0.00913892365456821,
254
+ "loss": 0.0,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 45.0,
259
+ "grad_norm": 0.0,
260
+ "learning_rate": 0.009113892365456821,
261
+ "loss": 0.0,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 46.25,
266
+ "grad_norm": 0.0,
267
+ "learning_rate": 0.009088861076345432,
268
+ "loss": 0.0,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 47.5,
273
+ "grad_norm": 0.0,
274
+ "learning_rate": 0.009063829787234043,
275
+ "loss": 0.0,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 48.75,
280
+ "grad_norm": 0.0,
281
+ "learning_rate": 0.009038798498122653,
282
+ "loss": 0.0,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 50.0,
287
+ "grad_norm": 0.0,
288
+ "learning_rate": 0.009013767209011264,
289
+ "loss": 0.0,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 51.25,
294
+ "grad_norm": 0.0,
295
+ "learning_rate": 0.008988735919899874,
296
+ "loss": 0.0,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 52.5,
301
+ "grad_norm": 0.0,
302
+ "learning_rate": 0.008963704630788487,
303
+ "loss": 0.0,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 53.75,
308
+ "grad_norm": 0.0,
309
+ "learning_rate": 0.008938673341677096,
310
+ "loss": 0.0,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 55.0,
315
+ "grad_norm": 0.0,
316
+ "learning_rate": 0.008913642052565706,
317
+ "loss": 0.0,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 56.25,
322
+ "grad_norm": 0.0,
323
+ "learning_rate": 0.008888610763454317,
324
+ "loss": 0.0,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 57.5,
329
+ "grad_norm": 0.0,
330
+ "learning_rate": 0.00886357947434293,
331
+ "loss": 0.0,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 58.75,
336
+ "grad_norm": 0.0,
337
+ "learning_rate": 0.00883854818523154,
338
+ "loss": 0.0,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 60.0,
343
+ "grad_norm": 0.0,
344
+ "learning_rate": 0.00881351689612015,
345
+ "loss": 0.0,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 61.25,
350
+ "grad_norm": 0.0,
351
+ "learning_rate": 0.008788485607008761,
352
+ "loss": 0.0,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 62.5,
357
+ "grad_norm": 0.0,
358
+ "learning_rate": 0.008763454317897372,
359
+ "loss": 0.0,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 63.75,
364
+ "grad_norm": 0.0,
365
+ "learning_rate": 0.008738423028785983,
366
+ "loss": 0.0,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 65.0,
371
+ "grad_norm": 0.0,
372
+ "learning_rate": 0.008713391739674593,
373
+ "loss": 0.0,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 66.25,
378
+ "grad_norm": 0.0,
379
+ "learning_rate": 0.008688360450563204,
380
+ "loss": 0.0,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 67.5,
385
+ "grad_norm": 0.0,
386
+ "learning_rate": 0.008663329161451815,
387
+ "loss": 0.0,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 68.75,
392
+ "grad_norm": 0.0,
393
+ "learning_rate": 0.008638297872340425,
394
+ "loss": 0.0,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 70.0,
399
+ "grad_norm": 0.0,
400
+ "learning_rate": 0.008613266583229038,
401
+ "loss": 0.0,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 71.25,
406
+ "grad_norm": 0.0,
407
+ "learning_rate": 0.008588235294117647,
408
+ "loss": 0.0,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 72.5,
413
+ "grad_norm": 0.0,
414
+ "learning_rate": 0.008563204005006257,
415
+ "loss": 0.0,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 73.75,
420
+ "grad_norm": 0.0,
421
+ "learning_rate": 0.008538172715894868,
422
+ "loss": 0.0,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 75.0,
427
+ "grad_norm": 0.0,
428
+ "learning_rate": 0.00851314142678348,
429
+ "loss": 0.0,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 76.25,
434
+ "grad_norm": 0.0,
435
+ "learning_rate": 0.00848811013767209,
436
+ "loss": 0.0,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 77.5,
441
+ "grad_norm": 0.0,
442
+ "learning_rate": 0.008463078848560701,
443
+ "loss": 0.0,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 78.75,
448
+ "grad_norm": 0.0,
449
+ "learning_rate": 0.008438047559449312,
450
+ "loss": 0.0,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 80.0,
455
+ "grad_norm": 0.0,
456
+ "learning_rate": 0.008413016270337923,
457
+ "loss": 0.0,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 81.25,
462
+ "grad_norm": 0.0,
463
+ "learning_rate": 0.008387984981226533,
464
+ "loss": 0.0,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 82.5,
469
+ "grad_norm": 0.0,
470
+ "learning_rate": 0.008362953692115144,
471
+ "loss": 0.0,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 83.75,
476
+ "grad_norm": 0.0,
477
+ "learning_rate": 0.008337922403003755,
478
+ "loss": 0.0,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 85.0,
483
+ "grad_norm": 0.0,
484
+ "learning_rate": 0.008312891113892365,
485
+ "loss": 0.0,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 86.25,
490
+ "grad_norm": 0.0,
491
+ "learning_rate": 0.008287859824780976,
492
+ "loss": 0.0,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 87.5,
497
+ "grad_norm": 0.0,
498
+ "learning_rate": 0.008262828535669588,
499
+ "loss": 0.0,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 88.75,
504
+ "grad_norm": 0.0,
505
+ "learning_rate": 0.008237797246558197,
506
+ "loss": 0.0,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 90.0,
511
+ "grad_norm": 0.0,
512
+ "learning_rate": 0.008212765957446808,
513
+ "loss": 0.0,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 91.25,
518
+ "grad_norm": 0.0,
519
+ "learning_rate": 0.008187734668335419,
520
+ "loss": 0.0,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 92.5,
525
+ "grad_norm": 0.0,
526
+ "learning_rate": 0.008162703379224031,
527
+ "loss": 0.0,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 93.75,
532
+ "grad_norm": 0.0,
533
+ "learning_rate": 0.008137672090112642,
534
+ "loss": 0.0,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 95.0,
539
+ "grad_norm": 0.0,
540
+ "learning_rate": 0.008112640801001252,
541
+ "loss": 0.0,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 96.25,
546
+ "grad_norm": 0.0,
547
+ "learning_rate": 0.008087609511889863,
548
+ "loss": 0.0,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 97.5,
553
+ "grad_norm": 0.0,
554
+ "learning_rate": 0.008062578222778474,
555
+ "loss": 0.0,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 98.75,
560
+ "grad_norm": 0.0,
561
+ "learning_rate": 0.008037546933667084,
562
+ "loss": 0.0,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 100.0,
567
+ "grad_norm": 0.0,
568
+ "learning_rate": 0.008012515644555695,
569
+ "loss": 0.0,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 101.25,
574
+ "grad_norm": 0.0,
575
+ "learning_rate": 0.007987484355444305,
576
+ "loss": 0.0,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 102.5,
581
+ "grad_norm": 0.0,
582
+ "learning_rate": 0.007962453066332916,
583
+ "loss": 0.0,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 103.75,
588
+ "grad_norm": 0.0,
589
+ "learning_rate": 0.007937421777221527,
590
+ "loss": 0.0,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 105.0,
595
+ "grad_norm": 0.0,
596
+ "learning_rate": 0.007912390488110137,
597
+ "loss": 0.0,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 106.25,
602
+ "grad_norm": 0.0,
603
+ "learning_rate": 0.007887359198998748,
604
+ "loss": 0.0,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 107.5,
609
+ "grad_norm": 0.0,
610
+ "learning_rate": 0.007862327909887359,
611
+ "loss": 0.0,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 108.75,
616
+ "grad_norm": 0.0,
617
+ "learning_rate": 0.00783729662077597,
618
+ "loss": 0.0,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 110.0,
623
+ "grad_norm": 0.0,
624
+ "learning_rate": 0.007812265331664581,
625
+ "loss": 0.0,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 111.25,
630
+ "grad_norm": 0.0,
631
+ "learning_rate": 0.0077872340425531915,
632
+ "loss": 0.0,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 112.5,
637
+ "grad_norm": 0.0,
638
+ "learning_rate": 0.007762202753441803,
639
+ "loss": 0.0,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 113.75,
644
+ "grad_norm": 0.0,
645
+ "learning_rate": 0.007737171464330414,
646
+ "loss": 0.0,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 115.0,
651
+ "grad_norm": 0.0,
652
+ "learning_rate": 0.0077121401752190235,
653
+ "loss": 0.0,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 116.25,
658
+ "grad_norm": 0.0,
659
+ "learning_rate": 0.007687108886107634,
660
+ "loss": 0.0,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 117.5,
665
+ "grad_norm": 0.0,
666
+ "learning_rate": 0.007662077596996246,
667
+ "loss": 0.0,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 118.75,
672
+ "grad_norm": 0.0,
673
+ "learning_rate": 0.007637046307884856,
674
+ "loss": 0.0,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 120.0,
679
+ "grad_norm": 0.0,
680
+ "learning_rate": 0.007612015018773467,
681
+ "loss": 0.0,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 121.25,
686
+ "grad_norm": 0.0,
687
+ "learning_rate": 0.007586983729662078,
688
+ "loss": 0.0,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 122.5,
693
+ "grad_norm": 0.0,
694
+ "learning_rate": 0.007561952440550689,
695
+ "loss": 0.0,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 123.75,
700
+ "grad_norm": 0.0,
701
+ "learning_rate": 0.007536921151439299,
702
+ "loss": 0.0,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 125.0,
707
+ "grad_norm": 0.0,
708
+ "learning_rate": 0.0075118898623279095,
709
+ "loss": 0.0,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 126.25,
714
+ "grad_norm": 0.0,
715
+ "learning_rate": 0.007486858573216521,
716
+ "loss": 0.0,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 127.5,
721
+ "grad_norm": 0.0,
722
+ "learning_rate": 0.007461827284105132,
723
+ "loss": 0.0,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 128.75,
728
+ "grad_norm": 0.0,
729
+ "learning_rate": 0.007436795994993742,
730
+ "loss": 0.0,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 130.0,
735
+ "grad_norm": 0.0,
736
+ "learning_rate": 0.007411764705882354,
737
+ "loss": 0.0,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 131.25,
742
+ "grad_norm": 0.0,
743
+ "learning_rate": 0.0073867334167709645,
744
+ "loss": 0.0,
745
+ "step": 1050
746
+ }
747
+ ],
748
+ "logging_steps": 10,
749
+ "max_steps": 4000,
750
+ "num_input_tokens_seen": 0,
751
+ "num_train_epochs": 500,
752
+ "save_steps": 50,
753
+ "stateful_callbacks": {
754
+ "TrainerControl": {
755
+ "args": {
756
+ "should_epoch_stop": false,
757
+ "should_evaluate": false,
758
+ "should_log": false,
759
+ "should_save": true,
760
+ "should_training_stop": false
761
+ },
762
+ "attributes": {}
763
+ }
764
+ },
765
+ "total_flos": 7087824.0,
766
+ "train_batch_size": 64,
767
+ "trial_name": null,
768
+ "trial_params": null
769
+ }
checkpoint-1050/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6208b173b902d75ab9814341e70009f66ecc0ecd70d024038fcaaf650010173
3
+ size 5137
checkpoint-1100/config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu",
3
+ "add_cross_attention": false,
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.0,
8
+ "bos_token_id": 1,
9
+ "dtype": "float32",
10
+ "embd_pdrop": 0.0,
11
+ "eos_token_id": 1,
12
+ "initializer_range": 0.02,
13
+ "layer_norm_epsilon": 1e-05,
14
+ "model_type": "gpt2",
15
+ "n_embd": 1,
16
+ "n_head": 1,
17
+ "n_inner": 1,
18
+ "n_layer": 1,
19
+ "n_positions": 1,
20
+ "pad_token_id": 0,
21
+ "reorder_and_upcast_attn": false,
22
+ "resid_pdrop": 0.0,
23
+ "scale_attn_by_inverse_layer_idx": false,
24
+ "scale_attn_weights": true,
25
+ "summary_activation": null,
26
+ "summary_first_dropout": 0.1,
27
+ "summary_proj_to_labels": true,
28
+ "summary_type": "cls_index",
29
+ "summary_use_proj": true,
30
+ "tie_word_embeddings": true,
31
+ "transformers_version": "5.12.0",
32
+ "use_cache": false,
33
+ "vocab_size": 2
34
+ }
checkpoint-1100/generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 1,
5
+ "output_attentions": false,
6
+ "output_hidden_states": false,
7
+ "pad_token_id": 0,
8
+ "transformers_version": "5.12.0",
9
+ "use_cache": true
10
+ }
checkpoint-1100/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:880c2ec6c05c1e38595824792a1d3850592746c746ee2caa95bee8cb0e8ec77d
3
+ size 1452
checkpoint-1100/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cecfcf884ab0004c59872c46381bfe51f9a073a67de02523a5acf9692d5ba866
3
+ size 13823
checkpoint-1100/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fc2de446fe61209ed367ff94c7ffb565e5e69564436f15adf49d20829abf178
3
+ size 14455
checkpoint-1100/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb7541fe30b75a402f49ee32b1224b2f0f71b7e0c5a8d479bb8a263674caa2b4
3
+ size 1465
checkpoint-1100/trainer_state.json ADDED
@@ -0,0 +1,804 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 137.5,
6
+ "eval_steps": 500,
7
+ "global_step": 1100,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 1.25,
14
+ "grad_norm": 0.0,
15
+ "learning_rate": 0.009989987484355445,
16
+ "loss": 0.0,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 2.5,
21
+ "grad_norm": 0.0,
22
+ "learning_rate": 0.009964956195244054,
23
+ "loss": 0.0,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 3.75,
28
+ "grad_norm": 0.0,
29
+ "learning_rate": 0.009939924906132666,
30
+ "loss": 0.0,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 5.0,
35
+ "grad_norm": 0.0,
36
+ "learning_rate": 0.009914893617021277,
37
+ "loss": 0.0,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 6.25,
42
+ "grad_norm": 0.0,
43
+ "learning_rate": 0.009889862327909888,
44
+ "loss": 0.0,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 7.5,
49
+ "grad_norm": 0.0,
50
+ "learning_rate": 0.009864831038798498,
51
+ "loss": 0.0,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 8.75,
56
+ "grad_norm": 0.0,
57
+ "learning_rate": 0.009839799749687109,
58
+ "loss": 0.0,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 10.0,
63
+ "grad_norm": 0.0,
64
+ "learning_rate": 0.00981476846057572,
65
+ "loss": 0.0,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 11.25,
70
+ "grad_norm": 0.0,
71
+ "learning_rate": 0.00978973717146433,
72
+ "loss": 0.0,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 12.5,
77
+ "grad_norm": 0.0,
78
+ "learning_rate": 0.009764705882352941,
79
+ "loss": 0.0,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 13.75,
84
+ "grad_norm": 0.0,
85
+ "learning_rate": 0.009739674593241552,
86
+ "loss": 0.0,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 15.0,
91
+ "grad_norm": 0.0,
92
+ "learning_rate": 0.009714643304130162,
93
+ "loss": 0.0,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 16.25,
98
+ "grad_norm": 0.0,
99
+ "learning_rate": 0.009689612015018775,
100
+ "loss": 0.0,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 17.5,
105
+ "grad_norm": 0.0,
106
+ "learning_rate": 0.009664580725907385,
107
+ "loss": 0.0,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 18.75,
112
+ "grad_norm": 0.0,
113
+ "learning_rate": 0.009639549436795996,
114
+ "loss": 0.0,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 20.0,
119
+ "grad_norm": 0.0,
120
+ "learning_rate": 0.009614518147684605,
121
+ "loss": 0.0,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 21.25,
126
+ "grad_norm": 0.0,
127
+ "learning_rate": 0.009589486858573217,
128
+ "loss": 0.0,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 22.5,
133
+ "grad_norm": 0.0,
134
+ "learning_rate": 0.009564455569461828,
135
+ "loss": 0.0,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 23.75,
140
+ "grad_norm": 0.0,
141
+ "learning_rate": 0.009539424280350439,
142
+ "loss": 0.0,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 25.0,
147
+ "grad_norm": 0.0,
148
+ "learning_rate": 0.00951439299123905,
149
+ "loss": 0.0,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 26.25,
154
+ "grad_norm": 0.0,
155
+ "learning_rate": 0.00948936170212766,
156
+ "loss": 0.0,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 27.5,
161
+ "grad_norm": 0.0,
162
+ "learning_rate": 0.00946433041301627,
163
+ "loss": 0.0,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 28.75,
168
+ "grad_norm": 0.0,
169
+ "learning_rate": 0.009439299123904881,
170
+ "loss": 0.0,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 30.0,
175
+ "grad_norm": 0.0,
176
+ "learning_rate": 0.009414267834793492,
177
+ "loss": 0.0,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 31.25,
182
+ "grad_norm": 0.0,
183
+ "learning_rate": 0.009389236545682102,
184
+ "loss": 0.0,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 32.5,
189
+ "grad_norm": 0.0,
190
+ "learning_rate": 0.009364205256570713,
191
+ "loss": 0.0,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 33.75,
196
+ "grad_norm": 0.0,
197
+ "learning_rate": 0.009339173967459325,
198
+ "loss": 0.0,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 35.0,
203
+ "grad_norm": 0.0,
204
+ "learning_rate": 0.009314142678347936,
205
+ "loss": 0.0,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 36.25,
210
+ "grad_norm": 0.0,
211
+ "learning_rate": 0.009289111389236547,
212
+ "loss": 0.0,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 37.5,
217
+ "grad_norm": 0.0,
218
+ "learning_rate": 0.009264080100125156,
219
+ "loss": 0.0,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 38.75,
224
+ "grad_norm": 0.0,
225
+ "learning_rate": 0.009239048811013768,
226
+ "loss": 0.0,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 40.0,
231
+ "grad_norm": 0.0,
232
+ "learning_rate": 0.009214017521902379,
233
+ "loss": 0.0,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 41.25,
238
+ "grad_norm": 0.0,
239
+ "learning_rate": 0.00918898623279099,
240
+ "loss": 0.0,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 42.5,
245
+ "grad_norm": 0.0,
246
+ "learning_rate": 0.0091639549436796,
247
+ "loss": 0.0,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 43.75,
252
+ "grad_norm": 0.0,
253
+ "learning_rate": 0.00913892365456821,
254
+ "loss": 0.0,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 45.0,
259
+ "grad_norm": 0.0,
260
+ "learning_rate": 0.009113892365456821,
261
+ "loss": 0.0,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 46.25,
266
+ "grad_norm": 0.0,
267
+ "learning_rate": 0.009088861076345432,
268
+ "loss": 0.0,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 47.5,
273
+ "grad_norm": 0.0,
274
+ "learning_rate": 0.009063829787234043,
275
+ "loss": 0.0,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 48.75,
280
+ "grad_norm": 0.0,
281
+ "learning_rate": 0.009038798498122653,
282
+ "loss": 0.0,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 50.0,
287
+ "grad_norm": 0.0,
288
+ "learning_rate": 0.009013767209011264,
289
+ "loss": 0.0,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 51.25,
294
+ "grad_norm": 0.0,
295
+ "learning_rate": 0.008988735919899874,
296
+ "loss": 0.0,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 52.5,
301
+ "grad_norm": 0.0,
302
+ "learning_rate": 0.008963704630788487,
303
+ "loss": 0.0,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 53.75,
308
+ "grad_norm": 0.0,
309
+ "learning_rate": 0.008938673341677096,
310
+ "loss": 0.0,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 55.0,
315
+ "grad_norm": 0.0,
316
+ "learning_rate": 0.008913642052565706,
317
+ "loss": 0.0,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 56.25,
322
+ "grad_norm": 0.0,
323
+ "learning_rate": 0.008888610763454317,
324
+ "loss": 0.0,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 57.5,
329
+ "grad_norm": 0.0,
330
+ "learning_rate": 0.00886357947434293,
331
+ "loss": 0.0,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 58.75,
336
+ "grad_norm": 0.0,
337
+ "learning_rate": 0.00883854818523154,
338
+ "loss": 0.0,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 60.0,
343
+ "grad_norm": 0.0,
344
+ "learning_rate": 0.00881351689612015,
345
+ "loss": 0.0,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 61.25,
350
+ "grad_norm": 0.0,
351
+ "learning_rate": 0.008788485607008761,
352
+ "loss": 0.0,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 62.5,
357
+ "grad_norm": 0.0,
358
+ "learning_rate": 0.008763454317897372,
359
+ "loss": 0.0,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 63.75,
364
+ "grad_norm": 0.0,
365
+ "learning_rate": 0.008738423028785983,
366
+ "loss": 0.0,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 65.0,
371
+ "grad_norm": 0.0,
372
+ "learning_rate": 0.008713391739674593,
373
+ "loss": 0.0,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 66.25,
378
+ "grad_norm": 0.0,
379
+ "learning_rate": 0.008688360450563204,
380
+ "loss": 0.0,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 67.5,
385
+ "grad_norm": 0.0,
386
+ "learning_rate": 0.008663329161451815,
387
+ "loss": 0.0,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 68.75,
392
+ "grad_norm": 0.0,
393
+ "learning_rate": 0.008638297872340425,
394
+ "loss": 0.0,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 70.0,
399
+ "grad_norm": 0.0,
400
+ "learning_rate": 0.008613266583229038,
401
+ "loss": 0.0,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 71.25,
406
+ "grad_norm": 0.0,
407
+ "learning_rate": 0.008588235294117647,
408
+ "loss": 0.0,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 72.5,
413
+ "grad_norm": 0.0,
414
+ "learning_rate": 0.008563204005006257,
415
+ "loss": 0.0,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 73.75,
420
+ "grad_norm": 0.0,
421
+ "learning_rate": 0.008538172715894868,
422
+ "loss": 0.0,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 75.0,
427
+ "grad_norm": 0.0,
428
+ "learning_rate": 0.00851314142678348,
429
+ "loss": 0.0,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 76.25,
434
+ "grad_norm": 0.0,
435
+ "learning_rate": 0.00848811013767209,
436
+ "loss": 0.0,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 77.5,
441
+ "grad_norm": 0.0,
442
+ "learning_rate": 0.008463078848560701,
443
+ "loss": 0.0,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 78.75,
448
+ "grad_norm": 0.0,
449
+ "learning_rate": 0.008438047559449312,
450
+ "loss": 0.0,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 80.0,
455
+ "grad_norm": 0.0,
456
+ "learning_rate": 0.008413016270337923,
457
+ "loss": 0.0,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 81.25,
462
+ "grad_norm": 0.0,
463
+ "learning_rate": 0.008387984981226533,
464
+ "loss": 0.0,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 82.5,
469
+ "grad_norm": 0.0,
470
+ "learning_rate": 0.008362953692115144,
471
+ "loss": 0.0,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 83.75,
476
+ "grad_norm": 0.0,
477
+ "learning_rate": 0.008337922403003755,
478
+ "loss": 0.0,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 85.0,
483
+ "grad_norm": 0.0,
484
+ "learning_rate": 0.008312891113892365,
485
+ "loss": 0.0,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 86.25,
490
+ "grad_norm": 0.0,
491
+ "learning_rate": 0.008287859824780976,
492
+ "loss": 0.0,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 87.5,
497
+ "grad_norm": 0.0,
498
+ "learning_rate": 0.008262828535669588,
499
+ "loss": 0.0,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 88.75,
504
+ "grad_norm": 0.0,
505
+ "learning_rate": 0.008237797246558197,
506
+ "loss": 0.0,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 90.0,
511
+ "grad_norm": 0.0,
512
+ "learning_rate": 0.008212765957446808,
513
+ "loss": 0.0,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 91.25,
518
+ "grad_norm": 0.0,
519
+ "learning_rate": 0.008187734668335419,
520
+ "loss": 0.0,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 92.5,
525
+ "grad_norm": 0.0,
526
+ "learning_rate": 0.008162703379224031,
527
+ "loss": 0.0,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 93.75,
532
+ "grad_norm": 0.0,
533
+ "learning_rate": 0.008137672090112642,
534
+ "loss": 0.0,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 95.0,
539
+ "grad_norm": 0.0,
540
+ "learning_rate": 0.008112640801001252,
541
+ "loss": 0.0,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 96.25,
546
+ "grad_norm": 0.0,
547
+ "learning_rate": 0.008087609511889863,
548
+ "loss": 0.0,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 97.5,
553
+ "grad_norm": 0.0,
554
+ "learning_rate": 0.008062578222778474,
555
+ "loss": 0.0,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 98.75,
560
+ "grad_norm": 0.0,
561
+ "learning_rate": 0.008037546933667084,
562
+ "loss": 0.0,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 100.0,
567
+ "grad_norm": 0.0,
568
+ "learning_rate": 0.008012515644555695,
569
+ "loss": 0.0,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 101.25,
574
+ "grad_norm": 0.0,
575
+ "learning_rate": 0.007987484355444305,
576
+ "loss": 0.0,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 102.5,
581
+ "grad_norm": 0.0,
582
+ "learning_rate": 0.007962453066332916,
583
+ "loss": 0.0,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 103.75,
588
+ "grad_norm": 0.0,
589
+ "learning_rate": 0.007937421777221527,
590
+ "loss": 0.0,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 105.0,
595
+ "grad_norm": 0.0,
596
+ "learning_rate": 0.007912390488110137,
597
+ "loss": 0.0,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 106.25,
602
+ "grad_norm": 0.0,
603
+ "learning_rate": 0.007887359198998748,
604
+ "loss": 0.0,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 107.5,
609
+ "grad_norm": 0.0,
610
+ "learning_rate": 0.007862327909887359,
611
+ "loss": 0.0,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 108.75,
616
+ "grad_norm": 0.0,
617
+ "learning_rate": 0.00783729662077597,
618
+ "loss": 0.0,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 110.0,
623
+ "grad_norm": 0.0,
624
+ "learning_rate": 0.007812265331664581,
625
+ "loss": 0.0,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 111.25,
630
+ "grad_norm": 0.0,
631
+ "learning_rate": 0.0077872340425531915,
632
+ "loss": 0.0,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 112.5,
637
+ "grad_norm": 0.0,
638
+ "learning_rate": 0.007762202753441803,
639
+ "loss": 0.0,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 113.75,
644
+ "grad_norm": 0.0,
645
+ "learning_rate": 0.007737171464330414,
646
+ "loss": 0.0,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 115.0,
651
+ "grad_norm": 0.0,
652
+ "learning_rate": 0.0077121401752190235,
653
+ "loss": 0.0,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 116.25,
658
+ "grad_norm": 0.0,
659
+ "learning_rate": 0.007687108886107634,
660
+ "loss": 0.0,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 117.5,
665
+ "grad_norm": 0.0,
666
+ "learning_rate": 0.007662077596996246,
667
+ "loss": 0.0,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 118.75,
672
+ "grad_norm": 0.0,
673
+ "learning_rate": 0.007637046307884856,
674
+ "loss": 0.0,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 120.0,
679
+ "grad_norm": 0.0,
680
+ "learning_rate": 0.007612015018773467,
681
+ "loss": 0.0,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 121.25,
686
+ "grad_norm": 0.0,
687
+ "learning_rate": 0.007586983729662078,
688
+ "loss": 0.0,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 122.5,
693
+ "grad_norm": 0.0,
694
+ "learning_rate": 0.007561952440550689,
695
+ "loss": 0.0,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 123.75,
700
+ "grad_norm": 0.0,
701
+ "learning_rate": 0.007536921151439299,
702
+ "loss": 0.0,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 125.0,
707
+ "grad_norm": 0.0,
708
+ "learning_rate": 0.0075118898623279095,
709
+ "loss": 0.0,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 126.25,
714
+ "grad_norm": 0.0,
715
+ "learning_rate": 0.007486858573216521,
716
+ "loss": 0.0,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 127.5,
721
+ "grad_norm": 0.0,
722
+ "learning_rate": 0.007461827284105132,
723
+ "loss": 0.0,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 128.75,
728
+ "grad_norm": 0.0,
729
+ "learning_rate": 0.007436795994993742,
730
+ "loss": 0.0,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 130.0,
735
+ "grad_norm": 0.0,
736
+ "learning_rate": 0.007411764705882354,
737
+ "loss": 0.0,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 131.25,
742
+ "grad_norm": 0.0,
743
+ "learning_rate": 0.0073867334167709645,
744
+ "loss": 0.0,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 132.5,
749
+ "grad_norm": 0.0,
750
+ "learning_rate": 0.007361702127659574,
751
+ "loss": 0.0,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 133.75,
756
+ "grad_norm": 0.0,
757
+ "learning_rate": 0.007336670838548185,
758
+ "loss": 0.0,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 135.0,
763
+ "grad_norm": 0.0,
764
+ "learning_rate": 0.007311639549436796,
765
+ "loss": 0.0,
766
+ "step": 1080
767
+ },
768
+ {
769
+ "epoch": 136.25,
770
+ "grad_norm": 0.0,
771
+ "learning_rate": 0.007286608260325407,
772
+ "loss": 0.0,
773
+ "step": 1090
774
+ },
775
+ {
776
+ "epoch": 137.5,
777
+ "grad_norm": 0.0,
778
+ "learning_rate": 0.007261576971214018,
779
+ "loss": 0.0,
780
+ "step": 1100
781
+ }
782
+ ],
783
+ "logging_steps": 10,
784
+ "max_steps": 4000,
785
+ "num_input_tokens_seen": 0,
786
+ "num_train_epochs": 500,
787
+ "save_steps": 50,
788
+ "stateful_callbacks": {
789
+ "TrainerControl": {
790
+ "args": {
791
+ "should_epoch_stop": false,
792
+ "should_evaluate": false,
793
+ "should_log": false,
794
+ "should_save": true,
795
+ "should_training_stop": false
796
+ },
797
+ "attributes": {}
798
+ }
799
+ },
800
+ "total_flos": 7425648.0,
801
+ "train_batch_size": 64,
802
+ "trial_name": null,
803
+ "trial_params": null
804
+ }
checkpoint-1100/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6208b173b902d75ab9814341e70009f66ecc0ecd70d024038fcaaf650010173
3
+ size 5137
checkpoint-1150/config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu",
3
+ "add_cross_attention": false,
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.0,
8
+ "bos_token_id": 1,
9
+ "dtype": "float32",
10
+ "embd_pdrop": 0.0,
11
+ "eos_token_id": 1,
12
+ "initializer_range": 0.02,
13
+ "layer_norm_epsilon": 1e-05,
14
+ "model_type": "gpt2",
15
+ "n_embd": 1,
16
+ "n_head": 1,
17
+ "n_inner": 1,
18
+ "n_layer": 1,
19
+ "n_positions": 1,
20
+ "pad_token_id": 0,
21
+ "reorder_and_upcast_attn": false,
22
+ "resid_pdrop": 0.0,
23
+ "scale_attn_by_inverse_layer_idx": false,
24
+ "scale_attn_weights": true,
25
+ "summary_activation": null,
26
+ "summary_first_dropout": 0.1,
27
+ "summary_proj_to_labels": true,
28
+ "summary_type": "cls_index",
29
+ "summary_use_proj": true,
30
+ "tie_word_embeddings": true,
31
+ "transformers_version": "5.12.0",
32
+ "use_cache": false,
33
+ "vocab_size": 2
34
+ }
checkpoint-1150/generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 1,
5
+ "output_attentions": false,
6
+ "output_hidden_states": false,
7
+ "pad_token_id": 0,
8
+ "transformers_version": "5.12.0",
9
+ "use_cache": true
10
+ }
checkpoint-1150/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:880c2ec6c05c1e38595824792a1d3850592746c746ee2caa95bee8cb0e8ec77d
3
+ size 1452
checkpoint-1150/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c6f07608caca2628e4e7d4f07ad180212b4eccace16f6c14e1ba4b9c0752cd6
3
+ size 13823
checkpoint-1150/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:adc2989b41697bc86300e49c8243db2e3b464f9053a52911e9b2ba6e76a2eee9
3
+ size 14455
checkpoint-1150/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d053bc3befa0c54cb32d83b5ef57ea57d883adf0189ecfb06b92af8c072919c
3
+ size 1465
checkpoint-1150/trainer_state.json ADDED
@@ -0,0 +1,839 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 143.75,
6
+ "eval_steps": 500,
7
+ "global_step": 1150,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 1.25,
14
+ "grad_norm": 0.0,
15
+ "learning_rate": 0.009989987484355445,
16
+ "loss": 0.0,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 2.5,
21
+ "grad_norm": 0.0,
22
+ "learning_rate": 0.009964956195244054,
23
+ "loss": 0.0,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 3.75,
28
+ "grad_norm": 0.0,
29
+ "learning_rate": 0.009939924906132666,
30
+ "loss": 0.0,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 5.0,
35
+ "grad_norm": 0.0,
36
+ "learning_rate": 0.009914893617021277,
37
+ "loss": 0.0,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 6.25,
42
+ "grad_norm": 0.0,
43
+ "learning_rate": 0.009889862327909888,
44
+ "loss": 0.0,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 7.5,
49
+ "grad_norm": 0.0,
50
+ "learning_rate": 0.009864831038798498,
51
+ "loss": 0.0,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 8.75,
56
+ "grad_norm": 0.0,
57
+ "learning_rate": 0.009839799749687109,
58
+ "loss": 0.0,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 10.0,
63
+ "grad_norm": 0.0,
64
+ "learning_rate": 0.00981476846057572,
65
+ "loss": 0.0,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 11.25,
70
+ "grad_norm": 0.0,
71
+ "learning_rate": 0.00978973717146433,
72
+ "loss": 0.0,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 12.5,
77
+ "grad_norm": 0.0,
78
+ "learning_rate": 0.009764705882352941,
79
+ "loss": 0.0,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 13.75,
84
+ "grad_norm": 0.0,
85
+ "learning_rate": 0.009739674593241552,
86
+ "loss": 0.0,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 15.0,
91
+ "grad_norm": 0.0,
92
+ "learning_rate": 0.009714643304130162,
93
+ "loss": 0.0,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 16.25,
98
+ "grad_norm": 0.0,
99
+ "learning_rate": 0.009689612015018775,
100
+ "loss": 0.0,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 17.5,
105
+ "grad_norm": 0.0,
106
+ "learning_rate": 0.009664580725907385,
107
+ "loss": 0.0,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 18.75,
112
+ "grad_norm": 0.0,
113
+ "learning_rate": 0.009639549436795996,
114
+ "loss": 0.0,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 20.0,
119
+ "grad_norm": 0.0,
120
+ "learning_rate": 0.009614518147684605,
121
+ "loss": 0.0,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 21.25,
126
+ "grad_norm": 0.0,
127
+ "learning_rate": 0.009589486858573217,
128
+ "loss": 0.0,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 22.5,
133
+ "grad_norm": 0.0,
134
+ "learning_rate": 0.009564455569461828,
135
+ "loss": 0.0,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 23.75,
140
+ "grad_norm": 0.0,
141
+ "learning_rate": 0.009539424280350439,
142
+ "loss": 0.0,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 25.0,
147
+ "grad_norm": 0.0,
148
+ "learning_rate": 0.00951439299123905,
149
+ "loss": 0.0,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 26.25,
154
+ "grad_norm": 0.0,
155
+ "learning_rate": 0.00948936170212766,
156
+ "loss": 0.0,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 27.5,
161
+ "grad_norm": 0.0,
162
+ "learning_rate": 0.00946433041301627,
163
+ "loss": 0.0,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 28.75,
168
+ "grad_norm": 0.0,
169
+ "learning_rate": 0.009439299123904881,
170
+ "loss": 0.0,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 30.0,
175
+ "grad_norm": 0.0,
176
+ "learning_rate": 0.009414267834793492,
177
+ "loss": 0.0,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 31.25,
182
+ "grad_norm": 0.0,
183
+ "learning_rate": 0.009389236545682102,
184
+ "loss": 0.0,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 32.5,
189
+ "grad_norm": 0.0,
190
+ "learning_rate": 0.009364205256570713,
191
+ "loss": 0.0,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 33.75,
196
+ "grad_norm": 0.0,
197
+ "learning_rate": 0.009339173967459325,
198
+ "loss": 0.0,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 35.0,
203
+ "grad_norm": 0.0,
204
+ "learning_rate": 0.009314142678347936,
205
+ "loss": 0.0,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 36.25,
210
+ "grad_norm": 0.0,
211
+ "learning_rate": 0.009289111389236547,
212
+ "loss": 0.0,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 37.5,
217
+ "grad_norm": 0.0,
218
+ "learning_rate": 0.009264080100125156,
219
+ "loss": 0.0,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 38.75,
224
+ "grad_norm": 0.0,
225
+ "learning_rate": 0.009239048811013768,
226
+ "loss": 0.0,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 40.0,
231
+ "grad_norm": 0.0,
232
+ "learning_rate": 0.009214017521902379,
233
+ "loss": 0.0,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 41.25,
238
+ "grad_norm": 0.0,
239
+ "learning_rate": 0.00918898623279099,
240
+ "loss": 0.0,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 42.5,
245
+ "grad_norm": 0.0,
246
+ "learning_rate": 0.0091639549436796,
247
+ "loss": 0.0,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 43.75,
252
+ "grad_norm": 0.0,
253
+ "learning_rate": 0.00913892365456821,
254
+ "loss": 0.0,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 45.0,
259
+ "grad_norm": 0.0,
260
+ "learning_rate": 0.009113892365456821,
261
+ "loss": 0.0,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 46.25,
266
+ "grad_norm": 0.0,
267
+ "learning_rate": 0.009088861076345432,
268
+ "loss": 0.0,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 47.5,
273
+ "grad_norm": 0.0,
274
+ "learning_rate": 0.009063829787234043,
275
+ "loss": 0.0,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 48.75,
280
+ "grad_norm": 0.0,
281
+ "learning_rate": 0.009038798498122653,
282
+ "loss": 0.0,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 50.0,
287
+ "grad_norm": 0.0,
288
+ "learning_rate": 0.009013767209011264,
289
+ "loss": 0.0,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 51.25,
294
+ "grad_norm": 0.0,
295
+ "learning_rate": 0.008988735919899874,
296
+ "loss": 0.0,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 52.5,
301
+ "grad_norm": 0.0,
302
+ "learning_rate": 0.008963704630788487,
303
+ "loss": 0.0,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 53.75,
308
+ "grad_norm": 0.0,
309
+ "learning_rate": 0.008938673341677096,
310
+ "loss": 0.0,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 55.0,
315
+ "grad_norm": 0.0,
316
+ "learning_rate": 0.008913642052565706,
317
+ "loss": 0.0,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 56.25,
322
+ "grad_norm": 0.0,
323
+ "learning_rate": 0.008888610763454317,
324
+ "loss": 0.0,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 57.5,
329
+ "grad_norm": 0.0,
330
+ "learning_rate": 0.00886357947434293,
331
+ "loss": 0.0,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 58.75,
336
+ "grad_norm": 0.0,
337
+ "learning_rate": 0.00883854818523154,
338
+ "loss": 0.0,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 60.0,
343
+ "grad_norm": 0.0,
344
+ "learning_rate": 0.00881351689612015,
345
+ "loss": 0.0,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 61.25,
350
+ "grad_norm": 0.0,
351
+ "learning_rate": 0.008788485607008761,
352
+ "loss": 0.0,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 62.5,
357
+ "grad_norm": 0.0,
358
+ "learning_rate": 0.008763454317897372,
359
+ "loss": 0.0,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 63.75,
364
+ "grad_norm": 0.0,
365
+ "learning_rate": 0.008738423028785983,
366
+ "loss": 0.0,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 65.0,
371
+ "grad_norm": 0.0,
372
+ "learning_rate": 0.008713391739674593,
373
+ "loss": 0.0,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 66.25,
378
+ "grad_norm": 0.0,
379
+ "learning_rate": 0.008688360450563204,
380
+ "loss": 0.0,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 67.5,
385
+ "grad_norm": 0.0,
386
+ "learning_rate": 0.008663329161451815,
387
+ "loss": 0.0,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 68.75,
392
+ "grad_norm": 0.0,
393
+ "learning_rate": 0.008638297872340425,
394
+ "loss": 0.0,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 70.0,
399
+ "grad_norm": 0.0,
400
+ "learning_rate": 0.008613266583229038,
401
+ "loss": 0.0,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 71.25,
406
+ "grad_norm": 0.0,
407
+ "learning_rate": 0.008588235294117647,
408
+ "loss": 0.0,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 72.5,
413
+ "grad_norm": 0.0,
414
+ "learning_rate": 0.008563204005006257,
415
+ "loss": 0.0,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 73.75,
420
+ "grad_norm": 0.0,
421
+ "learning_rate": 0.008538172715894868,
422
+ "loss": 0.0,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 75.0,
427
+ "grad_norm": 0.0,
428
+ "learning_rate": 0.00851314142678348,
429
+ "loss": 0.0,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 76.25,
434
+ "grad_norm": 0.0,
435
+ "learning_rate": 0.00848811013767209,
436
+ "loss": 0.0,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 77.5,
441
+ "grad_norm": 0.0,
442
+ "learning_rate": 0.008463078848560701,
443
+ "loss": 0.0,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 78.75,
448
+ "grad_norm": 0.0,
449
+ "learning_rate": 0.008438047559449312,
450
+ "loss": 0.0,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 80.0,
455
+ "grad_norm": 0.0,
456
+ "learning_rate": 0.008413016270337923,
457
+ "loss": 0.0,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 81.25,
462
+ "grad_norm": 0.0,
463
+ "learning_rate": 0.008387984981226533,
464
+ "loss": 0.0,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 82.5,
469
+ "grad_norm": 0.0,
470
+ "learning_rate": 0.008362953692115144,
471
+ "loss": 0.0,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 83.75,
476
+ "grad_norm": 0.0,
477
+ "learning_rate": 0.008337922403003755,
478
+ "loss": 0.0,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 85.0,
483
+ "grad_norm": 0.0,
484
+ "learning_rate": 0.008312891113892365,
485
+ "loss": 0.0,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 86.25,
490
+ "grad_norm": 0.0,
491
+ "learning_rate": 0.008287859824780976,
492
+ "loss": 0.0,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 87.5,
497
+ "grad_norm": 0.0,
498
+ "learning_rate": 0.008262828535669588,
499
+ "loss": 0.0,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 88.75,
504
+ "grad_norm": 0.0,
505
+ "learning_rate": 0.008237797246558197,
506
+ "loss": 0.0,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 90.0,
511
+ "grad_norm": 0.0,
512
+ "learning_rate": 0.008212765957446808,
513
+ "loss": 0.0,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 91.25,
518
+ "grad_norm": 0.0,
519
+ "learning_rate": 0.008187734668335419,
520
+ "loss": 0.0,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 92.5,
525
+ "grad_norm": 0.0,
526
+ "learning_rate": 0.008162703379224031,
527
+ "loss": 0.0,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 93.75,
532
+ "grad_norm": 0.0,
533
+ "learning_rate": 0.008137672090112642,
534
+ "loss": 0.0,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 95.0,
539
+ "grad_norm": 0.0,
540
+ "learning_rate": 0.008112640801001252,
541
+ "loss": 0.0,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 96.25,
546
+ "grad_norm": 0.0,
547
+ "learning_rate": 0.008087609511889863,
548
+ "loss": 0.0,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 97.5,
553
+ "grad_norm": 0.0,
554
+ "learning_rate": 0.008062578222778474,
555
+ "loss": 0.0,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 98.75,
560
+ "grad_norm": 0.0,
561
+ "learning_rate": 0.008037546933667084,
562
+ "loss": 0.0,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 100.0,
567
+ "grad_norm": 0.0,
568
+ "learning_rate": 0.008012515644555695,
569
+ "loss": 0.0,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 101.25,
574
+ "grad_norm": 0.0,
575
+ "learning_rate": 0.007987484355444305,
576
+ "loss": 0.0,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 102.5,
581
+ "grad_norm": 0.0,
582
+ "learning_rate": 0.007962453066332916,
583
+ "loss": 0.0,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 103.75,
588
+ "grad_norm": 0.0,
589
+ "learning_rate": 0.007937421777221527,
590
+ "loss": 0.0,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 105.0,
595
+ "grad_norm": 0.0,
596
+ "learning_rate": 0.007912390488110137,
597
+ "loss": 0.0,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 106.25,
602
+ "grad_norm": 0.0,
603
+ "learning_rate": 0.007887359198998748,
604
+ "loss": 0.0,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 107.5,
609
+ "grad_norm": 0.0,
610
+ "learning_rate": 0.007862327909887359,
611
+ "loss": 0.0,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 108.75,
616
+ "grad_norm": 0.0,
617
+ "learning_rate": 0.00783729662077597,
618
+ "loss": 0.0,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 110.0,
623
+ "grad_norm": 0.0,
624
+ "learning_rate": 0.007812265331664581,
625
+ "loss": 0.0,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 111.25,
630
+ "grad_norm": 0.0,
631
+ "learning_rate": 0.0077872340425531915,
632
+ "loss": 0.0,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 112.5,
637
+ "grad_norm": 0.0,
638
+ "learning_rate": 0.007762202753441803,
639
+ "loss": 0.0,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 113.75,
644
+ "grad_norm": 0.0,
645
+ "learning_rate": 0.007737171464330414,
646
+ "loss": 0.0,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 115.0,
651
+ "grad_norm": 0.0,
652
+ "learning_rate": 0.0077121401752190235,
653
+ "loss": 0.0,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 116.25,
658
+ "grad_norm": 0.0,
659
+ "learning_rate": 0.007687108886107634,
660
+ "loss": 0.0,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 117.5,
665
+ "grad_norm": 0.0,
666
+ "learning_rate": 0.007662077596996246,
667
+ "loss": 0.0,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 118.75,
672
+ "grad_norm": 0.0,
673
+ "learning_rate": 0.007637046307884856,
674
+ "loss": 0.0,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 120.0,
679
+ "grad_norm": 0.0,
680
+ "learning_rate": 0.007612015018773467,
681
+ "loss": 0.0,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 121.25,
686
+ "grad_norm": 0.0,
687
+ "learning_rate": 0.007586983729662078,
688
+ "loss": 0.0,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 122.5,
693
+ "grad_norm": 0.0,
694
+ "learning_rate": 0.007561952440550689,
695
+ "loss": 0.0,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 123.75,
700
+ "grad_norm": 0.0,
701
+ "learning_rate": 0.007536921151439299,
702
+ "loss": 0.0,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 125.0,
707
+ "grad_norm": 0.0,
708
+ "learning_rate": 0.0075118898623279095,
709
+ "loss": 0.0,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 126.25,
714
+ "grad_norm": 0.0,
715
+ "learning_rate": 0.007486858573216521,
716
+ "loss": 0.0,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 127.5,
721
+ "grad_norm": 0.0,
722
+ "learning_rate": 0.007461827284105132,
723
+ "loss": 0.0,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 128.75,
728
+ "grad_norm": 0.0,
729
+ "learning_rate": 0.007436795994993742,
730
+ "loss": 0.0,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 130.0,
735
+ "grad_norm": 0.0,
736
+ "learning_rate": 0.007411764705882354,
737
+ "loss": 0.0,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 131.25,
742
+ "grad_norm": 0.0,
743
+ "learning_rate": 0.0073867334167709645,
744
+ "loss": 0.0,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 132.5,
749
+ "grad_norm": 0.0,
750
+ "learning_rate": 0.007361702127659574,
751
+ "loss": 0.0,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 133.75,
756
+ "grad_norm": 0.0,
757
+ "learning_rate": 0.007336670838548185,
758
+ "loss": 0.0,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 135.0,
763
+ "grad_norm": 0.0,
764
+ "learning_rate": 0.007311639549436796,
765
+ "loss": 0.0,
766
+ "step": 1080
767
+ },
768
+ {
769
+ "epoch": 136.25,
770
+ "grad_norm": 0.0,
771
+ "learning_rate": 0.007286608260325407,
772
+ "loss": 0.0,
773
+ "step": 1090
774
+ },
775
+ {
776
+ "epoch": 137.5,
777
+ "grad_norm": 0.0,
778
+ "learning_rate": 0.007261576971214018,
779
+ "loss": 0.0,
780
+ "step": 1100
781
+ },
782
+ {
783
+ "epoch": 138.75,
784
+ "grad_norm": 0.0,
785
+ "learning_rate": 0.007236545682102628,
786
+ "loss": 0.0,
787
+ "step": 1110
788
+ },
789
+ {
790
+ "epoch": 140.0,
791
+ "grad_norm": 0.0,
792
+ "learning_rate": 0.00721151439299124,
793
+ "loss": 0.0,
794
+ "step": 1120
795
+ },
796
+ {
797
+ "epoch": 141.25,
798
+ "grad_norm": 0.0,
799
+ "learning_rate": 0.00718648310387985,
800
+ "loss": 0.0,
801
+ "step": 1130
802
+ },
803
+ {
804
+ "epoch": 142.5,
805
+ "grad_norm": 0.0,
806
+ "learning_rate": 0.00716145181476846,
807
+ "loss": 0.0,
808
+ "step": 1140
809
+ },
810
+ {
811
+ "epoch": 143.75,
812
+ "grad_norm": 0.0,
813
+ "learning_rate": 0.007136420525657071,
814
+ "loss": 0.0,
815
+ "step": 1150
816
+ }
817
+ ],
818
+ "logging_steps": 10,
819
+ "max_steps": 4000,
820
+ "num_input_tokens_seen": 0,
821
+ "num_train_epochs": 500,
822
+ "save_steps": 50,
823
+ "stateful_callbacks": {
824
+ "TrainerControl": {
825
+ "args": {
826
+ "should_epoch_stop": false,
827
+ "should_evaluate": false,
828
+ "should_log": false,
829
+ "should_save": true,
830
+ "should_training_stop": false
831
+ },
832
+ "attributes": {}
833
+ }
834
+ },
835
+ "total_flos": 7763472.0,
836
+ "train_batch_size": 64,
837
+ "trial_name": null,
838
+ "trial_params": null
839
+ }
checkpoint-1150/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6208b173b902d75ab9814341e70009f66ecc0ecd70d024038fcaaf650010173
3
+ size 5137
checkpoint-1200/config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu",
3
+ "add_cross_attention": false,
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.0,
8
+ "bos_token_id": 1,
9
+ "dtype": "float32",
10
+ "embd_pdrop": 0.0,
11
+ "eos_token_id": 1,
12
+ "initializer_range": 0.02,
13
+ "layer_norm_epsilon": 1e-05,
14
+ "model_type": "gpt2",
15
+ "n_embd": 1,
16
+ "n_head": 1,
17
+ "n_inner": 1,
18
+ "n_layer": 1,
19
+ "n_positions": 1,
20
+ "pad_token_id": 0,
21
+ "reorder_and_upcast_attn": false,
22
+ "resid_pdrop": 0.0,
23
+ "scale_attn_by_inverse_layer_idx": false,
24
+ "scale_attn_weights": true,
25
+ "summary_activation": null,
26
+ "summary_first_dropout": 0.1,
27
+ "summary_proj_to_labels": true,
28
+ "summary_type": "cls_index",
29
+ "summary_use_proj": true,
30
+ "tie_word_embeddings": true,
31
+ "transformers_version": "5.12.0",
32
+ "use_cache": false,
33
+ "vocab_size": 2
34
+ }
checkpoint-1200/generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 1,
5
+ "output_attentions": false,
6
+ "output_hidden_states": false,
7
+ "pad_token_id": 0,
8
+ "transformers_version": "5.12.0",
9
+ "use_cache": true
10
+ }
checkpoint-1200/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:880c2ec6c05c1e38595824792a1d3850592746c746ee2caa95bee8cb0e8ec77d
3
+ size 1452
checkpoint-1200/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3975bc7a21f5a4fd5059daec4f18dc973217774caa4f4484f1069b0e3cf0034e
3
+ size 13823
checkpoint-1200/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5992258b0e831b29c37beeefed96237fb5573ddb793970294c8b6f2dc3098fd8
3
+ size 14455
checkpoint-1200/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:822b71f69a7890e9b8716f7f8e72637712c24b0c354b6f54e6d112260cd7264c
3
+ size 1465
checkpoint-1200/trainer_state.json ADDED
@@ -0,0 +1,874 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 150.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1200,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 1.25,
14
+ "grad_norm": 0.0,
15
+ "learning_rate": 0.009989987484355445,
16
+ "loss": 0.0,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 2.5,
21
+ "grad_norm": 0.0,
22
+ "learning_rate": 0.009964956195244054,
23
+ "loss": 0.0,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 3.75,
28
+ "grad_norm": 0.0,
29
+ "learning_rate": 0.009939924906132666,
30
+ "loss": 0.0,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 5.0,
35
+ "grad_norm": 0.0,
36
+ "learning_rate": 0.009914893617021277,
37
+ "loss": 0.0,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 6.25,
42
+ "grad_norm": 0.0,
43
+ "learning_rate": 0.009889862327909888,
44
+ "loss": 0.0,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 7.5,
49
+ "grad_norm": 0.0,
50
+ "learning_rate": 0.009864831038798498,
51
+ "loss": 0.0,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 8.75,
56
+ "grad_norm": 0.0,
57
+ "learning_rate": 0.009839799749687109,
58
+ "loss": 0.0,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 10.0,
63
+ "grad_norm": 0.0,
64
+ "learning_rate": 0.00981476846057572,
65
+ "loss": 0.0,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 11.25,
70
+ "grad_norm": 0.0,
71
+ "learning_rate": 0.00978973717146433,
72
+ "loss": 0.0,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 12.5,
77
+ "grad_norm": 0.0,
78
+ "learning_rate": 0.009764705882352941,
79
+ "loss": 0.0,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 13.75,
84
+ "grad_norm": 0.0,
85
+ "learning_rate": 0.009739674593241552,
86
+ "loss": 0.0,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 15.0,
91
+ "grad_norm": 0.0,
92
+ "learning_rate": 0.009714643304130162,
93
+ "loss": 0.0,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 16.25,
98
+ "grad_norm": 0.0,
99
+ "learning_rate": 0.009689612015018775,
100
+ "loss": 0.0,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 17.5,
105
+ "grad_norm": 0.0,
106
+ "learning_rate": 0.009664580725907385,
107
+ "loss": 0.0,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 18.75,
112
+ "grad_norm": 0.0,
113
+ "learning_rate": 0.009639549436795996,
114
+ "loss": 0.0,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 20.0,
119
+ "grad_norm": 0.0,
120
+ "learning_rate": 0.009614518147684605,
121
+ "loss": 0.0,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 21.25,
126
+ "grad_norm": 0.0,
127
+ "learning_rate": 0.009589486858573217,
128
+ "loss": 0.0,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 22.5,
133
+ "grad_norm": 0.0,
134
+ "learning_rate": 0.009564455569461828,
135
+ "loss": 0.0,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 23.75,
140
+ "grad_norm": 0.0,
141
+ "learning_rate": 0.009539424280350439,
142
+ "loss": 0.0,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 25.0,
147
+ "grad_norm": 0.0,
148
+ "learning_rate": 0.00951439299123905,
149
+ "loss": 0.0,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 26.25,
154
+ "grad_norm": 0.0,
155
+ "learning_rate": 0.00948936170212766,
156
+ "loss": 0.0,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 27.5,
161
+ "grad_norm": 0.0,
162
+ "learning_rate": 0.00946433041301627,
163
+ "loss": 0.0,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 28.75,
168
+ "grad_norm": 0.0,
169
+ "learning_rate": 0.009439299123904881,
170
+ "loss": 0.0,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 30.0,
175
+ "grad_norm": 0.0,
176
+ "learning_rate": 0.009414267834793492,
177
+ "loss": 0.0,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 31.25,
182
+ "grad_norm": 0.0,
183
+ "learning_rate": 0.009389236545682102,
184
+ "loss": 0.0,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 32.5,
189
+ "grad_norm": 0.0,
190
+ "learning_rate": 0.009364205256570713,
191
+ "loss": 0.0,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 33.75,
196
+ "grad_norm": 0.0,
197
+ "learning_rate": 0.009339173967459325,
198
+ "loss": 0.0,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 35.0,
203
+ "grad_norm": 0.0,
204
+ "learning_rate": 0.009314142678347936,
205
+ "loss": 0.0,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 36.25,
210
+ "grad_norm": 0.0,
211
+ "learning_rate": 0.009289111389236547,
212
+ "loss": 0.0,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 37.5,
217
+ "grad_norm": 0.0,
218
+ "learning_rate": 0.009264080100125156,
219
+ "loss": 0.0,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 38.75,
224
+ "grad_norm": 0.0,
225
+ "learning_rate": 0.009239048811013768,
226
+ "loss": 0.0,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 40.0,
231
+ "grad_norm": 0.0,
232
+ "learning_rate": 0.009214017521902379,
233
+ "loss": 0.0,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 41.25,
238
+ "grad_norm": 0.0,
239
+ "learning_rate": 0.00918898623279099,
240
+ "loss": 0.0,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 42.5,
245
+ "grad_norm": 0.0,
246
+ "learning_rate": 0.0091639549436796,
247
+ "loss": 0.0,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 43.75,
252
+ "grad_norm": 0.0,
253
+ "learning_rate": 0.00913892365456821,
254
+ "loss": 0.0,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 45.0,
259
+ "grad_norm": 0.0,
260
+ "learning_rate": 0.009113892365456821,
261
+ "loss": 0.0,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 46.25,
266
+ "grad_norm": 0.0,
267
+ "learning_rate": 0.009088861076345432,
268
+ "loss": 0.0,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 47.5,
273
+ "grad_norm": 0.0,
274
+ "learning_rate": 0.009063829787234043,
275
+ "loss": 0.0,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 48.75,
280
+ "grad_norm": 0.0,
281
+ "learning_rate": 0.009038798498122653,
282
+ "loss": 0.0,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 50.0,
287
+ "grad_norm": 0.0,
288
+ "learning_rate": 0.009013767209011264,
289
+ "loss": 0.0,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 51.25,
294
+ "grad_norm": 0.0,
295
+ "learning_rate": 0.008988735919899874,
296
+ "loss": 0.0,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 52.5,
301
+ "grad_norm": 0.0,
302
+ "learning_rate": 0.008963704630788487,
303
+ "loss": 0.0,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 53.75,
308
+ "grad_norm": 0.0,
309
+ "learning_rate": 0.008938673341677096,
310
+ "loss": 0.0,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 55.0,
315
+ "grad_norm": 0.0,
316
+ "learning_rate": 0.008913642052565706,
317
+ "loss": 0.0,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 56.25,
322
+ "grad_norm": 0.0,
323
+ "learning_rate": 0.008888610763454317,
324
+ "loss": 0.0,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 57.5,
329
+ "grad_norm": 0.0,
330
+ "learning_rate": 0.00886357947434293,
331
+ "loss": 0.0,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 58.75,
336
+ "grad_norm": 0.0,
337
+ "learning_rate": 0.00883854818523154,
338
+ "loss": 0.0,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 60.0,
343
+ "grad_norm": 0.0,
344
+ "learning_rate": 0.00881351689612015,
345
+ "loss": 0.0,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 61.25,
350
+ "grad_norm": 0.0,
351
+ "learning_rate": 0.008788485607008761,
352
+ "loss": 0.0,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 62.5,
357
+ "grad_norm": 0.0,
358
+ "learning_rate": 0.008763454317897372,
359
+ "loss": 0.0,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 63.75,
364
+ "grad_norm": 0.0,
365
+ "learning_rate": 0.008738423028785983,
366
+ "loss": 0.0,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 65.0,
371
+ "grad_norm": 0.0,
372
+ "learning_rate": 0.008713391739674593,
373
+ "loss": 0.0,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 66.25,
378
+ "grad_norm": 0.0,
379
+ "learning_rate": 0.008688360450563204,
380
+ "loss": 0.0,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 67.5,
385
+ "grad_norm": 0.0,
386
+ "learning_rate": 0.008663329161451815,
387
+ "loss": 0.0,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 68.75,
392
+ "grad_norm": 0.0,
393
+ "learning_rate": 0.008638297872340425,
394
+ "loss": 0.0,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 70.0,
399
+ "grad_norm": 0.0,
400
+ "learning_rate": 0.008613266583229038,
401
+ "loss": 0.0,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 71.25,
406
+ "grad_norm": 0.0,
407
+ "learning_rate": 0.008588235294117647,
408
+ "loss": 0.0,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 72.5,
413
+ "grad_norm": 0.0,
414
+ "learning_rate": 0.008563204005006257,
415
+ "loss": 0.0,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 73.75,
420
+ "grad_norm": 0.0,
421
+ "learning_rate": 0.008538172715894868,
422
+ "loss": 0.0,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 75.0,
427
+ "grad_norm": 0.0,
428
+ "learning_rate": 0.00851314142678348,
429
+ "loss": 0.0,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 76.25,
434
+ "grad_norm": 0.0,
435
+ "learning_rate": 0.00848811013767209,
436
+ "loss": 0.0,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 77.5,
441
+ "grad_norm": 0.0,
442
+ "learning_rate": 0.008463078848560701,
443
+ "loss": 0.0,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 78.75,
448
+ "grad_norm": 0.0,
449
+ "learning_rate": 0.008438047559449312,
450
+ "loss": 0.0,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 80.0,
455
+ "grad_norm": 0.0,
456
+ "learning_rate": 0.008413016270337923,
457
+ "loss": 0.0,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 81.25,
462
+ "grad_norm": 0.0,
463
+ "learning_rate": 0.008387984981226533,
464
+ "loss": 0.0,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 82.5,
469
+ "grad_norm": 0.0,
470
+ "learning_rate": 0.008362953692115144,
471
+ "loss": 0.0,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 83.75,
476
+ "grad_norm": 0.0,
477
+ "learning_rate": 0.008337922403003755,
478
+ "loss": 0.0,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 85.0,
483
+ "grad_norm": 0.0,
484
+ "learning_rate": 0.008312891113892365,
485
+ "loss": 0.0,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 86.25,
490
+ "grad_norm": 0.0,
491
+ "learning_rate": 0.008287859824780976,
492
+ "loss": 0.0,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 87.5,
497
+ "grad_norm": 0.0,
498
+ "learning_rate": 0.008262828535669588,
499
+ "loss": 0.0,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 88.75,
504
+ "grad_norm": 0.0,
505
+ "learning_rate": 0.008237797246558197,
506
+ "loss": 0.0,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 90.0,
511
+ "grad_norm": 0.0,
512
+ "learning_rate": 0.008212765957446808,
513
+ "loss": 0.0,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 91.25,
518
+ "grad_norm": 0.0,
519
+ "learning_rate": 0.008187734668335419,
520
+ "loss": 0.0,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 92.5,
525
+ "grad_norm": 0.0,
526
+ "learning_rate": 0.008162703379224031,
527
+ "loss": 0.0,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 93.75,
532
+ "grad_norm": 0.0,
533
+ "learning_rate": 0.008137672090112642,
534
+ "loss": 0.0,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 95.0,
539
+ "grad_norm": 0.0,
540
+ "learning_rate": 0.008112640801001252,
541
+ "loss": 0.0,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 96.25,
546
+ "grad_norm": 0.0,
547
+ "learning_rate": 0.008087609511889863,
548
+ "loss": 0.0,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 97.5,
553
+ "grad_norm": 0.0,
554
+ "learning_rate": 0.008062578222778474,
555
+ "loss": 0.0,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 98.75,
560
+ "grad_norm": 0.0,
561
+ "learning_rate": 0.008037546933667084,
562
+ "loss": 0.0,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 100.0,
567
+ "grad_norm": 0.0,
568
+ "learning_rate": 0.008012515644555695,
569
+ "loss": 0.0,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 101.25,
574
+ "grad_norm": 0.0,
575
+ "learning_rate": 0.007987484355444305,
576
+ "loss": 0.0,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 102.5,
581
+ "grad_norm": 0.0,
582
+ "learning_rate": 0.007962453066332916,
583
+ "loss": 0.0,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 103.75,
588
+ "grad_norm": 0.0,
589
+ "learning_rate": 0.007937421777221527,
590
+ "loss": 0.0,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 105.0,
595
+ "grad_norm": 0.0,
596
+ "learning_rate": 0.007912390488110137,
597
+ "loss": 0.0,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 106.25,
602
+ "grad_norm": 0.0,
603
+ "learning_rate": 0.007887359198998748,
604
+ "loss": 0.0,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 107.5,
609
+ "grad_norm": 0.0,
610
+ "learning_rate": 0.007862327909887359,
611
+ "loss": 0.0,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 108.75,
616
+ "grad_norm": 0.0,
617
+ "learning_rate": 0.00783729662077597,
618
+ "loss": 0.0,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 110.0,
623
+ "grad_norm": 0.0,
624
+ "learning_rate": 0.007812265331664581,
625
+ "loss": 0.0,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 111.25,
630
+ "grad_norm": 0.0,
631
+ "learning_rate": 0.0077872340425531915,
632
+ "loss": 0.0,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 112.5,
637
+ "grad_norm": 0.0,
638
+ "learning_rate": 0.007762202753441803,
639
+ "loss": 0.0,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 113.75,
644
+ "grad_norm": 0.0,
645
+ "learning_rate": 0.007737171464330414,
646
+ "loss": 0.0,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 115.0,
651
+ "grad_norm": 0.0,
652
+ "learning_rate": 0.0077121401752190235,
653
+ "loss": 0.0,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 116.25,
658
+ "grad_norm": 0.0,
659
+ "learning_rate": 0.007687108886107634,
660
+ "loss": 0.0,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 117.5,
665
+ "grad_norm": 0.0,
666
+ "learning_rate": 0.007662077596996246,
667
+ "loss": 0.0,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 118.75,
672
+ "grad_norm": 0.0,
673
+ "learning_rate": 0.007637046307884856,
674
+ "loss": 0.0,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 120.0,
679
+ "grad_norm": 0.0,
680
+ "learning_rate": 0.007612015018773467,
681
+ "loss": 0.0,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 121.25,
686
+ "grad_norm": 0.0,
687
+ "learning_rate": 0.007586983729662078,
688
+ "loss": 0.0,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 122.5,
693
+ "grad_norm": 0.0,
694
+ "learning_rate": 0.007561952440550689,
695
+ "loss": 0.0,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 123.75,
700
+ "grad_norm": 0.0,
701
+ "learning_rate": 0.007536921151439299,
702
+ "loss": 0.0,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 125.0,
707
+ "grad_norm": 0.0,
708
+ "learning_rate": 0.0075118898623279095,
709
+ "loss": 0.0,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 126.25,
714
+ "grad_norm": 0.0,
715
+ "learning_rate": 0.007486858573216521,
716
+ "loss": 0.0,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 127.5,
721
+ "grad_norm": 0.0,
722
+ "learning_rate": 0.007461827284105132,
723
+ "loss": 0.0,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 128.75,
728
+ "grad_norm": 0.0,
729
+ "learning_rate": 0.007436795994993742,
730
+ "loss": 0.0,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 130.0,
735
+ "grad_norm": 0.0,
736
+ "learning_rate": 0.007411764705882354,
737
+ "loss": 0.0,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 131.25,
742
+ "grad_norm": 0.0,
743
+ "learning_rate": 0.0073867334167709645,
744
+ "loss": 0.0,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 132.5,
749
+ "grad_norm": 0.0,
750
+ "learning_rate": 0.007361702127659574,
751
+ "loss": 0.0,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 133.75,
756
+ "grad_norm": 0.0,
757
+ "learning_rate": 0.007336670838548185,
758
+ "loss": 0.0,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 135.0,
763
+ "grad_norm": 0.0,
764
+ "learning_rate": 0.007311639549436796,
765
+ "loss": 0.0,
766
+ "step": 1080
767
+ },
768
+ {
769
+ "epoch": 136.25,
770
+ "grad_norm": 0.0,
771
+ "learning_rate": 0.007286608260325407,
772
+ "loss": 0.0,
773
+ "step": 1090
774
+ },
775
+ {
776
+ "epoch": 137.5,
777
+ "grad_norm": 0.0,
778
+ "learning_rate": 0.007261576971214018,
779
+ "loss": 0.0,
780
+ "step": 1100
781
+ },
782
+ {
783
+ "epoch": 138.75,
784
+ "grad_norm": 0.0,
785
+ "learning_rate": 0.007236545682102628,
786
+ "loss": 0.0,
787
+ "step": 1110
788
+ },
789
+ {
790
+ "epoch": 140.0,
791
+ "grad_norm": 0.0,
792
+ "learning_rate": 0.00721151439299124,
793
+ "loss": 0.0,
794
+ "step": 1120
795
+ },
796
+ {
797
+ "epoch": 141.25,
798
+ "grad_norm": 0.0,
799
+ "learning_rate": 0.00718648310387985,
800
+ "loss": 0.0,
801
+ "step": 1130
802
+ },
803
+ {
804
+ "epoch": 142.5,
805
+ "grad_norm": 0.0,
806
+ "learning_rate": 0.00716145181476846,
807
+ "loss": 0.0,
808
+ "step": 1140
809
+ },
810
+ {
811
+ "epoch": 143.75,
812
+ "grad_norm": 0.0,
813
+ "learning_rate": 0.007136420525657071,
814
+ "loss": 0.0,
815
+ "step": 1150
816
+ },
817
+ {
818
+ "epoch": 145.0,
819
+ "grad_norm": 0.0,
820
+ "learning_rate": 0.007111389236545682,
821
+ "loss": 0.0,
822
+ "step": 1160
823
+ },
824
+ {
825
+ "epoch": 146.25,
826
+ "grad_norm": 0.0,
827
+ "learning_rate": 0.007086357947434293,
828
+ "loss": 0.0,
829
+ "step": 1170
830
+ },
831
+ {
832
+ "epoch": 147.5,
833
+ "grad_norm": 0.0,
834
+ "learning_rate": 0.007061326658322904,
835
+ "loss": 0.0,
836
+ "step": 1180
837
+ },
838
+ {
839
+ "epoch": 148.75,
840
+ "grad_norm": 0.0,
841
+ "learning_rate": 0.007036295369211515,
842
+ "loss": 0.0,
843
+ "step": 1190
844
+ },
845
+ {
846
+ "epoch": 150.0,
847
+ "grad_norm": 0.0,
848
+ "learning_rate": 0.007011264080100125,
849
+ "loss": 0.0,
850
+ "step": 1200
851
+ }
852
+ ],
853
+ "logging_steps": 10,
854
+ "max_steps": 4000,
855
+ "num_input_tokens_seen": 0,
856
+ "num_train_epochs": 500,
857
+ "save_steps": 50,
858
+ "stateful_callbacks": {
859
+ "TrainerControl": {
860
+ "args": {
861
+ "should_epoch_stop": false,
862
+ "should_evaluate": false,
863
+ "should_log": false,
864
+ "should_save": true,
865
+ "should_training_stop": false
866
+ },
867
+ "attributes": {}
868
+ }
869
+ },
870
+ "total_flos": 8100000.0,
871
+ "train_batch_size": 64,
872
+ "trial_name": null,
873
+ "trial_params": null
874
+ }
checkpoint-1200/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6208b173b902d75ab9814341e70009f66ecc0ecd70d024038fcaaf650010173
3
+ size 5137
checkpoint-1250/config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu",
3
+ "add_cross_attention": false,
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.0,
8
+ "bos_token_id": 1,
9
+ "dtype": "float32",
10
+ "embd_pdrop": 0.0,
11
+ "eos_token_id": 1,
12
+ "initializer_range": 0.02,
13
+ "layer_norm_epsilon": 1e-05,
14
+ "model_type": "gpt2",
15
+ "n_embd": 1,
16
+ "n_head": 1,
17
+ "n_inner": 1,
18
+ "n_layer": 1,
19
+ "n_positions": 1,
20
+ "pad_token_id": 0,
21
+ "reorder_and_upcast_attn": false,
22
+ "resid_pdrop": 0.0,
23
+ "scale_attn_by_inverse_layer_idx": false,
24
+ "scale_attn_weights": true,
25
+ "summary_activation": null,
26
+ "summary_first_dropout": 0.1,
27
+ "summary_proj_to_labels": true,
28
+ "summary_type": "cls_index",
29
+ "summary_use_proj": true,
30
+ "tie_word_embeddings": true,
31
+ "transformers_version": "5.12.0",
32
+ "use_cache": false,
33
+ "vocab_size": 2
34
+ }
checkpoint-1250/generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 1,
5
+ "output_attentions": false,
6
+ "output_hidden_states": false,
7
+ "pad_token_id": 0,
8
+ "transformers_version": "5.12.0",
9
+ "use_cache": true
10
+ }