kiatkock commited on
Commit
60c10e4
·
verified ·
1 Parent(s): b533f03

Upload folder using huggingface_hub

Browse files
checkpoint-20/config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "time_moe_50m",
3
+ "apply_aux_loss": true,
4
+ "architectures": [
5
+ "TimeMoeForPrediction"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "auto_map": {
9
+ "AutoConfig": "Maple728/TimeMoE-50M--configuration_time_moe.TimeMoeConfig",
10
+ "AutoModelForCausalLM": "Maple728/TimeMoE-50M--modeling_time_moe.TimeMoeForPrediction"
11
+ },
12
+ "channel_configs": [
13
+ [
14
+ 63,
15
+ 1,
16
+ 1
17
+ ],
18
+ [
19
+ 6,
20
+ 1,
21
+ 4
22
+ ],
23
+ [
24
+ 5,
25
+ 1,
26
+ 1
27
+ ],
28
+ [
29
+ 5,
30
+ 1,
31
+ 1
32
+ ]
33
+ ],
34
+ "embedding_hidden_size": 128,
35
+ "hidden_act": "silu",
36
+ "hidden_size": 384,
37
+ "horizon_lengths": [
38
+ 1,
39
+ 8,
40
+ 32,
41
+ 64
42
+ ],
43
+ "initializer_range": 0.02,
44
+ "input_size": 42,
45
+ "intermediate_size": 1536,
46
+ "max_position_embeddings": 4096,
47
+ "model_type": "time_moe",
48
+ "num_attention_heads": 12,
49
+ "num_experts": 8,
50
+ "num_experts_per_tok": 2,
51
+ "num_hidden_layers": 12,
52
+ "num_key_value_heads": 12,
53
+ "rms_norm_eps": 1e-06,
54
+ "rope_theta": 10000,
55
+ "router_aux_loss_factor": 0.02,
56
+ "tie_word_embeddings": false,
57
+ "torch_dtype": "float32",
58
+ "transformers_version": "4.40.1",
59
+ "use_cache": true,
60
+ "use_dense": false
61
+ }
checkpoint-20/generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.40.1"
4
+ }
checkpoint-20/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1c12078447ee98547e56faad88559e765882ac7da469062736061810ce9b3a2
3
+ size 484301192
checkpoint-20/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77b00521a0d9544854ff7ed31aa3b9ba5e26092e4298a16638fb4001baa40d5b
3
+ size 968903096
checkpoint-20/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5373721d9a470ab26caf9e22d632c992bacafe2ec2814ebe07a78f261437d8e5
3
+ size 14391
checkpoint-20/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a076ec7b5b7c2ad5f2da5df74c16e3cf45ecadac23b8bd64ca627a3ae91013b6
3
+ size 1465
checkpoint-20/trainer_state.json ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9876543209876543,
5
+ "eval_steps": 10,
6
+ "global_step": 20,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.04938271604938271,
13
+ "grad_norm": 1.3728221654891968,
14
+ "learning_rate": 9.992293334332821e-05,
15
+ "loss": 0.5092,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.09876543209876543,
20
+ "grad_norm": 0.4933850169181824,
21
+ "learning_rate": 9.969220851487845e-05,
22
+ "loss": 0.4755,
23
+ "step": 2
24
+ },
25
+ {
26
+ "epoch": 0.14814814814814814,
27
+ "grad_norm": 1.8201042413711548,
28
+ "learning_rate": 9.930924800994192e-05,
29
+ "loss": 0.5033,
30
+ "step": 3
31
+ },
32
+ {
33
+ "epoch": 0.19753086419753085,
34
+ "grad_norm": 0.3733859360218048,
35
+ "learning_rate": 9.877641290737885e-05,
36
+ "loss": 0.4497,
37
+ "step": 4
38
+ },
39
+ {
40
+ "epoch": 0.24691358024691357,
41
+ "grad_norm": 0.5732631683349609,
42
+ "learning_rate": 9.809698831278218e-05,
43
+ "loss": 0.4617,
44
+ "step": 5
45
+ },
46
+ {
47
+ "epoch": 0.2962962962962963,
48
+ "grad_norm": 0.5703950524330139,
49
+ "learning_rate": 9.72751631047092e-05,
50
+ "loss": 0.4452,
51
+ "step": 6
52
+ },
53
+ {
54
+ "epoch": 0.345679012345679,
55
+ "grad_norm": 0.27186131477355957,
56
+ "learning_rate": 9.631600410885231e-05,
57
+ "loss": 0.4493,
58
+ "step": 7
59
+ },
60
+ {
61
+ "epoch": 0.3950617283950617,
62
+ "grad_norm": 0.42532485723495483,
63
+ "learning_rate": 9.522542485937369e-05,
64
+ "loss": 0.4452,
65
+ "step": 8
66
+ },
67
+ {
68
+ "epoch": 0.4444444444444444,
69
+ "grad_norm": 0.43852725625038147,
70
+ "learning_rate": 9.401014914000078e-05,
71
+ "loss": 0.4326,
72
+ "step": 9
73
+ },
74
+ {
75
+ "epoch": 0.49382716049382713,
76
+ "grad_norm": 0.28129202127456665,
77
+ "learning_rate": 9.267766952966369e-05,
78
+ "loss": 0.4344,
79
+ "step": 10
80
+ },
81
+ {
82
+ "epoch": 0.49382716049382713,
83
+ "eval_loss": 0.43601080775260925,
84
+ "eval_runtime": 86.6474,
85
+ "eval_samples_per_second": 10.883,
86
+ "eval_steps_per_second": 0.681,
87
+ "step": 10
88
+ },
89
+ {
90
+ "epoch": 0.5432098765432098,
91
+ "grad_norm": 0.1692819446325302,
92
+ "learning_rate": 9.123620120825459e-05,
93
+ "loss": 0.4306,
94
+ "step": 11
95
+ },
96
+ {
97
+ "epoch": 0.5925925925925926,
98
+ "grad_norm": 0.3884420096874237,
99
+ "learning_rate": 8.969463130731183e-05,
100
+ "loss": 0.429,
101
+ "step": 12
102
+ },
103
+ {
104
+ "epoch": 0.6419753086419753,
105
+ "grad_norm": 0.3931941092014313,
106
+ "learning_rate": 8.806246411789872e-05,
107
+ "loss": 0.4224,
108
+ "step": 13
109
+ },
110
+ {
111
+ "epoch": 0.691358024691358,
112
+ "grad_norm": 0.14838409423828125,
113
+ "learning_rate": 8.634976249348867e-05,
114
+ "loss": 0.4411,
115
+ "step": 14
116
+ },
117
+ {
118
+ "epoch": 0.7407407407407407,
119
+ "grad_norm": 0.14195731282234192,
120
+ "learning_rate": 8.456708580912724e-05,
121
+ "loss": 0.4014,
122
+ "step": 15
123
+ },
124
+ {
125
+ "epoch": 0.7901234567901234,
126
+ "grad_norm": 0.31687867641448975,
127
+ "learning_rate": 8.27254248593737e-05,
128
+ "loss": 0.4185,
129
+ "step": 16
130
+ },
131
+ {
132
+ "epoch": 0.8395061728395061,
133
+ "grad_norm": 0.2713411748409271,
134
+ "learning_rate": 8.083613409639764e-05,
135
+ "loss": 0.4236,
136
+ "step": 17
137
+ },
138
+ {
139
+ "epoch": 0.8888888888888888,
140
+ "grad_norm": 0.12129750102758408,
141
+ "learning_rate": 7.891086162600579e-05,
142
+ "loss": 0.4118,
143
+ "step": 18
144
+ },
145
+ {
146
+ "epoch": 0.9382716049382716,
147
+ "grad_norm": 0.12624289095401764,
148
+ "learning_rate": 7.696147739319612e-05,
149
+ "loss": 0.4228,
150
+ "step": 19
151
+ },
152
+ {
153
+ "epoch": 0.9876543209876543,
154
+ "grad_norm": 0.19285239279270172,
155
+ "learning_rate": 7.500000000000001e-05,
156
+ "loss": 0.3954,
157
+ "step": 20
158
+ },
159
+ {
160
+ "epoch": 0.9876543209876543,
161
+ "eval_loss": 0.42280665040016174,
162
+ "eval_runtime": 84.8191,
163
+ "eval_samples_per_second": 11.118,
164
+ "eval_steps_per_second": 0.696,
165
+ "step": 20
166
+ }
167
+ ],
168
+ "logging_steps": 1,
169
+ "max_steps": 40,
170
+ "num_input_tokens_seen": 0,
171
+ "num_train_epochs": 2,
172
+ "save_steps": 500,
173
+ "total_flos": 59305762897920.0,
174
+ "train_batch_size": 8,
175
+ "trial_name": null,
176
+ "trial_params": null
177
+ }
checkpoint-20/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4f77884ddd41b5ba98022ed7940c5beac63484e3fee92ab74006e9bad8e19e3
3
+ size 5585
checkpoint-40/config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "time_moe_50m",
3
+ "apply_aux_loss": true,
4
+ "architectures": [
5
+ "TimeMoeForPrediction"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "auto_map": {
9
+ "AutoConfig": "Maple728/TimeMoE-50M--configuration_time_moe.TimeMoeConfig",
10
+ "AutoModelForCausalLM": "Maple728/TimeMoE-50M--modeling_time_moe.TimeMoeForPrediction"
11
+ },
12
+ "channel_configs": [
13
+ [
14
+ 63,
15
+ 1,
16
+ 1
17
+ ],
18
+ [
19
+ 6,
20
+ 1,
21
+ 4
22
+ ],
23
+ [
24
+ 5,
25
+ 1,
26
+ 1
27
+ ],
28
+ [
29
+ 5,
30
+ 1,
31
+ 1
32
+ ]
33
+ ],
34
+ "embedding_hidden_size": 128,
35
+ "hidden_act": "silu",
36
+ "hidden_size": 384,
37
+ "horizon_lengths": [
38
+ 1,
39
+ 8,
40
+ 32,
41
+ 64
42
+ ],
43
+ "initializer_range": 0.02,
44
+ "input_size": 42,
45
+ "intermediate_size": 1536,
46
+ "max_position_embeddings": 4096,
47
+ "model_type": "time_moe",
48
+ "num_attention_heads": 12,
49
+ "num_experts": 8,
50
+ "num_experts_per_tok": 2,
51
+ "num_hidden_layers": 12,
52
+ "num_key_value_heads": 12,
53
+ "rms_norm_eps": 1e-06,
54
+ "rope_theta": 10000,
55
+ "router_aux_loss_factor": 0.02,
56
+ "tie_word_embeddings": false,
57
+ "torch_dtype": "float32",
58
+ "transformers_version": "4.40.1",
59
+ "use_cache": true,
60
+ "use_dense": false
61
+ }
checkpoint-40/generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.40.1"
4
+ }
checkpoint-40/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1270c0f0619d0adb8fa6573f5c1485fc3eb67167357e845c4a0a638512ffc4d
3
+ size 484301192
checkpoint-40/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5eb59b42b0a035b09eb28f12cb7f6cb5dfb0a484d6cff2ab1f72cb9f6a7069b8
3
+ size 968903096
checkpoint-40/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b1fcb242e52a73ed5f8f4daef23a59696d5f4b7963226e7adfac0f95b9ddb94
3
+ size 14391
checkpoint-40/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61a66f30d36945a615551c789574577fb691ccdc857f214a99bda842d314afc0
3
+ size 1465
checkpoint-40/trainer_state.json ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.9753086419753085,
5
+ "eval_steps": 10,
6
+ "global_step": 40,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.04938271604938271,
13
+ "grad_norm": 1.3728221654891968,
14
+ "learning_rate": 9.992293334332821e-05,
15
+ "loss": 0.5092,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.09876543209876543,
20
+ "grad_norm": 0.4933850169181824,
21
+ "learning_rate": 9.969220851487845e-05,
22
+ "loss": 0.4755,
23
+ "step": 2
24
+ },
25
+ {
26
+ "epoch": 0.14814814814814814,
27
+ "grad_norm": 1.8201042413711548,
28
+ "learning_rate": 9.930924800994192e-05,
29
+ "loss": 0.5033,
30
+ "step": 3
31
+ },
32
+ {
33
+ "epoch": 0.19753086419753085,
34
+ "grad_norm": 0.3733859360218048,
35
+ "learning_rate": 9.877641290737885e-05,
36
+ "loss": 0.4497,
37
+ "step": 4
38
+ },
39
+ {
40
+ "epoch": 0.24691358024691357,
41
+ "grad_norm": 0.5732631683349609,
42
+ "learning_rate": 9.809698831278218e-05,
43
+ "loss": 0.4617,
44
+ "step": 5
45
+ },
46
+ {
47
+ "epoch": 0.2962962962962963,
48
+ "grad_norm": 0.5703950524330139,
49
+ "learning_rate": 9.72751631047092e-05,
50
+ "loss": 0.4452,
51
+ "step": 6
52
+ },
53
+ {
54
+ "epoch": 0.345679012345679,
55
+ "grad_norm": 0.27186131477355957,
56
+ "learning_rate": 9.631600410885231e-05,
57
+ "loss": 0.4493,
58
+ "step": 7
59
+ },
60
+ {
61
+ "epoch": 0.3950617283950617,
62
+ "grad_norm": 0.42532485723495483,
63
+ "learning_rate": 9.522542485937369e-05,
64
+ "loss": 0.4452,
65
+ "step": 8
66
+ },
67
+ {
68
+ "epoch": 0.4444444444444444,
69
+ "grad_norm": 0.43852725625038147,
70
+ "learning_rate": 9.401014914000078e-05,
71
+ "loss": 0.4326,
72
+ "step": 9
73
+ },
74
+ {
75
+ "epoch": 0.49382716049382713,
76
+ "grad_norm": 0.28129202127456665,
77
+ "learning_rate": 9.267766952966369e-05,
78
+ "loss": 0.4344,
79
+ "step": 10
80
+ },
81
+ {
82
+ "epoch": 0.49382716049382713,
83
+ "eval_loss": 0.43601080775260925,
84
+ "eval_runtime": 86.6474,
85
+ "eval_samples_per_second": 10.883,
86
+ "eval_steps_per_second": 0.681,
87
+ "step": 10
88
+ },
89
+ {
90
+ "epoch": 0.5432098765432098,
91
+ "grad_norm": 0.1692819446325302,
92
+ "learning_rate": 9.123620120825459e-05,
93
+ "loss": 0.4306,
94
+ "step": 11
95
+ },
96
+ {
97
+ "epoch": 0.5925925925925926,
98
+ "grad_norm": 0.3884420096874237,
99
+ "learning_rate": 8.969463130731183e-05,
100
+ "loss": 0.429,
101
+ "step": 12
102
+ },
103
+ {
104
+ "epoch": 0.6419753086419753,
105
+ "grad_norm": 0.3931941092014313,
106
+ "learning_rate": 8.806246411789872e-05,
107
+ "loss": 0.4224,
108
+ "step": 13
109
+ },
110
+ {
111
+ "epoch": 0.691358024691358,
112
+ "grad_norm": 0.14838409423828125,
113
+ "learning_rate": 8.634976249348867e-05,
114
+ "loss": 0.4411,
115
+ "step": 14
116
+ },
117
+ {
118
+ "epoch": 0.7407407407407407,
119
+ "grad_norm": 0.14195731282234192,
120
+ "learning_rate": 8.456708580912724e-05,
121
+ "loss": 0.4014,
122
+ "step": 15
123
+ },
124
+ {
125
+ "epoch": 0.7901234567901234,
126
+ "grad_norm": 0.31687867641448975,
127
+ "learning_rate": 8.27254248593737e-05,
128
+ "loss": 0.4185,
129
+ "step": 16
130
+ },
131
+ {
132
+ "epoch": 0.8395061728395061,
133
+ "grad_norm": 0.2713411748409271,
134
+ "learning_rate": 8.083613409639764e-05,
135
+ "loss": 0.4236,
136
+ "step": 17
137
+ },
138
+ {
139
+ "epoch": 0.8888888888888888,
140
+ "grad_norm": 0.12129750102758408,
141
+ "learning_rate": 7.891086162600579e-05,
142
+ "loss": 0.4118,
143
+ "step": 18
144
+ },
145
+ {
146
+ "epoch": 0.9382716049382716,
147
+ "grad_norm": 0.12624289095401764,
148
+ "learning_rate": 7.696147739319612e-05,
149
+ "loss": 0.4228,
150
+ "step": 19
151
+ },
152
+ {
153
+ "epoch": 0.9876543209876543,
154
+ "grad_norm": 0.19285239279270172,
155
+ "learning_rate": 7.500000000000001e-05,
156
+ "loss": 0.3954,
157
+ "step": 20
158
+ },
159
+ {
160
+ "epoch": 0.9876543209876543,
161
+ "eval_loss": 0.42280665040016174,
162
+ "eval_runtime": 84.8191,
163
+ "eval_samples_per_second": 11.118,
164
+ "eval_steps_per_second": 0.696,
165
+ "step": 20
166
+ },
167
+ {
168
+ "epoch": 1.037037037037037,
169
+ "grad_norm": 0.22232137620449066,
170
+ "learning_rate": 7.303852260680389e-05,
171
+ "loss": 0.4039,
172
+ "step": 21
173
+ },
174
+ {
175
+ "epoch": 1.0864197530864197,
176
+ "grad_norm": 0.17555947601795197,
177
+ "learning_rate": 7.108913837399423e-05,
178
+ "loss": 0.4044,
179
+ "step": 22
180
+ },
181
+ {
182
+ "epoch": 1.1358024691358024,
183
+ "grad_norm": 0.06463441997766495,
184
+ "learning_rate": 6.916386590360238e-05,
185
+ "loss": 0.4013,
186
+ "step": 23
187
+ },
188
+ {
189
+ "epoch": 1.1851851851851851,
190
+ "grad_norm": 0.18348956108093262,
191
+ "learning_rate": 6.727457514062632e-05,
192
+ "loss": 0.4213,
193
+ "step": 24
194
+ },
195
+ {
196
+ "epoch": 1.2345679012345678,
197
+ "grad_norm": 0.20464195311069489,
198
+ "learning_rate": 6.543291419087276e-05,
199
+ "loss": 0.4081,
200
+ "step": 25
201
+ },
202
+ {
203
+ "epoch": 1.2839506172839505,
204
+ "grad_norm": 0.1978413313627243,
205
+ "learning_rate": 6.365023750651134e-05,
206
+ "loss": 0.3962,
207
+ "step": 26
208
+ },
209
+ {
210
+ "epoch": 1.3333333333333333,
211
+ "grad_norm": 0.08643993735313416,
212
+ "learning_rate": 6.193753588210128e-05,
213
+ "loss": 0.4005,
214
+ "step": 27
215
+ },
216
+ {
217
+ "epoch": 1.382716049382716,
218
+ "grad_norm": 0.12691061198711395,
219
+ "learning_rate": 6.030536869268818e-05,
220
+ "loss": 0.4051,
221
+ "step": 28
222
+ },
223
+ {
224
+ "epoch": 1.4320987654320987,
225
+ "grad_norm": 0.1899513453245163,
226
+ "learning_rate": 5.8763798791745415e-05,
227
+ "loss": 0.4226,
228
+ "step": 29
229
+ },
230
+ {
231
+ "epoch": 1.4814814814814814,
232
+ "grad_norm": 0.16831448674201965,
233
+ "learning_rate": 5.7322330470336315e-05,
234
+ "loss": 0.4278,
235
+ "step": 30
236
+ },
237
+ {
238
+ "epoch": 1.4814814814814814,
239
+ "eval_loss": 0.416827529668808,
240
+ "eval_runtime": 90.8316,
241
+ "eval_samples_per_second": 10.382,
242
+ "eval_steps_per_second": 0.65,
243
+ "step": 30
244
+ },
245
+ {
246
+ "epoch": 1.5308641975308643,
247
+ "grad_norm": 0.08136623352766037,
248
+ "learning_rate": 5.5989850859999227e-05,
249
+ "loss": 0.4267,
250
+ "step": 31
251
+ },
252
+ {
253
+ "epoch": 1.5802469135802468,
254
+ "grad_norm": 0.06590854376554489,
255
+ "learning_rate": 5.4774575140626315e-05,
256
+ "loss": 0.414,
257
+ "step": 32
258
+ },
259
+ {
260
+ "epoch": 1.6296296296296298,
261
+ "grad_norm": 0.1640368402004242,
262
+ "learning_rate": 5.36839958911477e-05,
263
+ "loss": 0.3984,
264
+ "step": 33
265
+ },
266
+ {
267
+ "epoch": 1.6790123456790123,
268
+ "grad_norm": 0.07538054138422012,
269
+ "learning_rate": 5.2724836895290805e-05,
270
+ "loss": 0.3977,
271
+ "step": 34
272
+ },
273
+ {
274
+ "epoch": 1.7283950617283952,
275
+ "grad_norm": 0.10800465941429138,
276
+ "learning_rate": 5.190301168721783e-05,
277
+ "loss": 0.3962,
278
+ "step": 35
279
+ },
280
+ {
281
+ "epoch": 1.7777777777777777,
282
+ "grad_norm": 0.05263437330722809,
283
+ "learning_rate": 5.122358709262116e-05,
284
+ "loss": 0.406,
285
+ "step": 36
286
+ },
287
+ {
288
+ "epoch": 1.8271604938271606,
289
+ "grad_norm": 0.10143052786588669,
290
+ "learning_rate": 5.0690751990058084e-05,
291
+ "loss": 0.4073,
292
+ "step": 37
293
+ },
294
+ {
295
+ "epoch": 1.876543209876543,
296
+ "grad_norm": 0.10162568092346191,
297
+ "learning_rate": 5.030779148512156e-05,
298
+ "loss": 0.3952,
299
+ "step": 38
300
+ },
301
+ {
302
+ "epoch": 1.925925925925926,
303
+ "grad_norm": 0.12134301662445068,
304
+ "learning_rate": 5.00770666566718e-05,
305
+ "loss": 0.3962,
306
+ "step": 39
307
+ },
308
+ {
309
+ "epoch": 1.9753086419753085,
310
+ "grad_norm": 0.09298070520162582,
311
+ "learning_rate": 5e-05,
312
+ "loss": 0.4101,
313
+ "step": 40
314
+ },
315
+ {
316
+ "epoch": 1.9753086419753085,
317
+ "eval_loss": 0.4142439365386963,
318
+ "eval_runtime": 91.1081,
319
+ "eval_samples_per_second": 10.35,
320
+ "eval_steps_per_second": 0.648,
321
+ "step": 40
322
+ }
323
+ ],
324
+ "logging_steps": 1,
325
+ "max_steps": 40,
326
+ "num_input_tokens_seen": 0,
327
+ "num_train_epochs": 2,
328
+ "save_steps": 500,
329
+ "total_flos": 117147185971200.0,
330
+ "train_batch_size": 8,
331
+ "trial_name": null,
332
+ "trial_params": null
333
+ }
checkpoint-40/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4f77884ddd41b5ba98022ed7940c5beac63484e3fee92ab74006e9bad8e19e3
3
+ size 5585
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b0f8d6a29a701a2e5f989864080355d3c8535d9fb4648d862d3c41cc38720ff
3
  size 484301192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1270c0f0619d0adb8fa6573f5c1485fc3eb67167357e845c4a0a638512ffc4d
3
  size 484301192
tb_logs/events.out.tfevents.1760155830.Kiat.34980.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:778447207426355cc5d22aca433c7e96eb22fcd695619bda085bd7ac59bf9a13
3
+ size 15054