kiatkock commited on
Commit
015b6ca
·
verified ·
1 Parent(s): 60c10e4

Upload folder using huggingface_hub

Browse files
checkpoint-1012/config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "time_moe_50m",
3
+ "apply_aux_loss": true,
4
+ "architectures": [
5
+ "TimeMoeForPrediction"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "auto_map": {
9
+ "AutoConfig": "Maple728/TimeMoE-50M--configuration_time_moe.TimeMoeConfig",
10
+ "AutoModelForCausalLM": "Maple728/TimeMoE-50M--modeling_time_moe.TimeMoeForPrediction"
11
+ },
12
+ "channel_configs": [
13
+ [
14
+ 63,
15
+ 1,
16
+ 1
17
+ ],
18
+ [
19
+ 6,
20
+ 1,
21
+ 4
22
+ ],
23
+ [
24
+ 5,
25
+ 1,
26
+ 1
27
+ ],
28
+ [
29
+ 5,
30
+ 1,
31
+ 1
32
+ ]
33
+ ],
34
+ "embedding_hidden_size": 128,
35
+ "hidden_act": "silu",
36
+ "hidden_size": 384,
37
+ "horizon_lengths": [
38
+ 1,
39
+ 8,
40
+ 32,
41
+ 64
42
+ ],
43
+ "initializer_range": 0.02,
44
+ "input_size": 42,
45
+ "intermediate_size": 1536,
46
+ "max_position_embeddings": 4096,
47
+ "model_type": "time_moe",
48
+ "num_attention_heads": 12,
49
+ "num_experts": 8,
50
+ "num_experts_per_tok": 2,
51
+ "num_hidden_layers": 12,
52
+ "num_key_value_heads": 12,
53
+ "rms_norm_eps": 1e-06,
54
+ "rope_theta": 10000,
55
+ "router_aux_loss_factor": 0.02,
56
+ "tie_word_embeddings": false,
57
+ "torch_dtype": "float32",
58
+ "transformers_version": "4.40.1",
59
+ "use_cache": true,
60
+ "use_dense": false
61
+ }
checkpoint-1012/generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.40.1"
4
+ }
checkpoint-1012/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6b88928e1afca14f30bdd89c066721d058297e625931a890a86e98f678b84e7
3
+ size 484301192
checkpoint-1012/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7e79974ab08ca0fccbfb17cd4c57676c930d13a46445285f20686faf9d0409b
3
+ size 968924920
checkpoint-1012/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cbe4833c475023cd0b15013740d375640598d423ffe268731c6a5fe33bce683
3
+ size 14645
checkpoint-1012/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db999be284ba0abf1c609173979ea3f83c7c9b8cfb47a4d480f93d361d849447
3
+ size 1465
checkpoint-1012/trainer_state.json ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9995061728395062,
5
+ "eval_steps": 250,
6
+ "global_step": 1012,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0009876543209876543,
13
+ "grad_norm": 1.221229910850525,
14
+ "learning_rate": 9.99999698845987e-05,
15
+ "loss": 0.4907,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.04938271604938271,
20
+ "grad_norm": 0.05035819858312607,
21
+ "learning_rate": 9.99247492630919e-05,
22
+ "loss": 0.3903,
23
+ "step": 50
24
+ },
25
+ {
26
+ "epoch": 0.09876543209876543,
27
+ "grad_norm": 0.14429445564746857,
28
+ "learning_rate": 9.969945006624004e-05,
29
+ "loss": 0.3593,
30
+ "step": 100
31
+ },
32
+ {
33
+ "epoch": 0.14814814814814814,
34
+ "grad_norm": 0.15117663145065308,
35
+ "learning_rate": 9.932545872389141e-05,
36
+ "loss": 0.3437,
37
+ "step": 150
38
+ },
39
+ {
40
+ "epoch": 0.19753086419753085,
41
+ "grad_norm": 0.19646206498146057,
42
+ "learning_rate": 9.880502668597476e-05,
43
+ "loss": 0.3354,
44
+ "step": 200
45
+ },
46
+ {
47
+ "epoch": 0.24691358024691357,
48
+ "grad_norm": 0.25732266902923584,
49
+ "learning_rate": 9.814128698403919e-05,
50
+ "loss": 0.3283,
51
+ "step": 250
52
+ },
53
+ {
54
+ "epoch": 0.24691358024691357,
55
+ "eval_loss": 0.3421019911766052,
56
+ "eval_runtime": 137.4213,
57
+ "eval_samples_per_second": 686.211,
58
+ "eval_steps_per_second": 21.445,
59
+ "step": 250
60
+ },
61
+ {
62
+ "epoch": 0.2962962962962963,
63
+ "grad_norm": 0.12474808096885681,
64
+ "learning_rate": 9.733823537021955e-05,
65
+ "loss": 0.3266,
66
+ "step": 300
67
+ },
68
+ {
69
+ "epoch": 0.345679012345679,
70
+ "grad_norm": 0.12876740097999573,
71
+ "learning_rate": 9.640070626257307e-05,
72
+ "loss": 0.3259,
73
+ "step": 350
74
+ },
75
+ {
76
+ "epoch": 0.3950617283950617,
77
+ "grad_norm": 0.20461653172969818,
78
+ "learning_rate": 9.533434364159762e-05,
79
+ "loss": 0.3248,
80
+ "step": 400
81
+ },
82
+ {
83
+ "epoch": 0.4444444444444444,
84
+ "grad_norm": 0.22041602432727814,
85
+ "learning_rate": 9.414556707313633e-05,
86
+ "loss": 0.3219,
87
+ "step": 450
88
+ },
89
+ {
90
+ "epoch": 0.49382716049382713,
91
+ "grad_norm": 0.27202293276786804,
92
+ "learning_rate": 9.284153306221289e-05,
93
+ "loss": 0.3178,
94
+ "step": 500
95
+ },
96
+ {
97
+ "epoch": 0.49382716049382713,
98
+ "eval_loss": 0.33308419585227966,
99
+ "eval_runtime": 140.6684,
100
+ "eval_samples_per_second": 670.371,
101
+ "eval_steps_per_second": 20.95,
102
+ "step": 500
103
+ },
104
+ {
105
+ "epoch": 0.5432098765432098,
106
+ "grad_norm": 0.1661626547574997,
107
+ "learning_rate": 9.143009197044932e-05,
108
+ "loss": 0.3211,
109
+ "step": 550
110
+ },
111
+ {
112
+ "epoch": 0.5925925925925926,
113
+ "grad_norm": 0.14871230721473694,
114
+ "learning_rate": 8.991974075642621e-05,
115
+ "loss": 0.3207,
116
+ "step": 600
117
+ },
118
+ {
119
+ "epoch": 0.6419753086419753,
120
+ "grad_norm": 0.17097660899162292,
121
+ "learning_rate": 8.831957182349119e-05,
122
+ "loss": 0.3185,
123
+ "step": 650
124
+ },
125
+ {
126
+ "epoch": 0.691358024691358,
127
+ "grad_norm": 0.1582462042570114,
128
+ "learning_rate": 8.663921828295474e-05,
129
+ "loss": 0.3173,
130
+ "step": 700
131
+ },
132
+ {
133
+ "epoch": 0.7407407407407407,
134
+ "grad_norm": 0.25067687034606934,
135
+ "learning_rate": 8.488879596219216e-05,
136
+ "loss": 0.3191,
137
+ "step": 750
138
+ },
139
+ {
140
+ "epoch": 0.7407407407407407,
141
+ "eval_loss": 0.33356761932373047,
142
+ "eval_runtime": 145.0159,
143
+ "eval_samples_per_second": 650.274,
144
+ "eval_steps_per_second": 20.322,
145
+ "step": 750
146
+ },
147
+ {
148
+ "epoch": 0.7901234567901234,
149
+ "grad_norm": 0.1577720046043396,
150
+ "learning_rate": 8.307884250676648e-05,
151
+ "loss": 0.3153,
152
+ "step": 800
153
+ },
154
+ {
155
+ "epoch": 0.8395061728395061,
156
+ "grad_norm": 0.166303813457489,
157
+ "learning_rate": 8.122025394318091e-05,
158
+ "loss": 0.3166,
159
+ "step": 850
160
+ },
161
+ {
162
+ "epoch": 0.8888888888888888,
163
+ "grad_norm": 0.17451900243759155,
164
+ "learning_rate": 7.932421908415695e-05,
165
+ "loss": 0.3156,
166
+ "step": 900
167
+ },
168
+ {
169
+ "epoch": 0.9382716049382716,
170
+ "grad_norm": 0.18235546350479126,
171
+ "learning_rate": 7.740215217132219e-05,
172
+ "loss": 0.3155,
173
+ "step": 950
174
+ },
175
+ {
176
+ "epoch": 0.9876543209876543,
177
+ "grad_norm": 0.22571995854377747,
178
+ "learning_rate": 7.546562416080285e-05,
179
+ "loss": 0.3185,
180
+ "step": 1000
181
+ },
182
+ {
183
+ "epoch": 0.9876543209876543,
184
+ "eval_loss": 0.33335721492767334,
185
+ "eval_runtime": 144.7926,
186
+ "eval_samples_per_second": 651.276,
187
+ "eval_steps_per_second": 20.353,
188
+ "step": 1000
189
+ }
190
+ ],
191
+ "logging_steps": 50,
192
+ "max_steps": 2024,
193
+ "num_input_tokens_seen": 0,
194
+ "num_train_epochs": 2,
195
+ "save_steps": 500,
196
+ "total_flos": 5930576289792000.0,
197
+ "train_batch_size": 16,
198
+ "trial_name": null,
199
+ "trial_params": null
200
+ }
checkpoint-1012/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8c0e5ae1d16b61cb0bbe6e1913a36d352f5b976b4d77e04f450f90733900582
3
+ size 5521
checkpoint-2024/config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "time_moe_50m",
3
+ "apply_aux_loss": true,
4
+ "architectures": [
5
+ "TimeMoeForPrediction"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "auto_map": {
9
+ "AutoConfig": "Maple728/TimeMoE-50M--configuration_time_moe.TimeMoeConfig",
10
+ "AutoModelForCausalLM": "Maple728/TimeMoE-50M--modeling_time_moe.TimeMoeForPrediction"
11
+ },
12
+ "channel_configs": [
13
+ [
14
+ 63,
15
+ 1,
16
+ 1
17
+ ],
18
+ [
19
+ 6,
20
+ 1,
21
+ 4
22
+ ],
23
+ [
24
+ 5,
25
+ 1,
26
+ 1
27
+ ],
28
+ [
29
+ 5,
30
+ 1,
31
+ 1
32
+ ]
33
+ ],
34
+ "embedding_hidden_size": 128,
35
+ "hidden_act": "silu",
36
+ "hidden_size": 384,
37
+ "horizon_lengths": [
38
+ 1,
39
+ 8,
40
+ 32,
41
+ 64
42
+ ],
43
+ "initializer_range": 0.02,
44
+ "input_size": 42,
45
+ "intermediate_size": 1536,
46
+ "max_position_embeddings": 4096,
47
+ "model_type": "time_moe",
48
+ "num_attention_heads": 12,
49
+ "num_experts": 8,
50
+ "num_experts_per_tok": 2,
51
+ "num_hidden_layers": 12,
52
+ "num_key_value_heads": 12,
53
+ "rms_norm_eps": 1e-06,
54
+ "rope_theta": 10000,
55
+ "router_aux_loss_factor": 0.02,
56
+ "tie_word_embeddings": false,
57
+ "torch_dtype": "float32",
58
+ "transformers_version": "4.40.1",
59
+ "use_cache": true,
60
+ "use_dense": false
61
+ }
checkpoint-2024/generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.40.1"
4
+ }
checkpoint-2024/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b84220e3bd1ccdeaecfa9308850b05ee17de6761e6f83d061265fd0c1623007
3
+ size 484301192
checkpoint-2024/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ef94c5e1a4f5e32ac50ddac26d38fd3f3641155ebfc964f641ab2f2c49ab201
3
+ size 968924920
checkpoint-2024/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67a6938a14783e6890baf297362eece64c8df71219ffc210b560d0a690a4c99f
3
+ size 14645
checkpoint-2024/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89efdf2689bb6790e533b3e51ce5908662bf122670c4e379468efd264d7232ac
3
+ size 1465
checkpoint-2024/trainer_state.json ADDED
@@ -0,0 +1,372 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.9990123456790123,
5
+ "eval_steps": 250,
6
+ "global_step": 2024,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0009876543209876543,
13
+ "grad_norm": 1.221229910850525,
14
+ "learning_rate": 9.99999698845987e-05,
15
+ "loss": 0.4907,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.04938271604938271,
20
+ "grad_norm": 0.05035819858312607,
21
+ "learning_rate": 9.99247492630919e-05,
22
+ "loss": 0.3903,
23
+ "step": 50
24
+ },
25
+ {
26
+ "epoch": 0.09876543209876543,
27
+ "grad_norm": 0.14429445564746857,
28
+ "learning_rate": 9.969945006624004e-05,
29
+ "loss": 0.3593,
30
+ "step": 100
31
+ },
32
+ {
33
+ "epoch": 0.14814814814814814,
34
+ "grad_norm": 0.15117663145065308,
35
+ "learning_rate": 9.932545872389141e-05,
36
+ "loss": 0.3437,
37
+ "step": 150
38
+ },
39
+ {
40
+ "epoch": 0.19753086419753085,
41
+ "grad_norm": 0.19646206498146057,
42
+ "learning_rate": 9.880502668597476e-05,
43
+ "loss": 0.3354,
44
+ "step": 200
45
+ },
46
+ {
47
+ "epoch": 0.24691358024691357,
48
+ "grad_norm": 0.25732266902923584,
49
+ "learning_rate": 9.814128698403919e-05,
50
+ "loss": 0.3283,
51
+ "step": 250
52
+ },
53
+ {
54
+ "epoch": 0.24691358024691357,
55
+ "eval_loss": 0.3421019911766052,
56
+ "eval_runtime": 137.4213,
57
+ "eval_samples_per_second": 686.211,
58
+ "eval_steps_per_second": 21.445,
59
+ "step": 250
60
+ },
61
+ {
62
+ "epoch": 0.2962962962962963,
63
+ "grad_norm": 0.12474808096885681,
64
+ "learning_rate": 9.733823537021955e-05,
65
+ "loss": 0.3266,
66
+ "step": 300
67
+ },
68
+ {
69
+ "epoch": 0.345679012345679,
70
+ "grad_norm": 0.12876740097999573,
71
+ "learning_rate": 9.640070626257307e-05,
72
+ "loss": 0.3259,
73
+ "step": 350
74
+ },
75
+ {
76
+ "epoch": 0.3950617283950617,
77
+ "grad_norm": 0.20461653172969818,
78
+ "learning_rate": 9.533434364159762e-05,
79
+ "loss": 0.3248,
80
+ "step": 400
81
+ },
82
+ {
83
+ "epoch": 0.4444444444444444,
84
+ "grad_norm": 0.22041602432727814,
85
+ "learning_rate": 9.414556707313633e-05,
86
+ "loss": 0.3219,
87
+ "step": 450
88
+ },
89
+ {
90
+ "epoch": 0.49382716049382713,
91
+ "grad_norm": 0.27202293276786804,
92
+ "learning_rate": 9.284153306221289e-05,
93
+ "loss": 0.3178,
94
+ "step": 500
95
+ },
96
+ {
97
+ "epoch": 0.49382716049382713,
98
+ "eval_loss": 0.33308419585227966,
99
+ "eval_runtime": 140.6684,
100
+ "eval_samples_per_second": 670.371,
101
+ "eval_steps_per_second": 20.95,
102
+ "step": 500
103
+ },
104
+ {
105
+ "epoch": 0.5432098765432098,
106
+ "grad_norm": 0.1661626547574997,
107
+ "learning_rate": 9.143009197044932e-05,
108
+ "loss": 0.3211,
109
+ "step": 550
110
+ },
111
+ {
112
+ "epoch": 0.5925925925925926,
113
+ "grad_norm": 0.14871230721473694,
114
+ "learning_rate": 8.991974075642621e-05,
115
+ "loss": 0.3207,
116
+ "step": 600
117
+ },
118
+ {
119
+ "epoch": 0.6419753086419753,
120
+ "grad_norm": 0.17097660899162292,
121
+ "learning_rate": 8.831957182349119e-05,
122
+ "loss": 0.3185,
123
+ "step": 650
124
+ },
125
+ {
126
+ "epoch": 0.691358024691358,
127
+ "grad_norm": 0.1582462042570114,
128
+ "learning_rate": 8.663921828295474e-05,
129
+ "loss": 0.3173,
130
+ "step": 700
131
+ },
132
+ {
133
+ "epoch": 0.7407407407407407,
134
+ "grad_norm": 0.25067687034606934,
135
+ "learning_rate": 8.488879596219216e-05,
136
+ "loss": 0.3191,
137
+ "step": 750
138
+ },
139
+ {
140
+ "epoch": 0.7407407407407407,
141
+ "eval_loss": 0.33356761932373047,
142
+ "eval_runtime": 145.0159,
143
+ "eval_samples_per_second": 650.274,
144
+ "eval_steps_per_second": 20.322,
145
+ "step": 750
146
+ },
147
+ {
148
+ "epoch": 0.7901234567901234,
149
+ "grad_norm": 0.1577720046043396,
150
+ "learning_rate": 8.307884250676648e-05,
151
+ "loss": 0.3153,
152
+ "step": 800
153
+ },
154
+ {
155
+ "epoch": 0.8395061728395061,
156
+ "grad_norm": 0.166303813457489,
157
+ "learning_rate": 8.122025394318091e-05,
158
+ "loss": 0.3166,
159
+ "step": 850
160
+ },
161
+ {
162
+ "epoch": 0.8888888888888888,
163
+ "grad_norm": 0.17451900243759155,
164
+ "learning_rate": 7.932421908415695e-05,
165
+ "loss": 0.3156,
166
+ "step": 900
167
+ },
168
+ {
169
+ "epoch": 0.9382716049382716,
170
+ "grad_norm": 0.18235546350479126,
171
+ "learning_rate": 7.740215217132219e-05,
172
+ "loss": 0.3155,
173
+ "step": 950
174
+ },
175
+ {
176
+ "epoch": 0.9876543209876543,
177
+ "grad_norm": 0.22571995854377747,
178
+ "learning_rate": 7.546562416080285e-05,
179
+ "loss": 0.3185,
180
+ "step": 1000
181
+ },
182
+ {
183
+ "epoch": 0.9876543209876543,
184
+ "eval_loss": 0.33335721492767334,
185
+ "eval_runtime": 144.7926,
186
+ "eval_samples_per_second": 651.276,
187
+ "eval_steps_per_second": 20.353,
188
+ "step": 1000
189
+ },
190
+ {
191
+ "epoch": 1.037037037037037,
192
+ "grad_norm": 0.3201523721218109,
193
+ "learning_rate": 7.35262930653857e-05,
194
+ "loss": 0.3183,
195
+ "step": 1050
196
+ },
197
+ {
198
+ "epoch": 1.0864197530864197,
199
+ "grad_norm": 0.3311742842197418,
200
+ "learning_rate": 7.159583377259385e-05,
201
+ "loss": 0.3172,
202
+ "step": 1100
203
+ },
204
+ {
205
+ "epoch": 1.1358024691358024,
206
+ "grad_norm": 0.2752295434474945,
207
+ "learning_rate": 6.968586776117559e-05,
208
+ "loss": 0.3166,
209
+ "step": 1150
210
+ },
211
+ {
212
+ "epoch": 1.1851851851851851,
213
+ "grad_norm": 0.33451607823371887,
214
+ "learning_rate": 6.780789313911721e-05,
215
+ "loss": 0.3166,
216
+ "step": 1200
217
+ },
218
+ {
219
+ "epoch": 1.2345679012345678,
220
+ "grad_norm": 0.12275319546461105,
221
+ "learning_rate": 6.597321542435513e-05,
222
+ "loss": 0.3155,
223
+ "step": 1250
224
+ },
225
+ {
226
+ "epoch": 1.2345679012345678,
227
+ "eval_loss": 0.33012306690216064,
228
+ "eval_runtime": 140.4999,
229
+ "eval_samples_per_second": 671.175,
230
+ "eval_steps_per_second": 20.975,
231
+ "step": 1250
232
+ },
233
+ {
234
+ "epoch": 1.2839506172839505,
235
+ "grad_norm": 0.2870579659938812,
236
+ "learning_rate": 6.419287948489126e-05,
237
+ "loss": 0.3154,
238
+ "step": 1300
239
+ },
240
+ {
241
+ "epoch": 1.3333333333333333,
242
+ "grad_norm": 0.08633468300104141,
243
+ "learning_rate": 6.247760304803671e-05,
244
+ "loss": 0.3138,
245
+ "step": 1350
246
+ },
247
+ {
248
+ "epoch": 1.382716049382716,
249
+ "grad_norm": 0.29040753841400146,
250
+ "learning_rate": 6.083771217906143e-05,
251
+ "loss": 0.3101,
252
+ "step": 1400
253
+ },
254
+ {
255
+ "epoch": 1.4320987654320987,
256
+ "grad_norm": 0.2062658816576004,
257
+ "learning_rate": 5.928307911767258e-05,
258
+ "loss": 0.3143,
259
+ "step": 1450
260
+ },
261
+ {
262
+ "epoch": 1.4814814814814814,
263
+ "grad_norm": 0.23233149945735931,
264
+ "learning_rate": 5.7823062846549435e-05,
265
+ "loss": 0.3152,
266
+ "step": 1500
267
+ },
268
+ {
269
+ "epoch": 1.4814814814814814,
270
+ "eval_loss": 0.33253857493400574,
271
+ "eval_runtime": 139.0406,
272
+ "eval_samples_per_second": 678.219,
273
+ "eval_steps_per_second": 21.195,
274
+ "step": 1500
275
+ },
276
+ {
277
+ "epoch": 1.5308641975308643,
278
+ "grad_norm": 0.1579607129096985,
279
+ "learning_rate": 5.646645274971599e-05,
280
+ "loss": 0.3171,
281
+ "step": 1550
282
+ },
283
+ {
284
+ "epoch": 1.5802469135802468,
285
+ "grad_norm": 0.38029882311820984,
286
+ "learning_rate": 5.522141569993096e-05,
287
+ "loss": 0.3148,
288
+ "step": 1600
289
+ },
290
+ {
291
+ "epoch": 1.6296296296296298,
292
+ "grad_norm": 0.26864030957221985,
293
+ "learning_rate": 5.409544689363224e-05,
294
+ "loss": 0.3146,
295
+ "step": 1650
296
+ },
297
+ {
298
+ "epoch": 1.6790123456790123,
299
+ "grad_norm": 0.41325536370277405,
300
+ "learning_rate": 5.309532472941261e-05,
301
+ "loss": 0.3135,
302
+ "step": 1700
303
+ },
304
+ {
305
+ "epoch": 1.7283950617283952,
306
+ "grad_norm": 0.26135507225990295,
307
+ "learning_rate": 5.222707000166053e-05,
308
+ "loss": 0.3127,
309
+ "step": 1750
310
+ },
311
+ {
312
+ "epoch": 1.7283950617283952,
313
+ "eval_loss": 0.3314434289932251,
314
+ "eval_runtime": 137.4334,
315
+ "eval_samples_per_second": 686.151,
316
+ "eval_steps_per_second": 21.443,
317
+ "step": 1750
318
+ },
319
+ {
320
+ "epoch": 1.7777777777777777,
321
+ "grad_norm": 0.35287410020828247,
322
+ "learning_rate": 5.1495909655022966e-05,
323
+ "loss": 0.3127,
324
+ "step": 1800
325
+ },
326
+ {
327
+ "epoch": 1.8271604938271606,
328
+ "grad_norm": 0.2525063157081604,
329
+ "learning_rate": 5.090624531789053e-05,
330
+ "loss": 0.3116,
331
+ "step": 1850
332
+ },
333
+ {
334
+ "epoch": 1.876543209876543,
335
+ "grad_norm": 0.5191920399665833,
336
+ "learning_rate": 5.0461626804335035e-05,
337
+ "loss": 0.312,
338
+ "step": 1900
339
+ },
340
+ {
341
+ "epoch": 1.925925925925926,
342
+ "grad_norm": 0.3505001664161682,
343
+ "learning_rate": 5.0164730744019504e-05,
344
+ "loss": 0.3109,
345
+ "step": 1950
346
+ },
347
+ {
348
+ "epoch": 1.9753086419753085,
349
+ "grad_norm": 0.2468252331018448,
350
+ "learning_rate": 5.001734446872988e-05,
351
+ "loss": 0.3124,
352
+ "step": 2000
353
+ },
354
+ {
355
+ "epoch": 1.9753086419753085,
356
+ "eval_loss": 0.32719919085502625,
357
+ "eval_runtime": 138.3829,
358
+ "eval_samples_per_second": 681.443,
359
+ "eval_steps_per_second": 21.296,
360
+ "step": 2000
361
+ }
362
+ ],
363
+ "logging_steps": 50,
364
+ "max_steps": 2024,
365
+ "num_input_tokens_seen": 0,
366
+ "num_train_epochs": 2,
367
+ "save_steps": 500,
368
+ "total_flos": 1.185529522028544e+16,
369
+ "train_batch_size": 16,
370
+ "trial_name": null,
371
+ "trial_params": null
372
+ }
checkpoint-2024/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8c0e5ae1d16b61cb0bbe6e1913a36d352f5b976b4d77e04f450f90733900582
3
+ size 5521
config.json CHANGED
@@ -1,61 +1,61 @@
1
- {
2
- "_name_or_path": "time_moe_50m",
3
- "apply_aux_loss": true,
4
- "architectures": [
5
- "TimeMoeForPrediction"
6
- ],
7
- "attention_dropout": 0.0,
8
- "auto_map": {
9
- "AutoConfig": "Maple728/TimeMoE-50M--configuration_time_moe.TimeMoeConfig",
10
- "AutoModelForCausalLM": "Maple728/TimeMoE-50M--modeling_time_moe.TimeMoeForPrediction"
11
- },
12
- "channel_configs": [
13
- [
14
- 63,
15
- 1,
16
- 1
17
- ],
18
- [
19
- 6,
20
- 1,
21
- 4
22
- ],
23
- [
24
- 5,
25
- 1,
26
- 1
27
- ],
28
- [
29
- 5,
30
- 1,
31
- 1
32
- ]
33
- ],
34
- "embedding_hidden_size": 128,
35
- "hidden_act": "silu",
36
- "hidden_size": 384,
37
- "horizon_lengths": [
38
- 1,
39
- 8,
40
- 32,
41
- 64
42
- ],
43
- "initializer_range": 0.02,
44
- "input_size": 42,
45
- "intermediate_size": 1536,
46
- "max_position_embeddings": 4096,
47
- "model_type": "time_moe",
48
- "num_attention_heads": 12,
49
- "num_experts": 8,
50
- "num_experts_per_tok": 2,
51
- "num_hidden_layers": 12,
52
- "num_key_value_heads": 12,
53
- "rms_norm_eps": 1e-06,
54
- "rope_theta": 10000,
55
- "router_aux_loss_factor": 0.02,
56
- "tie_word_embeddings": false,
57
- "torch_dtype": "float32",
58
- "transformers_version": "4.40.1",
59
- "use_cache": true,
60
- "use_dense": false
61
- }
 
1
+ {
2
+ "_name_or_path": "time_moe_50m",
3
+ "apply_aux_loss": true,
4
+ "architectures": [
5
+ "TimeMoeForPrediction"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "auto_map": {
9
+ "AutoConfig": "Maple728/TimeMoE-50M--configuration_time_moe.TimeMoeConfig",
10
+ "AutoModelForCausalLM": "Maple728/TimeMoE-50M--modeling_time_moe.TimeMoeForPrediction"
11
+ },
12
+ "channel_configs": [
13
+ [
14
+ 63,
15
+ 1,
16
+ 1
17
+ ],
18
+ [
19
+ 6,
20
+ 1,
21
+ 4
22
+ ],
23
+ [
24
+ 5,
25
+ 1,
26
+ 1
27
+ ],
28
+ [
29
+ 5,
30
+ 1,
31
+ 1
32
+ ]
33
+ ],
34
+ "embedding_hidden_size": 128,
35
+ "hidden_act": "silu",
36
+ "hidden_size": 384,
37
+ "horizon_lengths": [
38
+ 1,
39
+ 8,
40
+ 32,
41
+ 64
42
+ ],
43
+ "initializer_range": 0.02,
44
+ "input_size": 42,
45
+ "intermediate_size": 1536,
46
+ "max_position_embeddings": 4096,
47
+ "model_type": "time_moe",
48
+ "num_attention_heads": 12,
49
+ "num_experts": 8,
50
+ "num_experts_per_tok": 2,
51
+ "num_hidden_layers": 12,
52
+ "num_key_value_heads": 12,
53
+ "rms_norm_eps": 1e-06,
54
+ "rope_theta": 10000,
55
+ "router_aux_loss_factor": 0.02,
56
+ "tie_word_embeddings": false,
57
+ "torch_dtype": "float32",
58
+ "transformers_version": "4.40.1",
59
+ "use_cache": true,
60
+ "use_dense": false
61
+ }
generation_config.json CHANGED
@@ -1,4 +1,4 @@
1
- {
2
- "_from_model_config": true,
3
- "transformers_version": "4.40.1"
4
- }
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.40.1"
4
+ }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e1270c0f0619d0adb8fa6573f5c1485fc3eb67167357e845c4a0a638512ffc4d
3
  size 484301192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b84220e3bd1ccdeaecfa9308850b05ee17de6761e6f83d061265fd0c1623007
3
  size 484301192
tb_logs/events.out.tfevents.1760423030.luyao1.2719331.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb25d9dda7f93a6f276b76a0b21b6c59b62fd6c2a847b11fab0dca71de14368d
3
+ size 16427
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d4f77884ddd41b5ba98022ed7940c5beac63484e3fee92ab74006e9bad8e19e3
3
- size 5585
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8c0e5ae1d16b61cb0bbe6e1913a36d352f5b976b4d77e04f450f90733900582
3
+ size 5521