andrew-healey commited on
Commit
48ba06a
·
verified ·
1 Parent(s): 34a829d

Upload folder using huggingface_hub

Browse files
half_total_bs_sqrt_lr/args.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hellaswag": true, "attention_kind": "selective", "log_dir": "half_total_bs_sqrt_lr", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 12, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": null, "warmup_steps": null, "group": "shrinking_big_runs_2", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1337, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": null, "batch_size": 4, "total_batch_size": 262144, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": false, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 4e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": null, "n_embd": 768}
half_total_bs_sqrt_lr/dataloader_02499.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5fcd93c5e67d67c2b8db7c6b19f314688fcb1792fa1900f091ff53035c716d9
3
+ size 964
half_total_bs_sqrt_lr/log2.txt ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ max_steps: 2500
2
+ 0 val loss 10.9982
3
+ 0 val perplexity 59766.1484
4
+ 0 train 10.999202 (lr=5.5944e-08) (hash(x)=24614019)
5
+ 10 train 10.982239 (lr=6.1538e-07) (hash(x)=21487749)
6
+ 20 train 10.934907 (lr=1.1748e-06) (hash(x)=20797237)
7
+ 30 train 10.861868 (lr=1.7343e-06) (hash(x)=23863724)
8
+ 40 train 10.760084 (lr=2.2937e-06) (hash(x)=20255871)
9
+ 50 train 10.623187 (lr=2.8531e-06) (hash(x)=19287357)
10
+ 60 train 10.467155 (lr=3.4126e-06) (hash(x)=19757121)
11
+ 70 train 10.267252 (lr=3.9720e-06) (hash(x)=22844276)
12
+ 80 train 10.078154 (lr=4.5315e-06) (hash(x)=19450560)
13
+ 90 train 9.912544 (lr=5.0909e-06) (hash(x)=21604092)
14
+ 100 val loss 9.7852
15
+ 100 val perplexity 17768.8633
16
+ 100 train 9.804387 (lr=5.6503e-06) (hash(x)=20874784)
17
+ 110 train 9.746110 (lr=6.2098e-06) (hash(x)=17548923)
18
+ 120 train 9.669116 (lr=6.7692e-06) (hash(x)=24955796)
19
+ 130 train 9.607136 (lr=7.3287e-06) (hash(x)=17608570)
20
+ 140 train 9.592934 (lr=7.8881e-06) (hash(x)=17029140)
21
+ 150 train 9.558821 (lr=8.4476e-06) (hash(x)=18428573)
22
+ 160 train 9.520965 (lr=9.0070e-06) (hash(x)=16410069)
23
+ 170 train 9.475086 (lr=9.5664e-06) (hash(x)=20030582)
24
+ 180 train 9.372977 (lr=1.0126e-05) (hash(x)=18857445)
25
+ 190 train 9.345454 (lr=1.0685e-05) (hash(x)=19796984)
26
+ 200 val loss 9.2528
27
+ 200 val perplexity 10433.2715
28
+ 200 train 9.247358 (lr=1.1245e-05) (hash(x)=20858271)
29
+ 210 train 9.156383 (lr=1.1804e-05) (hash(x)=18035683)
30
+ 220 train 9.050435 (lr=1.2364e-05) (hash(x)=17370295)
31
+ 230 train 9.004829 (lr=1.2923e-05) (hash(x)=23199515)
32
+ 240 train 8.921954 (lr=1.3483e-05) (hash(x)=20512454)
33
+ 250 train 8.795404 (lr=1.4042e-05) (hash(x)=25534459)
34
+ 260 train 8.736985 (lr=1.4601e-05) (hash(x)=22751192)
35
+ 270 train 8.569971 (lr=1.5161e-05) (hash(x)=19043782)
36
+ 280 train 8.449539 (lr=1.5720e-05) (hash(x)=19850067)
37
+ 290 train 8.354852 (lr=1.6280e-05) (hash(x)=21416956)
38
+ 300 val loss 8.2570
39
+ 300 val perplexity 3854.6638
40
+ 300 train 8.208916 (lr=1.6839e-05) (hash(x)=19608904)
41
+ 310 train 8.230871 (lr=1.7399e-05) (hash(x)=20982652)
42
+ 320 train 8.080886 (lr=1.7958e-05) (hash(x)=22691225)
43
+ 330 train 7.918680 (lr=1.8517e-05) (hash(x)=18378460)
44
+ 340 train 7.845694 (lr=1.9077e-05) (hash(x)=21223131)
45
+ 350 train 7.787889 (lr=1.9636e-05) (hash(x)=19308251)
46
+ 360 train 7.674394 (lr=2.0196e-05) (hash(x)=22490407)
47
+ 370 train 7.607284 (lr=2.0755e-05) (hash(x)=21833349)
48
+ 380 train 7.664894 (lr=2.1315e-05) (hash(x)=18713401)
49
+ 390 train 7.503607 (lr=2.1874e-05) (hash(x)=20490463)
50
+ 400 val loss 7.5439
51
+ 400 val perplexity 1889.1376
52
+ 400 train 7.461785 (lr=2.2434e-05) (hash(x)=20073576)
53
+ 410 train 7.383060 (lr=2.2993e-05) (hash(x)=19019422)
54
+ 420 train 7.392637 (lr=2.3552e-05) (hash(x)=22822152)
55
+ 430 train 7.373105 (lr=2.4112e-05) (hash(x)=21350620)
56
+ 440 train 7.428552 (lr=2.4671e-05) (hash(x)=15218868)
57
+ 450 train 7.394410 (lr=2.5231e-05) (hash(x)=17018676)
58
+ 460 train 7.293060 (lr=2.5790e-05) (hash(x)=24111981)
59
+ 470 train 7.267486 (lr=2.6350e-05) (hash(x)=18594586)
60
+ 480 train 7.226883 (lr=2.6909e-05) (hash(x)=16403425)
61
+ 490 train 7.170289 (lr=2.7469e-05) (hash(x)=19880070)
62
+ 500 val loss 7.2658
63
+ 500 val perplexity 1430.4702
64
+ 500 train 7.122260 (lr=2.8028e-05) (hash(x)=18499475)
65
+ 510 train 7.177752 (lr=2.8587e-05) (hash(x)=19645366)
66
+ 520 train 7.081666 (lr=2.9147e-05) (hash(x)=19054316)
67
+ 530 train 7.239734 (lr=2.9706e-05) (hash(x)=18524225)
68
+ 540 train 7.184512 (lr=3.0266e-05) (hash(x)=19437682)
69
+ 550 train 7.193623 (lr=3.0825e-05) (hash(x)=21418385)
70
+ 560 train 7.125282 (lr=3.1385e-05) (hash(x)=19530717)
71
+ 570 train 7.236847 (lr=3.1944e-05) (hash(x)=21931309)
72
+ 580 train 7.072907 (lr=3.2503e-05) (hash(x)=21228512)
73
+ 590 train 7.073298 (lr=3.3063e-05) (hash(x)=19376632)
74
+ 600 val loss 7.1176
75
+ 600 val perplexity 1233.4994
76
+ 600 train 7.014085 (lr=3.3622e-05) (hash(x)=20259915)
77
+ 610 train 7.024707 (lr=3.4182e-05) (hash(x)=19233893)
78
+ 620 train 7.160138 (lr=3.4741e-05) (hash(x)=18605673)
79
+ 630 train 7.087669 (lr=3.5301e-05) (hash(x)=20980149)
80
+ 640 train 7.051309 (lr=3.5860e-05) (hash(x)=19953548)
81
+ 650 train 6.946896 (lr=3.6420e-05) (hash(x)=22383615)
82
+ 660 train 7.027440 (lr=3.6979e-05) (hash(x)=22297279)
83
+ 670 train 6.953912 (lr=3.7538e-05) (hash(x)=20110140)
84
+ 680 train 6.937315 (lr=3.8098e-05) (hash(x)=20797751)
85
+ 690 train 6.860869 (lr=3.8657e-05) (hash(x)=20377877)
86
+ 700 val loss 6.9829
87
+ 700 val perplexity 1078.0793
88
+ 700 train 6.861719 (lr=3.9217e-05) (hash(x)=18112896)
89
+ 710 train 7.025198 (lr=3.9776e-05) (hash(x)=30311772)
90
+ 720 train 7.081313 (lr=3.9999e-05) (hash(x)=21182016)
91
+ 730 train 6.996511 (lr=3.9994e-05) (hash(x)=20250282)
92
+ 740 train 6.857033 (lr=3.9983e-05) (hash(x)=18458726)
93
+ 750 train 6.941055 (lr=3.9966e-05) (hash(x)=21484915)
94
+ 760 train 6.778360 (lr=3.9944e-05) (hash(x)=19340760)
95
+ 770 train 6.940674 (lr=3.9916e-05) (hash(x)=19567825)
96
+ 780 train 6.882836 (lr=3.9882e-05) (hash(x)=18523049)
97
+ 790 train 6.828885 (lr=3.9843e-05) (hash(x)=18616721)
98
+ 800 val loss 6.8576
99
+ 800 val perplexity 951.0393
100
+ 800 train 6.943720 (lr=3.9799e-05) (hash(x)=18489619)
101
+ 810 train 6.824470 (lr=3.9749e-05) (hash(x)=24417196)
102
+ 820 train 6.836069 (lr=3.9694e-05) (hash(x)=19079680)
103
+ 830 train 6.697251 (lr=3.9633e-05) (hash(x)=20163900)
104
+ 840 train 6.713286 (lr=3.9566e-05) (hash(x)=23229895)
105
+ 850 train 6.703739 (lr=3.9494e-05) (hash(x)=21951781)
106
+ 860 train 6.873831 (lr=3.9417e-05) (hash(x)=21381778)
107
+ 870 train 6.772340 (lr=3.9334e-05) (hash(x)=22595145)
108
+ 880 train 6.727268 (lr=3.9246e-05) (hash(x)=19115149)
109
+ 890 train 6.777977 (lr=3.9153e-05) (hash(x)=25135395)
110
+ 900 val loss 6.7608
111
+ 900 val perplexity 863.3621
112
+ 900 train 6.765095 (lr=3.9054e-05) (hash(x)=19138143)
113
+ 910 train 6.695135 (lr=3.8950e-05) (hash(x)=22351027)
114
+ 920 train 6.616197 (lr=3.8841e-05) (hash(x)=21193325)
115
+ 930 train 6.552218 (lr=3.8727e-05) (hash(x)=17472842)
116
+ 940 train 6.550955 (lr=3.8607e-05) (hash(x)=19985222)
117
+ 950 train 6.767941 (lr=3.8482e-05) (hash(x)=19887117)
118
+ 960 train 6.763002 (lr=3.8352e-05) (hash(x)=22783633)
119
+ 970 train 6.610527 (lr=3.8217e-05) (hash(x)=18654267)
120
+ 980 train 6.667663 (lr=3.8077e-05) (hash(x)=19489001)
121
+ 990 train 6.698734 (lr=3.7933e-05) (hash(x)=19289232)
122
+ 1000 val loss 6.6755
123
+ 1000 val perplexity 792.7429
124
+ 1000 train 6.601619 (lr=3.7783e-05) (hash(x)=18147207)
125
+ 1010 train 6.534192 (lr=3.7628e-05) (hash(x)=21130846)
126
+ 1020 train 6.543934 (lr=3.7468e-05) (hash(x)=20931367)
127
+ 1030 train 6.502700 (lr=3.7304e-05) (hash(x)=21853038)
128
+ 1040 train 6.465909 (lr=3.7135e-05) (hash(x)=22488988)
129
+ 1050 train 6.679924 (lr=3.6961e-05) (hash(x)=10846124)
130
+ 1060 train 6.754278 (lr=3.6782e-05) (hash(x)=21103681)
131
+ 1070 train 6.568912 (lr=3.6599e-05) (hash(x)=20748793)
132
+ 1080 train 6.615229 (lr=3.6412e-05) (hash(x)=24405845)
133
+ 1090 train 6.766366 (lr=3.6220e-05) (hash(x)=22840904)
134
+ 1100 val loss 6.6007
135
+ 1100 val perplexity 735.5790
136
+ 1100 train 6.498883 (lr=3.6023e-05) (hash(x)=21209470)
137
+ 1110 train 6.855779 (lr=3.5823e-05) (hash(x)=23760214)
138
+ 1120 train 6.481510 (lr=3.5618e-05) (hash(x)=21572797)
139
+ 1130 train 6.348232 (lr=3.5408e-05) (hash(x)=23050434)
140
+ 1140 train 6.705042 (lr=3.5195e-05) (hash(x)=26147459)
141
+ 1150 train 6.415875 (lr=3.4977e-05) (hash(x)=17502168)
142
+ 1160 train 6.676663 (lr=3.4756e-05) (hash(x)=19433723)
143
+ 1170 train 6.607038 (lr=3.4530e-05) (hash(x)=20604003)
144
+ 1180 train 6.468834 (lr=3.4301e-05) (hash(x)=22095370)
145
+ 1190 train 6.467143 (lr=3.4068e-05) (hash(x)=20112111)
146
+ 1200 val loss 6.5294
147
+ 1200 val perplexity 684.9926
148
+ 1200 train 6.523627 (lr=3.3831e-05) (hash(x)=23773208)
149
+ 1210 train 6.469047 (lr=3.3590e-05) (hash(x)=19000106)
150
+ 1220 train 6.433692 (lr=3.3346e-05) (hash(x)=22564305)
151
+ 1230 train 6.392467 (lr=3.3099e-05) (hash(x)=18484115)
152
+ 1240 train 6.436927 (lr=3.2847e-05) (hash(x)=18482057)
153
+ 1250 train 6.589318 (lr=3.2593e-05) (hash(x)=19655261)
154
+ 1260 train 6.451422 (lr=3.2335e-05) (hash(x)=18742891)
155
+ 1270 train 6.492024 (lr=3.2074e-05) (hash(x)=22674964)
156
+ 1280 train 6.495492 (lr=3.1810e-05) (hash(x)=19356586)
157
+ 1290 train 6.421260 (lr=3.1543e-05) (hash(x)=23330715)
158
+ 1300 val loss 6.4681
159
+ 1300 val perplexity 644.2678
160
+ 1300 train 6.401929 (lr=3.1273e-05) (hash(x)=20011153)
161
+ 1310 train 6.296085 (lr=3.1000e-05) (hash(x)=20013937)
162
+ 1320 train 6.334275 (lr=3.0724e-05) (hash(x)=21245751)
163
+ 1330 train 6.286648 (lr=3.0446e-05) (hash(x)=18218831)
164
+ 1340 train 6.456255 (lr=3.0165e-05) (hash(x)=19393527)
165
+ 1350 train 6.435429 (lr=2.9881e-05) (hash(x)=20965851)
166
+ 1360 train 6.547733 (lr=2.9595e-05) (hash(x)=22247436)
167
+ 1370 train 6.440526 (lr=2.9307e-05) (hash(x)=22757203)
168
+ 1380 train 6.450113 (lr=2.9016e-05) (hash(x)=18779213)
169
+ 1390 train 6.351770 (lr=2.8723e-05) (hash(x)=19426568)
170
+ 1400 val loss 6.4178
171
+ 1400 val perplexity 612.6356
172
+ 1400 train 6.401601 (lr=2.8428e-05) (hash(x)=19566329)
173
+ 1410 train 6.252581 (lr=2.8132e-05) (hash(x)=17636349)
174
+ 1420 train 6.271146 (lr=2.7833e-05) (hash(x)=23450878)
175
+ 1430 train 6.365708 (lr=2.7532e-05) (hash(x)=23379565)
176
+ 1440 train 6.475684 (lr=2.7230e-05) (hash(x)=16707949)
177
+ 1450 train 6.491540 (lr=2.6926e-05) (hash(x)=19564066)
178
+ 1460 train 6.357025 (lr=2.6620e-05) (hash(x)=17509928)
179
+ 1470 train 6.295128 (lr=2.6314e-05) (hash(x)=19145424)
180
+ 1480 train 6.260694 (lr=2.6005e-05) (hash(x)=20136952)
181
+ 1490 train 6.271736 (lr=2.5696e-05) (hash(x)=19182341)
182
+ 1500 val loss 6.3765
183
+ 1500 val perplexity 587.8658
184
+ 1500 train 6.235629 (lr=2.5385e-05) (hash(x)=16455771)
185
+ 1510 train 6.317551 (lr=2.5074e-05) (hash(x)=19817914)
186
+ 1520 train 6.215010 (lr=2.4761e-05) (hash(x)=20202182)
187
+ 1530 train 6.174621 (lr=2.4448e-05) (hash(x)=19052022)
188
+ 1540 train 6.231863 (lr=2.4133e-05) (hash(x)=15006357)
189
+ 1550 train 6.194119 (lr=2.3818e-05) (hash(x)=19767351)
190
+ 1560 train 6.265318 (lr=2.3503e-05) (hash(x)=20849000)
191
+ 1570 train 6.275762 (lr=2.3187e-05) (hash(x)=16243823)
192
+ 1580 train 6.420673 (lr=2.2871e-05) (hash(x)=19401948)
193
+ 1590 train 6.273223 (lr=2.2554e-05) (hash(x)=18825747)
194
+ 1600 val loss 6.3368
195
+ 1600 val perplexity 564.9612
196
+ 1600 train 6.265001 (lr=2.2238e-05) (hash(x)=20685248)
197
+ 1610 train 6.201800 (lr=2.1921e-05) (hash(x)=15316525)
198
+ 1620 train 6.143041 (lr=2.1604e-05) (hash(x)=18598785)
199
+ 1630 train 6.244697 (lr=2.1287e-05) (hash(x)=17902896)
200
+ 1640 train 6.342398 (lr=2.0971e-05) (hash(x)=22964168)
201
+ 1650 train 6.294670 (lr=2.0655e-05) (hash(x)=22144561)
202
+ 1660 train 6.248938 (lr=2.0339e-05) (hash(x)=22338904)
203
+ 1670 train 6.260442 (lr=2.0024e-05) (hash(x)=19682248)
204
+ 1680 train 6.164854 (lr=1.9709e-05) (hash(x)=17634615)
205
+ 1690 train 6.113889 (lr=1.9396e-05) (hash(x)=21122086)
206
+ 1700 val loss 6.3067
207
+ 1700 val perplexity 548.2374
208
+ 1700 train 6.182898 (lr=1.9083e-05) (hash(x)=18774327)
209
+ 1710 train 6.288434 (lr=1.8770e-05) (hash(x)=18681638)
210
+ 1720 train 6.395947 (lr=1.8459e-05) (hash(x)=12734360)
211
+ 1730 train 6.304706 (lr=1.8149e-05) (hash(x)=16881623)
212
+ 1740 train 6.292227 (lr=1.7840e-05) (hash(x)=18682791)
213
+ 1750 train 6.160336 (lr=1.7533e-05) (hash(x)=20382530)
214
+ 1760 train 6.127007 (lr=1.7227e-05) (hash(x)=21838844)
215
+ 1770 train 6.210433 (lr=1.6922e-05) (hash(x)=24508512)
216
+ 1780 train 6.336304 (lr=1.6619e-05) (hash(x)=20538110)
217
+ 1790 train 6.336246 (lr=1.6317e-05) (hash(x)=22444676)
218
+ 1800 val loss 6.2749
219
+ 1800 val perplexity 531.0672
220
+ 1800 train 6.165369 (lr=1.6018e-05) (hash(x)=19604219)
221
+ 1810 train 6.201979 (lr=1.5720e-05) (hash(x)=18899323)
222
+ 1820 train 6.157725 (lr=1.5424e-05) (hash(x)=17080605)
223
+ 1830 train 6.210185 (lr=1.5130e-05) (hash(x)=23581365)
224
+ 1840 train 6.113577 (lr=1.4838e-05) (hash(x)=21100558)
225
+ 1850 train 6.305647 (lr=1.4549e-05) (hash(x)=21163025)
226
+ 1860 train 6.175817 (lr=1.4262e-05) (hash(x)=19370953)
227
+ 1870 train 6.193404 (lr=1.3977e-05) (hash(x)=18968563)
228
+ 1880 train 6.264138 (lr=1.3694e-05) (hash(x)=19052948)
229
+ 1890 train 6.129352 (lr=1.3415e-05) (hash(x)=17522561)
230
+ 1900 val loss 6.2527
231
+ 1900 val perplexity 519.4227
232
+ 1900 train 6.191400 (lr=1.3138e-05) (hash(x)=17771350)
233
+ 1910 train 6.256058 (lr=1.2863e-05) (hash(x)=20297161)
234
+ 1920 train 6.216085 (lr=1.2592e-05) (hash(x)=18138349)
235
+ 1930 train 6.125598 (lr=1.2323e-05) (hash(x)=20032776)
236
+ 1940 train 6.237648 (lr=1.2057e-05) (hash(x)=19811309)
237
+ 1950 train 6.274024 (lr=1.1795e-05) (hash(x)=20604200)
238
+ 1960 train 6.306325 (lr=1.1536e-05) (hash(x)=19535927)
239
+ 1970 train 6.230391 (lr=1.1279e-05) (hash(x)=21285554)
240
+ 1980 train 6.204495 (lr=1.1027e-05) (hash(x)=19395085)
241
+ 1990 train 6.167343 (lr=1.0777e-05) (hash(x)=18641559)
242
+ 2000 val loss 6.2338
243
+ 2000 val perplexity 509.6657
244
+ 2000 train 6.120337 (lr=1.0531e-05) (hash(x)=19061766)
245
+ 2010 train 6.141171 (lr=1.0289e-05) (hash(x)=19920514)
246
+ 2020 train 6.300436 (lr=1.0050e-05) (hash(x)=16377887)
247
+ 2030 train 6.265210 (lr=9.8151e-06) (hash(x)=20982613)
248
+ 2040 train 6.193947 (lr=9.5838e-06) (hash(x)=18382837)
249
+ 2050 train 6.209549 (lr=9.3564e-06) (hash(x)=20629556)
250
+ 2060 train 6.118829 (lr=9.1328e-06) (hash(x)=21419862)
251
+ 2070 train 6.151879 (lr=8.9133e-06) (hash(x)=19255727)
252
+ 2080 train 6.115231 (lr=8.6978e-06) (hash(x)=22371977)
253
+ 2090 train 6.228383 (lr=8.4865e-06) (hash(x)=19801756)
254
+ 2100 val loss 6.2179
255
+ 2100 val perplexity 501.6340
256
+ 2100 train 6.242752 (lr=8.2793e-06) (hash(x)=19775253)
257
+ 2110 train 6.178238 (lr=8.0764e-06) (hash(x)=19908328)
258
+ 2120 train 6.212610 (lr=7.8778e-06) (hash(x)=16866379)
259
+ 2130 train 6.120737 (lr=7.6836e-06) (hash(x)=18603879)
260
+ 2140 train 6.153936 (lr=7.4938e-06) (hash(x)=17410022)
261
+ 2150 train 6.117592 (lr=7.3085e-06) (hash(x)=20670670)
262
+ 2160 train 6.240406 (lr=7.1277e-06) (hash(x)=18937483)
263
+ 2170 train 6.288017 (lr=6.9516e-06) (hash(x)=17747478)
264
+ 2180 train 6.246072 (lr=6.7801e-06) (hash(x)=22007580)
265
+ 2190 train 6.121523 (lr=6.6133e-06) (hash(x)=30629796)
266
+ 2200 val loss 6.2060
267
+ 2200 val perplexity 495.6978
268
+ 2200 train 6.136976 (lr=6.4513e-06) (hash(x)=18376034)
269
+ 2210 train 6.118820 (lr=6.2941e-06) (hash(x)=21805545)
270
+ 2220 train 6.141832 (lr=6.1418e-06) (hash(x)=21333227)
271
+ 2230 train 6.273334 (lr=5.9944e-06) (hash(x)=22196262)
272
+ 2240 train 6.295862 (lr=5.8519e-06) (hash(x)=19313786)
273
+ 2250 train 6.202902 (lr=5.7145e-06) (hash(x)=20647579)
274
+ 2260 train 6.134450 (lr=5.5821e-06) (hash(x)=17364516)
275
+ 2270 train 6.018724 (lr=5.4547e-06) (hash(x)=15770875)
276
+ 2280 train 6.145643 (lr=5.3325e-06) (hash(x)=21775829)
277
+ 2290 train 6.097759 (lr=5.2155e-06) (hash(x)=16146754)
278
+ 2300 val loss 6.1989
279
+ 2300 val perplexity 492.2006
280
+ 2300 train 6.115635 (lr=5.1037e-06) (hash(x)=19619330)
281
+ 2310 train 6.031117 (lr=4.9971e-06) (hash(x)=20440748)
282
+ 2320 train 6.219659 (lr=4.8957e-06) (hash(x)=18953334)
283
+ 2330 train 6.249330 (lr=4.7997e-06) (hash(x)=16187606)
284
+ 2340 train 6.242335 (lr=4.7090e-06) (hash(x)=20090275)
285
+ 2350 train 6.105347 (lr=4.6236e-06) (hash(x)=20652023)
286
+ 2360 train 6.152916 (lr=4.5437e-06) (hash(x)=22572496)
287
+ 2370 train 6.171911 (lr=4.4691e-06) (hash(x)=17579410)
288
+ 2380 train 6.125903 (lr=4.4000e-06) (hash(x)=20725075)
289
+ 2390 train 6.098174 (lr=4.3363e-06) (hash(x)=30725634)
290
+ 2400 val loss 6.1907
291
+ 2400 val perplexity 488.1908
292
+ 2400 train 6.180841 (lr=4.2781e-06) (hash(x)=21726726)
293
+ 2410 train 6.272202 (lr=4.2253e-06) (hash(x)=22029774)
294
+ 2420 train 6.233039 (lr=4.1781e-06) (hash(x)=20055172)
295
+ 2430 train 6.133156 (lr=4.1364e-06) (hash(x)=20121441)
296
+ 2440 train 6.214977 (lr=4.1003e-06) (hash(x)=22067748)
297
+ 2450 train 6.164004 (lr=4.0697e-06) (hash(x)=18254046)
298
+ 2460 train 6.142705 (lr=4.0446e-06) (hash(x)=18234469)
299
+ 2470 train 6.125352 (lr=4.0251e-06) (hash(x)=20676963)
300
+ 2480 train 6.086622 (lr=4.0112e-06) (hash(x)=21024535)
301
+ 2490 train 6.098122 (lr=4.0028e-06) (hash(x)=20148844)
302
+ 2499 val loss 6.1838
303
+ 2499 val perplexity 484.8386
half_total_bs_sqrt_lr/model_02499.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13a6d7fde9e22a3131910003859302b0e281c3b358900e6b86dff348ffbe86b4
3
+ size 548152706
half_total_bs_sqrt_lr/optimizer_02499.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7fd9f58186a971a7b23c5bcf8c8794618d840697c11bd6e539028514567804f
3
+ size 995652870