Upload folder using huggingface_hub
Browse files
half_total_bs_sqrt_lr/args.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"hellaswag": true, "attention_kind": "selective", "log_dir": "half_total_bs_sqrt_lr", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 12, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": null, "warmup_steps": null, "group": "shrinking_big_runs_2", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1337, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": null, "batch_size": 4, "total_batch_size": 262144, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": false, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 4e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": null, "n_embd": 768}
|
half_total_bs_sqrt_lr/dataloader_02499.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c5fcd93c5e67d67c2b8db7c6b19f314688fcb1792fa1900f091ff53035c716d9
|
| 3 |
+
size 964
|
half_total_bs_sqrt_lr/log2.txt
ADDED
|
@@ -0,0 +1,303 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
max_steps: 2500
|
| 2 |
+
0 val loss 10.9982
|
| 3 |
+
0 val perplexity 59766.1484
|
| 4 |
+
0 train 10.999202 (lr=5.5944e-08) (hash(x)=24614019)
|
| 5 |
+
10 train 10.982239 (lr=6.1538e-07) (hash(x)=21487749)
|
| 6 |
+
20 train 10.934907 (lr=1.1748e-06) (hash(x)=20797237)
|
| 7 |
+
30 train 10.861868 (lr=1.7343e-06) (hash(x)=23863724)
|
| 8 |
+
40 train 10.760084 (lr=2.2937e-06) (hash(x)=20255871)
|
| 9 |
+
50 train 10.623187 (lr=2.8531e-06) (hash(x)=19287357)
|
| 10 |
+
60 train 10.467155 (lr=3.4126e-06) (hash(x)=19757121)
|
| 11 |
+
70 train 10.267252 (lr=3.9720e-06) (hash(x)=22844276)
|
| 12 |
+
80 train 10.078154 (lr=4.5315e-06) (hash(x)=19450560)
|
| 13 |
+
90 train 9.912544 (lr=5.0909e-06) (hash(x)=21604092)
|
| 14 |
+
100 val loss 9.7852
|
| 15 |
+
100 val perplexity 17768.8633
|
| 16 |
+
100 train 9.804387 (lr=5.6503e-06) (hash(x)=20874784)
|
| 17 |
+
110 train 9.746110 (lr=6.2098e-06) (hash(x)=17548923)
|
| 18 |
+
120 train 9.669116 (lr=6.7692e-06) (hash(x)=24955796)
|
| 19 |
+
130 train 9.607136 (lr=7.3287e-06) (hash(x)=17608570)
|
| 20 |
+
140 train 9.592934 (lr=7.8881e-06) (hash(x)=17029140)
|
| 21 |
+
150 train 9.558821 (lr=8.4476e-06) (hash(x)=18428573)
|
| 22 |
+
160 train 9.520965 (lr=9.0070e-06) (hash(x)=16410069)
|
| 23 |
+
170 train 9.475086 (lr=9.5664e-06) (hash(x)=20030582)
|
| 24 |
+
180 train 9.372977 (lr=1.0126e-05) (hash(x)=18857445)
|
| 25 |
+
190 train 9.345454 (lr=1.0685e-05) (hash(x)=19796984)
|
| 26 |
+
200 val loss 9.2528
|
| 27 |
+
200 val perplexity 10433.2715
|
| 28 |
+
200 train 9.247358 (lr=1.1245e-05) (hash(x)=20858271)
|
| 29 |
+
210 train 9.156383 (lr=1.1804e-05) (hash(x)=18035683)
|
| 30 |
+
220 train 9.050435 (lr=1.2364e-05) (hash(x)=17370295)
|
| 31 |
+
230 train 9.004829 (lr=1.2923e-05) (hash(x)=23199515)
|
| 32 |
+
240 train 8.921954 (lr=1.3483e-05) (hash(x)=20512454)
|
| 33 |
+
250 train 8.795404 (lr=1.4042e-05) (hash(x)=25534459)
|
| 34 |
+
260 train 8.736985 (lr=1.4601e-05) (hash(x)=22751192)
|
| 35 |
+
270 train 8.569971 (lr=1.5161e-05) (hash(x)=19043782)
|
| 36 |
+
280 train 8.449539 (lr=1.5720e-05) (hash(x)=19850067)
|
| 37 |
+
290 train 8.354852 (lr=1.6280e-05) (hash(x)=21416956)
|
| 38 |
+
300 val loss 8.2570
|
| 39 |
+
300 val perplexity 3854.6638
|
| 40 |
+
300 train 8.208916 (lr=1.6839e-05) (hash(x)=19608904)
|
| 41 |
+
310 train 8.230871 (lr=1.7399e-05) (hash(x)=20982652)
|
| 42 |
+
320 train 8.080886 (lr=1.7958e-05) (hash(x)=22691225)
|
| 43 |
+
330 train 7.918680 (lr=1.8517e-05) (hash(x)=18378460)
|
| 44 |
+
340 train 7.845694 (lr=1.9077e-05) (hash(x)=21223131)
|
| 45 |
+
350 train 7.787889 (lr=1.9636e-05) (hash(x)=19308251)
|
| 46 |
+
360 train 7.674394 (lr=2.0196e-05) (hash(x)=22490407)
|
| 47 |
+
370 train 7.607284 (lr=2.0755e-05) (hash(x)=21833349)
|
| 48 |
+
380 train 7.664894 (lr=2.1315e-05) (hash(x)=18713401)
|
| 49 |
+
390 train 7.503607 (lr=2.1874e-05) (hash(x)=20490463)
|
| 50 |
+
400 val loss 7.5439
|
| 51 |
+
400 val perplexity 1889.1376
|
| 52 |
+
400 train 7.461785 (lr=2.2434e-05) (hash(x)=20073576)
|
| 53 |
+
410 train 7.383060 (lr=2.2993e-05) (hash(x)=19019422)
|
| 54 |
+
420 train 7.392637 (lr=2.3552e-05) (hash(x)=22822152)
|
| 55 |
+
430 train 7.373105 (lr=2.4112e-05) (hash(x)=21350620)
|
| 56 |
+
440 train 7.428552 (lr=2.4671e-05) (hash(x)=15218868)
|
| 57 |
+
450 train 7.394410 (lr=2.5231e-05) (hash(x)=17018676)
|
| 58 |
+
460 train 7.293060 (lr=2.5790e-05) (hash(x)=24111981)
|
| 59 |
+
470 train 7.267486 (lr=2.6350e-05) (hash(x)=18594586)
|
| 60 |
+
480 train 7.226883 (lr=2.6909e-05) (hash(x)=16403425)
|
| 61 |
+
490 train 7.170289 (lr=2.7469e-05) (hash(x)=19880070)
|
| 62 |
+
500 val loss 7.2658
|
| 63 |
+
500 val perplexity 1430.4702
|
| 64 |
+
500 train 7.122260 (lr=2.8028e-05) (hash(x)=18499475)
|
| 65 |
+
510 train 7.177752 (lr=2.8587e-05) (hash(x)=19645366)
|
| 66 |
+
520 train 7.081666 (lr=2.9147e-05) (hash(x)=19054316)
|
| 67 |
+
530 train 7.239734 (lr=2.9706e-05) (hash(x)=18524225)
|
| 68 |
+
540 train 7.184512 (lr=3.0266e-05) (hash(x)=19437682)
|
| 69 |
+
550 train 7.193623 (lr=3.0825e-05) (hash(x)=21418385)
|
| 70 |
+
560 train 7.125282 (lr=3.1385e-05) (hash(x)=19530717)
|
| 71 |
+
570 train 7.236847 (lr=3.1944e-05) (hash(x)=21931309)
|
| 72 |
+
580 train 7.072907 (lr=3.2503e-05) (hash(x)=21228512)
|
| 73 |
+
590 train 7.073298 (lr=3.3063e-05) (hash(x)=19376632)
|
| 74 |
+
600 val loss 7.1176
|
| 75 |
+
600 val perplexity 1233.4994
|
| 76 |
+
600 train 7.014085 (lr=3.3622e-05) (hash(x)=20259915)
|
| 77 |
+
610 train 7.024707 (lr=3.4182e-05) (hash(x)=19233893)
|
| 78 |
+
620 train 7.160138 (lr=3.4741e-05) (hash(x)=18605673)
|
| 79 |
+
630 train 7.087669 (lr=3.5301e-05) (hash(x)=20980149)
|
| 80 |
+
640 train 7.051309 (lr=3.5860e-05) (hash(x)=19953548)
|
| 81 |
+
650 train 6.946896 (lr=3.6420e-05) (hash(x)=22383615)
|
| 82 |
+
660 train 7.027440 (lr=3.6979e-05) (hash(x)=22297279)
|
| 83 |
+
670 train 6.953912 (lr=3.7538e-05) (hash(x)=20110140)
|
| 84 |
+
680 train 6.937315 (lr=3.8098e-05) (hash(x)=20797751)
|
| 85 |
+
690 train 6.860869 (lr=3.8657e-05) (hash(x)=20377877)
|
| 86 |
+
700 val loss 6.9829
|
| 87 |
+
700 val perplexity 1078.0793
|
| 88 |
+
700 train 6.861719 (lr=3.9217e-05) (hash(x)=18112896)
|
| 89 |
+
710 train 7.025198 (lr=3.9776e-05) (hash(x)=30311772)
|
| 90 |
+
720 train 7.081313 (lr=3.9999e-05) (hash(x)=21182016)
|
| 91 |
+
730 train 6.996511 (lr=3.9994e-05) (hash(x)=20250282)
|
| 92 |
+
740 train 6.857033 (lr=3.9983e-05) (hash(x)=18458726)
|
| 93 |
+
750 train 6.941055 (lr=3.9966e-05) (hash(x)=21484915)
|
| 94 |
+
760 train 6.778360 (lr=3.9944e-05) (hash(x)=19340760)
|
| 95 |
+
770 train 6.940674 (lr=3.9916e-05) (hash(x)=19567825)
|
| 96 |
+
780 train 6.882836 (lr=3.9882e-05) (hash(x)=18523049)
|
| 97 |
+
790 train 6.828885 (lr=3.9843e-05) (hash(x)=18616721)
|
| 98 |
+
800 val loss 6.8576
|
| 99 |
+
800 val perplexity 951.0393
|
| 100 |
+
800 train 6.943720 (lr=3.9799e-05) (hash(x)=18489619)
|
| 101 |
+
810 train 6.824470 (lr=3.9749e-05) (hash(x)=24417196)
|
| 102 |
+
820 train 6.836069 (lr=3.9694e-05) (hash(x)=19079680)
|
| 103 |
+
830 train 6.697251 (lr=3.9633e-05) (hash(x)=20163900)
|
| 104 |
+
840 train 6.713286 (lr=3.9566e-05) (hash(x)=23229895)
|
| 105 |
+
850 train 6.703739 (lr=3.9494e-05) (hash(x)=21951781)
|
| 106 |
+
860 train 6.873831 (lr=3.9417e-05) (hash(x)=21381778)
|
| 107 |
+
870 train 6.772340 (lr=3.9334e-05) (hash(x)=22595145)
|
| 108 |
+
880 train 6.727268 (lr=3.9246e-05) (hash(x)=19115149)
|
| 109 |
+
890 train 6.777977 (lr=3.9153e-05) (hash(x)=25135395)
|
| 110 |
+
900 val loss 6.7608
|
| 111 |
+
900 val perplexity 863.3621
|
| 112 |
+
900 train 6.765095 (lr=3.9054e-05) (hash(x)=19138143)
|
| 113 |
+
910 train 6.695135 (lr=3.8950e-05) (hash(x)=22351027)
|
| 114 |
+
920 train 6.616197 (lr=3.8841e-05) (hash(x)=21193325)
|
| 115 |
+
930 train 6.552218 (lr=3.8727e-05) (hash(x)=17472842)
|
| 116 |
+
940 train 6.550955 (lr=3.8607e-05) (hash(x)=19985222)
|
| 117 |
+
950 train 6.767941 (lr=3.8482e-05) (hash(x)=19887117)
|
| 118 |
+
960 train 6.763002 (lr=3.8352e-05) (hash(x)=22783633)
|
| 119 |
+
970 train 6.610527 (lr=3.8217e-05) (hash(x)=18654267)
|
| 120 |
+
980 train 6.667663 (lr=3.8077e-05) (hash(x)=19489001)
|
| 121 |
+
990 train 6.698734 (lr=3.7933e-05) (hash(x)=19289232)
|
| 122 |
+
1000 val loss 6.6755
|
| 123 |
+
1000 val perplexity 792.7429
|
| 124 |
+
1000 train 6.601619 (lr=3.7783e-05) (hash(x)=18147207)
|
| 125 |
+
1010 train 6.534192 (lr=3.7628e-05) (hash(x)=21130846)
|
| 126 |
+
1020 train 6.543934 (lr=3.7468e-05) (hash(x)=20931367)
|
| 127 |
+
1030 train 6.502700 (lr=3.7304e-05) (hash(x)=21853038)
|
| 128 |
+
1040 train 6.465909 (lr=3.7135e-05) (hash(x)=22488988)
|
| 129 |
+
1050 train 6.679924 (lr=3.6961e-05) (hash(x)=10846124)
|
| 130 |
+
1060 train 6.754278 (lr=3.6782e-05) (hash(x)=21103681)
|
| 131 |
+
1070 train 6.568912 (lr=3.6599e-05) (hash(x)=20748793)
|
| 132 |
+
1080 train 6.615229 (lr=3.6412e-05) (hash(x)=24405845)
|
| 133 |
+
1090 train 6.766366 (lr=3.6220e-05) (hash(x)=22840904)
|
| 134 |
+
1100 val loss 6.6007
|
| 135 |
+
1100 val perplexity 735.5790
|
| 136 |
+
1100 train 6.498883 (lr=3.6023e-05) (hash(x)=21209470)
|
| 137 |
+
1110 train 6.855779 (lr=3.5823e-05) (hash(x)=23760214)
|
| 138 |
+
1120 train 6.481510 (lr=3.5618e-05) (hash(x)=21572797)
|
| 139 |
+
1130 train 6.348232 (lr=3.5408e-05) (hash(x)=23050434)
|
| 140 |
+
1140 train 6.705042 (lr=3.5195e-05) (hash(x)=26147459)
|
| 141 |
+
1150 train 6.415875 (lr=3.4977e-05) (hash(x)=17502168)
|
| 142 |
+
1160 train 6.676663 (lr=3.4756e-05) (hash(x)=19433723)
|
| 143 |
+
1170 train 6.607038 (lr=3.4530e-05) (hash(x)=20604003)
|
| 144 |
+
1180 train 6.468834 (lr=3.4301e-05) (hash(x)=22095370)
|
| 145 |
+
1190 train 6.467143 (lr=3.4068e-05) (hash(x)=20112111)
|
| 146 |
+
1200 val loss 6.5294
|
| 147 |
+
1200 val perplexity 684.9926
|
| 148 |
+
1200 train 6.523627 (lr=3.3831e-05) (hash(x)=23773208)
|
| 149 |
+
1210 train 6.469047 (lr=3.3590e-05) (hash(x)=19000106)
|
| 150 |
+
1220 train 6.433692 (lr=3.3346e-05) (hash(x)=22564305)
|
| 151 |
+
1230 train 6.392467 (lr=3.3099e-05) (hash(x)=18484115)
|
| 152 |
+
1240 train 6.436927 (lr=3.2847e-05) (hash(x)=18482057)
|
| 153 |
+
1250 train 6.589318 (lr=3.2593e-05) (hash(x)=19655261)
|
| 154 |
+
1260 train 6.451422 (lr=3.2335e-05) (hash(x)=18742891)
|
| 155 |
+
1270 train 6.492024 (lr=3.2074e-05) (hash(x)=22674964)
|
| 156 |
+
1280 train 6.495492 (lr=3.1810e-05) (hash(x)=19356586)
|
| 157 |
+
1290 train 6.421260 (lr=3.1543e-05) (hash(x)=23330715)
|
| 158 |
+
1300 val loss 6.4681
|
| 159 |
+
1300 val perplexity 644.2678
|
| 160 |
+
1300 train 6.401929 (lr=3.1273e-05) (hash(x)=20011153)
|
| 161 |
+
1310 train 6.296085 (lr=3.1000e-05) (hash(x)=20013937)
|
| 162 |
+
1320 train 6.334275 (lr=3.0724e-05) (hash(x)=21245751)
|
| 163 |
+
1330 train 6.286648 (lr=3.0446e-05) (hash(x)=18218831)
|
| 164 |
+
1340 train 6.456255 (lr=3.0165e-05) (hash(x)=19393527)
|
| 165 |
+
1350 train 6.435429 (lr=2.9881e-05) (hash(x)=20965851)
|
| 166 |
+
1360 train 6.547733 (lr=2.9595e-05) (hash(x)=22247436)
|
| 167 |
+
1370 train 6.440526 (lr=2.9307e-05) (hash(x)=22757203)
|
| 168 |
+
1380 train 6.450113 (lr=2.9016e-05) (hash(x)=18779213)
|
| 169 |
+
1390 train 6.351770 (lr=2.8723e-05) (hash(x)=19426568)
|
| 170 |
+
1400 val loss 6.4178
|
| 171 |
+
1400 val perplexity 612.6356
|
| 172 |
+
1400 train 6.401601 (lr=2.8428e-05) (hash(x)=19566329)
|
| 173 |
+
1410 train 6.252581 (lr=2.8132e-05) (hash(x)=17636349)
|
| 174 |
+
1420 train 6.271146 (lr=2.7833e-05) (hash(x)=23450878)
|
| 175 |
+
1430 train 6.365708 (lr=2.7532e-05) (hash(x)=23379565)
|
| 176 |
+
1440 train 6.475684 (lr=2.7230e-05) (hash(x)=16707949)
|
| 177 |
+
1450 train 6.491540 (lr=2.6926e-05) (hash(x)=19564066)
|
| 178 |
+
1460 train 6.357025 (lr=2.6620e-05) (hash(x)=17509928)
|
| 179 |
+
1470 train 6.295128 (lr=2.6314e-05) (hash(x)=19145424)
|
| 180 |
+
1480 train 6.260694 (lr=2.6005e-05) (hash(x)=20136952)
|
| 181 |
+
1490 train 6.271736 (lr=2.5696e-05) (hash(x)=19182341)
|
| 182 |
+
1500 val loss 6.3765
|
| 183 |
+
1500 val perplexity 587.8658
|
| 184 |
+
1500 train 6.235629 (lr=2.5385e-05) (hash(x)=16455771)
|
| 185 |
+
1510 train 6.317551 (lr=2.5074e-05) (hash(x)=19817914)
|
| 186 |
+
1520 train 6.215010 (lr=2.4761e-05) (hash(x)=20202182)
|
| 187 |
+
1530 train 6.174621 (lr=2.4448e-05) (hash(x)=19052022)
|
| 188 |
+
1540 train 6.231863 (lr=2.4133e-05) (hash(x)=15006357)
|
| 189 |
+
1550 train 6.194119 (lr=2.3818e-05) (hash(x)=19767351)
|
| 190 |
+
1560 train 6.265318 (lr=2.3503e-05) (hash(x)=20849000)
|
| 191 |
+
1570 train 6.275762 (lr=2.3187e-05) (hash(x)=16243823)
|
| 192 |
+
1580 train 6.420673 (lr=2.2871e-05) (hash(x)=19401948)
|
| 193 |
+
1590 train 6.273223 (lr=2.2554e-05) (hash(x)=18825747)
|
| 194 |
+
1600 val loss 6.3368
|
| 195 |
+
1600 val perplexity 564.9612
|
| 196 |
+
1600 train 6.265001 (lr=2.2238e-05) (hash(x)=20685248)
|
| 197 |
+
1610 train 6.201800 (lr=2.1921e-05) (hash(x)=15316525)
|
| 198 |
+
1620 train 6.143041 (lr=2.1604e-05) (hash(x)=18598785)
|
| 199 |
+
1630 train 6.244697 (lr=2.1287e-05) (hash(x)=17902896)
|
| 200 |
+
1640 train 6.342398 (lr=2.0971e-05) (hash(x)=22964168)
|
| 201 |
+
1650 train 6.294670 (lr=2.0655e-05) (hash(x)=22144561)
|
| 202 |
+
1660 train 6.248938 (lr=2.0339e-05) (hash(x)=22338904)
|
| 203 |
+
1670 train 6.260442 (lr=2.0024e-05) (hash(x)=19682248)
|
| 204 |
+
1680 train 6.164854 (lr=1.9709e-05) (hash(x)=17634615)
|
| 205 |
+
1690 train 6.113889 (lr=1.9396e-05) (hash(x)=21122086)
|
| 206 |
+
1700 val loss 6.3067
|
| 207 |
+
1700 val perplexity 548.2374
|
| 208 |
+
1700 train 6.182898 (lr=1.9083e-05) (hash(x)=18774327)
|
| 209 |
+
1710 train 6.288434 (lr=1.8770e-05) (hash(x)=18681638)
|
| 210 |
+
1720 train 6.395947 (lr=1.8459e-05) (hash(x)=12734360)
|
| 211 |
+
1730 train 6.304706 (lr=1.8149e-05) (hash(x)=16881623)
|
| 212 |
+
1740 train 6.292227 (lr=1.7840e-05) (hash(x)=18682791)
|
| 213 |
+
1750 train 6.160336 (lr=1.7533e-05) (hash(x)=20382530)
|
| 214 |
+
1760 train 6.127007 (lr=1.7227e-05) (hash(x)=21838844)
|
| 215 |
+
1770 train 6.210433 (lr=1.6922e-05) (hash(x)=24508512)
|
| 216 |
+
1780 train 6.336304 (lr=1.6619e-05) (hash(x)=20538110)
|
| 217 |
+
1790 train 6.336246 (lr=1.6317e-05) (hash(x)=22444676)
|
| 218 |
+
1800 val loss 6.2749
|
| 219 |
+
1800 val perplexity 531.0672
|
| 220 |
+
1800 train 6.165369 (lr=1.6018e-05) (hash(x)=19604219)
|
| 221 |
+
1810 train 6.201979 (lr=1.5720e-05) (hash(x)=18899323)
|
| 222 |
+
1820 train 6.157725 (lr=1.5424e-05) (hash(x)=17080605)
|
| 223 |
+
1830 train 6.210185 (lr=1.5130e-05) (hash(x)=23581365)
|
| 224 |
+
1840 train 6.113577 (lr=1.4838e-05) (hash(x)=21100558)
|
| 225 |
+
1850 train 6.305647 (lr=1.4549e-05) (hash(x)=21163025)
|
| 226 |
+
1860 train 6.175817 (lr=1.4262e-05) (hash(x)=19370953)
|
| 227 |
+
1870 train 6.193404 (lr=1.3977e-05) (hash(x)=18968563)
|
| 228 |
+
1880 train 6.264138 (lr=1.3694e-05) (hash(x)=19052948)
|
| 229 |
+
1890 train 6.129352 (lr=1.3415e-05) (hash(x)=17522561)
|
| 230 |
+
1900 val loss 6.2527
|
| 231 |
+
1900 val perplexity 519.4227
|
| 232 |
+
1900 train 6.191400 (lr=1.3138e-05) (hash(x)=17771350)
|
| 233 |
+
1910 train 6.256058 (lr=1.2863e-05) (hash(x)=20297161)
|
| 234 |
+
1920 train 6.216085 (lr=1.2592e-05) (hash(x)=18138349)
|
| 235 |
+
1930 train 6.125598 (lr=1.2323e-05) (hash(x)=20032776)
|
| 236 |
+
1940 train 6.237648 (lr=1.2057e-05) (hash(x)=19811309)
|
| 237 |
+
1950 train 6.274024 (lr=1.1795e-05) (hash(x)=20604200)
|
| 238 |
+
1960 train 6.306325 (lr=1.1536e-05) (hash(x)=19535927)
|
| 239 |
+
1970 train 6.230391 (lr=1.1279e-05) (hash(x)=21285554)
|
| 240 |
+
1980 train 6.204495 (lr=1.1027e-05) (hash(x)=19395085)
|
| 241 |
+
1990 train 6.167343 (lr=1.0777e-05) (hash(x)=18641559)
|
| 242 |
+
2000 val loss 6.2338
|
| 243 |
+
2000 val perplexity 509.6657
|
| 244 |
+
2000 train 6.120337 (lr=1.0531e-05) (hash(x)=19061766)
|
| 245 |
+
2010 train 6.141171 (lr=1.0289e-05) (hash(x)=19920514)
|
| 246 |
+
2020 train 6.300436 (lr=1.0050e-05) (hash(x)=16377887)
|
| 247 |
+
2030 train 6.265210 (lr=9.8151e-06) (hash(x)=20982613)
|
| 248 |
+
2040 train 6.193947 (lr=9.5838e-06) (hash(x)=18382837)
|
| 249 |
+
2050 train 6.209549 (lr=9.3564e-06) (hash(x)=20629556)
|
| 250 |
+
2060 train 6.118829 (lr=9.1328e-06) (hash(x)=21419862)
|
| 251 |
+
2070 train 6.151879 (lr=8.9133e-06) (hash(x)=19255727)
|
| 252 |
+
2080 train 6.115231 (lr=8.6978e-06) (hash(x)=22371977)
|
| 253 |
+
2090 train 6.228383 (lr=8.4865e-06) (hash(x)=19801756)
|
| 254 |
+
2100 val loss 6.2179
|
| 255 |
+
2100 val perplexity 501.6340
|
| 256 |
+
2100 train 6.242752 (lr=8.2793e-06) (hash(x)=19775253)
|
| 257 |
+
2110 train 6.178238 (lr=8.0764e-06) (hash(x)=19908328)
|
| 258 |
+
2120 train 6.212610 (lr=7.8778e-06) (hash(x)=16866379)
|
| 259 |
+
2130 train 6.120737 (lr=7.6836e-06) (hash(x)=18603879)
|
| 260 |
+
2140 train 6.153936 (lr=7.4938e-06) (hash(x)=17410022)
|
| 261 |
+
2150 train 6.117592 (lr=7.3085e-06) (hash(x)=20670670)
|
| 262 |
+
2160 train 6.240406 (lr=7.1277e-06) (hash(x)=18937483)
|
| 263 |
+
2170 train 6.288017 (lr=6.9516e-06) (hash(x)=17747478)
|
| 264 |
+
2180 train 6.246072 (lr=6.7801e-06) (hash(x)=22007580)
|
| 265 |
+
2190 train 6.121523 (lr=6.6133e-06) (hash(x)=30629796)
|
| 266 |
+
2200 val loss 6.2060
|
| 267 |
+
2200 val perplexity 495.6978
|
| 268 |
+
2200 train 6.136976 (lr=6.4513e-06) (hash(x)=18376034)
|
| 269 |
+
2210 train 6.118820 (lr=6.2941e-06) (hash(x)=21805545)
|
| 270 |
+
2220 train 6.141832 (lr=6.1418e-06) (hash(x)=21333227)
|
| 271 |
+
2230 train 6.273334 (lr=5.9944e-06) (hash(x)=22196262)
|
| 272 |
+
2240 train 6.295862 (lr=5.8519e-06) (hash(x)=19313786)
|
| 273 |
+
2250 train 6.202902 (lr=5.7145e-06) (hash(x)=20647579)
|
| 274 |
+
2260 train 6.134450 (lr=5.5821e-06) (hash(x)=17364516)
|
| 275 |
+
2270 train 6.018724 (lr=5.4547e-06) (hash(x)=15770875)
|
| 276 |
+
2280 train 6.145643 (lr=5.3325e-06) (hash(x)=21775829)
|
| 277 |
+
2290 train 6.097759 (lr=5.2155e-06) (hash(x)=16146754)
|
| 278 |
+
2300 val loss 6.1989
|
| 279 |
+
2300 val perplexity 492.2006
|
| 280 |
+
2300 train 6.115635 (lr=5.1037e-06) (hash(x)=19619330)
|
| 281 |
+
2310 train 6.031117 (lr=4.9971e-06) (hash(x)=20440748)
|
| 282 |
+
2320 train 6.219659 (lr=4.8957e-06) (hash(x)=18953334)
|
| 283 |
+
2330 train 6.249330 (lr=4.7997e-06) (hash(x)=16187606)
|
| 284 |
+
2340 train 6.242335 (lr=4.7090e-06) (hash(x)=20090275)
|
| 285 |
+
2350 train 6.105347 (lr=4.6236e-06) (hash(x)=20652023)
|
| 286 |
+
2360 train 6.152916 (lr=4.5437e-06) (hash(x)=22572496)
|
| 287 |
+
2370 train 6.171911 (lr=4.4691e-06) (hash(x)=17579410)
|
| 288 |
+
2380 train 6.125903 (lr=4.4000e-06) (hash(x)=20725075)
|
| 289 |
+
2390 train 6.098174 (lr=4.3363e-06) (hash(x)=30725634)
|
| 290 |
+
2400 val loss 6.1907
|
| 291 |
+
2400 val perplexity 488.1908
|
| 292 |
+
2400 train 6.180841 (lr=4.2781e-06) (hash(x)=21726726)
|
| 293 |
+
2410 train 6.272202 (lr=4.2253e-06) (hash(x)=22029774)
|
| 294 |
+
2420 train 6.233039 (lr=4.1781e-06) (hash(x)=20055172)
|
| 295 |
+
2430 train 6.133156 (lr=4.1364e-06) (hash(x)=20121441)
|
| 296 |
+
2440 train 6.214977 (lr=4.1003e-06) (hash(x)=22067748)
|
| 297 |
+
2450 train 6.164004 (lr=4.0697e-06) (hash(x)=18254046)
|
| 298 |
+
2460 train 6.142705 (lr=4.0446e-06) (hash(x)=18234469)
|
| 299 |
+
2470 train 6.125352 (lr=4.0251e-06) (hash(x)=20676963)
|
| 300 |
+
2480 train 6.086622 (lr=4.0112e-06) (hash(x)=21024535)
|
| 301 |
+
2490 train 6.098122 (lr=4.0028e-06) (hash(x)=20148844)
|
| 302 |
+
2499 val loss 6.1838
|
| 303 |
+
2499 val perplexity 484.8386
|
half_total_bs_sqrt_lr/model_02499.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:13a6d7fde9e22a3131910003859302b0e281c3b358900e6b86dff348ffbe86b4
|
| 3 |
+
size 548152706
|
half_total_bs_sqrt_lr/optimizer_02499.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f7fd9f58186a971a7b23c5bcf8c8794618d840697c11bd6e539028514567804f
|
| 3 |
+
size 995652870
|