| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.9420444444444445, | |
| "eval_steps": 500, | |
| "global_step": 5624, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 9.912928587976149, | |
| "learning_rate": 1.4e-07, | |
| "loss": 1.0098, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 9.065321089366511, | |
| "learning_rate": 2.8e-07, | |
| "loss": 1.0032, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 4.529282680442283, | |
| "learning_rate": 4.2e-07, | |
| "loss": 0.9767, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 4.079773534866118, | |
| "learning_rate": 5.6e-07, | |
| "loss": 0.9341, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.7504240808921168, | |
| "learning_rate": 7.000000000000001e-07, | |
| "loss": 0.8727, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.7446189301316752, | |
| "learning_rate": 8.4e-07, | |
| "loss": 0.7997, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.5893489037586176, | |
| "learning_rate": 9.800000000000001e-07, | |
| "loss": 0.7828, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.5798012459841311, | |
| "learning_rate": 1.12e-06, | |
| "loss": 0.7671, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.5143143417454488, | |
| "learning_rate": 1.26e-06, | |
| "loss": 0.777, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.5006881361687121, | |
| "learning_rate": 1.4000000000000001e-06, | |
| "loss": 0.7709, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.5268772561224019, | |
| "learning_rate": 1.54e-06, | |
| "loss": 0.7751, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.49059329535011015, | |
| "learning_rate": 1.68e-06, | |
| "loss": 0.7588, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.548982179156723, | |
| "learning_rate": 1.82e-06, | |
| "loss": 0.758, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.5118740800557817, | |
| "learning_rate": 1.9600000000000003e-06, | |
| "loss": 0.7492, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.47988356348194033, | |
| "learning_rate": 2.1e-06, | |
| "loss": 0.7479, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.5324095582498372, | |
| "learning_rate": 2.24e-06, | |
| "loss": 0.7344, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.49578185528674784, | |
| "learning_rate": 2.38e-06, | |
| "loss": 0.7379, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.4751722809020323, | |
| "learning_rate": 2.52e-06, | |
| "loss": 0.7515, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.4898512842949614, | |
| "learning_rate": 2.66e-06, | |
| "loss": 0.7428, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.4938014103724035, | |
| "learning_rate": 2.8000000000000003e-06, | |
| "loss": 0.7356, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.4853179196888149, | |
| "learning_rate": 2.94e-06, | |
| "loss": 0.7338, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.5006261354893382, | |
| "learning_rate": 3.08e-06, | |
| "loss": 0.7228, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.49494536099466524, | |
| "learning_rate": 3.22e-06, | |
| "loss": 0.7371, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.4745560090617258, | |
| "learning_rate": 3.36e-06, | |
| "loss": 0.7374, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.458424659300056, | |
| "learning_rate": 3.5e-06, | |
| "loss": 0.7284, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.4918105642778609, | |
| "learning_rate": 3.64e-06, | |
| "loss": 0.719, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.45994092727545755, | |
| "learning_rate": 3.7800000000000002e-06, | |
| "loss": 0.7328, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.4888877840053054, | |
| "learning_rate": 3.920000000000001e-06, | |
| "loss": 0.7257, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.4891132357037931, | |
| "learning_rate": 4.059999999999999e-06, | |
| "loss": 0.7146, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.4659780107286472, | |
| "learning_rate": 4.2e-06, | |
| "loss": 0.7207, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.4747662452681582, | |
| "learning_rate": 4.34e-06, | |
| "loss": 0.7196, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.46183058951309874, | |
| "learning_rate": 4.48e-06, | |
| "loss": 0.7166, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.47556837186042844, | |
| "learning_rate": 4.62e-06, | |
| "loss": 0.7138, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.4646419935884572, | |
| "learning_rate": 4.76e-06, | |
| "loss": 0.7166, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.47208612393069765, | |
| "learning_rate": 4.9e-06, | |
| "loss": 0.7071, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.47395551626034477, | |
| "learning_rate": 5.04e-06, | |
| "loss": 0.7081, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.46256038389399284, | |
| "learning_rate": 5.1799999999999995e-06, | |
| "loss": 0.7112, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.44989559880311664, | |
| "learning_rate": 5.32e-06, | |
| "loss": 0.7157, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.4759980664139243, | |
| "learning_rate": 5.46e-06, | |
| "loss": 0.716, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.47761427911509746, | |
| "learning_rate": 5.600000000000001e-06, | |
| "loss": 0.6936, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.4823631066912239, | |
| "learning_rate": 5.739999999999999e-06, | |
| "loss": 0.7096, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.4692563644972781, | |
| "learning_rate": 5.88e-06, | |
| "loss": 0.6955, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.4758216043542266, | |
| "learning_rate": 6.02e-06, | |
| "loss": 0.7046, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.4607724176991764, | |
| "learning_rate": 6.16e-06, | |
| "loss": 0.7071, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.47650098464440593, | |
| "learning_rate": 6.3e-06, | |
| "loss": 0.6948, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.4927763843500283, | |
| "learning_rate": 6.44e-06, | |
| "loss": 0.7138, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.44343044028786904, | |
| "learning_rate": 6.58e-06, | |
| "loss": 0.7033, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.45708129790603597, | |
| "learning_rate": 6.72e-06, | |
| "loss": 0.7038, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.47564264251663835, | |
| "learning_rate": 6.8599999999999995e-06, | |
| "loss": 0.6974, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.4561386006973232, | |
| "learning_rate": 7e-06, | |
| "loss": 0.702, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.4318637464381274, | |
| "learning_rate": 6.999934216315939e-06, | |
| "loss": 0.7054, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.47772094451329594, | |
| "learning_rate": 6.999736867736609e-06, | |
| "loss": 0.6946, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.45891608711087106, | |
| "learning_rate": 6.9994079616804764e-06, | |
| "loss": 0.6952, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.46731862765960264, | |
| "learning_rate": 6.9989475105113426e-06, | |
| "loss": 0.6888, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.4667223098464595, | |
| "learning_rate": 6.998355531537879e-06, | |
| "loss": 0.7017, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.46285196540927176, | |
| "learning_rate": 6.997632047012975e-06, | |
| "loss": 0.7051, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.48044807815149254, | |
| "learning_rate": 6.996777084132904e-06, | |
| "loss": 0.701, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.47600970966063727, | |
| "learning_rate": 6.995790675036298e-06, | |
| "loss": 0.7001, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.4494522317826872, | |
| "learning_rate": 6.994672856802944e-06, | |
| "loss": 0.7042, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.4623294450089233, | |
| "learning_rate": 6.993423671452386e-06, | |
| "loss": 0.69, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.43825456028915594, | |
| "learning_rate": 6.9920431659423436e-06, | |
| "loss": 0.6996, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.4568055452742323, | |
| "learning_rate": 6.990531392166956e-06, | |
| "loss": 0.6939, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.4302767633743081, | |
| "learning_rate": 6.988888406954821e-06, | |
| "loss": 0.6898, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.4762852616798798, | |
| "learning_rate": 6.9871142720668644e-06, | |
| "loss": 0.703, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.4572026337069386, | |
| "learning_rate": 6.985209054194017e-06, | |
| "loss": 0.7004, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.45803902960498666, | |
| "learning_rate": 6.983172824954708e-06, | |
| "loss": 0.6853, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.44353624606381903, | |
| "learning_rate": 6.9810056608921725e-06, | |
| "loss": 0.7074, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.44517458769087626, | |
| "learning_rate": 6.978707643471573e-06, | |
| "loss": 0.6988, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.4616555458392388, | |
| "learning_rate": 6.97627885907694e-06, | |
| "loss": 0.7034, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.4770896081066365, | |
| "learning_rate": 6.973719399007923e-06, | |
| "loss": 0.6935, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.45665921054521347, | |
| "learning_rate": 6.9710293594763545e-06, | |
| "loss": 0.6773, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.48834217157342125, | |
| "learning_rate": 6.968208841602645e-06, | |
| "loss": 0.6974, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.4661409470252182, | |
| "learning_rate": 6.965257951411967e-06, | |
| "loss": 0.6796, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.4249423447942054, | |
| "learning_rate": 6.962176799830279e-06, | |
| "loss": 0.686, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.4517631229399239, | |
| "learning_rate": 6.958965502680155e-06, | |
| "loss": 0.6968, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.4334006789419362, | |
| "learning_rate": 6.955624180676427e-06, | |
| "loss": 0.705, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.44354874837116653, | |
| "learning_rate": 6.9521529594216516e-06, | |
| "loss": 0.6954, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.4606606226964418, | |
| "learning_rate": 6.948551969401381e-06, | |
| "loss": 0.6965, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.46221163538458165, | |
| "learning_rate": 6.94482134597927e-06, | |
| "loss": 0.695, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.4636824720485381, | |
| "learning_rate": 6.940961229391975e-06, | |
| "loss": 0.6919, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.4450527833539268, | |
| "learning_rate": 6.936971764743891e-06, | |
| "loss": 0.6977, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.4358125416971688, | |
| "learning_rate": 6.932853102001694e-06, | |
| "loss": 0.6998, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.45623590289661414, | |
| "learning_rate": 6.928605395988701e-06, | |
| "loss": 0.6954, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.4536975058820564, | |
| "learning_rate": 6.924228806379058e-06, | |
| "loss": 0.6742, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.4563719379438227, | |
| "learning_rate": 6.919723497691728e-06, | |
| "loss": 0.6921, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.45279224746852664, | |
| "learning_rate": 6.915089639284313e-06, | |
| "loss": 0.6861, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.466062080319079, | |
| "learning_rate": 6.910327405346686e-06, | |
| "loss": 0.6895, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.443881137156012, | |
| "learning_rate": 6.905436974894443e-06, | |
| "loss": 0.7008, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.47752762402129206, | |
| "learning_rate": 6.900418531762173e-06, | |
| "loss": 0.6985, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.4542692407893758, | |
| "learning_rate": 6.89527226459655e-06, | |
| "loss": 0.6822, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.4314820719874765, | |
| "learning_rate": 6.889998366849237e-06, | |
| "loss": 0.691, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.4278370127210443, | |
| "learning_rate": 6.884597036769621e-06, | |
| "loss": 0.689, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.45134601911703476, | |
| "learning_rate": 6.879068477397353e-06, | |
| "loss": 0.6898, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.45160503192413054, | |
| "learning_rate": 6.87341289655472e-06, | |
| "loss": 0.6869, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.41025143635863104, | |
| "learning_rate": 6.867630506838833e-06, | |
| "loss": 0.6984, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.46520301654074564, | |
| "learning_rate": 6.861721525613633e-06, | |
| "loss": 0.6843, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.451991102882798, | |
| "learning_rate": 6.8556861750017235e-06, | |
| "loss": 0.6962, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.418111038766468, | |
| "learning_rate": 6.849524681876018e-06, | |
| "loss": 0.6797, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.4403261547939229, | |
| "learning_rate": 6.843237277851211e-06, | |
| "loss": 0.6965, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.426598785059419, | |
| "learning_rate": 6.836824199275074e-06, | |
| "loss": 0.6821, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.42988247771547117, | |
| "learning_rate": 6.830285687219569e-06, | |
| "loss": 0.6911, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.452230475071558, | |
| "learning_rate": 6.823621987471789e-06, | |
| "loss": 0.6851, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.4267205539811686, | |
| "learning_rate": 6.816833350524716e-06, | |
| "loss": 0.6777, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.44148424584394874, | |
| "learning_rate": 6.809920031567808e-06, | |
| "loss": 0.6838, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.43306877795839893, | |
| "learning_rate": 6.802882290477399e-06, | |
| "loss": 0.6864, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.4952482617663558, | |
| "learning_rate": 6.79572039180694e-06, | |
| "loss": 0.6904, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.45382453893592856, | |
| "learning_rate": 6.788434604777048e-06, | |
| "loss": 0.6795, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.452960843334945, | |
| "learning_rate": 6.781025203265388e-06, | |
| "loss": 0.6891, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.4537364245497661, | |
| "learning_rate": 6.773492465796373e-06, | |
| "loss": 0.6907, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.44929090527897886, | |
| "learning_rate": 6.765836675530703e-06, | |
| "loss": 0.6798, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.46381413350008455, | |
| "learning_rate": 6.758058120254715e-06, | |
| "loss": 0.6716, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.4309028536458763, | |
| "learning_rate": 6.750157092369563e-06, | |
| "loss": 0.6799, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.43717422966700575, | |
| "learning_rate": 6.742133888880233e-06, | |
| "loss": 0.6883, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.4459700930425581, | |
| "learning_rate": 6.7339888113843696e-06, | |
| "loss": 0.6891, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.44045298948848877, | |
| "learning_rate": 6.725722166060951e-06, | |
| "loss": 0.6817, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.4485899862146157, | |
| "learning_rate": 6.717334263658766e-06, | |
| "loss": 0.6897, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.45682000330961775, | |
| "learning_rate": 6.70882541948474e-06, | |
| "loss": 0.6776, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.48037041295136884, | |
| "learning_rate": 6.700195953392085e-06, | |
| "loss": 0.6872, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.44334741491819346, | |
| "learning_rate": 6.691446189768268e-06, | |
| "loss": 0.6798, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.4674740757760583, | |
| "learning_rate": 6.682576457522825e-06, | |
| "loss": 0.6977, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.4696181980144796, | |
| "learning_rate": 6.673587090074993e-06, | |
| "loss": 0.6896, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.4593954697303246, | |
| "learning_rate": 6.664478425341176e-06, | |
| "loss": 0.6749, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.41647753357217115, | |
| "learning_rate": 6.655250805722244e-06, | |
| "loss": 0.6894, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.4245409839045758, | |
| "learning_rate": 6.645904578090662e-06, | |
| "loss": 0.6693, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.45490183172736, | |
| "learning_rate": 6.636440093777451e-06, | |
| "loss": 0.6881, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.4633877447287089, | |
| "learning_rate": 6.626857708558979e-06, | |
| "loss": 0.6953, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.45069656102358646, | |
| "learning_rate": 6.617157782643591e-06, | |
| "loss": 0.6787, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.44438426822862237, | |
| "learning_rate": 6.6073406806580646e-06, | |
| "loss": 0.6859, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.4335460798475662, | |
| "learning_rate": 6.597406771633906e-06, | |
| "loss": 0.6829, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.4282672786086354, | |
| "learning_rate": 6.587356428993477e-06, | |
| "loss": 0.6831, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.46465171297436636, | |
| "learning_rate": 6.577190030535957e-06, | |
| "loss": 0.6778, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.4590812961346198, | |
| "learning_rate": 6.566907958423142e-06, | |
| "loss": 0.6701, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.4180631333820519, | |
| "learning_rate": 6.5565105991650815e-06, | |
| "loss": 0.6825, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.42684427340923925, | |
| "learning_rate": 6.545998343605544e-06, | |
| "loss": 0.6823, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.6515643833482546, | |
| "learning_rate": 6.5353715869073275e-06, | |
| "loss": 0.6748, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.42995190312179654, | |
| "learning_rate": 6.524630728537408e-06, | |
| "loss": 0.6896, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.4307066820527156, | |
| "learning_rate": 6.513776172251919e-06, | |
| "loss": 0.6821, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.4401373902110004, | |
| "learning_rate": 6.5028083260809735e-06, | |
| "loss": 0.6729, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.420372235119902, | |
| "learning_rate": 6.491727602313334e-06, | |
| "loss": 0.6812, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.44387468527179835, | |
| "learning_rate": 6.4805344174808986e-06, | |
| "loss": 0.6713, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.4224291568526637, | |
| "learning_rate": 6.4692291923430634e-06, | |
| "loss": 0.6928, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 0.42342827072921446, | |
| "learning_rate": 6.457812351870889e-06, | |
| "loss": 0.6925, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 0.4614687139520872, | |
| "learning_rate": 6.446284325231132e-06, | |
| "loss": 0.6804, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 0.4513094113300999, | |
| "learning_rate": 6.434645545770116e-06, | |
| "loss": 0.649, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 0.46129242006354043, | |
| "learning_rate": 6.422896450997434e-06, | |
| "loss": 0.6244, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 0.44352477273420793, | |
| "learning_rate": 6.411037482569509e-06, | |
| "loss": 0.6231, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 0.43347730975194065, | |
| "learning_rate": 6.399069086272988e-06, | |
| "loss": 0.6163, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 0.5042235757137699, | |
| "learning_rate": 6.386991712007985e-06, | |
| "loss": 0.6295, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 0.4635765704926019, | |
| "learning_rate": 6.374805813771171e-06, | |
| "loss": 0.6145, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 0.4672283056367441, | |
| "learning_rate": 6.362511849638706e-06, | |
| "loss": 0.6248, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 0.44386378239345664, | |
| "learning_rate": 6.3501102817490184e-06, | |
| "loss": 0.6208, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 0.45014512458671113, | |
| "learning_rate": 6.337601576285438e-06, | |
| "loss": 0.6241, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 0.47077991205008496, | |
| "learning_rate": 6.324986203458665e-06, | |
| "loss": 0.637, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 0.43971957336428713, | |
| "learning_rate": 6.3122646374891014e-06, | |
| "loss": 0.6274, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 0.45398595356146343, | |
| "learning_rate": 6.299437356589018e-06, | |
| "loss": 0.6172, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 0.4638039927896387, | |
| "learning_rate": 6.2865048429445835e-06, | |
| "loss": 0.6162, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 0.456884430778857, | |
| "learning_rate": 6.273467582697736e-06, | |
| "loss": 0.6358, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 0.4513273711536076, | |
| "learning_rate": 6.260326065927908e-06, | |
| "loss": 0.6256, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 0.4585546365167011, | |
| "learning_rate": 6.247080786633608e-06, | |
| "loss": 0.6343, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 0.4837809920582229, | |
| "learning_rate": 6.233732242713847e-06, | |
| "loss": 0.6205, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 0.45062031874118463, | |
| "learning_rate": 6.220280935949423e-06, | |
| "loss": 0.6181, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 0.4934582241182996, | |
| "learning_rate": 6.206727371984055e-06, | |
| "loss": 0.6101, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 0.45848465100131724, | |
| "learning_rate": 6.193072060305386e-06, | |
| "loss": 0.6274, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 0.49225379713590917, | |
| "learning_rate": 6.17931551422582e-06, | |
| "loss": 0.6287, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 0.43783738072351636, | |
| "learning_rate": 6.165458250863233e-06, | |
| "loss": 0.6322, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 0.45111919610212603, | |
| "learning_rate": 6.15150079112153e-06, | |
| "loss": 0.6343, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 0.7283719867926337, | |
| "learning_rate": 6.137443659671066e-06, | |
| "loss": 0.6245, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 0.4317614230374671, | |
| "learning_rate": 6.123287384928924e-06, | |
| "loss": 0.6252, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 0.43630742763076885, | |
| "learning_rate": 6.1090324990390505e-06, | |
| "loss": 0.6281, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 0.49179102646470696, | |
| "learning_rate": 6.09467953785225e-06, | |
| "loss": 0.6304, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 0.4269421327683836, | |
| "learning_rate": 6.080229040906045e-06, | |
| "loss": 0.6205, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 0.44873848635658836, | |
| "learning_rate": 6.065681551404392e-06, | |
| "loss": 0.6203, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 0.43522811508044484, | |
| "learning_rate": 6.051037616197267e-06, | |
| "loss": 0.6233, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 0.43363424076560303, | |
| "learning_rate": 6.036297785760099e-06, | |
| "loss": 0.6274, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 0.4420787259752861, | |
| "learning_rate": 6.0214626141730895e-06, | |
| "loss": 0.6388, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 0.445119846862499, | |
| "learning_rate": 6.006532659100377e-06, | |
| "loss": 0.6107, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 0.4380767674114949, | |
| "learning_rate": 5.991508481769071e-06, | |
| "loss": 0.6341, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 0.44003117819419657, | |
| "learning_rate": 5.976390646948166e-06, | |
| "loss": 0.6344, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 0.45806509086322245, | |
| "learning_rate": 5.961179722927302e-06, | |
| "loss": 0.6283, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 0.4545928600817147, | |
| "learning_rate": 5.9458762814954016e-06, | |
| "loss": 0.6254, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 0.4438181707408447, | |
| "learning_rate": 5.930480897919185e-06, | |
| "loss": 0.631, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 0.44695115171581695, | |
| "learning_rate": 5.9149941509215366e-06, | |
| "loss": 0.6338, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 0.4280430227739119, | |
| "learning_rate": 5.899416622659754e-06, | |
| "loss": 0.6182, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 0.458726186518369, | |
| "learning_rate": 5.883748898703666e-06, | |
| "loss": 0.6162, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 0.43445566304338457, | |
| "learning_rate": 5.8679915680136155e-06, | |
| "loss": 0.6228, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 0.44895947980462597, | |
| "learning_rate": 5.852145222918326e-06, | |
| "loss": 0.6373, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 0.43403817083393664, | |
| "learning_rate": 5.83621045909263e-06, | |
| "loss": 0.6376, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 0.4673939224968789, | |
| "learning_rate": 5.820187875535083e-06, | |
| "loss": 0.6215, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 0.46323588428022766, | |
| "learning_rate": 5.804078074545439e-06, | |
| "loss": 0.6187, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 0.4530033509696719, | |
| "learning_rate": 5.7878816617020204e-06, | |
| "loss": 0.6239, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 0.4317929663828983, | |
| "learning_rate": 5.771599245838943e-06, | |
| "loss": 0.6168, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 0.436592310414347, | |
| "learning_rate": 5.7552314390232364e-06, | |
| "loss": 0.6179, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 0.4702835623046126, | |
| "learning_rate": 5.738778856531832e-06, | |
| "loss": 0.6272, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 0.4619318889613922, | |
| "learning_rate": 5.72224211682844e-06, | |
| "loss": 0.6256, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 0.49429029776316813, | |
| "learning_rate": 5.705621841540292e-06, | |
| "loss": 0.6283, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 0.47054367378052575, | |
| "learning_rate": 5.688918655434783e-06, | |
| "loss": 0.6156, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 0.45638233691668284, | |
| "learning_rate": 5.67213318639598e-06, | |
| "loss": 0.6257, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 0.43819489071261747, | |
| "learning_rate": 5.655266065401021e-06, | |
| "loss": 0.6255, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 0.45603698357049277, | |
| "learning_rate": 5.638317926496398e-06, | |
| "loss": 0.6267, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 0.45518318702227223, | |
| "learning_rate": 5.6212894067741176e-06, | |
| "loss": 0.6357, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 0.4402683023420712, | |
| "learning_rate": 5.604181146347758e-06, | |
| "loss": 0.6311, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 0.4498808898227514, | |
| "learning_rate": 5.5869937883284065e-06, | |
| "loss": 0.6213, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 0.46040698115780887, | |
| "learning_rate": 5.569727978800478e-06, | |
| "loss": 0.6223, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 0.44168864627397236, | |
| "learning_rate": 5.552384366797435e-06, | |
| "loss": 0.6268, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 0.45494321235524204, | |
| "learning_rate": 5.534963604277388e-06, | |
| "loss": 0.6193, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 0.44543538788588954, | |
| "learning_rate": 5.517466346098587e-06, | |
| "loss": 0.6311, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 0.45370006917207745, | |
| "learning_rate": 5.4998932499948055e-06, | |
| "loss": 0.6263, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 0.4457705866746906, | |
| "learning_rate": 5.482244976550616e-06, | |
| "loss": 0.6267, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 0.44178347775287935, | |
| "learning_rate": 5.464522189176559e-06, | |
| "loss": 0.6168, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 0.4510685099498634, | |
| "learning_rate": 5.446725554084202e-06, | |
| "loss": 0.6071, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 0.4463056440103558, | |
| "learning_rate": 5.4288557402611e-06, | |
| "loss": 0.6193, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 0.4450825773000299, | |
| "learning_rate": 5.410913419445647e-06, | |
| "loss": 0.6114, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.4609214677792106, | |
| "learning_rate": 5.3928992661018194e-06, | |
| "loss": 0.6255, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.48687583594807843, | |
| "learning_rate": 5.374813957393832e-06, | |
| "loss": 0.6286, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 0.47549284042607015, | |
| "learning_rate": 5.356658173160674e-06, | |
| "loss": 0.6143, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 0.49532165280916113, | |
| "learning_rate": 5.338432595890562e-06, | |
| "loss": 0.6249, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 0.45253915740067313, | |
| "learning_rate": 5.320137910695275e-06, | |
| "loss": 0.6257, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 0.43721435814923637, | |
| "learning_rate": 5.301774805284408e-06, | |
| "loss": 0.6178, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 0.4683301857922748, | |
| "learning_rate": 5.2833439699395175e-06, | |
| "loss": 0.6173, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 0.43871464981194036, | |
| "learning_rate": 5.264846097488175e-06, | |
| "loss": 0.6214, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 0.4524085111628937, | |
| "learning_rate": 5.246281883277922e-06, | |
| "loss": 0.6346, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 0.4468406698869542, | |
| "learning_rate": 5.227652025150132e-06, | |
| "loss": 0.614, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 0.468252187542662, | |
| "learning_rate": 5.208957223413776e-06, | |
| "loss": 0.6057, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 0.46458186348478814, | |
| "learning_rate": 5.1901981808191e-06, | |
| "loss": 0.6192, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 0.4589397282179608, | |
| "learning_rate": 5.1713756025312095e-06, | |
| "loss": 0.6197, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 0.4733441471283767, | |
| "learning_rate": 5.1524901961035555e-06, | |
| "loss": 0.6146, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.49573981085967583, | |
| "learning_rate": 5.1335426714513436e-06, | |
| "loss": 0.6205, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.45753588591278177, | |
| "learning_rate": 5.114533740824848e-06, | |
| "loss": 0.6194, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 0.44981584915327405, | |
| "learning_rate": 5.095464118782631e-06, | |
| "loss": 0.6285, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 0.44941448245640475, | |
| "learning_rate": 5.076334522164687e-06, | |
| "loss": 0.6183, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 0.46348841235648264, | |
| "learning_rate": 5.057145670065498e-06, | |
| "loss": 0.6178, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 0.4819885899523623, | |
| "learning_rate": 5.037898283806995e-06, | |
| "loss": 0.6209, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 0.45974762343297226, | |
| "learning_rate": 5.018593086911453e-06, | |
| "loss": 0.6144, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 0.4832719455105882, | |
| "learning_rate": 4.999230805074284e-06, | |
| "loss": 0.6255, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 0.4580501245903807, | |
| "learning_rate": 4.979812166136764e-06, | |
| "loss": 0.622, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 0.4869292416366864, | |
| "learning_rate": 4.960337900058668e-06, | |
| "loss": 0.6295, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 0.44734991176527494, | |
| "learning_rate": 4.940808738890834e-06, | |
| "loss": 0.61, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 0.4836741219786191, | |
| "learning_rate": 4.921225416747647e-06, | |
| "loss": 0.6131, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 0.43868937063180397, | |
| "learning_rate": 4.901588669779433e-06, | |
| "loss": 0.6261, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 0.4549440779907735, | |
| "learning_rate": 4.881899236144797e-06, | |
| "loss": 0.6216, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 0.4561309327019534, | |
| "learning_rate": 4.862157855982875e-06, | |
| "loss": 0.6262, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 0.4521274007767562, | |
| "learning_rate": 4.8423652713855e-06, | |
| "loss": 0.6214, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 0.4876373591113174, | |
| "learning_rate": 4.822522226369323e-06, | |
| "loss": 0.6303, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 0.4403247558369275, | |
| "learning_rate": 4.802629466847827e-06, | |
| "loss": 0.6236, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 0.4392883872725244, | |
| "learning_rate": 4.782687740603308e-06, | |
| "loss": 0.6125, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 0.44359149108855517, | |
| "learning_rate": 4.762697797258742e-06, | |
| "loss": 0.6208, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 0.45892783125410747, | |
| "learning_rate": 4.742660388249629e-06, | |
| "loss": 0.6146, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 0.46353318895549067, | |
| "learning_rate": 4.722576266795729e-06, | |
| "loss": 0.6199, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 0.4642990741363008, | |
| "learning_rate": 4.702446187872758e-06, | |
| "loss": 0.6182, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 0.44827792507065956, | |
| "learning_rate": 4.682270908184003e-06, | |
| "loss": 0.6246, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 0.45544933714150454, | |
| "learning_rate": 4.662051186131876e-06, | |
| "loss": 0.6256, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 0.4485500362120205, | |
| "learning_rate": 4.641787781789412e-06, | |
| "loss": 0.6181, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 0.42631048877270405, | |
| "learning_rate": 4.6214814568716894e-06, | |
| "loss": 0.6331, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 0.4714279586473698, | |
| "learning_rate": 4.601132974707202e-06, | |
| "loss": 0.628, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 0.4228608375782349, | |
| "learning_rate": 4.5807431002091605e-06, | |
| "loss": 0.6054, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 0.46872660848782277, | |
| "learning_rate": 4.560312599846746e-06, | |
| "loss": 0.6102, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 0.4379038714391558, | |
| "learning_rate": 4.539842241616287e-06, | |
| "loss": 0.6143, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 0.4719919574560488, | |
| "learning_rate": 4.519332795012404e-06, | |
| "loss": 0.6197, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 0.4560470541146194, | |
| "learning_rate": 4.498785030999068e-06, | |
| "loss": 0.6132, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 0.48502107778992737, | |
| "learning_rate": 4.478199721980633e-06, | |
| "loss": 0.631, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 0.45288928959662245, | |
| "learning_rate": 4.457577641772792e-06, | |
| "loss": 0.6148, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 0.45740004712492455, | |
| "learning_rate": 4.436919565573495e-06, | |
| "loss": 0.613, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 0.4680089016865197, | |
| "learning_rate": 4.416226269933802e-06, | |
| "loss": 0.6109, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 0.4498754217059588, | |
| "learning_rate": 4.395498532728697e-06, | |
| "loss": 0.627, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 0.490510820257092, | |
| "learning_rate": 4.374737133127847e-06, | |
| "loss": 0.6287, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 0.4384793154811805, | |
| "learning_rate": 4.35394285156631e-06, | |
| "loss": 0.6265, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 0.42053564372682345, | |
| "learning_rate": 4.3331164697151995e-06, | |
| "loss": 0.6123, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 0.44499220286710817, | |
| "learning_rate": 4.3122587704523015e-06, | |
| "loss": 0.6196, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 0.4681953108721627, | |
| "learning_rate": 4.291370537832641e-06, | |
| "loss": 0.6301, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 0.4245150987038812, | |
| "learning_rate": 4.2704525570590185e-06, | |
| "loss": 0.6203, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 0.4738423212960381, | |
| "learning_rate": 4.2495056144524824e-06, | |
| "loss": 0.6159, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 0.49926406862961464, | |
| "learning_rate": 4.228530497422779e-06, | |
| "loss": 0.6193, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 0.4423739374256911, | |
| "learning_rate": 4.207527994438748e-06, | |
| "loss": 0.617, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 0.44692873617751755, | |
| "learning_rate": 4.186498894998689e-06, | |
| "loss": 0.6135, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 0.4358994979972626, | |
| "learning_rate": 4.165443989600678e-06, | |
| "loss": 0.6121, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 0.46452930431844286, | |
| "learning_rate": 4.144364069712854e-06, | |
| "loss": 0.6167, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 0.4816111574015236, | |
| "learning_rate": 4.123259927743669e-06, | |
| "loss": 0.6203, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 0.45232518080467465, | |
| "learning_rate": 4.102132357012098e-06, | |
| "loss": 0.6199, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 0.45515782747165817, | |
| "learning_rate": 4.08098215171782e-06, | |
| "loss": 0.6174, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 0.44933646029392305, | |
| "learning_rate": 4.059810106911363e-06, | |
| "loss": 0.6188, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 0.45633219759975596, | |
| "learning_rate": 4.038617018464217e-06, | |
| "loss": 0.6168, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 0.4663774750339656, | |
| "learning_rate": 4.017403683038914e-06, | |
| "loss": 0.6199, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 0.4565589400061048, | |
| "learning_rate": 3.996170898059087e-06, | |
| "loss": 0.6187, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.45638098232431645, | |
| "learning_rate": 3.97491946167949e-06, | |
| "loss": 0.6133, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 0.4330737687010161, | |
| "learning_rate": 3.9536501727559956e-06, | |
| "loss": 0.6179, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 0.44620897297773393, | |
| "learning_rate": 3.932363830815563e-06, | |
| "loss": 0.606, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.4727298461430969, | |
| "learning_rate": 3.911061236026192e-06, | |
| "loss": 0.5804, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 0.5332182751767908, | |
| "learning_rate": 3.889743189166831e-06, | |
| "loss": 0.5552, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 0.471875496548638, | |
| "learning_rate": 3.868410491597286e-06, | |
| "loss": 0.5467, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 0.4869637805163024, | |
| "learning_rate": 3.847063945228094e-06, | |
| "loss": 0.5691, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 0.4714418364302173, | |
| "learning_rate": 3.825704352490375e-06, | |
| "loss": 0.5788, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 0.49636094733662106, | |
| "learning_rate": 3.804332516305672e-06, | |
| "loss": 0.5583, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 0.48087980189754664, | |
| "learning_rate": 3.782949240055768e-06, | |
| "loss": 0.5632, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 0.4873147689537464, | |
| "learning_rate": 3.7615553275524852e-06, | |
| "loss": 0.5602, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 0.4603275098510104, | |
| "learning_rate": 3.74015158300747e-06, | |
| "loss": 0.5641, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 0.5162191764305892, | |
| "learning_rate": 3.7187388110019604e-06, | |
| "loss": 0.5628, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 0.49005627074608765, | |
| "learning_rate": 3.697317816456546e-06, | |
| "loss": 0.559, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 0.4585568665283943, | |
| "learning_rate": 3.6758894046009037e-06, | |
| "loss": 0.547, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 0.4506260874603515, | |
| "learning_rate": 3.6544543809435346e-06, | |
| "loss": 0.5433, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 0.46595533436834136, | |
| "learning_rate": 3.6330135512414822e-06, | |
| "loss": 0.5666, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 0.4690150184503375, | |
| "learning_rate": 3.6115677214700397e-06, | |
| "loss": 0.5596, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 0.4683369095498927, | |
| "learning_rate": 3.5901176977924606e-06, | |
| "loss": 0.5458, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 0.4710288608351933, | |
| "learning_rate": 3.568664286529646e-06, | |
| "loss": 0.5507, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 0.4928542807361932, | |
| "learning_rate": 3.5472082941298433e-06, | |
| "loss": 0.5665, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 0.4972921543225756, | |
| "learning_rate": 3.5257505271383217e-06, | |
| "loss": 0.5586, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 0.4855107426051562, | |
| "learning_rate": 3.504291792167063e-06, | |
| "loss": 0.5615, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 0.4623236179613674, | |
| "learning_rate": 3.4828328958644326e-06, | |
| "loss": 0.5638, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 0.46028741167647896, | |
| "learning_rate": 3.4613746448848622e-06, | |
| "loss": 0.5464, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 0.46156508300115645, | |
| "learning_rate": 3.439917845858524e-06, | |
| "loss": 0.567, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 0.5669489602625127, | |
| "learning_rate": 3.418463305361013e-06, | |
| "loss": 0.5524, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 0.49099941076825016, | |
| "learning_rate": 3.3970118298830207e-06, | |
| "loss": 0.5591, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 0.5207064606888653, | |
| "learning_rate": 3.3755642258000265e-06, | |
| "loss": 0.5538, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 0.4830219809120518, | |
| "learning_rate": 3.3541212993419773e-06, | |
| "loss": 0.5475, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 0.4801836621711601, | |
| "learning_rate": 3.3326838565629895e-06, | |
| "loss": 0.5413, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 0.47387958333244534, | |
| "learning_rate": 3.31125270331104e-06, | |
| "loss": 0.5537, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 0.5090490511350312, | |
| "learning_rate": 3.289828645197681e-06, | |
| "loss": 0.5567, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 0.5286353188714713, | |
| "learning_rate": 3.2684124875677518e-06, | |
| "loss": 0.5589, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 0.4927074981163475, | |
| "learning_rate": 3.247005035469109e-06, | |
| "loss": 0.5697, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 0.47340856305327644, | |
| "learning_rate": 3.2256070936223603e-06, | |
| "loss": 0.5687, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 0.5115028667136483, | |
| "learning_rate": 3.2042194663906193e-06, | |
| "loss": 0.5625, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 0.4723602653535651, | |
| "learning_rate": 3.182842957749263e-06, | |
| "loss": 0.5633, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 0.4679538952450783, | |
| "learning_rate": 3.1614783712557156e-06, | |
| "loss": 0.5572, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 0.48051919166640805, | |
| "learning_rate": 3.1401265100192383e-06, | |
| "loss": 0.5648, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 0.4594423765819446, | |
| "learning_rate": 3.1187881766707425e-06, | |
| "loss": 0.5595, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 0.49220125939314296, | |
| "learning_rate": 3.0974641733326154e-06, | |
| "loss": 0.5479, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 0.4944265110257382, | |
| "learning_rate": 3.0761553015885717e-06, | |
| "loss": 0.5502, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 0.495744161270211, | |
| "learning_rate": 3.0548623624535165e-06, | |
| "loss": 0.5629, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 0.478561888744776, | |
| "learning_rate": 3.0335861563434403e-06, | |
| "loss": 0.5597, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 0.4946624980435279, | |
| "learning_rate": 3.012327483045325e-06, | |
| "loss": 0.556, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 0.4913013156645444, | |
| "learning_rate": 2.9910871416870855e-06, | |
| "loss": 0.5638, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 0.46629667333688474, | |
| "learning_rate": 2.9698659307075224e-06, | |
| "loss": 0.5508, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 0.47577400823898375, | |
| "learning_rate": 2.948664647826318e-06, | |
| "loss": 0.5518, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 0.48528006049817207, | |
| "learning_rate": 2.9274840900140375e-06, | |
| "loss": 0.5582, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 0.5499143301618472, | |
| "learning_rate": 2.906325053462181e-06, | |
| "loss": 0.548, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 0.4772816560553211, | |
| "learning_rate": 2.8851883335532496e-06, | |
| "loss": 0.5523, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 0.49887071761697505, | |
| "learning_rate": 2.8640747248308445e-06, | |
| "loss": 0.5544, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 0.4853842631362592, | |
| "learning_rate": 2.8429850209698053e-06, | |
| "loss": 0.5558, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 0.45895465861964546, | |
| "learning_rate": 2.8219200147463677e-06, | |
| "loss": 0.5598, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 0.4662802877247775, | |
| "learning_rate": 2.8008804980083695e-06, | |
| "loss": 0.5551, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 0.4881083174435456, | |
| "learning_rate": 2.7798672616454785e-06, | |
| "loss": 0.5511, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 0.5016617932642891, | |
| "learning_rate": 2.75888109555947e-06, | |
| "loss": 0.5438, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 0.4831166076149674, | |
| "learning_rate": 2.7379227886345244e-06, | |
| "loss": 0.5598, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 0.4953933886035155, | |
| "learning_rate": 2.716993128707581e-06, | |
| "loss": 0.5609, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 0.503170266490847, | |
| "learning_rate": 2.696092902538716e-06, | |
| "loss": 0.5488, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 0.5098380667106547, | |
| "learning_rate": 2.675222895781574e-06, | |
| "loss": 0.5539, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 0.49948084086860606, | |
| "learning_rate": 2.6543838929538285e-06, | |
| "loss": 0.5581, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 0.4872613273522286, | |
| "learning_rate": 2.6335766774076965e-06, | |
| "loss": 0.5562, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 0.47926716145131487, | |
| "learning_rate": 2.6128020313004875e-06, | |
| "loss": 0.5561, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 0.49339314189894584, | |
| "learning_rate": 2.592060735565206e-06, | |
| "loss": 0.5633, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 0.4888816777932096, | |
| "learning_rate": 2.5713535698811926e-06, | |
| "loss": 0.5623, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 0.47873225411797143, | |
| "learning_rate": 2.550681312644815e-06, | |
| "loss": 0.5629, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 0.4985498589688127, | |
| "learning_rate": 2.5300447409402104e-06, | |
| "loss": 0.5517, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 0.4699404709889953, | |
| "learning_rate": 2.509444630510071e-06, | |
| "loss": 0.5542, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 0.5471742855253533, | |
| "learning_rate": 2.4888817557264883e-06, | |
| "loss": 0.5573, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 0.4890601716460387, | |
| "learning_rate": 2.468356889561835e-06, | |
| "loss": 0.5496, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 0.4884550896007432, | |
| "learning_rate": 2.4478708035597206e-06, | |
| "loss": 0.5517, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.53082092791935, | |
| "learning_rate": 2.427424267805977e-06, | |
| "loss": 0.5643, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 0.4588900957688972, | |
| "learning_rate": 2.407018050899719e-06, | |
| "loss": 0.5588, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 0.4930240761419014, | |
| "learning_rate": 2.3866529199244454e-06, | |
| "loss": 0.5534, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 0.4995410840918172, | |
| "learning_rate": 2.36632964041921e-06, | |
| "loss": 0.5526, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 0.4889682103736911, | |
| "learning_rate": 2.3460489763498393e-06, | |
| "loss": 0.5575, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 0.47254332660748083, | |
| "learning_rate": 2.3258116900802188e-06, | |
| "loss": 0.5641, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 0.5271806756431864, | |
| "learning_rate": 2.3056185423436304e-06, | |
| "loss": 0.5515, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 0.5014716634327129, | |
| "learning_rate": 2.2854702922141627e-06, | |
| "loss": 0.5578, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 0.48930981901485066, | |
| "learning_rate": 2.265367697078168e-06, | |
| "loss": 0.5648, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 0.4822043988267899, | |
| "learning_rate": 2.245311512605801e-06, | |
| "loss": 0.5554, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 0.4978119631671631, | |
| "learning_rate": 2.2253024927226053e-06, | |
| "loss": 0.5586, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 0.49756480432664524, | |
| "learning_rate": 2.2053413895811764e-06, | |
| "loss": 0.5578, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 0.4671920108876918, | |
| "learning_rate": 2.1854289535328864e-06, | |
| "loss": 0.5557, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 0.513655855548841, | |
| "learning_rate": 2.165565933099682e-06, | |
| "loss": 0.5589, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 0.46274876339767745, | |
| "learning_rate": 2.1457530749459373e-06, | |
| "loss": 0.5588, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 0.48340392958868733, | |
| "learning_rate": 2.1259911238503988e-06, | |
| "loss": 0.5481, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 0.5024001177410511, | |
| "learning_rate": 2.1062808226781767e-06, | |
| "loss": 0.5604, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 0.4794062865649958, | |
| "learning_rate": 2.0866229123528305e-06, | |
| "loss": 0.552, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 0.49502474291815657, | |
| "learning_rate": 2.0670181318285076e-06, | |
| "loss": 0.5526, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 0.4912138589836612, | |
| "learning_rate": 2.0474672180621754e-06, | |
| "loss": 0.5433, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 0.46287983551015915, | |
| "learning_rate": 2.027970905985908e-06, | |
| "loss": 0.5607, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 0.4818908530273005, | |
| "learning_rate": 2.008529928479269e-06, | |
| "loss": 0.5552, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 0.49475825963312386, | |
| "learning_rate": 1.9891450163417574e-06, | |
| "loss": 0.5473, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 0.5090335613659759, | |
| "learning_rate": 1.9698168982653334e-06, | |
| "loss": 0.5469, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 0.48712846229296525, | |
| "learning_rate": 1.950546300807037e-06, | |
| "loss": 0.5526, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 0.5087151308068611, | |
| "learning_rate": 1.931333948361664e-06, | |
| "loss": 0.563, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 0.4770122954574883, | |
| "learning_rate": 1.9121805631345406e-06, | |
| "loss": 0.5588, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 0.49875337542296333, | |
| "learning_rate": 1.8930868651143776e-06, | |
| "loss": 0.5556, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 0.46661280379905284, | |
| "learning_rate": 1.8740535720462034e-06, | |
| "loss": 0.5518, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 0.49444595207088565, | |
| "learning_rate": 1.8550813994043814e-06, | |
| "loss": 0.5679, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 0.48381227476419236, | |
| "learning_rate": 1.8361710603657162e-06, | |
| "loss": 0.5572, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 0.5055312948711096, | |
| "learning_rate": 1.8173232657826508e-06, | |
| "loss": 0.5538, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 0.4686625212413926, | |
| "learning_rate": 1.7985387241565343e-06, | |
| "loss": 0.559, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 0.4804255341689684, | |
| "learning_rate": 1.7798181416109966e-06, | |
| "loss": 0.544, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 0.5090131219052505, | |
| "learning_rate": 1.7611622218654e-06, | |
| "loss": 0.5565, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 0.4823380469403731, | |
| "learning_rate": 1.7425716662083936e-06, | |
| "loss": 0.5586, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 0.5039478306212927, | |
| "learning_rate": 1.7240471734715416e-06, | |
| "loss": 0.5582, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 0.48106143586192984, | |
| "learning_rate": 1.7055894400030597e-06, | |
| "loss": 0.5527, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 0.4948095621947108, | |
| "learning_rate": 1.6871991596416367e-06, | |
| "loss": 0.5534, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 0.47985601211032985, | |
| "learning_rate": 1.668877023690356e-06, | |
| "loss": 0.5514, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 0.5044751224020304, | |
| "learning_rate": 1.6506237208907045e-06, | |
| "loss": 0.5541, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 0.5080452899508979, | |
| "learning_rate": 1.6324399373966833e-06, | |
| "loss": 0.5506, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 0.4931986436565961, | |
| "learning_rate": 1.6143263567490192e-06, | |
| "loss": 0.5736, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 0.4684816221900875, | |
| "learning_rate": 1.596283659849464e-06, | |
| "loss": 0.556, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 0.4785014812413059, | |
| "learning_rate": 1.5783125249352016e-06, | |
| "loss": 0.5579, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 0.5116019647376474, | |
| "learning_rate": 1.5604136275533513e-06, | |
| "loss": 0.5552, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 0.5395436792240803, | |
| "learning_rate": 1.5425876405355793e-06, | |
| "loss": 0.5384, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 0.4900436595350879, | |
| "learning_rate": 1.5248352339727968e-06, | |
| "loss": 0.5622, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 0.47513280378884526, | |
| "learning_rate": 1.5071570751899785e-06, | |
| "loss": 0.5636, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 0.4839906292088417, | |
| "learning_rate": 1.4895538287210727e-06, | |
| "loss": 0.5527, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 0.5376958097507211, | |
| "learning_rate": 1.4720261562840272e-06, | |
| "loss": 0.5635, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 0.48771290149288943, | |
| "learning_rate": 1.4545747167559066e-06, | |
| "loss": 0.564, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "grad_norm": 0.4854524808894032, | |
| "learning_rate": 1.4372001661481314e-06, | |
| "loss": 0.5598, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 0.4700143505212195, | |
| "learning_rate": 1.4199031575818126e-06, | |
| "loss": 0.5375, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 0.4915439052479703, | |
| "learning_rate": 1.4026843412632083e-06, | |
| "loss": 0.5548, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "grad_norm": 0.4869720592283153, | |
| "learning_rate": 1.385544364459273e-06, | |
| "loss": 0.5571, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 0.4716126280570366, | |
| "learning_rate": 1.3684838714733317e-06, | |
| "loss": 0.5516, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 0.4965381533290548, | |
| "learning_rate": 1.3515035036208578e-06, | |
| "loss": 0.5578, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 0.49674828915458996, | |
| "learning_rate": 1.3346038992053705e-06, | |
| "loss": 0.5498, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 0.47680857026122736, | |
| "learning_rate": 1.3177856934944328e-06, | |
| "loss": 0.5531, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 0.4870948629881832, | |
| "learning_rate": 1.3010495186957768e-06, | |
| "loss": 0.552, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 0.483089196852953, | |
| "learning_rate": 1.2843960039335355e-06, | |
| "loss": 0.5564, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 0.5140997811965615, | |
| "learning_rate": 1.2678257752245992e-06, | |
| "loss": 0.5504, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 0.4779902409617231, | |
| "learning_rate": 1.2513394554550753e-06, | |
| "loss": 0.5478, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 0.47680861915825756, | |
| "learning_rate": 1.2349376643568792e-06, | |
| "loss": 0.5555, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 0.47618772244534097, | |
| "learning_rate": 1.218621018484434e-06, | |
| "loss": 0.5509, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "grad_norm": 0.46991117646305874, | |
| "learning_rate": 1.202390131191501e-06, | |
| "loss": 0.5572, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 0.48145576248836425, | |
| "learning_rate": 1.1862456126081136e-06, | |
| "loss": 0.562, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 0.49862994419451123, | |
| "learning_rate": 1.170188069617649e-06, | |
| "loss": 0.5574, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 0.5025682535998525, | |
| "learning_rate": 1.1542181058340122e-06, | |
| "loss": 0.5569, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.47850092658350835, | |
| "learning_rate": 1.1383363215789488e-06, | |
| "loss": 0.5543, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 3.01, | |
| "grad_norm": 0.5044422425999335, | |
| "learning_rate": 1.1225433138594741e-06, | |
| "loss": 0.5599, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 3.01, | |
| "grad_norm": 0.47419325850109234, | |
| "learning_rate": 1.1068396763454339e-06, | |
| "loss": 0.5586, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.560597143802205, | |
| "learning_rate": 1.0912259993471857e-06, | |
| "loss": 0.5524, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 3.01, | |
| "grad_norm": 0.5148468793364267, | |
| "learning_rate": 1.0757028697934152e-06, | |
| "loss": 0.5084, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 3.01, | |
| "grad_norm": 0.5017714203601242, | |
| "learning_rate": 1.060270871209064e-06, | |
| "loss": 0.5156, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 3.02, | |
| "grad_norm": 0.49357251631602217, | |
| "learning_rate": 1.0449305836934003e-06, | |
| "loss": 0.5109, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 3.03, | |
| "grad_norm": 0.4936913138076729, | |
| "learning_rate": 1.02968258389821e-06, | |
| "loss": 0.5158, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 0.5049259973539401, | |
| "learning_rate": 1.0145274450061254e-06, | |
| "loss": 0.5217, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 0.517079836314341, | |
| "learning_rate": 9.994657367090686e-07, | |
| "loss": 0.5136, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 3.05, | |
| "grad_norm": 0.4837364294449262, | |
| "learning_rate": 9.844980251868449e-07, | |
| "loss": 0.518, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 3.06, | |
| "grad_norm": 0.4869343961795407, | |
| "learning_rate": 9.696248730858605e-07, | |
| "loss": 0.5132, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 3.06, | |
| "grad_norm": 0.5085658265111329, | |
| "learning_rate": 9.54846839497964e-07, | |
| "loss": 0.5165, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 3.07, | |
| "grad_norm": 0.47424129042024027, | |
| "learning_rate": 9.401644799394382e-07, | |
| "loss": 0.5215, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "grad_norm": 0.4991885159298539, | |
| "learning_rate": 9.255783463301111e-07, | |
| "loss": 0.5092, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 3.09, | |
| "grad_norm": 0.47972707851164975, | |
| "learning_rate": 9.110889869726167e-07, | |
| "loss": 0.5289, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 3.09, | |
| "grad_norm": 0.48477312158187885, | |
| "learning_rate": 8.966969465317753e-07, | |
| "loss": 0.5373, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "grad_norm": 0.5150113149802942, | |
| "learning_rate": 8.824027660141253e-07, | |
| "loss": 0.5144, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 3.11, | |
| "grad_norm": 0.5012820847152873, | |
| "learning_rate": 8.682069827475828e-07, | |
| "loss": 0.5232, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 3.11, | |
| "grad_norm": 0.536197598669663, | |
| "learning_rate": 8.541101303612473e-07, | |
| "loss": 0.5312, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "grad_norm": 0.47456874746453287, | |
| "learning_rate": 8.401127387653379e-07, | |
| "loss": 0.5021, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 3.13, | |
| "grad_norm": 0.5022494921077733, | |
| "learning_rate": 8.262153341312734e-07, | |
| "loss": 0.5039, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 3.14, | |
| "grad_norm": 0.5128622291867768, | |
| "learning_rate": 8.124184388719e-07, | |
| "loss": 0.5189, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 3.14, | |
| "grad_norm": 0.49970434341288505, | |
| "learning_rate": 7.987225716218441e-07, | |
| "loss": 0.5266, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "grad_norm": 0.4990813361708124, | |
| "learning_rate": 7.851282472180222e-07, | |
| "loss": 0.5189, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 3.16, | |
| "grad_norm": 0.5361324180050252, | |
| "learning_rate": 7.716359766802858e-07, | |
| "loss": 0.5283, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 3.16, | |
| "grad_norm": 0.49325303865409753, | |
| "learning_rate": 7.582462671922154e-07, | |
| "loss": 0.5134, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 3.17, | |
| "grad_norm": 0.5074499214352016, | |
| "learning_rate": 7.449596220820492e-07, | |
| "loss": 0.5219, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 3.18, | |
| "grad_norm": 0.48687866167974014, | |
| "learning_rate": 7.317765408037668e-07, | |
| "loss": 0.5131, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 3.19, | |
| "grad_norm": 0.5209017279406115, | |
| "learning_rate": 7.186975189183119e-07, | |
| "loss": 0.5263, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 3.19, | |
| "grad_norm": 0.5017929271897994, | |
| "learning_rate": 7.057230480749689e-07, | |
| "loss": 0.5221, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 0.4909543768911595, | |
| "learning_rate": 6.928536159928746e-07, | |
| "loss": 0.5082, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 3.21, | |
| "grad_norm": 0.5217040631589964, | |
| "learning_rate": 6.800897064426877e-07, | |
| "loss": 0.5136, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 3.21, | |
| "grad_norm": 0.5007485735211247, | |
| "learning_rate": 6.674317992284038e-07, | |
| "loss": 0.5158, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 3.22, | |
| "grad_norm": 0.495432605404129, | |
| "learning_rate": 6.548803701693218e-07, | |
| "loss": 0.5191, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 3.23, | |
| "grad_norm": 0.5457479536125451, | |
| "learning_rate": 6.424358910821511e-07, | |
| "loss": 0.5144, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 3.24, | |
| "grad_norm": 0.5106414076169086, | |
| "learning_rate": 6.300988297632804e-07, | |
| "loss": 0.5288, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 3.24, | |
| "grad_norm": 0.5211736668510725, | |
| "learning_rate": 6.178696499711915e-07, | |
| "loss": 0.5218, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "grad_norm": 0.4891406143758845, | |
| "learning_rate": 6.057488114090288e-07, | |
| "loss": 0.5107, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 3.26, | |
| "grad_norm": 0.5178228254981688, | |
| "learning_rate": 5.937367697073139e-07, | |
| "loss": 0.5004, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 3.26, | |
| "grad_norm": 0.49831173988741256, | |
| "learning_rate": 5.818339764068217e-07, | |
| "loss": 0.5167, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 3.27, | |
| "grad_norm": 0.5445792027132667, | |
| "learning_rate": 5.700408789416051e-07, | |
| "loss": 0.5251, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 3.28, | |
| "grad_norm": 0.5412064520692698, | |
| "learning_rate": 5.58357920622179e-07, | |
| "loss": 0.5185, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 3.28, | |
| "grad_norm": 0.5194173017222409, | |
| "learning_rate": 5.467855406188503e-07, | |
| "loss": 0.5213, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 3.29, | |
| "grad_norm": 0.530585691377951, | |
| "learning_rate": 5.353241739452134e-07, | |
| "loss": 0.5213, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 3.3, | |
| "grad_norm": 0.5334266089134705, | |
| "learning_rate": 5.239742514417958e-07, | |
| "loss": 0.5213, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 3.31, | |
| "grad_norm": 0.5323190599173516, | |
| "learning_rate": 5.127361997598647e-07, | |
| "loss": 0.5173, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 3.31, | |
| "grad_norm": 0.4977075988891876, | |
| "learning_rate": 5.016104413453866e-07, | |
| "loss": 0.5163, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 3.32, | |
| "grad_norm": 0.5072133518376746, | |
| "learning_rate": 4.905973944231479e-07, | |
| "loss": 0.5147, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 3.33, | |
| "grad_norm": 0.5089446326634548, | |
| "learning_rate": 4.796974729810328e-07, | |
| "loss": 0.5206, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 3.33, | |
| "grad_norm": 0.5173579821056443, | |
| "learning_rate": 4.6891108675446453e-07, | |
| "loss": 0.5233, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 3.34, | |
| "grad_norm": 0.49509093398735665, | |
| "learning_rate": 4.5823864121099967e-07, | |
| "loss": 0.5143, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 3.35, | |
| "grad_norm": 0.510739525920679, | |
| "learning_rate": 4.476805375350865e-07, | |
| "loss": 0.5204, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 0.5285640385275354, | |
| "learning_rate": 4.372371726129854e-07, | |
| "loss": 0.5226, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 0.49804779846917624, | |
| "learning_rate": 4.269089390178512e-07, | |
| "loss": 0.5257, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 3.37, | |
| "grad_norm": 0.4960403523798791, | |
| "learning_rate": 4.1669622499497205e-07, | |
| "loss": 0.5224, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 3.38, | |
| "grad_norm": 0.509776799973484, | |
| "learning_rate": 4.0659941444717833e-07, | |
| "loss": 0.5153, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 3.38, | |
| "grad_norm": 0.48108044641737857, | |
| "learning_rate": 3.966188869204094e-07, | |
| "loss": 0.5175, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 3.39, | |
| "grad_norm": 0.5141883943099625, | |
| "learning_rate": 3.8675501758944926e-07, | |
| "loss": 0.5147, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 0.5086149236998669, | |
| "learning_rate": 3.7700817724381983e-07, | |
| "loss": 0.5128, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 3.41, | |
| "grad_norm": 0.5107670739104685, | |
| "learning_rate": 3.6737873227384263e-07, | |
| "loss": 0.5162, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 3.41, | |
| "grad_norm": 0.48090817905611477, | |
| "learning_rate": 3.578670446568711e-07, | |
| "loss": 0.5289, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 3.42, | |
| "grad_norm": 0.5149098967385166, | |
| "learning_rate": 3.484734719436782e-07, | |
| "loss": 0.5224, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 3.43, | |
| "grad_norm": 0.4967090096149114, | |
| "learning_rate": 3.3919836724501743e-07, | |
| "loss": 0.5064, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 3.43, | |
| "grad_norm": 0.49198009223776107, | |
| "learning_rate": 3.3004207921835004e-07, | |
| "loss": 0.526, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 0.5260886992405347, | |
| "learning_rate": 3.210049520547388e-07, | |
| "loss": 0.5278, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 3.45, | |
| "grad_norm": 0.49827609520509064, | |
| "learning_rate": 3.1208732546590843e-07, | |
| "loss": 0.5269, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 3.46, | |
| "grad_norm": 0.5199185251610714, | |
| "learning_rate": 3.0328953467147543e-07, | |
| "loss": 0.5125, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 3.46, | |
| "grad_norm": 0.5165139482645277, | |
| "learning_rate": 2.946119103863483e-07, | |
| "loss": 0.5095, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 3.47, | |
| "grad_norm": 0.48760733590102007, | |
| "learning_rate": 2.86054778808296e-07, | |
| "loss": 0.5262, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "grad_norm": 0.49481920675979196, | |
| "learning_rate": 2.7761846160568403e-07, | |
| "loss": 0.5209, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "grad_norm": 0.5017608349952136, | |
| "learning_rate": 2.69303275905384e-07, | |
| "loss": 0.5137, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 3.49, | |
| "grad_norm": 0.5222144874040826, | |
| "learning_rate": 2.611095342808526e-07, | |
| "loss": 0.5162, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 0.4928255848647095, | |
| "learning_rate": 2.530375447403815e-07, | |
| "loss": 0.5176, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 3.51, | |
| "grad_norm": 0.530457616289496, | |
| "learning_rate": 2.4508761071551906e-07, | |
| "loss": 0.5181, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 3.51, | |
| "grad_norm": 0.5147706319208548, | |
| "learning_rate": 2.3726003104966393e-07, | |
| "loss": 0.5095, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 0.523763253857449, | |
| "learning_rate": 2.2955509998683214e-07, | |
| "loss": 0.5108, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 3.53, | |
| "grad_norm": 0.5323084690421006, | |
| "learning_rate": 2.2197310716059603e-07, | |
| "loss": 0.511, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 3.53, | |
| "grad_norm": 0.5088461348117514, | |
| "learning_rate": 2.1451433758319543e-07, | |
| "loss": 0.5265, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 3.54, | |
| "grad_norm": 0.5478220673331649, | |
| "learning_rate": 2.0717907163482507e-07, | |
| "loss": 0.5112, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 3.55, | |
| "grad_norm": 0.5414027895276027, | |
| "learning_rate": 1.9996758505309593e-07, | |
| "loss": 0.5231, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 3.56, | |
| "grad_norm": 0.4983898932091525, | |
| "learning_rate": 1.9288014892266753e-07, | |
| "loss": 0.5105, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 3.56, | |
| "grad_norm": 0.5093531347734784, | |
| "learning_rate": 1.8591702966505952e-07, | |
| "loss": 0.5127, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 3.57, | |
| "grad_norm": 0.677948629367298, | |
| "learning_rate": 1.790784890286352e-07, | |
| "loss": 0.5219, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 3.58, | |
| "grad_norm": 0.5010683504531009, | |
| "learning_rate": 1.7236478407876555e-07, | |
| "loss": 0.5054, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 3.58, | |
| "grad_norm": 0.5179768835662841, | |
| "learning_rate": 1.6577616718816123e-07, | |
| "loss": 0.5251, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 3.59, | |
| "grad_norm": 0.5087954420227027, | |
| "learning_rate": 1.5931288602738958e-07, | |
| "loss": 0.5137, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 0.5083448366233918, | |
| "learning_rate": 1.5297518355556132e-07, | |
| "loss": 0.5059, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 0.5170972166302202, | |
| "learning_rate": 1.467632980112023e-07, | |
| "loss": 0.5214, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 3.61, | |
| "grad_norm": 0.5145933451855358, | |
| "learning_rate": 1.406774629032923e-07, | |
| "loss": 0.511, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 3.62, | |
| "grad_norm": 0.5012480980422283, | |
| "learning_rate": 1.347179070024903e-07, | |
| "loss": 0.5179, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 3.63, | |
| "grad_norm": 0.5157422802936725, | |
| "learning_rate": 1.2888485433253521e-07, | |
| "loss": 0.5193, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 3.63, | |
| "grad_norm": 0.5104197669978088, | |
| "learning_rate": 1.2317852416182378e-07, | |
| "loss": 0.5221, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 3.64, | |
| "grad_norm": 0.48689303934415246, | |
| "learning_rate": 1.1759913099516816e-07, | |
| "loss": 0.5118, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 3.65, | |
| "grad_norm": 0.5105879788600957, | |
| "learning_rate": 1.1214688456573247e-07, | |
| "loss": 0.5178, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 3.65, | |
| "grad_norm": 0.4742285263986786, | |
| "learning_rate": 1.0682198982714814e-07, | |
| "loss": 0.534, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 3.66, | |
| "grad_norm": 0.5096564376650945, | |
| "learning_rate": 1.0162464694581235e-07, | |
| "loss": 0.5272, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 3.67, | |
| "grad_norm": 0.5068212494030221, | |
| "learning_rate": 9.65550512933605e-08, | |
| "loss": 0.5252, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 0.5132873703711879, | |
| "learning_rate": 9.16133934393224e-08, | |
| "loss": 0.5161, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 0.496214740845792, | |
| "learning_rate": 8.67998591439612e-08, | |
| "loss": 0.518, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 3.69, | |
| "grad_norm": 0.5257117991696062, | |
| "learning_rate": 8.21146293512876e-08, | |
| "loss": 0.5201, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 3.7, | |
| "grad_norm": 0.5038613162646833, | |
| "learning_rate": 7.755788018225961e-08, | |
| "loss": 0.5439, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 3.7, | |
| "grad_norm": 0.5108263338716986, | |
| "learning_rate": 7.31297829281617e-08, | |
| "loss": 0.5132, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 3.71, | |
| "grad_norm": 0.5149498952477054, | |
| "learning_rate": 6.883050404416552e-08, | |
| "loss": 0.5111, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 3.72, | |
| "grad_norm": 0.5047749285108949, | |
| "learning_rate": 6.46602051430732e-08, | |
| "loss": 0.5307, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 3.73, | |
| "grad_norm": 0.5156795764243357, | |
| "learning_rate": 6.061904298924253e-08, | |
| "loss": 0.5285, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 3.73, | |
| "grad_norm": 0.5144201053701509, | |
| "learning_rate": 5.670716949269278e-08, | |
| "loss": 0.5148, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 3.74, | |
| "grad_norm": 0.507394331507882, | |
| "learning_rate": 5.2924731703395564e-08, | |
| "loss": 0.5206, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 0.48368946217469994, | |
| "learning_rate": 4.927187180574666e-08, | |
| "loss": 0.526, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 0.5047554925764675, | |
| "learning_rate": 4.574872711322103e-08, | |
| "loss": 0.5126, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 3.76, | |
| "grad_norm": 0.4949463708763226, | |
| "learning_rate": 4.2355430063211405e-08, | |
| "loss": 0.5204, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 3.77, | |
| "grad_norm": 0.5079311960306774, | |
| "learning_rate": 3.909210821205017e-08, | |
| "loss": 0.5189, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 3.78, | |
| "grad_norm": 0.4902741464423996, | |
| "learning_rate": 3.595888423021354e-08, | |
| "loss": 0.513, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 3.78, | |
| "grad_norm": 0.5421885848655773, | |
| "learning_rate": 3.295587589771071e-08, | |
| "loss": 0.5093, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 3.79, | |
| "grad_norm": 0.49756539244831294, | |
| "learning_rate": 3.008319609965676e-08, | |
| "loss": 0.5144, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 0.5074328229331989, | |
| "learning_rate": 2.734095282202942e-08, | |
| "loss": 0.5133, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 0.49772891591572227, | |
| "learning_rate": 2.4729249147608378e-08, | |
| "loss": 0.5251, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 3.81, | |
| "grad_norm": 0.5088034449477752, | |
| "learning_rate": 2.224818325210237e-08, | |
| "loss": 0.5175, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 3.82, | |
| "grad_norm": 0.4826584150965653, | |
| "learning_rate": 1.9897848400456496e-08, | |
| "loss": 0.5141, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 3.83, | |
| "grad_norm": 0.5172662799041124, | |
| "learning_rate": 1.7678332943348807e-08, | |
| "loss": 0.5197, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 3.83, | |
| "grad_norm": 0.48940063691629393, | |
| "learning_rate": 1.5589720313866794e-08, | |
| "loss": 0.5059, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 0.517098264403305, | |
| "learning_rate": 1.3632089024371574e-08, | |
| "loss": 0.5141, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 3.85, | |
| "grad_norm": 0.48979313956431636, | |
| "learning_rate": 1.1805512663549345e-08, | |
| "loss": 0.5136, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 3.85, | |
| "grad_norm": 0.48660701715860905, | |
| "learning_rate": 1.0110059893640055e-08, | |
| "loss": 0.5212, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 3.86, | |
| "grad_norm": 0.4841422308843411, | |
| "learning_rate": 8.54579444786152e-09, | |
| "loss": 0.5228, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 3.87, | |
| "grad_norm": 0.4851052180007293, | |
| "learning_rate": 7.112775128009174e-09, | |
| "loss": 0.5146, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 3.88, | |
| "grad_norm": 0.49400550323274894, | |
| "learning_rate": 5.811055802249721e-09, | |
| "loss": 0.5277, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 3.88, | |
| "grad_norm": 0.512928633478054, | |
| "learning_rate": 4.640685403093147e-09, | |
| "loss": 0.5216, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 3.89, | |
| "grad_norm": 0.48929257944769156, | |
| "learning_rate": 3.6017079255547534e-09, | |
| "loss": 0.5172, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 3.9, | |
| "grad_norm": 0.5049424795736568, | |
| "learning_rate": 2.6941624255001904e-09, | |
| "loss": 0.5147, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 3.9, | |
| "grad_norm": 0.5046094240590331, | |
| "learning_rate": 1.9180830181797505e-09, | |
| "loss": 0.5222, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 3.91, | |
| "grad_norm": 0.5035868811303936, | |
| "learning_rate": 1.273498876942558e-09, | |
| "loss": 0.511, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "grad_norm": 0.48822795586505313, | |
| "learning_rate": 7.604342321435032e-10, | |
| "loss": 0.5222, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "grad_norm": 0.5199444169728372, | |
| "learning_rate": 3.789083702293028e-10, | |
| "loss": 0.5236, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 3.93, | |
| "grad_norm": 0.528390397947652, | |
| "learning_rate": 1.2893563301535904e-10, | |
| "loss": 0.5187, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 3.94, | |
| "grad_norm": 0.49922315857737276, | |
| "learning_rate": 1.0525417146023396e-11, | |
| "loss": 0.5179, | |
| "step": 5620 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 5624, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 500, | |
| "total_flos": 2354981319475200.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |