| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.3932878867330886, | |
| "eval_steps": 500, | |
| "global_step": 1500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00026219192448872575, | |
| "grad_norm": 22.20619010925293, | |
| "learning_rate": 0.0, | |
| "loss": 10.5131, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0026219192448872575, | |
| "grad_norm": 22.429588317871094, | |
| "learning_rate": 4.4999999999999996e-05, | |
| "loss": 10.4662, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.005243838489774515, | |
| "grad_norm": 22.83245086669922, | |
| "learning_rate": 9.5e-05, | |
| "loss": 10.1612, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.007865757734661772, | |
| "grad_norm": 23.247602462768555, | |
| "learning_rate": 0.000145, | |
| "loss": 9.5256, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.01048767697954903, | |
| "grad_norm": 23.51291275024414, | |
| "learning_rate": 0.00019500000000000002, | |
| "loss": 8.5708, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.013109596224436287, | |
| "grad_norm": 22.496492385864258, | |
| "learning_rate": 0.000245, | |
| "loss": 7.3388, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.015731515469323543, | |
| "grad_norm": 16.345460891723633, | |
| "learning_rate": 0.000295, | |
| "loss": 5.9703, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.018353434714210803, | |
| "grad_norm": 3.921259880065918, | |
| "learning_rate": 0.000345, | |
| "loss": 4.9478, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.02097535395909806, | |
| "grad_norm": 7.0385589599609375, | |
| "learning_rate": 0.000395, | |
| "loss": 4.6803, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.023597273203985317, | |
| "grad_norm": 2.6207873821258545, | |
| "learning_rate": 0.00044500000000000003, | |
| "loss": 4.4974, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.026219192448872573, | |
| "grad_norm": 1.9961260557174683, | |
| "learning_rate": 0.000495, | |
| "loss": 4.3314, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.028841111693759833, | |
| "grad_norm": 1.6183704137802124, | |
| "learning_rate": 0.000545, | |
| "loss": 4.1959, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.03146303093864709, | |
| "grad_norm": 1.331021785736084, | |
| "learning_rate": 0.0005949999999999999, | |
| "loss": 4.0158, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.03408495018353435, | |
| "grad_norm": 1.14554762840271, | |
| "learning_rate": 0.0006450000000000001, | |
| "loss": 3.9321, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.03670686942842161, | |
| "grad_norm": 0.9175837635993958, | |
| "learning_rate": 0.000695, | |
| "loss": 3.802, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.03932878867330886, | |
| "grad_norm": 0.7335033416748047, | |
| "learning_rate": 0.000745, | |
| "loss": 3.6618, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.04195070791819612, | |
| "grad_norm": 0.5916274785995483, | |
| "learning_rate": 0.000795, | |
| "loss": 3.5341, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.04457262716308338, | |
| "grad_norm": 0.4947799742221832, | |
| "learning_rate": 0.0008449999999999999, | |
| "loss": 3.5311, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.04719454640797063, | |
| "grad_norm": 0.40263015031814575, | |
| "learning_rate": 0.0008950000000000001, | |
| "loss": 3.4709, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.04981646565285789, | |
| "grad_norm": 0.32677406072616577, | |
| "learning_rate": 0.000945, | |
| "loss": 3.2973, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.05243838489774515, | |
| "grad_norm": 0.3071628212928772, | |
| "learning_rate": 0.000995, | |
| "loss": 3.28, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.05506030414263241, | |
| "grad_norm": 0.3233015835285187, | |
| "learning_rate": 0.001045, | |
| "loss": 3.2038, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.05768222338751967, | |
| "grad_norm": 0.39402100443840027, | |
| "learning_rate": 0.001095, | |
| "loss": 3.1627, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.060304142632406924, | |
| "grad_norm": 0.5528343915939331, | |
| "learning_rate": 0.001145, | |
| "loss": 3.1341, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.06292606187729417, | |
| "grad_norm": 0.4888489842414856, | |
| "learning_rate": 0.001195, | |
| "loss": 3.0192, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.06554798112218144, | |
| "grad_norm": 0.5662292838096619, | |
| "learning_rate": 0.0012450000000000002, | |
| "loss": 2.991, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.0681699003670687, | |
| "grad_norm": 0.5800466537475586, | |
| "learning_rate": 0.001295, | |
| "loss": 2.992, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.07079181961195595, | |
| "grad_norm": 0.5511091947555542, | |
| "learning_rate": 0.001345, | |
| "loss": 2.9246, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.07341373885684321, | |
| "grad_norm": 0.7486537098884583, | |
| "learning_rate": 0.001395, | |
| "loss": 2.8996, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.07603565810173046, | |
| "grad_norm": 0.6995801329612732, | |
| "learning_rate": 0.001445, | |
| "loss": 2.7945, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.07865757734661773, | |
| "grad_norm": 0.7938666939735413, | |
| "learning_rate": 0.0014950000000000002, | |
| "loss": 2.7632, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.08127949659150498, | |
| "grad_norm": 0.7555065155029297, | |
| "learning_rate": 0.001545, | |
| "loss": 2.7513, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.08390141583639224, | |
| "grad_norm": 0.7714865803718567, | |
| "learning_rate": 0.001595, | |
| "loss": 2.6165, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.08652333508127949, | |
| "grad_norm": 0.7604843974113464, | |
| "learning_rate": 0.001645, | |
| "loss": 2.6391, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.08914525432616675, | |
| "grad_norm": 0.7840315699577332, | |
| "learning_rate": 0.0016950000000000001, | |
| "loss": 2.5818, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.09176717357105402, | |
| "grad_norm": 1.0126832723617554, | |
| "learning_rate": 0.0017450000000000002, | |
| "loss": 2.5417, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.09438909281594127, | |
| "grad_norm": 1.0092129707336426, | |
| "learning_rate": 0.001795, | |
| "loss": 2.4844, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.09701101206082853, | |
| "grad_norm": 1.1585489511489868, | |
| "learning_rate": 0.001845, | |
| "loss": 2.4645, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.09963293130571578, | |
| "grad_norm": 1.0778034925460815, | |
| "learning_rate": 0.001895, | |
| "loss": 2.4003, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.10225485055060304, | |
| "grad_norm": 1.146636962890625, | |
| "learning_rate": 0.0019450000000000001, | |
| "loss": 2.3466, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.1048767697954903, | |
| "grad_norm": 0.9742526412010193, | |
| "learning_rate": 0.0019950000000000002, | |
| "loss": 2.3088, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.10749868904037756, | |
| "grad_norm": 1.3035728931427002, | |
| "learning_rate": 0.0019999657054386192, | |
| "loss": 2.2834, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.11012060828526482, | |
| "grad_norm": 1.0689384937286377, | |
| "learning_rate": 0.0019998471593574603, | |
| "loss": 2.2473, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.11274252753015207, | |
| "grad_norm": 1.1519441604614258, | |
| "learning_rate": 0.001999643948402709, | |
| "loss": 2.1925, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.11536444677503933, | |
| "grad_norm": 0.9427940249443054, | |
| "learning_rate": 0.0019993560897818255, | |
| "loss": 2.1774, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.11798636601992658, | |
| "grad_norm": 0.9017934203147888, | |
| "learning_rate": 0.0019989836078700496, | |
| "loss": 2.152, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.12060828526481385, | |
| "grad_norm": 1.018966555595398, | |
| "learning_rate": 0.001998526534208335, | |
| "loss": 2.0825, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.1232302045097011, | |
| "grad_norm": 1.0533466339111328, | |
| "learning_rate": 0.0019979849075006813, | |
| "loss": 2.1358, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.12585212375458835, | |
| "grad_norm": 0.941605806350708, | |
| "learning_rate": 0.001997358773610856, | |
| "loss": 2.0524, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.12847404299947562, | |
| "grad_norm": 0.8877449035644531, | |
| "learning_rate": 0.0019966481855585075, | |
| "loss": 2.0308, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.13109596224436287, | |
| "grad_norm": 0.8652307391166687, | |
| "learning_rate": 0.001995853203514682, | |
| "loss": 2.012, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.13371788148925012, | |
| "grad_norm": 0.8943641781806946, | |
| "learning_rate": 0.0019949738947967217, | |
| "loss": 1.9729, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.1363398007341374, | |
| "grad_norm": 0.9359736442565918, | |
| "learning_rate": 0.001994010333862568, | |
| "loss": 1.9997, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.13896171997902465, | |
| "grad_norm": 1.0085017681121826, | |
| "learning_rate": 0.001992962602304456, | |
| "loss": 1.937, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.1415836392239119, | |
| "grad_norm": 0.7549618482589722, | |
| "learning_rate": 0.0019918307888420065, | |
| "loss": 1.9268, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.14420555846879915, | |
| "grad_norm": 0.8932085037231445, | |
| "learning_rate": 0.0019906149893147104, | |
| "loss": 1.9014, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.14682747771368643, | |
| "grad_norm": 0.8130724430084229, | |
| "learning_rate": 0.001989315306673817, | |
| "loss": 1.8577, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.14944939695857368, | |
| "grad_norm": 0.8497139811515808, | |
| "learning_rate": 0.0019879318509736137, | |
| "loss": 1.8185, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.15207131620346093, | |
| "grad_norm": 0.6299962997436523, | |
| "learning_rate": 0.001986464739362106, | |
| "loss": 1.811, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.1546932354483482, | |
| "grad_norm": 0.7180768251419067, | |
| "learning_rate": 0.0019849140960711024, | |
| "loss": 1.7944, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.15731515469323545, | |
| "grad_norm": 0.8082334399223328, | |
| "learning_rate": 0.0019832800524056888, | |
| "loss": 1.8333, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.1599370739381227, | |
| "grad_norm": 0.8284159302711487, | |
| "learning_rate": 0.0019815627467331142, | |
| "loss": 1.811, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.16255899318300995, | |
| "grad_norm": 0.7332941293716431, | |
| "learning_rate": 0.0019797623244710715, | |
| "loss": 1.7704, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.16518091242789723, | |
| "grad_norm": 0.7234723567962646, | |
| "learning_rate": 0.0019778789380753862, | |
| "loss": 1.7558, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.16780283167278448, | |
| "grad_norm": 0.693242073059082, | |
| "learning_rate": 0.001975912747027104, | |
| "loss": 1.742, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.17042475091767173, | |
| "grad_norm": 0.8523733019828796, | |
| "learning_rate": 0.0019738639178189885, | |
| "loss": 1.7438, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.17304667016255898, | |
| "grad_norm": 0.7505561709403992, | |
| "learning_rate": 0.001971732623941422, | |
| "loss": 1.7251, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.17566858940744626, | |
| "grad_norm": 0.7338821887969971, | |
| "learning_rate": 0.0019695190458677144, | |
| "loss": 1.7281, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.1782905086523335, | |
| "grad_norm": 0.8278585076332092, | |
| "learning_rate": 0.001967223371038823, | |
| "loss": 1.6983, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.18091242789722076, | |
| "grad_norm": 0.6785498261451721, | |
| "learning_rate": 0.0019648457938474776, | |
| "loss": 1.7018, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.18353434714210803, | |
| "grad_norm": 0.7954968810081482, | |
| "learning_rate": 0.0019623865156217215, | |
| "loss": 1.6978, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.18615626638699528, | |
| "grad_norm": 0.6877925992012024, | |
| "learning_rate": 0.001959845744607864, | |
| "loss": 1.6693, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.18877818563188253, | |
| "grad_norm": 0.6183112859725952, | |
| "learning_rate": 0.001957223695952844, | |
| "loss": 1.656, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.19140010487676978, | |
| "grad_norm": 0.6864896416664124, | |
| "learning_rate": 0.0019545205916860152, | |
| "loss": 1.6188, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.19402202412165706, | |
| "grad_norm": 0.6678555011749268, | |
| "learning_rate": 0.0019517366607003429, | |
| "loss": 1.6195, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.1966439433665443, | |
| "grad_norm": 0.724320113658905, | |
| "learning_rate": 0.0019488721387330222, | |
| "loss": 1.6067, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.19926586261143156, | |
| "grad_norm": 0.6665757298469543, | |
| "learning_rate": 0.0019459272683455162, | |
| "loss": 1.5781, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.20188778185631884, | |
| "grad_norm": 0.7139772772789001, | |
| "learning_rate": 0.0019429022989030176, | |
| "loss": 1.5647, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.2045097011012061, | |
| "grad_norm": 0.6505457758903503, | |
| "learning_rate": 0.0019397974865533315, | |
| "loss": 1.5869, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.20713162034609334, | |
| "grad_norm": 0.6815754175186157, | |
| "learning_rate": 0.001936613094205186, | |
| "loss": 1.5848, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.2097535395909806, | |
| "grad_norm": 0.6977171897888184, | |
| "learning_rate": 0.00193334939150597, | |
| "loss": 1.5284, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.21237545883586786, | |
| "grad_norm": 0.5965753197669983, | |
| "learning_rate": 0.0019300066548188998, | |
| "loss": 1.5468, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.2149973780807551, | |
| "grad_norm": 0.596052885055542, | |
| "learning_rate": 0.001926585167199616, | |
| "loss": 1.5579, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.21761929732564236, | |
| "grad_norm": 0.6821017861366272, | |
| "learning_rate": 0.001923085218372218, | |
| "loss": 1.4984, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.22024121657052964, | |
| "grad_norm": 0.6523297429084778, | |
| "learning_rate": 0.0019195071047047277, | |
| "loss": 1.537, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.2228631358154169, | |
| "grad_norm": 0.648935079574585, | |
| "learning_rate": 0.0019158511291839945, | |
| "loss": 1.5192, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.22548505506030414, | |
| "grad_norm": 0.6102792620658875, | |
| "learning_rate": 0.0019121176013900407, | |
| "loss": 1.5209, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.2281069743051914, | |
| "grad_norm": 0.6573307514190674, | |
| "learning_rate": 0.0019083068374698448, | |
| "loss": 1.49, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.23072889355007867, | |
| "grad_norm": 0.6355723738670349, | |
| "learning_rate": 0.0019044191601105727, | |
| "loss": 1.4929, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.23335081279496592, | |
| "grad_norm": 0.5931225419044495, | |
| "learning_rate": 0.0019004548985122511, | |
| "loss": 1.4813, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.23597273203985317, | |
| "grad_norm": 0.6640650629997253, | |
| "learning_rate": 0.0018964143883598936, | |
| "loss": 1.4808, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.23859465128474042, | |
| "grad_norm": 0.6377866268157959, | |
| "learning_rate": 0.0018922979717950748, | |
| "loss": 1.4901, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.2412165705296277, | |
| "grad_norm": 0.6502982378005981, | |
| "learning_rate": 0.0018881059973869581, | |
| "loss": 1.4501, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.24383848977451494, | |
| "grad_norm": 0.602969765663147, | |
| "learning_rate": 0.0018838388201027805, | |
| "loss": 1.4661, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.2464604090194022, | |
| "grad_norm": 0.6061879396438599, | |
| "learning_rate": 0.001879496801277794, | |
| "loss": 1.4408, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.24908232826428947, | |
| "grad_norm": 0.8049127459526062, | |
| "learning_rate": 0.001875080308584669, | |
| "loss": 1.4466, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.2517042475091767, | |
| "grad_norm": 0.46771517395973206, | |
| "learning_rate": 0.00187058971600236, | |
| "loss": 1.4382, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.254326166754064, | |
| "grad_norm": 0.6081333756446838, | |
| "learning_rate": 0.001866025403784439, | |
| "loss": 1.4518, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.25694808599895125, | |
| "grad_norm": 0.6247040033340454, | |
| "learning_rate": 0.0018613877584268944, | |
| "loss": 1.4639, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.2595700052438385, | |
| "grad_norm": 0.5699506998062134, | |
| "learning_rate": 0.0018566771726354063, | |
| "loss": 1.4218, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.26219192448872575, | |
| "grad_norm": 0.5360729694366455, | |
| "learning_rate": 0.0018518940452920906, | |
| "loss": 1.4189, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.264813843733613, | |
| "grad_norm": 0.5921474695205688, | |
| "learning_rate": 0.0018470387814217232, | |
| "loss": 1.424, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.26743576297850025, | |
| "grad_norm": 0.6162559986114502, | |
| "learning_rate": 0.0018421117921574438, | |
| "loss": 1.4307, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.2700576822233875, | |
| "grad_norm": 0.5530286431312561, | |
| "learning_rate": 0.001837113494705942, | |
| "loss": 1.4158, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.2726796014682748, | |
| "grad_norm": 0.5585499405860901, | |
| "learning_rate": 0.0018320443123121283, | |
| "loss": 1.3861, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.27530152071316205, | |
| "grad_norm": 0.6225973963737488, | |
| "learning_rate": 0.0018269046742232966, | |
| "loss": 1.3942, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.2779234399580493, | |
| "grad_norm": 0.49642321467399597, | |
| "learning_rate": 0.0018216950156527737, | |
| "loss": 1.3912, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.28054535920293655, | |
| "grad_norm": 0.6089576482772827, | |
| "learning_rate": 0.0018164157777430681, | |
| "loss": 1.3732, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.2831672784478238, | |
| "grad_norm": 0.5753847360610962, | |
| "learning_rate": 0.0018110674075285157, | |
| "loss": 1.398, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.28578919769271105, | |
| "grad_norm": 0.5357734560966492, | |
| "learning_rate": 0.0018056503578974242, | |
| "loss": 1.3851, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.2884111169375983, | |
| "grad_norm": 0.5319791436195374, | |
| "learning_rate": 0.001800165087553724, | |
| "loss": 1.3804, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.2910330361824856, | |
| "grad_norm": 0.5765709280967712, | |
| "learning_rate": 0.0017946120609781276, | |
| "loss": 1.3534, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.29365495542737285, | |
| "grad_norm": 0.48765453696250916, | |
| "learning_rate": 0.001788991748388796, | |
| "loss": 1.3693, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.2962768746722601, | |
| "grad_norm": 0.5916075110435486, | |
| "learning_rate": 0.001783304625701524, | |
| "loss": 1.3697, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.29889879391714735, | |
| "grad_norm": 0.411699503660202, | |
| "learning_rate": 0.0017775511744894384, | |
| "loss": 1.3588, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.3015207131620346, | |
| "grad_norm": 0.5155631899833679, | |
| "learning_rate": 0.0017717318819422214, | |
| "loss": 1.3697, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.30414263240692185, | |
| "grad_norm": 0.5687488913536072, | |
| "learning_rate": 0.0017658472408248551, | |
| "loss": 1.3558, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.3067645516518091, | |
| "grad_norm": 0.5609891414642334, | |
| "learning_rate": 0.0017598977494358967, | |
| "loss": 1.3376, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.3093864708966964, | |
| "grad_norm": 0.5137512683868408, | |
| "learning_rate": 0.0017538839115652817, | |
| "loss": 1.3534, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.31200839014158366, | |
| "grad_norm": 0.5840641260147095, | |
| "learning_rate": 0.001747806236451666, | |
| "loss": 1.3394, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.3146303093864709, | |
| "grad_norm": 0.5758949518203735, | |
| "learning_rate": 0.0017416652387393027, | |
| "loss": 1.3417, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.31725222863135816, | |
| "grad_norm": 0.5121742486953735, | |
| "learning_rate": 0.0017354614384344658, | |
| "loss": 1.341, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.3198741478762454, | |
| "grad_norm": 0.5056650638580322, | |
| "learning_rate": 0.001729195360861414, | |
| "loss": 1.316, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.32249606712113266, | |
| "grad_norm": 0.4782615602016449, | |
| "learning_rate": 0.0017228675366179106, | |
| "loss": 1.3226, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.3251179863660199, | |
| "grad_norm": 0.49403342604637146, | |
| "learning_rate": 0.0017164785015302906, | |
| "loss": 1.37, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.3277399056109072, | |
| "grad_norm": 0.4836321175098419, | |
| "learning_rate": 0.0017100287966080906, | |
| "loss": 1.3272, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.33036182485579446, | |
| "grad_norm": 0.48174890875816345, | |
| "learning_rate": 0.001703518967998236, | |
| "loss": 1.3148, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.3329837441006817, | |
| "grad_norm": 0.4627121090888977, | |
| "learning_rate": 0.001696949566938795, | |
| "loss": 1.3161, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.33560566334556896, | |
| "grad_norm": 0.470414936542511, | |
| "learning_rate": 0.0016903211497123003, | |
| "loss": 1.3313, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.3382275825904562, | |
| "grad_norm": 0.4437310993671417, | |
| "learning_rate": 0.0016836342775986446, | |
| "loss": 1.3073, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.34084950183534346, | |
| "grad_norm": 0.47688329219818115, | |
| "learning_rate": 0.0016768895168275534, | |
| "loss": 1.3128, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.3434714210802307, | |
| "grad_norm": 0.5143507122993469, | |
| "learning_rate": 0.0016700874385306363, | |
| "loss": 1.3357, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.34609334032511796, | |
| "grad_norm": 0.4100657105445862, | |
| "learning_rate": 0.0016632286186930275, | |
| "loss": 1.3061, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.34871525957000526, | |
| "grad_norm": 0.4421868920326233, | |
| "learning_rate": 0.0016563136381046088, | |
| "loss": 1.3158, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.3513371788148925, | |
| "grad_norm": 0.4668099582195282, | |
| "learning_rate": 0.0016493430823108332, | |
| "loss": 1.3088, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.35395909805977976, | |
| "grad_norm": 0.5451709032058716, | |
| "learning_rate": 0.0016423175415631404, | |
| "loss": 1.3344, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.356581017304667, | |
| "grad_norm": 0.45294106006622314, | |
| "learning_rate": 0.0016352376107689754, | |
| "loss": 1.2778, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.35920293654955426, | |
| "grad_norm": 0.4404051601886749, | |
| "learning_rate": 0.0016281038894414143, | |
| "loss": 1.2871, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.3618248557944415, | |
| "grad_norm": 0.45863279700279236, | |
| "learning_rate": 0.0016209169816483971, | |
| "loss": 1.3286, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.36444677503932876, | |
| "grad_norm": 0.45011425018310547, | |
| "learning_rate": 0.0016136774959615784, | |
| "loss": 1.2979, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.36706869428421607, | |
| "grad_norm": 0.5113876461982727, | |
| "learning_rate": 0.0016063860454047943, | |
| "loss": 1.3088, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.3696906135291033, | |
| "grad_norm": 0.40740302205085754, | |
| "learning_rate": 0.001599043247402151, | |
| "loss": 1.2703, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.37231253277399057, | |
| "grad_norm": 0.4261358976364136, | |
| "learning_rate": 0.0015916497237257455, | |
| "loss": 1.2681, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.3749344520188778, | |
| "grad_norm": 0.4349290132522583, | |
| "learning_rate": 0.0015842061004430145, | |
| "loss": 1.317, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.37755637126376507, | |
| "grad_norm": 0.4363626539707184, | |
| "learning_rate": 0.0015767130078637183, | |
| "loss": 1.2707, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.3801782905086523, | |
| "grad_norm": 0.41238006949424744, | |
| "learning_rate": 0.0015691710804865706, | |
| "loss": 1.2763, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.38280020975353957, | |
| "grad_norm": 0.476226270198822, | |
| "learning_rate": 0.0015615809569455089, | |
| "loss": 1.3037, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.38542212899842687, | |
| "grad_norm": 0.45900896191596985, | |
| "learning_rate": 0.0015539432799556159, | |
| "loss": 1.287, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.3880440482433141, | |
| "grad_norm": 0.3873949348926544, | |
| "learning_rate": 0.0015462586962586972, | |
| "loss": 1.2793, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.39066596748820137, | |
| "grad_norm": 0.4380306601524353, | |
| "learning_rate": 0.001538527856568515, | |
| "loss": 1.2916, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.3932878867330886, | |
| "grad_norm": 0.39479300379753113, | |
| "learning_rate": 0.0015307514155156895, | |
| "loss": 1.272, | |
| "step": 1500 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3814, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 9223372036854775807, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.0095396499845284e+18, | |
| "train_batch_size": 64, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |