{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 8661, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0034644032565390613, "grad_norm": 24.105030711898273, "learning_rate": 5.190311418685121e-07, "loss": 2.8149, "step": 10 }, { "epoch": 0.0069288065130781226, "grad_norm": 13.101411602211778, "learning_rate": 1.0957324106113035e-06, "loss": 2.6533, "step": 20 }, { "epoch": 0.010393209769617183, "grad_norm": 11.8076022930929, "learning_rate": 1.6724336793540945e-06, "loss": 2.4007, "step": 30 }, { "epoch": 0.013857613026156245, "grad_norm": 4.7580336362411515, "learning_rate": 2.249134948096886e-06, "loss": 1.8865, "step": 40 }, { "epoch": 0.017322016282695307, "grad_norm": 2.9114980071240395, "learning_rate": 2.825836216839677e-06, "loss": 1.4207, "step": 50 }, { "epoch": 0.020786419539234366, "grad_norm": 1.7324565160527228, "learning_rate": 3.402537485582469e-06, "loss": 1.0113, "step": 60 }, { "epoch": 0.024250822795773428, "grad_norm": 2.1155590344472155, "learning_rate": 3.9792387543252595e-06, "loss": 0.7313, "step": 70 }, { "epoch": 0.02771522605231249, "grad_norm": 0.6681966791277029, "learning_rate": 4.555940023068051e-06, "loss": 0.5804, "step": 80 }, { "epoch": 0.03117962930885155, "grad_norm": 2.046887683182866, "learning_rate": 5.132641291810842e-06, "loss": 0.5134, "step": 90 }, { "epoch": 0.034644032565390614, "grad_norm": 1.0043907474739053, "learning_rate": 5.709342560553633e-06, "loss": 0.4659, "step": 100 }, { "epoch": 0.03810843582192967, "grad_norm": 0.6198895627168727, "learning_rate": 6.286043829296424e-06, "loss": 0.4432, "step": 110 }, { "epoch": 0.04157283907846873, "grad_norm": 0.5789693949791498, "learning_rate": 6.862745098039216e-06, "loss": 0.4208, "step": 120 }, { "epoch": 0.0450372423350078, "grad_norm": 0.5151956509234089, "learning_rate": 7.439446366782007e-06, "loss": 0.4103, "step": 130 }, { "epoch": 0.048501645591546856, "grad_norm": 0.6554047963560565, "learning_rate": 8.016147635524798e-06, "loss": 0.3938, "step": 140 }, { "epoch": 0.051966048848085915, "grad_norm": 0.504860048364916, "learning_rate": 8.592848904267588e-06, "loss": 0.384, "step": 150 }, { "epoch": 0.05543045210462498, "grad_norm": 0.6045057320065914, "learning_rate": 9.169550173010382e-06, "loss": 0.3781, "step": 160 }, { "epoch": 0.05889485536116404, "grad_norm": 1.055910114221381, "learning_rate": 9.746251441753172e-06, "loss": 0.3695, "step": 170 }, { "epoch": 0.0623592586177031, "grad_norm": 0.462726140305608, "learning_rate": 1.0322952710495964e-05, "loss": 0.3577, "step": 180 }, { "epoch": 0.06582366187424216, "grad_norm": 0.4950470445818375, "learning_rate": 1.0899653979238756e-05, "loss": 0.3593, "step": 190 }, { "epoch": 0.06928806513078123, "grad_norm": 0.4011301599299419, "learning_rate": 1.1476355247981546e-05, "loss": 0.3469, "step": 200 }, { "epoch": 0.07275246838732029, "grad_norm": 1.5395956991225512, "learning_rate": 1.2053056516724338e-05, "loss": 0.3466, "step": 210 }, { "epoch": 0.07621687164385935, "grad_norm": 0.4782287164298744, "learning_rate": 1.2629757785467128e-05, "loss": 0.3384, "step": 220 }, { "epoch": 0.0796812749003984, "grad_norm": 0.4264649009974007, "learning_rate": 1.3206459054209918e-05, "loss": 0.335, "step": 230 }, { "epoch": 0.08314567815693746, "grad_norm": 0.5500552360743377, "learning_rate": 1.3783160322952712e-05, "loss": 0.3294, "step": 240 }, { "epoch": 0.08661008141347652, "grad_norm": 0.5287918177337965, "learning_rate": 1.4359861591695503e-05, "loss": 0.328, "step": 250 }, { "epoch": 0.0900744846700156, "grad_norm": 0.4510077521972178, "learning_rate": 1.4936562860438294e-05, "loss": 0.3294, "step": 260 }, { "epoch": 0.09353888792655465, "grad_norm": 0.47410104891468646, "learning_rate": 1.5513264129181084e-05, "loss": 0.322, "step": 270 }, { "epoch": 0.09700329118309371, "grad_norm": 0.5064048354634346, "learning_rate": 1.6089965397923876e-05, "loss": 0.3222, "step": 280 }, { "epoch": 0.10046769443963277, "grad_norm": 0.5163925193924696, "learning_rate": 1.6666666666666667e-05, "loss": 0.3104, "step": 290 }, { "epoch": 0.10393209769617183, "grad_norm": 0.5854453216694896, "learning_rate": 1.7243367935409456e-05, "loss": 0.3108, "step": 300 }, { "epoch": 0.1073965009527109, "grad_norm": 0.47989721671414387, "learning_rate": 1.782006920415225e-05, "loss": 0.3122, "step": 310 }, { "epoch": 0.11086090420924996, "grad_norm": 0.4198890022476431, "learning_rate": 1.8396770472895043e-05, "loss": 0.3013, "step": 320 }, { "epoch": 0.11432530746578902, "grad_norm": 0.35484886374124996, "learning_rate": 1.897347174163783e-05, "loss": 0.308, "step": 330 }, { "epoch": 0.11778971072232808, "grad_norm": 0.6798994381335608, "learning_rate": 1.9550173010380623e-05, "loss": 0.3017, "step": 340 }, { "epoch": 0.12125411397886714, "grad_norm": 0.5104370022480478, "learning_rate": 2.0126874279123415e-05, "loss": 0.3009, "step": 350 }, { "epoch": 0.1247185172354062, "grad_norm": 0.43181121899772806, "learning_rate": 2.0703575547866204e-05, "loss": 0.2984, "step": 360 }, { "epoch": 0.12818292049194527, "grad_norm": 0.44631400797502857, "learning_rate": 2.1280276816609e-05, "loss": 0.3011, "step": 370 }, { "epoch": 0.1316473237484843, "grad_norm": 1.3618600184569662, "learning_rate": 2.185697808535179e-05, "loss": 0.3028, "step": 380 }, { "epoch": 0.13511172700502339, "grad_norm": 0.4288866277270222, "learning_rate": 2.243367935409458e-05, "loss": 0.2966, "step": 390 }, { "epoch": 0.13857613026156246, "grad_norm": 0.43831756556428697, "learning_rate": 2.301038062283737e-05, "loss": 0.2916, "step": 400 }, { "epoch": 0.1420405335181015, "grad_norm": 0.422949206010911, "learning_rate": 2.3587081891580163e-05, "loss": 0.2979, "step": 410 }, { "epoch": 0.14550493677464058, "grad_norm": 0.5022509237000374, "learning_rate": 2.4163783160322955e-05, "loss": 0.2895, "step": 420 }, { "epoch": 0.14896934003117962, "grad_norm": 0.49397467400296585, "learning_rate": 2.4740484429065743e-05, "loss": 0.2898, "step": 430 }, { "epoch": 0.1524337432877187, "grad_norm": 0.3494967743075288, "learning_rate": 2.531718569780854e-05, "loss": 0.279, "step": 440 }, { "epoch": 0.15589814654425777, "grad_norm": 0.45661458601823307, "learning_rate": 2.5893886966551327e-05, "loss": 0.291, "step": 450 }, { "epoch": 0.1593625498007968, "grad_norm": 0.5438985700428273, "learning_rate": 2.647058823529412e-05, "loss": 0.29, "step": 460 }, { "epoch": 0.16282695305733588, "grad_norm": 0.4788529828590826, "learning_rate": 2.704728950403691e-05, "loss": 0.2866, "step": 470 }, { "epoch": 0.16629135631387493, "grad_norm": 0.4154664374547961, "learning_rate": 2.7623990772779702e-05, "loss": 0.2825, "step": 480 }, { "epoch": 0.169755759570414, "grad_norm": 0.45665598437011556, "learning_rate": 2.820069204152249e-05, "loss": 0.2839, "step": 490 }, { "epoch": 0.17322016282695304, "grad_norm": 0.5602183672259513, "learning_rate": 2.8777393310265283e-05, "loss": 0.285, "step": 500 }, { "epoch": 0.17668456608349212, "grad_norm": 0.4595470907703161, "learning_rate": 2.9354094579008075e-05, "loss": 0.2845, "step": 510 }, { "epoch": 0.1801489693400312, "grad_norm": 0.35972117453020297, "learning_rate": 2.9930795847750863e-05, "loss": 0.2772, "step": 520 }, { "epoch": 0.18361337259657023, "grad_norm": 0.4722911335764227, "learning_rate": 3.0507497116493655e-05, "loss": 0.2774, "step": 530 }, { "epoch": 0.1870777758531093, "grad_norm": 0.4212365397286255, "learning_rate": 3.108419838523645e-05, "loss": 0.2778, "step": 540 }, { "epoch": 0.19054217910964835, "grad_norm": 0.4437718583316477, "learning_rate": 3.166089965397924e-05, "loss": 0.2763, "step": 550 }, { "epoch": 0.19400658236618742, "grad_norm": 0.5795580292549793, "learning_rate": 3.2237600922722034e-05, "loss": 0.2777, "step": 560 }, { "epoch": 0.1974709856227265, "grad_norm": 0.38801773200166495, "learning_rate": 3.2814302191464826e-05, "loss": 0.2782, "step": 570 }, { "epoch": 0.20093538887926554, "grad_norm": 0.39386050551197965, "learning_rate": 3.339100346020762e-05, "loss": 0.2756, "step": 580 }, { "epoch": 0.20439979213580461, "grad_norm": 0.4094962883057876, "learning_rate": 3.39677047289504e-05, "loss": 0.2725, "step": 590 }, { "epoch": 0.20786419539234366, "grad_norm": 0.46243563531324183, "learning_rate": 3.4544405997693194e-05, "loss": 0.2679, "step": 600 }, { "epoch": 0.21132859864888273, "grad_norm": 0.44066593111685165, "learning_rate": 3.5121107266435986e-05, "loss": 0.2785, "step": 610 }, { "epoch": 0.2147930019054218, "grad_norm": 0.3559776358805474, "learning_rate": 3.569780853517878e-05, "loss": 0.2717, "step": 620 }, { "epoch": 0.21825740516196085, "grad_norm": 0.46610388128285485, "learning_rate": 3.627450980392157e-05, "loss": 0.2697, "step": 630 }, { "epoch": 0.22172180841849992, "grad_norm": 0.40480766460689316, "learning_rate": 3.685121107266436e-05, "loss": 0.2703, "step": 640 }, { "epoch": 0.22518621167503897, "grad_norm": 0.4307168132201555, "learning_rate": 3.7427912341407154e-05, "loss": 0.2732, "step": 650 }, { "epoch": 0.22865061493157804, "grad_norm": 0.4259888241847833, "learning_rate": 3.800461361014994e-05, "loss": 0.2701, "step": 660 }, { "epoch": 0.23211501818811708, "grad_norm": 0.45912634835480254, "learning_rate": 3.858131487889274e-05, "loss": 0.2715, "step": 670 }, { "epoch": 0.23557942144465616, "grad_norm": 0.4028410357459808, "learning_rate": 3.915801614763553e-05, "loss": 0.2722, "step": 680 }, { "epoch": 0.23904382470119523, "grad_norm": 0.48493013541150326, "learning_rate": 3.973471741637832e-05, "loss": 0.2671, "step": 690 }, { "epoch": 0.24250822795773427, "grad_norm": 0.3565349550219391, "learning_rate": 4.031141868512111e-05, "loss": 0.2663, "step": 700 }, { "epoch": 0.24597263121427335, "grad_norm": 0.32316152566585826, "learning_rate": 4.0888119953863905e-05, "loss": 0.2725, "step": 710 }, { "epoch": 0.2494370344708124, "grad_norm": 0.5136690083745088, "learning_rate": 4.146482122260669e-05, "loss": 0.2655, "step": 720 }, { "epoch": 0.25290143772735146, "grad_norm": 0.40785543102527044, "learning_rate": 4.204152249134948e-05, "loss": 0.2668, "step": 730 }, { "epoch": 0.25636584098389054, "grad_norm": 0.38517456097463165, "learning_rate": 4.2618223760092274e-05, "loss": 0.257, "step": 740 }, { "epoch": 0.2598302442404296, "grad_norm": 0.38202204582529214, "learning_rate": 4.3194925028835065e-05, "loss": 0.2657, "step": 750 }, { "epoch": 0.2632946474969686, "grad_norm": 0.5057316578616624, "learning_rate": 4.377162629757786e-05, "loss": 0.2669, "step": 760 }, { "epoch": 0.2667590507535077, "grad_norm": 0.4288116267424748, "learning_rate": 4.434832756632065e-05, "loss": 0.2636, "step": 770 }, { "epoch": 0.27022345401004677, "grad_norm": 0.37093943217557107, "learning_rate": 4.4925028835063434e-05, "loss": 0.2647, "step": 780 }, { "epoch": 0.27368785726658584, "grad_norm": 0.34353485608546885, "learning_rate": 4.5501730103806226e-05, "loss": 0.2598, "step": 790 }, { "epoch": 0.2771522605231249, "grad_norm": 0.35239301626467756, "learning_rate": 4.607843137254902e-05, "loss": 0.2646, "step": 800 }, { "epoch": 0.28061666377966393, "grad_norm": 0.3842430524450989, "learning_rate": 4.6655132641291816e-05, "loss": 0.2619, "step": 810 }, { "epoch": 0.284081067036203, "grad_norm": 0.35682883446316815, "learning_rate": 4.723183391003461e-05, "loss": 0.2561, "step": 820 }, { "epoch": 0.2875454702927421, "grad_norm": 0.3734875488865965, "learning_rate": 4.78085351787774e-05, "loss": 0.2584, "step": 830 }, { "epoch": 0.29100987354928115, "grad_norm": 0.4315870557925161, "learning_rate": 4.8385236447520185e-05, "loss": 0.2613, "step": 840 }, { "epoch": 0.2944742768058202, "grad_norm": 0.4103742391218009, "learning_rate": 4.896193771626298e-05, "loss": 0.2624, "step": 850 }, { "epoch": 0.29793868006235924, "grad_norm": 0.3591092100235818, "learning_rate": 4.953863898500577e-05, "loss": 0.2597, "step": 860 }, { "epoch": 0.3014030833188983, "grad_norm": 0.3623515286988262, "learning_rate": 4.999999187639266e-05, "loss": 0.2593, "step": 870 }, { "epoch": 0.3048674865754374, "grad_norm": 0.3048594211509306, "learning_rate": 4.999970755069012e-05, "loss": 0.2601, "step": 880 }, { "epoch": 0.30833188983197646, "grad_norm": 0.45002930011250586, "learning_rate": 4.9999017049900046e-05, "loss": 0.2543, "step": 890 }, { "epoch": 0.31179629308851553, "grad_norm": 0.35358665328747896, "learning_rate": 4.999792038524113e-05, "loss": 0.2559, "step": 900 }, { "epoch": 0.31526069634505455, "grad_norm": 0.2967473429143083, "learning_rate": 4.9996417574531085e-05, "loss": 0.2576, "step": 910 }, { "epoch": 0.3187250996015936, "grad_norm": 0.38465662558696595, "learning_rate": 4.9994508642186376e-05, "loss": 0.2603, "step": 920 }, { "epoch": 0.3221895028581327, "grad_norm": 0.31492562032892685, "learning_rate": 4.9992193619221796e-05, "loss": 0.2576, "step": 930 }, { "epoch": 0.32565390611467177, "grad_norm": 0.34729263239739333, "learning_rate": 4.998947254324998e-05, "loss": 0.2547, "step": 940 }, { "epoch": 0.3291183093712108, "grad_norm": 0.3488064460618837, "learning_rate": 4.998634545848076e-05, "loss": 0.2515, "step": 950 }, { "epoch": 0.33258271262774985, "grad_norm": 0.26471687398645055, "learning_rate": 4.9982812415720496e-05, "loss": 0.2574, "step": 960 }, { "epoch": 0.3360471158842889, "grad_norm": 0.2644922688346079, "learning_rate": 4.997887347237122e-05, "loss": 0.2586, "step": 970 }, { "epoch": 0.339511519140828, "grad_norm": 0.7989330166644754, "learning_rate": 4.99745286924297e-05, "loss": 0.2552, "step": 980 }, { "epoch": 0.3429759223973671, "grad_norm": 0.26650336882677617, "learning_rate": 4.9969778146486424e-05, "loss": 0.2524, "step": 990 }, { "epoch": 0.3464403256539061, "grad_norm": 0.2899936429270715, "learning_rate": 4.996462191172443e-05, "loss": 0.2543, "step": 1000 }, { "epoch": 0.34990472891044516, "grad_norm": 0.26123609655286284, "learning_rate": 4.9959060071918055e-05, "loss": 0.2533, "step": 1010 }, { "epoch": 0.35336913216698423, "grad_norm": 0.33570339771203644, "learning_rate": 4.99530927174316e-05, "loss": 0.2563, "step": 1020 }, { "epoch": 0.3568335354235233, "grad_norm": 0.3014454205427117, "learning_rate": 4.9946719945217814e-05, "loss": 0.2499, "step": 1030 }, { "epoch": 0.3602979386800624, "grad_norm": 0.27325651600933015, "learning_rate": 4.9939941858816366e-05, "loss": 0.2511, "step": 1040 }, { "epoch": 0.3637623419366014, "grad_norm": 0.32465788692548037, "learning_rate": 4.9932758568352144e-05, "loss": 0.2479, "step": 1050 }, { "epoch": 0.36722674519314047, "grad_norm": 0.2652235360508281, "learning_rate": 4.9925170190533454e-05, "loss": 0.2517, "step": 1060 }, { "epoch": 0.37069114844967954, "grad_norm": 0.3970874000599188, "learning_rate": 4.991717684865014e-05, "loss": 0.2476, "step": 1070 }, { "epoch": 0.3741555517062186, "grad_norm": 0.2722867182843485, "learning_rate": 4.990877867257157e-05, "loss": 0.2529, "step": 1080 }, { "epoch": 0.3776199549627577, "grad_norm": 0.2619052903321827, "learning_rate": 4.989997579874454e-05, "loss": 0.2469, "step": 1090 }, { "epoch": 0.3810843582192967, "grad_norm": 0.2955918646015672, "learning_rate": 4.9890768370191046e-05, "loss": 0.2502, "step": 1100 }, { "epoch": 0.3845487614758358, "grad_norm": 0.387831684494149, "learning_rate": 4.988115653650596e-05, "loss": 0.2425, "step": 1110 }, { "epoch": 0.38801316473237485, "grad_norm": 0.32722861574894646, "learning_rate": 4.98711404538546e-05, "loss": 0.248, "step": 1120 }, { "epoch": 0.3914775679889139, "grad_norm": 0.29851671037489946, "learning_rate": 4.986072028497021e-05, "loss": 0.2477, "step": 1130 }, { "epoch": 0.394941971245453, "grad_norm": 0.32504507070930905, "learning_rate": 4.984989619915128e-05, "loss": 0.2483, "step": 1140 }, { "epoch": 0.398406374501992, "grad_norm": 0.2992367844222804, "learning_rate": 4.9838668372258844e-05, "loss": 0.2434, "step": 1150 }, { "epoch": 0.4018707777585311, "grad_norm": 0.35335395639024886, "learning_rate": 4.982703698671356e-05, "loss": 0.2515, "step": 1160 }, { "epoch": 0.40533518101507016, "grad_norm": 0.2724105498054968, "learning_rate": 4.9815002231492806e-05, "loss": 0.2422, "step": 1170 }, { "epoch": 0.40879958427160923, "grad_norm": 0.26081802141731797, "learning_rate": 4.9802564302127584e-05, "loss": 0.2477, "step": 1180 }, { "epoch": 0.4122639875281483, "grad_norm": 0.26509078283121335, "learning_rate": 4.978972340069934e-05, "loss": 0.2428, "step": 1190 }, { "epoch": 0.4157283907846873, "grad_norm": 0.3268402808784247, "learning_rate": 4.977647973583669e-05, "loss": 0.245, "step": 1200 }, { "epoch": 0.4191927940412264, "grad_norm": 0.26931679508398965, "learning_rate": 4.9762833522712e-05, "loss": 0.2461, "step": 1210 }, { "epoch": 0.42265719729776546, "grad_norm": 0.28796254136886007, "learning_rate": 4.9748784983037955e-05, "loss": 0.2464, "step": 1220 }, { "epoch": 0.42612160055430454, "grad_norm": 0.2903443309315686, "learning_rate": 4.9734334345063884e-05, "loss": 0.2462, "step": 1230 }, { "epoch": 0.4295860038108436, "grad_norm": 0.2474488914617406, "learning_rate": 4.971948184357211e-05, "loss": 0.241, "step": 1240 }, { "epoch": 0.4330504070673826, "grad_norm": 0.255276664365212, "learning_rate": 4.970422771987411e-05, "loss": 0.239, "step": 1250 }, { "epoch": 0.4365148103239217, "grad_norm": 0.24990399483865394, "learning_rate": 4.968857222180656e-05, "loss": 0.2466, "step": 1260 }, { "epoch": 0.43997921358046077, "grad_norm": 0.35045726721574383, "learning_rate": 4.9672515603727385e-05, "loss": 0.2423, "step": 1270 }, { "epoch": 0.44344361683699984, "grad_norm": 0.24435346062621818, "learning_rate": 4.965605812651155e-05, "loss": 0.2407, "step": 1280 }, { "epoch": 0.44690802009353886, "grad_norm": 0.24411085271480595, "learning_rate": 4.96392000575469e-05, "loss": 0.2381, "step": 1290 }, { "epoch": 0.45037242335007793, "grad_norm": 0.2774791617724539, "learning_rate": 4.962194167072971e-05, "loss": 0.2397, "step": 1300 }, { "epoch": 0.453836826606617, "grad_norm": 0.33768453163385515, "learning_rate": 4.960428324646036e-05, "loss": 0.2391, "step": 1310 }, { "epoch": 0.4573012298631561, "grad_norm": 0.26599849394188135, "learning_rate": 4.958622507163868e-05, "loss": 0.2372, "step": 1320 }, { "epoch": 0.46076563311969515, "grad_norm": 0.34654816144848893, "learning_rate": 4.9567767439659315e-05, "loss": 0.2405, "step": 1330 }, { "epoch": 0.46423003637623417, "grad_norm": 0.2964452922825174, "learning_rate": 4.954891065040701e-05, "loss": 0.2424, "step": 1340 }, { "epoch": 0.46769443963277324, "grad_norm": 0.26622673063396735, "learning_rate": 4.952965501025165e-05, "loss": 0.2396, "step": 1350 }, { "epoch": 0.4711588428893123, "grad_norm": 0.25967043414851687, "learning_rate": 4.9510000832043356e-05, "loss": 0.2421, "step": 1360 }, { "epoch": 0.4746232461458514, "grad_norm": 0.22164309133223403, "learning_rate": 4.948994843510737e-05, "loss": 0.2429, "step": 1370 }, { "epoch": 0.47808764940239046, "grad_norm": 0.3316472047464383, "learning_rate": 4.9469498145238855e-05, "loss": 0.2426, "step": 1380 }, { "epoch": 0.4815520526589295, "grad_norm": 0.3356169844309959, "learning_rate": 4.944865029469764e-05, "loss": 0.2355, "step": 1390 }, { "epoch": 0.48501645591546855, "grad_norm": 0.268486815716583, "learning_rate": 4.9427405222202784e-05, "loss": 0.2368, "step": 1400 }, { "epoch": 0.4884808591720076, "grad_norm": 0.2334743696157166, "learning_rate": 4.9405763272927086e-05, "loss": 0.2439, "step": 1410 }, { "epoch": 0.4919452624285467, "grad_norm": 0.2957491864509931, "learning_rate": 4.938372479849149e-05, "loss": 0.237, "step": 1420 }, { "epoch": 0.49540966568508576, "grad_norm": 0.26896125577295615, "learning_rate": 4.936129015695936e-05, "loss": 0.2354, "step": 1430 }, { "epoch": 0.4988740689416248, "grad_norm": 0.25816020668305967, "learning_rate": 4.9338459712830656e-05, "loss": 0.2374, "step": 1440 }, { "epoch": 0.5023384721981639, "grad_norm": 0.24981495367506942, "learning_rate": 4.9315233837036016e-05, "loss": 0.2332, "step": 1450 }, { "epoch": 0.5058028754547029, "grad_norm": 0.2554391345730699, "learning_rate": 4.9291612906930754e-05, "loss": 0.2383, "step": 1460 }, { "epoch": 0.509267278711242, "grad_norm": 0.26653068047488826, "learning_rate": 4.926759730628868e-05, "loss": 0.2411, "step": 1470 }, { "epoch": 0.5127316819677811, "grad_norm": 0.2402237748204006, "learning_rate": 4.9243187425295915e-05, "loss": 0.2332, "step": 1480 }, { "epoch": 0.5161960852243201, "grad_norm": 0.24413036670385513, "learning_rate": 4.921838366054451e-05, "loss": 0.2396, "step": 1490 }, { "epoch": 0.5196604884808592, "grad_norm": 0.22909060071304008, "learning_rate": 4.919318641502604e-05, "loss": 0.2349, "step": 1500 }, { "epoch": 0.5231248917373982, "grad_norm": 0.22701914207506868, "learning_rate": 4.9167596098125036e-05, "loss": 0.2324, "step": 1510 }, { "epoch": 0.5265892949939373, "grad_norm": 0.25285812755375375, "learning_rate": 4.9141613125612316e-05, "loss": 0.2361, "step": 1520 }, { "epoch": 0.5300536982504763, "grad_norm": 0.23531009106142328, "learning_rate": 4.911523791963828e-05, "loss": 0.2389, "step": 1530 }, { "epoch": 0.5335181015070154, "grad_norm": 0.23036998762323121, "learning_rate": 4.908847090872599e-05, "loss": 0.2349, "step": 1540 }, { "epoch": 0.5369825047635545, "grad_norm": 0.2771142315120943, "learning_rate": 4.906131252776426e-05, "loss": 0.2384, "step": 1550 }, { "epoch": 0.5404469080200935, "grad_norm": 0.22571300533030822, "learning_rate": 4.9033763218000555e-05, "loss": 0.2307, "step": 1560 }, { "epoch": 0.5439113112766326, "grad_norm": 0.2351873545158354, "learning_rate": 4.9005823427033856e-05, "loss": 0.2353, "step": 1570 }, { "epoch": 0.5473757145331717, "grad_norm": 0.2540211756984853, "learning_rate": 4.897749360880735e-05, "loss": 0.2324, "step": 1580 }, { "epoch": 0.5508401177897108, "grad_norm": 0.25457759095229565, "learning_rate": 4.894877422360106e-05, "loss": 0.233, "step": 1590 }, { "epoch": 0.5543045210462498, "grad_norm": 0.270030962078855, "learning_rate": 4.8919665738024424e-05, "loss": 0.2415, "step": 1600 }, { "epoch": 0.5577689243027888, "grad_norm": 0.2570316973112016, "learning_rate": 4.8890168625008624e-05, "loss": 0.2342, "step": 1610 }, { "epoch": 0.5612333275593279, "grad_norm": 0.27096347274503246, "learning_rate": 4.8860283363798974e-05, "loss": 0.2279, "step": 1620 }, { "epoch": 0.5646977308158669, "grad_norm": 0.24193443897499534, "learning_rate": 4.8830010439947096e-05, "loss": 0.2337, "step": 1630 }, { "epoch": 0.568162134072406, "grad_norm": 0.29770893374153723, "learning_rate": 4.879935034530304e-05, "loss": 0.2308, "step": 1640 }, { "epoch": 0.5716265373289451, "grad_norm": 0.25131732918770927, "learning_rate": 4.876830357800729e-05, "loss": 0.2294, "step": 1650 }, { "epoch": 0.5750909405854842, "grad_norm": 0.35319194823747296, "learning_rate": 4.87368706424827e-05, "loss": 0.231, "step": 1660 }, { "epoch": 0.5785553438420232, "grad_norm": 0.23506203264124303, "learning_rate": 4.8705052049426254e-05, "loss": 0.2353, "step": 1670 }, { "epoch": 0.5820197470985623, "grad_norm": 0.26231316656015746, "learning_rate": 4.867284831580078e-05, "loss": 0.2379, "step": 1680 }, { "epoch": 0.5854841503551014, "grad_norm": 0.23941392513796939, "learning_rate": 4.8640259964826584e-05, "loss": 0.2308, "step": 1690 }, { "epoch": 0.5889485536116404, "grad_norm": 0.2531735045781861, "learning_rate": 4.860728752597291e-05, "loss": 0.2315, "step": 1700 }, { "epoch": 0.5924129568681794, "grad_norm": 0.23922451826735386, "learning_rate": 4.8573931534949354e-05, "loss": 0.2334, "step": 1710 }, { "epoch": 0.5958773601247185, "grad_norm": 0.26345705856662766, "learning_rate": 4.8540192533697155e-05, "loss": 0.2326, "step": 1720 }, { "epoch": 0.5993417633812576, "grad_norm": 0.25324175129413945, "learning_rate": 4.85060710703804e-05, "loss": 0.2333, "step": 1730 }, { "epoch": 0.6028061666377966, "grad_norm": 0.25577386258261664, "learning_rate": 4.84715676993771e-05, "loss": 0.2362, "step": 1740 }, { "epoch": 0.6062705698943357, "grad_norm": 0.27948607132517533, "learning_rate": 4.843668298127022e-05, "loss": 0.2304, "step": 1750 }, { "epoch": 0.6097349731508748, "grad_norm": 0.2560173418476038, "learning_rate": 4.840141748283851e-05, "loss": 0.2362, "step": 1760 }, { "epoch": 0.6131993764074138, "grad_norm": 0.24729226299066756, "learning_rate": 4.8365771777047356e-05, "loss": 0.2317, "step": 1770 }, { "epoch": 0.6166637796639529, "grad_norm": 0.2568219765818277, "learning_rate": 4.832974644303944e-05, "loss": 0.2393, "step": 1780 }, { "epoch": 0.620128182920492, "grad_norm": 0.2306059252401264, "learning_rate": 4.829334206612534e-05, "loss": 0.2367, "step": 1790 }, { "epoch": 0.6235925861770311, "grad_norm": 0.31020261629615153, "learning_rate": 4.8256559237774e-05, "loss": 0.2326, "step": 1800 }, { "epoch": 0.62705698943357, "grad_norm": 0.27321518814530843, "learning_rate": 4.821939855560318e-05, "loss": 0.2341, "step": 1810 }, { "epoch": 0.6305213926901091, "grad_norm": 0.3177547595944014, "learning_rate": 4.8181860623369646e-05, "loss": 0.235, "step": 1820 }, { "epoch": 0.6339857959466482, "grad_norm": 0.2631656385352305, "learning_rate": 4.814394605095946e-05, "loss": 0.2325, "step": 1830 }, { "epoch": 0.6374501992031872, "grad_norm": 0.19501128079802074, "learning_rate": 4.810565545437802e-05, "loss": 0.2318, "step": 1840 }, { "epoch": 0.6409146024597263, "grad_norm": 0.23539186170837376, "learning_rate": 4.806698945574006e-05, "loss": 0.2322, "step": 1850 }, { "epoch": 0.6443790057162654, "grad_norm": 0.2474455157721665, "learning_rate": 4.8027948683259546e-05, "loss": 0.2319, "step": 1860 }, { "epoch": 0.6478434089728045, "grad_norm": 0.21665268684721978, "learning_rate": 4.798853377123948e-05, "loss": 0.2277, "step": 1870 }, { "epoch": 0.6513078122293435, "grad_norm": 0.24192469144287532, "learning_rate": 4.794874536006152e-05, "loss": 0.2263, "step": 1880 }, { "epoch": 0.6547722154858826, "grad_norm": 0.2372551879205524, "learning_rate": 4.790858409617573e-05, "loss": 0.227, "step": 1890 }, { "epoch": 0.6582366187424216, "grad_norm": 0.23445846730497275, "learning_rate": 4.786805063208992e-05, "loss": 0.2349, "step": 1900 }, { "epoch": 0.6617010219989606, "grad_norm": 0.22188437526617993, "learning_rate": 4.782714562635914e-05, "loss": 0.2298, "step": 1910 }, { "epoch": 0.6651654252554997, "grad_norm": 0.22756117335293902, "learning_rate": 4.7785869743574915e-05, "loss": 0.2245, "step": 1920 }, { "epoch": 0.6686298285120388, "grad_norm": 0.20717649696487608, "learning_rate": 4.7744223654354506e-05, "loss": 0.2331, "step": 1930 }, { "epoch": 0.6720942317685779, "grad_norm": 0.21788412171242105, "learning_rate": 4.7702208035329996e-05, "loss": 0.2207, "step": 1940 }, { "epoch": 0.6755586350251169, "grad_norm": 0.23790582790590165, "learning_rate": 4.765982356913728e-05, "loss": 0.2299, "step": 1950 }, { "epoch": 0.679023038281656, "grad_norm": 0.254738526683431, "learning_rate": 4.7617070944404975e-05, "loss": 0.2277, "step": 1960 }, { "epoch": 0.6824874415381951, "grad_norm": 0.2628656210512643, "learning_rate": 4.757395085574326e-05, "loss": 0.2297, "step": 1970 }, { "epoch": 0.6859518447947341, "grad_norm": 0.20591189751455907, "learning_rate": 4.7530464003732545e-05, "loss": 0.2248, "step": 1980 }, { "epoch": 0.6894162480512732, "grad_norm": 0.2576351313383852, "learning_rate": 4.7486611094912146e-05, "loss": 0.2251, "step": 1990 }, { "epoch": 0.6928806513078122, "grad_norm": 0.20856524397248571, "learning_rate": 4.744239284176876e-05, "loss": 0.2291, "step": 2000 }, { "epoch": 0.6963450545643513, "grad_norm": 0.2186827003230637, "learning_rate": 4.73978099627249e-05, "loss": 0.2229, "step": 2010 }, { "epoch": 0.6998094578208903, "grad_norm": 0.20914686503222382, "learning_rate": 4.7352863182127246e-05, "loss": 0.2206, "step": 2020 }, { "epoch": 0.7032738610774294, "grad_norm": 0.22559468853582607, "learning_rate": 4.730755323023482e-05, "loss": 0.2319, "step": 2030 }, { "epoch": 0.7067382643339685, "grad_norm": 0.23247057053881534, "learning_rate": 4.72618808432072e-05, "loss": 0.2261, "step": 2040 }, { "epoch": 0.7102026675905075, "grad_norm": 0.22994852903066657, "learning_rate": 4.7215846763092486e-05, "loss": 0.2275, "step": 2050 }, { "epoch": 0.7136670708470466, "grad_norm": 0.21562072555254103, "learning_rate": 4.716945173781528e-05, "loss": 0.2275, "step": 2060 }, { "epoch": 0.7171314741035857, "grad_norm": 0.1964162535848221, "learning_rate": 4.7122696521164564e-05, "loss": 0.2267, "step": 2070 }, { "epoch": 0.7205958773601248, "grad_norm": 0.23405157909799276, "learning_rate": 4.7075581872781375e-05, "loss": 0.2293, "step": 2080 }, { "epoch": 0.7240602806166638, "grad_norm": 0.22767428517421429, "learning_rate": 4.7028108558146526e-05, "loss": 0.2273, "step": 2090 }, { "epoch": 0.7275246838732028, "grad_norm": 0.28600372042549504, "learning_rate": 4.698027734856816e-05, "loss": 0.2297, "step": 2100 }, { "epoch": 0.7309890871297419, "grad_norm": 0.2298031761231605, "learning_rate": 4.693208902116918e-05, "loss": 0.2227, "step": 2110 }, { "epoch": 0.7344534903862809, "grad_norm": 0.2116599416195417, "learning_rate": 4.688354435887467e-05, "loss": 0.2248, "step": 2120 }, { "epoch": 0.73791789364282, "grad_norm": 0.20160724544535827, "learning_rate": 4.683464415039918e-05, "loss": 0.2197, "step": 2130 }, { "epoch": 0.7413822968993591, "grad_norm": 0.22040076084389237, "learning_rate": 4.678538919023383e-05, "loss": 0.2306, "step": 2140 }, { "epoch": 0.7448467001558982, "grad_norm": 0.24384427587101223, "learning_rate": 4.673578027863351e-05, "loss": 0.226, "step": 2150 }, { "epoch": 0.7483111034124372, "grad_norm": 0.21129176345098202, "learning_rate": 4.6685818221603804e-05, "loss": 0.2298, "step": 2160 }, { "epoch": 0.7517755066689763, "grad_norm": 0.2763625585319092, "learning_rate": 4.663550383088792e-05, "loss": 0.2253, "step": 2170 }, { "epoch": 0.7552399099255154, "grad_norm": 0.21704903419555124, "learning_rate": 4.6584837923953516e-05, "loss": 0.2215, "step": 2180 }, { "epoch": 0.7587043131820544, "grad_norm": 0.21517162264960232, "learning_rate": 4.653382132397938e-05, "loss": 0.2251, "step": 2190 }, { "epoch": 0.7621687164385934, "grad_norm": 0.2018666939738909, "learning_rate": 4.648245485984207e-05, "loss": 0.2239, "step": 2200 }, { "epoch": 0.7656331196951325, "grad_norm": 0.18928657263154086, "learning_rate": 4.64307393661025e-05, "loss": 0.2222, "step": 2210 }, { "epoch": 0.7690975229516716, "grad_norm": 0.23077452960308834, "learning_rate": 4.63786756829923e-05, "loss": 0.2254, "step": 2220 }, { "epoch": 0.7725619262082106, "grad_norm": 0.19369514792151565, "learning_rate": 4.63262646564002e-05, "loss": 0.2214, "step": 2230 }, { "epoch": 0.7760263294647497, "grad_norm": 0.19788506342985074, "learning_rate": 4.627350713785829e-05, "loss": 0.2199, "step": 2240 }, { "epoch": 0.7794907327212888, "grad_norm": 0.1892817176323716, "learning_rate": 4.622040398452819e-05, "loss": 0.2209, "step": 2250 }, { "epoch": 0.7829551359778278, "grad_norm": 0.21413217334371104, "learning_rate": 4.616695605918712e-05, "loss": 0.2259, "step": 2260 }, { "epoch": 0.7864195392343669, "grad_norm": 0.2516901660820717, "learning_rate": 4.6113164230213844e-05, "loss": 0.2224, "step": 2270 }, { "epoch": 0.789883942490906, "grad_norm": 0.23275016126252404, "learning_rate": 4.605902937157465e-05, "loss": 0.2269, "step": 2280 }, { "epoch": 0.793348345747445, "grad_norm": 0.2121809202061214, "learning_rate": 4.600455236280905e-05, "loss": 0.2231, "step": 2290 }, { "epoch": 0.796812749003984, "grad_norm": 0.19630769074682533, "learning_rate": 4.5949734089015544e-05, "loss": 0.2207, "step": 2300 }, { "epoch": 0.8002771522605231, "grad_norm": 0.21355754308769215, "learning_rate": 4.589457544083725e-05, "loss": 0.224, "step": 2310 }, { "epoch": 0.8037415555170622, "grad_norm": 0.21050957794732314, "learning_rate": 4.5839077314447385e-05, "loss": 0.2238, "step": 2320 }, { "epoch": 0.8072059587736012, "grad_norm": 0.22620144254655492, "learning_rate": 4.578324061153477e-05, "loss": 0.2252, "step": 2330 }, { "epoch": 0.8106703620301403, "grad_norm": 0.19924326647557386, "learning_rate": 4.5727066239289117e-05, "loss": 0.2239, "step": 2340 }, { "epoch": 0.8141347652866794, "grad_norm": 0.22935473461207423, "learning_rate": 4.5670555110386316e-05, "loss": 0.222, "step": 2350 }, { "epoch": 0.8175991685432185, "grad_norm": 0.2651712827400202, "learning_rate": 4.561370814297363e-05, "loss": 0.2225, "step": 2360 }, { "epoch": 0.8210635717997575, "grad_norm": 0.2182312756063536, "learning_rate": 4.555652626065473e-05, "loss": 0.2238, "step": 2370 }, { "epoch": 0.8245279750562966, "grad_norm": 0.20447815035283293, "learning_rate": 4.549901039247474e-05, "loss": 0.2212, "step": 2380 }, { "epoch": 0.8279923783128356, "grad_norm": 0.17858513767092687, "learning_rate": 4.544116147290509e-05, "loss": 0.223, "step": 2390 }, { "epoch": 0.8314567815693746, "grad_norm": 0.21126944513637871, "learning_rate": 4.5382980441828385e-05, "loss": 0.2253, "step": 2400 }, { "epoch": 0.8349211848259137, "grad_norm": 0.2343427045564657, "learning_rate": 4.5324468244523086e-05, "loss": 0.2176, "step": 2410 }, { "epoch": 0.8383855880824528, "grad_norm": 0.19348098291090032, "learning_rate": 4.52656258316482e-05, "loss": 0.2171, "step": 2420 }, { "epoch": 0.8418499913389919, "grad_norm": 0.18351830211981723, "learning_rate": 4.5206454159227783e-05, "loss": 0.2209, "step": 2430 }, { "epoch": 0.8453143945955309, "grad_norm": 0.21496767724862215, "learning_rate": 4.514695418863547e-05, "loss": 0.2209, "step": 2440 }, { "epoch": 0.84877879785207, "grad_norm": 0.19946299406668855, "learning_rate": 4.508712688657879e-05, "loss": 0.2202, "step": 2450 }, { "epoch": 0.8522432011086091, "grad_norm": 0.19377744464389568, "learning_rate": 4.50269732250835e-05, "loss": 0.2201, "step": 2460 }, { "epoch": 0.8557076043651481, "grad_norm": 0.2366384856135975, "learning_rate": 4.496649418147778e-05, "loss": 0.2149, "step": 2470 }, { "epoch": 0.8591720076216872, "grad_norm": 0.21068619938805416, "learning_rate": 4.490569073837636e-05, "loss": 0.2184, "step": 2480 }, { "epoch": 0.8626364108782262, "grad_norm": 0.2315187829791829, "learning_rate": 4.4844563883664554e-05, "loss": 0.222, "step": 2490 }, { "epoch": 0.8661008141347653, "grad_norm": 0.18649456076146348, "learning_rate": 4.478311461048219e-05, "loss": 0.2209, "step": 2500 }, { "epoch": 0.8695652173913043, "grad_norm": 0.23315042553004378, "learning_rate": 4.472134391720751e-05, "loss": 0.2224, "step": 2510 }, { "epoch": 0.8730296206478434, "grad_norm": 0.1951271064725934, "learning_rate": 4.465925280744094e-05, "loss": 0.2239, "step": 2520 }, { "epoch": 0.8764940239043825, "grad_norm": 0.1913261231013653, "learning_rate": 4.459684228998873e-05, "loss": 0.2179, "step": 2530 }, { "epoch": 0.8799584271609215, "grad_norm": 0.19492329842413578, "learning_rate": 4.453411337884666e-05, "loss": 0.2162, "step": 2540 }, { "epoch": 0.8834228304174606, "grad_norm": 0.2055009502555625, "learning_rate": 4.4471067093183475e-05, "loss": 0.2165, "step": 2550 }, { "epoch": 0.8868872336739997, "grad_norm": 0.19549219766107842, "learning_rate": 4.4407704457324394e-05, "loss": 0.2158, "step": 2560 }, { "epoch": 0.8903516369305388, "grad_norm": 0.1850796577943829, "learning_rate": 4.4344026500734415e-05, "loss": 0.2172, "step": 2570 }, { "epoch": 0.8938160401870777, "grad_norm": 0.19643486062098017, "learning_rate": 4.428003425800164e-05, "loss": 0.2208, "step": 2580 }, { "epoch": 0.8972804434436168, "grad_norm": 0.21099302341164966, "learning_rate": 4.4215728768820406e-05, "loss": 0.2194, "step": 2590 }, { "epoch": 0.9007448467001559, "grad_norm": 0.24764922928675714, "learning_rate": 4.415111107797445e-05, "loss": 0.2193, "step": 2600 }, { "epoch": 0.9042092499566949, "grad_norm": 0.22064705519176767, "learning_rate": 4.4086182235319904e-05, "loss": 0.2148, "step": 2610 }, { "epoch": 0.907673653213234, "grad_norm": 0.2131199868612154, "learning_rate": 4.402094329576825e-05, "loss": 0.2233, "step": 2620 }, { "epoch": 0.9111380564697731, "grad_norm": 0.2184325870922078, "learning_rate": 4.395539531926914e-05, "loss": 0.2227, "step": 2630 }, { "epoch": 0.9146024597263122, "grad_norm": 0.20257196030375807, "learning_rate": 4.388953937079327e-05, "loss": 0.2145, "step": 2640 }, { "epoch": 0.9180668629828512, "grad_norm": 0.2052608168052879, "learning_rate": 4.3823376520314964e-05, "loss": 0.2176, "step": 2650 }, { "epoch": 0.9215312662393903, "grad_norm": 0.20116038778213127, "learning_rate": 4.3756907842794855e-05, "loss": 0.2209, "step": 2660 }, { "epoch": 0.9249956694959294, "grad_norm": 0.2094115752703769, "learning_rate": 4.369013441816242e-05, "loss": 0.2186, "step": 2670 }, { "epoch": 0.9284600727524683, "grad_norm": 0.21116605286596796, "learning_rate": 4.362305733129841e-05, "loss": 0.2177, "step": 2680 }, { "epoch": 0.9319244760090074, "grad_norm": 0.2355250090651405, "learning_rate": 4.355567767201725e-05, "loss": 0.216, "step": 2690 }, { "epoch": 0.9353888792655465, "grad_norm": 0.21712887070689368, "learning_rate": 4.3487996535049296e-05, "loss": 0.2194, "step": 2700 }, { "epoch": 0.9388532825220856, "grad_norm": 0.21715912584157251, "learning_rate": 4.342001502002309e-05, "loss": 0.2187, "step": 2710 }, { "epoch": 0.9423176857786246, "grad_norm": 0.23840653028574527, "learning_rate": 4.3351734231447436e-05, "loss": 0.2127, "step": 2720 }, { "epoch": 0.9457820890351637, "grad_norm": 0.18655391680385583, "learning_rate": 4.328315527869357e-05, "loss": 0.2213, "step": 2730 }, { "epoch": 0.9492464922917028, "grad_norm": 0.20595739324476314, "learning_rate": 4.321427927597697e-05, "loss": 0.2172, "step": 2740 }, { "epoch": 0.9527108955482418, "grad_norm": 0.16761465601551198, "learning_rate": 4.31451073423394e-05, "loss": 0.2153, "step": 2750 }, { "epoch": 0.9561752988047809, "grad_norm": 0.18621316019170056, "learning_rate": 4.3075640601630664e-05, "loss": 0.2156, "step": 2760 }, { "epoch": 0.95963970206132, "grad_norm": 0.17305717604504345, "learning_rate": 4.300588018249033e-05, "loss": 0.2129, "step": 2770 }, { "epoch": 0.963104105317859, "grad_norm": 0.18044851481990093, "learning_rate": 4.2935827218329434e-05, "loss": 0.2154, "step": 2780 }, { "epoch": 0.966568508574398, "grad_norm": 0.20518001148705736, "learning_rate": 4.2865482847312043e-05, "loss": 0.2132, "step": 2790 }, { "epoch": 0.9700329118309371, "grad_norm": 0.3349155151219487, "learning_rate": 4.279484821233678e-05, "loss": 0.2202, "step": 2800 }, { "epoch": 0.9734973150874762, "grad_norm": 0.18282251380283435, "learning_rate": 4.2723924461018225e-05, "loss": 0.2186, "step": 2810 }, { "epoch": 0.9769617183440152, "grad_norm": 0.2173702476740624, "learning_rate": 4.265271274566829e-05, "loss": 0.22, "step": 2820 }, { "epoch": 0.9804261216005543, "grad_norm": 0.19184429102715528, "learning_rate": 4.2581214223277495e-05, "loss": 0.2077, "step": 2830 }, { "epoch": 0.9838905248570934, "grad_norm": 0.20991477426484062, "learning_rate": 4.250943005549618e-05, "loss": 0.2208, "step": 2840 }, { "epoch": 0.9873549281136325, "grad_norm": 0.2285331325208064, "learning_rate": 4.2437361408615614e-05, "loss": 0.2151, "step": 2850 }, { "epoch": 0.9908193313701715, "grad_norm": 0.2148804733407693, "learning_rate": 4.2365009453549046e-05, "loss": 0.2216, "step": 2860 }, { "epoch": 0.9942837346267106, "grad_norm": 0.23671514755116507, "learning_rate": 4.22923753658127e-05, "loss": 0.2197, "step": 2870 }, { "epoch": 0.9977481378832496, "grad_norm": 0.21988104341412565, "learning_rate": 4.221946032550665e-05, "loss": 0.2143, "step": 2880 }, { "epoch": 1.0010393209769617, "grad_norm": 0.20204127186990958, "learning_rate": 4.214626551729569e-05, "loss": 0.2095, "step": 2890 }, { "epoch": 1.0045037242335009, "grad_norm": 0.19674007109910524, "learning_rate": 4.207279213039003e-05, "loss": 0.192, "step": 2900 }, { "epoch": 1.0079681274900398, "grad_norm": 0.18771217665260514, "learning_rate": 4.199904135852598e-05, "loss": 0.1936, "step": 2910 }, { "epoch": 1.0114325307465788, "grad_norm": 0.2045135750011863, "learning_rate": 4.192501439994664e-05, "loss": 0.1942, "step": 2920 }, { "epoch": 1.014896934003118, "grad_norm": 0.18370214635615664, "learning_rate": 4.185071245738231e-05, "loss": 0.1891, "step": 2930 }, { "epoch": 1.018361337259657, "grad_norm": 0.18667193497723014, "learning_rate": 4.177613673803106e-05, "loss": 0.1944, "step": 2940 }, { "epoch": 1.0218257405161961, "grad_norm": 0.1791392276679071, "learning_rate": 4.170128845353902e-05, "loss": 0.189, "step": 2950 }, { "epoch": 1.025290143772735, "grad_norm": 0.16725027485938548, "learning_rate": 4.162616881998075e-05, "loss": 0.1985, "step": 2960 }, { "epoch": 1.0287545470292743, "grad_norm": 0.2029662849352073, "learning_rate": 4.155077905783949e-05, "loss": 0.1938, "step": 2970 }, { "epoch": 1.0322189502858132, "grad_norm": 0.21676480591029273, "learning_rate": 4.14751203919873e-05, "loss": 0.1938, "step": 2980 }, { "epoch": 1.0356833535423524, "grad_norm": 0.18194277030338057, "learning_rate": 4.1399194051665146e-05, "loss": 0.1943, "step": 2990 }, { "epoch": 1.0391477567988914, "grad_norm": 0.22187757323234317, "learning_rate": 4.1323001270463e-05, "loss": 0.1956, "step": 3000 }, { "epoch": 1.0426121600554306, "grad_norm": 0.17994994526393276, "learning_rate": 4.1246543286299714e-05, "loss": 0.196, "step": 3010 }, { "epoch": 1.0460765633119695, "grad_norm": 0.2006941244750973, "learning_rate": 4.1169821341402956e-05, "loss": 0.1928, "step": 3020 }, { "epoch": 1.0495409665685085, "grad_norm": 0.18127266652072002, "learning_rate": 4.109283668228903e-05, "loss": 0.1883, "step": 3030 }, { "epoch": 1.0530053698250477, "grad_norm": 0.17064172550512371, "learning_rate": 4.101559055974258e-05, "loss": 0.1944, "step": 3040 }, { "epoch": 1.0564697730815866, "grad_norm": 0.17513257115164024, "learning_rate": 4.09380842287963e-05, "loss": 0.1924, "step": 3050 }, { "epoch": 1.0599341763381258, "grad_norm": 0.17679873359445633, "learning_rate": 4.0860318948710574e-05, "loss": 0.1967, "step": 3060 }, { "epoch": 1.0633985795946648, "grad_norm": 0.17716657233908994, "learning_rate": 4.0782295982952954e-05, "loss": 0.1904, "step": 3070 }, { "epoch": 1.066862982851204, "grad_norm": 0.20587418971229102, "learning_rate": 4.0704016599177655e-05, "loss": 0.1944, "step": 3080 }, { "epoch": 1.070327386107743, "grad_norm": 0.19036738753494378, "learning_rate": 4.062548206920499e-05, "loss": 0.1927, "step": 3090 }, { "epoch": 1.073791789364282, "grad_norm": 0.19921829291538054, "learning_rate": 4.054669366900066e-05, "loss": 0.1917, "step": 3100 }, { "epoch": 1.077256192620821, "grad_norm": 0.20317583824797536, "learning_rate": 4.0467652678655056e-05, "loss": 0.1914, "step": 3110 }, { "epoch": 1.08072059587736, "grad_norm": 0.1848009075000212, "learning_rate": 4.038836038236245e-05, "loss": 0.1868, "step": 3120 }, { "epoch": 1.0841849991338992, "grad_norm": 0.2082390217876895, "learning_rate": 4.0308818068400125e-05, "loss": 0.1897, "step": 3130 }, { "epoch": 1.0876494023904382, "grad_norm": 0.23521100374901133, "learning_rate": 4.022902702910745e-05, "loss": 0.1849, "step": 3140 }, { "epoch": 1.0911138056469774, "grad_norm": 0.18191505448762496, "learning_rate": 4.014898856086489e-05, "loss": 0.1909, "step": 3150 }, { "epoch": 1.0945782089035163, "grad_norm": 0.19423731854983112, "learning_rate": 4.006870396407294e-05, "loss": 0.1925, "step": 3160 }, { "epoch": 1.0980426121600555, "grad_norm": 0.17415753971247672, "learning_rate": 3.998817454313096e-05, "loss": 0.1961, "step": 3170 }, { "epoch": 1.1015070154165945, "grad_norm": 0.17868260694918264, "learning_rate": 3.9907401606416054e-05, "loss": 0.1984, "step": 3180 }, { "epoch": 1.1049714186731336, "grad_norm": 0.16583150690918594, "learning_rate": 3.9826386466261765e-05, "loss": 0.1948, "step": 3190 }, { "epoch": 1.1084358219296726, "grad_norm": 0.19154779416193093, "learning_rate": 3.9745130438936744e-05, "loss": 0.187, "step": 3200 }, { "epoch": 1.1119002251862118, "grad_norm": 0.18741249451292638, "learning_rate": 3.96636348446234e-05, "loss": 0.1907, "step": 3210 }, { "epoch": 1.1153646284427507, "grad_norm": 0.18322276372490287, "learning_rate": 3.958190100739643e-05, "loss": 0.1872, "step": 3220 }, { "epoch": 1.1188290316992897, "grad_norm": 0.18729813363303588, "learning_rate": 3.94999302552013e-05, "loss": 0.1942, "step": 3230 }, { "epoch": 1.122293434955829, "grad_norm": 0.19705653891217498, "learning_rate": 3.941772391983271e-05, "loss": 0.1912, "step": 3240 }, { "epoch": 1.1257578382123679, "grad_norm": 0.17707126700086195, "learning_rate": 3.9335283336912873e-05, "loss": 0.192, "step": 3250 }, { "epoch": 1.129222241468907, "grad_norm": 0.17283665381012417, "learning_rate": 3.925260984586991e-05, "loss": 0.1904, "step": 3260 }, { "epoch": 1.132686644725446, "grad_norm": 0.18010720437594213, "learning_rate": 3.916970478991604e-05, "loss": 0.1943, "step": 3270 }, { "epoch": 1.1361510479819852, "grad_norm": 0.15919684820270277, "learning_rate": 3.908656951602574e-05, "loss": 0.1897, "step": 3280 }, { "epoch": 1.1396154512385241, "grad_norm": 0.1931598714015755, "learning_rate": 3.9003205374913906e-05, "loss": 0.1901, "step": 3290 }, { "epoch": 1.1430798544950633, "grad_norm": 0.17545621617123003, "learning_rate": 3.891961372101387e-05, "loss": 0.1869, "step": 3300 }, { "epoch": 1.1465442577516023, "grad_norm": 0.17341255082466775, "learning_rate": 3.883579591245542e-05, "loss": 0.1899, "step": 3310 }, { "epoch": 1.1500086610081413, "grad_norm": 0.16204232239326744, "learning_rate": 3.8751753311042704e-05, "loss": 0.1897, "step": 3320 }, { "epoch": 1.1534730642646804, "grad_norm": 0.17553623491312426, "learning_rate": 3.8667487282232144e-05, "loss": 0.187, "step": 3330 }, { "epoch": 1.1569374675212194, "grad_norm": 0.1774831508823347, "learning_rate": 3.8582999195110215e-05, "loss": 0.1943, "step": 3340 }, { "epoch": 1.1604018707777586, "grad_norm": 0.17946778025767848, "learning_rate": 3.849829042237123e-05, "loss": 0.1929, "step": 3350 }, { "epoch": 1.1638662740342975, "grad_norm": 0.17173672286352726, "learning_rate": 3.841336234029501e-05, "loss": 0.195, "step": 3360 }, { "epoch": 1.1673306772908367, "grad_norm": 0.17994745465860756, "learning_rate": 3.832821632872454e-05, "loss": 0.19, "step": 3370 }, { "epoch": 1.1707950805473757, "grad_norm": 0.1850088732651661, "learning_rate": 3.8242853771043566e-05, "loss": 0.1957, "step": 3380 }, { "epoch": 1.1742594838039149, "grad_norm": 0.15908666104639177, "learning_rate": 3.815727605415406e-05, "loss": 0.1915, "step": 3390 }, { "epoch": 1.1777238870604538, "grad_norm": 0.18156692156906576, "learning_rate": 3.807148456845378e-05, "loss": 0.188, "step": 3400 }, { "epoch": 1.1811882903169928, "grad_norm": 0.17113960858415064, "learning_rate": 3.798548070781357e-05, "loss": 0.1893, "step": 3410 }, { "epoch": 1.184652693573532, "grad_norm": 0.18236211522160944, "learning_rate": 3.789926586955484e-05, "loss": 0.1859, "step": 3420 }, { "epoch": 1.188117096830071, "grad_norm": 0.16797194145110902, "learning_rate": 3.7812841454426715e-05, "loss": 0.1901, "step": 3430 }, { "epoch": 1.1915815000866101, "grad_norm": 0.1988755262539599, "learning_rate": 3.772620886658342e-05, "loss": 0.1942, "step": 3440 }, { "epoch": 1.195045903343149, "grad_norm": 0.18276504002195174, "learning_rate": 3.7639369513561374e-05, "loss": 0.1901, "step": 3450 }, { "epoch": 1.1985103065996883, "grad_norm": 0.19420509251762302, "learning_rate": 3.7552324806256356e-05, "loss": 0.1893, "step": 3460 }, { "epoch": 1.2019747098562272, "grad_norm": 0.1856528154175482, "learning_rate": 3.7465076158900565e-05, "loss": 0.1926, "step": 3470 }, { "epoch": 1.2054391131127664, "grad_norm": 0.15954596519502212, "learning_rate": 3.737762498903967e-05, "loss": 0.1928, "step": 3480 }, { "epoch": 1.2089035163693054, "grad_norm": 0.16035794848815701, "learning_rate": 3.728997271750975e-05, "loss": 0.1911, "step": 3490 }, { "epoch": 1.2123679196258443, "grad_norm": 0.1860891805383301, "learning_rate": 3.720212076841424e-05, "loss": 0.1906, "step": 3500 }, { "epoch": 1.2158323228823835, "grad_norm": 0.18968549769475154, "learning_rate": 3.7114070569100745e-05, "loss": 0.1915, "step": 3510 }, { "epoch": 1.2192967261389225, "grad_norm": 0.18632674935712812, "learning_rate": 3.70258235501379e-05, "loss": 0.2005, "step": 3520 }, { "epoch": 1.2227611293954617, "grad_norm": 0.17630114385410808, "learning_rate": 3.693738114529211e-05, "loss": 0.1932, "step": 3530 }, { "epoch": 1.2262255326520006, "grad_norm": 0.17065976632694624, "learning_rate": 3.6848744791504244e-05, "loss": 0.1924, "step": 3540 }, { "epoch": 1.2296899359085398, "grad_norm": 0.16123026277843103, "learning_rate": 3.675991592886629e-05, "loss": 0.1921, "step": 3550 }, { "epoch": 1.2331543391650788, "grad_norm": 0.15903672080165102, "learning_rate": 3.667089600059799e-05, "loss": 0.1872, "step": 3560 }, { "epoch": 1.236618742421618, "grad_norm": 0.17243581096797103, "learning_rate": 3.658168645302333e-05, "loss": 0.1933, "step": 3570 }, { "epoch": 1.240083145678157, "grad_norm": 0.18100700093073868, "learning_rate": 3.6492288735547104e-05, "loss": 0.1951, "step": 3580 }, { "epoch": 1.2435475489346959, "grad_norm": 0.17490828663332014, "learning_rate": 3.640270430063133e-05, "loss": 0.1914, "step": 3590 }, { "epoch": 1.247011952191235, "grad_norm": 0.15940099515313444, "learning_rate": 3.6312934603771674e-05, "loss": 0.1894, "step": 3600 }, { "epoch": 1.2504763554477742, "grad_norm": 0.18052826346513526, "learning_rate": 3.622298110347377e-05, "loss": 0.1891, "step": 3610 }, { "epoch": 1.2539407587043132, "grad_norm": 0.16612752065480388, "learning_rate": 3.613284526122954e-05, "loss": 0.1908, "step": 3620 }, { "epoch": 1.2574051619608522, "grad_norm": 0.18743922356428885, "learning_rate": 3.604252854149347e-05, "loss": 0.1883, "step": 3630 }, { "epoch": 1.2608695652173914, "grad_norm": 0.181478054238508, "learning_rate": 3.595203241165878e-05, "loss": 0.1878, "step": 3640 }, { "epoch": 1.2643339684739303, "grad_norm": 0.1804892389398887, "learning_rate": 3.586135834203362e-05, "loss": 0.1893, "step": 3650 }, { "epoch": 1.2677983717304695, "grad_norm": 0.1741226930201246, "learning_rate": 3.5770507805817135e-05, "loss": 0.1912, "step": 3660 }, { "epoch": 1.2712627749870085, "grad_norm": 0.16334377723582452, "learning_rate": 3.5679482279075584e-05, "loss": 0.1892, "step": 3670 }, { "epoch": 1.2747271782435474, "grad_norm": 0.17047126134525575, "learning_rate": 3.558828324071831e-05, "loss": 0.1907, "step": 3680 }, { "epoch": 1.2781915815000866, "grad_norm": 0.17298916476177462, "learning_rate": 3.549691217247375e-05, "loss": 0.1906, "step": 3690 }, { "epoch": 1.2816559847566258, "grad_norm": 0.15717399662492984, "learning_rate": 3.540537055886533e-05, "loss": 0.1934, "step": 3700 }, { "epoch": 1.2851203880131647, "grad_norm": 0.17584831578937637, "learning_rate": 3.531365988718736e-05, "loss": 0.1851, "step": 3710 }, { "epoch": 1.2885847912697037, "grad_norm": 0.16205306891044194, "learning_rate": 3.522178164748089e-05, "loss": 0.1861, "step": 3720 }, { "epoch": 1.292049194526243, "grad_norm": 0.15057670492815758, "learning_rate": 3.5129737332509456e-05, "loss": 0.1906, "step": 3730 }, { "epoch": 1.2955135977827819, "grad_norm": 0.15822993127615093, "learning_rate": 3.503752843773486e-05, "loss": 0.1838, "step": 3740 }, { "epoch": 1.298978001039321, "grad_norm": 0.15864785027235162, "learning_rate": 3.4945156461292854e-05, "loss": 0.1952, "step": 3750 }, { "epoch": 1.30244240429586, "grad_norm": 0.18227602995981887, "learning_rate": 3.485262290396883e-05, "loss": 0.1898, "step": 3760 }, { "epoch": 1.305906807552399, "grad_norm": 0.16200311831559636, "learning_rate": 3.475992926917341e-05, "loss": 0.1929, "step": 3770 }, { "epoch": 1.3093712108089381, "grad_norm": 0.15259911553775118, "learning_rate": 3.4667077062918e-05, "loss": 0.1872, "step": 3780 }, { "epoch": 1.3128356140654773, "grad_norm": 0.18125600690337332, "learning_rate": 3.457406779379039e-05, "loss": 0.1925, "step": 3790 }, { "epoch": 1.3163000173220163, "grad_norm": 0.192188947967991, "learning_rate": 3.448090297293016e-05, "loss": 0.1871, "step": 3800 }, { "epoch": 1.3197644205785553, "grad_norm": 0.17328021183720274, "learning_rate": 3.438758411400421e-05, "loss": 0.189, "step": 3810 }, { "epoch": 1.3232288238350944, "grad_norm": 0.16788815861788564, "learning_rate": 3.4294112733182084e-05, "loss": 0.1916, "step": 3820 }, { "epoch": 1.3266932270916334, "grad_norm": 0.15688398565420658, "learning_rate": 3.420049034911139e-05, "loss": 0.1882, "step": 3830 }, { "epoch": 1.3301576303481726, "grad_norm": 0.1714798078013055, "learning_rate": 3.410671848289315e-05, "loss": 0.1882, "step": 3840 }, { "epoch": 1.3336220336047115, "grad_norm": 0.16309203688542842, "learning_rate": 3.401279865805702e-05, "loss": 0.1914, "step": 3850 }, { "epoch": 1.3370864368612507, "grad_norm": 0.15748111773147097, "learning_rate": 3.391873240053656e-05, "loss": 0.1866, "step": 3860 }, { "epoch": 1.3405508401177897, "grad_norm": 0.1820518471519293, "learning_rate": 3.382452123864448e-05, "loss": 0.1865, "step": 3870 }, { "epoch": 1.3440152433743289, "grad_norm": 0.15772861765330992, "learning_rate": 3.373016670304774e-05, "loss": 0.1928, "step": 3880 }, { "epoch": 1.3474796466308678, "grad_norm": 0.15938918092133955, "learning_rate": 3.3635670326742755e-05, "loss": 0.1913, "step": 3890 }, { "epoch": 1.3509440498874068, "grad_norm": 0.1721339531229463, "learning_rate": 3.354103364503045e-05, "loss": 0.1907, "step": 3900 }, { "epoch": 1.354408453143946, "grad_norm": 0.15410885348307193, "learning_rate": 3.3446258195491305e-05, "loss": 0.1879, "step": 3910 }, { "epoch": 1.357872856400485, "grad_norm": 0.16165063244129996, "learning_rate": 3.3351345517960386e-05, "loss": 0.1898, "step": 3920 }, { "epoch": 1.3613372596570241, "grad_norm": 0.16569690135925177, "learning_rate": 3.325629715450235e-05, "loss": 0.1873, "step": 3930 }, { "epoch": 1.364801662913563, "grad_norm": 0.16487442057352042, "learning_rate": 3.3161114649386335e-05, "loss": 0.1888, "step": 3940 }, { "epoch": 1.3682660661701023, "grad_norm": 0.17731395997046875, "learning_rate": 3.306579954906095e-05, "loss": 0.1881, "step": 3950 }, { "epoch": 1.3717304694266412, "grad_norm": 0.18039964361054459, "learning_rate": 3.2970353402129065e-05, "loss": 0.1873, "step": 3960 }, { "epoch": 1.3751948726831804, "grad_norm": 0.17793927736310747, "learning_rate": 3.287477775932271e-05, "loss": 0.1912, "step": 3970 }, { "epoch": 1.3786592759397194, "grad_norm": 0.1790070950599728, "learning_rate": 3.2779074173477845e-05, "loss": 0.1837, "step": 3980 }, { "epoch": 1.3821236791962583, "grad_norm": 0.16536456635439983, "learning_rate": 3.2683244199509164e-05, "loss": 0.1822, "step": 3990 }, { "epoch": 1.3855880824527975, "grad_norm": 0.17630304243331663, "learning_rate": 3.258728939438479e-05, "loss": 0.1875, "step": 4000 }, { "epoch": 1.3890524857093367, "grad_norm": 0.17420428081742365, "learning_rate": 3.249121131710102e-05, "loss": 0.1883, "step": 4010 }, { "epoch": 1.3925168889658757, "grad_norm": 0.17543770138618733, "learning_rate": 3.239501152865698e-05, "loss": 0.1871, "step": 4020 }, { "epoch": 1.3959812922224146, "grad_norm": 0.1592954498035186, "learning_rate": 3.229869159202925e-05, "loss": 0.1845, "step": 4030 }, { "epoch": 1.3994456954789538, "grad_norm": 0.1969202150910442, "learning_rate": 3.2202253072146485e-05, "loss": 0.1899, "step": 4040 }, { "epoch": 1.4029100987354928, "grad_norm": 0.16611324585892465, "learning_rate": 3.2105697535863974e-05, "loss": 0.1884, "step": 4050 }, { "epoch": 1.406374501992032, "grad_norm": 0.16159007802640257, "learning_rate": 3.200902655193822e-05, "loss": 0.185, "step": 4060 }, { "epoch": 1.409838905248571, "grad_norm": 0.1580252831987218, "learning_rate": 3.1912241691001396e-05, "loss": 0.1883, "step": 4070 }, { "epoch": 1.4133033085051099, "grad_norm": 0.17023457596293415, "learning_rate": 3.181534452553589e-05, "loss": 0.1864, "step": 4080 }, { "epoch": 1.416767711761649, "grad_norm": 0.16337679260042998, "learning_rate": 3.1718336629848674e-05, "loss": 0.1877, "step": 4090 }, { "epoch": 1.4202321150181882, "grad_norm": 0.1899260224945417, "learning_rate": 3.162121958004584e-05, "loss": 0.188, "step": 4100 }, { "epoch": 1.4236965182747272, "grad_norm": 0.1901828231396822, "learning_rate": 3.1523994954006875e-05, "loss": 0.1868, "step": 4110 }, { "epoch": 1.4271609215312662, "grad_norm": 0.1791821381475306, "learning_rate": 3.142666433135911e-05, "loss": 0.1889, "step": 4120 }, { "epoch": 1.4306253247878054, "grad_norm": 0.16547869947285684, "learning_rate": 3.132922929345199e-05, "loss": 0.1884, "step": 4130 }, { "epoch": 1.4340897280443443, "grad_norm": 0.1549076050063216, "learning_rate": 3.123169142333145e-05, "loss": 0.1884, "step": 4140 }, { "epoch": 1.4375541313008835, "grad_norm": 0.15295140029133414, "learning_rate": 3.1134052305714146e-05, "loss": 0.1852, "step": 4150 }, { "epoch": 1.4410185345574225, "grad_norm": 0.16777477030335766, "learning_rate": 3.1036313526961716e-05, "loss": 0.1849, "step": 4160 }, { "epoch": 1.4444829378139614, "grad_norm": 0.14671702288800448, "learning_rate": 3.093847667505502e-05, "loss": 0.1918, "step": 4170 }, { "epoch": 1.4479473410705006, "grad_norm": 0.15525539418252593, "learning_rate": 3.084054333956833e-05, "loss": 0.1866, "step": 4180 }, { "epoch": 1.4514117443270398, "grad_norm": 0.16462043975916238, "learning_rate": 3.0742515111643496e-05, "loss": 0.1863, "step": 4190 }, { "epoch": 1.4548761475835787, "grad_norm": 0.15447538862682672, "learning_rate": 3.064439358396412e-05, "loss": 0.1871, "step": 4200 }, { "epoch": 1.4583405508401177, "grad_norm": 0.19391772756705172, "learning_rate": 3.0546180350729646e-05, "loss": 0.1907, "step": 4210 }, { "epoch": 1.461804954096657, "grad_norm": 0.16839531563433688, "learning_rate": 3.0447877007629494e-05, "loss": 0.1872, "step": 4220 }, { "epoch": 1.4652693573531959, "grad_norm": 0.1667638250714929, "learning_rate": 3.0349485151817104e-05, "loss": 0.1891, "step": 4230 }, { "epoch": 1.468733760609735, "grad_norm": 0.15657114860167654, "learning_rate": 3.0251006381884e-05, "loss": 0.1888, "step": 4240 }, { "epoch": 1.472198163866274, "grad_norm": 0.1616644177415703, "learning_rate": 3.0152442297833817e-05, "loss": 0.1866, "step": 4250 }, { "epoch": 1.475662567122813, "grad_norm": 0.16936489968998167, "learning_rate": 3.005379450105631e-05, "loss": 0.1861, "step": 4260 }, { "epoch": 1.4791269703793521, "grad_norm": 0.17783057212322842, "learning_rate": 2.995506459430133e-05, "loss": 0.1921, "step": 4270 }, { "epoch": 1.4825913736358913, "grad_norm": 0.18253028978511793, "learning_rate": 2.9856254181652777e-05, "loss": 0.1931, "step": 4280 }, { "epoch": 1.4860557768924303, "grad_norm": 0.17108679951308503, "learning_rate": 2.9757364868502558e-05, "loss": 0.187, "step": 4290 }, { "epoch": 1.4895201801489693, "grad_norm": 0.17127968110442396, "learning_rate": 2.9658398261524477e-05, "loss": 0.1856, "step": 4300 }, { "epoch": 1.4929845834055084, "grad_norm": 0.15243800455802872, "learning_rate": 2.9559355968648163e-05, "loss": 0.1878, "step": 4310 }, { "epoch": 1.4964489866620474, "grad_norm": 0.16208265788754178, "learning_rate": 2.9460239599032898e-05, "loss": 0.1831, "step": 4320 }, { "epoch": 1.4999133899185866, "grad_norm": 0.17721043816157045, "learning_rate": 2.9361050763041552e-05, "loss": 0.1855, "step": 4330 }, { "epoch": 1.5033777931751255, "grad_norm": 0.1624251601306164, "learning_rate": 2.926179107221433e-05, "loss": 0.1905, "step": 4340 }, { "epoch": 1.5068421964316645, "grad_norm": 0.17242171402814246, "learning_rate": 2.916246213924263e-05, "loss": 0.1892, "step": 4350 }, { "epoch": 1.5103065996882037, "grad_norm": 0.18060628796516595, "learning_rate": 2.9063065577942873e-05, "loss": 0.1868, "step": 4360 }, { "epoch": 1.5137710029447429, "grad_norm": 0.16420679560340506, "learning_rate": 2.896360300323022e-05, "loss": 0.189, "step": 4370 }, { "epoch": 1.5172354062012818, "grad_norm": 0.1657190366825487, "learning_rate": 2.8864076031092375e-05, "loss": 0.1862, "step": 4380 }, { "epoch": 1.5206998094578208, "grad_norm": 0.14827671598571107, "learning_rate": 2.8764486278563313e-05, "loss": 0.187, "step": 4390 }, { "epoch": 1.52416421271436, "grad_norm": 0.1696723630570198, "learning_rate": 2.8664835363697028e-05, "loss": 0.1878, "step": 4400 }, { "epoch": 1.5276286159708992, "grad_norm": 0.16415531283137735, "learning_rate": 2.8565124905541224e-05, "loss": 0.1884, "step": 4410 }, { "epoch": 1.5310930192274381, "grad_norm": 0.1711666868251127, "learning_rate": 2.8465356524111014e-05, "loss": 0.1885, "step": 4420 }, { "epoch": 1.534557422483977, "grad_norm": 0.15183248647521377, "learning_rate": 2.8365531840362586e-05, "loss": 0.185, "step": 4430 }, { "epoch": 1.538021825740516, "grad_norm": 0.14509942126397887, "learning_rate": 2.826565247616692e-05, "loss": 0.1882, "step": 4440 }, { "epoch": 1.5414862289970552, "grad_norm": 0.1638920162743926, "learning_rate": 2.816572005428337e-05, "loss": 0.1868, "step": 4450 }, { "epoch": 1.5449506322535944, "grad_norm": 0.15578153131291148, "learning_rate": 2.8065736198333337e-05, "loss": 0.1811, "step": 4460 }, { "epoch": 1.5484150355101334, "grad_norm": 0.16655001845904682, "learning_rate": 2.796570253277389e-05, "loss": 0.1867, "step": 4470 }, { "epoch": 1.5518794387666723, "grad_norm": 0.16373974199509916, "learning_rate": 2.786562068287134e-05, "loss": 0.1841, "step": 4480 }, { "epoch": 1.5553438420232115, "grad_norm": 0.17150797676546983, "learning_rate": 2.7765492274674887e-05, "loss": 0.1885, "step": 4490 }, { "epoch": 1.5588082452797507, "grad_norm": 0.1477537468546959, "learning_rate": 2.7665318934990153e-05, "loss": 0.1836, "step": 4500 }, { "epoch": 1.5622726485362897, "grad_norm": 0.1553596996678754, "learning_rate": 2.7565102291352785e-05, "loss": 0.1828, "step": 4510 }, { "epoch": 1.5657370517928286, "grad_norm": 0.1653908672889608, "learning_rate": 2.7464843972001985e-05, "loss": 0.1841, "step": 4520 }, { "epoch": 1.5692014550493676, "grad_norm": 0.16834209700469813, "learning_rate": 2.7364545605854077e-05, "loss": 0.1868, "step": 4530 }, { "epoch": 1.5726658583059068, "grad_norm": 0.16296690623854307, "learning_rate": 2.7264208822476016e-05, "loss": 0.1871, "step": 4540 }, { "epoch": 1.576130261562446, "grad_norm": 0.1465435846295834, "learning_rate": 2.716383525205896e-05, "loss": 0.1845, "step": 4550 }, { "epoch": 1.579594664818985, "grad_norm": 0.16909003341569917, "learning_rate": 2.7063426525391732e-05, "loss": 0.1818, "step": 4560 }, { "epoch": 1.5830590680755239, "grad_norm": 0.16069669006075715, "learning_rate": 2.6962984273834346e-05, "loss": 0.1844, "step": 4570 }, { "epoch": 1.586523471332063, "grad_norm": 0.1750416978664114, "learning_rate": 2.686251012929151e-05, "loss": 0.1876, "step": 4580 }, { "epoch": 1.5899878745886022, "grad_norm": 0.16259236460228596, "learning_rate": 2.6762005724186084e-05, "loss": 0.1817, "step": 4590 }, { "epoch": 1.5934522778451412, "grad_norm": 0.14258206554047775, "learning_rate": 2.6661472691432614e-05, "loss": 0.1908, "step": 4600 }, { "epoch": 1.5969166811016802, "grad_norm": 0.15573453304245355, "learning_rate": 2.6560912664410724e-05, "loss": 0.1847, "step": 4610 }, { "epoch": 1.6003810843582191, "grad_norm": 0.1598455887435374, "learning_rate": 2.646032727693864e-05, "loss": 0.1883, "step": 4620 }, { "epoch": 1.6038454876147583, "grad_norm": 0.16653009190610318, "learning_rate": 2.6359718163246627e-05, "loss": 0.1817, "step": 4630 }, { "epoch": 1.6073098908712975, "grad_norm": 0.16139062116102534, "learning_rate": 2.6259086957950434e-05, "loss": 0.186, "step": 4640 }, { "epoch": 1.6107742941278365, "grad_norm": 0.1408890193988467, "learning_rate": 2.615843529602472e-05, "loss": 0.1883, "step": 4650 }, { "epoch": 1.6142386973843754, "grad_norm": 0.15912928421327938, "learning_rate": 2.6057764812776524e-05, "loss": 0.1818, "step": 4660 }, { "epoch": 1.6177031006409146, "grad_norm": 0.1531498559224487, "learning_rate": 2.595707714381867e-05, "loss": 0.1815, "step": 4670 }, { "epoch": 1.6211675038974538, "grad_norm": 0.15818993699570458, "learning_rate": 2.585637392504321e-05, "loss": 0.189, "step": 4680 }, { "epoch": 1.6246319071539927, "grad_norm": 0.1550856346527227, "learning_rate": 2.575565679259483e-05, "loss": 0.1851, "step": 4690 }, { "epoch": 1.6280963104105317, "grad_norm": 0.17234204432744546, "learning_rate": 2.5654927382844274e-05, "loss": 0.1856, "step": 4700 }, { "epoch": 1.631560713667071, "grad_norm": 0.14720447626557742, "learning_rate": 2.555418733236176e-05, "loss": 0.1849, "step": 4710 }, { "epoch": 1.6350251169236099, "grad_norm": 0.15155258081793999, "learning_rate": 2.545343827789039e-05, "loss": 0.185, "step": 4720 }, { "epoch": 1.638489520180149, "grad_norm": 0.1585948164278231, "learning_rate": 2.5352681856319556e-05, "loss": 0.1818, "step": 4730 }, { "epoch": 1.641953923436688, "grad_norm": 0.16020278019866965, "learning_rate": 2.5251919704658323e-05, "loss": 0.1847, "step": 4740 }, { "epoch": 1.645418326693227, "grad_norm": 0.1619196729529339, "learning_rate": 2.5151153460008898e-05, "loss": 0.1851, "step": 4750 }, { "epoch": 1.6488827299497661, "grad_norm": 0.17892607432188418, "learning_rate": 2.5050384759539946e-05, "loss": 0.1884, "step": 4760 }, { "epoch": 1.6523471332063053, "grad_norm": 0.1453101210669921, "learning_rate": 2.4949615240460053e-05, "loss": 0.1829, "step": 4770 }, { "epoch": 1.6558115364628443, "grad_norm": 0.15218576172212772, "learning_rate": 2.4848846539991108e-05, "loss": 0.1815, "step": 4780 }, { "epoch": 1.6592759397193833, "grad_norm": 0.14916349890801867, "learning_rate": 2.474808029534168e-05, "loss": 0.1831, "step": 4790 }, { "epoch": 1.6627403429759224, "grad_norm": 0.16152070439341162, "learning_rate": 2.464731814368045e-05, "loss": 0.1845, "step": 4800 }, { "epoch": 1.6662047462324616, "grad_norm": 0.2549593516440478, "learning_rate": 2.4546561722109614e-05, "loss": 0.1821, "step": 4810 }, { "epoch": 1.6696691494890006, "grad_norm": 0.1423882724472122, "learning_rate": 2.4445812667638242e-05, "loss": 0.1824, "step": 4820 }, { "epoch": 1.6731335527455395, "grad_norm": 0.16756009716869377, "learning_rate": 2.4345072617155732e-05, "loss": 0.1861, "step": 4830 }, { "epoch": 1.6765979560020785, "grad_norm": 0.16461215951788935, "learning_rate": 2.424434320740518e-05, "loss": 0.1832, "step": 4840 }, { "epoch": 1.6800623592586177, "grad_norm": 0.14517098753523155, "learning_rate": 2.4143626074956796e-05, "loss": 0.1785, "step": 4850 }, { "epoch": 1.6835267625151569, "grad_norm": 0.15108292412387223, "learning_rate": 2.4042922856181337e-05, "loss": 0.1827, "step": 4860 }, { "epoch": 1.6869911657716958, "grad_norm": 0.15525515390004765, "learning_rate": 2.394223518722348e-05, "loss": 0.1838, "step": 4870 }, { "epoch": 1.6904555690282348, "grad_norm": 0.15512655551886048, "learning_rate": 2.3841564703975287e-05, "loss": 0.1812, "step": 4880 }, { "epoch": 1.693919972284774, "grad_norm": 0.16187683902086444, "learning_rate": 2.374091304204958e-05, "loss": 0.1832, "step": 4890 }, { "epoch": 1.6973843755413132, "grad_norm": 0.15579000956703476, "learning_rate": 2.364028183675337e-05, "loss": 0.1806, "step": 4900 }, { "epoch": 1.7008487787978521, "grad_norm": 0.15600046355547195, "learning_rate": 2.353967272306137e-05, "loss": 0.1844, "step": 4910 }, { "epoch": 1.704313182054391, "grad_norm": 0.15757986710095487, "learning_rate": 2.3439087335589285e-05, "loss": 0.1841, "step": 4920 }, { "epoch": 1.70777758531093, "grad_norm": 0.1452879429548844, "learning_rate": 2.333852730856739e-05, "loss": 0.1846, "step": 4930 }, { "epoch": 1.7112419885674692, "grad_norm": 0.14279806262433956, "learning_rate": 2.3237994275813918e-05, "loss": 0.1846, "step": 4940 }, { "epoch": 1.7147063918240084, "grad_norm": 0.168480732012115, "learning_rate": 2.3137489870708494e-05, "loss": 0.1854, "step": 4950 }, { "epoch": 1.7181707950805474, "grad_norm": 0.14771704725144452, "learning_rate": 2.303701572616566e-05, "loss": 0.1781, "step": 4960 }, { "epoch": 1.7216351983370863, "grad_norm": 0.15399297657379268, "learning_rate": 2.2936573474608274e-05, "loss": 0.1851, "step": 4970 }, { "epoch": 1.7250996015936255, "grad_norm": 0.14268904873654373, "learning_rate": 2.283616474794104e-05, "loss": 0.1793, "step": 4980 }, { "epoch": 1.7285640048501647, "grad_norm": 0.1458152221590982, "learning_rate": 2.273579117752399e-05, "loss": 0.18, "step": 4990 }, { "epoch": 1.7320284081067037, "grad_norm": 0.157752398066733, "learning_rate": 2.2635454394145926e-05, "loss": 0.1804, "step": 5000 }, { "epoch": 1.7354928113632426, "grad_norm": 0.16354623197521176, "learning_rate": 2.253515602799802e-05, "loss": 0.18, "step": 5010 }, { "epoch": 1.7389572146197816, "grad_norm": 0.14987547413644828, "learning_rate": 2.2434897708647225e-05, "loss": 0.1884, "step": 5020 }, { "epoch": 1.7424216178763208, "grad_norm": 0.13264921207181166, "learning_rate": 2.233468106500985e-05, "loss": 0.1817, "step": 5030 }, { "epoch": 1.74588602113286, "grad_norm": 0.16074846687523298, "learning_rate": 2.2234507725325115e-05, "loss": 0.1821, "step": 5040 }, { "epoch": 1.749350424389399, "grad_norm": 0.14912108468148355, "learning_rate": 2.2134379317128666e-05, "loss": 0.1831, "step": 5050 }, { "epoch": 1.7528148276459379, "grad_norm": 0.15108125406874282, "learning_rate": 2.2034297467226117e-05, "loss": 0.1849, "step": 5060 }, { "epoch": 1.756279230902477, "grad_norm": 0.1841322575834389, "learning_rate": 2.193426380166667e-05, "loss": 0.1814, "step": 5070 }, { "epoch": 1.7597436341590162, "grad_norm": 0.15923619065201655, "learning_rate": 2.183427994571663e-05, "loss": 0.1826, "step": 5080 }, { "epoch": 1.7632080374155552, "grad_norm": 0.15869671284509257, "learning_rate": 2.1734347523833088e-05, "loss": 0.1825, "step": 5090 }, { "epoch": 1.7666724406720942, "grad_norm": 0.15221224358826752, "learning_rate": 2.163446815963742e-05, "loss": 0.182, "step": 5100 }, { "epoch": 1.7701368439286331, "grad_norm": 0.1460640105447675, "learning_rate": 2.1534643475888995e-05, "loss": 0.1823, "step": 5110 }, { "epoch": 1.7736012471851723, "grad_norm": 0.1521505743886422, "learning_rate": 2.1434875094458785e-05, "loss": 0.1874, "step": 5120 }, { "epoch": 1.7770656504417115, "grad_norm": 0.14416303035440375, "learning_rate": 2.133516463630297e-05, "loss": 0.1808, "step": 5130 }, { "epoch": 1.7805300536982505, "grad_norm": 0.1586845052440452, "learning_rate": 2.1235513721436693e-05, "loss": 0.1841, "step": 5140 }, { "epoch": 1.7839944569547894, "grad_norm": 0.14468498059776927, "learning_rate": 2.113592396890764e-05, "loss": 0.181, "step": 5150 }, { "epoch": 1.7874588602113286, "grad_norm": 0.14200788749614593, "learning_rate": 2.1036396996769785e-05, "loss": 0.1806, "step": 5160 }, { "epoch": 1.7909232634678678, "grad_norm": 0.14145049327765216, "learning_rate": 2.093693442205713e-05, "loss": 0.1844, "step": 5170 }, { "epoch": 1.7943876667244067, "grad_norm": 0.14047484844001326, "learning_rate": 2.0837537860757378e-05, "loss": 0.1856, "step": 5180 }, { "epoch": 1.7978520699809457, "grad_norm": 0.15798548962919395, "learning_rate": 2.073820892778568e-05, "loss": 0.1781, "step": 5190 }, { "epoch": 1.801316473237485, "grad_norm": 0.15599305625176324, "learning_rate": 2.063894923695846e-05, "loss": 0.1846, "step": 5200 }, { "epoch": 1.8047808764940239, "grad_norm": 0.15451286542161505, "learning_rate": 2.0539760400967105e-05, "loss": 0.1814, "step": 5210 }, { "epoch": 1.808245279750563, "grad_norm": 0.15151680436099774, "learning_rate": 2.0440644031351846e-05, "loss": 0.187, "step": 5220 }, { "epoch": 1.811709683007102, "grad_norm": 0.16740847608877021, "learning_rate": 2.0341601738475532e-05, "loss": 0.1788, "step": 5230 }, { "epoch": 1.815174086263641, "grad_norm": 0.15255748452386272, "learning_rate": 2.0242635131497444e-05, "loss": 0.1799, "step": 5240 }, { "epoch": 1.8186384895201801, "grad_norm": 0.21370458116557772, "learning_rate": 2.0143745818347226e-05, "loss": 0.1859, "step": 5250 }, { "epoch": 1.8221028927767193, "grad_norm": 0.15178882291540086, "learning_rate": 2.004493540569867e-05, "loss": 0.1816, "step": 5260 }, { "epoch": 1.8255672960332583, "grad_norm": 0.1381809582876828, "learning_rate": 1.9946205498943693e-05, "loss": 0.1782, "step": 5270 }, { "epoch": 1.8290316992897973, "grad_norm": 0.15231248860338162, "learning_rate": 1.9847557702166185e-05, "loss": 0.182, "step": 5280 }, { "epoch": 1.8324961025463364, "grad_norm": 0.13540143942421462, "learning_rate": 1.9748993618116003e-05, "loss": 0.1802, "step": 5290 }, { "epoch": 1.8359605058028756, "grad_norm": 0.15057686984095897, "learning_rate": 1.9650514848182902e-05, "loss": 0.1845, "step": 5300 }, { "epoch": 1.8394249090594146, "grad_norm": 0.15657239979982757, "learning_rate": 1.9552122992370515e-05, "loss": 0.1816, "step": 5310 }, { "epoch": 1.8428893123159535, "grad_norm": 0.14942811146448692, "learning_rate": 1.9453819649270356e-05, "loss": 0.1881, "step": 5320 }, { "epoch": 1.8463537155724925, "grad_norm": 0.14206236299677405, "learning_rate": 1.9355606416035893e-05, "loss": 0.1798, "step": 5330 }, { "epoch": 1.8498181188290317, "grad_norm": 0.15512685603221288, "learning_rate": 1.925748488835651e-05, "loss": 0.1787, "step": 5340 }, { "epoch": 1.8532825220855709, "grad_norm": 0.14951858713327895, "learning_rate": 1.9159456660431675e-05, "loss": 0.1808, "step": 5350 }, { "epoch": 1.8567469253421098, "grad_norm": 0.1518891829482899, "learning_rate": 1.906152332494499e-05, "loss": 0.1847, "step": 5360 }, { "epoch": 1.8602113285986488, "grad_norm": 0.1599605482136836, "learning_rate": 1.8963686473038286e-05, "loss": 0.1758, "step": 5370 }, { "epoch": 1.863675731855188, "grad_norm": 0.14097789058459068, "learning_rate": 1.8865947694285863e-05, "loss": 0.1814, "step": 5380 }, { "epoch": 1.8671401351117272, "grad_norm": 0.13734664780231215, "learning_rate": 1.876830857666855e-05, "loss": 0.1813, "step": 5390 }, { "epoch": 1.8706045383682661, "grad_norm": 0.15463681734647824, "learning_rate": 1.867077070654802e-05, "loss": 0.1807, "step": 5400 }, { "epoch": 1.874068941624805, "grad_norm": 0.16737868012866194, "learning_rate": 1.85733356686409e-05, "loss": 0.1791, "step": 5410 }, { "epoch": 1.877533344881344, "grad_norm": 0.1422554567768386, "learning_rate": 1.847600504599312e-05, "loss": 0.1812, "step": 5420 }, { "epoch": 1.8809977481378832, "grad_norm": 0.1499775475287378, "learning_rate": 1.8378780419954168e-05, "loss": 0.1791, "step": 5430 }, { "epoch": 1.8844621513944224, "grad_norm": 0.1620107234869922, "learning_rate": 1.828166337015133e-05, "loss": 0.1798, "step": 5440 }, { "epoch": 1.8879265546509614, "grad_norm": 0.14943512022105485, "learning_rate": 1.8184655474464122e-05, "loss": 0.1769, "step": 5450 }, { "epoch": 1.8913909579075003, "grad_norm": 0.1636002811165978, "learning_rate": 1.8087758308998607e-05, "loss": 0.1828, "step": 5460 }, { "epoch": 1.8948553611640395, "grad_norm": 0.1488164351792101, "learning_rate": 1.7990973448061788e-05, "loss": 0.1793, "step": 5470 }, { "epoch": 1.8983197644205787, "grad_norm": 0.1495335119087624, "learning_rate": 1.7894302464136028e-05, "loss": 0.1804, "step": 5480 }, { "epoch": 1.9017841676771177, "grad_norm": 0.13881655138176263, "learning_rate": 1.7797746927853524e-05, "loss": 0.1808, "step": 5490 }, { "epoch": 1.9052485709336566, "grad_norm": 0.14655156873185612, "learning_rate": 1.770130840797075e-05, "loss": 0.1824, "step": 5500 }, { "epoch": 1.9087129741901956, "grad_norm": 0.1401693117484928, "learning_rate": 1.7604988471343026e-05, "loss": 0.1836, "step": 5510 }, { "epoch": 1.9121773774467348, "grad_norm": 0.13663044985397266, "learning_rate": 1.750878868289898e-05, "loss": 0.179, "step": 5520 }, { "epoch": 1.915641780703274, "grad_norm": 0.1440844464161243, "learning_rate": 1.741271060561522e-05, "loss": 0.1794, "step": 5530 }, { "epoch": 1.919106183959813, "grad_norm": 0.1579545402684227, "learning_rate": 1.731675580049085e-05, "loss": 0.1762, "step": 5540 }, { "epoch": 1.9225705872163519, "grad_norm": 0.14451711020259372, "learning_rate": 1.7220925826522158e-05, "loss": 0.179, "step": 5550 }, { "epoch": 1.926034990472891, "grad_norm": 0.16318851480604618, "learning_rate": 1.71252222406773e-05, "loss": 0.1794, "step": 5560 }, { "epoch": 1.9294993937294302, "grad_norm": 0.16754335602378248, "learning_rate": 1.7029646597870934e-05, "loss": 0.1809, "step": 5570 }, { "epoch": 1.9329637969859692, "grad_norm": 0.14425404689789675, "learning_rate": 1.693420045093905e-05, "loss": 0.181, "step": 5580 }, { "epoch": 1.9364282002425082, "grad_norm": 0.14950486896828924, "learning_rate": 1.6838885350613664e-05, "loss": 0.1834, "step": 5590 }, { "epoch": 1.9398926034990471, "grad_norm": 0.15401643917378996, "learning_rate": 1.674370284549765e-05, "loss": 0.18, "step": 5600 }, { "epoch": 1.9433570067555863, "grad_norm": 0.1457933918907984, "learning_rate": 1.6648654482039616e-05, "loss": 0.1798, "step": 5610 }, { "epoch": 1.9468214100121255, "grad_norm": 0.15104837558607415, "learning_rate": 1.6553741804508704e-05, "loss": 0.1808, "step": 5620 }, { "epoch": 1.9502858132686645, "grad_norm": 0.15234854679585655, "learning_rate": 1.6458966354969553e-05, "loss": 0.183, "step": 5630 }, { "epoch": 1.9537502165252034, "grad_norm": 0.15054697844231169, "learning_rate": 1.6364329673257244e-05, "loss": 0.1782, "step": 5640 }, { "epoch": 1.9572146197817426, "grad_norm": 0.19519516147127766, "learning_rate": 1.6269833296952267e-05, "loss": 0.1779, "step": 5650 }, { "epoch": 1.9606790230382818, "grad_norm": 0.14953949146669174, "learning_rate": 1.617547876135553e-05, "loss": 0.1817, "step": 5660 }, { "epoch": 1.9641434262948207, "grad_norm": 0.14098620367788547, "learning_rate": 1.6081267599463446e-05, "loss": 0.1795, "step": 5670 }, { "epoch": 1.9676078295513597, "grad_norm": 0.1516762253034418, "learning_rate": 1.598720134194298e-05, "loss": 0.175, "step": 5680 }, { "epoch": 1.971072232807899, "grad_norm": 0.14592317747183597, "learning_rate": 1.5893281517106852e-05, "loss": 0.1817, "step": 5690 }, { "epoch": 1.9745366360644379, "grad_norm": 0.1367156708009888, "learning_rate": 1.5799509650888605e-05, "loss": 0.1792, "step": 5700 }, { "epoch": 1.978001039320977, "grad_norm": 0.15222974333942974, "learning_rate": 1.5705887266817926e-05, "loss": 0.1838, "step": 5710 }, { "epoch": 1.981465442577516, "grad_norm": 0.15063980268483795, "learning_rate": 1.5612415885995803e-05, "loss": 0.1798, "step": 5720 }, { "epoch": 1.984929845834055, "grad_norm": 0.14361029580183732, "learning_rate": 1.551909702706984e-05, "loss": 0.1775, "step": 5730 }, { "epoch": 1.9883942490905941, "grad_norm": 0.14905006323407718, "learning_rate": 1.5425932206209617e-05, "loss": 0.1853, "step": 5740 }, { "epoch": 1.9918586523471333, "grad_norm": 0.1473397517873399, "learning_rate": 1.533292293708201e-05, "loss": 0.1857, "step": 5750 }, { "epoch": 1.9953230556036723, "grad_norm": 0.14626760437882622, "learning_rate": 1.52400707308266e-05, "loss": 0.1741, "step": 5760 }, { "epoch": 1.9987874588602113, "grad_norm": 0.1405339530408193, "learning_rate": 1.5147377096031173e-05, "loss": 0.1757, "step": 5770 }, { "epoch": 2.0020786419539234, "grad_norm": 0.17093438476339481, "learning_rate": 1.5054843538707147e-05, "loss": 0.1598, "step": 5780 }, { "epoch": 2.0055430452104623, "grad_norm": 0.15448133744376852, "learning_rate": 1.4962471562265151e-05, "loss": 0.1509, "step": 5790 }, { "epoch": 2.0090074484670017, "grad_norm": 0.15883932121601194, "learning_rate": 1.4870262667490553e-05, "loss": 0.1508, "step": 5800 }, { "epoch": 2.0124718517235407, "grad_norm": 0.15799483695764974, "learning_rate": 1.4778218352519113e-05, "loss": 0.1514, "step": 5810 }, { "epoch": 2.0159362549800797, "grad_norm": 0.1397582105470391, "learning_rate": 1.4686340112812644e-05, "loss": 0.1513, "step": 5820 }, { "epoch": 2.0194006582366186, "grad_norm": 0.13988381555476553, "learning_rate": 1.4594629441134674e-05, "loss": 0.1516, "step": 5830 }, { "epoch": 2.0228650614931576, "grad_norm": 0.15357440805298642, "learning_rate": 1.4503087827526257e-05, "loss": 0.1537, "step": 5840 }, { "epoch": 2.026329464749697, "grad_norm": 0.140105602913093, "learning_rate": 1.4411716759281701e-05, "loss": 0.1489, "step": 5850 }, { "epoch": 2.029793868006236, "grad_norm": 0.1371171850215917, "learning_rate": 1.4320517720924423e-05, "loss": 0.1478, "step": 5860 }, { "epoch": 2.033258271262775, "grad_norm": 0.1415040646234407, "learning_rate": 1.4229492194182864e-05, "loss": 0.1511, "step": 5870 }, { "epoch": 2.036722674519314, "grad_norm": 0.1455346789180852, "learning_rate": 1.4138641657966387e-05, "loss": 0.1541, "step": 5880 }, { "epoch": 2.0401870777758533, "grad_norm": 0.14764527095741217, "learning_rate": 1.4047967588341216e-05, "loss": 0.1528, "step": 5890 }, { "epoch": 2.0436514810323922, "grad_norm": 0.13949032357196248, "learning_rate": 1.3957471458506536e-05, "loss": 0.1539, "step": 5900 }, { "epoch": 2.047115884288931, "grad_norm": 0.13705465172650183, "learning_rate": 1.386715473877046e-05, "loss": 0.1457, "step": 5910 }, { "epoch": 2.05058028754547, "grad_norm": 0.12956466312791168, "learning_rate": 1.3777018896526236e-05, "loss": 0.1473, "step": 5920 }, { "epoch": 2.0540446908020096, "grad_norm": 0.15368302078148194, "learning_rate": 1.3687065396228332e-05, "loss": 0.1497, "step": 5930 }, { "epoch": 2.0575090940585485, "grad_norm": 0.1428450265623596, "learning_rate": 1.3597295699368668e-05, "loss": 0.1514, "step": 5940 }, { "epoch": 2.0609734973150875, "grad_norm": 0.1281258398579668, "learning_rate": 1.3507711264452905e-05, "loss": 0.1483, "step": 5950 }, { "epoch": 2.0644379005716265, "grad_norm": 0.15262843729207556, "learning_rate": 1.3418313546976676e-05, "loss": 0.1466, "step": 5960 }, { "epoch": 2.0679023038281654, "grad_norm": 0.14904369630104097, "learning_rate": 1.332910399940202e-05, "loss": 0.1454, "step": 5970 }, { "epoch": 2.071366707084705, "grad_norm": 0.1423441662005472, "learning_rate": 1.324008407113371e-05, "loss": 0.1501, "step": 5980 }, { "epoch": 2.074831110341244, "grad_norm": 0.1422975070772413, "learning_rate": 1.3151255208495755e-05, "loss": 0.154, "step": 5990 }, { "epoch": 2.0782955135977828, "grad_norm": 0.1352228664513086, "learning_rate": 1.306261885470789e-05, "loss": 0.1506, "step": 6000 }, { "epoch": 2.0817599168543217, "grad_norm": 0.13493941433636564, "learning_rate": 1.2974176449862101e-05, "loss": 0.148, "step": 6010 }, { "epoch": 2.085224320110861, "grad_norm": 0.13664044137951584, "learning_rate": 1.2885929430899258e-05, "loss": 0.1479, "step": 6020 }, { "epoch": 2.0886887233674, "grad_norm": 0.13427253141306963, "learning_rate": 1.279787923158577e-05, "loss": 0.1451, "step": 6030 }, { "epoch": 2.092153126623939, "grad_norm": 0.18500494498072673, "learning_rate": 1.2710027282490247e-05, "loss": 0.1504, "step": 6040 }, { "epoch": 2.095617529880478, "grad_norm": 0.1384525206350356, "learning_rate": 1.2622375010960335e-05, "loss": 0.1502, "step": 6050 }, { "epoch": 2.099081933137017, "grad_norm": 0.1417115574643469, "learning_rate": 1.2534923841099445e-05, "loss": 0.1522, "step": 6060 }, { "epoch": 2.1025463363935564, "grad_norm": 0.14117258627489582, "learning_rate": 1.2447675193743651e-05, "loss": 0.149, "step": 6070 }, { "epoch": 2.1060107396500953, "grad_norm": 0.13976875251148813, "learning_rate": 1.2360630486438635e-05, "loss": 0.1479, "step": 6080 }, { "epoch": 2.1094751429066343, "grad_norm": 0.13438983639128466, "learning_rate": 1.2273791133416584e-05, "loss": 0.1493, "step": 6090 }, { "epoch": 2.1129395461631733, "grad_norm": 0.12916426405356554, "learning_rate": 1.2187158545573295e-05, "loss": 0.1462, "step": 6100 }, { "epoch": 2.1164039494197127, "grad_norm": 0.14884278506870566, "learning_rate": 1.2100734130445173e-05, "loss": 0.1534, "step": 6110 }, { "epoch": 2.1198683526762516, "grad_norm": 0.14763268804961824, "learning_rate": 1.2014519292186428e-05, "loss": 0.1504, "step": 6120 }, { "epoch": 2.1233327559327906, "grad_norm": 0.14546584264873522, "learning_rate": 1.1928515431546233e-05, "loss": 0.1549, "step": 6130 }, { "epoch": 2.1267971591893295, "grad_norm": 0.13000521630439743, "learning_rate": 1.1842723945845948e-05, "loss": 0.1515, "step": 6140 }, { "epoch": 2.1302615624458685, "grad_norm": 0.1380689347039955, "learning_rate": 1.1757146228956445e-05, "loss": 0.1534, "step": 6150 }, { "epoch": 2.133725965702408, "grad_norm": 0.13885554681042553, "learning_rate": 1.1671783671275467e-05, "loss": 0.1477, "step": 6160 }, { "epoch": 2.137190368958947, "grad_norm": 0.1413517568795959, "learning_rate": 1.1586637659704994e-05, "loss": 0.1489, "step": 6170 }, { "epoch": 2.140654772215486, "grad_norm": 0.1445567017190447, "learning_rate": 1.1501709577628777e-05, "loss": 0.1527, "step": 6180 }, { "epoch": 2.144119175472025, "grad_norm": 0.13842122845453322, "learning_rate": 1.1417000804889793e-05, "loss": 0.149, "step": 6190 }, { "epoch": 2.147583578728564, "grad_norm": 0.1351342875816243, "learning_rate": 1.1332512717767862e-05, "loss": 0.1499, "step": 6200 }, { "epoch": 2.151047981985103, "grad_norm": 0.19203561762793064, "learning_rate": 1.1248246688957307e-05, "loss": 0.1516, "step": 6210 }, { "epoch": 2.154512385241642, "grad_norm": 0.13275502783261534, "learning_rate": 1.1164204087544589e-05, "loss": 0.1496, "step": 6220 }, { "epoch": 2.157976788498181, "grad_norm": 0.14124554657127336, "learning_rate": 1.108038627898613e-05, "loss": 0.1469, "step": 6230 }, { "epoch": 2.16144119175472, "grad_norm": 0.13662855483900213, "learning_rate": 1.0996794625086102e-05, "loss": 0.1533, "step": 6240 }, { "epoch": 2.1649055950112595, "grad_norm": 0.13136794147532813, "learning_rate": 1.091343048397426e-05, "loss": 0.1479, "step": 6250 }, { "epoch": 2.1683699982677984, "grad_norm": 0.14289580428302312, "learning_rate": 1.0830295210083968e-05, "loss": 0.1523, "step": 6260 }, { "epoch": 2.1718344015243374, "grad_norm": 0.14336144046136406, "learning_rate": 1.0747390154130097e-05, "loss": 0.1482, "step": 6270 }, { "epoch": 2.1752988047808763, "grad_norm": 0.12351594136318876, "learning_rate": 1.0664716663087132e-05, "loss": 0.148, "step": 6280 }, { "epoch": 2.1787632080374157, "grad_norm": 0.1263105200697639, "learning_rate": 1.0582276080167299e-05, "loss": 0.1488, "step": 6290 }, { "epoch": 2.1822276112939547, "grad_norm": 0.1409302104392171, "learning_rate": 1.0500069744798696e-05, "loss": 0.1493, "step": 6300 }, { "epoch": 2.1856920145504937, "grad_norm": 0.13878678763750232, "learning_rate": 1.0418098992603576e-05, "loss": 0.1467, "step": 6310 }, { "epoch": 2.1891564178070326, "grad_norm": 0.13130670045208906, "learning_rate": 1.033636515537661e-05, "loss": 0.15, "step": 6320 }, { "epoch": 2.1926208210635716, "grad_norm": 0.13037720388849533, "learning_rate": 1.0254869561063263e-05, "loss": 0.1514, "step": 6330 }, { "epoch": 2.196085224320111, "grad_norm": 0.1392467131479532, "learning_rate": 1.0173613533738238e-05, "loss": 0.1497, "step": 6340 }, { "epoch": 2.19954962757665, "grad_norm": 0.13583441746880653, "learning_rate": 1.0092598393583949e-05, "loss": 0.1533, "step": 6350 }, { "epoch": 2.203014030833189, "grad_norm": 0.138478347457145, "learning_rate": 1.001182545686904e-05, "loss": 0.1474, "step": 6360 }, { "epoch": 2.206478434089728, "grad_norm": 0.13196884066043799, "learning_rate": 9.931296035927068e-06, "loss": 0.1485, "step": 6370 }, { "epoch": 2.2099428373462673, "grad_norm": 0.12628759927748884, "learning_rate": 9.851011439135105e-06, "loss": 0.1512, "step": 6380 }, { "epoch": 2.2134072406028062, "grad_norm": 0.12720352197837026, "learning_rate": 9.770972970892553e-06, "loss": 0.1492, "step": 6390 }, { "epoch": 2.216871643859345, "grad_norm": 0.1288529643249653, "learning_rate": 9.691181931599886e-06, "loss": 0.1463, "step": 6400 }, { "epoch": 2.220336047115884, "grad_norm": 0.12600916062409626, "learning_rate": 9.611639617637558e-06, "loss": 0.1488, "step": 6410 }, { "epoch": 2.2238004503724236, "grad_norm": 0.14329418696142437, "learning_rate": 9.532347321344956e-06, "loss": 0.1521, "step": 6420 }, { "epoch": 2.2272648536289625, "grad_norm": 0.13819893815319562, "learning_rate": 9.453306330999349e-06, "loss": 0.1501, "step": 6430 }, { "epoch": 2.2307292568855015, "grad_norm": 0.13277309165484377, "learning_rate": 9.37451793079502e-06, "loss": 0.1515, "step": 6440 }, { "epoch": 2.2341936601420405, "grad_norm": 0.1304498393414478, "learning_rate": 9.29598340082236e-06, "loss": 0.1498, "step": 6450 }, { "epoch": 2.2376580633985794, "grad_norm": 0.13297679247054883, "learning_rate": 9.217704017047057e-06, "loss": 0.151, "step": 6460 }, { "epoch": 2.241122466655119, "grad_norm": 0.12871920957838523, "learning_rate": 9.139681051289425e-06, "loss": 0.1507, "step": 6470 }, { "epoch": 2.244586869911658, "grad_norm": 0.1377467169551894, "learning_rate": 9.061915771203695e-06, "loss": 0.1477, "step": 6480 }, { "epoch": 2.2480512731681968, "grad_norm": 0.14354881718167706, "learning_rate": 8.984409440257427e-06, "loss": 0.1496, "step": 6490 }, { "epoch": 2.2515156764247357, "grad_norm": 0.13146580616283166, "learning_rate": 8.907163317710976e-06, "loss": 0.1499, "step": 6500 }, { "epoch": 2.2549800796812747, "grad_norm": 0.14028927070593003, "learning_rate": 8.830178658597038e-06, "loss": 0.1482, "step": 6510 }, { "epoch": 2.258444482937814, "grad_norm": 0.13406176038514053, "learning_rate": 8.75345671370029e-06, "loss": 0.1513, "step": 6520 }, { "epoch": 2.261908886194353, "grad_norm": 0.1281531685378758, "learning_rate": 8.676998729537009e-06, "loss": 0.1487, "step": 6530 }, { "epoch": 2.265373289450892, "grad_norm": 0.1334136953967956, "learning_rate": 8.600805948334858e-06, "loss": 0.1461, "step": 6540 }, { "epoch": 2.268837692707431, "grad_norm": 0.13263104957255284, "learning_rate": 8.524879608012714e-06, "loss": 0.1494, "step": 6550 }, { "epoch": 2.2723020959639704, "grad_norm": 0.14074464264886974, "learning_rate": 8.449220942160512e-06, "loss": 0.1492, "step": 6560 }, { "epoch": 2.2757664992205093, "grad_norm": 0.13291686489725904, "learning_rate": 8.373831180019256e-06, "loss": 0.1468, "step": 6570 }, { "epoch": 2.2792309024770483, "grad_norm": 0.13968516954362226, "learning_rate": 8.298711546460986e-06, "loss": 0.148, "step": 6580 }, { "epoch": 2.2826953057335873, "grad_norm": 0.13617881495577386, "learning_rate": 8.223863261968945e-06, "loss": 0.1514, "step": 6590 }, { "epoch": 2.2861597089901267, "grad_norm": 0.13589641843226832, "learning_rate": 8.149287542617686e-06, "loss": 0.147, "step": 6600 }, { "epoch": 2.2896241122466656, "grad_norm": 0.13498192446917104, "learning_rate": 8.074985600053361e-06, "loss": 0.1559, "step": 6610 }, { "epoch": 2.2930885155032046, "grad_norm": 0.13109156975966305, "learning_rate": 8.000958641474021e-06, "loss": 0.1524, "step": 6620 }, { "epoch": 2.2965529187597435, "grad_norm": 0.1320540630523605, "learning_rate": 7.927207869609984e-06, "loss": 0.1493, "step": 6630 }, { "epoch": 2.3000173220162825, "grad_norm": 0.14127205971076925, "learning_rate": 7.853734482704309e-06, "loss": 0.1511, "step": 6640 }, { "epoch": 2.303481725272822, "grad_norm": 0.13466133744323935, "learning_rate": 7.780539674493345e-06, "loss": 0.1506, "step": 6650 }, { "epoch": 2.306946128529361, "grad_norm": 0.1286157583033884, "learning_rate": 7.707624634187308e-06, "loss": 0.1527, "step": 6660 }, { "epoch": 2.3104105317859, "grad_norm": 0.1380982881943608, "learning_rate": 7.63499054645096e-06, "loss": 0.1484, "step": 6670 }, { "epoch": 2.313874935042439, "grad_norm": 0.15156686175057582, "learning_rate": 7.562638591384396e-06, "loss": 0.1461, "step": 6680 }, { "epoch": 2.3173393382989778, "grad_norm": 0.12803039542374958, "learning_rate": 7.4905699445038255e-06, "loss": 0.1483, "step": 6690 }, { "epoch": 2.320803741555517, "grad_norm": 0.13309461688414237, "learning_rate": 7.418785776722514e-06, "loss": 0.151, "step": 6700 }, { "epoch": 2.324268144812056, "grad_norm": 0.12574985983373116, "learning_rate": 7.34728725433172e-06, "loss": 0.1466, "step": 6710 }, { "epoch": 2.327732548068595, "grad_norm": 0.13375589775606558, "learning_rate": 7.276075538981778e-06, "loss": 0.1511, "step": 6720 }, { "epoch": 2.3311969513251345, "grad_norm": 0.13021533231229976, "learning_rate": 7.205151787663222e-06, "loss": 0.1486, "step": 6730 }, { "epoch": 2.3346613545816735, "grad_norm": 0.1238991538662092, "learning_rate": 7.134517152687953e-06, "loss": 0.1467, "step": 6740 }, { "epoch": 2.3381257578382124, "grad_norm": 0.13319985151049268, "learning_rate": 7.064172781670569e-06, "loss": 0.1504, "step": 6750 }, { "epoch": 2.3415901610947514, "grad_norm": 0.13427753958581112, "learning_rate": 6.994119817509678e-06, "loss": 0.1454, "step": 6760 }, { "epoch": 2.3450545643512903, "grad_norm": 0.1329147728324323, "learning_rate": 6.924359398369342e-06, "loss": 0.1487, "step": 6770 }, { "epoch": 2.3485189676078297, "grad_norm": 0.11940132635278913, "learning_rate": 6.854892657660605e-06, "loss": 0.1476, "step": 6780 }, { "epoch": 2.3519833708643687, "grad_norm": 0.13086880472397885, "learning_rate": 6.785720724023042e-06, "loss": 0.1483, "step": 6790 }, { "epoch": 2.3554477741209077, "grad_norm": 0.12994412754505075, "learning_rate": 6.716844721306443e-06, "loss": 0.1496, "step": 6800 }, { "epoch": 2.3589121773774466, "grad_norm": 0.13309217588951705, "learning_rate": 6.648265768552569e-06, "loss": 0.1469, "step": 6810 }, { "epoch": 2.3623765806339856, "grad_norm": 0.1353366584822473, "learning_rate": 6.579984979976925e-06, "loss": 0.151, "step": 6820 }, { "epoch": 2.365840983890525, "grad_norm": 0.13072638016170673, "learning_rate": 6.512003464950706e-06, "loss": 0.1498, "step": 6830 }, { "epoch": 2.369305387147064, "grad_norm": 0.14086411924508258, "learning_rate": 6.444322327982752e-06, "loss": 0.1488, "step": 6840 }, { "epoch": 2.372769790403603, "grad_norm": 0.13022025324073322, "learning_rate": 6.376942668701586e-06, "loss": 0.1505, "step": 6850 }, { "epoch": 2.376234193660142, "grad_norm": 0.1270304545640671, "learning_rate": 6.309865581837584e-06, "loss": 0.1467, "step": 6860 }, { "epoch": 2.3796985969166813, "grad_norm": 0.13824906046376823, "learning_rate": 6.243092157205146e-06, "loss": 0.1479, "step": 6870 }, { "epoch": 2.3831630001732202, "grad_norm": 0.13322342141123356, "learning_rate": 6.1766234796850426e-06, "loss": 0.1532, "step": 6880 }, { "epoch": 2.386627403429759, "grad_norm": 0.13622336055549258, "learning_rate": 6.110460629206735e-06, "loss": 0.1494, "step": 6890 }, { "epoch": 2.390091806686298, "grad_norm": 0.13141145530513035, "learning_rate": 6.044604680730856e-06, "loss": 0.1478, "step": 6900 }, { "epoch": 2.3935562099428376, "grad_norm": 0.12753936723220544, "learning_rate": 5.979056704231759e-06, "loss": 0.1508, "step": 6910 }, { "epoch": 2.3970206131993765, "grad_norm": 0.1282678691982685, "learning_rate": 5.9138177646800934e-06, "loss": 0.1494, "step": 6920 }, { "epoch": 2.4004850164559155, "grad_norm": 0.12550541746949806, "learning_rate": 5.848888922025553e-06, "loss": 0.1474, "step": 6930 }, { "epoch": 2.4039494197124545, "grad_norm": 0.13234115800985993, "learning_rate": 5.7842712311796025e-06, "loss": 0.1463, "step": 6940 }, { "epoch": 2.4074138229689934, "grad_norm": 0.12467209107241967, "learning_rate": 5.719965741998368e-06, "loss": 0.1491, "step": 6950 }, { "epoch": 2.410878226225533, "grad_norm": 0.13573330603028008, "learning_rate": 5.655973499265582e-06, "loss": 0.1513, "step": 6960 }, { "epoch": 2.414342629482072, "grad_norm": 0.13240853708299855, "learning_rate": 5.59229554267561e-06, "loss": 0.1516, "step": 6970 }, { "epoch": 2.4178070327386108, "grad_norm": 0.1329534082302391, "learning_rate": 5.528932906816522e-06, "loss": 0.1517, "step": 6980 }, { "epoch": 2.4212714359951497, "grad_norm": 0.12911430987167502, "learning_rate": 5.465886621153346e-06, "loss": 0.1457, "step": 6990 }, { "epoch": 2.4247358392516887, "grad_norm": 0.1334215432640577, "learning_rate": 5.403157710011267e-06, "loss": 0.1534, "step": 7000 }, { "epoch": 2.428200242508228, "grad_norm": 0.12233095847822573, "learning_rate": 5.340747192559064e-06, "loss": 0.1443, "step": 7010 }, { "epoch": 2.431664645764767, "grad_norm": 0.12214722447020973, "learning_rate": 5.278656082792488e-06, "loss": 0.1506, "step": 7020 }, { "epoch": 2.435129049021306, "grad_norm": 0.1312038495892653, "learning_rate": 5.216885389517808e-06, "loss": 0.1494, "step": 7030 }, { "epoch": 2.438593452277845, "grad_norm": 0.12877996640447992, "learning_rate": 5.155436116335455e-06, "loss": 0.1498, "step": 7040 }, { "epoch": 2.4420578555343844, "grad_norm": 0.13785796047700058, "learning_rate": 5.094309261623642e-06, "loss": 0.1493, "step": 7050 }, { "epoch": 2.4455222587909233, "grad_norm": 0.12689388560349374, "learning_rate": 5.0335058185222245e-06, "loss": 0.148, "step": 7060 }, { "epoch": 2.4489866620474623, "grad_norm": 0.13349070966216434, "learning_rate": 4.973026774916504e-06, "loss": 0.1491, "step": 7070 }, { "epoch": 2.4524510653040013, "grad_norm": 0.13124780154477322, "learning_rate": 4.912873113421215e-06, "loss": 0.1472, "step": 7080 }, { "epoch": 2.4559154685605407, "grad_norm": 0.12289322814948568, "learning_rate": 4.853045811364532e-06, "loss": 0.151, "step": 7090 }, { "epoch": 2.4593798718170796, "grad_norm": 0.1231683131497469, "learning_rate": 4.793545840772221e-06, "loss": 0.1477, "step": 7100 }, { "epoch": 2.4628442750736186, "grad_norm": 0.12204021116853747, "learning_rate": 4.734374168351807e-06, "loss": 0.1464, "step": 7110 }, { "epoch": 2.4663086783301575, "grad_norm": 0.12705870894938584, "learning_rate": 4.675531755476922e-06, "loss": 0.1487, "step": 7120 }, { "epoch": 2.4697730815866965, "grad_norm": 0.13311385467368475, "learning_rate": 4.617019558171623e-06, "loss": 0.1518, "step": 7130 }, { "epoch": 2.473237484843236, "grad_norm": 0.12633200495803268, "learning_rate": 4.558838527094916e-06, "loss": 0.1487, "step": 7140 }, { "epoch": 2.476701888099775, "grad_norm": 0.12821618157907513, "learning_rate": 4.500989607525271e-06, "loss": 0.1513, "step": 7150 }, { "epoch": 2.480166291356314, "grad_norm": 0.12368299726159533, "learning_rate": 4.443473739345275e-06, "loss": 0.1486, "step": 7160 }, { "epoch": 2.483630694612853, "grad_norm": 0.13459364837403084, "learning_rate": 4.386291857026381e-06, "loss": 0.1507, "step": 7170 }, { "epoch": 2.4870950978693918, "grad_norm": 0.12921146609560327, "learning_rate": 4.329444889613687e-06, "loss": 0.1506, "step": 7180 }, { "epoch": 2.490559501125931, "grad_norm": 0.13011095251066665, "learning_rate": 4.272933760710893e-06, "loss": 0.1472, "step": 7190 }, { "epoch": 2.49402390438247, "grad_norm": 0.13394025522445424, "learning_rate": 4.2167593884652325e-06, "loss": 0.1482, "step": 7200 }, { "epoch": 2.497488307639009, "grad_norm": 0.12341718153206974, "learning_rate": 4.160922685552612e-06, "loss": 0.1492, "step": 7210 }, { "epoch": 2.5009527108955485, "grad_norm": 0.12332806421502722, "learning_rate": 4.105424559162754e-06, "loss": 0.151, "step": 7220 }, { "epoch": 2.5044171141520875, "grad_norm": 0.12451895183900567, "learning_rate": 4.05026591098446e-06, "loss": 0.1477, "step": 7230 }, { "epoch": 2.5078815174086264, "grad_norm": 0.12237447886665374, "learning_rate": 3.995447637190955e-06, "loss": 0.1434, "step": 7240 }, { "epoch": 2.5113459206651654, "grad_norm": 0.12468896676434474, "learning_rate": 3.940970628425353e-06, "loss": 0.1509, "step": 7250 }, { "epoch": 2.5148103239217043, "grad_norm": 0.12229284415257566, "learning_rate": 3.886835769786154e-06, "loss": 0.1508, "step": 7260 }, { "epoch": 2.5182747271782437, "grad_norm": 0.12284739761632071, "learning_rate": 3.833043940812889e-06, "loss": 0.1469, "step": 7270 }, { "epoch": 2.5217391304347827, "grad_norm": 0.12171858269842417, "learning_rate": 3.7795960154718175e-06, "loss": 0.148, "step": 7280 }, { "epoch": 2.5252035336913217, "grad_norm": 0.1279096545024547, "learning_rate": 3.726492862141717e-06, "loss": 0.1492, "step": 7290 }, { "epoch": 2.5286679369478606, "grad_norm": 0.1261825689932827, "learning_rate": 3.67373534359981e-06, "loss": 0.1485, "step": 7300 }, { "epoch": 2.5321323402043996, "grad_norm": 0.12731710139244515, "learning_rate": 3.621324317007704e-06, "loss": 0.1485, "step": 7310 }, { "epoch": 2.535596743460939, "grad_norm": 0.1224027049400114, "learning_rate": 3.569260633897495e-06, "loss": 0.1476, "step": 7320 }, { "epoch": 2.539061146717478, "grad_norm": 0.12095976657546359, "learning_rate": 3.517545140157927e-06, "loss": 0.1466, "step": 7330 }, { "epoch": 2.542525549974017, "grad_norm": 0.12165268564942827, "learning_rate": 3.466178676020626e-06, "loss": 0.1463, "step": 7340 }, { "epoch": 2.545989953230556, "grad_norm": 0.12226265951849115, "learning_rate": 3.415162076046488e-06, "loss": 0.1485, "step": 7350 }, { "epoch": 2.549454356487095, "grad_norm": 0.12637110596902743, "learning_rate": 3.364496169112083e-06, "loss": 0.1487, "step": 7360 }, { "epoch": 2.5529187597436342, "grad_norm": 0.12810226008624373, "learning_rate": 3.3141817783962e-06, "loss": 0.146, "step": 7370 }, { "epoch": 2.556383163000173, "grad_norm": 0.1252993130112932, "learning_rate": 3.264219721366496e-06, "loss": 0.1465, "step": 7380 }, { "epoch": 2.559847566256712, "grad_norm": 0.1217860788587577, "learning_rate": 3.2146108097661746e-06, "loss": 0.1465, "step": 7390 }, { "epoch": 2.5633119695132516, "grad_norm": 0.1306358146817434, "learning_rate": 3.165355849600829e-06, "loss": 0.1488, "step": 7400 }, { "epoch": 2.5667763727697905, "grad_norm": 0.1243046537823559, "learning_rate": 3.116455641125332e-06, "loss": 0.1457, "step": 7410 }, { "epoch": 2.5702407760263295, "grad_norm": 0.13155702663844496, "learning_rate": 3.0679109788308293e-06, "loss": 0.1507, "step": 7420 }, { "epoch": 2.5737051792828685, "grad_norm": 0.12565136055486759, "learning_rate": 3.0197226514318527e-06, "loss": 0.1447, "step": 7430 }, { "epoch": 2.5771695825394074, "grad_norm": 0.12227279079368177, "learning_rate": 2.9718914418534747e-06, "loss": 0.1481, "step": 7440 }, { "epoch": 2.580633985795947, "grad_norm": 0.12446579351502253, "learning_rate": 2.9244181272186257e-06, "loss": 0.1508, "step": 7450 }, { "epoch": 2.584098389052486, "grad_norm": 0.12176081031880796, "learning_rate": 2.8773034788354384e-06, "loss": 0.1516, "step": 7460 }, { "epoch": 2.5875627923090248, "grad_norm": 0.12394944777774412, "learning_rate": 2.8305482621847152e-06, "loss": 0.146, "step": 7470 }, { "epoch": 2.5910271955655637, "grad_norm": 0.12488028679932459, "learning_rate": 2.784153236907522e-06, "loss": 0.1473, "step": 7480 }, { "epoch": 2.5944915988221027, "grad_norm": 0.12228989346664527, "learning_rate": 2.7381191567928064e-06, "loss": 0.1481, "step": 7490 }, { "epoch": 2.597956002078642, "grad_norm": 0.12775499380593486, "learning_rate": 2.6924467697651778e-06, "loss": 0.1489, "step": 7500 }, { "epoch": 2.601420405335181, "grad_norm": 0.123666079771719, "learning_rate": 2.6471368178727583e-06, "loss": 0.1457, "step": 7510 }, { "epoch": 2.60488480859172, "grad_norm": 0.12267432282437238, "learning_rate": 2.6021900372750956e-06, "loss": 0.1457, "step": 7520 }, { "epoch": 2.6083492118482594, "grad_norm": 0.12433547335629408, "learning_rate": 2.5576071582312428e-06, "loss": 0.1461, "step": 7530 }, { "epoch": 2.611813615104798, "grad_norm": 0.12415914526721374, "learning_rate": 2.51338890508786e-06, "loss": 0.1503, "step": 7540 }, { "epoch": 2.6152780183613373, "grad_norm": 0.1228630751843417, "learning_rate": 2.4695359962674608e-06, "loss": 0.1478, "step": 7550 }, { "epoch": 2.6187424216178763, "grad_norm": 0.12239232127572298, "learning_rate": 2.4260491442567506e-06, "loss": 0.148, "step": 7560 }, { "epoch": 2.6222068248744153, "grad_norm": 0.13105355595668536, "learning_rate": 2.3829290555950264e-06, "loss": 0.1463, "step": 7570 }, { "epoch": 2.6256712281309547, "grad_norm": 0.12268630715908427, "learning_rate": 2.340176430862723e-06, "loss": 0.1458, "step": 7580 }, { "epoch": 2.6291356313874936, "grad_norm": 0.12235644638631886, "learning_rate": 2.2977919646700068e-06, "loss": 0.1455, "step": 7590 }, { "epoch": 2.6326000346440326, "grad_norm": 0.11798600614702796, "learning_rate": 2.255776345645494e-06, "loss": 0.142, "step": 7600 }, { "epoch": 2.6360644379005715, "grad_norm": 0.12407906704297328, "learning_rate": 2.2141302564250926e-06, "loss": 0.1502, "step": 7610 }, { "epoch": 2.6395288411571105, "grad_norm": 0.11753331905833096, "learning_rate": 2.17285437364087e-06, "loss": 0.1428, "step": 7620 }, { "epoch": 2.64299324441365, "grad_norm": 0.12007274689026741, "learning_rate": 2.131949367910077e-06, "loss": 0.1465, "step": 7630 }, { "epoch": 2.646457647670189, "grad_norm": 0.125472794336859, "learning_rate": 2.0914159038242704e-06, "loss": 0.1423, "step": 7640 }, { "epoch": 2.649922050926728, "grad_norm": 0.12784584316147593, "learning_rate": 2.051254639938477e-06, "loss": 0.1476, "step": 7650 }, { "epoch": 2.653386454183267, "grad_norm": 0.11853028227441036, "learning_rate": 2.0114662287605335e-06, "loss": 0.1486, "step": 7660 }, { "epoch": 2.6568508574398058, "grad_norm": 0.1191674746816265, "learning_rate": 1.97205131674045e-06, "loss": 0.1492, "step": 7670 }, { "epoch": 2.660315260696345, "grad_norm": 0.1250727768460615, "learning_rate": 1.933010544259939e-06, "loss": 0.1454, "step": 7680 }, { "epoch": 2.663779663952884, "grad_norm": 0.12320213371283693, "learning_rate": 1.8943445456219815e-06, "loss": 0.1472, "step": 7690 }, { "epoch": 2.667244067209423, "grad_norm": 0.11891666023211327, "learning_rate": 1.8560539490405399e-06, "loss": 0.1492, "step": 7700 }, { "epoch": 2.6707084704659625, "grad_norm": 0.125346930924718, "learning_rate": 1.8181393766303595e-06, "loss": 0.1462, "step": 7710 }, { "epoch": 2.6741728737225015, "grad_norm": 0.1321032171582539, "learning_rate": 1.7806014443968289e-06, "loss": 0.1436, "step": 7720 }, { "epoch": 2.6776372769790404, "grad_norm": 0.12117133427851705, "learning_rate": 1.7434407622259951e-06, "loss": 0.1466, "step": 7730 }, { "epoch": 2.6811016802355794, "grad_norm": 0.11718318211141446, "learning_rate": 1.7066579338746668e-06, "loss": 0.1456, "step": 7740 }, { "epoch": 2.6845660834921183, "grad_norm": 0.1283761135380607, "learning_rate": 1.670253556960563e-06, "loss": 0.1464, "step": 7750 }, { "epoch": 2.6880304867486577, "grad_norm": 0.12456996594619513, "learning_rate": 1.6342282229526468e-06, "loss": 0.1482, "step": 7760 }, { "epoch": 2.6914948900051967, "grad_norm": 0.11703659504283041, "learning_rate": 1.5985825171614953e-06, "loss": 0.1455, "step": 7770 }, { "epoch": 2.6949592932617357, "grad_norm": 0.11962329640139921, "learning_rate": 1.5633170187297846e-06, "loss": 0.1468, "step": 7780 }, { "epoch": 2.6984236965182746, "grad_norm": 0.12759004303067803, "learning_rate": 1.5284323006229035e-06, "loss": 0.1457, "step": 7790 }, { "epoch": 2.7018880997748136, "grad_norm": 0.12164262479291138, "learning_rate": 1.4939289296196063e-06, "loss": 0.1451, "step": 7800 }, { "epoch": 2.705352503031353, "grad_norm": 0.11841849076099206, "learning_rate": 1.4598074663028483e-06, "loss": 0.1442, "step": 7810 }, { "epoch": 2.708816906287892, "grad_norm": 0.1279272278852782, "learning_rate": 1.4260684650506478e-06, "loss": 0.1466, "step": 7820 }, { "epoch": 2.712281309544431, "grad_norm": 0.11952115983833288, "learning_rate": 1.3927124740270885e-06, "loss": 0.1457, "step": 7830 }, { "epoch": 2.71574571280097, "grad_norm": 0.12162884781857089, "learning_rate": 1.3597400351734151e-06, "loss": 0.1427, "step": 7840 }, { "epoch": 2.719210116057509, "grad_norm": 0.13471370194895713, "learning_rate": 1.327151684199221e-06, "loss": 0.1442, "step": 7850 }, { "epoch": 2.7226745193140482, "grad_norm": 0.12907986475959193, "learning_rate": 1.2949479505737494e-06, "loss": 0.1455, "step": 7860 }, { "epoch": 2.726138922570587, "grad_norm": 0.12566383969973927, "learning_rate": 1.263129357517301e-06, "loss": 0.1463, "step": 7870 }, { "epoch": 2.729603325827126, "grad_norm": 0.1195570593454166, "learning_rate": 1.2316964219927119e-06, "loss": 0.1422, "step": 7880 }, { "epoch": 2.7330677290836656, "grad_norm": 0.12438741593202014, "learning_rate": 1.2006496546969642e-06, "loss": 0.1506, "step": 7890 }, { "epoch": 2.7365321323402045, "grad_norm": 0.13311421492284156, "learning_rate": 1.1699895600529087e-06, "loss": 0.1484, "step": 7900 }, { "epoch": 2.7399965355967435, "grad_norm": 0.13353566400942274, "learning_rate": 1.1397166362010243e-06, "loss": 0.1472, "step": 7910 }, { "epoch": 2.7434609388532825, "grad_norm": 0.12303792368683632, "learning_rate": 1.109831374991377e-06, "loss": 0.1455, "step": 7920 }, { "epoch": 2.7469253421098214, "grad_norm": 0.1164932218834871, "learning_rate": 1.080334261975577e-06, "loss": 0.1456, "step": 7930 }, { "epoch": 2.750389745366361, "grad_norm": 0.12413166278304175, "learning_rate": 1.051225776398937e-06, "loss": 0.1475, "step": 7940 }, { "epoch": 2.7538541486229, "grad_norm": 0.11917463944574379, "learning_rate": 1.0225063911926597e-06, "loss": 0.1473, "step": 7950 }, { "epoch": 2.7573185518794388, "grad_norm": 0.21002619780579418, "learning_rate": 9.94176572966149e-07, "loss": 0.1499, "step": 7960 }, { "epoch": 2.7607829551359777, "grad_norm": 0.12190061305916731, "learning_rate": 9.662367819994467e-07, "loss": 0.1448, "step": 7970 }, { "epoch": 2.7642473583925167, "grad_norm": 0.11828260636790151, "learning_rate": 9.386874722357469e-07, "loss": 0.1492, "step": 7980 }, { "epoch": 2.767711761649056, "grad_norm": 0.11503639232862127, "learning_rate": 9.115290912740132e-07, "loss": 0.1448, "step": 7990 }, { "epoch": 2.771176164905595, "grad_norm": 0.1189533885403222, "learning_rate": 8.847620803617257e-07, "loss": 0.1467, "step": 8000 }, { "epoch": 2.774640568162134, "grad_norm": 0.12701672502597688, "learning_rate": 8.583868743876844e-07, "loss": 0.1462, "step": 8010 }, { "epoch": 2.7781049714186734, "grad_norm": 0.1220861131545684, "learning_rate": 8.324039018749674e-07, "loss": 0.1453, "step": 8020 }, { "epoch": 2.781569374675212, "grad_norm": 0.12031457421341588, "learning_rate": 8.068135849739617e-07, "loss": 0.1437, "step": 8030 }, { "epoch": 2.7850337779317513, "grad_norm": 0.12017875343248997, "learning_rate": 7.816163394554932e-07, "loss": 0.1478, "step": 8040 }, { "epoch": 2.7884981811882903, "grad_norm": 0.1192917350903116, "learning_rate": 7.56812574704091e-07, "loss": 0.1436, "step": 8050 }, { "epoch": 2.7919625844448293, "grad_norm": 0.11524716943825414, "learning_rate": 7.32402693711326e-07, "loss": 0.1451, "step": 8060 }, { "epoch": 2.7954269877013687, "grad_norm": 0.12406089861167721, "learning_rate": 7.083870930692516e-07, "loss": 0.1459, "step": 8070 }, { "epoch": 2.7988913909579076, "grad_norm": 0.12315834818935076, "learning_rate": 6.847661629639873e-07, "loss": 0.1451, "step": 8080 }, { "epoch": 2.8023557942144466, "grad_norm": 0.11732007572900043, "learning_rate": 6.615402871693487e-07, "loss": 0.1473, "step": 8090 }, { "epoch": 2.8058201974709855, "grad_norm": 0.11711430732640442, "learning_rate": 6.387098430406441e-07, "loss": 0.1481, "step": 8100 }, { "epoch": 2.8092846007275245, "grad_norm": 0.11662597413638771, "learning_rate": 6.162752015085122e-07, "loss": 0.1462, "step": 8110 }, { "epoch": 2.812749003984064, "grad_norm": 0.11644716676700212, "learning_rate": 5.942367270729165e-07, "loss": 0.1454, "step": 8120 }, { "epoch": 2.816213407240603, "grad_norm": 0.1210826937584845, "learning_rate": 5.725947777972224e-07, "loss": 0.1482, "step": 8130 }, { "epoch": 2.819677810497142, "grad_norm": 0.11717664884081884, "learning_rate": 5.513497053023647e-07, "loss": 0.1446, "step": 8140 }, { "epoch": 2.823142213753681, "grad_norm": 0.11782433267361038, "learning_rate": 5.305018547611451e-07, "loss": 0.1479, "step": 8150 }, { "epoch": 2.8266066170102198, "grad_norm": 0.12844697710634922, "learning_rate": 5.100515648926329e-07, "loss": 0.1457, "step": 8160 }, { "epoch": 2.830071020266759, "grad_norm": 0.1259382993927401, "learning_rate": 4.899991679566423e-07, "loss": 0.1482, "step": 8170 }, { "epoch": 2.833535423523298, "grad_norm": 0.12112661112999973, "learning_rate": 4.703449897483503e-07, "loss": 0.1468, "step": 8180 }, { "epoch": 2.836999826779837, "grad_norm": 0.11907259047343774, "learning_rate": 4.5108934959299243e-07, "loss": 0.1412, "step": 8190 }, { "epoch": 2.8404642300363765, "grad_norm": 0.11046949957522073, "learning_rate": 4.322325603406813e-07, "loss": 0.1457, "step": 8200 }, { "epoch": 2.8439286332929155, "grad_norm": 0.11728964170089871, "learning_rate": 4.137749283613268e-07, "loss": 0.148, "step": 8210 }, { "epoch": 2.8473930365494544, "grad_norm": 0.12458638671812242, "learning_rate": 3.9571675353964053e-07, "loss": 0.1477, "step": 8220 }, { "epoch": 2.8508574398059934, "grad_norm": 0.12603956939519112, "learning_rate": 3.780583292702894e-07, "loss": 0.1431, "step": 8230 }, { "epoch": 2.8543218430625323, "grad_norm": 0.11737849831673626, "learning_rate": 3.607999424531078e-07, "loss": 0.1459, "step": 8240 }, { "epoch": 2.8577862463190717, "grad_norm": 0.11861750859649005, "learning_rate": 3.4394187348844866e-07, "loss": 0.1484, "step": 8250 }, { "epoch": 2.8612506495756107, "grad_norm": 0.12007382110446226, "learning_rate": 3.274843962726204e-07, "loss": 0.1461, "step": 8260 }, { "epoch": 2.8647150528321497, "grad_norm": 0.1114021812553446, "learning_rate": 3.114277781934433e-07, "loss": 0.1408, "step": 8270 }, { "epoch": 2.8681794560886886, "grad_norm": 0.12459777820858844, "learning_rate": 2.957722801258944e-07, "loss": 0.1469, "step": 8280 }, { "epoch": 2.8716438593452276, "grad_norm": 0.12059709971421617, "learning_rate": 2.805181564278864e-07, "loss": 0.1476, "step": 8290 }, { "epoch": 2.875108262601767, "grad_norm": 0.11316078775210853, "learning_rate": 2.6566565493611475e-07, "loss": 0.1447, "step": 8300 }, { "epoch": 2.878572665858306, "grad_norm": 0.12343927697834793, "learning_rate": 2.512150169620503e-07, "loss": 0.1499, "step": 8310 }, { "epoch": 2.882037069114845, "grad_norm": 0.1264438568086182, "learning_rate": 2.371664772880061e-07, "loss": 0.1475, "step": 8320 }, { "epoch": 2.885501472371384, "grad_norm": 0.11834929752982455, "learning_rate": 2.2352026416331829e-07, "loss": 0.1484, "step": 8330 }, { "epoch": 2.888965875627923, "grad_norm": 0.12074802080378956, "learning_rate": 2.1027659930066e-07, "loss": 0.1464, "step": 8340 }, { "epoch": 2.8924302788844622, "grad_norm": 0.11880030905569529, "learning_rate": 1.97435697872414e-07, "loss": 0.1465, "step": 8350 }, { "epoch": 2.895894682141001, "grad_norm": 0.1286519420227983, "learning_rate": 1.8499776850719463e-07, "loss": 0.1466, "step": 8360 }, { "epoch": 2.89935908539754, "grad_norm": 0.12408525205621782, "learning_rate": 1.7296301328644516e-07, "loss": 0.1467, "step": 8370 }, { "epoch": 2.9028234886540796, "grad_norm": 0.11413939977206788, "learning_rate": 1.613316277411625e-07, "loss": 0.1432, "step": 8380 }, { "epoch": 2.9062878919106185, "grad_norm": 0.12094926364512971, "learning_rate": 1.5010380084871933e-07, "loss": 0.1482, "step": 8390 }, { "epoch": 2.9097522951671575, "grad_norm": 0.1173931691129058, "learning_rate": 1.392797150297942e-07, "loss": 0.1476, "step": 8400 }, { "epoch": 2.9132166984236965, "grad_norm": 0.11719997871862918, "learning_rate": 1.2885954614540175e-07, "loss": 0.1463, "step": 8410 }, { "epoch": 2.9166811016802354, "grad_norm": 0.11982420345633238, "learning_rate": 1.1884346349404774e-07, "loss": 0.1508, "step": 8420 }, { "epoch": 2.920145504936775, "grad_norm": 0.12475530168973786, "learning_rate": 1.0923162980896185e-07, "loss": 0.15, "step": 8430 }, { "epoch": 2.923609908193314, "grad_norm": 0.11892325463016702, "learning_rate": 1.000242012554664e-07, "loss": 0.1476, "step": 8440 }, { "epoch": 2.9270743114498528, "grad_norm": 0.1192790735851098, "learning_rate": 9.122132742843681e-08, "loss": 0.1454, "step": 8450 }, { "epoch": 2.9305387147063917, "grad_norm": 0.1170936103181331, "learning_rate": 8.28231513498673e-08, "loss": 0.1482, "step": 8460 }, { "epoch": 2.9340031179629307, "grad_norm": 0.11575536846669715, "learning_rate": 7.48298094665506e-08, "loss": 0.1482, "step": 8470 }, { "epoch": 2.93746752121947, "grad_norm": 0.10817448466285368, "learning_rate": 6.724143164785757e-08, "loss": 0.1454, "step": 8480 }, { "epoch": 2.940931924476009, "grad_norm": 0.11464554359428555, "learning_rate": 6.005814118363317e-08, "loss": 0.1438, "step": 8490 }, { "epoch": 2.944396327732548, "grad_norm": 0.12385487233495297, "learning_rate": 5.328005478218989e-08, "loss": 0.1463, "step": 8500 }, { "epoch": 2.947860730989087, "grad_norm": 0.11878438059832382, "learning_rate": 4.69072825684036e-08, "loss": 0.1464, "step": 8510 }, { "epoch": 2.951325134245626, "grad_norm": 0.11382553656848247, "learning_rate": 4.093992808194558e-08, "loss": 0.1427, "step": 8520 }, { "epoch": 2.9547895375021653, "grad_norm": 0.1221402366685275, "learning_rate": 3.537808827557276e-08, "loss": 0.1469, "step": 8530 }, { "epoch": 2.9582539407587043, "grad_norm": 0.11836429164824389, "learning_rate": 3.0221853513576207e-08, "loss": 0.1471, "step": 8540 }, { "epoch": 2.9617183440152433, "grad_norm": 0.11592227533291538, "learning_rate": 2.5471307570298918e-08, "loss": 0.1436, "step": 8550 }, { "epoch": 2.9651827472717827, "grad_norm": 0.12005026306688542, "learning_rate": 2.112652762878142e-08, "loss": 0.1466, "step": 8560 }, { "epoch": 2.9686471505283216, "grad_norm": 0.12062875615569613, "learning_rate": 1.71875842795044e-08, "loss": 0.1457, "step": 8570 }, { "epoch": 2.9721115537848606, "grad_norm": 0.12033280556169802, "learning_rate": 1.3654541519242392e-08, "loss": 0.1437, "step": 8580 }, { "epoch": 2.9755759570413995, "grad_norm": 0.12337008384975914, "learning_rate": 1.0527456750025755e-08, "loss": 0.1468, "step": 8590 }, { "epoch": 2.9790403602979385, "grad_norm": 0.12167133579723895, "learning_rate": 7.80638077820528e-09, "loss": 0.1519, "step": 8600 }, { "epoch": 2.982504763554478, "grad_norm": 0.11751314218922883, "learning_rate": 5.491357813627862e-09, "loss": 0.1494, "step": 8610 }, { "epoch": 2.985969166811017, "grad_norm": 0.12293930219223904, "learning_rate": 3.582425468920403e-09, "loss": 0.1483, "step": 8620 }, { "epoch": 2.989433570067556, "grad_norm": 0.12371328514211692, "learning_rate": 2.0796147588791894e-09, "loss": 0.1446, "step": 8630 }, { "epoch": 2.992897973324095, "grad_norm": 0.1283635993974664, "learning_rate": 9.829500999564144e-10, "loss": 0.148, "step": 8640 }, { "epoch": 2.9963623765806338, "grad_norm": 0.11829545701564251, "learning_rate": 2.924493098743764e-10, "loss": 0.1419, "step": 8650 }, { "epoch": 2.999826779837173, "grad_norm": 0.1191095821300158, "learning_rate": 8.123607339594053e-12, "loss": 0.1497, "step": 8660 }, { "epoch": 3.0, "step": 8661, "total_flos": 1.6970304064115966e+19, "train_loss": 0.20868314632318158, "train_runtime": 603947.8558, "train_samples_per_second": 0.459, "train_steps_per_second": 0.014 } ], "logging_steps": 10, "max_steps": 8661, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6970304064115966e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }