{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 5775, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005194805194805195, "grad_norm": 1.3003292727167528, "learning_rate": 2.8735632183908046e-06, "loss": 0.5747, "step": 10 }, { "epoch": 0.01038961038961039, "grad_norm": 0.8164205391130951, "learning_rate": 5.747126436781609e-06, "loss": 0.5181, "step": 20 }, { "epoch": 0.015584415584415584, "grad_norm": 0.488946468114722, "learning_rate": 8.620689655172414e-06, "loss": 0.4525, "step": 30 }, { "epoch": 0.02077922077922078, "grad_norm": 0.5240257767228009, "learning_rate": 1.1494252873563218e-05, "loss": 0.4153, "step": 40 }, { "epoch": 0.025974025974025976, "grad_norm": 0.3878833187137561, "learning_rate": 1.4367816091954022e-05, "loss": 0.3917, "step": 50 }, { "epoch": 0.03116883116883117, "grad_norm": 0.3932438541306065, "learning_rate": 1.7241379310344828e-05, "loss": 0.3707, "step": 60 }, { "epoch": 0.03636363636363636, "grad_norm": 0.4466114932768719, "learning_rate": 2.0114942528735632e-05, "loss": 0.3605, "step": 70 }, { "epoch": 0.04155844155844156, "grad_norm": 0.8032738854038463, "learning_rate": 2.2988505747126437e-05, "loss": 0.3591, "step": 80 }, { "epoch": 0.046753246753246755, "grad_norm": 0.543812557370986, "learning_rate": 2.5862068965517244e-05, "loss": 0.3472, "step": 90 }, { "epoch": 0.05194805194805195, "grad_norm": 0.5733743111451876, "learning_rate": 2.8735632183908045e-05, "loss": 0.3341, "step": 100 }, { "epoch": 0.05714285714285714, "grad_norm": 0.9136032757030011, "learning_rate": 3.160919540229885e-05, "loss": 0.34, "step": 110 }, { "epoch": 0.06233766233766234, "grad_norm": 0.8015940010159968, "learning_rate": 3.4482758620689657e-05, "loss": 0.3346, "step": 120 }, { "epoch": 0.06753246753246753, "grad_norm": 0.5317651402718742, "learning_rate": 3.735632183908046e-05, "loss": 0.3205, "step": 130 }, { "epoch": 0.07272727272727272, "grad_norm": 0.9794116224334949, "learning_rate": 4.0229885057471265e-05, "loss": 0.3247, "step": 140 }, { "epoch": 0.07792207792207792, "grad_norm": 0.5926607382301553, "learning_rate": 4.3103448275862066e-05, "loss": 0.3137, "step": 150 }, { "epoch": 0.08311688311688312, "grad_norm": 0.501504811182817, "learning_rate": 4.597701149425287e-05, "loss": 0.3196, "step": 160 }, { "epoch": 0.08831168831168831, "grad_norm": 0.4844466938932139, "learning_rate": 4.885057471264368e-05, "loss": 0.3185, "step": 170 }, { "epoch": 0.09350649350649351, "grad_norm": 0.8792222600261355, "learning_rate": 4.999985842691236e-05, "loss": 0.317, "step": 180 }, { "epoch": 0.0987012987012987, "grad_norm": 0.538260729358137, "learning_rate": 4.999899326385009e-05, "loss": 0.3122, "step": 190 }, { "epoch": 0.1038961038961039, "grad_norm": 0.5532332343125024, "learning_rate": 4.99973416166265e-05, "loss": 0.308, "step": 200 }, { "epoch": 0.10909090909090909, "grad_norm": 0.5065931571721304, "learning_rate": 4.999490353720347e-05, "loss": 0.305, "step": 210 }, { "epoch": 0.11428571428571428, "grad_norm": 0.4661004020519144, "learning_rate": 4.9991679102284494e-05, "loss": 0.3031, "step": 220 }, { "epoch": 0.11948051948051948, "grad_norm": 0.4350447913864305, "learning_rate": 4.998766841331236e-05, "loss": 0.2979, "step": 230 }, { "epoch": 0.12467532467532468, "grad_norm": 0.3782265622233307, "learning_rate": 4.998287159646586e-05, "loss": 0.3035, "step": 240 }, { "epoch": 0.12987012987012986, "grad_norm": 0.3863507226733708, "learning_rate": 4.997728880265592e-05, "loss": 0.3024, "step": 250 }, { "epoch": 0.13506493506493505, "grad_norm": 0.45465564793802865, "learning_rate": 4.9970920207520756e-05, "loss": 0.2984, "step": 260 }, { "epoch": 0.14025974025974025, "grad_norm": 0.331617769411873, "learning_rate": 4.9963766011420394e-05, "loss": 0.2947, "step": 270 }, { "epoch": 0.14545454545454545, "grad_norm": 0.4323281120205803, "learning_rate": 4.9955826439430384e-05, "loss": 0.2885, "step": 280 }, { "epoch": 0.15064935064935064, "grad_norm": 0.3828139513983012, "learning_rate": 4.994710174133469e-05, "loss": 0.2957, "step": 290 }, { "epoch": 0.15584415584415584, "grad_norm": 0.45442672024966796, "learning_rate": 4.9937592191617846e-05, "loss": 0.2929, "step": 300 }, { "epoch": 0.16103896103896104, "grad_norm": 0.4324048943234123, "learning_rate": 4.992729808945629e-05, "loss": 0.287, "step": 310 }, { "epoch": 0.16623376623376623, "grad_norm": 0.3358736775493111, "learning_rate": 4.991621975870901e-05, "loss": 0.2831, "step": 320 }, { "epoch": 0.17142857142857143, "grad_norm": 0.33113790181444713, "learning_rate": 4.990435754790731e-05, "loss": 0.2868, "step": 330 }, { "epoch": 0.17662337662337663, "grad_norm": 0.3207965425558816, "learning_rate": 4.9891711830243845e-05, "loss": 0.2911, "step": 340 }, { "epoch": 0.18181818181818182, "grad_norm": 0.361708114933415, "learning_rate": 4.987828300356091e-05, "loss": 0.2857, "step": 350 }, { "epoch": 0.18701298701298702, "grad_norm": 0.34168626805695096, "learning_rate": 4.9864071490337896e-05, "loss": 0.2849, "step": 360 }, { "epoch": 0.19220779220779222, "grad_norm": 0.2908926746764898, "learning_rate": 4.9849077737678e-05, "loss": 0.2794, "step": 370 }, { "epoch": 0.1974025974025974, "grad_norm": 0.5087944109212423, "learning_rate": 4.983330221729419e-05, "loss": 0.2787, "step": 380 }, { "epoch": 0.2025974025974026, "grad_norm": 0.27804001121817934, "learning_rate": 4.9816745425494326e-05, "loss": 0.2759, "step": 390 }, { "epoch": 0.2077922077922078, "grad_norm": 0.4013635528070106, "learning_rate": 4.979940788316556e-05, "loss": 0.2817, "step": 400 }, { "epoch": 0.21298701298701297, "grad_norm": 0.25511672135848257, "learning_rate": 4.978129013575796e-05, "loss": 0.2785, "step": 410 }, { "epoch": 0.21818181818181817, "grad_norm": 0.298813003536876, "learning_rate": 4.976239275326733e-05, "loss": 0.2803, "step": 420 }, { "epoch": 0.22337662337662337, "grad_norm": 0.25617082862388485, "learning_rate": 4.974271633021729e-05, "loss": 0.2736, "step": 430 }, { "epoch": 0.22857142857142856, "grad_norm": 0.29247425322362797, "learning_rate": 4.9722261485640584e-05, "loss": 0.2767, "step": 440 }, { "epoch": 0.23376623376623376, "grad_norm": 0.33775737625365165, "learning_rate": 4.9701028863059563e-05, "loss": 0.2753, "step": 450 }, { "epoch": 0.23896103896103896, "grad_norm": 0.38447368527387143, "learning_rate": 4.967901913046598e-05, "loss": 0.2805, "step": 460 }, { "epoch": 0.24415584415584415, "grad_norm": 0.36418471782114614, "learning_rate": 4.9656232980299976e-05, "loss": 0.2707, "step": 470 }, { "epoch": 0.24935064935064935, "grad_norm": 0.2826049309528878, "learning_rate": 4.963267112942826e-05, "loss": 0.2775, "step": 480 }, { "epoch": 0.2545454545454545, "grad_norm": 0.3501209948268494, "learning_rate": 4.9608334319121584e-05, "loss": 0.2731, "step": 490 }, { "epoch": 0.2597402597402597, "grad_norm": 0.3608819312798831, "learning_rate": 4.958322331503141e-05, "loss": 0.2707, "step": 500 }, { "epoch": 0.2649350649350649, "grad_norm": 0.299020142675044, "learning_rate": 4.9557338907165833e-05, "loss": 0.2732, "step": 510 }, { "epoch": 0.2701298701298701, "grad_norm": 0.2685575579055106, "learning_rate": 4.9530681909864724e-05, "loss": 0.2689, "step": 520 }, { "epoch": 0.2753246753246753, "grad_norm": 0.23478660501034596, "learning_rate": 4.950325316177409e-05, "loss": 0.2726, "step": 530 }, { "epoch": 0.2805194805194805, "grad_norm": 0.26267550311986976, "learning_rate": 4.947505352581974e-05, "loss": 0.2688, "step": 540 }, { "epoch": 0.2857142857142857, "grad_norm": 0.2804831055299143, "learning_rate": 4.944608388918005e-05, "loss": 0.2724, "step": 550 }, { "epoch": 0.2909090909090909, "grad_norm": 0.36597728037439853, "learning_rate": 4.941634516325816e-05, "loss": 0.2674, "step": 560 }, { "epoch": 0.2961038961038961, "grad_norm": 0.27054673370836463, "learning_rate": 4.9385838283653216e-05, "loss": 0.2649, "step": 570 }, { "epoch": 0.3012987012987013, "grad_norm": 0.3066834081735395, "learning_rate": 4.9354564210130976e-05, "loss": 0.2677, "step": 580 }, { "epoch": 0.3064935064935065, "grad_norm": 0.2950401672059928, "learning_rate": 4.93225239265936e-05, "loss": 0.2622, "step": 590 }, { "epoch": 0.3116883116883117, "grad_norm": 0.31772179112234966, "learning_rate": 4.928971844104868e-05, "loss": 0.2641, "step": 600 }, { "epoch": 0.3168831168831169, "grad_norm": 0.27924556453889027, "learning_rate": 4.9256148785577606e-05, "loss": 0.2647, "step": 610 }, { "epoch": 0.3220779220779221, "grad_norm": 0.28644663252200886, "learning_rate": 4.9221816016302966e-05, "loss": 0.2625, "step": 620 }, { "epoch": 0.32727272727272727, "grad_norm": 0.2606246292323375, "learning_rate": 4.9186721213355455e-05, "loss": 0.2636, "step": 630 }, { "epoch": 0.33246753246753247, "grad_norm": 0.3172496714001626, "learning_rate": 4.915086548083978e-05, "loss": 0.2683, "step": 640 }, { "epoch": 0.33766233766233766, "grad_norm": 0.24927905040341644, "learning_rate": 4.9114249946800003e-05, "loss": 0.2654, "step": 650 }, { "epoch": 0.34285714285714286, "grad_norm": 0.32250447729794757, "learning_rate": 4.907687576318401e-05, "loss": 0.2636, "step": 660 }, { "epoch": 0.34805194805194806, "grad_norm": 0.32565974721616914, "learning_rate": 4.903874410580731e-05, "loss": 0.2601, "step": 670 }, { "epoch": 0.35324675324675325, "grad_norm": 0.23517482221948124, "learning_rate": 4.899985617431597e-05, "loss": 0.2611, "step": 680 }, { "epoch": 0.35844155844155845, "grad_norm": 0.2438523561922534, "learning_rate": 4.896021319214895e-05, "loss": 0.2601, "step": 690 }, { "epoch": 0.36363636363636365, "grad_norm": 0.2668670929832916, "learning_rate": 4.8919816406499584e-05, "loss": 0.2696, "step": 700 }, { "epoch": 0.36883116883116884, "grad_norm": 0.24937135360115686, "learning_rate": 4.887866708827633e-05, "loss": 0.2602, "step": 710 }, { "epoch": 0.37402597402597404, "grad_norm": 0.23999629542679116, "learning_rate": 4.8836766532062804e-05, "loss": 0.2588, "step": 720 }, { "epoch": 0.37922077922077924, "grad_norm": 0.23334349462758497, "learning_rate": 4.879411605607704e-05, "loss": 0.2606, "step": 730 }, { "epoch": 0.38441558441558443, "grad_norm": 0.2215145938323352, "learning_rate": 4.8750717002130024e-05, "loss": 0.2567, "step": 740 }, { "epoch": 0.38961038961038963, "grad_norm": 0.3218548674660387, "learning_rate": 4.870657073558349e-05, "loss": 0.2627, "step": 750 }, { "epoch": 0.3948051948051948, "grad_norm": 0.29152150655446074, "learning_rate": 4.866167864530693e-05, "loss": 0.2561, "step": 760 }, { "epoch": 0.4, "grad_norm": 0.29823420885887736, "learning_rate": 4.8616042143633937e-05, "loss": 0.2594, "step": 770 }, { "epoch": 0.4051948051948052, "grad_norm": 0.256792339442467, "learning_rate": 4.856966266631777e-05, "loss": 0.2565, "step": 780 }, { "epoch": 0.4103896103896104, "grad_norm": 0.25804885977573755, "learning_rate": 4.8522541672486156e-05, "loss": 0.2577, "step": 790 }, { "epoch": 0.4155844155844156, "grad_norm": 0.2903609751193798, "learning_rate": 4.84746806445954e-05, "loss": 0.2534, "step": 800 }, { "epoch": 0.42077922077922075, "grad_norm": 0.26400859462593973, "learning_rate": 4.8426081088383756e-05, "loss": 0.2586, "step": 810 }, { "epoch": 0.42597402597402595, "grad_norm": 0.2960587838599708, "learning_rate": 4.837674453282404e-05, "loss": 0.261, "step": 820 }, { "epoch": 0.43116883116883115, "grad_norm": 0.23331561495605277, "learning_rate": 4.832667253007554e-05, "loss": 0.2536, "step": 830 }, { "epoch": 0.43636363636363634, "grad_norm": 0.24020811230350025, "learning_rate": 4.8275866655435175e-05, "loss": 0.2564, "step": 840 }, { "epoch": 0.44155844155844154, "grad_norm": 0.22023402453548904, "learning_rate": 4.8224328507287946e-05, "loss": 0.2562, "step": 850 }, { "epoch": 0.44675324675324674, "grad_norm": 0.293317498213313, "learning_rate": 4.8172059707056626e-05, "loss": 0.2565, "step": 860 }, { "epoch": 0.45194805194805193, "grad_norm": 0.2692215437341758, "learning_rate": 4.811906189915078e-05, "loss": 0.2506, "step": 870 }, { "epoch": 0.45714285714285713, "grad_norm": 0.23694698773474526, "learning_rate": 4.806533675091501e-05, "loss": 0.2518, "step": 880 }, { "epoch": 0.4623376623376623, "grad_norm": 0.22885916720084376, "learning_rate": 4.80108859525765e-05, "loss": 0.252, "step": 890 }, { "epoch": 0.4675324675324675, "grad_norm": 0.24916135308130166, "learning_rate": 4.795571121719187e-05, "loss": 0.253, "step": 900 }, { "epoch": 0.4727272727272727, "grad_norm": 0.24894984701102493, "learning_rate": 4.7899814280593226e-05, "loss": 0.2529, "step": 910 }, { "epoch": 0.4779220779220779, "grad_norm": 0.2723728137565129, "learning_rate": 4.78431969013336e-05, "loss": 0.2555, "step": 920 }, { "epoch": 0.4831168831168831, "grad_norm": 0.3183305552275493, "learning_rate": 4.778586086063159e-05, "loss": 0.2514, "step": 930 }, { "epoch": 0.4883116883116883, "grad_norm": 0.2414413013327865, "learning_rate": 4.772780796231537e-05, "loss": 0.2484, "step": 940 }, { "epoch": 0.4935064935064935, "grad_norm": 0.22563943539011178, "learning_rate": 4.766904003276589e-05, "loss": 0.2539, "step": 950 }, { "epoch": 0.4987012987012987, "grad_norm": 0.30205872999506944, "learning_rate": 4.760955892085942e-05, "loss": 0.2527, "step": 960 }, { "epoch": 0.5038961038961038, "grad_norm": 0.2860786126383834, "learning_rate": 4.754936649790942e-05, "loss": 0.2516, "step": 970 }, { "epoch": 0.509090909090909, "grad_norm": 0.27572406640999436, "learning_rate": 4.7488464657607635e-05, "loss": 0.2498, "step": 980 }, { "epoch": 0.5142857142857142, "grad_norm": 0.24673507061013106, "learning_rate": 4.7426855315964535e-05, "loss": 0.2531, "step": 990 }, { "epoch": 0.5194805194805194, "grad_norm": 0.21033978842271397, "learning_rate": 4.736454041124904e-05, "loss": 0.2504, "step": 1000 }, { "epoch": 0.5246753246753246, "grad_norm": 0.24381830272002009, "learning_rate": 4.7301521903927505e-05, "loss": 0.2428, "step": 1010 }, { "epoch": 0.5298701298701298, "grad_norm": 0.2334282560856222, "learning_rate": 4.723780177660209e-05, "loss": 0.2501, "step": 1020 }, { "epoch": 0.535064935064935, "grad_norm": 0.2751808654527514, "learning_rate": 4.717338203394836e-05, "loss": 0.2507, "step": 1030 }, { "epoch": 0.5402597402597402, "grad_norm": 0.2697637268340861, "learning_rate": 4.71082647026522e-05, "loss": 0.2503, "step": 1040 }, { "epoch": 0.5454545454545454, "grad_norm": 0.22403492548323756, "learning_rate": 4.7042451831346136e-05, "loss": 0.2495, "step": 1050 }, { "epoch": 0.5506493506493506, "grad_norm": 0.22425594786040917, "learning_rate": 4.697594549054474e-05, "loss": 0.2475, "step": 1060 }, { "epoch": 0.5558441558441558, "grad_norm": 0.24265650046282458, "learning_rate": 4.690874777257964e-05, "loss": 0.2491, "step": 1070 }, { "epoch": 0.561038961038961, "grad_norm": 0.22077426247794457, "learning_rate": 4.684086079153359e-05, "loss": 0.2449, "step": 1080 }, { "epoch": 0.5662337662337662, "grad_norm": 0.30484192484636535, "learning_rate": 4.6772286683174025e-05, "loss": 0.245, "step": 1090 }, { "epoch": 0.5714285714285714, "grad_norm": 0.22182045074526108, "learning_rate": 4.670302760488582e-05, "loss": 0.2477, "step": 1100 }, { "epoch": 0.5766233766233766, "grad_norm": 0.2200234503444333, "learning_rate": 4.663308573560343e-05, "loss": 0.2506, "step": 1110 }, { "epoch": 0.5818181818181818, "grad_norm": 0.21994287683589026, "learning_rate": 4.656246327574238e-05, "loss": 0.2421, "step": 1120 }, { "epoch": 0.587012987012987, "grad_norm": 0.272334351757034, "learning_rate": 4.649116244712998e-05, "loss": 0.2476, "step": 1130 }, { "epoch": 0.5922077922077922, "grad_norm": 0.2236806239622702, "learning_rate": 4.641918549293545e-05, "loss": 0.2454, "step": 1140 }, { "epoch": 0.5974025974025974, "grad_norm": 0.23546182291174625, "learning_rate": 4.634653467759936e-05, "loss": 0.2477, "step": 1150 }, { "epoch": 0.6025974025974026, "grad_norm": 0.2505748637769869, "learning_rate": 4.6273212286762376e-05, "loss": 0.2449, "step": 1160 }, { "epoch": 0.6077922077922078, "grad_norm": 0.18468403137918, "learning_rate": 4.619922062719335e-05, "loss": 0.2432, "step": 1170 }, { "epoch": 0.612987012987013, "grad_norm": 0.23235694779454488, "learning_rate": 4.6124562026716766e-05, "loss": 0.2457, "step": 1180 }, { "epoch": 0.6181818181818182, "grad_norm": 0.2587545121058708, "learning_rate": 4.604923883413946e-05, "loss": 0.2467, "step": 1190 }, { "epoch": 0.6233766233766234, "grad_norm": 0.23524218849591322, "learning_rate": 4.59732534191768e-05, "loss": 0.2425, "step": 1200 }, { "epoch": 0.6285714285714286, "grad_norm": 0.20482111091171828, "learning_rate": 4.589660817237805e-05, "loss": 0.2446, "step": 1210 }, { "epoch": 0.6337662337662338, "grad_norm": 0.23935891812153473, "learning_rate": 4.581930550505122e-05, "loss": 0.2359, "step": 1220 }, { "epoch": 0.638961038961039, "grad_norm": 0.2088944848253035, "learning_rate": 4.5741347849187186e-05, "loss": 0.2435, "step": 1230 }, { "epoch": 0.6441558441558441, "grad_norm": 0.19769218239953817, "learning_rate": 4.566273765738318e-05, "loss": 0.2429, "step": 1240 }, { "epoch": 0.6493506493506493, "grad_norm": 0.21104513975336958, "learning_rate": 4.558347740276562e-05, "loss": 0.2414, "step": 1250 }, { "epoch": 0.6545454545454545, "grad_norm": 0.22299730594468617, "learning_rate": 4.550356957891232e-05, "loss": 0.2405, "step": 1260 }, { "epoch": 0.6597402597402597, "grad_norm": 0.2553717349606562, "learning_rate": 4.5423016699774025e-05, "loss": 0.242, "step": 1270 }, { "epoch": 0.6649350649350649, "grad_norm": 0.184621184174687, "learning_rate": 4.5341821299595334e-05, "loss": 0.2377, "step": 1280 }, { "epoch": 0.6701298701298701, "grad_norm": 0.1931019421949112, "learning_rate": 4.525998593283496e-05, "loss": 0.2401, "step": 1290 }, { "epoch": 0.6753246753246753, "grad_norm": 0.2327800605150306, "learning_rate": 4.517751317408537e-05, "loss": 0.2405, "step": 1300 }, { "epoch": 0.6805194805194805, "grad_norm": 0.21060753158636902, "learning_rate": 4.5094405617991796e-05, "loss": 0.2363, "step": 1310 }, { "epoch": 0.6857142857142857, "grad_norm": 0.18780386461582757, "learning_rate": 4.501066587917058e-05, "loss": 0.2437, "step": 1320 }, { "epoch": 0.6909090909090909, "grad_norm": 0.2671736773226871, "learning_rate": 4.4926296592126946e-05, "loss": 0.2431, "step": 1330 }, { "epoch": 0.6961038961038961, "grad_norm": 0.23810395896846592, "learning_rate": 4.484130041117211e-05, "loss": 0.243, "step": 1340 }, { "epoch": 0.7012987012987013, "grad_norm": 0.18025186403432847, "learning_rate": 4.475568001033974e-05, "loss": 0.2457, "step": 1350 }, { "epoch": 0.7064935064935065, "grad_norm": 0.21115305940327297, "learning_rate": 4.466943808330189e-05, "loss": 0.2415, "step": 1360 }, { "epoch": 0.7116883116883117, "grad_norm": 0.25256979094205834, "learning_rate": 4.45825773432842e-05, "loss": 0.2407, "step": 1370 }, { "epoch": 0.7168831168831169, "grad_norm": 0.22014008453128092, "learning_rate": 4.449510052298056e-05, "loss": 0.2357, "step": 1380 }, { "epoch": 0.7220779220779221, "grad_norm": 0.20062628753000003, "learning_rate": 4.440701037446714e-05, "loss": 0.2396, "step": 1390 }, { "epoch": 0.7272727272727273, "grad_norm": 0.2083485579826855, "learning_rate": 4.431830966911582e-05, "loss": 0.2391, "step": 1400 }, { "epoch": 0.7324675324675325, "grad_norm": 0.18184305857175756, "learning_rate": 4.422900119750695e-05, "loss": 0.2355, "step": 1410 }, { "epoch": 0.7376623376623377, "grad_norm": 0.2048232593631597, "learning_rate": 4.4139087769341625e-05, "loss": 0.2332, "step": 1420 }, { "epoch": 0.7428571428571429, "grad_norm": 0.21132211989073768, "learning_rate": 4.4048572213353234e-05, "loss": 0.2422, "step": 1430 }, { "epoch": 0.7480519480519481, "grad_norm": 0.19792895604344352, "learning_rate": 4.39574573772185e-05, "loss": 0.2334, "step": 1440 }, { "epoch": 0.7532467532467533, "grad_norm": 0.22178836320367148, "learning_rate": 4.3865746127467876e-05, "loss": 0.2423, "step": 1450 }, { "epoch": 0.7584415584415585, "grad_norm": 0.21967650568135474, "learning_rate": 4.3773441349395374e-05, "loss": 0.2357, "step": 1460 }, { "epoch": 0.7636363636363637, "grad_norm": 0.1917556477695145, "learning_rate": 4.368054594696775e-05, "loss": 0.2443, "step": 1470 }, { "epoch": 0.7688311688311689, "grad_norm": 0.20969861600848638, "learning_rate": 4.3587062842733216e-05, "loss": 0.2341, "step": 1480 }, { "epoch": 0.7740259740259741, "grad_norm": 0.19871375239851857, "learning_rate": 4.349299497772945e-05, "loss": 0.2361, "step": 1490 }, { "epoch": 0.7792207792207793, "grad_norm": 0.19082750332598916, "learning_rate": 4.339834531139104e-05, "loss": 0.2316, "step": 1500 }, { "epoch": 0.7844155844155845, "grad_norm": 0.2177029161255871, "learning_rate": 4.330311682145645e-05, "loss": 0.2343, "step": 1510 }, { "epoch": 0.7896103896103897, "grad_norm": 0.20562958726540304, "learning_rate": 4.320731250387429e-05, "loss": 0.2401, "step": 1520 }, { "epoch": 0.7948051948051948, "grad_norm": 0.2070237852219627, "learning_rate": 4.311093537270905e-05, "loss": 0.2374, "step": 1530 }, { "epoch": 0.8, "grad_norm": 0.18967395002327114, "learning_rate": 4.301398846004634e-05, "loss": 0.2363, "step": 1540 }, { "epoch": 0.8051948051948052, "grad_norm": 0.1970271386066234, "learning_rate": 4.291647481589742e-05, "loss": 0.2302, "step": 1550 }, { "epoch": 0.8103896103896104, "grad_norm": 0.18673676151020974, "learning_rate": 4.28183975081033e-05, "loss": 0.2416, "step": 1560 }, { "epoch": 0.8155844155844156, "grad_norm": 0.2183111540642943, "learning_rate": 4.271975962223821e-05, "loss": 0.2342, "step": 1570 }, { "epoch": 0.8207792207792208, "grad_norm": 0.1792298886397136, "learning_rate": 4.2620564261512496e-05, "loss": 0.2388, "step": 1580 }, { "epoch": 0.825974025974026, "grad_norm": 0.21429193275126804, "learning_rate": 4.2520814546675037e-05, "loss": 0.2323, "step": 1590 }, { "epoch": 0.8311688311688312, "grad_norm": 0.1923357673969473, "learning_rate": 4.242051361591505e-05, "loss": 0.2398, "step": 1600 }, { "epoch": 0.8363636363636363, "grad_norm": 0.1825902322292911, "learning_rate": 4.2319664624763325e-05, "loss": 0.2355, "step": 1610 }, { "epoch": 0.8415584415584415, "grad_norm": 0.1708452665847616, "learning_rate": 4.2218270745993016e-05, "loss": 0.2361, "step": 1620 }, { "epoch": 0.8467532467532467, "grad_norm": 0.2003983431936864, "learning_rate": 4.211633516951975e-05, "loss": 0.237, "step": 1630 }, { "epoch": 0.8519480519480519, "grad_norm": 0.1809948763155965, "learning_rate": 4.201386110230134e-05, "loss": 0.2291, "step": 1640 }, { "epoch": 0.8571428571428571, "grad_norm": 0.19621979591943875, "learning_rate": 4.1910851768236825e-05, "loss": 0.2284, "step": 1650 }, { "epoch": 0.8623376623376623, "grad_norm": 0.20784597945629102, "learning_rate": 4.180731040806511e-05, "loss": 0.2359, "step": 1660 }, { "epoch": 0.8675324675324675, "grad_norm": 0.22581034014160772, "learning_rate": 4.170324027926297e-05, "loss": 0.2329, "step": 1670 }, { "epoch": 0.8727272727272727, "grad_norm": 0.1789163119753752, "learning_rate": 4.159864465594255e-05, "loss": 0.2338, "step": 1680 }, { "epoch": 0.8779220779220779, "grad_norm": 0.1949206924337472, "learning_rate": 4.1493526828748416e-05, "loss": 0.2392, "step": 1690 }, { "epoch": 0.8831168831168831, "grad_norm": 0.20147429000086556, "learning_rate": 4.1387890104754004e-05, "loss": 0.233, "step": 1700 }, { "epoch": 0.8883116883116883, "grad_norm": 0.1537005161376695, "learning_rate": 4.128173780735753e-05, "loss": 0.2291, "step": 1710 }, { "epoch": 0.8935064935064935, "grad_norm": 0.17777763693741433, "learning_rate": 4.117507327617751e-05, "loss": 0.2291, "step": 1720 }, { "epoch": 0.8987012987012987, "grad_norm": 0.174198062693491, "learning_rate": 4.1067899866947665e-05, "loss": 0.2294, "step": 1730 }, { "epoch": 0.9038961038961039, "grad_norm": 0.1884364748511166, "learning_rate": 4.096022095141132e-05, "loss": 0.235, "step": 1740 }, { "epoch": 0.9090909090909091, "grad_norm": 0.1912652069094164, "learning_rate": 4.085203991721535e-05, "loss": 0.2318, "step": 1750 }, { "epoch": 0.9142857142857143, "grad_norm": 0.206558739242339, "learning_rate": 4.0743360167803614e-05, "loss": 0.2317, "step": 1760 }, { "epoch": 0.9194805194805195, "grad_norm": 0.18252166114267931, "learning_rate": 4.063418512230987e-05, "loss": 0.2346, "step": 1770 }, { "epoch": 0.9246753246753247, "grad_norm": 0.18463778266166328, "learning_rate": 4.0524518215450166e-05, "loss": 0.2306, "step": 1780 }, { "epoch": 0.9298701298701298, "grad_norm": 0.207923278938462, "learning_rate": 4.041436289741489e-05, "loss": 0.2301, "step": 1790 }, { "epoch": 0.935064935064935, "grad_norm": 0.25335695776490813, "learning_rate": 4.0303722633760085e-05, "loss": 0.2258, "step": 1800 }, { "epoch": 0.9402597402597402, "grad_norm": 0.16805426564943104, "learning_rate": 4.019260090529854e-05, "loss": 0.2284, "step": 1810 }, { "epoch": 0.9454545454545454, "grad_norm": 0.1884837989936669, "learning_rate": 4.008100120799019e-05, "loss": 0.2285, "step": 1820 }, { "epoch": 0.9506493506493506, "grad_norm": 0.19643081968195814, "learning_rate": 3.996892705283222e-05, "loss": 0.2354, "step": 1830 }, { "epoch": 0.9558441558441558, "grad_norm": 0.18949399067442121, "learning_rate": 3.9856381965748506e-05, "loss": 0.234, "step": 1840 }, { "epoch": 0.961038961038961, "grad_norm": 0.18422150470243814, "learning_rate": 3.974336948747879e-05, "loss": 0.228, "step": 1850 }, { "epoch": 0.9662337662337662, "grad_norm": 0.179837212988977, "learning_rate": 3.962989317346722e-05, "loss": 0.2283, "step": 1860 }, { "epoch": 0.9714285714285714, "grad_norm": 0.18190735457463206, "learning_rate": 3.951595659375048e-05, "loss": 0.2337, "step": 1870 }, { "epoch": 0.9766233766233766, "grad_norm": 0.17903757429753223, "learning_rate": 3.9401563332845545e-05, "loss": 0.2225, "step": 1880 }, { "epoch": 0.9818181818181818, "grad_norm": 0.18775008592274955, "learning_rate": 3.928671698963686e-05, "loss": 0.226, "step": 1890 }, { "epoch": 0.987012987012987, "grad_norm": 0.16101556678112905, "learning_rate": 3.917142117726312e-05, "loss": 0.2312, "step": 1900 }, { "epoch": 0.9922077922077922, "grad_norm": 0.18133929144854413, "learning_rate": 3.90556795230036e-05, "loss": 0.2264, "step": 1910 }, { "epoch": 0.9974025974025974, "grad_norm": 0.21238218367672568, "learning_rate": 3.893949566816404e-05, "loss": 0.2273, "step": 1920 }, { "epoch": 1.0025974025974025, "grad_norm": 0.180697571813627, "learning_rate": 3.8822873267962115e-05, "loss": 0.2148, "step": 1930 }, { "epoch": 1.0077922077922077, "grad_norm": 0.18289201426564544, "learning_rate": 3.870581599141239e-05, "loss": 0.1982, "step": 1940 }, { "epoch": 1.0129870129870129, "grad_norm": 0.1916290587115548, "learning_rate": 3.858832752121093e-05, "loss": 0.1972, "step": 1950 }, { "epoch": 1.018181818181818, "grad_norm": 0.16005158183997184, "learning_rate": 3.847041155361941e-05, "loss": 0.1945, "step": 1960 }, { "epoch": 1.0233766233766233, "grad_norm": 0.1933154004002062, "learning_rate": 3.835207179834886e-05, "loss": 0.1957, "step": 1970 }, { "epoch": 1.0285714285714285, "grad_norm": 0.19020919057253263, "learning_rate": 3.823331197844293e-05, "loss": 0.1997, "step": 1980 }, { "epoch": 1.0337662337662337, "grad_norm": 0.19051536230519941, "learning_rate": 3.8114135830160766e-05, "loss": 0.1965, "step": 1990 }, { "epoch": 1.0389610389610389, "grad_norm": 0.2061945657419727, "learning_rate": 3.799454710285949e-05, "loss": 0.199, "step": 2000 }, { "epoch": 1.044155844155844, "grad_norm": 0.1634129561802415, "learning_rate": 3.787454955887619e-05, "loss": 0.1979, "step": 2010 }, { "epoch": 1.0493506493506493, "grad_norm": 0.19104378460061372, "learning_rate": 3.775414697340962e-05, "loss": 0.1976, "step": 2020 }, { "epoch": 1.0545454545454545, "grad_norm": 0.17070052143672057, "learning_rate": 3.763334313440134e-05, "loss": 0.1958, "step": 2030 }, { "epoch": 1.0597402597402596, "grad_norm": 0.18748046307285904, "learning_rate": 3.7512141842416674e-05, "loss": 0.1941, "step": 2040 }, { "epoch": 1.0649350649350648, "grad_norm": 0.1989042435608863, "learning_rate": 3.739054691052501e-05, "loss": 0.1964, "step": 2050 }, { "epoch": 1.07012987012987, "grad_norm": 0.21700073670750156, "learning_rate": 3.726856216417992e-05, "loss": 0.1987, "step": 2060 }, { "epoch": 1.0753246753246752, "grad_norm": 0.1899511809741937, "learning_rate": 3.71461914410988e-05, "loss": 0.1972, "step": 2070 }, { "epoch": 1.0805194805194804, "grad_norm": 0.16164124372624425, "learning_rate": 3.702343859114208e-05, "loss": 0.196, "step": 2080 }, { "epoch": 1.0857142857142856, "grad_norm": 0.1962886097471118, "learning_rate": 3.690030747619218e-05, "loss": 0.1972, "step": 2090 }, { "epoch": 1.0909090909090908, "grad_norm": 0.20624255109168987, "learning_rate": 3.6776801970031956e-05, "loss": 0.1975, "step": 2100 }, { "epoch": 1.096103896103896, "grad_norm": 0.17127975336052367, "learning_rate": 3.665292595822286e-05, "loss": 0.1969, "step": 2110 }, { "epoch": 1.1012987012987012, "grad_norm": 0.22765509302500941, "learning_rate": 3.6528683337982675e-05, "loss": 0.204, "step": 2120 }, { "epoch": 1.1064935064935064, "grad_norm": 0.17313358078957405, "learning_rate": 3.640407801806292e-05, "loss": 0.2018, "step": 2130 }, { "epoch": 1.1116883116883116, "grad_norm": 0.16881351547306925, "learning_rate": 3.62791139186259e-05, "loss": 0.195, "step": 2140 }, { "epoch": 1.1168831168831168, "grad_norm": 0.1639645171706093, "learning_rate": 3.6153794971121305e-05, "loss": 0.1961, "step": 2150 }, { "epoch": 1.122077922077922, "grad_norm": 0.15232520727472273, "learning_rate": 3.602812511816262e-05, "loss": 0.1966, "step": 2160 }, { "epoch": 1.1272727272727272, "grad_norm": 0.1617732944247109, "learning_rate": 3.590210831340297e-05, "loss": 0.1994, "step": 2170 }, { "epoch": 1.1324675324675324, "grad_norm": 0.1614529353275871, "learning_rate": 3.577574852141089e-05, "loss": 0.1964, "step": 2180 }, { "epoch": 1.1376623376623376, "grad_norm": 0.1678102166459782, "learning_rate": 3.564904971754546e-05, "loss": 0.2012, "step": 2190 }, { "epoch": 1.1428571428571428, "grad_norm": 0.20333612359210218, "learning_rate": 3.552201588783127e-05, "loss": 0.1956, "step": 2200 }, { "epoch": 1.148051948051948, "grad_norm": 0.17535467832478063, "learning_rate": 3.5394651028833084e-05, "loss": 0.1967, "step": 2210 }, { "epoch": 1.1532467532467532, "grad_norm": 0.16637781141145597, "learning_rate": 3.526695914753002e-05, "loss": 0.2002, "step": 2220 }, { "epoch": 1.1584415584415584, "grad_norm": 0.17958171262199632, "learning_rate": 3.5138944261189545e-05, "loss": 0.1995, "step": 2230 }, { "epoch": 1.1636363636363636, "grad_norm": 0.16457847074469398, "learning_rate": 3.501061039724106e-05, "loss": 0.1961, "step": 2240 }, { "epoch": 1.1688311688311688, "grad_norm": 0.17382166641018432, "learning_rate": 3.48819615931492e-05, "loss": 0.1966, "step": 2250 }, { "epoch": 1.174025974025974, "grad_norm": 0.1796672158320363, "learning_rate": 3.475300189628685e-05, "loss": 0.2005, "step": 2260 }, { "epoch": 1.1792207792207792, "grad_norm": 0.19277006012807987, "learning_rate": 3.4623735363807706e-05, "loss": 0.1973, "step": 2270 }, { "epoch": 1.1844155844155844, "grad_norm": 0.16718509709095522, "learning_rate": 3.449416606251878e-05, "loss": 0.1966, "step": 2280 }, { "epoch": 1.1896103896103896, "grad_norm": 0.15948163188018513, "learning_rate": 3.436429806875236e-05, "loss": 0.1951, "step": 2290 }, { "epoch": 1.1948051948051948, "grad_norm": 0.14931342836105496, "learning_rate": 3.423413546823776e-05, "loss": 0.1981, "step": 2300 }, { "epoch": 1.2, "grad_norm": 0.17649267762779447, "learning_rate": 3.410368235597285e-05, "loss": 0.1965, "step": 2310 }, { "epoch": 1.2051948051948052, "grad_norm": 0.18378320454721475, "learning_rate": 3.3972942836095146e-05, "loss": 0.1984, "step": 2320 }, { "epoch": 1.2103896103896103, "grad_norm": 0.16895275790268519, "learning_rate": 3.3841921021752764e-05, "loss": 0.1973, "step": 2330 }, { "epoch": 1.2155844155844155, "grad_norm": 0.16837240010707993, "learning_rate": 3.3710621034974966e-05, "loss": 0.1996, "step": 2340 }, { "epoch": 1.2207792207792207, "grad_norm": 0.17505091322912789, "learning_rate": 3.357904700654248e-05, "loss": 0.1983, "step": 2350 }, { "epoch": 1.225974025974026, "grad_norm": 0.16274246547553728, "learning_rate": 3.3447203075857596e-05, "loss": 0.1999, "step": 2360 }, { "epoch": 1.2311688311688311, "grad_norm": 0.17154176441709548, "learning_rate": 3.3315093390813856e-05, "loss": 0.1958, "step": 2370 }, { "epoch": 1.2363636363636363, "grad_norm": 0.1661630930719061, "learning_rate": 3.318272210766564e-05, "loss": 0.1952, "step": 2380 }, { "epoch": 1.2415584415584415, "grad_norm": 0.17507230307092086, "learning_rate": 3.3050093390897355e-05, "loss": 0.1949, "step": 2390 }, { "epoch": 1.2467532467532467, "grad_norm": 0.17495618433072496, "learning_rate": 3.291721141309241e-05, "loss": 0.2, "step": 2400 }, { "epoch": 1.251948051948052, "grad_norm": 0.16517186947249202, "learning_rate": 3.278408035480202e-05, "loss": 0.1958, "step": 2410 }, { "epoch": 1.2571428571428571, "grad_norm": 0.15684230284407272, "learning_rate": 3.2650704404413556e-05, "loss": 0.1953, "step": 2420 }, { "epoch": 1.2623376623376623, "grad_norm": 0.18306937206961088, "learning_rate": 3.251708775801893e-05, "loss": 0.1951, "step": 2430 }, { "epoch": 1.2675324675324675, "grad_norm": 0.15796065260907452, "learning_rate": 3.2383234619282456e-05, "loss": 0.1975, "step": 2440 }, { "epoch": 1.2727272727272727, "grad_norm": 0.1590509287945099, "learning_rate": 3.2249149199308645e-05, "loss": 0.1966, "step": 2450 }, { "epoch": 1.277922077922078, "grad_norm": 0.1578980037591031, "learning_rate": 3.211483571650974e-05, "loss": 0.1942, "step": 2460 }, { "epoch": 1.283116883116883, "grad_norm": 0.15431493580944897, "learning_rate": 3.198029839647297e-05, "loss": 0.1963, "step": 2470 }, { "epoch": 1.2883116883116883, "grad_norm": 0.15203081144923972, "learning_rate": 3.184554147182764e-05, "loss": 0.192, "step": 2480 }, { "epoch": 1.2935064935064935, "grad_norm": 0.1648836808861431, "learning_rate": 3.171056918211195e-05, "loss": 0.1952, "step": 2490 }, { "epoch": 1.2987012987012987, "grad_norm": 0.17970050092893974, "learning_rate": 3.157538577363962e-05, "loss": 0.1953, "step": 2500 }, { "epoch": 1.3038961038961039, "grad_norm": 0.18613322149465572, "learning_rate": 3.1439995499366285e-05, "loss": 0.1932, "step": 2510 }, { "epoch": 1.309090909090909, "grad_norm": 0.17435271630087446, "learning_rate": 3.130440261875575e-05, "loss": 0.1925, "step": 2520 }, { "epoch": 1.3142857142857143, "grad_norm": 0.17016839586778193, "learning_rate": 3.1168611397645894e-05, "loss": 0.194, "step": 2530 }, { "epoch": 1.3194805194805195, "grad_norm": 0.16245806925048073, "learning_rate": 3.103262610811455e-05, "loss": 0.1916, "step": 2540 }, { "epoch": 1.3246753246753247, "grad_norm": 0.14467063077481634, "learning_rate": 3.0896451028345054e-05, "loss": 0.1926, "step": 2550 }, { "epoch": 1.3298701298701299, "grad_norm": 0.1564688986937739, "learning_rate": 3.076009044249165e-05, "loss": 0.1936, "step": 2560 }, { "epoch": 1.335064935064935, "grad_norm": 0.16082121945843722, "learning_rate": 3.0623548640544747e-05, "loss": 0.1967, "step": 2570 }, { "epoch": 1.3402597402597403, "grad_norm": 0.14645012653654418, "learning_rate": 3.0486829918195902e-05, "loss": 0.1948, "step": 2580 }, { "epoch": 1.3454545454545455, "grad_norm": 0.1554338631637541, "learning_rate": 3.0349938576702734e-05, "loss": 0.1954, "step": 2590 }, { "epoch": 1.3506493506493507, "grad_norm": 0.190100650523074, "learning_rate": 3.021287892275352e-05, "loss": 0.1994, "step": 2600 }, { "epoch": 1.3558441558441559, "grad_norm": 0.16042048393322747, "learning_rate": 3.0075655268331792e-05, "loss": 0.195, "step": 2610 }, { "epoch": 1.361038961038961, "grad_norm": 0.15621284876054406, "learning_rate": 2.9938271930580637e-05, "loss": 0.1945, "step": 2620 }, { "epoch": 1.3662337662337662, "grad_norm": 0.1619700504975918, "learning_rate": 2.980073323166686e-05, "loss": 0.1945, "step": 2630 }, { "epoch": 1.3714285714285714, "grad_norm": 0.16844868277250866, "learning_rate": 2.9663043498645055e-05, "loss": 0.1968, "step": 2640 }, { "epoch": 1.3766233766233766, "grad_norm": 0.16162842712288464, "learning_rate": 2.9525207063321407e-05, "loss": 0.1929, "step": 2650 }, { "epoch": 1.3818181818181818, "grad_norm": 0.15290543489779018, "learning_rate": 2.938722826211749e-05, "loss": 0.196, "step": 2660 }, { "epoch": 1.387012987012987, "grad_norm": 0.15019873611737544, "learning_rate": 2.9249111435933774e-05, "loss": 0.1971, "step": 2670 }, { "epoch": 1.3922077922077922, "grad_norm": 0.16303163651181884, "learning_rate": 2.9110860930013086e-05, "loss": 0.1953, "step": 2680 }, { "epoch": 1.3974025974025974, "grad_norm": 0.1664726514638493, "learning_rate": 2.8972481093803904e-05, "loss": 0.1908, "step": 2690 }, { "epoch": 1.4025974025974026, "grad_norm": 0.15130472315263208, "learning_rate": 2.8833976280823518e-05, "loss": 0.1921, "step": 2700 }, { "epoch": 1.4077922077922078, "grad_norm": 0.15072736412248944, "learning_rate": 2.8695350848521075e-05, "loss": 0.1978, "step": 2710 }, { "epoch": 1.412987012987013, "grad_norm": 0.15699378092430566, "learning_rate": 2.8556609158140463e-05, "loss": 0.1964, "step": 2720 }, { "epoch": 1.4181818181818182, "grad_norm": 0.16242931571448047, "learning_rate": 2.8417755574583137e-05, "loss": 0.1923, "step": 2730 }, { "epoch": 1.4233766233766234, "grad_norm": 0.17561234437581505, "learning_rate": 2.827879446627079e-05, "loss": 0.1968, "step": 2740 }, { "epoch": 1.4285714285714286, "grad_norm": 0.1960744043552905, "learning_rate": 2.8139730205007885e-05, "loss": 0.1957, "step": 2750 }, { "epoch": 1.4337662337662338, "grad_norm": 0.17235052078230464, "learning_rate": 2.8000567165844166e-05, "loss": 0.1915, "step": 2760 }, { "epoch": 1.438961038961039, "grad_norm": 0.15593868664885746, "learning_rate": 2.786130972693699e-05, "loss": 0.1948, "step": 2770 }, { "epoch": 1.4441558441558442, "grad_norm": 0.16960231230641076, "learning_rate": 2.7721962269413577e-05, "loss": 0.1939, "step": 2780 }, { "epoch": 1.4493506493506494, "grad_norm": 0.1696728165311113, "learning_rate": 2.7582529177233203e-05, "loss": 0.1941, "step": 2790 }, { "epoch": 1.4545454545454546, "grad_norm": 0.16089525009448685, "learning_rate": 2.7443014837049247e-05, "loss": 0.1949, "step": 2800 }, { "epoch": 1.4597402597402598, "grad_norm": 0.14342624961264755, "learning_rate": 2.7303423638071223e-05, "loss": 0.1907, "step": 2810 }, { "epoch": 1.464935064935065, "grad_norm": 0.16377445863374662, "learning_rate": 2.7163759971926668e-05, "loss": 0.1929, "step": 2820 }, { "epoch": 1.4701298701298702, "grad_norm": 0.14893765229334818, "learning_rate": 2.7024028232522962e-05, "loss": 0.1892, "step": 2830 }, { "epoch": 1.4753246753246754, "grad_norm": 0.15599419394354383, "learning_rate": 2.688423281590913e-05, "loss": 0.1957, "step": 2840 }, { "epoch": 1.4805194805194806, "grad_norm": 0.15330477393788722, "learning_rate": 2.6744378120137526e-05, "loss": 0.1895, "step": 2850 }, { "epoch": 1.4857142857142858, "grad_norm": 0.17621145634836316, "learning_rate": 2.660446854512545e-05, "loss": 0.1945, "step": 2860 }, { "epoch": 1.490909090909091, "grad_norm": 0.15862268063341767, "learning_rate": 2.6464508492516742e-05, "loss": 0.1942, "step": 2870 }, { "epoch": 1.4961038961038962, "grad_norm": 0.17315942606336032, "learning_rate": 2.6324502365543313e-05, "loss": 0.1905, "step": 2880 }, { "epoch": 1.5012987012987011, "grad_norm": 0.16945429053310193, "learning_rate": 2.618445456888658e-05, "loss": 0.1915, "step": 2890 }, { "epoch": 1.5064935064935066, "grad_norm": 0.16427452855902688, "learning_rate": 2.604436950853893e-05, "loss": 0.191, "step": 2900 }, { "epoch": 1.5116883116883115, "grad_norm": 0.14123750251448794, "learning_rate": 2.5904251591665078e-05, "loss": 0.1902, "step": 2910 }, { "epoch": 1.516883116883117, "grad_norm": 0.15868241668499022, "learning_rate": 2.5764105226463447e-05, "loss": 0.1916, "step": 2920 }, { "epoch": 1.522077922077922, "grad_norm": 0.17175562912305398, "learning_rate": 2.562393482202744e-05, "loss": 0.1918, "step": 2930 }, { "epoch": 1.5272727272727273, "grad_norm": 0.1506170694591851, "learning_rate": 2.5483744788206755e-05, "loss": 0.1924, "step": 2940 }, { "epoch": 1.5324675324675323, "grad_norm": 0.1573731670528784, "learning_rate": 2.5343539535468665e-05, "loss": 0.1933, "step": 2950 }, { "epoch": 1.5376623376623377, "grad_norm": 0.15102349530813702, "learning_rate": 2.5203323474759188e-05, "loss": 0.1878, "step": 2960 }, { "epoch": 1.5428571428571427, "grad_norm": 0.15486309972010434, "learning_rate": 2.5063101017364433e-05, "loss": 0.1924, "step": 2970 }, { "epoch": 1.5480519480519481, "grad_norm": 0.16776236741984477, "learning_rate": 2.4922876574771705e-05, "loss": 0.1923, "step": 2980 }, { "epoch": 1.553246753246753, "grad_norm": 0.14493538153609314, "learning_rate": 2.4782654558530767e-05, "loss": 0.1895, "step": 2990 }, { "epoch": 1.5584415584415585, "grad_norm": 0.1521099389385944, "learning_rate": 2.464243938011509e-05, "loss": 0.1953, "step": 3000 }, { "epoch": 1.5636363636363635, "grad_norm": 0.1520228941252825, "learning_rate": 2.4502235450782976e-05, "loss": 0.1918, "step": 3010 }, { "epoch": 1.568831168831169, "grad_norm": 0.14357994502338198, "learning_rate": 2.4362047181438835e-05, "loss": 0.1893, "step": 3020 }, { "epoch": 1.5740259740259739, "grad_norm": 0.1419085126020348, "learning_rate": 2.4221878982494423e-05, "loss": 0.1954, "step": 3030 }, { "epoch": 1.5792207792207793, "grad_norm": 0.149414700589382, "learning_rate": 2.4081735263730047e-05, "loss": 0.1918, "step": 3040 }, { "epoch": 1.5844155844155843, "grad_norm": 0.14522347482433445, "learning_rate": 2.3941620434155854e-05, "loss": 0.1915, "step": 3050 }, { "epoch": 1.5896103896103897, "grad_norm": 0.13817121165231117, "learning_rate": 2.380153890187314e-05, "loss": 0.1931, "step": 3060 }, { "epoch": 1.5948051948051947, "grad_norm": 0.1512561465765382, "learning_rate": 2.366149507393563e-05, "loss": 0.1882, "step": 3070 }, { "epoch": 1.6, "grad_norm": 0.15918442476721073, "learning_rate": 2.352149335621084e-05, "loss": 0.188, "step": 3080 }, { "epoch": 1.605194805194805, "grad_norm": 0.15634396731194353, "learning_rate": 2.3381538153241474e-05, "loss": 0.1867, "step": 3090 }, { "epoch": 1.6103896103896105, "grad_norm": 0.1586607565553087, "learning_rate": 2.3241633868106878e-05, "loss": 0.1939, "step": 3100 }, { "epoch": 1.6155844155844155, "grad_norm": 0.1560480985454943, "learning_rate": 2.310178490228446e-05, "loss": 0.1913, "step": 3110 }, { "epoch": 1.6207792207792209, "grad_norm": 0.1451574194093105, "learning_rate": 2.296199565551125e-05, "loss": 0.1901, "step": 3120 }, { "epoch": 1.6259740259740258, "grad_norm": 0.16572964821298886, "learning_rate": 2.2822270525645507e-05, "loss": 0.1921, "step": 3130 }, { "epoch": 1.6311688311688313, "grad_norm": 0.1526008677355272, "learning_rate": 2.268261390852831e-05, "loss": 0.1911, "step": 3140 }, { "epoch": 1.6363636363636362, "grad_norm": 0.15191599502910447, "learning_rate": 2.254303019784526e-05, "loss": 0.1918, "step": 3150 }, { "epoch": 1.6415584415584417, "grad_norm": 0.1463054127627514, "learning_rate": 2.240352378498834e-05, "loss": 0.1909, "step": 3160 }, { "epoch": 1.6467532467532466, "grad_norm": 0.14379825200348653, "learning_rate": 2.226409905891763e-05, "loss": 0.1889, "step": 3170 }, { "epoch": 1.651948051948052, "grad_norm": 0.20939441149322338, "learning_rate": 2.2124760406023315e-05, "loss": 0.19, "step": 3180 }, { "epoch": 1.657142857142857, "grad_norm": 0.14501247701314718, "learning_rate": 2.198551220998768e-05, "loss": 0.1873, "step": 3190 }, { "epoch": 1.6623376623376624, "grad_norm": 0.14787111177950868, "learning_rate": 2.1846358851647162e-05, "loss": 0.1892, "step": 3200 }, { "epoch": 1.6675324675324674, "grad_norm": 0.14627855688686608, "learning_rate": 2.1707304708854547e-05, "loss": 0.1957, "step": 3210 }, { "epoch": 1.6727272727272728, "grad_norm": 0.14191753318818107, "learning_rate": 2.156835415634123e-05, "loss": 0.1862, "step": 3220 }, { "epoch": 1.6779220779220778, "grad_norm": 0.14766613612356988, "learning_rate": 2.1429511565579612e-05, "loss": 0.188, "step": 3230 }, { "epoch": 1.6831168831168832, "grad_norm": 0.13830710205530652, "learning_rate": 2.129078130464553e-05, "loss": 0.1913, "step": 3240 }, { "epoch": 1.6883116883116882, "grad_norm": 0.14720437692164431, "learning_rate": 2.1152167738080848e-05, "loss": 0.1898, "step": 3250 }, { "epoch": 1.6935064935064936, "grad_norm": 0.14698633131391733, "learning_rate": 2.1013675226756178e-05, "loss": 0.1924, "step": 3260 }, { "epoch": 1.6987012987012986, "grad_norm": 0.1419475844570598, "learning_rate": 2.0875308127733634e-05, "loss": 0.1894, "step": 3270 }, { "epoch": 1.703896103896104, "grad_norm": 0.14518860678193932, "learning_rate": 2.0737070794129776e-05, "loss": 0.1897, "step": 3280 }, { "epoch": 1.709090909090909, "grad_norm": 0.13929880201308206, "learning_rate": 2.059896757497869e-05, "loss": 0.1894, "step": 3290 }, { "epoch": 1.7142857142857144, "grad_norm": 0.1560629988591411, "learning_rate": 2.046100281509511e-05, "loss": 0.1883, "step": 3300 }, { "epoch": 1.7194805194805194, "grad_norm": 0.14187945981501732, "learning_rate": 2.0323180854937775e-05, "loss": 0.1893, "step": 3310 }, { "epoch": 1.7246753246753248, "grad_norm": 0.13855333314294954, "learning_rate": 2.018550603047281e-05, "loss": 0.1851, "step": 3320 }, { "epoch": 1.7298701298701298, "grad_norm": 0.15293602282420193, "learning_rate": 2.004798267303743e-05, "loss": 0.1919, "step": 3330 }, { "epoch": 1.7350649350649352, "grad_norm": 0.1510311020666268, "learning_rate": 1.9910615109203533e-05, "loss": 0.188, "step": 3340 }, { "epoch": 1.7402597402597402, "grad_norm": 0.15491321483535742, "learning_rate": 1.977340766064169e-05, "loss": 0.1892, "step": 3350 }, { "epoch": 1.7454545454545456, "grad_norm": 0.14485954144018753, "learning_rate": 1.9636364643985132e-05, "loss": 0.1906, "step": 3360 }, { "epoch": 1.7506493506493506, "grad_norm": 0.1594394775258021, "learning_rate": 1.949949037069396e-05, "loss": 0.193, "step": 3370 }, { "epoch": 1.755844155844156, "grad_norm": 0.14909440982518038, "learning_rate": 1.9362789146919498e-05, "loss": 0.1891, "step": 3380 }, { "epoch": 1.761038961038961, "grad_norm": 0.15409122998170965, "learning_rate": 1.922626527336884e-05, "loss": 0.1887, "step": 3390 }, { "epoch": 1.7662337662337664, "grad_norm": 0.14506982813181468, "learning_rate": 1.90899230451695e-05, "loss": 0.1892, "step": 3400 }, { "epoch": 1.7714285714285714, "grad_norm": 0.1429677030388747, "learning_rate": 1.8953766751734343e-05, "loss": 0.1863, "step": 3410 }, { "epoch": 1.7766233766233768, "grad_norm": 0.14323572969542486, "learning_rate": 1.8817800676626572e-05, "loss": 0.1874, "step": 3420 }, { "epoch": 1.7818181818181817, "grad_norm": 0.13341435839039215, "learning_rate": 1.8682029097425063e-05, "loss": 0.1915, "step": 3430 }, { "epoch": 1.7870129870129872, "grad_norm": 0.13841332912364393, "learning_rate": 1.8546456285589663e-05, "loss": 0.1871, "step": 3440 }, { "epoch": 1.7922077922077921, "grad_norm": 0.14609137476892148, "learning_rate": 1.8411086506326893e-05, "loss": 0.1878, "step": 3450 }, { "epoch": 1.7974025974025976, "grad_norm": 0.14038564796108288, "learning_rate": 1.8275924018455765e-05, "loss": 0.1884, "step": 3460 }, { "epoch": 1.8025974025974025, "grad_norm": 0.14343484338333484, "learning_rate": 1.814097307427374e-05, "loss": 0.1915, "step": 3470 }, { "epoch": 1.807792207792208, "grad_norm": 0.1396561043467835, "learning_rate": 1.8006237919423004e-05, "loss": 0.1916, "step": 3480 }, { "epoch": 1.812987012987013, "grad_norm": 0.142463801296892, "learning_rate": 1.7871722792756852e-05, "loss": 0.1876, "step": 3490 }, { "epoch": 1.8181818181818183, "grad_norm": 0.1336858975526505, "learning_rate": 1.7737431926206383e-05, "loss": 0.1851, "step": 3500 }, { "epoch": 1.8233766233766233, "grad_norm": 0.1426293194672696, "learning_rate": 1.760336954464729e-05, "loss": 0.1865, "step": 3510 }, { "epoch": 1.8285714285714287, "grad_norm": 0.15015722852688704, "learning_rate": 1.7469539865767015e-05, "loss": 0.1881, "step": 3520 }, { "epoch": 1.8337662337662337, "grad_norm": 0.13366343514009785, "learning_rate": 1.7335947099932022e-05, "loss": 0.1852, "step": 3530 }, { "epoch": 1.838961038961039, "grad_norm": 0.14856809610613397, "learning_rate": 1.720259545005533e-05, "loss": 0.1843, "step": 3540 }, { "epoch": 1.844155844155844, "grad_norm": 0.1448912483018553, "learning_rate": 1.7069489111464304e-05, "loss": 0.1864, "step": 3550 }, { "epoch": 1.8493506493506493, "grad_norm": 0.1359780492493188, "learning_rate": 1.693663227176867e-05, "loss": 0.1877, "step": 3560 }, { "epoch": 1.8545454545454545, "grad_norm": 0.13772337189235462, "learning_rate": 1.680402911072874e-05, "loss": 0.1873, "step": 3570 }, { "epoch": 1.8597402597402597, "grad_norm": 0.14393209031617119, "learning_rate": 1.6671683800123932e-05, "loss": 0.1839, "step": 3580 }, { "epoch": 1.864935064935065, "grad_norm": 0.13956682462779404, "learning_rate": 1.6539600503621572e-05, "loss": 0.1874, "step": 3590 }, { "epoch": 1.87012987012987, "grad_norm": 0.1391253168614093, "learning_rate": 1.6407783376645803e-05, "loss": 0.1845, "step": 3600 }, { "epoch": 1.8753246753246753, "grad_norm": 0.13132359363192808, "learning_rate": 1.6276236566246916e-05, "loss": 0.1873, "step": 3610 }, { "epoch": 1.8805194805194805, "grad_norm": 0.14070930287195552, "learning_rate": 1.614496421097091e-05, "loss": 0.1884, "step": 3620 }, { "epoch": 1.8857142857142857, "grad_norm": 0.14427194349525638, "learning_rate": 1.6013970440729204e-05, "loss": 0.1889, "step": 3630 }, { "epoch": 1.8909090909090909, "grad_norm": 0.13521516110226986, "learning_rate": 1.588325937666878e-05, "loss": 0.1839, "step": 3640 }, { "epoch": 1.896103896103896, "grad_norm": 0.13376048212412542, "learning_rate": 1.5752835131042494e-05, "loss": 0.1839, "step": 3650 }, { "epoch": 1.9012987012987013, "grad_norm": 0.14138808345532902, "learning_rate": 1.5622701807079733e-05, "loss": 0.1871, "step": 3660 }, { "epoch": 1.9064935064935065, "grad_norm": 0.13070839091776826, "learning_rate": 1.5492863498857287e-05, "loss": 0.1823, "step": 3670 }, { "epoch": 1.9116883116883117, "grad_norm": 0.13600301900618547, "learning_rate": 1.5363324291170545e-05, "loss": 0.1868, "step": 3680 }, { "epoch": 1.9168831168831169, "grad_norm": 0.1400192321902119, "learning_rate": 1.5234088259405056e-05, "loss": 0.1867, "step": 3690 }, { "epoch": 1.922077922077922, "grad_norm": 0.13023052808570348, "learning_rate": 1.5105159469408209e-05, "loss": 0.1821, "step": 3700 }, { "epoch": 1.9272727272727272, "grad_norm": 0.14335411330199735, "learning_rate": 1.4976541977361402e-05, "loss": 0.189, "step": 3710 }, { "epoch": 1.9324675324675324, "grad_norm": 0.1336525441650799, "learning_rate": 1.48482398296524e-05, "loss": 0.1834, "step": 3720 }, { "epoch": 1.9376623376623376, "grad_norm": 0.14165962885271705, "learning_rate": 1.4720257062748022e-05, "loss": 0.1866, "step": 3730 }, { "epoch": 1.9428571428571428, "grad_norm": 0.13695354269783466, "learning_rate": 1.4592597703067187e-05, "loss": 0.1832, "step": 3740 }, { "epoch": 1.948051948051948, "grad_norm": 0.13868718580766137, "learning_rate": 1.446526576685418e-05, "loss": 0.183, "step": 3750 }, { "epoch": 1.9532467532467532, "grad_norm": 0.13843203988888092, "learning_rate": 1.4338265260052387e-05, "loss": 0.1842, "step": 3760 }, { "epoch": 1.9584415584415584, "grad_norm": 0.13271796860429588, "learning_rate": 1.4211600178178174e-05, "loss": 0.1832, "step": 3770 }, { "epoch": 1.9636363636363636, "grad_norm": 0.14628648866036364, "learning_rate": 1.4085274506195245e-05, "loss": 0.186, "step": 3780 }, { "epoch": 1.9688311688311688, "grad_norm": 0.14975084928952903, "learning_rate": 1.3959292218389248e-05, "loss": 0.1842, "step": 3790 }, { "epoch": 1.974025974025974, "grad_norm": 0.1304201723081621, "learning_rate": 1.383365727824275e-05, "loss": 0.1835, "step": 3800 }, { "epoch": 1.9792207792207792, "grad_norm": 0.15224990383617293, "learning_rate": 1.3708373638310531e-05, "loss": 0.1858, "step": 3810 }, { "epoch": 1.9844155844155844, "grad_norm": 0.13275223694042823, "learning_rate": 1.358344524009528e-05, "loss": 0.1832, "step": 3820 }, { "epoch": 1.9896103896103896, "grad_norm": 0.14813458539626156, "learning_rate": 1.3458876013923499e-05, "loss": 0.1866, "step": 3830 }, { "epoch": 1.9948051948051948, "grad_norm": 0.1330421923733361, "learning_rate": 1.3334669878821948e-05, "loss": 0.1827, "step": 3840 }, { "epoch": 2.0, "grad_norm": 0.14316381473328288, "learning_rate": 1.3210830742394298e-05, "loss": 0.1799, "step": 3850 }, { "epoch": 2.005194805194805, "grad_norm": 0.1512401129922707, "learning_rate": 1.3087362500698237e-05, "loss": 0.154, "step": 3860 }, { "epoch": 2.0103896103896104, "grad_norm": 0.14562927746343074, "learning_rate": 1.2964269038122836e-05, "loss": 0.1505, "step": 3870 }, { "epoch": 2.0155844155844154, "grad_norm": 0.13594065019007026, "learning_rate": 1.2841554227266373e-05, "loss": 0.151, "step": 3880 }, { "epoch": 2.020779220779221, "grad_norm": 0.14424278321251643, "learning_rate": 1.2719221928814545e-05, "loss": 0.1489, "step": 3890 }, { "epoch": 2.0259740259740258, "grad_norm": 0.14407964341771956, "learning_rate": 1.2597275991418928e-05, "loss": 0.1534, "step": 3900 }, { "epoch": 2.031168831168831, "grad_norm": 0.13922191062743117, "learning_rate": 1.247572025157595e-05, "loss": 0.1498, "step": 3910 }, { "epoch": 2.036363636363636, "grad_norm": 0.1315526163391668, "learning_rate": 1.2354558533506176e-05, "loss": 0.1486, "step": 3920 }, { "epoch": 2.0415584415584416, "grad_norm": 0.14073786153063886, "learning_rate": 1.2233794649033991e-05, "loss": 0.1561, "step": 3930 }, { "epoch": 2.0467532467532465, "grad_norm": 0.1428562906764405, "learning_rate": 1.211343239746768e-05, "loss": 0.1509, "step": 3940 }, { "epoch": 2.051948051948052, "grad_norm": 0.14280097216113558, "learning_rate": 1.199347556547993e-05, "loss": 0.1517, "step": 3950 }, { "epoch": 2.057142857142857, "grad_norm": 0.13105429870703852, "learning_rate": 1.187392792698864e-05, "loss": 0.1469, "step": 3960 }, { "epoch": 2.0623376623376624, "grad_norm": 0.14347982471711343, "learning_rate": 1.1754793243038239e-05, "loss": 0.1507, "step": 3970 }, { "epoch": 2.0675324675324673, "grad_norm": 0.14904614300033936, "learning_rate": 1.1636075261681315e-05, "loss": 0.1549, "step": 3980 }, { "epoch": 2.0727272727272728, "grad_norm": 0.13730655354719687, "learning_rate": 1.1517777717860776e-05, "loss": 0.1503, "step": 3990 }, { "epoch": 2.0779220779220777, "grad_norm": 0.14862501974595033, "learning_rate": 1.1399904333292269e-05, "loss": 0.1509, "step": 4000 }, { "epoch": 2.083116883116883, "grad_norm": 0.13183718490837706, "learning_rate": 1.1282458816347128e-05, "loss": 0.1509, "step": 4010 }, { "epoch": 2.088311688311688, "grad_norm": 0.13591710857386252, "learning_rate": 1.1165444861935701e-05, "loss": 0.1514, "step": 4020 }, { "epoch": 2.0935064935064935, "grad_norm": 0.13367182049390872, "learning_rate": 1.1048866151391102e-05, "loss": 0.1479, "step": 4030 }, { "epoch": 2.0987012987012985, "grad_norm": 0.13675222806448303, "learning_rate": 1.0932726352353393e-05, "loss": 0.1499, "step": 4040 }, { "epoch": 2.103896103896104, "grad_norm": 0.13838118900424312, "learning_rate": 1.081702911865423e-05, "loss": 0.1487, "step": 4050 }, { "epoch": 2.109090909090909, "grad_norm": 0.13434786073921717, "learning_rate": 1.0701778090201858e-05, "loss": 0.1533, "step": 4060 }, { "epoch": 2.1142857142857143, "grad_norm": 0.13473555962628936, "learning_rate": 1.0586976892866615e-05, "loss": 0.1505, "step": 4070 }, { "epoch": 2.1194805194805193, "grad_norm": 0.14196721586166652, "learning_rate": 1.0472629138366874e-05, "loss": 0.1468, "step": 4080 }, { "epoch": 2.1246753246753247, "grad_norm": 0.13373540922822927, "learning_rate": 1.0358738424155435e-05, "loss": 0.1514, "step": 4090 }, { "epoch": 2.1298701298701297, "grad_norm": 0.13735599472516719, "learning_rate": 1.024530833330629e-05, "loss": 0.1518, "step": 4100 }, { "epoch": 2.135064935064935, "grad_norm": 0.14328294051153082, "learning_rate": 1.0132342434401937e-05, "loss": 0.1507, "step": 4110 }, { "epoch": 2.14025974025974, "grad_norm": 0.1364465855704536, "learning_rate": 1.0019844281421107e-05, "loss": 0.1523, "step": 4120 }, { "epoch": 2.1454545454545455, "grad_norm": 0.1350058874546942, "learning_rate": 9.90781741362694e-06, "loss": 0.149, "step": 4130 }, { "epoch": 2.1506493506493505, "grad_norm": 0.12932693371267076, "learning_rate": 9.796265355455647e-06, "loss": 0.1535, "step": 4140 }, { "epoch": 2.155844155844156, "grad_norm": 0.13891031796141848, "learning_rate": 9.685191616405643e-06, "loss": 0.1511, "step": 4150 }, { "epoch": 2.161038961038961, "grad_norm": 0.12826558374453936, "learning_rate": 9.574599690927105e-06, "loss": 0.1497, "step": 4160 }, { "epoch": 2.1662337662337663, "grad_norm": 0.1356328538309424, "learning_rate": 9.46449305831204e-06, "loss": 0.1518, "step": 4170 }, { "epoch": 2.1714285714285713, "grad_norm": 0.1297425436242765, "learning_rate": 9.354875182584846e-06, "loss": 0.1534, "step": 4180 }, { "epoch": 2.1766233766233767, "grad_norm": 0.13530241943073487, "learning_rate": 9.245749512393334e-06, "loss": 0.1503, "step": 4190 }, { "epoch": 2.1818181818181817, "grad_norm": 0.1408560476379785, "learning_rate": 9.1371194809002e-06, "loss": 0.1492, "step": 4200 }, { "epoch": 2.187012987012987, "grad_norm": 0.13372249178503884, "learning_rate": 9.028988505675034e-06, "loss": 0.1507, "step": 4210 }, { "epoch": 2.192207792207792, "grad_norm": 0.13921966813026387, "learning_rate": 8.9213599885868e-06, "loss": 0.1531, "step": 4220 }, { "epoch": 2.1974025974025975, "grad_norm": 0.13262776415903957, "learning_rate": 8.814237315696817e-06, "loss": 0.1504, "step": 4230 }, { "epoch": 2.2025974025974024, "grad_norm": 0.1301186665851314, "learning_rate": 8.707623857152208e-06, "loss": 0.1492, "step": 4240 }, { "epoch": 2.207792207792208, "grad_norm": 0.13579689779095455, "learning_rate": 8.60152296707993e-06, "loss": 0.1521, "step": 4250 }, { "epoch": 2.212987012987013, "grad_norm": 0.14637447855770952, "learning_rate": 8.495937983481158e-06, "loss": 0.1487, "step": 4260 }, { "epoch": 2.2181818181818183, "grad_norm": 0.1289270701627187, "learning_rate": 8.390872228126362e-06, "loss": 0.1532, "step": 4270 }, { "epoch": 2.2233766233766232, "grad_norm": 0.13704790669730998, "learning_rate": 8.286329006450735e-06, "loss": 0.154, "step": 4280 }, { "epoch": 2.2285714285714286, "grad_norm": 0.1342079590854756, "learning_rate": 8.182311607450264e-06, "loss": 0.1519, "step": 4290 }, { "epoch": 2.2337662337662336, "grad_norm": 0.1289098530992212, "learning_rate": 8.078823303578198e-06, "loss": 0.1491, "step": 4300 }, { "epoch": 2.238961038961039, "grad_norm": 0.1332579940433215, "learning_rate": 7.9758673506421e-06, "loss": 0.1519, "step": 4310 }, { "epoch": 2.244155844155844, "grad_norm": 0.12978196786783464, "learning_rate": 7.87344698770148e-06, "loss": 0.1494, "step": 4320 }, { "epoch": 2.2493506493506494, "grad_norm": 0.1331325286342844, "learning_rate": 7.77156543696582e-06, "loss": 0.1496, "step": 4330 }, { "epoch": 2.2545454545454544, "grad_norm": 0.13175031314804772, "learning_rate": 7.670225903693229e-06, "loss": 0.1494, "step": 4340 }, { "epoch": 2.25974025974026, "grad_norm": 0.13252344104105662, "learning_rate": 7.5694315760896086e-06, "loss": 0.1501, "step": 4350 }, { "epoch": 2.264935064935065, "grad_norm": 0.1292549372813496, "learning_rate": 7.469185625208347e-06, "loss": 0.1493, "step": 4360 }, { "epoch": 2.27012987012987, "grad_norm": 0.14892873039516652, "learning_rate": 7.369491204850537e-06, "loss": 0.151, "step": 4370 }, { "epoch": 2.275324675324675, "grad_norm": 0.1334849414038533, "learning_rate": 7.270351451465806e-06, "loss": 0.1516, "step": 4380 }, { "epoch": 2.2805194805194806, "grad_norm": 0.13092908930580768, "learning_rate": 7.171769484053575e-06, "loss": 0.1471, "step": 4390 }, { "epoch": 2.2857142857142856, "grad_norm": 0.13163937976410825, "learning_rate": 7.0737484040649864e-06, "loss": 0.1493, "step": 4400 }, { "epoch": 2.290909090909091, "grad_norm": 0.12981020405290916, "learning_rate": 6.9762912953052706e-06, "loss": 0.1554, "step": 4410 }, { "epoch": 2.296103896103896, "grad_norm": 0.13098768915549264, "learning_rate": 6.879401223836823e-06, "loss": 0.1493, "step": 4420 }, { "epoch": 2.3012987012987014, "grad_norm": 0.4012203963638076, "learning_rate": 6.783081237882649e-06, "loss": 0.1485, "step": 4430 }, { "epoch": 2.3064935064935064, "grad_norm": 0.13383556830226412, "learning_rate": 6.68733436773051e-06, "loss": 0.1496, "step": 4440 }, { "epoch": 2.311688311688312, "grad_norm": 0.1333510593064724, "learning_rate": 6.592163625637582e-06, "loss": 0.1541, "step": 4450 }, { "epoch": 2.3168831168831168, "grad_norm": 0.1565215404000228, "learning_rate": 6.497572005735689e-06, "loss": 0.1501, "step": 4460 }, { "epoch": 2.322077922077922, "grad_norm": 0.13259468650053935, "learning_rate": 6.4035624839370975e-06, "loss": 0.1551, "step": 4470 }, { "epoch": 2.327272727272727, "grad_norm": 0.1307362325637928, "learning_rate": 6.310138017840917e-06, "loss": 0.1486, "step": 4480 }, { "epoch": 2.3324675324675326, "grad_norm": 0.13133190490766602, "learning_rate": 6.217301546640022e-06, "loss": 0.1518, "step": 4490 }, { "epoch": 2.3376623376623376, "grad_norm": 0.13025882458056662, "learning_rate": 6.125055991028583e-06, "loss": 0.1494, "step": 4500 }, { "epoch": 2.342857142857143, "grad_norm": 0.1254143759117903, "learning_rate": 6.0334042531102005e-06, "loss": 0.1458, "step": 4510 }, { "epoch": 2.348051948051948, "grad_norm": 0.12809326669267687, "learning_rate": 5.942349216306614e-06, "loss": 0.1471, "step": 4520 }, { "epoch": 2.3532467532467534, "grad_norm": 0.12509372527529825, "learning_rate": 5.851893745266945e-06, "loss": 0.1478, "step": 4530 }, { "epoch": 2.3584415584415583, "grad_norm": 0.13151188136576936, "learning_rate": 5.7620406857776e-06, "loss": 0.1495, "step": 4540 }, { "epoch": 2.3636363636363638, "grad_norm": 0.1347993314789362, "learning_rate": 5.67279286467274e-06, "loss": 0.1504, "step": 4550 }, { "epoch": 2.3688311688311687, "grad_norm": 0.13205701694453234, "learning_rate": 5.584153089745345e-06, "loss": 0.1492, "step": 4560 }, { "epoch": 2.374025974025974, "grad_norm": 0.130155592941109, "learning_rate": 5.4961241496588655e-06, "loss": 0.1522, "step": 4570 }, { "epoch": 2.379220779220779, "grad_norm": 0.13048892356720782, "learning_rate": 5.408708813859531e-06, "loss": 0.1464, "step": 4580 }, { "epoch": 2.3844155844155845, "grad_norm": 0.12447265332394425, "learning_rate": 5.3219098324891496e-06, "loss": 0.1487, "step": 4590 }, { "epoch": 2.3896103896103895, "grad_norm": 0.12292541161875337, "learning_rate": 5.235729936298661e-06, "loss": 0.149, "step": 4600 }, { "epoch": 2.394805194805195, "grad_norm": 0.12776875389985792, "learning_rate": 5.15017183656217e-06, "loss": 0.151, "step": 4610 }, { "epoch": 2.4, "grad_norm": 0.12547722558900828, "learning_rate": 5.065238224991698e-06, "loss": 0.1484, "step": 4620 }, { "epoch": 2.4051948051948053, "grad_norm": 0.12365440404472366, "learning_rate": 4.980931773652453e-06, "loss": 0.1465, "step": 4630 }, { "epoch": 2.4103896103896103, "grad_norm": 0.12432434525146745, "learning_rate": 4.897255134878786e-06, "loss": 0.1483, "step": 4640 }, { "epoch": 2.4155844155844157, "grad_norm": 0.139186530570493, "learning_rate": 4.814210941190755e-06, "loss": 0.1518, "step": 4650 }, { "epoch": 2.4207792207792207, "grad_norm": 0.12528679762457748, "learning_rate": 4.731801805211286e-06, "loss": 0.1489, "step": 4660 }, { "epoch": 2.425974025974026, "grad_norm": 0.12450530026135737, "learning_rate": 4.650030319583987e-06, "loss": 0.1505, "step": 4670 }, { "epoch": 2.431168831168831, "grad_norm": 0.13039926074492833, "learning_rate": 4.568899056891604e-06, "loss": 0.1485, "step": 4680 }, { "epoch": 2.4363636363636365, "grad_norm": 0.12466326477523655, "learning_rate": 4.488410569575028e-06, "loss": 0.1475, "step": 4690 }, { "epoch": 2.4415584415584415, "grad_norm": 0.12528932051361105, "learning_rate": 4.408567389853055e-06, "loss": 0.1516, "step": 4700 }, { "epoch": 2.446753246753247, "grad_norm": 0.1259262307284031, "learning_rate": 4.329372029642678e-06, "loss": 0.1511, "step": 4710 }, { "epoch": 2.451948051948052, "grad_norm": 0.12907058499763246, "learning_rate": 4.250826980480105e-06, "loss": 0.1458, "step": 4720 }, { "epoch": 2.4571428571428573, "grad_norm": 0.13360791308885997, "learning_rate": 4.172934713442328e-06, "loss": 0.1506, "step": 4730 }, { "epoch": 2.4623376623376623, "grad_norm": 0.1246999497148764, "learning_rate": 4.095697679069382e-06, "loss": 0.1474, "step": 4740 }, { "epoch": 2.4675324675324677, "grad_norm": 0.12422306029077695, "learning_rate": 4.019118307287307e-06, "loss": 0.1465, "step": 4750 }, { "epoch": 2.4727272727272727, "grad_norm": 0.12816176999453854, "learning_rate": 3.943199007331633e-06, "loss": 0.1476, "step": 4760 }, { "epoch": 2.477922077922078, "grad_norm": 0.1283395199969505, "learning_rate": 3.8679421676716235e-06, "loss": 0.1505, "step": 4770 }, { "epoch": 2.483116883116883, "grad_norm": 0.12466666439258484, "learning_rate": 3.793350155935113e-06, "loss": 0.1504, "step": 4780 }, { "epoch": 2.4883116883116885, "grad_norm": 0.12152148883126895, "learning_rate": 3.7194253188340412e-06, "loss": 0.1471, "step": 4790 }, { "epoch": 2.4935064935064934, "grad_norm": 0.12943483158505822, "learning_rate": 3.6461699820905915e-06, "loss": 0.1488, "step": 4800 }, { "epoch": 2.498701298701299, "grad_norm": 0.1237730899808012, "learning_rate": 3.5735864503640693e-06, "loss": 0.1477, "step": 4810 }, { "epoch": 2.503896103896104, "grad_norm": 0.12374067827054203, "learning_rate": 3.50167700717835e-06, "loss": 0.1502, "step": 4820 }, { "epoch": 2.509090909090909, "grad_norm": 0.1271358088834976, "learning_rate": 3.4304439148500624e-06, "loss": 0.1446, "step": 4830 }, { "epoch": 2.5142857142857142, "grad_norm": 0.1272280855510312, "learning_rate": 3.3598894144173913e-06, "loss": 0.1496, "step": 4840 }, { "epoch": 2.5194805194805197, "grad_norm": 0.1258197921752763, "learning_rate": 3.290015725569626e-06, "loss": 0.15, "step": 4850 }, { "epoch": 2.5246753246753246, "grad_norm": 0.12473206628431092, "learning_rate": 3.220825046577261e-06, "loss": 0.1462, "step": 4860 }, { "epoch": 2.5298701298701296, "grad_norm": 0.11843223945225219, "learning_rate": 3.152319554222885e-06, "loss": 0.1477, "step": 4870 }, { "epoch": 2.535064935064935, "grad_norm": 0.1259012256625921, "learning_rate": 3.084501403732673e-06, "loss": 0.1516, "step": 4880 }, { "epoch": 2.5402597402597404, "grad_norm": 0.12425830875342217, "learning_rate": 3.017372728708595e-06, "loss": 0.1508, "step": 4890 }, { "epoch": 2.5454545454545454, "grad_norm": 0.12294050842755987, "learning_rate": 2.950935641061275e-06, "loss": 0.1468, "step": 4900 }, { "epoch": 2.5506493506493504, "grad_norm": 0.12210571551971311, "learning_rate": 2.8851922309435815e-06, "loss": 0.1494, "step": 4910 }, { "epoch": 2.555844155844156, "grad_norm": 0.12349634158702745, "learning_rate": 2.8201445666848346e-06, "loss": 0.1441, "step": 4920 }, { "epoch": 2.5610389610389612, "grad_norm": 0.12742872858385557, "learning_rate": 2.755794694725741e-06, "loss": 0.1457, "step": 4930 }, { "epoch": 2.566233766233766, "grad_norm": 0.1242587205799011, "learning_rate": 2.6921446395540284e-06, "loss": 0.1474, "step": 4940 }, { "epoch": 2.571428571428571, "grad_norm": 0.1266773031643187, "learning_rate": 2.6291964036407545e-06, "loss": 0.1472, "step": 4950 }, { "epoch": 2.5766233766233766, "grad_norm": 0.1253376470917521, "learning_rate": 2.5669519673772847e-06, "loss": 0.1471, "step": 4960 }, { "epoch": 2.581818181818182, "grad_norm": 0.12644089026425634, "learning_rate": 2.5054132890130087e-06, "loss": 0.1521, "step": 4970 }, { "epoch": 2.587012987012987, "grad_norm": 0.1197473844911881, "learning_rate": 2.444582304593723e-06, "loss": 0.1478, "step": 4980 }, { "epoch": 2.592207792207792, "grad_norm": 0.11990806649152073, "learning_rate": 2.3844609279007226e-06, "loss": 0.1499, "step": 4990 }, { "epoch": 2.5974025974025974, "grad_norm": 0.1202956364762698, "learning_rate": 2.325051050390595e-06, "loss": 0.1478, "step": 5000 }, { "epoch": 2.602597402597403, "grad_norm": 0.12255235260438935, "learning_rate": 2.266354541135726e-06, "loss": 0.1489, "step": 5010 }, { "epoch": 2.6077922077922078, "grad_norm": 0.12671799892499586, "learning_rate": 2.2083732467654603e-06, "loss": 0.1462, "step": 5020 }, { "epoch": 2.6129870129870127, "grad_norm": 0.12223780402274406, "learning_rate": 2.1511089914080464e-06, "loss": 0.1494, "step": 5030 }, { "epoch": 2.618181818181818, "grad_norm": 0.12145201814175376, "learning_rate": 2.094563576633221e-06, "loss": 0.149, "step": 5040 }, { "epoch": 2.6233766233766236, "grad_norm": 0.12317156500158179, "learning_rate": 2.038738781395552e-06, "loss": 0.1525, "step": 5050 }, { "epoch": 2.6285714285714286, "grad_norm": 0.12189047171784712, "learning_rate": 1.9836363619784552e-06, "loss": 0.1463, "step": 5060 }, { "epoch": 2.6337662337662335, "grad_norm": 0.12030818805338123, "learning_rate": 1.929258051938945e-06, "loss": 0.1463, "step": 5070 }, { "epoch": 2.638961038961039, "grad_norm": 0.12106285056082301, "learning_rate": 1.8756055620530898e-06, "loss": 0.1471, "step": 5080 }, { "epoch": 2.6441558441558444, "grad_norm": 0.12038600565908254, "learning_rate": 1.8226805802622094e-06, "loss": 0.1494, "step": 5090 }, { "epoch": 2.6493506493506493, "grad_norm": 0.12263633853976644, "learning_rate": 1.770484771619743e-06, "loss": 0.1499, "step": 5100 }, { "epoch": 2.6545454545454543, "grad_norm": 0.12334958408238764, "learning_rate": 1.7190197782389035e-06, "loss": 0.1484, "step": 5110 }, { "epoch": 2.6597402597402597, "grad_norm": 0.12029874859542598, "learning_rate": 1.6682872192409632e-06, "loss": 0.148, "step": 5120 }, { "epoch": 2.664935064935065, "grad_norm": 0.11743108789978497, "learning_rate": 1.618288690704367e-06, "loss": 0.1464, "step": 5130 }, { "epoch": 2.67012987012987, "grad_norm": 0.12483269486534702, "learning_rate": 1.5690257656144846e-06, "loss": 0.1456, "step": 5140 }, { "epoch": 2.675324675324675, "grad_norm": 0.1207903651113982, "learning_rate": 1.520499993814148e-06, "loss": 0.1444, "step": 5150 }, { "epoch": 2.6805194805194805, "grad_norm": 0.12216310435997871, "learning_rate": 1.472712901954873e-06, "loss": 0.1487, "step": 5160 }, { "epoch": 2.685714285714286, "grad_norm": 0.12421180243276636, "learning_rate": 1.4256659934488215e-06, "loss": 0.1468, "step": 5170 }, { "epoch": 2.690909090909091, "grad_norm": 0.12291390409864483, "learning_rate": 1.3793607484215458e-06, "loss": 0.1492, "step": 5180 }, { "epoch": 2.696103896103896, "grad_norm": 0.12424898483815008, "learning_rate": 1.3337986236653777e-06, "loss": 0.1498, "step": 5190 }, { "epoch": 2.7012987012987013, "grad_norm": 0.12220447786976209, "learning_rate": 1.2889810525936214e-06, "loss": 0.1464, "step": 5200 }, { "epoch": 2.7064935064935067, "grad_norm": 0.1170794743951447, "learning_rate": 1.244909445195444e-06, "loss": 0.1479, "step": 5210 }, { "epoch": 2.7116883116883117, "grad_norm": 0.11982789622673459, "learning_rate": 1.2015851879915302e-06, "loss": 0.1456, "step": 5220 }, { "epoch": 2.7168831168831167, "grad_norm": 0.11731066611016075, "learning_rate": 1.1590096439904496e-06, "loss": 0.1441, "step": 5230 }, { "epoch": 2.722077922077922, "grad_norm": 0.12315215332934029, "learning_rate": 1.1171841526457838e-06, "loss": 0.1491, "step": 5240 }, { "epoch": 2.7272727272727275, "grad_norm": 0.11853604447437521, "learning_rate": 1.0761100298139788e-06, "loss": 0.1471, "step": 5250 }, { "epoch": 2.7324675324675325, "grad_norm": 0.11999351610266512, "learning_rate": 1.0357885677129614e-06, "loss": 0.1505, "step": 5260 }, { "epoch": 2.7376623376623375, "grad_norm": 0.12232868615312523, "learning_rate": 9.962210348814504e-07, "loss": 0.1446, "step": 5270 }, { "epoch": 2.742857142857143, "grad_norm": 0.12255388742745053, "learning_rate": 9.574086761391043e-07, "loss": 0.1488, "step": 5280 }, { "epoch": 2.7480519480519483, "grad_norm": 0.11949069963969916, "learning_rate": 9.193527125473056e-07, "loss": 0.149, "step": 5290 }, { "epoch": 2.7532467532467533, "grad_norm": 0.11543627566803302, "learning_rate": 8.820543413707694e-07, "loss": 0.1498, "step": 5300 }, { "epoch": 2.7584415584415583, "grad_norm": 0.12420569841945647, "learning_rate": 8.455147360398819e-07, "loss": 0.149, "step": 5310 }, { "epoch": 2.7636363636363637, "grad_norm": 0.11841650740456904, "learning_rate": 8.097350461137631e-07, "loss": 0.1458, "step": 5320 }, { "epoch": 2.768831168831169, "grad_norm": 0.12370250243324289, "learning_rate": 7.747163972441213e-07, "loss": 0.1474, "step": 5330 }, { "epoch": 2.774025974025974, "grad_norm": 0.1214996353399532, "learning_rate": 7.404598911398331e-07, "loss": 0.1493, "step": 5340 }, { "epoch": 2.779220779220779, "grad_norm": 0.1195478340774238, "learning_rate": 7.069666055322777e-07, "loss": 0.1482, "step": 5350 }, { "epoch": 2.7844155844155845, "grad_norm": 0.12483131247062451, "learning_rate": 6.742375941414247e-07, "loss": 0.1472, "step": 5360 }, { "epoch": 2.78961038961039, "grad_norm": 0.11952354364943257, "learning_rate": 6.422738866427053e-07, "loss": 0.1495, "step": 5370 }, { "epoch": 2.794805194805195, "grad_norm": 0.11960938019103264, "learning_rate": 6.110764886346043e-07, "loss": 0.1478, "step": 5380 }, { "epoch": 2.8, "grad_norm": 0.11946979649886329, "learning_rate": 5.806463816070251e-07, "loss": 0.1435, "step": 5390 }, { "epoch": 2.8051948051948052, "grad_norm": 0.12094569119420252, "learning_rate": 5.509845229103999e-07, "loss": 0.1477, "step": 5400 }, { "epoch": 2.8103896103896107, "grad_norm": 0.12040943576693716, "learning_rate": 5.220918457255947e-07, "loss": 0.147, "step": 5410 }, { "epoch": 2.8155844155844156, "grad_norm": 0.12306327075134675, "learning_rate": 4.939692590345324e-07, "loss": 0.1459, "step": 5420 }, { "epoch": 2.8207792207792206, "grad_norm": 0.11635380002309499, "learning_rate": 4.6661764759159954e-07, "loss": 0.146, "step": 5430 }, { "epoch": 2.825974025974026, "grad_norm": 0.12155462126032764, "learning_rate": 4.400378718958209e-07, "loss": 0.1477, "step": 5440 }, { "epoch": 2.8311688311688314, "grad_norm": 0.13019986073768214, "learning_rate": 4.1423076816376747e-07, "loss": 0.15, "step": 5450 }, { "epoch": 2.8363636363636364, "grad_norm": 0.11990566287768059, "learning_rate": 3.8919714830327194e-07, "loss": 0.1463, "step": 5460 }, { "epoch": 2.8415584415584414, "grad_norm": 0.123316194099736, "learning_rate": 3.6493779988786835e-07, "loss": 0.1493, "step": 5470 }, { "epoch": 2.846753246753247, "grad_norm": 0.11382332839422425, "learning_rate": 3.414534861320262e-07, "loss": 0.1459, "step": 5480 }, { "epoch": 2.851948051948052, "grad_norm": 0.11644720907565642, "learning_rate": 3.187449458671249e-07, "loss": 0.1523, "step": 5490 }, { "epoch": 2.857142857142857, "grad_norm": 0.11676245617873343, "learning_rate": 2.968128935182279e-07, "loss": 0.1477, "step": 5500 }, { "epoch": 2.862337662337662, "grad_norm": 0.12250814554782062, "learning_rate": 2.756580190815927e-07, "loss": 0.1473, "step": 5510 }, { "epoch": 2.8675324675324676, "grad_norm": 0.11988558217156425, "learning_rate": 2.5528098810296276e-07, "loss": 0.148, "step": 5520 }, { "epoch": 2.8727272727272726, "grad_norm": 0.11907029230620142, "learning_rate": 2.3568244165664555e-07, "loss": 0.1479, "step": 5530 }, { "epoch": 2.877922077922078, "grad_norm": 0.1175605990368118, "learning_rate": 2.1686299632532049e-07, "loss": 0.1454, "step": 5540 }, { "epoch": 2.883116883116883, "grad_norm": 0.1170893554332405, "learning_rate": 1.9882324418065978e-07, "loss": 0.1463, "step": 5550 }, { "epoch": 2.8883116883116884, "grad_norm": 0.1217388184223315, "learning_rate": 1.8156375276468795e-07, "loss": 0.1469, "step": 5560 }, { "epoch": 2.8935064935064934, "grad_norm": 0.12104004201060858, "learning_rate": 1.6508506507193766e-07, "loss": 0.1482, "step": 5570 }, { "epoch": 2.898701298701299, "grad_norm": 0.12023073487651949, "learning_rate": 1.4938769953236064e-07, "loss": 0.1482, "step": 5580 }, { "epoch": 2.9038961038961038, "grad_norm": 0.11829219458577808, "learning_rate": 1.344721499950241e-07, "loss": 0.1466, "step": 5590 }, { "epoch": 2.909090909090909, "grad_norm": 0.1202978727970241, "learning_rate": 1.203388857125537e-07, "loss": 0.1446, "step": 5600 }, { "epoch": 2.914285714285714, "grad_norm": 0.12003327209462132, "learning_rate": 1.0698835132640361e-07, "loss": 0.1487, "step": 5610 }, { "epoch": 2.9194805194805196, "grad_norm": 0.1181424516998344, "learning_rate": 9.442096685283452e-08, "loss": 0.1465, "step": 5620 }, { "epoch": 2.9246753246753245, "grad_norm": 0.1230036027460985, "learning_rate": 8.263712766972686e-08, "loss": 0.1471, "step": 5630 }, { "epoch": 2.92987012987013, "grad_norm": 0.12320775023601876, "learning_rate": 7.163720450412415e-08, "loss": 0.1483, "step": 5640 }, { "epoch": 2.935064935064935, "grad_norm": 0.11757950962836114, "learning_rate": 6.142154342057282e-08, "loss": 0.1451, "step": 5650 }, { "epoch": 2.9402597402597404, "grad_norm": 0.12075101363106212, "learning_rate": 5.199046581024214e-08, "loss": 0.1457, "step": 5660 }, { "epoch": 2.9454545454545453, "grad_norm": 0.11591455658677356, "learning_rate": 4.334426838080719e-08, "loss": 0.1454, "step": 5670 }, { "epoch": 2.9506493506493507, "grad_norm": 0.11974339645293, "learning_rate": 3.5483223147114716e-08, "loss": 0.1493, "step": 5680 }, { "epoch": 2.9558441558441557, "grad_norm": 0.12221319049722304, "learning_rate": 2.8407577422628895e-08, "loss": 0.146, "step": 5690 }, { "epoch": 2.961038961038961, "grad_norm": 0.11694250162651787, "learning_rate": 2.2117553811643044e-08, "loss": 0.1458, "step": 5700 }, { "epoch": 2.966233766233766, "grad_norm": 0.11631918845285252, "learning_rate": 1.6613350202282496e-08, "loss": 0.1494, "step": 5710 }, { "epoch": 2.9714285714285715, "grad_norm": 0.12032018694766353, "learning_rate": 1.1895139760284557e-08, "loss": 0.1492, "step": 5720 }, { "epoch": 2.9766233766233765, "grad_norm": 0.1160647794022908, "learning_rate": 7.963070923533433e-09, "loss": 0.146, "step": 5730 }, { "epoch": 2.981818181818182, "grad_norm": 0.12107552388620237, "learning_rate": 4.817267397405623e-09, "loss": 0.1482, "step": 5740 }, { "epoch": 2.987012987012987, "grad_norm": 0.12616234731784437, "learning_rate": 2.4578281508702563e-09, "loss": 0.1466, "step": 5750 }, { "epoch": 2.9922077922077923, "grad_norm": 0.11975678944265007, "learning_rate": 8.848274133860246e-10, "loss": 0.1484, "step": 5760 }, { "epoch": 2.9974025974025973, "grad_norm": 0.12134318440981517, "learning_rate": 9.831467255028148e-11, "loss": 0.1452, "step": 5770 }, { "epoch": 3.0, "step": 5775, "total_flos": 1.816985160646656e+16, "train_loss": 0.20176902750353792, "train_runtime": 231849.8596, "train_samples_per_second": 1.195, "train_steps_per_second": 0.025 } ], "logging_steps": 10, "max_steps": 5775, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.816985160646656e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }