| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9999964067682114, |
| "eval_steps": 3000, |
| "global_step": 69575, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0007186463577205975, |
| "grad_norm": 0.33678868412971497, |
| "learning_rate": 9.992957240388071e-05, |
| "loss": 0.3609, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.001437292715441195, |
| "grad_norm": 0.3722863495349884, |
| "learning_rate": 9.985770750988143e-05, |
| "loss": 0.2822, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.0021559390731617925, |
| "grad_norm": 0.3654504120349884, |
| "learning_rate": 9.978584261588215e-05, |
| "loss": 0.2715, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.00287458543088239, |
| "grad_norm": 0.3333148956298828, |
| "learning_rate": 9.971397772188287e-05, |
| "loss": 0.273, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.0035932317886029873, |
| "grad_norm": 0.28888237476348877, |
| "learning_rate": 9.964211282788358e-05, |
| "loss": 0.2777, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.004311878146323585, |
| "grad_norm": 0.4001215696334839, |
| "learning_rate": 9.95702479338843e-05, |
| "loss": 0.2622, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.005030524504044182, |
| "grad_norm": 0.38211333751678467, |
| "learning_rate": 9.949838303988502e-05, |
| "loss": 0.27, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.00574917086176478, |
| "grad_norm": 0.3811050057411194, |
| "learning_rate": 9.942651814588574e-05, |
| "loss": 0.2599, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.006467817219485378, |
| "grad_norm": 0.41944026947021484, |
| "learning_rate": 9.935465325188645e-05, |
| "loss": 0.2539, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.0071864635772059746, |
| "grad_norm": 0.4201948046684265, |
| "learning_rate": 9.928278835788718e-05, |
| "loss": 0.2501, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.007905109934926572, |
| "grad_norm": 0.34886258840560913, |
| "learning_rate": 9.92109234638879e-05, |
| "loss": 0.2578, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.00862375629264717, |
| "grad_norm": 0.369731605052948, |
| "learning_rate": 9.91390585698886e-05, |
| "loss": 0.2558, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.009342402650367768, |
| "grad_norm": 0.2964828610420227, |
| "learning_rate": 9.906719367588934e-05, |
| "loss": 0.2445, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.010061049008088364, |
| "grad_norm": 0.27640748023986816, |
| "learning_rate": 9.899532878189006e-05, |
| "loss": 0.2477, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.010779695365808962, |
| "grad_norm": 0.4146655797958374, |
| "learning_rate": 9.892346388789076e-05, |
| "loss": 0.2427, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.01149834172352956, |
| "grad_norm": 0.24966135621070862, |
| "learning_rate": 9.88515989938915e-05, |
| "loss": 0.243, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.012216988081250157, |
| "grad_norm": 0.39003002643585205, |
| "learning_rate": 9.877973409989221e-05, |
| "loss": 0.2514, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.012935634438970755, |
| "grad_norm": 0.3037317991256714, |
| "learning_rate": 9.870786920589292e-05, |
| "loss": 0.2444, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.013654280796691351, |
| "grad_norm": 0.4129560589790344, |
| "learning_rate": 9.863600431189365e-05, |
| "loss": 0.2488, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.014372927154411949, |
| "grad_norm": 0.3825244903564453, |
| "learning_rate": 9.856413941789436e-05, |
| "loss": 0.244, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.015091573512132547, |
| "grad_norm": 0.30103355646133423, |
| "learning_rate": 9.849227452389508e-05, |
| "loss": 0.2409, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.015810219869853145, |
| "grad_norm": 0.3930550515651703, |
| "learning_rate": 9.842040962989581e-05, |
| "loss": 0.2282, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.01652886622757374, |
| "grad_norm": 0.31128013134002686, |
| "learning_rate": 9.834854473589651e-05, |
| "loss": 0.239, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.01724751258529434, |
| "grad_norm": 0.3896910846233368, |
| "learning_rate": 9.827667984189723e-05, |
| "loss": 0.2354, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.017966158943014936, |
| "grad_norm": 0.35376253724098206, |
| "learning_rate": 9.820481494789797e-05, |
| "loss": 0.2368, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.018684805300735536, |
| "grad_norm": 0.31663399934768677, |
| "learning_rate": 9.813295005389867e-05, |
| "loss": 0.2358, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.019403451658456132, |
| "grad_norm": 0.29948070645332336, |
| "learning_rate": 9.806108515989939e-05, |
| "loss": 0.2331, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.020122098016176728, |
| "grad_norm": 0.37420302629470825, |
| "learning_rate": 9.798922026590011e-05, |
| "loss": 0.2323, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.020840744373897328, |
| "grad_norm": 0.38153383135795593, |
| "learning_rate": 9.791735537190083e-05, |
| "loss": 0.2348, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.021559390731617924, |
| "grad_norm": 0.30417364835739136, |
| "learning_rate": 9.784549047790155e-05, |
| "loss": 0.242, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.022278037089338523, |
| "grad_norm": 0.31648948788642883, |
| "learning_rate": 9.777362558390227e-05, |
| "loss": 0.2322, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.02299668344705912, |
| "grad_norm": 0.28670674562454224, |
| "learning_rate": 9.770176068990299e-05, |
| "loss": 0.2264, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.023715329804779715, |
| "grad_norm": 0.3444724977016449, |
| "learning_rate": 9.76298957959037e-05, |
| "loss": 0.2286, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.024433976162500315, |
| "grad_norm": 0.3509560227394104, |
| "learning_rate": 9.755803090190442e-05, |
| "loss": 0.2296, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.02515262252022091, |
| "grad_norm": 0.34082117676734924, |
| "learning_rate": 9.748616600790514e-05, |
| "loss": 0.2286, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.02587126887794151, |
| "grad_norm": 0.39878740906715393, |
| "learning_rate": 9.741430111390586e-05, |
| "loss": 0.2284, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.026589915235662107, |
| "grad_norm": 0.3792494833469391, |
| "learning_rate": 9.734243621990658e-05, |
| "loss": 0.2232, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.027308561593382703, |
| "grad_norm": 0.26393282413482666, |
| "learning_rate": 9.72705713259073e-05, |
| "loss": 0.22, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.028027207951103302, |
| "grad_norm": 0.2707872986793518, |
| "learning_rate": 9.719870643190802e-05, |
| "loss": 0.2195, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.028745854308823898, |
| "grad_norm": 0.2854771018028259, |
| "learning_rate": 9.712684153790874e-05, |
| "loss": 0.2227, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.029464500666544498, |
| "grad_norm": 0.29345157742500305, |
| "learning_rate": 9.705497664390946e-05, |
| "loss": 0.2247, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.030183147024265094, |
| "grad_norm": 0.29724326729774475, |
| "learning_rate": 9.698311174991018e-05, |
| "loss": 0.2239, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.030901793381985693, |
| "grad_norm": 0.31227824091911316, |
| "learning_rate": 9.69112468559109e-05, |
| "loss": 0.2265, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.03162043973970629, |
| "grad_norm": 0.31163766980171204, |
| "learning_rate": 9.683938196191162e-05, |
| "loss": 0.2266, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.03233908609742689, |
| "grad_norm": 0.2654283344745636, |
| "learning_rate": 9.676751706791232e-05, |
| "loss": 0.2233, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.03305773245514748, |
| "grad_norm": 0.27485188841819763, |
| "learning_rate": 9.669565217391304e-05, |
| "loss": 0.2252, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.03377637881286808, |
| "grad_norm": 0.28361964225769043, |
| "learning_rate": 9.662378727991377e-05, |
| "loss": 0.2272, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.03449502517058868, |
| "grad_norm": 0.45227479934692383, |
| "learning_rate": 9.655192238591448e-05, |
| "loss": 0.2219, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.03521367152830927, |
| "grad_norm": 0.26741498708724976, |
| "learning_rate": 9.64800574919152e-05, |
| "loss": 0.2244, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.03593231788602987, |
| "grad_norm": 0.38418859243392944, |
| "learning_rate": 9.640819259791593e-05, |
| "loss": 0.2213, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.03665096424375047, |
| "grad_norm": 0.28470662236213684, |
| "learning_rate": 9.633632770391664e-05, |
| "loss": 0.2265, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.03736961060147107, |
| "grad_norm": 0.3410508632659912, |
| "learning_rate": 9.626446280991736e-05, |
| "loss": 0.2172, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.038088256959191664, |
| "grad_norm": 0.30783000588417053, |
| "learning_rate": 9.619259791591809e-05, |
| "loss": 0.2198, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.038806903316912264, |
| "grad_norm": 0.2574290335178375, |
| "learning_rate": 9.61207330219188e-05, |
| "loss": 0.2149, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.039525549674632864, |
| "grad_norm": 0.2963494062423706, |
| "learning_rate": 9.604886812791951e-05, |
| "loss": 0.2193, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.040244196032353456, |
| "grad_norm": 0.33474215865135193, |
| "learning_rate": 9.597700323392023e-05, |
| "loss": 0.2202, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.040962842390074056, |
| "grad_norm": 0.3282819390296936, |
| "learning_rate": 9.590513833992095e-05, |
| "loss": 0.2249, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.041681488747794655, |
| "grad_norm": 0.4898041784763336, |
| "learning_rate": 9.583327344592167e-05, |
| "loss": 0.2199, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.042400135105515255, |
| "grad_norm": 0.2768670618534088, |
| "learning_rate": 9.576140855192239e-05, |
| "loss": 0.217, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.04311878146323585, |
| "grad_norm": 0.3368474543094635, |
| "learning_rate": 9.568954365792311e-05, |
| "loss": 0.2171, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.04311878146323585, |
| "eval_loss": 0.21704652905464172, |
| "eval_runtime": 2340.123, |
| "eval_samples_per_second": 25.037, |
| "eval_steps_per_second": 3.13, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.04383742782095645, |
| "grad_norm": 0.4556143283843994, |
| "learning_rate": 9.561767876392383e-05, |
| "loss": 0.2196, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.044556074178677046, |
| "grad_norm": 0.3851192891597748, |
| "learning_rate": 9.554581386992455e-05, |
| "loss": 0.2154, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.04527472053639764, |
| "grad_norm": 0.33037668466567993, |
| "learning_rate": 9.547394897592527e-05, |
| "loss": 0.2146, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.04599336689411824, |
| "grad_norm": 0.5030372142791748, |
| "learning_rate": 9.540208408192598e-05, |
| "loss": 0.2163, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.04671201325183884, |
| "grad_norm": 0.2891259789466858, |
| "learning_rate": 9.53302191879267e-05, |
| "loss": 0.2102, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.04743065960955943, |
| "grad_norm": 0.3370216190814972, |
| "learning_rate": 9.525835429392742e-05, |
| "loss": 0.226, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.04814930596728003, |
| "grad_norm": 0.3481488525867462, |
| "learning_rate": 9.518648939992814e-05, |
| "loss": 0.2102, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.04886795232500063, |
| "grad_norm": 0.23386803269386292, |
| "learning_rate": 9.511462450592886e-05, |
| "loss": 0.2094, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.04958659868272123, |
| "grad_norm": 0.3331839144229889, |
| "learning_rate": 9.504275961192958e-05, |
| "loss": 0.2133, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.05030524504044182, |
| "grad_norm": 0.29888877272605896, |
| "learning_rate": 9.49708947179303e-05, |
| "loss": 0.2085, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.05102389139816242, |
| "grad_norm": 0.284204363822937, |
| "learning_rate": 9.489902982393102e-05, |
| "loss": 0.2084, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.05174253775588302, |
| "grad_norm": 0.3944644033908844, |
| "learning_rate": 9.482716492993174e-05, |
| "loss": 0.2084, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.052461184113603614, |
| "grad_norm": 0.3057551085948944, |
| "learning_rate": 9.475530003593244e-05, |
| "loss": 0.2092, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.05317983047132421, |
| "grad_norm": 0.30618202686309814, |
| "learning_rate": 9.468343514193318e-05, |
| "loss": 0.206, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.05389847682904481, |
| "grad_norm": 0.2912580370903015, |
| "learning_rate": 9.46115702479339e-05, |
| "loss": 0.2097, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.054617123186765405, |
| "grad_norm": 0.29069676995277405, |
| "learning_rate": 9.45397053539346e-05, |
| "loss": 0.2172, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.055335769544486005, |
| "grad_norm": 0.35476160049438477, |
| "learning_rate": 9.446784045993533e-05, |
| "loss": 0.2142, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.056054415902206604, |
| "grad_norm": 0.38042518496513367, |
| "learning_rate": 9.439597556593605e-05, |
| "loss": 0.2093, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.056773062259927204, |
| "grad_norm": 0.33690425753593445, |
| "learning_rate": 9.432411067193676e-05, |
| "loss": 0.2088, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.057491708617647796, |
| "grad_norm": 0.329703152179718, |
| "learning_rate": 9.425224577793749e-05, |
| "loss": 0.2112, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.058210354975368396, |
| "grad_norm": 0.35252898931503296, |
| "learning_rate": 9.41803808839382e-05, |
| "loss": 0.2093, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.058929001333088996, |
| "grad_norm": 0.36909961700439453, |
| "learning_rate": 9.410851598993891e-05, |
| "loss": 0.2077, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.05964764769080959, |
| "grad_norm": 0.3794455826282501, |
| "learning_rate": 9.403665109593965e-05, |
| "loss": 0.2063, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.06036629404853019, |
| "grad_norm": 0.31799831986427307, |
| "learning_rate": 9.396478620194035e-05, |
| "loss": 0.2079, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.06108494040625079, |
| "grad_norm": 0.2503960132598877, |
| "learning_rate": 9.389292130794107e-05, |
| "loss": 0.2069, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.06180358676397139, |
| "grad_norm": 0.32965216040611267, |
| "learning_rate": 9.38210564139418e-05, |
| "loss": 0.21, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.06252223312169199, |
| "grad_norm": 0.2706376314163208, |
| "learning_rate": 9.374919151994251e-05, |
| "loss": 0.204, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.06324087947941258, |
| "grad_norm": 0.3276236951351166, |
| "learning_rate": 9.367732662594323e-05, |
| "loss": 0.2069, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.06395952583713317, |
| "grad_norm": 0.3109481930732727, |
| "learning_rate": 9.360546173194396e-05, |
| "loss": 0.2147, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.06467817219485378, |
| "grad_norm": 0.3856702446937561, |
| "learning_rate": 9.353359683794467e-05, |
| "loss": 0.21, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.06539681855257437, |
| "grad_norm": 0.32052862644195557, |
| "learning_rate": 9.346173194394539e-05, |
| "loss": 0.2112, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.06611546491029496, |
| "grad_norm": 0.3940321207046509, |
| "learning_rate": 9.33898670499461e-05, |
| "loss": 0.2063, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.06683411126801557, |
| "grad_norm": 0.290620893239975, |
| "learning_rate": 9.331800215594682e-05, |
| "loss": 0.211, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.06755275762573616, |
| "grad_norm": 0.2583450973033905, |
| "learning_rate": 9.324613726194754e-05, |
| "loss": 0.2043, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.06827140398345675, |
| "grad_norm": 0.5912581086158752, |
| "learning_rate": 9.317427236794826e-05, |
| "loss": 0.2077, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.06899005034117736, |
| "grad_norm": 0.4294145107269287, |
| "learning_rate": 9.310240747394898e-05, |
| "loss": 0.2103, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.06970869669889795, |
| "grad_norm": 0.2989557385444641, |
| "learning_rate": 9.30305425799497e-05, |
| "loss": 0.2006, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.07042734305661855, |
| "grad_norm": 0.3520444333553314, |
| "learning_rate": 9.29586776859504e-05, |
| "loss": 0.2054, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.07114598941433915, |
| "grad_norm": 0.3374841511249542, |
| "learning_rate": 9.288681279195114e-05, |
| "loss": 0.2058, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.07186463577205975, |
| "grad_norm": 0.5366376042366028, |
| "learning_rate": 9.281494789795186e-05, |
| "loss": 0.2063, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.07258328212978035, |
| "grad_norm": 0.3531434237957001, |
| "learning_rate": 9.274308300395256e-05, |
| "loss": 0.1993, |
| "step": 5050 |
| }, |
| { |
| "epoch": 0.07330192848750094, |
| "grad_norm": 0.406619668006897, |
| "learning_rate": 9.26712181099533e-05, |
| "loss": 0.2, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.07402057484522154, |
| "grad_norm": 0.5909317135810852, |
| "learning_rate": 9.259935321595402e-05, |
| "loss": 0.2023, |
| "step": 5150 |
| }, |
| { |
| "epoch": 0.07473922120294214, |
| "grad_norm": 0.3451712727546692, |
| "learning_rate": 9.252748832195472e-05, |
| "loss": 0.2073, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.07545786756066274, |
| "grad_norm": 0.26411768794059753, |
| "learning_rate": 9.245562342795545e-05, |
| "loss": 0.2077, |
| "step": 5250 |
| }, |
| { |
| "epoch": 0.07617651391838333, |
| "grad_norm": 0.36952096223831177, |
| "learning_rate": 9.238375853395616e-05, |
| "loss": 0.1997, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.07689516027610394, |
| "grad_norm": 0.5570283532142639, |
| "learning_rate": 9.231189363995688e-05, |
| "loss": 0.2082, |
| "step": 5350 |
| }, |
| { |
| "epoch": 0.07761380663382453, |
| "grad_norm": 0.2923007607460022, |
| "learning_rate": 9.224002874595761e-05, |
| "loss": 0.2009, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.07833245299154512, |
| "grad_norm": 0.36968836188316345, |
| "learning_rate": 9.216816385195832e-05, |
| "loss": 0.2073, |
| "step": 5450 |
| }, |
| { |
| "epoch": 0.07905109934926573, |
| "grad_norm": 0.34387850761413574, |
| "learning_rate": 9.209629895795904e-05, |
| "loss": 0.2056, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.07976974570698632, |
| "grad_norm": 0.36204349994659424, |
| "learning_rate": 9.202443406395977e-05, |
| "loss": 0.2009, |
| "step": 5550 |
| }, |
| { |
| "epoch": 0.08048839206470691, |
| "grad_norm": 0.517398476600647, |
| "learning_rate": 9.195256916996047e-05, |
| "loss": 0.2025, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.08120703842242752, |
| "grad_norm": 0.3113616704940796, |
| "learning_rate": 9.18807042759612e-05, |
| "loss": 0.1983, |
| "step": 5650 |
| }, |
| { |
| "epoch": 0.08192568478014811, |
| "grad_norm": 0.32107213139533997, |
| "learning_rate": 9.180883938196193e-05, |
| "loss": 0.2044, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.0826443311378687, |
| "grad_norm": 0.45938199758529663, |
| "learning_rate": 9.173697448796263e-05, |
| "loss": 0.201, |
| "step": 5750 |
| }, |
| { |
| "epoch": 0.08336297749558931, |
| "grad_norm": 0.39211779832839966, |
| "learning_rate": 9.166510959396335e-05, |
| "loss": 0.2122, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.0840816238533099, |
| "grad_norm": 0.49038565158843994, |
| "learning_rate": 9.159324469996407e-05, |
| "loss": 0.197, |
| "step": 5850 |
| }, |
| { |
| "epoch": 0.08480027021103051, |
| "grad_norm": 0.3352994918823242, |
| "learning_rate": 9.152137980596479e-05, |
| "loss": 0.2036, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.0855189165687511, |
| "grad_norm": 0.4314139485359192, |
| "learning_rate": 9.144951491196551e-05, |
| "loss": 0.194, |
| "step": 5950 |
| }, |
| { |
| "epoch": 0.0862375629264717, |
| "grad_norm": 0.2832717001438141, |
| "learning_rate": 9.137765001796623e-05, |
| "loss": 0.1988, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.0862375629264717, |
| "eval_loss": 0.20054864883422852, |
| "eval_runtime": 2378.5965, |
| "eval_samples_per_second": 24.632, |
| "eval_steps_per_second": 3.079, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.0869562092841923, |
| "grad_norm": 0.37466466426849365, |
| "learning_rate": 9.130578512396695e-05, |
| "loss": 0.2006, |
| "step": 6050 |
| }, |
| { |
| "epoch": 0.0876748556419129, |
| "grad_norm": 0.3644261658191681, |
| "learning_rate": 9.123392022996767e-05, |
| "loss": 0.2013, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.08839350199963349, |
| "grad_norm": 0.3186008632183075, |
| "learning_rate": 9.116205533596838e-05, |
| "loss": 0.1953, |
| "step": 6150 |
| }, |
| { |
| "epoch": 0.08911214835735409, |
| "grad_norm": 0.51948481798172, |
| "learning_rate": 9.10901904419691e-05, |
| "loss": 0.196, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.08983079471507469, |
| "grad_norm": 0.3450036346912384, |
| "learning_rate": 9.101832554796982e-05, |
| "loss": 0.1954, |
| "step": 6250 |
| }, |
| { |
| "epoch": 0.09054944107279528, |
| "grad_norm": 0.3176559805870056, |
| "learning_rate": 9.094646065397054e-05, |
| "loss": 0.1995, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.09126808743051588, |
| "grad_norm": 0.39915382862091064, |
| "learning_rate": 9.087459575997126e-05, |
| "loss": 0.2022, |
| "step": 6350 |
| }, |
| { |
| "epoch": 0.09198673378823648, |
| "grad_norm": 0.3537696301937103, |
| "learning_rate": 9.080273086597198e-05, |
| "loss": 0.1983, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.09270538014595707, |
| "grad_norm": 0.35816970467567444, |
| "learning_rate": 9.07308659719727e-05, |
| "loss": 0.2011, |
| "step": 6450 |
| }, |
| { |
| "epoch": 0.09342402650367768, |
| "grad_norm": 0.27828651666641235, |
| "learning_rate": 9.065900107797342e-05, |
| "loss": 0.196, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.09414267286139827, |
| "grad_norm": 0.294121116399765, |
| "learning_rate": 9.058713618397414e-05, |
| "loss": 0.1979, |
| "step": 6550 |
| }, |
| { |
| "epoch": 0.09486131921911886, |
| "grad_norm": 0.3522706925868988, |
| "learning_rate": 9.051527128997486e-05, |
| "loss": 0.1986, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.09557996557683947, |
| "grad_norm": 0.2693149745464325, |
| "learning_rate": 9.044340639597558e-05, |
| "loss": 0.2033, |
| "step": 6650 |
| }, |
| { |
| "epoch": 0.09629861193456006, |
| "grad_norm": 0.3474609851837158, |
| "learning_rate": 9.037154150197628e-05, |
| "loss": 0.1967, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.09701725829228067, |
| "grad_norm": 0.4667980372905731, |
| "learning_rate": 9.029967660797701e-05, |
| "loss": 0.1965, |
| "step": 6750 |
| }, |
| { |
| "epoch": 0.09773590465000126, |
| "grad_norm": 0.3009561002254486, |
| "learning_rate": 9.022781171397773e-05, |
| "loss": 0.1897, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.09845455100772185, |
| "grad_norm": 0.3703483045101166, |
| "learning_rate": 9.015594681997844e-05, |
| "loss": 0.2068, |
| "step": 6850 |
| }, |
| { |
| "epoch": 0.09917319736544246, |
| "grad_norm": 0.3561961054801941, |
| "learning_rate": 9.008408192597916e-05, |
| "loss": 0.1902, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.09989184372316305, |
| "grad_norm": 0.381755530834198, |
| "learning_rate": 9.001221703197989e-05, |
| "loss": 0.1939, |
| "step": 6950 |
| }, |
| { |
| "epoch": 0.10061049008088364, |
| "grad_norm": 0.298585444688797, |
| "learning_rate": 8.99403521379806e-05, |
| "loss": 0.1948, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.10132913643860425, |
| "grad_norm": 0.24976646900177002, |
| "learning_rate": 8.986848724398131e-05, |
| "loss": 0.192, |
| "step": 7050 |
| }, |
| { |
| "epoch": 0.10204778279632484, |
| "grad_norm": 0.3998195230960846, |
| "learning_rate": 8.979662234998203e-05, |
| "loss": 0.1979, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.10276642915404544, |
| "grad_norm": 0.2840961515903473, |
| "learning_rate": 8.972475745598275e-05, |
| "loss": 0.1976, |
| "step": 7150 |
| }, |
| { |
| "epoch": 0.10348507551176604, |
| "grad_norm": 0.39742761850357056, |
| "learning_rate": 8.965289256198347e-05, |
| "loss": 0.1904, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.10420372186948663, |
| "grad_norm": 0.2676410377025604, |
| "learning_rate": 8.958102766798419e-05, |
| "loss": 0.1974, |
| "step": 7250 |
| }, |
| { |
| "epoch": 0.10492236822720723, |
| "grad_norm": 0.32223057746887207, |
| "learning_rate": 8.950916277398491e-05, |
| "loss": 0.1942, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.10564101458492783, |
| "grad_norm": 0.35473334789276123, |
| "learning_rate": 8.943729787998563e-05, |
| "loss": 0.1976, |
| "step": 7350 |
| }, |
| { |
| "epoch": 0.10635966094264843, |
| "grad_norm": 0.2410697489976883, |
| "learning_rate": 8.936543298598635e-05, |
| "loss": 0.1979, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.10707830730036902, |
| "grad_norm": 0.2326819747686386, |
| "learning_rate": 8.929356809198707e-05, |
| "loss": 0.193, |
| "step": 7450 |
| }, |
| { |
| "epoch": 0.10779695365808963, |
| "grad_norm": 0.39694592356681824, |
| "learning_rate": 8.922170319798779e-05, |
| "loss": 0.1952, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.10851560001581022, |
| "grad_norm": 0.2801777720451355, |
| "learning_rate": 8.91498383039885e-05, |
| "loss": 0.1913, |
| "step": 7550 |
| }, |
| { |
| "epoch": 0.10923424637353081, |
| "grad_norm": 0.27637767791748047, |
| "learning_rate": 8.907797340998922e-05, |
| "loss": 0.1894, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.10995289273125142, |
| "grad_norm": 0.47819074988365173, |
| "learning_rate": 8.900610851598994e-05, |
| "loss": 0.19, |
| "step": 7650 |
| }, |
| { |
| "epoch": 0.11067153908897201, |
| "grad_norm": 0.37279731035232544, |
| "learning_rate": 8.893424362199066e-05, |
| "loss": 0.193, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.11139018544669262, |
| "grad_norm": 0.3399754464626312, |
| "learning_rate": 8.886237872799138e-05, |
| "loss": 0.1901, |
| "step": 7750 |
| }, |
| { |
| "epoch": 0.11210883180441321, |
| "grad_norm": 0.2655271887779236, |
| "learning_rate": 8.87905138339921e-05, |
| "loss": 0.1902, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.1128274781621338, |
| "grad_norm": 0.2509619891643524, |
| "learning_rate": 8.871864893999282e-05, |
| "loss": 0.1929, |
| "step": 7850 |
| }, |
| { |
| "epoch": 0.11354612451985441, |
| "grad_norm": 0.37877124547958374, |
| "learning_rate": 8.864678404599354e-05, |
| "loss": 0.1935, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.114264770877575, |
| "grad_norm": 0.3020787835121155, |
| "learning_rate": 8.857491915199424e-05, |
| "loss": 0.1943, |
| "step": 7950 |
| }, |
| { |
| "epoch": 0.11498341723529559, |
| "grad_norm": 0.37178000807762146, |
| "learning_rate": 8.850305425799498e-05, |
| "loss": 0.1874, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.1157020635930162, |
| "grad_norm": 0.29959267377853394, |
| "learning_rate": 8.84311893639957e-05, |
| "loss": 0.1915, |
| "step": 8050 |
| }, |
| { |
| "epoch": 0.11642070995073679, |
| "grad_norm": 0.3364570736885071, |
| "learning_rate": 8.83607617678764e-05, |
| "loss": 0.1929, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.11713935630845738, |
| "grad_norm": 0.4932682514190674, |
| "learning_rate": 8.828889687387712e-05, |
| "loss": 0.1903, |
| "step": 8150 |
| }, |
| { |
| "epoch": 0.11785800266617799, |
| "grad_norm": 0.32474425435066223, |
| "learning_rate": 8.821703197987782e-05, |
| "loss": 0.1884, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.11857664902389858, |
| "grad_norm": 0.2824951410293579, |
| "learning_rate": 8.814516708587856e-05, |
| "loss": 0.1847, |
| "step": 8250 |
| }, |
| { |
| "epoch": 0.11929529538161918, |
| "grad_norm": 0.4078387916088104, |
| "learning_rate": 8.807330219187928e-05, |
| "loss": 0.1909, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.12001394173933978, |
| "grad_norm": 0.42612573504447937, |
| "learning_rate": 8.800143729787998e-05, |
| "loss": 0.191, |
| "step": 8350 |
| }, |
| { |
| "epoch": 0.12073258809706038, |
| "grad_norm": 0.3414117991924286, |
| "learning_rate": 8.792957240388071e-05, |
| "loss": 0.1929, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.12145123445478097, |
| "grad_norm": 0.2816385328769684, |
| "learning_rate": 8.785770750988142e-05, |
| "loss": 0.1958, |
| "step": 8450 |
| }, |
| { |
| "epoch": 0.12216988081250157, |
| "grad_norm": 0.29827359318733215, |
| "learning_rate": 8.778584261588214e-05, |
| "loss": 0.191, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.12288852717022217, |
| "grad_norm": 0.27113088965415955, |
| "learning_rate": 8.771397772188287e-05, |
| "loss": 0.1908, |
| "step": 8550 |
| }, |
| { |
| "epoch": 0.12360717352794277, |
| "grad_norm": 0.3588126003742218, |
| "learning_rate": 8.764211282788358e-05, |
| "loss": 0.1951, |
| "step": 8600 |
| }, |
| { |
| "epoch": 0.12432581988566337, |
| "grad_norm": 0.2723435163497925, |
| "learning_rate": 8.75702479338843e-05, |
| "loss": 0.1916, |
| "step": 8650 |
| }, |
| { |
| "epoch": 0.12504446624338397, |
| "grad_norm": 0.37845754623413086, |
| "learning_rate": 8.749838303988503e-05, |
| "loss": 0.1967, |
| "step": 8700 |
| }, |
| { |
| "epoch": 0.12576311260110457, |
| "grad_norm": 0.36663511395454407, |
| "learning_rate": 8.742651814588573e-05, |
| "loss": 0.183, |
| "step": 8750 |
| }, |
| { |
| "epoch": 0.12648175895882516, |
| "grad_norm": 0.34882092475891113, |
| "learning_rate": 8.735465325188645e-05, |
| "loss": 0.1895, |
| "step": 8800 |
| }, |
| { |
| "epoch": 0.12720040531654575, |
| "grad_norm": 0.3167667090892792, |
| "learning_rate": 8.728278835788719e-05, |
| "loss": 0.1933, |
| "step": 8850 |
| }, |
| { |
| "epoch": 0.12791905167426634, |
| "grad_norm": 0.4581696093082428, |
| "learning_rate": 8.721092346388789e-05, |
| "loss": 0.1909, |
| "step": 8900 |
| }, |
| { |
| "epoch": 0.12863769803198694, |
| "grad_norm": 0.3376115560531616, |
| "learning_rate": 8.713905856988861e-05, |
| "loss": 0.189, |
| "step": 8950 |
| }, |
| { |
| "epoch": 0.12935634438970756, |
| "grad_norm": 0.4183780550956726, |
| "learning_rate": 8.706719367588933e-05, |
| "loss": 0.1933, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.12935634438970756, |
| "eval_loss": 0.18834719061851501, |
| "eval_runtime": 2348.9887, |
| "eval_samples_per_second": 24.943, |
| "eval_steps_per_second": 3.118, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.13007499074742815, |
| "grad_norm": 0.36423614621162415, |
| "learning_rate": 8.699532878189005e-05, |
| "loss": 0.1926, |
| "step": 9050 |
| }, |
| { |
| "epoch": 0.13079363710514874, |
| "grad_norm": 0.47257256507873535, |
| "learning_rate": 8.692346388789077e-05, |
| "loss": 0.1982, |
| "step": 9100 |
| }, |
| { |
| "epoch": 0.13151228346286933, |
| "grad_norm": 0.2503010034561157, |
| "learning_rate": 8.685159899389149e-05, |
| "loss": 0.1874, |
| "step": 9150 |
| }, |
| { |
| "epoch": 0.13223092982058993, |
| "grad_norm": 0.2888598144054413, |
| "learning_rate": 8.67797340998922e-05, |
| "loss": 0.1927, |
| "step": 9200 |
| }, |
| { |
| "epoch": 0.13294957617831055, |
| "grad_norm": 0.6369989514350891, |
| "learning_rate": 8.670786920589293e-05, |
| "loss": 0.1843, |
| "step": 9250 |
| }, |
| { |
| "epoch": 0.13366822253603114, |
| "grad_norm": 0.33099669218063354, |
| "learning_rate": 8.663600431189364e-05, |
| "loss": 0.1872, |
| "step": 9300 |
| }, |
| { |
| "epoch": 0.13438686889375173, |
| "grad_norm": 0.3384752869606018, |
| "learning_rate": 8.656413941789436e-05, |
| "loss": 0.187, |
| "step": 9350 |
| }, |
| { |
| "epoch": 0.13510551525147232, |
| "grad_norm": 0.3799266219139099, |
| "learning_rate": 8.649227452389508e-05, |
| "loss": 0.1866, |
| "step": 9400 |
| }, |
| { |
| "epoch": 0.13582416160919292, |
| "grad_norm": 0.26931485533714294, |
| "learning_rate": 8.64204096298958e-05, |
| "loss": 0.1815, |
| "step": 9450 |
| }, |
| { |
| "epoch": 0.1365428079669135, |
| "grad_norm": 0.4372073709964752, |
| "learning_rate": 8.634854473589652e-05, |
| "loss": 0.1907, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.13726145432463413, |
| "grad_norm": 0.34800344705581665, |
| "learning_rate": 8.627667984189724e-05, |
| "loss": 0.1832, |
| "step": 9550 |
| }, |
| { |
| "epoch": 0.13798010068235472, |
| "grad_norm": 0.3955633044242859, |
| "learning_rate": 8.620481494789796e-05, |
| "loss": 0.1884, |
| "step": 9600 |
| }, |
| { |
| "epoch": 0.13869874704007532, |
| "grad_norm": 0.37053999304771423, |
| "learning_rate": 8.613295005389868e-05, |
| "loss": 0.1845, |
| "step": 9650 |
| }, |
| { |
| "epoch": 0.1394173933977959, |
| "grad_norm": 0.25118228793144226, |
| "learning_rate": 8.606108515989938e-05, |
| "loss": 0.1785, |
| "step": 9700 |
| }, |
| { |
| "epoch": 0.1401360397555165, |
| "grad_norm": 0.3882904052734375, |
| "learning_rate": 8.598922026590012e-05, |
| "loss": 0.1851, |
| "step": 9750 |
| }, |
| { |
| "epoch": 0.1408546861132371, |
| "grad_norm": 0.2123679518699646, |
| "learning_rate": 8.591735537190084e-05, |
| "loss": 0.1886, |
| "step": 9800 |
| }, |
| { |
| "epoch": 0.1415733324709577, |
| "grad_norm": 0.28561949729919434, |
| "learning_rate": 8.584549047790154e-05, |
| "loss": 0.1766, |
| "step": 9850 |
| }, |
| { |
| "epoch": 0.1422919788286783, |
| "grad_norm": 0.2941133379936218, |
| "learning_rate": 8.577362558390227e-05, |
| "loss": 0.1889, |
| "step": 9900 |
| }, |
| { |
| "epoch": 0.1430106251863989, |
| "grad_norm": 0.3725387454032898, |
| "learning_rate": 8.570176068990299e-05, |
| "loss": 0.1887, |
| "step": 9950 |
| }, |
| { |
| "epoch": 0.1437292715441195, |
| "grad_norm": 0.34473615884780884, |
| "learning_rate": 8.56298957959037e-05, |
| "loss": 0.1876, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.14444791790184008, |
| "grad_norm": 0.3055415749549866, |
| "learning_rate": 8.555803090190443e-05, |
| "loss": 0.1844, |
| "step": 10050 |
| }, |
| { |
| "epoch": 0.1451665642595607, |
| "grad_norm": 0.308893084526062, |
| "learning_rate": 8.548616600790515e-05, |
| "loss": 0.1819, |
| "step": 10100 |
| }, |
| { |
| "epoch": 0.1458852106172813, |
| "grad_norm": 0.36378395557403564, |
| "learning_rate": 8.541430111390586e-05, |
| "loss": 0.1824, |
| "step": 10150 |
| }, |
| { |
| "epoch": 0.1466038569750019, |
| "grad_norm": 0.43480271100997925, |
| "learning_rate": 8.534243621990659e-05, |
| "loss": 0.1843, |
| "step": 10200 |
| }, |
| { |
| "epoch": 0.14732250333272248, |
| "grad_norm": 0.3115452527999878, |
| "learning_rate": 8.527200862378728e-05, |
| "loss": 0.187, |
| "step": 10250 |
| }, |
| { |
| "epoch": 0.14804114969044307, |
| "grad_norm": 0.4598091244697571, |
| "learning_rate": 8.520014372978801e-05, |
| "loss": 0.188, |
| "step": 10300 |
| }, |
| { |
| "epoch": 0.14875979604816367, |
| "grad_norm": 0.3190801739692688, |
| "learning_rate": 8.512827883578872e-05, |
| "loss": 0.1888, |
| "step": 10350 |
| }, |
| { |
| "epoch": 0.1494784424058843, |
| "grad_norm": 0.33876556158065796, |
| "learning_rate": 8.505785123966943e-05, |
| "loss": 0.1945, |
| "step": 10400 |
| }, |
| { |
| "epoch": 0.15019708876360488, |
| "grad_norm": 0.49205857515335083, |
| "learning_rate": 8.498598634567014e-05, |
| "loss": 0.1862, |
| "step": 10450 |
| }, |
| { |
| "epoch": 0.15091573512132547, |
| "grad_norm": 0.2966972887516022, |
| "learning_rate": 8.491412145167086e-05, |
| "loss": 0.1822, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.15163438147904607, |
| "grad_norm": 0.33791208267211914, |
| "learning_rate": 8.484225655767159e-05, |
| "loss": 0.1854, |
| "step": 10550 |
| }, |
| { |
| "epoch": 0.15235302783676666, |
| "grad_norm": 0.34383195638656616, |
| "learning_rate": 8.47703916636723e-05, |
| "loss": 0.1809, |
| "step": 10600 |
| }, |
| { |
| "epoch": 0.15307167419448725, |
| "grad_norm": 0.3117673397064209, |
| "learning_rate": 8.469852676967301e-05, |
| "loss": 0.1782, |
| "step": 10650 |
| }, |
| { |
| "epoch": 0.15379032055220787, |
| "grad_norm": 0.38107767701148987, |
| "learning_rate": 8.462666187567373e-05, |
| "loss": 0.1776, |
| "step": 10700 |
| }, |
| { |
| "epoch": 0.15450896690992846, |
| "grad_norm": 0.30095821619033813, |
| "learning_rate": 8.455479698167445e-05, |
| "loss": 0.1802, |
| "step": 10750 |
| }, |
| { |
| "epoch": 0.15522761326764906, |
| "grad_norm": 0.3196362257003784, |
| "learning_rate": 8.448293208767517e-05, |
| "loss": 0.1823, |
| "step": 10800 |
| }, |
| { |
| "epoch": 0.15594625962536965, |
| "grad_norm": 0.4483869671821594, |
| "learning_rate": 8.441106719367589e-05, |
| "loss": 0.1847, |
| "step": 10850 |
| }, |
| { |
| "epoch": 0.15666490598309024, |
| "grad_norm": 0.32411614060401917, |
| "learning_rate": 8.433920229967661e-05, |
| "loss": 0.1851, |
| "step": 10900 |
| }, |
| { |
| "epoch": 0.15738355234081086, |
| "grad_norm": 0.48899659514427185, |
| "learning_rate": 8.426733740567733e-05, |
| "loss": 0.1864, |
| "step": 10950 |
| }, |
| { |
| "epoch": 0.15810219869853145, |
| "grad_norm": 0.38691896200180054, |
| "learning_rate": 8.419547251167805e-05, |
| "loss": 0.1869, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.15882084505625205, |
| "grad_norm": 0.34995898604393005, |
| "learning_rate": 8.412360761767877e-05, |
| "loss": 0.1796, |
| "step": 11050 |
| }, |
| { |
| "epoch": 0.15953949141397264, |
| "grad_norm": 0.32180970907211304, |
| "learning_rate": 8.405174272367949e-05, |
| "loss": 0.1857, |
| "step": 11100 |
| }, |
| { |
| "epoch": 0.16025813777169323, |
| "grad_norm": 0.286548912525177, |
| "learning_rate": 8.39798778296802e-05, |
| "loss": 0.1817, |
| "step": 11150 |
| }, |
| { |
| "epoch": 0.16097678412941382, |
| "grad_norm": 0.26015704870224, |
| "learning_rate": 8.390801293568093e-05, |
| "loss": 0.1815, |
| "step": 11200 |
| }, |
| { |
| "epoch": 0.16169543048713444, |
| "grad_norm": 0.33400681614875793, |
| "learning_rate": 8.383614804168164e-05, |
| "loss": 0.1842, |
| "step": 11250 |
| }, |
| { |
| "epoch": 0.16241407684485504, |
| "grad_norm": 0.3002636134624481, |
| "learning_rate": 8.376428314768236e-05, |
| "loss": 0.1868, |
| "step": 11300 |
| }, |
| { |
| "epoch": 0.16313272320257563, |
| "grad_norm": 0.43470489978790283, |
| "learning_rate": 8.369241825368308e-05, |
| "loss": 0.1797, |
| "step": 11350 |
| }, |
| { |
| "epoch": 0.16385136956029622, |
| "grad_norm": 0.26626238226890564, |
| "learning_rate": 8.36205533596838e-05, |
| "loss": 0.1812, |
| "step": 11400 |
| }, |
| { |
| "epoch": 0.16457001591801682, |
| "grad_norm": 0.4098244905471802, |
| "learning_rate": 8.354868846568452e-05, |
| "loss": 0.1817, |
| "step": 11450 |
| }, |
| { |
| "epoch": 0.1652886622757374, |
| "grad_norm": 0.39458343386650085, |
| "learning_rate": 8.347682357168524e-05, |
| "loss": 0.1825, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.16600730863345803, |
| "grad_norm": 0.3881920874118805, |
| "learning_rate": 8.340495867768595e-05, |
| "loss": 0.1808, |
| "step": 11550 |
| }, |
| { |
| "epoch": 0.16672595499117862, |
| "grad_norm": 0.4013212323188782, |
| "learning_rate": 8.333309378368668e-05, |
| "loss": 0.1812, |
| "step": 11600 |
| }, |
| { |
| "epoch": 0.1674446013488992, |
| "grad_norm": 0.5738546848297119, |
| "learning_rate": 8.32612288896874e-05, |
| "loss": 0.1787, |
| "step": 11650 |
| }, |
| { |
| "epoch": 0.1681632477066198, |
| "grad_norm": 0.3394758701324463, |
| "learning_rate": 8.31893639956881e-05, |
| "loss": 0.1824, |
| "step": 11700 |
| }, |
| { |
| "epoch": 0.1688818940643404, |
| "grad_norm": 0.3730837106704712, |
| "learning_rate": 8.311749910168884e-05, |
| "loss": 0.1915, |
| "step": 11750 |
| }, |
| { |
| "epoch": 0.16960054042206102, |
| "grad_norm": 0.47480309009552, |
| "learning_rate": 8.304563420768955e-05, |
| "loss": 0.1838, |
| "step": 11800 |
| }, |
| { |
| "epoch": 0.1703191867797816, |
| "grad_norm": 0.37515339255332947, |
| "learning_rate": 8.297376931369026e-05, |
| "loss": 0.1813, |
| "step": 11850 |
| }, |
| { |
| "epoch": 0.1710378331375022, |
| "grad_norm": 0.39568060636520386, |
| "learning_rate": 8.290190441969099e-05, |
| "loss": 0.1792, |
| "step": 11900 |
| }, |
| { |
| "epoch": 0.1717564794952228, |
| "grad_norm": 0.2854001224040985, |
| "learning_rate": 8.283003952569171e-05, |
| "loss": 0.1809, |
| "step": 11950 |
| }, |
| { |
| "epoch": 0.1724751258529434, |
| "grad_norm": 0.2876518964767456, |
| "learning_rate": 8.275817463169242e-05, |
| "loss": 0.1817, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.1724751258529434, |
| "eval_loss": 0.1806262880563736, |
| "eval_runtime": 2341.5668, |
| "eval_samples_per_second": 25.022, |
| "eval_steps_per_second": 3.128, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.17319377221066398, |
| "grad_norm": 0.32629990577697754, |
| "learning_rate": 8.268630973769315e-05, |
| "loss": 0.1814, |
| "step": 12050 |
| }, |
| { |
| "epoch": 0.1739124185683846, |
| "grad_norm": 0.3833070695400238, |
| "learning_rate": 8.261444484369386e-05, |
| "loss": 0.1794, |
| "step": 12100 |
| }, |
| { |
| "epoch": 0.1746310649261052, |
| "grad_norm": 0.3579089343547821, |
| "learning_rate": 8.254257994969457e-05, |
| "loss": 0.177, |
| "step": 12150 |
| }, |
| { |
| "epoch": 0.1753497112838258, |
| "grad_norm": 0.3816784918308258, |
| "learning_rate": 8.247071505569531e-05, |
| "loss": 0.1809, |
| "step": 12200 |
| }, |
| { |
| "epoch": 0.17606835764154638, |
| "grad_norm": 0.23603789508342743, |
| "learning_rate": 8.239885016169601e-05, |
| "loss": 0.1809, |
| "step": 12250 |
| }, |
| { |
| "epoch": 0.17678700399926697, |
| "grad_norm": 0.39853760600090027, |
| "learning_rate": 8.232698526769673e-05, |
| "loss": 0.1836, |
| "step": 12300 |
| }, |
| { |
| "epoch": 0.17750565035698757, |
| "grad_norm": 0.4424062669277191, |
| "learning_rate": 8.225512037369745e-05, |
| "loss": 0.1781, |
| "step": 12350 |
| }, |
| { |
| "epoch": 0.17822429671470819, |
| "grad_norm": 0.36165034770965576, |
| "learning_rate": 8.218325547969817e-05, |
| "loss": 0.1828, |
| "step": 12400 |
| }, |
| { |
| "epoch": 0.17894294307242878, |
| "grad_norm": 0.3120635449886322, |
| "learning_rate": 8.211139058569889e-05, |
| "loss": 0.178, |
| "step": 12450 |
| }, |
| { |
| "epoch": 0.17966158943014937, |
| "grad_norm": 0.37852615118026733, |
| "learning_rate": 8.203952569169961e-05, |
| "loss": 0.185, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.18038023578786996, |
| "grad_norm": 0.38624006509780884, |
| "learning_rate": 8.196766079770033e-05, |
| "loss": 0.1806, |
| "step": 12550 |
| }, |
| { |
| "epoch": 0.18109888214559056, |
| "grad_norm": 0.26871535181999207, |
| "learning_rate": 8.189579590370105e-05, |
| "loss": 0.181, |
| "step": 12600 |
| }, |
| { |
| "epoch": 0.18181752850331118, |
| "grad_norm": 0.3296714425086975, |
| "learning_rate": 8.182393100970177e-05, |
| "loss": 0.1803, |
| "step": 12650 |
| }, |
| { |
| "epoch": 0.18253617486103177, |
| "grad_norm": 0.2623511254787445, |
| "learning_rate": 8.175206611570248e-05, |
| "loss": 0.178, |
| "step": 12700 |
| }, |
| { |
| "epoch": 0.18325482121875236, |
| "grad_norm": 0.35815101861953735, |
| "learning_rate": 8.16802012217032e-05, |
| "loss": 0.1766, |
| "step": 12750 |
| }, |
| { |
| "epoch": 0.18397346757647295, |
| "grad_norm": 0.3817836046218872, |
| "learning_rate": 8.160833632770391e-05, |
| "loss": 0.1845, |
| "step": 12800 |
| }, |
| { |
| "epoch": 0.18469211393419355, |
| "grad_norm": 0.2875792682170868, |
| "learning_rate": 8.153647143370464e-05, |
| "loss": 0.1781, |
| "step": 12850 |
| }, |
| { |
| "epoch": 0.18541076029191414, |
| "grad_norm": 0.3971846103668213, |
| "learning_rate": 8.146460653970536e-05, |
| "loss": 0.1766, |
| "step": 12900 |
| }, |
| { |
| "epoch": 0.18612940664963476, |
| "grad_norm": 0.39499327540397644, |
| "learning_rate": 8.139274164570607e-05, |
| "loss": 0.1778, |
| "step": 12950 |
| }, |
| { |
| "epoch": 0.18684805300735535, |
| "grad_norm": 0.45246168971061707, |
| "learning_rate": 8.13208767517068e-05, |
| "loss": 0.1798, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.18756669936507595, |
| "grad_norm": 0.2600598931312561, |
| "learning_rate": 8.124901185770752e-05, |
| "loss": 0.1777, |
| "step": 13050 |
| }, |
| { |
| "epoch": 0.18828534572279654, |
| "grad_norm": 0.29498159885406494, |
| "learning_rate": 8.117714696370822e-05, |
| "loss": 0.1834, |
| "step": 13100 |
| }, |
| { |
| "epoch": 0.18900399208051713, |
| "grad_norm": 0.3159259855747223, |
| "learning_rate": 8.110528206970896e-05, |
| "loss": 0.1769, |
| "step": 13150 |
| }, |
| { |
| "epoch": 0.18972263843823772, |
| "grad_norm": 0.3468966484069824, |
| "learning_rate": 8.103341717570968e-05, |
| "loss": 0.1812, |
| "step": 13200 |
| }, |
| { |
| "epoch": 0.19044128479595834, |
| "grad_norm": 0.2982788681983948, |
| "learning_rate": 8.096155228171038e-05, |
| "loss": 0.1793, |
| "step": 13250 |
| }, |
| { |
| "epoch": 0.19115993115367894, |
| "grad_norm": 0.40844494104385376, |
| "learning_rate": 8.088968738771111e-05, |
| "loss": 0.1765, |
| "step": 13300 |
| }, |
| { |
| "epoch": 0.19187857751139953, |
| "grad_norm": 0.35525286197662354, |
| "learning_rate": 8.081782249371182e-05, |
| "loss": 0.1739, |
| "step": 13350 |
| }, |
| { |
| "epoch": 0.19259722386912012, |
| "grad_norm": 0.42295753955841064, |
| "learning_rate": 8.074595759971254e-05, |
| "loss": 0.178, |
| "step": 13400 |
| }, |
| { |
| "epoch": 0.1933158702268407, |
| "grad_norm": 0.28371748328208923, |
| "learning_rate": 8.067409270571327e-05, |
| "loss": 0.1768, |
| "step": 13450 |
| }, |
| { |
| "epoch": 0.19403451658456133, |
| "grad_norm": 0.36987873911857605, |
| "learning_rate": 8.060222781171398e-05, |
| "loss": 0.1795, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.19475316294228193, |
| "grad_norm": 0.3212873339653015, |
| "learning_rate": 8.05303629177147e-05, |
| "loss": 0.1751, |
| "step": 13550 |
| }, |
| { |
| "epoch": 0.19547180930000252, |
| "grad_norm": 0.3947288393974304, |
| "learning_rate": 8.045849802371543e-05, |
| "loss": 0.1727, |
| "step": 13600 |
| }, |
| { |
| "epoch": 0.1961904556577231, |
| "grad_norm": 0.28673598170280457, |
| "learning_rate": 8.038663312971613e-05, |
| "loss": 0.1863, |
| "step": 13650 |
| }, |
| { |
| "epoch": 0.1969091020154437, |
| "grad_norm": 0.28890731930732727, |
| "learning_rate": 8.031476823571685e-05, |
| "loss": 0.1777, |
| "step": 13700 |
| }, |
| { |
| "epoch": 0.1976277483731643, |
| "grad_norm": 0.36219891905784607, |
| "learning_rate": 8.024290334171759e-05, |
| "loss": 0.1745, |
| "step": 13750 |
| }, |
| { |
| "epoch": 0.19834639473088492, |
| "grad_norm": 0.37695997953414917, |
| "learning_rate": 8.017103844771829e-05, |
| "loss": 0.1807, |
| "step": 13800 |
| }, |
| { |
| "epoch": 0.1990650410886055, |
| "grad_norm": 0.3192157447338104, |
| "learning_rate": 8.009917355371901e-05, |
| "loss": 0.173, |
| "step": 13850 |
| }, |
| { |
| "epoch": 0.1997836874463261, |
| "grad_norm": 0.4570382833480835, |
| "learning_rate": 8.002730865971973e-05, |
| "loss": 0.1723, |
| "step": 13900 |
| }, |
| { |
| "epoch": 0.2005023338040467, |
| "grad_norm": 0.3775467276573181, |
| "learning_rate": 7.995544376572045e-05, |
| "loss": 0.1814, |
| "step": 13950 |
| }, |
| { |
| "epoch": 0.2012209801617673, |
| "grad_norm": 0.32216453552246094, |
| "learning_rate": 7.988357887172117e-05, |
| "loss": 0.1788, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.20193962651948788, |
| "grad_norm": 0.30940186977386475, |
| "learning_rate": 7.981171397772189e-05, |
| "loss": 0.1767, |
| "step": 14050 |
| }, |
| { |
| "epoch": 0.2026582728772085, |
| "grad_norm": 0.5685888528823853, |
| "learning_rate": 7.97398490837226e-05, |
| "loss": 0.1824, |
| "step": 14100 |
| }, |
| { |
| "epoch": 0.2033769192349291, |
| "grad_norm": 0.3270226716995239, |
| "learning_rate": 7.966798418972333e-05, |
| "loss": 0.1816, |
| "step": 14150 |
| }, |
| { |
| "epoch": 0.20409556559264969, |
| "grad_norm": 0.2310134619474411, |
| "learning_rate": 7.959611929572404e-05, |
| "loss": 0.1812, |
| "step": 14200 |
| }, |
| { |
| "epoch": 0.20481421195037028, |
| "grad_norm": 0.33357006311416626, |
| "learning_rate": 7.952425440172476e-05, |
| "loss": 0.1783, |
| "step": 14250 |
| }, |
| { |
| "epoch": 0.20553285830809087, |
| "grad_norm": 0.26054659485816956, |
| "learning_rate": 7.945238950772548e-05, |
| "loss": 0.1752, |
| "step": 14300 |
| }, |
| { |
| "epoch": 0.20625150466581146, |
| "grad_norm": 0.2842780649662018, |
| "learning_rate": 7.93805246137262e-05, |
| "loss": 0.1738, |
| "step": 14350 |
| }, |
| { |
| "epoch": 0.20697015102353208, |
| "grad_norm": 0.4128149747848511, |
| "learning_rate": 7.930865971972692e-05, |
| "loss": 0.1705, |
| "step": 14400 |
| }, |
| { |
| "epoch": 0.20768879738125268, |
| "grad_norm": 0.36563193798065186, |
| "learning_rate": 7.923823212360762e-05, |
| "loss": 0.1683, |
| "step": 14450 |
| }, |
| { |
| "epoch": 0.20840744373897327, |
| "grad_norm": 0.32785964012145996, |
| "learning_rate": 7.916636722960834e-05, |
| "loss": 0.1705, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.20912609009669386, |
| "grad_norm": 0.3670963943004608, |
| "learning_rate": 7.909450233560906e-05, |
| "loss": 0.1786, |
| "step": 14550 |
| }, |
| { |
| "epoch": 0.20984473645441445, |
| "grad_norm": 0.33668437600135803, |
| "learning_rate": 7.902263744160978e-05, |
| "loss": 0.1715, |
| "step": 14600 |
| }, |
| { |
| "epoch": 0.21056338281213507, |
| "grad_norm": 0.29565784335136414, |
| "learning_rate": 7.89507725476105e-05, |
| "loss": 0.1726, |
| "step": 14650 |
| }, |
| { |
| "epoch": 0.21128202916985567, |
| "grad_norm": 0.42704764008522034, |
| "learning_rate": 7.88789076536112e-05, |
| "loss": 0.1814, |
| "step": 14700 |
| }, |
| { |
| "epoch": 0.21200067552757626, |
| "grad_norm": 0.49560704827308655, |
| "learning_rate": 7.880704275961194e-05, |
| "loss": 0.1763, |
| "step": 14750 |
| }, |
| { |
| "epoch": 0.21271932188529685, |
| "grad_norm": 0.39118367433547974, |
| "learning_rate": 7.873517786561266e-05, |
| "loss": 0.1784, |
| "step": 14800 |
| }, |
| { |
| "epoch": 0.21343796824301745, |
| "grad_norm": 0.30129724740982056, |
| "learning_rate": 7.866331297161336e-05, |
| "loss": 0.1768, |
| "step": 14850 |
| }, |
| { |
| "epoch": 0.21415661460073804, |
| "grad_norm": 0.3319786787033081, |
| "learning_rate": 7.85914480776141e-05, |
| "loss": 0.1786, |
| "step": 14900 |
| }, |
| { |
| "epoch": 0.21487526095845866, |
| "grad_norm": 0.3003113865852356, |
| "learning_rate": 7.851958318361481e-05, |
| "loss": 0.1791, |
| "step": 14950 |
| }, |
| { |
| "epoch": 0.21559390731617925, |
| "grad_norm": 0.34315207600593567, |
| "learning_rate": 7.844771828961552e-05, |
| "loss": 0.173, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.21559390731617925, |
| "eval_loss": 0.17433789372444153, |
| "eval_runtime": 2337.8325, |
| "eval_samples_per_second": 25.062, |
| "eval_steps_per_second": 3.133, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.21631255367389984, |
| "grad_norm": 0.4181901812553406, |
| "learning_rate": 7.837585339561625e-05, |
| "loss": 0.1783, |
| "step": 15050 |
| }, |
| { |
| "epoch": 0.21703120003162044, |
| "grad_norm": 0.3226313889026642, |
| "learning_rate": 7.830398850161696e-05, |
| "loss": 0.1703, |
| "step": 15100 |
| }, |
| { |
| "epoch": 0.21774984638934103, |
| "grad_norm": 0.2730088233947754, |
| "learning_rate": 7.823212360761768e-05, |
| "loss": 0.1757, |
| "step": 15150 |
| }, |
| { |
| "epoch": 0.21846849274706162, |
| "grad_norm": 0.39994746446609497, |
| "learning_rate": 7.816025871361841e-05, |
| "loss": 0.1755, |
| "step": 15200 |
| }, |
| { |
| "epoch": 0.21918713910478224, |
| "grad_norm": 0.3416014015674591, |
| "learning_rate": 7.808839381961912e-05, |
| "loss": 0.178, |
| "step": 15250 |
| }, |
| { |
| "epoch": 0.21990578546250283, |
| "grad_norm": 0.3324638605117798, |
| "learning_rate": 7.801652892561983e-05, |
| "loss": 0.174, |
| "step": 15300 |
| }, |
| { |
| "epoch": 0.22062443182022343, |
| "grad_norm": 0.4316116273403168, |
| "learning_rate": 7.794466403162057e-05, |
| "loss": 0.1701, |
| "step": 15350 |
| }, |
| { |
| "epoch": 0.22134307817794402, |
| "grad_norm": 0.3134278655052185, |
| "learning_rate": 7.787279913762127e-05, |
| "loss": 0.1761, |
| "step": 15400 |
| }, |
| { |
| "epoch": 0.2220617245356646, |
| "grad_norm": 0.31829679012298584, |
| "learning_rate": 7.780093424362199e-05, |
| "loss": 0.1744, |
| "step": 15450 |
| }, |
| { |
| "epoch": 0.22278037089338523, |
| "grad_norm": 0.7381129264831543, |
| "learning_rate": 7.772906934962272e-05, |
| "loss": 0.1722, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.22349901725110582, |
| "grad_norm": 0.28715264797210693, |
| "learning_rate": 7.765720445562343e-05, |
| "loss": 0.1757, |
| "step": 15550 |
| }, |
| { |
| "epoch": 0.22421766360882642, |
| "grad_norm": 0.307982474565506, |
| "learning_rate": 7.758533956162415e-05, |
| "loss": 0.169, |
| "step": 15600 |
| }, |
| { |
| "epoch": 0.224936309966547, |
| "grad_norm": 0.45478326082229614, |
| "learning_rate": 7.751347466762487e-05, |
| "loss": 0.1792, |
| "step": 15650 |
| }, |
| { |
| "epoch": 0.2256549563242676, |
| "grad_norm": 0.45371562242507935, |
| "learning_rate": 7.744160977362559e-05, |
| "loss": 0.18, |
| "step": 15700 |
| }, |
| { |
| "epoch": 0.2263736026819882, |
| "grad_norm": 0.3575720489025116, |
| "learning_rate": 7.73697448796263e-05, |
| "loss": 0.1764, |
| "step": 15750 |
| }, |
| { |
| "epoch": 0.22709224903970882, |
| "grad_norm": 0.41387391090393066, |
| "learning_rate": 7.729787998562703e-05, |
| "loss": 0.1697, |
| "step": 15800 |
| }, |
| { |
| "epoch": 0.2278108953974294, |
| "grad_norm": 0.4434982240200043, |
| "learning_rate": 7.722601509162774e-05, |
| "loss": 0.1742, |
| "step": 15850 |
| }, |
| { |
| "epoch": 0.22852954175515, |
| "grad_norm": 0.44597572088241577, |
| "learning_rate": 7.715415019762846e-05, |
| "loss": 0.1745, |
| "step": 15900 |
| }, |
| { |
| "epoch": 0.2292481881128706, |
| "grad_norm": 0.3628135323524475, |
| "learning_rate": 7.708228530362917e-05, |
| "loss": 0.1703, |
| "step": 15950 |
| }, |
| { |
| "epoch": 0.22996683447059119, |
| "grad_norm": 0.28047481179237366, |
| "learning_rate": 7.70104204096299e-05, |
| "loss": 0.1743, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.23068548082831178, |
| "grad_norm": 0.3026706278324127, |
| "learning_rate": 7.693855551563062e-05, |
| "loss": 0.1694, |
| "step": 16050 |
| }, |
| { |
| "epoch": 0.2314041271860324, |
| "grad_norm": 0.3518725037574768, |
| "learning_rate": 7.686669062163133e-05, |
| "loss": 0.1751, |
| "step": 16100 |
| }, |
| { |
| "epoch": 0.232122773543753, |
| "grad_norm": 0.41190633177757263, |
| "learning_rate": 7.679482572763206e-05, |
| "loss": 0.1744, |
| "step": 16150 |
| }, |
| { |
| "epoch": 0.23284141990147358, |
| "grad_norm": 0.39950138330459595, |
| "learning_rate": 7.672296083363278e-05, |
| "loss": 0.1714, |
| "step": 16200 |
| }, |
| { |
| "epoch": 0.23356006625919418, |
| "grad_norm": 0.3585628867149353, |
| "learning_rate": 7.665109593963348e-05, |
| "loss": 0.1693, |
| "step": 16250 |
| }, |
| { |
| "epoch": 0.23427871261691477, |
| "grad_norm": 0.4004826843738556, |
| "learning_rate": 7.657923104563422e-05, |
| "loss": 0.1755, |
| "step": 16300 |
| }, |
| { |
| "epoch": 0.2349973589746354, |
| "grad_norm": 0.360116183757782, |
| "learning_rate": 7.650736615163494e-05, |
| "loss": 0.1673, |
| "step": 16350 |
| }, |
| { |
| "epoch": 0.23571600533235598, |
| "grad_norm": 0.344833642244339, |
| "learning_rate": 7.643550125763564e-05, |
| "loss": 0.1746, |
| "step": 16400 |
| }, |
| { |
| "epoch": 0.23643465169007657, |
| "grad_norm": 0.32852986454963684, |
| "learning_rate": 7.636363636363637e-05, |
| "loss": 0.1748, |
| "step": 16450 |
| }, |
| { |
| "epoch": 0.23715329804779717, |
| "grad_norm": 0.3062026798725128, |
| "learning_rate": 7.629177146963708e-05, |
| "loss": 0.1755, |
| "step": 16500 |
| }, |
| { |
| "epoch": 0.23787194440551776, |
| "grad_norm": 0.4373960793018341, |
| "learning_rate": 7.62199065756378e-05, |
| "loss": 0.1759, |
| "step": 16550 |
| }, |
| { |
| "epoch": 0.23859059076323835, |
| "grad_norm": 0.36062178015708923, |
| "learning_rate": 7.614804168163853e-05, |
| "loss": 0.1693, |
| "step": 16600 |
| }, |
| { |
| "epoch": 0.23930923712095897, |
| "grad_norm": 0.514687716960907, |
| "learning_rate": 7.607905138339922e-05, |
| "loss": 0.1721, |
| "step": 16650 |
| }, |
| { |
| "epoch": 0.24002788347867957, |
| "grad_norm": 0.2862984538078308, |
| "learning_rate": 7.600718648939992e-05, |
| "loss": 0.1748, |
| "step": 16700 |
| }, |
| { |
| "epoch": 0.24074652983640016, |
| "grad_norm": 0.4001787304878235, |
| "learning_rate": 7.593532159540066e-05, |
| "loss": 0.1663, |
| "step": 16750 |
| }, |
| { |
| "epoch": 0.24146517619412075, |
| "grad_norm": 0.4565765857696533, |
| "learning_rate": 7.586345670140138e-05, |
| "loss": 0.1766, |
| "step": 16800 |
| }, |
| { |
| "epoch": 0.24218382255184134, |
| "grad_norm": 0.2525484561920166, |
| "learning_rate": 7.579159180740208e-05, |
| "loss": 0.172, |
| "step": 16850 |
| }, |
| { |
| "epoch": 0.24290246890956194, |
| "grad_norm": 0.5037934184074402, |
| "learning_rate": 7.571972691340281e-05, |
| "loss": 0.1735, |
| "step": 16900 |
| }, |
| { |
| "epoch": 0.24362111526728256, |
| "grad_norm": 0.33634111285209656, |
| "learning_rate": 7.564786201940352e-05, |
| "loss": 0.177, |
| "step": 16950 |
| }, |
| { |
| "epoch": 0.24433976162500315, |
| "grad_norm": 0.31144094467163086, |
| "learning_rate": 7.557599712540424e-05, |
| "loss": 0.1718, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.24505840798272374, |
| "grad_norm": 0.2749430239200592, |
| "learning_rate": 7.550413223140497e-05, |
| "loss": 0.1739, |
| "step": 17050 |
| }, |
| { |
| "epoch": 0.24577705434044433, |
| "grad_norm": 0.4118638336658478, |
| "learning_rate": 7.543226733740568e-05, |
| "loss": 0.173, |
| "step": 17100 |
| }, |
| { |
| "epoch": 0.24649570069816493, |
| "grad_norm": 0.2785220146179199, |
| "learning_rate": 7.53604024434064e-05, |
| "loss": 0.1749, |
| "step": 17150 |
| }, |
| { |
| "epoch": 0.24721434705588555, |
| "grad_norm": 0.3517521321773529, |
| "learning_rate": 7.528853754940713e-05, |
| "loss": 0.1734, |
| "step": 17200 |
| }, |
| { |
| "epoch": 0.24793299341360614, |
| "grad_norm": 0.42283421754837036, |
| "learning_rate": 7.521667265540783e-05, |
| "loss": 0.1717, |
| "step": 17250 |
| }, |
| { |
| "epoch": 0.24865163977132673, |
| "grad_norm": 0.3272489607334137, |
| "learning_rate": 7.514480776140855e-05, |
| "loss": 0.1745, |
| "step": 17300 |
| }, |
| { |
| "epoch": 0.24937028612904732, |
| "grad_norm": 0.3763234317302704, |
| "learning_rate": 7.507294286740929e-05, |
| "loss": 0.1716, |
| "step": 17350 |
| }, |
| { |
| "epoch": 0.25008893248676795, |
| "grad_norm": 0.3796086013317108, |
| "learning_rate": 7.500107797340999e-05, |
| "loss": 0.1639, |
| "step": 17400 |
| }, |
| { |
| "epoch": 0.2508075788444885, |
| "grad_norm": 0.3655567169189453, |
| "learning_rate": 7.492921307941071e-05, |
| "loss": 0.1716, |
| "step": 17450 |
| }, |
| { |
| "epoch": 0.25152622520220913, |
| "grad_norm": 0.323050320148468, |
| "learning_rate": 7.485734818541143e-05, |
| "loss": 0.1704, |
| "step": 17500 |
| }, |
| { |
| "epoch": 0.2522448715599297, |
| "grad_norm": 0.3040378987789154, |
| "learning_rate": 7.478548329141215e-05, |
| "loss": 0.1655, |
| "step": 17550 |
| }, |
| { |
| "epoch": 0.2529635179176503, |
| "grad_norm": 0.371359646320343, |
| "learning_rate": 7.471361839741287e-05, |
| "loss": 0.1729, |
| "step": 17600 |
| }, |
| { |
| "epoch": 0.25368216427537094, |
| "grad_norm": 0.3760315775871277, |
| "learning_rate": 7.464175350341359e-05, |
| "loss": 0.1671, |
| "step": 17650 |
| }, |
| { |
| "epoch": 0.2544008106330915, |
| "grad_norm": 0.325546532869339, |
| "learning_rate": 7.45698886094143e-05, |
| "loss": 0.168, |
| "step": 17700 |
| }, |
| { |
| "epoch": 0.2551194569908121, |
| "grad_norm": 0.3608609735965729, |
| "learning_rate": 7.449802371541503e-05, |
| "loss": 0.1694, |
| "step": 17750 |
| }, |
| { |
| "epoch": 0.2558381033485327, |
| "grad_norm": 0.49022817611694336, |
| "learning_rate": 7.442615882141573e-05, |
| "loss": 0.1692, |
| "step": 17800 |
| }, |
| { |
| "epoch": 0.2565567497062533, |
| "grad_norm": 0.3275602459907532, |
| "learning_rate": 7.435429392741646e-05, |
| "loss": 0.1709, |
| "step": 17850 |
| }, |
| { |
| "epoch": 0.25727539606397387, |
| "grad_norm": 0.3520912230014801, |
| "learning_rate": 7.428242903341718e-05, |
| "loss": 0.1657, |
| "step": 17900 |
| }, |
| { |
| "epoch": 0.2579940424216945, |
| "grad_norm": 0.4564454257488251, |
| "learning_rate": 7.421056413941789e-05, |
| "loss": 0.1724, |
| "step": 17950 |
| }, |
| { |
| "epoch": 0.2587126887794151, |
| "grad_norm": 0.4266446530818939, |
| "learning_rate": 7.413869924541862e-05, |
| "loss": 0.1747, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.2587126887794151, |
| "eval_loss": 0.16929900646209717, |
| "eval_runtime": 2331.677, |
| "eval_samples_per_second": 25.128, |
| "eval_steps_per_second": 3.141, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.2594313351371357, |
| "grad_norm": 0.32367444038391113, |
| "learning_rate": 7.406683435141934e-05, |
| "loss": 0.1693, |
| "step": 18050 |
| }, |
| { |
| "epoch": 0.2601499814948563, |
| "grad_norm": 0.3971792161464691, |
| "learning_rate": 7.399496945742005e-05, |
| "loss": 0.1691, |
| "step": 18100 |
| }, |
| { |
| "epoch": 0.26086862785257686, |
| "grad_norm": 0.37220925092697144, |
| "learning_rate": 7.392310456342078e-05, |
| "loss": 0.1671, |
| "step": 18150 |
| }, |
| { |
| "epoch": 0.2615872742102975, |
| "grad_norm": 0.3948183059692383, |
| "learning_rate": 7.385123966942148e-05, |
| "loss": 0.1696, |
| "step": 18200 |
| }, |
| { |
| "epoch": 0.2623059205680181, |
| "grad_norm": 0.3619794249534607, |
| "learning_rate": 7.37808120733022e-05, |
| "loss": 0.1686, |
| "step": 18250 |
| }, |
| { |
| "epoch": 0.26302456692573867, |
| "grad_norm": 0.3705451190471649, |
| "learning_rate": 7.37089471793029e-05, |
| "loss": 0.1731, |
| "step": 18300 |
| }, |
| { |
| "epoch": 0.2637432132834593, |
| "grad_norm": 0.609754204750061, |
| "learning_rate": 7.363708228530364e-05, |
| "loss": 0.1683, |
| "step": 18350 |
| }, |
| { |
| "epoch": 0.26446185964117985, |
| "grad_norm": 0.42656949162483215, |
| "learning_rate": 7.356521739130436e-05, |
| "loss": 0.1684, |
| "step": 18400 |
| }, |
| { |
| "epoch": 0.2651805059989005, |
| "grad_norm": 0.34473884105682373, |
| "learning_rate": 7.349335249730506e-05, |
| "loss": 0.1701, |
| "step": 18450 |
| }, |
| { |
| "epoch": 0.2658991523566211, |
| "grad_norm": 0.37473055720329285, |
| "learning_rate": 7.34214876033058e-05, |
| "loss": 0.1681, |
| "step": 18500 |
| }, |
| { |
| "epoch": 0.26661779871434166, |
| "grad_norm": 0.3349181115627289, |
| "learning_rate": 7.334962270930651e-05, |
| "loss": 0.1732, |
| "step": 18550 |
| }, |
| { |
| "epoch": 0.2673364450720623, |
| "grad_norm": 0.3274621069431305, |
| "learning_rate": 7.327775781530722e-05, |
| "loss": 0.168, |
| "step": 18600 |
| }, |
| { |
| "epoch": 0.26805509142978284, |
| "grad_norm": 0.2889033257961273, |
| "learning_rate": 7.320589292130794e-05, |
| "loss": 0.1651, |
| "step": 18650 |
| }, |
| { |
| "epoch": 0.26877373778750346, |
| "grad_norm": 0.4824863374233246, |
| "learning_rate": 7.313402802730866e-05, |
| "loss": 0.1716, |
| "step": 18700 |
| }, |
| { |
| "epoch": 0.26949238414522403, |
| "grad_norm": 0.4139736294746399, |
| "learning_rate": 7.306216313330938e-05, |
| "loss": 0.1713, |
| "step": 18750 |
| }, |
| { |
| "epoch": 0.27021103050294465, |
| "grad_norm": 0.3236710727214813, |
| "learning_rate": 7.29902982393101e-05, |
| "loss": 0.1668, |
| "step": 18800 |
| }, |
| { |
| "epoch": 0.27092967686066527, |
| "grad_norm": 0.3383229970932007, |
| "learning_rate": 7.291843334531082e-05, |
| "loss": 0.1679, |
| "step": 18850 |
| }, |
| { |
| "epoch": 0.27164832321838583, |
| "grad_norm": 0.30610159039497375, |
| "learning_rate": 7.284656845131154e-05, |
| "loss": 0.1683, |
| "step": 18900 |
| }, |
| { |
| "epoch": 0.27236696957610645, |
| "grad_norm": 0.3125470280647278, |
| "learning_rate": 7.277470355731225e-05, |
| "loss": 0.1716, |
| "step": 18950 |
| }, |
| { |
| "epoch": 0.273085615933827, |
| "grad_norm": 0.4086760878562927, |
| "learning_rate": 7.270283866331297e-05, |
| "loss": 0.1664, |
| "step": 19000 |
| }, |
| { |
| "epoch": 0.27380426229154764, |
| "grad_norm": 0.3585663139820099, |
| "learning_rate": 7.263097376931369e-05, |
| "loss": 0.1647, |
| "step": 19050 |
| }, |
| { |
| "epoch": 0.27452290864926826, |
| "grad_norm": 0.3493901491165161, |
| "learning_rate": 7.255910887531441e-05, |
| "loss": 0.1671, |
| "step": 19100 |
| }, |
| { |
| "epoch": 0.2752415550069888, |
| "grad_norm": 0.37844619154930115, |
| "learning_rate": 7.248724398131513e-05, |
| "loss": 0.1654, |
| "step": 19150 |
| }, |
| { |
| "epoch": 0.27596020136470945, |
| "grad_norm": 0.3647208511829376, |
| "learning_rate": 7.241537908731585e-05, |
| "loss": 0.1672, |
| "step": 19200 |
| }, |
| { |
| "epoch": 0.27667884772243, |
| "grad_norm": 0.3517756462097168, |
| "learning_rate": 7.234351419331657e-05, |
| "loss": 0.1696, |
| "step": 19250 |
| }, |
| { |
| "epoch": 0.27739749408015063, |
| "grad_norm": 0.3229799270629883, |
| "learning_rate": 7.227164929931729e-05, |
| "loss": 0.166, |
| "step": 19300 |
| }, |
| { |
| "epoch": 0.27811614043787125, |
| "grad_norm": 0.2893189489841461, |
| "learning_rate": 7.219978440531801e-05, |
| "loss": 0.1642, |
| "step": 19350 |
| }, |
| { |
| "epoch": 0.2788347867955918, |
| "grad_norm": 0.3653119206428528, |
| "learning_rate": 7.212791951131873e-05, |
| "loss": 0.1638, |
| "step": 19400 |
| }, |
| { |
| "epoch": 0.27955343315331244, |
| "grad_norm": 0.3465476632118225, |
| "learning_rate": 7.205605461731945e-05, |
| "loss": 0.1724, |
| "step": 19450 |
| }, |
| { |
| "epoch": 0.280272079511033, |
| "grad_norm": 0.3858092129230499, |
| "learning_rate": 7.198418972332016e-05, |
| "loss": 0.1654, |
| "step": 19500 |
| }, |
| { |
| "epoch": 0.2809907258687536, |
| "grad_norm": 0.45877212285995483, |
| "learning_rate": 7.191232482932087e-05, |
| "loss": 0.1655, |
| "step": 19550 |
| }, |
| { |
| "epoch": 0.2817093722264742, |
| "grad_norm": 0.31164032220840454, |
| "learning_rate": 7.18404599353216e-05, |
| "loss": 0.1697, |
| "step": 19600 |
| }, |
| { |
| "epoch": 0.2824280185841948, |
| "grad_norm": 0.35051417350769043, |
| "learning_rate": 7.176859504132232e-05, |
| "loss": 0.1608, |
| "step": 19650 |
| }, |
| { |
| "epoch": 0.2831466649419154, |
| "grad_norm": 0.4664309620857239, |
| "learning_rate": 7.169673014732303e-05, |
| "loss": 0.1629, |
| "step": 19700 |
| }, |
| { |
| "epoch": 0.283865311299636, |
| "grad_norm": 0.29969966411590576, |
| "learning_rate": 7.162486525332376e-05, |
| "loss": 0.1711, |
| "step": 19750 |
| }, |
| { |
| "epoch": 0.2845839576573566, |
| "grad_norm": 0.3115592300891876, |
| "learning_rate": 7.155300035932448e-05, |
| "loss": 0.1585, |
| "step": 19800 |
| }, |
| { |
| "epoch": 0.2853026040150772, |
| "grad_norm": 0.35464227199554443, |
| "learning_rate": 7.148113546532518e-05, |
| "loss": 0.1615, |
| "step": 19850 |
| }, |
| { |
| "epoch": 0.2860212503727978, |
| "grad_norm": 0.30538317561149597, |
| "learning_rate": 7.140927057132592e-05, |
| "loss": 0.1658, |
| "step": 19900 |
| }, |
| { |
| "epoch": 0.2867398967305184, |
| "grad_norm": 0.4378170073032379, |
| "learning_rate": 7.133740567732664e-05, |
| "loss": 0.1666, |
| "step": 19950 |
| }, |
| { |
| "epoch": 0.287458543088239, |
| "grad_norm": 0.44222235679626465, |
| "learning_rate": 7.126554078332734e-05, |
| "loss": 0.1665, |
| "step": 20000 |
| }, |
| { |
| "epoch": 0.2881771894459596, |
| "grad_norm": 0.3468632400035858, |
| "learning_rate": 7.119367588932807e-05, |
| "loss": 0.1641, |
| "step": 20050 |
| }, |
| { |
| "epoch": 0.28889583580368017, |
| "grad_norm": 0.2810463607311249, |
| "learning_rate": 7.112181099532878e-05, |
| "loss": 0.1681, |
| "step": 20100 |
| }, |
| { |
| "epoch": 0.2896144821614008, |
| "grad_norm": 0.30284568667411804, |
| "learning_rate": 7.10499461013295e-05, |
| "loss": 0.1682, |
| "step": 20150 |
| }, |
| { |
| "epoch": 0.2903331285191214, |
| "grad_norm": 0.5109066367149353, |
| "learning_rate": 7.097808120733023e-05, |
| "loss": 0.1635, |
| "step": 20200 |
| }, |
| { |
| "epoch": 0.291051774876842, |
| "grad_norm": 0.32616057991981506, |
| "learning_rate": 7.090621631333094e-05, |
| "loss": 0.1596, |
| "step": 20250 |
| }, |
| { |
| "epoch": 0.2917704212345626, |
| "grad_norm": 0.31997406482696533, |
| "learning_rate": 7.083435141933166e-05, |
| "loss": 0.1652, |
| "step": 20300 |
| }, |
| { |
| "epoch": 0.29248906759228316, |
| "grad_norm": 0.3968636989593506, |
| "learning_rate": 7.076248652533239e-05, |
| "loss": 0.1657, |
| "step": 20350 |
| }, |
| { |
| "epoch": 0.2932077139500038, |
| "grad_norm": 0.3720139265060425, |
| "learning_rate": 7.06906216313331e-05, |
| "loss": 0.1649, |
| "step": 20400 |
| }, |
| { |
| "epoch": 0.29392636030772434, |
| "grad_norm": 0.3689219653606415, |
| "learning_rate": 7.061875673733381e-05, |
| "loss": 0.1654, |
| "step": 20450 |
| }, |
| { |
| "epoch": 0.29464500666544496, |
| "grad_norm": 0.4002225995063782, |
| "learning_rate": 7.054689184333453e-05, |
| "loss": 0.1603, |
| "step": 20500 |
| }, |
| { |
| "epoch": 0.2953636530231656, |
| "grad_norm": 0.30403926968574524, |
| "learning_rate": 7.047502694933525e-05, |
| "loss": 0.1637, |
| "step": 20550 |
| }, |
| { |
| "epoch": 0.29608229938088615, |
| "grad_norm": 0.42684683203697205, |
| "learning_rate": 7.040316205533597e-05, |
| "loss": 0.1665, |
| "step": 20600 |
| }, |
| { |
| "epoch": 0.29680094573860677, |
| "grad_norm": 0.3254065215587616, |
| "learning_rate": 7.033129716133669e-05, |
| "loss": 0.1653, |
| "step": 20650 |
| }, |
| { |
| "epoch": 0.29751959209632733, |
| "grad_norm": 0.2438507229089737, |
| "learning_rate": 7.025943226733741e-05, |
| "loss": 0.1694, |
| "step": 20700 |
| }, |
| { |
| "epoch": 0.29823823845404795, |
| "grad_norm": 0.49997007846832275, |
| "learning_rate": 7.018756737333813e-05, |
| "loss": 0.1679, |
| "step": 20750 |
| }, |
| { |
| "epoch": 0.2989568848117686, |
| "grad_norm": 0.417477548122406, |
| "learning_rate": 7.011570247933885e-05, |
| "loss": 0.1648, |
| "step": 20800 |
| }, |
| { |
| "epoch": 0.29967553116948914, |
| "grad_norm": 0.33265259861946106, |
| "learning_rate": 7.004383758533957e-05, |
| "loss": 0.1648, |
| "step": 20850 |
| }, |
| { |
| "epoch": 0.30039417752720976, |
| "grad_norm": 0.37442734837532043, |
| "learning_rate": 6.997197269134029e-05, |
| "loss": 0.1648, |
| "step": 20900 |
| }, |
| { |
| "epoch": 0.3011128238849303, |
| "grad_norm": 0.4252796173095703, |
| "learning_rate": 6.990010779734099e-05, |
| "loss": 0.1684, |
| "step": 20950 |
| }, |
| { |
| "epoch": 0.30183147024265095, |
| "grad_norm": 0.5813310742378235, |
| "learning_rate": 6.982824290334172e-05, |
| "loss": 0.1629, |
| "step": 21000 |
| }, |
| { |
| "epoch": 0.30183147024265095, |
| "eval_loss": 0.16490380465984344, |
| "eval_runtime": 2333.5649, |
| "eval_samples_per_second": 25.108, |
| "eval_steps_per_second": 3.139, |
| "step": 21000 |
| }, |
| { |
| "epoch": 0.30255011660037157, |
| "grad_norm": 0.3204265534877777, |
| "learning_rate": 6.975637800934244e-05, |
| "loss": 0.1652, |
| "step": 21050 |
| }, |
| { |
| "epoch": 0.30326876295809213, |
| "grad_norm": 0.4411029815673828, |
| "learning_rate": 6.968451311534315e-05, |
| "loss": 0.1665, |
| "step": 21100 |
| }, |
| { |
| "epoch": 0.30398740931581275, |
| "grad_norm": 0.2782106399536133, |
| "learning_rate": 6.961264822134388e-05, |
| "loss": 0.1642, |
| "step": 21150 |
| }, |
| { |
| "epoch": 0.3047060556735333, |
| "grad_norm": 0.3360287845134735, |
| "learning_rate": 6.95407833273446e-05, |
| "loss": 0.1662, |
| "step": 21200 |
| }, |
| { |
| "epoch": 0.30542470203125394, |
| "grad_norm": 0.3113015294075012, |
| "learning_rate": 6.94689184333453e-05, |
| "loss": 0.1665, |
| "step": 21250 |
| }, |
| { |
| "epoch": 0.3061433483889745, |
| "grad_norm": 0.3733580708503723, |
| "learning_rate": 6.939705353934604e-05, |
| "loss": 0.1613, |
| "step": 21300 |
| }, |
| { |
| "epoch": 0.3068619947466951, |
| "grad_norm": 0.3798975348472595, |
| "learning_rate": 6.932518864534674e-05, |
| "loss": 0.1599, |
| "step": 21350 |
| }, |
| { |
| "epoch": 0.30758064110441574, |
| "grad_norm": 0.4411696493625641, |
| "learning_rate": 6.925332375134746e-05, |
| "loss": 0.1613, |
| "step": 21400 |
| }, |
| { |
| "epoch": 0.3082992874621363, |
| "grad_norm": 0.29650819301605225, |
| "learning_rate": 6.91814588573482e-05, |
| "loss": 0.1619, |
| "step": 21450 |
| }, |
| { |
| "epoch": 0.3090179338198569, |
| "grad_norm": 0.32305335998535156, |
| "learning_rate": 6.91095939633489e-05, |
| "loss": 0.1578, |
| "step": 21500 |
| }, |
| { |
| "epoch": 0.3097365801775775, |
| "grad_norm": 0.3363495469093323, |
| "learning_rate": 6.903772906934962e-05, |
| "loss": 0.162, |
| "step": 21550 |
| }, |
| { |
| "epoch": 0.3104552265352981, |
| "grad_norm": 0.39038559794425964, |
| "learning_rate": 6.896586417535035e-05, |
| "loss": 0.1665, |
| "step": 21600 |
| }, |
| { |
| "epoch": 0.31117387289301873, |
| "grad_norm": 0.3347565233707428, |
| "learning_rate": 6.889399928135106e-05, |
| "loss": 0.1646, |
| "step": 21650 |
| }, |
| { |
| "epoch": 0.3118925192507393, |
| "grad_norm": 0.41805052757263184, |
| "learning_rate": 6.882213438735178e-05, |
| "loss": 0.1581, |
| "step": 21700 |
| }, |
| { |
| "epoch": 0.3126111656084599, |
| "grad_norm": 0.31128305196762085, |
| "learning_rate": 6.875026949335251e-05, |
| "loss": 0.1608, |
| "step": 21750 |
| }, |
| { |
| "epoch": 0.3133298119661805, |
| "grad_norm": 0.3111174702644348, |
| "learning_rate": 6.867840459935322e-05, |
| "loss": 0.1646, |
| "step": 21800 |
| }, |
| { |
| "epoch": 0.3140484583239011, |
| "grad_norm": 0.3375270366668701, |
| "learning_rate": 6.860653970535394e-05, |
| "loss": 0.1635, |
| "step": 21850 |
| }, |
| { |
| "epoch": 0.3147671046816217, |
| "grad_norm": 0.2751725912094116, |
| "learning_rate": 6.853467481135465e-05, |
| "loss": 0.1612, |
| "step": 21900 |
| }, |
| { |
| "epoch": 0.3154857510393423, |
| "grad_norm": 0.31462764739990234, |
| "learning_rate": 6.846280991735537e-05, |
| "loss": 0.1686, |
| "step": 21950 |
| }, |
| { |
| "epoch": 0.3162043973970629, |
| "grad_norm": 0.3871542811393738, |
| "learning_rate": 6.839094502335609e-05, |
| "loss": 0.1576, |
| "step": 22000 |
| }, |
| { |
| "epoch": 0.3169230437547835, |
| "grad_norm": 0.36108696460723877, |
| "learning_rate": 6.831908012935681e-05, |
| "loss": 0.1604, |
| "step": 22050 |
| }, |
| { |
| "epoch": 0.3176416901125041, |
| "grad_norm": 0.48631274700164795, |
| "learning_rate": 6.824721523535753e-05, |
| "loss": 0.1607, |
| "step": 22100 |
| }, |
| { |
| "epoch": 0.31836033647022466, |
| "grad_norm": 0.4918723404407501, |
| "learning_rate": 6.817535034135825e-05, |
| "loss": 0.1581, |
| "step": 22150 |
| }, |
| { |
| "epoch": 0.3190789828279453, |
| "grad_norm": 0.40690186619758606, |
| "learning_rate": 6.810348544735897e-05, |
| "loss": 0.1651, |
| "step": 22200 |
| }, |
| { |
| "epoch": 0.3197976291856659, |
| "grad_norm": 0.38321653008461, |
| "learning_rate": 6.803162055335969e-05, |
| "loss": 0.1617, |
| "step": 22250 |
| }, |
| { |
| "epoch": 0.32051627554338646, |
| "grad_norm": 0.38967370986938477, |
| "learning_rate": 6.795975565936041e-05, |
| "loss": 0.1655, |
| "step": 22300 |
| }, |
| { |
| "epoch": 0.3212349219011071, |
| "grad_norm": 0.37661367654800415, |
| "learning_rate": 6.788932806324111e-05, |
| "loss": 0.1634, |
| "step": 22350 |
| }, |
| { |
| "epoch": 0.32195356825882765, |
| "grad_norm": 0.2743198871612549, |
| "learning_rate": 6.781746316924183e-05, |
| "loss": 0.1672, |
| "step": 22400 |
| }, |
| { |
| "epoch": 0.32267221461654827, |
| "grad_norm": 0.37295830249786377, |
| "learning_rate": 6.774559827524255e-05, |
| "loss": 0.1564, |
| "step": 22450 |
| }, |
| { |
| "epoch": 0.3233908609742689, |
| "grad_norm": 0.32078495621681213, |
| "learning_rate": 6.767373338124327e-05, |
| "loss": 0.1609, |
| "step": 22500 |
| }, |
| { |
| "epoch": 0.32410950733198945, |
| "grad_norm": 0.3385615944862366, |
| "learning_rate": 6.760186848724399e-05, |
| "loss": 0.1648, |
| "step": 22550 |
| }, |
| { |
| "epoch": 0.3248281536897101, |
| "grad_norm": 0.3343018889427185, |
| "learning_rate": 6.75300035932447e-05, |
| "loss": 0.1618, |
| "step": 22600 |
| }, |
| { |
| "epoch": 0.32554680004743064, |
| "grad_norm": 0.43768858909606934, |
| "learning_rate": 6.745813869924542e-05, |
| "loss": 0.1633, |
| "step": 22650 |
| }, |
| { |
| "epoch": 0.32626544640515126, |
| "grad_norm": 0.26847851276397705, |
| "learning_rate": 6.738627380524613e-05, |
| "loss": 0.164, |
| "step": 22700 |
| }, |
| { |
| "epoch": 0.3269840927628719, |
| "grad_norm": 0.4442514181137085, |
| "learning_rate": 6.731440891124686e-05, |
| "loss": 0.1582, |
| "step": 22750 |
| }, |
| { |
| "epoch": 0.32770273912059245, |
| "grad_norm": 0.40202733874320984, |
| "learning_rate": 6.724254401724758e-05, |
| "loss": 0.1628, |
| "step": 22800 |
| }, |
| { |
| "epoch": 0.32842138547831307, |
| "grad_norm": 0.3353193402290344, |
| "learning_rate": 6.717067912324829e-05, |
| "loss": 0.1669, |
| "step": 22850 |
| }, |
| { |
| "epoch": 0.32914003183603363, |
| "grad_norm": 0.3123689591884613, |
| "learning_rate": 6.709881422924902e-05, |
| "loss": 0.1625, |
| "step": 22900 |
| }, |
| { |
| "epoch": 0.32985867819375425, |
| "grad_norm": 0.38110092282295227, |
| "learning_rate": 6.702694933524974e-05, |
| "loss": 0.1627, |
| "step": 22950 |
| }, |
| { |
| "epoch": 0.3305773245514748, |
| "grad_norm": 0.46331876516342163, |
| "learning_rate": 6.695508444125044e-05, |
| "loss": 0.162, |
| "step": 23000 |
| }, |
| { |
| "epoch": 0.33129597090919544, |
| "grad_norm": 0.272176593542099, |
| "learning_rate": 6.688321954725118e-05, |
| "loss": 0.1574, |
| "step": 23050 |
| }, |
| { |
| "epoch": 0.33201461726691606, |
| "grad_norm": 0.43498551845550537, |
| "learning_rate": 6.681135465325188e-05, |
| "loss": 0.1633, |
| "step": 23100 |
| }, |
| { |
| "epoch": 0.3327332636246366, |
| "grad_norm": 0.2999245822429657, |
| "learning_rate": 6.67394897592526e-05, |
| "loss": 0.1622, |
| "step": 23150 |
| }, |
| { |
| "epoch": 0.33345190998235724, |
| "grad_norm": 0.31831660866737366, |
| "learning_rate": 6.666762486525333e-05, |
| "loss": 0.1589, |
| "step": 23200 |
| }, |
| { |
| "epoch": 0.3341705563400778, |
| "grad_norm": 0.33583468198776245, |
| "learning_rate": 6.659575997125404e-05, |
| "loss": 0.1615, |
| "step": 23250 |
| }, |
| { |
| "epoch": 0.3348892026977984, |
| "grad_norm": 0.4801647365093231, |
| "learning_rate": 6.652389507725476e-05, |
| "loss": 0.1617, |
| "step": 23300 |
| }, |
| { |
| "epoch": 0.33560784905551905, |
| "grad_norm": 0.3660373389720917, |
| "learning_rate": 6.645203018325549e-05, |
| "loss": 0.1608, |
| "step": 23350 |
| }, |
| { |
| "epoch": 0.3363264954132396, |
| "grad_norm": 0.3422200679779053, |
| "learning_rate": 6.63801652892562e-05, |
| "loss": 0.1698, |
| "step": 23400 |
| }, |
| { |
| "epoch": 0.33704514177096023, |
| "grad_norm": 0.3418559730052948, |
| "learning_rate": 6.630830039525692e-05, |
| "loss": 0.1634, |
| "step": 23450 |
| }, |
| { |
| "epoch": 0.3377637881286808, |
| "grad_norm": 0.2738020420074463, |
| "learning_rate": 6.623643550125765e-05, |
| "loss": 0.1593, |
| "step": 23500 |
| }, |
| { |
| "epoch": 0.3384824344864014, |
| "grad_norm": 0.28123342990875244, |
| "learning_rate": 6.616457060725836e-05, |
| "loss": 0.1605, |
| "step": 23550 |
| }, |
| { |
| "epoch": 0.33920108084412204, |
| "grad_norm": 0.2701767683029175, |
| "learning_rate": 6.609270571325907e-05, |
| "loss": 0.162, |
| "step": 23600 |
| }, |
| { |
| "epoch": 0.3399197272018426, |
| "grad_norm": 0.3176400065422058, |
| "learning_rate": 6.60208408192598e-05, |
| "loss": 0.1603, |
| "step": 23650 |
| }, |
| { |
| "epoch": 0.3406383735595632, |
| "grad_norm": 0.359195739030838, |
| "learning_rate": 6.594897592526051e-05, |
| "loss": 0.1624, |
| "step": 23700 |
| }, |
| { |
| "epoch": 0.3413570199172838, |
| "grad_norm": 0.4724487364292145, |
| "learning_rate": 6.587711103126123e-05, |
| "loss": 0.1587, |
| "step": 23750 |
| }, |
| { |
| "epoch": 0.3420756662750044, |
| "grad_norm": 0.3473828434944153, |
| "learning_rate": 6.580524613726195e-05, |
| "loss": 0.1618, |
| "step": 23800 |
| }, |
| { |
| "epoch": 0.342794312632725, |
| "grad_norm": 0.25780320167541504, |
| "learning_rate": 6.573338124326267e-05, |
| "loss": 0.1617, |
| "step": 23850 |
| }, |
| { |
| "epoch": 0.3435129589904456, |
| "grad_norm": 0.43437114357948303, |
| "learning_rate": 6.566151634926339e-05, |
| "loss": 0.1653, |
| "step": 23900 |
| }, |
| { |
| "epoch": 0.3442316053481662, |
| "grad_norm": 0.3810344934463501, |
| "learning_rate": 6.558965145526411e-05, |
| "loss": 0.1609, |
| "step": 23950 |
| }, |
| { |
| "epoch": 0.3449502517058868, |
| "grad_norm": 0.32231196761131287, |
| "learning_rate": 6.551778656126483e-05, |
| "loss": 0.1661, |
| "step": 24000 |
| }, |
| { |
| "epoch": 0.3449502517058868, |
| "eval_loss": 0.16020701825618744, |
| "eval_runtime": 2356.1463, |
| "eval_samples_per_second": 24.867, |
| "eval_steps_per_second": 3.108, |
| "step": 24000 |
| }, |
| { |
| "epoch": 0.3456688980636074, |
| "grad_norm": 0.3379141688346863, |
| "learning_rate": 6.544592166726555e-05, |
| "loss": 0.162, |
| "step": 24050 |
| }, |
| { |
| "epoch": 0.34638754442132796, |
| "grad_norm": 0.36986809968948364, |
| "learning_rate": 6.537405677326627e-05, |
| "loss": 0.1561, |
| "step": 24100 |
| }, |
| { |
| "epoch": 0.3471061907790486, |
| "grad_norm": 0.5383297801017761, |
| "learning_rate": 6.530219187926698e-05, |
| "loss": 0.1589, |
| "step": 24150 |
| }, |
| { |
| "epoch": 0.3478248371367692, |
| "grad_norm": 0.46481168270111084, |
| "learning_rate": 6.52303269852677e-05, |
| "loss": 0.1628, |
| "step": 24200 |
| }, |
| { |
| "epoch": 0.34854348349448977, |
| "grad_norm": 0.4319482743740082, |
| "learning_rate": 6.515846209126842e-05, |
| "loss": 0.1617, |
| "step": 24250 |
| }, |
| { |
| "epoch": 0.3492621298522104, |
| "grad_norm": 0.6843165159225464, |
| "learning_rate": 6.508659719726914e-05, |
| "loss": 0.1592, |
| "step": 24300 |
| }, |
| { |
| "epoch": 0.34998077620993095, |
| "grad_norm": 0.5816138982772827, |
| "learning_rate": 6.501473230326986e-05, |
| "loss": 0.1654, |
| "step": 24350 |
| }, |
| { |
| "epoch": 0.3506994225676516, |
| "grad_norm": 0.42936670780181885, |
| "learning_rate": 6.494286740927057e-05, |
| "loss": 0.1641, |
| "step": 24400 |
| }, |
| { |
| "epoch": 0.3514180689253722, |
| "grad_norm": 0.480822890996933, |
| "learning_rate": 6.48710025152713e-05, |
| "loss": 0.1569, |
| "step": 24450 |
| }, |
| { |
| "epoch": 0.35213671528309276, |
| "grad_norm": 0.39662879705429077, |
| "learning_rate": 6.4799137621272e-05, |
| "loss": 0.1595, |
| "step": 24500 |
| }, |
| { |
| "epoch": 0.3528553616408134, |
| "grad_norm": 0.31965863704681396, |
| "learning_rate": 6.472727272727272e-05, |
| "loss": 0.1616, |
| "step": 24550 |
| }, |
| { |
| "epoch": 0.35357400799853395, |
| "grad_norm": 0.3041664958000183, |
| "learning_rate": 6.465540783327346e-05, |
| "loss": 0.1576, |
| "step": 24600 |
| }, |
| { |
| "epoch": 0.35429265435625457, |
| "grad_norm": 0.32472431659698486, |
| "learning_rate": 6.458354293927416e-05, |
| "loss": 0.1577, |
| "step": 24650 |
| }, |
| { |
| "epoch": 0.35501130071397513, |
| "grad_norm": 0.6908242106437683, |
| "learning_rate": 6.451167804527488e-05, |
| "loss": 0.1614, |
| "step": 24700 |
| }, |
| { |
| "epoch": 0.35572994707169575, |
| "grad_norm": 0.31418710947036743, |
| "learning_rate": 6.443981315127561e-05, |
| "loss": 0.1586, |
| "step": 24750 |
| }, |
| { |
| "epoch": 0.35644859342941637, |
| "grad_norm": 0.4417416453361511, |
| "learning_rate": 6.436794825727632e-05, |
| "loss": 0.1563, |
| "step": 24800 |
| }, |
| { |
| "epoch": 0.35716723978713694, |
| "grad_norm": 0.35909807682037354, |
| "learning_rate": 6.429608336327704e-05, |
| "loss": 0.1576, |
| "step": 24850 |
| }, |
| { |
| "epoch": 0.35788588614485756, |
| "grad_norm": 0.6350358128547668, |
| "learning_rate": 6.422421846927776e-05, |
| "loss": 0.1644, |
| "step": 24900 |
| }, |
| { |
| "epoch": 0.3586045325025781, |
| "grad_norm": 0.368534117937088, |
| "learning_rate": 6.415235357527848e-05, |
| "loss": 0.1619, |
| "step": 24950 |
| }, |
| { |
| "epoch": 0.35932317886029874, |
| "grad_norm": 0.3108366131782532, |
| "learning_rate": 6.40804886812792e-05, |
| "loss": 0.1635, |
| "step": 25000 |
| }, |
| { |
| "epoch": 0.36004182521801936, |
| "grad_norm": 0.31624144315719604, |
| "learning_rate": 6.400862378727991e-05, |
| "loss": 0.1561, |
| "step": 25050 |
| }, |
| { |
| "epoch": 0.3607604715757399, |
| "grad_norm": 0.5008798837661743, |
| "learning_rate": 6.393675889328063e-05, |
| "loss": 0.1589, |
| "step": 25100 |
| }, |
| { |
| "epoch": 0.36147911793346055, |
| "grad_norm": 0.4550321400165558, |
| "learning_rate": 6.386489399928135e-05, |
| "loss": 0.1612, |
| "step": 25150 |
| }, |
| { |
| "epoch": 0.3621977642911811, |
| "grad_norm": 0.3721817433834076, |
| "learning_rate": 6.379302910528207e-05, |
| "loss": 0.1625, |
| "step": 25200 |
| }, |
| { |
| "epoch": 0.36291641064890173, |
| "grad_norm": 0.3496086001396179, |
| "learning_rate": 6.372116421128279e-05, |
| "loss": 0.1582, |
| "step": 25250 |
| }, |
| { |
| "epoch": 0.36363505700662235, |
| "grad_norm": 0.4079247713088989, |
| "learning_rate": 6.364929931728351e-05, |
| "loss": 0.1636, |
| "step": 25300 |
| }, |
| { |
| "epoch": 0.3643537033643429, |
| "grad_norm": 0.42480820417404175, |
| "learning_rate": 6.357743442328423e-05, |
| "loss": 0.1602, |
| "step": 25350 |
| }, |
| { |
| "epoch": 0.36507234972206354, |
| "grad_norm": 0.46133843064308167, |
| "learning_rate": 6.350556952928495e-05, |
| "loss": 0.1621, |
| "step": 25400 |
| }, |
| { |
| "epoch": 0.3657909960797841, |
| "grad_norm": 0.43702232837677, |
| "learning_rate": 6.343370463528567e-05, |
| "loss": 0.1581, |
| "step": 25450 |
| }, |
| { |
| "epoch": 0.3665096424375047, |
| "grad_norm": 0.30601567029953003, |
| "learning_rate": 6.336183974128639e-05, |
| "loss": 0.1582, |
| "step": 25500 |
| }, |
| { |
| "epoch": 0.3672282887952253, |
| "grad_norm": 0.33580270409584045, |
| "learning_rate": 6.32899748472871e-05, |
| "loss": 0.1644, |
| "step": 25550 |
| }, |
| { |
| "epoch": 0.3679469351529459, |
| "grad_norm": 0.3324548006057739, |
| "learning_rate": 6.321810995328782e-05, |
| "loss": 0.1618, |
| "step": 25600 |
| }, |
| { |
| "epoch": 0.36866558151066653, |
| "grad_norm": 0.3337075114250183, |
| "learning_rate": 6.314624505928854e-05, |
| "loss": 0.1571, |
| "step": 25650 |
| }, |
| { |
| "epoch": 0.3693842278683871, |
| "grad_norm": 0.30700790882110596, |
| "learning_rate": 6.307438016528926e-05, |
| "loss": 0.158, |
| "step": 25700 |
| }, |
| { |
| "epoch": 0.3701028742261077, |
| "grad_norm": 0.4422316551208496, |
| "learning_rate": 6.300251527128997e-05, |
| "loss": 0.1548, |
| "step": 25750 |
| }, |
| { |
| "epoch": 0.3708215205838283, |
| "grad_norm": 0.42883041501045227, |
| "learning_rate": 6.29306503772907e-05, |
| "loss": 0.1569, |
| "step": 25800 |
| }, |
| { |
| "epoch": 0.3715401669415489, |
| "grad_norm": 0.34244126081466675, |
| "learning_rate": 6.285878548329142e-05, |
| "loss": 0.1597, |
| "step": 25850 |
| }, |
| { |
| "epoch": 0.3722588132992695, |
| "grad_norm": 0.32595932483673096, |
| "learning_rate": 6.278692058929213e-05, |
| "loss": 0.1548, |
| "step": 25900 |
| }, |
| { |
| "epoch": 0.3729774596569901, |
| "grad_norm": 0.34954744577407837, |
| "learning_rate": 6.271505569529286e-05, |
| "loss": 0.1587, |
| "step": 25950 |
| }, |
| { |
| "epoch": 0.3736961060147107, |
| "grad_norm": 0.35941386222839355, |
| "learning_rate": 6.264319080129358e-05, |
| "loss": 0.1587, |
| "step": 26000 |
| }, |
| { |
| "epoch": 0.37441475237243127, |
| "grad_norm": 0.44000008702278137, |
| "learning_rate": 6.257132590729428e-05, |
| "loss": 0.1569, |
| "step": 26050 |
| }, |
| { |
| "epoch": 0.3751333987301519, |
| "grad_norm": 0.28137120604515076, |
| "learning_rate": 6.249946101329502e-05, |
| "loss": 0.1557, |
| "step": 26100 |
| }, |
| { |
| "epoch": 0.3758520450878725, |
| "grad_norm": 0.40533679723739624, |
| "learning_rate": 6.242759611929573e-05, |
| "loss": 0.1575, |
| "step": 26150 |
| }, |
| { |
| "epoch": 0.3765706914455931, |
| "grad_norm": 0.40215086936950684, |
| "learning_rate": 6.235573122529644e-05, |
| "loss": 0.1585, |
| "step": 26200 |
| }, |
| { |
| "epoch": 0.3772893378033137, |
| "grad_norm": 0.3472813069820404, |
| "learning_rate": 6.228386633129717e-05, |
| "loss": 0.1609, |
| "step": 26250 |
| }, |
| { |
| "epoch": 0.37800798416103426, |
| "grad_norm": 0.32452771067619324, |
| "learning_rate": 6.221200143729788e-05, |
| "loss": 0.1634, |
| "step": 26300 |
| }, |
| { |
| "epoch": 0.3787266305187549, |
| "grad_norm": 0.37654146552085876, |
| "learning_rate": 6.21415738411786e-05, |
| "loss": 0.1604, |
| "step": 26350 |
| }, |
| { |
| "epoch": 0.37944527687647545, |
| "grad_norm": 0.46723291277885437, |
| "learning_rate": 6.20697089471793e-05, |
| "loss": 0.157, |
| "step": 26400 |
| }, |
| { |
| "epoch": 0.38016392323419607, |
| "grad_norm": 0.42815640568733215, |
| "learning_rate": 6.199784405318002e-05, |
| "loss": 0.1615, |
| "step": 26450 |
| }, |
| { |
| "epoch": 0.3808825695919167, |
| "grad_norm": 0.4379606246948242, |
| "learning_rate": 6.192597915918075e-05, |
| "loss": 0.1638, |
| "step": 26500 |
| }, |
| { |
| "epoch": 0.38160121594963725, |
| "grad_norm": 0.5562979578971863, |
| "learning_rate": 6.185411426518146e-05, |
| "loss": 0.1608, |
| "step": 26550 |
| }, |
| { |
| "epoch": 0.38231986230735787, |
| "grad_norm": 0.33051741123199463, |
| "learning_rate": 6.178224937118218e-05, |
| "loss": 0.1548, |
| "step": 26600 |
| }, |
| { |
| "epoch": 0.38303850866507844, |
| "grad_norm": 0.2941145598888397, |
| "learning_rate": 6.171038447718291e-05, |
| "loss": 0.1551, |
| "step": 26650 |
| }, |
| { |
| "epoch": 0.38375715502279906, |
| "grad_norm": 0.4036601781845093, |
| "learning_rate": 6.163851958318362e-05, |
| "loss": 0.1592, |
| "step": 26700 |
| }, |
| { |
| "epoch": 0.3844758013805197, |
| "grad_norm": 0.4525456726551056, |
| "learning_rate": 6.156665468918433e-05, |
| "loss": 0.1553, |
| "step": 26750 |
| }, |
| { |
| "epoch": 0.38519444773824024, |
| "grad_norm": 0.4872748553752899, |
| "learning_rate": 6.149478979518505e-05, |
| "loss": 0.1582, |
| "step": 26800 |
| }, |
| { |
| "epoch": 0.38591309409596086, |
| "grad_norm": 0.451123982667923, |
| "learning_rate": 6.142292490118577e-05, |
| "loss": 0.1578, |
| "step": 26850 |
| }, |
| { |
| "epoch": 0.3866317404536814, |
| "grad_norm": 0.3543640077114105, |
| "learning_rate": 6.135106000718649e-05, |
| "loss": 0.1588, |
| "step": 26900 |
| }, |
| { |
| "epoch": 0.38735038681140205, |
| "grad_norm": 0.38010174036026, |
| "learning_rate": 6.127919511318721e-05, |
| "loss": 0.1575, |
| "step": 26950 |
| }, |
| { |
| "epoch": 0.38806903316912267, |
| "grad_norm": 0.39789631962776184, |
| "learning_rate": 6.120733021918793e-05, |
| "loss": 0.1578, |
| "step": 27000 |
| }, |
| { |
| "epoch": 0.38806903316912267, |
| "eval_loss": 0.15559689700603485, |
| "eval_runtime": 2336.5126, |
| "eval_samples_per_second": 25.076, |
| "eval_steps_per_second": 3.135, |
| "step": 27000 |
| }, |
| { |
| "epoch": 0.38878767952684323, |
| "grad_norm": 0.3860076367855072, |
| "learning_rate": 6.113546532518865e-05, |
| "loss": 0.1538, |
| "step": 27050 |
| }, |
| { |
| "epoch": 0.38950632588456385, |
| "grad_norm": 0.27461615204811096, |
| "learning_rate": 6.106360043118937e-05, |
| "loss": 0.1517, |
| "step": 27100 |
| }, |
| { |
| "epoch": 0.3902249722422844, |
| "grad_norm": 0.46942609548568726, |
| "learning_rate": 6.099173553719009e-05, |
| "loss": 0.1556, |
| "step": 27150 |
| }, |
| { |
| "epoch": 0.39094361860000504, |
| "grad_norm": 0.3067554831504822, |
| "learning_rate": 6.09198706431908e-05, |
| "loss": 0.1582, |
| "step": 27200 |
| }, |
| { |
| "epoch": 0.3916622649577256, |
| "grad_norm": 0.3194333016872406, |
| "learning_rate": 6.0848005749191526e-05, |
| "loss": 0.1576, |
| "step": 27250 |
| }, |
| { |
| "epoch": 0.3923809113154462, |
| "grad_norm": 0.5251829028129578, |
| "learning_rate": 6.0776140855192245e-05, |
| "loss": 0.1543, |
| "step": 27300 |
| }, |
| { |
| "epoch": 0.39309955767316684, |
| "grad_norm": 0.4211423099040985, |
| "learning_rate": 6.070427596119296e-05, |
| "loss": 0.1629, |
| "step": 27350 |
| }, |
| { |
| "epoch": 0.3938182040308874, |
| "grad_norm": 0.4318183660507202, |
| "learning_rate": 6.063241106719368e-05, |
| "loss": 0.1557, |
| "step": 27400 |
| }, |
| { |
| "epoch": 0.39453685038860803, |
| "grad_norm": 0.5136430263519287, |
| "learning_rate": 6.05605461731944e-05, |
| "loss": 0.157, |
| "step": 27450 |
| }, |
| { |
| "epoch": 0.3952554967463286, |
| "grad_norm": 0.35453012585639954, |
| "learning_rate": 6.0488681279195114e-05, |
| "loss": 0.1535, |
| "step": 27500 |
| }, |
| { |
| "epoch": 0.3959741431040492, |
| "grad_norm": 0.44323351979255676, |
| "learning_rate": 6.041681638519584e-05, |
| "loss": 0.1595, |
| "step": 27550 |
| }, |
| { |
| "epoch": 0.39669278946176983, |
| "grad_norm": 0.29345712065696716, |
| "learning_rate": 6.034495149119655e-05, |
| "loss": 0.1557, |
| "step": 27600 |
| }, |
| { |
| "epoch": 0.3974114358194904, |
| "grad_norm": 0.2903861701488495, |
| "learning_rate": 6.027308659719727e-05, |
| "loss": 0.1608, |
| "step": 27650 |
| }, |
| { |
| "epoch": 0.398130082177211, |
| "grad_norm": 0.30161532759666443, |
| "learning_rate": 6.0201221703197984e-05, |
| "loss": 0.1586, |
| "step": 27700 |
| }, |
| { |
| "epoch": 0.3988487285349316, |
| "grad_norm": 0.3740021288394928, |
| "learning_rate": 6.012935680919871e-05, |
| "loss": 0.154, |
| "step": 27750 |
| }, |
| { |
| "epoch": 0.3995673748926522, |
| "grad_norm": 0.4624473750591278, |
| "learning_rate": 6.005749191519943e-05, |
| "loss": 0.1544, |
| "step": 27800 |
| }, |
| { |
| "epoch": 0.4002860212503728, |
| "grad_norm": 0.36845239996910095, |
| "learning_rate": 5.998562702120014e-05, |
| "loss": 0.1556, |
| "step": 27850 |
| }, |
| { |
| "epoch": 0.4010046676080934, |
| "grad_norm": 0.5025461912155151, |
| "learning_rate": 5.991376212720087e-05, |
| "loss": 0.1531, |
| "step": 27900 |
| }, |
| { |
| "epoch": 0.401723313965814, |
| "grad_norm": 0.6607873439788818, |
| "learning_rate": 5.984189723320158e-05, |
| "loss": 0.1573, |
| "step": 27950 |
| }, |
| { |
| "epoch": 0.4024419603235346, |
| "grad_norm": 0.5143309831619263, |
| "learning_rate": 5.97700323392023e-05, |
| "loss": 0.1534, |
| "step": 28000 |
| }, |
| { |
| "epoch": 0.4031606066812552, |
| "grad_norm": 0.3975028991699219, |
| "learning_rate": 5.9698167445203024e-05, |
| "loss": 0.1568, |
| "step": 28050 |
| }, |
| { |
| "epoch": 0.40387925303897576, |
| "grad_norm": 0.3697468936443329, |
| "learning_rate": 5.962630255120374e-05, |
| "loss": 0.1476, |
| "step": 28100 |
| }, |
| { |
| "epoch": 0.4045978993966964, |
| "grad_norm": 0.8748229146003723, |
| "learning_rate": 5.9554437657204456e-05, |
| "loss": 0.1586, |
| "step": 28150 |
| }, |
| { |
| "epoch": 0.405316545754417, |
| "grad_norm": 0.43097078800201416, |
| "learning_rate": 5.948257276320518e-05, |
| "loss": 0.1576, |
| "step": 28200 |
| }, |
| { |
| "epoch": 0.40603519211213757, |
| "grad_norm": 0.3834463059902191, |
| "learning_rate": 5.9410707869205894e-05, |
| "loss": 0.1534, |
| "step": 28250 |
| }, |
| { |
| "epoch": 0.4067538384698582, |
| "grad_norm": 0.42111456394195557, |
| "learning_rate": 5.933884297520661e-05, |
| "loss": 0.1562, |
| "step": 28300 |
| }, |
| { |
| "epoch": 0.40747248482757875, |
| "grad_norm": 0.27892324328422546, |
| "learning_rate": 5.926697808120734e-05, |
| "loss": 0.1528, |
| "step": 28350 |
| }, |
| { |
| "epoch": 0.40819113118529937, |
| "grad_norm": 0.36843934655189514, |
| "learning_rate": 5.919655048508804e-05, |
| "loss": 0.1545, |
| "step": 28400 |
| }, |
| { |
| "epoch": 0.40890977754302, |
| "grad_norm": 0.4540693163871765, |
| "learning_rate": 5.9124685591088755e-05, |
| "loss": 0.1545, |
| "step": 28450 |
| }, |
| { |
| "epoch": 0.40962842390074056, |
| "grad_norm": 0.5054967403411865, |
| "learning_rate": 5.9052820697089474e-05, |
| "loss": 0.1549, |
| "step": 28500 |
| }, |
| { |
| "epoch": 0.4103470702584612, |
| "grad_norm": 0.27066469192504883, |
| "learning_rate": 5.89809558030902e-05, |
| "loss": 0.1517, |
| "step": 28550 |
| }, |
| { |
| "epoch": 0.41106571661618174, |
| "grad_norm": 0.3877571225166321, |
| "learning_rate": 5.890909090909091e-05, |
| "loss": 0.1524, |
| "step": 28600 |
| }, |
| { |
| "epoch": 0.41178436297390236, |
| "grad_norm": 0.385812371969223, |
| "learning_rate": 5.883722601509163e-05, |
| "loss": 0.1538, |
| "step": 28650 |
| }, |
| { |
| "epoch": 0.4125030093316229, |
| "grad_norm": 0.3364986777305603, |
| "learning_rate": 5.876536112109234e-05, |
| "loss": 0.1555, |
| "step": 28700 |
| }, |
| { |
| "epoch": 0.41322165568934355, |
| "grad_norm": 0.46070176362991333, |
| "learning_rate": 5.869349622709307e-05, |
| "loss": 0.1506, |
| "step": 28750 |
| }, |
| { |
| "epoch": 0.41394030204706417, |
| "grad_norm": 0.4020332396030426, |
| "learning_rate": 5.862163133309379e-05, |
| "loss": 0.1509, |
| "step": 28800 |
| }, |
| { |
| "epoch": 0.41465894840478473, |
| "grad_norm": 0.47747594118118286, |
| "learning_rate": 5.85497664390945e-05, |
| "loss": 0.1526, |
| "step": 28850 |
| }, |
| { |
| "epoch": 0.41537759476250535, |
| "grad_norm": 0.45608797669410706, |
| "learning_rate": 5.8477901545095226e-05, |
| "loss": 0.1543, |
| "step": 28900 |
| }, |
| { |
| "epoch": 0.4160962411202259, |
| "grad_norm": 0.2811453640460968, |
| "learning_rate": 5.840603665109594e-05, |
| "loss": 0.157, |
| "step": 28950 |
| }, |
| { |
| "epoch": 0.41681488747794654, |
| "grad_norm": 0.32047563791275024, |
| "learning_rate": 5.833417175709666e-05, |
| "loss": 0.1539, |
| "step": 29000 |
| }, |
| { |
| "epoch": 0.41753353383566716, |
| "grad_norm": 0.482090026140213, |
| "learning_rate": 5.8262306863097384e-05, |
| "loss": 0.1604, |
| "step": 29050 |
| }, |
| { |
| "epoch": 0.4182521801933877, |
| "grad_norm": 0.30156487226486206, |
| "learning_rate": 5.819187926697809e-05, |
| "loss": 0.155, |
| "step": 29100 |
| }, |
| { |
| "epoch": 0.41897082655110834, |
| "grad_norm": 0.36799871921539307, |
| "learning_rate": 5.8120014372978806e-05, |
| "loss": 0.1542, |
| "step": 29150 |
| }, |
| { |
| "epoch": 0.4196894729088289, |
| "grad_norm": 0.44371145963668823, |
| "learning_rate": 5.804814947897952e-05, |
| "loss": 0.1523, |
| "step": 29200 |
| }, |
| { |
| "epoch": 0.42040811926654953, |
| "grad_norm": 0.45741862058639526, |
| "learning_rate": 5.7976284584980244e-05, |
| "loss": 0.1526, |
| "step": 29250 |
| }, |
| { |
| "epoch": 0.42112676562427015, |
| "grad_norm": 0.33403101563453674, |
| "learning_rate": 5.790441969098096e-05, |
| "loss": 0.1534, |
| "step": 29300 |
| }, |
| { |
| "epoch": 0.4218454119819907, |
| "grad_norm": 0.31555646657943726, |
| "learning_rate": 5.7832554796981676e-05, |
| "loss": 0.1525, |
| "step": 29350 |
| }, |
| { |
| "epoch": 0.42256405833971133, |
| "grad_norm": 0.33370161056518555, |
| "learning_rate": 5.77606899029824e-05, |
| "loss": 0.159, |
| "step": 29400 |
| }, |
| { |
| "epoch": 0.4232827046974319, |
| "grad_norm": 0.41524410247802734, |
| "learning_rate": 5.7688825008983114e-05, |
| "loss": 0.1538, |
| "step": 29450 |
| }, |
| { |
| "epoch": 0.4240013510551525, |
| "grad_norm": 0.3497067391872406, |
| "learning_rate": 5.761696011498383e-05, |
| "loss": 0.1498, |
| "step": 29500 |
| }, |
| { |
| "epoch": 0.4247199974128731, |
| "grad_norm": 0.37955665588378906, |
| "learning_rate": 5.754509522098456e-05, |
| "loss": 0.1492, |
| "step": 29550 |
| }, |
| { |
| "epoch": 0.4254386437705937, |
| "grad_norm": 0.30920735001564026, |
| "learning_rate": 5.747323032698527e-05, |
| "loss": 0.1589, |
| "step": 29600 |
| }, |
| { |
| "epoch": 0.4261572901283143, |
| "grad_norm": 0.3532569110393524, |
| "learning_rate": 5.740136543298599e-05, |
| "loss": 0.1536, |
| "step": 29650 |
| }, |
| { |
| "epoch": 0.4268759364860349, |
| "grad_norm": 0.33438754081726074, |
| "learning_rate": 5.73295005389867e-05, |
| "loss": 0.1527, |
| "step": 29700 |
| }, |
| { |
| "epoch": 0.4275945828437555, |
| "grad_norm": 0.376600980758667, |
| "learning_rate": 5.725763564498743e-05, |
| "loss": 0.1521, |
| "step": 29750 |
| }, |
| { |
| "epoch": 0.4283132292014761, |
| "grad_norm": 0.48702993988990784, |
| "learning_rate": 5.718577075098814e-05, |
| "loss": 0.1545, |
| "step": 29800 |
| }, |
| { |
| "epoch": 0.4290318755591967, |
| "grad_norm": 0.2475581169128418, |
| "learning_rate": 5.711390585698886e-05, |
| "loss": 0.1544, |
| "step": 29850 |
| }, |
| { |
| "epoch": 0.4297505219169173, |
| "grad_norm": 0.37837105989456177, |
| "learning_rate": 5.7042040962989586e-05, |
| "loss": 0.1541, |
| "step": 29900 |
| }, |
| { |
| "epoch": 0.4304691682746379, |
| "grad_norm": 0.47116902470588684, |
| "learning_rate": 5.69701760689903e-05, |
| "loss": 0.1502, |
| "step": 29950 |
| }, |
| { |
| "epoch": 0.4311878146323585, |
| "grad_norm": 0.29833170771598816, |
| "learning_rate": 5.689831117499102e-05, |
| "loss": 0.1537, |
| "step": 30000 |
| }, |
| { |
| "epoch": 0.4311878146323585, |
| "eval_loss": 0.15278169512748718, |
| "eval_runtime": 2339.1715, |
| "eval_samples_per_second": 25.047, |
| "eval_steps_per_second": 3.131, |
| "step": 30000 |
| }, |
| { |
| "epoch": 0.43190646099007907, |
| "grad_norm": 0.3521724045276642, |
| "learning_rate": 5.682644628099174e-05, |
| "loss": 0.1482, |
| "step": 30050 |
| }, |
| { |
| "epoch": 0.4326251073477997, |
| "grad_norm": 0.2992171049118042, |
| "learning_rate": 5.6754581386992455e-05, |
| "loss": 0.1515, |
| "step": 30100 |
| }, |
| { |
| "epoch": 0.4333437537055203, |
| "grad_norm": 0.2566670775413513, |
| "learning_rate": 5.668271649299317e-05, |
| "loss": 0.155, |
| "step": 30150 |
| }, |
| { |
| "epoch": 0.43406240006324087, |
| "grad_norm": 0.42938894033432007, |
| "learning_rate": 5.66108515989939e-05, |
| "loss": 0.1514, |
| "step": 30200 |
| }, |
| { |
| "epoch": 0.4347810464209615, |
| "grad_norm": 0.3730209469795227, |
| "learning_rate": 5.653898670499461e-05, |
| "loss": 0.152, |
| "step": 30250 |
| }, |
| { |
| "epoch": 0.43549969277868206, |
| "grad_norm": 0.269544780254364, |
| "learning_rate": 5.6467121810995325e-05, |
| "loss": 0.1555, |
| "step": 30300 |
| }, |
| { |
| "epoch": 0.4362183391364027, |
| "grad_norm": 0.331620991230011, |
| "learning_rate": 5.639525691699605e-05, |
| "loss": 0.1558, |
| "step": 30350 |
| }, |
| { |
| "epoch": 0.43693698549412324, |
| "grad_norm": 0.2786395847797394, |
| "learning_rate": 5.632339202299677e-05, |
| "loss": 0.151, |
| "step": 30400 |
| }, |
| { |
| "epoch": 0.43765563185184386, |
| "grad_norm": 0.3796517550945282, |
| "learning_rate": 5.625152712899748e-05, |
| "loss": 0.1513, |
| "step": 30450 |
| }, |
| { |
| "epoch": 0.4383742782095645, |
| "grad_norm": 0.48707351088523865, |
| "learning_rate": 5.617966223499821e-05, |
| "loss": 0.1513, |
| "step": 30500 |
| }, |
| { |
| "epoch": 0.43909292456728505, |
| "grad_norm": 0.391174852848053, |
| "learning_rate": 5.610779734099893e-05, |
| "loss": 0.1536, |
| "step": 30550 |
| }, |
| { |
| "epoch": 0.43981157092500567, |
| "grad_norm": 0.4588553309440613, |
| "learning_rate": 5.603593244699964e-05, |
| "loss": 0.1506, |
| "step": 30600 |
| }, |
| { |
| "epoch": 0.44053021728272623, |
| "grad_norm": 0.6206002235412598, |
| "learning_rate": 5.5964067553000365e-05, |
| "loss": 0.1563, |
| "step": 30650 |
| }, |
| { |
| "epoch": 0.44124886364044685, |
| "grad_norm": 0.4559854567050934, |
| "learning_rate": 5.589220265900108e-05, |
| "loss": 0.1465, |
| "step": 30700 |
| }, |
| { |
| "epoch": 0.4419675099981675, |
| "grad_norm": 0.3629855811595917, |
| "learning_rate": 5.58203377650018e-05, |
| "loss": 0.1523, |
| "step": 30750 |
| }, |
| { |
| "epoch": 0.44268615635588804, |
| "grad_norm": 0.43928229808807373, |
| "learning_rate": 5.574847287100252e-05, |
| "loss": 0.1535, |
| "step": 30800 |
| }, |
| { |
| "epoch": 0.44340480271360866, |
| "grad_norm": 0.5060630440711975, |
| "learning_rate": 5.5676607977003235e-05, |
| "loss": 0.1504, |
| "step": 30850 |
| }, |
| { |
| "epoch": 0.4441234490713292, |
| "grad_norm": 0.7647538781166077, |
| "learning_rate": 5.5604743083003954e-05, |
| "loss": 0.1535, |
| "step": 30900 |
| }, |
| { |
| "epoch": 0.44484209542904984, |
| "grad_norm": 0.4331282377243042, |
| "learning_rate": 5.553287818900468e-05, |
| "loss": 0.1534, |
| "step": 30950 |
| }, |
| { |
| "epoch": 0.44556074178677046, |
| "grad_norm": 0.34580302238464355, |
| "learning_rate": 5.546101329500539e-05, |
| "loss": 0.1502, |
| "step": 31000 |
| }, |
| { |
| "epoch": 0.44627938814449103, |
| "grad_norm": 0.36288365721702576, |
| "learning_rate": 5.5389148401006105e-05, |
| "loss": 0.1549, |
| "step": 31050 |
| }, |
| { |
| "epoch": 0.44699803450221165, |
| "grad_norm": 0.319622278213501, |
| "learning_rate": 5.531728350700684e-05, |
| "loss": 0.1529, |
| "step": 31100 |
| }, |
| { |
| "epoch": 0.4477166808599322, |
| "grad_norm": 0.3031866252422333, |
| "learning_rate": 5.524541861300755e-05, |
| "loss": 0.152, |
| "step": 31150 |
| }, |
| { |
| "epoch": 0.44843532721765283, |
| "grad_norm": 0.6403375864028931, |
| "learning_rate": 5.517355371900826e-05, |
| "loss": 0.1491, |
| "step": 31200 |
| }, |
| { |
| "epoch": 0.4491539735753734, |
| "grad_norm": 0.3669109642505646, |
| "learning_rate": 5.510168882500899e-05, |
| "loss": 0.1505, |
| "step": 31250 |
| }, |
| { |
| "epoch": 0.449872619933094, |
| "grad_norm": 0.3309599459171295, |
| "learning_rate": 5.502982393100971e-05, |
| "loss": 0.1496, |
| "step": 31300 |
| }, |
| { |
| "epoch": 0.45059126629081464, |
| "grad_norm": 0.40921223163604736, |
| "learning_rate": 5.495795903701042e-05, |
| "loss": 0.1513, |
| "step": 31350 |
| }, |
| { |
| "epoch": 0.4513099126485352, |
| "grad_norm": 0.34538429975509644, |
| "learning_rate": 5.4886094143011145e-05, |
| "loss": 0.1477, |
| "step": 31400 |
| }, |
| { |
| "epoch": 0.4520285590062558, |
| "grad_norm": 0.3140852451324463, |
| "learning_rate": 5.4814229249011864e-05, |
| "loss": 0.1501, |
| "step": 31450 |
| }, |
| { |
| "epoch": 0.4527472053639764, |
| "grad_norm": 0.34375426173210144, |
| "learning_rate": 5.474236435501258e-05, |
| "loss": 0.1486, |
| "step": 31500 |
| }, |
| { |
| "epoch": 0.453465851721697, |
| "grad_norm": 0.2683236002922058, |
| "learning_rate": 5.46704994610133e-05, |
| "loss": 0.1521, |
| "step": 31550 |
| }, |
| { |
| "epoch": 0.45418449807941763, |
| "grad_norm": 0.2604835331439972, |
| "learning_rate": 5.4598634567014015e-05, |
| "loss": 0.1524, |
| "step": 31600 |
| }, |
| { |
| "epoch": 0.4549031444371382, |
| "grad_norm": 0.3806206285953522, |
| "learning_rate": 5.4526769673014734e-05, |
| "loss": 0.1537, |
| "step": 31650 |
| }, |
| { |
| "epoch": 0.4556217907948588, |
| "grad_norm": 0.34805959463119507, |
| "learning_rate": 5.445490477901546e-05, |
| "loss": 0.1507, |
| "step": 31700 |
| }, |
| { |
| "epoch": 0.4563404371525794, |
| "grad_norm": 0.36547720432281494, |
| "learning_rate": 5.438303988501617e-05, |
| "loss": 0.1544, |
| "step": 31750 |
| }, |
| { |
| "epoch": 0.4570590835103, |
| "grad_norm": 0.428213506937027, |
| "learning_rate": 5.431117499101689e-05, |
| "loss": 0.1522, |
| "step": 31800 |
| }, |
| { |
| "epoch": 0.4577777298680206, |
| "grad_norm": 0.42825666069984436, |
| "learning_rate": 5.423931009701762e-05, |
| "loss": 0.149, |
| "step": 31850 |
| }, |
| { |
| "epoch": 0.4584963762257412, |
| "grad_norm": 0.45687320828437805, |
| "learning_rate": 5.416744520301833e-05, |
| "loss": 0.1492, |
| "step": 31900 |
| }, |
| { |
| "epoch": 0.4592150225834618, |
| "grad_norm": 0.313772976398468, |
| "learning_rate": 5.409558030901904e-05, |
| "loss": 0.1475, |
| "step": 31950 |
| }, |
| { |
| "epoch": 0.45993366894118237, |
| "grad_norm": 0.43313875794410706, |
| "learning_rate": 5.402371541501976e-05, |
| "loss": 0.1533, |
| "step": 32000 |
| }, |
| { |
| "epoch": 0.460652315298903, |
| "grad_norm": 0.3474908173084259, |
| "learning_rate": 5.395185052102049e-05, |
| "loss": 0.1527, |
| "step": 32050 |
| }, |
| { |
| "epoch": 0.46137096165662356, |
| "grad_norm": 0.38463565707206726, |
| "learning_rate": 5.38799856270212e-05, |
| "loss": 0.1522, |
| "step": 32100 |
| }, |
| { |
| "epoch": 0.4620896080143442, |
| "grad_norm": 0.27655136585235596, |
| "learning_rate": 5.380812073302192e-05, |
| "loss": 0.1491, |
| "step": 32150 |
| }, |
| { |
| "epoch": 0.4628082543720648, |
| "grad_norm": 0.3056930899620056, |
| "learning_rate": 5.3736255839022644e-05, |
| "loss": 0.1439, |
| "step": 32200 |
| }, |
| { |
| "epoch": 0.46352690072978536, |
| "grad_norm": 0.4399576187133789, |
| "learning_rate": 5.3664390945023356e-05, |
| "loss": 0.1514, |
| "step": 32250 |
| }, |
| { |
| "epoch": 0.464245547087506, |
| "grad_norm": 0.32986631989479065, |
| "learning_rate": 5.3592526051024076e-05, |
| "loss": 0.1508, |
| "step": 32300 |
| }, |
| { |
| "epoch": 0.46496419344522655, |
| "grad_norm": 0.290811687707901, |
| "learning_rate": 5.35206611570248e-05, |
| "loss": 0.153, |
| "step": 32350 |
| }, |
| { |
| "epoch": 0.46568283980294717, |
| "grad_norm": 0.6335808634757996, |
| "learning_rate": 5.3448796263025514e-05, |
| "loss": 0.151, |
| "step": 32400 |
| }, |
| { |
| "epoch": 0.4664014861606678, |
| "grad_norm": 0.2713414132595062, |
| "learning_rate": 5.3376931369026226e-05, |
| "loss": 0.1501, |
| "step": 32450 |
| }, |
| { |
| "epoch": 0.46712013251838835, |
| "grad_norm": 0.5587482452392578, |
| "learning_rate": 5.330506647502695e-05, |
| "loss": 0.1495, |
| "step": 32500 |
| }, |
| { |
| "epoch": 0.467838778876109, |
| "grad_norm": 0.36849066615104675, |
| "learning_rate": 5.323320158102767e-05, |
| "loss": 0.1489, |
| "step": 32550 |
| }, |
| { |
| "epoch": 0.46855742523382954, |
| "grad_norm": 0.2901330590248108, |
| "learning_rate": 5.3161336687028383e-05, |
| "loss": 0.1488, |
| "step": 32600 |
| }, |
| { |
| "epoch": 0.46927607159155016, |
| "grad_norm": 0.28899720311164856, |
| "learning_rate": 5.308947179302911e-05, |
| "loss": 0.15, |
| "step": 32650 |
| }, |
| { |
| "epoch": 0.4699947179492708, |
| "grad_norm": 0.5915825963020325, |
| "learning_rate": 5.301760689902983e-05, |
| "loss": 0.1513, |
| "step": 32700 |
| }, |
| { |
| "epoch": 0.47071336430699134, |
| "grad_norm": 0.3386688828468323, |
| "learning_rate": 5.294574200503054e-05, |
| "loss": 0.1484, |
| "step": 32750 |
| }, |
| { |
| "epoch": 0.47143201066471196, |
| "grad_norm": 0.3731253445148468, |
| "learning_rate": 5.2873877111031267e-05, |
| "loss": 0.1532, |
| "step": 32800 |
| }, |
| { |
| "epoch": 0.47215065702243253, |
| "grad_norm": 0.31815454363822937, |
| "learning_rate": 5.280201221703198e-05, |
| "loss": 0.148, |
| "step": 32850 |
| }, |
| { |
| "epoch": 0.47286930338015315, |
| "grad_norm": 0.30257225036621094, |
| "learning_rate": 5.27301473230327e-05, |
| "loss": 0.1509, |
| "step": 32900 |
| }, |
| { |
| "epoch": 0.4735879497378737, |
| "grad_norm": 0.3791181743144989, |
| "learning_rate": 5.2658282429033424e-05, |
| "loss": 0.1484, |
| "step": 32950 |
| }, |
| { |
| "epoch": 0.47430659609559433, |
| "grad_norm": 0.39685317873954773, |
| "learning_rate": 5.2586417535034136e-05, |
| "loss": 0.1489, |
| "step": 33000 |
| }, |
| { |
| "epoch": 0.47430659609559433, |
| "eval_loss": 0.14942093193531036, |
| "eval_runtime": 2342.5781, |
| "eval_samples_per_second": 25.011, |
| "eval_steps_per_second": 3.126, |
| "step": 33000 |
| }, |
| { |
| "epoch": 0.47502524245331496, |
| "grad_norm": 0.5523746013641357, |
| "learning_rate": 5.2514552641034855e-05, |
| "loss": 0.147, |
| "step": 33050 |
| }, |
| { |
| "epoch": 0.4757438888110355, |
| "grad_norm": 0.36720308661460876, |
| "learning_rate": 5.244268774703558e-05, |
| "loss": 0.1517, |
| "step": 33100 |
| }, |
| { |
| "epoch": 0.47646253516875614, |
| "grad_norm": 0.45619773864746094, |
| "learning_rate": 5.237226015091628e-05, |
| "loss": 0.1517, |
| "step": 33150 |
| }, |
| { |
| "epoch": 0.4771811815264767, |
| "grad_norm": 0.31895503401756287, |
| "learning_rate": 5.2300395256917003e-05, |
| "loss": 0.1487, |
| "step": 33200 |
| }, |
| { |
| "epoch": 0.4778998278841973, |
| "grad_norm": 0.37323054671287537, |
| "learning_rate": 5.2228530362917716e-05, |
| "loss": 0.1479, |
| "step": 33250 |
| }, |
| { |
| "epoch": 0.47861847424191795, |
| "grad_norm": 0.4316665232181549, |
| "learning_rate": 5.215666546891843e-05, |
| "loss": 0.1465, |
| "step": 33300 |
| }, |
| { |
| "epoch": 0.4793371205996385, |
| "grad_norm": 0.4115291237831116, |
| "learning_rate": 5.2084800574919154e-05, |
| "loss": 0.1506, |
| "step": 33350 |
| }, |
| { |
| "epoch": 0.48005576695735913, |
| "grad_norm": 0.343179851770401, |
| "learning_rate": 5.201293568091987e-05, |
| "loss": 0.1473, |
| "step": 33400 |
| }, |
| { |
| "epoch": 0.4807744133150797, |
| "grad_norm": 0.3537336587905884, |
| "learning_rate": 5.1941070786920585e-05, |
| "loss": 0.149, |
| "step": 33450 |
| }, |
| { |
| "epoch": 0.4814930596728003, |
| "grad_norm": 0.34489813446998596, |
| "learning_rate": 5.186920589292131e-05, |
| "loss": 0.148, |
| "step": 33500 |
| }, |
| { |
| "epoch": 0.48221170603052094, |
| "grad_norm": 0.526101291179657, |
| "learning_rate": 5.179734099892203e-05, |
| "loss": 0.153, |
| "step": 33550 |
| }, |
| { |
| "epoch": 0.4829303523882415, |
| "grad_norm": 0.3146929144859314, |
| "learning_rate": 5.172547610492274e-05, |
| "loss": 0.1484, |
| "step": 33600 |
| }, |
| { |
| "epoch": 0.4836489987459621, |
| "grad_norm": 0.4303942918777466, |
| "learning_rate": 5.165361121092347e-05, |
| "loss": 0.1465, |
| "step": 33650 |
| }, |
| { |
| "epoch": 0.4843676451036827, |
| "grad_norm": 0.4743385910987854, |
| "learning_rate": 5.158174631692419e-05, |
| "loss": 0.1505, |
| "step": 33700 |
| }, |
| { |
| "epoch": 0.4850862914614033, |
| "grad_norm": 0.4490325152873993, |
| "learning_rate": 5.15098814229249e-05, |
| "loss": 0.1468, |
| "step": 33750 |
| }, |
| { |
| "epoch": 0.48580493781912387, |
| "grad_norm": 0.3729170858860016, |
| "learning_rate": 5.1438016528925626e-05, |
| "loss": 0.1495, |
| "step": 33800 |
| }, |
| { |
| "epoch": 0.4865235841768445, |
| "grad_norm": 0.2781628668308258, |
| "learning_rate": 5.136615163492634e-05, |
| "loss": 0.1479, |
| "step": 33850 |
| }, |
| { |
| "epoch": 0.4872422305345651, |
| "grad_norm": 0.30333954095840454, |
| "learning_rate": 5.129428674092706e-05, |
| "loss": 0.1433, |
| "step": 33900 |
| }, |
| { |
| "epoch": 0.4879608768922857, |
| "grad_norm": 0.30281704664230347, |
| "learning_rate": 5.122242184692778e-05, |
| "loss": 0.1454, |
| "step": 33950 |
| }, |
| { |
| "epoch": 0.4886795232500063, |
| "grad_norm": 0.25618281960487366, |
| "learning_rate": 5.1150556952928496e-05, |
| "loss": 0.151, |
| "step": 34000 |
| }, |
| { |
| "epoch": 0.48939816960772686, |
| "grad_norm": 0.34702569246292114, |
| "learning_rate": 5.1078692058929215e-05, |
| "loss": 0.1481, |
| "step": 34050 |
| }, |
| { |
| "epoch": 0.4901168159654475, |
| "grad_norm": 0.3548518121242523, |
| "learning_rate": 5.100682716492994e-05, |
| "loss": 0.1474, |
| "step": 34100 |
| }, |
| { |
| "epoch": 0.4908354623231681, |
| "grad_norm": 0.36157211661338806, |
| "learning_rate": 5.093496227093065e-05, |
| "loss": 0.1503, |
| "step": 34150 |
| }, |
| { |
| "epoch": 0.49155410868088867, |
| "grad_norm": 0.3797595500946045, |
| "learning_rate": 5.0863097376931365e-05, |
| "loss": 0.1432, |
| "step": 34200 |
| }, |
| { |
| "epoch": 0.4922727550386093, |
| "grad_norm": 0.33411136269569397, |
| "learning_rate": 5.079123248293209e-05, |
| "loss": 0.1497, |
| "step": 34250 |
| }, |
| { |
| "epoch": 0.49299140139632985, |
| "grad_norm": 0.5178517699241638, |
| "learning_rate": 5.071936758893281e-05, |
| "loss": 0.1526, |
| "step": 34300 |
| }, |
| { |
| "epoch": 0.4937100477540505, |
| "grad_norm": 0.4469529986381531, |
| "learning_rate": 5.064750269493352e-05, |
| "loss": 0.1478, |
| "step": 34350 |
| }, |
| { |
| "epoch": 0.4944286941117711, |
| "grad_norm": 0.3018459975719452, |
| "learning_rate": 5.057563780093425e-05, |
| "loss": 0.1444, |
| "step": 34400 |
| }, |
| { |
| "epoch": 0.49514734046949166, |
| "grad_norm": 0.38588500022888184, |
| "learning_rate": 5.050377290693497e-05, |
| "loss": 0.1492, |
| "step": 34450 |
| }, |
| { |
| "epoch": 0.4958659868272123, |
| "grad_norm": 0.38262608647346497, |
| "learning_rate": 5.043190801293568e-05, |
| "loss": 0.1454, |
| "step": 34500 |
| }, |
| { |
| "epoch": 0.49658463318493284, |
| "grad_norm": 0.32095280289649963, |
| "learning_rate": 5.0360043118936406e-05, |
| "loss": 0.1479, |
| "step": 34550 |
| }, |
| { |
| "epoch": 0.49730327954265346, |
| "grad_norm": 0.3346179127693176, |
| "learning_rate": 5.0288178224937125e-05, |
| "loss": 0.1511, |
| "step": 34600 |
| }, |
| { |
| "epoch": 0.49802192590037403, |
| "grad_norm": 0.5138089060783386, |
| "learning_rate": 5.021631333093784e-05, |
| "loss": 0.1511, |
| "step": 34650 |
| }, |
| { |
| "epoch": 0.49874057225809465, |
| "grad_norm": 0.664122462272644, |
| "learning_rate": 5.014444843693856e-05, |
| "loss": 0.149, |
| "step": 34700 |
| }, |
| { |
| "epoch": 0.49945921861581527, |
| "grad_norm": 0.43910887837409973, |
| "learning_rate": 5.0072583542939275e-05, |
| "loss": 0.149, |
| "step": 34750 |
| }, |
| { |
| "epoch": 0.5001778649735359, |
| "grad_norm": 0.4011009633541107, |
| "learning_rate": 5.0000718648939994e-05, |
| "loss": 0.1481, |
| "step": 34800 |
| }, |
| { |
| "epoch": 0.5008965113312565, |
| "grad_norm": 0.2885836660861969, |
| "learning_rate": 4.9928853754940713e-05, |
| "loss": 0.15, |
| "step": 34850 |
| }, |
| { |
| "epoch": 0.501615157688977, |
| "grad_norm": 0.5149396061897278, |
| "learning_rate": 4.985698886094143e-05, |
| "loss": 0.1506, |
| "step": 34900 |
| }, |
| { |
| "epoch": 0.5023338040466976, |
| "grad_norm": 0.3503366708755493, |
| "learning_rate": 4.978512396694215e-05, |
| "loss": 0.1457, |
| "step": 34950 |
| }, |
| { |
| "epoch": 0.5030524504044183, |
| "grad_norm": 0.4478318691253662, |
| "learning_rate": 4.971325907294287e-05, |
| "loss": 0.1468, |
| "step": 35000 |
| }, |
| { |
| "epoch": 0.5037710967621388, |
| "grad_norm": 0.4077318608760834, |
| "learning_rate": 4.964139417894359e-05, |
| "loss": 0.1421, |
| "step": 35050 |
| }, |
| { |
| "epoch": 0.5044897431198594, |
| "grad_norm": 0.4490613341331482, |
| "learning_rate": 4.95695292849443e-05, |
| "loss": 0.1471, |
| "step": 35100 |
| }, |
| { |
| "epoch": 0.5052083894775801, |
| "grad_norm": 0.4023280441761017, |
| "learning_rate": 4.949766439094503e-05, |
| "loss": 0.1467, |
| "step": 35150 |
| }, |
| { |
| "epoch": 0.5059270358353006, |
| "grad_norm": 0.426633358001709, |
| "learning_rate": 4.942579949694575e-05, |
| "loss": 0.1457, |
| "step": 35200 |
| }, |
| { |
| "epoch": 0.5066456821930212, |
| "grad_norm": 0.45067334175109863, |
| "learning_rate": 4.935537190082645e-05, |
| "loss": 0.1445, |
| "step": 35250 |
| }, |
| { |
| "epoch": 0.5073643285507419, |
| "grad_norm": 0.39662429690361023, |
| "learning_rate": 4.928350700682717e-05, |
| "loss": 0.145, |
| "step": 35300 |
| }, |
| { |
| "epoch": 0.5080829749084624, |
| "grad_norm": 0.3792784512042999, |
| "learning_rate": 4.921164211282789e-05, |
| "loss": 0.1501, |
| "step": 35350 |
| }, |
| { |
| "epoch": 0.508801621266183, |
| "grad_norm": 0.5802880525588989, |
| "learning_rate": 4.91397772188286e-05, |
| "loss": 0.1479, |
| "step": 35400 |
| }, |
| { |
| "epoch": 0.5095202676239036, |
| "grad_norm": 0.27767547965049744, |
| "learning_rate": 4.906791232482933e-05, |
| "loss": 0.1466, |
| "step": 35450 |
| }, |
| { |
| "epoch": 0.5102389139816242, |
| "grad_norm": 0.32960420846939087, |
| "learning_rate": 4.8996047430830046e-05, |
| "loss": 0.1441, |
| "step": 35500 |
| }, |
| { |
| "epoch": 0.5109575603393448, |
| "grad_norm": 0.3106556832790375, |
| "learning_rate": 4.892418253683076e-05, |
| "loss": 0.1495, |
| "step": 35550 |
| }, |
| { |
| "epoch": 0.5116762066970654, |
| "grad_norm": 0.37083184719085693, |
| "learning_rate": 4.885231764283148e-05, |
| "loss": 0.1483, |
| "step": 35600 |
| }, |
| { |
| "epoch": 0.512394853054786, |
| "grad_norm": 0.3685917854309082, |
| "learning_rate": 4.87804527488322e-05, |
| "loss": 0.1476, |
| "step": 35650 |
| }, |
| { |
| "epoch": 0.5131134994125066, |
| "grad_norm": 0.4368564784526825, |
| "learning_rate": 4.8708587854832915e-05, |
| "loss": 0.1474, |
| "step": 35700 |
| }, |
| { |
| "epoch": 0.5138321457702272, |
| "grad_norm": 0.3055019676685333, |
| "learning_rate": 4.8636722960833635e-05, |
| "loss": 0.1421, |
| "step": 35750 |
| }, |
| { |
| "epoch": 0.5145507921279477, |
| "grad_norm": 0.4695027470588684, |
| "learning_rate": 4.8564858066834354e-05, |
| "loss": 0.1493, |
| "step": 35800 |
| }, |
| { |
| "epoch": 0.5152694384856684, |
| "grad_norm": 0.35940343141555786, |
| "learning_rate": 4.849299317283507e-05, |
| "loss": 0.1489, |
| "step": 35850 |
| }, |
| { |
| "epoch": 0.515988084843389, |
| "grad_norm": 0.37435460090637207, |
| "learning_rate": 4.842112827883579e-05, |
| "loss": 0.1475, |
| "step": 35900 |
| }, |
| { |
| "epoch": 0.5167067312011095, |
| "grad_norm": 0.35947975516319275, |
| "learning_rate": 4.8349263384836504e-05, |
| "loss": 0.1467, |
| "step": 35950 |
| }, |
| { |
| "epoch": 0.5174253775588302, |
| "grad_norm": 0.3311944007873535, |
| "learning_rate": 4.827739849083723e-05, |
| "loss": 0.1465, |
| "step": 36000 |
| }, |
| { |
| "epoch": 0.5174253775588302, |
| "eval_loss": 0.14617860317230225, |
| "eval_runtime": 2341.814, |
| "eval_samples_per_second": 25.019, |
| "eval_steps_per_second": 3.127, |
| "step": 36000 |
| }, |
| { |
| "epoch": 0.5181440239165508, |
| "grad_norm": 0.3348549008369446, |
| "learning_rate": 4.820553359683795e-05, |
| "loss": 0.1499, |
| "step": 36050 |
| }, |
| { |
| "epoch": 0.5188626702742714, |
| "grad_norm": 0.3845024108886719, |
| "learning_rate": 4.813366870283866e-05, |
| "loss": 0.1479, |
| "step": 36100 |
| }, |
| { |
| "epoch": 0.519581316631992, |
| "grad_norm": 0.3920484483242035, |
| "learning_rate": 4.806324110671937e-05, |
| "loss": 0.143, |
| "step": 36150 |
| }, |
| { |
| "epoch": 0.5202999629897126, |
| "grad_norm": 0.3186359405517578, |
| "learning_rate": 4.799137621272009e-05, |
| "loss": 0.1464, |
| "step": 36200 |
| }, |
| { |
| "epoch": 0.5210186093474332, |
| "grad_norm": 0.3029921054840088, |
| "learning_rate": 4.79195113187208e-05, |
| "loss": 0.1479, |
| "step": 36250 |
| }, |
| { |
| "epoch": 0.5217372557051537, |
| "grad_norm": 0.37382078170776367, |
| "learning_rate": 4.784764642472153e-05, |
| "loss": 0.1424, |
| "step": 36300 |
| }, |
| { |
| "epoch": 0.5224559020628744, |
| "grad_norm": 0.3550768792629242, |
| "learning_rate": 4.777578153072225e-05, |
| "loss": 0.142, |
| "step": 36350 |
| }, |
| { |
| "epoch": 0.523174548420595, |
| "grad_norm": 0.33143946528434753, |
| "learning_rate": 4.770391663672296e-05, |
| "loss": 0.1453, |
| "step": 36400 |
| }, |
| { |
| "epoch": 0.5238931947783155, |
| "grad_norm": 0.40416479110717773, |
| "learning_rate": 4.763205174272368e-05, |
| "loss": 0.1435, |
| "step": 36450 |
| }, |
| { |
| "epoch": 0.5246118411360362, |
| "grad_norm": 0.3630949854850769, |
| "learning_rate": 4.7560186848724405e-05, |
| "loss": 0.1464, |
| "step": 36500 |
| }, |
| { |
| "epoch": 0.5253304874937568, |
| "grad_norm": 0.3661783039569855, |
| "learning_rate": 4.748832195472512e-05, |
| "loss": 0.1428, |
| "step": 36550 |
| }, |
| { |
| "epoch": 0.5260491338514773, |
| "grad_norm": 0.38454943895339966, |
| "learning_rate": 4.7416457060725837e-05, |
| "loss": 0.1437, |
| "step": 36600 |
| }, |
| { |
| "epoch": 0.5267677802091979, |
| "grad_norm": 0.4658641517162323, |
| "learning_rate": 4.734459216672656e-05, |
| "loss": 0.1445, |
| "step": 36650 |
| }, |
| { |
| "epoch": 0.5274864265669186, |
| "grad_norm": 0.3436949551105499, |
| "learning_rate": 4.7272727272727275e-05, |
| "loss": 0.1446, |
| "step": 36700 |
| }, |
| { |
| "epoch": 0.5282050729246391, |
| "grad_norm": 0.34002795815467834, |
| "learning_rate": 4.7200862378727994e-05, |
| "loss": 0.1441, |
| "step": 36750 |
| }, |
| { |
| "epoch": 0.5289237192823597, |
| "grad_norm": 0.40276002883911133, |
| "learning_rate": 4.712899748472871e-05, |
| "loss": 0.1479, |
| "step": 36800 |
| }, |
| { |
| "epoch": 0.5296423656400804, |
| "grad_norm": 0.5200607776641846, |
| "learning_rate": 4.705713259072943e-05, |
| "loss": 0.1443, |
| "step": 36850 |
| }, |
| { |
| "epoch": 0.530361011997801, |
| "grad_norm": 0.46847838163375854, |
| "learning_rate": 4.698526769673015e-05, |
| "loss": 0.1416, |
| "step": 36900 |
| }, |
| { |
| "epoch": 0.5310796583555215, |
| "grad_norm": 0.4312261641025543, |
| "learning_rate": 4.6913402802730863e-05, |
| "loss": 0.1483, |
| "step": 36950 |
| }, |
| { |
| "epoch": 0.5317983047132422, |
| "grad_norm": 0.3275109827518463, |
| "learning_rate": 4.684153790873159e-05, |
| "loss": 0.1467, |
| "step": 37000 |
| }, |
| { |
| "epoch": 0.5325169510709628, |
| "grad_norm": 0.4498741030693054, |
| "learning_rate": 4.676967301473231e-05, |
| "loss": 0.146, |
| "step": 37050 |
| }, |
| { |
| "epoch": 0.5332355974286833, |
| "grad_norm": 0.3524952530860901, |
| "learning_rate": 4.669780812073302e-05, |
| "loss": 0.1437, |
| "step": 37100 |
| }, |
| { |
| "epoch": 0.5339542437864039, |
| "grad_norm": 0.4064757823944092, |
| "learning_rate": 4.662594322673374e-05, |
| "loss": 0.1462, |
| "step": 37150 |
| }, |
| { |
| "epoch": 0.5346728901441246, |
| "grad_norm": 0.3992777168750763, |
| "learning_rate": 4.6554078332734466e-05, |
| "loss": 0.1451, |
| "step": 37200 |
| }, |
| { |
| "epoch": 0.5353915365018451, |
| "grad_norm": 0.34734034538269043, |
| "learning_rate": 4.648221343873518e-05, |
| "loss": 0.1478, |
| "step": 37250 |
| }, |
| { |
| "epoch": 0.5361101828595657, |
| "grad_norm": 0.29704445600509644, |
| "learning_rate": 4.64103485447359e-05, |
| "loss": 0.1421, |
| "step": 37300 |
| }, |
| { |
| "epoch": 0.5368288292172864, |
| "grad_norm": 0.3567189574241638, |
| "learning_rate": 4.6338483650736616e-05, |
| "loss": 0.1462, |
| "step": 37350 |
| }, |
| { |
| "epoch": 0.5375474755750069, |
| "grad_norm": 0.49130842089653015, |
| "learning_rate": 4.6266618756737335e-05, |
| "loss": 0.146, |
| "step": 37400 |
| }, |
| { |
| "epoch": 0.5382661219327275, |
| "grad_norm": 0.44548994302749634, |
| "learning_rate": 4.6194753862738055e-05, |
| "loss": 0.1409, |
| "step": 37450 |
| }, |
| { |
| "epoch": 0.5389847682904481, |
| "grad_norm": 0.426877498626709, |
| "learning_rate": 4.6122888968738774e-05, |
| "loss": 0.1421, |
| "step": 37500 |
| }, |
| { |
| "epoch": 0.5397034146481687, |
| "grad_norm": 0.4113141596317291, |
| "learning_rate": 4.605102407473949e-05, |
| "loss": 0.1478, |
| "step": 37550 |
| }, |
| { |
| "epoch": 0.5404220610058893, |
| "grad_norm": 0.4359269440174103, |
| "learning_rate": 4.597915918074021e-05, |
| "loss": 0.1393, |
| "step": 37600 |
| }, |
| { |
| "epoch": 0.5411407073636099, |
| "grad_norm": 0.5452132821083069, |
| "learning_rate": 4.590729428674093e-05, |
| "loss": 0.1489, |
| "step": 37650 |
| }, |
| { |
| "epoch": 0.5418593537213305, |
| "grad_norm": 0.41318729519844055, |
| "learning_rate": 4.583542939274165e-05, |
| "loss": 0.1413, |
| "step": 37700 |
| }, |
| { |
| "epoch": 0.5425780000790511, |
| "grad_norm": 0.35488757491111755, |
| "learning_rate": 4.576356449874237e-05, |
| "loss": 0.1415, |
| "step": 37750 |
| }, |
| { |
| "epoch": 0.5432966464367717, |
| "grad_norm": 0.34059152007102966, |
| "learning_rate": 4.569169960474309e-05, |
| "loss": 0.1456, |
| "step": 37800 |
| }, |
| { |
| "epoch": 0.5440152927944923, |
| "grad_norm": 0.35003551840782166, |
| "learning_rate": 4.56198347107438e-05, |
| "loss": 0.1491, |
| "step": 37850 |
| }, |
| { |
| "epoch": 0.5447339391522129, |
| "grad_norm": 0.25030508637428284, |
| "learning_rate": 4.5547969816744526e-05, |
| "loss": 0.1429, |
| "step": 37900 |
| }, |
| { |
| "epoch": 0.5454525855099335, |
| "grad_norm": 0.3320457339286804, |
| "learning_rate": 4.547610492274524e-05, |
| "loss": 0.1458, |
| "step": 37950 |
| }, |
| { |
| "epoch": 0.546171231867654, |
| "grad_norm": 0.29865363240242004, |
| "learning_rate": 4.540424002874596e-05, |
| "loss": 0.1432, |
| "step": 38000 |
| }, |
| { |
| "epoch": 0.5468898782253747, |
| "grad_norm": 0.29522374272346497, |
| "learning_rate": 4.533237513474668e-05, |
| "loss": 0.1429, |
| "step": 38050 |
| }, |
| { |
| "epoch": 0.5476085245830953, |
| "grad_norm": 0.29843688011169434, |
| "learning_rate": 4.5260510240747396e-05, |
| "loss": 0.1457, |
| "step": 38100 |
| }, |
| { |
| "epoch": 0.5483271709408158, |
| "grad_norm": 0.42439109086990356, |
| "learning_rate": 4.5188645346748115e-05, |
| "loss": 0.1434, |
| "step": 38150 |
| }, |
| { |
| "epoch": 0.5490458172985365, |
| "grad_norm": 0.4063067138195038, |
| "learning_rate": 4.5116780452748834e-05, |
| "loss": 0.1432, |
| "step": 38200 |
| }, |
| { |
| "epoch": 0.5497644636562571, |
| "grad_norm": 0.39855289459228516, |
| "learning_rate": 4.5044915558749553e-05, |
| "loss": 0.1461, |
| "step": 38250 |
| }, |
| { |
| "epoch": 0.5504831100139777, |
| "grad_norm": 0.3334510624408722, |
| "learning_rate": 4.497305066475027e-05, |
| "loss": 0.1463, |
| "step": 38300 |
| }, |
| { |
| "epoch": 0.5512017563716982, |
| "grad_norm": 0.4082428812980652, |
| "learning_rate": 4.490118577075099e-05, |
| "loss": 0.1466, |
| "step": 38350 |
| }, |
| { |
| "epoch": 0.5519204027294189, |
| "grad_norm": 0.32873401045799255, |
| "learning_rate": 4.4829320876751704e-05, |
| "loss": 0.1445, |
| "step": 38400 |
| }, |
| { |
| "epoch": 0.5526390490871395, |
| "grad_norm": 0.31031402945518494, |
| "learning_rate": 4.475745598275243e-05, |
| "loss": 0.1433, |
| "step": 38450 |
| }, |
| { |
| "epoch": 0.55335769544486, |
| "grad_norm": 0.45551490783691406, |
| "learning_rate": 4.468559108875315e-05, |
| "loss": 0.1398, |
| "step": 38500 |
| }, |
| { |
| "epoch": 0.5540763418025807, |
| "grad_norm": 0.3071674406528473, |
| "learning_rate": 4.461372619475386e-05, |
| "loss": 0.1447, |
| "step": 38550 |
| }, |
| { |
| "epoch": 0.5547949881603013, |
| "grad_norm": 0.4662153422832489, |
| "learning_rate": 4.454186130075459e-05, |
| "loss": 0.1416, |
| "step": 38600 |
| }, |
| { |
| "epoch": 0.5555136345180218, |
| "grad_norm": 0.42972332239151, |
| "learning_rate": 4.4469996406755306e-05, |
| "loss": 0.1448, |
| "step": 38650 |
| }, |
| { |
| "epoch": 0.5562322808757425, |
| "grad_norm": 0.5573573112487793, |
| "learning_rate": 4.439813151275602e-05, |
| "loss": 0.1449, |
| "step": 38700 |
| }, |
| { |
| "epoch": 0.5569509272334631, |
| "grad_norm": 0.460609495639801, |
| "learning_rate": 4.432626661875674e-05, |
| "loss": 0.1413, |
| "step": 38750 |
| }, |
| { |
| "epoch": 0.5576695735911836, |
| "grad_norm": 0.410047322511673, |
| "learning_rate": 4.4254401724757464e-05, |
| "loss": 0.1417, |
| "step": 38800 |
| }, |
| { |
| "epoch": 0.5583882199489042, |
| "grad_norm": 0.31715336441993713, |
| "learning_rate": 4.4182536830758176e-05, |
| "loss": 0.1396, |
| "step": 38850 |
| }, |
| { |
| "epoch": 0.5591068663066249, |
| "grad_norm": 0.2956278324127197, |
| "learning_rate": 4.4110671936758895e-05, |
| "loss": 0.1428, |
| "step": 38900 |
| }, |
| { |
| "epoch": 0.5598255126643454, |
| "grad_norm": 0.48904120922088623, |
| "learning_rate": 4.4038807042759614e-05, |
| "loss": 0.1451, |
| "step": 38950 |
| }, |
| { |
| "epoch": 0.560544159022066, |
| "grad_norm": 0.4974726140499115, |
| "learning_rate": 4.396694214876033e-05, |
| "loss": 0.141, |
| "step": 39000 |
| }, |
| { |
| "epoch": 0.560544159022066, |
| "eval_loss": 0.14253196120262146, |
| "eval_runtime": 2331.8967, |
| "eval_samples_per_second": 25.125, |
| "eval_steps_per_second": 3.141, |
| "step": 39000 |
| }, |
| { |
| "epoch": 0.5612628053797867, |
| "grad_norm": 0.386654794216156, |
| "learning_rate": 4.389507725476105e-05, |
| "loss": 0.1405, |
| "step": 39050 |
| }, |
| { |
| "epoch": 0.5619814517375072, |
| "grad_norm": 0.5421323776245117, |
| "learning_rate": 4.3823212360761765e-05, |
| "loss": 0.1433, |
| "step": 39100 |
| }, |
| { |
| "epoch": 0.5627000980952278, |
| "grad_norm": 0.3224097788333893, |
| "learning_rate": 4.375134746676249e-05, |
| "loss": 0.1406, |
| "step": 39150 |
| }, |
| { |
| "epoch": 0.5634187444529484, |
| "grad_norm": 0.29007741808891296, |
| "learning_rate": 4.367948257276321e-05, |
| "loss": 0.1444, |
| "step": 39200 |
| }, |
| { |
| "epoch": 0.564137390810669, |
| "grad_norm": 0.3608921468257904, |
| "learning_rate": 4.360761767876392e-05, |
| "loss": 0.1391, |
| "step": 39250 |
| }, |
| { |
| "epoch": 0.5648560371683896, |
| "grad_norm": 0.37967294454574585, |
| "learning_rate": 4.353575278476464e-05, |
| "loss": 0.1382, |
| "step": 39300 |
| }, |
| { |
| "epoch": 0.5655746835261102, |
| "grad_norm": 0.3170875310897827, |
| "learning_rate": 4.346388789076537e-05, |
| "loss": 0.1406, |
| "step": 39350 |
| }, |
| { |
| "epoch": 0.5662933298838309, |
| "grad_norm": 0.2862553596496582, |
| "learning_rate": 4.339202299676608e-05, |
| "loss": 0.1452, |
| "step": 39400 |
| }, |
| { |
| "epoch": 0.5670119762415514, |
| "grad_norm": 0.37836143374443054, |
| "learning_rate": 4.33201581027668e-05, |
| "loss": 0.1476, |
| "step": 39450 |
| }, |
| { |
| "epoch": 0.567730622599272, |
| "grad_norm": 0.4403735399246216, |
| "learning_rate": 4.324829320876752e-05, |
| "loss": 0.1419, |
| "step": 39500 |
| }, |
| { |
| "epoch": 0.5684492689569927, |
| "grad_norm": 0.2952041029930115, |
| "learning_rate": 4.3176428314768237e-05, |
| "loss": 0.1399, |
| "step": 39550 |
| }, |
| { |
| "epoch": 0.5691679153147132, |
| "grad_norm": 0.3520998954772949, |
| "learning_rate": 4.3104563420768956e-05, |
| "loss": 0.1414, |
| "step": 39600 |
| }, |
| { |
| "epoch": 0.5698865616724338, |
| "grad_norm": 0.667454719543457, |
| "learning_rate": 4.3032698526769675e-05, |
| "loss": 0.1414, |
| "step": 39650 |
| }, |
| { |
| "epoch": 0.5706052080301544, |
| "grad_norm": 0.37446698546409607, |
| "learning_rate": 4.2960833632770394e-05, |
| "loss": 0.1436, |
| "step": 39700 |
| }, |
| { |
| "epoch": 0.571323854387875, |
| "grad_norm": 0.46900761127471924, |
| "learning_rate": 4.288896873877111e-05, |
| "loss": 0.1414, |
| "step": 39750 |
| }, |
| { |
| "epoch": 0.5720425007455956, |
| "grad_norm": 0.3684402406215668, |
| "learning_rate": 4.281710384477183e-05, |
| "loss": 0.1447, |
| "step": 39800 |
| }, |
| { |
| "epoch": 0.5727611471033162, |
| "grad_norm": 0.3427809774875641, |
| "learning_rate": 4.274523895077255e-05, |
| "loss": 0.1414, |
| "step": 39850 |
| }, |
| { |
| "epoch": 0.5734797934610368, |
| "grad_norm": 0.3241630494594574, |
| "learning_rate": 4.267337405677327e-05, |
| "loss": 0.1397, |
| "step": 39900 |
| }, |
| { |
| "epoch": 0.5741984398187574, |
| "grad_norm": 0.3302249610424042, |
| "learning_rate": 4.260150916277399e-05, |
| "loss": 0.1412, |
| "step": 39950 |
| }, |
| { |
| "epoch": 0.574917086176478, |
| "grad_norm": 0.5053568482398987, |
| "learning_rate": 4.25296442687747e-05, |
| "loss": 0.1432, |
| "step": 40000 |
| }, |
| { |
| "epoch": 0.5756357325341985, |
| "grad_norm": 0.42272675037384033, |
| "learning_rate": 4.245777937477543e-05, |
| "loss": 0.146, |
| "step": 40050 |
| }, |
| { |
| "epoch": 0.5763543788919192, |
| "grad_norm": 0.3637758493423462, |
| "learning_rate": 4.238591448077615e-05, |
| "loss": 0.1418, |
| "step": 40100 |
| }, |
| { |
| "epoch": 0.5770730252496398, |
| "grad_norm": 0.4027983248233795, |
| "learning_rate": 4.231404958677686e-05, |
| "loss": 0.1414, |
| "step": 40150 |
| }, |
| { |
| "epoch": 0.5777916716073603, |
| "grad_norm": 0.2862512171268463, |
| "learning_rate": 4.224362199065757e-05, |
| "loss": 0.1416, |
| "step": 40200 |
| }, |
| { |
| "epoch": 0.578510317965081, |
| "grad_norm": 0.49394193291664124, |
| "learning_rate": 4.217175709665828e-05, |
| "loss": 0.1415, |
| "step": 40250 |
| }, |
| { |
| "epoch": 0.5792289643228016, |
| "grad_norm": 0.3553609549999237, |
| "learning_rate": 4.2099892202659e-05, |
| "loss": 0.1432, |
| "step": 40300 |
| }, |
| { |
| "epoch": 0.5799476106805221, |
| "grad_norm": 0.5765389204025269, |
| "learning_rate": 4.2028027308659726e-05, |
| "loss": 0.1437, |
| "step": 40350 |
| }, |
| { |
| "epoch": 0.5806662570382428, |
| "grad_norm": 0.376592755317688, |
| "learning_rate": 4.195616241466044e-05, |
| "loss": 0.1406, |
| "step": 40400 |
| }, |
| { |
| "epoch": 0.5813849033959634, |
| "grad_norm": 0.30451223254203796, |
| "learning_rate": 4.188429752066116e-05, |
| "loss": 0.1447, |
| "step": 40450 |
| }, |
| { |
| "epoch": 0.582103549753684, |
| "grad_norm": 0.40569472312927246, |
| "learning_rate": 4.181243262666188e-05, |
| "loss": 0.1475, |
| "step": 40500 |
| }, |
| { |
| "epoch": 0.5828221961114045, |
| "grad_norm": 0.5457615256309509, |
| "learning_rate": 4.1740567732662596e-05, |
| "loss": 0.1444, |
| "step": 40550 |
| }, |
| { |
| "epoch": 0.5835408424691252, |
| "grad_norm": 0.30570656061172485, |
| "learning_rate": 4.1668702838663315e-05, |
| "loss": 0.138, |
| "step": 40600 |
| }, |
| { |
| "epoch": 0.5842594888268458, |
| "grad_norm": 0.3235516846179962, |
| "learning_rate": 4.1596837944664034e-05, |
| "loss": 0.1429, |
| "step": 40650 |
| }, |
| { |
| "epoch": 0.5849781351845663, |
| "grad_norm": 0.3158193826675415, |
| "learning_rate": 4.152497305066475e-05, |
| "loss": 0.1411, |
| "step": 40700 |
| }, |
| { |
| "epoch": 0.585696781542287, |
| "grad_norm": 0.33914652466773987, |
| "learning_rate": 4.145310815666547e-05, |
| "loss": 0.1396, |
| "step": 40750 |
| }, |
| { |
| "epoch": 0.5864154279000076, |
| "grad_norm": 0.38597750663757324, |
| "learning_rate": 4.138124326266619e-05, |
| "loss": 0.1393, |
| "step": 40800 |
| }, |
| { |
| "epoch": 0.5871340742577281, |
| "grad_norm": 0.5863543152809143, |
| "learning_rate": 4.1309378368666904e-05, |
| "loss": 0.1418, |
| "step": 40850 |
| }, |
| { |
| "epoch": 0.5878527206154487, |
| "grad_norm": 0.44041261076927185, |
| "learning_rate": 4.123751347466763e-05, |
| "loss": 0.1407, |
| "step": 40900 |
| }, |
| { |
| "epoch": 0.5885713669731694, |
| "grad_norm": 0.41702163219451904, |
| "learning_rate": 4.116564858066835e-05, |
| "loss": 0.1406, |
| "step": 40950 |
| }, |
| { |
| "epoch": 0.5892900133308899, |
| "grad_norm": 0.304452121257782, |
| "learning_rate": 4.109378368666906e-05, |
| "loss": 0.1457, |
| "step": 41000 |
| }, |
| { |
| "epoch": 0.5900086596886105, |
| "grad_norm": 0.3917398452758789, |
| "learning_rate": 4.102191879266978e-05, |
| "loss": 0.1429, |
| "step": 41050 |
| }, |
| { |
| "epoch": 0.5907273060463312, |
| "grad_norm": 0.34760087728500366, |
| "learning_rate": 4.0950053898670506e-05, |
| "loss": 0.1384, |
| "step": 41100 |
| }, |
| { |
| "epoch": 0.5914459524040517, |
| "grad_norm": 0.36742857098579407, |
| "learning_rate": 4.087818900467122e-05, |
| "loss": 0.1386, |
| "step": 41150 |
| }, |
| { |
| "epoch": 0.5921645987617723, |
| "grad_norm": 0.31853848695755005, |
| "learning_rate": 4.080632411067194e-05, |
| "loss": 0.1363, |
| "step": 41200 |
| }, |
| { |
| "epoch": 0.592883245119493, |
| "grad_norm": 0.5090049505233765, |
| "learning_rate": 4.0734459216672656e-05, |
| "loss": 0.1372, |
| "step": 41250 |
| }, |
| { |
| "epoch": 0.5936018914772135, |
| "grad_norm": 0.38126641511917114, |
| "learning_rate": 4.0662594322673376e-05, |
| "loss": 0.1389, |
| "step": 41300 |
| }, |
| { |
| "epoch": 0.5943205378349341, |
| "grad_norm": 0.36152946949005127, |
| "learning_rate": 4.0590729428674095e-05, |
| "loss": 0.141, |
| "step": 41350 |
| }, |
| { |
| "epoch": 0.5950391841926547, |
| "grad_norm": 0.29812654852867126, |
| "learning_rate": 4.0518864534674814e-05, |
| "loss": 0.1395, |
| "step": 41400 |
| }, |
| { |
| "epoch": 0.5957578305503753, |
| "grad_norm": 0.38680991530418396, |
| "learning_rate": 4.044699964067553e-05, |
| "loss": 0.1411, |
| "step": 41450 |
| }, |
| { |
| "epoch": 0.5964764769080959, |
| "grad_norm": 0.4588952958583832, |
| "learning_rate": 4.037513474667625e-05, |
| "loss": 0.141, |
| "step": 41500 |
| }, |
| { |
| "epoch": 0.5971951232658165, |
| "grad_norm": 0.34070587158203125, |
| "learning_rate": 4.0303269852676964e-05, |
| "loss": 0.1402, |
| "step": 41550 |
| }, |
| { |
| "epoch": 0.5979137696235372, |
| "grad_norm": 0.4279496371746063, |
| "learning_rate": 4.023140495867769e-05, |
| "loss": 0.1407, |
| "step": 41600 |
| }, |
| { |
| "epoch": 0.5986324159812577, |
| "grad_norm": 0.3747555911540985, |
| "learning_rate": 4.015954006467841e-05, |
| "loss": 0.1388, |
| "step": 41650 |
| }, |
| { |
| "epoch": 0.5993510623389783, |
| "grad_norm": 0.46703338623046875, |
| "learning_rate": 4.008767517067912e-05, |
| "loss": 0.1444, |
| "step": 41700 |
| }, |
| { |
| "epoch": 0.6000697086966988, |
| "grad_norm": 0.2871130704879761, |
| "learning_rate": 4.001581027667984e-05, |
| "loss": 0.1397, |
| "step": 41750 |
| }, |
| { |
| "epoch": 0.6007883550544195, |
| "grad_norm": 0.5842506289482117, |
| "learning_rate": 3.994394538268057e-05, |
| "loss": 0.1393, |
| "step": 41800 |
| }, |
| { |
| "epoch": 0.6015070014121401, |
| "grad_norm": 0.5202719569206238, |
| "learning_rate": 3.987208048868128e-05, |
| "loss": 0.1407, |
| "step": 41850 |
| }, |
| { |
| "epoch": 0.6022256477698607, |
| "grad_norm": 0.39580488204956055, |
| "learning_rate": 3.9800215594682e-05, |
| "loss": 0.1425, |
| "step": 41900 |
| }, |
| { |
| "epoch": 0.6029442941275813, |
| "grad_norm": 0.35014522075653076, |
| "learning_rate": 3.972835070068272e-05, |
| "loss": 0.1419, |
| "step": 41950 |
| }, |
| { |
| "epoch": 0.6036629404853019, |
| "grad_norm": 0.4798388183116913, |
| "learning_rate": 3.9656485806683436e-05, |
| "loss": 0.1377, |
| "step": 42000 |
| }, |
| { |
| "epoch": 0.6036629404853019, |
| "eval_loss": 0.1396327018737793, |
| "eval_runtime": 2335.2661, |
| "eval_samples_per_second": 25.089, |
| "eval_steps_per_second": 3.136, |
| "step": 42000 |
| }, |
| { |
| "epoch": 0.6043815868430225, |
| "grad_norm": 0.5069937109947205, |
| "learning_rate": 3.9584620912684155e-05, |
| "loss": 0.1381, |
| "step": 42050 |
| }, |
| { |
| "epoch": 0.6051002332007431, |
| "grad_norm": 0.423450767993927, |
| "learning_rate": 3.9512756018684874e-05, |
| "loss": 0.1401, |
| "step": 42100 |
| }, |
| { |
| "epoch": 0.6058188795584637, |
| "grad_norm": 0.3367474675178528, |
| "learning_rate": 3.9440891124685594e-05, |
| "loss": 0.1389, |
| "step": 42150 |
| }, |
| { |
| "epoch": 0.6065375259161843, |
| "grad_norm": 0.33142003417015076, |
| "learning_rate": 3.936902623068631e-05, |
| "loss": 0.1403, |
| "step": 42200 |
| }, |
| { |
| "epoch": 0.6072561722739048, |
| "grad_norm": 0.4653840959072113, |
| "learning_rate": 3.929716133668703e-05, |
| "loss": 0.141, |
| "step": 42250 |
| }, |
| { |
| "epoch": 0.6079748186316255, |
| "grad_norm": 0.48399651050567627, |
| "learning_rate": 3.922529644268775e-05, |
| "loss": 0.1346, |
| "step": 42300 |
| }, |
| { |
| "epoch": 0.6086934649893461, |
| "grad_norm": 0.39922240376472473, |
| "learning_rate": 3.9154868846568454e-05, |
| "loss": 0.1397, |
| "step": 42350 |
| }, |
| { |
| "epoch": 0.6094121113470666, |
| "grad_norm": 0.370822012424469, |
| "learning_rate": 3.9083003952569166e-05, |
| "loss": 0.1442, |
| "step": 42400 |
| }, |
| { |
| "epoch": 0.6101307577047873, |
| "grad_norm": 0.33353352546691895, |
| "learning_rate": 3.901113905856989e-05, |
| "loss": 0.1382, |
| "step": 42450 |
| }, |
| { |
| "epoch": 0.6108494040625079, |
| "grad_norm": 0.34602826833724976, |
| "learning_rate": 3.893927416457061e-05, |
| "loss": 0.1376, |
| "step": 42500 |
| }, |
| { |
| "epoch": 0.6115680504202284, |
| "grad_norm": 0.40028107166290283, |
| "learning_rate": 3.8867409270571324e-05, |
| "loss": 0.137, |
| "step": 42550 |
| }, |
| { |
| "epoch": 0.612286696777949, |
| "grad_norm": 0.37151163816452026, |
| "learning_rate": 3.879554437657205e-05, |
| "loss": 0.1386, |
| "step": 42600 |
| }, |
| { |
| "epoch": 0.6130053431356697, |
| "grad_norm": 0.33360013365745544, |
| "learning_rate": 3.872367948257277e-05, |
| "loss": 0.1396, |
| "step": 42650 |
| }, |
| { |
| "epoch": 0.6137239894933902, |
| "grad_norm": 0.3027796447277069, |
| "learning_rate": 3.865181458857348e-05, |
| "loss": 0.1396, |
| "step": 42700 |
| }, |
| { |
| "epoch": 0.6144426358511108, |
| "grad_norm": 0.5223707556724548, |
| "learning_rate": 3.85799496945742e-05, |
| "loss": 0.1408, |
| "step": 42750 |
| }, |
| { |
| "epoch": 0.6151612822088315, |
| "grad_norm": 0.34969398379325867, |
| "learning_rate": 3.8508084800574926e-05, |
| "loss": 0.1403, |
| "step": 42800 |
| }, |
| { |
| "epoch": 0.615879928566552, |
| "grad_norm": 0.38828930258750916, |
| "learning_rate": 3.843621990657564e-05, |
| "loss": 0.1351, |
| "step": 42850 |
| }, |
| { |
| "epoch": 0.6165985749242726, |
| "grad_norm": 0.43073561787605286, |
| "learning_rate": 3.836435501257636e-05, |
| "loss": 0.1389, |
| "step": 42900 |
| }, |
| { |
| "epoch": 0.6173172212819933, |
| "grad_norm": 0.3976250886917114, |
| "learning_rate": 3.8292490118577076e-05, |
| "loss": 0.1389, |
| "step": 42950 |
| }, |
| { |
| "epoch": 0.6180358676397139, |
| "grad_norm": 0.37994539737701416, |
| "learning_rate": 3.8220625224577796e-05, |
| "loss": 0.1396, |
| "step": 43000 |
| }, |
| { |
| "epoch": 0.6187545139974344, |
| "grad_norm": 0.44032207131385803, |
| "learning_rate": 3.8148760330578515e-05, |
| "loss": 0.1407, |
| "step": 43050 |
| }, |
| { |
| "epoch": 0.619473160355155, |
| "grad_norm": 0.5004963278770447, |
| "learning_rate": 3.8076895436579234e-05, |
| "loss": 0.1444, |
| "step": 43100 |
| }, |
| { |
| "epoch": 0.6201918067128757, |
| "grad_norm": 0.43479058146476746, |
| "learning_rate": 3.800503054257995e-05, |
| "loss": 0.1394, |
| "step": 43150 |
| }, |
| { |
| "epoch": 0.6209104530705962, |
| "grad_norm": 0.3528944253921509, |
| "learning_rate": 3.793316564858067e-05, |
| "loss": 0.1366, |
| "step": 43200 |
| }, |
| { |
| "epoch": 0.6216290994283168, |
| "grad_norm": 0.3285259008407593, |
| "learning_rate": 3.786130075458139e-05, |
| "loss": 0.1386, |
| "step": 43250 |
| }, |
| { |
| "epoch": 0.6223477457860375, |
| "grad_norm": 0.38898158073425293, |
| "learning_rate": 3.77894358605821e-05, |
| "loss": 0.1418, |
| "step": 43300 |
| }, |
| { |
| "epoch": 0.623066392143758, |
| "grad_norm": 0.32555800676345825, |
| "learning_rate": 3.771757096658283e-05, |
| "loss": 0.1368, |
| "step": 43350 |
| }, |
| { |
| "epoch": 0.6237850385014786, |
| "grad_norm": 0.46589428186416626, |
| "learning_rate": 3.764570607258354e-05, |
| "loss": 0.1358, |
| "step": 43400 |
| }, |
| { |
| "epoch": 0.6245036848591992, |
| "grad_norm": 0.4179432988166809, |
| "learning_rate": 3.757384117858426e-05, |
| "loss": 0.1352, |
| "step": 43450 |
| }, |
| { |
| "epoch": 0.6252223312169198, |
| "grad_norm": 0.32889196276664734, |
| "learning_rate": 3.750197628458498e-05, |
| "loss": 0.1378, |
| "step": 43500 |
| }, |
| { |
| "epoch": 0.6259409775746404, |
| "grad_norm": 0.5279363393783569, |
| "learning_rate": 3.74301113905857e-05, |
| "loss": 0.1368, |
| "step": 43550 |
| }, |
| { |
| "epoch": 0.626659623932361, |
| "grad_norm": 0.36462467908859253, |
| "learning_rate": 3.735824649658642e-05, |
| "loss": 0.1381, |
| "step": 43600 |
| }, |
| { |
| "epoch": 0.6273782702900816, |
| "grad_norm": 0.333006888628006, |
| "learning_rate": 3.728638160258714e-05, |
| "loss": 0.137, |
| "step": 43650 |
| }, |
| { |
| "epoch": 0.6280969166478022, |
| "grad_norm": 0.3400765359401703, |
| "learning_rate": 3.7214516708587856e-05, |
| "loss": 0.139, |
| "step": 43700 |
| }, |
| { |
| "epoch": 0.6288155630055228, |
| "grad_norm": 0.4929637908935547, |
| "learning_rate": 3.7142651814588575e-05, |
| "loss": 0.1375, |
| "step": 43750 |
| }, |
| { |
| "epoch": 0.6295342093632434, |
| "grad_norm": 0.3783525824546814, |
| "learning_rate": 3.7070786920589294e-05, |
| "loss": 0.1383, |
| "step": 43800 |
| }, |
| { |
| "epoch": 0.630252855720964, |
| "grad_norm": 0.3588646650314331, |
| "learning_rate": 3.6998922026590014e-05, |
| "loss": 0.1383, |
| "step": 43850 |
| }, |
| { |
| "epoch": 0.6309715020786846, |
| "grad_norm": 0.2712932825088501, |
| "learning_rate": 3.692705713259073e-05, |
| "loss": 0.1346, |
| "step": 43900 |
| }, |
| { |
| "epoch": 0.6316901484364051, |
| "grad_norm": 0.3558711111545563, |
| "learning_rate": 3.685519223859145e-05, |
| "loss": 0.141, |
| "step": 43950 |
| }, |
| { |
| "epoch": 0.6324087947941258, |
| "grad_norm": 0.3488093912601471, |
| "learning_rate": 3.6783327344592164e-05, |
| "loss": 0.1391, |
| "step": 44000 |
| }, |
| { |
| "epoch": 0.6331274411518464, |
| "grad_norm": 0.409156858921051, |
| "learning_rate": 3.671146245059289e-05, |
| "loss": 0.137, |
| "step": 44050 |
| }, |
| { |
| "epoch": 0.633846087509567, |
| "grad_norm": 0.3519936800003052, |
| "learning_rate": 3.663959755659361e-05, |
| "loss": 0.1353, |
| "step": 44100 |
| }, |
| { |
| "epoch": 0.6345647338672876, |
| "grad_norm": 0.35567376017570496, |
| "learning_rate": 3.656773266259432e-05, |
| "loss": 0.1394, |
| "step": 44150 |
| }, |
| { |
| "epoch": 0.6352833802250082, |
| "grad_norm": 0.26964500546455383, |
| "learning_rate": 3.649586776859504e-05, |
| "loss": 0.1335, |
| "step": 44200 |
| }, |
| { |
| "epoch": 0.6360020265827288, |
| "grad_norm": 0.3359646797180176, |
| "learning_rate": 3.6424002874595766e-05, |
| "loss": 0.1422, |
| "step": 44250 |
| }, |
| { |
| "epoch": 0.6367206729404493, |
| "grad_norm": 0.31784096360206604, |
| "learning_rate": 3.635213798059648e-05, |
| "loss": 0.1379, |
| "step": 44300 |
| }, |
| { |
| "epoch": 0.63743931929817, |
| "grad_norm": 0.3648991286754608, |
| "learning_rate": 3.628171038447719e-05, |
| "loss": 0.1342, |
| "step": 44350 |
| }, |
| { |
| "epoch": 0.6381579656558906, |
| "grad_norm": 0.413467675447464, |
| "learning_rate": 3.62098454904779e-05, |
| "loss": 0.1381, |
| "step": 44400 |
| }, |
| { |
| "epoch": 0.6388766120136111, |
| "grad_norm": 0.5211649537086487, |
| "learning_rate": 3.613798059647862e-05, |
| "loss": 0.1408, |
| "step": 44450 |
| }, |
| { |
| "epoch": 0.6395952583713318, |
| "grad_norm": 0.358275830745697, |
| "learning_rate": 3.606611570247934e-05, |
| "loss": 0.1352, |
| "step": 44500 |
| }, |
| { |
| "epoch": 0.6403139047290524, |
| "grad_norm": 0.38519778847694397, |
| "learning_rate": 3.599425080848006e-05, |
| "loss": 0.1439, |
| "step": 44550 |
| }, |
| { |
| "epoch": 0.6410325510867729, |
| "grad_norm": 0.3506108820438385, |
| "learning_rate": 3.592238591448078e-05, |
| "loss": 0.1408, |
| "step": 44600 |
| }, |
| { |
| "epoch": 0.6417511974444936, |
| "grad_norm": 0.32862502336502075, |
| "learning_rate": 3.5850521020481496e-05, |
| "loss": 0.1375, |
| "step": 44650 |
| }, |
| { |
| "epoch": 0.6424698438022142, |
| "grad_norm": 0.34333035349845886, |
| "learning_rate": 3.5778656126482215e-05, |
| "loss": 0.1333, |
| "step": 44700 |
| }, |
| { |
| "epoch": 0.6431884901599347, |
| "grad_norm": 0.33827170729637146, |
| "learning_rate": 3.5706791232482935e-05, |
| "loss": 0.137, |
| "step": 44750 |
| }, |
| { |
| "epoch": 0.6439071365176553, |
| "grad_norm": 0.5813308954238892, |
| "learning_rate": 3.5634926338483654e-05, |
| "loss": 0.1375, |
| "step": 44800 |
| }, |
| { |
| "epoch": 0.644625782875376, |
| "grad_norm": 0.4128280580043793, |
| "learning_rate": 3.5563061444484366e-05, |
| "loss": 0.1377, |
| "step": 44850 |
| }, |
| { |
| "epoch": 0.6453444292330965, |
| "grad_norm": 0.33010971546173096, |
| "learning_rate": 3.549119655048509e-05, |
| "loss": 0.138, |
| "step": 44900 |
| }, |
| { |
| "epoch": 0.6460630755908171, |
| "grad_norm": 0.33612126111984253, |
| "learning_rate": 3.541933165648581e-05, |
| "loss": 0.1385, |
| "step": 44950 |
| }, |
| { |
| "epoch": 0.6467817219485378, |
| "grad_norm": 0.5942758321762085, |
| "learning_rate": 3.534746676248652e-05, |
| "loss": 0.1343, |
| "step": 45000 |
| }, |
| { |
| "epoch": 0.6467817219485378, |
| "eval_loss": 0.13751016557216644, |
| "eval_runtime": 2334.0416, |
| "eval_samples_per_second": 25.102, |
| "eval_steps_per_second": 3.138, |
| "step": 45000 |
| }, |
| { |
| "epoch": 0.6475003683062583, |
| "grad_norm": 0.49251455068588257, |
| "learning_rate": 3.527560186848724e-05, |
| "loss": 0.1368, |
| "step": 45050 |
| }, |
| { |
| "epoch": 0.6482190146639789, |
| "grad_norm": 0.306601345539093, |
| "learning_rate": 3.520373697448797e-05, |
| "loss": 0.1411, |
| "step": 45100 |
| }, |
| { |
| "epoch": 0.6489376610216995, |
| "grad_norm": 0.45201998949050903, |
| "learning_rate": 3.513187208048868e-05, |
| "loss": 0.1339, |
| "step": 45150 |
| }, |
| { |
| "epoch": 0.6496563073794202, |
| "grad_norm": 0.3716999590396881, |
| "learning_rate": 3.50600071864894e-05, |
| "loss": 0.1317, |
| "step": 45200 |
| }, |
| { |
| "epoch": 0.6503749537371407, |
| "grad_norm": 0.3443935811519623, |
| "learning_rate": 3.4988142292490126e-05, |
| "loss": 0.141, |
| "step": 45250 |
| }, |
| { |
| "epoch": 0.6510936000948613, |
| "grad_norm": 0.4641805589199066, |
| "learning_rate": 3.491627739849084e-05, |
| "loss": 0.1341, |
| "step": 45300 |
| }, |
| { |
| "epoch": 0.651812246452582, |
| "grad_norm": 0.5612492561340332, |
| "learning_rate": 3.484441250449156e-05, |
| "loss": 0.1433, |
| "step": 45350 |
| }, |
| { |
| "epoch": 0.6525308928103025, |
| "grad_norm": 0.36323145031929016, |
| "learning_rate": 3.4772547610492276e-05, |
| "loss": 0.1383, |
| "step": 45400 |
| }, |
| { |
| "epoch": 0.6532495391680231, |
| "grad_norm": 0.40366289019584656, |
| "learning_rate": 3.4700682716492995e-05, |
| "loss": 0.1363, |
| "step": 45450 |
| }, |
| { |
| "epoch": 0.6539681855257438, |
| "grad_norm": 0.318117618560791, |
| "learning_rate": 3.4628817822493714e-05, |
| "loss": 0.1347, |
| "step": 45500 |
| }, |
| { |
| "epoch": 0.6546868318834643, |
| "grad_norm": 0.43462902307510376, |
| "learning_rate": 3.4556952928494433e-05, |
| "loss": 0.1319, |
| "step": 45550 |
| }, |
| { |
| "epoch": 0.6554054782411849, |
| "grad_norm": 0.4332530200481415, |
| "learning_rate": 3.448508803449515e-05, |
| "loss": 0.1404, |
| "step": 45600 |
| }, |
| { |
| "epoch": 0.6561241245989055, |
| "grad_norm": 0.506473183631897, |
| "learning_rate": 3.441322314049587e-05, |
| "loss": 0.1366, |
| "step": 45650 |
| }, |
| { |
| "epoch": 0.6568427709566261, |
| "grad_norm": 0.3314126431941986, |
| "learning_rate": 3.4341358246496584e-05, |
| "loss": 0.1362, |
| "step": 45700 |
| }, |
| { |
| "epoch": 0.6575614173143467, |
| "grad_norm": 0.32727667689323425, |
| "learning_rate": 3.42694933524973e-05, |
| "loss": 0.1342, |
| "step": 45750 |
| }, |
| { |
| "epoch": 0.6582800636720673, |
| "grad_norm": 0.39728379249572754, |
| "learning_rate": 3.419762845849803e-05, |
| "loss": 0.135, |
| "step": 45800 |
| }, |
| { |
| "epoch": 0.6589987100297879, |
| "grad_norm": 0.4866426885128021, |
| "learning_rate": 3.412576356449874e-05, |
| "loss": 0.1346, |
| "step": 45850 |
| }, |
| { |
| "epoch": 0.6597173563875085, |
| "grad_norm": 0.47908952832221985, |
| "learning_rate": 3.405533596837945e-05, |
| "loss": 0.1378, |
| "step": 45900 |
| }, |
| { |
| "epoch": 0.6604360027452291, |
| "grad_norm": 0.490915983915329, |
| "learning_rate": 3.398347107438017e-05, |
| "loss": 0.1354, |
| "step": 45950 |
| }, |
| { |
| "epoch": 0.6611546491029496, |
| "grad_norm": 0.32982251048088074, |
| "learning_rate": 3.391160618038088e-05, |
| "loss": 0.1329, |
| "step": 46000 |
| }, |
| { |
| "epoch": 0.6618732954606703, |
| "grad_norm": 0.2903765141963959, |
| "learning_rate": 3.38397412863816e-05, |
| "loss": 0.1352, |
| "step": 46050 |
| }, |
| { |
| "epoch": 0.6625919418183909, |
| "grad_norm": 0.45338502526283264, |
| "learning_rate": 3.376787639238233e-05, |
| "loss": 0.1393, |
| "step": 46100 |
| }, |
| { |
| "epoch": 0.6633105881761114, |
| "grad_norm": 0.5581700801849365, |
| "learning_rate": 3.369601149838304e-05, |
| "loss": 0.1392, |
| "step": 46150 |
| }, |
| { |
| "epoch": 0.6640292345338321, |
| "grad_norm": 0.3929944932460785, |
| "learning_rate": 3.362414660438376e-05, |
| "loss": 0.135, |
| "step": 46200 |
| }, |
| { |
| "epoch": 0.6647478808915527, |
| "grad_norm": 0.3831471800804138, |
| "learning_rate": 3.355228171038448e-05, |
| "loss": 0.1374, |
| "step": 46250 |
| }, |
| { |
| "epoch": 0.6654665272492732, |
| "grad_norm": 0.319807231426239, |
| "learning_rate": 3.34804168163852e-05, |
| "loss": 0.1327, |
| "step": 46300 |
| }, |
| { |
| "epoch": 0.6661851736069939, |
| "grad_norm": 0.33727338910102844, |
| "learning_rate": 3.3408551922385916e-05, |
| "loss": 0.1338, |
| "step": 46350 |
| }, |
| { |
| "epoch": 0.6669038199647145, |
| "grad_norm": 0.37149056792259216, |
| "learning_rate": 3.3336687028386635e-05, |
| "loss": 0.1407, |
| "step": 46400 |
| }, |
| { |
| "epoch": 0.667622466322435, |
| "grad_norm": 0.29155993461608887, |
| "learning_rate": 3.3264822134387355e-05, |
| "loss": 0.1391, |
| "step": 46450 |
| }, |
| { |
| "epoch": 0.6683411126801556, |
| "grad_norm": 0.32874971628189087, |
| "learning_rate": 3.3192957240388074e-05, |
| "loss": 0.137, |
| "step": 46500 |
| }, |
| { |
| "epoch": 0.6690597590378763, |
| "grad_norm": 0.4324096143245697, |
| "learning_rate": 3.312109234638879e-05, |
| "loss": 0.1357, |
| "step": 46550 |
| }, |
| { |
| "epoch": 0.6697784053955969, |
| "grad_norm": 0.3491309881210327, |
| "learning_rate": 3.304922745238951e-05, |
| "loss": 0.1371, |
| "step": 46600 |
| }, |
| { |
| "epoch": 0.6704970517533174, |
| "grad_norm": 0.3900867998600006, |
| "learning_rate": 3.297736255839023e-05, |
| "loss": 0.1374, |
| "step": 46650 |
| }, |
| { |
| "epoch": 0.6712156981110381, |
| "grad_norm": 0.41907286643981934, |
| "learning_rate": 3.290549766439094e-05, |
| "loss": 0.1342, |
| "step": 46700 |
| }, |
| { |
| "epoch": 0.6719343444687587, |
| "grad_norm": 0.3794206380844116, |
| "learning_rate": 3.283363277039166e-05, |
| "loss": 0.1372, |
| "step": 46750 |
| }, |
| { |
| "epoch": 0.6726529908264792, |
| "grad_norm": 0.3382868468761444, |
| "learning_rate": 3.276176787639239e-05, |
| "loss": 0.1349, |
| "step": 46800 |
| }, |
| { |
| "epoch": 0.6733716371841998, |
| "grad_norm": 0.3015630543231964, |
| "learning_rate": 3.26899029823931e-05, |
| "loss": 0.1379, |
| "step": 46850 |
| }, |
| { |
| "epoch": 0.6740902835419205, |
| "grad_norm": 0.40960338711738586, |
| "learning_rate": 3.261803808839382e-05, |
| "loss": 0.1324, |
| "step": 46900 |
| }, |
| { |
| "epoch": 0.674808929899641, |
| "grad_norm": 0.30875468254089355, |
| "learning_rate": 3.254617319439454e-05, |
| "loss": 0.1323, |
| "step": 46950 |
| }, |
| { |
| "epoch": 0.6755275762573616, |
| "grad_norm": 0.35867223143577576, |
| "learning_rate": 3.247430830039526e-05, |
| "loss": 0.1367, |
| "step": 47000 |
| }, |
| { |
| "epoch": 0.6762462226150823, |
| "grad_norm": 0.3085405230522156, |
| "learning_rate": 3.240244340639598e-05, |
| "loss": 0.1402, |
| "step": 47050 |
| }, |
| { |
| "epoch": 0.6769648689728028, |
| "grad_norm": 0.30113691091537476, |
| "learning_rate": 3.2330578512396696e-05, |
| "loss": 0.1358, |
| "step": 47100 |
| }, |
| { |
| "epoch": 0.6776835153305234, |
| "grad_norm": 0.41874217987060547, |
| "learning_rate": 3.2258713618397415e-05, |
| "loss": 0.1359, |
| "step": 47150 |
| }, |
| { |
| "epoch": 0.6784021616882441, |
| "grad_norm": 0.3495503067970276, |
| "learning_rate": 3.2186848724398134e-05, |
| "loss": 0.14, |
| "step": 47200 |
| }, |
| { |
| "epoch": 0.6791208080459646, |
| "grad_norm": 0.45888441801071167, |
| "learning_rate": 3.2114983830398853e-05, |
| "loss": 0.1319, |
| "step": 47250 |
| }, |
| { |
| "epoch": 0.6798394544036852, |
| "grad_norm": 0.2843805253505707, |
| "learning_rate": 3.2043118936399566e-05, |
| "loss": 0.1338, |
| "step": 47300 |
| }, |
| { |
| "epoch": 0.6805581007614058, |
| "grad_norm": 0.5123314261436462, |
| "learning_rate": 3.197125404240029e-05, |
| "loss": 0.1408, |
| "step": 47350 |
| }, |
| { |
| "epoch": 0.6812767471191264, |
| "grad_norm": 0.38351091742515564, |
| "learning_rate": 3.189938914840101e-05, |
| "loss": 0.1369, |
| "step": 47400 |
| }, |
| { |
| "epoch": 0.681995393476847, |
| "grad_norm": 0.34910881519317627, |
| "learning_rate": 3.182752425440172e-05, |
| "loss": 0.1342, |
| "step": 47450 |
| }, |
| { |
| "epoch": 0.6827140398345676, |
| "grad_norm": 0.3906104564666748, |
| "learning_rate": 3.175565936040244e-05, |
| "loss": 0.1363, |
| "step": 47500 |
| }, |
| { |
| "epoch": 0.6834326861922883, |
| "grad_norm": 0.3617385923862457, |
| "learning_rate": 3.168379446640317e-05, |
| "loss": 0.1323, |
| "step": 47550 |
| }, |
| { |
| "epoch": 0.6841513325500088, |
| "grad_norm": 0.6945951581001282, |
| "learning_rate": 3.161192957240388e-05, |
| "loss": 0.1328, |
| "step": 47600 |
| }, |
| { |
| "epoch": 0.6848699789077294, |
| "grad_norm": 0.4061312973499298, |
| "learning_rate": 3.15400646784046e-05, |
| "loss": 0.1336, |
| "step": 47650 |
| }, |
| { |
| "epoch": 0.68558862526545, |
| "grad_norm": 0.3243468999862671, |
| "learning_rate": 3.1468199784405325e-05, |
| "loss": 0.1353, |
| "step": 47700 |
| }, |
| { |
| "epoch": 0.6863072716231706, |
| "grad_norm": 0.354533851146698, |
| "learning_rate": 3.139633489040604e-05, |
| "loss": 0.1324, |
| "step": 47750 |
| }, |
| { |
| "epoch": 0.6870259179808912, |
| "grad_norm": 0.34843167662620544, |
| "learning_rate": 3.132446999640676e-05, |
| "loss": 0.133, |
| "step": 47800 |
| }, |
| { |
| "epoch": 0.6877445643386118, |
| "grad_norm": 0.3735269606113434, |
| "learning_rate": 3.1252605102407476e-05, |
| "loss": 0.1333, |
| "step": 47850 |
| }, |
| { |
| "epoch": 0.6884632106963324, |
| "grad_norm": 0.5805733799934387, |
| "learning_rate": 3.1180740208408195e-05, |
| "loss": 0.1363, |
| "step": 47900 |
| }, |
| { |
| "epoch": 0.689181857054053, |
| "grad_norm": 0.4405001103878021, |
| "learning_rate": 3.1108875314408914e-05, |
| "loss": 0.1365, |
| "step": 47950 |
| }, |
| { |
| "epoch": 0.6899005034117736, |
| "grad_norm": 0.3531704545021057, |
| "learning_rate": 3.1037010420409626e-05, |
| "loss": 0.1315, |
| "step": 48000 |
| }, |
| { |
| "epoch": 0.6899005034117736, |
| "eval_loss": 0.13462956249713898, |
| "eval_runtime": 2343.1875, |
| "eval_samples_per_second": 25.004, |
| "eval_steps_per_second": 3.126, |
| "step": 48000 |
| }, |
| { |
| "epoch": 0.6906191497694942, |
| "grad_norm": 0.32714229822158813, |
| "learning_rate": 3.096514552641035e-05, |
| "loss": 0.1333, |
| "step": 48050 |
| }, |
| { |
| "epoch": 0.6913377961272148, |
| "grad_norm": 0.3133352994918823, |
| "learning_rate": 3.089328063241107e-05, |
| "loss": 0.135, |
| "step": 48100 |
| }, |
| { |
| "epoch": 0.6920564424849354, |
| "grad_norm": 0.3752550482749939, |
| "learning_rate": 3.0821415738411784e-05, |
| "loss": 0.1381, |
| "step": 48150 |
| }, |
| { |
| "epoch": 0.6927750888426559, |
| "grad_norm": 0.49100804328918457, |
| "learning_rate": 3.07495508444125e-05, |
| "loss": 0.135, |
| "step": 48200 |
| }, |
| { |
| "epoch": 0.6934937352003766, |
| "grad_norm": 0.36574167013168335, |
| "learning_rate": 3.067768595041323e-05, |
| "loss": 0.1308, |
| "step": 48250 |
| }, |
| { |
| "epoch": 0.6942123815580972, |
| "grad_norm": 0.3478115200996399, |
| "learning_rate": 3.0607258354293925e-05, |
| "loss": 0.1322, |
| "step": 48300 |
| }, |
| { |
| "epoch": 0.6949310279158177, |
| "grad_norm": 0.743575394153595, |
| "learning_rate": 3.053539346029465e-05, |
| "loss": 0.1356, |
| "step": 48350 |
| }, |
| { |
| "epoch": 0.6956496742735384, |
| "grad_norm": 0.3269284665584564, |
| "learning_rate": 3.046352856629537e-05, |
| "loss": 0.1327, |
| "step": 48400 |
| }, |
| { |
| "epoch": 0.696368320631259, |
| "grad_norm": 0.43469032645225525, |
| "learning_rate": 3.0391663672296082e-05, |
| "loss": 0.1341, |
| "step": 48450 |
| }, |
| { |
| "epoch": 0.6970869669889795, |
| "grad_norm": 0.46831926703453064, |
| "learning_rate": 3.0319798778296805e-05, |
| "loss": 0.1326, |
| "step": 48500 |
| }, |
| { |
| "epoch": 0.6978056133467001, |
| "grad_norm": 0.313024640083313, |
| "learning_rate": 3.0247933884297524e-05, |
| "loss": 0.1339, |
| "step": 48550 |
| }, |
| { |
| "epoch": 0.6985242597044208, |
| "grad_norm": 0.3206130564212799, |
| "learning_rate": 3.017606899029824e-05, |
| "loss": 0.1315, |
| "step": 48600 |
| }, |
| { |
| "epoch": 0.6992429060621413, |
| "grad_norm": 0.29317253828048706, |
| "learning_rate": 3.010420409629896e-05, |
| "loss": 0.1316, |
| "step": 48650 |
| }, |
| { |
| "epoch": 0.6999615524198619, |
| "grad_norm": 0.36203575134277344, |
| "learning_rate": 3.003233920229968e-05, |
| "loss": 0.1369, |
| "step": 48700 |
| }, |
| { |
| "epoch": 0.7006801987775826, |
| "grad_norm": 0.4818989932537079, |
| "learning_rate": 2.9960474308300397e-05, |
| "loss": 0.1312, |
| "step": 48750 |
| }, |
| { |
| "epoch": 0.7013988451353032, |
| "grad_norm": 0.4188140332698822, |
| "learning_rate": 2.9888609414301116e-05, |
| "loss": 0.1295, |
| "step": 48800 |
| }, |
| { |
| "epoch": 0.7021174914930237, |
| "grad_norm": 0.42703622579574585, |
| "learning_rate": 2.9816744520301832e-05, |
| "loss": 0.1357, |
| "step": 48850 |
| }, |
| { |
| "epoch": 0.7028361378507444, |
| "grad_norm": 0.3431522250175476, |
| "learning_rate": 2.974487962630255e-05, |
| "loss": 0.1343, |
| "step": 48900 |
| }, |
| { |
| "epoch": 0.703554784208465, |
| "grad_norm": 0.3017210364341736, |
| "learning_rate": 2.9673014732303273e-05, |
| "loss": 0.1357, |
| "step": 48950 |
| }, |
| { |
| "epoch": 0.7042734305661855, |
| "grad_norm": 0.32396504282951355, |
| "learning_rate": 2.960114983830399e-05, |
| "loss": 0.1348, |
| "step": 49000 |
| }, |
| { |
| "epoch": 0.7049920769239061, |
| "grad_norm": 0.4112311899662018, |
| "learning_rate": 2.9529284944304708e-05, |
| "loss": 0.1333, |
| "step": 49050 |
| }, |
| { |
| "epoch": 0.7057107232816268, |
| "grad_norm": 0.3641326427459717, |
| "learning_rate": 2.9457420050305427e-05, |
| "loss": 0.1377, |
| "step": 49100 |
| }, |
| { |
| "epoch": 0.7064293696393473, |
| "grad_norm": 0.41020339727401733, |
| "learning_rate": 2.9385555156306143e-05, |
| "loss": 0.1326, |
| "step": 49150 |
| }, |
| { |
| "epoch": 0.7071480159970679, |
| "grad_norm": 0.5133574604988098, |
| "learning_rate": 2.9313690262306865e-05, |
| "loss": 0.1404, |
| "step": 49200 |
| }, |
| { |
| "epoch": 0.7078666623547886, |
| "grad_norm": 0.4345245063304901, |
| "learning_rate": 2.9241825368307585e-05, |
| "loss": 0.1355, |
| "step": 49250 |
| }, |
| { |
| "epoch": 0.7085853087125091, |
| "grad_norm": 0.3085751235485077, |
| "learning_rate": 2.91699604743083e-05, |
| "loss": 0.1361, |
| "step": 49300 |
| }, |
| { |
| "epoch": 0.7093039550702297, |
| "grad_norm": 0.4417734444141388, |
| "learning_rate": 2.909809558030902e-05, |
| "loss": 0.1353, |
| "step": 49350 |
| }, |
| { |
| "epoch": 0.7100226014279503, |
| "grad_norm": 0.38431400060653687, |
| "learning_rate": 2.9026230686309742e-05, |
| "loss": 0.1371, |
| "step": 49400 |
| }, |
| { |
| "epoch": 0.7107412477856709, |
| "grad_norm": 0.420731782913208, |
| "learning_rate": 2.8954365792310458e-05, |
| "loss": 0.1292, |
| "step": 49450 |
| }, |
| { |
| "epoch": 0.7114598941433915, |
| "grad_norm": 0.3676437437534332, |
| "learning_rate": 2.8882500898311177e-05, |
| "loss": 0.1319, |
| "step": 49500 |
| }, |
| { |
| "epoch": 0.7121785405011121, |
| "grad_norm": 0.34518060088157654, |
| "learning_rate": 2.8810636004311896e-05, |
| "loss": 0.134, |
| "step": 49550 |
| }, |
| { |
| "epoch": 0.7128971868588327, |
| "grad_norm": 0.4851680099964142, |
| "learning_rate": 2.873877111031261e-05, |
| "loss": 0.129, |
| "step": 49600 |
| }, |
| { |
| "epoch": 0.7136158332165533, |
| "grad_norm": 0.29022541642189026, |
| "learning_rate": 2.8666906216313334e-05, |
| "loss": 0.1315, |
| "step": 49650 |
| }, |
| { |
| "epoch": 0.7143344795742739, |
| "grad_norm": 0.5281957387924194, |
| "learning_rate": 2.8595041322314053e-05, |
| "loss": 0.1353, |
| "step": 49700 |
| }, |
| { |
| "epoch": 0.7150531259319945, |
| "grad_norm": 0.36344966292381287, |
| "learning_rate": 2.852317642831477e-05, |
| "loss": 0.1378, |
| "step": 49750 |
| }, |
| { |
| "epoch": 0.7157717722897151, |
| "grad_norm": 0.42327845096588135, |
| "learning_rate": 2.8451311534315488e-05, |
| "loss": 0.1287, |
| "step": 49800 |
| }, |
| { |
| "epoch": 0.7164904186474357, |
| "grad_norm": 0.38851046562194824, |
| "learning_rate": 2.837944664031621e-05, |
| "loss": 0.1342, |
| "step": 49850 |
| }, |
| { |
| "epoch": 0.7172090650051562, |
| "grad_norm": 0.3646990656852722, |
| "learning_rate": 2.8307581746316926e-05, |
| "loss": 0.1334, |
| "step": 49900 |
| }, |
| { |
| "epoch": 0.7179277113628769, |
| "grad_norm": 0.3507033586502075, |
| "learning_rate": 2.8235716852317645e-05, |
| "loss": 0.1327, |
| "step": 49950 |
| }, |
| { |
| "epoch": 0.7186463577205975, |
| "grad_norm": 0.3572186827659607, |
| "learning_rate": 2.816385195831836e-05, |
| "loss": 0.1325, |
| "step": 50000 |
| }, |
| { |
| "epoch": 0.719365004078318, |
| "grad_norm": 0.43677380681037903, |
| "learning_rate": 2.809198706431908e-05, |
| "loss": 0.1346, |
| "step": 50050 |
| }, |
| { |
| "epoch": 0.7200836504360387, |
| "grad_norm": 0.3421187400817871, |
| "learning_rate": 2.8020122170319803e-05, |
| "loss": 0.1316, |
| "step": 50100 |
| }, |
| { |
| "epoch": 0.7208022967937593, |
| "grad_norm": 0.3370194435119629, |
| "learning_rate": 2.7948257276320515e-05, |
| "loss": 0.1318, |
| "step": 50150 |
| }, |
| { |
| "epoch": 0.7215209431514799, |
| "grad_norm": 0.36595064401626587, |
| "learning_rate": 2.7876392382321237e-05, |
| "loss": 0.1341, |
| "step": 50200 |
| }, |
| { |
| "epoch": 0.7222395895092004, |
| "grad_norm": 0.3835677206516266, |
| "learning_rate": 2.7804527488321956e-05, |
| "loss": 0.1293, |
| "step": 50250 |
| }, |
| { |
| "epoch": 0.7229582358669211, |
| "grad_norm": 0.40462374687194824, |
| "learning_rate": 2.7732662594322672e-05, |
| "loss": 0.1356, |
| "step": 50300 |
| }, |
| { |
| "epoch": 0.7236768822246417, |
| "grad_norm": 0.35873696208000183, |
| "learning_rate": 2.7660797700323395e-05, |
| "loss": 0.1347, |
| "step": 50350 |
| }, |
| { |
| "epoch": 0.7243955285823622, |
| "grad_norm": 0.40664613246917725, |
| "learning_rate": 2.7588932806324114e-05, |
| "loss": 0.1327, |
| "step": 50400 |
| }, |
| { |
| "epoch": 0.7251141749400829, |
| "grad_norm": 0.3808116912841797, |
| "learning_rate": 2.751706791232483e-05, |
| "loss": 0.1316, |
| "step": 50450 |
| }, |
| { |
| "epoch": 0.7258328212978035, |
| "grad_norm": 0.43004295229911804, |
| "learning_rate": 2.744520301832555e-05, |
| "loss": 0.1312, |
| "step": 50500 |
| }, |
| { |
| "epoch": 0.726551467655524, |
| "grad_norm": 0.27748802304267883, |
| "learning_rate": 2.737333812432627e-05, |
| "loss": 0.1347, |
| "step": 50550 |
| }, |
| { |
| "epoch": 0.7272701140132447, |
| "grad_norm": 0.2862594425678253, |
| "learning_rate": 2.7301473230326983e-05, |
| "loss": 0.133, |
| "step": 50600 |
| }, |
| { |
| "epoch": 0.7279887603709653, |
| "grad_norm": 0.4833882749080658, |
| "learning_rate": 2.7229608336327706e-05, |
| "loss": 0.1309, |
| "step": 50650 |
| }, |
| { |
| "epoch": 0.7287074067286858, |
| "grad_norm": 0.47044578194618225, |
| "learning_rate": 2.7157743442328425e-05, |
| "loss": 0.134, |
| "step": 50700 |
| }, |
| { |
| "epoch": 0.7294260530864064, |
| "grad_norm": 0.44328999519348145, |
| "learning_rate": 2.708587854832914e-05, |
| "loss": 0.1345, |
| "step": 50750 |
| }, |
| { |
| "epoch": 0.7301446994441271, |
| "grad_norm": 0.39716413617134094, |
| "learning_rate": 2.7014013654329863e-05, |
| "loss": 0.131, |
| "step": 50800 |
| }, |
| { |
| "epoch": 0.7308633458018476, |
| "grad_norm": 0.48879918456077576, |
| "learning_rate": 2.6942148760330582e-05, |
| "loss": 0.1362, |
| "step": 50850 |
| }, |
| { |
| "epoch": 0.7315819921595682, |
| "grad_norm": 0.3789602518081665, |
| "learning_rate": 2.6870283866331298e-05, |
| "loss": 0.1335, |
| "step": 50900 |
| }, |
| { |
| "epoch": 0.7323006385172889, |
| "grad_norm": 0.5360676050186157, |
| "learning_rate": 2.6798418972332017e-05, |
| "loss": 0.1368, |
| "step": 50950 |
| }, |
| { |
| "epoch": 0.7330192848750094, |
| "grad_norm": 0.3693239986896515, |
| "learning_rate": 2.672655407833274e-05, |
| "loss": 0.1324, |
| "step": 51000 |
| }, |
| { |
| "epoch": 0.7330192848750094, |
| "eval_loss": 0.13193804025650024, |
| "eval_runtime": 2341.8076, |
| "eval_samples_per_second": 25.019, |
| "eval_steps_per_second": 3.127, |
| "step": 51000 |
| }, |
| { |
| "epoch": 0.73373793123273, |
| "grad_norm": 0.4406863749027252, |
| "learning_rate": 2.6654689184333452e-05, |
| "loss": 0.1337, |
| "step": 51050 |
| }, |
| { |
| "epoch": 0.7344565775904506, |
| "grad_norm": 0.3431764245033264, |
| "learning_rate": 2.6582824290334174e-05, |
| "loss": 0.1288, |
| "step": 51100 |
| }, |
| { |
| "epoch": 0.7351752239481713, |
| "grad_norm": 0.4725956320762634, |
| "learning_rate": 2.651095939633489e-05, |
| "loss": 0.1324, |
| "step": 51150 |
| }, |
| { |
| "epoch": 0.7358938703058918, |
| "grad_norm": 0.2942321300506592, |
| "learning_rate": 2.643909450233561e-05, |
| "loss": 0.1328, |
| "step": 51200 |
| }, |
| { |
| "epoch": 0.7366125166636124, |
| "grad_norm": 0.3699278235435486, |
| "learning_rate": 2.6367229608336332e-05, |
| "loss": 0.1339, |
| "step": 51250 |
| }, |
| { |
| "epoch": 0.7373311630213331, |
| "grad_norm": 0.37523624300956726, |
| "learning_rate": 2.6295364714337044e-05, |
| "loss": 0.1296, |
| "step": 51300 |
| }, |
| { |
| "epoch": 0.7380498093790536, |
| "grad_norm": 0.31678199768066406, |
| "learning_rate": 2.6223499820337767e-05, |
| "loss": 0.1302, |
| "step": 51350 |
| }, |
| { |
| "epoch": 0.7387684557367742, |
| "grad_norm": 0.321972131729126, |
| "learning_rate": 2.6151634926338486e-05, |
| "loss": 0.1322, |
| "step": 51400 |
| }, |
| { |
| "epoch": 0.7394871020944949, |
| "grad_norm": 0.28065794706344604, |
| "learning_rate": 2.60797700323392e-05, |
| "loss": 0.1328, |
| "step": 51450 |
| }, |
| { |
| "epoch": 0.7402057484522154, |
| "grad_norm": 0.2923668324947357, |
| "learning_rate": 2.600790513833992e-05, |
| "loss": 0.1269, |
| "step": 51500 |
| }, |
| { |
| "epoch": 0.740924394809936, |
| "grad_norm": 0.3297905921936035, |
| "learning_rate": 2.5936040244340643e-05, |
| "loss": 0.1314, |
| "step": 51550 |
| }, |
| { |
| "epoch": 0.7416430411676566, |
| "grad_norm": 0.3125530779361725, |
| "learning_rate": 2.586417535034136e-05, |
| "loss": 0.1345, |
| "step": 51600 |
| }, |
| { |
| "epoch": 0.7423616875253772, |
| "grad_norm": 0.29957976937294006, |
| "learning_rate": 2.5792310456342078e-05, |
| "loss": 0.1339, |
| "step": 51650 |
| }, |
| { |
| "epoch": 0.7430803338830978, |
| "grad_norm": 0.45887959003448486, |
| "learning_rate": 2.57204455623428e-05, |
| "loss": 0.1294, |
| "step": 51700 |
| }, |
| { |
| "epoch": 0.7437989802408184, |
| "grad_norm": 0.44700437784194946, |
| "learning_rate": 2.5648580668343513e-05, |
| "loss": 0.1291, |
| "step": 51750 |
| }, |
| { |
| "epoch": 0.744517626598539, |
| "grad_norm": 0.30511099100112915, |
| "learning_rate": 2.5576715774344235e-05, |
| "loss": 0.1313, |
| "step": 51800 |
| }, |
| { |
| "epoch": 0.7452362729562596, |
| "grad_norm": 0.4534262418746948, |
| "learning_rate": 2.5504850880344954e-05, |
| "loss": 0.1343, |
| "step": 51850 |
| }, |
| { |
| "epoch": 0.7459549193139802, |
| "grad_norm": 0.4143444299697876, |
| "learning_rate": 2.543298598634567e-05, |
| "loss": 0.1305, |
| "step": 51900 |
| }, |
| { |
| "epoch": 0.7466735656717007, |
| "grad_norm": 0.4388984739780426, |
| "learning_rate": 2.536112109234639e-05, |
| "loss": 0.1307, |
| "step": 51950 |
| }, |
| { |
| "epoch": 0.7473922120294214, |
| "grad_norm": 0.33407118916511536, |
| "learning_rate": 2.528925619834711e-05, |
| "loss": 0.1287, |
| "step": 52000 |
| }, |
| { |
| "epoch": 0.748110858387142, |
| "grad_norm": 0.3362686336040497, |
| "learning_rate": 2.5217391304347827e-05, |
| "loss": 0.1305, |
| "step": 52050 |
| }, |
| { |
| "epoch": 0.7488295047448625, |
| "grad_norm": 0.3452841639518738, |
| "learning_rate": 2.5145526410348546e-05, |
| "loss": 0.1282, |
| "step": 52100 |
| }, |
| { |
| "epoch": 0.7495481511025832, |
| "grad_norm": 0.7336341142654419, |
| "learning_rate": 2.507366151634927e-05, |
| "loss": 0.134, |
| "step": 52150 |
| }, |
| { |
| "epoch": 0.7502667974603038, |
| "grad_norm": 0.40963059663772583, |
| "learning_rate": 2.500179662234998e-05, |
| "loss": 0.1259, |
| "step": 52200 |
| }, |
| { |
| "epoch": 0.7509854438180243, |
| "grad_norm": 0.40320196747779846, |
| "learning_rate": 2.4929931728350704e-05, |
| "loss": 0.1288, |
| "step": 52250 |
| }, |
| { |
| "epoch": 0.751704090175745, |
| "grad_norm": 0.391169011592865, |
| "learning_rate": 2.4859504132231407e-05, |
| "loss": 0.1292, |
| "step": 52300 |
| }, |
| { |
| "epoch": 0.7524227365334656, |
| "grad_norm": 0.456756591796875, |
| "learning_rate": 2.4787639238232126e-05, |
| "loss": 0.1287, |
| "step": 52350 |
| }, |
| { |
| "epoch": 0.7531413828911862, |
| "grad_norm": 0.3610314428806305, |
| "learning_rate": 2.471577434423284e-05, |
| "loss": 0.1287, |
| "step": 52400 |
| }, |
| { |
| "epoch": 0.7538600292489067, |
| "grad_norm": 0.3390330672264099, |
| "learning_rate": 2.4643909450233564e-05, |
| "loss": 0.1303, |
| "step": 52450 |
| }, |
| { |
| "epoch": 0.7545786756066274, |
| "grad_norm": 0.38022834062576294, |
| "learning_rate": 2.457204455623428e-05, |
| "loss": 0.1342, |
| "step": 52500 |
| }, |
| { |
| "epoch": 0.755297321964348, |
| "grad_norm": 0.37033766508102417, |
| "learning_rate": 2.4500179662235e-05, |
| "loss": 0.1296, |
| "step": 52550 |
| }, |
| { |
| "epoch": 0.7560159683220685, |
| "grad_norm": 0.39085620641708374, |
| "learning_rate": 2.4429752066115705e-05, |
| "loss": 0.1334, |
| "step": 52600 |
| }, |
| { |
| "epoch": 0.7567346146797892, |
| "grad_norm": 0.24048922955989838, |
| "learning_rate": 2.435788717211642e-05, |
| "loss": 0.1262, |
| "step": 52650 |
| }, |
| { |
| "epoch": 0.7574532610375098, |
| "grad_norm": 0.4394150376319885, |
| "learning_rate": 2.428602227811714e-05, |
| "loss": 0.1296, |
| "step": 52700 |
| }, |
| { |
| "epoch": 0.7581719073952303, |
| "grad_norm": 0.5719261765480042, |
| "learning_rate": 2.421415738411786e-05, |
| "loss": 0.1285, |
| "step": 52750 |
| }, |
| { |
| "epoch": 0.7588905537529509, |
| "grad_norm": 0.3501519560813904, |
| "learning_rate": 2.414229249011858e-05, |
| "loss": 0.1297, |
| "step": 52800 |
| }, |
| { |
| "epoch": 0.7596092001106716, |
| "grad_norm": 0.36480751633644104, |
| "learning_rate": 2.4070427596119298e-05, |
| "loss": 0.1298, |
| "step": 52850 |
| }, |
| { |
| "epoch": 0.7603278464683921, |
| "grad_norm": 0.48834893107414246, |
| "learning_rate": 2.3998562702120013e-05, |
| "loss": 0.1271, |
| "step": 52900 |
| }, |
| { |
| "epoch": 0.7610464928261127, |
| "grad_norm": 0.44482356309890747, |
| "learning_rate": 2.3926697808120736e-05, |
| "loss": 0.1306, |
| "step": 52950 |
| }, |
| { |
| "epoch": 0.7617651391838334, |
| "grad_norm": 0.5450196862220764, |
| "learning_rate": 2.385483291412145e-05, |
| "loss": 0.1271, |
| "step": 53000 |
| }, |
| { |
| "epoch": 0.7624837855415539, |
| "grad_norm": 0.3324733078479767, |
| "learning_rate": 2.378296802012217e-05, |
| "loss": 0.1329, |
| "step": 53050 |
| }, |
| { |
| "epoch": 0.7632024318992745, |
| "grad_norm": 0.34339848160743713, |
| "learning_rate": 2.371110312612289e-05, |
| "loss": 0.1321, |
| "step": 53100 |
| }, |
| { |
| "epoch": 0.7639210782569952, |
| "grad_norm": 0.3504130244255066, |
| "learning_rate": 2.363923823212361e-05, |
| "loss": 0.1281, |
| "step": 53150 |
| }, |
| { |
| "epoch": 0.7646397246147157, |
| "grad_norm": 0.46733903884887695, |
| "learning_rate": 2.3567373338124328e-05, |
| "loss": 0.1306, |
| "step": 53200 |
| }, |
| { |
| "epoch": 0.7653583709724363, |
| "grad_norm": 0.4953770041465759, |
| "learning_rate": 2.3495508444125047e-05, |
| "loss": 0.1308, |
| "step": 53250 |
| }, |
| { |
| "epoch": 0.7660770173301569, |
| "grad_norm": 0.3816761076450348, |
| "learning_rate": 2.3423643550125766e-05, |
| "loss": 0.1308, |
| "step": 53300 |
| }, |
| { |
| "epoch": 0.7667956636878775, |
| "grad_norm": 0.3412993848323822, |
| "learning_rate": 2.3351778656126482e-05, |
| "loss": 0.1272, |
| "step": 53350 |
| }, |
| { |
| "epoch": 0.7675143100455981, |
| "grad_norm": 0.5620062947273254, |
| "learning_rate": 2.32799137621272e-05, |
| "loss": 0.1308, |
| "step": 53400 |
| }, |
| { |
| "epoch": 0.7682329564033187, |
| "grad_norm": 0.44010090827941895, |
| "learning_rate": 2.320804886812792e-05, |
| "loss": 0.1271, |
| "step": 53450 |
| }, |
| { |
| "epoch": 0.7689516027610394, |
| "grad_norm": 0.3432207703590393, |
| "learning_rate": 2.313618397412864e-05, |
| "loss": 0.1291, |
| "step": 53500 |
| }, |
| { |
| "epoch": 0.7696702491187599, |
| "grad_norm": 0.45159292221069336, |
| "learning_rate": 2.3064319080129358e-05, |
| "loss": 0.1264, |
| "step": 53550 |
| }, |
| { |
| "epoch": 0.7703888954764805, |
| "grad_norm": 0.3891454339027405, |
| "learning_rate": 2.2992454186130077e-05, |
| "loss": 0.1311, |
| "step": 53600 |
| }, |
| { |
| "epoch": 0.771107541834201, |
| "grad_norm": 0.5852764844894409, |
| "learning_rate": 2.2920589292130796e-05, |
| "loss": 0.1307, |
| "step": 53650 |
| }, |
| { |
| "epoch": 0.7718261881919217, |
| "grad_norm": 0.48390915989875793, |
| "learning_rate": 2.2848724398131512e-05, |
| "loss": 0.1292, |
| "step": 53700 |
| }, |
| { |
| "epoch": 0.7725448345496423, |
| "grad_norm": 0.27066367864608765, |
| "learning_rate": 2.2776859504132235e-05, |
| "loss": 0.1316, |
| "step": 53750 |
| }, |
| { |
| "epoch": 0.7732634809073629, |
| "grad_norm": 0.32667478919029236, |
| "learning_rate": 2.270499461013295e-05, |
| "loss": 0.1261, |
| "step": 53800 |
| }, |
| { |
| "epoch": 0.7739821272650835, |
| "grad_norm": 0.35106027126312256, |
| "learning_rate": 2.263312971613367e-05, |
| "loss": 0.131, |
| "step": 53850 |
| }, |
| { |
| "epoch": 0.7747007736228041, |
| "grad_norm": 0.41025617718696594, |
| "learning_rate": 2.256126482213439e-05, |
| "loss": 0.1301, |
| "step": 53900 |
| }, |
| { |
| "epoch": 0.7754194199805247, |
| "grad_norm": 0.47719448804855347, |
| "learning_rate": 2.2489399928135108e-05, |
| "loss": 0.1255, |
| "step": 53950 |
| }, |
| { |
| "epoch": 0.7761380663382453, |
| "grad_norm": 0.38124901056289673, |
| "learning_rate": 2.2417535034135827e-05, |
| "loss": 0.1309, |
| "step": 54000 |
| }, |
| { |
| "epoch": 0.7761380663382453, |
| "eval_loss": 0.12959778308868408, |
| "eval_runtime": 2346.9836, |
| "eval_samples_per_second": 24.964, |
| "eval_steps_per_second": 3.121, |
| "step": 54000 |
| }, |
| { |
| "epoch": 0.7768567126959659, |
| "grad_norm": 0.48373425006866455, |
| "learning_rate": 2.2345670140136542e-05, |
| "loss": 0.1365, |
| "step": 54050 |
| }, |
| { |
| "epoch": 0.7775753590536865, |
| "grad_norm": 0.6832888722419739, |
| "learning_rate": 2.2273805246137265e-05, |
| "loss": 0.1315, |
| "step": 54100 |
| }, |
| { |
| "epoch": 0.778294005411407, |
| "grad_norm": 0.36773887276649475, |
| "learning_rate": 2.220194035213798e-05, |
| "loss": 0.13, |
| "step": 54150 |
| }, |
| { |
| "epoch": 0.7790126517691277, |
| "grad_norm": 0.4312587082386017, |
| "learning_rate": 2.21300754581387e-05, |
| "loss": 0.1291, |
| "step": 54200 |
| }, |
| { |
| "epoch": 0.7797312981268483, |
| "grad_norm": 0.3125370740890503, |
| "learning_rate": 2.205821056413942e-05, |
| "loss": 0.1307, |
| "step": 54250 |
| }, |
| { |
| "epoch": 0.7804499444845688, |
| "grad_norm": 0.617438554763794, |
| "learning_rate": 2.1986345670140138e-05, |
| "loss": 0.1343, |
| "step": 54300 |
| }, |
| { |
| "epoch": 0.7811685908422895, |
| "grad_norm": 0.45716118812561035, |
| "learning_rate": 2.1914480776140857e-05, |
| "loss": 0.129, |
| "step": 54350 |
| }, |
| { |
| "epoch": 0.7818872372000101, |
| "grad_norm": 0.3317459523677826, |
| "learning_rate": 2.1842615882141576e-05, |
| "loss": 0.1255, |
| "step": 54400 |
| }, |
| { |
| "epoch": 0.7826058835577306, |
| "grad_norm": 0.3295186161994934, |
| "learning_rate": 2.1770750988142295e-05, |
| "loss": 0.1254, |
| "step": 54450 |
| }, |
| { |
| "epoch": 0.7833245299154512, |
| "grad_norm": 0.32061442732810974, |
| "learning_rate": 2.169888609414301e-05, |
| "loss": 0.1276, |
| "step": 54500 |
| }, |
| { |
| "epoch": 0.7840431762731719, |
| "grad_norm": 0.5443814396858215, |
| "learning_rate": 2.162702120014373e-05, |
| "loss": 0.1318, |
| "step": 54550 |
| }, |
| { |
| "epoch": 0.7847618226308924, |
| "grad_norm": 0.36078059673309326, |
| "learning_rate": 2.155515630614445e-05, |
| "loss": 0.1261, |
| "step": 54600 |
| }, |
| { |
| "epoch": 0.785480468988613, |
| "grad_norm": 0.5464069247245789, |
| "learning_rate": 2.1483291412145168e-05, |
| "loss": 0.1287, |
| "step": 54650 |
| }, |
| { |
| "epoch": 0.7861991153463337, |
| "grad_norm": 0.39234134554862976, |
| "learning_rate": 2.1411426518145884e-05, |
| "loss": 0.1294, |
| "step": 54700 |
| }, |
| { |
| "epoch": 0.7869177617040543, |
| "grad_norm": 0.3379664123058319, |
| "learning_rate": 2.1339561624146606e-05, |
| "loss": 0.1315, |
| "step": 54750 |
| }, |
| { |
| "epoch": 0.7876364080617748, |
| "grad_norm": 0.5115400552749634, |
| "learning_rate": 2.1267696730147326e-05, |
| "loss": 0.1263, |
| "step": 54800 |
| }, |
| { |
| "epoch": 0.7883550544194955, |
| "grad_norm": 0.42865628004074097, |
| "learning_rate": 2.119583183614804e-05, |
| "loss": 0.1278, |
| "step": 54850 |
| }, |
| { |
| "epoch": 0.7890737007772161, |
| "grad_norm": 0.36056503653526306, |
| "learning_rate": 2.1123966942148764e-05, |
| "loss": 0.1284, |
| "step": 54900 |
| }, |
| { |
| "epoch": 0.7897923471349366, |
| "grad_norm": 0.3155864179134369, |
| "learning_rate": 2.105210204814948e-05, |
| "loss": 0.1293, |
| "step": 54950 |
| }, |
| { |
| "epoch": 0.7905109934926572, |
| "grad_norm": 0.37790244817733765, |
| "learning_rate": 2.09802371541502e-05, |
| "loss": 0.1267, |
| "step": 55000 |
| }, |
| { |
| "epoch": 0.7912296398503779, |
| "grad_norm": 0.3175414502620697, |
| "learning_rate": 2.0908372260150918e-05, |
| "loss": 0.1293, |
| "step": 55050 |
| }, |
| { |
| "epoch": 0.7919482862080984, |
| "grad_norm": 0.47029367089271545, |
| "learning_rate": 2.0836507366151637e-05, |
| "loss": 0.1288, |
| "step": 55100 |
| }, |
| { |
| "epoch": 0.792666932565819, |
| "grad_norm": 0.3179719150066376, |
| "learning_rate": 2.0764642472152353e-05, |
| "loss": 0.1253, |
| "step": 55150 |
| }, |
| { |
| "epoch": 0.7933855789235397, |
| "grad_norm": 0.6366003155708313, |
| "learning_rate": 2.069277757815307e-05, |
| "loss": 0.1286, |
| "step": 55200 |
| }, |
| { |
| "epoch": 0.7941042252812602, |
| "grad_norm": 0.3257125914096832, |
| "learning_rate": 2.0620912684153794e-05, |
| "loss": 0.1305, |
| "step": 55250 |
| }, |
| { |
| "epoch": 0.7948228716389808, |
| "grad_norm": 0.47698974609375, |
| "learning_rate": 2.054904779015451e-05, |
| "loss": 0.1287, |
| "step": 55300 |
| }, |
| { |
| "epoch": 0.7955415179967014, |
| "grad_norm": 0.701610267162323, |
| "learning_rate": 2.047718289615523e-05, |
| "loss": 0.1272, |
| "step": 55350 |
| }, |
| { |
| "epoch": 0.796260164354422, |
| "grad_norm": 0.388808935880661, |
| "learning_rate": 2.0405318002155948e-05, |
| "loss": 0.1255, |
| "step": 55400 |
| }, |
| { |
| "epoch": 0.7969788107121426, |
| "grad_norm": 0.5635932683944702, |
| "learning_rate": 2.0333453108156667e-05, |
| "loss": 0.1279, |
| "step": 55450 |
| }, |
| { |
| "epoch": 0.7976974570698632, |
| "grad_norm": 0.3552047908306122, |
| "learning_rate": 2.0261588214157383e-05, |
| "loss": 0.1244, |
| "step": 55500 |
| }, |
| { |
| "epoch": 0.7984161034275838, |
| "grad_norm": 0.38414454460144043, |
| "learning_rate": 2.0189723320158105e-05, |
| "loss": 0.127, |
| "step": 55550 |
| }, |
| { |
| "epoch": 0.7991347497853044, |
| "grad_norm": 0.4083007872104645, |
| "learning_rate": 2.011785842615882e-05, |
| "loss": 0.131, |
| "step": 55600 |
| }, |
| { |
| "epoch": 0.799853396143025, |
| "grad_norm": 0.3586987853050232, |
| "learning_rate": 2.004599353215954e-05, |
| "loss": 0.1294, |
| "step": 55650 |
| }, |
| { |
| "epoch": 0.8005720425007457, |
| "grad_norm": 0.3205280303955078, |
| "learning_rate": 1.9974128638160263e-05, |
| "loss": 0.1295, |
| "step": 55700 |
| }, |
| { |
| "epoch": 0.8012906888584662, |
| "grad_norm": 0.2761281132698059, |
| "learning_rate": 1.990226374416098e-05, |
| "loss": 0.1288, |
| "step": 55750 |
| }, |
| { |
| "epoch": 0.8020093352161868, |
| "grad_norm": 0.34950581192970276, |
| "learning_rate": 1.9830398850161698e-05, |
| "loss": 0.1297, |
| "step": 55800 |
| }, |
| { |
| "epoch": 0.8027279815739073, |
| "grad_norm": 0.35706281661987305, |
| "learning_rate": 1.9758533956162413e-05, |
| "loss": 0.1264, |
| "step": 55850 |
| }, |
| { |
| "epoch": 0.803446627931628, |
| "grad_norm": 0.40524542331695557, |
| "learning_rate": 1.9686669062163136e-05, |
| "loss": 0.1282, |
| "step": 55900 |
| }, |
| { |
| "epoch": 0.8041652742893486, |
| "grad_norm": 0.37444543838500977, |
| "learning_rate": 1.961480416816385e-05, |
| "loss": 0.125, |
| "step": 55950 |
| }, |
| { |
| "epoch": 0.8048839206470692, |
| "grad_norm": 0.46441900730133057, |
| "learning_rate": 1.954293927416457e-05, |
| "loss": 0.123, |
| "step": 56000 |
| }, |
| { |
| "epoch": 0.8056025670047898, |
| "grad_norm": 0.396161288022995, |
| "learning_rate": 1.947107438016529e-05, |
| "loss": 0.124, |
| "step": 56050 |
| }, |
| { |
| "epoch": 0.8063212133625104, |
| "grad_norm": 0.32282915711402893, |
| "learning_rate": 1.939920948616601e-05, |
| "loss": 0.1316, |
| "step": 56100 |
| }, |
| { |
| "epoch": 0.807039859720231, |
| "grad_norm": 0.6208024621009827, |
| "learning_rate": 1.9327344592166728e-05, |
| "loss": 0.1321, |
| "step": 56150 |
| }, |
| { |
| "epoch": 0.8077585060779515, |
| "grad_norm": 0.5710030198097229, |
| "learning_rate": 1.9255479698167447e-05, |
| "loss": 0.1252, |
| "step": 56200 |
| }, |
| { |
| "epoch": 0.8084771524356722, |
| "grad_norm": 0.49897143244743347, |
| "learning_rate": 1.9183614804168166e-05, |
| "loss": 0.1275, |
| "step": 56250 |
| }, |
| { |
| "epoch": 0.8091957987933928, |
| "grad_norm": 0.331696480512619, |
| "learning_rate": 1.9111749910168882e-05, |
| "loss": 0.1303, |
| "step": 56300 |
| }, |
| { |
| "epoch": 0.8099144451511133, |
| "grad_norm": 0.37506794929504395, |
| "learning_rate": 1.90398850161696e-05, |
| "loss": 0.1286, |
| "step": 56350 |
| }, |
| { |
| "epoch": 0.810633091508834, |
| "grad_norm": 0.3573820888996124, |
| "learning_rate": 1.896802012217032e-05, |
| "loss": 0.1345, |
| "step": 56400 |
| }, |
| { |
| "epoch": 0.8113517378665546, |
| "grad_norm": 0.3456536531448364, |
| "learning_rate": 1.889615522817104e-05, |
| "loss": 0.1246, |
| "step": 56450 |
| }, |
| { |
| "epoch": 0.8120703842242751, |
| "grad_norm": 0.30496513843536377, |
| "learning_rate": 1.8824290334171758e-05, |
| "loss": 0.1265, |
| "step": 56500 |
| }, |
| { |
| "epoch": 0.8127890305819957, |
| "grad_norm": 0.3960976302623749, |
| "learning_rate": 1.8752425440172477e-05, |
| "loss": 0.1256, |
| "step": 56550 |
| }, |
| { |
| "epoch": 0.8135076769397164, |
| "grad_norm": 0.6446784138679504, |
| "learning_rate": 1.8680560546173196e-05, |
| "loss": 0.1268, |
| "step": 56600 |
| }, |
| { |
| "epoch": 0.8142263232974369, |
| "grad_norm": 0.4691488742828369, |
| "learning_rate": 1.8608695652173912e-05, |
| "loss": 0.1313, |
| "step": 56650 |
| }, |
| { |
| "epoch": 0.8149449696551575, |
| "grad_norm": 0.34596702456474304, |
| "learning_rate": 1.853826805605462e-05, |
| "loss": 0.1246, |
| "step": 56700 |
| }, |
| { |
| "epoch": 0.8156636160128782, |
| "grad_norm": 0.2931258976459503, |
| "learning_rate": 1.8466403162055338e-05, |
| "loss": 0.1307, |
| "step": 56750 |
| }, |
| { |
| "epoch": 0.8163822623705987, |
| "grad_norm": 0.3972085416316986, |
| "learning_rate": 1.8394538268056057e-05, |
| "loss": 0.1247, |
| "step": 56800 |
| }, |
| { |
| "epoch": 0.8171009087283193, |
| "grad_norm": 0.3118221163749695, |
| "learning_rate": 1.8322673374056772e-05, |
| "loss": 0.1279, |
| "step": 56850 |
| }, |
| { |
| "epoch": 0.81781955508604, |
| "grad_norm": 0.3568058907985687, |
| "learning_rate": 1.8250808480057495e-05, |
| "loss": 0.1291, |
| "step": 56900 |
| }, |
| { |
| "epoch": 0.8185382014437605, |
| "grad_norm": 0.361229807138443, |
| "learning_rate": 1.817894358605821e-05, |
| "loss": 0.128, |
| "step": 56950 |
| }, |
| { |
| "epoch": 0.8192568478014811, |
| "grad_norm": 0.4731394648551941, |
| "learning_rate": 1.810707869205893e-05, |
| "loss": 0.1245, |
| "step": 57000 |
| }, |
| { |
| "epoch": 0.8192568478014811, |
| "eval_loss": 0.12679025530815125, |
| "eval_runtime": 2353.3114, |
| "eval_samples_per_second": 24.897, |
| "eval_steps_per_second": 3.112, |
| "step": 57000 |
| }, |
| { |
| "epoch": 0.8199754941592017, |
| "grad_norm": 0.34787485003471375, |
| "learning_rate": 1.803521379805965e-05, |
| "loss": 0.1297, |
| "step": 57050 |
| }, |
| { |
| "epoch": 0.8206941405169224, |
| "grad_norm": 0.3218761384487152, |
| "learning_rate": 1.7963348904060368e-05, |
| "loss": 0.1286, |
| "step": 57100 |
| }, |
| { |
| "epoch": 0.8214127868746429, |
| "grad_norm": 0.41955044865608215, |
| "learning_rate": 1.7891484010061084e-05, |
| "loss": 0.124, |
| "step": 57150 |
| }, |
| { |
| "epoch": 0.8221314332323635, |
| "grad_norm": 0.44354313611984253, |
| "learning_rate": 1.7819619116061806e-05, |
| "loss": 0.1268, |
| "step": 57200 |
| }, |
| { |
| "epoch": 0.8228500795900842, |
| "grad_norm": 0.4659808576107025, |
| "learning_rate": 1.7747754222062525e-05, |
| "loss": 0.1279, |
| "step": 57250 |
| }, |
| { |
| "epoch": 0.8235687259478047, |
| "grad_norm": 0.30388763546943665, |
| "learning_rate": 1.767588932806324e-05, |
| "loss": 0.1269, |
| "step": 57300 |
| }, |
| { |
| "epoch": 0.8242873723055253, |
| "grad_norm": 0.3265688121318817, |
| "learning_rate": 1.760402443406396e-05, |
| "loss": 0.1287, |
| "step": 57350 |
| }, |
| { |
| "epoch": 0.8250060186632459, |
| "grad_norm": 0.42907214164733887, |
| "learning_rate": 1.753215954006468e-05, |
| "loss": 0.1314, |
| "step": 57400 |
| }, |
| { |
| "epoch": 0.8257246650209665, |
| "grad_norm": 0.33852580189704895, |
| "learning_rate": 1.74602946460654e-05, |
| "loss": 0.1303, |
| "step": 57450 |
| }, |
| { |
| "epoch": 0.8264433113786871, |
| "grad_norm": 0.42768487334251404, |
| "learning_rate": 1.7388429752066114e-05, |
| "loss": 0.129, |
| "step": 57500 |
| }, |
| { |
| "epoch": 0.8271619577364077, |
| "grad_norm": 0.401076078414917, |
| "learning_rate": 1.7316564858066837e-05, |
| "loss": 0.1281, |
| "step": 57550 |
| }, |
| { |
| "epoch": 0.8278806040941283, |
| "grad_norm": 0.4327072501182556, |
| "learning_rate": 1.7244699964067552e-05, |
| "loss": 0.1233, |
| "step": 57600 |
| }, |
| { |
| "epoch": 0.8285992504518489, |
| "grad_norm": 0.3264212906360626, |
| "learning_rate": 1.717283507006827e-05, |
| "loss": 0.1274, |
| "step": 57650 |
| }, |
| { |
| "epoch": 0.8293178968095695, |
| "grad_norm": 0.3338124752044678, |
| "learning_rate": 1.7100970176068994e-05, |
| "loss": 0.1235, |
| "step": 57700 |
| }, |
| { |
| "epoch": 0.8300365431672901, |
| "grad_norm": 0.5374757647514343, |
| "learning_rate": 1.702910528206971e-05, |
| "loss": 0.1308, |
| "step": 57750 |
| }, |
| { |
| "epoch": 0.8307551895250107, |
| "grad_norm": 0.34573695063591003, |
| "learning_rate": 1.695724038807043e-05, |
| "loss": 0.1237, |
| "step": 57800 |
| }, |
| { |
| "epoch": 0.8314738358827313, |
| "grad_norm": 0.3776203691959381, |
| "learning_rate": 1.6885375494071148e-05, |
| "loss": 0.1258, |
| "step": 57850 |
| }, |
| { |
| "epoch": 0.8321924822404518, |
| "grad_norm": 0.4206998348236084, |
| "learning_rate": 1.6813510600071867e-05, |
| "loss": 0.1248, |
| "step": 57900 |
| }, |
| { |
| "epoch": 0.8329111285981725, |
| "grad_norm": 0.39265042543411255, |
| "learning_rate": 1.6741645706072583e-05, |
| "loss": 0.1287, |
| "step": 57950 |
| }, |
| { |
| "epoch": 0.8336297749558931, |
| "grad_norm": 0.4329000413417816, |
| "learning_rate": 1.667121810995329e-05, |
| "loss": 0.129, |
| "step": 58000 |
| }, |
| { |
| "epoch": 0.8343484213136136, |
| "grad_norm": 0.38255923986434937, |
| "learning_rate": 1.6599353215954008e-05, |
| "loss": 0.1251, |
| "step": 58050 |
| }, |
| { |
| "epoch": 0.8350670676713343, |
| "grad_norm": 0.4766669273376465, |
| "learning_rate": 1.6527488321954727e-05, |
| "loss": 0.1268, |
| "step": 58100 |
| }, |
| { |
| "epoch": 0.8357857140290549, |
| "grad_norm": 0.39188286662101746, |
| "learning_rate": 1.6455623427955443e-05, |
| "loss": 0.1249, |
| "step": 58150 |
| }, |
| { |
| "epoch": 0.8365043603867754, |
| "grad_norm": 0.3658916652202606, |
| "learning_rate": 1.6383758533956165e-05, |
| "loss": 0.1194, |
| "step": 58200 |
| }, |
| { |
| "epoch": 0.837223006744496, |
| "grad_norm": 0.3856219947338104, |
| "learning_rate": 1.631189363995688e-05, |
| "loss": 0.1324, |
| "step": 58250 |
| }, |
| { |
| "epoch": 0.8379416531022167, |
| "grad_norm": 0.3330176770687103, |
| "learning_rate": 1.62400287459576e-05, |
| "loss": 0.1275, |
| "step": 58300 |
| }, |
| { |
| "epoch": 0.8386602994599373, |
| "grad_norm": 0.2891123294830322, |
| "learning_rate": 1.616816385195832e-05, |
| "loss": 0.1254, |
| "step": 58350 |
| }, |
| { |
| "epoch": 0.8393789458176578, |
| "grad_norm": 0.41428953409194946, |
| "learning_rate": 1.609629895795904e-05, |
| "loss": 0.128, |
| "step": 58400 |
| }, |
| { |
| "epoch": 0.8400975921753785, |
| "grad_norm": 0.3331279754638672, |
| "learning_rate": 1.6024434063959758e-05, |
| "loss": 0.124, |
| "step": 58450 |
| }, |
| { |
| "epoch": 0.8408162385330991, |
| "grad_norm": 0.4965340495109558, |
| "learning_rate": 1.5952569169960473e-05, |
| "loss": 0.1224, |
| "step": 58500 |
| }, |
| { |
| "epoch": 0.8415348848908196, |
| "grad_norm": 0.3635750710964203, |
| "learning_rate": 1.5880704275961196e-05, |
| "loss": 0.1241, |
| "step": 58550 |
| }, |
| { |
| "epoch": 0.8422535312485403, |
| "grad_norm": 0.294466495513916, |
| "learning_rate": 1.580883938196191e-05, |
| "loss": 0.1277, |
| "step": 58600 |
| }, |
| { |
| "epoch": 0.8429721776062609, |
| "grad_norm": 0.3636401891708374, |
| "learning_rate": 1.573697448796263e-05, |
| "loss": 0.1278, |
| "step": 58650 |
| }, |
| { |
| "epoch": 0.8436908239639814, |
| "grad_norm": 0.2929422855377197, |
| "learning_rate": 1.566510959396335e-05, |
| "loss": 0.125, |
| "step": 58700 |
| }, |
| { |
| "epoch": 0.844409470321702, |
| "grad_norm": 0.4337705969810486, |
| "learning_rate": 1.559324469996407e-05, |
| "loss": 0.1232, |
| "step": 58750 |
| }, |
| { |
| "epoch": 0.8451281166794227, |
| "grad_norm": 0.39441823959350586, |
| "learning_rate": 1.5521379805964788e-05, |
| "loss": 0.1279, |
| "step": 58800 |
| }, |
| { |
| "epoch": 0.8458467630371432, |
| "grad_norm": 0.37334030866622925, |
| "learning_rate": 1.5449514911965507e-05, |
| "loss": 0.127, |
| "step": 58850 |
| }, |
| { |
| "epoch": 0.8465654093948638, |
| "grad_norm": 0.4512324333190918, |
| "learning_rate": 1.5377650017966226e-05, |
| "loss": 0.1247, |
| "step": 58900 |
| }, |
| { |
| "epoch": 0.8472840557525845, |
| "grad_norm": 0.39491838216781616, |
| "learning_rate": 1.5305785123966942e-05, |
| "loss": 0.129, |
| "step": 58950 |
| }, |
| { |
| "epoch": 0.848002702110305, |
| "grad_norm": 0.3801107704639435, |
| "learning_rate": 1.5233920229967661e-05, |
| "loss": 0.1254, |
| "step": 59000 |
| }, |
| { |
| "epoch": 0.8487213484680256, |
| "grad_norm": 0.33343058824539185, |
| "learning_rate": 1.516205533596838e-05, |
| "loss": 0.1259, |
| "step": 59050 |
| }, |
| { |
| "epoch": 0.8494399948257462, |
| "grad_norm": 0.3642762303352356, |
| "learning_rate": 1.50901904419691e-05, |
| "loss": 0.1265, |
| "step": 59100 |
| }, |
| { |
| "epoch": 0.8501586411834668, |
| "grad_norm": 0.3644247353076935, |
| "learning_rate": 1.5018325547969817e-05, |
| "loss": 0.1227, |
| "step": 59150 |
| }, |
| { |
| "epoch": 0.8508772875411874, |
| "grad_norm": 0.33389464020729065, |
| "learning_rate": 1.4946460653970537e-05, |
| "loss": 0.1217, |
| "step": 59200 |
| }, |
| { |
| "epoch": 0.851595933898908, |
| "grad_norm": 0.35677823424339294, |
| "learning_rate": 1.4874595759971255e-05, |
| "loss": 0.1273, |
| "step": 59250 |
| }, |
| { |
| "epoch": 0.8523145802566287, |
| "grad_norm": 0.32460057735443115, |
| "learning_rate": 1.4802730865971972e-05, |
| "loss": 0.1223, |
| "step": 59300 |
| }, |
| { |
| "epoch": 0.8530332266143492, |
| "grad_norm": 0.4664474427700043, |
| "learning_rate": 1.4730865971972693e-05, |
| "loss": 0.1258, |
| "step": 59350 |
| }, |
| { |
| "epoch": 0.8537518729720698, |
| "grad_norm": 0.43569889664649963, |
| "learning_rate": 1.465900107797341e-05, |
| "loss": 0.1225, |
| "step": 59400 |
| }, |
| { |
| "epoch": 0.8544705193297905, |
| "grad_norm": 0.41789641976356506, |
| "learning_rate": 1.458713618397413e-05, |
| "loss": 0.1226, |
| "step": 59450 |
| }, |
| { |
| "epoch": 0.855189165687511, |
| "grad_norm": 0.4204149842262268, |
| "learning_rate": 1.4515271289974847e-05, |
| "loss": 0.126, |
| "step": 59500 |
| }, |
| { |
| "epoch": 0.8559078120452316, |
| "grad_norm": 0.3769352436065674, |
| "learning_rate": 1.4443406395975568e-05, |
| "loss": 0.1222, |
| "step": 59550 |
| }, |
| { |
| "epoch": 0.8566264584029522, |
| "grad_norm": 0.3953361213207245, |
| "learning_rate": 1.4371541501976285e-05, |
| "loss": 0.1305, |
| "step": 59600 |
| }, |
| { |
| "epoch": 0.8573451047606728, |
| "grad_norm": 0.31381121277809143, |
| "learning_rate": 1.4299676607977003e-05, |
| "loss": 0.1255, |
| "step": 59650 |
| }, |
| { |
| "epoch": 0.8580637511183934, |
| "grad_norm": 0.35245615243911743, |
| "learning_rate": 1.4227811713977723e-05, |
| "loss": 0.1253, |
| "step": 59700 |
| }, |
| { |
| "epoch": 0.858782397476114, |
| "grad_norm": 0.30990514159202576, |
| "learning_rate": 1.415594681997844e-05, |
| "loss": 0.1231, |
| "step": 59750 |
| }, |
| { |
| "epoch": 0.8595010438338346, |
| "grad_norm": 0.3628983795642853, |
| "learning_rate": 1.4084081925979158e-05, |
| "loss": 0.1252, |
| "step": 59800 |
| }, |
| { |
| "epoch": 0.8602196901915552, |
| "grad_norm": 0.4560927152633667, |
| "learning_rate": 1.4012217031979879e-05, |
| "loss": 0.1232, |
| "step": 59850 |
| }, |
| { |
| "epoch": 0.8609383365492758, |
| "grad_norm": 0.3632454574108124, |
| "learning_rate": 1.3940352137980598e-05, |
| "loss": 0.1236, |
| "step": 59900 |
| }, |
| { |
| "epoch": 0.8616569829069963, |
| "grad_norm": 0.4410002529621124, |
| "learning_rate": 1.3868487243981315e-05, |
| "loss": 0.1218, |
| "step": 59950 |
| }, |
| { |
| "epoch": 0.862375629264717, |
| "grad_norm": 0.48087888956069946, |
| "learning_rate": 1.3796622349982036e-05, |
| "loss": 0.1252, |
| "step": 60000 |
| }, |
| { |
| "epoch": 0.862375629264717, |
| "eval_loss": 0.12454573065042496, |
| "eval_runtime": 2352.7822, |
| "eval_samples_per_second": 24.902, |
| "eval_steps_per_second": 3.113, |
| "step": 60000 |
| }, |
| { |
| "epoch": 0.8630942756224376, |
| "grad_norm": 0.3360944986343384, |
| "learning_rate": 1.3724757455982754e-05, |
| "loss": 0.1235, |
| "step": 60050 |
| }, |
| { |
| "epoch": 0.8638129219801581, |
| "grad_norm": 0.33876070380210876, |
| "learning_rate": 1.3652892561983471e-05, |
| "loss": 0.1245, |
| "step": 60100 |
| }, |
| { |
| "epoch": 0.8645315683378788, |
| "grad_norm": 0.4165988862514496, |
| "learning_rate": 1.3581027667984189e-05, |
| "loss": 0.1274, |
| "step": 60150 |
| }, |
| { |
| "epoch": 0.8652502146955994, |
| "grad_norm": 0.35403144359588623, |
| "learning_rate": 1.350916277398491e-05, |
| "loss": 0.1247, |
| "step": 60200 |
| }, |
| { |
| "epoch": 0.8659688610533199, |
| "grad_norm": 0.3999514579772949, |
| "learning_rate": 1.3438735177865614e-05, |
| "loss": 0.1261, |
| "step": 60250 |
| }, |
| { |
| "epoch": 0.8666875074110406, |
| "grad_norm": 0.33906814455986023, |
| "learning_rate": 1.3366870283866331e-05, |
| "loss": 0.1245, |
| "step": 60300 |
| }, |
| { |
| "epoch": 0.8674061537687612, |
| "grad_norm": 0.3145388662815094, |
| "learning_rate": 1.3295005389867052e-05, |
| "loss": 0.123, |
| "step": 60350 |
| }, |
| { |
| "epoch": 0.8681248001264817, |
| "grad_norm": 0.5379143953323364, |
| "learning_rate": 1.322314049586777e-05, |
| "loss": 0.1266, |
| "step": 60400 |
| }, |
| { |
| "epoch": 0.8688434464842023, |
| "grad_norm": 0.49181005358695984, |
| "learning_rate": 1.3151275601868487e-05, |
| "loss": 0.1233, |
| "step": 60450 |
| }, |
| { |
| "epoch": 0.869562092841923, |
| "grad_norm": 0.4837574064731598, |
| "learning_rate": 1.3079410707869205e-05, |
| "loss": 0.1291, |
| "step": 60500 |
| }, |
| { |
| "epoch": 0.8702807391996435, |
| "grad_norm": 0.37543419003486633, |
| "learning_rate": 1.3007545813869925e-05, |
| "loss": 0.1236, |
| "step": 60550 |
| }, |
| { |
| "epoch": 0.8709993855573641, |
| "grad_norm": 0.3877285122871399, |
| "learning_rate": 1.2935680919870643e-05, |
| "loss": 0.1229, |
| "step": 60600 |
| }, |
| { |
| "epoch": 0.8717180319150848, |
| "grad_norm": 0.34340938925743103, |
| "learning_rate": 1.2863816025871362e-05, |
| "loss": 0.1241, |
| "step": 60650 |
| }, |
| { |
| "epoch": 0.8724366782728054, |
| "grad_norm": 0.33106306195259094, |
| "learning_rate": 1.2791951131872083e-05, |
| "loss": 0.126, |
| "step": 60700 |
| }, |
| { |
| "epoch": 0.8731553246305259, |
| "grad_norm": 0.3516967296600342, |
| "learning_rate": 1.27200862378728e-05, |
| "loss": 0.126, |
| "step": 60750 |
| }, |
| { |
| "epoch": 0.8738739709882465, |
| "grad_norm": 0.3838764429092407, |
| "learning_rate": 1.2648221343873517e-05, |
| "loss": 0.1236, |
| "step": 60800 |
| }, |
| { |
| "epoch": 0.8745926173459672, |
| "grad_norm": 0.5866835117340088, |
| "learning_rate": 1.2576356449874238e-05, |
| "loss": 0.1243, |
| "step": 60850 |
| }, |
| { |
| "epoch": 0.8753112637036877, |
| "grad_norm": 0.42481040954589844, |
| "learning_rate": 1.2504491555874956e-05, |
| "loss": 0.1258, |
| "step": 60900 |
| }, |
| { |
| "epoch": 0.8760299100614083, |
| "grad_norm": 0.33105340600013733, |
| "learning_rate": 1.2432626661875673e-05, |
| "loss": 0.1242, |
| "step": 60950 |
| }, |
| { |
| "epoch": 0.876748556419129, |
| "grad_norm": 0.38015687465667725, |
| "learning_rate": 1.2360761767876392e-05, |
| "loss": 0.1246, |
| "step": 61000 |
| }, |
| { |
| "epoch": 0.8774672027768495, |
| "grad_norm": 0.35006070137023926, |
| "learning_rate": 1.2288896873877111e-05, |
| "loss": 0.1229, |
| "step": 61050 |
| }, |
| { |
| "epoch": 0.8781858491345701, |
| "grad_norm": 0.4309733808040619, |
| "learning_rate": 1.221703197987783e-05, |
| "loss": 0.1205, |
| "step": 61100 |
| }, |
| { |
| "epoch": 0.8789044954922908, |
| "grad_norm": 0.3616236448287964, |
| "learning_rate": 1.214516708587855e-05, |
| "loss": 0.1245, |
| "step": 61150 |
| }, |
| { |
| "epoch": 0.8796231418500113, |
| "grad_norm": 0.41629916429519653, |
| "learning_rate": 1.2073302191879267e-05, |
| "loss": 0.1216, |
| "step": 61200 |
| }, |
| { |
| "epoch": 0.8803417882077319, |
| "grad_norm": 0.283905565738678, |
| "learning_rate": 1.2001437297879986e-05, |
| "loss": 0.1221, |
| "step": 61250 |
| }, |
| { |
| "epoch": 0.8810604345654525, |
| "grad_norm": 0.439532995223999, |
| "learning_rate": 1.1929572403880705e-05, |
| "loss": 0.1294, |
| "step": 61300 |
| }, |
| { |
| "epoch": 0.8817790809231731, |
| "grad_norm": 0.41762885451316833, |
| "learning_rate": 1.1857707509881423e-05, |
| "loss": 0.125, |
| "step": 61350 |
| }, |
| { |
| "epoch": 0.8824977272808937, |
| "grad_norm": 0.361398845911026, |
| "learning_rate": 1.1785842615882142e-05, |
| "loss": 0.1196, |
| "step": 61400 |
| }, |
| { |
| "epoch": 0.8832163736386143, |
| "grad_norm": 0.4029219150543213, |
| "learning_rate": 1.171397772188286e-05, |
| "loss": 0.1225, |
| "step": 61450 |
| }, |
| { |
| "epoch": 0.883935019996335, |
| "grad_norm": 0.29444122314453125, |
| "learning_rate": 1.164211282788358e-05, |
| "loss": 0.125, |
| "step": 61500 |
| }, |
| { |
| "epoch": 0.8846536663540555, |
| "grad_norm": 0.3278166353702545, |
| "learning_rate": 1.1570247933884299e-05, |
| "loss": 0.1237, |
| "step": 61550 |
| }, |
| { |
| "epoch": 0.8853723127117761, |
| "grad_norm": 0.41596293449401855, |
| "learning_rate": 1.1498383039885016e-05, |
| "loss": 0.1239, |
| "step": 61600 |
| }, |
| { |
| "epoch": 0.8860909590694966, |
| "grad_norm": 0.3153913617134094, |
| "learning_rate": 1.1426518145885735e-05, |
| "loss": 0.1244, |
| "step": 61650 |
| }, |
| { |
| "epoch": 0.8868096054272173, |
| "grad_norm": 0.37591880559921265, |
| "learning_rate": 1.1354653251886455e-05, |
| "loss": 0.1205, |
| "step": 61700 |
| }, |
| { |
| "epoch": 0.8875282517849379, |
| "grad_norm": 0.28158771991729736, |
| "learning_rate": 1.1282788357887172e-05, |
| "loss": 0.1223, |
| "step": 61750 |
| }, |
| { |
| "epoch": 0.8882468981426584, |
| "grad_norm": 0.33741140365600586, |
| "learning_rate": 1.1210923463887891e-05, |
| "loss": 0.1309, |
| "step": 61800 |
| }, |
| { |
| "epoch": 0.8889655445003791, |
| "grad_norm": 0.4564758539199829, |
| "learning_rate": 1.113905856988861e-05, |
| "loss": 0.1243, |
| "step": 61850 |
| }, |
| { |
| "epoch": 0.8896841908580997, |
| "grad_norm": 0.6229817867279053, |
| "learning_rate": 1.106719367588933e-05, |
| "loss": 0.1264, |
| "step": 61900 |
| }, |
| { |
| "epoch": 0.8904028372158203, |
| "grad_norm": 0.35576000809669495, |
| "learning_rate": 1.0995328781890048e-05, |
| "loss": 0.1251, |
| "step": 61950 |
| }, |
| { |
| "epoch": 0.8911214835735409, |
| "grad_norm": 0.4052492380142212, |
| "learning_rate": 1.0923463887890766e-05, |
| "loss": 0.1219, |
| "step": 62000 |
| }, |
| { |
| "epoch": 0.8918401299312615, |
| "grad_norm": 0.33254197239875793, |
| "learning_rate": 1.0851598993891485e-05, |
| "loss": 0.1237, |
| "step": 62050 |
| }, |
| { |
| "epoch": 0.8925587762889821, |
| "grad_norm": 0.3527114987373352, |
| "learning_rate": 1.0779734099892202e-05, |
| "loss": 0.1215, |
| "step": 62100 |
| }, |
| { |
| "epoch": 0.8932774226467026, |
| "grad_norm": 0.36184269189834595, |
| "learning_rate": 1.0707869205892921e-05, |
| "loss": 0.1244, |
| "step": 62150 |
| }, |
| { |
| "epoch": 0.8939960690044233, |
| "grad_norm": 0.3354300558567047, |
| "learning_rate": 1.063600431189364e-05, |
| "loss": 0.12, |
| "step": 62200 |
| }, |
| { |
| "epoch": 0.8947147153621439, |
| "grad_norm": 0.38519734144210815, |
| "learning_rate": 1.0564139417894358e-05, |
| "loss": 0.1249, |
| "step": 62250 |
| }, |
| { |
| "epoch": 0.8954333617198644, |
| "grad_norm": 0.3049958348274231, |
| "learning_rate": 1.0492274523895079e-05, |
| "loss": 0.126, |
| "step": 62300 |
| }, |
| { |
| "epoch": 0.8961520080775851, |
| "grad_norm": 0.3645496070384979, |
| "learning_rate": 1.0420409629895798e-05, |
| "loss": 0.1267, |
| "step": 62350 |
| }, |
| { |
| "epoch": 0.8968706544353057, |
| "grad_norm": 0.48931238055229187, |
| "learning_rate": 1.0348544735896515e-05, |
| "loss": 0.1216, |
| "step": 62400 |
| }, |
| { |
| "epoch": 0.8975893007930262, |
| "grad_norm": 0.3747062385082245, |
| "learning_rate": 1.0276679841897234e-05, |
| "loss": 0.1231, |
| "step": 62450 |
| }, |
| { |
| "epoch": 0.8983079471507468, |
| "grad_norm": 0.3831491470336914, |
| "learning_rate": 1.0204814947897952e-05, |
| "loss": 0.1234, |
| "step": 62500 |
| }, |
| { |
| "epoch": 0.8990265935084675, |
| "grad_norm": 0.6291891932487488, |
| "learning_rate": 1.013295005389867e-05, |
| "loss": 0.1233, |
| "step": 62550 |
| }, |
| { |
| "epoch": 0.899745239866188, |
| "grad_norm": 0.30106669664382935, |
| "learning_rate": 1.006108515989939e-05, |
| "loss": 0.1221, |
| "step": 62600 |
| }, |
| { |
| "epoch": 0.9004638862239086, |
| "grad_norm": 0.3288785219192505, |
| "learning_rate": 9.989220265900107e-06, |
| "loss": 0.1273, |
| "step": 62650 |
| }, |
| { |
| "epoch": 0.9011825325816293, |
| "grad_norm": 0.33860695362091064, |
| "learning_rate": 9.917355371900826e-06, |
| "loss": 0.1217, |
| "step": 62700 |
| }, |
| { |
| "epoch": 0.9019011789393498, |
| "grad_norm": 0.3534477651119232, |
| "learning_rate": 9.845490477901546e-06, |
| "loss": 0.1217, |
| "step": 62750 |
| }, |
| { |
| "epoch": 0.9026198252970704, |
| "grad_norm": 0.4250911474227905, |
| "learning_rate": 9.773625583902265e-06, |
| "loss": 0.1236, |
| "step": 62800 |
| }, |
| { |
| "epoch": 0.9033384716547911, |
| "grad_norm": 0.3646557033061981, |
| "learning_rate": 9.703197987782968e-06, |
| "loss": 0.1273, |
| "step": 62850 |
| }, |
| { |
| "epoch": 0.9040571180125117, |
| "grad_norm": 0.3592480421066284, |
| "learning_rate": 9.631333093783687e-06, |
| "loss": 0.1233, |
| "step": 62900 |
| }, |
| { |
| "epoch": 0.9047757643702322, |
| "grad_norm": 0.4674370288848877, |
| "learning_rate": 9.559468199784406e-06, |
| "loss": 0.1194, |
| "step": 62950 |
| }, |
| { |
| "epoch": 0.9054944107279528, |
| "grad_norm": 0.6390477418899536, |
| "learning_rate": 9.487603305785123e-06, |
| "loss": 0.1193, |
| "step": 63000 |
| }, |
| { |
| "epoch": 0.9054944107279528, |
| "eval_loss": 0.12265664339065552, |
| "eval_runtime": 2338.7731, |
| "eval_samples_per_second": 25.052, |
| "eval_steps_per_second": 3.132, |
| "step": 63000 |
| }, |
| { |
| "epoch": 0.9062130570856735, |
| "grad_norm": 0.3408135175704956, |
| "learning_rate": 9.415738411785842e-06, |
| "loss": 0.1212, |
| "step": 63050 |
| }, |
| { |
| "epoch": 0.906931703443394, |
| "grad_norm": 0.3757847249507904, |
| "learning_rate": 9.343873517786562e-06, |
| "loss": 0.1223, |
| "step": 63100 |
| }, |
| { |
| "epoch": 0.9076503498011146, |
| "grad_norm": 0.4292818307876587, |
| "learning_rate": 9.27200862378728e-06, |
| "loss": 0.1239, |
| "step": 63150 |
| }, |
| { |
| "epoch": 0.9083689961588353, |
| "grad_norm": 0.3223564922809601, |
| "learning_rate": 9.200143729788e-06, |
| "loss": 0.1252, |
| "step": 63200 |
| }, |
| { |
| "epoch": 0.9090876425165558, |
| "grad_norm": 0.34768980741500854, |
| "learning_rate": 9.128278835788717e-06, |
| "loss": 0.125, |
| "step": 63250 |
| }, |
| { |
| "epoch": 0.9098062888742764, |
| "grad_norm": 0.38182663917541504, |
| "learning_rate": 9.056413941789436e-06, |
| "loss": 0.1222, |
| "step": 63300 |
| }, |
| { |
| "epoch": 0.910524935231997, |
| "grad_norm": 0.30806124210357666, |
| "learning_rate": 8.984549047790155e-06, |
| "loss": 0.1225, |
| "step": 63350 |
| }, |
| { |
| "epoch": 0.9112435815897176, |
| "grad_norm": 0.39036205410957336, |
| "learning_rate": 8.912684153790873e-06, |
| "loss": 0.124, |
| "step": 63400 |
| }, |
| { |
| "epoch": 0.9119622279474382, |
| "grad_norm": 0.31721031665802, |
| "learning_rate": 8.840819259791592e-06, |
| "loss": 0.1215, |
| "step": 63450 |
| }, |
| { |
| "epoch": 0.9126808743051588, |
| "grad_norm": 0.36282646656036377, |
| "learning_rate": 8.768954365792311e-06, |
| "loss": 0.1245, |
| "step": 63500 |
| }, |
| { |
| "epoch": 0.9133995206628794, |
| "grad_norm": 0.3416596055030823, |
| "learning_rate": 8.69708947179303e-06, |
| "loss": 0.1208, |
| "step": 63550 |
| }, |
| { |
| "epoch": 0.9141181670206, |
| "grad_norm": 0.46525707840919495, |
| "learning_rate": 8.62522457779375e-06, |
| "loss": 0.1254, |
| "step": 63600 |
| }, |
| { |
| "epoch": 0.9148368133783206, |
| "grad_norm": 0.49614182114601135, |
| "learning_rate": 8.553359683794467e-06, |
| "loss": 0.1224, |
| "step": 63650 |
| }, |
| { |
| "epoch": 0.9155554597360412, |
| "grad_norm": 0.36073940992355347, |
| "learning_rate": 8.481494789795186e-06, |
| "loss": 0.123, |
| "step": 63700 |
| }, |
| { |
| "epoch": 0.9162741060937618, |
| "grad_norm": 0.42497745156288147, |
| "learning_rate": 8.409629895795903e-06, |
| "loss": 0.1259, |
| "step": 63750 |
| }, |
| { |
| "epoch": 0.9169927524514824, |
| "grad_norm": 0.34898316860198975, |
| "learning_rate": 8.337765001796622e-06, |
| "loss": 0.1265, |
| "step": 63800 |
| }, |
| { |
| "epoch": 0.9177113988092029, |
| "grad_norm": 0.35650330781936646, |
| "learning_rate": 8.265900107797341e-06, |
| "loss": 0.1182, |
| "step": 63850 |
| }, |
| { |
| "epoch": 0.9184300451669236, |
| "grad_norm": 0.29349714517593384, |
| "learning_rate": 8.19403521379806e-06, |
| "loss": 0.1195, |
| "step": 63900 |
| }, |
| { |
| "epoch": 0.9191486915246442, |
| "grad_norm": 0.34547990560531616, |
| "learning_rate": 8.12217031979878e-06, |
| "loss": 0.1205, |
| "step": 63950 |
| }, |
| { |
| "epoch": 0.9198673378823647, |
| "grad_norm": 0.43164411187171936, |
| "learning_rate": 8.050305425799497e-06, |
| "loss": 0.1225, |
| "step": 64000 |
| }, |
| { |
| "epoch": 0.9205859842400854, |
| "grad_norm": 0.4483722746372223, |
| "learning_rate": 7.978440531800216e-06, |
| "loss": 0.1213, |
| "step": 64050 |
| }, |
| { |
| "epoch": 0.921304630597806, |
| "grad_norm": 0.42297491431236267, |
| "learning_rate": 7.906575637800935e-06, |
| "loss": 0.1239, |
| "step": 64100 |
| }, |
| { |
| "epoch": 0.9220232769555265, |
| "grad_norm": 0.3763730227947235, |
| "learning_rate": 7.834710743801653e-06, |
| "loss": 0.1219, |
| "step": 64150 |
| }, |
| { |
| "epoch": 0.9227419233132471, |
| "grad_norm": 0.39699843525886536, |
| "learning_rate": 7.762845849802372e-06, |
| "loss": 0.1187, |
| "step": 64200 |
| }, |
| { |
| "epoch": 0.9234605696709678, |
| "grad_norm": 0.3688933253288269, |
| "learning_rate": 7.69098095580309e-06, |
| "loss": 0.1191, |
| "step": 64250 |
| }, |
| { |
| "epoch": 0.9241792160286884, |
| "grad_norm": 0.411871075630188, |
| "learning_rate": 7.619116061803809e-06, |
| "loss": 0.1248, |
| "step": 64300 |
| }, |
| { |
| "epoch": 0.9248978623864089, |
| "grad_norm": 0.5669124722480774, |
| "learning_rate": 7.547251167804528e-06, |
| "loss": 0.1201, |
| "step": 64350 |
| }, |
| { |
| "epoch": 0.9256165087441296, |
| "grad_norm": 0.3789122700691223, |
| "learning_rate": 7.475386273805246e-06, |
| "loss": 0.1327, |
| "step": 64400 |
| }, |
| { |
| "epoch": 0.9263351551018502, |
| "grad_norm": 0.382330060005188, |
| "learning_rate": 7.4035213798059655e-06, |
| "loss": 0.1217, |
| "step": 64450 |
| }, |
| { |
| "epoch": 0.9270538014595707, |
| "grad_norm": 0.4880569279193878, |
| "learning_rate": 7.331656485806685e-06, |
| "loss": 0.1222, |
| "step": 64500 |
| }, |
| { |
| "epoch": 0.9277724478172914, |
| "grad_norm": 0.5543988943099976, |
| "learning_rate": 7.259791591807402e-06, |
| "loss": 0.1218, |
| "step": 64550 |
| }, |
| { |
| "epoch": 0.928491094175012, |
| "grad_norm": 0.36518344283103943, |
| "learning_rate": 7.187926697808121e-06, |
| "loss": 0.1217, |
| "step": 64600 |
| }, |
| { |
| "epoch": 0.9292097405327325, |
| "grad_norm": 0.39271408319473267, |
| "learning_rate": 7.116061803808839e-06, |
| "loss": 0.1214, |
| "step": 64650 |
| }, |
| { |
| "epoch": 0.9299283868904531, |
| "grad_norm": 0.4329274594783783, |
| "learning_rate": 7.0441969098095585e-06, |
| "loss": 0.1231, |
| "step": 64700 |
| }, |
| { |
| "epoch": 0.9306470332481738, |
| "grad_norm": 0.6806078553199768, |
| "learning_rate": 6.9723320158102776e-06, |
| "loss": 0.1214, |
| "step": 64750 |
| }, |
| { |
| "epoch": 0.9313656796058943, |
| "grad_norm": 0.4004870355129242, |
| "learning_rate": 6.900467121810995e-06, |
| "loss": 0.1263, |
| "step": 64800 |
| }, |
| { |
| "epoch": 0.9320843259636149, |
| "grad_norm": 0.48993903398513794, |
| "learning_rate": 6.828602227811715e-06, |
| "loss": 0.1223, |
| "step": 64850 |
| }, |
| { |
| "epoch": 0.9328029723213356, |
| "grad_norm": 0.5226307511329651, |
| "learning_rate": 6.756737333812432e-06, |
| "loss": 0.1221, |
| "step": 64900 |
| }, |
| { |
| "epoch": 0.9335216186790561, |
| "grad_norm": 0.28276339173316956, |
| "learning_rate": 6.6848724398131514e-06, |
| "loss": 0.1236, |
| "step": 64950 |
| }, |
| { |
| "epoch": 0.9342402650367767, |
| "grad_norm": 0.37379029393196106, |
| "learning_rate": 6.6130075458138705e-06, |
| "loss": 0.1204, |
| "step": 65000 |
| }, |
| { |
| "epoch": 0.9349589113944973, |
| "grad_norm": 0.3810805678367615, |
| "learning_rate": 6.541142651814589e-06, |
| "loss": 0.1179, |
| "step": 65050 |
| }, |
| { |
| "epoch": 0.935677557752218, |
| "grad_norm": 0.3675697147846222, |
| "learning_rate": 6.469277757815308e-06, |
| "loss": 0.1218, |
| "step": 65100 |
| }, |
| { |
| "epoch": 0.9363962041099385, |
| "grad_norm": 0.41229313611984253, |
| "learning_rate": 6.397412863816025e-06, |
| "loss": 0.1244, |
| "step": 65150 |
| }, |
| { |
| "epoch": 0.9371148504676591, |
| "grad_norm": 0.4059778153896332, |
| "learning_rate": 6.325547969816744e-06, |
| "loss": 0.1204, |
| "step": 65200 |
| }, |
| { |
| "epoch": 0.9378334968253798, |
| "grad_norm": 0.38567858934402466, |
| "learning_rate": 6.2536830758174635e-06, |
| "loss": 0.1173, |
| "step": 65250 |
| }, |
| { |
| "epoch": 0.9385521431831003, |
| "grad_norm": 0.36514681577682495, |
| "learning_rate": 6.181818181818183e-06, |
| "loss": 0.1188, |
| "step": 65300 |
| }, |
| { |
| "epoch": 0.9392707895408209, |
| "grad_norm": 0.37921783328056335, |
| "learning_rate": 6.109953287818901e-06, |
| "loss": 0.1234, |
| "step": 65350 |
| }, |
| { |
| "epoch": 0.9399894358985416, |
| "grad_norm": 0.41887423396110535, |
| "learning_rate": 6.038088393819619e-06, |
| "loss": 0.1169, |
| "step": 65400 |
| }, |
| { |
| "epoch": 0.9407080822562621, |
| "grad_norm": 0.41464152932167053, |
| "learning_rate": 5.966223499820338e-06, |
| "loss": 0.1208, |
| "step": 65450 |
| }, |
| { |
| "epoch": 0.9414267286139827, |
| "grad_norm": 0.3517071604728699, |
| "learning_rate": 5.894358605821057e-06, |
| "loss": 0.1222, |
| "step": 65500 |
| }, |
| { |
| "epoch": 0.9421453749717033, |
| "grad_norm": 0.39751720428466797, |
| "learning_rate": 5.822493711821776e-06, |
| "loss": 0.1201, |
| "step": 65550 |
| }, |
| { |
| "epoch": 0.9428640213294239, |
| "grad_norm": 0.42730578780174255, |
| "learning_rate": 5.750628817822494e-06, |
| "loss": 0.1197, |
| "step": 65600 |
| }, |
| { |
| "epoch": 0.9435826676871445, |
| "grad_norm": 0.4353543519973755, |
| "learning_rate": 5.678763923823212e-06, |
| "loss": 0.1231, |
| "step": 65650 |
| }, |
| { |
| "epoch": 0.9443013140448651, |
| "grad_norm": 0.4269670248031616, |
| "learning_rate": 5.606899029823931e-06, |
| "loss": 0.1229, |
| "step": 65700 |
| }, |
| { |
| "epoch": 0.9450199604025857, |
| "grad_norm": 0.4464121460914612, |
| "learning_rate": 5.53503413582465e-06, |
| "loss": 0.118, |
| "step": 65750 |
| }, |
| { |
| "epoch": 0.9457386067603063, |
| "grad_norm": 0.44756245613098145, |
| "learning_rate": 5.4631692418253686e-06, |
| "loss": 0.1225, |
| "step": 65800 |
| }, |
| { |
| "epoch": 0.9464572531180269, |
| "grad_norm": 0.3775683641433716, |
| "learning_rate": 5.391304347826087e-06, |
| "loss": 0.1232, |
| "step": 65850 |
| }, |
| { |
| "epoch": 0.9471758994757474, |
| "grad_norm": 0.5156663060188293, |
| "learning_rate": 5.319439453826806e-06, |
| "loss": 0.1183, |
| "step": 65900 |
| }, |
| { |
| "epoch": 0.9478945458334681, |
| "grad_norm": 0.44453561305999756, |
| "learning_rate": 5.247574559827524e-06, |
| "loss": 0.1243, |
| "step": 65950 |
| }, |
| { |
| "epoch": 0.9486131921911887, |
| "grad_norm": 0.4175598621368408, |
| "learning_rate": 5.175709665828243e-06, |
| "loss": 0.124, |
| "step": 66000 |
| }, |
| { |
| "epoch": 0.9486131921911887, |
| "eval_loss": 0.12094888836145401, |
| "eval_runtime": 2341.6257, |
| "eval_samples_per_second": 25.021, |
| "eval_steps_per_second": 3.128, |
| "step": 66000 |
| }, |
| { |
| "epoch": 0.9493318385489092, |
| "grad_norm": 0.37046581506729126, |
| "learning_rate": 5.1038447718289615e-06, |
| "loss": 0.1221, |
| "step": 66050 |
| }, |
| { |
| "epoch": 0.9500504849066299, |
| "grad_norm": 0.45041966438293457, |
| "learning_rate": 5.031979877829681e-06, |
| "loss": 0.1188, |
| "step": 66100 |
| }, |
| { |
| "epoch": 0.9507691312643505, |
| "grad_norm": 0.3930753469467163, |
| "learning_rate": 4.961552281710385e-06, |
| "loss": 0.1238, |
| "step": 66150 |
| }, |
| { |
| "epoch": 0.951487777622071, |
| "grad_norm": 0.3932070732116699, |
| "learning_rate": 4.889687387711103e-06, |
| "loss": 0.1223, |
| "step": 66200 |
| }, |
| { |
| "epoch": 0.9522064239797917, |
| "grad_norm": 0.40663453936576843, |
| "learning_rate": 4.817822493711822e-06, |
| "loss": 0.1189, |
| "step": 66250 |
| }, |
| { |
| "epoch": 0.9529250703375123, |
| "grad_norm": 0.5449784994125366, |
| "learning_rate": 4.745957599712541e-06, |
| "loss": 0.1217, |
| "step": 66300 |
| }, |
| { |
| "epoch": 0.9536437166952328, |
| "grad_norm": 0.4170607924461365, |
| "learning_rate": 4.674092705713259e-06, |
| "loss": 0.1203, |
| "step": 66350 |
| }, |
| { |
| "epoch": 0.9543623630529534, |
| "grad_norm": 0.4866325557231903, |
| "learning_rate": 4.6022278117139776e-06, |
| "loss": 0.1254, |
| "step": 66400 |
| }, |
| { |
| "epoch": 0.9550810094106741, |
| "grad_norm": 0.3833375573158264, |
| "learning_rate": 4.530362917714697e-06, |
| "loss": 0.1207, |
| "step": 66450 |
| }, |
| { |
| "epoch": 0.9557996557683947, |
| "grad_norm": 0.40516358613967896, |
| "learning_rate": 4.458498023715415e-06, |
| "loss": 0.1232, |
| "step": 66500 |
| }, |
| { |
| "epoch": 0.9565183021261152, |
| "grad_norm": 0.29924067854881287, |
| "learning_rate": 4.386633129716134e-06, |
| "loss": 0.1199, |
| "step": 66550 |
| }, |
| { |
| "epoch": 0.9572369484838359, |
| "grad_norm": 0.4255986213684082, |
| "learning_rate": 4.314768235716852e-06, |
| "loss": 0.1247, |
| "step": 66600 |
| }, |
| { |
| "epoch": 0.9579555948415565, |
| "grad_norm": 0.582648515701294, |
| "learning_rate": 4.242903341717571e-06, |
| "loss": 0.1179, |
| "step": 66650 |
| }, |
| { |
| "epoch": 0.958674241199277, |
| "grad_norm": 0.3907829821109772, |
| "learning_rate": 4.17103844771829e-06, |
| "loss": 0.1187, |
| "step": 66700 |
| }, |
| { |
| "epoch": 0.9593928875569976, |
| "grad_norm": 0.39846646785736084, |
| "learning_rate": 4.099173553719009e-06, |
| "loss": 0.1197, |
| "step": 66750 |
| }, |
| { |
| "epoch": 0.9601115339147183, |
| "grad_norm": 0.35272663831710815, |
| "learning_rate": 4.027308659719727e-06, |
| "loss": 0.1204, |
| "step": 66800 |
| }, |
| { |
| "epoch": 0.9608301802724388, |
| "grad_norm": 0.4485180974006653, |
| "learning_rate": 3.955443765720446e-06, |
| "loss": 0.1216, |
| "step": 66850 |
| }, |
| { |
| "epoch": 0.9615488266301594, |
| "grad_norm": 0.5025599002838135, |
| "learning_rate": 3.883578871721164e-06, |
| "loss": 0.1154, |
| "step": 66900 |
| }, |
| { |
| "epoch": 0.9622674729878801, |
| "grad_norm": 0.49099233746528625, |
| "learning_rate": 3.8117139777218826e-06, |
| "loss": 0.1175, |
| "step": 66950 |
| }, |
| { |
| "epoch": 0.9629861193456006, |
| "grad_norm": 0.33758753538131714, |
| "learning_rate": 3.7398490837226017e-06, |
| "loss": 0.1197, |
| "step": 67000 |
| }, |
| { |
| "epoch": 0.9637047657033212, |
| "grad_norm": 0.4802404046058655, |
| "learning_rate": 3.6679841897233204e-06, |
| "loss": 0.1185, |
| "step": 67050 |
| }, |
| { |
| "epoch": 0.9644234120610419, |
| "grad_norm": 0.3601958751678467, |
| "learning_rate": 3.596119295724039e-06, |
| "loss": 0.1195, |
| "step": 67100 |
| }, |
| { |
| "epoch": 0.9651420584187624, |
| "grad_norm": 0.3577285408973694, |
| "learning_rate": 3.5242544017247573e-06, |
| "loss": 0.1201, |
| "step": 67150 |
| }, |
| { |
| "epoch": 0.965860704776483, |
| "grad_norm": 0.4318629503250122, |
| "learning_rate": 3.4523895077254764e-06, |
| "loss": 0.1167, |
| "step": 67200 |
| }, |
| { |
| "epoch": 0.9665793511342036, |
| "grad_norm": 0.48125141859054565, |
| "learning_rate": 3.380524613726195e-06, |
| "loss": 0.1214, |
| "step": 67250 |
| }, |
| { |
| "epoch": 0.9672979974919242, |
| "grad_norm": 0.3523324728012085, |
| "learning_rate": 3.3086597197269134e-06, |
| "loss": 0.1225, |
| "step": 67300 |
| }, |
| { |
| "epoch": 0.9680166438496448, |
| "grad_norm": 0.4431188404560089, |
| "learning_rate": 3.236794825727632e-06, |
| "loss": 0.1196, |
| "step": 67350 |
| }, |
| { |
| "epoch": 0.9687352902073654, |
| "grad_norm": 0.42814207077026367, |
| "learning_rate": 3.1649299317283507e-06, |
| "loss": 0.1197, |
| "step": 67400 |
| }, |
| { |
| "epoch": 0.969453936565086, |
| "grad_norm": 0.37214395403862, |
| "learning_rate": 3.0930650377290694e-06, |
| "loss": 0.1218, |
| "step": 67450 |
| }, |
| { |
| "epoch": 0.9701725829228066, |
| "grad_norm": 0.45836591720581055, |
| "learning_rate": 3.021200143729788e-06, |
| "loss": 0.1253, |
| "step": 67500 |
| }, |
| { |
| "epoch": 0.9708912292805272, |
| "grad_norm": 0.3980534076690674, |
| "learning_rate": 2.9493352497305068e-06, |
| "loss": 0.1177, |
| "step": 67550 |
| }, |
| { |
| "epoch": 0.9716098756382477, |
| "grad_norm": 0.4024925231933594, |
| "learning_rate": 2.8774703557312255e-06, |
| "loss": 0.1191, |
| "step": 67600 |
| }, |
| { |
| "epoch": 0.9723285219959684, |
| "grad_norm": 0.3470667600631714, |
| "learning_rate": 2.805605461731944e-06, |
| "loss": 0.116, |
| "step": 67650 |
| }, |
| { |
| "epoch": 0.973047168353689, |
| "grad_norm": 0.3723811209201813, |
| "learning_rate": 2.733740567732663e-06, |
| "loss": 0.1214, |
| "step": 67700 |
| }, |
| { |
| "epoch": 0.9737658147114095, |
| "grad_norm": 0.3014863431453705, |
| "learning_rate": 2.6618756737333815e-06, |
| "loss": 0.1212, |
| "step": 67750 |
| }, |
| { |
| "epoch": 0.9744844610691302, |
| "grad_norm": 0.48357853293418884, |
| "learning_rate": 2.5900107797340997e-06, |
| "loss": 0.1244, |
| "step": 67800 |
| }, |
| { |
| "epoch": 0.9752031074268508, |
| "grad_norm": 0.5432282090187073, |
| "learning_rate": 2.518145885734819e-06, |
| "loss": 0.121, |
| "step": 67850 |
| }, |
| { |
| "epoch": 0.9759217537845714, |
| "grad_norm": 0.3833717703819275, |
| "learning_rate": 2.446280991735537e-06, |
| "loss": 0.118, |
| "step": 67900 |
| }, |
| { |
| "epoch": 0.976640400142292, |
| "grad_norm": 0.4205469787120819, |
| "learning_rate": 2.374416097736256e-06, |
| "loss": 0.1228, |
| "step": 67950 |
| }, |
| { |
| "epoch": 0.9773590465000126, |
| "grad_norm": 0.45980021357536316, |
| "learning_rate": 2.3025512037369745e-06, |
| "loss": 0.121, |
| "step": 68000 |
| }, |
| { |
| "epoch": 0.9780776928577332, |
| "grad_norm": 0.3673114478588104, |
| "learning_rate": 2.2306863097376936e-06, |
| "loss": 0.1205, |
| "step": 68050 |
| }, |
| { |
| "epoch": 0.9787963392154537, |
| "grad_norm": 0.42491433024406433, |
| "learning_rate": 2.158821415738412e-06, |
| "loss": 0.1198, |
| "step": 68100 |
| }, |
| { |
| "epoch": 0.9795149855731744, |
| "grad_norm": 0.2906801998615265, |
| "learning_rate": 2.0869565217391305e-06, |
| "loss": 0.122, |
| "step": 68150 |
| }, |
| { |
| "epoch": 0.980233631930895, |
| "grad_norm": 0.45080652832984924, |
| "learning_rate": 2.015091627739849e-06, |
| "loss": 0.1167, |
| "step": 68200 |
| }, |
| { |
| "epoch": 0.9809522782886155, |
| "grad_norm": 0.3137567937374115, |
| "learning_rate": 1.943226733740568e-06, |
| "loss": 0.1206, |
| "step": 68250 |
| }, |
| { |
| "epoch": 0.9816709246463362, |
| "grad_norm": 0.38510262966156006, |
| "learning_rate": 1.8713618397412865e-06, |
| "loss": 0.1197, |
| "step": 68300 |
| }, |
| { |
| "epoch": 0.9823895710040568, |
| "grad_norm": 0.32521912455558777, |
| "learning_rate": 1.799496945742005e-06, |
| "loss": 0.1213, |
| "step": 68350 |
| }, |
| { |
| "epoch": 0.9831082173617773, |
| "grad_norm": 0.4238761067390442, |
| "learning_rate": 1.727632051742724e-06, |
| "loss": 0.1216, |
| "step": 68400 |
| }, |
| { |
| "epoch": 0.9838268637194979, |
| "grad_norm": 0.39427056908607483, |
| "learning_rate": 1.6557671577434424e-06, |
| "loss": 0.122, |
| "step": 68450 |
| }, |
| { |
| "epoch": 0.9845455100772186, |
| "grad_norm": 0.343313604593277, |
| "learning_rate": 1.5839022637441613e-06, |
| "loss": 0.12, |
| "step": 68500 |
| }, |
| { |
| "epoch": 0.9852641564349391, |
| "grad_norm": 0.3430338203907013, |
| "learning_rate": 1.5120373697448797e-06, |
| "loss": 0.1252, |
| "step": 68550 |
| }, |
| { |
| "epoch": 0.9859828027926597, |
| "grad_norm": 0.3622065484523773, |
| "learning_rate": 1.4401724757455984e-06, |
| "loss": 0.1252, |
| "step": 68600 |
| }, |
| { |
| "epoch": 0.9867014491503804, |
| "grad_norm": 0.42893192172050476, |
| "learning_rate": 1.368307581746317e-06, |
| "loss": 0.1233, |
| "step": 68650 |
| }, |
| { |
| "epoch": 0.987420095508101, |
| "grad_norm": 0.3050183951854706, |
| "learning_rate": 1.2964426877470358e-06, |
| "loss": 0.1172, |
| "step": 68700 |
| }, |
| { |
| "epoch": 0.9881387418658215, |
| "grad_norm": 0.5666402578353882, |
| "learning_rate": 1.2245777937477545e-06, |
| "loss": 0.1238, |
| "step": 68750 |
| }, |
| { |
| "epoch": 0.9888573882235422, |
| "grad_norm": 0.35554978251457214, |
| "learning_rate": 1.152712899748473e-06, |
| "loss": 0.1194, |
| "step": 68800 |
| }, |
| { |
| "epoch": 0.9895760345812628, |
| "grad_norm": 0.4939674139022827, |
| "learning_rate": 1.0808480057491916e-06, |
| "loss": 0.1187, |
| "step": 68850 |
| }, |
| { |
| "epoch": 0.9902946809389833, |
| "grad_norm": 0.3537197709083557, |
| "learning_rate": 1.00898311174991e-06, |
| "loss": 0.119, |
| "step": 68900 |
| }, |
| { |
| "epoch": 0.9910133272967039, |
| "grad_norm": 0.5450920462608337, |
| "learning_rate": 9.371182177506289e-07, |
| "loss": 0.1203, |
| "step": 68950 |
| }, |
| { |
| "epoch": 0.9917319736544246, |
| "grad_norm": 0.33558523654937744, |
| "learning_rate": 8.652533237513475e-07, |
| "loss": 0.1225, |
| "step": 69000 |
| }, |
| { |
| "epoch": 0.9917319736544246, |
| "eval_loss": 0.11988582462072372, |
| "eval_runtime": 2351.3637, |
| "eval_samples_per_second": 24.917, |
| "eval_steps_per_second": 3.115, |
| "step": 69000 |
| }, |
| { |
| "epoch": 0.9924506200121451, |
| "grad_norm": 0.4475248456001282, |
| "learning_rate": 7.933884297520662e-07, |
| "loss": 0.119, |
| "step": 69050 |
| }, |
| { |
| "epoch": 0.9931692663698657, |
| "grad_norm": 0.34947699308395386, |
| "learning_rate": 7.215235357527848e-07, |
| "loss": 0.1205, |
| "step": 69100 |
| }, |
| { |
| "epoch": 0.9938879127275864, |
| "grad_norm": 0.4064067304134369, |
| "learning_rate": 6.496586417535035e-07, |
| "loss": 0.1215, |
| "step": 69150 |
| }, |
| { |
| "epoch": 0.9946065590853069, |
| "grad_norm": 0.5461844205856323, |
| "learning_rate": 5.77793747754222e-07, |
| "loss": 0.1214, |
| "step": 69200 |
| }, |
| { |
| "epoch": 0.9953252054430275, |
| "grad_norm": 0.4855654835700989, |
| "learning_rate": 5.059288537549407e-07, |
| "loss": 0.122, |
| "step": 69250 |
| }, |
| { |
| "epoch": 0.9960438518007481, |
| "grad_norm": 0.40837883949279785, |
| "learning_rate": 4.340639597556594e-07, |
| "loss": 0.1176, |
| "step": 69300 |
| }, |
| { |
| "epoch": 0.9967624981584687, |
| "grad_norm": 0.41178905963897705, |
| "learning_rate": 3.62199065756378e-07, |
| "loss": 0.1223, |
| "step": 69350 |
| }, |
| { |
| "epoch": 0.9974811445161893, |
| "grad_norm": 0.32353463768959045, |
| "learning_rate": 2.9033417175709665e-07, |
| "loss": 0.1182, |
| "step": 69400 |
| }, |
| { |
| "epoch": 0.9981997908739099, |
| "grad_norm": 0.40918976068496704, |
| "learning_rate": 2.1846927775781533e-07, |
| "loss": 0.1186, |
| "step": 69450 |
| }, |
| { |
| "epoch": 0.9989184372316305, |
| "grad_norm": 0.3727043569087982, |
| "learning_rate": 1.4660438375853396e-07, |
| "loss": 0.1196, |
| "step": 69500 |
| }, |
| { |
| "epoch": 0.9996370835893511, |
| "grad_norm": 0.30194342136383057, |
| "learning_rate": 7.473948975925261e-08, |
| "loss": 0.1217, |
| "step": 69550 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 69575, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 3000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.0846345485833994e+19, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|