diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.8881110722202776, + "epoch": 3.108388752770972, "eval_steps": 3806, - "global_step": 7612, + "global_step": 26642, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -53345,6 +53345,133324 @@ "eval_test_samples_per_second": 12.332, "eval_test_steps_per_second": 0.771, "step": 7612 + }, + { + "epoch": 0.8882277447205693, + "grad_norm": 1.08250892162323, + "learning_rate": 0.00028672756941133364, + "loss": 1.9172, + "step": 7613 + }, + { + "epoch": 0.888344417220861, + "grad_norm": 1.1378583908081055, + "learning_rate": 0.0002867213205765848, + "loss": 2.2683, + "step": 7614 + }, + { + "epoch": 0.8884610897211527, + "grad_norm": 1.0862494707107544, + "learning_rate": 0.0002867150703400109, + "loss": 2.1381, + "step": 7615 + }, + { + "epoch": 0.8885777622214444, + "grad_norm": 1.0403109788894653, + "learning_rate": 0.0002867088187016767, + "loss": 2.0912, + "step": 7616 + }, + { + "epoch": 0.888694434721736, + "grad_norm": 1.0656440258026123, + "learning_rate": 0.0002867025656616471, + "loss": 2.1179, + "step": 7617 + }, + { + "epoch": 0.8888111072220277, + "grad_norm": 1.617745280265808, + "learning_rate": 0.0002866963112199868, + "loss": 2.2645, + "step": 7618 + }, + { + "epoch": 0.8889277797223194, + "grad_norm": 1.3309426307678223, + "learning_rate": 0.00028669005537676083, + "loss": 2.3184, + "step": 7619 + }, + { + "epoch": 0.8890444522226111, + "grad_norm": 1.1338757276535034, + "learning_rate": 0.00028668379813203386, + "loss": 2.0862, + "step": 7620 + }, + { + "epoch": 0.8891611247229028, + "grad_norm": 0.9337488412857056, + "learning_rate": 0.00028667753948587085, + "loss": 2.0542, + "step": 7621 + }, + { + "epoch": 0.8892777972231944, + "grad_norm": 1.1324946880340576, + "learning_rate": 0.00028667127943833673, + "loss": 2.0001, + "step": 7622 + }, + { + "epoch": 0.8893944697234861, + "grad_norm": 0.9813324213027954, + "learning_rate": 0.0002866650179894962, + "loss": 2.1155, + "step": 7623 + }, + { + "epoch": 0.8895111422237778, + "grad_norm": 1.1691035032272339, + "learning_rate": 0.0002866587551394144, + "loss": 2.0888, + "step": 7624 + }, + { + "epoch": 0.8896278147240695, + "grad_norm": 1.0807366371154785, + "learning_rate": 0.00028665249088815617, + "loss": 2.1184, + "step": 7625 + }, + { + "epoch": 0.8897444872243612, + "grad_norm": 1.0275025367736816, + "learning_rate": 0.0002866462252357864, + "loss": 2.0111, + "step": 7626 + }, + { + "epoch": 0.8898611597246529, + "grad_norm": 1.1816335916519165, + "learning_rate": 0.00028663995818237003, + "loss": 2.3064, + "step": 7627 + }, + { + "epoch": 0.8899778322249445, + "grad_norm": 1.0611976385116577, + "learning_rate": 0.0002866336897279721, + "loss": 2.0971, + "step": 7628 + }, + { + "epoch": 0.8900945047252362, + "grad_norm": 1.030343770980835, + "learning_rate": 0.00028662741987265766, + "loss": 2.0998, + "step": 7629 + }, + { + "epoch": 0.8902111772255279, + "grad_norm": 1.042432188987732, + "learning_rate": 0.0002866211486164916, + "loss": 2.0548, + "step": 7630 + }, + { + "epoch": 0.8903278497258196, + "grad_norm": 1.1357702016830444, + "learning_rate": 0.00028661487595953893, + "loss": 2.236, + "step": 7631 + }, + { + "epoch": 0.8904445222261113, + "grad_norm": 1.2299506664276123, + "learning_rate": 0.00028660860190186477, + "loss": 2.2724, + "step": 7632 + }, + { + "epoch": 0.8905611947264029, + "grad_norm": 1.1576910018920898, + "learning_rate": 0.00028660232644353405, + "loss": 2.0453, + "step": 7633 + }, + { + "epoch": 0.8906778672266946, + "grad_norm": 1.1590412855148315, + "learning_rate": 0.0002865960495846119, + "loss": 2.1664, + "step": 7634 + }, + { + "epoch": 0.8907945397269863, + "grad_norm": 1.0263004302978516, + "learning_rate": 0.00028658977132516337, + "loss": 1.9667, + "step": 7635 + }, + { + "epoch": 0.890911212227278, + "grad_norm": 1.1873406171798706, + "learning_rate": 0.0002865834916652535, + "loss": 2.1737, + "step": 7636 + }, + { + "epoch": 0.8910278847275697, + "grad_norm": 1.1991950273513794, + "learning_rate": 0.0002865772106049475, + "loss": 2.1354, + "step": 7637 + }, + { + "epoch": 0.8911445572278613, + "grad_norm": 1.1854585409164429, + "learning_rate": 0.0002865709281443104, + "loss": 2.3185, + "step": 7638 + }, + { + "epoch": 0.891261229728153, + "grad_norm": 1.2650865316390991, + "learning_rate": 0.0002865646442834073, + "loss": 2.1125, + "step": 7639 + }, + { + "epoch": 0.8913779022284447, + "grad_norm": 1.215883493423462, + "learning_rate": 0.0002865583590223035, + "loss": 2.0998, + "step": 7640 + }, + { + "epoch": 0.8914945747287364, + "grad_norm": 1.0300604104995728, + "learning_rate": 0.000286552072361064, + "loss": 1.9049, + "step": 7641 + }, + { + "epoch": 0.8916112472290281, + "grad_norm": 1.1543309688568115, + "learning_rate": 0.000286545784299754, + "loss": 2.1492, + "step": 7642 + }, + { + "epoch": 0.8917279197293198, + "grad_norm": 1.0205514430999756, + "learning_rate": 0.00028653949483843877, + "loss": 2.092, + "step": 7643 + }, + { + "epoch": 0.8918445922296114, + "grad_norm": 1.1057915687561035, + "learning_rate": 0.0002865332039771834, + "loss": 2.0687, + "step": 7644 + }, + { + "epoch": 0.8919612647299031, + "grad_norm": 1.1866565942764282, + "learning_rate": 0.00028652691171605324, + "loss": 2.1003, + "step": 7645 + }, + { + "epoch": 0.8920779372301948, + "grad_norm": 1.2364860773086548, + "learning_rate": 0.0002865206180551134, + "loss": 2.229, + "step": 7646 + }, + { + "epoch": 0.8921946097304865, + "grad_norm": 1.1664913892745972, + "learning_rate": 0.00028651432299442914, + "loss": 2.1672, + "step": 7647 + }, + { + "epoch": 0.8923112822307782, + "grad_norm": 1.0692989826202393, + "learning_rate": 0.0002865080265340658, + "loss": 2.0868, + "step": 7648 + }, + { + "epoch": 0.8924279547310698, + "grad_norm": 1.0119680166244507, + "learning_rate": 0.0002865017286740886, + "loss": 1.9664, + "step": 7649 + }, + { + "epoch": 0.8925446272313615, + "grad_norm": 1.1600960493087769, + "learning_rate": 0.00028649542941456283, + "loss": 2.1485, + "step": 7650 + }, + { + "epoch": 0.8926612997316532, + "grad_norm": 1.22147536277771, + "learning_rate": 0.00028648912875555376, + "loss": 2.1145, + "step": 7651 + }, + { + "epoch": 0.8927779722319449, + "grad_norm": 1.1962738037109375, + "learning_rate": 0.0002864828266971268, + "loss": 2.2462, + "step": 7652 + }, + { + "epoch": 0.8928946447322366, + "grad_norm": 1.2948722839355469, + "learning_rate": 0.00028647652323934717, + "loss": 2.4294, + "step": 7653 + }, + { + "epoch": 0.8930113172325282, + "grad_norm": 1.268310546875, + "learning_rate": 0.0002864702183822804, + "loss": 2.2659, + "step": 7654 + }, + { + "epoch": 0.8931279897328199, + "grad_norm": 1.0589717626571655, + "learning_rate": 0.00028646391212599164, + "loss": 1.8144, + "step": 7655 + }, + { + "epoch": 0.8932446622331116, + "grad_norm": 1.1787922382354736, + "learning_rate": 0.00028645760447054637, + "loss": 2.1633, + "step": 7656 + }, + { + "epoch": 0.8933613347334033, + "grad_norm": 1.0891555547714233, + "learning_rate": 0.00028645129541600997, + "loss": 2.0988, + "step": 7657 + }, + { + "epoch": 0.893478007233695, + "grad_norm": 1.0574315786361694, + "learning_rate": 0.0002864449849624478, + "loss": 2.0956, + "step": 7658 + }, + { + "epoch": 0.8935946797339867, + "grad_norm": 1.102419137954712, + "learning_rate": 0.00028643867310992535, + "loss": 2.2295, + "step": 7659 + }, + { + "epoch": 0.8937113522342783, + "grad_norm": 1.1031811237335205, + "learning_rate": 0.0002864323598585081, + "loss": 2.1465, + "step": 7660 + }, + { + "epoch": 0.89382802473457, + "grad_norm": 1.2879818677902222, + "learning_rate": 0.0002864260452082613, + "loss": 2.1426, + "step": 7661 + }, + { + "epoch": 0.8939446972348617, + "grad_norm": 0.9326125979423523, + "learning_rate": 0.0002864197291592506, + "loss": 2.1553, + "step": 7662 + }, + { + "epoch": 0.8940613697351534, + "grad_norm": 1.3141180276870728, + "learning_rate": 0.00028641341171154144, + "loss": 2.0425, + "step": 7663 + }, + { + "epoch": 0.8941780422354451, + "grad_norm": 1.440615177154541, + "learning_rate": 0.0002864070928651992, + "loss": 2.0678, + "step": 7664 + }, + { + "epoch": 0.8942947147357367, + "grad_norm": 1.126431941986084, + "learning_rate": 0.0002864007726202896, + "loss": 2.0122, + "step": 7665 + }, + { + "epoch": 0.8944113872360284, + "grad_norm": 1.125025987625122, + "learning_rate": 0.00028639445097687795, + "loss": 2.1396, + "step": 7666 + }, + { + "epoch": 0.8945280597363201, + "grad_norm": 1.3569656610488892, + "learning_rate": 0.0002863881279350299, + "loss": 2.1112, + "step": 7667 + }, + { + "epoch": 0.8946447322366118, + "grad_norm": 1.400065541267395, + "learning_rate": 0.000286381803494811, + "loss": 2.1839, + "step": 7668 + }, + { + "epoch": 0.8947614047369035, + "grad_norm": 1.1277515888214111, + "learning_rate": 0.0002863754776562867, + "loss": 2.1539, + "step": 7669 + }, + { + "epoch": 0.8948780772371951, + "grad_norm": 1.0258126258850098, + "learning_rate": 0.0002863691504195227, + "loss": 2.0974, + "step": 7670 + }, + { + "epoch": 0.8949947497374868, + "grad_norm": 1.0710912942886353, + "learning_rate": 0.00028636282178458463, + "loss": 2.0746, + "step": 7671 + }, + { + "epoch": 0.8951114222377785, + "grad_norm": 1.4061764478683472, + "learning_rate": 0.00028635649175153796, + "loss": 2.0758, + "step": 7672 + }, + { + "epoch": 0.8952280947380702, + "grad_norm": 1.299557089805603, + "learning_rate": 0.00028635016032044837, + "loss": 2.2677, + "step": 7673 + }, + { + "epoch": 0.8953447672383619, + "grad_norm": 1.127527117729187, + "learning_rate": 0.00028634382749138156, + "loss": 2.0538, + "step": 7674 + }, + { + "epoch": 0.8954614397386536, + "grad_norm": 1.3079715967178345, + "learning_rate": 0.0002863374932644031, + "loss": 2.1402, + "step": 7675 + }, + { + "epoch": 0.8955781122389452, + "grad_norm": 1.086977243423462, + "learning_rate": 0.0002863311576395787, + "loss": 1.9497, + "step": 7676 + }, + { + "epoch": 0.8956947847392369, + "grad_norm": 1.0714901685714722, + "learning_rate": 0.000286324820616974, + "loss": 2.1334, + "step": 7677 + }, + { + "epoch": 0.8958114572395286, + "grad_norm": 1.062922477722168, + "learning_rate": 0.00028631848219665476, + "loss": 2.0645, + "step": 7678 + }, + { + "epoch": 0.8959281297398203, + "grad_norm": 1.1949241161346436, + "learning_rate": 0.00028631214237868663, + "loss": 2.1911, + "step": 7679 + }, + { + "epoch": 0.896044802240112, + "grad_norm": 1.071686029434204, + "learning_rate": 0.00028630580116313544, + "loss": 2.0933, + "step": 7680 + }, + { + "epoch": 0.8961614747404036, + "grad_norm": 1.1421473026275635, + "learning_rate": 0.00028629945855006674, + "loss": 2.0221, + "step": 7681 + }, + { + "epoch": 0.8962781472406953, + "grad_norm": 1.062211036682129, + "learning_rate": 0.0002862931145395464, + "loss": 2.1285, + "step": 7682 + }, + { + "epoch": 0.896394819740987, + "grad_norm": 1.0244603157043457, + "learning_rate": 0.0002862867691316402, + "loss": 2.1247, + "step": 7683 + }, + { + "epoch": 0.8965114922412787, + "grad_norm": 1.1377800703048706, + "learning_rate": 0.0002862804223264139, + "loss": 1.983, + "step": 7684 + }, + { + "epoch": 0.8966281647415704, + "grad_norm": 1.241559386253357, + "learning_rate": 0.00028627407412393327, + "loss": 2.0537, + "step": 7685 + }, + { + "epoch": 0.896744837241862, + "grad_norm": 1.0168758630752563, + "learning_rate": 0.00028626772452426414, + "loss": 2.0791, + "step": 7686 + }, + { + "epoch": 0.8968615097421537, + "grad_norm": 1.1788339614868164, + "learning_rate": 0.00028626137352747236, + "loss": 2.2621, + "step": 7687 + }, + { + "epoch": 0.8969781822424454, + "grad_norm": 1.2187483310699463, + "learning_rate": 0.0002862550211336238, + "loss": 2.2115, + "step": 7688 + }, + { + "epoch": 0.8970948547427371, + "grad_norm": 1.2273964881896973, + "learning_rate": 0.0002862486673427842, + "loss": 1.9595, + "step": 7689 + }, + { + "epoch": 0.8972115272430288, + "grad_norm": 1.2150580883026123, + "learning_rate": 0.0002862423121550195, + "loss": 2.2246, + "step": 7690 + }, + { + "epoch": 0.8973281997433205, + "grad_norm": 1.3260173797607422, + "learning_rate": 0.0002862359555703956, + "loss": 2.0123, + "step": 7691 + }, + { + "epoch": 0.8974448722436121, + "grad_norm": 1.022878646850586, + "learning_rate": 0.0002862295975889784, + "loss": 2.0525, + "step": 7692 + }, + { + "epoch": 0.8975615447439038, + "grad_norm": 1.2084789276123047, + "learning_rate": 0.00028622323821083376, + "loss": 2.0917, + "step": 7693 + }, + { + "epoch": 0.8976782172441955, + "grad_norm": 1.0034688711166382, + "learning_rate": 0.00028621687743602764, + "loss": 2.0447, + "step": 7694 + }, + { + "epoch": 0.8977948897444872, + "grad_norm": 1.1672208309173584, + "learning_rate": 0.00028621051526462596, + "loss": 2.059, + "step": 7695 + }, + { + "epoch": 0.8979115622447789, + "grad_norm": 1.1251784563064575, + "learning_rate": 0.00028620415169669473, + "loss": 2.0956, + "step": 7696 + }, + { + "epoch": 0.8980282347450705, + "grad_norm": 1.0544373989105225, + "learning_rate": 0.00028619778673229983, + "loss": 2.1053, + "step": 7697 + }, + { + "epoch": 0.8981449072453622, + "grad_norm": 1.1466302871704102, + "learning_rate": 0.0002861914203715074, + "loss": 2.1468, + "step": 7698 + }, + { + "epoch": 0.8982615797456539, + "grad_norm": 1.539823293685913, + "learning_rate": 0.0002861850526143832, + "loss": 2.2994, + "step": 7699 + }, + { + "epoch": 0.8983782522459456, + "grad_norm": 0.9858671426773071, + "learning_rate": 0.0002861786834609935, + "loss": 1.7827, + "step": 7700 + }, + { + "epoch": 0.8984949247462373, + "grad_norm": 1.1419137716293335, + "learning_rate": 0.00028617231291140416, + "loss": 2.36, + "step": 7701 + }, + { + "epoch": 0.898611597246529, + "grad_norm": 1.0229613780975342, + "learning_rate": 0.00028616594096568125, + "loss": 1.8795, + "step": 7702 + }, + { + "epoch": 0.8987282697468206, + "grad_norm": 1.0741506814956665, + "learning_rate": 0.0002861595676238909, + "loss": 2.1407, + "step": 7703 + }, + { + "epoch": 0.8988449422471123, + "grad_norm": 1.1516250371932983, + "learning_rate": 0.00028615319288609917, + "loss": 2.2622, + "step": 7704 + }, + { + "epoch": 0.898961614747404, + "grad_norm": 1.1305463314056396, + "learning_rate": 0.00028614681675237204, + "loss": 2.0332, + "step": 7705 + }, + { + "epoch": 0.8990782872476957, + "grad_norm": 1.0980333089828491, + "learning_rate": 0.00028614043922277566, + "loss": 1.9973, + "step": 7706 + }, + { + "epoch": 0.8991949597479874, + "grad_norm": 1.1538151502609253, + "learning_rate": 0.0002861340602973762, + "loss": 2.0912, + "step": 7707 + }, + { + "epoch": 0.899311632248279, + "grad_norm": 1.1670420169830322, + "learning_rate": 0.00028612767997623976, + "loss": 2.1639, + "step": 7708 + }, + { + "epoch": 0.8994283047485707, + "grad_norm": 1.0915011167526245, + "learning_rate": 0.00028612129825943247, + "loss": 2.0848, + "step": 7709 + }, + { + "epoch": 0.8995449772488624, + "grad_norm": 1.3376234769821167, + "learning_rate": 0.0002861149151470205, + "loss": 2.2912, + "step": 7710 + }, + { + "epoch": 0.8996616497491541, + "grad_norm": 1.0715789794921875, + "learning_rate": 0.00028610853063907004, + "loss": 2.1091, + "step": 7711 + }, + { + "epoch": 0.8997783222494458, + "grad_norm": 1.1968445777893066, + "learning_rate": 0.00028610214473564723, + "loss": 2.252, + "step": 7712 + }, + { + "epoch": 0.8998949947497374, + "grad_norm": 1.0611146688461304, + "learning_rate": 0.00028609575743681833, + "loss": 2.1571, + "step": 7713 + }, + { + "epoch": 0.9000116672500291, + "grad_norm": 1.2312525510787964, + "learning_rate": 0.00028608936874264954, + "loss": 1.9288, + "step": 7714 + }, + { + "epoch": 0.9001283397503208, + "grad_norm": 1.0590976476669312, + "learning_rate": 0.000286082978653207, + "loss": 2.0702, + "step": 7715 + }, + { + "epoch": 0.9002450122506125, + "grad_norm": 1.2139405012130737, + "learning_rate": 0.00028607658716855707, + "loss": 2.2277, + "step": 7716 + }, + { + "epoch": 0.9003616847509042, + "grad_norm": 1.146539330482483, + "learning_rate": 0.000286070194288766, + "loss": 1.8814, + "step": 7717 + }, + { + "epoch": 0.9004783572511958, + "grad_norm": 1.1422219276428223, + "learning_rate": 0.0002860638000139, + "loss": 2.083, + "step": 7718 + }, + { + "epoch": 0.9005950297514875, + "grad_norm": 1.2087074518203735, + "learning_rate": 0.0002860574043440254, + "loss": 2.2024, + "step": 7719 + }, + { + "epoch": 0.9007117022517792, + "grad_norm": 1.0318280458450317, + "learning_rate": 0.00028605100727920854, + "loss": 2.069, + "step": 7720 + }, + { + "epoch": 0.9008283747520709, + "grad_norm": 1.1610606908798218, + "learning_rate": 0.0002860446088195156, + "loss": 2.243, + "step": 7721 + }, + { + "epoch": 0.9009450472523626, + "grad_norm": 1.0863351821899414, + "learning_rate": 0.0002860382089650131, + "loss": 2.1728, + "step": 7722 + }, + { + "epoch": 0.9010617197526543, + "grad_norm": 1.1489291191101074, + "learning_rate": 0.0002860318077157673, + "loss": 2.1071, + "step": 7723 + }, + { + "epoch": 0.9011783922529459, + "grad_norm": 1.143589735031128, + "learning_rate": 0.0002860254050718445, + "loss": 2.252, + "step": 7724 + }, + { + "epoch": 0.9012950647532376, + "grad_norm": 1.0463134050369263, + "learning_rate": 0.0002860190010333112, + "loss": 2.1159, + "step": 7725 + }, + { + "epoch": 0.9014117372535293, + "grad_norm": 1.0598622560501099, + "learning_rate": 0.0002860125956002336, + "loss": 2.0983, + "step": 7726 + }, + { + "epoch": 0.901528409753821, + "grad_norm": 1.2513090372085571, + "learning_rate": 0.0002860061887726783, + "loss": 2.1201, + "step": 7727 + }, + { + "epoch": 0.9016450822541127, + "grad_norm": 1.2587553262710571, + "learning_rate": 0.0002859997805507117, + "loss": 2.0914, + "step": 7728 + }, + { + "epoch": 0.9017617547544043, + "grad_norm": 1.0462827682495117, + "learning_rate": 0.0002859933709344001, + "loss": 1.9636, + "step": 7729 + }, + { + "epoch": 0.901878427254696, + "grad_norm": 1.1109490394592285, + "learning_rate": 0.00028598695992381005, + "loss": 2.1869, + "step": 7730 + }, + { + "epoch": 0.9019950997549877, + "grad_norm": 1.1166107654571533, + "learning_rate": 0.00028598054751900797, + "loss": 2.0665, + "step": 7731 + }, + { + "epoch": 0.9021117722552794, + "grad_norm": 1.1760079860687256, + "learning_rate": 0.0002859741337200604, + "loss": 2.0794, + "step": 7732 + }, + { + "epoch": 0.9022284447555711, + "grad_norm": 1.2122858762741089, + "learning_rate": 0.00028596771852703375, + "loss": 2.2135, + "step": 7733 + }, + { + "epoch": 0.9023451172558627, + "grad_norm": 1.167136311531067, + "learning_rate": 0.0002859613019399946, + "loss": 2.0963, + "step": 7734 + }, + { + "epoch": 0.9024617897561544, + "grad_norm": 1.0690226554870605, + "learning_rate": 0.0002859548839590094, + "loss": 2.0805, + "step": 7735 + }, + { + "epoch": 0.9025784622564461, + "grad_norm": 1.1205998659133911, + "learning_rate": 0.0002859484645841447, + "loss": 2.2341, + "step": 7736 + }, + { + "epoch": 0.9026951347567378, + "grad_norm": 1.1102405786514282, + "learning_rate": 0.0002859420438154671, + "loss": 2.0092, + "step": 7737 + }, + { + "epoch": 0.9028118072570295, + "grad_norm": 1.293692946434021, + "learning_rate": 0.0002859356216530431, + "loss": 2.077, + "step": 7738 + }, + { + "epoch": 0.9029284797573212, + "grad_norm": 1.1980440616607666, + "learning_rate": 0.00028592919809693933, + "loss": 2.2695, + "step": 7739 + }, + { + "epoch": 0.9030451522576128, + "grad_norm": 1.1411552429199219, + "learning_rate": 0.0002859227731472224, + "loss": 2.2561, + "step": 7740 + }, + { + "epoch": 0.9031618247579045, + "grad_norm": 1.1395879983901978, + "learning_rate": 0.0002859163468039588, + "loss": 1.9521, + "step": 7741 + }, + { + "epoch": 0.9032784972581962, + "grad_norm": 1.0206725597381592, + "learning_rate": 0.0002859099190672153, + "loss": 2.082, + "step": 7742 + }, + { + "epoch": 0.9033951697584879, + "grad_norm": 1.1355072259902954, + "learning_rate": 0.0002859034899370584, + "loss": 2.1405, + "step": 7743 + }, + { + "epoch": 0.9035118422587796, + "grad_norm": 1.2100863456726074, + "learning_rate": 0.0002858970594135548, + "loss": 2.0859, + "step": 7744 + }, + { + "epoch": 0.9036285147590712, + "grad_norm": 0.9997634291648865, + "learning_rate": 0.0002858906274967712, + "loss": 2.1676, + "step": 7745 + }, + { + "epoch": 0.9037451872593629, + "grad_norm": 1.1456596851348877, + "learning_rate": 0.00028588419418677433, + "loss": 2.247, + "step": 7746 + }, + { + "epoch": 0.9038618597596546, + "grad_norm": 1.2252956628799438, + "learning_rate": 0.00028587775948363074, + "loss": 2.1176, + "step": 7747 + }, + { + "epoch": 0.9039785322599463, + "grad_norm": 1.403113603591919, + "learning_rate": 0.00028587132338740717, + "loss": 2.1192, + "step": 7748 + }, + { + "epoch": 0.904095204760238, + "grad_norm": 1.1618897914886475, + "learning_rate": 0.00028586488589817045, + "loss": 2.1536, + "step": 7749 + }, + { + "epoch": 0.9042118772605297, + "grad_norm": 1.0646613836288452, + "learning_rate": 0.0002858584470159872, + "loss": 2.0737, + "step": 7750 + }, + { + "epoch": 0.9043285497608213, + "grad_norm": 1.019668459892273, + "learning_rate": 0.00028585200674092424, + "loss": 2.1779, + "step": 7751 + }, + { + "epoch": 0.904445222261113, + "grad_norm": 1.0252569913864136, + "learning_rate": 0.0002858455650730483, + "loss": 2.1205, + "step": 7752 + }, + { + "epoch": 0.9045618947614047, + "grad_norm": 1.1750857830047607, + "learning_rate": 0.0002858391220124262, + "loss": 2.078, + "step": 7753 + }, + { + "epoch": 0.9046785672616964, + "grad_norm": 1.1839138269424438, + "learning_rate": 0.0002858326775591247, + "loss": 2.0449, + "step": 7754 + }, + { + "epoch": 0.9047952397619881, + "grad_norm": 1.1228492259979248, + "learning_rate": 0.0002858262317132106, + "loss": 1.9847, + "step": 7755 + }, + { + "epoch": 0.9049119122622797, + "grad_norm": 1.225654125213623, + "learning_rate": 0.00028581978447475077, + "loss": 2.1288, + "step": 7756 + }, + { + "epoch": 0.9050285847625714, + "grad_norm": 1.2148550748825073, + "learning_rate": 0.000285813335843812, + "loss": 2.0768, + "step": 7757 + }, + { + "epoch": 0.9051452572628631, + "grad_norm": 1.2137441635131836, + "learning_rate": 0.00028580688582046114, + "loss": 2.1451, + "step": 7758 + }, + { + "epoch": 0.9052619297631548, + "grad_norm": 1.0329149961471558, + "learning_rate": 0.00028580043440476515, + "loss": 2.2456, + "step": 7759 + }, + { + "epoch": 0.9053786022634465, + "grad_norm": 1.0256474018096924, + "learning_rate": 0.0002857939815967908, + "loss": 2.0517, + "step": 7760 + }, + { + "epoch": 0.9054952747637381, + "grad_norm": 1.0794750452041626, + "learning_rate": 0.000285787527396605, + "loss": 2.1002, + "step": 7761 + }, + { + "epoch": 0.9056119472640298, + "grad_norm": 1.0974162817001343, + "learning_rate": 0.0002857810718042747, + "loss": 2.1366, + "step": 7762 + }, + { + "epoch": 0.9057286197643215, + "grad_norm": 1.0529462099075317, + "learning_rate": 0.00028577461481986676, + "loss": 2.1174, + "step": 7763 + }, + { + "epoch": 0.9058452922646132, + "grad_norm": 1.1460424661636353, + "learning_rate": 0.00028576815644344825, + "loss": 2.2658, + "step": 7764 + }, + { + "epoch": 0.9059619647649049, + "grad_norm": 1.1285291910171509, + "learning_rate": 0.000285761696675086, + "loss": 1.9823, + "step": 7765 + }, + { + "epoch": 0.9060786372651966, + "grad_norm": 1.0286446809768677, + "learning_rate": 0.00028575523551484705, + "loss": 2.0345, + "step": 7766 + }, + { + "epoch": 0.9061953097654882, + "grad_norm": 1.1373339891433716, + "learning_rate": 0.0002857487729627983, + "loss": 2.2127, + "step": 7767 + }, + { + "epoch": 0.9063119822657799, + "grad_norm": 1.152403712272644, + "learning_rate": 0.0002857423090190068, + "loss": 2.1774, + "step": 7768 + }, + { + "epoch": 0.9064286547660716, + "grad_norm": 1.160611867904663, + "learning_rate": 0.0002857358436835396, + "loss": 2.0802, + "step": 7769 + }, + { + "epoch": 0.9065453272663633, + "grad_norm": 1.2061898708343506, + "learning_rate": 0.00028572937695646363, + "loss": 2.2178, + "step": 7770 + }, + { + "epoch": 0.906661999766655, + "grad_norm": 1.2304335832595825, + "learning_rate": 0.000285722908837846, + "loss": 2.2588, + "step": 7771 + }, + { + "epoch": 0.9067786722669466, + "grad_norm": 0.9622619152069092, + "learning_rate": 0.00028571643932775385, + "loss": 2.0929, + "step": 7772 + }, + { + "epoch": 0.9068953447672383, + "grad_norm": 1.1332268714904785, + "learning_rate": 0.00028570996842625405, + "loss": 2.1665, + "step": 7773 + }, + { + "epoch": 0.90701201726753, + "grad_norm": 1.106376051902771, + "learning_rate": 0.00028570349613341376, + "loss": 2.1319, + "step": 7774 + }, + { + "epoch": 0.9071286897678217, + "grad_norm": 1.3701610565185547, + "learning_rate": 0.00028569702244930014, + "loss": 2.2534, + "step": 7775 + }, + { + "epoch": 0.9072453622681134, + "grad_norm": 1.107193946838379, + "learning_rate": 0.00028569054737398025, + "loss": 2.1167, + "step": 7776 + }, + { + "epoch": 0.907362034768405, + "grad_norm": 1.10404634475708, + "learning_rate": 0.0002856840709075212, + "loss": 2.137, + "step": 7777 + }, + { + "epoch": 0.9074787072686967, + "grad_norm": 1.1885349750518799, + "learning_rate": 0.0002856775930499902, + "loss": 2.1915, + "step": 7778 + }, + { + "epoch": 0.9075953797689884, + "grad_norm": 1.2766343355178833, + "learning_rate": 0.00028567111380145435, + "loss": 2.2481, + "step": 7779 + }, + { + "epoch": 0.9077120522692801, + "grad_norm": 1.262192726135254, + "learning_rate": 0.0002856646331619808, + "loss": 2.3928, + "step": 7780 + }, + { + "epoch": 0.9078287247695718, + "grad_norm": 1.0744717121124268, + "learning_rate": 0.00028565815113163677, + "loss": 2.1239, + "step": 7781 + }, + { + "epoch": 0.9079453972698635, + "grad_norm": 0.8696849942207336, + "learning_rate": 0.00028565166771048954, + "loss": 1.9992, + "step": 7782 + }, + { + "epoch": 0.9080620697701551, + "grad_norm": 1.1494446992874146, + "learning_rate": 0.00028564518289860615, + "loss": 1.9723, + "step": 7783 + }, + { + "epoch": 0.9081787422704468, + "grad_norm": 1.1516201496124268, + "learning_rate": 0.00028563869669605393, + "loss": 2.1904, + "step": 7784 + }, + { + "epoch": 0.9082954147707385, + "grad_norm": 1.0313258171081543, + "learning_rate": 0.0002856322091029001, + "loss": 2.1028, + "step": 7785 + }, + { + "epoch": 0.9084120872710302, + "grad_norm": 1.1771332025527954, + "learning_rate": 0.00028562572011921195, + "loss": 2.0297, + "step": 7786 + }, + { + "epoch": 0.9085287597713219, + "grad_norm": 1.1348516941070557, + "learning_rate": 0.0002856192297450567, + "loss": 2.2442, + "step": 7787 + }, + { + "epoch": 0.9086454322716135, + "grad_norm": 1.1549097299575806, + "learning_rate": 0.00028561273798050166, + "loss": 2.0182, + "step": 7788 + }, + { + "epoch": 0.9087621047719052, + "grad_norm": 1.259285569190979, + "learning_rate": 0.00028560624482561414, + "loss": 2.1653, + "step": 7789 + }, + { + "epoch": 0.9088787772721969, + "grad_norm": 1.2278320789337158, + "learning_rate": 0.00028559975028046144, + "loss": 2.0127, + "step": 7790 + }, + { + "epoch": 0.9089954497724886, + "grad_norm": 1.2042667865753174, + "learning_rate": 0.0002855932543451109, + "loss": 2.2335, + "step": 7791 + }, + { + "epoch": 0.9091121222727803, + "grad_norm": 1.2016063928604126, + "learning_rate": 0.0002855867570196298, + "loss": 2.264, + "step": 7792 + }, + { + "epoch": 0.909228794773072, + "grad_norm": 1.2245522737503052, + "learning_rate": 0.0002855802583040856, + "loss": 2.0914, + "step": 7793 + }, + { + "epoch": 0.9093454672733636, + "grad_norm": 1.215088963508606, + "learning_rate": 0.0002855737581985456, + "loss": 2.2187, + "step": 7794 + }, + { + "epoch": 0.9094621397736553, + "grad_norm": 1.1132687330245972, + "learning_rate": 0.0002855672567030773, + "loss": 2.0239, + "step": 7795 + }, + { + "epoch": 0.909578812273947, + "grad_norm": 1.2770593166351318, + "learning_rate": 0.00028556075381774783, + "loss": 2.0882, + "step": 7796 + }, + { + "epoch": 0.9096954847742387, + "grad_norm": 1.1253643035888672, + "learning_rate": 0.00028555424954262494, + "loss": 2.1493, + "step": 7797 + }, + { + "epoch": 0.9098121572745304, + "grad_norm": 0.9871168732643127, + "learning_rate": 0.00028554774387777577, + "loss": 2.1118, + "step": 7798 + }, + { + "epoch": 0.909928829774822, + "grad_norm": 1.0510376691818237, + "learning_rate": 0.000285541236823268, + "loss": 1.9199, + "step": 7799 + }, + { + "epoch": 0.9100455022751137, + "grad_norm": 1.053801417350769, + "learning_rate": 0.00028553472837916893, + "loss": 2.1372, + "step": 7800 + }, + { + "epoch": 0.9101621747754054, + "grad_norm": 1.333701729774475, + "learning_rate": 0.0002855282185455461, + "loss": 2.2808, + "step": 7801 + }, + { + "epoch": 0.9102788472756971, + "grad_norm": 1.096773624420166, + "learning_rate": 0.000285521707322467, + "loss": 2.161, + "step": 7802 + }, + { + "epoch": 0.9103955197759888, + "grad_norm": 1.0421103239059448, + "learning_rate": 0.00028551519470999907, + "loss": 2.194, + "step": 7803 + }, + { + "epoch": 0.9105121922762804, + "grad_norm": 1.1424124240875244, + "learning_rate": 0.00028550868070820987, + "loss": 1.9777, + "step": 7804 + }, + { + "epoch": 0.9106288647765721, + "grad_norm": 1.1946487426757812, + "learning_rate": 0.000285502165317167, + "loss": 1.9936, + "step": 7805 + }, + { + "epoch": 0.9107455372768638, + "grad_norm": 1.3027219772338867, + "learning_rate": 0.00028549564853693783, + "loss": 2.2302, + "step": 7806 + }, + { + "epoch": 0.9108622097771555, + "grad_norm": 1.3015846014022827, + "learning_rate": 0.0002854891303675901, + "loss": 2.0724, + "step": 7807 + }, + { + "epoch": 0.9109788822774472, + "grad_norm": 1.0902661085128784, + "learning_rate": 0.00028548261080919125, + "loss": 2.1964, + "step": 7808 + }, + { + "epoch": 0.9110955547777388, + "grad_norm": 1.1172826290130615, + "learning_rate": 0.00028547608986180897, + "loss": 2.1225, + "step": 7809 + }, + { + "epoch": 0.9112122272780305, + "grad_norm": 1.262534499168396, + "learning_rate": 0.0002854695675255108, + "loss": 2.0888, + "step": 7810 + }, + { + "epoch": 0.9113288997783222, + "grad_norm": 1.0339456796646118, + "learning_rate": 0.00028546304380036434, + "loss": 2.0781, + "step": 7811 + }, + { + "epoch": 0.9114455722786139, + "grad_norm": 1.2915139198303223, + "learning_rate": 0.0002854565186864373, + "loss": 2.1095, + "step": 7812 + }, + { + "epoch": 0.9115622447789056, + "grad_norm": 1.2814085483551025, + "learning_rate": 0.0002854499921837972, + "loss": 2.1226, + "step": 7813 + }, + { + "epoch": 0.9116789172791973, + "grad_norm": 1.4063079357147217, + "learning_rate": 0.00028544346429251185, + "loss": 2.1849, + "step": 7814 + }, + { + "epoch": 0.9117955897794889, + "grad_norm": 1.160796046257019, + "learning_rate": 0.0002854369350126488, + "loss": 2.0403, + "step": 7815 + }, + { + "epoch": 0.9119122622797806, + "grad_norm": 1.211656928062439, + "learning_rate": 0.0002854304043442758, + "loss": 2.1501, + "step": 7816 + }, + { + "epoch": 0.9120289347800723, + "grad_norm": 1.0210412740707397, + "learning_rate": 0.0002854238722874606, + "loss": 1.8635, + "step": 7817 + }, + { + "epoch": 0.912145607280364, + "grad_norm": 1.1063917875289917, + "learning_rate": 0.0002854173388422708, + "loss": 2.0556, + "step": 7818 + }, + { + "epoch": 0.9122622797806557, + "grad_norm": 1.015636920928955, + "learning_rate": 0.0002854108040087742, + "loss": 1.9296, + "step": 7819 + }, + { + "epoch": 0.9123789522809473, + "grad_norm": 1.1993725299835205, + "learning_rate": 0.00028540426778703854, + "loss": 2.0952, + "step": 7820 + }, + { + "epoch": 0.912495624781239, + "grad_norm": 1.2124842405319214, + "learning_rate": 0.0002853977301771316, + "loss": 2.2658, + "step": 7821 + }, + { + "epoch": 0.9126122972815307, + "grad_norm": 1.4071087837219238, + "learning_rate": 0.0002853911911791211, + "loss": 2.0571, + "step": 7822 + }, + { + "epoch": 0.9127289697818224, + "grad_norm": 1.2533411979675293, + "learning_rate": 0.00028538465079307483, + "loss": 2.1099, + "step": 7823 + }, + { + "epoch": 0.9128456422821141, + "grad_norm": 1.0407071113586426, + "learning_rate": 0.0002853781090190607, + "loss": 2.0065, + "step": 7824 + }, + { + "epoch": 0.9129623147824057, + "grad_norm": 1.322742223739624, + "learning_rate": 0.0002853715658571464, + "loss": 2.0641, + "step": 7825 + }, + { + "epoch": 0.9130789872826974, + "grad_norm": 1.2417938709259033, + "learning_rate": 0.00028536502130739984, + "loss": 2.2228, + "step": 7826 + }, + { + "epoch": 0.9131956597829891, + "grad_norm": 1.1956738233566284, + "learning_rate": 0.0002853584753698888, + "loss": 2.2162, + "step": 7827 + }, + { + "epoch": 0.9133123322832808, + "grad_norm": 1.3124910593032837, + "learning_rate": 0.0002853519280446812, + "loss": 2.143, + "step": 7828 + }, + { + "epoch": 0.9134290047835725, + "grad_norm": 1.0715827941894531, + "learning_rate": 0.0002853453793318449, + "loss": 2.0415, + "step": 7829 + }, + { + "epoch": 0.9135456772838642, + "grad_norm": 1.0431458950042725, + "learning_rate": 0.0002853388292314478, + "loss": 1.954, + "step": 7830 + }, + { + "epoch": 0.9136623497841558, + "grad_norm": 1.1006070375442505, + "learning_rate": 0.00028533227774355777, + "loss": 1.9749, + "step": 7831 + }, + { + "epoch": 0.9137790222844475, + "grad_norm": 1.1014723777770996, + "learning_rate": 0.0002853257248682427, + "loss": 2.1695, + "step": 7832 + }, + { + "epoch": 0.9138956947847392, + "grad_norm": 1.2379391193389893, + "learning_rate": 0.0002853191706055707, + "loss": 2.0739, + "step": 7833 + }, + { + "epoch": 0.9140123672850309, + "grad_norm": 1.1776214838027954, + "learning_rate": 0.0002853126149556095, + "loss": 2.3194, + "step": 7834 + }, + { + "epoch": 0.9141290397853226, + "grad_norm": 1.243692398071289, + "learning_rate": 0.0002853060579184272, + "loss": 2.1802, + "step": 7835 + }, + { + "epoch": 0.9142457122856142, + "grad_norm": 1.3971835374832153, + "learning_rate": 0.00028529949949409163, + "loss": 2.0332, + "step": 7836 + }, + { + "epoch": 0.9143623847859059, + "grad_norm": 1.2208880186080933, + "learning_rate": 0.00028529293968267095, + "loss": 2.1889, + "step": 7837 + }, + { + "epoch": 0.9144790572861976, + "grad_norm": 1.2644108533859253, + "learning_rate": 0.00028528637848423315, + "loss": 2.148, + "step": 7838 + }, + { + "epoch": 0.9145957297864893, + "grad_norm": 1.176229476928711, + "learning_rate": 0.0002852798158988461, + "loss": 2.2087, + "step": 7839 + }, + { + "epoch": 0.914712402286781, + "grad_norm": 1.1895897388458252, + "learning_rate": 0.0002852732519265779, + "loss": 1.9236, + "step": 7840 + }, + { + "epoch": 0.9148290747870726, + "grad_norm": 1.3998881578445435, + "learning_rate": 0.0002852666865674967, + "loss": 2.2411, + "step": 7841 + }, + { + "epoch": 0.9149457472873643, + "grad_norm": 1.188551425933838, + "learning_rate": 0.0002852601198216705, + "loss": 2.0728, + "step": 7842 + }, + { + "epoch": 0.915062419787656, + "grad_norm": 1.1262788772583008, + "learning_rate": 0.0002852535516891673, + "loss": 2.2485, + "step": 7843 + }, + { + "epoch": 0.9151790922879477, + "grad_norm": 1.349350094795227, + "learning_rate": 0.0002852469821700553, + "loss": 2.1875, + "step": 7844 + }, + { + "epoch": 0.9152957647882394, + "grad_norm": 1.074832558631897, + "learning_rate": 0.00028524041126440255, + "loss": 2.044, + "step": 7845 + }, + { + "epoch": 0.9154124372885311, + "grad_norm": 1.087854266166687, + "learning_rate": 0.00028523383897227715, + "loss": 2.1714, + "step": 7846 + }, + { + "epoch": 0.9155291097888227, + "grad_norm": 1.2538983821868896, + "learning_rate": 0.00028522726529374724, + "loss": 2.1022, + "step": 7847 + }, + { + "epoch": 0.9156457822891144, + "grad_norm": 0.9983584880828857, + "learning_rate": 0.00028522069022888103, + "loss": 2.0169, + "step": 7848 + }, + { + "epoch": 0.9157624547894061, + "grad_norm": 1.2470539808273315, + "learning_rate": 0.00028521411377774666, + "loss": 1.9068, + "step": 7849 + }, + { + "epoch": 0.9158791272896978, + "grad_norm": 1.1137639284133911, + "learning_rate": 0.00028520753594041227, + "loss": 2.1524, + "step": 7850 + }, + { + "epoch": 0.9159957997899895, + "grad_norm": 1.29500412940979, + "learning_rate": 0.00028520095671694606, + "loss": 2.175, + "step": 7851 + }, + { + "epoch": 0.9161124722902811, + "grad_norm": 1.1957987546920776, + "learning_rate": 0.0002851943761074163, + "loss": 2.2702, + "step": 7852 + }, + { + "epoch": 0.9162291447905728, + "grad_norm": 1.1232460737228394, + "learning_rate": 0.00028518779411189116, + "loss": 2.0074, + "step": 7853 + }, + { + "epoch": 0.9163458172908645, + "grad_norm": 1.2542752027511597, + "learning_rate": 0.0002851812107304388, + "loss": 2.2688, + "step": 7854 + }, + { + "epoch": 0.9164624897911562, + "grad_norm": 1.1881414651870728, + "learning_rate": 0.0002851746259631276, + "loss": 2.0874, + "step": 7855 + }, + { + "epoch": 0.9165791622914479, + "grad_norm": 1.1058297157287598, + "learning_rate": 0.00028516803981002575, + "loss": 2.1618, + "step": 7856 + }, + { + "epoch": 0.9166958347917395, + "grad_norm": 1.098527431488037, + "learning_rate": 0.0002851614522712016, + "loss": 2.1235, + "step": 7857 + }, + { + "epoch": 0.9168125072920312, + "grad_norm": 0.9622920751571655, + "learning_rate": 0.0002851548633467233, + "loss": 1.9793, + "step": 7858 + }, + { + "epoch": 0.9169291797923229, + "grad_norm": 1.2490051984786987, + "learning_rate": 0.0002851482730366593, + "loss": 2.3245, + "step": 7859 + }, + { + "epoch": 0.9170458522926146, + "grad_norm": 1.061938762664795, + "learning_rate": 0.00028514168134107784, + "loss": 2.0628, + "step": 7860 + }, + { + "epoch": 0.9171625247929063, + "grad_norm": 1.0981899499893188, + "learning_rate": 0.00028513508826004733, + "loss": 2.0762, + "step": 7861 + }, + { + "epoch": 0.917279197293198, + "grad_norm": 1.2311816215515137, + "learning_rate": 0.000285128493793636, + "loss": 2.0394, + "step": 7862 + }, + { + "epoch": 0.9173958697934896, + "grad_norm": 1.001168131828308, + "learning_rate": 0.0002851218979419124, + "loss": 2.1072, + "step": 7863 + }, + { + "epoch": 0.9175125422937813, + "grad_norm": 1.007753849029541, + "learning_rate": 0.0002851153007049447, + "loss": 2.2206, + "step": 7864 + }, + { + "epoch": 0.917629214794073, + "grad_norm": 1.1129182577133179, + "learning_rate": 0.0002851087020828014, + "loss": 2.1927, + "step": 7865 + }, + { + "epoch": 0.9177458872943647, + "grad_norm": 1.1508362293243408, + "learning_rate": 0.0002851021020755509, + "loss": 2.1167, + "step": 7866 + }, + { + "epoch": 0.9178625597946564, + "grad_norm": 1.1573528051376343, + "learning_rate": 0.0002850955006832616, + "loss": 2.2322, + "step": 7867 + }, + { + "epoch": 0.917979232294948, + "grad_norm": 1.0917202234268188, + "learning_rate": 0.00028508889790600196, + "loss": 2.021, + "step": 7868 + }, + { + "epoch": 0.9180959047952397, + "grad_norm": 1.2129746675491333, + "learning_rate": 0.00028508229374384043, + "loss": 2.0734, + "step": 7869 + }, + { + "epoch": 0.9182125772955314, + "grad_norm": 1.1155372858047485, + "learning_rate": 0.00028507568819684546, + "loss": 2.2605, + "step": 7870 + }, + { + "epoch": 0.9183292497958231, + "grad_norm": 1.2175378799438477, + "learning_rate": 0.0002850690812650856, + "loss": 2.1077, + "step": 7871 + }, + { + "epoch": 0.9184459222961148, + "grad_norm": 1.0765875577926636, + "learning_rate": 0.00028506247294862925, + "loss": 2.0855, + "step": 7872 + }, + { + "epoch": 0.9185625947964065, + "grad_norm": 1.1159541606903076, + "learning_rate": 0.00028505586324754483, + "loss": 2.0779, + "step": 7873 + }, + { + "epoch": 0.9186792672966981, + "grad_norm": 1.3105417490005493, + "learning_rate": 0.0002850492521619011, + "loss": 2.4033, + "step": 7874 + }, + { + "epoch": 0.9187959397969898, + "grad_norm": 1.009418249130249, + "learning_rate": 0.0002850426396917664, + "loss": 1.9028, + "step": 7875 + }, + { + "epoch": 0.9189126122972815, + "grad_norm": 1.094577431678772, + "learning_rate": 0.00028503602583720945, + "loss": 2.0344, + "step": 7876 + }, + { + "epoch": 0.9190292847975732, + "grad_norm": 1.1921145915985107, + "learning_rate": 0.0002850294105982987, + "loss": 2.1592, + "step": 7877 + }, + { + "epoch": 0.9191459572978649, + "grad_norm": 1.1359472274780273, + "learning_rate": 0.0002850227939751027, + "loss": 2.2227, + "step": 7878 + }, + { + "epoch": 0.9192626297981565, + "grad_norm": 1.2673059701919556, + "learning_rate": 0.0002850161759676901, + "loss": 2.1116, + "step": 7879 + }, + { + "epoch": 0.9193793022984482, + "grad_norm": 1.1103332042694092, + "learning_rate": 0.0002850095565761295, + "loss": 1.9448, + "step": 7880 + }, + { + "epoch": 0.9194959747987399, + "grad_norm": 0.9415518641471863, + "learning_rate": 0.00028500293580048953, + "loss": 1.914, + "step": 7881 + }, + { + "epoch": 0.9196126472990316, + "grad_norm": 1.1358927488327026, + "learning_rate": 0.0002849963136408388, + "loss": 2.2, + "step": 7882 + }, + { + "epoch": 0.9197293197993233, + "grad_norm": 0.9987287521362305, + "learning_rate": 0.0002849896900972461, + "loss": 2.1424, + "step": 7883 + }, + { + "epoch": 0.919845992299615, + "grad_norm": 1.3493143320083618, + "learning_rate": 0.00028498306516977983, + "loss": 2.0956, + "step": 7884 + }, + { + "epoch": 0.9199626647999066, + "grad_norm": 1.1634291410446167, + "learning_rate": 0.00028497643885850893, + "loss": 2.1718, + "step": 7885 + }, + { + "epoch": 0.9200793373001983, + "grad_norm": 0.9872016310691833, + "learning_rate": 0.0002849698111635019, + "loss": 1.9881, + "step": 7886 + }, + { + "epoch": 0.92019600980049, + "grad_norm": 1.0770505666732788, + "learning_rate": 0.0002849631820848276, + "loss": 2.1701, + "step": 7887 + }, + { + "epoch": 0.9203126823007817, + "grad_norm": 1.0079565048217773, + "learning_rate": 0.00028495655162255466, + "loss": 2.1011, + "step": 7888 + }, + { + "epoch": 0.9204293548010734, + "grad_norm": 1.0251368284225464, + "learning_rate": 0.0002849499197767518, + "loss": 2.011, + "step": 7889 + }, + { + "epoch": 0.920546027301365, + "grad_norm": 1.0815380811691284, + "learning_rate": 0.00028494328654748784, + "loss": 1.9951, + "step": 7890 + }, + { + "epoch": 0.9206626998016567, + "grad_norm": 1.1054810285568237, + "learning_rate": 0.00028493665193483154, + "loss": 2.1325, + "step": 7891 + }, + { + "epoch": 0.9207793723019484, + "grad_norm": 1.3423104286193848, + "learning_rate": 0.00028493001593885166, + "loss": 2.1883, + "step": 7892 + }, + { + "epoch": 0.9208960448022401, + "grad_norm": 1.3647828102111816, + "learning_rate": 0.00028492337855961694, + "loss": 2.0973, + "step": 7893 + }, + { + "epoch": 0.9210127173025318, + "grad_norm": 1.106692910194397, + "learning_rate": 0.0002849167397971963, + "loss": 1.924, + "step": 7894 + }, + { + "epoch": 0.9211293898028234, + "grad_norm": 1.048444390296936, + "learning_rate": 0.0002849100996516585, + "loss": 2.0591, + "step": 7895 + }, + { + "epoch": 0.9212460623031151, + "grad_norm": 1.19859778881073, + "learning_rate": 0.0002849034581230724, + "loss": 2.0318, + "step": 7896 + }, + { + "epoch": 0.9213627348034068, + "grad_norm": 1.1398659944534302, + "learning_rate": 0.0002848968152115068, + "loss": 2.065, + "step": 7897 + }, + { + "epoch": 0.9214794073036985, + "grad_norm": 1.2344404458999634, + "learning_rate": 0.00028489017091703063, + "loss": 2.1977, + "step": 7898 + }, + { + "epoch": 0.9215960798039902, + "grad_norm": 1.114533543586731, + "learning_rate": 0.0002848835252397127, + "loss": 2.2007, + "step": 7899 + }, + { + "epoch": 0.9217127523042818, + "grad_norm": 1.003045678138733, + "learning_rate": 0.00028487687817962195, + "loss": 2.188, + "step": 7900 + }, + { + "epoch": 0.9218294248045735, + "grad_norm": 1.076112151145935, + "learning_rate": 0.00028487022973682733, + "loss": 2.0782, + "step": 7901 + }, + { + "epoch": 0.9219460973048652, + "grad_norm": 0.9810500741004944, + "learning_rate": 0.0002848635799113977, + "loss": 2.0297, + "step": 7902 + }, + { + "epoch": 0.9220627698051569, + "grad_norm": 1.0984736680984497, + "learning_rate": 0.00028485692870340205, + "loss": 2.0268, + "step": 7903 + }, + { + "epoch": 0.9221794423054486, + "grad_norm": 1.0281565189361572, + "learning_rate": 0.00028485027611290924, + "loss": 2.1521, + "step": 7904 + }, + { + "epoch": 0.9222961148057403, + "grad_norm": 1.1837942600250244, + "learning_rate": 0.0002848436221399883, + "loss": 2.2824, + "step": 7905 + }, + { + "epoch": 0.9224127873060319, + "grad_norm": 1.0392494201660156, + "learning_rate": 0.0002848369667847083, + "loss": 1.9969, + "step": 7906 + }, + { + "epoch": 0.9225294598063236, + "grad_norm": 1.128122329711914, + "learning_rate": 0.0002848303100471381, + "loss": 2.1629, + "step": 7907 + }, + { + "epoch": 0.9226461323066153, + "grad_norm": 1.1002190113067627, + "learning_rate": 0.00028482365192734676, + "loss": 1.931, + "step": 7908 + }, + { + "epoch": 0.922762804806907, + "grad_norm": 1.1181992292404175, + "learning_rate": 0.00028481699242540327, + "loss": 2.2601, + "step": 7909 + }, + { + "epoch": 0.9228794773071987, + "grad_norm": 1.1456444263458252, + "learning_rate": 0.0002848103315413767, + "loss": 2.1257, + "step": 7910 + }, + { + "epoch": 0.9229961498074903, + "grad_norm": 1.2153416872024536, + "learning_rate": 0.00028480366927533614, + "loss": 2.2968, + "step": 7911 + }, + { + "epoch": 0.923112822307782, + "grad_norm": 0.9701879024505615, + "learning_rate": 0.00028479700562735057, + "loss": 2.14, + "step": 7912 + }, + { + "epoch": 0.9232294948080737, + "grad_norm": 1.032896876335144, + "learning_rate": 0.0002847903405974891, + "loss": 1.9926, + "step": 7913 + }, + { + "epoch": 0.9233461673083654, + "grad_norm": 1.0410873889923096, + "learning_rate": 0.0002847836741858209, + "loss": 2.0235, + "step": 7914 + }, + { + "epoch": 0.9234628398086571, + "grad_norm": 0.9734470248222351, + "learning_rate": 0.00028477700639241503, + "loss": 2.147, + "step": 7915 + }, + { + "epoch": 0.9235795123089487, + "grad_norm": 1.13896644115448, + "learning_rate": 0.0002847703372173406, + "loss": 2.1255, + "step": 7916 + }, + { + "epoch": 0.9236961848092404, + "grad_norm": 1.0995146036148071, + "learning_rate": 0.00028476366666066666, + "loss": 2.3644, + "step": 7917 + }, + { + "epoch": 0.9238128573095321, + "grad_norm": 1.186931848526001, + "learning_rate": 0.00028475699472246254, + "loss": 2.3145, + "step": 7918 + }, + { + "epoch": 0.9239295298098238, + "grad_norm": 1.025596261024475, + "learning_rate": 0.00028475032140279734, + "loss": 2.1378, + "step": 7919 + }, + { + "epoch": 0.9240462023101155, + "grad_norm": 1.2172316312789917, + "learning_rate": 0.0002847436467017402, + "loss": 1.9826, + "step": 7920 + }, + { + "epoch": 0.9241628748104072, + "grad_norm": 1.1321165561676025, + "learning_rate": 0.0002847369706193604, + "loss": 2.202, + "step": 7921 + }, + { + "epoch": 0.9242795473106988, + "grad_norm": 1.3196816444396973, + "learning_rate": 0.00028473029315572704, + "loss": 2.2312, + "step": 7922 + }, + { + "epoch": 0.9243962198109905, + "grad_norm": 1.142581582069397, + "learning_rate": 0.0002847236143109094, + "loss": 2.1968, + "step": 7923 + }, + { + "epoch": 0.9245128923112822, + "grad_norm": 1.0839333534240723, + "learning_rate": 0.00028471693408497676, + "loss": 2.1189, + "step": 7924 + }, + { + "epoch": 0.9246295648115739, + "grad_norm": 1.1349208354949951, + "learning_rate": 0.0002847102524779983, + "loss": 2.233, + "step": 7925 + }, + { + "epoch": 0.9247462373118656, + "grad_norm": 1.125079870223999, + "learning_rate": 0.0002847035694900433, + "loss": 2.3328, + "step": 7926 + }, + { + "epoch": 0.9248629098121572, + "grad_norm": 1.1224796772003174, + "learning_rate": 0.0002846968851211811, + "loss": 2.0109, + "step": 7927 + }, + { + "epoch": 0.9249795823124489, + "grad_norm": 1.3803157806396484, + "learning_rate": 0.0002846901993714809, + "loss": 2.1086, + "step": 7928 + }, + { + "epoch": 0.9250962548127406, + "grad_norm": 1.2346559762954712, + "learning_rate": 0.0002846835122410121, + "loss": 2.1274, + "step": 7929 + }, + { + "epoch": 0.9252129273130323, + "grad_norm": 1.271288275718689, + "learning_rate": 0.00028467682372984403, + "loss": 2.2787, + "step": 7930 + }, + { + "epoch": 0.925329599813324, + "grad_norm": 1.115641474723816, + "learning_rate": 0.00028467013383804595, + "loss": 1.9883, + "step": 7931 + }, + { + "epoch": 0.9254462723136156, + "grad_norm": 1.206203818321228, + "learning_rate": 0.00028466344256568727, + "loss": 2.1691, + "step": 7932 + }, + { + "epoch": 0.9255629448139073, + "grad_norm": 1.0260014533996582, + "learning_rate": 0.00028465674991283733, + "loss": 2.2266, + "step": 7933 + }, + { + "epoch": 0.925679617314199, + "grad_norm": 1.2235004901885986, + "learning_rate": 0.0002846500558795655, + "loss": 2.2092, + "step": 7934 + }, + { + "epoch": 0.9257962898144907, + "grad_norm": 1.1769475936889648, + "learning_rate": 0.0002846433604659412, + "loss": 2.2245, + "step": 7935 + }, + { + "epoch": 0.9259129623147824, + "grad_norm": 1.0147637128829956, + "learning_rate": 0.0002846366636720339, + "loss": 1.9827, + "step": 7936 + }, + { + "epoch": 0.926029634815074, + "grad_norm": 1.1523207426071167, + "learning_rate": 0.00028462996549791296, + "loss": 2.0101, + "step": 7937 + }, + { + "epoch": 0.9261463073153657, + "grad_norm": 1.087465524673462, + "learning_rate": 0.0002846232659436478, + "loss": 2.1203, + "step": 7938 + }, + { + "epoch": 0.9262629798156574, + "grad_norm": 1.059519648551941, + "learning_rate": 0.0002846165650093079, + "loss": 2.0905, + "step": 7939 + }, + { + "epoch": 0.9263796523159491, + "grad_norm": 1.2698450088500977, + "learning_rate": 0.0002846098626949627, + "loss": 2.1074, + "step": 7940 + }, + { + "epoch": 0.9264963248162408, + "grad_norm": 1.2196731567382812, + "learning_rate": 0.00028460315900068173, + "loss": 2.1463, + "step": 7941 + }, + { + "epoch": 0.9266129973165325, + "grad_norm": 1.1229082345962524, + "learning_rate": 0.0002845964539265345, + "loss": 2.0763, + "step": 7942 + }, + { + "epoch": 0.9267296698168241, + "grad_norm": 1.1750860214233398, + "learning_rate": 0.00028458974747259046, + "loss": 1.9755, + "step": 7943 + }, + { + "epoch": 0.9268463423171158, + "grad_norm": 1.4648780822753906, + "learning_rate": 0.00028458303963891914, + "loss": 2.2324, + "step": 7944 + }, + { + "epoch": 0.9269630148174075, + "grad_norm": 1.1903760433197021, + "learning_rate": 0.00028457633042559014, + "loss": 2.2364, + "step": 7945 + }, + { + "epoch": 0.9270796873176992, + "grad_norm": 0.9651169776916504, + "learning_rate": 0.00028456961983267296, + "loss": 2.0429, + "step": 7946 + }, + { + "epoch": 0.9271963598179909, + "grad_norm": 1.207841157913208, + "learning_rate": 0.00028456290786023716, + "loss": 2.0652, + "step": 7947 + }, + { + "epoch": 0.9273130323182825, + "grad_norm": 1.440525770187378, + "learning_rate": 0.0002845561945083524, + "loss": 2.2971, + "step": 7948 + }, + { + "epoch": 0.9274297048185742, + "grad_norm": 1.2194563150405884, + "learning_rate": 0.00028454947977708816, + "loss": 2.0179, + "step": 7949 + }, + { + "epoch": 0.9275463773188659, + "grad_norm": 1.380210041999817, + "learning_rate": 0.0002845427636665141, + "loss": 2.0505, + "step": 7950 + }, + { + "epoch": 0.9276630498191576, + "grad_norm": 1.1838468313217163, + "learning_rate": 0.00028453604617669994, + "loss": 2.0019, + "step": 7951 + }, + { + "epoch": 0.9277797223194493, + "grad_norm": 1.0739247798919678, + "learning_rate": 0.00028452932730771516, + "loss": 2.0734, + "step": 7952 + }, + { + "epoch": 0.927896394819741, + "grad_norm": 1.0890172719955444, + "learning_rate": 0.0002845226070596295, + "loss": 2.0365, + "step": 7953 + }, + { + "epoch": 0.9280130673200326, + "grad_norm": 1.2135428190231323, + "learning_rate": 0.0002845158854325127, + "loss": 2.008, + "step": 7954 + }, + { + "epoch": 0.9281297398203243, + "grad_norm": 1.3276758193969727, + "learning_rate": 0.00028450916242643424, + "loss": 2.173, + "step": 7955 + }, + { + "epoch": 0.928246412320616, + "grad_norm": 1.1818102598190308, + "learning_rate": 0.00028450243804146406, + "loss": 2.0847, + "step": 7956 + }, + { + "epoch": 0.9283630848209077, + "grad_norm": 1.2531766891479492, + "learning_rate": 0.00028449571227767164, + "loss": 2.0871, + "step": 7957 + }, + { + "epoch": 0.9284797573211994, + "grad_norm": 1.0800575017929077, + "learning_rate": 0.00028448898513512685, + "loss": 2.1621, + "step": 7958 + }, + { + "epoch": 0.928596429821491, + "grad_norm": 1.1862022876739502, + "learning_rate": 0.0002844822566138994, + "loss": 2.0849, + "step": 7959 + }, + { + "epoch": 0.9287131023217827, + "grad_norm": 1.2874406576156616, + "learning_rate": 0.0002844755267140591, + "loss": 2.0717, + "step": 7960 + }, + { + "epoch": 0.9288297748220744, + "grad_norm": 1.294123888015747, + "learning_rate": 0.0002844687954356755, + "loss": 2.246, + "step": 7961 + }, + { + "epoch": 0.9289464473223661, + "grad_norm": 1.1568982601165771, + "learning_rate": 0.0002844620627788187, + "loss": 2.1659, + "step": 7962 + }, + { + "epoch": 0.9290631198226578, + "grad_norm": 1.0908862352371216, + "learning_rate": 0.0002844553287435583, + "loss": 1.9829, + "step": 7963 + }, + { + "epoch": 0.9291797923229494, + "grad_norm": 0.9742130637168884, + "learning_rate": 0.00028444859332996407, + "loss": 2.1574, + "step": 7964 + }, + { + "epoch": 0.9292964648232411, + "grad_norm": 1.2095483541488647, + "learning_rate": 0.000284441856538106, + "loss": 2.0934, + "step": 7965 + }, + { + "epoch": 0.9294131373235328, + "grad_norm": 1.0300589799880981, + "learning_rate": 0.00028443511836805384, + "loss": 2.0941, + "step": 7966 + }, + { + "epoch": 0.9295298098238245, + "grad_norm": 1.0023101568222046, + "learning_rate": 0.00028442837881987736, + "loss": 2.1971, + "step": 7967 + }, + { + "epoch": 0.9296464823241162, + "grad_norm": 1.1738405227661133, + "learning_rate": 0.0002844216378936465, + "loss": 2.1529, + "step": 7968 + }, + { + "epoch": 0.9297631548244079, + "grad_norm": 1.540298342704773, + "learning_rate": 0.00028441489558943126, + "loss": 2.2959, + "step": 7969 + }, + { + "epoch": 0.9298798273246995, + "grad_norm": 1.22186279296875, + "learning_rate": 0.0002844081519073014, + "loss": 2.2041, + "step": 7970 + }, + { + "epoch": 0.9299964998249912, + "grad_norm": 1.112317442893982, + "learning_rate": 0.0002844014068473268, + "loss": 1.9283, + "step": 7971 + }, + { + "epoch": 0.9301131723252829, + "grad_norm": 1.2543818950653076, + "learning_rate": 0.0002843946604095775, + "loss": 2.3105, + "step": 7972 + }, + { + "epoch": 0.9302298448255746, + "grad_norm": 1.0886249542236328, + "learning_rate": 0.0002843879125941234, + "loss": 2.0585, + "step": 7973 + }, + { + "epoch": 0.9303465173258663, + "grad_norm": 1.1438591480255127, + "learning_rate": 0.00028438116340103446, + "loss": 2.0573, + "step": 7974 + }, + { + "epoch": 0.9304631898261579, + "grad_norm": 1.1487195491790771, + "learning_rate": 0.00028437441283038056, + "loss": 2.1251, + "step": 7975 + }, + { + "epoch": 0.9305798623264496, + "grad_norm": 1.1828148365020752, + "learning_rate": 0.0002843676608822318, + "loss": 2.009, + "step": 7976 + }, + { + "epoch": 0.9306965348267413, + "grad_norm": 1.0608187913894653, + "learning_rate": 0.00028436090755665813, + "loss": 2.0421, + "step": 7977 + }, + { + "epoch": 0.930813207327033, + "grad_norm": 1.1767576932907104, + "learning_rate": 0.0002843541528537296, + "loss": 1.9502, + "step": 7978 + }, + { + "epoch": 0.9309298798273247, + "grad_norm": 1.1461434364318848, + "learning_rate": 0.0002843473967735161, + "loss": 2.0703, + "step": 7979 + }, + { + "epoch": 0.9310465523276164, + "grad_norm": 1.301177978515625, + "learning_rate": 0.0002843406393160878, + "loss": 2.2235, + "step": 7980 + }, + { + "epoch": 0.931163224827908, + "grad_norm": 1.1246711015701294, + "learning_rate": 0.00028433388048151474, + "loss": 2.1696, + "step": 7981 + }, + { + "epoch": 0.9312798973281997, + "grad_norm": 1.238592505455017, + "learning_rate": 0.000284327120269867, + "loss": 2.0646, + "step": 7982 + }, + { + "epoch": 0.9313965698284914, + "grad_norm": 1.1171120405197144, + "learning_rate": 0.0002843203586812146, + "loss": 2.2256, + "step": 7983 + }, + { + "epoch": 0.9315132423287831, + "grad_norm": 1.2186048030853271, + "learning_rate": 0.0002843135957156276, + "loss": 2.2604, + "step": 7984 + }, + { + "epoch": 0.9316299148290748, + "grad_norm": 1.2046220302581787, + "learning_rate": 0.00028430683137317626, + "loss": 2.0267, + "step": 7985 + }, + { + "epoch": 0.9317465873293664, + "grad_norm": 1.0360900163650513, + "learning_rate": 0.00028430006565393056, + "loss": 2.1402, + "step": 7986 + }, + { + "epoch": 0.9318632598296581, + "grad_norm": 1.0716800689697266, + "learning_rate": 0.00028429329855796077, + "loss": 2.327, + "step": 7987 + }, + { + "epoch": 0.9319799323299498, + "grad_norm": 1.1823192834854126, + "learning_rate": 0.0002842865300853369, + "loss": 2.3447, + "step": 7988 + }, + { + "epoch": 0.9320966048302415, + "grad_norm": 1.1025283336639404, + "learning_rate": 0.0002842797602361293, + "loss": 1.9651, + "step": 7989 + }, + { + "epoch": 0.9322132773305332, + "grad_norm": 1.0005701780319214, + "learning_rate": 0.00028427298901040793, + "loss": 1.8638, + "step": 7990 + }, + { + "epoch": 0.9323299498308248, + "grad_norm": 1.1309012174606323, + "learning_rate": 0.0002842662164082432, + "loss": 2.1501, + "step": 7991 + }, + { + "epoch": 0.9324466223311165, + "grad_norm": 1.0902622938156128, + "learning_rate": 0.00028425944242970515, + "loss": 2.0827, + "step": 7992 + }, + { + "epoch": 0.9325632948314082, + "grad_norm": 1.1535595655441284, + "learning_rate": 0.0002842526670748641, + "loss": 2.1014, + "step": 7993 + }, + { + "epoch": 0.9326799673316999, + "grad_norm": 1.090740442276001, + "learning_rate": 0.00028424589034379026, + "loss": 1.9866, + "step": 7994 + }, + { + "epoch": 0.9327966398319916, + "grad_norm": 1.3360283374786377, + "learning_rate": 0.0002842391122365539, + "loss": 2.2174, + "step": 7995 + }, + { + "epoch": 0.9329133123322833, + "grad_norm": 1.0601108074188232, + "learning_rate": 0.0002842323327532253, + "loss": 2.0622, + "step": 7996 + }, + { + "epoch": 0.9330299848325749, + "grad_norm": 1.2861580848693848, + "learning_rate": 0.0002842255518938747, + "loss": 2.2021, + "step": 7997 + }, + { + "epoch": 0.9331466573328666, + "grad_norm": 1.2600791454315186, + "learning_rate": 0.0002842187696585724, + "loss": 2.1096, + "step": 7998 + }, + { + "epoch": 0.9332633298331583, + "grad_norm": 1.1031837463378906, + "learning_rate": 0.00028421198604738874, + "loss": 1.9941, + "step": 7999 + }, + { + "epoch": 0.93338000233345, + "grad_norm": 1.1457524299621582, + "learning_rate": 0.0002842052010603941, + "loss": 2.1714, + "step": 8000 + }, + { + "epoch": 0.9334966748337417, + "grad_norm": 1.0752812623977661, + "learning_rate": 0.0002841984146976587, + "loss": 2.2774, + "step": 8001 + }, + { + "epoch": 0.9336133473340333, + "grad_norm": 1.3427860736846924, + "learning_rate": 0.00028419162695925295, + "loss": 2.077, + "step": 8002 + }, + { + "epoch": 0.933730019834325, + "grad_norm": 1.1079760789871216, + "learning_rate": 0.0002841848378452472, + "loss": 1.9641, + "step": 8003 + }, + { + "epoch": 0.9338466923346167, + "grad_norm": 1.1069691181182861, + "learning_rate": 0.0002841780473557119, + "loss": 2.1305, + "step": 8004 + }, + { + "epoch": 0.9339633648349084, + "grad_norm": 1.2477067708969116, + "learning_rate": 0.0002841712554907173, + "loss": 2.1529, + "step": 8005 + }, + { + "epoch": 0.9340800373352001, + "grad_norm": 1.3679028749465942, + "learning_rate": 0.000284164462250334, + "loss": 2.2925, + "step": 8006 + }, + { + "epoch": 0.9341967098354917, + "grad_norm": 0.9558112025260925, + "learning_rate": 0.00028415766763463225, + "loss": 2.0425, + "step": 8007 + }, + { + "epoch": 0.9343133823357834, + "grad_norm": 1.09604012966156, + "learning_rate": 0.0002841508716436826, + "loss": 2.1051, + "step": 8008 + }, + { + "epoch": 0.9344300548360751, + "grad_norm": 1.3421978950500488, + "learning_rate": 0.00028414407427755547, + "loss": 2.1908, + "step": 8009 + }, + { + "epoch": 0.9345467273363668, + "grad_norm": 0.9967025518417358, + "learning_rate": 0.0002841372755363213, + "loss": 2.0449, + "step": 8010 + }, + { + "epoch": 0.9346633998366585, + "grad_norm": 1.036281704902649, + "learning_rate": 0.00028413047542005066, + "loss": 2.0565, + "step": 8011 + }, + { + "epoch": 0.9347800723369502, + "grad_norm": 1.031887173652649, + "learning_rate": 0.00028412367392881396, + "loss": 2.0912, + "step": 8012 + }, + { + "epoch": 0.9348967448372418, + "grad_norm": 1.123057246208191, + "learning_rate": 0.00028411687106268173, + "loss": 2.038, + "step": 8013 + }, + { + "epoch": 0.9350134173375335, + "grad_norm": 1.1342356204986572, + "learning_rate": 0.0002841100668217245, + "loss": 2.175, + "step": 8014 + }, + { + "epoch": 0.9351300898378252, + "grad_norm": 1.0855133533477783, + "learning_rate": 0.0002841032612060128, + "loss": 2.0942, + "step": 8015 + }, + { + "epoch": 0.9352467623381169, + "grad_norm": 1.086283564567566, + "learning_rate": 0.0002840964542156172, + "loss": 2.212, + "step": 8016 + }, + { + "epoch": 0.9353634348384086, + "grad_norm": 0.9755321741104126, + "learning_rate": 0.0002840896458506083, + "loss": 2.2075, + "step": 8017 + }, + { + "epoch": 0.9354801073387002, + "grad_norm": 1.1790897846221924, + "learning_rate": 0.0002840828361110566, + "loss": 2.0502, + "step": 8018 + }, + { + "epoch": 0.9355967798389919, + "grad_norm": 1.0372095108032227, + "learning_rate": 0.00028407602499703273, + "loss": 1.9247, + "step": 8019 + }, + { + "epoch": 0.9357134523392836, + "grad_norm": 1.356740951538086, + "learning_rate": 0.0002840692125086073, + "loss": 2.0737, + "step": 8020 + }, + { + "epoch": 0.9358301248395753, + "grad_norm": 1.225103497505188, + "learning_rate": 0.000284062398645851, + "loss": 2.1519, + "step": 8021 + }, + { + "epoch": 0.935946797339867, + "grad_norm": 1.132494568824768, + "learning_rate": 0.0002840555834088344, + "loss": 2.2395, + "step": 8022 + }, + { + "epoch": 0.9360634698401586, + "grad_norm": 1.1819425821304321, + "learning_rate": 0.0002840487667976281, + "loss": 2.1722, + "step": 8023 + }, + { + "epoch": 0.9361801423404503, + "grad_norm": 0.9264653325080872, + "learning_rate": 0.0002840419488123029, + "loss": 2.1352, + "step": 8024 + }, + { + "epoch": 0.936296814840742, + "grad_norm": 1.0253020524978638, + "learning_rate": 0.0002840351294529293, + "loss": 2.146, + "step": 8025 + }, + { + "epoch": 0.9364134873410337, + "grad_norm": 1.164390206336975, + "learning_rate": 0.00028402830871957825, + "loss": 2.1589, + "step": 8026 + }, + { + "epoch": 0.9365301598413254, + "grad_norm": 1.1148897409439087, + "learning_rate": 0.00028402148661232023, + "loss": 2.1148, + "step": 8027 + }, + { + "epoch": 0.936646832341617, + "grad_norm": 1.425548791885376, + "learning_rate": 0.0002840146631312261, + "loss": 2.1824, + "step": 8028 + }, + { + "epoch": 0.9367635048419087, + "grad_norm": 1.0404874086380005, + "learning_rate": 0.0002840078382763665, + "loss": 2.1273, + "step": 8029 + }, + { + "epoch": 0.9368801773422004, + "grad_norm": 1.3619792461395264, + "learning_rate": 0.00028400101204781227, + "loss": 2.2627, + "step": 8030 + }, + { + "epoch": 0.9369968498424921, + "grad_norm": 1.2796903848648071, + "learning_rate": 0.0002839941844456341, + "loss": 2.1054, + "step": 8031 + }, + { + "epoch": 0.9371135223427838, + "grad_norm": 1.1620181798934937, + "learning_rate": 0.00028398735546990284, + "loss": 2.0345, + "step": 8032 + }, + { + "epoch": 0.9372301948430755, + "grad_norm": 0.9875627756118774, + "learning_rate": 0.00028398052512068926, + "loss": 1.9732, + "step": 8033 + }, + { + "epoch": 0.9373468673433671, + "grad_norm": 1.136635661125183, + "learning_rate": 0.00028397369339806415, + "loss": 1.9626, + "step": 8034 + }, + { + "epoch": 0.9374635398436588, + "grad_norm": 1.1389747858047485, + "learning_rate": 0.00028396686030209836, + "loss": 2.1703, + "step": 8035 + }, + { + "epoch": 0.9375802123439505, + "grad_norm": 1.1525416374206543, + "learning_rate": 0.0002839600258328627, + "loss": 2.1783, + "step": 8036 + }, + { + "epoch": 0.9376968848442422, + "grad_norm": 1.2706809043884277, + "learning_rate": 0.00028395318999042805, + "loss": 2.1995, + "step": 8037 + }, + { + "epoch": 0.9378135573445339, + "grad_norm": 1.2618913650512695, + "learning_rate": 0.00028394635277486524, + "loss": 2.1233, + "step": 8038 + }, + { + "epoch": 0.9379302298448255, + "grad_norm": 1.2359689474105835, + "learning_rate": 0.0002839395141862452, + "loss": 2.3207, + "step": 8039 + }, + { + "epoch": 0.9380469023451172, + "grad_norm": 1.1950842142105103, + "learning_rate": 0.0002839326742246388, + "loss": 2.0919, + "step": 8040 + }, + { + "epoch": 0.9381635748454089, + "grad_norm": 0.863520622253418, + "learning_rate": 0.00028392583289011693, + "loss": 1.7009, + "step": 8041 + }, + { + "epoch": 0.9382802473457006, + "grad_norm": 1.0755060911178589, + "learning_rate": 0.0002839189901827506, + "loss": 1.9531, + "step": 8042 + }, + { + "epoch": 0.9383969198459923, + "grad_norm": 1.1591105461120605, + "learning_rate": 0.00028391214610261063, + "loss": 1.8959, + "step": 8043 + }, + { + "epoch": 0.938513592346284, + "grad_norm": 0.995438814163208, + "learning_rate": 0.00028390530064976803, + "loss": 2.1515, + "step": 8044 + }, + { + "epoch": 0.9386302648465756, + "grad_norm": 1.3834409713745117, + "learning_rate": 0.0002838984538242937, + "loss": 2.1212, + "step": 8045 + }, + { + "epoch": 0.9387469373468673, + "grad_norm": 1.1725176572799683, + "learning_rate": 0.0002838916056262588, + "loss": 2.0669, + "step": 8046 + }, + { + "epoch": 0.938863609847159, + "grad_norm": 1.1477594375610352, + "learning_rate": 0.0002838847560557341, + "loss": 2.1584, + "step": 8047 + }, + { + "epoch": 0.9389802823474507, + "grad_norm": 1.2274786233901978, + "learning_rate": 0.0002838779051127908, + "loss": 2.2073, + "step": 8048 + }, + { + "epoch": 0.9390969548477424, + "grad_norm": 1.0868299007415771, + "learning_rate": 0.00028387105279749976, + "loss": 1.9608, + "step": 8049 + }, + { + "epoch": 0.939213627348034, + "grad_norm": 1.1210057735443115, + "learning_rate": 0.00028386419910993215, + "loss": 1.9145, + "step": 8050 + }, + { + "epoch": 0.9393302998483257, + "grad_norm": 1.2329264879226685, + "learning_rate": 0.0002838573440501589, + "loss": 2.1348, + "step": 8051 + }, + { + "epoch": 0.9394469723486174, + "grad_norm": 1.1412274837493896, + "learning_rate": 0.0002838504876182512, + "loss": 2.0951, + "step": 8052 + }, + { + "epoch": 0.9395636448489091, + "grad_norm": 1.1478779315948486, + "learning_rate": 0.00028384362981428, + "loss": 1.9855, + "step": 8053 + }, + { + "epoch": 0.9396803173492008, + "grad_norm": 1.7518608570098877, + "learning_rate": 0.00028383677063831655, + "loss": 2.152, + "step": 8054 + }, + { + "epoch": 0.9397969898494924, + "grad_norm": 1.2172746658325195, + "learning_rate": 0.0002838299100904318, + "loss": 2.1899, + "step": 8055 + }, + { + "epoch": 0.9399136623497841, + "grad_norm": 1.1631700992584229, + "learning_rate": 0.000283823048170697, + "loss": 1.9779, + "step": 8056 + }, + { + "epoch": 0.9400303348500758, + "grad_norm": 1.0075442790985107, + "learning_rate": 0.0002838161848791832, + "loss": 2.0036, + "step": 8057 + }, + { + "epoch": 0.9401470073503675, + "grad_norm": 1.252729058265686, + "learning_rate": 0.00028380932021596154, + "loss": 2.1096, + "step": 8058 + }, + { + "epoch": 0.9402636798506592, + "grad_norm": 1.2972831726074219, + "learning_rate": 0.0002838024541811033, + "loss": 2.0757, + "step": 8059 + }, + { + "epoch": 0.9403803523509509, + "grad_norm": 1.095494270324707, + "learning_rate": 0.00028379558677467956, + "loss": 1.9199, + "step": 8060 + }, + { + "epoch": 0.9404970248512425, + "grad_norm": 1.133412480354309, + "learning_rate": 0.00028378871799676153, + "loss": 1.9748, + "step": 8061 + }, + { + "epoch": 0.9406136973515342, + "grad_norm": 0.9724987745285034, + "learning_rate": 0.00028378184784742046, + "loss": 1.8655, + "step": 8062 + }, + { + "epoch": 0.9407303698518259, + "grad_norm": 1.0675561428070068, + "learning_rate": 0.0002837749763267275, + "loss": 2.0832, + "step": 8063 + }, + { + "epoch": 0.9408470423521176, + "grad_norm": 1.1619795560836792, + "learning_rate": 0.000283768103434754, + "loss": 2.1268, + "step": 8064 + }, + { + "epoch": 0.9409637148524093, + "grad_norm": 1.1240631341934204, + "learning_rate": 0.00028376122917157104, + "loss": 2.0749, + "step": 8065 + }, + { + "epoch": 0.9410803873527009, + "grad_norm": 1.0499290227890015, + "learning_rate": 0.00028375435353725003, + "loss": 2.081, + "step": 8066 + }, + { + "epoch": 0.9411970598529926, + "grad_norm": 1.1148730516433716, + "learning_rate": 0.0002837474765318622, + "loss": 2.002, + "step": 8067 + }, + { + "epoch": 0.9413137323532843, + "grad_norm": 1.073714017868042, + "learning_rate": 0.00028374059815547886, + "loss": 2.0479, + "step": 8068 + }, + { + "epoch": 0.941430404853576, + "grad_norm": 1.092005729675293, + "learning_rate": 0.00028373371840817127, + "loss": 2.0287, + "step": 8069 + }, + { + "epoch": 0.9415470773538677, + "grad_norm": 1.1563644409179688, + "learning_rate": 0.0002837268372900108, + "loss": 2.0854, + "step": 8070 + }, + { + "epoch": 0.9416637498541593, + "grad_norm": 1.0757368803024292, + "learning_rate": 0.0002837199548010688, + "loss": 1.9843, + "step": 8071 + }, + { + "epoch": 0.941780422354451, + "grad_norm": 1.2122461795806885, + "learning_rate": 0.00028371307094141653, + "loss": 2.2014, + "step": 8072 + }, + { + "epoch": 0.9418970948547427, + "grad_norm": 1.2680087089538574, + "learning_rate": 0.00028370618571112546, + "loss": 2.0988, + "step": 8073 + }, + { + "epoch": 0.9420137673550344, + "grad_norm": 1.232986330986023, + "learning_rate": 0.00028369929911026693, + "loss": 2.1144, + "step": 8074 + }, + { + "epoch": 0.9421304398553261, + "grad_norm": 1.2193726301193237, + "learning_rate": 0.0002836924111389123, + "loss": 2.0322, + "step": 8075 + }, + { + "epoch": 0.9422471123556178, + "grad_norm": 1.1362422704696655, + "learning_rate": 0.00028368552179713295, + "loss": 2.1026, + "step": 8076 + }, + { + "epoch": 0.9423637848559094, + "grad_norm": 1.0778822898864746, + "learning_rate": 0.00028367863108500044, + "loss": 2.0439, + "step": 8077 + }, + { + "epoch": 0.9424804573562011, + "grad_norm": 1.2044763565063477, + "learning_rate": 0.00028367173900258606, + "loss": 2.1494, + "step": 8078 + }, + { + "epoch": 0.9425971298564928, + "grad_norm": 1.0705093145370483, + "learning_rate": 0.00028366484554996136, + "loss": 2.1971, + "step": 8079 + }, + { + "epoch": 0.9427138023567845, + "grad_norm": 1.0616320371627808, + "learning_rate": 0.00028365795072719775, + "loss": 2.2245, + "step": 8080 + }, + { + "epoch": 0.9428304748570762, + "grad_norm": 1.102337121963501, + "learning_rate": 0.0002836510545343667, + "loss": 2.1624, + "step": 8081 + }, + { + "epoch": 0.9429471473573678, + "grad_norm": 1.206834077835083, + "learning_rate": 0.0002836441569715397, + "loss": 2.2607, + "step": 8082 + }, + { + "epoch": 0.9430638198576595, + "grad_norm": 1.122274398803711, + "learning_rate": 0.00028363725803878836, + "loss": 2.2481, + "step": 8083 + }, + { + "epoch": 0.9431804923579512, + "grad_norm": 1.1153028011322021, + "learning_rate": 0.0002836303577361841, + "loss": 1.9723, + "step": 8084 + }, + { + "epoch": 0.9432971648582429, + "grad_norm": 0.9497330784797668, + "learning_rate": 0.0002836234560637984, + "loss": 1.9421, + "step": 8085 + }, + { + "epoch": 0.9434138373585346, + "grad_norm": 1.1315723657608032, + "learning_rate": 0.0002836165530217029, + "loss": 1.9484, + "step": 8086 + }, + { + "epoch": 0.9435305098588262, + "grad_norm": 1.2637642621994019, + "learning_rate": 0.0002836096486099692, + "loss": 2.0616, + "step": 8087 + }, + { + "epoch": 0.9436471823591179, + "grad_norm": 1.0635923147201538, + "learning_rate": 0.0002836027428286688, + "loss": 2.0973, + "step": 8088 + }, + { + "epoch": 0.9437638548594096, + "grad_norm": 1.2142491340637207, + "learning_rate": 0.0002835958356778733, + "loss": 2.0334, + "step": 8089 + }, + { + "epoch": 0.9438805273597013, + "grad_norm": 1.1753416061401367, + "learning_rate": 0.0002835889271576543, + "loss": 2.0934, + "step": 8090 + }, + { + "epoch": 0.943997199859993, + "grad_norm": 0.9723520874977112, + "learning_rate": 0.00028358201726808344, + "loss": 1.9405, + "step": 8091 + }, + { + "epoch": 0.9441138723602847, + "grad_norm": 1.1786930561065674, + "learning_rate": 0.00028357510600923237, + "loss": 2.2649, + "step": 8092 + }, + { + "epoch": 0.9442305448605763, + "grad_norm": 0.908631443977356, + "learning_rate": 0.0002835681933811727, + "loss": 1.9585, + "step": 8093 + }, + { + "epoch": 0.944347217360868, + "grad_norm": 1.1110515594482422, + "learning_rate": 0.0002835612793839762, + "loss": 2.1671, + "step": 8094 + }, + { + "epoch": 0.9444638898611597, + "grad_norm": 1.2597066164016724, + "learning_rate": 0.0002835543640177144, + "loss": 2.1153, + "step": 8095 + }, + { + "epoch": 0.9445805623614514, + "grad_norm": 1.3384028673171997, + "learning_rate": 0.00028354744728245905, + "loss": 2.1445, + "step": 8096 + }, + { + "epoch": 0.9446972348617431, + "grad_norm": 1.0312447547912598, + "learning_rate": 0.0002835405291782819, + "loss": 2.1906, + "step": 8097 + }, + { + "epoch": 0.9448139073620347, + "grad_norm": 1.0775086879730225, + "learning_rate": 0.00028353360970525453, + "loss": 2.1816, + "step": 8098 + }, + { + "epoch": 0.9449305798623264, + "grad_norm": 1.2514700889587402, + "learning_rate": 0.00028352668886344885, + "loss": 2.1688, + "step": 8099 + }, + { + "epoch": 0.9450472523626181, + "grad_norm": 0.9777731895446777, + "learning_rate": 0.0002835197666529365, + "loss": 1.9729, + "step": 8100 + }, + { + "epoch": 0.9451639248629098, + "grad_norm": 1.2201296091079712, + "learning_rate": 0.0002835128430737893, + "loss": 2.351, + "step": 8101 + }, + { + "epoch": 0.9452805973632015, + "grad_norm": 1.0380215644836426, + "learning_rate": 0.0002835059181260789, + "loss": 2.086, + "step": 8102 + }, + { + "epoch": 0.9453972698634932, + "grad_norm": 1.137380599975586, + "learning_rate": 0.00028349899180987726, + "loss": 1.9504, + "step": 8103 + }, + { + "epoch": 0.9455139423637848, + "grad_norm": 1.257280945777893, + "learning_rate": 0.0002834920641252561, + "loss": 2.1467, + "step": 8104 + }, + { + "epoch": 0.9456306148640765, + "grad_norm": 1.2821887731552124, + "learning_rate": 0.00028348513507228727, + "loss": 2.2148, + "step": 8105 + }, + { + "epoch": 0.9457472873643682, + "grad_norm": 1.440726399421692, + "learning_rate": 0.0002834782046510425, + "loss": 2.1358, + "step": 8106 + }, + { + "epoch": 0.9458639598646599, + "grad_norm": 1.1144506931304932, + "learning_rate": 0.00028347127286159387, + "loss": 2.0668, + "step": 8107 + }, + { + "epoch": 0.9459806323649516, + "grad_norm": 1.1403063535690308, + "learning_rate": 0.00028346433970401295, + "loss": 2.0946, + "step": 8108 + }, + { + "epoch": 0.9460973048652432, + "grad_norm": 1.2581244707107544, + "learning_rate": 0.0002834574051783718, + "loss": 1.9428, + "step": 8109 + }, + { + "epoch": 0.9462139773655349, + "grad_norm": 1.0694200992584229, + "learning_rate": 0.00028345046928474223, + "loss": 2.213, + "step": 8110 + }, + { + "epoch": 0.9463306498658266, + "grad_norm": 1.1994606256484985, + "learning_rate": 0.0002834435320231962, + "loss": 2.328, + "step": 8111 + }, + { + "epoch": 0.9464473223661183, + "grad_norm": 1.1972627639770508, + "learning_rate": 0.0002834365933938057, + "loss": 2.1454, + "step": 8112 + }, + { + "epoch": 0.94656399486641, + "grad_norm": 1.1672194004058838, + "learning_rate": 0.00028342965339664243, + "loss": 2.1221, + "step": 8113 + }, + { + "epoch": 0.9466806673667016, + "grad_norm": 1.074910283088684, + "learning_rate": 0.00028342271203177856, + "loss": 2.0279, + "step": 8114 + }, + { + "epoch": 0.9467973398669933, + "grad_norm": 1.1531076431274414, + "learning_rate": 0.00028341576929928586, + "loss": 1.8983, + "step": 8115 + }, + { + "epoch": 0.946914012367285, + "grad_norm": 1.129930019378662, + "learning_rate": 0.0002834088251992365, + "loss": 2.161, + "step": 8116 + }, + { + "epoch": 0.9470306848675767, + "grad_norm": 1.0701541900634766, + "learning_rate": 0.00028340187973170233, + "loss": 2.006, + "step": 8117 + }, + { + "epoch": 0.9471473573678684, + "grad_norm": 1.0771695375442505, + "learning_rate": 0.0002833949328967554, + "loss": 2.1575, + "step": 8118 + }, + { + "epoch": 0.94726402986816, + "grad_norm": 1.0541114807128906, + "learning_rate": 0.0002833879846944678, + "loss": 2.2393, + "step": 8119 + }, + { + "epoch": 0.9473807023684517, + "grad_norm": 1.0611722469329834, + "learning_rate": 0.0002833810351249115, + "loss": 1.9036, + "step": 8120 + }, + { + "epoch": 0.9474973748687434, + "grad_norm": 1.2260329723358154, + "learning_rate": 0.0002833740841881584, + "loss": 2.1509, + "step": 8121 + }, + { + "epoch": 0.9476140473690351, + "grad_norm": 1.2320021390914917, + "learning_rate": 0.00028336713188428076, + "loss": 2.0144, + "step": 8122 + }, + { + "epoch": 0.9477307198693268, + "grad_norm": 1.004175066947937, + "learning_rate": 0.0002833601782133506, + "loss": 1.9562, + "step": 8123 + }, + { + "epoch": 0.9478473923696185, + "grad_norm": 1.1394189596176147, + "learning_rate": 0.00028335322317544, + "loss": 2.1836, + "step": 8124 + }, + { + "epoch": 0.9479640648699101, + "grad_norm": 1.1454873085021973, + "learning_rate": 0.000283346266770621, + "loss": 2.104, + "step": 8125 + }, + { + "epoch": 0.9480807373702018, + "grad_norm": 1.1854814291000366, + "learning_rate": 0.00028333930899896587, + "loss": 2.0868, + "step": 8126 + }, + { + "epoch": 0.9481974098704935, + "grad_norm": 1.1356117725372314, + "learning_rate": 0.0002833323498605466, + "loss": 2.087, + "step": 8127 + }, + { + "epoch": 0.9483140823707852, + "grad_norm": 1.1951093673706055, + "learning_rate": 0.00028332538935543533, + "loss": 2.145, + "step": 8128 + }, + { + "epoch": 0.9484307548710769, + "grad_norm": 1.1686854362487793, + "learning_rate": 0.0002833184274837043, + "loss": 2.0385, + "step": 8129 + }, + { + "epoch": 0.9485474273713685, + "grad_norm": 1.5062894821166992, + "learning_rate": 0.00028331146424542564, + "loss": 2.2127, + "step": 8130 + }, + { + "epoch": 0.9486640998716602, + "grad_norm": 1.1046174764633179, + "learning_rate": 0.0002833044996406716, + "loss": 2.2185, + "step": 8131 + }, + { + "epoch": 0.9487807723719519, + "grad_norm": 1.1530513763427734, + "learning_rate": 0.0002832975336695143, + "loss": 2.1074, + "step": 8132 + }, + { + "epoch": 0.9488974448722436, + "grad_norm": 1.082339882850647, + "learning_rate": 0.000283290566332026, + "loss": 2.0763, + "step": 8133 + }, + { + "epoch": 0.9490141173725353, + "grad_norm": 1.2863261699676514, + "learning_rate": 0.00028328359762827885, + "loss": 2.0659, + "step": 8134 + }, + { + "epoch": 0.949130789872827, + "grad_norm": 1.1780908107757568, + "learning_rate": 0.0002832766275583452, + "loss": 2.3535, + "step": 8135 + }, + { + "epoch": 0.9492474623731186, + "grad_norm": 1.2922214269638062, + "learning_rate": 0.0002832696561222973, + "loss": 2.2583, + "step": 8136 + }, + { + "epoch": 0.9493641348734103, + "grad_norm": 1.265657663345337, + "learning_rate": 0.00028326268332020733, + "loss": 2.322, + "step": 8137 + }, + { + "epoch": 0.949480807373702, + "grad_norm": 1.0961307287216187, + "learning_rate": 0.0002832557091521477, + "loss": 2.1338, + "step": 8138 + }, + { + "epoch": 0.9495974798739937, + "grad_norm": 1.1404774188995361, + "learning_rate": 0.0002832487336181906, + "loss": 2.0976, + "step": 8139 + }, + { + "epoch": 0.9497141523742854, + "grad_norm": 1.3784418106079102, + "learning_rate": 0.00028324175671840836, + "loss": 2.2391, + "step": 8140 + }, + { + "epoch": 0.949830824874577, + "grad_norm": 1.1654032468795776, + "learning_rate": 0.0002832347784528734, + "loss": 2.2134, + "step": 8141 + }, + { + "epoch": 0.9499474973748687, + "grad_norm": 1.0675255060195923, + "learning_rate": 0.00028322779882165795, + "loss": 2.2107, + "step": 8142 + }, + { + "epoch": 0.9500641698751604, + "grad_norm": 1.0934138298034668, + "learning_rate": 0.0002832208178248344, + "loss": 2.0774, + "step": 8143 + }, + { + "epoch": 0.9501808423754521, + "grad_norm": 1.1128185987472534, + "learning_rate": 0.00028321383546247523, + "loss": 2.0582, + "step": 8144 + }, + { + "epoch": 0.9502975148757438, + "grad_norm": 1.4919023513793945, + "learning_rate": 0.0002832068517346527, + "loss": 2.2244, + "step": 8145 + }, + { + "epoch": 0.9504141873760354, + "grad_norm": 1.087350606918335, + "learning_rate": 0.00028319986664143926, + "loss": 2.096, + "step": 8146 + }, + { + "epoch": 0.9505308598763271, + "grad_norm": 1.0506906509399414, + "learning_rate": 0.0002831928801829072, + "loss": 2.1618, + "step": 8147 + }, + { + "epoch": 0.9506475323766188, + "grad_norm": 1.21791410446167, + "learning_rate": 0.00028318589235912917, + "loss": 2.2156, + "step": 8148 + }, + { + "epoch": 0.9507642048769105, + "grad_norm": 1.2545193433761597, + "learning_rate": 0.00028317890317017746, + "loss": 2.143, + "step": 8149 + }, + { + "epoch": 0.9508808773772022, + "grad_norm": 1.0148595571517944, + "learning_rate": 0.0002831719126161246, + "loss": 2.06, + "step": 8150 + }, + { + "epoch": 0.9509975498774939, + "grad_norm": 1.239933967590332, + "learning_rate": 0.000283164920697043, + "loss": 2.2434, + "step": 8151 + }, + { + "epoch": 0.9511142223777855, + "grad_norm": 1.3807487487792969, + "learning_rate": 0.0002831579274130051, + "loss": 2.1964, + "step": 8152 + }, + { + "epoch": 0.9512308948780772, + "grad_norm": 1.1485446691513062, + "learning_rate": 0.00028315093276408355, + "loss": 2.3328, + "step": 8153 + }, + { + "epoch": 0.9513475673783689, + "grad_norm": 1.1917102336883545, + "learning_rate": 0.00028314393675035076, + "loss": 2.2353, + "step": 8154 + }, + { + "epoch": 0.9514642398786606, + "grad_norm": 1.0321202278137207, + "learning_rate": 0.00028313693937187924, + "loss": 2.0109, + "step": 8155 + }, + { + "epoch": 0.9515809123789523, + "grad_norm": 1.0930845737457275, + "learning_rate": 0.00028312994062874164, + "loss": 1.9803, + "step": 8156 + }, + { + "epoch": 0.9516975848792439, + "grad_norm": 1.1551553010940552, + "learning_rate": 0.00028312294052101043, + "loss": 2.1887, + "step": 8157 + }, + { + "epoch": 0.9518142573795356, + "grad_norm": 1.1400947570800781, + "learning_rate": 0.00028311593904875815, + "loss": 1.9861, + "step": 8158 + }, + { + "epoch": 0.9519309298798273, + "grad_norm": 1.1942070722579956, + "learning_rate": 0.00028310893621205744, + "loss": 2.224, + "step": 8159 + }, + { + "epoch": 0.952047602380119, + "grad_norm": 1.1012922525405884, + "learning_rate": 0.0002831019320109809, + "loss": 2.1488, + "step": 8160 + }, + { + "epoch": 0.9521642748804107, + "grad_norm": 1.1041576862335205, + "learning_rate": 0.0002830949264456011, + "loss": 2.0251, + "step": 8161 + }, + { + "epoch": 0.9522809473807023, + "grad_norm": 1.0272053480148315, + "learning_rate": 0.0002830879195159907, + "loss": 2.0084, + "step": 8162 + }, + { + "epoch": 0.952397619880994, + "grad_norm": 1.2432093620300293, + "learning_rate": 0.00028308091122222236, + "loss": 2.1132, + "step": 8163 + }, + { + "epoch": 0.9525142923812857, + "grad_norm": 1.2664804458618164, + "learning_rate": 0.0002830739015643687, + "loss": 2.095, + "step": 8164 + }, + { + "epoch": 0.9526309648815774, + "grad_norm": 1.2057737112045288, + "learning_rate": 0.0002830668905425024, + "loss": 2.2926, + "step": 8165 + }, + { + "epoch": 0.9527476373818691, + "grad_norm": 1.2911036014556885, + "learning_rate": 0.00028305987815669607, + "loss": 2.2699, + "step": 8166 + }, + { + "epoch": 0.9528643098821608, + "grad_norm": 1.4893916845321655, + "learning_rate": 0.0002830528644070226, + "loss": 2.1806, + "step": 8167 + }, + { + "epoch": 0.9529809823824524, + "grad_norm": 1.0955300331115723, + "learning_rate": 0.0002830458492935544, + "loss": 2.117, + "step": 8168 + }, + { + "epoch": 0.9530976548827441, + "grad_norm": 1.2271462678909302, + "learning_rate": 0.0002830388328163645, + "loss": 2.2163, + "step": 8169 + }, + { + "epoch": 0.9532143273830358, + "grad_norm": 1.2669918537139893, + "learning_rate": 0.00028303181497552554, + "loss": 2.1557, + "step": 8170 + }, + { + "epoch": 0.9533309998833275, + "grad_norm": 1.2140368223190308, + "learning_rate": 0.00028302479577111013, + "loss": 1.9106, + "step": 8171 + }, + { + "epoch": 0.9534476723836192, + "grad_norm": 1.0618327856063843, + "learning_rate": 0.0002830177752031912, + "loss": 2.0863, + "step": 8172 + }, + { + "epoch": 0.9535643448839108, + "grad_norm": 1.073552131652832, + "learning_rate": 0.00028301075327184143, + "loss": 2.0908, + "step": 8173 + }, + { + "epoch": 0.9536810173842025, + "grad_norm": 1.2572970390319824, + "learning_rate": 0.0002830037299771338, + "loss": 2.194, + "step": 8174 + }, + { + "epoch": 0.9537976898844942, + "grad_norm": 1.3042923212051392, + "learning_rate": 0.00028299670531914086, + "loss": 1.9562, + "step": 8175 + }, + { + "epoch": 0.9539143623847859, + "grad_norm": 1.1826660633087158, + "learning_rate": 0.0002829896792979356, + "loss": 1.9803, + "step": 8176 + }, + { + "epoch": 0.9540310348850776, + "grad_norm": 0.91898113489151, + "learning_rate": 0.0002829826519135908, + "loss": 1.9997, + "step": 8177 + }, + { + "epoch": 0.9541477073853692, + "grad_norm": 1.076879620552063, + "learning_rate": 0.0002829756231661794, + "loss": 2.2006, + "step": 8178 + }, + { + "epoch": 0.9542643798856609, + "grad_norm": 1.1377254724502563, + "learning_rate": 0.0002829685930557741, + "loss": 2.1695, + "step": 8179 + }, + { + "epoch": 0.9543810523859526, + "grad_norm": 1.125827431678772, + "learning_rate": 0.00028296156158244795, + "loss": 2.0992, + "step": 8180 + }, + { + "epoch": 0.9544977248862443, + "grad_norm": 1.2996283769607544, + "learning_rate": 0.00028295452874627374, + "loss": 2.0996, + "step": 8181 + }, + { + "epoch": 0.954614397386536, + "grad_norm": 1.2265254259109497, + "learning_rate": 0.0002829474945473244, + "loss": 2.3132, + "step": 8182 + }, + { + "epoch": 0.9547310698868277, + "grad_norm": 1.2239588499069214, + "learning_rate": 0.0002829404589856729, + "loss": 2.073, + "step": 8183 + }, + { + "epoch": 0.9548477423871193, + "grad_norm": 1.1903473138809204, + "learning_rate": 0.0002829334220613921, + "loss": 2.0619, + "step": 8184 + }, + { + "epoch": 0.954964414887411, + "grad_norm": 1.2774465084075928, + "learning_rate": 0.000282926383774555, + "loss": 2.0222, + "step": 8185 + }, + { + "epoch": 0.9550810873877027, + "grad_norm": 1.1468342542648315, + "learning_rate": 0.00028291934412523457, + "loss": 2.1611, + "step": 8186 + }, + { + "epoch": 0.9551977598879944, + "grad_norm": 1.038784146308899, + "learning_rate": 0.00028291230311350377, + "loss": 2.0832, + "step": 8187 + }, + { + "epoch": 0.9553144323882861, + "grad_norm": 1.1935580968856812, + "learning_rate": 0.0002829052607394356, + "loss": 2.1495, + "step": 8188 + }, + { + "epoch": 0.9554311048885777, + "grad_norm": 1.1878174543380737, + "learning_rate": 0.00028289821700310306, + "loss": 2.0499, + "step": 8189 + }, + { + "epoch": 0.9555477773888694, + "grad_norm": 1.0984762907028198, + "learning_rate": 0.0002828911719045792, + "loss": 2.0558, + "step": 8190 + }, + { + "epoch": 0.9556644498891611, + "grad_norm": 1.1178984642028809, + "learning_rate": 0.000282884125443937, + "loss": 2.0762, + "step": 8191 + }, + { + "epoch": 0.9557811223894528, + "grad_norm": 1.2136762142181396, + "learning_rate": 0.00028287707762124955, + "loss": 2.2106, + "step": 8192 + }, + { + "epoch": 0.9558977948897445, + "grad_norm": 1.2416975498199463, + "learning_rate": 0.0002828700284365899, + "loss": 2.0985, + "step": 8193 + }, + { + "epoch": 0.9560144673900361, + "grad_norm": 1.2264798879623413, + "learning_rate": 0.00028286297789003115, + "loss": 2.2406, + "step": 8194 + }, + { + "epoch": 0.9561311398903278, + "grad_norm": 1.0045137405395508, + "learning_rate": 0.0002828559259816464, + "loss": 1.9215, + "step": 8195 + }, + { + "epoch": 0.9562478123906195, + "grad_norm": 1.2263705730438232, + "learning_rate": 0.0002828488727115087, + "loss": 2.1909, + "step": 8196 + }, + { + "epoch": 0.9563644848909112, + "grad_norm": 1.1148275136947632, + "learning_rate": 0.0002828418180796912, + "loss": 2.1051, + "step": 8197 + }, + { + "epoch": 0.9564811573912029, + "grad_norm": 1.143176794052124, + "learning_rate": 0.00028283476208626707, + "loss": 1.984, + "step": 8198 + }, + { + "epoch": 0.9565978298914946, + "grad_norm": 1.2288734912872314, + "learning_rate": 0.0002828277047313094, + "loss": 2.128, + "step": 8199 + }, + { + "epoch": 0.9567145023917862, + "grad_norm": 0.9968921542167664, + "learning_rate": 0.0002828206460148914, + "loss": 2.0446, + "step": 8200 + }, + { + "epoch": 0.9568311748920779, + "grad_norm": 1.1876842975616455, + "learning_rate": 0.00028281358593708626, + "loss": 2.1808, + "step": 8201 + }, + { + "epoch": 0.9569478473923696, + "grad_norm": 1.2068727016448975, + "learning_rate": 0.00028280652449796706, + "loss": 2.2027, + "step": 8202 + }, + { + "epoch": 0.9570645198926613, + "grad_norm": 1.003962516784668, + "learning_rate": 0.00028279946169760715, + "loss": 2.1028, + "step": 8203 + }, + { + "epoch": 0.957181192392953, + "grad_norm": 1.1860018968582153, + "learning_rate": 0.00028279239753607966, + "loss": 2.0517, + "step": 8204 + }, + { + "epoch": 0.9572978648932446, + "grad_norm": 1.03196382522583, + "learning_rate": 0.0002827853320134579, + "loss": 2.2073, + "step": 8205 + }, + { + "epoch": 0.9574145373935363, + "grad_norm": 1.2101024389266968, + "learning_rate": 0.000282778265129815, + "loss": 2.1515, + "step": 8206 + }, + { + "epoch": 0.957531209893828, + "grad_norm": 1.1294724941253662, + "learning_rate": 0.0002827711968852243, + "loss": 2.2233, + "step": 8207 + }, + { + "epoch": 0.9576478823941197, + "grad_norm": 0.8366424441337585, + "learning_rate": 0.000282764127279759, + "loss": 1.8701, + "step": 8208 + }, + { + "epoch": 0.9577645548944114, + "grad_norm": 1.1299378871917725, + "learning_rate": 0.0002827570563134925, + "loss": 2.1372, + "step": 8209 + }, + { + "epoch": 0.957881227394703, + "grad_norm": 1.2314047813415527, + "learning_rate": 0.0002827499839864981, + "loss": 2.2204, + "step": 8210 + }, + { + "epoch": 0.9579978998949947, + "grad_norm": 1.1772931814193726, + "learning_rate": 0.00028274291029884905, + "loss": 2.328, + "step": 8211 + }, + { + "epoch": 0.9581145723952864, + "grad_norm": 1.0360417366027832, + "learning_rate": 0.0002827358352506187, + "loss": 2.0894, + "step": 8212 + }, + { + "epoch": 0.9582312448955781, + "grad_norm": 1.4237219095230103, + "learning_rate": 0.0002827287588418804, + "loss": 2.2285, + "step": 8213 + }, + { + "epoch": 0.9583479173958698, + "grad_norm": 1.2366126775741577, + "learning_rate": 0.0002827216810727075, + "loss": 2.0745, + "step": 8214 + }, + { + "epoch": 0.9584645898961615, + "grad_norm": 1.1245213747024536, + "learning_rate": 0.0002827146019431734, + "loss": 2.1602, + "step": 8215 + }, + { + "epoch": 0.9585812623964531, + "grad_norm": 1.0577305555343628, + "learning_rate": 0.0002827075214533515, + "loss": 1.8174, + "step": 8216 + }, + { + "epoch": 0.9586979348967448, + "grad_norm": 1.0090112686157227, + "learning_rate": 0.00028270043960331516, + "loss": 2.1771, + "step": 8217 + }, + { + "epoch": 0.9588146073970365, + "grad_norm": 1.039625883102417, + "learning_rate": 0.00028269335639313775, + "loss": 2.2029, + "step": 8218 + }, + { + "epoch": 0.9589312798973282, + "grad_norm": 1.3832927942276, + "learning_rate": 0.00028268627182289285, + "loss": 2.2689, + "step": 8219 + }, + { + "epoch": 0.9590479523976199, + "grad_norm": 1.1984143257141113, + "learning_rate": 0.0002826791858926538, + "loss": 2.3266, + "step": 8220 + }, + { + "epoch": 0.9591646248979115, + "grad_norm": 1.1688166856765747, + "learning_rate": 0.0002826720986024941, + "loss": 2.2871, + "step": 8221 + }, + { + "epoch": 0.9592812973982032, + "grad_norm": 1.2277954816818237, + "learning_rate": 0.00028266500995248716, + "loss": 2.1498, + "step": 8222 + }, + { + "epoch": 0.9593979698984949, + "grad_norm": 1.154105544090271, + "learning_rate": 0.00028265791994270655, + "loss": 2.0205, + "step": 8223 + }, + { + "epoch": 0.9595146423987866, + "grad_norm": 1.1360832452774048, + "learning_rate": 0.0002826508285732257, + "loss": 2.0879, + "step": 8224 + }, + { + "epoch": 0.9596313148990783, + "grad_norm": 1.0668489933013916, + "learning_rate": 0.0002826437358441182, + "loss": 2.1035, + "step": 8225 + }, + { + "epoch": 0.95974798739937, + "grad_norm": 1.2327197790145874, + "learning_rate": 0.0002826366417554575, + "loss": 2.2165, + "step": 8226 + }, + { + "epoch": 0.9598646598996616, + "grad_norm": 1.142493486404419, + "learning_rate": 0.0002826295463073172, + "loss": 2.2727, + "step": 8227 + }, + { + "epoch": 0.9599813323999533, + "grad_norm": 1.1141239404678345, + "learning_rate": 0.0002826224494997708, + "loss": 2.1615, + "step": 8228 + }, + { + "epoch": 0.960098004900245, + "grad_norm": 1.0457044839859009, + "learning_rate": 0.00028261535133289193, + "loss": 2.1148, + "step": 8229 + }, + { + "epoch": 0.9602146774005367, + "grad_norm": 1.1853407621383667, + "learning_rate": 0.00028260825180675416, + "loss": 2.136, + "step": 8230 + }, + { + "epoch": 0.9603313499008284, + "grad_norm": 1.132614016532898, + "learning_rate": 0.0002826011509214311, + "loss": 1.9433, + "step": 8231 + }, + { + "epoch": 0.96044802240112, + "grad_norm": 1.330064296722412, + "learning_rate": 0.00028259404867699635, + "loss": 2.1545, + "step": 8232 + }, + { + "epoch": 0.9605646949014117, + "grad_norm": 1.075301170349121, + "learning_rate": 0.0002825869450735235, + "loss": 2.1093, + "step": 8233 + }, + { + "epoch": 0.9606813674017034, + "grad_norm": 1.176295280456543, + "learning_rate": 0.0002825798401110863, + "loss": 2.2375, + "step": 8234 + }, + { + "epoch": 0.9607980399019951, + "grad_norm": 1.0720055103302002, + "learning_rate": 0.00028257273378975826, + "loss": 1.9553, + "step": 8235 + }, + { + "epoch": 0.9609147124022868, + "grad_norm": 1.1724997758865356, + "learning_rate": 0.0002825656261096132, + "loss": 2.1906, + "step": 8236 + }, + { + "epoch": 0.9610313849025784, + "grad_norm": 1.1456785202026367, + "learning_rate": 0.0002825585170707247, + "loss": 2.0945, + "step": 8237 + }, + { + "epoch": 0.9611480574028701, + "grad_norm": 1.3323817253112793, + "learning_rate": 0.00028255140667316647, + "loss": 2.2769, + "step": 8238 + }, + { + "epoch": 0.9612647299031618, + "grad_norm": 1.1791701316833496, + "learning_rate": 0.0002825442949170123, + "loss": 2.23, + "step": 8239 + }, + { + "epoch": 0.9613814024034535, + "grad_norm": 1.164123773574829, + "learning_rate": 0.00028253718180233574, + "loss": 2.1456, + "step": 8240 + }, + { + "epoch": 0.9614980749037452, + "grad_norm": 1.0328189134597778, + "learning_rate": 0.00028253006732921075, + "loss": 2.1419, + "step": 8241 + }, + { + "epoch": 0.9616147474040369, + "grad_norm": 1.03902006149292, + "learning_rate": 0.000282522951497711, + "loss": 1.957, + "step": 8242 + }, + { + "epoch": 0.9617314199043285, + "grad_norm": 1.12985360622406, + "learning_rate": 0.0002825158343079102, + "loss": 2.1927, + "step": 8243 + }, + { + "epoch": 0.9618480924046202, + "grad_norm": 1.1133475303649902, + "learning_rate": 0.0002825087157598822, + "loss": 2.0493, + "step": 8244 + }, + { + "epoch": 0.9619647649049119, + "grad_norm": 0.9886404871940613, + "learning_rate": 0.00028250159585370074, + "loss": 2.0445, + "step": 8245 + }, + { + "epoch": 0.9620814374052036, + "grad_norm": 1.2020630836486816, + "learning_rate": 0.0002824944745894397, + "loss": 2.0945, + "step": 8246 + }, + { + "epoch": 0.9621981099054953, + "grad_norm": 1.1121805906295776, + "learning_rate": 0.0002824873519671729, + "loss": 2.1217, + "step": 8247 + }, + { + "epoch": 0.9623147824057869, + "grad_norm": 1.2484077215194702, + "learning_rate": 0.0002824802279869741, + "loss": 2.1138, + "step": 8248 + }, + { + "epoch": 0.9624314549060786, + "grad_norm": 0.9396520256996155, + "learning_rate": 0.0002824731026489172, + "loss": 2.0046, + "step": 8249 + }, + { + "epoch": 0.9625481274063703, + "grad_norm": 1.170531153678894, + "learning_rate": 0.00028246597595307614, + "loss": 2.0479, + "step": 8250 + }, + { + "epoch": 0.962664799906662, + "grad_norm": 1.201873540878296, + "learning_rate": 0.0002824588478995247, + "loss": 2.1491, + "step": 8251 + }, + { + "epoch": 0.9627814724069537, + "grad_norm": 1.1090590953826904, + "learning_rate": 0.0002824517184883367, + "loss": 2.0872, + "step": 8252 + }, + { + "epoch": 0.9628981449072453, + "grad_norm": 1.2112096548080444, + "learning_rate": 0.00028244458771958634, + "loss": 2.1772, + "step": 8253 + }, + { + "epoch": 0.963014817407537, + "grad_norm": 1.1698068380355835, + "learning_rate": 0.0002824374555933473, + "loss": 2.1336, + "step": 8254 + }, + { + "epoch": 0.9631314899078287, + "grad_norm": 1.0988185405731201, + "learning_rate": 0.00028243032210969354, + "loss": 1.9909, + "step": 8255 + }, + { + "epoch": 0.9632481624081204, + "grad_norm": 1.053684949874878, + "learning_rate": 0.0002824231872686991, + "loss": 2.1157, + "step": 8256 + }, + { + "epoch": 0.9633648349084121, + "grad_norm": 1.2768152952194214, + "learning_rate": 0.0002824160510704379, + "loss": 2.2054, + "step": 8257 + }, + { + "epoch": 0.9634815074087038, + "grad_norm": 0.9983629584312439, + "learning_rate": 0.00028240891351498387, + "loss": 2.0478, + "step": 8258 + }, + { + "epoch": 0.9635981799089954, + "grad_norm": 1.3374063968658447, + "learning_rate": 0.0002824017746024111, + "loss": 2.1208, + "step": 8259 + }, + { + "epoch": 0.9637148524092871, + "grad_norm": 1.3085131645202637, + "learning_rate": 0.0002823946343327936, + "loss": 2.1765, + "step": 8260 + }, + { + "epoch": 0.9638315249095788, + "grad_norm": 1.1409640312194824, + "learning_rate": 0.00028238749270620525, + "loss": 2.0066, + "step": 8261 + }, + { + "epoch": 0.9639481974098705, + "grad_norm": 1.0925946235656738, + "learning_rate": 0.00028238034972272023, + "loss": 2.0207, + "step": 8262 + }, + { + "epoch": 0.9640648699101622, + "grad_norm": 1.2043496370315552, + "learning_rate": 0.00028237320538241256, + "loss": 2.0274, + "step": 8263 + }, + { + "epoch": 0.9641815424104538, + "grad_norm": 1.1978634595870972, + "learning_rate": 0.00028236605968535625, + "loss": 2.0778, + "step": 8264 + }, + { + "epoch": 0.9642982149107455, + "grad_norm": 1.2725282907485962, + "learning_rate": 0.0002823589126316254, + "loss": 2.0219, + "step": 8265 + }, + { + "epoch": 0.9644148874110372, + "grad_norm": 1.165954828262329, + "learning_rate": 0.0002823517642212942, + "loss": 2.2007, + "step": 8266 + }, + { + "epoch": 0.9645315599113289, + "grad_norm": 0.9821828007698059, + "learning_rate": 0.0002823446144544366, + "loss": 2.1289, + "step": 8267 + }, + { + "epoch": 0.9646482324116206, + "grad_norm": 1.3684319257736206, + "learning_rate": 0.00028233746333112685, + "loss": 2.1558, + "step": 8268 + }, + { + "epoch": 0.9647649049119122, + "grad_norm": 1.0596133470535278, + "learning_rate": 0.00028233031085143895, + "loss": 2.0674, + "step": 8269 + }, + { + "epoch": 0.9648815774122039, + "grad_norm": 1.2377238273620605, + "learning_rate": 0.0002823231570154472, + "loss": 2.0805, + "step": 8270 + }, + { + "epoch": 0.9649982499124956, + "grad_norm": 1.1593408584594727, + "learning_rate": 0.0002823160018232257, + "loss": 2.1799, + "step": 8271 + }, + { + "epoch": 0.9651149224127873, + "grad_norm": 1.184826374053955, + "learning_rate": 0.00028230884527484854, + "loss": 2.3889, + "step": 8272 + }, + { + "epoch": 0.965231594913079, + "grad_norm": 1.1369401216506958, + "learning_rate": 0.0002823016873703901, + "loss": 2.0836, + "step": 8273 + }, + { + "epoch": 0.9653482674133707, + "grad_norm": 1.460461974143982, + "learning_rate": 0.00028229452810992443, + "loss": 2.0699, + "step": 8274 + }, + { + "epoch": 0.9654649399136623, + "grad_norm": 1.3189477920532227, + "learning_rate": 0.00028228736749352583, + "loss": 2.1993, + "step": 8275 + }, + { + "epoch": 0.965581612413954, + "grad_norm": 1.26058030128479, + "learning_rate": 0.0002822802055212684, + "loss": 2.2803, + "step": 8276 + }, + { + "epoch": 0.9656982849142457, + "grad_norm": 1.2972522974014282, + "learning_rate": 0.0002822730421932266, + "loss": 2.2606, + "step": 8277 + }, + { + "epoch": 0.9658149574145374, + "grad_norm": 1.4111006259918213, + "learning_rate": 0.0002822658775094745, + "loss": 2.0898, + "step": 8278 + }, + { + "epoch": 0.9659316299148291, + "grad_norm": 1.0454007387161255, + "learning_rate": 0.0002822587114700865, + "loss": 2.0306, + "step": 8279 + }, + { + "epoch": 0.9660483024151207, + "grad_norm": 1.121336817741394, + "learning_rate": 0.0002822515440751368, + "loss": 2.1228, + "step": 8280 + }, + { + "epoch": 0.9661649749154124, + "grad_norm": 1.1571322679519653, + "learning_rate": 0.0002822443753246998, + "loss": 2.1472, + "step": 8281 + }, + { + "epoch": 0.9662816474157041, + "grad_norm": 1.2087817192077637, + "learning_rate": 0.00028223720521884974, + "loss": 2.1862, + "step": 8282 + }, + { + "epoch": 0.9663983199159958, + "grad_norm": 1.2527817487716675, + "learning_rate": 0.00028223003375766096, + "loss": 2.1467, + "step": 8283 + }, + { + "epoch": 0.9665149924162875, + "grad_norm": 1.275138258934021, + "learning_rate": 0.00028222286094120786, + "loss": 2.2933, + "step": 8284 + }, + { + "epoch": 0.9666316649165791, + "grad_norm": 1.2230603694915771, + "learning_rate": 0.0002822156867695647, + "loss": 2.1931, + "step": 8285 + }, + { + "epoch": 0.9667483374168708, + "grad_norm": 1.1503797769546509, + "learning_rate": 0.00028220851124280595, + "loss": 2.1642, + "step": 8286 + }, + { + "epoch": 0.9668650099171625, + "grad_norm": 1.178140640258789, + "learning_rate": 0.000282201334361006, + "loss": 1.9456, + "step": 8287 + }, + { + "epoch": 0.9669816824174542, + "grad_norm": 1.2639771699905396, + "learning_rate": 0.0002821941561242392, + "loss": 1.9634, + "step": 8288 + }, + { + "epoch": 0.9670983549177459, + "grad_norm": 1.141958475112915, + "learning_rate": 0.0002821869765325799, + "loss": 1.9702, + "step": 8289 + }, + { + "epoch": 0.9672150274180376, + "grad_norm": 1.354993224143982, + "learning_rate": 0.0002821797955861027, + "loss": 2.1109, + "step": 8290 + }, + { + "epoch": 0.9673316999183292, + "grad_norm": 1.268147349357605, + "learning_rate": 0.000282172613284882, + "loss": 2.2665, + "step": 8291 + }, + { + "epoch": 0.9674483724186209, + "grad_norm": 1.1758309602737427, + "learning_rate": 0.0002821654296289921, + "loss": 2.264, + "step": 8292 + }, + { + "epoch": 0.9675650449189126, + "grad_norm": 1.0995246171951294, + "learning_rate": 0.0002821582446185077, + "loss": 2.1188, + "step": 8293 + }, + { + "epoch": 0.9676817174192043, + "grad_norm": 1.0036988258361816, + "learning_rate": 0.00028215105825350306, + "loss": 1.9158, + "step": 8294 + }, + { + "epoch": 0.967798389919496, + "grad_norm": 1.211678147315979, + "learning_rate": 0.0002821438705340529, + "loss": 2.2971, + "step": 8295 + }, + { + "epoch": 0.9679150624197876, + "grad_norm": 1.271138310432434, + "learning_rate": 0.0002821366814602315, + "loss": 2.1032, + "step": 8296 + }, + { + "epoch": 0.9680317349200793, + "grad_norm": 1.1665290594100952, + "learning_rate": 0.0002821294910321136, + "loss": 2.4146, + "step": 8297 + }, + { + "epoch": 0.968148407420371, + "grad_norm": 1.1067897081375122, + "learning_rate": 0.00028212229924977366, + "loss": 2.1969, + "step": 8298 + }, + { + "epoch": 0.9682650799206627, + "grad_norm": 1.2453585863113403, + "learning_rate": 0.0002821151061132862, + "loss": 2.2727, + "step": 8299 + }, + { + "epoch": 0.9683817524209544, + "grad_norm": 1.3281707763671875, + "learning_rate": 0.00028210791162272585, + "loss": 2.1322, + "step": 8300 + }, + { + "epoch": 0.968498424921246, + "grad_norm": 1.2616461515426636, + "learning_rate": 0.0002821007157781672, + "loss": 2.1911, + "step": 8301 + }, + { + "epoch": 0.9686150974215377, + "grad_norm": 1.0912264585494995, + "learning_rate": 0.00028209351857968475, + "loss": 2.3, + "step": 8302 + }, + { + "epoch": 0.9687317699218294, + "grad_norm": 1.1035500764846802, + "learning_rate": 0.0002820863200273532, + "loss": 2.1752, + "step": 8303 + }, + { + "epoch": 0.9688484424221211, + "grad_norm": 1.200317144393921, + "learning_rate": 0.0002820791201212472, + "loss": 2.1553, + "step": 8304 + }, + { + "epoch": 0.9689651149224128, + "grad_norm": 1.0259754657745361, + "learning_rate": 0.0002820719188614413, + "loss": 1.9927, + "step": 8305 + }, + { + "epoch": 0.9690817874227045, + "grad_norm": 1.0987592935562134, + "learning_rate": 0.00028206471624801023, + "loss": 1.9633, + "step": 8306 + }, + { + "epoch": 0.9691984599229961, + "grad_norm": 1.0695778131484985, + "learning_rate": 0.00028205751228102863, + "loss": 2.0675, + "step": 8307 + }, + { + "epoch": 0.9693151324232878, + "grad_norm": 1.0745277404785156, + "learning_rate": 0.00028205030696057116, + "loss": 1.9808, + "step": 8308 + }, + { + "epoch": 0.9694318049235795, + "grad_norm": 0.976620078086853, + "learning_rate": 0.00028204310028671263, + "loss": 1.9237, + "step": 8309 + }, + { + "epoch": 0.9695484774238712, + "grad_norm": 1.1070432662963867, + "learning_rate": 0.0002820358922595275, + "loss": 2.1276, + "step": 8310 + }, + { + "epoch": 0.9696651499241629, + "grad_norm": 1.0531561374664307, + "learning_rate": 0.0002820286828790908, + "loss": 2.1124, + "step": 8311 + }, + { + "epoch": 0.9697818224244545, + "grad_norm": 1.2965134382247925, + "learning_rate": 0.0002820214721454771, + "loss": 2.1331, + "step": 8312 + }, + { + "epoch": 0.9698984949247462, + "grad_norm": 1.0575939416885376, + "learning_rate": 0.0002820142600587611, + "loss": 2.2837, + "step": 8313 + }, + { + "epoch": 0.9700151674250379, + "grad_norm": 1.0310932397842407, + "learning_rate": 0.0002820070466190177, + "loss": 2.0948, + "step": 8314 + }, + { + "epoch": 0.9701318399253296, + "grad_norm": 1.2115956544876099, + "learning_rate": 0.0002819998318263217, + "loss": 2.2752, + "step": 8315 + }, + { + "epoch": 0.9702485124256213, + "grad_norm": 1.149065375328064, + "learning_rate": 0.0002819926156807477, + "loss": 1.9522, + "step": 8316 + }, + { + "epoch": 0.970365184925913, + "grad_norm": 0.9922330975532532, + "learning_rate": 0.00028198539818237065, + "loss": 1.9251, + "step": 8317 + }, + { + "epoch": 0.9704818574262046, + "grad_norm": 1.2706127166748047, + "learning_rate": 0.00028197817933126544, + "loss": 2.0774, + "step": 8318 + }, + { + "epoch": 0.9705985299264963, + "grad_norm": 1.0291748046875, + "learning_rate": 0.00028197095912750673, + "loss": 2.0755, + "step": 8319 + }, + { + "epoch": 0.970715202426788, + "grad_norm": 1.1840827465057373, + "learning_rate": 0.00028196373757116954, + "loss": 2.0346, + "step": 8320 + }, + { + "epoch": 0.9708318749270797, + "grad_norm": 1.0966664552688599, + "learning_rate": 0.0002819565146623286, + "loss": 1.9731, + "step": 8321 + }, + { + "epoch": 0.9709485474273714, + "grad_norm": 1.1440420150756836, + "learning_rate": 0.0002819492904010588, + "loss": 1.9961, + "step": 8322 + }, + { + "epoch": 0.971065219927663, + "grad_norm": 1.2635407447814941, + "learning_rate": 0.00028194206478743516, + "loss": 2.0992, + "step": 8323 + }, + { + "epoch": 0.9711818924279547, + "grad_norm": 1.2578474283218384, + "learning_rate": 0.0002819348378215325, + "loss": 2.2337, + "step": 8324 + }, + { + "epoch": 0.9712985649282464, + "grad_norm": 0.9652823209762573, + "learning_rate": 0.0002819276095034257, + "loss": 2.1409, + "step": 8325 + }, + { + "epoch": 0.9714152374285381, + "grad_norm": 1.0495800971984863, + "learning_rate": 0.0002819203798331898, + "loss": 2.067, + "step": 8326 + }, + { + "epoch": 0.9715319099288298, + "grad_norm": 1.1244856119155884, + "learning_rate": 0.00028191314881089965, + "loss": 2.1464, + "step": 8327 + }, + { + "epoch": 0.9716485824291214, + "grad_norm": 1.2254797220230103, + "learning_rate": 0.0002819059164366303, + "loss": 2.1148, + "step": 8328 + }, + { + "epoch": 0.9717652549294131, + "grad_norm": 1.1152286529541016, + "learning_rate": 0.00028189868271045657, + "loss": 2.1618, + "step": 8329 + }, + { + "epoch": 0.9718819274297048, + "grad_norm": 1.2903615236282349, + "learning_rate": 0.0002818914476324537, + "loss": 2.1631, + "step": 8330 + }, + { + "epoch": 0.9719985999299965, + "grad_norm": 1.039218544960022, + "learning_rate": 0.0002818842112026965, + "loss": 2.1025, + "step": 8331 + }, + { + "epoch": 0.9721152724302882, + "grad_norm": 1.15164053440094, + "learning_rate": 0.00028187697342126007, + "loss": 2.1297, + "step": 8332 + }, + { + "epoch": 0.9722319449305798, + "grad_norm": 1.2151501178741455, + "learning_rate": 0.0002818697342882194, + "loss": 2.1798, + "step": 8333 + }, + { + "epoch": 0.9723486174308715, + "grad_norm": 1.1745020151138306, + "learning_rate": 0.0002818624938036495, + "loss": 2.2551, + "step": 8334 + }, + { + "epoch": 0.9724652899311632, + "grad_norm": 1.2850069999694824, + "learning_rate": 0.00028185525196762555, + "loss": 2.0516, + "step": 8335 + }, + { + "epoch": 0.9725819624314549, + "grad_norm": 1.2641571760177612, + "learning_rate": 0.0002818480087802226, + "loss": 2.173, + "step": 8336 + }, + { + "epoch": 0.9726986349317466, + "grad_norm": 1.1389837265014648, + "learning_rate": 0.00028184076424151566, + "loss": 2.2773, + "step": 8337 + }, + { + "epoch": 0.9728153074320383, + "grad_norm": 1.17267644405365, + "learning_rate": 0.00028183351835157986, + "loss": 2.1592, + "step": 8338 + }, + { + "epoch": 0.9729319799323299, + "grad_norm": 1.0841718912124634, + "learning_rate": 0.0002818262711104904, + "loss": 2.1612, + "step": 8339 + }, + { + "epoch": 0.9730486524326216, + "grad_norm": 1.2061266899108887, + "learning_rate": 0.0002818190225183223, + "loss": 2.1824, + "step": 8340 + }, + { + "epoch": 0.9731653249329133, + "grad_norm": 1.117701768875122, + "learning_rate": 0.00028181177257515075, + "loss": 2.0809, + "step": 8341 + }, + { + "epoch": 0.973281997433205, + "grad_norm": 1.1439893245697021, + "learning_rate": 0.00028180452128105096, + "loss": 2.0353, + "step": 8342 + }, + { + "epoch": 0.9733986699334967, + "grad_norm": 1.0863456726074219, + "learning_rate": 0.000281797268636098, + "loss": 1.9981, + "step": 8343 + }, + { + "epoch": 0.9735153424337883, + "grad_norm": 1.2070512771606445, + "learning_rate": 0.0002817900146403671, + "loss": 2.2007, + "step": 8344 + }, + { + "epoch": 0.97363201493408, + "grad_norm": 1.1665860414505005, + "learning_rate": 0.00028178275929393356, + "loss": 2.177, + "step": 8345 + }, + { + "epoch": 0.9737486874343717, + "grad_norm": 1.2070797681808472, + "learning_rate": 0.0002817755025968725, + "loss": 2.1367, + "step": 8346 + }, + { + "epoch": 0.9738653599346634, + "grad_norm": 1.2703830003738403, + "learning_rate": 0.0002817682445492591, + "loss": 2.2986, + "step": 8347 + }, + { + "epoch": 0.9739820324349551, + "grad_norm": 1.0449298620224, + "learning_rate": 0.00028176098515116864, + "loss": 1.9787, + "step": 8348 + }, + { + "epoch": 0.9740987049352468, + "grad_norm": 1.238974928855896, + "learning_rate": 0.00028175372440267645, + "loss": 2.3178, + "step": 8349 + }, + { + "epoch": 0.9742153774355384, + "grad_norm": 1.132002830505371, + "learning_rate": 0.00028174646230385767, + "loss": 2.2858, + "step": 8350 + }, + { + "epoch": 0.9743320499358301, + "grad_norm": 1.150915265083313, + "learning_rate": 0.00028173919885478776, + "loss": 1.8867, + "step": 8351 + }, + { + "epoch": 0.9744487224361218, + "grad_norm": 1.5092377662658691, + "learning_rate": 0.00028173193405554197, + "loss": 2.1723, + "step": 8352 + }, + { + "epoch": 0.9745653949364135, + "grad_norm": 1.230851411819458, + "learning_rate": 0.0002817246679061955, + "loss": 2.2273, + "step": 8353 + }, + { + "epoch": 0.9746820674367052, + "grad_norm": 1.1140971183776855, + "learning_rate": 0.00028171740040682365, + "loss": 2.2613, + "step": 8354 + }, + { + "epoch": 0.9747987399369968, + "grad_norm": 1.090557336807251, + "learning_rate": 0.000281710131557502, + "loss": 2.1515, + "step": 8355 + }, + { + "epoch": 0.9749154124372885, + "grad_norm": 1.2127017974853516, + "learning_rate": 0.00028170286135830575, + "loss": 2.1086, + "step": 8356 + }, + { + "epoch": 0.9750320849375802, + "grad_norm": 1.0603454113006592, + "learning_rate": 0.00028169558980931027, + "loss": 2.1853, + "step": 8357 + }, + { + "epoch": 0.9751487574378719, + "grad_norm": 1.033698558807373, + "learning_rate": 0.00028168831691059086, + "loss": 2.2924, + "step": 8358 + }, + { + "epoch": 0.9752654299381636, + "grad_norm": 1.0321924686431885, + "learning_rate": 0.00028168104266222314, + "loss": 2.0933, + "step": 8359 + }, + { + "epoch": 0.9753821024384552, + "grad_norm": 1.0763195753097534, + "learning_rate": 0.0002816737670642823, + "loss": 2.2794, + "step": 8360 + }, + { + "epoch": 0.9754987749387469, + "grad_norm": 1.1567615270614624, + "learning_rate": 0.0002816664901168439, + "loss": 2.1587, + "step": 8361 + }, + { + "epoch": 0.9756154474390386, + "grad_norm": 1.107566475868225, + "learning_rate": 0.0002816592118199834, + "loss": 1.9679, + "step": 8362 + }, + { + "epoch": 0.9757321199393303, + "grad_norm": 1.248289704322815, + "learning_rate": 0.0002816519321737761, + "loss": 2.0737, + "step": 8363 + }, + { + "epoch": 0.975848792439622, + "grad_norm": 0.9874007701873779, + "learning_rate": 0.0002816446511782976, + "loss": 2.0024, + "step": 8364 + }, + { + "epoch": 0.9759654649399137, + "grad_norm": 1.1389360427856445, + "learning_rate": 0.00028163736883362337, + "loss": 2.1868, + "step": 8365 + }, + { + "epoch": 0.9760821374402053, + "grad_norm": 1.0788893699645996, + "learning_rate": 0.00028163008513982883, + "loss": 2.1657, + "step": 8366 + }, + { + "epoch": 0.976198809940497, + "grad_norm": 1.2753651142120361, + "learning_rate": 0.00028162280009698957, + "loss": 2.1402, + "step": 8367 + }, + { + "epoch": 0.9763154824407887, + "grad_norm": 1.4105451107025146, + "learning_rate": 0.000281615513705181, + "loss": 2.1931, + "step": 8368 + }, + { + "epoch": 0.9764321549410804, + "grad_norm": 1.222701907157898, + "learning_rate": 0.0002816082259644788, + "loss": 2.2113, + "step": 8369 + }, + { + "epoch": 0.9765488274413721, + "grad_norm": 1.2087877988815308, + "learning_rate": 0.0002816009368749585, + "loss": 1.9737, + "step": 8370 + }, + { + "epoch": 0.9766654999416637, + "grad_norm": 1.161365270614624, + "learning_rate": 0.0002815936464366956, + "loss": 2.3201, + "step": 8371 + }, + { + "epoch": 0.9767821724419554, + "grad_norm": 1.0788600444793701, + "learning_rate": 0.0002815863546497656, + "loss": 2.0805, + "step": 8372 + }, + { + "epoch": 0.9768988449422471, + "grad_norm": 1.092759609222412, + "learning_rate": 0.00028157906151424434, + "loss": 2.0202, + "step": 8373 + }, + { + "epoch": 0.9770155174425388, + "grad_norm": 1.3063884973526, + "learning_rate": 0.0002815717670302072, + "loss": 2.1529, + "step": 8374 + }, + { + "epoch": 0.9771321899428305, + "grad_norm": 1.0026953220367432, + "learning_rate": 0.0002815644711977299, + "loss": 2.173, + "step": 8375 + }, + { + "epoch": 0.9772488624431221, + "grad_norm": 1.0771260261535645, + "learning_rate": 0.00028155717401688807, + "loss": 1.9798, + "step": 8376 + }, + { + "epoch": 0.9773655349434138, + "grad_norm": 1.1904833316802979, + "learning_rate": 0.0002815498754877573, + "loss": 2.0721, + "step": 8377 + }, + { + "epoch": 0.9774822074437055, + "grad_norm": 1.1632963418960571, + "learning_rate": 0.00028154257561041336, + "loss": 2.1552, + "step": 8378 + }, + { + "epoch": 0.9775988799439972, + "grad_norm": 1.137768030166626, + "learning_rate": 0.00028153527438493184, + "loss": 2.0586, + "step": 8379 + }, + { + "epoch": 0.9777155524442889, + "grad_norm": 1.0426682233810425, + "learning_rate": 0.00028152797181138847, + "loss": 2.1469, + "step": 8380 + }, + { + "epoch": 0.9778322249445806, + "grad_norm": 1.4699870347976685, + "learning_rate": 0.0002815206678898589, + "loss": 2.3198, + "step": 8381 + }, + { + "epoch": 0.9779488974448722, + "grad_norm": 1.1372932195663452, + "learning_rate": 0.00028151336262041894, + "loss": 1.8766, + "step": 8382 + }, + { + "epoch": 0.9780655699451639, + "grad_norm": 1.049477219581604, + "learning_rate": 0.0002815060560031443, + "loss": 2.1425, + "step": 8383 + }, + { + "epoch": 0.9781822424454556, + "grad_norm": 0.9914905428886414, + "learning_rate": 0.00028149874803811066, + "loss": 1.8114, + "step": 8384 + }, + { + "epoch": 0.9782989149457473, + "grad_norm": 1.1016443967819214, + "learning_rate": 0.0002814914387253939, + "loss": 2.3267, + "step": 8385 + }, + { + "epoch": 0.978415587446039, + "grad_norm": 0.9171959757804871, + "learning_rate": 0.0002814841280650696, + "loss": 1.8886, + "step": 8386 + }, + { + "epoch": 0.9785322599463306, + "grad_norm": 1.2178714275360107, + "learning_rate": 0.00028147681605721373, + "loss": 2.1287, + "step": 8387 + }, + { + "epoch": 0.9786489324466223, + "grad_norm": 1.072847604751587, + "learning_rate": 0.000281469502701902, + "loss": 2.2401, + "step": 8388 + }, + { + "epoch": 0.978765604946914, + "grad_norm": 1.098970651626587, + "learning_rate": 0.0002814621879992103, + "loss": 2.0289, + "step": 8389 + }, + { + "epoch": 0.9788822774472057, + "grad_norm": 1.1666202545166016, + "learning_rate": 0.0002814548719492144, + "loss": 2.0802, + "step": 8390 + }, + { + "epoch": 0.9789989499474974, + "grad_norm": 1.205284833908081, + "learning_rate": 0.00028144755455199015, + "loss": 2.0851, + "step": 8391 + }, + { + "epoch": 0.979115622447789, + "grad_norm": 0.9374419450759888, + "learning_rate": 0.00028144023580761344, + "loss": 1.9979, + "step": 8392 + }, + { + "epoch": 0.9792322949480807, + "grad_norm": 1.087666630744934, + "learning_rate": 0.0002814329157161601, + "loss": 2.1817, + "step": 8393 + }, + { + "epoch": 0.9793489674483724, + "grad_norm": 1.1261099576950073, + "learning_rate": 0.00028142559427770606, + "loss": 1.9472, + "step": 8394 + }, + { + "epoch": 0.9794656399486641, + "grad_norm": 1.6028270721435547, + "learning_rate": 0.0002814182714923272, + "loss": 2.3375, + "step": 8395 + }, + { + "epoch": 0.9795823124489558, + "grad_norm": 0.9850560426712036, + "learning_rate": 0.00028141094736009944, + "loss": 2.0898, + "step": 8396 + }, + { + "epoch": 0.9796989849492475, + "grad_norm": 1.0967754125595093, + "learning_rate": 0.0002814036218810987, + "loss": 2.2491, + "step": 8397 + }, + { + "epoch": 0.9798156574495391, + "grad_norm": 1.1734756231307983, + "learning_rate": 0.0002813962950554009, + "loss": 2.0952, + "step": 8398 + }, + { + "epoch": 0.9799323299498308, + "grad_norm": 1.0254671573638916, + "learning_rate": 0.00028138896688308205, + "loss": 1.9794, + "step": 8399 + }, + { + "epoch": 0.9800490024501225, + "grad_norm": 1.0723973512649536, + "learning_rate": 0.0002813816373642181, + "loss": 2.0726, + "step": 8400 + }, + { + "epoch": 0.9801656749504142, + "grad_norm": 1.3670190572738647, + "learning_rate": 0.000281374306498885, + "loss": 2.1504, + "step": 8401 + }, + { + "epoch": 0.9802823474507059, + "grad_norm": 1.0360450744628906, + "learning_rate": 0.00028136697428715883, + "loss": 2.0802, + "step": 8402 + }, + { + "epoch": 0.9803990199509975, + "grad_norm": 1.1239367723464966, + "learning_rate": 0.00028135964072911545, + "loss": 2.1702, + "step": 8403 + }, + { + "epoch": 0.9805156924512892, + "grad_norm": 1.0156246423721313, + "learning_rate": 0.0002813523058248311, + "loss": 2.0457, + "step": 8404 + }, + { + "epoch": 0.9806323649515809, + "grad_norm": 1.277430772781372, + "learning_rate": 0.0002813449695743816, + "loss": 2.181, + "step": 8405 + }, + { + "epoch": 0.9807490374518726, + "grad_norm": 1.2062636613845825, + "learning_rate": 0.00028133763197784315, + "loss": 2.1366, + "step": 8406 + }, + { + "epoch": 0.9808657099521643, + "grad_norm": 1.2069294452667236, + "learning_rate": 0.0002813302930352918, + "loss": 2.1033, + "step": 8407 + }, + { + "epoch": 0.980982382452456, + "grad_norm": 1.089626669883728, + "learning_rate": 0.0002813229527468036, + "loss": 2.0662, + "step": 8408 + }, + { + "epoch": 0.9810990549527476, + "grad_norm": 1.1025818586349487, + "learning_rate": 0.00028131561111245467, + "loss": 2.2366, + "step": 8409 + }, + { + "epoch": 0.9812157274530393, + "grad_norm": 1.0881199836730957, + "learning_rate": 0.00028130826813232106, + "loss": 2.0771, + "step": 8410 + }, + { + "epoch": 0.981332399953331, + "grad_norm": 1.0457500219345093, + "learning_rate": 0.00028130092380647897, + "loss": 2.0946, + "step": 8411 + }, + { + "epoch": 0.9814490724536227, + "grad_norm": 1.173556923866272, + "learning_rate": 0.0002812935781350045, + "loss": 2.154, + "step": 8412 + }, + { + "epoch": 0.9815657449539144, + "grad_norm": 1.1861302852630615, + "learning_rate": 0.00028128623111797383, + "loss": 2.3328, + "step": 8413 + }, + { + "epoch": 0.981682417454206, + "grad_norm": 1.2745448350906372, + "learning_rate": 0.00028127888275546306, + "loss": 2.2092, + "step": 8414 + }, + { + "epoch": 0.9817990899544977, + "grad_norm": 1.1801388263702393, + "learning_rate": 0.00028127153304754847, + "loss": 2.205, + "step": 8415 + }, + { + "epoch": 0.9819157624547894, + "grad_norm": 1.050021767616272, + "learning_rate": 0.0002812641819943061, + "loss": 2.237, + "step": 8416 + }, + { + "epoch": 0.9820324349550811, + "grad_norm": 1.1011805534362793, + "learning_rate": 0.00028125682959581237, + "loss": 2.2111, + "step": 8417 + }, + { + "epoch": 0.9821491074553728, + "grad_norm": 1.1345916986465454, + "learning_rate": 0.0002812494758521433, + "loss": 2.1432, + "step": 8418 + }, + { + "epoch": 0.9822657799556644, + "grad_norm": 1.1613448858261108, + "learning_rate": 0.0002812421207633752, + "loss": 2.2975, + "step": 8419 + }, + { + "epoch": 0.9823824524559561, + "grad_norm": 0.9716729521751404, + "learning_rate": 0.00028123476432958443, + "loss": 1.9594, + "step": 8420 + }, + { + "epoch": 0.9824991249562478, + "grad_norm": 1.0581358671188354, + "learning_rate": 0.00028122740655084713, + "loss": 2.073, + "step": 8421 + }, + { + "epoch": 0.9826157974565395, + "grad_norm": 1.1406636238098145, + "learning_rate": 0.00028122004742723954, + "loss": 2.0377, + "step": 8422 + }, + { + "epoch": 0.9827324699568312, + "grad_norm": 1.2253586053848267, + "learning_rate": 0.0002812126869588381, + "loss": 2.1314, + "step": 8423 + }, + { + "epoch": 0.9828491424571228, + "grad_norm": 1.2372732162475586, + "learning_rate": 0.0002812053251457189, + "loss": 2.0679, + "step": 8424 + }, + { + "epoch": 0.9829658149574145, + "grad_norm": 1.045002818107605, + "learning_rate": 0.00028119796198795854, + "loss": 2.266, + "step": 8425 + }, + { + "epoch": 0.9830824874577062, + "grad_norm": 1.1794880628585815, + "learning_rate": 0.0002811905974856331, + "loss": 2.2445, + "step": 8426 + }, + { + "epoch": 0.9831991599579979, + "grad_norm": 1.3008677959442139, + "learning_rate": 0.00028118323163881903, + "loss": 2.2492, + "step": 8427 + }, + { + "epoch": 0.9833158324582896, + "grad_norm": 1.3016449213027954, + "learning_rate": 0.0002811758644475927, + "loss": 2.0684, + "step": 8428 + }, + { + "epoch": 0.9834325049585813, + "grad_norm": 1.1016956567764282, + "learning_rate": 0.0002811684959120305, + "loss": 2.058, + "step": 8429 + }, + { + "epoch": 0.9835491774588729, + "grad_norm": 1.0128799676895142, + "learning_rate": 0.00028116112603220874, + "loss": 2.1849, + "step": 8430 + }, + { + "epoch": 0.9836658499591646, + "grad_norm": 0.9393819570541382, + "learning_rate": 0.0002811537548082039, + "loss": 1.7801, + "step": 8431 + }, + { + "epoch": 0.9837825224594563, + "grad_norm": 0.9905153512954712, + "learning_rate": 0.00028114638224009246, + "loss": 2.1395, + "step": 8432 + }, + { + "epoch": 0.983899194959748, + "grad_norm": 1.2274287939071655, + "learning_rate": 0.00028113900832795065, + "loss": 2.1264, + "step": 8433 + }, + { + "epoch": 0.9840158674600397, + "grad_norm": 1.1888220310211182, + "learning_rate": 0.0002811316330718551, + "loss": 2.1955, + "step": 8434 + }, + { + "epoch": 0.9841325399603313, + "grad_norm": 1.3645853996276855, + "learning_rate": 0.00028112425647188214, + "loss": 2.0856, + "step": 8435 + }, + { + "epoch": 0.984249212460623, + "grad_norm": 1.1636558771133423, + "learning_rate": 0.0002811168785281083, + "loss": 2.0528, + "step": 8436 + }, + { + "epoch": 0.9843658849609147, + "grad_norm": 1.2923616170883179, + "learning_rate": 0.0002811094992406101, + "loss": 2.2602, + "step": 8437 + }, + { + "epoch": 0.9844825574612064, + "grad_norm": 1.1869527101516724, + "learning_rate": 0.00028110211860946397, + "loss": 2.2393, + "step": 8438 + }, + { + "epoch": 0.9845992299614981, + "grad_norm": 1.0617257356643677, + "learning_rate": 0.0002810947366347465, + "loss": 2.0951, + "step": 8439 + }, + { + "epoch": 0.9847159024617897, + "grad_norm": 1.2048165798187256, + "learning_rate": 0.00028108735331653414, + "loss": 2.0123, + "step": 8440 + }, + { + "epoch": 0.9848325749620814, + "grad_norm": 1.0636767148971558, + "learning_rate": 0.0002810799686549035, + "loss": 2.0559, + "step": 8441 + }, + { + "epoch": 0.9849492474623731, + "grad_norm": 1.104831337928772, + "learning_rate": 0.00028107258264993106, + "loss": 2.0541, + "step": 8442 + }, + { + "epoch": 0.9850659199626648, + "grad_norm": 1.175878882408142, + "learning_rate": 0.0002810651953016935, + "loss": 2.2487, + "step": 8443 + }, + { + "epoch": 0.9851825924629565, + "grad_norm": 1.2473407983779907, + "learning_rate": 0.00028105780661026727, + "loss": 2.1377, + "step": 8444 + }, + { + "epoch": 0.9852992649632482, + "grad_norm": 1.146395206451416, + "learning_rate": 0.00028105041657572907, + "loss": 2.1538, + "step": 8445 + }, + { + "epoch": 0.9854159374635398, + "grad_norm": 1.3382686376571655, + "learning_rate": 0.0002810430251981555, + "loss": 2.2445, + "step": 8446 + }, + { + "epoch": 0.9855326099638315, + "grad_norm": 1.0485341548919678, + "learning_rate": 0.0002810356324776231, + "loss": 1.9699, + "step": 8447 + }, + { + "epoch": 0.9856492824641232, + "grad_norm": 1.2542130947113037, + "learning_rate": 0.00028102823841420863, + "loss": 2.0352, + "step": 8448 + }, + { + "epoch": 0.9857659549644149, + "grad_norm": 1.171365737915039, + "learning_rate": 0.0002810208430079887, + "loss": 2.1234, + "step": 8449 + }, + { + "epoch": 0.9858826274647066, + "grad_norm": 1.0738182067871094, + "learning_rate": 0.00028101344625903996, + "loss": 2.0285, + "step": 8450 + }, + { + "epoch": 0.9859992999649982, + "grad_norm": 1.1855552196502686, + "learning_rate": 0.000281006048167439, + "loss": 2.1634, + "step": 8451 + }, + { + "epoch": 0.9861159724652899, + "grad_norm": 1.192663311958313, + "learning_rate": 0.0002809986487332627, + "loss": 2.1521, + "step": 8452 + }, + { + "epoch": 0.9862326449655816, + "grad_norm": 1.3014391660690308, + "learning_rate": 0.0002809912479565877, + "loss": 2.2624, + "step": 8453 + }, + { + "epoch": 0.9863493174658733, + "grad_norm": 0.9768815040588379, + "learning_rate": 0.0002809838458374906, + "loss": 2.0382, + "step": 8454 + }, + { + "epoch": 0.986465989966165, + "grad_norm": 0.9702838063240051, + "learning_rate": 0.0002809764423760484, + "loss": 2.2466, + "step": 8455 + }, + { + "epoch": 0.9865826624664567, + "grad_norm": 1.1936497688293457, + "learning_rate": 0.0002809690375723375, + "loss": 2.142, + "step": 8456 + }, + { + "epoch": 0.9866993349667483, + "grad_norm": 1.2621668577194214, + "learning_rate": 0.000280961631426435, + "loss": 2.1655, + "step": 8457 + }, + { + "epoch": 0.98681600746704, + "grad_norm": 1.1535800695419312, + "learning_rate": 0.0002809542239384175, + "loss": 2.1403, + "step": 8458 + }, + { + "epoch": 0.9869326799673317, + "grad_norm": 1.1034220457077026, + "learning_rate": 0.0002809468151083618, + "loss": 2.1665, + "step": 8459 + }, + { + "epoch": 0.9870493524676234, + "grad_norm": 1.14629328250885, + "learning_rate": 0.0002809394049363447, + "loss": 2.0901, + "step": 8460 + }, + { + "epoch": 0.9871660249679151, + "grad_norm": 1.0558735132217407, + "learning_rate": 0.0002809319934224431, + "loss": 2.0175, + "step": 8461 + }, + { + "epoch": 0.9872826974682067, + "grad_norm": 1.1375048160552979, + "learning_rate": 0.00028092458056673384, + "loss": 2.0634, + "step": 8462 + }, + { + "epoch": 0.9873993699684984, + "grad_norm": 1.10844087600708, + "learning_rate": 0.0002809171663692936, + "loss": 2.1439, + "step": 8463 + }, + { + "epoch": 0.9875160424687901, + "grad_norm": 1.0668278932571411, + "learning_rate": 0.0002809097508301995, + "loss": 1.9255, + "step": 8464 + }, + { + "epoch": 0.9876327149690818, + "grad_norm": 1.1385581493377686, + "learning_rate": 0.0002809023339495282, + "loss": 2.2141, + "step": 8465 + }, + { + "epoch": 0.9877493874693735, + "grad_norm": 1.1005414724349976, + "learning_rate": 0.00028089491572735664, + "loss": 2.1551, + "step": 8466 + }, + { + "epoch": 0.9878660599696651, + "grad_norm": 1.1896798610687256, + "learning_rate": 0.0002808874961637618, + "loss": 2.0605, + "step": 8467 + }, + { + "epoch": 0.9879827324699568, + "grad_norm": 1.3479856252670288, + "learning_rate": 0.0002808800752588205, + "loss": 2.1034, + "step": 8468 + }, + { + "epoch": 0.9880994049702485, + "grad_norm": 1.2480638027191162, + "learning_rate": 0.00028087265301260974, + "loss": 2.2087, + "step": 8469 + }, + { + "epoch": 0.9882160774705402, + "grad_norm": 1.3170245885849, + "learning_rate": 0.00028086522942520647, + "loss": 2.2108, + "step": 8470 + }, + { + "epoch": 0.9883327499708319, + "grad_norm": 1.1817286014556885, + "learning_rate": 0.0002808578044966876, + "loss": 2.0207, + "step": 8471 + }, + { + "epoch": 0.9884494224711236, + "grad_norm": 1.1835331916809082, + "learning_rate": 0.0002808503782271301, + "loss": 2.0773, + "step": 8472 + }, + { + "epoch": 0.9885660949714152, + "grad_norm": 1.1216565370559692, + "learning_rate": 0.00028084295061661104, + "loss": 2.1821, + "step": 8473 + }, + { + "epoch": 0.9886827674717069, + "grad_norm": 1.0561095476150513, + "learning_rate": 0.00028083552166520735, + "loss": 1.9873, + "step": 8474 + }, + { + "epoch": 0.9887994399719986, + "grad_norm": 1.2927762269973755, + "learning_rate": 0.00028082809137299603, + "loss": 2.108, + "step": 8475 + }, + { + "epoch": 0.9889161124722903, + "grad_norm": 1.3962255716323853, + "learning_rate": 0.00028082065974005413, + "loss": 2.1726, + "step": 8476 + }, + { + "epoch": 0.989032784972582, + "grad_norm": 1.2600493431091309, + "learning_rate": 0.0002808132267664588, + "loss": 2.1914, + "step": 8477 + }, + { + "epoch": 0.9891494574728736, + "grad_norm": 1.332062840461731, + "learning_rate": 0.00028080579245228686, + "loss": 2.1793, + "step": 8478 + }, + { + "epoch": 0.9892661299731653, + "grad_norm": 1.200391411781311, + "learning_rate": 0.0002807983567976156, + "loss": 2.1145, + "step": 8479 + }, + { + "epoch": 0.989382802473457, + "grad_norm": 1.0419983863830566, + "learning_rate": 0.000280790919802522, + "loss": 1.9613, + "step": 8480 + }, + { + "epoch": 0.9894994749737487, + "grad_norm": 1.114866018295288, + "learning_rate": 0.00028078348146708327, + "loss": 2.1606, + "step": 8481 + }, + { + "epoch": 0.9896161474740404, + "grad_norm": 1.3525867462158203, + "learning_rate": 0.00028077604179137635, + "loss": 2.0649, + "step": 8482 + }, + { + "epoch": 0.989732819974332, + "grad_norm": 1.0911846160888672, + "learning_rate": 0.00028076860077547844, + "loss": 2.1013, + "step": 8483 + }, + { + "epoch": 0.9898494924746237, + "grad_norm": 1.0853066444396973, + "learning_rate": 0.00028076115841946676, + "loss": 2.0578, + "step": 8484 + }, + { + "epoch": 0.9899661649749154, + "grad_norm": 1.16364324092865, + "learning_rate": 0.0002807537147234183, + "loss": 2.1887, + "step": 8485 + }, + { + "epoch": 0.9900828374752071, + "grad_norm": 1.200290322303772, + "learning_rate": 0.00028074626968741044, + "loss": 2.0783, + "step": 8486 + }, + { + "epoch": 0.9901995099754988, + "grad_norm": 1.153157114982605, + "learning_rate": 0.0002807388233115202, + "loss": 2.0604, + "step": 8487 + }, + { + "epoch": 0.9903161824757905, + "grad_norm": 1.198935866355896, + "learning_rate": 0.0002807313755958248, + "loss": 2.1183, + "step": 8488 + }, + { + "epoch": 0.9904328549760821, + "grad_norm": 1.081579327583313, + "learning_rate": 0.0002807239265404015, + "loss": 2.1628, + "step": 8489 + }, + { + "epoch": 0.9905495274763738, + "grad_norm": 1.1212958097457886, + "learning_rate": 0.00028071647614532746, + "loss": 2.2543, + "step": 8490 + }, + { + "epoch": 0.9906661999766655, + "grad_norm": 1.037232518196106, + "learning_rate": 0.00028070902441068, + "loss": 1.9921, + "step": 8491 + }, + { + "epoch": 0.9907828724769572, + "grad_norm": 1.2631763219833374, + "learning_rate": 0.0002807015713365363, + "loss": 2.3163, + "step": 8492 + }, + { + "epoch": 0.9908995449772489, + "grad_norm": 1.1945841312408447, + "learning_rate": 0.00028069411692297364, + "loss": 2.0814, + "step": 8493 + }, + { + "epoch": 0.9910162174775405, + "grad_norm": 1.0567688941955566, + "learning_rate": 0.0002806866611700693, + "loss": 2.1193, + "step": 8494 + }, + { + "epoch": 0.9911328899778322, + "grad_norm": 1.1476421356201172, + "learning_rate": 0.0002806792040779006, + "loss": 2.223, + "step": 8495 + }, + { + "epoch": 0.9912495624781239, + "grad_norm": 1.0925215482711792, + "learning_rate": 0.00028067174564654486, + "loss": 2.2118, + "step": 8496 + }, + { + "epoch": 0.9913662349784156, + "grad_norm": 0.9513172507286072, + "learning_rate": 0.00028066428587607934, + "loss": 1.952, + "step": 8497 + }, + { + "epoch": 0.9914829074787073, + "grad_norm": 1.043384313583374, + "learning_rate": 0.0002806568247665814, + "loss": 2.0059, + "step": 8498 + }, + { + "epoch": 0.991599579978999, + "grad_norm": 1.1210662126541138, + "learning_rate": 0.0002806493623181284, + "loss": 2.2155, + "step": 8499 + }, + { + "epoch": 0.9917162524792906, + "grad_norm": 1.0080779790878296, + "learning_rate": 0.00028064189853079766, + "loss": 2.1747, + "step": 8500 + }, + { + "epoch": 0.9918329249795823, + "grad_norm": 1.0018770694732666, + "learning_rate": 0.00028063443340466663, + "loss": 1.7947, + "step": 8501 + }, + { + "epoch": 0.991949597479874, + "grad_norm": 1.0818088054656982, + "learning_rate": 0.0002806269669398127, + "loss": 2.3262, + "step": 8502 + }, + { + "epoch": 0.9920662699801657, + "grad_norm": 1.2682151794433594, + "learning_rate": 0.00028061949913631314, + "loss": 2.2623, + "step": 8503 + }, + { + "epoch": 0.9921829424804574, + "grad_norm": 1.2149672508239746, + "learning_rate": 0.00028061202999424555, + "loss": 2.0612, + "step": 8504 + }, + { + "epoch": 0.992299614980749, + "grad_norm": 0.9850306510925293, + "learning_rate": 0.0002806045595136872, + "loss": 2.0766, + "step": 8505 + }, + { + "epoch": 0.9924162874810407, + "grad_norm": 1.3014483451843262, + "learning_rate": 0.0002805970876947157, + "loss": 2.2558, + "step": 8506 + }, + { + "epoch": 0.9925329599813324, + "grad_norm": 0.9466602206230164, + "learning_rate": 0.00028058961453740835, + "loss": 2.0405, + "step": 8507 + }, + { + "epoch": 0.9926496324816241, + "grad_norm": 0.9543014764785767, + "learning_rate": 0.00028058214004184274, + "loss": 2.0313, + "step": 8508 + }, + { + "epoch": 0.9927663049819158, + "grad_norm": 1.10466730594635, + "learning_rate": 0.00028057466420809625, + "loss": 2.0763, + "step": 8509 + }, + { + "epoch": 0.9928829774822074, + "grad_norm": 1.318197250366211, + "learning_rate": 0.00028056718703624653, + "loss": 2.1712, + "step": 8510 + }, + { + "epoch": 0.9929996499824991, + "grad_norm": 1.4490689039230347, + "learning_rate": 0.0002805597085263709, + "loss": 2.2419, + "step": 8511 + }, + { + "epoch": 0.9931163224827908, + "grad_norm": 1.1558794975280762, + "learning_rate": 0.0002805522286785471, + "loss": 2.0301, + "step": 8512 + }, + { + "epoch": 0.9932329949830825, + "grad_norm": 1.0446854829788208, + "learning_rate": 0.00028054474749285253, + "loss": 2.2246, + "step": 8513 + }, + { + "epoch": 0.9933496674833742, + "grad_norm": 1.4565924406051636, + "learning_rate": 0.0002805372649693648, + "loss": 2.1235, + "step": 8514 + }, + { + "epoch": 0.9934663399836658, + "grad_norm": 1.204081654548645, + "learning_rate": 0.00028052978110816145, + "loss": 2.2219, + "step": 8515 + }, + { + "epoch": 0.9935830124839575, + "grad_norm": 1.2778921127319336, + "learning_rate": 0.00028052229590932006, + "loss": 2.068, + "step": 8516 + }, + { + "epoch": 0.9936996849842492, + "grad_norm": 1.1739259958267212, + "learning_rate": 0.00028051480937291823, + "loss": 2.013, + "step": 8517 + }, + { + "epoch": 0.9938163574845409, + "grad_norm": 1.0281388759613037, + "learning_rate": 0.00028050732149903366, + "loss": 2.1032, + "step": 8518 + }, + { + "epoch": 0.9939330299848326, + "grad_norm": 0.9697136282920837, + "learning_rate": 0.00028049983228774385, + "loss": 1.9136, + "step": 8519 + }, + { + "epoch": 0.9940497024851243, + "grad_norm": 1.344842553138733, + "learning_rate": 0.00028049234173912655, + "loss": 2.132, + "step": 8520 + }, + { + "epoch": 0.9941663749854159, + "grad_norm": 1.1000465154647827, + "learning_rate": 0.0002804848498532593, + "loss": 1.9828, + "step": 8521 + }, + { + "epoch": 0.9942830474857076, + "grad_norm": 1.2611247301101685, + "learning_rate": 0.0002804773566302199, + "loss": 2.1706, + "step": 8522 + }, + { + "epoch": 0.9943997199859993, + "grad_norm": 1.1655994653701782, + "learning_rate": 0.00028046986207008593, + "loss": 2.0653, + "step": 8523 + }, + { + "epoch": 0.994516392486291, + "grad_norm": 1.1648569107055664, + "learning_rate": 0.00028046236617293506, + "loss": 2.1099, + "step": 8524 + }, + { + "epoch": 0.9946330649865827, + "grad_norm": 1.0366568565368652, + "learning_rate": 0.0002804548689388451, + "loss": 2.0465, + "step": 8525 + }, + { + "epoch": 0.9947497374868743, + "grad_norm": 1.1669492721557617, + "learning_rate": 0.0002804473703678937, + "loss": 2.0739, + "step": 8526 + }, + { + "epoch": 0.994866409987166, + "grad_norm": 1.0669909715652466, + "learning_rate": 0.0002804398704601587, + "loss": 1.7998, + "step": 8527 + }, + { + "epoch": 0.9949830824874577, + "grad_norm": 1.097051978111267, + "learning_rate": 0.00028043236921571773, + "loss": 2.1319, + "step": 8528 + }, + { + "epoch": 0.9950997549877494, + "grad_norm": 1.3358629941940308, + "learning_rate": 0.00028042486663464863, + "loss": 2.1881, + "step": 8529 + }, + { + "epoch": 0.9952164274880411, + "grad_norm": 1.1652648448944092, + "learning_rate": 0.0002804173627170291, + "loss": 2.1947, + "step": 8530 + }, + { + "epoch": 0.9953330999883327, + "grad_norm": 1.1568177938461304, + "learning_rate": 0.00028040985746293694, + "loss": 2.029, + "step": 8531 + }, + { + "epoch": 0.9954497724886244, + "grad_norm": 1.145362377166748, + "learning_rate": 0.0002804023508724501, + "loss": 2.2776, + "step": 8532 + }, + { + "epoch": 0.9955664449889161, + "grad_norm": 1.1183449029922485, + "learning_rate": 0.0002803948429456462, + "loss": 2.2318, + "step": 8533 + }, + { + "epoch": 0.9956831174892078, + "grad_norm": 1.124826431274414, + "learning_rate": 0.00028038733368260325, + "loss": 2.1146, + "step": 8534 + }, + { + "epoch": 0.9957997899894995, + "grad_norm": 1.1463494300842285, + "learning_rate": 0.000280379823083399, + "loss": 1.9792, + "step": 8535 + }, + { + "epoch": 0.9959164624897912, + "grad_norm": 1.104211688041687, + "learning_rate": 0.0002803723111481113, + "loss": 2.1211, + "step": 8536 + }, + { + "epoch": 0.9960331349900828, + "grad_norm": 1.2741267681121826, + "learning_rate": 0.00028036479787681807, + "loss": 2.1798, + "step": 8537 + }, + { + "epoch": 0.9961498074903745, + "grad_norm": 1.2402268648147583, + "learning_rate": 0.0002803572832695972, + "loss": 2.1117, + "step": 8538 + }, + { + "epoch": 0.9962664799906662, + "grad_norm": 1.2622240781784058, + "learning_rate": 0.00028034976732652653, + "loss": 2.0985, + "step": 8539 + }, + { + "epoch": 0.9963831524909579, + "grad_norm": 1.0993117094039917, + "learning_rate": 0.0002803422500476841, + "loss": 2.1024, + "step": 8540 + }, + { + "epoch": 0.9964998249912496, + "grad_norm": 1.1869269609451294, + "learning_rate": 0.0002803347314331477, + "loss": 1.9418, + "step": 8541 + }, + { + "epoch": 0.9966164974915412, + "grad_norm": 1.0613274574279785, + "learning_rate": 0.00028032721148299537, + "loss": 1.9898, + "step": 8542 + }, + { + "epoch": 0.9967331699918329, + "grad_norm": 1.120569109916687, + "learning_rate": 0.00028031969019730503, + "loss": 1.9602, + "step": 8543 + }, + { + "epoch": 0.9968498424921246, + "grad_norm": 1.210964322090149, + "learning_rate": 0.0002803121675761547, + "loss": 2.1907, + "step": 8544 + }, + { + "epoch": 0.9969665149924163, + "grad_norm": 1.1123226881027222, + "learning_rate": 0.0002803046436196223, + "loss": 2.0054, + "step": 8545 + }, + { + "epoch": 0.997083187492708, + "grad_norm": 1.588776707649231, + "learning_rate": 0.0002802971183277858, + "loss": 2.0705, + "step": 8546 + }, + { + "epoch": 0.9971998599929996, + "grad_norm": 1.083274006843567, + "learning_rate": 0.00028028959170072333, + "loss": 2.0947, + "step": 8547 + }, + { + "epoch": 0.9973165324932913, + "grad_norm": 1.1744035482406616, + "learning_rate": 0.00028028206373851286, + "loss": 2.1762, + "step": 8548 + }, + { + "epoch": 0.997433204993583, + "grad_norm": 1.2143865823745728, + "learning_rate": 0.00028027453444123243, + "loss": 2.1852, + "step": 8549 + }, + { + "epoch": 0.9975498774938747, + "grad_norm": 1.1392228603363037, + "learning_rate": 0.0002802670038089601, + "loss": 2.3393, + "step": 8550 + }, + { + "epoch": 0.9976665499941664, + "grad_norm": 1.1831682920455933, + "learning_rate": 0.0002802594718417739, + "loss": 2.2125, + "step": 8551 + }, + { + "epoch": 0.997783222494458, + "grad_norm": 1.1950886249542236, + "learning_rate": 0.00028025193853975196, + "loss": 2.0133, + "step": 8552 + }, + { + "epoch": 0.9978998949947497, + "grad_norm": 1.041909098625183, + "learning_rate": 0.0002802444039029724, + "loss": 2.0029, + "step": 8553 + }, + { + "epoch": 0.9980165674950414, + "grad_norm": 1.1330770254135132, + "learning_rate": 0.0002802368679315133, + "loss": 2.1067, + "step": 8554 + }, + { + "epoch": 0.9981332399953331, + "grad_norm": 1.1280019283294678, + "learning_rate": 0.00028022933062545275, + "loss": 2.31, + "step": 8555 + }, + { + "epoch": 0.9982499124956248, + "grad_norm": 1.0707032680511475, + "learning_rate": 0.0002802217919848689, + "loss": 2.2395, + "step": 8556 + }, + { + "epoch": 0.9983665849959165, + "grad_norm": 1.0658494234085083, + "learning_rate": 0.00028021425200983995, + "loss": 2.0542, + "step": 8557 + }, + { + "epoch": 0.9984832574962081, + "grad_norm": 1.2873015403747559, + "learning_rate": 0.000280206710700444, + "loss": 2.1853, + "step": 8558 + }, + { + "epoch": 0.9985999299964998, + "grad_norm": 1.0851757526397705, + "learning_rate": 0.0002801991680567593, + "loss": 2.4179, + "step": 8559 + }, + { + "epoch": 0.9987166024967915, + "grad_norm": 1.0667115449905396, + "learning_rate": 0.000280191624078864, + "loss": 2.1914, + "step": 8560 + }, + { + "epoch": 0.9988332749970832, + "grad_norm": 0.9438336491584778, + "learning_rate": 0.0002801840787668363, + "loss": 2.0843, + "step": 8561 + }, + { + "epoch": 0.9989499474973749, + "grad_norm": 1.0751169919967651, + "learning_rate": 0.0002801765321207545, + "loss": 1.9964, + "step": 8562 + }, + { + "epoch": 0.9990666199976665, + "grad_norm": 0.9837767481803894, + "learning_rate": 0.00028016898414069674, + "loss": 2.0945, + "step": 8563 + }, + { + "epoch": 0.9991832924979582, + "grad_norm": 1.1031445264816284, + "learning_rate": 0.00028016143482674136, + "loss": 2.0953, + "step": 8564 + }, + { + "epoch": 0.9992999649982499, + "grad_norm": 1.303348183631897, + "learning_rate": 0.0002801538841789665, + "loss": 2.2799, + "step": 8565 + }, + { + "epoch": 0.9994166374985416, + "grad_norm": 1.0308955907821655, + "learning_rate": 0.0002801463321974505, + "loss": 2.0558, + "step": 8566 + }, + { + "epoch": 0.9995333099988333, + "grad_norm": 1.097693920135498, + "learning_rate": 0.00028013877888227165, + "loss": 2.03, + "step": 8567 + }, + { + "epoch": 0.999649982499125, + "grad_norm": 1.1555689573287964, + "learning_rate": 0.0002801312242335083, + "loss": 1.9089, + "step": 8568 + }, + { + "epoch": 0.9997666549994166, + "grad_norm": 1.0926133394241333, + "learning_rate": 0.00028012366825123876, + "loss": 2.0987, + "step": 8569 + }, + { + "epoch": 0.9998833274997083, + "grad_norm": 1.1150904893875122, + "learning_rate": 0.0002801161109355412, + "loss": 2.0418, + "step": 8570 + }, + { + "epoch": 1.0, + "grad_norm": 1.049277424812317, + "learning_rate": 0.0002801085522864942, + "loss": 2.1496, + "step": 8571 + }, + { + "epoch": 1.0001166725002917, + "grad_norm": 1.1079888343811035, + "learning_rate": 0.000280100992304176, + "loss": 2.0958, + "step": 8572 + }, + { + "epoch": 1.0002333450005834, + "grad_norm": 1.1800625324249268, + "learning_rate": 0.000280093430988665, + "loss": 2.1564, + "step": 8573 + }, + { + "epoch": 1.000350017500875, + "grad_norm": 1.2356460094451904, + "learning_rate": 0.00028008586834003953, + "loss": 2.107, + "step": 8574 + }, + { + "epoch": 1.0004666900011667, + "grad_norm": 1.3637752532958984, + "learning_rate": 0.0002800783043583781, + "loss": 1.887, + "step": 8575 + }, + { + "epoch": 1.0005833625014584, + "grad_norm": 1.1947885751724243, + "learning_rate": 0.000280070739043759, + "loss": 1.9248, + "step": 8576 + }, + { + "epoch": 1.00070003500175, + "grad_norm": 1.0732897520065308, + "learning_rate": 0.0002800631723962608, + "loss": 1.9666, + "step": 8577 + }, + { + "epoch": 1.0008167075020418, + "grad_norm": 1.1401915550231934, + "learning_rate": 0.0002800556044159618, + "loss": 2.0351, + "step": 8578 + }, + { + "epoch": 1.0009333800023335, + "grad_norm": 1.0915818214416504, + "learning_rate": 0.0002800480351029405, + "loss": 1.9706, + "step": 8579 + }, + { + "epoch": 1.0010500525026251, + "grad_norm": 1.5347741842269897, + "learning_rate": 0.00028004046445727544, + "loss": 1.9178, + "step": 8580 + }, + { + "epoch": 1.0011667250029168, + "grad_norm": 1.075046181678772, + "learning_rate": 0.00028003289247904503, + "loss": 2.144, + "step": 8581 + }, + { + "epoch": 1.0012833975032085, + "grad_norm": 1.1732796430587769, + "learning_rate": 0.0002800253191683278, + "loss": 2.1342, + "step": 8582 + }, + { + "epoch": 1.0014000700035002, + "grad_norm": 1.4022433757781982, + "learning_rate": 0.00028001774452520226, + "loss": 2.0814, + "step": 8583 + }, + { + "epoch": 1.0015167425037919, + "grad_norm": 1.23623788356781, + "learning_rate": 0.00028001016854974694, + "loss": 2.051, + "step": 8584 + }, + { + "epoch": 1.0016334150040835, + "grad_norm": 1.1741023063659668, + "learning_rate": 0.00028000259124204036, + "loss": 2.0828, + "step": 8585 + }, + { + "epoch": 1.0017500875043752, + "grad_norm": 1.1347084045410156, + "learning_rate": 0.0002799950126021611, + "loss": 2.1174, + "step": 8586 + }, + { + "epoch": 1.001866760004667, + "grad_norm": 1.1359676122665405, + "learning_rate": 0.00027998743263018763, + "loss": 2.1194, + "step": 8587 + }, + { + "epoch": 1.0019834325049586, + "grad_norm": 1.0878112316131592, + "learning_rate": 0.0002799798513261987, + "loss": 1.9913, + "step": 8588 + }, + { + "epoch": 1.0021001050052503, + "grad_norm": 1.0307419300079346, + "learning_rate": 0.00027997226869027274, + "loss": 1.9062, + "step": 8589 + }, + { + "epoch": 1.002216777505542, + "grad_norm": 1.11911940574646, + "learning_rate": 0.0002799646847224885, + "loss": 1.8802, + "step": 8590 + }, + { + "epoch": 1.0023334500058336, + "grad_norm": 1.0524044036865234, + "learning_rate": 0.0002799570994229245, + "loss": 1.8442, + "step": 8591 + }, + { + "epoch": 1.0024501225061253, + "grad_norm": 1.1169424057006836, + "learning_rate": 0.00027994951279165944, + "loss": 1.9317, + "step": 8592 + }, + { + "epoch": 1.002566795006417, + "grad_norm": 1.1147619485855103, + "learning_rate": 0.0002799419248287719, + "loss": 1.9926, + "step": 8593 + }, + { + "epoch": 1.0026834675067087, + "grad_norm": 1.2207691669464111, + "learning_rate": 0.00027993433553434057, + "loss": 2.2073, + "step": 8594 + }, + { + "epoch": 1.0028001400070004, + "grad_norm": 1.3933519124984741, + "learning_rate": 0.0002799267449084442, + "loss": 2.0016, + "step": 8595 + }, + { + "epoch": 1.002916812507292, + "grad_norm": 1.2097653150558472, + "learning_rate": 0.0002799191529511614, + "loss": 2.0036, + "step": 8596 + }, + { + "epoch": 1.0030334850075837, + "grad_norm": 1.1972713470458984, + "learning_rate": 0.0002799115596625709, + "loss": 1.9581, + "step": 8597 + }, + { + "epoch": 1.0031501575078754, + "grad_norm": 1.1541141271591187, + "learning_rate": 0.0002799039650427514, + "loss": 1.8399, + "step": 8598 + }, + { + "epoch": 1.003266830008167, + "grad_norm": 1.7322280406951904, + "learning_rate": 0.0002798963690917817, + "loss": 2.0398, + "step": 8599 + }, + { + "epoch": 1.0033835025084588, + "grad_norm": 1.0916794538497925, + "learning_rate": 0.0002798887718097404, + "loss": 2.1064, + "step": 8600 + }, + { + "epoch": 1.0035001750087504, + "grad_norm": 1.2899190187454224, + "learning_rate": 0.00027988117319670643, + "loss": 2.1897, + "step": 8601 + }, + { + "epoch": 1.0036168475090421, + "grad_norm": 1.3129909038543701, + "learning_rate": 0.00027987357325275845, + "loss": 2.0781, + "step": 8602 + }, + { + "epoch": 1.0037335200093338, + "grad_norm": 1.0974351167678833, + "learning_rate": 0.00027986597197797534, + "loss": 2.1416, + "step": 8603 + }, + { + "epoch": 1.0038501925096255, + "grad_norm": 1.192284345626831, + "learning_rate": 0.0002798583693724358, + "loss": 2.1187, + "step": 8604 + }, + { + "epoch": 1.0039668650099172, + "grad_norm": 1.168803334236145, + "learning_rate": 0.00027985076543621875, + "loss": 2.052, + "step": 8605 + }, + { + "epoch": 1.0040835375102088, + "grad_norm": 1.3634428977966309, + "learning_rate": 0.0002798431601694029, + "loss": 2.1114, + "step": 8606 + }, + { + "epoch": 1.0042002100105005, + "grad_norm": 1.1375110149383545, + "learning_rate": 0.00027983555357206723, + "loss": 2.1297, + "step": 8607 + }, + { + "epoch": 1.0043168825107922, + "grad_norm": 1.2018039226531982, + "learning_rate": 0.00027982794564429043, + "loss": 2.0403, + "step": 8608 + }, + { + "epoch": 1.004433555011084, + "grad_norm": 1.2503390312194824, + "learning_rate": 0.00027982033638615153, + "loss": 2.1304, + "step": 8609 + }, + { + "epoch": 1.0045502275113756, + "grad_norm": 1.1282567977905273, + "learning_rate": 0.00027981272579772937, + "loss": 2.0699, + "step": 8610 + }, + { + "epoch": 1.0046669000116673, + "grad_norm": 1.0542869567871094, + "learning_rate": 0.00027980511387910276, + "loss": 2.0074, + "step": 8611 + }, + { + "epoch": 1.004783572511959, + "grad_norm": 1.0929044485092163, + "learning_rate": 0.00027979750063035074, + "loss": 1.9725, + "step": 8612 + }, + { + "epoch": 1.0049002450122506, + "grad_norm": 1.1649510860443115, + "learning_rate": 0.00027978988605155213, + "loss": 1.823, + "step": 8613 + }, + { + "epoch": 1.0050169175125423, + "grad_norm": 1.2789961099624634, + "learning_rate": 0.0002797822701427859, + "loss": 2.0473, + "step": 8614 + }, + { + "epoch": 1.005133590012834, + "grad_norm": 1.153833270072937, + "learning_rate": 0.0002797746529041311, + "loss": 2.1074, + "step": 8615 + }, + { + "epoch": 1.0052502625131257, + "grad_norm": 1.0871700048446655, + "learning_rate": 0.0002797670343356666, + "loss": 2.0833, + "step": 8616 + }, + { + "epoch": 1.0053669350134173, + "grad_norm": 1.08542799949646, + "learning_rate": 0.0002797594144374713, + "loss": 1.9913, + "step": 8617 + }, + { + "epoch": 1.005483607513709, + "grad_norm": 1.1446900367736816, + "learning_rate": 0.00027975179320962435, + "loss": 2.2141, + "step": 8618 + }, + { + "epoch": 1.0056002800140007, + "grad_norm": 1.228852391242981, + "learning_rate": 0.0002797441706522047, + "loss": 2.1231, + "step": 8619 + }, + { + "epoch": 1.0057169525142924, + "grad_norm": 1.1658238172531128, + "learning_rate": 0.0002797365467652914, + "loss": 2.259, + "step": 8620 + }, + { + "epoch": 1.005833625014584, + "grad_norm": 1.2296218872070312, + "learning_rate": 0.0002797289215489634, + "loss": 2.0661, + "step": 8621 + }, + { + "epoch": 1.0059502975148757, + "grad_norm": 1.408633828163147, + "learning_rate": 0.00027972129500329987, + "loss": 1.9494, + "step": 8622 + }, + { + "epoch": 1.0060669700151674, + "grad_norm": 1.3375390768051147, + "learning_rate": 0.00027971366712837973, + "loss": 2.091, + "step": 8623 + }, + { + "epoch": 1.006183642515459, + "grad_norm": 1.1353908777236938, + "learning_rate": 0.00027970603792428216, + "loss": 1.968, + "step": 8624 + }, + { + "epoch": 1.0063003150157508, + "grad_norm": 1.2468032836914062, + "learning_rate": 0.00027969840739108624, + "loss": 2.1405, + "step": 8625 + }, + { + "epoch": 1.0064169875160425, + "grad_norm": 1.0732635259628296, + "learning_rate": 0.0002796907755288711, + "loss": 1.9224, + "step": 8626 + }, + { + "epoch": 1.0065336600163342, + "grad_norm": 1.0711276531219482, + "learning_rate": 0.0002796831423377158, + "loss": 2.0003, + "step": 8627 + }, + { + "epoch": 1.0066503325166258, + "grad_norm": 1.3690516948699951, + "learning_rate": 0.0002796755078176995, + "loss": 2.2302, + "step": 8628 + }, + { + "epoch": 1.0067670050169175, + "grad_norm": 1.0325586795806885, + "learning_rate": 0.00027966787196890137, + "loss": 2.026, + "step": 8629 + }, + { + "epoch": 1.0068836775172092, + "grad_norm": 1.1420327425003052, + "learning_rate": 0.00027966023479140045, + "loss": 2.1709, + "step": 8630 + }, + { + "epoch": 1.0070003500175009, + "grad_norm": 1.0304964780807495, + "learning_rate": 0.00027965259628527606, + "loss": 2.0034, + "step": 8631 + }, + { + "epoch": 1.0071170225177926, + "grad_norm": 1.1241950988769531, + "learning_rate": 0.0002796449564506073, + "loss": 2.0259, + "step": 8632 + }, + { + "epoch": 1.0072336950180842, + "grad_norm": 1.2976313829421997, + "learning_rate": 0.00027963731528747344, + "loss": 2.1363, + "step": 8633 + }, + { + "epoch": 1.007350367518376, + "grad_norm": 1.1761218309402466, + "learning_rate": 0.0002796296727959537, + "loss": 2.1026, + "step": 8634 + }, + { + "epoch": 1.0074670400186676, + "grad_norm": 1.2045050859451294, + "learning_rate": 0.0002796220289761272, + "loss": 2.0681, + "step": 8635 + }, + { + "epoch": 1.0075837125189593, + "grad_norm": 1.3527328968048096, + "learning_rate": 0.0002796143838280733, + "loss": 2.0201, + "step": 8636 + }, + { + "epoch": 1.007700385019251, + "grad_norm": 1.1884974241256714, + "learning_rate": 0.0002796067373518712, + "loss": 1.9358, + "step": 8637 + }, + { + "epoch": 1.0078170575195426, + "grad_norm": 1.1034621000289917, + "learning_rate": 0.00027959908954760013, + "loss": 1.9843, + "step": 8638 + }, + { + "epoch": 1.0079337300198343, + "grad_norm": 1.1248687505722046, + "learning_rate": 0.00027959144041533944, + "loss": 2.011, + "step": 8639 + }, + { + "epoch": 1.008050402520126, + "grad_norm": 1.3965603113174438, + "learning_rate": 0.0002795837899551684, + "loss": 2.191, + "step": 8640 + }, + { + "epoch": 1.0081670750204177, + "grad_norm": 1.258955478668213, + "learning_rate": 0.00027957613816716637, + "loss": 2.2797, + "step": 8641 + }, + { + "epoch": 1.0082837475207094, + "grad_norm": 1.1317604780197144, + "learning_rate": 0.00027956848505141267, + "loss": 1.9939, + "step": 8642 + }, + { + "epoch": 1.008400420021001, + "grad_norm": 1.4041026830673218, + "learning_rate": 0.0002795608306079865, + "loss": 2.3188, + "step": 8643 + }, + { + "epoch": 1.0085170925212927, + "grad_norm": 1.1596726179122925, + "learning_rate": 0.00027955317483696737, + "loss": 1.9746, + "step": 8644 + }, + { + "epoch": 1.0086337650215844, + "grad_norm": 1.1525185108184814, + "learning_rate": 0.0002795455177384346, + "loss": 1.9928, + "step": 8645 + }, + { + "epoch": 1.008750437521876, + "grad_norm": 1.2791533470153809, + "learning_rate": 0.00027953785931246754, + "loss": 2.1039, + "step": 8646 + }, + { + "epoch": 1.0088671100221678, + "grad_norm": 1.1869120597839355, + "learning_rate": 0.00027953019955914563, + "loss": 2.0644, + "step": 8647 + }, + { + "epoch": 1.0089837825224595, + "grad_norm": 1.3363511562347412, + "learning_rate": 0.00027952253847854825, + "loss": 2.0666, + "step": 8648 + }, + { + "epoch": 1.0091004550227511, + "grad_norm": 0.9779249429702759, + "learning_rate": 0.0002795148760707548, + "loss": 1.9617, + "step": 8649 + }, + { + "epoch": 1.0092171275230428, + "grad_norm": 1.1532750129699707, + "learning_rate": 0.0002795072123358448, + "loss": 1.9997, + "step": 8650 + }, + { + "epoch": 1.0093338000233345, + "grad_norm": 1.1248822212219238, + "learning_rate": 0.0002794995472738976, + "loss": 2.0224, + "step": 8651 + }, + { + "epoch": 1.0094504725236262, + "grad_norm": 1.0130969285964966, + "learning_rate": 0.0002794918808849927, + "loss": 2.0397, + "step": 8652 + }, + { + "epoch": 1.0095671450239179, + "grad_norm": 1.1942613124847412, + "learning_rate": 0.00027948421316920964, + "loss": 1.7693, + "step": 8653 + }, + { + "epoch": 1.0096838175242095, + "grad_norm": 1.2436870336532593, + "learning_rate": 0.00027947654412662783, + "loss": 2.3041, + "step": 8654 + }, + { + "epoch": 1.0098004900245012, + "grad_norm": 1.2081959247589111, + "learning_rate": 0.00027946887375732675, + "loss": 2.0362, + "step": 8655 + }, + { + "epoch": 1.009917162524793, + "grad_norm": 1.3242156505584717, + "learning_rate": 0.000279461202061386, + "loss": 2.2223, + "step": 8656 + }, + { + "epoch": 1.0100338350250846, + "grad_norm": 1.0796597003936768, + "learning_rate": 0.0002794535290388851, + "loss": 2.0864, + "step": 8657 + }, + { + "epoch": 1.0101505075253763, + "grad_norm": 1.159770131111145, + "learning_rate": 0.0002794458546899035, + "loss": 2.0548, + "step": 8658 + }, + { + "epoch": 1.010267180025668, + "grad_norm": 1.1382991075515747, + "learning_rate": 0.0002794381790145209, + "loss": 2.1231, + "step": 8659 + }, + { + "epoch": 1.0103838525259596, + "grad_norm": 1.2152676582336426, + "learning_rate": 0.00027943050201281684, + "loss": 2.0401, + "step": 8660 + }, + { + "epoch": 1.0105005250262513, + "grad_norm": 1.293216586112976, + "learning_rate": 0.0002794228236848708, + "loss": 1.9981, + "step": 8661 + }, + { + "epoch": 1.010617197526543, + "grad_norm": 1.0426223278045654, + "learning_rate": 0.00027941514403076245, + "loss": 1.9875, + "step": 8662 + }, + { + "epoch": 1.0107338700268347, + "grad_norm": 1.2301050424575806, + "learning_rate": 0.0002794074630505714, + "loss": 1.9763, + "step": 8663 + }, + { + "epoch": 1.0108505425271264, + "grad_norm": 1.1712013483047485, + "learning_rate": 0.0002793997807443773, + "loss": 2.1288, + "step": 8664 + }, + { + "epoch": 1.010967215027418, + "grad_norm": 1.145759105682373, + "learning_rate": 0.0002793920971122598, + "loss": 2.1715, + "step": 8665 + }, + { + "epoch": 1.0110838875277097, + "grad_norm": 1.1021326780319214, + "learning_rate": 0.00027938441215429846, + "loss": 2.0858, + "step": 8666 + }, + { + "epoch": 1.0112005600280014, + "grad_norm": 1.2655929327011108, + "learning_rate": 0.0002793767258705731, + "loss": 2.0073, + "step": 8667 + }, + { + "epoch": 1.011317232528293, + "grad_norm": 1.1676814556121826, + "learning_rate": 0.00027936903826116324, + "loss": 2.1092, + "step": 8668 + }, + { + "epoch": 1.0114339050285848, + "grad_norm": 1.331023097038269, + "learning_rate": 0.0002793613493261487, + "loss": 2.2499, + "step": 8669 + }, + { + "epoch": 1.0115505775288764, + "grad_norm": 1.3168443441390991, + "learning_rate": 0.0002793536590656092, + "loss": 2.0076, + "step": 8670 + }, + { + "epoch": 1.0116672500291681, + "grad_norm": 1.2159552574157715, + "learning_rate": 0.0002793459674796243, + "loss": 2.0005, + "step": 8671 + }, + { + "epoch": 1.0117839225294598, + "grad_norm": 1.1681499481201172, + "learning_rate": 0.00027933827456827395, + "loss": 1.9423, + "step": 8672 + }, + { + "epoch": 1.0119005950297515, + "grad_norm": 1.154765248298645, + "learning_rate": 0.0002793305803316378, + "loss": 1.99, + "step": 8673 + }, + { + "epoch": 1.0120172675300432, + "grad_norm": 1.0943512916564941, + "learning_rate": 0.00027932288476979555, + "loss": 1.8161, + "step": 8674 + }, + { + "epoch": 1.0121339400303349, + "grad_norm": 1.2202397584915161, + "learning_rate": 0.00027931518788282706, + "loss": 1.9924, + "step": 8675 + }, + { + "epoch": 1.0122506125306265, + "grad_norm": 1.1634773015975952, + "learning_rate": 0.00027930748967081216, + "loss": 2.0148, + "step": 8676 + }, + { + "epoch": 1.0123672850309182, + "grad_norm": 1.1358314752578735, + "learning_rate": 0.0002792997901338306, + "loss": 2.0064, + "step": 8677 + }, + { + "epoch": 1.01248395753121, + "grad_norm": 1.0921423435211182, + "learning_rate": 0.00027929208927196216, + "loss": 1.9778, + "step": 8678 + }, + { + "epoch": 1.0126006300315016, + "grad_norm": 1.0281314849853516, + "learning_rate": 0.00027928438708528676, + "loss": 2.0516, + "step": 8679 + }, + { + "epoch": 1.0127173025317933, + "grad_norm": 1.2284536361694336, + "learning_rate": 0.0002792766835738842, + "loss": 2.0112, + "step": 8680 + }, + { + "epoch": 1.012833975032085, + "grad_norm": 1.1011213064193726, + "learning_rate": 0.0002792689787378343, + "loss": 1.978, + "step": 8681 + }, + { + "epoch": 1.0129506475323766, + "grad_norm": 1.1710132360458374, + "learning_rate": 0.00027926127257721703, + "loss": 2.091, + "step": 8682 + }, + { + "epoch": 1.0130673200326683, + "grad_norm": 1.0787538290023804, + "learning_rate": 0.00027925356509211224, + "loss": 2.134, + "step": 8683 + }, + { + "epoch": 1.01318399253296, + "grad_norm": 1.165162205696106, + "learning_rate": 0.0002792458562825998, + "loss": 2.1295, + "step": 8684 + }, + { + "epoch": 1.0133006650332517, + "grad_norm": 1.0375875234603882, + "learning_rate": 0.00027923814614875966, + "loss": 1.9757, + "step": 8685 + }, + { + "epoch": 1.0134173375335433, + "grad_norm": 1.304068684577942, + "learning_rate": 0.0002792304346906718, + "loss": 1.9793, + "step": 8686 + }, + { + "epoch": 1.013534010033835, + "grad_norm": 1.0747500658035278, + "learning_rate": 0.00027922272190841605, + "loss": 2.0279, + "step": 8687 + }, + { + "epoch": 1.0136506825341267, + "grad_norm": 1.1576874256134033, + "learning_rate": 0.00027921500780207244, + "loss": 1.9979, + "step": 8688 + }, + { + "epoch": 1.0137673550344184, + "grad_norm": 1.1989753246307373, + "learning_rate": 0.0002792072923717209, + "loss": 2.1267, + "step": 8689 + }, + { + "epoch": 1.01388402753471, + "grad_norm": 1.2054005861282349, + "learning_rate": 0.0002791995756174415, + "loss": 1.956, + "step": 8690 + }, + { + "epoch": 1.0140007000350018, + "grad_norm": 1.1642941236495972, + "learning_rate": 0.0002791918575393141, + "loss": 2.0392, + "step": 8691 + }, + { + "epoch": 1.0141173725352934, + "grad_norm": 1.2727373838424683, + "learning_rate": 0.00027918413813741885, + "loss": 2.0242, + "step": 8692 + }, + { + "epoch": 1.0142340450355851, + "grad_norm": 1.3421907424926758, + "learning_rate": 0.0002791764174118357, + "loss": 2.0154, + "step": 8693 + }, + { + "epoch": 1.0143507175358768, + "grad_norm": 1.512200117111206, + "learning_rate": 0.00027916869536264474, + "loss": 2.3171, + "step": 8694 + }, + { + "epoch": 1.0144673900361685, + "grad_norm": 1.1241463422775269, + "learning_rate": 0.00027916097198992597, + "loss": 1.9437, + "step": 8695 + }, + { + "epoch": 1.0145840625364602, + "grad_norm": 1.1837345361709595, + "learning_rate": 0.00027915324729375945, + "loss": 1.8466, + "step": 8696 + }, + { + "epoch": 1.0147007350367518, + "grad_norm": 1.2690216302871704, + "learning_rate": 0.00027914552127422535, + "loss": 2.2599, + "step": 8697 + }, + { + "epoch": 1.0148174075370435, + "grad_norm": 1.186184048652649, + "learning_rate": 0.0002791377939314037, + "loss": 2.0785, + "step": 8698 + }, + { + "epoch": 1.0149340800373352, + "grad_norm": 1.5044922828674316, + "learning_rate": 0.00027913006526537453, + "loss": 2.1066, + "step": 8699 + }, + { + "epoch": 1.0150507525376269, + "grad_norm": 1.2343339920043945, + "learning_rate": 0.0002791223352762181, + "loss": 2.1915, + "step": 8700 + }, + { + "epoch": 1.0151674250379186, + "grad_norm": 1.3614939451217651, + "learning_rate": 0.00027911460396401445, + "loss": 2.1239, + "step": 8701 + }, + { + "epoch": 1.0152840975382103, + "grad_norm": 1.3690515756607056, + "learning_rate": 0.00027910687132884387, + "loss": 2.3003, + "step": 8702 + }, + { + "epoch": 1.015400770038502, + "grad_norm": 1.1060700416564941, + "learning_rate": 0.0002790991373707864, + "loss": 2.0235, + "step": 8703 + }, + { + "epoch": 1.0155174425387936, + "grad_norm": 1.2395387887954712, + "learning_rate": 0.0002790914020899222, + "loss": 2.1911, + "step": 8704 + }, + { + "epoch": 1.0156341150390853, + "grad_norm": 1.2011442184448242, + "learning_rate": 0.0002790836654863315, + "loss": 2.1013, + "step": 8705 + }, + { + "epoch": 1.015750787539377, + "grad_norm": 1.199929118156433, + "learning_rate": 0.00027907592756009454, + "loss": 1.9076, + "step": 8706 + }, + { + "epoch": 1.0158674600396687, + "grad_norm": 1.1816227436065674, + "learning_rate": 0.00027906818831129147, + "loss": 1.9126, + "step": 8707 + }, + { + "epoch": 1.0159841325399603, + "grad_norm": 1.1271976232528687, + "learning_rate": 0.0002790604477400026, + "loss": 2.0234, + "step": 8708 + }, + { + "epoch": 1.016100805040252, + "grad_norm": 1.2124170064926147, + "learning_rate": 0.0002790527058463081, + "loss": 2.1478, + "step": 8709 + }, + { + "epoch": 1.0162174775405437, + "grad_norm": 1.0142055749893188, + "learning_rate": 0.00027904496263028825, + "loss": 1.939, + "step": 8710 + }, + { + "epoch": 1.0163341500408354, + "grad_norm": 1.173797845840454, + "learning_rate": 0.0002790372180920234, + "loss": 2.0546, + "step": 8711 + }, + { + "epoch": 1.016450822541127, + "grad_norm": 1.2200902700424194, + "learning_rate": 0.0002790294722315937, + "loss": 2.0272, + "step": 8712 + }, + { + "epoch": 1.0165674950414187, + "grad_norm": 1.1772054433822632, + "learning_rate": 0.0002790217250490796, + "loss": 2.0169, + "step": 8713 + }, + { + "epoch": 1.0166841675417104, + "grad_norm": 1.2341771125793457, + "learning_rate": 0.00027901397654456125, + "loss": 2.0588, + "step": 8714 + }, + { + "epoch": 1.016800840042002, + "grad_norm": 1.326499104499817, + "learning_rate": 0.0002790062267181191, + "loss": 2.0907, + "step": 8715 + }, + { + "epoch": 1.0169175125422938, + "grad_norm": 1.2171703577041626, + "learning_rate": 0.00027899847556983343, + "loss": 1.9883, + "step": 8716 + }, + { + "epoch": 1.0170341850425855, + "grad_norm": 1.1123597621917725, + "learning_rate": 0.0002789907230997847, + "loss": 2.0884, + "step": 8717 + }, + { + "epoch": 1.0171508575428772, + "grad_norm": 1.1661514043807983, + "learning_rate": 0.00027898296930805314, + "loss": 2.2432, + "step": 8718 + }, + { + "epoch": 1.0172675300431688, + "grad_norm": 1.3348274230957031, + "learning_rate": 0.00027897521419471924, + "loss": 1.8806, + "step": 8719 + }, + { + "epoch": 1.0173842025434605, + "grad_norm": 1.3296594619750977, + "learning_rate": 0.00027896745775986335, + "loss": 2.0936, + "step": 8720 + }, + { + "epoch": 1.0175008750437522, + "grad_norm": 1.172450065612793, + "learning_rate": 0.0002789597000035658, + "loss": 2.1954, + "step": 8721 + }, + { + "epoch": 1.0176175475440439, + "grad_norm": 1.0680086612701416, + "learning_rate": 0.00027895194092590717, + "loss": 2.1213, + "step": 8722 + }, + { + "epoch": 1.0177342200443356, + "grad_norm": 1.109450101852417, + "learning_rate": 0.0002789441805269678, + "loss": 1.8926, + "step": 8723 + }, + { + "epoch": 1.0178508925446272, + "grad_norm": 1.387073278427124, + "learning_rate": 0.00027893641880682817, + "loss": 2.2819, + "step": 8724 + }, + { + "epoch": 1.017967565044919, + "grad_norm": 1.0873671770095825, + "learning_rate": 0.00027892865576556876, + "loss": 2.0594, + "step": 8725 + }, + { + "epoch": 1.0180842375452106, + "grad_norm": 1.142337441444397, + "learning_rate": 0.00027892089140327, + "loss": 1.9179, + "step": 8726 + }, + { + "epoch": 1.0182009100455023, + "grad_norm": 1.3819200992584229, + "learning_rate": 0.0002789131257200124, + "loss": 2.1026, + "step": 8727 + }, + { + "epoch": 1.018317582545794, + "grad_norm": 1.1211267709732056, + "learning_rate": 0.00027890535871587653, + "loss": 2.1986, + "step": 8728 + }, + { + "epoch": 1.0184342550460856, + "grad_norm": 1.271048665046692, + "learning_rate": 0.0002788975903909428, + "loss": 2.1228, + "step": 8729 + }, + { + "epoch": 1.0185509275463773, + "grad_norm": 1.2902750968933105, + "learning_rate": 0.0002788898207452918, + "loss": 2.0308, + "step": 8730 + }, + { + "epoch": 1.018667600046669, + "grad_norm": 1.0404094457626343, + "learning_rate": 0.0002788820497790041, + "loss": 1.798, + "step": 8731 + }, + { + "epoch": 1.0187842725469607, + "grad_norm": 1.1439718008041382, + "learning_rate": 0.0002788742774921602, + "loss": 2.0206, + "step": 8732 + }, + { + "epoch": 1.0189009450472524, + "grad_norm": 1.3579570055007935, + "learning_rate": 0.00027886650388484076, + "loss": 2.1178, + "step": 8733 + }, + { + "epoch": 1.019017617547544, + "grad_norm": 1.1819583177566528, + "learning_rate": 0.0002788587289571263, + "loss": 1.8938, + "step": 8734 + }, + { + "epoch": 1.0191342900478357, + "grad_norm": 1.0632025003433228, + "learning_rate": 0.0002788509527090974, + "loss": 1.8508, + "step": 8735 + }, + { + "epoch": 1.0192509625481274, + "grad_norm": 1.4425855875015259, + "learning_rate": 0.00027884317514083475, + "loss": 2.0423, + "step": 8736 + }, + { + "epoch": 1.019367635048419, + "grad_norm": 1.089010238647461, + "learning_rate": 0.00027883539625241896, + "loss": 2.0936, + "step": 8737 + }, + { + "epoch": 1.0194843075487108, + "grad_norm": 1.2387582063674927, + "learning_rate": 0.00027882761604393065, + "loss": 2.0574, + "step": 8738 + }, + { + "epoch": 1.0196009800490025, + "grad_norm": 1.4056012630462646, + "learning_rate": 0.0002788198345154505, + "loss": 2.1071, + "step": 8739 + }, + { + "epoch": 1.0197176525492941, + "grad_norm": 1.1904430389404297, + "learning_rate": 0.00027881205166705917, + "loss": 2.0168, + "step": 8740 + }, + { + "epoch": 1.0198343250495858, + "grad_norm": 1.1762046813964844, + "learning_rate": 0.00027880426749883727, + "loss": 2.1214, + "step": 8741 + }, + { + "epoch": 1.0199509975498775, + "grad_norm": 1.3238470554351807, + "learning_rate": 0.0002787964820108656, + "loss": 1.9916, + "step": 8742 + }, + { + "epoch": 1.0200676700501692, + "grad_norm": 1.2008336782455444, + "learning_rate": 0.0002787886952032249, + "loss": 2.0568, + "step": 8743 + }, + { + "epoch": 1.0201843425504609, + "grad_norm": 1.3080942630767822, + "learning_rate": 0.00027878090707599576, + "loss": 2.0364, + "step": 8744 + }, + { + "epoch": 1.0203010150507525, + "grad_norm": 1.1778491735458374, + "learning_rate": 0.000278773117629259, + "loss": 2.0253, + "step": 8745 + }, + { + "epoch": 1.0204176875510442, + "grad_norm": 1.1242454051971436, + "learning_rate": 0.0002787653268630954, + "loss": 1.9872, + "step": 8746 + }, + { + "epoch": 1.020534360051336, + "grad_norm": 1.1849719285964966, + "learning_rate": 0.0002787575347775857, + "loss": 1.9505, + "step": 8747 + }, + { + "epoch": 1.0206510325516276, + "grad_norm": 1.1247414350509644, + "learning_rate": 0.0002787497413728106, + "loss": 2.2297, + "step": 8748 + }, + { + "epoch": 1.0207677050519193, + "grad_norm": 1.2348648309707642, + "learning_rate": 0.000278741946648851, + "loss": 2.0617, + "step": 8749 + }, + { + "epoch": 1.020884377552211, + "grad_norm": 1.0942206382751465, + "learning_rate": 0.00027873415060578765, + "loss": 2.0951, + "step": 8750 + }, + { + "epoch": 1.0210010500525026, + "grad_norm": 1.2079945802688599, + "learning_rate": 0.00027872635324370146, + "loss": 1.908, + "step": 8751 + }, + { + "epoch": 1.0211177225527943, + "grad_norm": 1.1874041557312012, + "learning_rate": 0.00027871855456267314, + "loss": 1.9919, + "step": 8752 + }, + { + "epoch": 1.021234395053086, + "grad_norm": 1.1880239248275757, + "learning_rate": 0.00027871075456278356, + "loss": 2.0207, + "step": 8753 + }, + { + "epoch": 1.0213510675533777, + "grad_norm": 1.0912691354751587, + "learning_rate": 0.0002787029532441137, + "loss": 2.1361, + "step": 8754 + }, + { + "epoch": 1.0214677400536694, + "grad_norm": 1.1858954429626465, + "learning_rate": 0.00027869515060674433, + "loss": 2.0395, + "step": 8755 + }, + { + "epoch": 1.021584412553961, + "grad_norm": 1.1813734769821167, + "learning_rate": 0.0002786873466507563, + "loss": 2.03, + "step": 8756 + }, + { + "epoch": 1.0217010850542527, + "grad_norm": 1.4783133268356323, + "learning_rate": 0.0002786795413762306, + "loss": 2.183, + "step": 8757 + }, + { + "epoch": 1.0218177575545444, + "grad_norm": 1.0193562507629395, + "learning_rate": 0.00027867173478324816, + "loss": 1.8178, + "step": 8758 + }, + { + "epoch": 1.021934430054836, + "grad_norm": 1.5181009769439697, + "learning_rate": 0.00027866392687188985, + "loss": 2.0026, + "step": 8759 + }, + { + "epoch": 1.0220511025551278, + "grad_norm": 1.2705698013305664, + "learning_rate": 0.0002786561176422366, + "loss": 2.1675, + "step": 8760 + }, + { + "epoch": 1.0221677750554194, + "grad_norm": 1.017844557762146, + "learning_rate": 0.00027864830709436946, + "loss": 2.0169, + "step": 8761 + }, + { + "epoch": 1.0222844475557111, + "grad_norm": 1.1799933910369873, + "learning_rate": 0.0002786404952283693, + "loss": 2.0925, + "step": 8762 + }, + { + "epoch": 1.0224011200560028, + "grad_norm": 1.3301604986190796, + "learning_rate": 0.0002786326820443172, + "loss": 2.0572, + "step": 8763 + }, + { + "epoch": 1.0225177925562945, + "grad_norm": 1.2164790630340576, + "learning_rate": 0.000278624867542294, + "loss": 2.0458, + "step": 8764 + }, + { + "epoch": 1.0226344650565862, + "grad_norm": 1.2233937978744507, + "learning_rate": 0.00027861705172238093, + "loss": 2.03, + "step": 8765 + }, + { + "epoch": 1.0227511375568779, + "grad_norm": 1.0035635232925415, + "learning_rate": 0.0002786092345846588, + "loss": 1.9713, + "step": 8766 + }, + { + "epoch": 1.0228678100571695, + "grad_norm": 1.1201781034469604, + "learning_rate": 0.00027860141612920885, + "loss": 2.0416, + "step": 8767 + }, + { + "epoch": 1.0229844825574612, + "grad_norm": 1.5173919200897217, + "learning_rate": 0.000278593596356112, + "loss": 2.1707, + "step": 8768 + }, + { + "epoch": 1.023101155057753, + "grad_norm": 1.2531185150146484, + "learning_rate": 0.0002785857752654493, + "loss": 2.1308, + "step": 8769 + }, + { + "epoch": 1.0232178275580446, + "grad_norm": 1.1717915534973145, + "learning_rate": 0.00027857795285730197, + "loss": 2.004, + "step": 8770 + }, + { + "epoch": 1.0233345000583363, + "grad_norm": 1.483586072921753, + "learning_rate": 0.000278570129131751, + "loss": 2.1611, + "step": 8771 + }, + { + "epoch": 1.023451172558628, + "grad_norm": 1.4080744981765747, + "learning_rate": 0.00027856230408887744, + "loss": 2.3301, + "step": 8772 + }, + { + "epoch": 1.0235678450589196, + "grad_norm": 1.4131675958633423, + "learning_rate": 0.00027855447772876256, + "loss": 2.1723, + "step": 8773 + }, + { + "epoch": 1.0236845175592113, + "grad_norm": 1.447327733039856, + "learning_rate": 0.00027854665005148737, + "loss": 2.149, + "step": 8774 + }, + { + "epoch": 1.023801190059503, + "grad_norm": 1.4332095384597778, + "learning_rate": 0.0002785388210571331, + "loss": 2.1006, + "step": 8775 + }, + { + "epoch": 1.0239178625597947, + "grad_norm": 1.294086217880249, + "learning_rate": 0.00027853099074578086, + "loss": 1.9706, + "step": 8776 + }, + { + "epoch": 1.0240345350600863, + "grad_norm": 1.1597617864608765, + "learning_rate": 0.00027852315911751184, + "loss": 2.0558, + "step": 8777 + }, + { + "epoch": 1.024151207560378, + "grad_norm": 1.2465943098068237, + "learning_rate": 0.0002785153261724073, + "loss": 2.0736, + "step": 8778 + }, + { + "epoch": 1.0242678800606697, + "grad_norm": 1.123004674911499, + "learning_rate": 0.00027850749191054837, + "loss": 1.8735, + "step": 8779 + }, + { + "epoch": 1.0243845525609614, + "grad_norm": 1.309453010559082, + "learning_rate": 0.0002784996563320162, + "loss": 2.0806, + "step": 8780 + }, + { + "epoch": 1.024501225061253, + "grad_norm": 1.1564124822616577, + "learning_rate": 0.00027849181943689216, + "loss": 2.1151, + "step": 8781 + }, + { + "epoch": 1.0246178975615448, + "grad_norm": 1.3290754556655884, + "learning_rate": 0.0002784839812252574, + "loss": 1.8857, + "step": 8782 + }, + { + "epoch": 1.0247345700618364, + "grad_norm": 1.1916790008544922, + "learning_rate": 0.00027847614169719325, + "loss": 2.0437, + "step": 8783 + }, + { + "epoch": 1.0248512425621281, + "grad_norm": 1.2640352249145508, + "learning_rate": 0.000278468300852781, + "loss": 2.1923, + "step": 8784 + }, + { + "epoch": 1.0249679150624198, + "grad_norm": 1.211562156677246, + "learning_rate": 0.00027846045869210173, + "loss": 2.0349, + "step": 8785 + }, + { + "epoch": 1.0250845875627115, + "grad_norm": 1.1540685892105103, + "learning_rate": 0.000278452615215237, + "loss": 2.0645, + "step": 8786 + }, + { + "epoch": 1.0252012600630032, + "grad_norm": 1.1853848695755005, + "learning_rate": 0.00027844477042226795, + "loss": 2.0718, + "step": 8787 + }, + { + "epoch": 1.0253179325632948, + "grad_norm": 1.209814429283142, + "learning_rate": 0.000278436924313276, + "loss": 2.047, + "step": 8788 + }, + { + "epoch": 1.0254346050635865, + "grad_norm": 0.982694149017334, + "learning_rate": 0.00027842907688834244, + "loss": 1.9542, + "step": 8789 + }, + { + "epoch": 1.0255512775638782, + "grad_norm": 1.243787407875061, + "learning_rate": 0.0002784212281475486, + "loss": 2.0407, + "step": 8790 + }, + { + "epoch": 1.0256679500641699, + "grad_norm": 1.2239766120910645, + "learning_rate": 0.0002784133780909759, + "loss": 2.1123, + "step": 8791 + }, + { + "epoch": 1.0257846225644616, + "grad_norm": 1.112264633178711, + "learning_rate": 0.0002784055267187057, + "loss": 1.9384, + "step": 8792 + }, + { + "epoch": 1.0259012950647532, + "grad_norm": 0.9526240825653076, + "learning_rate": 0.0002783976740308195, + "loss": 1.928, + "step": 8793 + }, + { + "epoch": 1.026017967565045, + "grad_norm": 1.0406184196472168, + "learning_rate": 0.0002783898200273985, + "loss": 1.9517, + "step": 8794 + }, + { + "epoch": 1.0261346400653366, + "grad_norm": 1.1547185182571411, + "learning_rate": 0.00027838196470852424, + "loss": 2.1137, + "step": 8795 + }, + { + "epoch": 1.0262513125656283, + "grad_norm": 1.0774778127670288, + "learning_rate": 0.0002783741080742782, + "loss": 2.0562, + "step": 8796 + }, + { + "epoch": 1.02636798506592, + "grad_norm": 1.0546610355377197, + "learning_rate": 0.00027836625012474177, + "loss": 1.8149, + "step": 8797 + }, + { + "epoch": 1.0264846575662117, + "grad_norm": 1.1922143697738647, + "learning_rate": 0.0002783583908599964, + "loss": 2.0281, + "step": 8798 + }, + { + "epoch": 1.0266013300665033, + "grad_norm": 1.1093765497207642, + "learning_rate": 0.00027835053028012354, + "loss": 2.0025, + "step": 8799 + }, + { + "epoch": 1.026718002566795, + "grad_norm": 1.2224929332733154, + "learning_rate": 0.00027834266838520477, + "loss": 2.1267, + "step": 8800 + }, + { + "epoch": 1.0268346750670867, + "grad_norm": 1.0353941917419434, + "learning_rate": 0.0002783348051753215, + "loss": 2.0022, + "step": 8801 + }, + { + "epoch": 1.0269513475673784, + "grad_norm": 1.0847564935684204, + "learning_rate": 0.00027832694065055535, + "loss": 2.1527, + "step": 8802 + }, + { + "epoch": 1.02706802006767, + "grad_norm": 1.2783128023147583, + "learning_rate": 0.00027831907481098777, + "loss": 2.0128, + "step": 8803 + }, + { + "epoch": 1.0271846925679617, + "grad_norm": 1.1306607723236084, + "learning_rate": 0.0002783112076567003, + "loss": 2.1643, + "step": 8804 + }, + { + "epoch": 1.0273013650682534, + "grad_norm": 1.112945795059204, + "learning_rate": 0.0002783033391877746, + "loss": 2.022, + "step": 8805 + }, + { + "epoch": 1.027418037568545, + "grad_norm": 1.1831871271133423, + "learning_rate": 0.0002782954694042921, + "loss": 2.0637, + "step": 8806 + }, + { + "epoch": 1.0275347100688368, + "grad_norm": 1.1518025398254395, + "learning_rate": 0.0002782875983063345, + "loss": 2.037, + "step": 8807 + }, + { + "epoch": 1.0276513825691285, + "grad_norm": 1.1754765510559082, + "learning_rate": 0.0002782797258939833, + "loss": 2.0424, + "step": 8808 + }, + { + "epoch": 1.0277680550694202, + "grad_norm": 1.117531657218933, + "learning_rate": 0.0002782718521673202, + "loss": 2.0316, + "step": 8809 + }, + { + "epoch": 1.0278847275697118, + "grad_norm": 1.1303967237472534, + "learning_rate": 0.00027826397712642676, + "loss": 2.0507, + "step": 8810 + }, + { + "epoch": 1.0280014000700035, + "grad_norm": 1.2230921983718872, + "learning_rate": 0.00027825610077138467, + "loss": 1.9692, + "step": 8811 + }, + { + "epoch": 1.0281180725702952, + "grad_norm": 1.2521675825119019, + "learning_rate": 0.00027824822310227557, + "loss": 2.1673, + "step": 8812 + }, + { + "epoch": 1.0282347450705869, + "grad_norm": 1.105074167251587, + "learning_rate": 0.0002782403441191811, + "loss": 2.1281, + "step": 8813 + }, + { + "epoch": 1.0283514175708786, + "grad_norm": 1.1370817422866821, + "learning_rate": 0.000278232463822183, + "loss": 2.1081, + "step": 8814 + }, + { + "epoch": 1.0284680900711702, + "grad_norm": 1.293664813041687, + "learning_rate": 0.00027822458221136287, + "loss": 1.8977, + "step": 8815 + }, + { + "epoch": 1.028584762571462, + "grad_norm": 1.1958829164505005, + "learning_rate": 0.00027821669928680253, + "loss": 2.1307, + "step": 8816 + }, + { + "epoch": 1.0287014350717536, + "grad_norm": 1.7740507125854492, + "learning_rate": 0.00027820881504858364, + "loss": 2.0273, + "step": 8817 + }, + { + "epoch": 1.0288181075720453, + "grad_norm": 1.0974676609039307, + "learning_rate": 0.00027820092949678785, + "loss": 2.1309, + "step": 8818 + }, + { + "epoch": 1.028934780072337, + "grad_norm": 1.194632887840271, + "learning_rate": 0.00027819304263149706, + "loss": 2.0473, + "step": 8819 + }, + { + "epoch": 1.0290514525726286, + "grad_norm": 1.09019935131073, + "learning_rate": 0.000278185154452793, + "loss": 1.9587, + "step": 8820 + }, + { + "epoch": 1.0291681250729203, + "grad_norm": 1.153159499168396, + "learning_rate": 0.0002781772649607574, + "loss": 2.0123, + "step": 8821 + }, + { + "epoch": 1.029284797573212, + "grad_norm": 1.256695032119751, + "learning_rate": 0.000278169374155472, + "loss": 2.1232, + "step": 8822 + }, + { + "epoch": 1.0294014700735037, + "grad_norm": 1.3417302370071411, + "learning_rate": 0.0002781614820370188, + "loss": 2.1074, + "step": 8823 + }, + { + "epoch": 1.0295181425737954, + "grad_norm": 1.1904597282409668, + "learning_rate": 0.0002781535886054794, + "loss": 2.0094, + "step": 8824 + }, + { + "epoch": 1.029634815074087, + "grad_norm": 0.9488178491592407, + "learning_rate": 0.0002781456938609357, + "loss": 1.7812, + "step": 8825 + }, + { + "epoch": 1.0297514875743787, + "grad_norm": 1.107558250427246, + "learning_rate": 0.0002781377978034696, + "loss": 2.1204, + "step": 8826 + }, + { + "epoch": 1.0298681600746704, + "grad_norm": 1.067975640296936, + "learning_rate": 0.0002781299004331629, + "loss": 2.0235, + "step": 8827 + }, + { + "epoch": 1.029984832574962, + "grad_norm": 1.2171003818511963, + "learning_rate": 0.00027812200175009747, + "loss": 2.2239, + "step": 8828 + }, + { + "epoch": 1.0301015050752538, + "grad_norm": 1.0428277254104614, + "learning_rate": 0.0002781141017543552, + "loss": 1.9706, + "step": 8829 + }, + { + "epoch": 1.0302181775755455, + "grad_norm": 1.1872632503509521, + "learning_rate": 0.0002781062004460181, + "loss": 2.1527, + "step": 8830 + }, + { + "epoch": 1.0303348500758371, + "grad_norm": 1.115568995475769, + "learning_rate": 0.00027809829782516785, + "loss": 1.9521, + "step": 8831 + }, + { + "epoch": 1.0304515225761288, + "grad_norm": 0.993392288684845, + "learning_rate": 0.0002780903938918866, + "loss": 2.0237, + "step": 8832 + }, + { + "epoch": 1.0305681950764205, + "grad_norm": 1.182478427886963, + "learning_rate": 0.0002780824886462562, + "loss": 1.9785, + "step": 8833 + }, + { + "epoch": 1.0306848675767122, + "grad_norm": 1.1045560836791992, + "learning_rate": 0.0002780745820883586, + "loss": 2.0768, + "step": 8834 + }, + { + "epoch": 1.0308015400770039, + "grad_norm": 1.3310736417770386, + "learning_rate": 0.0002780666742182757, + "loss": 2.1291, + "step": 8835 + }, + { + "epoch": 1.0309182125772955, + "grad_norm": 1.0393128395080566, + "learning_rate": 0.00027805876503608957, + "loss": 1.8639, + "step": 8836 + }, + { + "epoch": 1.0310348850775872, + "grad_norm": 1.276726245880127, + "learning_rate": 0.00027805085454188225, + "loss": 2.1164, + "step": 8837 + }, + { + "epoch": 1.031151557577879, + "grad_norm": 1.0570024251937866, + "learning_rate": 0.0002780429427357356, + "loss": 1.8319, + "step": 8838 + }, + { + "epoch": 1.0312682300781706, + "grad_norm": 1.2614045143127441, + "learning_rate": 0.00027803502961773176, + "loss": 1.8881, + "step": 8839 + }, + { + "epoch": 1.0313849025784623, + "grad_norm": 1.1903163194656372, + "learning_rate": 0.00027802711518795274, + "loss": 1.9678, + "step": 8840 + }, + { + "epoch": 1.031501575078754, + "grad_norm": 1.2618528604507446, + "learning_rate": 0.0002780191994464805, + "loss": 1.9768, + "step": 8841 + }, + { + "epoch": 1.0316182475790456, + "grad_norm": 1.192734718322754, + "learning_rate": 0.0002780112823933973, + "loss": 2.0685, + "step": 8842 + }, + { + "epoch": 1.0317349200793373, + "grad_norm": 1.0661274194717407, + "learning_rate": 0.00027800336402878495, + "loss": 1.7927, + "step": 8843 + }, + { + "epoch": 1.031851592579629, + "grad_norm": 1.1246366500854492, + "learning_rate": 0.00027799544435272574, + "loss": 2.0657, + "step": 8844 + }, + { + "epoch": 1.0319682650799207, + "grad_norm": 1.2906720638275146, + "learning_rate": 0.0002779875233653017, + "loss": 2.1464, + "step": 8845 + }, + { + "epoch": 1.0320849375802124, + "grad_norm": 1.2213131189346313, + "learning_rate": 0.000277979601066595, + "loss": 2.0448, + "step": 8846 + }, + { + "epoch": 1.032201610080504, + "grad_norm": 1.305462121963501, + "learning_rate": 0.0002779716774566877, + "loss": 2.0338, + "step": 8847 + }, + { + "epoch": 1.0323182825807957, + "grad_norm": 1.349643588066101, + "learning_rate": 0.000277963752535662, + "loss": 2.1261, + "step": 8848 + }, + { + "epoch": 1.0324349550810874, + "grad_norm": 1.3503611087799072, + "learning_rate": 0.00027795582630359996, + "loss": 2.1481, + "step": 8849 + }, + { + "epoch": 1.032551627581379, + "grad_norm": 1.0370204448699951, + "learning_rate": 0.0002779478987605838, + "loss": 1.9435, + "step": 8850 + }, + { + "epoch": 1.0326683000816708, + "grad_norm": 1.2538262605667114, + "learning_rate": 0.00027793996990669586, + "loss": 1.9875, + "step": 8851 + }, + { + "epoch": 1.0327849725819624, + "grad_norm": 1.2269270420074463, + "learning_rate": 0.00027793203974201805, + "loss": 2.2583, + "step": 8852 + }, + { + "epoch": 1.0329016450822541, + "grad_norm": 1.3321806192398071, + "learning_rate": 0.00027792410826663283, + "loss": 2.1852, + "step": 8853 + }, + { + "epoch": 1.0330183175825458, + "grad_norm": 1.330703616142273, + "learning_rate": 0.00027791617548062223, + "loss": 1.9708, + "step": 8854 + }, + { + "epoch": 1.0331349900828375, + "grad_norm": 1.102867603302002, + "learning_rate": 0.00027790824138406865, + "loss": 2.0995, + "step": 8855 + }, + { + "epoch": 1.0332516625831292, + "grad_norm": 1.3331435918807983, + "learning_rate": 0.0002779003059770543, + "loss": 2.146, + "step": 8856 + }, + { + "epoch": 1.0333683350834209, + "grad_norm": 1.2715052366256714, + "learning_rate": 0.00027789236925966133, + "loss": 1.9581, + "step": 8857 + }, + { + "epoch": 1.0334850075837125, + "grad_norm": 1.2429187297821045, + "learning_rate": 0.0002778844312319722, + "loss": 2.0015, + "step": 8858 + }, + { + "epoch": 1.0336016800840042, + "grad_norm": 1.218944787979126, + "learning_rate": 0.000277876491894069, + "loss": 2.072, + "step": 8859 + }, + { + "epoch": 1.033718352584296, + "grad_norm": 0.9513140320777893, + "learning_rate": 0.0002778685512460343, + "loss": 1.7706, + "step": 8860 + }, + { + "epoch": 1.0338350250845876, + "grad_norm": 1.1809306144714355, + "learning_rate": 0.0002778606092879501, + "loss": 1.9906, + "step": 8861 + }, + { + "epoch": 1.0339516975848793, + "grad_norm": 1.135770320892334, + "learning_rate": 0.00027785266601989904, + "loss": 2.2153, + "step": 8862 + }, + { + "epoch": 1.034068370085171, + "grad_norm": 1.0926520824432373, + "learning_rate": 0.00027784472144196326, + "loss": 1.7847, + "step": 8863 + }, + { + "epoch": 1.0341850425854626, + "grad_norm": 1.1619987487792969, + "learning_rate": 0.0002778367755542252, + "loss": 2.0753, + "step": 8864 + }, + { + "epoch": 1.0343017150857543, + "grad_norm": 1.26513671875, + "learning_rate": 0.0002778288283567673, + "loss": 2.0008, + "step": 8865 + }, + { + "epoch": 1.034418387586046, + "grad_norm": 1.1973721981048584, + "learning_rate": 0.00027782087984967176, + "loss": 2.0445, + "step": 8866 + }, + { + "epoch": 1.0345350600863377, + "grad_norm": 1.0169360637664795, + "learning_rate": 0.00027781293003302114, + "loss": 2.0324, + "step": 8867 + }, + { + "epoch": 1.0346517325866293, + "grad_norm": 1.256615161895752, + "learning_rate": 0.00027780497890689775, + "loss": 2.145, + "step": 8868 + }, + { + "epoch": 1.034768405086921, + "grad_norm": 1.1990654468536377, + "learning_rate": 0.00027779702647138415, + "loss": 1.9416, + "step": 8869 + }, + { + "epoch": 1.0348850775872127, + "grad_norm": 1.3305789232254028, + "learning_rate": 0.0002777890727265627, + "loss": 1.9911, + "step": 8870 + }, + { + "epoch": 1.0350017500875044, + "grad_norm": 1.1946964263916016, + "learning_rate": 0.00027778111767251583, + "loss": 2.1024, + "step": 8871 + }, + { + "epoch": 1.035118422587796, + "grad_norm": 1.2459850311279297, + "learning_rate": 0.000277773161309326, + "loss": 2.2982, + "step": 8872 + }, + { + "epoch": 1.0352350950880878, + "grad_norm": 1.3698877096176147, + "learning_rate": 0.0002777652036370758, + "loss": 2.0948, + "step": 8873 + }, + { + "epoch": 1.0353517675883794, + "grad_norm": 1.1233516931533813, + "learning_rate": 0.00027775724465584767, + "loss": 2.0428, + "step": 8874 + }, + { + "epoch": 1.0354684400886711, + "grad_norm": 1.2948588132858276, + "learning_rate": 0.0002777492843657241, + "loss": 2.0332, + "step": 8875 + }, + { + "epoch": 1.0355851125889628, + "grad_norm": 1.2970420122146606, + "learning_rate": 0.00027774132276678766, + "loss": 2.1698, + "step": 8876 + }, + { + "epoch": 1.0357017850892545, + "grad_norm": 1.2025597095489502, + "learning_rate": 0.00027773335985912075, + "loss": 2.0361, + "step": 8877 + }, + { + "epoch": 1.0358184575895462, + "grad_norm": 1.3240916728973389, + "learning_rate": 0.0002777253956428061, + "loss": 2.2815, + "step": 8878 + }, + { + "epoch": 1.0359351300898378, + "grad_norm": 1.2047933340072632, + "learning_rate": 0.00027771743011792616, + "loss": 1.9722, + "step": 8879 + }, + { + "epoch": 1.0360518025901295, + "grad_norm": 1.3737297058105469, + "learning_rate": 0.00027770946328456355, + "loss": 2.1726, + "step": 8880 + }, + { + "epoch": 1.0361684750904212, + "grad_norm": 1.0671954154968262, + "learning_rate": 0.0002777014951428009, + "loss": 1.9008, + "step": 8881 + }, + { + "epoch": 1.0362851475907129, + "grad_norm": 1.3361129760742188, + "learning_rate": 0.0002776935256927207, + "loss": 2.2353, + "step": 8882 + }, + { + "epoch": 1.0364018200910046, + "grad_norm": 0.9833676218986511, + "learning_rate": 0.00027768555493440573, + "loss": 1.8882, + "step": 8883 + }, + { + "epoch": 1.0365184925912962, + "grad_norm": 1.0997051000595093, + "learning_rate": 0.0002776775828679385, + "loss": 1.871, + "step": 8884 + }, + { + "epoch": 1.036635165091588, + "grad_norm": 1.1071906089782715, + "learning_rate": 0.0002776696094934017, + "loss": 2.1216, + "step": 8885 + }, + { + "epoch": 1.0367518375918796, + "grad_norm": 1.0244262218475342, + "learning_rate": 0.00027766163481087787, + "loss": 2.0352, + "step": 8886 + }, + { + "epoch": 1.0368685100921713, + "grad_norm": 1.0871492624282837, + "learning_rate": 0.0002776536588204499, + "loss": 1.9954, + "step": 8887 + }, + { + "epoch": 1.036985182592463, + "grad_norm": 1.1262189149856567, + "learning_rate": 0.0002776456815222003, + "loss": 2.0455, + "step": 8888 + }, + { + "epoch": 1.0371018550927547, + "grad_norm": 1.1773192882537842, + "learning_rate": 0.0002776377029162119, + "loss": 2.0536, + "step": 8889 + }, + { + "epoch": 1.0372185275930463, + "grad_norm": 1.2335513830184937, + "learning_rate": 0.00027762972300256726, + "loss": 2.2227, + "step": 8890 + }, + { + "epoch": 1.037335200093338, + "grad_norm": 1.5021415948867798, + "learning_rate": 0.00027762174178134927, + "loss": 1.9944, + "step": 8891 + }, + { + "epoch": 1.0374518725936297, + "grad_norm": 1.2339730262756348, + "learning_rate": 0.00027761375925264054, + "loss": 2.0878, + "step": 8892 + }, + { + "epoch": 1.0375685450939214, + "grad_norm": 1.3152306079864502, + "learning_rate": 0.00027760577541652396, + "loss": 2.0962, + "step": 8893 + }, + { + "epoch": 1.037685217594213, + "grad_norm": 1.2403055429458618, + "learning_rate": 0.00027759779027308214, + "loss": 2.1887, + "step": 8894 + }, + { + "epoch": 1.0378018900945047, + "grad_norm": 1.3205935955047607, + "learning_rate": 0.0002775898038223979, + "loss": 1.9717, + "step": 8895 + }, + { + "epoch": 1.0379185625947964, + "grad_norm": 1.1838940382003784, + "learning_rate": 0.0002775818160645542, + "loss": 2.0544, + "step": 8896 + }, + { + "epoch": 1.038035235095088, + "grad_norm": 1.168263554573059, + "learning_rate": 0.0002775738269996337, + "loss": 2.0843, + "step": 8897 + }, + { + "epoch": 1.0381519075953798, + "grad_norm": 1.283225655555725, + "learning_rate": 0.00027756583662771916, + "loss": 2.1587, + "step": 8898 + }, + { + "epoch": 1.0382685800956715, + "grad_norm": 1.2145400047302246, + "learning_rate": 0.00027755784494889357, + "loss": 1.9828, + "step": 8899 + }, + { + "epoch": 1.0383852525959631, + "grad_norm": 1.2799615859985352, + "learning_rate": 0.0002775498519632397, + "loss": 2.135, + "step": 8900 + }, + { + "epoch": 1.0385019250962548, + "grad_norm": 1.664087176322937, + "learning_rate": 0.00027754185767084035, + "loss": 2.1001, + "step": 8901 + }, + { + "epoch": 1.0386185975965465, + "grad_norm": 1.266829252243042, + "learning_rate": 0.00027753386207177855, + "loss": 2.1186, + "step": 8902 + }, + { + "epoch": 1.0387352700968382, + "grad_norm": 1.201251745223999, + "learning_rate": 0.000277525865166137, + "loss": 2.1533, + "step": 8903 + }, + { + "epoch": 1.0388519425971299, + "grad_norm": 1.3261210918426514, + "learning_rate": 0.0002775178669539988, + "loss": 2.2365, + "step": 8904 + }, + { + "epoch": 1.0389686150974216, + "grad_norm": 1.0947729349136353, + "learning_rate": 0.00027750986743544677, + "loss": 2.0664, + "step": 8905 + }, + { + "epoch": 1.0390852875977132, + "grad_norm": 1.216394305229187, + "learning_rate": 0.0002775018666105638, + "loss": 2.0952, + "step": 8906 + }, + { + "epoch": 1.039201960098005, + "grad_norm": 1.205283522605896, + "learning_rate": 0.00027749386447943294, + "loss": 2.0567, + "step": 8907 + }, + { + "epoch": 1.0393186325982966, + "grad_norm": 1.1028059720993042, + "learning_rate": 0.00027748586104213695, + "loss": 2.011, + "step": 8908 + }, + { + "epoch": 1.0394353050985883, + "grad_norm": 1.150165319442749, + "learning_rate": 0.0002774778562987591, + "loss": 2.0012, + "step": 8909 + }, + { + "epoch": 1.03955197759888, + "grad_norm": 1.2839062213897705, + "learning_rate": 0.00027746985024938215, + "loss": 1.9573, + "step": 8910 + }, + { + "epoch": 1.0396686500991716, + "grad_norm": 1.2829228639602661, + "learning_rate": 0.00027746184289408917, + "loss": 2.0974, + "step": 8911 + }, + { + "epoch": 1.0397853225994633, + "grad_norm": 1.161543607711792, + "learning_rate": 0.00027745383423296313, + "loss": 2.194, + "step": 8912 + }, + { + "epoch": 1.039901995099755, + "grad_norm": 1.4036509990692139, + "learning_rate": 0.0002774458242660871, + "loss": 2.1507, + "step": 8913 + }, + { + "epoch": 1.0400186676000467, + "grad_norm": 1.3956220149993896, + "learning_rate": 0.0002774378129935441, + "loss": 2.1973, + "step": 8914 + }, + { + "epoch": 1.0401353401003384, + "grad_norm": 1.2576677799224854, + "learning_rate": 0.0002774298004154172, + "loss": 2.024, + "step": 8915 + }, + { + "epoch": 1.04025201260063, + "grad_norm": 1.2527408599853516, + "learning_rate": 0.0002774217865317894, + "loss": 2.1662, + "step": 8916 + }, + { + "epoch": 1.0403686851009217, + "grad_norm": 1.1583929061889648, + "learning_rate": 0.0002774137713427439, + "loss": 2.0481, + "step": 8917 + }, + { + "epoch": 1.0404853576012134, + "grad_norm": 1.2582197189331055, + "learning_rate": 0.00027740575484836366, + "loss": 2.0597, + "step": 8918 + }, + { + "epoch": 1.040602030101505, + "grad_norm": 1.1362075805664062, + "learning_rate": 0.0002773977370487319, + "loss": 2.0777, + "step": 8919 + }, + { + "epoch": 1.0407187026017968, + "grad_norm": 1.2490930557250977, + "learning_rate": 0.0002773897179439317, + "loss": 2.1727, + "step": 8920 + }, + { + "epoch": 1.0408353751020885, + "grad_norm": 1.4472897052764893, + "learning_rate": 0.00027738169753404614, + "loss": 2.164, + "step": 8921 + }, + { + "epoch": 1.0409520476023801, + "grad_norm": 1.3365968465805054, + "learning_rate": 0.0002773736758191584, + "loss": 2.0504, + "step": 8922 + }, + { + "epoch": 1.0410687201026718, + "grad_norm": 1.1817964315414429, + "learning_rate": 0.0002773656527993517, + "loss": 2.0875, + "step": 8923 + }, + { + "epoch": 1.0411853926029635, + "grad_norm": 1.1957645416259766, + "learning_rate": 0.0002773576284747091, + "loss": 2.1509, + "step": 8924 + }, + { + "epoch": 1.0413020651032552, + "grad_norm": 1.326164960861206, + "learning_rate": 0.00027734960284531385, + "loss": 2.1602, + "step": 8925 + }, + { + "epoch": 1.0414187376035469, + "grad_norm": 1.265036940574646, + "learning_rate": 0.00027734157591124926, + "loss": 1.8274, + "step": 8926 + }, + { + "epoch": 1.0415354101038385, + "grad_norm": 1.2169567346572876, + "learning_rate": 0.0002773335476725983, + "loss": 2.073, + "step": 8927 + }, + { + "epoch": 1.0416520826041302, + "grad_norm": 1.1108307838439941, + "learning_rate": 0.00027732551812944444, + "loss": 2.1616, + "step": 8928 + }, + { + "epoch": 1.041768755104422, + "grad_norm": 1.0887714624404907, + "learning_rate": 0.0002773174872818707, + "loss": 1.9776, + "step": 8929 + }, + { + "epoch": 1.0418854276047136, + "grad_norm": 1.16782808303833, + "learning_rate": 0.00027730945512996057, + "loss": 2.1781, + "step": 8930 + }, + { + "epoch": 1.0420021001050053, + "grad_norm": 1.181111216545105, + "learning_rate": 0.0002773014216737971, + "loss": 2.027, + "step": 8931 + }, + { + "epoch": 1.042118772605297, + "grad_norm": 1.238542079925537, + "learning_rate": 0.00027729338691346374, + "loss": 1.9652, + "step": 8932 + }, + { + "epoch": 1.0422354451055886, + "grad_norm": 1.4443281888961792, + "learning_rate": 0.00027728535084904366, + "loss": 2.1366, + "step": 8933 + }, + { + "epoch": 1.0423521176058803, + "grad_norm": 1.1644365787506104, + "learning_rate": 0.00027727731348062024, + "loss": 2.0173, + "step": 8934 + }, + { + "epoch": 1.042468790106172, + "grad_norm": 1.293247938156128, + "learning_rate": 0.00027726927480827685, + "loss": 2.1986, + "step": 8935 + }, + { + "epoch": 1.0425854626064637, + "grad_norm": 1.1435471773147583, + "learning_rate": 0.0002772612348320967, + "loss": 2.1253, + "step": 8936 + }, + { + "epoch": 1.0427021351067554, + "grad_norm": 1.1872448921203613, + "learning_rate": 0.0002772531935521632, + "loss": 1.9765, + "step": 8937 + }, + { + "epoch": 1.042818807607047, + "grad_norm": 1.0692615509033203, + "learning_rate": 0.0002772451509685597, + "loss": 1.9452, + "step": 8938 + }, + { + "epoch": 1.0429354801073387, + "grad_norm": 1.2560431957244873, + "learning_rate": 0.00027723710708136965, + "loss": 2.1579, + "step": 8939 + }, + { + "epoch": 1.0430521526076304, + "grad_norm": 1.1176836490631104, + "learning_rate": 0.0002772290618906763, + "loss": 2.1432, + "step": 8940 + }, + { + "epoch": 1.043168825107922, + "grad_norm": 1.2016528844833374, + "learning_rate": 0.0002772210153965633, + "loss": 2.2028, + "step": 8941 + }, + { + "epoch": 1.0432854976082138, + "grad_norm": 1.16740882396698, + "learning_rate": 0.0002772129675991137, + "loss": 1.9077, + "step": 8942 + }, + { + "epoch": 1.0434021701085054, + "grad_norm": 1.1940975189208984, + "learning_rate": 0.00027720491849841125, + "loss": 2.1942, + "step": 8943 + }, + { + "epoch": 1.0435188426087971, + "grad_norm": 1.209976315498352, + "learning_rate": 0.00027719686809453924, + "loss": 2.1464, + "step": 8944 + }, + { + "epoch": 1.0436355151090888, + "grad_norm": 1.1096906661987305, + "learning_rate": 0.0002771888163875812, + "loss": 2.0647, + "step": 8945 + }, + { + "epoch": 1.0437521876093805, + "grad_norm": 1.0400429964065552, + "learning_rate": 0.00027718076337762053, + "loss": 1.8875, + "step": 8946 + }, + { + "epoch": 1.0438688601096722, + "grad_norm": 1.2014433145523071, + "learning_rate": 0.0002771727090647408, + "loss": 1.9623, + "step": 8947 + }, + { + "epoch": 1.0439855326099639, + "grad_norm": 1.2250697612762451, + "learning_rate": 0.00027716465344902546, + "loss": 2.1068, + "step": 8948 + }, + { + "epoch": 1.0441022051102555, + "grad_norm": 1.4024370908737183, + "learning_rate": 0.00027715659653055794, + "loss": 2.2691, + "step": 8949 + }, + { + "epoch": 1.0442188776105472, + "grad_norm": 1.068717122077942, + "learning_rate": 0.00027714853830942195, + "loss": 1.9543, + "step": 8950 + }, + { + "epoch": 1.044335550110839, + "grad_norm": 1.0595226287841797, + "learning_rate": 0.00027714047878570086, + "loss": 2.0387, + "step": 8951 + }, + { + "epoch": 1.0444522226111306, + "grad_norm": 1.2328404188156128, + "learning_rate": 0.00027713241795947825, + "loss": 1.9968, + "step": 8952 + }, + { + "epoch": 1.0445688951114223, + "grad_norm": 1.4256110191345215, + "learning_rate": 0.0002771243558308378, + "loss": 2.0785, + "step": 8953 + }, + { + "epoch": 1.044685567611714, + "grad_norm": 1.1378482580184937, + "learning_rate": 0.000277116292399863, + "loss": 2.2506, + "step": 8954 + }, + { + "epoch": 1.0448022401120056, + "grad_norm": 1.1652264595031738, + "learning_rate": 0.0002771082276666374, + "loss": 2.1268, + "step": 8955 + }, + { + "epoch": 1.0449189126122973, + "grad_norm": 1.2726528644561768, + "learning_rate": 0.00027710016163124475, + "loss": 2.1857, + "step": 8956 + }, + { + "epoch": 1.045035585112589, + "grad_norm": 1.290517807006836, + "learning_rate": 0.00027709209429376854, + "loss": 2.1506, + "step": 8957 + }, + { + "epoch": 1.0451522576128807, + "grad_norm": 1.224327802658081, + "learning_rate": 0.0002770840256542924, + "loss": 2.1277, + "step": 8958 + }, + { + "epoch": 1.0452689301131723, + "grad_norm": 1.0330942869186401, + "learning_rate": 0.0002770759557129001, + "loss": 1.958, + "step": 8959 + }, + { + "epoch": 1.045385602613464, + "grad_norm": 1.3692070245742798, + "learning_rate": 0.00027706788446967516, + "loss": 2.0152, + "step": 8960 + }, + { + "epoch": 1.0455022751137557, + "grad_norm": 1.0713632106781006, + "learning_rate": 0.0002770598119247013, + "loss": 2.0926, + "step": 8961 + }, + { + "epoch": 1.0456189476140474, + "grad_norm": 1.0386263132095337, + "learning_rate": 0.0002770517380780623, + "loss": 2.2134, + "step": 8962 + }, + { + "epoch": 1.045735620114339, + "grad_norm": 1.3348023891448975, + "learning_rate": 0.00027704366292984176, + "loss": 2.0264, + "step": 8963 + }, + { + "epoch": 1.0458522926146308, + "grad_norm": 1.2625501155853271, + "learning_rate": 0.0002770355864801234, + "loss": 2.1812, + "step": 8964 + }, + { + "epoch": 1.0459689651149224, + "grad_norm": 1.228943109512329, + "learning_rate": 0.00027702750872899096, + "loss": 2.0686, + "step": 8965 + }, + { + "epoch": 1.0460856376152141, + "grad_norm": 1.2153180837631226, + "learning_rate": 0.0002770194296765282, + "loss": 1.8444, + "step": 8966 + }, + { + "epoch": 1.0462023101155058, + "grad_norm": 1.1364995241165161, + "learning_rate": 0.0002770113493228188, + "loss": 2.0097, + "step": 8967 + }, + { + "epoch": 1.0463189826157975, + "grad_norm": 1.0647897720336914, + "learning_rate": 0.0002770032676679467, + "loss": 2.0114, + "step": 8968 + }, + { + "epoch": 1.0464356551160892, + "grad_norm": 1.1720366477966309, + "learning_rate": 0.00027699518471199547, + "loss": 2.0376, + "step": 8969 + }, + { + "epoch": 1.0465523276163808, + "grad_norm": 1.1063640117645264, + "learning_rate": 0.00027698710045504903, + "loss": 1.9752, + "step": 8970 + }, + { + "epoch": 1.0466690001166725, + "grad_norm": 1.184598684310913, + "learning_rate": 0.0002769790148971912, + "loss": 2.0578, + "step": 8971 + }, + { + "epoch": 1.0467856726169642, + "grad_norm": 1.262190341949463, + "learning_rate": 0.00027697092803850577, + "loss": 2.0587, + "step": 8972 + }, + { + "epoch": 1.0469023451172559, + "grad_norm": 1.251043438911438, + "learning_rate": 0.00027696283987907656, + "loss": 2.0963, + "step": 8973 + }, + { + "epoch": 1.0470190176175476, + "grad_norm": 1.0949803590774536, + "learning_rate": 0.0002769547504189874, + "loss": 1.7875, + "step": 8974 + }, + { + "epoch": 1.0471356901178392, + "grad_norm": 1.123026967048645, + "learning_rate": 0.00027694665965832224, + "loss": 1.9572, + "step": 8975 + }, + { + "epoch": 1.047252362618131, + "grad_norm": 1.200104832649231, + "learning_rate": 0.0002769385675971648, + "loss": 2.1037, + "step": 8976 + }, + { + "epoch": 1.0473690351184226, + "grad_norm": 1.2237775325775146, + "learning_rate": 0.0002769304742355992, + "loss": 2.0386, + "step": 8977 + }, + { + "epoch": 1.0474857076187143, + "grad_norm": 1.1523629426956177, + "learning_rate": 0.00027692237957370914, + "loss": 1.9032, + "step": 8978 + }, + { + "epoch": 1.047602380119006, + "grad_norm": 1.1545665264129639, + "learning_rate": 0.0002769142836115786, + "loss": 1.9605, + "step": 8979 + }, + { + "epoch": 1.0477190526192977, + "grad_norm": 1.173844575881958, + "learning_rate": 0.00027690618634929157, + "loss": 1.997, + "step": 8980 + }, + { + "epoch": 1.0478357251195893, + "grad_norm": 1.1707327365875244, + "learning_rate": 0.0002768980877869319, + "loss": 1.9861, + "step": 8981 + }, + { + "epoch": 1.047952397619881, + "grad_norm": 1.1342962980270386, + "learning_rate": 0.0002768899879245836, + "loss": 2.0819, + "step": 8982 + }, + { + "epoch": 1.0480690701201727, + "grad_norm": 1.2173380851745605, + "learning_rate": 0.0002768818867623306, + "loss": 2.0313, + "step": 8983 + }, + { + "epoch": 1.0481857426204644, + "grad_norm": 1.255187749862671, + "learning_rate": 0.00027687378430025697, + "loss": 2.1834, + "step": 8984 + }, + { + "epoch": 1.048302415120756, + "grad_norm": 1.1390645503997803, + "learning_rate": 0.0002768656805384466, + "loss": 2.0394, + "step": 8985 + }, + { + "epoch": 1.0484190876210477, + "grad_norm": 1.210516095161438, + "learning_rate": 0.0002768575754769836, + "loss": 2.0533, + "step": 8986 + }, + { + "epoch": 1.0485357601213394, + "grad_norm": 1.280390977859497, + "learning_rate": 0.0002768494691159519, + "loss": 1.9776, + "step": 8987 + }, + { + "epoch": 1.048652432621631, + "grad_norm": 1.2039356231689453, + "learning_rate": 0.00027684136145543557, + "loss": 2.0723, + "step": 8988 + }, + { + "epoch": 1.0487691051219228, + "grad_norm": 1.1697885990142822, + "learning_rate": 0.0002768332524955187, + "loss": 1.8443, + "step": 8989 + }, + { + "epoch": 1.0488857776222145, + "grad_norm": 1.1126084327697754, + "learning_rate": 0.0002768251422362854, + "loss": 1.9315, + "step": 8990 + }, + { + "epoch": 1.0490024501225061, + "grad_norm": 1.1231951713562012, + "learning_rate": 0.0002768170306778196, + "loss": 2.0083, + "step": 8991 + }, + { + "epoch": 1.0491191226227978, + "grad_norm": 1.3342986106872559, + "learning_rate": 0.0002768089178202055, + "loss": 2.073, + "step": 8992 + }, + { + "epoch": 1.0492357951230895, + "grad_norm": 1.3511295318603516, + "learning_rate": 0.00027680080366352717, + "loss": 2.1173, + "step": 8993 + }, + { + "epoch": 1.0493524676233812, + "grad_norm": 1.0262317657470703, + "learning_rate": 0.0002767926882078687, + "loss": 1.883, + "step": 8994 + }, + { + "epoch": 1.0494691401236729, + "grad_norm": 1.3445383310317993, + "learning_rate": 0.0002767845714533143, + "loss": 2.151, + "step": 8995 + }, + { + "epoch": 1.0495858126239646, + "grad_norm": 1.312991738319397, + "learning_rate": 0.000276776453399948, + "loss": 2.2777, + "step": 8996 + }, + { + "epoch": 1.0497024851242562, + "grad_norm": 1.090330958366394, + "learning_rate": 0.0002767683340478541, + "loss": 2.0759, + "step": 8997 + }, + { + "epoch": 1.049819157624548, + "grad_norm": 1.106294870376587, + "learning_rate": 0.0002767602133971167, + "loss": 1.9552, + "step": 8998 + }, + { + "epoch": 1.0499358301248396, + "grad_norm": 1.2627832889556885, + "learning_rate": 0.00027675209144782, + "loss": 1.9626, + "step": 8999 + }, + { + "epoch": 1.0500525026251313, + "grad_norm": 1.3095756769180298, + "learning_rate": 0.0002767439682000481, + "loss": 2.2653, + "step": 9000 + }, + { + "epoch": 1.050169175125423, + "grad_norm": 1.1094481945037842, + "learning_rate": 0.00027673584365388537, + "loss": 2.023, + "step": 9001 + }, + { + "epoch": 1.0502858476257146, + "grad_norm": 1.1488072872161865, + "learning_rate": 0.00027672771780941596, + "loss": 2.0646, + "step": 9002 + }, + { + "epoch": 1.0504025201260063, + "grad_norm": 1.1627832651138306, + "learning_rate": 0.0002767195906667241, + "loss": 2.1048, + "step": 9003 + }, + { + "epoch": 1.050519192626298, + "grad_norm": 1.5224653482437134, + "learning_rate": 0.00027671146222589413, + "loss": 2.1257, + "step": 9004 + }, + { + "epoch": 1.0506358651265897, + "grad_norm": 1.1193078756332397, + "learning_rate": 0.0002767033324870102, + "loss": 2.0298, + "step": 9005 + }, + { + "epoch": 1.0507525376268814, + "grad_norm": 1.3435240983963013, + "learning_rate": 0.0002766952014501566, + "loss": 1.9057, + "step": 9006 + }, + { + "epoch": 1.050869210127173, + "grad_norm": 1.1998666524887085, + "learning_rate": 0.00027668706911541767, + "loss": 1.9705, + "step": 9007 + }, + { + "epoch": 1.0509858826274647, + "grad_norm": 1.3453034162521362, + "learning_rate": 0.0002766789354828778, + "loss": 2.0731, + "step": 9008 + }, + { + "epoch": 1.0511025551277564, + "grad_norm": 1.3866620063781738, + "learning_rate": 0.0002766708005526211, + "loss": 2.1222, + "step": 9009 + }, + { + "epoch": 1.051219227628048, + "grad_norm": 1.4078880548477173, + "learning_rate": 0.00027666266432473205, + "loss": 2.2455, + "step": 9010 + }, + { + "epoch": 1.0513359001283398, + "grad_norm": 1.361546516418457, + "learning_rate": 0.000276654526799295, + "loss": 2.1646, + "step": 9011 + }, + { + "epoch": 1.0514525726286315, + "grad_norm": 1.2852352857589722, + "learning_rate": 0.00027664638797639425, + "loss": 1.914, + "step": 9012 + }, + { + "epoch": 1.0515692451289231, + "grad_norm": 1.2121278047561646, + "learning_rate": 0.0002766382478561142, + "loss": 2.0509, + "step": 9013 + }, + { + "epoch": 1.0516859176292148, + "grad_norm": 1.243377685546875, + "learning_rate": 0.0002766301064385393, + "loss": 2.1686, + "step": 9014 + }, + { + "epoch": 1.0518025901295065, + "grad_norm": 1.2353010177612305, + "learning_rate": 0.00027662196372375386, + "loss": 2.2263, + "step": 9015 + }, + { + "epoch": 1.0519192626297982, + "grad_norm": 1.0806976556777954, + "learning_rate": 0.0002766138197118423, + "loss": 1.9504, + "step": 9016 + }, + { + "epoch": 1.0520359351300899, + "grad_norm": 1.2583180665969849, + "learning_rate": 0.00027660567440288914, + "loss": 2.1317, + "step": 9017 + }, + { + "epoch": 1.0521526076303815, + "grad_norm": 1.2451462745666504, + "learning_rate": 0.0002765975277969787, + "loss": 2.086, + "step": 9018 + }, + { + "epoch": 1.0522692801306732, + "grad_norm": 1.6680339574813843, + "learning_rate": 0.0002765893798941956, + "loss": 2.2928, + "step": 9019 + }, + { + "epoch": 1.052385952630965, + "grad_norm": 1.2179476022720337, + "learning_rate": 0.000276581230694624, + "loss": 2.1987, + "step": 9020 + }, + { + "epoch": 1.0525026251312566, + "grad_norm": 1.1253862380981445, + "learning_rate": 0.00027657308019834873, + "loss": 1.9743, + "step": 9021 + }, + { + "epoch": 1.0526192976315483, + "grad_norm": 1.2252869606018066, + "learning_rate": 0.0002765649284054541, + "loss": 2.2415, + "step": 9022 + }, + { + "epoch": 1.05273597013184, + "grad_norm": 1.1617258787155151, + "learning_rate": 0.00027655677531602467, + "loss": 2.0509, + "step": 9023 + }, + { + "epoch": 1.0528526426321316, + "grad_norm": 1.3719254732131958, + "learning_rate": 0.0002765486209301449, + "loss": 2.1413, + "step": 9024 + }, + { + "epoch": 1.0529693151324233, + "grad_norm": 1.1787364482879639, + "learning_rate": 0.0002765404652478995, + "loss": 1.9986, + "step": 9025 + }, + { + "epoch": 1.053085987632715, + "grad_norm": 1.2710044384002686, + "learning_rate": 0.00027653230826937277, + "loss": 2.0369, + "step": 9026 + }, + { + "epoch": 1.0532026601330067, + "grad_norm": 1.4237339496612549, + "learning_rate": 0.00027652414999464946, + "loss": 2.1631, + "step": 9027 + }, + { + "epoch": 1.0533193326332984, + "grad_norm": 1.3476570844650269, + "learning_rate": 0.00027651599042381405, + "loss": 1.904, + "step": 9028 + }, + { + "epoch": 1.05343600513359, + "grad_norm": 1.255308747291565, + "learning_rate": 0.0002765078295569512, + "loss": 2.0739, + "step": 9029 + }, + { + "epoch": 1.0535526776338817, + "grad_norm": 1.138414978981018, + "learning_rate": 0.0002764996673941454, + "loss": 1.8988, + "step": 9030 + }, + { + "epoch": 1.0536693501341734, + "grad_norm": 1.2277988195419312, + "learning_rate": 0.00027649150393548136, + "loss": 2.1019, + "step": 9031 + }, + { + "epoch": 1.053786022634465, + "grad_norm": 1.288956642150879, + "learning_rate": 0.0002764833391810437, + "loss": 2.0401, + "step": 9032 + }, + { + "epoch": 1.0539026951347568, + "grad_norm": 1.3677051067352295, + "learning_rate": 0.0002764751731309171, + "loss": 2.0636, + "step": 9033 + }, + { + "epoch": 1.0540193676350484, + "grad_norm": 1.2294483184814453, + "learning_rate": 0.00027646700578518615, + "loss": 1.9506, + "step": 9034 + }, + { + "epoch": 1.0541360401353401, + "grad_norm": 1.1512316465377808, + "learning_rate": 0.00027645883714393554, + "loss": 2.0858, + "step": 9035 + }, + { + "epoch": 1.0542527126356318, + "grad_norm": 1.3970381021499634, + "learning_rate": 0.00027645066720725, + "loss": 2.0876, + "step": 9036 + }, + { + "epoch": 1.0543693851359235, + "grad_norm": 1.5237236022949219, + "learning_rate": 0.00027644249597521414, + "loss": 2.0998, + "step": 9037 + }, + { + "epoch": 1.0544860576362152, + "grad_norm": 1.163118600845337, + "learning_rate": 0.00027643432344791265, + "loss": 2.2099, + "step": 9038 + }, + { + "epoch": 1.0546027301365068, + "grad_norm": 1.0881787538528442, + "learning_rate": 0.0002764261496254304, + "loss": 2.0944, + "step": 9039 + }, + { + "epoch": 1.0547194026367985, + "grad_norm": 1.194876790046692, + "learning_rate": 0.00027641797450785196, + "loss": 2.0653, + "step": 9040 + }, + { + "epoch": 1.0548360751370902, + "grad_norm": 1.1874984502792358, + "learning_rate": 0.00027640979809526227, + "loss": 1.9678, + "step": 9041 + }, + { + "epoch": 1.054952747637382, + "grad_norm": 1.0792856216430664, + "learning_rate": 0.0002764016203877459, + "loss": 2.1865, + "step": 9042 + }, + { + "epoch": 1.0550694201376736, + "grad_norm": 1.0591133832931519, + "learning_rate": 0.00027639344138538776, + "loss": 1.8362, + "step": 9043 + }, + { + "epoch": 1.0551860926379653, + "grad_norm": 1.1910874843597412, + "learning_rate": 0.00027638526108827256, + "loss": 2.0657, + "step": 9044 + }, + { + "epoch": 1.055302765138257, + "grad_norm": 1.227922797203064, + "learning_rate": 0.00027637707949648515, + "loss": 1.9647, + "step": 9045 + }, + { + "epoch": 1.0554194376385486, + "grad_norm": 1.2702058553695679, + "learning_rate": 0.00027636889661011036, + "loss": 2.1103, + "step": 9046 + }, + { + "epoch": 1.0555361101388403, + "grad_norm": 1.13642418384552, + "learning_rate": 0.000276360712429233, + "loss": 2.1045, + "step": 9047 + }, + { + "epoch": 1.055652782639132, + "grad_norm": 1.3745291233062744, + "learning_rate": 0.0002763525269539379, + "loss": 2.262, + "step": 9048 + }, + { + "epoch": 1.0557694551394237, + "grad_norm": 1.1929950714111328, + "learning_rate": 0.0002763443401843099, + "loss": 2.1674, + "step": 9049 + }, + { + "epoch": 1.0558861276397153, + "grad_norm": 1.0720913410186768, + "learning_rate": 0.0002763361521204339, + "loss": 2.0701, + "step": 9050 + }, + { + "epoch": 1.056002800140007, + "grad_norm": 1.2891552448272705, + "learning_rate": 0.00027632796276239486, + "loss": 2.2413, + "step": 9051 + }, + { + "epoch": 1.0561194726402987, + "grad_norm": 1.2798035144805908, + "learning_rate": 0.0002763197721102775, + "loss": 2.2197, + "step": 9052 + }, + { + "epoch": 1.0562361451405904, + "grad_norm": 1.2130005359649658, + "learning_rate": 0.0002763115801641669, + "loss": 2.2375, + "step": 9053 + }, + { + "epoch": 1.056352817640882, + "grad_norm": 1.204174518585205, + "learning_rate": 0.00027630338692414793, + "loss": 1.9776, + "step": 9054 + }, + { + "epoch": 1.0564694901411738, + "grad_norm": 1.3913856744766235, + "learning_rate": 0.00027629519239030553, + "loss": 2.109, + "step": 9055 + }, + { + "epoch": 1.0565861626414654, + "grad_norm": 1.3741308450698853, + "learning_rate": 0.0002762869965627246, + "loss": 2.0305, + "step": 9056 + }, + { + "epoch": 1.0567028351417571, + "grad_norm": 1.3456330299377441, + "learning_rate": 0.00027627879944149014, + "loss": 1.8767, + "step": 9057 + }, + { + "epoch": 1.0568195076420488, + "grad_norm": 1.144978404045105, + "learning_rate": 0.0002762706010266872, + "loss": 2.1158, + "step": 9058 + }, + { + "epoch": 1.0569361801423405, + "grad_norm": 1.2350778579711914, + "learning_rate": 0.0002762624013184006, + "loss": 2.105, + "step": 9059 + }, + { + "epoch": 1.0570528526426322, + "grad_norm": 1.0260390043258667, + "learning_rate": 0.00027625420031671555, + "loss": 1.811, + "step": 9060 + }, + { + "epoch": 1.0571695251429238, + "grad_norm": 1.0734834671020508, + "learning_rate": 0.00027624599802171697, + "loss": 1.9249, + "step": 9061 + }, + { + "epoch": 1.0572861976432155, + "grad_norm": 1.0975134372711182, + "learning_rate": 0.00027623779443348984, + "loss": 1.924, + "step": 9062 + }, + { + "epoch": 1.0574028701435072, + "grad_norm": 1.547428011894226, + "learning_rate": 0.00027622958955211927, + "loss": 2.1924, + "step": 9063 + }, + { + "epoch": 1.0575195426437989, + "grad_norm": 1.1448664665222168, + "learning_rate": 0.0002762213833776903, + "loss": 2.1958, + "step": 9064 + }, + { + "epoch": 1.0576362151440906, + "grad_norm": 1.2113491296768188, + "learning_rate": 0.0002762131759102881, + "loss": 1.9917, + "step": 9065 + }, + { + "epoch": 1.0577528876443822, + "grad_norm": 1.2494068145751953, + "learning_rate": 0.00027620496714999757, + "loss": 2.1158, + "step": 9066 + }, + { + "epoch": 1.057869560144674, + "grad_norm": 1.1492103338241577, + "learning_rate": 0.000276196757096904, + "loss": 1.9215, + "step": 9067 + }, + { + "epoch": 1.0579862326449656, + "grad_norm": 1.129713535308838, + "learning_rate": 0.0002761885457510924, + "loss": 1.9371, + "step": 9068 + }, + { + "epoch": 1.0581029051452573, + "grad_norm": 1.1982271671295166, + "learning_rate": 0.0002761803331126478, + "loss": 2.0574, + "step": 9069 + }, + { + "epoch": 1.058219577645549, + "grad_norm": 1.0938533544540405, + "learning_rate": 0.00027617211918165547, + "loss": 2.1282, + "step": 9070 + }, + { + "epoch": 1.0583362501458407, + "grad_norm": 1.1153026819229126, + "learning_rate": 0.00027616390395820056, + "loss": 1.9707, + "step": 9071 + }, + { + "epoch": 1.0584529226461323, + "grad_norm": 1.1629583835601807, + "learning_rate": 0.00027615568744236826, + "loss": 2.0149, + "step": 9072 + }, + { + "epoch": 1.058569595146424, + "grad_norm": 1.3525935411453247, + "learning_rate": 0.0002761474696342437, + "loss": 2.2733, + "step": 9073 + }, + { + "epoch": 1.0586862676467157, + "grad_norm": 1.1536011695861816, + "learning_rate": 0.00027613925053391204, + "loss": 2.0081, + "step": 9074 + }, + { + "epoch": 1.0588029401470074, + "grad_norm": 1.156983733177185, + "learning_rate": 0.00027613103014145855, + "loss": 2.0168, + "step": 9075 + }, + { + "epoch": 1.058919612647299, + "grad_norm": 1.1346995830535889, + "learning_rate": 0.00027612280845696843, + "loss": 2.047, + "step": 9076 + }, + { + "epoch": 1.0590362851475907, + "grad_norm": 1.2573487758636475, + "learning_rate": 0.00027611458548052685, + "loss": 2.1151, + "step": 9077 + }, + { + "epoch": 1.0591529576478824, + "grad_norm": 1.1435198783874512, + "learning_rate": 0.00027610636121221913, + "loss": 2.1234, + "step": 9078 + }, + { + "epoch": 1.059269630148174, + "grad_norm": 1.149248719215393, + "learning_rate": 0.0002760981356521306, + "loss": 1.9248, + "step": 9079 + }, + { + "epoch": 1.0593863026484658, + "grad_norm": 1.3117822408676147, + "learning_rate": 0.00027608990880034637, + "loss": 2.0563, + "step": 9080 + }, + { + "epoch": 1.0595029751487575, + "grad_norm": 1.07302725315094, + "learning_rate": 0.0002760816806569518, + "loss": 1.9277, + "step": 9081 + }, + { + "epoch": 1.0596196476490491, + "grad_norm": 1.2118889093399048, + "learning_rate": 0.00027607345122203223, + "loss": 2.1238, + "step": 9082 + }, + { + "epoch": 1.0597363201493408, + "grad_norm": 1.3651909828186035, + "learning_rate": 0.0002760652204956729, + "loss": 1.9709, + "step": 9083 + }, + { + "epoch": 1.0598529926496325, + "grad_norm": 1.329294204711914, + "learning_rate": 0.0002760569884779592, + "loss": 2.1975, + "step": 9084 + }, + { + "epoch": 1.0599696651499242, + "grad_norm": 1.1563721895217896, + "learning_rate": 0.00027604875516897643, + "loss": 2.0137, + "step": 9085 + }, + { + "epoch": 1.0600863376502159, + "grad_norm": 1.379671573638916, + "learning_rate": 0.00027604052056881003, + "loss": 1.8947, + "step": 9086 + }, + { + "epoch": 1.0602030101505076, + "grad_norm": 1.3459917306900024, + "learning_rate": 0.0002760322846775452, + "loss": 1.9058, + "step": 9087 + }, + { + "epoch": 1.0603196826507992, + "grad_norm": 1.1570396423339844, + "learning_rate": 0.0002760240474952675, + "loss": 2.0979, + "step": 9088 + }, + { + "epoch": 1.060436355151091, + "grad_norm": 1.4768277406692505, + "learning_rate": 0.0002760158090220622, + "loss": 2.0686, + "step": 9089 + }, + { + "epoch": 1.0605530276513826, + "grad_norm": 1.1829569339752197, + "learning_rate": 0.00027600756925801474, + "loss": 1.885, + "step": 9090 + }, + { + "epoch": 1.0606697001516743, + "grad_norm": 1.1076878309249878, + "learning_rate": 0.0002759993282032106, + "loss": 2.2296, + "step": 9091 + }, + { + "epoch": 1.060786372651966, + "grad_norm": 1.1200501918792725, + "learning_rate": 0.0002759910858577351, + "loss": 2.0738, + "step": 9092 + }, + { + "epoch": 1.0609030451522576, + "grad_norm": 1.2692538499832153, + "learning_rate": 0.0002759828422216738, + "loss": 1.9658, + "step": 9093 + }, + { + "epoch": 1.0610197176525493, + "grad_norm": 1.1259396076202393, + "learning_rate": 0.00027597459729511214, + "loss": 1.9009, + "step": 9094 + }, + { + "epoch": 1.061136390152841, + "grad_norm": 1.1756664514541626, + "learning_rate": 0.00027596635107813547, + "loss": 2.0971, + "step": 9095 + }, + { + "epoch": 1.0612530626531327, + "grad_norm": 1.2069684267044067, + "learning_rate": 0.00027595810357082946, + "loss": 2.1576, + "step": 9096 + }, + { + "epoch": 1.0613697351534244, + "grad_norm": 1.2820404767990112, + "learning_rate": 0.0002759498547732795, + "loss": 1.8876, + "step": 9097 + }, + { + "epoch": 1.061486407653716, + "grad_norm": 1.109025478363037, + "learning_rate": 0.00027594160468557107, + "loss": 1.9354, + "step": 9098 + }, + { + "epoch": 1.0616030801540077, + "grad_norm": 1.1186318397521973, + "learning_rate": 0.0002759333533077899, + "loss": 1.9206, + "step": 9099 + }, + { + "epoch": 1.0617197526542994, + "grad_norm": 1.096549391746521, + "learning_rate": 0.00027592510064002127, + "loss": 2.0044, + "step": 9100 + }, + { + "epoch": 1.061836425154591, + "grad_norm": 1.3431675434112549, + "learning_rate": 0.0002759168466823509, + "loss": 2.1912, + "step": 9101 + }, + { + "epoch": 1.0619530976548828, + "grad_norm": 1.1627616882324219, + "learning_rate": 0.00027590859143486425, + "loss": 2.0132, + "step": 9102 + }, + { + "epoch": 1.0620697701551745, + "grad_norm": 1.199549913406372, + "learning_rate": 0.00027590033489764703, + "loss": 2.0432, + "step": 9103 + }, + { + "epoch": 1.0621864426554661, + "grad_norm": 1.2848169803619385, + "learning_rate": 0.0002758920770707847, + "loss": 2.1156, + "step": 9104 + }, + { + "epoch": 1.0623031151557578, + "grad_norm": 1.0587717294692993, + "learning_rate": 0.000275883817954363, + "loss": 2.1518, + "step": 9105 + }, + { + "epoch": 1.0624197876560495, + "grad_norm": 1.4365332126617432, + "learning_rate": 0.0002758755575484675, + "loss": 1.9845, + "step": 9106 + }, + { + "epoch": 1.0625364601563412, + "grad_norm": 1.2611548900604248, + "learning_rate": 0.00027586729585318376, + "loss": 2.1069, + "step": 9107 + }, + { + "epoch": 1.0626531326566329, + "grad_norm": 1.2827955484390259, + "learning_rate": 0.0002758590328685975, + "loss": 2.0093, + "step": 9108 + }, + { + "epoch": 1.0627698051569245, + "grad_norm": 1.2768737077713013, + "learning_rate": 0.00027585076859479443, + "loss": 1.9009, + "step": 9109 + }, + { + "epoch": 1.0628864776572162, + "grad_norm": 1.084776520729065, + "learning_rate": 0.00027584250303186014, + "loss": 1.8338, + "step": 9110 + }, + { + "epoch": 1.063003150157508, + "grad_norm": 1.1322157382965088, + "learning_rate": 0.00027583423617988026, + "loss": 2.1801, + "step": 9111 + }, + { + "epoch": 1.0631198226577996, + "grad_norm": 1.1973985433578491, + "learning_rate": 0.00027582596803894067, + "loss": 2.2943, + "step": 9112 + }, + { + "epoch": 1.0632364951580913, + "grad_norm": 1.0914031267166138, + "learning_rate": 0.000275817698609127, + "loss": 1.9544, + "step": 9113 + }, + { + "epoch": 1.063353167658383, + "grad_norm": 1.2301666736602783, + "learning_rate": 0.00027580942789052495, + "loss": 2.0474, + "step": 9114 + }, + { + "epoch": 1.0634698401586746, + "grad_norm": 1.2546242475509644, + "learning_rate": 0.00027580115588322023, + "loss": 1.988, + "step": 9115 + }, + { + "epoch": 1.0635865126589663, + "grad_norm": 1.1171987056732178, + "learning_rate": 0.0002757928825872987, + "loss": 2.2474, + "step": 9116 + }, + { + "epoch": 1.063703185159258, + "grad_norm": 1.1409929990768433, + "learning_rate": 0.00027578460800284604, + "loss": 2.0419, + "step": 9117 + }, + { + "epoch": 1.0638198576595497, + "grad_norm": 1.2729579210281372, + "learning_rate": 0.00027577633212994804, + "loss": 1.9946, + "step": 9118 + }, + { + "epoch": 1.0639365301598414, + "grad_norm": 1.3615992069244385, + "learning_rate": 0.00027576805496869064, + "loss": 2.2795, + "step": 9119 + }, + { + "epoch": 1.064053202660133, + "grad_norm": 1.3911099433898926, + "learning_rate": 0.00027575977651915945, + "loss": 2.0736, + "step": 9120 + }, + { + "epoch": 1.0641698751604247, + "grad_norm": 1.1705975532531738, + "learning_rate": 0.00027575149678144034, + "loss": 2.1593, + "step": 9121 + }, + { + "epoch": 1.0642865476607164, + "grad_norm": 1.3246338367462158, + "learning_rate": 0.0002757432157556192, + "loss": 2.0225, + "step": 9122 + }, + { + "epoch": 1.064403220161008, + "grad_norm": 1.1184865236282349, + "learning_rate": 0.0002757349334417819, + "loss": 1.9372, + "step": 9123 + }, + { + "epoch": 1.0645198926612998, + "grad_norm": 1.3006443977355957, + "learning_rate": 0.0002757266498400142, + "loss": 2.0063, + "step": 9124 + }, + { + "epoch": 1.0646365651615914, + "grad_norm": 1.2461930513381958, + "learning_rate": 0.0002757183649504021, + "loss": 1.8429, + "step": 9125 + }, + { + "epoch": 1.0647532376618831, + "grad_norm": 1.265075445175171, + "learning_rate": 0.00027571007877303137, + "loss": 2.0825, + "step": 9126 + }, + { + "epoch": 1.0648699101621748, + "grad_norm": 1.2123692035675049, + "learning_rate": 0.00027570179130798793, + "loss": 2.0062, + "step": 9127 + }, + { + "epoch": 1.0649865826624665, + "grad_norm": 1.362673044204712, + "learning_rate": 0.0002756935025553578, + "loss": 2.1369, + "step": 9128 + }, + { + "epoch": 1.0651032551627582, + "grad_norm": 1.1861529350280762, + "learning_rate": 0.0002756852125152268, + "loss": 2.0073, + "step": 9129 + }, + { + "epoch": 1.0652199276630498, + "grad_norm": 1.027260184288025, + "learning_rate": 0.00027567692118768086, + "loss": 2.0767, + "step": 9130 + }, + { + "epoch": 1.0653366001633415, + "grad_norm": 1.187614917755127, + "learning_rate": 0.0002756686285728061, + "loss": 1.9843, + "step": 9131 + }, + { + "epoch": 1.0654532726636332, + "grad_norm": 1.1562708616256714, + "learning_rate": 0.00027566033467068824, + "loss": 2.0256, + "step": 9132 + }, + { + "epoch": 1.065569945163925, + "grad_norm": 1.2022854089736938, + "learning_rate": 0.00027565203948141346, + "loss": 2.032, + "step": 9133 + }, + { + "epoch": 1.0656866176642166, + "grad_norm": 1.1714707612991333, + "learning_rate": 0.00027564374300506774, + "loss": 1.9374, + "step": 9134 + }, + { + "epoch": 1.0658032901645083, + "grad_norm": 1.2753218412399292, + "learning_rate": 0.00027563544524173697, + "loss": 2.0684, + "step": 9135 + }, + { + "epoch": 1.0659199626648, + "grad_norm": 1.2672932147979736, + "learning_rate": 0.0002756271461915073, + "loss": 2.1086, + "step": 9136 + }, + { + "epoch": 1.0660366351650916, + "grad_norm": 1.1601849794387817, + "learning_rate": 0.00027561884585446464, + "loss": 2.0171, + "step": 9137 + }, + { + "epoch": 1.0661533076653833, + "grad_norm": 1.0948717594146729, + "learning_rate": 0.0002756105442306951, + "loss": 2.0171, + "step": 9138 + }, + { + "epoch": 1.066269980165675, + "grad_norm": 1.3848881721496582, + "learning_rate": 0.0002756022413202848, + "loss": 2.1569, + "step": 9139 + }, + { + "epoch": 1.0663866526659667, + "grad_norm": 1.1545408964157104, + "learning_rate": 0.0002755939371233197, + "loss": 1.8932, + "step": 9140 + }, + { + "epoch": 1.0665033251662583, + "grad_norm": 1.0657130479812622, + "learning_rate": 0.000275585631639886, + "loss": 1.9721, + "step": 9141 + }, + { + "epoch": 1.06661999766655, + "grad_norm": 1.206026315689087, + "learning_rate": 0.00027557732487006975, + "loss": 1.9091, + "step": 9142 + }, + { + "epoch": 1.0667366701668417, + "grad_norm": 1.289884090423584, + "learning_rate": 0.00027556901681395705, + "loss": 1.9433, + "step": 9143 + }, + { + "epoch": 1.0668533426671334, + "grad_norm": 1.1629921197891235, + "learning_rate": 0.00027556070747163406, + "loss": 2.1205, + "step": 9144 + }, + { + "epoch": 1.066970015167425, + "grad_norm": 1.2850277423858643, + "learning_rate": 0.00027555239684318687, + "loss": 2.2653, + "step": 9145 + }, + { + "epoch": 1.0670866876677167, + "grad_norm": 1.1501809358596802, + "learning_rate": 0.00027554408492870173, + "loss": 1.8291, + "step": 9146 + }, + { + "epoch": 1.0672033601680084, + "grad_norm": 1.1915228366851807, + "learning_rate": 0.0002755357717282647, + "loss": 2.0648, + "step": 9147 + }, + { + "epoch": 1.0673200326683001, + "grad_norm": 1.2063627243041992, + "learning_rate": 0.0002755274572419621, + "loss": 1.9426, + "step": 9148 + }, + { + "epoch": 1.0674367051685918, + "grad_norm": 1.2286139726638794, + "learning_rate": 0.00027551914146988, + "loss": 2.075, + "step": 9149 + }, + { + "epoch": 1.0675533776688835, + "grad_norm": 1.2688742876052856, + "learning_rate": 0.0002755108244121046, + "loss": 2.067, + "step": 9150 + }, + { + "epoch": 1.0676700501691752, + "grad_norm": 1.1984208822250366, + "learning_rate": 0.0002755025060687222, + "loss": 2.08, + "step": 9151 + }, + { + "epoch": 1.0677867226694668, + "grad_norm": 1.2036387920379639, + "learning_rate": 0.00027549418643981903, + "loss": 1.8795, + "step": 9152 + }, + { + "epoch": 1.0679033951697585, + "grad_norm": 1.169691562652588, + "learning_rate": 0.0002754858655254813, + "loss": 2.02, + "step": 9153 + }, + { + "epoch": 1.0680200676700502, + "grad_norm": 1.089033842086792, + "learning_rate": 0.00027547754332579526, + "loss": 2.1295, + "step": 9154 + }, + { + "epoch": 1.0681367401703419, + "grad_norm": 1.319284200668335, + "learning_rate": 0.00027546921984084727, + "loss": 2.0871, + "step": 9155 + }, + { + "epoch": 1.0682534126706336, + "grad_norm": 1.0653361082077026, + "learning_rate": 0.0002754608950707235, + "loss": 1.8835, + "step": 9156 + }, + { + "epoch": 1.0683700851709252, + "grad_norm": 1.1057729721069336, + "learning_rate": 0.00027545256901551036, + "loss": 2.1313, + "step": 9157 + }, + { + "epoch": 1.068486757671217, + "grad_norm": 1.3313961029052734, + "learning_rate": 0.0002754442416752941, + "loss": 2.137, + "step": 9158 + }, + { + "epoch": 1.0686034301715086, + "grad_norm": 1.4194010496139526, + "learning_rate": 0.00027543591305016107, + "loss": 1.898, + "step": 9159 + }, + { + "epoch": 1.0687201026718003, + "grad_norm": 1.1232064962387085, + "learning_rate": 0.0002754275831401976, + "loss": 1.891, + "step": 9160 + }, + { + "epoch": 1.068836775172092, + "grad_norm": 1.175803780555725, + "learning_rate": 0.0002754192519454901, + "loss": 2.0816, + "step": 9161 + }, + { + "epoch": 1.0689534476723836, + "grad_norm": 1.1510990858078003, + "learning_rate": 0.00027541091946612484, + "loss": 2.0567, + "step": 9162 + }, + { + "epoch": 1.0690701201726753, + "grad_norm": 1.1822712421417236, + "learning_rate": 0.00027540258570218825, + "loss": 2.0085, + "step": 9163 + }, + { + "epoch": 1.069186792672967, + "grad_norm": 1.1598196029663086, + "learning_rate": 0.00027539425065376677, + "loss": 1.8556, + "step": 9164 + }, + { + "epoch": 1.0693034651732587, + "grad_norm": 1.022118091583252, + "learning_rate": 0.00027538591432094676, + "loss": 1.9583, + "step": 9165 + }, + { + "epoch": 1.0694201376735504, + "grad_norm": 1.1001310348510742, + "learning_rate": 0.00027537757670381463, + "loss": 2.0484, + "step": 9166 + }, + { + "epoch": 1.069536810173842, + "grad_norm": 1.1097813844680786, + "learning_rate": 0.0002753692378024569, + "loss": 1.9582, + "step": 9167 + }, + { + "epoch": 1.0696534826741337, + "grad_norm": 1.217260718345642, + "learning_rate": 0.00027536089761695987, + "loss": 2.0006, + "step": 9168 + }, + { + "epoch": 1.0697701551744254, + "grad_norm": 1.2408560514450073, + "learning_rate": 0.0002753525561474101, + "loss": 2.0276, + "step": 9169 + }, + { + "epoch": 1.069886827674717, + "grad_norm": 1.231480598449707, + "learning_rate": 0.0002753442133938941, + "loss": 2.0798, + "step": 9170 + }, + { + "epoch": 1.0700035001750088, + "grad_norm": 1.1576576232910156, + "learning_rate": 0.0002753358693564982, + "loss": 2.2134, + "step": 9171 + }, + { + "epoch": 1.0701201726753005, + "grad_norm": 1.3607937097549438, + "learning_rate": 0.00027532752403530914, + "loss": 1.9109, + "step": 9172 + }, + { + "epoch": 1.0702368451755921, + "grad_norm": 1.2779821157455444, + "learning_rate": 0.0002753191774304133, + "loss": 2.1897, + "step": 9173 + }, + { + "epoch": 1.0703535176758838, + "grad_norm": 1.2769768238067627, + "learning_rate": 0.00027531082954189716, + "loss": 1.9434, + "step": 9174 + }, + { + "epoch": 1.0704701901761755, + "grad_norm": 1.426096796989441, + "learning_rate": 0.00027530248036984736, + "loss": 2.1285, + "step": 9175 + }, + { + "epoch": 1.0705868626764672, + "grad_norm": 1.3667900562286377, + "learning_rate": 0.0002752941299143503, + "loss": 2.2259, + "step": 9176 + }, + { + "epoch": 1.0707035351767589, + "grad_norm": 1.3192046880722046, + "learning_rate": 0.0002752857781754928, + "loss": 2.141, + "step": 9177 + }, + { + "epoch": 1.0708202076770506, + "grad_norm": 1.0968334674835205, + "learning_rate": 0.00027527742515336123, + "loss": 1.9784, + "step": 9178 + }, + { + "epoch": 1.0709368801773422, + "grad_norm": 1.2918574810028076, + "learning_rate": 0.00027526907084804225, + "loss": 1.9888, + "step": 9179 + }, + { + "epoch": 1.071053552677634, + "grad_norm": 1.1381746530532837, + "learning_rate": 0.0002752607152596225, + "loss": 2.084, + "step": 9180 + }, + { + "epoch": 1.0711702251779256, + "grad_norm": 1.3361589908599854, + "learning_rate": 0.00027525235838818856, + "loss": 2.2319, + "step": 9181 + }, + { + "epoch": 1.0712868976782173, + "grad_norm": 1.0385710000991821, + "learning_rate": 0.00027524400023382713, + "loss": 1.8426, + "step": 9182 + }, + { + "epoch": 1.071403570178509, + "grad_norm": 1.071835994720459, + "learning_rate": 0.0002752356407966248, + "loss": 2.0991, + "step": 9183 + }, + { + "epoch": 1.0715202426788006, + "grad_norm": 1.203639030456543, + "learning_rate": 0.0002752272800766682, + "loss": 1.9878, + "step": 9184 + }, + { + "epoch": 1.0716369151790923, + "grad_norm": 1.5209290981292725, + "learning_rate": 0.000275218918074044, + "loss": 2.1743, + "step": 9185 + }, + { + "epoch": 1.071753587679384, + "grad_norm": 1.1789690256118774, + "learning_rate": 0.000275210554788839, + "loss": 2.0504, + "step": 9186 + }, + { + "epoch": 1.0718702601796757, + "grad_norm": 1.1124354600906372, + "learning_rate": 0.00027520219022113984, + "loss": 1.9859, + "step": 9187 + }, + { + "epoch": 1.0719869326799674, + "grad_norm": 1.274032473564148, + "learning_rate": 0.0002751938243710332, + "loss": 2.1753, + "step": 9188 + }, + { + "epoch": 1.072103605180259, + "grad_norm": 1.0543005466461182, + "learning_rate": 0.00027518545723860587, + "loss": 2.0295, + "step": 9189 + }, + { + "epoch": 1.0722202776805507, + "grad_norm": 1.1248066425323486, + "learning_rate": 0.00027517708882394455, + "loss": 2.0893, + "step": 9190 + }, + { + "epoch": 1.0723369501808424, + "grad_norm": 1.1153616905212402, + "learning_rate": 0.000275168719127136, + "loss": 2.0991, + "step": 9191 + }, + { + "epoch": 1.072453622681134, + "grad_norm": 1.406193733215332, + "learning_rate": 0.000275160348148267, + "loss": 2.0361, + "step": 9192 + }, + { + "epoch": 1.0725702951814258, + "grad_norm": 1.1589758396148682, + "learning_rate": 0.00027515197588742426, + "loss": 1.9334, + "step": 9193 + }, + { + "epoch": 1.0726869676817175, + "grad_norm": 1.321773648262024, + "learning_rate": 0.0002751436023446947, + "loss": 1.9011, + "step": 9194 + }, + { + "epoch": 1.0728036401820091, + "grad_norm": 1.1227389574050903, + "learning_rate": 0.000275135227520165, + "loss": 2.058, + "step": 9195 + }, + { + "epoch": 1.0729203126823008, + "grad_norm": 1.228547215461731, + "learning_rate": 0.0002751268514139221, + "loss": 2.1349, + "step": 9196 + }, + { + "epoch": 1.0730369851825925, + "grad_norm": 1.3908007144927979, + "learning_rate": 0.0002751184740260528, + "loss": 2.2475, + "step": 9197 + }, + { + "epoch": 1.0731536576828842, + "grad_norm": 1.2612996101379395, + "learning_rate": 0.0002751100953566439, + "loss": 2.2373, + "step": 9198 + }, + { + "epoch": 1.0732703301831759, + "grad_norm": 1.3149768114089966, + "learning_rate": 0.0002751017154057823, + "loss": 2.283, + "step": 9199 + }, + { + "epoch": 1.0733870026834675, + "grad_norm": 1.0029534101486206, + "learning_rate": 0.0002750933341735548, + "loss": 1.8982, + "step": 9200 + }, + { + "epoch": 1.0735036751837592, + "grad_norm": 1.1369826793670654, + "learning_rate": 0.0002750849516600484, + "loss": 1.9538, + "step": 9201 + }, + { + "epoch": 1.073620347684051, + "grad_norm": 1.0744967460632324, + "learning_rate": 0.0002750765678653499, + "loss": 1.9775, + "step": 9202 + }, + { + "epoch": 1.0737370201843426, + "grad_norm": 1.325226068496704, + "learning_rate": 0.00027506818278954636, + "loss": 2.1596, + "step": 9203 + }, + { + "epoch": 1.0738536926846343, + "grad_norm": 1.1575961112976074, + "learning_rate": 0.00027505979643272454, + "loss": 2.0367, + "step": 9204 + }, + { + "epoch": 1.073970365184926, + "grad_norm": 1.1927365064620972, + "learning_rate": 0.00027505140879497146, + "loss": 2.0631, + "step": 9205 + }, + { + "epoch": 1.0740870376852176, + "grad_norm": 1.1815558671951294, + "learning_rate": 0.00027504301987637403, + "loss": 2.0357, + "step": 9206 + }, + { + "epoch": 1.0742037101855093, + "grad_norm": 1.147953748703003, + "learning_rate": 0.0002750346296770193, + "loss": 1.9603, + "step": 9207 + }, + { + "epoch": 1.074320382685801, + "grad_norm": 1.2048656940460205, + "learning_rate": 0.0002750262381969942, + "loss": 2.101, + "step": 9208 + }, + { + "epoch": 1.0744370551860927, + "grad_norm": 1.3659991025924683, + "learning_rate": 0.00027501784543638573, + "loss": 2.0112, + "step": 9209 + }, + { + "epoch": 1.0745537276863844, + "grad_norm": 1.464125156402588, + "learning_rate": 0.00027500945139528085, + "loss": 1.916, + "step": 9210 + }, + { + "epoch": 1.074670400186676, + "grad_norm": 1.1951738595962524, + "learning_rate": 0.00027500105607376665, + "loss": 2.1449, + "step": 9211 + }, + { + "epoch": 1.0747870726869677, + "grad_norm": 1.1267186403274536, + "learning_rate": 0.00027499265947193013, + "loss": 2.145, + "step": 9212 + }, + { + "epoch": 1.0749037451872594, + "grad_norm": 1.1289515495300293, + "learning_rate": 0.0002749842615898583, + "loss": 1.8167, + "step": 9213 + }, + { + "epoch": 1.075020417687551, + "grad_norm": 1.3691350221633911, + "learning_rate": 0.0002749758624276383, + "loss": 2.0377, + "step": 9214 + }, + { + "epoch": 1.0751370901878428, + "grad_norm": 1.2814959287643433, + "learning_rate": 0.0002749674619853572, + "loss": 2.0219, + "step": 9215 + }, + { + "epoch": 1.0752537626881344, + "grad_norm": 1.0872268676757812, + "learning_rate": 0.000274959060263102, + "loss": 2.0136, + "step": 9216 + }, + { + "epoch": 1.0753704351884261, + "grad_norm": 1.0906652212142944, + "learning_rate": 0.0002749506572609599, + "loss": 1.9197, + "step": 9217 + }, + { + "epoch": 1.0754871076887178, + "grad_norm": 1.0710935592651367, + "learning_rate": 0.00027494225297901797, + "loss": 2.0799, + "step": 9218 + }, + { + "epoch": 1.0756037801890095, + "grad_norm": 1.1245545148849487, + "learning_rate": 0.0002749338474173632, + "loss": 2.0633, + "step": 9219 + }, + { + "epoch": 1.0757204526893012, + "grad_norm": 1.3337260484695435, + "learning_rate": 0.00027492544057608297, + "loss": 2.1719, + "step": 9220 + }, + { + "epoch": 1.0758371251895928, + "grad_norm": 1.2054722309112549, + "learning_rate": 0.00027491703245526434, + "loss": 2.063, + "step": 9221 + }, + { + "epoch": 1.0759537976898845, + "grad_norm": 1.2237887382507324, + "learning_rate": 0.00027490862305499446, + "loss": 2.2195, + "step": 9222 + }, + { + "epoch": 1.0760704701901762, + "grad_norm": 1.4713069200515747, + "learning_rate": 0.0002749002123753604, + "loss": 1.9712, + "step": 9223 + }, + { + "epoch": 1.076187142690468, + "grad_norm": 1.2587413787841797, + "learning_rate": 0.0002748918004164496, + "loss": 2.0463, + "step": 9224 + }, + { + "epoch": 1.0763038151907596, + "grad_norm": 1.3868132829666138, + "learning_rate": 0.0002748833871783491, + "loss": 2.0419, + "step": 9225 + }, + { + "epoch": 1.0764204876910513, + "grad_norm": 1.1721978187561035, + "learning_rate": 0.00027487497266114605, + "loss": 2.1261, + "step": 9226 + }, + { + "epoch": 1.076537160191343, + "grad_norm": 1.1563804149627686, + "learning_rate": 0.00027486655686492783, + "loss": 2.0293, + "step": 9227 + }, + { + "epoch": 1.0766538326916346, + "grad_norm": 1.1759769916534424, + "learning_rate": 0.0002748581397897817, + "loss": 2.1843, + "step": 9228 + }, + { + "epoch": 1.0767705051919263, + "grad_norm": 1.3311902284622192, + "learning_rate": 0.00027484972143579475, + "loss": 2.0361, + "step": 9229 + }, + { + "epoch": 1.076887177692218, + "grad_norm": 1.2135446071624756, + "learning_rate": 0.00027484130180305445, + "loss": 2.193, + "step": 9230 + }, + { + "epoch": 1.0770038501925097, + "grad_norm": 1.2036139965057373, + "learning_rate": 0.000274832880891648, + "loss": 1.9402, + "step": 9231 + }, + { + "epoch": 1.0771205226928013, + "grad_norm": 1.3302704095840454, + "learning_rate": 0.0002748244587016626, + "loss": 1.9752, + "step": 9232 + }, + { + "epoch": 1.077237195193093, + "grad_norm": 1.2328853607177734, + "learning_rate": 0.00027481603523318566, + "loss": 2.0782, + "step": 9233 + }, + { + "epoch": 1.0773538676933847, + "grad_norm": 1.4642715454101562, + "learning_rate": 0.00027480761048630453, + "loss": 2.0862, + "step": 9234 + }, + { + "epoch": 1.0774705401936764, + "grad_norm": 1.3080590963363647, + "learning_rate": 0.00027479918446110654, + "loss": 2.0339, + "step": 9235 + }, + { + "epoch": 1.077587212693968, + "grad_norm": 1.0896624326705933, + "learning_rate": 0.00027479075715767897, + "loss": 2.1374, + "step": 9236 + }, + { + "epoch": 1.0777038851942597, + "grad_norm": 1.2416856288909912, + "learning_rate": 0.00027478232857610925, + "loss": 2.1385, + "step": 9237 + }, + { + "epoch": 1.0778205576945514, + "grad_norm": 1.4777979850769043, + "learning_rate": 0.0002747738987164847, + "loss": 2.1348, + "step": 9238 + }, + { + "epoch": 1.077937230194843, + "grad_norm": 1.2420002222061157, + "learning_rate": 0.0002747654675788928, + "loss": 2.1574, + "step": 9239 + }, + { + "epoch": 1.0780539026951348, + "grad_norm": 1.233688235282898, + "learning_rate": 0.00027475703516342093, + "loss": 2.0988, + "step": 9240 + }, + { + "epoch": 1.0781705751954265, + "grad_norm": 1.250838041305542, + "learning_rate": 0.0002747486014701564, + "loss": 2.2121, + "step": 9241 + }, + { + "epoch": 1.0782872476957182, + "grad_norm": 1.2788732051849365, + "learning_rate": 0.0002747401664991868, + "loss": 2.201, + "step": 9242 + }, + { + "epoch": 1.0784039201960098, + "grad_norm": 1.1390056610107422, + "learning_rate": 0.0002747317302505995, + "loss": 2.0284, + "step": 9243 + }, + { + "epoch": 1.0785205926963015, + "grad_norm": 1.0356953144073486, + "learning_rate": 0.00027472329272448196, + "loss": 2.0494, + "step": 9244 + }, + { + "epoch": 1.0786372651965932, + "grad_norm": 1.18488609790802, + "learning_rate": 0.0002747148539209216, + "loss": 2.1244, + "step": 9245 + }, + { + "epoch": 1.0787539376968849, + "grad_norm": 1.202146053314209, + "learning_rate": 0.000274706413840006, + "loss": 1.9158, + "step": 9246 + }, + { + "epoch": 1.0788706101971766, + "grad_norm": 1.040649175643921, + "learning_rate": 0.00027469797248182257, + "loss": 2.0177, + "step": 9247 + }, + { + "epoch": 1.0789872826974682, + "grad_norm": 1.2267236709594727, + "learning_rate": 0.00027468952984645886, + "loss": 2.1594, + "step": 9248 + }, + { + "epoch": 1.07910395519776, + "grad_norm": 1.186245322227478, + "learning_rate": 0.0002746810859340024, + "loss": 1.9862, + "step": 9249 + }, + { + "epoch": 1.0792206276980516, + "grad_norm": 1.0225163698196411, + "learning_rate": 0.00027467264074454076, + "loss": 2.1462, + "step": 9250 + }, + { + "epoch": 1.0793373001983433, + "grad_norm": 1.0182766914367676, + "learning_rate": 0.0002746641942781614, + "loss": 1.9916, + "step": 9251 + }, + { + "epoch": 1.079453972698635, + "grad_norm": 1.4444917440414429, + "learning_rate": 0.00027465574653495197, + "loss": 2.1234, + "step": 9252 + }, + { + "epoch": 1.0795706451989266, + "grad_norm": 1.0911942720413208, + "learning_rate": 0.000274647297515, + "loss": 1.8703, + "step": 9253 + }, + { + "epoch": 1.0796873176992183, + "grad_norm": 1.1696698665618896, + "learning_rate": 0.0002746388472183931, + "loss": 2.0465, + "step": 9254 + }, + { + "epoch": 1.07980399019951, + "grad_norm": 1.2597697973251343, + "learning_rate": 0.0002746303956452189, + "loss": 2.2355, + "step": 9255 + }, + { + "epoch": 1.0799206626998017, + "grad_norm": 1.1342101097106934, + "learning_rate": 0.0002746219427955649, + "loss": 2.0752, + "step": 9256 + }, + { + "epoch": 1.0800373352000934, + "grad_norm": 0.9484564661979675, + "learning_rate": 0.00027461348866951885, + "loss": 1.8921, + "step": 9257 + }, + { + "epoch": 1.080154007700385, + "grad_norm": 1.1088069677352905, + "learning_rate": 0.0002746050332671684, + "loss": 2.1049, + "step": 9258 + }, + { + "epoch": 1.0802706802006767, + "grad_norm": 1.212378978729248, + "learning_rate": 0.00027459657658860114, + "loss": 1.942, + "step": 9259 + }, + { + "epoch": 1.0803873527009684, + "grad_norm": 1.3129582405090332, + "learning_rate": 0.0002745881186339047, + "loss": 2.2998, + "step": 9260 + }, + { + "epoch": 1.08050402520126, + "grad_norm": 1.1681208610534668, + "learning_rate": 0.0002745796594031669, + "loss": 1.9895, + "step": 9261 + }, + { + "epoch": 1.0806206977015518, + "grad_norm": 1.134214997291565, + "learning_rate": 0.0002745711988964753, + "loss": 2.1357, + "step": 9262 + }, + { + "epoch": 1.0807373702018435, + "grad_norm": 1.255013346672058, + "learning_rate": 0.0002745627371139177, + "loss": 2.051, + "step": 9263 + }, + { + "epoch": 1.0808540427021351, + "grad_norm": 1.1550554037094116, + "learning_rate": 0.0002745542740555818, + "loss": 2.1305, + "step": 9264 + }, + { + "epoch": 1.0809707152024268, + "grad_norm": 1.1639984846115112, + "learning_rate": 0.00027454580972155533, + "loss": 2.2026, + "step": 9265 + }, + { + "epoch": 1.0810873877027185, + "grad_norm": 1.2417492866516113, + "learning_rate": 0.00027453734411192596, + "loss": 2.0863, + "step": 9266 + }, + { + "epoch": 1.0812040602030102, + "grad_norm": 1.064033031463623, + "learning_rate": 0.0002745288772267816, + "loss": 2.0842, + "step": 9267 + }, + { + "epoch": 1.0813207327033019, + "grad_norm": 1.0843671560287476, + "learning_rate": 0.00027452040906621, + "loss": 1.877, + "step": 9268 + }, + { + "epoch": 1.0814374052035935, + "grad_norm": 1.1688287258148193, + "learning_rate": 0.0002745119396302988, + "loss": 1.9242, + "step": 9269 + }, + { + "epoch": 1.0815540777038852, + "grad_norm": 1.139057993888855, + "learning_rate": 0.000274503468919136, + "loss": 2.0387, + "step": 9270 + }, + { + "epoch": 1.081670750204177, + "grad_norm": 1.179661512374878, + "learning_rate": 0.00027449499693280927, + "loss": 2.1213, + "step": 9271 + }, + { + "epoch": 1.0817874227044686, + "grad_norm": 1.0278830528259277, + "learning_rate": 0.0002744865236714065, + "loss": 1.9692, + "step": 9272 + }, + { + "epoch": 1.0819040952047603, + "grad_norm": 1.1672383546829224, + "learning_rate": 0.0002744780491350155, + "loss": 2.0495, + "step": 9273 + }, + { + "epoch": 1.082020767705052, + "grad_norm": 1.075474739074707, + "learning_rate": 0.00027446957332372406, + "loss": 2.1683, + "step": 9274 + }, + { + "epoch": 1.0821374402053436, + "grad_norm": 1.5442161560058594, + "learning_rate": 0.0002744610962376203, + "loss": 2.0804, + "step": 9275 + }, + { + "epoch": 1.0822541127056353, + "grad_norm": 1.1668349504470825, + "learning_rate": 0.0002744526178767918, + "loss": 2.0064, + "step": 9276 + }, + { + "epoch": 1.082370785205927, + "grad_norm": 1.380982756614685, + "learning_rate": 0.00027444413824132663, + "loss": 2.0131, + "step": 9277 + }, + { + "epoch": 1.0824874577062187, + "grad_norm": 1.1289968490600586, + "learning_rate": 0.00027443565733131266, + "loss": 2.1995, + "step": 9278 + }, + { + "epoch": 1.0826041302065104, + "grad_norm": 1.3285531997680664, + "learning_rate": 0.0002744271751468378, + "loss": 2.1573, + "step": 9279 + }, + { + "epoch": 1.082720802706802, + "grad_norm": 1.2997835874557495, + "learning_rate": 0.00027441869168799, + "loss": 2.2139, + "step": 9280 + }, + { + "epoch": 1.0828374752070937, + "grad_norm": 1.2776046991348267, + "learning_rate": 0.00027441020695485717, + "loss": 2.0587, + "step": 9281 + }, + { + "epoch": 1.0829541477073854, + "grad_norm": 1.0858681201934814, + "learning_rate": 0.00027440172094752734, + "loss": 2.0468, + "step": 9282 + }, + { + "epoch": 1.083070820207677, + "grad_norm": 1.3684496879577637, + "learning_rate": 0.0002743932336660884, + "loss": 2.1559, + "step": 9283 + }, + { + "epoch": 1.0831874927079688, + "grad_norm": 1.0148625373840332, + "learning_rate": 0.0002743847451106284, + "loss": 1.911, + "step": 9284 + }, + { + "epoch": 1.0833041652082605, + "grad_norm": 1.201215386390686, + "learning_rate": 0.0002743762552812353, + "loss": 2.053, + "step": 9285 + }, + { + "epoch": 1.0834208377085521, + "grad_norm": 1.2151210308074951, + "learning_rate": 0.0002743677641779971, + "loss": 1.9322, + "step": 9286 + }, + { + "epoch": 1.0835375102088438, + "grad_norm": 1.0595473051071167, + "learning_rate": 0.00027435927180100185, + "loss": 1.9988, + "step": 9287 + }, + { + "epoch": 1.0836541827091355, + "grad_norm": 1.2647051811218262, + "learning_rate": 0.00027435077815033765, + "loss": 2.138, + "step": 9288 + }, + { + "epoch": 1.0837708552094272, + "grad_norm": 1.272246241569519, + "learning_rate": 0.00027434228322609244, + "loss": 2.1369, + "step": 9289 + }, + { + "epoch": 1.0838875277097189, + "grad_norm": 1.3347810506820679, + "learning_rate": 0.0002743337870283544, + "loss": 2.07, + "step": 9290 + }, + { + "epoch": 1.0840042002100105, + "grad_norm": 1.1339551210403442, + "learning_rate": 0.00027432528955721156, + "loss": 2.1344, + "step": 9291 + }, + { + "epoch": 1.0841208727103022, + "grad_norm": 1.3168151378631592, + "learning_rate": 0.00027431679081275196, + "loss": 2.0957, + "step": 9292 + }, + { + "epoch": 1.084237545210594, + "grad_norm": 1.2686336040496826, + "learning_rate": 0.00027430829079506377, + "loss": 2.1409, + "step": 9293 + }, + { + "epoch": 1.0843542177108856, + "grad_norm": 1.204972267150879, + "learning_rate": 0.0002742997895042351, + "loss": 2.1616, + "step": 9294 + }, + { + "epoch": 1.0844708902111773, + "grad_norm": 1.221034288406372, + "learning_rate": 0.00027429128694035404, + "loss": 2.1652, + "step": 9295 + }, + { + "epoch": 1.084587562711469, + "grad_norm": 1.332228660583496, + "learning_rate": 0.0002742827831035088, + "loss": 2.0344, + "step": 9296 + }, + { + "epoch": 1.0847042352117606, + "grad_norm": 1.1711691617965698, + "learning_rate": 0.00027427427799378746, + "loss": 2.0921, + "step": 9297 + }, + { + "epoch": 1.0848209077120523, + "grad_norm": 1.2269469499588013, + "learning_rate": 0.0002742657716112783, + "loss": 1.9381, + "step": 9298 + }, + { + "epoch": 1.084937580212344, + "grad_norm": 1.1982911825180054, + "learning_rate": 0.0002742572639560694, + "loss": 2.1442, + "step": 9299 + }, + { + "epoch": 1.0850542527126357, + "grad_norm": 1.1733388900756836, + "learning_rate": 0.000274248755028249, + "loss": 1.9522, + "step": 9300 + }, + { + "epoch": 1.0851709252129274, + "grad_norm": 1.2937228679656982, + "learning_rate": 0.0002742402448279053, + "loss": 2.253, + "step": 9301 + }, + { + "epoch": 1.085287597713219, + "grad_norm": 1.316644549369812, + "learning_rate": 0.00027423173335512657, + "loss": 2.1053, + "step": 9302 + }, + { + "epoch": 1.0854042702135107, + "grad_norm": 1.4868671894073486, + "learning_rate": 0.00027422322061000095, + "loss": 2.3185, + "step": 9303 + }, + { + "epoch": 1.0855209427138024, + "grad_norm": 1.4360547065734863, + "learning_rate": 0.00027421470659261683, + "loss": 2.2479, + "step": 9304 + }, + { + "epoch": 1.085637615214094, + "grad_norm": 1.271989107131958, + "learning_rate": 0.0002742061913030624, + "loss": 1.9566, + "step": 9305 + }, + { + "epoch": 1.0857542877143858, + "grad_norm": 1.2564327716827393, + "learning_rate": 0.0002741976747414259, + "loss": 1.9359, + "step": 9306 + }, + { + "epoch": 1.0858709602146774, + "grad_norm": 1.1049844026565552, + "learning_rate": 0.00027418915690779555, + "loss": 1.9676, + "step": 9307 + }, + { + "epoch": 1.0859876327149691, + "grad_norm": 1.4046356678009033, + "learning_rate": 0.0002741806378022599, + "loss": 2.0145, + "step": 9308 + }, + { + "epoch": 1.0861043052152608, + "grad_norm": 1.2193827629089355, + "learning_rate": 0.0002741721174249071, + "loss": 2.2192, + "step": 9309 + }, + { + "epoch": 1.0862209777155525, + "grad_norm": 1.1333039999008179, + "learning_rate": 0.0002741635957758255, + "loss": 2.0535, + "step": 9310 + }, + { + "epoch": 1.0863376502158442, + "grad_norm": 1.2315549850463867, + "learning_rate": 0.00027415507285510345, + "loss": 2.2352, + "step": 9311 + }, + { + "epoch": 1.0864543227161358, + "grad_norm": 1.4062620401382446, + "learning_rate": 0.00027414654866282927, + "loss": 2.0342, + "step": 9312 + }, + { + "epoch": 1.0865709952164275, + "grad_norm": 1.426464557647705, + "learning_rate": 0.0002741380231990914, + "loss": 2.1215, + "step": 9313 + }, + { + "epoch": 1.0866876677167192, + "grad_norm": 1.0712597370147705, + "learning_rate": 0.0002741294964639781, + "loss": 2.0218, + "step": 9314 + }, + { + "epoch": 1.086804340217011, + "grad_norm": 1.1136319637298584, + "learning_rate": 0.00027412096845757795, + "loss": 2.0283, + "step": 9315 + }, + { + "epoch": 1.0869210127173026, + "grad_norm": 1.1049823760986328, + "learning_rate": 0.00027411243917997924, + "loss": 2.0157, + "step": 9316 + }, + { + "epoch": 1.0870376852175943, + "grad_norm": 1.0332460403442383, + "learning_rate": 0.00027410390863127036, + "loss": 1.9072, + "step": 9317 + }, + { + "epoch": 1.087154357717886, + "grad_norm": 1.1944077014923096, + "learning_rate": 0.0002740953768115399, + "loss": 2.0673, + "step": 9318 + }, + { + "epoch": 1.0872710302181776, + "grad_norm": 1.2577471733093262, + "learning_rate": 0.0002740868437208761, + "loss": 1.9814, + "step": 9319 + }, + { + "epoch": 1.0873877027184693, + "grad_norm": 1.3698046207427979, + "learning_rate": 0.0002740783093593675, + "loss": 2.1026, + "step": 9320 + }, + { + "epoch": 1.087504375218761, + "grad_norm": 1.0835788249969482, + "learning_rate": 0.0002740697737271027, + "loss": 2.0756, + "step": 9321 + }, + { + "epoch": 1.0876210477190527, + "grad_norm": 1.2777167558670044, + "learning_rate": 0.00027406123682417017, + "loss": 2.1112, + "step": 9322 + }, + { + "epoch": 1.0877377202193443, + "grad_norm": 1.1304547786712646, + "learning_rate": 0.00027405269865065816, + "loss": 2.0744, + "step": 9323 + }, + { + "epoch": 1.087854392719636, + "grad_norm": 1.2030346393585205, + "learning_rate": 0.00027404415920665543, + "loss": 2.0604, + "step": 9324 + }, + { + "epoch": 1.0879710652199277, + "grad_norm": 1.3779184818267822, + "learning_rate": 0.0002740356184922504, + "loss": 1.9987, + "step": 9325 + }, + { + "epoch": 1.0880877377202194, + "grad_norm": 1.2660824060440063, + "learning_rate": 0.0002740270765075317, + "loss": 2.1108, + "step": 9326 + }, + { + "epoch": 1.088204410220511, + "grad_norm": 1.110802173614502, + "learning_rate": 0.00027401853325258776, + "loss": 2.1704, + "step": 9327 + }, + { + "epoch": 1.0883210827208027, + "grad_norm": 1.1566698551177979, + "learning_rate": 0.0002740099887275073, + "loss": 2.2331, + "step": 9328 + }, + { + "epoch": 1.0884377552210944, + "grad_norm": 1.2753061056137085, + "learning_rate": 0.00027400144293237877, + "loss": 2.0337, + "step": 9329 + }, + { + "epoch": 1.088554427721386, + "grad_norm": 1.0670663118362427, + "learning_rate": 0.00027399289586729075, + "loss": 1.8866, + "step": 9330 + }, + { + "epoch": 1.0886711002216778, + "grad_norm": 1.1699252128601074, + "learning_rate": 0.000273984347532332, + "loss": 2.1428, + "step": 9331 + }, + { + "epoch": 1.0887877727219695, + "grad_norm": 1.2044696807861328, + "learning_rate": 0.000273975797927591, + "loss": 2.0027, + "step": 9332 + }, + { + "epoch": 1.0889044452222612, + "grad_norm": 1.2366106510162354, + "learning_rate": 0.00027396724705315643, + "loss": 2.0624, + "step": 9333 + }, + { + "epoch": 1.0890211177225528, + "grad_norm": 1.228948712348938, + "learning_rate": 0.00027395869490911694, + "loss": 2.1684, + "step": 9334 + }, + { + "epoch": 1.0891377902228445, + "grad_norm": 1.1924843788146973, + "learning_rate": 0.00027395014149556116, + "loss": 2.037, + "step": 9335 + }, + { + "epoch": 1.0892544627231362, + "grad_norm": 1.102662205696106, + "learning_rate": 0.00027394158681257784, + "loss": 2.0762, + "step": 9336 + }, + { + "epoch": 1.0893711352234279, + "grad_norm": 1.1731386184692383, + "learning_rate": 0.00027393303086025556, + "loss": 2.1084, + "step": 9337 + }, + { + "epoch": 1.0894878077237196, + "grad_norm": 1.114320993423462, + "learning_rate": 0.00027392447363868306, + "loss": 2.0756, + "step": 9338 + }, + { + "epoch": 1.0896044802240112, + "grad_norm": 1.1156225204467773, + "learning_rate": 0.0002739159151479491, + "loss": 1.906, + "step": 9339 + }, + { + "epoch": 1.089721152724303, + "grad_norm": 1.272825002670288, + "learning_rate": 0.00027390735538814235, + "loss": 2.1666, + "step": 9340 + }, + { + "epoch": 1.0898378252245946, + "grad_norm": 1.0476069450378418, + "learning_rate": 0.0002738987943593516, + "loss": 2.1643, + "step": 9341 + }, + { + "epoch": 1.0899544977248863, + "grad_norm": 1.039000153541565, + "learning_rate": 0.0002738902320616655, + "loss": 1.9183, + "step": 9342 + }, + { + "epoch": 1.090071170225178, + "grad_norm": 1.0761175155639648, + "learning_rate": 0.00027388166849517297, + "loss": 2.1458, + "step": 9343 + }, + { + "epoch": 1.0901878427254696, + "grad_norm": 1.186829686164856, + "learning_rate": 0.00027387310365996256, + "loss": 2.1005, + "step": 9344 + }, + { + "epoch": 1.0903045152257613, + "grad_norm": 1.3386694192886353, + "learning_rate": 0.0002738645375561233, + "loss": 2.1122, + "step": 9345 + }, + { + "epoch": 1.090421187726053, + "grad_norm": 1.312024712562561, + "learning_rate": 0.0002738559701837438, + "loss": 2.0973, + "step": 9346 + }, + { + "epoch": 1.0905378602263447, + "grad_norm": 1.2377150058746338, + "learning_rate": 0.00027384740154291315, + "loss": 2.2147, + "step": 9347 + }, + { + "epoch": 1.0906545327266364, + "grad_norm": 1.030424952507019, + "learning_rate": 0.00027383883163371983, + "loss": 2.1104, + "step": 9348 + }, + { + "epoch": 1.090771205226928, + "grad_norm": 1.104631781578064, + "learning_rate": 0.00027383026045625295, + "loss": 1.8778, + "step": 9349 + }, + { + "epoch": 1.0908878777272197, + "grad_norm": 1.3067842721939087, + "learning_rate": 0.0002738216880106012, + "loss": 2.1522, + "step": 9350 + }, + { + "epoch": 1.0910045502275114, + "grad_norm": 1.2189576625823975, + "learning_rate": 0.00027381311429685353, + "loss": 1.9045, + "step": 9351 + }, + { + "epoch": 1.091121222727803, + "grad_norm": 1.26433527469635, + "learning_rate": 0.0002738045393150988, + "loss": 2.1391, + "step": 9352 + }, + { + "epoch": 1.0912378952280948, + "grad_norm": 1.344917893409729, + "learning_rate": 0.000273795963065426, + "loss": 1.9544, + "step": 9353 + }, + { + "epoch": 1.0913545677283865, + "grad_norm": 1.2641197443008423, + "learning_rate": 0.0002737873855479239, + "loss": 2.033, + "step": 9354 + }, + { + "epoch": 1.0914712402286781, + "grad_norm": 1.2253217697143555, + "learning_rate": 0.0002737788067626814, + "loss": 1.9974, + "step": 9355 + }, + { + "epoch": 1.0915879127289698, + "grad_norm": 1.2913838624954224, + "learning_rate": 0.00027377022670978766, + "loss": 2.0661, + "step": 9356 + }, + { + "epoch": 1.0917045852292615, + "grad_norm": 1.1678638458251953, + "learning_rate": 0.0002737616453893314, + "loss": 1.9034, + "step": 9357 + }, + { + "epoch": 1.0918212577295532, + "grad_norm": 1.1088312864303589, + "learning_rate": 0.00027375306280140167, + "loss": 2.0663, + "step": 9358 + }, + { + "epoch": 1.0919379302298449, + "grad_norm": 1.407297968864441, + "learning_rate": 0.00027374447894608746, + "loss": 2.136, + "step": 9359 + }, + { + "epoch": 1.0920546027301365, + "grad_norm": 1.2882040739059448, + "learning_rate": 0.00027373589382347774, + "loss": 2.0954, + "step": 9360 + }, + { + "epoch": 1.0921712752304282, + "grad_norm": 1.3192304372787476, + "learning_rate": 0.00027372730743366146, + "loss": 1.9548, + "step": 9361 + }, + { + "epoch": 1.09228794773072, + "grad_norm": 1.5108163356781006, + "learning_rate": 0.0002737187197767277, + "loss": 2.181, + "step": 9362 + }, + { + "epoch": 1.0924046202310116, + "grad_norm": 1.1648602485656738, + "learning_rate": 0.0002737101308527655, + "loss": 1.9405, + "step": 9363 + }, + { + "epoch": 1.0925212927313033, + "grad_norm": 1.2880988121032715, + "learning_rate": 0.00027370154066186386, + "loss": 2.1061, + "step": 9364 + }, + { + "epoch": 1.092637965231595, + "grad_norm": 1.106539249420166, + "learning_rate": 0.0002736929492041118, + "loss": 2.0305, + "step": 9365 + }, + { + "epoch": 1.0927546377318866, + "grad_norm": 1.0936692953109741, + "learning_rate": 0.0002736843564795985, + "loss": 2.0045, + "step": 9366 + }, + { + "epoch": 1.0928713102321783, + "grad_norm": 1.166637659072876, + "learning_rate": 0.00027367576248841293, + "loss": 2.0916, + "step": 9367 + }, + { + "epoch": 1.09298798273247, + "grad_norm": 1.1080912351608276, + "learning_rate": 0.00027366716723064427, + "loss": 1.9942, + "step": 9368 + }, + { + "epoch": 1.0931046552327617, + "grad_norm": 1.204696774482727, + "learning_rate": 0.00027365857070638154, + "loss": 1.9044, + "step": 9369 + }, + { + "epoch": 1.0932213277330534, + "grad_norm": 1.303788185119629, + "learning_rate": 0.0002736499729157139, + "loss": 2.1278, + "step": 9370 + }, + { + "epoch": 1.093338000233345, + "grad_norm": 1.0343326330184937, + "learning_rate": 0.0002736413738587305, + "loss": 1.9158, + "step": 9371 + }, + { + "epoch": 1.0934546727336367, + "grad_norm": 1.124210000038147, + "learning_rate": 0.0002736327735355204, + "loss": 2.0658, + "step": 9372 + }, + { + "epoch": 1.0935713452339284, + "grad_norm": 1.1541519165039062, + "learning_rate": 0.0002736241719461729, + "loss": 2.0053, + "step": 9373 + }, + { + "epoch": 1.09368801773422, + "grad_norm": 1.1800364255905151, + "learning_rate": 0.00027361556909077705, + "loss": 2.0588, + "step": 9374 + }, + { + "epoch": 1.0938046902345118, + "grad_norm": 1.1445355415344238, + "learning_rate": 0.0002736069649694221, + "loss": 1.9771, + "step": 9375 + }, + { + "epoch": 1.0939213627348034, + "grad_norm": 1.305708646774292, + "learning_rate": 0.00027359835958219725, + "loss": 2.0951, + "step": 9376 + }, + { + "epoch": 1.0940380352350951, + "grad_norm": 1.1455620527267456, + "learning_rate": 0.0002735897529291917, + "loss": 2.0215, + "step": 9377 + }, + { + "epoch": 1.0941547077353868, + "grad_norm": 1.2927223443984985, + "learning_rate": 0.00027358114501049456, + "loss": 2.0876, + "step": 9378 + }, + { + "epoch": 1.0942713802356785, + "grad_norm": 1.1478662490844727, + "learning_rate": 0.0002735725358261953, + "loss": 2.0188, + "step": 9379 + }, + { + "epoch": 1.0943880527359702, + "grad_norm": 1.202154278755188, + "learning_rate": 0.00027356392537638294, + "loss": 1.919, + "step": 9380 + }, + { + "epoch": 1.0945047252362619, + "grad_norm": 1.1267924308776855, + "learning_rate": 0.00027355531366114695, + "loss": 1.9543, + "step": 9381 + }, + { + "epoch": 1.0946213977365535, + "grad_norm": 1.4510258436203003, + "learning_rate": 0.00027354670068057644, + "loss": 1.9889, + "step": 9382 + }, + { + "epoch": 1.0947380702368452, + "grad_norm": 1.044642448425293, + "learning_rate": 0.0002735380864347608, + "loss": 1.9349, + "step": 9383 + }, + { + "epoch": 1.094854742737137, + "grad_norm": 1.1410776376724243, + "learning_rate": 0.0002735294709237892, + "loss": 1.9223, + "step": 9384 + }, + { + "epoch": 1.0949714152374286, + "grad_norm": 1.2975244522094727, + "learning_rate": 0.00027352085414775114, + "loss": 2.2392, + "step": 9385 + }, + { + "epoch": 1.0950880877377203, + "grad_norm": 1.325164556503296, + "learning_rate": 0.0002735122361067359, + "loss": 2.0227, + "step": 9386 + }, + { + "epoch": 1.095204760238012, + "grad_norm": 1.2552756071090698, + "learning_rate": 0.0002735036168008327, + "loss": 2.1105, + "step": 9387 + }, + { + "epoch": 1.0953214327383036, + "grad_norm": 1.2039252519607544, + "learning_rate": 0.000273494996230131, + "loss": 2.0904, + "step": 9388 + }, + { + "epoch": 1.0954381052385953, + "grad_norm": 1.033353567123413, + "learning_rate": 0.00027348637439472017, + "loss": 1.9966, + "step": 9389 + }, + { + "epoch": 1.095554777738887, + "grad_norm": 1.2533904314041138, + "learning_rate": 0.00027347775129468954, + "loss": 2.0425, + "step": 9390 + }, + { + "epoch": 1.0956714502391787, + "grad_norm": 1.389890193939209, + "learning_rate": 0.00027346912693012855, + "loss": 1.9655, + "step": 9391 + }, + { + "epoch": 1.0957881227394703, + "grad_norm": 1.1307008266448975, + "learning_rate": 0.00027346050130112653, + "loss": 2.0696, + "step": 9392 + }, + { + "epoch": 1.095904795239762, + "grad_norm": 1.4222792387008667, + "learning_rate": 0.00027345187440777304, + "loss": 2.142, + "step": 9393 + }, + { + "epoch": 1.0960214677400537, + "grad_norm": 0.9509802460670471, + "learning_rate": 0.0002734432462501575, + "loss": 1.9824, + "step": 9394 + }, + { + "epoch": 1.0961381402403454, + "grad_norm": 1.266930341720581, + "learning_rate": 0.00027343461682836917, + "loss": 1.8651, + "step": 9395 + }, + { + "epoch": 1.096254812740637, + "grad_norm": 1.284184455871582, + "learning_rate": 0.0002734259861424977, + "loss": 2.1746, + "step": 9396 + }, + { + "epoch": 1.0963714852409288, + "grad_norm": 1.1464365720748901, + "learning_rate": 0.00027341735419263245, + "loss": 2.3258, + "step": 9397 + }, + { + "epoch": 1.0964881577412204, + "grad_norm": 1.3090358972549438, + "learning_rate": 0.000273408720978863, + "loss": 2.0023, + "step": 9398 + }, + { + "epoch": 1.0966048302415121, + "grad_norm": 1.109755039215088, + "learning_rate": 0.0002734000865012788, + "loss": 2.053, + "step": 9399 + }, + { + "epoch": 1.0967215027418038, + "grad_norm": 1.148348331451416, + "learning_rate": 0.0002733914507599693, + "loss": 2.1402, + "step": 9400 + }, + { + "epoch": 1.0968381752420955, + "grad_norm": 1.2166134119033813, + "learning_rate": 0.00027338281375502416, + "loss": 2.0962, + "step": 9401 + }, + { + "epoch": 1.0969548477423872, + "grad_norm": 1.218364953994751, + "learning_rate": 0.0002733741754865328, + "loss": 2.0113, + "step": 9402 + }, + { + "epoch": 1.0970715202426788, + "grad_norm": 1.399631381034851, + "learning_rate": 0.0002733655359545849, + "loss": 2.1216, + "step": 9403 + }, + { + "epoch": 1.0971881927429705, + "grad_norm": 1.2380608320236206, + "learning_rate": 0.0002733568951592699, + "loss": 2.0548, + "step": 9404 + }, + { + "epoch": 1.0973048652432622, + "grad_norm": 1.1385014057159424, + "learning_rate": 0.00027334825310067743, + "loss": 2.031, + "step": 9405 + }, + { + "epoch": 1.0974215377435539, + "grad_norm": 1.276147723197937, + "learning_rate": 0.0002733396097788971, + "loss": 2.0529, + "step": 9406 + }, + { + "epoch": 1.0975382102438456, + "grad_norm": 1.1235965490341187, + "learning_rate": 0.0002733309651940185, + "loss": 2.1213, + "step": 9407 + }, + { + "epoch": 1.0976548827441373, + "grad_norm": 1.312536597251892, + "learning_rate": 0.0002733223193461312, + "loss": 2.0238, + "step": 9408 + }, + { + "epoch": 1.097771555244429, + "grad_norm": 1.2968465089797974, + "learning_rate": 0.0002733136722353249, + "loss": 2.1098, + "step": 9409 + }, + { + "epoch": 1.0978882277447206, + "grad_norm": 1.5808018445968628, + "learning_rate": 0.00027330502386168915, + "loss": 2.0874, + "step": 9410 + }, + { + "epoch": 1.0980049002450123, + "grad_norm": 1.2288923263549805, + "learning_rate": 0.0002732963742253137, + "loss": 2.1357, + "step": 9411 + }, + { + "epoch": 1.098121572745304, + "grad_norm": 1.2204713821411133, + "learning_rate": 0.00027328772332628823, + "loss": 1.9875, + "step": 9412 + }, + { + "epoch": 1.0982382452455957, + "grad_norm": 1.1050786972045898, + "learning_rate": 0.00027327907116470236, + "loss": 1.8844, + "step": 9413 + }, + { + "epoch": 1.0983549177458873, + "grad_norm": 1.109100103378296, + "learning_rate": 0.0002732704177406458, + "loss": 1.9502, + "step": 9414 + }, + { + "epoch": 1.098471590246179, + "grad_norm": 1.3034703731536865, + "learning_rate": 0.0002732617630542083, + "loss": 2.1653, + "step": 9415 + }, + { + "epoch": 1.0985882627464707, + "grad_norm": 1.2017830610275269, + "learning_rate": 0.00027325310710547954, + "loss": 2.0564, + "step": 9416 + }, + { + "epoch": 1.0987049352467624, + "grad_norm": 1.193279504776001, + "learning_rate": 0.00027324444989454926, + "loss": 2.1143, + "step": 9417 + }, + { + "epoch": 1.098821607747054, + "grad_norm": 1.052464485168457, + "learning_rate": 0.0002732357914215072, + "loss": 1.8406, + "step": 9418 + }, + { + "epoch": 1.0989382802473457, + "grad_norm": 1.2581509351730347, + "learning_rate": 0.0002732271316864432, + "loss": 1.8289, + "step": 9419 + }, + { + "epoch": 1.0990549527476374, + "grad_norm": 1.2326833009719849, + "learning_rate": 0.0002732184706894469, + "loss": 2.0662, + "step": 9420 + }, + { + "epoch": 1.099171625247929, + "grad_norm": 1.1496878862380981, + "learning_rate": 0.00027320980843060816, + "loss": 2.1053, + "step": 9421 + }, + { + "epoch": 1.0992882977482208, + "grad_norm": 1.4983607530593872, + "learning_rate": 0.0002732011449100168, + "loss": 2.2643, + "step": 9422 + }, + { + "epoch": 1.0994049702485125, + "grad_norm": 1.3479522466659546, + "learning_rate": 0.0002731924801277626, + "loss": 2.0817, + "step": 9423 + }, + { + "epoch": 1.0995216427488042, + "grad_norm": 1.3968791961669922, + "learning_rate": 0.00027318381408393544, + "loss": 2.1587, + "step": 9424 + }, + { + "epoch": 1.0996383152490958, + "grad_norm": 1.1030641794204712, + "learning_rate": 0.0002731751467786251, + "loss": 2.1739, + "step": 9425 + }, + { + "epoch": 1.0997549877493875, + "grad_norm": 1.113232135772705, + "learning_rate": 0.00027316647821192143, + "loss": 1.9997, + "step": 9426 + }, + { + "epoch": 1.0998716602496792, + "grad_norm": 1.0747950077056885, + "learning_rate": 0.0002731578083839143, + "loss": 2.0284, + "step": 9427 + }, + { + "epoch": 1.0999883327499709, + "grad_norm": 1.070237636566162, + "learning_rate": 0.00027314913729469366, + "loss": 2.0337, + "step": 9428 + }, + { + "epoch": 1.1001050052502626, + "grad_norm": 1.010141372680664, + "learning_rate": 0.0002731404649443493, + "loss": 1.8746, + "step": 9429 + }, + { + "epoch": 1.1002216777505542, + "grad_norm": 1.3636751174926758, + "learning_rate": 0.00027313179133297123, + "loss": 2.0318, + "step": 9430 + }, + { + "epoch": 1.100338350250846, + "grad_norm": 1.188959002494812, + "learning_rate": 0.0002731231164606493, + "loss": 1.9687, + "step": 9431 + }, + { + "epoch": 1.1004550227511376, + "grad_norm": 1.0263779163360596, + "learning_rate": 0.00027311444032747346, + "loss": 1.9444, + "step": 9432 + }, + { + "epoch": 1.1005716952514293, + "grad_norm": 1.1685564517974854, + "learning_rate": 0.0002731057629335336, + "loss": 2.1302, + "step": 9433 + }, + { + "epoch": 1.100688367751721, + "grad_norm": 1.3347052335739136, + "learning_rate": 0.0002730970842789198, + "loss": 2.1127, + "step": 9434 + }, + { + "epoch": 1.1008050402520126, + "grad_norm": 1.1347357034683228, + "learning_rate": 0.0002730884043637219, + "loss": 2.1579, + "step": 9435 + }, + { + "epoch": 1.1009217127523043, + "grad_norm": 1.285332202911377, + "learning_rate": 0.00027307972318803, + "loss": 2.143, + "step": 9436 + }, + { + "epoch": 1.101038385252596, + "grad_norm": 1.2469111680984497, + "learning_rate": 0.000273071040751934, + "loss": 2.0327, + "step": 9437 + }, + { + "epoch": 1.1011550577528877, + "grad_norm": 1.0155411958694458, + "learning_rate": 0.00027306235705552395, + "loss": 2.0176, + "step": 9438 + }, + { + "epoch": 1.1012717302531794, + "grad_norm": 1.1273242235183716, + "learning_rate": 0.0002730536720988899, + "loss": 2.1771, + "step": 9439 + }, + { + "epoch": 1.101388402753471, + "grad_norm": 1.305579662322998, + "learning_rate": 0.00027304498588212177, + "loss": 2.1558, + "step": 9440 + }, + { + "epoch": 1.1015050752537627, + "grad_norm": 1.0088105201721191, + "learning_rate": 0.0002730362984053098, + "loss": 1.9782, + "step": 9441 + }, + { + "epoch": 1.1016217477540544, + "grad_norm": 1.1589090824127197, + "learning_rate": 0.0002730276096685439, + "loss": 2.1928, + "step": 9442 + }, + { + "epoch": 1.101738420254346, + "grad_norm": 1.3741521835327148, + "learning_rate": 0.00027301891967191423, + "loss": 2.2139, + "step": 9443 + }, + { + "epoch": 1.1018550927546378, + "grad_norm": 1.1455014944076538, + "learning_rate": 0.0002730102284155108, + "loss": 1.9338, + "step": 9444 + }, + { + "epoch": 1.1019717652549295, + "grad_norm": 1.3157881498336792, + "learning_rate": 0.0002730015358994238, + "loss": 1.8168, + "step": 9445 + }, + { + "epoch": 1.1020884377552211, + "grad_norm": 1.0905866622924805, + "learning_rate": 0.00027299284212374325, + "loss": 2.1053, + "step": 9446 + }, + { + "epoch": 1.1022051102555128, + "grad_norm": 1.1214457750320435, + "learning_rate": 0.00027298414708855936, + "loss": 2.1103, + "step": 9447 + }, + { + "epoch": 1.1023217827558045, + "grad_norm": 1.3779507875442505, + "learning_rate": 0.0002729754507939622, + "loss": 2.0669, + "step": 9448 + }, + { + "epoch": 1.1024384552560962, + "grad_norm": 1.0529378652572632, + "learning_rate": 0.000272966753240042, + "loss": 1.9684, + "step": 9449 + }, + { + "epoch": 1.1025551277563879, + "grad_norm": 1.208350419998169, + "learning_rate": 0.00027295805442688884, + "loss": 2.2266, + "step": 9450 + }, + { + "epoch": 1.1026718002566795, + "grad_norm": 1.0768959522247314, + "learning_rate": 0.000272949354354593, + "loss": 2.0081, + "step": 9451 + }, + { + "epoch": 1.1027884727569712, + "grad_norm": 1.1614242792129517, + "learning_rate": 0.0002729406530232446, + "loss": 1.9554, + "step": 9452 + }, + { + "epoch": 1.102905145257263, + "grad_norm": 1.188103437423706, + "learning_rate": 0.00027293195043293387, + "loss": 2.1513, + "step": 9453 + }, + { + "epoch": 1.1030218177575546, + "grad_norm": 1.3512078523635864, + "learning_rate": 0.00027292324658375103, + "loss": 2.1267, + "step": 9454 + }, + { + "epoch": 1.1031384902578463, + "grad_norm": 1.214401364326477, + "learning_rate": 0.00027291454147578627, + "loss": 2.0532, + "step": 9455 + }, + { + "epoch": 1.103255162758138, + "grad_norm": 0.9520347714424133, + "learning_rate": 0.00027290583510912984, + "loss": 1.9823, + "step": 9456 + }, + { + "epoch": 1.1033718352584296, + "grad_norm": 1.0424461364746094, + "learning_rate": 0.0002728971274838721, + "loss": 2.1897, + "step": 9457 + }, + { + "epoch": 1.1034885077587213, + "grad_norm": 1.1650031805038452, + "learning_rate": 0.00027288841860010325, + "loss": 2.184, + "step": 9458 + }, + { + "epoch": 1.103605180259013, + "grad_norm": 1.202614665031433, + "learning_rate": 0.00027287970845791355, + "loss": 1.9572, + "step": 9459 + }, + { + "epoch": 1.1037218527593047, + "grad_norm": 1.1378921270370483, + "learning_rate": 0.00027287099705739333, + "loss": 2.1076, + "step": 9460 + }, + { + "epoch": 1.1038385252595964, + "grad_norm": 1.1541748046875, + "learning_rate": 0.0002728622843986328, + "loss": 1.9755, + "step": 9461 + }, + { + "epoch": 1.103955197759888, + "grad_norm": 1.1271101236343384, + "learning_rate": 0.0002728535704817225, + "loss": 2.1535, + "step": 9462 + }, + { + "epoch": 1.1040718702601797, + "grad_norm": 1.2103214263916016, + "learning_rate": 0.00027284485530675257, + "loss": 1.9578, + "step": 9463 + }, + { + "epoch": 1.1041885427604714, + "grad_norm": 1.0653421878814697, + "learning_rate": 0.0002728361388738135, + "loss": 2.0871, + "step": 9464 + }, + { + "epoch": 1.104305215260763, + "grad_norm": 1.2166041135787964, + "learning_rate": 0.00027282742118299555, + "loss": 2.0696, + "step": 9465 + }, + { + "epoch": 1.1044218877610548, + "grad_norm": 1.1998540163040161, + "learning_rate": 0.00027281870223438905, + "loss": 2.1625, + "step": 9466 + }, + { + "epoch": 1.1045385602613464, + "grad_norm": 1.260169267654419, + "learning_rate": 0.0002728099820280846, + "loss": 1.9196, + "step": 9467 + }, + { + "epoch": 1.1046552327616381, + "grad_norm": 1.3334521055221558, + "learning_rate": 0.0002728012605641724, + "loss": 2.0928, + "step": 9468 + }, + { + "epoch": 1.1047719052619298, + "grad_norm": 1.2205456495285034, + "learning_rate": 0.0002727925378427429, + "loss": 2.2451, + "step": 9469 + }, + { + "epoch": 1.1048885777622215, + "grad_norm": 1.2322574853897095, + "learning_rate": 0.00027278381386388657, + "loss": 2.0956, + "step": 9470 + }, + { + "epoch": 1.1050052502625132, + "grad_norm": 1.0131317377090454, + "learning_rate": 0.00027277508862769384, + "loss": 1.9874, + "step": 9471 + }, + { + "epoch": 1.1051219227628049, + "grad_norm": 1.1432182788848877, + "learning_rate": 0.00027276636213425517, + "loss": 1.8967, + "step": 9472 + }, + { + "epoch": 1.1052385952630965, + "grad_norm": 1.135315179824829, + "learning_rate": 0.00027275763438366104, + "loss": 2.0626, + "step": 9473 + }, + { + "epoch": 1.1053552677633882, + "grad_norm": 1.034879207611084, + "learning_rate": 0.0002727489053760019, + "loss": 2.0486, + "step": 9474 + }, + { + "epoch": 1.10547194026368, + "grad_norm": 1.2345077991485596, + "learning_rate": 0.00027274017511136827, + "loss": 2.1738, + "step": 9475 + }, + { + "epoch": 1.1055886127639716, + "grad_norm": 1.1917273998260498, + "learning_rate": 0.00027273144358985063, + "loss": 2.1428, + "step": 9476 + }, + { + "epoch": 1.1057052852642633, + "grad_norm": 1.1877082586288452, + "learning_rate": 0.00027272271081153944, + "loss": 1.9384, + "step": 9477 + }, + { + "epoch": 1.105821957764555, + "grad_norm": 1.179397702217102, + "learning_rate": 0.0002727139767765254, + "loss": 2.0088, + "step": 9478 + }, + { + "epoch": 1.1059386302648466, + "grad_norm": 1.1883246898651123, + "learning_rate": 0.0002727052414848989, + "loss": 1.8012, + "step": 9479 + }, + { + "epoch": 1.1060553027651383, + "grad_norm": 1.2807027101516724, + "learning_rate": 0.0002726965049367505, + "loss": 2.0614, + "step": 9480 + }, + { + "epoch": 1.10617197526543, + "grad_norm": 1.2272145748138428, + "learning_rate": 0.0002726877671321709, + "loss": 2.1116, + "step": 9481 + }, + { + "epoch": 1.1062886477657217, + "grad_norm": 1.1470316648483276, + "learning_rate": 0.00027267902807125063, + "loss": 2.0777, + "step": 9482 + }, + { + "epoch": 1.1064053202660133, + "grad_norm": 1.2658120393753052, + "learning_rate": 0.0002726702877540802, + "loss": 2.0848, + "step": 9483 + }, + { + "epoch": 1.106521992766305, + "grad_norm": 1.290461778640747, + "learning_rate": 0.0002726615461807503, + "loss": 2.142, + "step": 9484 + }, + { + "epoch": 1.1066386652665967, + "grad_norm": 1.0796080827713013, + "learning_rate": 0.00027265280335135155, + "loss": 1.9385, + "step": 9485 + }, + { + "epoch": 1.1067553377668884, + "grad_norm": 1.2941999435424805, + "learning_rate": 0.00027264405926597456, + "loss": 2.1707, + "step": 9486 + }, + { + "epoch": 1.10687201026718, + "grad_norm": 1.2212209701538086, + "learning_rate": 0.00027263531392471, + "loss": 2.1553, + "step": 9487 + }, + { + "epoch": 1.1069886827674718, + "grad_norm": 1.0480059385299683, + "learning_rate": 0.0002726265673276485, + "loss": 2.1601, + "step": 9488 + }, + { + "epoch": 1.1071053552677634, + "grad_norm": 1.3497401475906372, + "learning_rate": 0.00027261781947488076, + "loss": 2.1998, + "step": 9489 + }, + { + "epoch": 1.1072220277680551, + "grad_norm": 1.1616915464401245, + "learning_rate": 0.0002726090703664975, + "loss": 2.2188, + "step": 9490 + }, + { + "epoch": 1.1073387002683468, + "grad_norm": 1.0390393733978271, + "learning_rate": 0.0002726003200025893, + "loss": 2.0239, + "step": 9491 + }, + { + "epoch": 1.1074553727686385, + "grad_norm": 1.2790207862854004, + "learning_rate": 0.0002725915683832471, + "loss": 2.1564, + "step": 9492 + }, + { + "epoch": 1.1075720452689302, + "grad_norm": 1.0970231294631958, + "learning_rate": 0.0002725828155085614, + "loss": 1.9725, + "step": 9493 + }, + { + "epoch": 1.1076887177692218, + "grad_norm": 1.1655380725860596, + "learning_rate": 0.00027257406137862303, + "loss": 2.0433, + "step": 9494 + }, + { + "epoch": 1.1078053902695135, + "grad_norm": 1.2848435640335083, + "learning_rate": 0.00027256530599352273, + "loss": 2.0769, + "step": 9495 + }, + { + "epoch": 1.1079220627698052, + "grad_norm": 1.1662037372589111, + "learning_rate": 0.0002725565493533513, + "loss": 2.0459, + "step": 9496 + }, + { + "epoch": 1.1080387352700969, + "grad_norm": 1.4507036209106445, + "learning_rate": 0.0002725477914581995, + "loss": 2.0844, + "step": 9497 + }, + { + "epoch": 1.1081554077703886, + "grad_norm": 1.1828505992889404, + "learning_rate": 0.00027253903230815813, + "loss": 1.9489, + "step": 9498 + }, + { + "epoch": 1.1082720802706802, + "grad_norm": 1.130625605583191, + "learning_rate": 0.00027253027190331794, + "loss": 2.0221, + "step": 9499 + }, + { + "epoch": 1.108388752770972, + "grad_norm": 1.1372264623641968, + "learning_rate": 0.00027252151024376983, + "loss": 1.8296, + "step": 9500 + }, + { + "epoch": 1.1085054252712636, + "grad_norm": 1.2164490222930908, + "learning_rate": 0.0002725127473296046, + "loss": 2.2271, + "step": 9501 + }, + { + "epoch": 1.1086220977715553, + "grad_norm": 1.2087407112121582, + "learning_rate": 0.00027250398316091304, + "loss": 2.0422, + "step": 9502 + }, + { + "epoch": 1.108738770271847, + "grad_norm": 1.331469178199768, + "learning_rate": 0.0002724952177377861, + "loss": 2.1156, + "step": 9503 + }, + { + "epoch": 1.1088554427721387, + "grad_norm": 1.193974494934082, + "learning_rate": 0.0002724864510603146, + "loss": 2.0751, + "step": 9504 + }, + { + "epoch": 1.1089721152724303, + "grad_norm": 1.3163397312164307, + "learning_rate": 0.00027247768312858937, + "loss": 2.1273, + "step": 9505 + }, + { + "epoch": 1.109088787772722, + "grad_norm": 1.221242904663086, + "learning_rate": 0.00027246891394270146, + "loss": 2.1663, + "step": 9506 + }, + { + "epoch": 1.1092054602730137, + "grad_norm": 1.2018502950668335, + "learning_rate": 0.0002724601435027416, + "loss": 2.1756, + "step": 9507 + }, + { + "epoch": 1.1093221327733054, + "grad_norm": 1.0985620021820068, + "learning_rate": 0.0002724513718088008, + "loss": 1.8892, + "step": 9508 + }, + { + "epoch": 1.109438805273597, + "grad_norm": 1.2874614000320435, + "learning_rate": 0.00027244259886097006, + "loss": 2.2121, + "step": 9509 + }, + { + "epoch": 1.1095554777738887, + "grad_norm": 1.2253642082214355, + "learning_rate": 0.0002724338246593402, + "loss": 2.0979, + "step": 9510 + }, + { + "epoch": 1.1096721502741804, + "grad_norm": 1.172404408454895, + "learning_rate": 0.0002724250492040023, + "loss": 2.0154, + "step": 9511 + }, + { + "epoch": 1.109788822774472, + "grad_norm": 1.1238443851470947, + "learning_rate": 0.00027241627249504724, + "loss": 2.1182, + "step": 9512 + }, + { + "epoch": 1.1099054952747638, + "grad_norm": 1.1956653594970703, + "learning_rate": 0.000272407494532566, + "loss": 2.0849, + "step": 9513 + }, + { + "epoch": 1.1100221677750555, + "grad_norm": 1.2173490524291992, + "learning_rate": 0.00027239871531664964, + "loss": 2.0218, + "step": 9514 + }, + { + "epoch": 1.1101388402753471, + "grad_norm": 1.0315494537353516, + "learning_rate": 0.0002723899348473892, + "loss": 2.0483, + "step": 9515 + }, + { + "epoch": 1.1102555127756388, + "grad_norm": 1.4010947942733765, + "learning_rate": 0.00027238115312487566, + "loss": 2.2254, + "step": 9516 + }, + { + "epoch": 1.1103721852759305, + "grad_norm": 1.057565450668335, + "learning_rate": 0.00027237237014920003, + "loss": 1.9922, + "step": 9517 + }, + { + "epoch": 1.1104888577762222, + "grad_norm": 1.0641676187515259, + "learning_rate": 0.00027236358592045345, + "loss": 2.0031, + "step": 9518 + }, + { + "epoch": 1.1106055302765139, + "grad_norm": 1.2187505960464478, + "learning_rate": 0.00027235480043872684, + "loss": 2.0995, + "step": 9519 + }, + { + "epoch": 1.1107222027768056, + "grad_norm": 1.4618386030197144, + "learning_rate": 0.0002723460137041114, + "loss": 2.1978, + "step": 9520 + }, + { + "epoch": 1.1108388752770972, + "grad_norm": 1.0482466220855713, + "learning_rate": 0.0002723372257166982, + "loss": 1.9671, + "step": 9521 + }, + { + "epoch": 1.110955547777389, + "grad_norm": 1.259421706199646, + "learning_rate": 0.0002723284364765783, + "loss": 1.9324, + "step": 9522 + }, + { + "epoch": 1.1110722202776806, + "grad_norm": 1.104828953742981, + "learning_rate": 0.0002723196459838429, + "loss": 2.0075, + "step": 9523 + }, + { + "epoch": 1.1111888927779723, + "grad_norm": 1.2022008895874023, + "learning_rate": 0.00027231085423858303, + "loss": 2.1273, + "step": 9524 + }, + { + "epoch": 1.111305565278264, + "grad_norm": 1.0804094076156616, + "learning_rate": 0.00027230206124088995, + "loss": 1.8553, + "step": 9525 + }, + { + "epoch": 1.1114222377785556, + "grad_norm": 1.1916043758392334, + "learning_rate": 0.00027229326699085467, + "loss": 2.1323, + "step": 9526 + }, + { + "epoch": 1.1115389102788473, + "grad_norm": 1.148671269416809, + "learning_rate": 0.0002722844714885684, + "loss": 1.8634, + "step": 9527 + }, + { + "epoch": 1.111655582779139, + "grad_norm": 1.0664409399032593, + "learning_rate": 0.00027227567473412246, + "loss": 2.0155, + "step": 9528 + }, + { + "epoch": 1.1117722552794307, + "grad_norm": 1.160117745399475, + "learning_rate": 0.00027226687672760795, + "loss": 2.1008, + "step": 9529 + }, + { + "epoch": 1.1118889277797224, + "grad_norm": 1.1611489057540894, + "learning_rate": 0.000272258077469116, + "loss": 2.1974, + "step": 9530 + }, + { + "epoch": 1.112005600280014, + "grad_norm": 1.298043131828308, + "learning_rate": 0.00027224927695873797, + "loss": 2.1255, + "step": 9531 + }, + { + "epoch": 1.1121222727803057, + "grad_norm": 1.2241904735565186, + "learning_rate": 0.000272240475196565, + "loss": 2.112, + "step": 9532 + }, + { + "epoch": 1.1122389452805974, + "grad_norm": 1.3407636880874634, + "learning_rate": 0.0002722316721826883, + "loss": 1.9636, + "step": 9533 + }, + { + "epoch": 1.112355617780889, + "grad_norm": 1.3392572402954102, + "learning_rate": 0.00027222286791719923, + "loss": 2.0775, + "step": 9534 + }, + { + "epoch": 1.1124722902811808, + "grad_norm": 1.4452239274978638, + "learning_rate": 0.00027221406240018913, + "loss": 2.0279, + "step": 9535 + }, + { + "epoch": 1.1125889627814725, + "grad_norm": 1.0407930612564087, + "learning_rate": 0.0002722052556317491, + "loss": 2.0504, + "step": 9536 + }, + { + "epoch": 1.1127056352817641, + "grad_norm": 1.2623990774154663, + "learning_rate": 0.0002721964476119705, + "loss": 2.007, + "step": 9537 + }, + { + "epoch": 1.1128223077820558, + "grad_norm": 1.10405433177948, + "learning_rate": 0.0002721876383409447, + "loss": 2.0594, + "step": 9538 + }, + { + "epoch": 1.1129389802823475, + "grad_norm": 1.2437491416931152, + "learning_rate": 0.00027217882781876296, + "loss": 1.9332, + "step": 9539 + }, + { + "epoch": 1.1130556527826392, + "grad_norm": 1.0316307544708252, + "learning_rate": 0.00027217001604551666, + "loss": 1.9617, + "step": 9540 + }, + { + "epoch": 1.1131723252829309, + "grad_norm": 1.1228755712509155, + "learning_rate": 0.00027216120302129713, + "loss": 2.1236, + "step": 9541 + }, + { + "epoch": 1.1132889977832225, + "grad_norm": 1.1963136196136475, + "learning_rate": 0.0002721523887461957, + "loss": 1.9737, + "step": 9542 + }, + { + "epoch": 1.1134056702835142, + "grad_norm": 1.2562259435653687, + "learning_rate": 0.0002721435732203039, + "loss": 2.0315, + "step": 9543 + }, + { + "epoch": 1.113522342783806, + "grad_norm": 1.2581369876861572, + "learning_rate": 0.0002721347564437129, + "loss": 2.0391, + "step": 9544 + }, + { + "epoch": 1.1136390152840976, + "grad_norm": 1.2445714473724365, + "learning_rate": 0.00027212593841651427, + "loss": 2.1282, + "step": 9545 + }, + { + "epoch": 1.1137556877843893, + "grad_norm": 1.0976946353912354, + "learning_rate": 0.0002721171191387993, + "loss": 2.0744, + "step": 9546 + }, + { + "epoch": 1.113872360284681, + "grad_norm": 1.3046761751174927, + "learning_rate": 0.0002721082986106595, + "loss": 2.1123, + "step": 9547 + }, + { + "epoch": 1.1139890327849726, + "grad_norm": 1.2153654098510742, + "learning_rate": 0.00027209947683218635, + "loss": 2.0815, + "step": 9548 + }, + { + "epoch": 1.1141057052852643, + "grad_norm": 1.3017604351043701, + "learning_rate": 0.0002720906538034712, + "loss": 2.0827, + "step": 9549 + }, + { + "epoch": 1.114222377785556, + "grad_norm": 1.1949310302734375, + "learning_rate": 0.0002720818295246056, + "loss": 2.1511, + "step": 9550 + }, + { + "epoch": 1.1143390502858477, + "grad_norm": 1.2034103870391846, + "learning_rate": 0.00027207300399568097, + "loss": 1.98, + "step": 9551 + }, + { + "epoch": 1.1144557227861394, + "grad_norm": 1.2078145742416382, + "learning_rate": 0.0002720641772167888, + "loss": 2.0663, + "step": 9552 + }, + { + "epoch": 1.114572395286431, + "grad_norm": 1.1812310218811035, + "learning_rate": 0.00027205534918802067, + "loss": 2.224, + "step": 9553 + }, + { + "epoch": 1.1146890677867227, + "grad_norm": 1.266080379486084, + "learning_rate": 0.0002720465199094681, + "loss": 2.1832, + "step": 9554 + }, + { + "epoch": 1.1148057402870144, + "grad_norm": 1.285591721534729, + "learning_rate": 0.0002720376893812225, + "loss": 2.0371, + "step": 9555 + }, + { + "epoch": 1.114922412787306, + "grad_norm": 1.1352022886276245, + "learning_rate": 0.00027202885760337547, + "loss": 2.0191, + "step": 9556 + }, + { + "epoch": 1.1150390852875978, + "grad_norm": 1.3025691509246826, + "learning_rate": 0.00027202002457601867, + "loss": 2.0918, + "step": 9557 + }, + { + "epoch": 1.1151557577878894, + "grad_norm": 1.1233248710632324, + "learning_rate": 0.00027201119029924355, + "loss": 2.0303, + "step": 9558 + }, + { + "epoch": 1.1152724302881811, + "grad_norm": 1.0278006792068481, + "learning_rate": 0.00027200235477314183, + "loss": 2.1712, + "step": 9559 + }, + { + "epoch": 1.1153891027884728, + "grad_norm": 1.1589641571044922, + "learning_rate": 0.0002719935179978049, + "loss": 1.8725, + "step": 9560 + }, + { + "epoch": 1.1155057752887645, + "grad_norm": 1.3328922986984253, + "learning_rate": 0.0002719846799733245, + "loss": 2.1633, + "step": 9561 + }, + { + "epoch": 1.1156224477890562, + "grad_norm": 1.2169110774993896, + "learning_rate": 0.0002719758406997922, + "loss": 2.0131, + "step": 9562 + }, + { + "epoch": 1.1157391202893479, + "grad_norm": 1.22648024559021, + "learning_rate": 0.00027196700017729976, + "loss": 2.0905, + "step": 9563 + }, + { + "epoch": 1.1158557927896395, + "grad_norm": 1.2131574153900146, + "learning_rate": 0.0002719581584059387, + "loss": 2.2186, + "step": 9564 + }, + { + "epoch": 1.1159724652899312, + "grad_norm": 1.3162003755569458, + "learning_rate": 0.0002719493153858007, + "loss": 2.0891, + "step": 9565 + }, + { + "epoch": 1.116089137790223, + "grad_norm": 1.1172175407409668, + "learning_rate": 0.0002719404711169775, + "loss": 2.0774, + "step": 9566 + }, + { + "epoch": 1.1162058102905146, + "grad_norm": 1.0015214681625366, + "learning_rate": 0.0002719316255995607, + "loss": 1.8738, + "step": 9567 + }, + { + "epoch": 1.1163224827908063, + "grad_norm": 1.3591252565383911, + "learning_rate": 0.00027192277883364207, + "loss": 2.0264, + "step": 9568 + }, + { + "epoch": 1.116439155291098, + "grad_norm": 1.2397801876068115, + "learning_rate": 0.0002719139308193133, + "loss": 2.0842, + "step": 9569 + }, + { + "epoch": 1.1165558277913896, + "grad_norm": 1.227992057800293, + "learning_rate": 0.0002719050815566661, + "loss": 2.076, + "step": 9570 + }, + { + "epoch": 1.1166725002916813, + "grad_norm": 1.287450909614563, + "learning_rate": 0.00027189623104579224, + "loss": 1.9974, + "step": 9571 + }, + { + "epoch": 1.116789172791973, + "grad_norm": 1.2426726818084717, + "learning_rate": 0.00027188737928678347, + "loss": 2.1441, + "step": 9572 + }, + { + "epoch": 1.1169058452922647, + "grad_norm": 1.0110561847686768, + "learning_rate": 0.0002718785262797315, + "loss": 1.8933, + "step": 9573 + }, + { + "epoch": 1.1170225177925563, + "grad_norm": 1.5608481168746948, + "learning_rate": 0.0002718696720247282, + "loss": 2.2141, + "step": 9574 + }, + { + "epoch": 1.117139190292848, + "grad_norm": 1.3006771802902222, + "learning_rate": 0.0002718608165218653, + "loss": 2.1422, + "step": 9575 + }, + { + "epoch": 1.1172558627931397, + "grad_norm": 1.1689735651016235, + "learning_rate": 0.0002718519597712346, + "loss": 2.0871, + "step": 9576 + }, + { + "epoch": 1.1173725352934314, + "grad_norm": 1.1509026288986206, + "learning_rate": 0.00027184310177292794, + "loss": 2.103, + "step": 9577 + }, + { + "epoch": 1.117489207793723, + "grad_norm": 1.4717973470687866, + "learning_rate": 0.00027183424252703707, + "loss": 1.9951, + "step": 9578 + }, + { + "epoch": 1.1176058802940148, + "grad_norm": 1.1510344743728638, + "learning_rate": 0.000271825382033654, + "loss": 1.9834, + "step": 9579 + }, + { + "epoch": 1.1177225527943064, + "grad_norm": 1.0859894752502441, + "learning_rate": 0.00027181652029287045, + "loss": 2.0919, + "step": 9580 + }, + { + "epoch": 1.1178392252945981, + "grad_norm": 1.0937994718551636, + "learning_rate": 0.00027180765730477836, + "loss": 2.0073, + "step": 9581 + }, + { + "epoch": 1.1179558977948898, + "grad_norm": 1.3171007633209229, + "learning_rate": 0.00027179879306946954, + "loss": 2.0014, + "step": 9582 + }, + { + "epoch": 1.1180725702951815, + "grad_norm": 1.0755501985549927, + "learning_rate": 0.0002717899275870359, + "loss": 2.0629, + "step": 9583 + }, + { + "epoch": 1.1181892427954732, + "grad_norm": 1.1378886699676514, + "learning_rate": 0.00027178106085756944, + "loss": 2.0984, + "step": 9584 + }, + { + "epoch": 1.1183059152957648, + "grad_norm": 1.178866982460022, + "learning_rate": 0.000271772192881162, + "loss": 1.9566, + "step": 9585 + }, + { + "epoch": 1.1184225877960565, + "grad_norm": 1.3206379413604736, + "learning_rate": 0.0002717633236579055, + "loss": 2.0296, + "step": 9586 + }, + { + "epoch": 1.1185392602963482, + "grad_norm": 1.1358979940414429, + "learning_rate": 0.00027175445318789197, + "loss": 1.929, + "step": 9587 + }, + { + "epoch": 1.1186559327966399, + "grad_norm": 1.056700587272644, + "learning_rate": 0.00027174558147121324, + "loss": 1.9014, + "step": 9588 + }, + { + "epoch": 1.1187726052969316, + "grad_norm": 1.3770489692687988, + "learning_rate": 0.0002717367085079614, + "loss": 2.2182, + "step": 9589 + }, + { + "epoch": 1.1188892777972232, + "grad_norm": 1.1988270282745361, + "learning_rate": 0.00027172783429822836, + "loss": 2.1117, + "step": 9590 + }, + { + "epoch": 1.119005950297515, + "grad_norm": 1.2013095617294312, + "learning_rate": 0.00027171895884210614, + "loss": 2.0851, + "step": 9591 + }, + { + "epoch": 1.1191226227978066, + "grad_norm": 1.1735059022903442, + "learning_rate": 0.0002717100821396868, + "loss": 2.0193, + "step": 9592 + }, + { + "epoch": 1.1192392952980983, + "grad_norm": 1.1974194049835205, + "learning_rate": 0.0002717012041910623, + "loss": 2.0122, + "step": 9593 + }, + { + "epoch": 1.11935596779839, + "grad_norm": 1.2081313133239746, + "learning_rate": 0.0002716923249963247, + "loss": 2.1075, + "step": 9594 + }, + { + "epoch": 1.1194726402986817, + "grad_norm": 1.1839325428009033, + "learning_rate": 0.000271683444555566, + "loss": 2.1953, + "step": 9595 + }, + { + "epoch": 1.1195893127989733, + "grad_norm": 1.1807608604431152, + "learning_rate": 0.0002716745628688784, + "loss": 2.0252, + "step": 9596 + }, + { + "epoch": 1.119705985299265, + "grad_norm": 1.1019201278686523, + "learning_rate": 0.00027166567993635384, + "loss": 1.8156, + "step": 9597 + }, + { + "epoch": 1.1198226577995567, + "grad_norm": 1.1740559339523315, + "learning_rate": 0.0002716567957580845, + "loss": 2.0182, + "step": 9598 + }, + { + "epoch": 1.1199393302998484, + "grad_norm": 1.1516708135604858, + "learning_rate": 0.00027164791033416235, + "loss": 1.8833, + "step": 9599 + }, + { + "epoch": 1.12005600280014, + "grad_norm": 1.110507607460022, + "learning_rate": 0.00027163902366467965, + "loss": 1.7717, + "step": 9600 + }, + { + "epoch": 1.1201726753004317, + "grad_norm": 1.228191614151001, + "learning_rate": 0.0002716301357497285, + "loss": 2.2738, + "step": 9601 + }, + { + "epoch": 1.1202893478007234, + "grad_norm": 1.3156989812850952, + "learning_rate": 0.0002716212465894009, + "loss": 2.0228, + "step": 9602 + }, + { + "epoch": 1.120406020301015, + "grad_norm": 1.2239199876785278, + "learning_rate": 0.00027161235618378924, + "loss": 2.0178, + "step": 9603 + }, + { + "epoch": 1.1205226928013068, + "grad_norm": 1.071122407913208, + "learning_rate": 0.00027160346453298546, + "loss": 2.1598, + "step": 9604 + }, + { + "epoch": 1.1206393653015985, + "grad_norm": 1.3446706533432007, + "learning_rate": 0.00027159457163708186, + "loss": 2.0728, + "step": 9605 + }, + { + "epoch": 1.1207560378018901, + "grad_norm": 1.3556681871414185, + "learning_rate": 0.0002715856774961706, + "loss": 2.0676, + "step": 9606 + }, + { + "epoch": 1.1208727103021818, + "grad_norm": 1.180435299873352, + "learning_rate": 0.0002715767821103439, + "loss": 1.9864, + "step": 9607 + }, + { + "epoch": 1.1209893828024735, + "grad_norm": 1.2330987453460693, + "learning_rate": 0.0002715678854796941, + "loss": 1.9973, + "step": 9608 + }, + { + "epoch": 1.1211060553027652, + "grad_norm": 1.1692805290222168, + "learning_rate": 0.0002715589876043131, + "loss": 2.0055, + "step": 9609 + }, + { + "epoch": 1.1212227278030569, + "grad_norm": 1.2844858169555664, + "learning_rate": 0.0002715500884842934, + "loss": 1.9492, + "step": 9610 + }, + { + "epoch": 1.1213394003033486, + "grad_norm": 1.3445658683776855, + "learning_rate": 0.00027154118811972724, + "loss": 2.1555, + "step": 9611 + }, + { + "epoch": 1.1214560728036402, + "grad_norm": 1.2735170125961304, + "learning_rate": 0.0002715322865107068, + "loss": 2.0202, + "step": 9612 + }, + { + "epoch": 1.121572745303932, + "grad_norm": 1.1270676851272583, + "learning_rate": 0.00027152338365732447, + "loss": 2.1711, + "step": 9613 + }, + { + "epoch": 1.1216894178042236, + "grad_norm": 1.0217646360397339, + "learning_rate": 0.00027151447955967245, + "loss": 1.9509, + "step": 9614 + }, + { + "epoch": 1.1218060903045153, + "grad_norm": 1.346962571144104, + "learning_rate": 0.00027150557421784304, + "loss": 2.1806, + "step": 9615 + }, + { + "epoch": 1.121922762804807, + "grad_norm": 1.1648635864257812, + "learning_rate": 0.0002714966676319287, + "loss": 2.2525, + "step": 9616 + }, + { + "epoch": 1.1220394353050986, + "grad_norm": 1.2320654392242432, + "learning_rate": 0.00027148775980202164, + "loss": 1.9008, + "step": 9617 + }, + { + "epoch": 1.1221561078053903, + "grad_norm": 1.1784230470657349, + "learning_rate": 0.00027147885072821424, + "loss": 2.0406, + "step": 9618 + }, + { + "epoch": 1.122272780305682, + "grad_norm": 1.129900336265564, + "learning_rate": 0.0002714699404105987, + "loss": 2.0696, + "step": 9619 + }, + { + "epoch": 1.1223894528059737, + "grad_norm": 0.9862661957740784, + "learning_rate": 0.0002714610288492677, + "loss": 1.9176, + "step": 9620 + }, + { + "epoch": 1.1225061253062654, + "grad_norm": 1.0501576662063599, + "learning_rate": 0.00027145211604431347, + "loss": 1.9332, + "step": 9621 + }, + { + "epoch": 1.122622797806557, + "grad_norm": 1.16960608959198, + "learning_rate": 0.00027144320199582844, + "loss": 2.0088, + "step": 9622 + }, + { + "epoch": 1.1227394703068487, + "grad_norm": 1.0072211027145386, + "learning_rate": 0.0002714342867039049, + "loss": 2.0445, + "step": 9623 + }, + { + "epoch": 1.1228561428071404, + "grad_norm": 1.2301033735275269, + "learning_rate": 0.0002714253701686354, + "loss": 2.1695, + "step": 9624 + }, + { + "epoch": 1.122972815307432, + "grad_norm": 1.2876256704330444, + "learning_rate": 0.0002714164523901123, + "loss": 1.9481, + "step": 9625 + }, + { + "epoch": 1.1230894878077238, + "grad_norm": 1.1196144819259644, + "learning_rate": 0.0002714075333684282, + "loss": 2.0392, + "step": 9626 + }, + { + "epoch": 1.1232061603080155, + "grad_norm": 1.4023382663726807, + "learning_rate": 0.00027139861310367537, + "loss": 2.2094, + "step": 9627 + }, + { + "epoch": 1.1233228328083071, + "grad_norm": 1.1951162815093994, + "learning_rate": 0.0002713896915959464, + "loss": 2.1088, + "step": 9628 + }, + { + "epoch": 1.1234395053085988, + "grad_norm": 1.378577470779419, + "learning_rate": 0.0002713807688453337, + "loss": 2.1375, + "step": 9629 + }, + { + "epoch": 1.1235561778088905, + "grad_norm": 1.105467677116394, + "learning_rate": 0.00027137184485192986, + "loss": 2.0656, + "step": 9630 + }, + { + "epoch": 1.1236728503091822, + "grad_norm": 1.4370661973953247, + "learning_rate": 0.0002713629196158273, + "loss": 2.0422, + "step": 9631 + }, + { + "epoch": 1.1237895228094739, + "grad_norm": 1.3544954061508179, + "learning_rate": 0.00027135399313711866, + "loss": 2.0935, + "step": 9632 + }, + { + "epoch": 1.1239061953097655, + "grad_norm": 1.3323084115982056, + "learning_rate": 0.00027134506541589637, + "loss": 2.2044, + "step": 9633 + }, + { + "epoch": 1.1240228678100572, + "grad_norm": 1.4375839233398438, + "learning_rate": 0.0002713361364522531, + "loss": 2.0535, + "step": 9634 + }, + { + "epoch": 1.124139540310349, + "grad_norm": 1.178259253501892, + "learning_rate": 0.00027132720624628126, + "loss": 2.2366, + "step": 9635 + }, + { + "epoch": 1.1242562128106406, + "grad_norm": 1.2055314779281616, + "learning_rate": 0.0002713182747980736, + "loss": 2.0451, + "step": 9636 + }, + { + "epoch": 1.1243728853109323, + "grad_norm": 1.1322182416915894, + "learning_rate": 0.00027130934210772254, + "loss": 2.016, + "step": 9637 + }, + { + "epoch": 1.124489557811224, + "grad_norm": 1.2121999263763428, + "learning_rate": 0.00027130040817532083, + "loss": 2.1704, + "step": 9638 + }, + { + "epoch": 1.1246062303115156, + "grad_norm": 1.125906229019165, + "learning_rate": 0.00027129147300096105, + "loss": 2.1212, + "step": 9639 + }, + { + "epoch": 1.1247229028118073, + "grad_norm": 1.3649176359176636, + "learning_rate": 0.0002712825365847358, + "loss": 2.093, + "step": 9640 + }, + { + "epoch": 1.124839575312099, + "grad_norm": 1.198368787765503, + "learning_rate": 0.0002712735989267377, + "loss": 2.0818, + "step": 9641 + }, + { + "epoch": 1.1249562478123907, + "grad_norm": 1.190515160560608, + "learning_rate": 0.00027126466002705944, + "loss": 1.9162, + "step": 9642 + }, + { + "epoch": 1.1250729203126824, + "grad_norm": 1.2289265394210815, + "learning_rate": 0.00027125571988579364, + "loss": 2.0952, + "step": 9643 + }, + { + "epoch": 1.125189592812974, + "grad_norm": 1.2850427627563477, + "learning_rate": 0.0002712467785030331, + "loss": 2.0641, + "step": 9644 + }, + { + "epoch": 1.1253062653132657, + "grad_norm": 1.3160196542739868, + "learning_rate": 0.0002712378358788704, + "loss": 2.088, + "step": 9645 + }, + { + "epoch": 1.1254229378135574, + "grad_norm": 1.0874605178833008, + "learning_rate": 0.00027122889201339835, + "loss": 2.0314, + "step": 9646 + }, + { + "epoch": 1.125539610313849, + "grad_norm": 1.4792605638504028, + "learning_rate": 0.00027121994690670957, + "loss": 2.1524, + "step": 9647 + }, + { + "epoch": 1.1256562828141408, + "grad_norm": 1.0722304582595825, + "learning_rate": 0.00027121100055889683, + "loss": 2.0203, + "step": 9648 + }, + { + "epoch": 1.1257729553144324, + "grad_norm": 1.1398872137069702, + "learning_rate": 0.0002712020529700529, + "loss": 2.2373, + "step": 9649 + }, + { + "epoch": 1.1258896278147241, + "grad_norm": 1.304833173751831, + "learning_rate": 0.0002711931041402705, + "loss": 1.997, + "step": 9650 + }, + { + "epoch": 1.1260063003150158, + "grad_norm": 1.1130452156066895, + "learning_rate": 0.0002711841540696424, + "loss": 1.9736, + "step": 9651 + }, + { + "epoch": 1.1261229728153075, + "grad_norm": 1.342430830001831, + "learning_rate": 0.0002711752027582614, + "loss": 2.1356, + "step": 9652 + }, + { + "epoch": 1.1262396453155992, + "grad_norm": 1.0588865280151367, + "learning_rate": 0.0002711662502062203, + "loss": 1.8864, + "step": 9653 + }, + { + "epoch": 1.1263563178158909, + "grad_norm": 1.2315301895141602, + "learning_rate": 0.0002711572964136119, + "loss": 2.114, + "step": 9654 + }, + { + "epoch": 1.1264729903161825, + "grad_norm": 1.2654531002044678, + "learning_rate": 0.00027114834138052907, + "loss": 1.9786, + "step": 9655 + }, + { + "epoch": 1.1265896628164742, + "grad_norm": 1.2124079465866089, + "learning_rate": 0.00027113938510706467, + "loss": 2.0645, + "step": 9656 + }, + { + "epoch": 1.126706335316766, + "grad_norm": 1.257242202758789, + "learning_rate": 0.00027113042759331136, + "loss": 2.1436, + "step": 9657 + }, + { + "epoch": 1.1268230078170576, + "grad_norm": 1.2559441328048706, + "learning_rate": 0.0002711214688393622, + "loss": 2.0278, + "step": 9658 + }, + { + "epoch": 1.1269396803173493, + "grad_norm": 1.3080034255981445, + "learning_rate": 0.00027111250884530997, + "loss": 2.2427, + "step": 9659 + }, + { + "epoch": 1.127056352817641, + "grad_norm": 1.0176080465316772, + "learning_rate": 0.0002711035476112476, + "loss": 2.1046, + "step": 9660 + }, + { + "epoch": 1.1271730253179326, + "grad_norm": 1.200033187866211, + "learning_rate": 0.00027109458513726795, + "loss": 2.0325, + "step": 9661 + }, + { + "epoch": 1.1272896978182243, + "grad_norm": 1.23715341091156, + "learning_rate": 0.00027108562142346397, + "loss": 2.1511, + "step": 9662 + }, + { + "epoch": 1.127406370318516, + "grad_norm": 1.2322689294815063, + "learning_rate": 0.0002710766564699286, + "loss": 2.077, + "step": 9663 + }, + { + "epoch": 1.1275230428188077, + "grad_norm": 1.2163972854614258, + "learning_rate": 0.00027106769027675465, + "loss": 2.1541, + "step": 9664 + }, + { + "epoch": 1.1276397153190993, + "grad_norm": 1.2805052995681763, + "learning_rate": 0.0002710587228440353, + "loss": 2.1087, + "step": 9665 + }, + { + "epoch": 1.127756387819391, + "grad_norm": 1.3737610578536987, + "learning_rate": 0.0002710497541718633, + "loss": 2.1857, + "step": 9666 + }, + { + "epoch": 1.1278730603196827, + "grad_norm": 1.2048375606536865, + "learning_rate": 0.0002710407842603317, + "loss": 2.0426, + "step": 9667 + }, + { + "epoch": 1.1279897328199744, + "grad_norm": 1.2739081382751465, + "learning_rate": 0.00027103181310953356, + "loss": 2.1509, + "step": 9668 + }, + { + "epoch": 1.128106405320266, + "grad_norm": 1.2231788635253906, + "learning_rate": 0.0002710228407195618, + "loss": 2.1366, + "step": 9669 + }, + { + "epoch": 1.1282230778205578, + "grad_norm": 1.3594759702682495, + "learning_rate": 0.00027101386709050943, + "loss": 2.0137, + "step": 9670 + }, + { + "epoch": 1.1283397503208494, + "grad_norm": 1.1970117092132568, + "learning_rate": 0.00027100489222246955, + "loss": 2.0478, + "step": 9671 + }, + { + "epoch": 1.1284564228211411, + "grad_norm": 1.416100025177002, + "learning_rate": 0.00027099591611553516, + "loss": 2.244, + "step": 9672 + }, + { + "epoch": 1.1285730953214328, + "grad_norm": 1.1671524047851562, + "learning_rate": 0.00027098693876979933, + "loss": 2.0293, + "step": 9673 + }, + { + "epoch": 1.1286897678217245, + "grad_norm": 1.1276428699493408, + "learning_rate": 0.00027097796018535504, + "loss": 1.9538, + "step": 9674 + }, + { + "epoch": 1.1288064403220162, + "grad_norm": 1.1326102018356323, + "learning_rate": 0.00027096898036229553, + "loss": 2.0507, + "step": 9675 + }, + { + "epoch": 1.1289231128223078, + "grad_norm": 1.0924543142318726, + "learning_rate": 0.0002709599993007137, + "loss": 2.0436, + "step": 9676 + }, + { + "epoch": 1.1290397853225995, + "grad_norm": 1.225992202758789, + "learning_rate": 0.0002709510170007029, + "loss": 2.1991, + "step": 9677 + }, + { + "epoch": 1.1291564578228912, + "grad_norm": 1.2498246431350708, + "learning_rate": 0.00027094203346235597, + "loss": 2.0651, + "step": 9678 + }, + { + "epoch": 1.1292731303231829, + "grad_norm": 1.1696828603744507, + "learning_rate": 0.00027093304868576625, + "loss": 2.0428, + "step": 9679 + }, + { + "epoch": 1.1293898028234746, + "grad_norm": 1.1843234300613403, + "learning_rate": 0.0002709240626710268, + "loss": 2.0491, + "step": 9680 + }, + { + "epoch": 1.1295064753237662, + "grad_norm": 1.1282566785812378, + "learning_rate": 0.00027091507541823084, + "loss": 1.9481, + "step": 9681 + }, + { + "epoch": 1.129623147824058, + "grad_norm": 1.1728781461715698, + "learning_rate": 0.00027090608692747147, + "loss": 1.8499, + "step": 9682 + }, + { + "epoch": 1.1297398203243496, + "grad_norm": 1.3533419370651245, + "learning_rate": 0.0002708970971988419, + "loss": 2.1735, + "step": 9683 + }, + { + "epoch": 1.1298564928246413, + "grad_norm": 1.0953425168991089, + "learning_rate": 0.0002708881062324353, + "loss": 2.013, + "step": 9684 + }, + { + "epoch": 1.129973165324933, + "grad_norm": 1.1758815050125122, + "learning_rate": 0.0002708791140283449, + "loss": 1.9987, + "step": 9685 + }, + { + "epoch": 1.1300898378252247, + "grad_norm": 0.9729447364807129, + "learning_rate": 0.00027087012058666385, + "loss": 2.0628, + "step": 9686 + }, + { + "epoch": 1.1302065103255163, + "grad_norm": 1.1795330047607422, + "learning_rate": 0.0002708611259074856, + "loss": 2.0799, + "step": 9687 + }, + { + "epoch": 1.130323182825808, + "grad_norm": 1.0948549509048462, + "learning_rate": 0.00027085212999090316, + "loss": 1.9828, + "step": 9688 + }, + { + "epoch": 1.1304398553260997, + "grad_norm": 1.1970778703689575, + "learning_rate": 0.0002708431328370099, + "loss": 1.8983, + "step": 9689 + }, + { + "epoch": 1.1305565278263914, + "grad_norm": 1.2247334718704224, + "learning_rate": 0.0002708341344458991, + "loss": 2.1069, + "step": 9690 + }, + { + "epoch": 1.130673200326683, + "grad_norm": 1.3524270057678223, + "learning_rate": 0.00027082513481766396, + "loss": 2.0984, + "step": 9691 + }, + { + "epoch": 1.1307898728269747, + "grad_norm": 1.038651704788208, + "learning_rate": 0.0002708161339523979, + "loss": 1.9917, + "step": 9692 + }, + { + "epoch": 1.1309065453272664, + "grad_norm": 1.1023204326629639, + "learning_rate": 0.0002708071318501941, + "loss": 2.1435, + "step": 9693 + }, + { + "epoch": 1.131023217827558, + "grad_norm": 1.1346518993377686, + "learning_rate": 0.000270798128511146, + "loss": 2.1263, + "step": 9694 + }, + { + "epoch": 1.1311398903278498, + "grad_norm": 1.2587190866470337, + "learning_rate": 0.00027078912393534686, + "loss": 2.1006, + "step": 9695 + }, + { + "epoch": 1.1312565628281415, + "grad_norm": 1.1245579719543457, + "learning_rate": 0.0002707801181228901, + "loss": 2.0077, + "step": 9696 + }, + { + "epoch": 1.1313732353284331, + "grad_norm": 1.1734681129455566, + "learning_rate": 0.00027077111107386896, + "loss": 1.9796, + "step": 9697 + }, + { + "epoch": 1.1314899078287248, + "grad_norm": 1.0368608236312866, + "learning_rate": 0.000270762102788377, + "loss": 1.9216, + "step": 9698 + }, + { + "epoch": 1.1316065803290165, + "grad_norm": 1.1948826313018799, + "learning_rate": 0.00027075309326650746, + "loss": 1.8285, + "step": 9699 + }, + { + "epoch": 1.1317232528293082, + "grad_norm": 1.2370431423187256, + "learning_rate": 0.0002707440825083538, + "loss": 2.1768, + "step": 9700 + }, + { + "epoch": 1.1318399253295999, + "grad_norm": 1.2242242097854614, + "learning_rate": 0.0002707350705140094, + "loss": 1.9321, + "step": 9701 + }, + { + "epoch": 1.1319565978298916, + "grad_norm": 1.010064721107483, + "learning_rate": 0.00027072605728356764, + "loss": 1.8232, + "step": 9702 + }, + { + "epoch": 1.1320732703301832, + "grad_norm": 1.30732262134552, + "learning_rate": 0.0002707170428171221, + "loss": 2.0541, + "step": 9703 + }, + { + "epoch": 1.132189942830475, + "grad_norm": 1.1488397121429443, + "learning_rate": 0.00027070802711476615, + "loss": 2.1085, + "step": 9704 + }, + { + "epoch": 1.1323066153307666, + "grad_norm": 1.2006334066390991, + "learning_rate": 0.0002706990101765932, + "loss": 2.1811, + "step": 9705 + }, + { + "epoch": 1.1324232878310583, + "grad_norm": 1.1911262273788452, + "learning_rate": 0.00027068999200269685, + "loss": 2.0652, + "step": 9706 + }, + { + "epoch": 1.13253996033135, + "grad_norm": 1.2589620351791382, + "learning_rate": 0.00027068097259317053, + "loss": 2.1442, + "step": 9707 + }, + { + "epoch": 1.1326566328316416, + "grad_norm": 1.0525439977645874, + "learning_rate": 0.00027067195194810774, + "loss": 2.1329, + "step": 9708 + }, + { + "epoch": 1.1327733053319333, + "grad_norm": 1.22475266456604, + "learning_rate": 0.00027066293006760195, + "loss": 2.3513, + "step": 9709 + }, + { + "epoch": 1.132889977832225, + "grad_norm": 1.312070608139038, + "learning_rate": 0.00027065390695174676, + "loss": 2.1567, + "step": 9710 + }, + { + "epoch": 1.1330066503325167, + "grad_norm": 1.3260318040847778, + "learning_rate": 0.0002706448826006357, + "loss": 2.2043, + "step": 9711 + }, + { + "epoch": 1.1331233228328084, + "grad_norm": 1.0634435415267944, + "learning_rate": 0.0002706358570143623, + "loss": 2.0212, + "step": 9712 + }, + { + "epoch": 1.1332399953331, + "grad_norm": 1.234865427017212, + "learning_rate": 0.00027062683019302017, + "loss": 2.0807, + "step": 9713 + }, + { + "epoch": 1.1333566678333917, + "grad_norm": 1.1264276504516602, + "learning_rate": 0.00027061780213670283, + "loss": 2.1931, + "step": 9714 + }, + { + "epoch": 1.1334733403336834, + "grad_norm": 1.0623116493225098, + "learning_rate": 0.0002706087728455039, + "loss": 2.1939, + "step": 9715 + }, + { + "epoch": 1.133590012833975, + "grad_norm": 1.1027743816375732, + "learning_rate": 0.000270599742319517, + "loss": 1.9902, + "step": 9716 + }, + { + "epoch": 1.1337066853342668, + "grad_norm": 1.418882131576538, + "learning_rate": 0.00027059071055883575, + "loss": 2.1457, + "step": 9717 + }, + { + "epoch": 1.1338233578345585, + "grad_norm": 1.1937106847763062, + "learning_rate": 0.00027058167756355376, + "loss": 2.2047, + "step": 9718 + }, + { + "epoch": 1.1339400303348501, + "grad_norm": 1.119477391242981, + "learning_rate": 0.00027057264333376464, + "loss": 1.9892, + "step": 9719 + }, + { + "epoch": 1.1340567028351418, + "grad_norm": 1.1791480779647827, + "learning_rate": 0.00027056360786956217, + "loss": 2.1285, + "step": 9720 + }, + { + "epoch": 1.1341733753354335, + "grad_norm": 1.1428619623184204, + "learning_rate": 0.0002705545711710399, + "loss": 2.1651, + "step": 9721 + }, + { + "epoch": 1.1342900478357252, + "grad_norm": 1.1818666458129883, + "learning_rate": 0.00027054553323829154, + "loss": 2.0674, + "step": 9722 + }, + { + "epoch": 1.1344067203360169, + "grad_norm": 1.1373348236083984, + "learning_rate": 0.00027053649407141083, + "loss": 1.9964, + "step": 9723 + }, + { + "epoch": 1.1345233928363085, + "grad_norm": 1.177549123764038, + "learning_rate": 0.00027052745367049137, + "loss": 2.1142, + "step": 9724 + }, + { + "epoch": 1.1346400653366002, + "grad_norm": 1.2826123237609863, + "learning_rate": 0.00027051841203562705, + "loss": 2.079, + "step": 9725 + }, + { + "epoch": 1.134756737836892, + "grad_norm": 1.3321903944015503, + "learning_rate": 0.00027050936916691144, + "loss": 2.2915, + "step": 9726 + }, + { + "epoch": 1.1348734103371836, + "grad_norm": 1.2435592412948608, + "learning_rate": 0.0002705003250644384, + "loss": 2.1483, + "step": 9727 + }, + { + "epoch": 1.1349900828374753, + "grad_norm": 1.1196547746658325, + "learning_rate": 0.0002704912797283016, + "loss": 2.158, + "step": 9728 + }, + { + "epoch": 1.135106755337767, + "grad_norm": 1.2943716049194336, + "learning_rate": 0.00027048223315859486, + "loss": 2.0128, + "step": 9729 + }, + { + "epoch": 1.1352234278380586, + "grad_norm": 1.1004536151885986, + "learning_rate": 0.000270473185355412, + "loss": 2.1053, + "step": 9730 + }, + { + "epoch": 1.1353401003383503, + "grad_norm": 1.2499701976776123, + "learning_rate": 0.0002704641363188467, + "loss": 2.0366, + "step": 9731 + }, + { + "epoch": 1.135456772838642, + "grad_norm": 1.2226425409317017, + "learning_rate": 0.000270455086048993, + "loss": 2.2823, + "step": 9732 + }, + { + "epoch": 1.1355734453389337, + "grad_norm": 1.0184659957885742, + "learning_rate": 0.0002704460345459445, + "loss": 2.0384, + "step": 9733 + }, + { + "epoch": 1.1356901178392254, + "grad_norm": 1.0382251739501953, + "learning_rate": 0.00027043698180979507, + "loss": 2.0529, + "step": 9734 + }, + { + "epoch": 1.135806790339517, + "grad_norm": 1.0427591800689697, + "learning_rate": 0.0002704279278406386, + "loss": 1.9752, + "step": 9735 + }, + { + "epoch": 1.1359234628398087, + "grad_norm": 1.2128881216049194, + "learning_rate": 0.000270418872638569, + "loss": 2.2085, + "step": 9736 + }, + { + "epoch": 1.1360401353401004, + "grad_norm": 1.1709425449371338, + "learning_rate": 0.00027040981620368005, + "loss": 2.0189, + "step": 9737 + }, + { + "epoch": 1.136156807840392, + "grad_norm": 1.2845230102539062, + "learning_rate": 0.0002704007585360657, + "loss": 2.0739, + "step": 9738 + }, + { + "epoch": 1.1362734803406838, + "grad_norm": 1.2981795072555542, + "learning_rate": 0.0002703916996358198, + "loss": 1.9161, + "step": 9739 + }, + { + "epoch": 1.1363901528409754, + "grad_norm": 1.2118186950683594, + "learning_rate": 0.00027038263950303636, + "loss": 1.9959, + "step": 9740 + }, + { + "epoch": 1.1365068253412671, + "grad_norm": 1.4078824520111084, + "learning_rate": 0.0002703735781378092, + "loss": 2.0841, + "step": 9741 + }, + { + "epoch": 1.1366234978415588, + "grad_norm": 1.23219633102417, + "learning_rate": 0.0002703645155402323, + "loss": 2.1623, + "step": 9742 + }, + { + "epoch": 1.1367401703418505, + "grad_norm": 1.2156487703323364, + "learning_rate": 0.0002703554517103996, + "loss": 2.1223, + "step": 9743 + }, + { + "epoch": 1.1368568428421422, + "grad_norm": 1.198148488998413, + "learning_rate": 0.00027034638664840506, + "loss": 2.219, + "step": 9744 + }, + { + "epoch": 1.1369735153424338, + "grad_norm": 1.0938366651535034, + "learning_rate": 0.0002703373203543427, + "loss": 1.9993, + "step": 9745 + }, + { + "epoch": 1.1370901878427255, + "grad_norm": 1.243309497833252, + "learning_rate": 0.00027032825282830643, + "loss": 2.1173, + "step": 9746 + }, + { + "epoch": 1.1372068603430172, + "grad_norm": 1.4162733554840088, + "learning_rate": 0.0002703191840703904, + "loss": 2.093, + "step": 9747 + }, + { + "epoch": 1.137323532843309, + "grad_norm": 1.1954545974731445, + "learning_rate": 0.0002703101140806884, + "loss": 2.0716, + "step": 9748 + }, + { + "epoch": 1.1374402053436006, + "grad_norm": 0.9654387831687927, + "learning_rate": 0.00027030104285929464, + "loss": 1.9601, + "step": 9749 + }, + { + "epoch": 1.1375568778438923, + "grad_norm": 1.3276971578598022, + "learning_rate": 0.00027029197040630314, + "loss": 2.0542, + "step": 9750 + }, + { + "epoch": 1.137673550344184, + "grad_norm": 1.2786835432052612, + "learning_rate": 0.00027028289672180786, + "loss": 2.2432, + "step": 9751 + }, + { + "epoch": 1.1377902228444756, + "grad_norm": 1.1727213859558105, + "learning_rate": 0.00027027382180590297, + "loss": 2.105, + "step": 9752 + }, + { + "epoch": 1.1379068953447673, + "grad_norm": 1.3188812732696533, + "learning_rate": 0.00027026474565868246, + "loss": 2.0946, + "step": 9753 + }, + { + "epoch": 1.138023567845059, + "grad_norm": 1.1590051651000977, + "learning_rate": 0.00027025566828024045, + "loss": 2.0627, + "step": 9754 + }, + { + "epoch": 1.1381402403453507, + "grad_norm": 1.367296576499939, + "learning_rate": 0.0002702465896706711, + "loss": 2.2598, + "step": 9755 + }, + { + "epoch": 1.1382569128456423, + "grad_norm": 1.1855076551437378, + "learning_rate": 0.00027023750983006846, + "loss": 2.1953, + "step": 9756 + }, + { + "epoch": 1.138373585345934, + "grad_norm": 1.2012473344802856, + "learning_rate": 0.0002702284287585267, + "loss": 2.1239, + "step": 9757 + }, + { + "epoch": 1.1384902578462257, + "grad_norm": 1.2607510089874268, + "learning_rate": 0.0002702193464561399, + "loss": 2.024, + "step": 9758 + }, + { + "epoch": 1.1386069303465174, + "grad_norm": 1.3352845907211304, + "learning_rate": 0.0002702102629230023, + "loss": 2.0853, + "step": 9759 + }, + { + "epoch": 1.138723602846809, + "grad_norm": 1.086637258529663, + "learning_rate": 0.000270201178159208, + "loss": 2.0137, + "step": 9760 + }, + { + "epoch": 1.1388402753471008, + "grad_norm": 1.5114994049072266, + "learning_rate": 0.00027019209216485126, + "loss": 2.2158, + "step": 9761 + }, + { + "epoch": 1.1389569478473924, + "grad_norm": 1.233950138092041, + "learning_rate": 0.00027018300494002616, + "loss": 1.9003, + "step": 9762 + }, + { + "epoch": 1.1390736203476841, + "grad_norm": 1.3010090589523315, + "learning_rate": 0.00027017391648482707, + "loss": 2.2099, + "step": 9763 + }, + { + "epoch": 1.1391902928479758, + "grad_norm": 1.2004984617233276, + "learning_rate": 0.000270164826799348, + "loss": 2.0935, + "step": 9764 + }, + { + "epoch": 1.1393069653482675, + "grad_norm": 1.0557868480682373, + "learning_rate": 0.0002701557358836833, + "loss": 1.873, + "step": 9765 + }, + { + "epoch": 1.1394236378485592, + "grad_norm": 1.1809786558151245, + "learning_rate": 0.00027014664373792725, + "loss": 2.0709, + "step": 9766 + }, + { + "epoch": 1.1395403103488508, + "grad_norm": 1.0166131258010864, + "learning_rate": 0.00027013755036217407, + "loss": 1.8145, + "step": 9767 + }, + { + "epoch": 1.1396569828491425, + "grad_norm": 1.1588740348815918, + "learning_rate": 0.000270128455756518, + "loss": 1.9758, + "step": 9768 + }, + { + "epoch": 1.1397736553494342, + "grad_norm": 1.0468508005142212, + "learning_rate": 0.00027011935992105334, + "loss": 2.0532, + "step": 9769 + }, + { + "epoch": 1.1398903278497259, + "grad_norm": 1.0413731336593628, + "learning_rate": 0.0002701102628558744, + "loss": 1.8882, + "step": 9770 + }, + { + "epoch": 1.1400070003500176, + "grad_norm": 1.1221672296524048, + "learning_rate": 0.00027010116456107545, + "loss": 2.0852, + "step": 9771 + }, + { + "epoch": 1.1401236728503092, + "grad_norm": 1.298263669013977, + "learning_rate": 0.0002700920650367508, + "loss": 2.0122, + "step": 9772 + }, + { + "epoch": 1.140240345350601, + "grad_norm": 1.248782753944397, + "learning_rate": 0.0002700829642829949, + "loss": 2.0505, + "step": 9773 + }, + { + "epoch": 1.1403570178508926, + "grad_norm": 1.391043782234192, + "learning_rate": 0.000270073862299902, + "loss": 2.2582, + "step": 9774 + }, + { + "epoch": 1.1404736903511843, + "grad_norm": 1.1944888830184937, + "learning_rate": 0.0002700647590875665, + "loss": 2.2156, + "step": 9775 + }, + { + "epoch": 1.140590362851476, + "grad_norm": 1.2255440950393677, + "learning_rate": 0.00027005565464608273, + "loss": 2.1343, + "step": 9776 + }, + { + "epoch": 1.1407070353517677, + "grad_norm": 1.1814182996749878, + "learning_rate": 0.00027004654897554517, + "loss": 1.8482, + "step": 9777 + }, + { + "epoch": 1.1408237078520593, + "grad_norm": 1.2564513683319092, + "learning_rate": 0.000270037442076048, + "loss": 2.1622, + "step": 9778 + }, + { + "epoch": 1.140940380352351, + "grad_norm": 1.2301987409591675, + "learning_rate": 0.00027002833394768587, + "loss": 2.1914, + "step": 9779 + }, + { + "epoch": 1.1410570528526427, + "grad_norm": 1.1453806161880493, + "learning_rate": 0.0002700192245905531, + "loss": 1.913, + "step": 9780 + }, + { + "epoch": 1.1411737253529344, + "grad_norm": 1.082458734512329, + "learning_rate": 0.0002700101140047441, + "loss": 1.9363, + "step": 9781 + }, + { + "epoch": 1.141290397853226, + "grad_norm": 1.2409130334854126, + "learning_rate": 0.00027000100219035333, + "loss": 2.0607, + "step": 9782 + }, + { + "epoch": 1.1414070703535177, + "grad_norm": 1.2660067081451416, + "learning_rate": 0.00026999188914747534, + "loss": 2.0959, + "step": 9783 + }, + { + "epoch": 1.1415237428538094, + "grad_norm": 1.259856939315796, + "learning_rate": 0.0002699827748762045, + "loss": 2.0626, + "step": 9784 + }, + { + "epoch": 1.141640415354101, + "grad_norm": 1.103835940361023, + "learning_rate": 0.00026997365937663534, + "loss": 1.7898, + "step": 9785 + }, + { + "epoch": 1.1417570878543928, + "grad_norm": 1.1506315469741821, + "learning_rate": 0.0002699645426488624, + "loss": 2.0884, + "step": 9786 + }, + { + "epoch": 1.1418737603546845, + "grad_norm": 1.2379387617111206, + "learning_rate": 0.00026995542469298006, + "loss": 2.0633, + "step": 9787 + }, + { + "epoch": 1.1419904328549761, + "grad_norm": 1.117091417312622, + "learning_rate": 0.00026994630550908295, + "loss": 1.9603, + "step": 9788 + }, + { + "epoch": 1.1421071053552678, + "grad_norm": 1.1526103019714355, + "learning_rate": 0.00026993718509726557, + "loss": 1.9596, + "step": 9789 + }, + { + "epoch": 1.1422237778555595, + "grad_norm": 1.0837900638580322, + "learning_rate": 0.0002699280634576225, + "loss": 2.1387, + "step": 9790 + }, + { + "epoch": 1.1423404503558512, + "grad_norm": 1.2101702690124512, + "learning_rate": 0.00026991894059024833, + "loss": 1.8945, + "step": 9791 + }, + { + "epoch": 1.1424571228561429, + "grad_norm": 1.0493550300598145, + "learning_rate": 0.0002699098164952376, + "loss": 2.1592, + "step": 9792 + }, + { + "epoch": 1.1425737953564346, + "grad_norm": 1.2286564111709595, + "learning_rate": 0.0002699006911726849, + "loss": 2.0285, + "step": 9793 + }, + { + "epoch": 1.1426904678567262, + "grad_norm": 0.9487320780754089, + "learning_rate": 0.0002698915646226848, + "loss": 1.9299, + "step": 9794 + }, + { + "epoch": 1.142807140357018, + "grad_norm": 1.3022748231887817, + "learning_rate": 0.0002698824368453319, + "loss": 1.8932, + "step": 9795 + }, + { + "epoch": 1.1429238128573096, + "grad_norm": 1.3224750757217407, + "learning_rate": 0.0002698733078407209, + "loss": 2.093, + "step": 9796 + }, + { + "epoch": 1.1430404853576013, + "grad_norm": 1.1548513174057007, + "learning_rate": 0.0002698641776089464, + "loss": 2.0809, + "step": 9797 + }, + { + "epoch": 1.143157157857893, + "grad_norm": 1.26144540309906, + "learning_rate": 0.0002698550461501031, + "loss": 2.1406, + "step": 9798 + }, + { + "epoch": 1.1432738303581846, + "grad_norm": 1.1724704504013062, + "learning_rate": 0.0002698459134642856, + "loss": 2.0267, + "step": 9799 + }, + { + "epoch": 1.1433905028584763, + "grad_norm": 1.2571685314178467, + "learning_rate": 0.0002698367795515886, + "loss": 2.0824, + "step": 9800 + }, + { + "epoch": 1.143507175358768, + "grad_norm": 1.1692239046096802, + "learning_rate": 0.00026982764441210673, + "loss": 2.1685, + "step": 9801 + }, + { + "epoch": 1.1436238478590597, + "grad_norm": 1.0909687280654907, + "learning_rate": 0.0002698185080459348, + "loss": 2.0564, + "step": 9802 + }, + { + "epoch": 1.1437405203593514, + "grad_norm": 1.1360085010528564, + "learning_rate": 0.0002698093704531675, + "loss": 2.1257, + "step": 9803 + }, + { + "epoch": 1.143857192859643, + "grad_norm": 1.2335052490234375, + "learning_rate": 0.0002698002316338995, + "loss": 2.1157, + "step": 9804 + }, + { + "epoch": 1.1439738653599347, + "grad_norm": 1.0874868631362915, + "learning_rate": 0.00026979109158822563, + "loss": 1.9456, + "step": 9805 + }, + { + "epoch": 1.1440905378602264, + "grad_norm": 1.1679658889770508, + "learning_rate": 0.0002697819503162406, + "loss": 2.1264, + "step": 9806 + }, + { + "epoch": 1.144207210360518, + "grad_norm": 1.0479247570037842, + "learning_rate": 0.0002697728078180391, + "loss": 2.1199, + "step": 9807 + }, + { + "epoch": 1.1443238828608098, + "grad_norm": 1.187138557434082, + "learning_rate": 0.000269763664093716, + "loss": 1.9892, + "step": 9808 + }, + { + "epoch": 1.1444405553611015, + "grad_norm": 1.3603103160858154, + "learning_rate": 0.0002697545191433661, + "loss": 2.0772, + "step": 9809 + }, + { + "epoch": 1.1445572278613931, + "grad_norm": 1.2355250120162964, + "learning_rate": 0.00026974537296708414, + "loss": 1.88, + "step": 9810 + }, + { + "epoch": 1.1446739003616848, + "grad_norm": 1.0399878025054932, + "learning_rate": 0.000269736225564965, + "loss": 2.011, + "step": 9811 + }, + { + "epoch": 1.1447905728619765, + "grad_norm": 0.9858202934265137, + "learning_rate": 0.00026972707693710354, + "loss": 1.9303, + "step": 9812 + }, + { + "epoch": 1.1449072453622682, + "grad_norm": 1.015482783317566, + "learning_rate": 0.0002697179270835944, + "loss": 2.1175, + "step": 9813 + }, + { + "epoch": 1.1450239178625599, + "grad_norm": 1.2909680604934692, + "learning_rate": 0.0002697087760045327, + "loss": 2.1588, + "step": 9814 + }, + { + "epoch": 1.1451405903628515, + "grad_norm": 1.0426818132400513, + "learning_rate": 0.00026969962370001317, + "loss": 1.8626, + "step": 9815 + }, + { + "epoch": 1.1452572628631432, + "grad_norm": 1.117774486541748, + "learning_rate": 0.0002696904701701307, + "loss": 2.0492, + "step": 9816 + }, + { + "epoch": 1.145373935363435, + "grad_norm": 1.0476402044296265, + "learning_rate": 0.0002696813154149802, + "loss": 1.9222, + "step": 9817 + }, + { + "epoch": 1.1454906078637266, + "grad_norm": 1.2127071619033813, + "learning_rate": 0.00026967215943465655, + "loss": 1.9799, + "step": 9818 + }, + { + "epoch": 1.1456072803640183, + "grad_norm": 1.2824044227600098, + "learning_rate": 0.0002696630022292547, + "loss": 2.1475, + "step": 9819 + }, + { + "epoch": 1.14572395286431, + "grad_norm": 1.1600638628005981, + "learning_rate": 0.0002696538437988696, + "loss": 2.169, + "step": 9820 + }, + { + "epoch": 1.1458406253646016, + "grad_norm": 1.3683440685272217, + "learning_rate": 0.00026964468414359614, + "loss": 2.1509, + "step": 9821 + }, + { + "epoch": 1.1459572978648933, + "grad_norm": 1.2552990913391113, + "learning_rate": 0.00026963552326352924, + "loss": 1.9135, + "step": 9822 + }, + { + "epoch": 1.146073970365185, + "grad_norm": 1.2699071168899536, + "learning_rate": 0.00026962636115876405, + "loss": 2.2513, + "step": 9823 + }, + { + "epoch": 1.1461906428654767, + "grad_norm": 1.3657035827636719, + "learning_rate": 0.0002696171978293954, + "loss": 2.2929, + "step": 9824 + }, + { + "epoch": 1.1463073153657684, + "grad_norm": 1.054972767829895, + "learning_rate": 0.0002696080332755183, + "loss": 2.0072, + "step": 9825 + }, + { + "epoch": 1.14642398786606, + "grad_norm": 1.3196265697479248, + "learning_rate": 0.00026959886749722777, + "loss": 2.029, + "step": 9826 + }, + { + "epoch": 1.1465406603663517, + "grad_norm": 1.1266400814056396, + "learning_rate": 0.0002695897004946189, + "loss": 2.2192, + "step": 9827 + }, + { + "epoch": 1.1466573328666434, + "grad_norm": 1.069173812866211, + "learning_rate": 0.0002695805322677866, + "loss": 2.0221, + "step": 9828 + }, + { + "epoch": 1.146774005366935, + "grad_norm": 1.0864801406860352, + "learning_rate": 0.000269571362816826, + "loss": 2.03, + "step": 9829 + }, + { + "epoch": 1.1468906778672268, + "grad_norm": 1.2071483135223389, + "learning_rate": 0.0002695621921418321, + "loss": 2.1286, + "step": 9830 + }, + { + "epoch": 1.1470073503675184, + "grad_norm": 1.27816641330719, + "learning_rate": 0.00026955302024290003, + "loss": 2.0897, + "step": 9831 + }, + { + "epoch": 1.1471240228678101, + "grad_norm": 1.3859035968780518, + "learning_rate": 0.0002695438471201249, + "loss": 2.104, + "step": 9832 + }, + { + "epoch": 1.1472406953681018, + "grad_norm": 1.2668944597244263, + "learning_rate": 0.00026953467277360173, + "loss": 1.9709, + "step": 9833 + }, + { + "epoch": 1.1473573678683935, + "grad_norm": 1.2481917142868042, + "learning_rate": 0.0002695254972034256, + "loss": 2.2233, + "step": 9834 + }, + { + "epoch": 1.1474740403686852, + "grad_norm": 1.0139180421829224, + "learning_rate": 0.0002695163204096918, + "loss": 1.9558, + "step": 9835 + }, + { + "epoch": 1.1475907128689768, + "grad_norm": 1.1689006090164185, + "learning_rate": 0.0002695071423924953, + "loss": 1.9871, + "step": 9836 + }, + { + "epoch": 1.1477073853692685, + "grad_norm": 1.1349986791610718, + "learning_rate": 0.0002694979631519313, + "loss": 1.8001, + "step": 9837 + }, + { + "epoch": 1.1478240578695602, + "grad_norm": 1.247206449508667, + "learning_rate": 0.00026948878268809496, + "loss": 2.1544, + "step": 9838 + }, + { + "epoch": 1.147940730369852, + "grad_norm": 1.3211133480072021, + "learning_rate": 0.00026947960100108146, + "loss": 2.0442, + "step": 9839 + }, + { + "epoch": 1.1480574028701436, + "grad_norm": 1.0992604494094849, + "learning_rate": 0.000269470418090986, + "loss": 2.0936, + "step": 9840 + }, + { + "epoch": 1.1481740753704353, + "grad_norm": 1.2333875894546509, + "learning_rate": 0.00026946123395790375, + "loss": 2.1768, + "step": 9841 + }, + { + "epoch": 1.148290747870727, + "grad_norm": 1.4252636432647705, + "learning_rate": 0.00026945204860193, + "loss": 2.1242, + "step": 9842 + }, + { + "epoch": 1.1484074203710186, + "grad_norm": 1.1101677417755127, + "learning_rate": 0.00026944286202315983, + "loss": 2.0742, + "step": 9843 + }, + { + "epoch": 1.1485240928713103, + "grad_norm": 1.1435989141464233, + "learning_rate": 0.00026943367422168854, + "loss": 2.0925, + "step": 9844 + }, + { + "epoch": 1.148640765371602, + "grad_norm": 1.3671088218688965, + "learning_rate": 0.0002694244851976114, + "loss": 2.2099, + "step": 9845 + }, + { + "epoch": 1.1487574378718937, + "grad_norm": 1.1181617975234985, + "learning_rate": 0.00026941529495102374, + "loss": 2.0464, + "step": 9846 + }, + { + "epoch": 1.1488741103721853, + "grad_norm": 1.0872167348861694, + "learning_rate": 0.00026940610348202067, + "loss": 1.9319, + "step": 9847 + }, + { + "epoch": 1.148990782872477, + "grad_norm": 1.0967248678207397, + "learning_rate": 0.00026939691079069756, + "loss": 2.0007, + "step": 9848 + }, + { + "epoch": 1.1491074553727687, + "grad_norm": 1.0314033031463623, + "learning_rate": 0.00026938771687714977, + "loss": 2.15, + "step": 9849 + }, + { + "epoch": 1.1492241278730604, + "grad_norm": 1.1862359046936035, + "learning_rate": 0.00026937852174147253, + "loss": 2.0508, + "step": 9850 + }, + { + "epoch": 1.149340800373352, + "grad_norm": 1.146220326423645, + "learning_rate": 0.00026936932538376116, + "loss": 2.0996, + "step": 9851 + }, + { + "epoch": 1.1494574728736437, + "grad_norm": 1.2495145797729492, + "learning_rate": 0.0002693601278041111, + "loss": 2.3663, + "step": 9852 + }, + { + "epoch": 1.1495741453739354, + "grad_norm": 1.2073489427566528, + "learning_rate": 0.0002693509290026176, + "loss": 2.0232, + "step": 9853 + }, + { + "epoch": 1.1496908178742271, + "grad_norm": 1.160000205039978, + "learning_rate": 0.00026934172897937603, + "loss": 2.17, + "step": 9854 + }, + { + "epoch": 1.1498074903745188, + "grad_norm": 1.1604682207107544, + "learning_rate": 0.00026933252773448177, + "loss": 2.0925, + "step": 9855 + }, + { + "epoch": 1.1499241628748105, + "grad_norm": 1.3137726783752441, + "learning_rate": 0.00026932332526803025, + "loss": 2.0556, + "step": 9856 + }, + { + "epoch": 1.1500408353751022, + "grad_norm": 1.2413783073425293, + "learning_rate": 0.00026931412158011684, + "loss": 2.1088, + "step": 9857 + }, + { + "epoch": 1.1501575078753938, + "grad_norm": 1.119777798652649, + "learning_rate": 0.000269304916670837, + "loss": 2.302, + "step": 9858 + }, + { + "epoch": 1.1502741803756855, + "grad_norm": 1.2224061489105225, + "learning_rate": 0.0002692957105402861, + "loss": 2.2036, + "step": 9859 + }, + { + "epoch": 1.1503908528759772, + "grad_norm": 1.354736566543579, + "learning_rate": 0.00026928650318855954, + "loss": 2.2087, + "step": 9860 + }, + { + "epoch": 1.1505075253762689, + "grad_norm": 1.0648866891860962, + "learning_rate": 0.0002692772946157529, + "loss": 2.1132, + "step": 9861 + }, + { + "epoch": 1.1506241978765606, + "grad_norm": 1.0072146654129028, + "learning_rate": 0.00026926808482196155, + "loss": 1.9877, + "step": 9862 + }, + { + "epoch": 1.1507408703768522, + "grad_norm": 1.310222864151001, + "learning_rate": 0.00026925887380728096, + "loss": 2.1779, + "step": 9863 + }, + { + "epoch": 1.150857542877144, + "grad_norm": 0.9839957356452942, + "learning_rate": 0.00026924966157180667, + "loss": 1.9795, + "step": 9864 + }, + { + "epoch": 1.1509742153774356, + "grad_norm": 1.1014920473098755, + "learning_rate": 0.0002692404481156342, + "loss": 2.1285, + "step": 9865 + }, + { + "epoch": 1.1510908878777273, + "grad_norm": 1.0458134412765503, + "learning_rate": 0.00026923123343885894, + "loss": 2.0461, + "step": 9866 + }, + { + "epoch": 1.151207560378019, + "grad_norm": 1.1019227504730225, + "learning_rate": 0.0002692220175415766, + "loss": 2.0266, + "step": 9867 + }, + { + "epoch": 1.1513242328783106, + "grad_norm": 1.1750710010528564, + "learning_rate": 0.0002692128004238825, + "loss": 2.0092, + "step": 9868 + }, + { + "epoch": 1.1514409053786023, + "grad_norm": 1.23758864402771, + "learning_rate": 0.0002692035820858724, + "loss": 2.1323, + "step": 9869 + }, + { + "epoch": 1.151557577878894, + "grad_norm": 1.443543791770935, + "learning_rate": 0.00026919436252764175, + "loss": 2.1305, + "step": 9870 + }, + { + "epoch": 1.1516742503791857, + "grad_norm": 1.1546494960784912, + "learning_rate": 0.0002691851417492862, + "loss": 2.0983, + "step": 9871 + }, + { + "epoch": 1.1517909228794774, + "grad_norm": 1.1096794605255127, + "learning_rate": 0.00026917591975090133, + "loss": 2.0982, + "step": 9872 + }, + { + "epoch": 1.151907595379769, + "grad_norm": 1.1923706531524658, + "learning_rate": 0.00026916669653258263, + "loss": 2.0877, + "step": 9873 + }, + { + "epoch": 1.1520242678800607, + "grad_norm": 1.3138145208358765, + "learning_rate": 0.00026915747209442585, + "loss": 2.2375, + "step": 9874 + }, + { + "epoch": 1.1521409403803524, + "grad_norm": 1.126878023147583, + "learning_rate": 0.00026914824643652655, + "loss": 2.0177, + "step": 9875 + }, + { + "epoch": 1.152257612880644, + "grad_norm": 1.2745544910430908, + "learning_rate": 0.00026913901955898036, + "loss": 2.0544, + "step": 9876 + }, + { + "epoch": 1.1523742853809358, + "grad_norm": 1.0198371410369873, + "learning_rate": 0.000269129791461883, + "loss": 1.8851, + "step": 9877 + }, + { + "epoch": 1.1524909578812275, + "grad_norm": 1.3760044574737549, + "learning_rate": 0.00026912056214533015, + "loss": 1.9968, + "step": 9878 + }, + { + "epoch": 1.1526076303815191, + "grad_norm": 1.1768633127212524, + "learning_rate": 0.00026911133160941743, + "loss": 2.1124, + "step": 9879 + }, + { + "epoch": 1.1527243028818108, + "grad_norm": 1.0722051858901978, + "learning_rate": 0.00026910209985424045, + "loss": 1.9322, + "step": 9880 + }, + { + "epoch": 1.1528409753821025, + "grad_norm": 1.3343799114227295, + "learning_rate": 0.0002690928668798951, + "loss": 1.9588, + "step": 9881 + }, + { + "epoch": 1.1529576478823942, + "grad_norm": 1.1658910512924194, + "learning_rate": 0.0002690836326864769, + "loss": 1.9071, + "step": 9882 + }, + { + "epoch": 1.1530743203826859, + "grad_norm": 1.4053434133529663, + "learning_rate": 0.0002690743972740818, + "loss": 1.9051, + "step": 9883 + }, + { + "epoch": 1.1531909928829776, + "grad_norm": 1.1939725875854492, + "learning_rate": 0.00026906516064280535, + "loss": 2.1084, + "step": 9884 + }, + { + "epoch": 1.1533076653832692, + "grad_norm": 1.2887675762176514, + "learning_rate": 0.0002690559227927434, + "loss": 2.1928, + "step": 9885 + }, + { + "epoch": 1.153424337883561, + "grad_norm": 1.313315510749817, + "learning_rate": 0.0002690466837239917, + "loss": 2.1709, + "step": 9886 + }, + { + "epoch": 1.1535410103838526, + "grad_norm": 1.2639052867889404, + "learning_rate": 0.00026903744343664604, + "loss": 2.1551, + "step": 9887 + }, + { + "epoch": 1.1536576828841443, + "grad_norm": 1.3974181413650513, + "learning_rate": 0.0002690282019308022, + "loss": 2.0386, + "step": 9888 + }, + { + "epoch": 1.153774355384436, + "grad_norm": 1.0460482835769653, + "learning_rate": 0.00026901895920655597, + "loss": 2.1117, + "step": 9889 + }, + { + "epoch": 1.1538910278847276, + "grad_norm": 1.3053325414657593, + "learning_rate": 0.00026900971526400326, + "loss": 2.0329, + "step": 9890 + }, + { + "epoch": 1.1540077003850193, + "grad_norm": 1.2027649879455566, + "learning_rate": 0.0002690004701032398, + "loss": 2.1625, + "step": 9891 + }, + { + "epoch": 1.154124372885311, + "grad_norm": 1.2040224075317383, + "learning_rate": 0.0002689912237243614, + "loss": 2.0798, + "step": 9892 + }, + { + "epoch": 1.1542410453856027, + "grad_norm": 1.2416067123413086, + "learning_rate": 0.000268981976127464, + "loss": 1.9315, + "step": 9893 + }, + { + "epoch": 1.1543577178858944, + "grad_norm": 1.3136309385299683, + "learning_rate": 0.0002689727273126435, + "loss": 2.1665, + "step": 9894 + }, + { + "epoch": 1.154474390386186, + "grad_norm": 1.2140884399414062, + "learning_rate": 0.00026896347727999563, + "loss": 2.1544, + "step": 9895 + }, + { + "epoch": 1.1545910628864777, + "grad_norm": 1.285676121711731, + "learning_rate": 0.0002689542260296165, + "loss": 2.0869, + "step": 9896 + }, + { + "epoch": 1.1547077353867694, + "grad_norm": 1.121633529663086, + "learning_rate": 0.00026894497356160185, + "loss": 1.9924, + "step": 9897 + }, + { + "epoch": 1.154824407887061, + "grad_norm": 1.1265716552734375, + "learning_rate": 0.0002689357198760476, + "loss": 2.1667, + "step": 9898 + }, + { + "epoch": 1.1549410803873528, + "grad_norm": 1.4234519004821777, + "learning_rate": 0.0002689264649730498, + "loss": 2.1418, + "step": 9899 + }, + { + "epoch": 1.1550577528876445, + "grad_norm": 1.2395105361938477, + "learning_rate": 0.0002689172088527043, + "loss": 2.0705, + "step": 9900 + }, + { + "epoch": 1.1551744253879361, + "grad_norm": 1.427287220954895, + "learning_rate": 0.0002689079515151071, + "loss": 2.0564, + "step": 9901 + }, + { + "epoch": 1.1552910978882278, + "grad_norm": 1.1395361423492432, + "learning_rate": 0.00026889869296035414, + "loss": 2.1415, + "step": 9902 + }, + { + "epoch": 1.1554077703885195, + "grad_norm": 1.2824885845184326, + "learning_rate": 0.00026888943318854143, + "loss": 2.0873, + "step": 9903 + }, + { + "epoch": 1.1555244428888112, + "grad_norm": 1.090143084526062, + "learning_rate": 0.0002688801721997649, + "loss": 2.0979, + "step": 9904 + }, + { + "epoch": 1.1556411153891029, + "grad_norm": 1.1075350046157837, + "learning_rate": 0.00026887090999412063, + "loss": 1.8745, + "step": 9905 + }, + { + "epoch": 1.1557577878893945, + "grad_norm": 1.3182563781738281, + "learning_rate": 0.00026886164657170466, + "loss": 1.9756, + "step": 9906 + }, + { + "epoch": 1.1558744603896862, + "grad_norm": 1.2429423332214355, + "learning_rate": 0.0002688523819326129, + "loss": 2.0664, + "step": 9907 + }, + { + "epoch": 1.155991132889978, + "grad_norm": 1.163681983947754, + "learning_rate": 0.0002688431160769415, + "loss": 2.0259, + "step": 9908 + }, + { + "epoch": 1.1561078053902696, + "grad_norm": 1.061734676361084, + "learning_rate": 0.00026883384900478656, + "loss": 2.065, + "step": 9909 + }, + { + "epoch": 1.1562244778905613, + "grad_norm": 1.2768739461898804, + "learning_rate": 0.00026882458071624405, + "loss": 2.0053, + "step": 9910 + }, + { + "epoch": 1.156341150390853, + "grad_norm": 1.0754743814468384, + "learning_rate": 0.00026881531121141004, + "loss": 1.9566, + "step": 9911 + }, + { + "epoch": 1.1564578228911446, + "grad_norm": 1.350901484489441, + "learning_rate": 0.0002688060404903807, + "loss": 2.4046, + "step": 9912 + }, + { + "epoch": 1.1565744953914363, + "grad_norm": 1.3624615669250488, + "learning_rate": 0.00026879676855325215, + "loss": 1.9976, + "step": 9913 + }, + { + "epoch": 1.156691167891728, + "grad_norm": 1.1291580200195312, + "learning_rate": 0.00026878749540012044, + "loss": 1.8352, + "step": 9914 + }, + { + "epoch": 1.1568078403920197, + "grad_norm": 1.3411929607391357, + "learning_rate": 0.0002687782210310818, + "loss": 2.1222, + "step": 9915 + }, + { + "epoch": 1.1569245128923114, + "grad_norm": 1.2348448038101196, + "learning_rate": 0.0002687689454462322, + "loss": 2.1129, + "step": 9916 + }, + { + "epoch": 1.157041185392603, + "grad_norm": 1.2034497261047363, + "learning_rate": 0.00026875966864566806, + "loss": 2.1983, + "step": 9917 + }, + { + "epoch": 1.1571578578928947, + "grad_norm": 1.1420350074768066, + "learning_rate": 0.0002687503906294853, + "loss": 2.0748, + "step": 9918 + }, + { + "epoch": 1.1572745303931864, + "grad_norm": 1.1672927141189575, + "learning_rate": 0.00026874111139778024, + "loss": 1.9197, + "step": 9919 + }, + { + "epoch": 1.157391202893478, + "grad_norm": 1.3435498476028442, + "learning_rate": 0.00026873183095064897, + "loss": 2.2148, + "step": 9920 + }, + { + "epoch": 1.1575078753937698, + "grad_norm": 1.1741989850997925, + "learning_rate": 0.0002687225492881879, + "loss": 2.1214, + "step": 9921 + }, + { + "epoch": 1.1576245478940614, + "grad_norm": 1.2619848251342773, + "learning_rate": 0.000268713266410493, + "loss": 2.0332, + "step": 9922 + }, + { + "epoch": 1.1577412203943531, + "grad_norm": 1.2095389366149902, + "learning_rate": 0.00026870398231766077, + "loss": 1.9468, + "step": 9923 + }, + { + "epoch": 1.1578578928946448, + "grad_norm": 1.3181484937667847, + "learning_rate": 0.0002686946970097872, + "loss": 1.9113, + "step": 9924 + }, + { + "epoch": 1.1579745653949365, + "grad_norm": 1.2650699615478516, + "learning_rate": 0.0002686854104869688, + "loss": 2.2189, + "step": 9925 + }, + { + "epoch": 1.1580912378952282, + "grad_norm": 1.4011900424957275, + "learning_rate": 0.00026867612274930164, + "loss": 2.2279, + "step": 9926 + }, + { + "epoch": 1.1582079103955198, + "grad_norm": 0.9596818089485168, + "learning_rate": 0.00026866683379688205, + "loss": 1.9677, + "step": 9927 + }, + { + "epoch": 1.1583245828958115, + "grad_norm": 1.1772689819335938, + "learning_rate": 0.00026865754362980637, + "loss": 2.0277, + "step": 9928 + }, + { + "epoch": 1.1584412553961032, + "grad_norm": 1.1574606895446777, + "learning_rate": 0.0002686482522481709, + "loss": 2.0883, + "step": 9929 + }, + { + "epoch": 1.158557927896395, + "grad_norm": 1.1863399744033813, + "learning_rate": 0.00026863895965207197, + "loss": 1.8188, + "step": 9930 + }, + { + "epoch": 1.1586746003966866, + "grad_norm": 1.3394492864608765, + "learning_rate": 0.00026862966584160586, + "loss": 2.0136, + "step": 9931 + }, + { + "epoch": 1.1587912728969783, + "grad_norm": 1.2607885599136353, + "learning_rate": 0.00026862037081686903, + "loss": 1.8894, + "step": 9932 + }, + { + "epoch": 1.15890794539727, + "grad_norm": 1.0936375856399536, + "learning_rate": 0.0002686110745779577, + "loss": 2.2098, + "step": 9933 + }, + { + "epoch": 1.1590246178975616, + "grad_norm": 1.2908929586410522, + "learning_rate": 0.0002686017771249683, + "loss": 2.0975, + "step": 9934 + }, + { + "epoch": 1.1591412903978533, + "grad_norm": 1.2237728834152222, + "learning_rate": 0.0002685924784579973, + "loss": 2.2048, + "step": 9935 + }, + { + "epoch": 1.159257962898145, + "grad_norm": 1.3544232845306396, + "learning_rate": 0.00026858317857714106, + "loss": 2.1133, + "step": 9936 + }, + { + "epoch": 1.1593746353984367, + "grad_norm": 1.503271222114563, + "learning_rate": 0.0002685738774824959, + "loss": 2.1776, + "step": 9937 + }, + { + "epoch": 1.1594913078987283, + "grad_norm": 1.2638696432113647, + "learning_rate": 0.00026856457517415825, + "loss": 2.0993, + "step": 9938 + }, + { + "epoch": 1.15960798039902, + "grad_norm": 1.1316637992858887, + "learning_rate": 0.0002685552716522247, + "loss": 2.1038, + "step": 9939 + }, + { + "epoch": 1.1597246528993117, + "grad_norm": 1.2923678159713745, + "learning_rate": 0.0002685459669167915, + "loss": 2.2774, + "step": 9940 + }, + { + "epoch": 1.1598413253996034, + "grad_norm": 1.171929121017456, + "learning_rate": 0.0002685366609679553, + "loss": 2.1386, + "step": 9941 + }, + { + "epoch": 1.159957997899895, + "grad_norm": 1.1656694412231445, + "learning_rate": 0.0002685273538058124, + "loss": 2.1722, + "step": 9942 + }, + { + "epoch": 1.1600746704001867, + "grad_norm": 1.0853703022003174, + "learning_rate": 0.0002685180454304594, + "loss": 1.9261, + "step": 9943 + }, + { + "epoch": 1.1601913429004784, + "grad_norm": 1.0350781679153442, + "learning_rate": 0.0002685087358419928, + "loss": 2.0715, + "step": 9944 + }, + { + "epoch": 1.16030801540077, + "grad_norm": 1.188889741897583, + "learning_rate": 0.0002684994250405091, + "loss": 2.0312, + "step": 9945 + }, + { + "epoch": 1.1604246879010618, + "grad_norm": 1.200624704360962, + "learning_rate": 0.00026849011302610467, + "loss": 2.0138, + "step": 9946 + }, + { + "epoch": 1.1605413604013535, + "grad_norm": 1.0937570333480835, + "learning_rate": 0.0002684807997988763, + "loss": 2.0865, + "step": 9947 + }, + { + "epoch": 1.1606580329016452, + "grad_norm": 1.0363706350326538, + "learning_rate": 0.0002684714853589204, + "loss": 2.0623, + "step": 9948 + }, + { + "epoch": 1.1607747054019368, + "grad_norm": 1.3220024108886719, + "learning_rate": 0.0002684621697063335, + "loss": 2.0067, + "step": 9949 + }, + { + "epoch": 1.1608913779022285, + "grad_norm": 1.273321270942688, + "learning_rate": 0.0002684528528412122, + "loss": 2.1021, + "step": 9950 + }, + { + "epoch": 1.1610080504025202, + "grad_norm": 1.0796383619308472, + "learning_rate": 0.00026844353476365327, + "loss": 2.0547, + "step": 9951 + }, + { + "epoch": 1.1611247229028119, + "grad_norm": 1.1004422903060913, + "learning_rate": 0.000268434215473753, + "loss": 2.097, + "step": 9952 + }, + { + "epoch": 1.1612413954031036, + "grad_norm": 1.1555829048156738, + "learning_rate": 0.00026842489497160823, + "loss": 2.0846, + "step": 9953 + }, + { + "epoch": 1.1613580679033952, + "grad_norm": 1.532447099685669, + "learning_rate": 0.00026841557325731546, + "loss": 2.1977, + "step": 9954 + }, + { + "epoch": 1.161474740403687, + "grad_norm": 1.1352564096450806, + "learning_rate": 0.0002684062503309714, + "loss": 1.8839, + "step": 9955 + }, + { + "epoch": 1.1615914129039786, + "grad_norm": 1.1665725708007812, + "learning_rate": 0.0002683969261926727, + "loss": 1.9794, + "step": 9956 + }, + { + "epoch": 1.1617080854042703, + "grad_norm": 1.175874948501587, + "learning_rate": 0.00026838760084251596, + "loss": 2.0007, + "step": 9957 + }, + { + "epoch": 1.161824757904562, + "grad_norm": 1.1641740798950195, + "learning_rate": 0.00026837827428059796, + "loss": 1.8775, + "step": 9958 + }, + { + "epoch": 1.1619414304048536, + "grad_norm": 1.2906073331832886, + "learning_rate": 0.00026836894650701524, + "loss": 1.9918, + "step": 9959 + }, + { + "epoch": 1.1620581029051453, + "grad_norm": 1.084316611289978, + "learning_rate": 0.0002683596175218646, + "loss": 2.0714, + "step": 9960 + }, + { + "epoch": 1.162174775405437, + "grad_norm": 1.2785414457321167, + "learning_rate": 0.00026835028732524274, + "loss": 2.2057, + "step": 9961 + }, + { + "epoch": 1.1622914479057287, + "grad_norm": 1.4282269477844238, + "learning_rate": 0.0002683409559172464, + "loss": 2.2526, + "step": 9962 + }, + { + "epoch": 1.1624081204060204, + "grad_norm": 1.212430715560913, + "learning_rate": 0.0002683316232979722, + "loss": 2.1439, + "step": 9963 + }, + { + "epoch": 1.162524792906312, + "grad_norm": 1.304101824760437, + "learning_rate": 0.0002683222894675171, + "loss": 2.1012, + "step": 9964 + }, + { + "epoch": 1.1626414654066037, + "grad_norm": 1.1017274856567383, + "learning_rate": 0.0002683129544259777, + "loss": 2.0778, + "step": 9965 + }, + { + "epoch": 1.1627581379068954, + "grad_norm": 1.1357083320617676, + "learning_rate": 0.0002683036181734508, + "loss": 2.0369, + "step": 9966 + }, + { + "epoch": 1.162874810407187, + "grad_norm": 1.2076669931411743, + "learning_rate": 0.00026829428071003326, + "loss": 2.0519, + "step": 9967 + }, + { + "epoch": 1.1629914829074788, + "grad_norm": 1.2105305194854736, + "learning_rate": 0.00026828494203582183, + "loss": 2.0697, + "step": 9968 + }, + { + "epoch": 1.1631081554077705, + "grad_norm": 1.3569914102554321, + "learning_rate": 0.00026827560215091333, + "loss": 2.2379, + "step": 9969 + }, + { + "epoch": 1.1632248279080621, + "grad_norm": 1.16325044631958, + "learning_rate": 0.00026826626105540453, + "loss": 1.9268, + "step": 9970 + }, + { + "epoch": 1.1633415004083538, + "grad_norm": 1.1321425437927246, + "learning_rate": 0.00026825691874939235, + "loss": 2.0875, + "step": 9971 + }, + { + "epoch": 1.1634581729086455, + "grad_norm": 1.2183457612991333, + "learning_rate": 0.00026824757523297366, + "loss": 2.0919, + "step": 9972 + }, + { + "epoch": 1.1635748454089372, + "grad_norm": 1.3288251161575317, + "learning_rate": 0.00026823823050624513, + "loss": 2.0379, + "step": 9973 + }, + { + "epoch": 1.1636915179092289, + "grad_norm": 1.2035043239593506, + "learning_rate": 0.00026822888456930395, + "loss": 2.0798, + "step": 9974 + }, + { + "epoch": 1.1638081904095205, + "grad_norm": 1.2517714500427246, + "learning_rate": 0.0002682195374222467, + "loss": 2.0597, + "step": 9975 + }, + { + "epoch": 1.1639248629098122, + "grad_norm": 1.213984489440918, + "learning_rate": 0.00026821018906517054, + "loss": 2.0628, + "step": 9976 + }, + { + "epoch": 1.164041535410104, + "grad_norm": 1.2036124467849731, + "learning_rate": 0.00026820083949817216, + "loss": 2.2146, + "step": 9977 + }, + { + "epoch": 1.1641582079103956, + "grad_norm": 1.2108614444732666, + "learning_rate": 0.00026819148872134864, + "loss": 2.012, + "step": 9978 + }, + { + "epoch": 1.1642748804106873, + "grad_norm": 1.294071912765503, + "learning_rate": 0.0002681821367347968, + "loss": 2.1273, + "step": 9979 + }, + { + "epoch": 1.164391552910979, + "grad_norm": 1.1565132141113281, + "learning_rate": 0.0002681727835386137, + "loss": 2.0539, + "step": 9980 + }, + { + "epoch": 1.1645082254112706, + "grad_norm": 1.2192389965057373, + "learning_rate": 0.00026816342913289635, + "loss": 2.0836, + "step": 9981 + }, + { + "epoch": 1.1646248979115623, + "grad_norm": 1.0538619756698608, + "learning_rate": 0.00026815407351774153, + "loss": 1.9928, + "step": 9982 + }, + { + "epoch": 1.164741570411854, + "grad_norm": 1.161969780921936, + "learning_rate": 0.00026814471669324636, + "loss": 2.1444, + "step": 9983 + }, + { + "epoch": 1.1648582429121457, + "grad_norm": 1.1297155618667603, + "learning_rate": 0.0002681353586595078, + "loss": 2.0135, + "step": 9984 + }, + { + "epoch": 1.1649749154124374, + "grad_norm": 1.3156975507736206, + "learning_rate": 0.0002681259994166229, + "loss": 2.1732, + "step": 9985 + }, + { + "epoch": 1.165091587912729, + "grad_norm": 1.2246063947677612, + "learning_rate": 0.0002681166389646887, + "loss": 2.0602, + "step": 9986 + }, + { + "epoch": 1.1652082604130207, + "grad_norm": 1.088401198387146, + "learning_rate": 0.0002681072773038021, + "loss": 2.0881, + "step": 9987 + }, + { + "epoch": 1.1653249329133124, + "grad_norm": 1.2919981479644775, + "learning_rate": 0.0002680979144340604, + "loss": 1.9816, + "step": 9988 + }, + { + "epoch": 1.165441605413604, + "grad_norm": 1.1500402688980103, + "learning_rate": 0.0002680885503555604, + "loss": 2.105, + "step": 9989 + }, + { + "epoch": 1.1655582779138958, + "grad_norm": 1.1306864023208618, + "learning_rate": 0.00026807918506839936, + "loss": 2.054, + "step": 9990 + }, + { + "epoch": 1.1656749504141874, + "grad_norm": 1.1688882112503052, + "learning_rate": 0.0002680698185726743, + "loss": 2.1126, + "step": 9991 + }, + { + "epoch": 1.1657916229144791, + "grad_norm": 1.3126535415649414, + "learning_rate": 0.00026806045086848234, + "loss": 1.9432, + "step": 9992 + }, + { + "epoch": 1.1659082954147708, + "grad_norm": 1.146417260169983, + "learning_rate": 0.0002680510819559205, + "loss": 1.904, + "step": 9993 + }, + { + "epoch": 1.1660249679150625, + "grad_norm": 1.3583636283874512, + "learning_rate": 0.0002680417118350861, + "loss": 2.0429, + "step": 9994 + }, + { + "epoch": 1.1661416404153542, + "grad_norm": 1.3169888257980347, + "learning_rate": 0.0002680323405060761, + "loss": 2.1944, + "step": 9995 + }, + { + "epoch": 1.1662583129156459, + "grad_norm": 1.2725993394851685, + "learning_rate": 0.0002680229679689877, + "loss": 2.1877, + "step": 9996 + }, + { + "epoch": 1.1663749854159375, + "grad_norm": 1.3453469276428223, + "learning_rate": 0.00026801359422391817, + "loss": 2.0661, + "step": 9997 + }, + { + "epoch": 1.1664916579162292, + "grad_norm": 1.1546169519424438, + "learning_rate": 0.0002680042192709645, + "loss": 2.0765, + "step": 9998 + }, + { + "epoch": 1.166608330416521, + "grad_norm": 1.206926703453064, + "learning_rate": 0.000267994843110224, + "loss": 2.2172, + "step": 9999 + }, + { + "epoch": 1.1667250029168126, + "grad_norm": 1.033019781112671, + "learning_rate": 0.0002679854657417939, + "loss": 2.1509, + "step": 10000 + }, + { + "epoch": 1.1668416754171043, + "grad_norm": 1.4590277671813965, + "learning_rate": 0.00026797608716577136, + "loss": 2.19, + "step": 10001 + }, + { + "epoch": 1.166958347917396, + "grad_norm": 1.1243574619293213, + "learning_rate": 0.00026796670738225354, + "loss": 2.0964, + "step": 10002 + }, + { + "epoch": 1.1670750204176876, + "grad_norm": 1.139386534690857, + "learning_rate": 0.0002679573263913378, + "loss": 2.1083, + "step": 10003 + }, + { + "epoch": 1.1671916929179793, + "grad_norm": 1.13433837890625, + "learning_rate": 0.00026794794419312135, + "loss": 2.1615, + "step": 10004 + }, + { + "epoch": 1.167308365418271, + "grad_norm": 1.4901155233383179, + "learning_rate": 0.00026793856078770144, + "loss": 1.9968, + "step": 10005 + }, + { + "epoch": 1.1674250379185627, + "grad_norm": 1.2849189043045044, + "learning_rate": 0.00026792917617517533, + "loss": 2.0563, + "step": 10006 + }, + { + "epoch": 1.1675417104188544, + "grad_norm": 1.2035950422286987, + "learning_rate": 0.0002679197903556404, + "loss": 2.2418, + "step": 10007 + }, + { + "epoch": 1.167658382919146, + "grad_norm": 1.2776048183441162, + "learning_rate": 0.00026791040332919383, + "loss": 2.2095, + "step": 10008 + }, + { + "epoch": 1.1677750554194377, + "grad_norm": 1.1744449138641357, + "learning_rate": 0.000267901015095933, + "loss": 2.2609, + "step": 10009 + }, + { + "epoch": 1.1678917279197294, + "grad_norm": 1.1266899108886719, + "learning_rate": 0.00026789162565595524, + "loss": 1.9976, + "step": 10010 + }, + { + "epoch": 1.168008400420021, + "grad_norm": 1.1169884204864502, + "learning_rate": 0.00026788223500935783, + "loss": 2.1671, + "step": 10011 + }, + { + "epoch": 1.1681250729203128, + "grad_norm": 1.1401387453079224, + "learning_rate": 0.0002678728431562382, + "loss": 2.1999, + "step": 10012 + }, + { + "epoch": 1.1682417454206044, + "grad_norm": 0.9802332520484924, + "learning_rate": 0.0002678634500966937, + "loss": 1.8897, + "step": 10013 + }, + { + "epoch": 1.1683584179208961, + "grad_norm": 1.3368421792984009, + "learning_rate": 0.0002678540558308216, + "loss": 2.1819, + "step": 10014 + }, + { + "epoch": 1.1684750904211878, + "grad_norm": 1.323469638824463, + "learning_rate": 0.0002678446603587194, + "loss": 2.1821, + "step": 10015 + }, + { + "epoch": 1.1685917629214795, + "grad_norm": 1.2193127870559692, + "learning_rate": 0.0002678352636804846, + "loss": 2.1807, + "step": 10016 + }, + { + "epoch": 1.1687084354217712, + "grad_norm": 1.1513031721115112, + "learning_rate": 0.00026782586579621436, + "loss": 2.1231, + "step": 10017 + }, + { + "epoch": 1.1688251079220628, + "grad_norm": 1.04558265209198, + "learning_rate": 0.0002678164667060063, + "loss": 1.9637, + "step": 10018 + }, + { + "epoch": 1.1689417804223545, + "grad_norm": 1.3752740621566772, + "learning_rate": 0.0002678070664099578, + "loss": 2.1365, + "step": 10019 + }, + { + "epoch": 1.1690584529226462, + "grad_norm": 1.3556299209594727, + "learning_rate": 0.00026779766490816624, + "loss": 2.1225, + "step": 10020 + }, + { + "epoch": 1.1691751254229379, + "grad_norm": 1.1645947694778442, + "learning_rate": 0.00026778826220072926, + "loss": 2.0289, + "step": 10021 + }, + { + "epoch": 1.1692917979232296, + "grad_norm": 1.3013070821762085, + "learning_rate": 0.00026777885828774414, + "loss": 2.0835, + "step": 10022 + }, + { + "epoch": 1.1694084704235213, + "grad_norm": 1.1582709550857544, + "learning_rate": 0.00026776945316930856, + "loss": 2.0732, + "step": 10023 + }, + { + "epoch": 1.169525142923813, + "grad_norm": 1.4147725105285645, + "learning_rate": 0.00026776004684551984, + "loss": 2.1339, + "step": 10024 + }, + { + "epoch": 1.1696418154241046, + "grad_norm": 1.2095255851745605, + "learning_rate": 0.0002677506393164756, + "loss": 1.8215, + "step": 10025 + }, + { + "epoch": 1.1697584879243963, + "grad_norm": 1.2769280672073364, + "learning_rate": 0.0002677412305822734, + "loss": 2.0139, + "step": 10026 + }, + { + "epoch": 1.169875160424688, + "grad_norm": 1.0888121128082275, + "learning_rate": 0.0002677318206430107, + "loss": 2.033, + "step": 10027 + }, + { + "epoch": 1.1699918329249797, + "grad_norm": 1.1641013622283936, + "learning_rate": 0.000267722409498785, + "loss": 2.0379, + "step": 10028 + }, + { + "epoch": 1.1701085054252713, + "grad_norm": 1.1879850625991821, + "learning_rate": 0.000267712997149694, + "loss": 2.1183, + "step": 10029 + }, + { + "epoch": 1.170225177925563, + "grad_norm": 1.7696596384048462, + "learning_rate": 0.0002677035835958352, + "loss": 2.0826, + "step": 10030 + }, + { + "epoch": 1.1703418504258547, + "grad_norm": 1.183815598487854, + "learning_rate": 0.00026769416883730624, + "loss": 2.1149, + "step": 10031 + }, + { + "epoch": 1.1704585229261464, + "grad_norm": 1.1283410787582397, + "learning_rate": 0.0002676847528742046, + "loss": 2.309, + "step": 10032 + }, + { + "epoch": 1.170575195426438, + "grad_norm": 1.201621651649475, + "learning_rate": 0.0002676753357066281, + "loss": 1.9623, + "step": 10033 + }, + { + "epoch": 1.1706918679267297, + "grad_norm": 1.4450818300247192, + "learning_rate": 0.00026766591733467425, + "loss": 2.1157, + "step": 10034 + }, + { + "epoch": 1.1708085404270214, + "grad_norm": 1.2228639125823975, + "learning_rate": 0.0002676564977584406, + "loss": 2.0379, + "step": 10035 + }, + { + "epoch": 1.170925212927313, + "grad_norm": 1.1096078157424927, + "learning_rate": 0.0002676470769780249, + "loss": 1.9957, + "step": 10036 + }, + { + "epoch": 1.1710418854276048, + "grad_norm": 1.1901484727859497, + "learning_rate": 0.0002676376549935249, + "loss": 2.0665, + "step": 10037 + }, + { + "epoch": 1.1711585579278965, + "grad_norm": 1.332900047302246, + "learning_rate": 0.00026762823180503806, + "loss": 2.1194, + "step": 10038 + }, + { + "epoch": 1.1712752304281882, + "grad_norm": 1.1504193544387817, + "learning_rate": 0.00026761880741266224, + "loss": 2.0277, + "step": 10039 + }, + { + "epoch": 1.1713919029284798, + "grad_norm": 1.0983707904815674, + "learning_rate": 0.0002676093818164951, + "loss": 2.0978, + "step": 10040 + }, + { + "epoch": 1.1715085754287715, + "grad_norm": 1.2358222007751465, + "learning_rate": 0.00026759995501663433, + "loss": 2.0331, + "step": 10041 + }, + { + "epoch": 1.1716252479290632, + "grad_norm": 1.3228750228881836, + "learning_rate": 0.00026759052701317773, + "loss": 2.2331, + "step": 10042 + }, + { + "epoch": 1.1717419204293549, + "grad_norm": 1.3809207677841187, + "learning_rate": 0.0002675810978062229, + "loss": 2.1504, + "step": 10043 + }, + { + "epoch": 1.1718585929296466, + "grad_norm": 1.1296147108078003, + "learning_rate": 0.00026757166739586777, + "loss": 1.8267, + "step": 10044 + }, + { + "epoch": 1.1719752654299382, + "grad_norm": 1.1965994834899902, + "learning_rate": 0.0002675622357822099, + "loss": 2.1709, + "step": 10045 + }, + { + "epoch": 1.17209193793023, + "grad_norm": 1.6508861780166626, + "learning_rate": 0.00026755280296534726, + "loss": 2.0552, + "step": 10046 + }, + { + "epoch": 1.1722086104305216, + "grad_norm": 1.2599486112594604, + "learning_rate": 0.0002675433689453775, + "loss": 2.1883, + "step": 10047 + }, + { + "epoch": 1.1723252829308133, + "grad_norm": 1.1422585248947144, + "learning_rate": 0.00026753393372239853, + "loss": 1.951, + "step": 10048 + }, + { + "epoch": 1.172441955431105, + "grad_norm": 1.06069815158844, + "learning_rate": 0.000267524497296508, + "loss": 2.1518, + "step": 10049 + }, + { + "epoch": 1.1725586279313966, + "grad_norm": 1.2679853439331055, + "learning_rate": 0.00026751505966780397, + "loss": 1.8548, + "step": 10050 + }, + { + "epoch": 1.1726753004316883, + "grad_norm": 1.1471222639083862, + "learning_rate": 0.0002675056208363841, + "loss": 1.9082, + "step": 10051 + }, + { + "epoch": 1.17279197293198, + "grad_norm": 1.1264952421188354, + "learning_rate": 0.0002674961808023463, + "loss": 2.021, + "step": 10052 + }, + { + "epoch": 1.1729086454322717, + "grad_norm": 1.261679768562317, + "learning_rate": 0.00026748673956578843, + "loss": 2.1092, + "step": 10053 + }, + { + "epoch": 1.1730253179325634, + "grad_norm": 1.295212745666504, + "learning_rate": 0.0002674772971268083, + "loss": 2.1052, + "step": 10054 + }, + { + "epoch": 1.173141990432855, + "grad_norm": 1.0696054697036743, + "learning_rate": 0.0002674678534855039, + "loss": 1.9815, + "step": 10055 + }, + { + "epoch": 1.1732586629331467, + "grad_norm": 1.1533875465393066, + "learning_rate": 0.00026745840864197316, + "loss": 2.0211, + "step": 10056 + }, + { + "epoch": 1.1733753354334384, + "grad_norm": 1.1303168535232544, + "learning_rate": 0.0002674489625963139, + "loss": 2.1193, + "step": 10057 + }, + { + "epoch": 1.17349200793373, + "grad_norm": 1.3480620384216309, + "learning_rate": 0.0002674395153486241, + "loss": 2.2201, + "step": 10058 + }, + { + "epoch": 1.1736086804340218, + "grad_norm": 1.2818983793258667, + "learning_rate": 0.00026743006689900156, + "loss": 2.071, + "step": 10059 + }, + { + "epoch": 1.1737253529343135, + "grad_norm": 1.1331571340560913, + "learning_rate": 0.00026742061724754446, + "loss": 1.7576, + "step": 10060 + }, + { + "epoch": 1.1738420254346051, + "grad_norm": 1.1932610273361206, + "learning_rate": 0.0002674111663943506, + "loss": 2.0596, + "step": 10061 + }, + { + "epoch": 1.1739586979348968, + "grad_norm": 1.1135770082473755, + "learning_rate": 0.000267401714339518, + "loss": 2.048, + "step": 10062 + }, + { + "epoch": 1.1740753704351885, + "grad_norm": 1.2977936267852783, + "learning_rate": 0.00026739226108314466, + "loss": 2.2029, + "step": 10063 + }, + { + "epoch": 1.1741920429354802, + "grad_norm": 1.2998350858688354, + "learning_rate": 0.00026738280662532863, + "loss": 2.0228, + "step": 10064 + }, + { + "epoch": 1.1743087154357719, + "grad_norm": 1.437227487564087, + "learning_rate": 0.0002673733509661678, + "loss": 2.2339, + "step": 10065 + }, + { + "epoch": 1.1744253879360635, + "grad_norm": 1.1540383100509644, + "learning_rate": 0.00026736389410576025, + "loss": 1.942, + "step": 10066 + }, + { + "epoch": 1.1745420604363552, + "grad_norm": 1.2906166315078735, + "learning_rate": 0.00026735443604420414, + "loss": 1.9855, + "step": 10067 + }, + { + "epoch": 1.174658732936647, + "grad_norm": 1.1327084302902222, + "learning_rate": 0.0002673449767815973, + "loss": 2.0318, + "step": 10068 + }, + { + "epoch": 1.1747754054369386, + "grad_norm": 1.0924851894378662, + "learning_rate": 0.0002673355163180379, + "loss": 2.0818, + "step": 10069 + }, + { + "epoch": 1.1748920779372303, + "grad_norm": 1.2424955368041992, + "learning_rate": 0.0002673260546536241, + "loss": 2.2902, + "step": 10070 + }, + { + "epoch": 1.175008750437522, + "grad_norm": 1.1665661334991455, + "learning_rate": 0.0002673165917884539, + "loss": 1.7644, + "step": 10071 + }, + { + "epoch": 1.1751254229378136, + "grad_norm": 1.1736347675323486, + "learning_rate": 0.0002673071277226254, + "loss": 1.9382, + "step": 10072 + }, + { + "epoch": 1.1752420954381053, + "grad_norm": 1.1475410461425781, + "learning_rate": 0.0002672976624562367, + "loss": 2.1179, + "step": 10073 + }, + { + "epoch": 1.175358767938397, + "grad_norm": 1.141765832901001, + "learning_rate": 0.0002672881959893859, + "loss": 2.071, + "step": 10074 + }, + { + "epoch": 1.1754754404386887, + "grad_norm": 1.3384666442871094, + "learning_rate": 0.00026727872832217126, + "loss": 2.0171, + "step": 10075 + }, + { + "epoch": 1.1755921129389804, + "grad_norm": 1.1300323009490967, + "learning_rate": 0.0002672692594546908, + "loss": 1.9974, + "step": 10076 + }, + { + "epoch": 1.175708785439272, + "grad_norm": 1.1909817457199097, + "learning_rate": 0.00026725978938704276, + "loss": 2.1235, + "step": 10077 + }, + { + "epoch": 1.1758254579395637, + "grad_norm": 1.3532555103302002, + "learning_rate": 0.00026725031811932533, + "loss": 2.0085, + "step": 10078 + }, + { + "epoch": 1.1759421304398554, + "grad_norm": 1.2228202819824219, + "learning_rate": 0.00026724084565163663, + "loss": 2.1284, + "step": 10079 + }, + { + "epoch": 1.176058802940147, + "grad_norm": 1.2257877588272095, + "learning_rate": 0.0002672313719840749, + "loss": 2.1502, + "step": 10080 + }, + { + "epoch": 1.1761754754404388, + "grad_norm": 1.2849698066711426, + "learning_rate": 0.0002672218971167383, + "loss": 2.085, + "step": 10081 + }, + { + "epoch": 1.1762921479407304, + "grad_norm": 1.3033835887908936, + "learning_rate": 0.0002672124210497252, + "loss": 2.0522, + "step": 10082 + }, + { + "epoch": 1.1764088204410221, + "grad_norm": 1.2014316320419312, + "learning_rate": 0.0002672029437831337, + "loss": 2.0432, + "step": 10083 + }, + { + "epoch": 1.1765254929413138, + "grad_norm": 1.267681360244751, + "learning_rate": 0.00026719346531706205, + "loss": 2.2079, + "step": 10084 + }, + { + "epoch": 1.1766421654416055, + "grad_norm": 1.4333158731460571, + "learning_rate": 0.00026718398565160856, + "loss": 2.081, + "step": 10085 + }, + { + "epoch": 1.1767588379418972, + "grad_norm": 1.1728124618530273, + "learning_rate": 0.0002671745047868715, + "loss": 1.9394, + "step": 10086 + }, + { + "epoch": 1.1768755104421889, + "grad_norm": 1.178229808807373, + "learning_rate": 0.00026716502272294915, + "loss": 1.8278, + "step": 10087 + }, + { + "epoch": 1.1769921829424805, + "grad_norm": 1.3062236309051514, + "learning_rate": 0.00026715553945993976, + "loss": 2.1328, + "step": 10088 + }, + { + "epoch": 1.1771088554427722, + "grad_norm": 1.1582645177841187, + "learning_rate": 0.00026714605499794175, + "loss": 2.1267, + "step": 10089 + }, + { + "epoch": 1.177225527943064, + "grad_norm": 1.2229959964752197, + "learning_rate": 0.00026713656933705337, + "loss": 2.0407, + "step": 10090 + }, + { + "epoch": 1.1773422004433556, + "grad_norm": 1.31804358959198, + "learning_rate": 0.000267127082477373, + "loss": 2.0084, + "step": 10091 + }, + { + "epoch": 1.1774588729436473, + "grad_norm": 1.2464474439620972, + "learning_rate": 0.0002671175944189989, + "loss": 2.0214, + "step": 10092 + }, + { + "epoch": 1.177575545443939, + "grad_norm": 1.0940114259719849, + "learning_rate": 0.00026710810516202953, + "loss": 1.8038, + "step": 10093 + }, + { + "epoch": 1.1776922179442306, + "grad_norm": 1.4336086511611938, + "learning_rate": 0.0002670986147065632, + "loss": 2.1934, + "step": 10094 + }, + { + "epoch": 1.1778088904445223, + "grad_norm": 1.1543877124786377, + "learning_rate": 0.00026708912305269843, + "loss": 2.0968, + "step": 10095 + }, + { + "epoch": 1.177925562944814, + "grad_norm": 1.2865254878997803, + "learning_rate": 0.00026707963020053345, + "loss": 2.1213, + "step": 10096 + }, + { + "epoch": 1.1780422354451057, + "grad_norm": 1.0691670179367065, + "learning_rate": 0.0002670701361501667, + "loss": 2.0944, + "step": 10097 + }, + { + "epoch": 1.1781589079453973, + "grad_norm": 1.2649232149124146, + "learning_rate": 0.00026706064090169663, + "loss": 2.2727, + "step": 10098 + }, + { + "epoch": 1.178275580445689, + "grad_norm": 1.2323157787322998, + "learning_rate": 0.00026705114445522174, + "loss": 2.032, + "step": 10099 + }, + { + "epoch": 1.1783922529459807, + "grad_norm": 1.2597943544387817, + "learning_rate": 0.00026704164681084044, + "loss": 2.0534, + "step": 10100 + }, + { + "epoch": 1.1785089254462724, + "grad_norm": 1.241778016090393, + "learning_rate": 0.0002670321479686511, + "loss": 2.2349, + "step": 10101 + }, + { + "epoch": 1.178625597946564, + "grad_norm": 1.1672505140304565, + "learning_rate": 0.0002670226479287523, + "loss": 2.0292, + "step": 10102 + }, + { + "epoch": 1.1787422704468558, + "grad_norm": 1.2553045749664307, + "learning_rate": 0.00026701314669124257, + "loss": 2.1407, + "step": 10103 + }, + { + "epoch": 1.1788589429471474, + "grad_norm": 1.3723976612091064, + "learning_rate": 0.0002670036442562202, + "loss": 2.1602, + "step": 10104 + }, + { + "epoch": 1.1789756154474391, + "grad_norm": 1.073043704032898, + "learning_rate": 0.00026699414062378394, + "loss": 1.9592, + "step": 10105 + }, + { + "epoch": 1.1790922879477308, + "grad_norm": 1.1750538349151611, + "learning_rate": 0.0002669846357940321, + "loss": 2.0134, + "step": 10106 + }, + { + "epoch": 1.1792089604480225, + "grad_norm": 1.1525838375091553, + "learning_rate": 0.00026697512976706344, + "loss": 2.0858, + "step": 10107 + }, + { + "epoch": 1.1793256329483142, + "grad_norm": 1.1028214693069458, + "learning_rate": 0.0002669656225429763, + "loss": 2.2214, + "step": 10108 + }, + { + "epoch": 1.1794423054486058, + "grad_norm": 1.2900185585021973, + "learning_rate": 0.0002669561141218694, + "loss": 2.2806, + "step": 10109 + }, + { + "epoch": 1.1795589779488975, + "grad_norm": 1.3673791885375977, + "learning_rate": 0.0002669466045038412, + "loss": 2.2153, + "step": 10110 + }, + { + "epoch": 1.1796756504491892, + "grad_norm": 1.1697533130645752, + "learning_rate": 0.00026693709368899035, + "loss": 2.1746, + "step": 10111 + }, + { + "epoch": 1.1797923229494809, + "grad_norm": 1.4079018831253052, + "learning_rate": 0.00026692758167741536, + "loss": 2.1525, + "step": 10112 + }, + { + "epoch": 1.1799089954497726, + "grad_norm": 1.1029361486434937, + "learning_rate": 0.00026691806846921494, + "loss": 1.9702, + "step": 10113 + }, + { + "epoch": 1.1800256679500642, + "grad_norm": 1.4035210609436035, + "learning_rate": 0.00026690855406448776, + "loss": 1.9492, + "step": 10114 + }, + { + "epoch": 1.180142340450356, + "grad_norm": 1.148276925086975, + "learning_rate": 0.00026689903846333226, + "loss": 2.2286, + "step": 10115 + }, + { + "epoch": 1.1802590129506476, + "grad_norm": 1.1648149490356445, + "learning_rate": 0.00026688952166584726, + "loss": 2.067, + "step": 10116 + }, + { + "epoch": 1.1803756854509393, + "grad_norm": 1.21876859664917, + "learning_rate": 0.0002668800036721314, + "loss": 1.9511, + "step": 10117 + }, + { + "epoch": 1.180492357951231, + "grad_norm": 1.1100807189941406, + "learning_rate": 0.00026687048448228323, + "loss": 2.2318, + "step": 10118 + }, + { + "epoch": 1.1806090304515227, + "grad_norm": 1.3997268676757812, + "learning_rate": 0.00026686096409640157, + "loss": 2.1671, + "step": 10119 + }, + { + "epoch": 1.1807257029518143, + "grad_norm": 1.2030202150344849, + "learning_rate": 0.000266851442514585, + "loss": 2.1008, + "step": 10120 + }, + { + "epoch": 1.180842375452106, + "grad_norm": 1.1663613319396973, + "learning_rate": 0.0002668419197369323, + "loss": 1.9998, + "step": 10121 + }, + { + "epoch": 1.1809590479523977, + "grad_norm": 1.1147220134735107, + "learning_rate": 0.00026683239576354227, + "loss": 2.017, + "step": 10122 + }, + { + "epoch": 1.1810757204526894, + "grad_norm": 1.5408233404159546, + "learning_rate": 0.0002668228705945135, + "loss": 2.1052, + "step": 10123 + }, + { + "epoch": 1.181192392952981, + "grad_norm": 1.031060814857483, + "learning_rate": 0.0002668133442299448, + "loss": 1.9025, + "step": 10124 + }, + { + "epoch": 1.1813090654532727, + "grad_norm": 0.9904870986938477, + "learning_rate": 0.0002668038166699349, + "loss": 2.0015, + "step": 10125 + }, + { + "epoch": 1.1814257379535644, + "grad_norm": 1.276110053062439, + "learning_rate": 0.00026679428791458263, + "loss": 2.214, + "step": 10126 + }, + { + "epoch": 1.181542410453856, + "grad_norm": 1.2200284004211426, + "learning_rate": 0.0002667847579639867, + "loss": 2.1353, + "step": 10127 + }, + { + "epoch": 1.1816590829541478, + "grad_norm": 1.1057108640670776, + "learning_rate": 0.000266775226818246, + "loss": 2.0716, + "step": 10128 + }, + { + "epoch": 1.1817757554544395, + "grad_norm": 1.2997833490371704, + "learning_rate": 0.00026676569447745923, + "loss": 2.0553, + "step": 10129 + }, + { + "epoch": 1.1818924279547312, + "grad_norm": 1.2412422895431519, + "learning_rate": 0.00026675616094172527, + "loss": 1.9182, + "step": 10130 + }, + { + "epoch": 1.1820091004550228, + "grad_norm": 1.2496330738067627, + "learning_rate": 0.000266746626211143, + "loss": 2.1876, + "step": 10131 + }, + { + "epoch": 1.1821257729553145, + "grad_norm": 1.0753084421157837, + "learning_rate": 0.0002667370902858112, + "loss": 1.9621, + "step": 10132 + }, + { + "epoch": 1.1822424454556062, + "grad_norm": 1.1301792860031128, + "learning_rate": 0.0002667275531658287, + "loss": 1.8292, + "step": 10133 + }, + { + "epoch": 1.1823591179558979, + "grad_norm": 1.1169438362121582, + "learning_rate": 0.0002667180148512944, + "loss": 1.9521, + "step": 10134 + }, + { + "epoch": 1.1824757904561896, + "grad_norm": 1.1141080856323242, + "learning_rate": 0.00026670847534230723, + "loss": 1.9453, + "step": 10135 + }, + { + "epoch": 1.1825924629564812, + "grad_norm": 1.2187728881835938, + "learning_rate": 0.00026669893463896605, + "loss": 1.9886, + "step": 10136 + }, + { + "epoch": 1.182709135456773, + "grad_norm": 1.3249305486679077, + "learning_rate": 0.0002666893927413697, + "loss": 2.1273, + "step": 10137 + }, + { + "epoch": 1.1828258079570646, + "grad_norm": 1.0757001638412476, + "learning_rate": 0.00026667984964961724, + "loss": 2.0096, + "step": 10138 + }, + { + "epoch": 1.1829424804573563, + "grad_norm": 1.3128293752670288, + "learning_rate": 0.00026667030536380753, + "loss": 2.1415, + "step": 10139 + }, + { + "epoch": 1.183059152957648, + "grad_norm": 1.386872410774231, + "learning_rate": 0.0002666607598840395, + "loss": 1.9806, + "step": 10140 + }, + { + "epoch": 1.1831758254579396, + "grad_norm": 1.3693536520004272, + "learning_rate": 0.00026665121321041207, + "loss": 2.1179, + "step": 10141 + }, + { + "epoch": 1.1832924979582313, + "grad_norm": 1.2143958806991577, + "learning_rate": 0.00026664166534302424, + "loss": 2.0813, + "step": 10142 + }, + { + "epoch": 1.183409170458523, + "grad_norm": 1.312560796737671, + "learning_rate": 0.00026663211628197514, + "loss": 2.1426, + "step": 10143 + }, + { + "epoch": 1.1835258429588147, + "grad_norm": 1.2514631748199463, + "learning_rate": 0.00026662256602736354, + "loss": 1.9226, + "step": 10144 + }, + { + "epoch": 1.1836425154591064, + "grad_norm": 1.4091469049453735, + "learning_rate": 0.00026661301457928853, + "loss": 2.2667, + "step": 10145 + }, + { + "epoch": 1.183759187959398, + "grad_norm": 1.3043783903121948, + "learning_rate": 0.00026660346193784916, + "loss": 2.1158, + "step": 10146 + }, + { + "epoch": 1.1838758604596897, + "grad_norm": 1.363008737564087, + "learning_rate": 0.00026659390810314447, + "loss": 2.1616, + "step": 10147 + }, + { + "epoch": 1.1839925329599814, + "grad_norm": 1.1773818731307983, + "learning_rate": 0.00026658435307527344, + "loss": 2.1028, + "step": 10148 + }, + { + "epoch": 1.184109205460273, + "grad_norm": 1.343398094177246, + "learning_rate": 0.00026657479685433523, + "loss": 2.1262, + "step": 10149 + }, + { + "epoch": 1.1842258779605648, + "grad_norm": 1.1335811614990234, + "learning_rate": 0.00026656523944042877, + "loss": 2.0406, + "step": 10150 + }, + { + "epoch": 1.1843425504608565, + "grad_norm": 1.007930040359497, + "learning_rate": 0.0002665556808336532, + "loss": 1.9773, + "step": 10151 + }, + { + "epoch": 1.1844592229611481, + "grad_norm": 1.2737056016921997, + "learning_rate": 0.0002665461210341076, + "loss": 1.9982, + "step": 10152 + }, + { + "epoch": 1.1845758954614398, + "grad_norm": 1.1818861961364746, + "learning_rate": 0.0002665365600418911, + "loss": 1.966, + "step": 10153 + }, + { + "epoch": 1.1846925679617315, + "grad_norm": 1.3958324193954468, + "learning_rate": 0.00026652699785710294, + "loss": 2.2315, + "step": 10154 + }, + { + "epoch": 1.1848092404620232, + "grad_norm": 1.2340235710144043, + "learning_rate": 0.000266517434479842, + "loss": 1.9727, + "step": 10155 + }, + { + "epoch": 1.1849259129623149, + "grad_norm": 1.2612842321395874, + "learning_rate": 0.0002665078699102076, + "loss": 2.0688, + "step": 10156 + }, + { + "epoch": 1.1850425854626065, + "grad_norm": 1.2605226039886475, + "learning_rate": 0.0002664983041482988, + "loss": 2.2038, + "step": 10157 + }, + { + "epoch": 1.1851592579628982, + "grad_norm": 1.0097236633300781, + "learning_rate": 0.0002664887371942149, + "loss": 2.0072, + "step": 10158 + }, + { + "epoch": 1.18527593046319, + "grad_norm": 1.239092230796814, + "learning_rate": 0.0002664791690480549, + "loss": 2.2056, + "step": 10159 + }, + { + "epoch": 1.1853926029634816, + "grad_norm": 1.3445111513137817, + "learning_rate": 0.00026646959970991815, + "loss": 2.0628, + "step": 10160 + }, + { + "epoch": 1.1855092754637733, + "grad_norm": 1.1773109436035156, + "learning_rate": 0.0002664600291799037, + "loss": 2.1516, + "step": 10161 + }, + { + "epoch": 1.185625947964065, + "grad_norm": 1.160248041152954, + "learning_rate": 0.00026645045745811094, + "loss": 2.0572, + "step": 10162 + }, + { + "epoch": 1.1857426204643566, + "grad_norm": 1.0968750715255737, + "learning_rate": 0.00026644088454463893, + "loss": 2.0511, + "step": 10163 + }, + { + "epoch": 1.1858592929646483, + "grad_norm": 1.2979117631912231, + "learning_rate": 0.00026643131043958704, + "loss": 2.0781, + "step": 10164 + }, + { + "epoch": 1.18597596546494, + "grad_norm": 1.0482323169708252, + "learning_rate": 0.00026642173514305453, + "loss": 2.1043, + "step": 10165 + }, + { + "epoch": 1.1860926379652317, + "grad_norm": 1.0550209283828735, + "learning_rate": 0.00026641215865514056, + "loss": 1.9607, + "step": 10166 + }, + { + "epoch": 1.1862093104655234, + "grad_norm": 1.4123841524124146, + "learning_rate": 0.00026640258097594444, + "loss": 2.1558, + "step": 10167 + }, + { + "epoch": 1.186325982965815, + "grad_norm": 1.0859668254852295, + "learning_rate": 0.00026639300210556553, + "loss": 2.1677, + "step": 10168 + }, + { + "epoch": 1.1864426554661067, + "grad_norm": 1.1217740774154663, + "learning_rate": 0.00026638342204410304, + "loss": 1.9166, + "step": 10169 + }, + { + "epoch": 1.1865593279663984, + "grad_norm": 1.2091461420059204, + "learning_rate": 0.00026637384079165633, + "loss": 2.0244, + "step": 10170 + }, + { + "epoch": 1.18667600046669, + "grad_norm": 1.3855825662612915, + "learning_rate": 0.0002663642583483248, + "loss": 2.0949, + "step": 10171 + }, + { + "epoch": 1.1867926729669818, + "grad_norm": 1.3386998176574707, + "learning_rate": 0.00026635467471420763, + "loss": 2.0901, + "step": 10172 + }, + { + "epoch": 1.1869093454672734, + "grad_norm": 1.1961250305175781, + "learning_rate": 0.0002663450898894043, + "loss": 2.0926, + "step": 10173 + }, + { + "epoch": 1.1870260179675651, + "grad_norm": 1.2157498598098755, + "learning_rate": 0.00026633550387401416, + "loss": 2.021, + "step": 10174 + }, + { + "epoch": 1.1871426904678568, + "grad_norm": 1.3289293050765991, + "learning_rate": 0.0002663259166681365, + "loss": 2.1553, + "step": 10175 + }, + { + "epoch": 1.1872593629681485, + "grad_norm": 1.4312156438827515, + "learning_rate": 0.0002663163282718708, + "loss": 2.1546, + "step": 10176 + }, + { + "epoch": 1.1873760354684402, + "grad_norm": 1.299649953842163, + "learning_rate": 0.0002663067386853165, + "loss": 2.147, + "step": 10177 + }, + { + "epoch": 1.1874927079687319, + "grad_norm": 1.3314993381500244, + "learning_rate": 0.00026629714790857284, + "loss": 2.1788, + "step": 10178 + }, + { + "epoch": 1.1876093804690235, + "grad_norm": 1.164818525314331, + "learning_rate": 0.0002662875559417394, + "loss": 1.9964, + "step": 10179 + }, + { + "epoch": 1.1877260529693152, + "grad_norm": 1.3485544919967651, + "learning_rate": 0.00026627796278491563, + "loss": 1.9937, + "step": 10180 + }, + { + "epoch": 1.187842725469607, + "grad_norm": 1.0280841588974, + "learning_rate": 0.00026626836843820085, + "loss": 1.8756, + "step": 10181 + }, + { + "epoch": 1.1879593979698986, + "grad_norm": 1.1750953197479248, + "learning_rate": 0.00026625877290169464, + "loss": 2.1776, + "step": 10182 + }, + { + "epoch": 1.1880760704701903, + "grad_norm": 1.2360007762908936, + "learning_rate": 0.0002662491761754964, + "loss": 2.1637, + "step": 10183 + }, + { + "epoch": 1.188192742970482, + "grad_norm": 1.1754859685897827, + "learning_rate": 0.00026623957825970573, + "loss": 2.1398, + "step": 10184 + }, + { + "epoch": 1.1883094154707736, + "grad_norm": 1.228100299835205, + "learning_rate": 0.00026622997915442203, + "loss": 2.1145, + "step": 10185 + }, + { + "epoch": 1.1884260879710653, + "grad_norm": 1.095301866531372, + "learning_rate": 0.0002662203788597448, + "loss": 1.9774, + "step": 10186 + }, + { + "epoch": 1.188542760471357, + "grad_norm": 1.3370425701141357, + "learning_rate": 0.0002662107773757736, + "loss": 2.0465, + "step": 10187 + }, + { + "epoch": 1.1886594329716487, + "grad_norm": 1.2149207592010498, + "learning_rate": 0.00026620117470260794, + "loss": 2.2266, + "step": 10188 + }, + { + "epoch": 1.1887761054719403, + "grad_norm": 1.2455626726150513, + "learning_rate": 0.00026619157084034747, + "loss": 2.0153, + "step": 10189 + }, + { + "epoch": 1.188892777972232, + "grad_norm": 1.2064237594604492, + "learning_rate": 0.0002661819657890917, + "loss": 2.1385, + "step": 10190 + }, + { + "epoch": 1.1890094504725237, + "grad_norm": 1.068413257598877, + "learning_rate": 0.0002661723595489401, + "loss": 1.9483, + "step": 10191 + }, + { + "epoch": 1.1891261229728154, + "grad_norm": 1.3133875131607056, + "learning_rate": 0.0002661627521199924, + "loss": 2.2178, + "step": 10192 + }, + { + "epoch": 1.189242795473107, + "grad_norm": 1.2009085416793823, + "learning_rate": 0.0002661531435023481, + "loss": 2.1108, + "step": 10193 + }, + { + "epoch": 1.1893594679733988, + "grad_norm": 1.1192198991775513, + "learning_rate": 0.0002661435336961069, + "loss": 2.0577, + "step": 10194 + }, + { + "epoch": 1.1894761404736904, + "grad_norm": 1.3912808895111084, + "learning_rate": 0.0002661339227013683, + "loss": 2.1196, + "step": 10195 + }, + { + "epoch": 1.1895928129739821, + "grad_norm": 1.2174440622329712, + "learning_rate": 0.0002661243105182322, + "loss": 1.8845, + "step": 10196 + }, + { + "epoch": 1.1897094854742738, + "grad_norm": 1.245046615600586, + "learning_rate": 0.0002661146971467979, + "loss": 2.1313, + "step": 10197 + }, + { + "epoch": 1.1898261579745655, + "grad_norm": 1.208012342453003, + "learning_rate": 0.00026610508258716523, + "loss": 2.0641, + "step": 10198 + }, + { + "epoch": 1.1899428304748572, + "grad_norm": 1.2075780630111694, + "learning_rate": 0.0002660954668394339, + "loss": 2.0945, + "step": 10199 + }, + { + "epoch": 1.1900595029751488, + "grad_norm": 1.1837822198867798, + "learning_rate": 0.0002660858499037035, + "loss": 2.1916, + "step": 10200 + }, + { + "epoch": 1.1901761754754405, + "grad_norm": 1.3139578104019165, + "learning_rate": 0.00026607623178007383, + "loss": 2.2036, + "step": 10201 + }, + { + "epoch": 1.1902928479757322, + "grad_norm": 1.2335492372512817, + "learning_rate": 0.00026606661246864456, + "loss": 1.907, + "step": 10202 + }, + { + "epoch": 1.1904095204760239, + "grad_norm": 1.1669628620147705, + "learning_rate": 0.0002660569919695154, + "loss": 2.1009, + "step": 10203 + }, + { + "epoch": 1.1905261929763156, + "grad_norm": 1.1115080118179321, + "learning_rate": 0.00026604737028278604, + "loss": 2.1211, + "step": 10204 + }, + { + "epoch": 1.1906428654766072, + "grad_norm": 1.2213644981384277, + "learning_rate": 0.0002660377474085563, + "loss": 2.1208, + "step": 10205 + }, + { + "epoch": 1.190759537976899, + "grad_norm": 1.3473763465881348, + "learning_rate": 0.0002660281233469259, + "loss": 2.2099, + "step": 10206 + }, + { + "epoch": 1.1908762104771906, + "grad_norm": 1.1316083669662476, + "learning_rate": 0.0002660184980979947, + "loss": 1.9278, + "step": 10207 + }, + { + "epoch": 1.1909928829774823, + "grad_norm": 1.2070655822753906, + "learning_rate": 0.00026600887166186224, + "loss": 2.0203, + "step": 10208 + }, + { + "epoch": 1.191109555477774, + "grad_norm": 1.1023552417755127, + "learning_rate": 0.00026599924403862866, + "loss": 2.0464, + "step": 10209 + }, + { + "epoch": 1.1912262279780657, + "grad_norm": 1.1516669988632202, + "learning_rate": 0.0002659896152283935, + "loss": 2.1513, + "step": 10210 + }, + { + "epoch": 1.1913429004783573, + "grad_norm": 1.118140697479248, + "learning_rate": 0.0002659799852312567, + "loss": 2.2249, + "step": 10211 + }, + { + "epoch": 1.191459572978649, + "grad_norm": 1.2780100107192993, + "learning_rate": 0.0002659703540473181, + "loss": 2.119, + "step": 10212 + }, + { + "epoch": 1.1915762454789407, + "grad_norm": 1.2371554374694824, + "learning_rate": 0.00026596072167667744, + "loss": 2.2081, + "step": 10213 + }, + { + "epoch": 1.1916929179792324, + "grad_norm": 1.1784406900405884, + "learning_rate": 0.00026595108811943477, + "loss": 2.1325, + "step": 10214 + }, + { + "epoch": 1.191809590479524, + "grad_norm": 1.102130651473999, + "learning_rate": 0.0002659414533756897, + "loss": 2.1182, + "step": 10215 + }, + { + "epoch": 1.1919262629798157, + "grad_norm": 1.2270684242248535, + "learning_rate": 0.0002659318174455423, + "loss": 2.1429, + "step": 10216 + }, + { + "epoch": 1.1920429354801074, + "grad_norm": 1.2214397192001343, + "learning_rate": 0.00026592218032909246, + "loss": 2.0846, + "step": 10217 + }, + { + "epoch": 1.192159607980399, + "grad_norm": 1.2880628108978271, + "learning_rate": 0.00026591254202644005, + "loss": 1.955, + "step": 10218 + }, + { + "epoch": 1.1922762804806908, + "grad_norm": 1.0777024030685425, + "learning_rate": 0.0002659029025376849, + "loss": 2.0678, + "step": 10219 + }, + { + "epoch": 1.1923929529809825, + "grad_norm": 1.170137882232666, + "learning_rate": 0.0002658932618629271, + "loss": 2.0951, + "step": 10220 + }, + { + "epoch": 1.1925096254812741, + "grad_norm": 1.2886980772018433, + "learning_rate": 0.0002658836200022665, + "loss": 2.1609, + "step": 10221 + }, + { + "epoch": 1.1926262979815658, + "grad_norm": 1.137818694114685, + "learning_rate": 0.00026587397695580306, + "loss": 2.1221, + "step": 10222 + }, + { + "epoch": 1.1927429704818575, + "grad_norm": 1.177474856376648, + "learning_rate": 0.0002658643327236367, + "loss": 2.1142, + "step": 10223 + }, + { + "epoch": 1.1928596429821492, + "grad_norm": 1.1347167491912842, + "learning_rate": 0.0002658546873058676, + "loss": 2.1427, + "step": 10224 + }, + { + "epoch": 1.1929763154824409, + "grad_norm": 1.022446870803833, + "learning_rate": 0.00026584504070259555, + "loss": 2.0798, + "step": 10225 + }, + { + "epoch": 1.1930929879827326, + "grad_norm": 1.1086723804473877, + "learning_rate": 0.00026583539291392066, + "loss": 1.9744, + "step": 10226 + }, + { + "epoch": 1.1932096604830242, + "grad_norm": 0.9821906089782715, + "learning_rate": 0.0002658257439399429, + "loss": 2.0196, + "step": 10227 + }, + { + "epoch": 1.193326332983316, + "grad_norm": 1.1154145002365112, + "learning_rate": 0.0002658160937807623, + "loss": 1.8942, + "step": 10228 + }, + { + "epoch": 1.1934430054836076, + "grad_norm": 1.234824299812317, + "learning_rate": 0.0002658064424364789, + "loss": 2.0748, + "step": 10229 + }, + { + "epoch": 1.1935596779838993, + "grad_norm": 1.1448549032211304, + "learning_rate": 0.0002657967899071927, + "loss": 2.0856, + "step": 10230 + }, + { + "epoch": 1.193676350484191, + "grad_norm": 1.1714259386062622, + "learning_rate": 0.00026578713619300395, + "loss": 2.19, + "step": 10231 + }, + { + "epoch": 1.1937930229844826, + "grad_norm": 1.0980596542358398, + "learning_rate": 0.0002657774812940126, + "loss": 1.9646, + "step": 10232 + }, + { + "epoch": 1.1939096954847743, + "grad_norm": 1.2830361127853394, + "learning_rate": 0.00026576782521031873, + "loss": 2.0376, + "step": 10233 + }, + { + "epoch": 1.194026367985066, + "grad_norm": 1.149314284324646, + "learning_rate": 0.00026575816794202243, + "loss": 2.2641, + "step": 10234 + }, + { + "epoch": 1.1941430404853577, + "grad_norm": 1.2593708038330078, + "learning_rate": 0.00026574850948922387, + "loss": 1.9824, + "step": 10235 + }, + { + "epoch": 1.1942597129856494, + "grad_norm": 1.1672879457473755, + "learning_rate": 0.0002657388498520232, + "loss": 2.0907, + "step": 10236 + }, + { + "epoch": 1.194376385485941, + "grad_norm": 1.1219220161437988, + "learning_rate": 0.0002657291890305205, + "loss": 2.115, + "step": 10237 + }, + { + "epoch": 1.1944930579862327, + "grad_norm": 1.1199793815612793, + "learning_rate": 0.0002657195270248159, + "loss": 2.0332, + "step": 10238 + }, + { + "epoch": 1.1946097304865244, + "grad_norm": 1.0903578996658325, + "learning_rate": 0.0002657098638350097, + "loss": 2.2117, + "step": 10239 + }, + { + "epoch": 1.194726402986816, + "grad_norm": 1.234298586845398, + "learning_rate": 0.00026570019946120196, + "loss": 1.895, + "step": 10240 + }, + { + "epoch": 1.1948430754871078, + "grad_norm": 1.1367624998092651, + "learning_rate": 0.0002656905339034928, + "loss": 2.1085, + "step": 10241 + }, + { + "epoch": 1.1949597479873995, + "grad_norm": 1.2459850311279297, + "learning_rate": 0.00026568086716198265, + "loss": 2.1555, + "step": 10242 + }, + { + "epoch": 1.1950764204876911, + "grad_norm": 1.0217846632003784, + "learning_rate": 0.0002656711992367715, + "loss": 2.0796, + "step": 10243 + }, + { + "epoch": 1.1951930929879828, + "grad_norm": 1.2572325468063354, + "learning_rate": 0.00026566153012795975, + "loss": 2.0777, + "step": 10244 + }, + { + "epoch": 1.1953097654882745, + "grad_norm": 1.207937240600586, + "learning_rate": 0.0002656518598356475, + "loss": 2.0639, + "step": 10245 + }, + { + "epoch": 1.1954264379885662, + "grad_norm": 1.1972731351852417, + "learning_rate": 0.0002656421883599351, + "loss": 2.0177, + "step": 10246 + }, + { + "epoch": 1.1955431104888579, + "grad_norm": 1.0506081581115723, + "learning_rate": 0.0002656325157009227, + "loss": 1.9097, + "step": 10247 + }, + { + "epoch": 1.1956597829891495, + "grad_norm": 1.2066998481750488, + "learning_rate": 0.0002656228418587107, + "loss": 2.1935, + "step": 10248 + }, + { + "epoch": 1.1957764554894412, + "grad_norm": 1.1616549491882324, + "learning_rate": 0.00026561316683339936, + "loss": 2.0721, + "step": 10249 + }, + { + "epoch": 1.195893127989733, + "grad_norm": 1.4355740547180176, + "learning_rate": 0.0002656034906250889, + "loss": 2.1101, + "step": 10250 + }, + { + "epoch": 1.1960098004900246, + "grad_norm": 1.2941348552703857, + "learning_rate": 0.0002655938132338797, + "loss": 2.0978, + "step": 10251 + }, + { + "epoch": 1.1961264729903163, + "grad_norm": 1.3257925510406494, + "learning_rate": 0.0002655841346598721, + "loss": 2.0409, + "step": 10252 + }, + { + "epoch": 1.196243145490608, + "grad_norm": 1.3226481676101685, + "learning_rate": 0.0002655744549031664, + "loss": 2.1084, + "step": 10253 + }, + { + "epoch": 1.1963598179908996, + "grad_norm": 1.1341036558151245, + "learning_rate": 0.000265564773963863, + "loss": 2.0601, + "step": 10254 + }, + { + "epoch": 1.1964764904911913, + "grad_norm": 1.2124048471450806, + "learning_rate": 0.0002655550918420621, + "loss": 2.126, + "step": 10255 + }, + { + "epoch": 1.196593162991483, + "grad_norm": 1.2240475416183472, + "learning_rate": 0.00026554540853786424, + "loss": 2.077, + "step": 10256 + }, + { + "epoch": 1.1967098354917747, + "grad_norm": 1.266058087348938, + "learning_rate": 0.0002655357240513698, + "loss": 2.094, + "step": 10257 + }, + { + "epoch": 1.1968265079920664, + "grad_norm": 1.5204591751098633, + "learning_rate": 0.0002655260383826791, + "loss": 2.0689, + "step": 10258 + }, + { + "epoch": 1.196943180492358, + "grad_norm": 1.5969682931900024, + "learning_rate": 0.0002655163515318926, + "loss": 1.9785, + "step": 10259 + }, + { + "epoch": 1.1970598529926497, + "grad_norm": 1.1357702016830444, + "learning_rate": 0.00026550666349911073, + "loss": 2.1897, + "step": 10260 + }, + { + "epoch": 1.1971765254929414, + "grad_norm": 1.190864086151123, + "learning_rate": 0.0002654969742844339, + "loss": 2.0652, + "step": 10261 + }, + { + "epoch": 1.197293197993233, + "grad_norm": 1.0842078924179077, + "learning_rate": 0.0002654872838879625, + "loss": 2.0912, + "step": 10262 + }, + { + "epoch": 1.1974098704935248, + "grad_norm": 1.1389212608337402, + "learning_rate": 0.0002654775923097971, + "loss": 1.9525, + "step": 10263 + }, + { + "epoch": 1.1975265429938164, + "grad_norm": 1.2264033555984497, + "learning_rate": 0.0002654678995500381, + "loss": 2.2965, + "step": 10264 + }, + { + "epoch": 1.1976432154941081, + "grad_norm": 1.220826268196106, + "learning_rate": 0.000265458205608786, + "loss": 2.1109, + "step": 10265 + }, + { + "epoch": 1.1977598879943998, + "grad_norm": 1.1492527723312378, + "learning_rate": 0.0002654485104861413, + "loss": 2.0241, + "step": 10266 + }, + { + "epoch": 1.1978765604946915, + "grad_norm": 1.1981534957885742, + "learning_rate": 0.00026543881418220457, + "loss": 2.0996, + "step": 10267 + }, + { + "epoch": 1.1979932329949832, + "grad_norm": 1.1822422742843628, + "learning_rate": 0.00026542911669707623, + "loss": 2.102, + "step": 10268 + }, + { + "epoch": 1.1981099054952749, + "grad_norm": 1.2160522937774658, + "learning_rate": 0.0002654194180308568, + "loss": 2.0309, + "step": 10269 + }, + { + "epoch": 1.1982265779955665, + "grad_norm": 1.2740799188613892, + "learning_rate": 0.00026540971818364697, + "loss": 2.0318, + "step": 10270 + }, + { + "epoch": 1.1983432504958582, + "grad_norm": 1.1101064682006836, + "learning_rate": 0.00026540001715554717, + "loss": 1.9956, + "step": 10271 + }, + { + "epoch": 1.19845992299615, + "grad_norm": 1.3112033605575562, + "learning_rate": 0.000265390314946658, + "loss": 2.0592, + "step": 10272 + }, + { + "epoch": 1.1985765954964416, + "grad_norm": 1.0831652879714966, + "learning_rate": 0.00026538061155708005, + "loss": 2.0069, + "step": 10273 + }, + { + "epoch": 1.1986932679967333, + "grad_norm": 1.0188546180725098, + "learning_rate": 0.00026537090698691387, + "loss": 1.9373, + "step": 10274 + }, + { + "epoch": 1.198809940497025, + "grad_norm": 1.083798885345459, + "learning_rate": 0.00026536120123626017, + "loss": 1.9268, + "step": 10275 + }, + { + "epoch": 1.1989266129973166, + "grad_norm": 1.0555356740951538, + "learning_rate": 0.00026535149430521946, + "loss": 2.1175, + "step": 10276 + }, + { + "epoch": 1.1990432854976083, + "grad_norm": 1.1320419311523438, + "learning_rate": 0.0002653417861938925, + "loss": 1.9555, + "step": 10277 + }, + { + "epoch": 1.1991599579979, + "grad_norm": 1.2614846229553223, + "learning_rate": 0.00026533207690237977, + "loss": 1.8622, + "step": 10278 + }, + { + "epoch": 1.1992766304981917, + "grad_norm": 1.29086434841156, + "learning_rate": 0.000265322366430782, + "loss": 1.9491, + "step": 10279 + }, + { + "epoch": 1.1993933029984833, + "grad_norm": 2.4858853816986084, + "learning_rate": 0.00026531265477919985, + "loss": 2.2176, + "step": 10280 + }, + { + "epoch": 1.199509975498775, + "grad_norm": 1.2669347524642944, + "learning_rate": 0.00026530294194773403, + "loss": 2.2759, + "step": 10281 + }, + { + "epoch": 1.1996266479990667, + "grad_norm": 1.0797653198242188, + "learning_rate": 0.0002652932279364853, + "loss": 2.0901, + "step": 10282 + }, + { + "epoch": 1.1997433204993584, + "grad_norm": 1.2317181825637817, + "learning_rate": 0.00026528351274555413, + "loss": 2.1746, + "step": 10283 + }, + { + "epoch": 1.19985999299965, + "grad_norm": 1.1640512943267822, + "learning_rate": 0.0002652737963750415, + "loss": 2.1749, + "step": 10284 + }, + { + "epoch": 1.1999766654999418, + "grad_norm": 1.217170238494873, + "learning_rate": 0.00026526407882504797, + "loss": 1.9624, + "step": 10285 + }, + { + "epoch": 1.2000933380002334, + "grad_norm": 1.1501352787017822, + "learning_rate": 0.0002652543600956743, + "loss": 1.9952, + "step": 10286 + }, + { + "epoch": 1.2002100105005251, + "grad_norm": 1.0373914241790771, + "learning_rate": 0.0002652446401870213, + "loss": 2.0315, + "step": 10287 + }, + { + "epoch": 1.2003266830008168, + "grad_norm": 1.0709574222564697, + "learning_rate": 0.00026523491909918976, + "loss": 2.0869, + "step": 10288 + }, + { + "epoch": 1.2004433555011085, + "grad_norm": 1.155549168586731, + "learning_rate": 0.0002652251968322804, + "loss": 2.0951, + "step": 10289 + }, + { + "epoch": 1.2005600280014002, + "grad_norm": 1.1440171003341675, + "learning_rate": 0.000265215473386394, + "loss": 2.1499, + "step": 10290 + }, + { + "epoch": 1.2006767005016918, + "grad_norm": 1.250311255455017, + "learning_rate": 0.00026520574876163133, + "loss": 1.985, + "step": 10291 + }, + { + "epoch": 1.2007933730019835, + "grad_norm": 1.213796615600586, + "learning_rate": 0.00026519602295809337, + "loss": 2.0073, + "step": 10292 + }, + { + "epoch": 1.2009100455022752, + "grad_norm": 1.1519955396652222, + "learning_rate": 0.00026518629597588073, + "loss": 2.0779, + "step": 10293 + }, + { + "epoch": 1.2010267180025669, + "grad_norm": 1.16828453540802, + "learning_rate": 0.0002651765678150943, + "loss": 2.102, + "step": 10294 + }, + { + "epoch": 1.2011433905028586, + "grad_norm": 1.4801474809646606, + "learning_rate": 0.00026516683847583515, + "loss": 2.2162, + "step": 10295 + }, + { + "epoch": 1.2012600630031502, + "grad_norm": 1.0925140380859375, + "learning_rate": 0.00026515710795820387, + "loss": 2.0734, + "step": 10296 + }, + { + "epoch": 1.201376735503442, + "grad_norm": 1.169199824333191, + "learning_rate": 0.00026514737626230145, + "loss": 2.0235, + "step": 10297 + }, + { + "epoch": 1.2014934080037336, + "grad_norm": 1.101043462753296, + "learning_rate": 0.0002651376433882288, + "loss": 2.0383, + "step": 10298 + }, + { + "epoch": 1.2016100805040253, + "grad_norm": 1.274351954460144, + "learning_rate": 0.0002651279093360867, + "loss": 2.2468, + "step": 10299 + }, + { + "epoch": 1.201726753004317, + "grad_norm": 1.2217761278152466, + "learning_rate": 0.0002651181741059762, + "loss": 2.1221, + "step": 10300 + }, + { + "epoch": 1.2018434255046087, + "grad_norm": 1.2852427959442139, + "learning_rate": 0.0002651084376979982, + "loss": 1.9758, + "step": 10301 + }, + { + "epoch": 1.2019600980049003, + "grad_norm": 1.3186724185943604, + "learning_rate": 0.00026509870011225356, + "loss": 2.2239, + "step": 10302 + }, + { + "epoch": 1.202076770505192, + "grad_norm": 1.068009614944458, + "learning_rate": 0.0002650889613488433, + "loss": 2.0334, + "step": 10303 + }, + { + "epoch": 1.2021934430054837, + "grad_norm": 1.4364984035491943, + "learning_rate": 0.00026507922140786837, + "loss": 2.0207, + "step": 10304 + }, + { + "epoch": 1.2023101155057754, + "grad_norm": 1.0092648267745972, + "learning_rate": 0.00026506948028942975, + "loss": 1.9331, + "step": 10305 + }, + { + "epoch": 1.202426788006067, + "grad_norm": 1.2595384120941162, + "learning_rate": 0.00026505973799362834, + "loss": 2.2464, + "step": 10306 + }, + { + "epoch": 1.2025434605063587, + "grad_norm": 1.1169443130493164, + "learning_rate": 0.00026504999452056524, + "loss": 1.9705, + "step": 10307 + }, + { + "epoch": 1.2026601330066504, + "grad_norm": 1.3681296110153198, + "learning_rate": 0.0002650402498703414, + "loss": 2.1363, + "step": 10308 + }, + { + "epoch": 1.202776805506942, + "grad_norm": 1.3504279851913452, + "learning_rate": 0.0002650305040430579, + "loss": 1.9122, + "step": 10309 + }, + { + "epoch": 1.2028934780072338, + "grad_norm": 1.248176097869873, + "learning_rate": 0.0002650207570388157, + "loss": 1.9547, + "step": 10310 + }, + { + "epoch": 1.2030101505075255, + "grad_norm": 1.348126769065857, + "learning_rate": 0.00026501100885771594, + "loss": 2.0926, + "step": 10311 + }, + { + "epoch": 1.2031268230078171, + "grad_norm": 1.315871000289917, + "learning_rate": 0.0002650012594998596, + "loss": 2.2681, + "step": 10312 + }, + { + "epoch": 1.2032434955081088, + "grad_norm": 1.16068434715271, + "learning_rate": 0.0002649915089653477, + "loss": 2.0359, + "step": 10313 + }, + { + "epoch": 1.2033601680084005, + "grad_norm": 1.1779128313064575, + "learning_rate": 0.00026498175725428143, + "loss": 1.9334, + "step": 10314 + }, + { + "epoch": 1.2034768405086922, + "grad_norm": 1.0353200435638428, + "learning_rate": 0.0002649720043667619, + "loss": 2.1668, + "step": 10315 + }, + { + "epoch": 1.2035935130089839, + "grad_norm": 1.0386884212493896, + "learning_rate": 0.00026496225030289017, + "loss": 2.0152, + "step": 10316 + }, + { + "epoch": 1.2037101855092756, + "grad_norm": 1.2730461359024048, + "learning_rate": 0.00026495249506276736, + "loss": 2.128, + "step": 10317 + }, + { + "epoch": 1.2038268580095672, + "grad_norm": 1.0739109516143799, + "learning_rate": 0.00026494273864649455, + "loss": 1.9755, + "step": 10318 + }, + { + "epoch": 1.203943530509859, + "grad_norm": 1.1325052976608276, + "learning_rate": 0.0002649329810541729, + "loss": 2.0429, + "step": 10319 + }, + { + "epoch": 1.2040602030101506, + "grad_norm": 1.2301591634750366, + "learning_rate": 0.00026492322228590363, + "loss": 2.0801, + "step": 10320 + }, + { + "epoch": 1.2041768755104423, + "grad_norm": 1.1817668676376343, + "learning_rate": 0.00026491346234178793, + "loss": 2.1166, + "step": 10321 + }, + { + "epoch": 1.204293548010734, + "grad_norm": 1.231053352355957, + "learning_rate": 0.0002649037012219269, + "loss": 2.1233, + "step": 10322 + }, + { + "epoch": 1.2044102205110256, + "grad_norm": 1.1628011465072632, + "learning_rate": 0.00026489393892642173, + "loss": 2.1551, + "step": 10323 + }, + { + "epoch": 1.2045268930113173, + "grad_norm": 1.2171502113342285, + "learning_rate": 0.0002648841754553737, + "loss": 2.0958, + "step": 10324 + }, + { + "epoch": 1.204643565511609, + "grad_norm": 1.332535982131958, + "learning_rate": 0.00026487441080888394, + "loss": 2.1302, + "step": 10325 + }, + { + "epoch": 1.2047602380119007, + "grad_norm": 1.0329338312149048, + "learning_rate": 0.0002648646449870538, + "loss": 2.1137, + "step": 10326 + }, + { + "epoch": 1.2048769105121924, + "grad_norm": 1.0938559770584106, + "learning_rate": 0.00026485487798998434, + "loss": 1.9522, + "step": 10327 + }, + { + "epoch": 1.204993583012484, + "grad_norm": 1.0622965097427368, + "learning_rate": 0.00026484510981777693, + "loss": 1.9346, + "step": 10328 + }, + { + "epoch": 1.2051102555127757, + "grad_norm": 1.1769027709960938, + "learning_rate": 0.0002648353404705329, + "loss": 2.1902, + "step": 10329 + }, + { + "epoch": 1.2052269280130674, + "grad_norm": 1.1002458333969116, + "learning_rate": 0.0002648255699483534, + "loss": 1.9804, + "step": 10330 + }, + { + "epoch": 1.205343600513359, + "grad_norm": 1.1539772748947144, + "learning_rate": 0.00026481579825133976, + "loss": 1.9397, + "step": 10331 + }, + { + "epoch": 1.2054602730136508, + "grad_norm": 1.194138526916504, + "learning_rate": 0.0002648060253795932, + "loss": 1.9728, + "step": 10332 + }, + { + "epoch": 1.2055769455139425, + "grad_norm": 1.2165802717208862, + "learning_rate": 0.00026479625133321523, + "loss": 2.0825, + "step": 10333 + }, + { + "epoch": 1.2056936180142341, + "grad_norm": 1.2265808582305908, + "learning_rate": 0.0002647864761123071, + "loss": 1.9916, + "step": 10334 + }, + { + "epoch": 1.2058102905145258, + "grad_norm": 1.262939691543579, + "learning_rate": 0.00026477669971697006, + "loss": 2.2558, + "step": 10335 + }, + { + "epoch": 1.2059269630148175, + "grad_norm": 1.380492091178894, + "learning_rate": 0.00026476692214730556, + "loss": 2.1361, + "step": 10336 + }, + { + "epoch": 1.2060436355151092, + "grad_norm": 1.197738528251648, + "learning_rate": 0.00026475714340341486, + "loss": 2.213, + "step": 10337 + }, + { + "epoch": 1.2061603080154009, + "grad_norm": 1.1891705989837646, + "learning_rate": 0.0002647473634853994, + "loss": 2.0189, + "step": 10338 + }, + { + "epoch": 1.2062769805156925, + "grad_norm": 1.242497444152832, + "learning_rate": 0.00026473758239336067, + "loss": 2.016, + "step": 10339 + }, + { + "epoch": 1.2063936530159842, + "grad_norm": 1.2181851863861084, + "learning_rate": 0.00026472780012739986, + "loss": 1.9276, + "step": 10340 + }, + { + "epoch": 1.206510325516276, + "grad_norm": 1.184330940246582, + "learning_rate": 0.0002647180166876185, + "loss": 1.9565, + "step": 10341 + }, + { + "epoch": 1.2066269980165676, + "grad_norm": 1.1818715333938599, + "learning_rate": 0.0002647082320741181, + "loss": 2.2076, + "step": 10342 + }, + { + "epoch": 1.2067436705168593, + "grad_norm": 1.0843361616134644, + "learning_rate": 0.00026469844628699993, + "loss": 2.1268, + "step": 10343 + }, + { + "epoch": 1.206860343017151, + "grad_norm": 1.308995246887207, + "learning_rate": 0.0002646886593263655, + "loss": 2.144, + "step": 10344 + }, + { + "epoch": 1.2069770155174426, + "grad_norm": 1.1297091245651245, + "learning_rate": 0.0002646788711923162, + "loss": 2.2539, + "step": 10345 + }, + { + "epoch": 1.2070936880177343, + "grad_norm": 1.4351621866226196, + "learning_rate": 0.00026466908188495366, + "loss": 2.1574, + "step": 10346 + }, + { + "epoch": 1.207210360518026, + "grad_norm": 1.0949838161468506, + "learning_rate": 0.00026465929140437935, + "loss": 2.1445, + "step": 10347 + }, + { + "epoch": 1.2073270330183177, + "grad_norm": 1.1224666833877563, + "learning_rate": 0.00026464949975069465, + "loss": 2.0935, + "step": 10348 + }, + { + "epoch": 1.2074437055186094, + "grad_norm": 1.0969489812850952, + "learning_rate": 0.00026463970692400105, + "loss": 1.9832, + "step": 10349 + }, + { + "epoch": 1.207560378018901, + "grad_norm": 1.031989336013794, + "learning_rate": 0.0002646299129244002, + "loss": 1.81, + "step": 10350 + }, + { + "epoch": 1.2076770505191927, + "grad_norm": 1.0834805965423584, + "learning_rate": 0.0002646201177519935, + "loss": 1.9524, + "step": 10351 + }, + { + "epoch": 1.2077937230194844, + "grad_norm": 1.2702282667160034, + "learning_rate": 0.00026461032140688264, + "loss": 2.1848, + "step": 10352 + }, + { + "epoch": 1.207910395519776, + "grad_norm": 1.1307473182678223, + "learning_rate": 0.0002646005238891691, + "loss": 1.9886, + "step": 10353 + }, + { + "epoch": 1.2080270680200678, + "grad_norm": 1.2172653675079346, + "learning_rate": 0.0002645907251989545, + "loss": 2.3418, + "step": 10354 + }, + { + "epoch": 1.2081437405203594, + "grad_norm": 1.1616847515106201, + "learning_rate": 0.00026458092533634026, + "loss": 1.9909, + "step": 10355 + }, + { + "epoch": 1.2082604130206511, + "grad_norm": 1.165776014328003, + "learning_rate": 0.0002645711243014282, + "loss": 1.9771, + "step": 10356 + }, + { + "epoch": 1.2083770855209428, + "grad_norm": 1.0632082223892212, + "learning_rate": 0.00026456132209431977, + "loss": 1.9722, + "step": 10357 + }, + { + "epoch": 1.2084937580212345, + "grad_norm": 1.106296420097351, + "learning_rate": 0.00026455151871511667, + "loss": 2.0124, + "step": 10358 + }, + { + "epoch": 1.2086104305215262, + "grad_norm": 1.4178879261016846, + "learning_rate": 0.00026454171416392047, + "loss": 2.0336, + "step": 10359 + }, + { + "epoch": 1.2087271030218179, + "grad_norm": 1.0539798736572266, + "learning_rate": 0.00026453190844083285, + "loss": 2.0516, + "step": 10360 + }, + { + "epoch": 1.2088437755221095, + "grad_norm": 1.1034719944000244, + "learning_rate": 0.00026452210154595545, + "loss": 1.9615, + "step": 10361 + }, + { + "epoch": 1.2089604480224012, + "grad_norm": 1.049289584159851, + "learning_rate": 0.00026451229347938994, + "loss": 2.1084, + "step": 10362 + }, + { + "epoch": 1.209077120522693, + "grad_norm": 1.1358225345611572, + "learning_rate": 0.000264502484241238, + "loss": 1.9554, + "step": 10363 + }, + { + "epoch": 1.2091937930229846, + "grad_norm": 1.3386834859848022, + "learning_rate": 0.0002644926738316013, + "loss": 2.1807, + "step": 10364 + }, + { + "epoch": 1.2093104655232763, + "grad_norm": 1.2672255039215088, + "learning_rate": 0.0002644828622505816, + "loss": 1.9925, + "step": 10365 + }, + { + "epoch": 1.209427138023568, + "grad_norm": 1.3638774156570435, + "learning_rate": 0.0002644730494982807, + "loss": 2.1605, + "step": 10366 + }, + { + "epoch": 1.2095438105238596, + "grad_norm": 1.369408369064331, + "learning_rate": 0.0002644632355748, + "loss": 2.1631, + "step": 10367 + }, + { + "epoch": 1.2096604830241513, + "grad_norm": 1.3507565259933472, + "learning_rate": 0.00026445342048024155, + "loss": 1.8682, + "step": 10368 + }, + { + "epoch": 1.209777155524443, + "grad_norm": 1.363718032836914, + "learning_rate": 0.00026444360421470697, + "loss": 2.1271, + "step": 10369 + }, + { + "epoch": 1.2098938280247347, + "grad_norm": 1.1618024110794067, + "learning_rate": 0.0002644337867782981, + "loss": 2.0817, + "step": 10370 + }, + { + "epoch": 1.2100105005250263, + "grad_norm": 1.1250460147857666, + "learning_rate": 0.00026442396817111664, + "loss": 2.2208, + "step": 10371 + }, + { + "epoch": 1.210127173025318, + "grad_norm": 1.2454115152359009, + "learning_rate": 0.00026441414839326445, + "loss": 2.1359, + "step": 10372 + }, + { + "epoch": 1.2102438455256097, + "grad_norm": 1.3091832399368286, + "learning_rate": 0.00026440432744484325, + "loss": 2.0731, + "step": 10373 + }, + { + "epoch": 1.2103605180259014, + "grad_norm": 1.1500251293182373, + "learning_rate": 0.0002643945053259549, + "loss": 2.2094, + "step": 10374 + }, + { + "epoch": 1.210477190526193, + "grad_norm": 1.3606822490692139, + "learning_rate": 0.00026438468203670125, + "loss": 2.19, + "step": 10375 + }, + { + "epoch": 1.2105938630264848, + "grad_norm": 0.9505560398101807, + "learning_rate": 0.000264374857577184, + "loss": 2.0159, + "step": 10376 + }, + { + "epoch": 1.2107105355267764, + "grad_norm": 1.2470324039459229, + "learning_rate": 0.00026436503194750526, + "loss": 2.006, + "step": 10377 + }, + { + "epoch": 1.2108272080270681, + "grad_norm": 1.095773458480835, + "learning_rate": 0.00026435520514776664, + "loss": 1.7627, + "step": 10378 + }, + { + "epoch": 1.2109438805273598, + "grad_norm": 1.1359976530075073, + "learning_rate": 0.0002643453771780701, + "loss": 2.1474, + "step": 10379 + }, + { + "epoch": 1.2110605530276515, + "grad_norm": 1.2259618043899536, + "learning_rate": 0.0002643355480385176, + "loss": 2.1562, + "step": 10380 + }, + { + "epoch": 1.2111772255279432, + "grad_norm": 1.2985838651657104, + "learning_rate": 0.00026432571772921096, + "loss": 2.0854, + "step": 10381 + }, + { + "epoch": 1.2112938980282348, + "grad_norm": 1.1092489957809448, + "learning_rate": 0.0002643158862502521, + "loss": 1.9431, + "step": 10382 + }, + { + "epoch": 1.2114105705285265, + "grad_norm": 1.2290374040603638, + "learning_rate": 0.0002643060536017429, + "loss": 2.2183, + "step": 10383 + }, + { + "epoch": 1.2115272430288182, + "grad_norm": 1.1273692846298218, + "learning_rate": 0.0002642962197837853, + "loss": 2.2067, + "step": 10384 + }, + { + "epoch": 1.2116439155291099, + "grad_norm": 1.171958565711975, + "learning_rate": 0.00026428638479648143, + "loss": 2.113, + "step": 10385 + }, + { + "epoch": 1.2117605880294016, + "grad_norm": 1.365695834159851, + "learning_rate": 0.000264276548639933, + "loss": 2.1985, + "step": 10386 + }, + { + "epoch": 1.2118772605296932, + "grad_norm": 1.3194196224212646, + "learning_rate": 0.00026426671131424213, + "loss": 2.2422, + "step": 10387 + }, + { + "epoch": 1.211993933029985, + "grad_norm": 1.2536588907241821, + "learning_rate": 0.0002642568728195107, + "loss": 2.1012, + "step": 10388 + }, + { + "epoch": 1.2121106055302766, + "grad_norm": 1.2620503902435303, + "learning_rate": 0.0002642470331558408, + "loss": 1.993, + "step": 10389 + }, + { + "epoch": 1.2122272780305683, + "grad_norm": 1.4674150943756104, + "learning_rate": 0.0002642371923233344, + "loss": 2.1015, + "step": 10390 + }, + { + "epoch": 1.21234395053086, + "grad_norm": 1.1542381048202515, + "learning_rate": 0.0002642273503220935, + "loss": 2.1287, + "step": 10391 + }, + { + "epoch": 1.2124606230311517, + "grad_norm": 1.1942663192749023, + "learning_rate": 0.00026421750715222014, + "loss": 2.1378, + "step": 10392 + }, + { + "epoch": 1.2125772955314433, + "grad_norm": 1.3204641342163086, + "learning_rate": 0.0002642076628138164, + "loss": 2.0834, + "step": 10393 + }, + { + "epoch": 1.212693968031735, + "grad_norm": 1.2273921966552734, + "learning_rate": 0.0002641978173069843, + "loss": 2.0583, + "step": 10394 + }, + { + "epoch": 1.2128106405320267, + "grad_norm": 1.1031808853149414, + "learning_rate": 0.00026418797063182593, + "loss": 2.0129, + "step": 10395 + }, + { + "epoch": 1.2129273130323184, + "grad_norm": 1.32085382938385, + "learning_rate": 0.0002641781227884433, + "loss": 2.1082, + "step": 10396 + }, + { + "epoch": 1.21304398553261, + "grad_norm": 1.1205253601074219, + "learning_rate": 0.0002641682737769386, + "loss": 2.0435, + "step": 10397 + }, + { + "epoch": 1.2131606580329017, + "grad_norm": 1.0947428941726685, + "learning_rate": 0.00026415842359741383, + "loss": 2.2188, + "step": 10398 + }, + { + "epoch": 1.2132773305331934, + "grad_norm": 1.266093134880066, + "learning_rate": 0.00026414857224997117, + "loss": 2.0233, + "step": 10399 + }, + { + "epoch": 1.213394003033485, + "grad_norm": 1.1702885627746582, + "learning_rate": 0.0002641387197347128, + "loss": 1.9271, + "step": 10400 + }, + { + "epoch": 1.2135106755337768, + "grad_norm": 1.461289405822754, + "learning_rate": 0.00026412886605174077, + "loss": 2.0131, + "step": 10401 + }, + { + "epoch": 1.2136273480340685, + "grad_norm": 1.4024549722671509, + "learning_rate": 0.0002641190112011573, + "loss": 2.1815, + "step": 10402 + }, + { + "epoch": 1.2137440205343601, + "grad_norm": 1.3243056535720825, + "learning_rate": 0.0002641091551830645, + "loss": 2.1674, + "step": 10403 + }, + { + "epoch": 1.2138606930346518, + "grad_norm": 1.0252115726470947, + "learning_rate": 0.0002640992979975645, + "loss": 1.8457, + "step": 10404 + }, + { + "epoch": 1.2139773655349435, + "grad_norm": 1.098596453666687, + "learning_rate": 0.0002640894396447596, + "loss": 2.0577, + "step": 10405 + }, + { + "epoch": 1.2140940380352352, + "grad_norm": 1.127016544342041, + "learning_rate": 0.00026407958012475196, + "loss": 2.0471, + "step": 10406 + }, + { + "epoch": 1.2142107105355269, + "grad_norm": 1.3077715635299683, + "learning_rate": 0.00026406971943764376, + "loss": 2.1727, + "step": 10407 + }, + { + "epoch": 1.2143273830358186, + "grad_norm": 1.1626931428909302, + "learning_rate": 0.0002640598575835373, + "loss": 2.1395, + "step": 10408 + }, + { + "epoch": 1.2144440555361102, + "grad_norm": 1.3727068901062012, + "learning_rate": 0.00026404999456253474, + "loss": 2.2039, + "step": 10409 + }, + { + "epoch": 1.214560728036402, + "grad_norm": 1.042460560798645, + "learning_rate": 0.00026404013037473835, + "loss": 1.9917, + "step": 10410 + }, + { + "epoch": 1.2146774005366936, + "grad_norm": 1.0500848293304443, + "learning_rate": 0.0002640302650202504, + "loss": 2.0453, + "step": 10411 + }, + { + "epoch": 1.2147940730369853, + "grad_norm": 1.2775390148162842, + "learning_rate": 0.0002640203984991731, + "loss": 1.9992, + "step": 10412 + }, + { + "epoch": 1.214910745537277, + "grad_norm": 1.1996822357177734, + "learning_rate": 0.0002640105308116088, + "loss": 2.1904, + "step": 10413 + }, + { + "epoch": 1.2150274180375686, + "grad_norm": 1.1487315893173218, + "learning_rate": 0.0002640006619576599, + "loss": 2.1475, + "step": 10414 + }, + { + "epoch": 1.2151440905378603, + "grad_norm": 1.3779819011688232, + "learning_rate": 0.0002639907919374286, + "loss": 2.1038, + "step": 10415 + }, + { + "epoch": 1.215260763038152, + "grad_norm": 1.2466926574707031, + "learning_rate": 0.00026398092075101713, + "loss": 2.0179, + "step": 10416 + }, + { + "epoch": 1.2153774355384437, + "grad_norm": 1.3120689392089844, + "learning_rate": 0.00026397104839852797, + "loss": 2.1516, + "step": 10417 + }, + { + "epoch": 1.2154941080387354, + "grad_norm": 1.2595428228378296, + "learning_rate": 0.0002639611748800634, + "loss": 2.0023, + "step": 10418 + }, + { + "epoch": 1.215610780539027, + "grad_norm": 1.2628241777420044, + "learning_rate": 0.00026395130019572576, + "loss": 2.138, + "step": 10419 + }, + { + "epoch": 1.2157274530393187, + "grad_norm": 1.230739951133728, + "learning_rate": 0.00026394142434561747, + "loss": 2.0416, + "step": 10420 + }, + { + "epoch": 1.2158441255396104, + "grad_norm": 1.1122004985809326, + "learning_rate": 0.0002639315473298409, + "loss": 1.8936, + "step": 10421 + }, + { + "epoch": 1.215960798039902, + "grad_norm": 1.1963801383972168, + "learning_rate": 0.0002639216691484984, + "loss": 2.1098, + "step": 10422 + }, + { + "epoch": 1.2160774705401938, + "grad_norm": 1.081950306892395, + "learning_rate": 0.00026391178980169245, + "loss": 2.0938, + "step": 10423 + }, + { + "epoch": 1.2161941430404855, + "grad_norm": 1.2509105205535889, + "learning_rate": 0.00026390190928952544, + "loss": 2.1407, + "step": 10424 + }, + { + "epoch": 1.2163108155407771, + "grad_norm": 1.1346098184585571, + "learning_rate": 0.00026389202761209973, + "loss": 1.906, + "step": 10425 + }, + { + "epoch": 1.2164274880410688, + "grad_norm": 1.3681575059890747, + "learning_rate": 0.0002638821447695179, + "loss": 2.2683, + "step": 10426 + }, + { + "epoch": 1.2165441605413605, + "grad_norm": 1.4534775018692017, + "learning_rate": 0.00026387226076188226, + "loss": 2.1497, + "step": 10427 + }, + { + "epoch": 1.2166608330416522, + "grad_norm": 1.2040477991104126, + "learning_rate": 0.00026386237558929536, + "loss": 2.2202, + "step": 10428 + }, + { + "epoch": 1.2167775055419439, + "grad_norm": 1.1129381656646729, + "learning_rate": 0.0002638524892518597, + "loss": 2.0207, + "step": 10429 + }, + { + "epoch": 1.2168941780422355, + "grad_norm": 1.279581069946289, + "learning_rate": 0.00026384260174967773, + "loss": 1.9894, + "step": 10430 + }, + { + "epoch": 1.2170108505425272, + "grad_norm": 1.1093356609344482, + "learning_rate": 0.0002638327130828519, + "loss": 2.0889, + "step": 10431 + }, + { + "epoch": 1.217127523042819, + "grad_norm": 1.4679778814315796, + "learning_rate": 0.0002638228232514848, + "loss": 2.1473, + "step": 10432 + }, + { + "epoch": 1.2172441955431106, + "grad_norm": 1.215448260307312, + "learning_rate": 0.00026381293225567895, + "loss": 2.1832, + "step": 10433 + }, + { + "epoch": 1.2173608680434023, + "grad_norm": 1.2135413885116577, + "learning_rate": 0.00026380304009553687, + "loss": 2.1141, + "step": 10434 + }, + { + "epoch": 1.217477540543694, + "grad_norm": 1.2629250288009644, + "learning_rate": 0.00026379314677116116, + "loss": 2.0282, + "step": 10435 + }, + { + "epoch": 1.2175942130439856, + "grad_norm": 1.0657559633255005, + "learning_rate": 0.00026378325228265426, + "loss": 2.0279, + "step": 10436 + }, + { + "epoch": 1.2177108855442773, + "grad_norm": 1.5072623491287231, + "learning_rate": 0.00026377335663011885, + "loss": 2.0529, + "step": 10437 + }, + { + "epoch": 1.217827558044569, + "grad_norm": 1.0945509672164917, + "learning_rate": 0.0002637634598136575, + "loss": 2.0743, + "step": 10438 + }, + { + "epoch": 1.2179442305448607, + "grad_norm": 1.0988560914993286, + "learning_rate": 0.0002637535618333728, + "loss": 2.0727, + "step": 10439 + }, + { + "epoch": 1.2180609030451524, + "grad_norm": 1.1689305305480957, + "learning_rate": 0.0002637436626893674, + "loss": 2.2627, + "step": 10440 + }, + { + "epoch": 1.218177575545444, + "grad_norm": 1.24563729763031, + "learning_rate": 0.00026373376238174377, + "loss": 2.1994, + "step": 10441 + }, + { + "epoch": 1.2182942480457357, + "grad_norm": 1.0785791873931885, + "learning_rate": 0.0002637238609106048, + "loss": 2.178, + "step": 10442 + }, + { + "epoch": 1.2184109205460274, + "grad_norm": 1.1459097862243652, + "learning_rate": 0.00026371395827605297, + "loss": 2.031, + "step": 10443 + }, + { + "epoch": 1.218527593046319, + "grad_norm": 1.1970343589782715, + "learning_rate": 0.0002637040544781909, + "loss": 2.2156, + "step": 10444 + }, + { + "epoch": 1.2186442655466108, + "grad_norm": 1.1201621294021606, + "learning_rate": 0.00026369414951712137, + "loss": 2.1198, + "step": 10445 + }, + { + "epoch": 1.2187609380469024, + "grad_norm": 1.157225489616394, + "learning_rate": 0.000263684243392947, + "loss": 2.0441, + "step": 10446 + }, + { + "epoch": 1.2188776105471941, + "grad_norm": 1.1431633234024048, + "learning_rate": 0.00026367433610577053, + "loss": 1.992, + "step": 10447 + }, + { + "epoch": 1.2189942830474858, + "grad_norm": 1.0860862731933594, + "learning_rate": 0.00026366442765569465, + "loss": 2.0654, + "step": 10448 + }, + { + "epoch": 1.2191109555477775, + "grad_norm": 1.1968300342559814, + "learning_rate": 0.0002636545180428221, + "loss": 2.0827, + "step": 10449 + }, + { + "epoch": 1.2192276280480692, + "grad_norm": 1.1737858057022095, + "learning_rate": 0.0002636446072672556, + "loss": 2.1642, + "step": 10450 + }, + { + "epoch": 1.2193443005483608, + "grad_norm": 1.1613298654556274, + "learning_rate": 0.0002636346953290978, + "loss": 2.1922, + "step": 10451 + }, + { + "epoch": 1.2194609730486525, + "grad_norm": 1.0551679134368896, + "learning_rate": 0.00026362478222845156, + "loss": 2.1057, + "step": 10452 + }, + { + "epoch": 1.2195776455489442, + "grad_norm": 1.1559211015701294, + "learning_rate": 0.00026361486796541966, + "loss": 2.0711, + "step": 10453 + }, + { + "epoch": 1.219694318049236, + "grad_norm": 1.3359946012496948, + "learning_rate": 0.00026360495254010486, + "loss": 2.1995, + "step": 10454 + }, + { + "epoch": 1.2198109905495276, + "grad_norm": 1.1163606643676758, + "learning_rate": 0.0002635950359526099, + "loss": 2.0988, + "step": 10455 + }, + { + "epoch": 1.2199276630498193, + "grad_norm": 1.0638248920440674, + "learning_rate": 0.0002635851182030376, + "loss": 2.1402, + "step": 10456 + }, + { + "epoch": 1.220044335550111, + "grad_norm": 1.2992812395095825, + "learning_rate": 0.00026357519929149086, + "loss": 2.1856, + "step": 10457 + }, + { + "epoch": 1.2201610080504026, + "grad_norm": 1.1567291021347046, + "learning_rate": 0.00026356527921807244, + "loss": 2.1032, + "step": 10458 + }, + { + "epoch": 1.2202776805506943, + "grad_norm": 1.1866307258605957, + "learning_rate": 0.00026355535798288515, + "loss": 2.0657, + "step": 10459 + }, + { + "epoch": 1.220394353050986, + "grad_norm": 1.1940919160842896, + "learning_rate": 0.0002635454355860319, + "loss": 2.1117, + "step": 10460 + }, + { + "epoch": 1.2205110255512777, + "grad_norm": 1.1011582612991333, + "learning_rate": 0.0002635355120276156, + "loss": 2.0145, + "step": 10461 + }, + { + "epoch": 1.2206276980515693, + "grad_norm": 1.307000994682312, + "learning_rate": 0.0002635255873077389, + "loss": 2.0256, + "step": 10462 + }, + { + "epoch": 1.220744370551861, + "grad_norm": 1.1033543348312378, + "learning_rate": 0.000263515661426505, + "loss": 2.0234, + "step": 10463 + }, + { + "epoch": 1.2208610430521527, + "grad_norm": 1.2668023109436035, + "learning_rate": 0.0002635057343840166, + "loss": 2.0994, + "step": 10464 + }, + { + "epoch": 1.2209777155524444, + "grad_norm": 0.9110881090164185, + "learning_rate": 0.00026349580618037667, + "loss": 2.1068, + "step": 10465 + }, + { + "epoch": 1.221094388052736, + "grad_norm": 1.183340311050415, + "learning_rate": 0.0002634858768156881, + "loss": 2.0798, + "step": 10466 + }, + { + "epoch": 1.2212110605530277, + "grad_norm": 1.461498498916626, + "learning_rate": 0.0002634759462900539, + "loss": 2.2654, + "step": 10467 + }, + { + "epoch": 1.2213277330533194, + "grad_norm": 1.252587080001831, + "learning_rate": 0.00026346601460357693, + "loss": 2.0928, + "step": 10468 + }, + { + "epoch": 1.2214444055536111, + "grad_norm": 1.1919533014297485, + "learning_rate": 0.0002634560817563602, + "loss": 2.1052, + "step": 10469 + }, + { + "epoch": 1.2215610780539028, + "grad_norm": 1.0416122674942017, + "learning_rate": 0.0002634461477485067, + "loss": 2.0322, + "step": 10470 + }, + { + "epoch": 1.2216777505541945, + "grad_norm": 1.162929654121399, + "learning_rate": 0.00026343621258011934, + "loss": 2.2659, + "step": 10471 + }, + { + "epoch": 1.2217944230544862, + "grad_norm": 1.2251256704330444, + "learning_rate": 0.0002634262762513013, + "loss": 1.9052, + "step": 10472 + }, + { + "epoch": 1.2219110955547778, + "grad_norm": 1.2028286457061768, + "learning_rate": 0.00026341633876215525, + "loss": 2.1276, + "step": 10473 + }, + { + "epoch": 1.2220277680550695, + "grad_norm": 1.22076416015625, + "learning_rate": 0.00026340640011278453, + "loss": 2.1736, + "step": 10474 + }, + { + "epoch": 1.2221444405553612, + "grad_norm": 1.0159921646118164, + "learning_rate": 0.0002633964603032921, + "loss": 2.0664, + "step": 10475 + }, + { + "epoch": 1.2222611130556529, + "grad_norm": 1.1759395599365234, + "learning_rate": 0.00026338651933378095, + "loss": 2.129, + "step": 10476 + }, + { + "epoch": 1.2223777855559446, + "grad_norm": 1.4091153144836426, + "learning_rate": 0.0002633765772043541, + "loss": 2.3485, + "step": 10477 + }, + { + "epoch": 1.2224944580562362, + "grad_norm": 1.150364637374878, + "learning_rate": 0.0002633666339151147, + "loss": 1.9199, + "step": 10478 + }, + { + "epoch": 1.222611130556528, + "grad_norm": 1.2663718461990356, + "learning_rate": 0.0002633566894661658, + "loss": 2.0652, + "step": 10479 + }, + { + "epoch": 1.2227278030568196, + "grad_norm": 1.2199519872665405, + "learning_rate": 0.0002633467438576105, + "loss": 2.1248, + "step": 10480 + }, + { + "epoch": 1.2228444755571113, + "grad_norm": 1.0502350330352783, + "learning_rate": 0.00026333679708955194, + "loss": 1.998, + "step": 10481 + }, + { + "epoch": 1.222961148057403, + "grad_norm": 1.1241034269332886, + "learning_rate": 0.00026332684916209316, + "loss": 2.0175, + "step": 10482 + }, + { + "epoch": 1.2230778205576947, + "grad_norm": 0.9527755379676819, + "learning_rate": 0.00026331690007533727, + "loss": 1.9034, + "step": 10483 + }, + { + "epoch": 1.2231944930579863, + "grad_norm": 1.2387219667434692, + "learning_rate": 0.00026330694982938755, + "loss": 2.0823, + "step": 10484 + }, + { + "epoch": 1.223311165558278, + "grad_norm": 1.1862844228744507, + "learning_rate": 0.00026329699842434704, + "loss": 1.9431, + "step": 10485 + }, + { + "epoch": 1.2234278380585697, + "grad_norm": 1.1086938381195068, + "learning_rate": 0.0002632870458603189, + "loss": 1.9879, + "step": 10486 + }, + { + "epoch": 1.2235445105588614, + "grad_norm": 1.1425223350524902, + "learning_rate": 0.0002632770921374064, + "loss": 1.8308, + "step": 10487 + }, + { + "epoch": 1.223661183059153, + "grad_norm": 1.0395324230194092, + "learning_rate": 0.0002632671372557127, + "loss": 1.8403, + "step": 10488 + }, + { + "epoch": 1.2237778555594447, + "grad_norm": 0.995579719543457, + "learning_rate": 0.0002632571812153409, + "loss": 1.9795, + "step": 10489 + }, + { + "epoch": 1.2238945280597364, + "grad_norm": 1.3898036479949951, + "learning_rate": 0.0002632472240163943, + "loss": 2.2073, + "step": 10490 + }, + { + "epoch": 1.224011200560028, + "grad_norm": 1.2860181331634521, + "learning_rate": 0.00026323726565897614, + "loss": 2.0263, + "step": 10491 + }, + { + "epoch": 1.2241278730603198, + "grad_norm": 1.1229403018951416, + "learning_rate": 0.0002632273061431896, + "loss": 2.1285, + "step": 10492 + }, + { + "epoch": 1.2242445455606115, + "grad_norm": 1.3544865846633911, + "learning_rate": 0.000263217345469138, + "loss": 2.231, + "step": 10493 + }, + { + "epoch": 1.2243612180609031, + "grad_norm": 1.144981026649475, + "learning_rate": 0.00026320738363692453, + "loss": 2.0741, + "step": 10494 + }, + { + "epoch": 1.2244778905611948, + "grad_norm": 1.2413636445999146, + "learning_rate": 0.0002631974206466525, + "loss": 2.1696, + "step": 10495 + }, + { + "epoch": 1.2245945630614865, + "grad_norm": 1.2866908311843872, + "learning_rate": 0.0002631874564984252, + "loss": 2.1686, + "step": 10496 + }, + { + "epoch": 1.2247112355617782, + "grad_norm": 1.296875, + "learning_rate": 0.0002631774911923459, + "loss": 1.8713, + "step": 10497 + }, + { + "epoch": 1.2248279080620699, + "grad_norm": 1.1287798881530762, + "learning_rate": 0.00026316752472851793, + "loss": 1.9134, + "step": 10498 + }, + { + "epoch": 1.2249445805623616, + "grad_norm": 1.0081278085708618, + "learning_rate": 0.0002631575571070446, + "loss": 1.9863, + "step": 10499 + }, + { + "epoch": 1.2250612530626532, + "grad_norm": 1.3777326345443726, + "learning_rate": 0.00026314758832802923, + "loss": 2.0995, + "step": 10500 + }, + { + "epoch": 1.225177925562945, + "grad_norm": 1.0406947135925293, + "learning_rate": 0.00026313761839157524, + "loss": 2.165, + "step": 10501 + }, + { + "epoch": 1.2252945980632366, + "grad_norm": 1.233040690422058, + "learning_rate": 0.0002631276472977859, + "loss": 2.0103, + "step": 10502 + }, + { + "epoch": 1.2254112705635283, + "grad_norm": 1.151428461074829, + "learning_rate": 0.00026311767504676463, + "loss": 2.0463, + "step": 10503 + }, + { + "epoch": 1.22552794306382, + "grad_norm": 1.1247669458389282, + "learning_rate": 0.0002631077016386148, + "loss": 2.1633, + "step": 10504 + }, + { + "epoch": 1.2256446155641116, + "grad_norm": 1.176518440246582, + "learning_rate": 0.00026309772707343976, + "loss": 2.0595, + "step": 10505 + }, + { + "epoch": 1.2257612880644033, + "grad_norm": 1.248246669769287, + "learning_rate": 0.00026308775135134297, + "loss": 2.1751, + "step": 10506 + }, + { + "epoch": 1.225877960564695, + "grad_norm": 1.2380748987197876, + "learning_rate": 0.00026307777447242783, + "loss": 2.0857, + "step": 10507 + }, + { + "epoch": 1.2259946330649867, + "grad_norm": 1.2063206434249878, + "learning_rate": 0.0002630677964367977, + "loss": 2.0303, + "step": 10508 + }, + { + "epoch": 1.2261113055652784, + "grad_norm": 1.1237516403198242, + "learning_rate": 0.0002630578172445562, + "loss": 2.0455, + "step": 10509 + }, + { + "epoch": 1.22622797806557, + "grad_norm": 1.2875620126724243, + "learning_rate": 0.0002630478368958066, + "loss": 2.1186, + "step": 10510 + }, + { + "epoch": 1.2263446505658617, + "grad_norm": 1.16062593460083, + "learning_rate": 0.0002630378553906525, + "loss": 2.0315, + "step": 10511 + }, + { + "epoch": 1.2264613230661534, + "grad_norm": 1.072212815284729, + "learning_rate": 0.0002630278727291972, + "loss": 2.1846, + "step": 10512 + }, + { + "epoch": 1.226577995566445, + "grad_norm": 1.1609759330749512, + "learning_rate": 0.00026301788891154446, + "loss": 2.3334, + "step": 10513 + }, + { + "epoch": 1.2266946680667368, + "grad_norm": 0.9497514367103577, + "learning_rate": 0.0002630079039377975, + "loss": 1.9911, + "step": 10514 + }, + { + "epoch": 1.2268113405670285, + "grad_norm": 1.1626057624816895, + "learning_rate": 0.00026299791780805997, + "loss": 1.9569, + "step": 10515 + }, + { + "epoch": 1.2269280130673201, + "grad_norm": 1.2160121202468872, + "learning_rate": 0.0002629879305224354, + "loss": 2.0499, + "step": 10516 + }, + { + "epoch": 1.2270446855676118, + "grad_norm": 1.2394129037857056, + "learning_rate": 0.00026297794208102735, + "loss": 2.0213, + "step": 10517 + }, + { + "epoch": 1.2271613580679035, + "grad_norm": 1.1630969047546387, + "learning_rate": 0.00026296795248393934, + "loss": 2.1564, + "step": 10518 + }, + { + "epoch": 1.2272780305681952, + "grad_norm": 1.123170256614685, + "learning_rate": 0.0002629579617312749, + "loss": 2.2514, + "step": 10519 + }, + { + "epoch": 1.2273947030684869, + "grad_norm": 1.1496080160140991, + "learning_rate": 0.00026294796982313757, + "loss": 2.1707, + "step": 10520 + }, + { + "epoch": 1.2275113755687785, + "grad_norm": 1.2208493947982788, + "learning_rate": 0.000262937976759631, + "loss": 2.0977, + "step": 10521 + }, + { + "epoch": 1.2276280480690702, + "grad_norm": 1.1271041631698608, + "learning_rate": 0.0002629279825408588, + "loss": 2.1521, + "step": 10522 + }, + { + "epoch": 1.227744720569362, + "grad_norm": 1.2483998537063599, + "learning_rate": 0.0002629179871669246, + "loss": 2.1137, + "step": 10523 + }, + { + "epoch": 1.2278613930696536, + "grad_norm": 1.3552874326705933, + "learning_rate": 0.0002629079906379319, + "loss": 2.1007, + "step": 10524 + }, + { + "epoch": 1.2279780655699453, + "grad_norm": 1.3976253271102905, + "learning_rate": 0.0002628979929539845, + "loss": 2.0579, + "step": 10525 + }, + { + "epoch": 1.228094738070237, + "grad_norm": 0.9871069192886353, + "learning_rate": 0.00026288799411518585, + "loss": 2.1222, + "step": 10526 + }, + { + "epoch": 1.2282114105705286, + "grad_norm": 1.3521913290023804, + "learning_rate": 0.0002628779941216398, + "loss": 2.0459, + "step": 10527 + }, + { + "epoch": 1.2283280830708203, + "grad_norm": 1.1124107837677002, + "learning_rate": 0.00026286799297344985, + "loss": 2.0705, + "step": 10528 + }, + { + "epoch": 1.228444755571112, + "grad_norm": 1.1105703115463257, + "learning_rate": 0.0002628579906707198, + "loss": 1.8742, + "step": 10529 + }, + { + "epoch": 1.2285614280714037, + "grad_norm": 1.107224464416504, + "learning_rate": 0.00026284798721355334, + "loss": 1.9404, + "step": 10530 + }, + { + "epoch": 1.2286781005716954, + "grad_norm": 1.1521553993225098, + "learning_rate": 0.00026283798260205406, + "loss": 2.1084, + "step": 10531 + }, + { + "epoch": 1.228794773071987, + "grad_norm": 1.3738481998443604, + "learning_rate": 0.00026282797683632584, + "loss": 2.1818, + "step": 10532 + }, + { + "epoch": 1.2289114455722787, + "grad_norm": 1.066184639930725, + "learning_rate": 0.0002628179699164722, + "loss": 1.9973, + "step": 10533 + }, + { + "epoch": 1.2290281180725704, + "grad_norm": 1.3154778480529785, + "learning_rate": 0.00026280796184259715, + "loss": 2.2944, + "step": 10534 + }, + { + "epoch": 1.229144790572862, + "grad_norm": 1.0987993478775024, + "learning_rate": 0.0002627979526148042, + "loss": 1.8389, + "step": 10535 + }, + { + "epoch": 1.2292614630731538, + "grad_norm": 1.1746357679367065, + "learning_rate": 0.00026278794223319723, + "loss": 2.0389, + "step": 10536 + }, + { + "epoch": 1.2293781355734454, + "grad_norm": 1.2051560878753662, + "learning_rate": 0.00026277793069788005, + "loss": 2.0518, + "step": 10537 + }, + { + "epoch": 1.2294948080737371, + "grad_norm": 1.201619267463684, + "learning_rate": 0.00026276791800895635, + "loss": 1.9738, + "step": 10538 + }, + { + "epoch": 1.2296114805740288, + "grad_norm": 1.3841216564178467, + "learning_rate": 0.00026275790416652997, + "loss": 2.1469, + "step": 10539 + }, + { + "epoch": 1.2297281530743205, + "grad_norm": 1.2533698081970215, + "learning_rate": 0.00026274788917070475, + "loss": 2.1084, + "step": 10540 + }, + { + "epoch": 1.2298448255746122, + "grad_norm": 1.166087031364441, + "learning_rate": 0.00026273787302158447, + "loss": 2.0204, + "step": 10541 + }, + { + "epoch": 1.2299614980749038, + "grad_norm": 1.2832468748092651, + "learning_rate": 0.000262727855719273, + "loss": 1.9822, + "step": 10542 + }, + { + "epoch": 1.2300781705751955, + "grad_norm": 1.2187047004699707, + "learning_rate": 0.00026271783726387416, + "loss": 2.1186, + "step": 10543 + }, + { + "epoch": 1.2301948430754872, + "grad_norm": 1.2229478359222412, + "learning_rate": 0.00026270781765549186, + "loss": 2.1121, + "step": 10544 + }, + { + "epoch": 1.230311515575779, + "grad_norm": 1.1879029273986816, + "learning_rate": 0.00026269779689422996, + "loss": 2.0311, + "step": 10545 + }, + { + "epoch": 1.2304281880760706, + "grad_norm": 1.2651631832122803, + "learning_rate": 0.00026268777498019227, + "loss": 1.9732, + "step": 10546 + }, + { + "epoch": 1.2305448605763623, + "grad_norm": 1.491563320159912, + "learning_rate": 0.00026267775191348276, + "loss": 2.1991, + "step": 10547 + }, + { + "epoch": 1.230661533076654, + "grad_norm": 1.1982210874557495, + "learning_rate": 0.0002626677276942053, + "loss": 2.1051, + "step": 10548 + }, + { + "epoch": 1.2307782055769456, + "grad_norm": 1.2178577184677124, + "learning_rate": 0.0002626577023224639, + "loss": 2.0691, + "step": 10549 + }, + { + "epoch": 1.2308948780772373, + "grad_norm": 1.13804292678833, + "learning_rate": 0.00026264767579836233, + "loss": 2.1205, + "step": 10550 + }, + { + "epoch": 1.231011550577529, + "grad_norm": 1.0502396821975708, + "learning_rate": 0.00026263764812200466, + "loss": 2.068, + "step": 10551 + }, + { + "epoch": 1.2311282230778207, + "grad_norm": 1.1574078798294067, + "learning_rate": 0.0002626276192934948, + "loss": 1.9685, + "step": 10552 + }, + { + "epoch": 1.2312448955781123, + "grad_norm": 1.138871192932129, + "learning_rate": 0.0002626175893129368, + "loss": 2.1957, + "step": 10553 + }, + { + "epoch": 1.231361568078404, + "grad_norm": 1.1615533828735352, + "learning_rate": 0.00026260755818043453, + "loss": 2.0677, + "step": 10554 + }, + { + "epoch": 1.2314782405786957, + "grad_norm": 1.4196075201034546, + "learning_rate": 0.00026259752589609197, + "loss": 2.2768, + "step": 10555 + }, + { + "epoch": 1.2315949130789874, + "grad_norm": 1.1743242740631104, + "learning_rate": 0.0002625874924600132, + "loss": 2.2271, + "step": 10556 + }, + { + "epoch": 1.231711585579279, + "grad_norm": 1.304281234741211, + "learning_rate": 0.0002625774578723022, + "loss": 2.1617, + "step": 10557 + }, + { + "epoch": 1.2318282580795707, + "grad_norm": 1.095207691192627, + "learning_rate": 0.0002625674221330631, + "loss": 2.1134, + "step": 10558 + }, + { + "epoch": 1.2319449305798624, + "grad_norm": 1.2328702211380005, + "learning_rate": 0.00026255738524239974, + "loss": 1.9787, + "step": 10559 + }, + { + "epoch": 1.232061603080154, + "grad_norm": 1.0949445962905884, + "learning_rate": 0.00026254734720041633, + "loss": 1.9307, + "step": 10560 + }, + { + "epoch": 1.2321782755804458, + "grad_norm": 1.1803498268127441, + "learning_rate": 0.0002625373080072168, + "loss": 2.0686, + "step": 10561 + }, + { + "epoch": 1.2322949480807375, + "grad_norm": 1.1576956510543823, + "learning_rate": 0.0002625272676629054, + "loss": 2.0926, + "step": 10562 + }, + { + "epoch": 1.2324116205810292, + "grad_norm": 1.2632534503936768, + "learning_rate": 0.0002625172261675861, + "loss": 1.9793, + "step": 10563 + }, + { + "epoch": 1.2325282930813208, + "grad_norm": 1.3907592296600342, + "learning_rate": 0.000262507183521363, + "loss": 2.024, + "step": 10564 + }, + { + "epoch": 1.2326449655816125, + "grad_norm": 1.1004444360733032, + "learning_rate": 0.00026249713972434027, + "loss": 2.0217, + "step": 10565 + }, + { + "epoch": 1.2327616380819042, + "grad_norm": 1.2042638063430786, + "learning_rate": 0.000262487094776622, + "loss": 2.1535, + "step": 10566 + }, + { + "epoch": 1.2328783105821959, + "grad_norm": 1.1910580396652222, + "learning_rate": 0.00026247704867831225, + "loss": 2.125, + "step": 10567 + }, + { + "epoch": 1.2329949830824876, + "grad_norm": 1.1065562963485718, + "learning_rate": 0.0002624670014295153, + "loss": 2.1684, + "step": 10568 + }, + { + "epoch": 1.2331116555827792, + "grad_norm": 1.2507792711257935, + "learning_rate": 0.00026245695303033525, + "loss": 2.0562, + "step": 10569 + }, + { + "epoch": 1.233228328083071, + "grad_norm": 1.1760804653167725, + "learning_rate": 0.0002624469034808762, + "loss": 2.217, + "step": 10570 + }, + { + "epoch": 1.2333450005833626, + "grad_norm": 1.4355249404907227, + "learning_rate": 0.00026243685278124244, + "loss": 1.9754, + "step": 10571 + }, + { + "epoch": 1.2334616730836543, + "grad_norm": 1.3761276006698608, + "learning_rate": 0.00026242680093153817, + "loss": 2.0451, + "step": 10572 + }, + { + "epoch": 1.233578345583946, + "grad_norm": 1.3848278522491455, + "learning_rate": 0.00026241674793186744, + "loss": 2.0891, + "step": 10573 + }, + { + "epoch": 1.2336950180842376, + "grad_norm": 1.3536523580551147, + "learning_rate": 0.0002624066937823347, + "loss": 2.2741, + "step": 10574 + }, + { + "epoch": 1.2338116905845293, + "grad_norm": 1.202457070350647, + "learning_rate": 0.000262396638483044, + "loss": 2.1292, + "step": 10575 + }, + { + "epoch": 1.233928363084821, + "grad_norm": 1.1848032474517822, + "learning_rate": 0.00026238658203409966, + "loss": 2.0539, + "step": 10576 + }, + { + "epoch": 1.2340450355851127, + "grad_norm": 1.041068196296692, + "learning_rate": 0.00026237652443560593, + "loss": 1.8775, + "step": 10577 + }, + { + "epoch": 1.2341617080854044, + "grad_norm": 1.2346751689910889, + "learning_rate": 0.0002623664656876671, + "loss": 2.0835, + "step": 10578 + }, + { + "epoch": 1.234278380585696, + "grad_norm": 1.202345609664917, + "learning_rate": 0.0002623564057903873, + "loss": 1.9618, + "step": 10579 + }, + { + "epoch": 1.2343950530859877, + "grad_norm": 1.4247466325759888, + "learning_rate": 0.000262346344743871, + "loss": 2.1196, + "step": 10580 + }, + { + "epoch": 1.2345117255862794, + "grad_norm": 1.205070972442627, + "learning_rate": 0.0002623362825482224, + "loss": 2.0105, + "step": 10581 + }, + { + "epoch": 1.234628398086571, + "grad_norm": 1.082365870475769, + "learning_rate": 0.00026232621920354584, + "loss": 2.0497, + "step": 10582 + }, + { + "epoch": 1.2347450705868628, + "grad_norm": 1.2003742456436157, + "learning_rate": 0.0002623161547099457, + "loss": 2.1442, + "step": 10583 + }, + { + "epoch": 1.2348617430871545, + "grad_norm": 1.175126552581787, + "learning_rate": 0.0002623060890675262, + "loss": 1.7279, + "step": 10584 + }, + { + "epoch": 1.2349784155874461, + "grad_norm": 1.1070879697799683, + "learning_rate": 0.00026229602227639186, + "loss": 2.0385, + "step": 10585 + }, + { + "epoch": 1.2350950880877378, + "grad_norm": 1.2043499946594238, + "learning_rate": 0.0002622859543366469, + "loss": 2.078, + "step": 10586 + }, + { + "epoch": 1.2352117605880295, + "grad_norm": 1.4470351934432983, + "learning_rate": 0.00026227588524839566, + "loss": 2.2445, + "step": 10587 + }, + { + "epoch": 1.2353284330883212, + "grad_norm": 1.2488263845443726, + "learning_rate": 0.0002622658150117427, + "loss": 2.0213, + "step": 10588 + }, + { + "epoch": 1.2354451055886129, + "grad_norm": 1.1685447692871094, + "learning_rate": 0.0002622557436267922, + "loss": 2.2078, + "step": 10589 + }, + { + "epoch": 1.2355617780889045, + "grad_norm": 1.2992441654205322, + "learning_rate": 0.0002622456710936488, + "loss": 2.1058, + "step": 10590 + }, + { + "epoch": 1.2356784505891962, + "grad_norm": 1.0678766965866089, + "learning_rate": 0.0002622355974124167, + "loss": 1.9633, + "step": 10591 + }, + { + "epoch": 1.235795123089488, + "grad_norm": 1.2960413694381714, + "learning_rate": 0.00026222552258320055, + "loss": 1.9943, + "step": 10592 + }, + { + "epoch": 1.2359117955897796, + "grad_norm": 1.2092782258987427, + "learning_rate": 0.00026221544660610463, + "loss": 2.0671, + "step": 10593 + }, + { + "epoch": 1.2360284680900713, + "grad_norm": 1.1884891986846924, + "learning_rate": 0.0002622053694812334, + "loss": 2.066, + "step": 10594 + }, + { + "epoch": 1.236145140590363, + "grad_norm": 1.1575658321380615, + "learning_rate": 0.00026219529120869144, + "loss": 1.9258, + "step": 10595 + }, + { + "epoch": 1.2362618130906546, + "grad_norm": 1.2568418979644775, + "learning_rate": 0.00026218521178858316, + "loss": 1.985, + "step": 10596 + }, + { + "epoch": 1.2363784855909463, + "grad_norm": 1.0624580383300781, + "learning_rate": 0.0002621751312210131, + "loss": 2.0336, + "step": 10597 + }, + { + "epoch": 1.236495158091238, + "grad_norm": 1.1747409105300903, + "learning_rate": 0.0002621650495060856, + "loss": 2.0568, + "step": 10598 + }, + { + "epoch": 1.2366118305915297, + "grad_norm": 1.1186699867248535, + "learning_rate": 0.0002621549666439054, + "loss": 2.2061, + "step": 10599 + }, + { + "epoch": 1.2367285030918214, + "grad_norm": 1.1398842334747314, + "learning_rate": 0.0002621448826345769, + "loss": 1.9668, + "step": 10600 + }, + { + "epoch": 1.236845175592113, + "grad_norm": 1.1940439939498901, + "learning_rate": 0.00026213479747820464, + "loss": 2.0873, + "step": 10601 + }, + { + "epoch": 1.2369618480924047, + "grad_norm": 1.2616454362869263, + "learning_rate": 0.0002621247111748933, + "loss": 2.1377, + "step": 10602 + }, + { + "epoch": 1.2370785205926964, + "grad_norm": 1.3915880918502808, + "learning_rate": 0.00026211462372474723, + "loss": 2.1226, + "step": 10603 + }, + { + "epoch": 1.237195193092988, + "grad_norm": 1.1979120969772339, + "learning_rate": 0.00026210453512787114, + "loss": 1.927, + "step": 10604 + }, + { + "epoch": 1.2373118655932798, + "grad_norm": 1.3827755451202393, + "learning_rate": 0.00026209444538436967, + "loss": 2.0377, + "step": 10605 + }, + { + "epoch": 1.2374285380935715, + "grad_norm": 1.2713814973831177, + "learning_rate": 0.0002620843544943473, + "loss": 2.1238, + "step": 10606 + }, + { + "epoch": 1.2375452105938631, + "grad_norm": 1.1706794500350952, + "learning_rate": 0.00026207426245790865, + "loss": 1.898, + "step": 10607 + }, + { + "epoch": 1.2376618830941548, + "grad_norm": 1.2265586853027344, + "learning_rate": 0.0002620641692751584, + "loss": 2.1198, + "step": 10608 + }, + { + "epoch": 1.2377785555944465, + "grad_norm": 1.2193013429641724, + "learning_rate": 0.0002620540749462012, + "loss": 1.862, + "step": 10609 + }, + { + "epoch": 1.2378952280947382, + "grad_norm": 1.1175535917282104, + "learning_rate": 0.0002620439794711416, + "loss": 2.1013, + "step": 10610 + }, + { + "epoch": 1.2380119005950299, + "grad_norm": 1.3488589525222778, + "learning_rate": 0.0002620338828500844, + "loss": 2.0845, + "step": 10611 + }, + { + "epoch": 1.2381285730953215, + "grad_norm": 1.1544864177703857, + "learning_rate": 0.00026202378508313414, + "loss": 2.0089, + "step": 10612 + }, + { + "epoch": 1.2382452455956132, + "grad_norm": 1.385679841041565, + "learning_rate": 0.0002620136861703956, + "loss": 2.3553, + "step": 10613 + }, + { + "epoch": 1.238361918095905, + "grad_norm": 1.3561196327209473, + "learning_rate": 0.0002620035861119734, + "loss": 2.1699, + "step": 10614 + }, + { + "epoch": 1.2384785905961966, + "grad_norm": 1.3122109174728394, + "learning_rate": 0.00026199348490797225, + "loss": 2.1132, + "step": 10615 + }, + { + "epoch": 1.2385952630964883, + "grad_norm": 1.2180370092391968, + "learning_rate": 0.00026198338255849695, + "loss": 2.1134, + "step": 10616 + }, + { + "epoch": 1.23871193559678, + "grad_norm": 1.1719856262207031, + "learning_rate": 0.0002619732790636521, + "loss": 2.0637, + "step": 10617 + }, + { + "epoch": 1.2388286080970716, + "grad_norm": 1.4447098970413208, + "learning_rate": 0.00026196317442354254, + "loss": 2.3223, + "step": 10618 + }, + { + "epoch": 1.2389452805973633, + "grad_norm": 1.0890854597091675, + "learning_rate": 0.000261953068638273, + "loss": 2.1243, + "step": 10619 + }, + { + "epoch": 1.239061953097655, + "grad_norm": 1.0550910234451294, + "learning_rate": 0.0002619429617079483, + "loss": 2.0731, + "step": 10620 + }, + { + "epoch": 1.2391786255979467, + "grad_norm": 1.1656192541122437, + "learning_rate": 0.00026193285363267305, + "loss": 1.9921, + "step": 10621 + }, + { + "epoch": 1.2392952980982384, + "grad_norm": 1.2642370462417603, + "learning_rate": 0.0002619227444125522, + "loss": 2.1356, + "step": 10622 + }, + { + "epoch": 1.23941197059853, + "grad_norm": 1.1902294158935547, + "learning_rate": 0.00026191263404769054, + "loss": 2.1634, + "step": 10623 + }, + { + "epoch": 1.2395286430988217, + "grad_norm": 1.2732512950897217, + "learning_rate": 0.0002619025225381928, + "loss": 2.1826, + "step": 10624 + }, + { + "epoch": 1.2396453155991134, + "grad_norm": 1.1992768049240112, + "learning_rate": 0.0002618924098841639, + "loss": 2.1769, + "step": 10625 + }, + { + "epoch": 1.239761988099405, + "grad_norm": 1.2299599647521973, + "learning_rate": 0.0002618822960857086, + "loss": 2.2559, + "step": 10626 + }, + { + "epoch": 1.2398786605996968, + "grad_norm": 1.2311489582061768, + "learning_rate": 0.0002618721811429317, + "loss": 1.9555, + "step": 10627 + }, + { + "epoch": 1.2399953330999884, + "grad_norm": 1.0880988836288452, + "learning_rate": 0.00026186206505593825, + "loss": 2.0452, + "step": 10628 + }, + { + "epoch": 1.2401120056002801, + "grad_norm": 1.2472268342971802, + "learning_rate": 0.00026185194782483295, + "loss": 2.0438, + "step": 10629 + }, + { + "epoch": 1.2402286781005718, + "grad_norm": 1.3941495418548584, + "learning_rate": 0.0002618418294497208, + "loss": 2.2669, + "step": 10630 + }, + { + "epoch": 1.2403453506008635, + "grad_norm": 1.1488697528839111, + "learning_rate": 0.00026183170993070653, + "loss": 2.0816, + "step": 10631 + }, + { + "epoch": 1.2404620231011552, + "grad_norm": 1.078881025314331, + "learning_rate": 0.00026182158926789525, + "loss": 2.1552, + "step": 10632 + }, + { + "epoch": 1.2405786956014468, + "grad_norm": 1.0332121849060059, + "learning_rate": 0.00026181146746139173, + "loss": 2.1208, + "step": 10633 + }, + { + "epoch": 1.2406953681017385, + "grad_norm": 1.2502926588058472, + "learning_rate": 0.00026180134451130103, + "loss": 2.0234, + "step": 10634 + }, + { + "epoch": 1.2408120406020302, + "grad_norm": 1.2349317073822021, + "learning_rate": 0.00026179122041772795, + "loss": 2.1168, + "step": 10635 + }, + { + "epoch": 1.240928713102322, + "grad_norm": 1.22657310962677, + "learning_rate": 0.00026178109518077753, + "loss": 2.1587, + "step": 10636 + }, + { + "epoch": 1.2410453856026136, + "grad_norm": 1.2771673202514648, + "learning_rate": 0.00026177096880055474, + "loss": 2.0798, + "step": 10637 + }, + { + "epoch": 1.2411620581029053, + "grad_norm": 1.1668990850448608, + "learning_rate": 0.0002617608412771645, + "loss": 2.1582, + "step": 10638 + }, + { + "epoch": 1.241278730603197, + "grad_norm": 1.3001201152801514, + "learning_rate": 0.0002617507126107119, + "loss": 2.0942, + "step": 10639 + }, + { + "epoch": 1.2413954031034886, + "grad_norm": 1.528394341468811, + "learning_rate": 0.0002617405828013019, + "loss": 2.1048, + "step": 10640 + }, + { + "epoch": 1.2415120756037803, + "grad_norm": 1.043615460395813, + "learning_rate": 0.00026173045184903947, + "loss": 1.8254, + "step": 10641 + }, + { + "epoch": 1.241628748104072, + "grad_norm": 1.1821812391281128, + "learning_rate": 0.00026172031975402963, + "loss": 2.1466, + "step": 10642 + }, + { + "epoch": 1.2417454206043637, + "grad_norm": 1.0109758377075195, + "learning_rate": 0.00026171018651637755, + "loss": 2.1487, + "step": 10643 + }, + { + "epoch": 1.2418620931046553, + "grad_norm": 1.2997618913650513, + "learning_rate": 0.0002617000521361881, + "loss": 1.9992, + "step": 10644 + }, + { + "epoch": 1.241978765604947, + "grad_norm": 1.1999222040176392, + "learning_rate": 0.0002616899166135664, + "loss": 1.9923, + "step": 10645 + }, + { + "epoch": 1.2420954381052387, + "grad_norm": 1.1080622673034668, + "learning_rate": 0.0002616797799486177, + "loss": 1.9113, + "step": 10646 + }, + { + "epoch": 1.2422121106055304, + "grad_norm": 1.2665807008743286, + "learning_rate": 0.00026166964214144686, + "loss": 2.0291, + "step": 10647 + }, + { + "epoch": 1.242328783105822, + "grad_norm": 1.2144763469696045, + "learning_rate": 0.000261659503192159, + "loss": 2.0451, + "step": 10648 + }, + { + "epoch": 1.2424454556061137, + "grad_norm": 1.129441261291504, + "learning_rate": 0.00026164936310085936, + "loss": 1.9955, + "step": 10649 + }, + { + "epoch": 1.2425621281064054, + "grad_norm": 1.1443434953689575, + "learning_rate": 0.00026163922186765297, + "loss": 2.2089, + "step": 10650 + }, + { + "epoch": 1.242678800606697, + "grad_norm": 1.195489525794983, + "learning_rate": 0.000261629079492645, + "loss": 1.9669, + "step": 10651 + }, + { + "epoch": 1.2427954731069888, + "grad_norm": 1.1876882314682007, + "learning_rate": 0.0002616189359759405, + "loss": 2.147, + "step": 10652 + }, + { + "epoch": 1.2429121456072805, + "grad_norm": 1.4282512664794922, + "learning_rate": 0.00026160879131764474, + "loss": 2.2096, + "step": 10653 + }, + { + "epoch": 1.2430288181075722, + "grad_norm": 1.1903057098388672, + "learning_rate": 0.00026159864551786283, + "loss": 2.1204, + "step": 10654 + }, + { + "epoch": 1.2431454906078638, + "grad_norm": 1.1533515453338623, + "learning_rate": 0.00026158849857670004, + "loss": 2.0269, + "step": 10655 + }, + { + "epoch": 1.2432621631081555, + "grad_norm": 1.1882110834121704, + "learning_rate": 0.00026157835049426143, + "loss": 1.9725, + "step": 10656 + }, + { + "epoch": 1.2433788356084472, + "grad_norm": 1.0211479663848877, + "learning_rate": 0.0002615682012706523, + "loss": 2.0493, + "step": 10657 + }, + { + "epoch": 1.2434955081087389, + "grad_norm": 1.1086558103561401, + "learning_rate": 0.00026155805090597784, + "loss": 2.0014, + "step": 10658 + }, + { + "epoch": 1.2436121806090306, + "grad_norm": 1.3956866264343262, + "learning_rate": 0.0002615478994003432, + "loss": 2.2187, + "step": 10659 + }, + { + "epoch": 1.2437288531093222, + "grad_norm": 1.3286898136138916, + "learning_rate": 0.00026153774675385374, + "loss": 2.2291, + "step": 10660 + }, + { + "epoch": 1.243845525609614, + "grad_norm": 1.1905626058578491, + "learning_rate": 0.00026152759296661467, + "loss": 2.132, + "step": 10661 + }, + { + "epoch": 1.2439621981099056, + "grad_norm": 1.0289785861968994, + "learning_rate": 0.0002615174380387313, + "loss": 1.8863, + "step": 10662 + }, + { + "epoch": 1.2440788706101973, + "grad_norm": 1.121028184890747, + "learning_rate": 0.0002615072819703087, + "loss": 2.0831, + "step": 10663 + }, + { + "epoch": 1.244195543110489, + "grad_norm": 1.2565680742263794, + "learning_rate": 0.0002614971247614524, + "loss": 2.1163, + "step": 10664 + }, + { + "epoch": 1.2443122156107806, + "grad_norm": 1.5481528043746948, + "learning_rate": 0.0002614869664122676, + "loss": 2.2402, + "step": 10665 + }, + { + "epoch": 1.2444288881110723, + "grad_norm": 1.3524558544158936, + "learning_rate": 0.0002614768069228596, + "loss": 2.1, + "step": 10666 + }, + { + "epoch": 1.244545560611364, + "grad_norm": 1.2864607572555542, + "learning_rate": 0.00026146664629333376, + "loss": 1.9907, + "step": 10667 + }, + { + "epoch": 1.2446622331116557, + "grad_norm": 1.2228336334228516, + "learning_rate": 0.0002614564845237954, + "loss": 2.1225, + "step": 10668 + }, + { + "epoch": 1.2447789056119474, + "grad_norm": 1.4829496145248413, + "learning_rate": 0.0002614463216143498, + "loss": 2.1027, + "step": 10669 + }, + { + "epoch": 1.244895578112239, + "grad_norm": 1.1487864255905151, + "learning_rate": 0.00026143615756510244, + "loss": 2.05, + "step": 10670 + }, + { + "epoch": 1.2450122506125307, + "grad_norm": 1.076981544494629, + "learning_rate": 0.0002614259923761586, + "loss": 1.9074, + "step": 10671 + }, + { + "epoch": 1.2451289231128224, + "grad_norm": 1.1936264038085938, + "learning_rate": 0.0002614158260476237, + "loss": 2.0628, + "step": 10672 + }, + { + "epoch": 1.245245595613114, + "grad_norm": 1.398506760597229, + "learning_rate": 0.0002614056585796031, + "loss": 2.0688, + "step": 10673 + }, + { + "epoch": 1.2453622681134058, + "grad_norm": 1.2662880420684814, + "learning_rate": 0.00026139548997220223, + "loss": 2.0514, + "step": 10674 + }, + { + "epoch": 1.2454789406136975, + "grad_norm": 1.1506011486053467, + "learning_rate": 0.0002613853202255265, + "loss": 2.0121, + "step": 10675 + }, + { + "epoch": 1.2455956131139891, + "grad_norm": 1.332770824432373, + "learning_rate": 0.0002613751493396813, + "loss": 2.1409, + "step": 10676 + }, + { + "epoch": 1.2457122856142808, + "grad_norm": 1.2667654752731323, + "learning_rate": 0.00026136497731477223, + "loss": 2.1594, + "step": 10677 + }, + { + "epoch": 1.2458289581145725, + "grad_norm": 1.091247797012329, + "learning_rate": 0.00026135480415090455, + "loss": 2.0678, + "step": 10678 + }, + { + "epoch": 1.2459456306148642, + "grad_norm": 1.1806386709213257, + "learning_rate": 0.00026134462984818375, + "loss": 2.1206, + "step": 10679 + }, + { + "epoch": 1.2460623031151559, + "grad_norm": 1.27056884765625, + "learning_rate": 0.00026133445440671544, + "loss": 2.0388, + "step": 10680 + }, + { + "epoch": 1.2461789756154475, + "grad_norm": 1.1483066082000732, + "learning_rate": 0.000261324277826605, + "loss": 1.9852, + "step": 10681 + }, + { + "epoch": 1.2462956481157392, + "grad_norm": 1.1709779500961304, + "learning_rate": 0.0002613141001079579, + "loss": 2.073, + "step": 10682 + }, + { + "epoch": 1.246412320616031, + "grad_norm": 1.2401037216186523, + "learning_rate": 0.00026130392125087973, + "loss": 2.1337, + "step": 10683 + }, + { + "epoch": 1.2465289931163226, + "grad_norm": 1.2375917434692383, + "learning_rate": 0.00026129374125547606, + "loss": 2.0461, + "step": 10684 + }, + { + "epoch": 1.2466456656166143, + "grad_norm": 1.0957947969436646, + "learning_rate": 0.0002612835601218523, + "loss": 1.8943, + "step": 10685 + }, + { + "epoch": 1.246762338116906, + "grad_norm": 1.2038413286209106, + "learning_rate": 0.00026127337785011395, + "loss": 1.9402, + "step": 10686 + }, + { + "epoch": 1.2468790106171976, + "grad_norm": 1.1065125465393066, + "learning_rate": 0.00026126319444036677, + "loss": 1.9712, + "step": 10687 + }, + { + "epoch": 1.2469956831174893, + "grad_norm": 1.306254506111145, + "learning_rate": 0.00026125300989271623, + "loss": 2.3742, + "step": 10688 + }, + { + "epoch": 1.247112355617781, + "grad_norm": 1.227242112159729, + "learning_rate": 0.0002612428242072679, + "loss": 2.1541, + "step": 10689 + }, + { + "epoch": 1.2472290281180727, + "grad_norm": 1.2887181043624878, + "learning_rate": 0.0002612326373841273, + "loss": 2.1647, + "step": 10690 + }, + { + "epoch": 1.2473457006183644, + "grad_norm": 1.3618513345718384, + "learning_rate": 0.0002612224494234002, + "loss": 2.2202, + "step": 10691 + }, + { + "epoch": 1.247462373118656, + "grad_norm": 1.2052520513534546, + "learning_rate": 0.0002612122603251921, + "loss": 1.9104, + "step": 10692 + }, + { + "epoch": 1.2475790456189477, + "grad_norm": 1.2579517364501953, + "learning_rate": 0.00026120207008960874, + "loss": 2.1153, + "step": 10693 + }, + { + "epoch": 1.2476957181192394, + "grad_norm": 1.291076898574829, + "learning_rate": 0.00026119187871675553, + "loss": 2.0581, + "step": 10694 + }, + { + "epoch": 1.247812390619531, + "grad_norm": 1.1778689622879028, + "learning_rate": 0.00026118168620673844, + "loss": 2.1701, + "step": 10695 + }, + { + "epoch": 1.2479290631198228, + "grad_norm": 1.0595444440841675, + "learning_rate": 0.0002611714925596629, + "loss": 2.1305, + "step": 10696 + }, + { + "epoch": 1.2480457356201144, + "grad_norm": 1.1302309036254883, + "learning_rate": 0.00026116129777563465, + "loss": 2.0949, + "step": 10697 + }, + { + "epoch": 1.2481624081204061, + "grad_norm": 1.392021894454956, + "learning_rate": 0.00026115110185475943, + "loss": 2.19, + "step": 10698 + }, + { + "epoch": 1.2482790806206978, + "grad_norm": 1.1596381664276123, + "learning_rate": 0.00026114090479714285, + "loss": 1.9858, + "step": 10699 + }, + { + "epoch": 1.2483957531209895, + "grad_norm": 1.20834481716156, + "learning_rate": 0.0002611307066028907, + "loss": 2.1023, + "step": 10700 + }, + { + "epoch": 1.2485124256212812, + "grad_norm": 1.1401653289794922, + "learning_rate": 0.0002611205072721087, + "loss": 2.111, + "step": 10701 + }, + { + "epoch": 1.2486290981215729, + "grad_norm": 1.0749551057815552, + "learning_rate": 0.0002611103068049025, + "loss": 2.0373, + "step": 10702 + }, + { + "epoch": 1.2487457706218645, + "grad_norm": 1.117417573928833, + "learning_rate": 0.00026110010520137794, + "loss": 1.944, + "step": 10703 + }, + { + "epoch": 1.2488624431221562, + "grad_norm": 1.5727885961532593, + "learning_rate": 0.00026108990246164073, + "loss": 2.1432, + "step": 10704 + }, + { + "epoch": 1.248979115622448, + "grad_norm": 1.2447190284729004, + "learning_rate": 0.00026107969858579663, + "loss": 2.1105, + "step": 10705 + }, + { + "epoch": 1.2490957881227396, + "grad_norm": 1.2748041152954102, + "learning_rate": 0.0002610694935739515, + "loss": 2.0976, + "step": 10706 + }, + { + "epoch": 1.2492124606230313, + "grad_norm": 1.430406093597412, + "learning_rate": 0.00026105928742621104, + "loss": 2.0238, + "step": 10707 + }, + { + "epoch": 1.249329133123323, + "grad_norm": 1.387062430381775, + "learning_rate": 0.00026104908014268113, + "loss": 2.1632, + "step": 10708 + }, + { + "epoch": 1.2494458056236146, + "grad_norm": 1.1792476177215576, + "learning_rate": 0.00026103887172346756, + "loss": 2.0127, + "step": 10709 + }, + { + "epoch": 1.2495624781239063, + "grad_norm": 1.0776454210281372, + "learning_rate": 0.0002610286621686762, + "loss": 2.0759, + "step": 10710 + }, + { + "epoch": 1.249679150624198, + "grad_norm": 1.2618378400802612, + "learning_rate": 0.0002610184514784128, + "loss": 2.1163, + "step": 10711 + }, + { + "epoch": 1.2497958231244897, + "grad_norm": 1.04510498046875, + "learning_rate": 0.0002610082396527832, + "loss": 2.0174, + "step": 10712 + }, + { + "epoch": 1.2499124956247814, + "grad_norm": 1.0541077852249146, + "learning_rate": 0.00026099802669189335, + "loss": 1.8141, + "step": 10713 + }, + { + "epoch": 1.250029168125073, + "grad_norm": 1.6095049381256104, + "learning_rate": 0.0002609878125958492, + "loss": 2.1269, + "step": 10714 + }, + { + "epoch": 1.2501458406253647, + "grad_norm": 1.4259604215621948, + "learning_rate": 0.0002609775973647564, + "loss": 2.0955, + "step": 10715 + }, + { + "epoch": 1.2502625131256564, + "grad_norm": 1.1661896705627441, + "learning_rate": 0.0002609673809987211, + "loss": 1.9652, + "step": 10716 + }, + { + "epoch": 1.250379185625948, + "grad_norm": 1.1477152109146118, + "learning_rate": 0.0002609571634978491, + "loss": 2.0178, + "step": 10717 + }, + { + "epoch": 1.2504958581262398, + "grad_norm": 1.312675952911377, + "learning_rate": 0.00026094694486224624, + "loss": 1.9739, + "step": 10718 + }, + { + "epoch": 1.2506125306265314, + "grad_norm": 1.0856728553771973, + "learning_rate": 0.00026093672509201857, + "loss": 2.1197, + "step": 10719 + }, + { + "epoch": 1.2507292031268231, + "grad_norm": 1.0180727243423462, + "learning_rate": 0.000260926504187272, + "loss": 2.072, + "step": 10720 + }, + { + "epoch": 1.2508458756271148, + "grad_norm": 1.2199229001998901, + "learning_rate": 0.0002609162821481125, + "loss": 2.1494, + "step": 10721 + }, + { + "epoch": 1.2509625481274065, + "grad_norm": 1.1993321180343628, + "learning_rate": 0.0002609060589746461, + "loss": 2.1011, + "step": 10722 + }, + { + "epoch": 1.2510792206276982, + "grad_norm": 1.2914597988128662, + "learning_rate": 0.0002608958346669787, + "loss": 2.0333, + "step": 10723 + }, + { + "epoch": 1.2511958931279898, + "grad_norm": 1.216982364654541, + "learning_rate": 0.0002608856092252163, + "loss": 2.1838, + "step": 10724 + }, + { + "epoch": 1.2513125656282815, + "grad_norm": 1.0991758108139038, + "learning_rate": 0.00026087538264946494, + "loss": 2.0453, + "step": 10725 + }, + { + "epoch": 1.2514292381285732, + "grad_norm": 1.1955665349960327, + "learning_rate": 0.0002608651549398306, + "loss": 1.9995, + "step": 10726 + }, + { + "epoch": 1.2515459106288649, + "grad_norm": 1.3293548822402954, + "learning_rate": 0.0002608549260964193, + "loss": 1.9504, + "step": 10727 + }, + { + "epoch": 1.2516625831291566, + "grad_norm": 1.1595213413238525, + "learning_rate": 0.0002608446961193371, + "loss": 2.0901, + "step": 10728 + }, + { + "epoch": 1.2517792556294483, + "grad_norm": 1.0878896713256836, + "learning_rate": 0.0002608344650086902, + "loss": 2.0156, + "step": 10729 + }, + { + "epoch": 1.25189592812974, + "grad_norm": 1.1530976295471191, + "learning_rate": 0.0002608242327645844, + "loss": 2.1262, + "step": 10730 + }, + { + "epoch": 1.2520126006300316, + "grad_norm": 1.0744528770446777, + "learning_rate": 0.00026081399938712594, + "loss": 1.9509, + "step": 10731 + }, + { + "epoch": 1.2521292731303233, + "grad_norm": 1.2461060285568237, + "learning_rate": 0.00026080376487642087, + "loss": 2.0856, + "step": 10732 + }, + { + "epoch": 1.252245945630615, + "grad_norm": 1.0895730257034302, + "learning_rate": 0.0002607935292325753, + "loss": 2.0988, + "step": 10733 + }, + { + "epoch": 1.2523626181309067, + "grad_norm": 1.283785343170166, + "learning_rate": 0.0002607832924556953, + "loss": 1.9887, + "step": 10734 + }, + { + "epoch": 1.2524792906311983, + "grad_norm": 1.0783007144927979, + "learning_rate": 0.00026077305454588707, + "loss": 2.0895, + "step": 10735 + }, + { + "epoch": 1.25259596313149, + "grad_norm": 1.2288172245025635, + "learning_rate": 0.0002607628155032567, + "loss": 2.1271, + "step": 10736 + }, + { + "epoch": 1.2527126356317817, + "grad_norm": 1.3490616083145142, + "learning_rate": 0.0002607525753279103, + "loss": 2.2178, + "step": 10737 + }, + { + "epoch": 1.2528293081320734, + "grad_norm": 1.1529031991958618, + "learning_rate": 0.00026074233401995417, + "loss": 2.0518, + "step": 10738 + }, + { + "epoch": 1.252945980632365, + "grad_norm": 1.258965015411377, + "learning_rate": 0.00026073209157949427, + "loss": 2.2299, + "step": 10739 + }, + { + "epoch": 1.2530626531326567, + "grad_norm": 1.337518334388733, + "learning_rate": 0.0002607218480066369, + "loss": 2.2568, + "step": 10740 + }, + { + "epoch": 1.2531793256329484, + "grad_norm": 1.0208592414855957, + "learning_rate": 0.00026071160330148836, + "loss": 2.183, + "step": 10741 + }, + { + "epoch": 1.25329599813324, + "grad_norm": 1.202248215675354, + "learning_rate": 0.00026070135746415464, + "loss": 2.0052, + "step": 10742 + }, + { + "epoch": 1.2534126706335318, + "grad_norm": 1.1451349258422852, + "learning_rate": 0.0002606911104947421, + "loss": 2.0608, + "step": 10743 + }, + { + "epoch": 1.2535293431338235, + "grad_norm": 1.1365870237350464, + "learning_rate": 0.0002606808623933569, + "loss": 2.149, + "step": 10744 + }, + { + "epoch": 1.2536460156341152, + "grad_norm": 1.1557070016860962, + "learning_rate": 0.0002606706131601054, + "loss": 2.1412, + "step": 10745 + }, + { + "epoch": 1.2537626881344068, + "grad_norm": 1.1301000118255615, + "learning_rate": 0.0002606603627950936, + "loss": 2.1525, + "step": 10746 + }, + { + "epoch": 1.2538793606346985, + "grad_norm": 1.1750726699829102, + "learning_rate": 0.000260650111298428, + "loss": 2.1504, + "step": 10747 + }, + { + "epoch": 1.2539960331349902, + "grad_norm": 1.044761061668396, + "learning_rate": 0.00026063985867021486, + "loss": 2.2293, + "step": 10748 + }, + { + "epoch": 1.2541127056352819, + "grad_norm": 1.2137538194656372, + "learning_rate": 0.0002606296049105604, + "loss": 2.0289, + "step": 10749 + }, + { + "epoch": 1.2542293781355736, + "grad_norm": 1.1065675020217896, + "learning_rate": 0.0002606193500195709, + "loss": 1.8998, + "step": 10750 + }, + { + "epoch": 1.2543460506358652, + "grad_norm": 1.0854698419570923, + "learning_rate": 0.00026060909399735273, + "loss": 2.1411, + "step": 10751 + }, + { + "epoch": 1.254462723136157, + "grad_norm": 1.1020382642745972, + "learning_rate": 0.0002605988368440122, + "loss": 2.1666, + "step": 10752 + }, + { + "epoch": 1.2545793956364486, + "grad_norm": 1.1995683908462524, + "learning_rate": 0.0002605885785596555, + "loss": 2.1573, + "step": 10753 + }, + { + "epoch": 1.2546960681367403, + "grad_norm": 1.3143680095672607, + "learning_rate": 0.0002605783191443892, + "loss": 2.234, + "step": 10754 + }, + { + "epoch": 1.254812740637032, + "grad_norm": 1.1536099910736084, + "learning_rate": 0.0002605680585983196, + "loss": 2.1715, + "step": 10755 + }, + { + "epoch": 1.2549294131373236, + "grad_norm": 1.112989902496338, + "learning_rate": 0.00026055779692155296, + "loss": 2.1007, + "step": 10756 + }, + { + "epoch": 1.2550460856376153, + "grad_norm": 1.2766891717910767, + "learning_rate": 0.0002605475341141958, + "loss": 2.0711, + "step": 10757 + }, + { + "epoch": 1.255162758137907, + "grad_norm": 1.246861219406128, + "learning_rate": 0.00026053727017635446, + "loss": 1.9736, + "step": 10758 + }, + { + "epoch": 1.2552794306381987, + "grad_norm": 1.1020618677139282, + "learning_rate": 0.0002605270051081353, + "loss": 2.1729, + "step": 10759 + }, + { + "epoch": 1.2553961031384904, + "grad_norm": 1.0179587602615356, + "learning_rate": 0.00026051673890964476, + "loss": 1.97, + "step": 10760 + }, + { + "epoch": 1.255512775638782, + "grad_norm": 1.3164390325546265, + "learning_rate": 0.0002605064715809893, + "loss": 2.107, + "step": 10761 + }, + { + "epoch": 1.2556294481390737, + "grad_norm": 1.206310749053955, + "learning_rate": 0.00026049620312227533, + "loss": 2.0176, + "step": 10762 + }, + { + "epoch": 1.2557461206393654, + "grad_norm": 1.356021761894226, + "learning_rate": 0.0002604859335336093, + "loss": 2.1408, + "step": 10763 + }, + { + "epoch": 1.255862793139657, + "grad_norm": 1.2449296712875366, + "learning_rate": 0.00026047566281509776, + "loss": 2.2484, + "step": 10764 + }, + { + "epoch": 1.2559794656399488, + "grad_norm": 1.2665609121322632, + "learning_rate": 0.00026046539096684704, + "loss": 2.2443, + "step": 10765 + }, + { + "epoch": 1.2560961381402405, + "grad_norm": 1.2901763916015625, + "learning_rate": 0.00026045511798896375, + "loss": 2.1151, + "step": 10766 + }, + { + "epoch": 1.2562128106405321, + "grad_norm": 1.1664025783538818, + "learning_rate": 0.0002604448438815543, + "loss": 1.944, + "step": 10767 + }, + { + "epoch": 1.2563294831408238, + "grad_norm": 1.1254932880401611, + "learning_rate": 0.00026043456864472525, + "loss": 2.174, + "step": 10768 + }, + { + "epoch": 1.2564461556411155, + "grad_norm": 1.2185553312301636, + "learning_rate": 0.0002604242922785831, + "loss": 2.1136, + "step": 10769 + }, + { + "epoch": 1.2565628281414072, + "grad_norm": 1.3371074199676514, + "learning_rate": 0.0002604140147832344, + "loss": 2.2033, + "step": 10770 + }, + { + "epoch": 1.2566795006416989, + "grad_norm": 1.3037548065185547, + "learning_rate": 0.0002604037361587857, + "loss": 2.156, + "step": 10771 + }, + { + "epoch": 1.2567961731419905, + "grad_norm": 1.1667850017547607, + "learning_rate": 0.00026039345640534357, + "loss": 2.2099, + "step": 10772 + }, + { + "epoch": 1.2569128456422822, + "grad_norm": 1.1905758380889893, + "learning_rate": 0.0002603831755230145, + "loss": 2.1625, + "step": 10773 + }, + { + "epoch": 1.257029518142574, + "grad_norm": 1.046967625617981, + "learning_rate": 0.0002603728935119052, + "loss": 2.16, + "step": 10774 + }, + { + "epoch": 1.2571461906428656, + "grad_norm": 1.2636334896087646, + "learning_rate": 0.00026036261037212213, + "loss": 2.3518, + "step": 10775 + }, + { + "epoch": 1.2572628631431573, + "grad_norm": 1.2120484113693237, + "learning_rate": 0.000260352326103772, + "loss": 2.2687, + "step": 10776 + }, + { + "epoch": 1.257379535643449, + "grad_norm": 1.3828502893447876, + "learning_rate": 0.00026034204070696134, + "loss": 2.3094, + "step": 10777 + }, + { + "epoch": 1.2574962081437406, + "grad_norm": 1.003517508506775, + "learning_rate": 0.00026033175418179686, + "loss": 1.8249, + "step": 10778 + }, + { + "epoch": 1.2576128806440323, + "grad_norm": 1.087824821472168, + "learning_rate": 0.00026032146652838514, + "loss": 2.1214, + "step": 10779 + }, + { + "epoch": 1.257729553144324, + "grad_norm": 1.1981841325759888, + "learning_rate": 0.0002603111777468328, + "loss": 1.9287, + "step": 10780 + }, + { + "epoch": 1.2578462256446157, + "grad_norm": 1.2094836235046387, + "learning_rate": 0.00026030088783724663, + "loss": 2.0791, + "step": 10781 + }, + { + "epoch": 1.2579628981449074, + "grad_norm": 1.2208324670791626, + "learning_rate": 0.0002602905967997332, + "loss": 2.0655, + "step": 10782 + }, + { + "epoch": 1.258079570645199, + "grad_norm": 1.2279399633407593, + "learning_rate": 0.00026028030463439924, + "loss": 2.2802, + "step": 10783 + }, + { + "epoch": 1.2581962431454907, + "grad_norm": 1.4103820323944092, + "learning_rate": 0.00026027001134135136, + "loss": 2.0623, + "step": 10784 + }, + { + "epoch": 1.2583129156457824, + "grad_norm": 1.34292471408844, + "learning_rate": 0.0002602597169206964, + "loss": 2.1728, + "step": 10785 + }, + { + "epoch": 1.258429588146074, + "grad_norm": 1.1241828203201294, + "learning_rate": 0.000260249421372541, + "loss": 1.9088, + "step": 10786 + }, + { + "epoch": 1.2585462606463658, + "grad_norm": 1.1820144653320312, + "learning_rate": 0.000260239124696992, + "loss": 2.1681, + "step": 10787 + }, + { + "epoch": 1.2586629331466574, + "grad_norm": 1.3277565240859985, + "learning_rate": 0.00026022882689415595, + "loss": 2.3153, + "step": 10788 + }, + { + "epoch": 1.2587796056469491, + "grad_norm": 1.198019027709961, + "learning_rate": 0.00026021852796413974, + "loss": 2.0756, + "step": 10789 + }, + { + "epoch": 1.2588962781472408, + "grad_norm": 1.3329499959945679, + "learning_rate": 0.00026020822790705006, + "loss": 2.2807, + "step": 10790 + }, + { + "epoch": 1.2590129506475325, + "grad_norm": 1.1590873003005981, + "learning_rate": 0.0002601979267229938, + "loss": 2.1921, + "step": 10791 + }, + { + "epoch": 1.2591296231478242, + "grad_norm": 1.088384747505188, + "learning_rate": 0.0002601876244120777, + "loss": 2.0129, + "step": 10792 + }, + { + "epoch": 1.2592462956481159, + "grad_norm": 1.0722259283065796, + "learning_rate": 0.00026017732097440856, + "loss": 2.1296, + "step": 10793 + }, + { + "epoch": 1.2593629681484075, + "grad_norm": 1.1079856157302856, + "learning_rate": 0.0002601670164100932, + "loss": 1.7872, + "step": 10794 + }, + { + "epoch": 1.2594796406486992, + "grad_norm": 1.063706874847412, + "learning_rate": 0.0002601567107192384, + "loss": 2.0488, + "step": 10795 + }, + { + "epoch": 1.259596313148991, + "grad_norm": 1.2366193532943726, + "learning_rate": 0.00026014640390195106, + "loss": 2.1146, + "step": 10796 + }, + { + "epoch": 1.2597129856492826, + "grad_norm": 1.0677671432495117, + "learning_rate": 0.00026013609595833796, + "loss": 2.0267, + "step": 10797 + }, + { + "epoch": 1.2598296581495743, + "grad_norm": 1.2167924642562866, + "learning_rate": 0.00026012578688850606, + "loss": 1.9331, + "step": 10798 + }, + { + "epoch": 1.259946330649866, + "grad_norm": 1.2503535747528076, + "learning_rate": 0.0002601154766925622, + "loss": 2.1879, + "step": 10799 + }, + { + "epoch": 1.2600630031501576, + "grad_norm": 1.2022547721862793, + "learning_rate": 0.0002601051653706132, + "loss": 2.0036, + "step": 10800 + }, + { + "epoch": 1.2601796756504493, + "grad_norm": 1.3334734439849854, + "learning_rate": 0.00026009485292276594, + "loss": 1.9556, + "step": 10801 + }, + { + "epoch": 1.260296348150741, + "grad_norm": 1.3212034702301025, + "learning_rate": 0.00026008453934912753, + "loss": 1.9896, + "step": 10802 + }, + { + "epoch": 1.2604130206510327, + "grad_norm": 1.4485646486282349, + "learning_rate": 0.0002600742246498047, + "loss": 2.0834, + "step": 10803 + }, + { + "epoch": 1.2605296931513243, + "grad_norm": 1.1546549797058105, + "learning_rate": 0.0002600639088249043, + "loss": 2.0919, + "step": 10804 + }, + { + "epoch": 1.260646365651616, + "grad_norm": 1.217184066772461, + "learning_rate": 0.00026005359187453354, + "loss": 2.0176, + "step": 10805 + }, + { + "epoch": 1.2607630381519077, + "grad_norm": 1.215211272239685, + "learning_rate": 0.00026004327379879923, + "loss": 2.1176, + "step": 10806 + }, + { + "epoch": 1.2608797106521994, + "grad_norm": 1.1835435628890991, + "learning_rate": 0.0002600329545978083, + "loss": 2.0928, + "step": 10807 + }, + { + "epoch": 1.260996383152491, + "grad_norm": 1.2766836881637573, + "learning_rate": 0.0002600226342716678, + "loss": 2.0854, + "step": 10808 + }, + { + "epoch": 1.2611130556527828, + "grad_norm": 1.2625411748886108, + "learning_rate": 0.00026001231282048463, + "loss": 2.0777, + "step": 10809 + }, + { + "epoch": 1.2612297281530744, + "grad_norm": 1.325623869895935, + "learning_rate": 0.0002600019902443659, + "loss": 2.1322, + "step": 10810 + }, + { + "epoch": 1.2613464006533661, + "grad_norm": 1.2060394287109375, + "learning_rate": 0.0002599916665434186, + "loss": 2.0703, + "step": 10811 + }, + { + "epoch": 1.2614630731536578, + "grad_norm": 1.3396209478378296, + "learning_rate": 0.00025998134171774964, + "loss": 2.1562, + "step": 10812 + }, + { + "epoch": 1.2615797456539495, + "grad_norm": 1.2146605253219604, + "learning_rate": 0.0002599710157674663, + "loss": 2.1127, + "step": 10813 + }, + { + "epoch": 1.2616964181542412, + "grad_norm": 1.4418563842773438, + "learning_rate": 0.00025996068869267537, + "loss": 2.1413, + "step": 10814 + }, + { + "epoch": 1.2618130906545328, + "grad_norm": 1.0401701927185059, + "learning_rate": 0.000259950360493484, + "loss": 2.0683, + "step": 10815 + }, + { + "epoch": 1.2619297631548245, + "grad_norm": 1.153834581375122, + "learning_rate": 0.0002599400311699993, + "loss": 1.9167, + "step": 10816 + }, + { + "epoch": 1.2620464356551162, + "grad_norm": 1.2620481252670288, + "learning_rate": 0.00025992970072232836, + "loss": 2.1485, + "step": 10817 + }, + { + "epoch": 1.2621631081554079, + "grad_norm": 1.4216679334640503, + "learning_rate": 0.0002599193691505782, + "loss": 2.2999, + "step": 10818 + }, + { + "epoch": 1.2622797806556996, + "grad_norm": 1.2059199810028076, + "learning_rate": 0.000259909036454856, + "loss": 2.4019, + "step": 10819 + }, + { + "epoch": 1.2623964531559912, + "grad_norm": 1.1589967012405396, + "learning_rate": 0.00025989870263526887, + "loss": 2.1374, + "step": 10820 + }, + { + "epoch": 1.262513125656283, + "grad_norm": 1.2822157144546509, + "learning_rate": 0.00025988836769192384, + "loss": 2.246, + "step": 10821 + }, + { + "epoch": 1.2626297981565746, + "grad_norm": 1.2427079677581787, + "learning_rate": 0.0002598780316249282, + "loss": 2.0886, + "step": 10822 + }, + { + "epoch": 1.2627464706568663, + "grad_norm": 1.2748817205429077, + "learning_rate": 0.000259867694434389, + "loss": 2.1122, + "step": 10823 + }, + { + "epoch": 1.262863143157158, + "grad_norm": 1.1776304244995117, + "learning_rate": 0.00025985735612041343, + "loss": 2.013, + "step": 10824 + }, + { + "epoch": 1.2629798156574497, + "grad_norm": 1.2269586324691772, + "learning_rate": 0.0002598470166831087, + "loss": 2.2098, + "step": 10825 + }, + { + "epoch": 1.2630964881577413, + "grad_norm": 1.4066178798675537, + "learning_rate": 0.0002598366761225819, + "loss": 2.2687, + "step": 10826 + }, + { + "epoch": 1.263213160658033, + "grad_norm": 1.1112250089645386, + "learning_rate": 0.00025982633443894036, + "loss": 2.0685, + "step": 10827 + }, + { + "epoch": 1.2633298331583247, + "grad_norm": 1.1078933477401733, + "learning_rate": 0.00025981599163229125, + "loss": 2.0417, + "step": 10828 + }, + { + "epoch": 1.2634465056586164, + "grad_norm": 1.1032394170761108, + "learning_rate": 0.00025980564770274175, + "loss": 2.0751, + "step": 10829 + }, + { + "epoch": 1.263563178158908, + "grad_norm": 1.3486500978469849, + "learning_rate": 0.0002597953026503991, + "loss": 1.882, + "step": 10830 + }, + { + "epoch": 1.2636798506591997, + "grad_norm": 1.1827335357666016, + "learning_rate": 0.00025978495647537063, + "loss": 2.1988, + "step": 10831 + }, + { + "epoch": 1.2637965231594914, + "grad_norm": 1.1669859886169434, + "learning_rate": 0.00025977460917776346, + "loss": 2.0877, + "step": 10832 + }, + { + "epoch": 1.263913195659783, + "grad_norm": 1.3701584339141846, + "learning_rate": 0.0002597642607576849, + "loss": 2.1877, + "step": 10833 + }, + { + "epoch": 1.2640298681600748, + "grad_norm": 1.2265188694000244, + "learning_rate": 0.00025975391121524236, + "loss": 2.0102, + "step": 10834 + }, + { + "epoch": 1.2641465406603665, + "grad_norm": 1.0614300966262817, + "learning_rate": 0.000259743560550543, + "loss": 1.9157, + "step": 10835 + }, + { + "epoch": 1.2642632131606582, + "grad_norm": 1.4815764427185059, + "learning_rate": 0.00025973320876369416, + "loss": 2.1357, + "step": 10836 + }, + { + "epoch": 1.2643798856609498, + "grad_norm": 1.1145131587982178, + "learning_rate": 0.00025972285585480314, + "loss": 2.1822, + "step": 10837 + }, + { + "epoch": 1.2644965581612415, + "grad_norm": 1.2318390607833862, + "learning_rate": 0.0002597125018239772, + "loss": 1.987, + "step": 10838 + }, + { + "epoch": 1.2646132306615332, + "grad_norm": 1.1953672170639038, + "learning_rate": 0.0002597021466713239, + "loss": 2.144, + "step": 10839 + }, + { + "epoch": 1.2647299031618249, + "grad_norm": 1.1634044647216797, + "learning_rate": 0.0002596917903969504, + "loss": 2.2141, + "step": 10840 + }, + { + "epoch": 1.2648465756621166, + "grad_norm": 1.3145564794540405, + "learning_rate": 0.00025968143300096413, + "loss": 2.1549, + "step": 10841 + }, + { + "epoch": 1.2649632481624082, + "grad_norm": 1.1640925407409668, + "learning_rate": 0.0002596710744834724, + "loss": 2.0807, + "step": 10842 + }, + { + "epoch": 1.2650799206627, + "grad_norm": 1.106727123260498, + "learning_rate": 0.00025966071484458266, + "loss": 1.8721, + "step": 10843 + }, + { + "epoch": 1.2651965931629916, + "grad_norm": 1.2609078884124756, + "learning_rate": 0.00025965035408440237, + "loss": 2.0777, + "step": 10844 + }, + { + "epoch": 1.2653132656632833, + "grad_norm": 1.0589138269424438, + "learning_rate": 0.0002596399922030388, + "loss": 2.037, + "step": 10845 + }, + { + "epoch": 1.265429938163575, + "grad_norm": 1.3852134943008423, + "learning_rate": 0.00025962962920059944, + "loss": 1.9309, + "step": 10846 + }, + { + "epoch": 1.2655466106638666, + "grad_norm": 1.2633693218231201, + "learning_rate": 0.0002596192650771917, + "loss": 2.3344, + "step": 10847 + }, + { + "epoch": 1.2656632831641583, + "grad_norm": 1.4251972436904907, + "learning_rate": 0.000259608899832923, + "loss": 2.1467, + "step": 10848 + }, + { + "epoch": 1.26577995566445, + "grad_norm": 1.1544307470321655, + "learning_rate": 0.00025959853346790087, + "loss": 2.0521, + "step": 10849 + }, + { + "epoch": 1.2658966281647417, + "grad_norm": 1.1614335775375366, + "learning_rate": 0.00025958816598223273, + "loss": 1.9078, + "step": 10850 + }, + { + "epoch": 1.2660133006650334, + "grad_norm": 1.1060467958450317, + "learning_rate": 0.0002595777973760261, + "loss": 1.9892, + "step": 10851 + }, + { + "epoch": 1.266129973165325, + "grad_norm": 1.3258744478225708, + "learning_rate": 0.00025956742764938844, + "loss": 2.0824, + "step": 10852 + }, + { + "epoch": 1.2662466456656167, + "grad_norm": 1.0629652738571167, + "learning_rate": 0.00025955705680242723, + "loss": 1.9273, + "step": 10853 + }, + { + "epoch": 1.2663633181659084, + "grad_norm": 1.1146125793457031, + "learning_rate": 0.00025954668483524996, + "loss": 1.9652, + "step": 10854 + }, + { + "epoch": 1.2664799906662, + "grad_norm": 1.1174362897872925, + "learning_rate": 0.00025953631174796424, + "loss": 2.0003, + "step": 10855 + }, + { + "epoch": 1.2665966631664918, + "grad_norm": 1.1447149515151978, + "learning_rate": 0.00025952593754067756, + "loss": 2.0251, + "step": 10856 + }, + { + "epoch": 1.2667133356667835, + "grad_norm": 1.283370852470398, + "learning_rate": 0.0002595155622134975, + "loss": 2.1862, + "step": 10857 + }, + { + "epoch": 1.2668300081670751, + "grad_norm": 1.3902214765548706, + "learning_rate": 0.00025950518576653154, + "loss": 2.1995, + "step": 10858 + }, + { + "epoch": 1.2669466806673668, + "grad_norm": 1.0774221420288086, + "learning_rate": 0.00025949480819988736, + "loss": 2.1506, + "step": 10859 + }, + { + "epoch": 1.2670633531676585, + "grad_norm": 1.2255946397781372, + "learning_rate": 0.00025948442951367244, + "loss": 2.199, + "step": 10860 + }, + { + "epoch": 1.2671800256679502, + "grad_norm": 1.310494065284729, + "learning_rate": 0.00025947404970799444, + "loss": 2.17, + "step": 10861 + }, + { + "epoch": 1.2672966981682419, + "grad_norm": 1.2088775634765625, + "learning_rate": 0.00025946366878296096, + "loss": 1.9978, + "step": 10862 + }, + { + "epoch": 1.2674133706685335, + "grad_norm": 1.1542192697525024, + "learning_rate": 0.00025945328673867966, + "loss": 2.0301, + "step": 10863 + }, + { + "epoch": 1.2675300431688252, + "grad_norm": 1.2167878150939941, + "learning_rate": 0.000259442903575258, + "loss": 2.078, + "step": 10864 + }, + { + "epoch": 1.267646715669117, + "grad_norm": 1.1708602905273438, + "learning_rate": 0.00025943251929280383, + "loss": 2.0715, + "step": 10865 + }, + { + "epoch": 1.2677633881694086, + "grad_norm": 1.069606065750122, + "learning_rate": 0.0002594221338914246, + "loss": 2.0738, + "step": 10866 + }, + { + "epoch": 1.2678800606697003, + "grad_norm": 1.3133573532104492, + "learning_rate": 0.00025941174737122825, + "loss": 2.2848, + "step": 10867 + }, + { + "epoch": 1.267996733169992, + "grad_norm": 1.2829831838607788, + "learning_rate": 0.0002594013597323222, + "loss": 2.0008, + "step": 10868 + }, + { + "epoch": 1.2681134056702836, + "grad_norm": 1.450797438621521, + "learning_rate": 0.0002593909709748142, + "loss": 2.1975, + "step": 10869 + }, + { + "epoch": 1.2682300781705753, + "grad_norm": 1.0968453884124756, + "learning_rate": 0.00025938058109881205, + "loss": 2.1097, + "step": 10870 + }, + { + "epoch": 1.268346750670867, + "grad_norm": 1.291693925857544, + "learning_rate": 0.00025937019010442334, + "loss": 2.1364, + "step": 10871 + }, + { + "epoch": 1.2684634231711587, + "grad_norm": 1.2487152814865112, + "learning_rate": 0.0002593597979917558, + "loss": 2.2287, + "step": 10872 + }, + { + "epoch": 1.2685800956714504, + "grad_norm": 1.1459777355194092, + "learning_rate": 0.00025934940476091725, + "loss": 2.0881, + "step": 10873 + }, + { + "epoch": 1.268696768171742, + "grad_norm": 1.2311618328094482, + "learning_rate": 0.0002593390104120154, + "loss": 2.1318, + "step": 10874 + }, + { + "epoch": 1.2688134406720337, + "grad_norm": 1.4066720008850098, + "learning_rate": 0.000259328614945158, + "loss": 2.0674, + "step": 10875 + }, + { + "epoch": 1.2689301131723254, + "grad_norm": 1.2916942834854126, + "learning_rate": 0.0002593182183604528, + "loss": 2.1184, + "step": 10876 + }, + { + "epoch": 1.269046785672617, + "grad_norm": 1.448833703994751, + "learning_rate": 0.0002593078206580076, + "loss": 2.0015, + "step": 10877 + }, + { + "epoch": 1.2691634581729088, + "grad_norm": 1.3295177221298218, + "learning_rate": 0.0002592974218379302, + "loss": 1.9939, + "step": 10878 + }, + { + "epoch": 1.2692801306732004, + "grad_norm": 1.0476195812225342, + "learning_rate": 0.0002592870219003284, + "loss": 2.0401, + "step": 10879 + }, + { + "epoch": 1.2693968031734921, + "grad_norm": 1.164620041847229, + "learning_rate": 0.00025927662084531, + "loss": 2.1081, + "step": 10880 + }, + { + "epoch": 1.2695134756737838, + "grad_norm": 1.2289166450500488, + "learning_rate": 0.00025926621867298285, + "loss": 2.0474, + "step": 10881 + }, + { + "epoch": 1.2696301481740755, + "grad_norm": 1.3807452917099, + "learning_rate": 0.0002592558153834547, + "loss": 1.9458, + "step": 10882 + }, + { + "epoch": 1.2697468206743672, + "grad_norm": 1.339490532875061, + "learning_rate": 0.0002592454109768335, + "loss": 2.0927, + "step": 10883 + }, + { + "epoch": 1.2698634931746589, + "grad_norm": 1.0927680730819702, + "learning_rate": 0.0002592350054532271, + "loss": 2.0585, + "step": 10884 + }, + { + "epoch": 1.2699801656749505, + "grad_norm": 1.4476802349090576, + "learning_rate": 0.0002592245988127434, + "loss": 2.297, + "step": 10885 + }, + { + "epoch": 1.2700968381752422, + "grad_norm": 1.1843403577804565, + "learning_rate": 0.0002592141910554902, + "loss": 2.0889, + "step": 10886 + }, + { + "epoch": 1.270213510675534, + "grad_norm": 1.0534558296203613, + "learning_rate": 0.00025920378218157543, + "loss": 1.8448, + "step": 10887 + }, + { + "epoch": 1.2703301831758256, + "grad_norm": 1.048046588897705, + "learning_rate": 0.00025919337219110694, + "loss": 2.0085, + "step": 10888 + }, + { + "epoch": 1.2704468556761173, + "grad_norm": 1.218740463256836, + "learning_rate": 0.0002591829610841928, + "loss": 2.0713, + "step": 10889 + }, + { + "epoch": 1.270563528176409, + "grad_norm": 1.2000203132629395, + "learning_rate": 0.00025917254886094086, + "loss": 2.0725, + "step": 10890 + }, + { + "epoch": 1.2706802006767006, + "grad_norm": 1.3546710014343262, + "learning_rate": 0.000259162135521459, + "loss": 2.2043, + "step": 10891 + }, + { + "epoch": 1.2707968731769923, + "grad_norm": 1.1443482637405396, + "learning_rate": 0.0002591517210658553, + "loss": 2.1344, + "step": 10892 + }, + { + "epoch": 1.270913545677284, + "grad_norm": 1.0884888172149658, + "learning_rate": 0.00025914130549423755, + "loss": 2.1279, + "step": 10893 + }, + { + "epoch": 1.2710302181775757, + "grad_norm": 1.143415927886963, + "learning_rate": 0.00025913088880671393, + "loss": 2.1444, + "step": 10894 + }, + { + "epoch": 1.2711468906778673, + "grad_norm": 1.1110819578170776, + "learning_rate": 0.00025912047100339227, + "loss": 2.024, + "step": 10895 + }, + { + "epoch": 1.271263563178159, + "grad_norm": 1.142222285270691, + "learning_rate": 0.00025911005208438065, + "loss": 2.0657, + "step": 10896 + }, + { + "epoch": 1.2713802356784507, + "grad_norm": 1.1864198446273804, + "learning_rate": 0.00025909963204978707, + "loss": 1.9479, + "step": 10897 + }, + { + "epoch": 1.2714969081787424, + "grad_norm": 1.505826711654663, + "learning_rate": 0.00025908921089971956, + "loss": 2.2535, + "step": 10898 + }, + { + "epoch": 1.271613580679034, + "grad_norm": 1.1910529136657715, + "learning_rate": 0.0002590787886342862, + "loss": 2.0298, + "step": 10899 + }, + { + "epoch": 1.2717302531793258, + "grad_norm": 1.0342925786972046, + "learning_rate": 0.00025906836525359483, + "loss": 2.0455, + "step": 10900 + }, + { + "epoch": 1.2718469256796174, + "grad_norm": 1.2626593112945557, + "learning_rate": 0.00025905794075775374, + "loss": 2.0273, + "step": 10901 + }, + { + "epoch": 1.2719635981799091, + "grad_norm": 1.1522905826568604, + "learning_rate": 0.00025904751514687095, + "loss": 2.0429, + "step": 10902 + }, + { + "epoch": 1.2720802706802008, + "grad_norm": 1.189144253730774, + "learning_rate": 0.00025903708842105443, + "loss": 1.935, + "step": 10903 + }, + { + "epoch": 1.2721969431804925, + "grad_norm": 1.3584617376327515, + "learning_rate": 0.00025902666058041246, + "loss": 2.1097, + "step": 10904 + }, + { + "epoch": 1.2723136156807842, + "grad_norm": 1.2994418144226074, + "learning_rate": 0.00025901623162505295, + "loss": 1.9535, + "step": 10905 + }, + { + "epoch": 1.2724302881810758, + "grad_norm": 1.2444558143615723, + "learning_rate": 0.0002590058015550841, + "loss": 2.0159, + "step": 10906 + }, + { + "epoch": 1.2725469606813675, + "grad_norm": 1.2249408960342407, + "learning_rate": 0.0002589953703706141, + "loss": 1.9038, + "step": 10907 + }, + { + "epoch": 1.2726636331816592, + "grad_norm": 1.2527432441711426, + "learning_rate": 0.0002589849380717509, + "loss": 2.0725, + "step": 10908 + }, + { + "epoch": 1.2727803056819509, + "grad_norm": 1.1597893238067627, + "learning_rate": 0.0002589745046586029, + "loss": 2.2926, + "step": 10909 + }, + { + "epoch": 1.2728969781822426, + "grad_norm": 1.0903905630111694, + "learning_rate": 0.0002589640701312781, + "loss": 2.1313, + "step": 10910 + }, + { + "epoch": 1.2730136506825342, + "grad_norm": 1.1263004541397095, + "learning_rate": 0.0002589536344898847, + "loss": 1.9187, + "step": 10911 + }, + { + "epoch": 1.273130323182826, + "grad_norm": 1.4319616556167603, + "learning_rate": 0.00025894319773453093, + "loss": 2.2441, + "step": 10912 + }, + { + "epoch": 1.2732469956831176, + "grad_norm": 1.042878270149231, + "learning_rate": 0.0002589327598653249, + "loss": 2.0708, + "step": 10913 + }, + { + "epoch": 1.2733636681834093, + "grad_norm": 1.3089011907577515, + "learning_rate": 0.0002589223208823749, + "loss": 2.1332, + "step": 10914 + }, + { + "epoch": 1.273480340683701, + "grad_norm": 1.0816572904586792, + "learning_rate": 0.0002589118807857892, + "loss": 2.1351, + "step": 10915 + }, + { + "epoch": 1.2735970131839927, + "grad_norm": 0.9533092379570007, + "learning_rate": 0.00025890143957567585, + "loss": 1.9949, + "step": 10916 + }, + { + "epoch": 1.2737136856842843, + "grad_norm": 1.2107771635055542, + "learning_rate": 0.0002588909972521433, + "loss": 1.9897, + "step": 10917 + }, + { + "epoch": 1.273830358184576, + "grad_norm": 1.2028968334197998, + "learning_rate": 0.00025888055381529957, + "loss": 2.038, + "step": 10918 + }, + { + "epoch": 1.2739470306848677, + "grad_norm": 1.1657699346542358, + "learning_rate": 0.00025887010926525317, + "loss": 2.0102, + "step": 10919 + }, + { + "epoch": 1.2740637031851594, + "grad_norm": 1.2599341869354248, + "learning_rate": 0.0002588596636021122, + "loss": 2.0309, + "step": 10920 + }, + { + "epoch": 1.274180375685451, + "grad_norm": 1.154956579208374, + "learning_rate": 0.000258849216825985, + "loss": 2.0062, + "step": 10921 + }, + { + "epoch": 1.2742970481857427, + "grad_norm": 1.264689564704895, + "learning_rate": 0.00025883876893697996, + "loss": 2.2823, + "step": 10922 + }, + { + "epoch": 1.2744137206860344, + "grad_norm": 1.161481261253357, + "learning_rate": 0.00025882831993520524, + "loss": 2.0793, + "step": 10923 + }, + { + "epoch": 1.274530393186326, + "grad_norm": 1.1602535247802734, + "learning_rate": 0.0002588178698207693, + "loss": 1.9133, + "step": 10924 + }, + { + "epoch": 1.2746470656866178, + "grad_norm": 1.1578538417816162, + "learning_rate": 0.00025880741859378036, + "loss": 2.1006, + "step": 10925 + }, + { + "epoch": 1.2747637381869095, + "grad_norm": 1.2714512348175049, + "learning_rate": 0.0002587969662543469, + "loss": 2.0566, + "step": 10926 + }, + { + "epoch": 1.2748804106872011, + "grad_norm": 1.181373119354248, + "learning_rate": 0.00025878651280257715, + "loss": 2.0384, + "step": 10927 + }, + { + "epoch": 1.2749970831874928, + "grad_norm": 1.1028110980987549, + "learning_rate": 0.00025877605823857954, + "loss": 2.0369, + "step": 10928 + }, + { + "epoch": 1.2751137556877845, + "grad_norm": 1.2224757671356201, + "learning_rate": 0.00025876560256246243, + "loss": 2.1237, + "step": 10929 + }, + { + "epoch": 1.2752304281880762, + "grad_norm": 1.2068063020706177, + "learning_rate": 0.0002587551457743343, + "loss": 2.0562, + "step": 10930 + }, + { + "epoch": 1.2753471006883679, + "grad_norm": 1.1961878538131714, + "learning_rate": 0.00025874468787430335, + "loss": 2.0243, + "step": 10931 + }, + { + "epoch": 1.2754637731886596, + "grad_norm": 1.1102590560913086, + "learning_rate": 0.0002587342288624782, + "loss": 2.1853, + "step": 10932 + }, + { + "epoch": 1.2755804456889512, + "grad_norm": 1.371931791305542, + "learning_rate": 0.0002587237687389672, + "loss": 2.1861, + "step": 10933 + }, + { + "epoch": 1.275697118189243, + "grad_norm": 1.2042964696884155, + "learning_rate": 0.0002587133075038788, + "loss": 1.9477, + "step": 10934 + }, + { + "epoch": 1.2758137906895346, + "grad_norm": 1.106960415840149, + "learning_rate": 0.00025870284515732144, + "loss": 1.9522, + "step": 10935 + }, + { + "epoch": 1.2759304631898263, + "grad_norm": 1.1636022329330444, + "learning_rate": 0.00025869238169940357, + "loss": 2.2027, + "step": 10936 + }, + { + "epoch": 1.276047135690118, + "grad_norm": 1.2873235940933228, + "learning_rate": 0.0002586819171302337, + "loss": 2.1585, + "step": 10937 + }, + { + "epoch": 1.2761638081904096, + "grad_norm": 1.1256550550460815, + "learning_rate": 0.00025867145144992023, + "loss": 2.0593, + "step": 10938 + }, + { + "epoch": 1.2762804806907013, + "grad_norm": 1.1375787258148193, + "learning_rate": 0.00025866098465857177, + "loss": 2.1448, + "step": 10939 + }, + { + "epoch": 1.276397153190993, + "grad_norm": 1.28468656539917, + "learning_rate": 0.0002586505167562968, + "loss": 2.0953, + "step": 10940 + }, + { + "epoch": 1.2765138256912847, + "grad_norm": 1.0312342643737793, + "learning_rate": 0.00025864004774320374, + "loss": 1.9261, + "step": 10941 + }, + { + "epoch": 1.2766304981915764, + "grad_norm": 1.1322147846221924, + "learning_rate": 0.00025862957761940133, + "loss": 2.153, + "step": 10942 + }, + { + "epoch": 1.276747170691868, + "grad_norm": 1.051142930984497, + "learning_rate": 0.0002586191063849979, + "loss": 2.0199, + "step": 10943 + }, + { + "epoch": 1.2768638431921597, + "grad_norm": 1.2785836458206177, + "learning_rate": 0.0002586086340401021, + "loss": 2.1469, + "step": 10944 + }, + { + "epoch": 1.2769805156924514, + "grad_norm": 1.172794222831726, + "learning_rate": 0.0002585981605848224, + "loss": 2.0453, + "step": 10945 + }, + { + "epoch": 1.277097188192743, + "grad_norm": 1.2908509969711304, + "learning_rate": 0.0002585876860192676, + "loss": 2.1702, + "step": 10946 + }, + { + "epoch": 1.2772138606930348, + "grad_norm": 1.0826970338821411, + "learning_rate": 0.00025857721034354604, + "loss": 2.0599, + "step": 10947 + }, + { + "epoch": 1.2773305331933265, + "grad_norm": 1.2258607149124146, + "learning_rate": 0.00025856673355776645, + "loss": 1.91, + "step": 10948 + }, + { + "epoch": 1.2774472056936181, + "grad_norm": 1.144421935081482, + "learning_rate": 0.00025855625566203745, + "loss": 2.1035, + "step": 10949 + }, + { + "epoch": 1.2775638781939098, + "grad_norm": 1.3353291749954224, + "learning_rate": 0.0002585457766564676, + "loss": 2.1595, + "step": 10950 + }, + { + "epoch": 1.2776805506942015, + "grad_norm": 1.149709939956665, + "learning_rate": 0.0002585352965411656, + "loss": 2.0036, + "step": 10951 + }, + { + "epoch": 1.2777972231944932, + "grad_norm": 1.2091504335403442, + "learning_rate": 0.00025852481531624, + "loss": 2.0703, + "step": 10952 + }, + { + "epoch": 1.2779138956947849, + "grad_norm": 1.0837056636810303, + "learning_rate": 0.00025851433298179963, + "loss": 2.1031, + "step": 10953 + }, + { + "epoch": 1.2780305681950765, + "grad_norm": 1.1568025350570679, + "learning_rate": 0.00025850384953795294, + "loss": 1.9145, + "step": 10954 + }, + { + "epoch": 1.2781472406953682, + "grad_norm": 1.1734040975570679, + "learning_rate": 0.0002584933649848088, + "loss": 2.1352, + "step": 10955 + }, + { + "epoch": 1.27826391319566, + "grad_norm": 1.2262822389602661, + "learning_rate": 0.0002584828793224758, + "loss": 2.17, + "step": 10956 + }, + { + "epoch": 1.2783805856959516, + "grad_norm": 1.2373199462890625, + "learning_rate": 0.0002584723925510627, + "loss": 1.9792, + "step": 10957 + }, + { + "epoch": 1.2784972581962433, + "grad_norm": 1.3222172260284424, + "learning_rate": 0.0002584619046706781, + "loss": 2.0424, + "step": 10958 + }, + { + "epoch": 1.278613930696535, + "grad_norm": 1.4100204706192017, + "learning_rate": 0.0002584514156814309, + "loss": 2.1705, + "step": 10959 + }, + { + "epoch": 1.2787306031968266, + "grad_norm": 1.1162197589874268, + "learning_rate": 0.00025844092558342964, + "loss": 2.0122, + "step": 10960 + }, + { + "epoch": 1.2788472756971183, + "grad_norm": 1.2839478254318237, + "learning_rate": 0.00025843043437678327, + "loss": 2.1573, + "step": 10961 + }, + { + "epoch": 1.27896394819741, + "grad_norm": 1.113491415977478, + "learning_rate": 0.0002584199420616004, + "loss": 2.2329, + "step": 10962 + }, + { + "epoch": 1.2790806206977017, + "grad_norm": 1.2139872312545776, + "learning_rate": 0.0002584094486379899, + "loss": 1.9145, + "step": 10963 + }, + { + "epoch": 1.2791972931979934, + "grad_norm": 1.3753503561019897, + "learning_rate": 0.00025839895410606045, + "loss": 2.1664, + "step": 10964 + }, + { + "epoch": 1.279313965698285, + "grad_norm": 1.00276517868042, + "learning_rate": 0.00025838845846592096, + "loss": 1.978, + "step": 10965 + }, + { + "epoch": 1.2794306381985767, + "grad_norm": 1.1981416940689087, + "learning_rate": 0.0002583779617176802, + "loss": 2.114, + "step": 10966 + }, + { + "epoch": 1.2795473106988684, + "grad_norm": 1.360906720161438, + "learning_rate": 0.000258367463861447, + "loss": 2.1047, + "step": 10967 + }, + { + "epoch": 1.27966398319916, + "grad_norm": 1.3026775121688843, + "learning_rate": 0.0002583569648973301, + "loss": 2.0874, + "step": 10968 + }, + { + "epoch": 1.2797806556994518, + "grad_norm": 1.1822541952133179, + "learning_rate": 0.00025834646482543846, + "loss": 2.0656, + "step": 10969 + }, + { + "epoch": 1.2798973281997434, + "grad_norm": 1.1692310571670532, + "learning_rate": 0.00025833596364588086, + "loss": 2.141, + "step": 10970 + }, + { + "epoch": 1.2800140007000351, + "grad_norm": 1.3128583431243896, + "learning_rate": 0.00025832546135876624, + "loss": 2.1402, + "step": 10971 + }, + { + "epoch": 1.2801306732003268, + "grad_norm": 1.0617375373840332, + "learning_rate": 0.00025831495796420335, + "loss": 2.1541, + "step": 10972 + }, + { + "epoch": 1.2802473457006185, + "grad_norm": 1.1163281202316284, + "learning_rate": 0.0002583044534623012, + "loss": 2.1441, + "step": 10973 + }, + { + "epoch": 1.2803640182009102, + "grad_norm": 1.0963972806930542, + "learning_rate": 0.00025829394785316855, + "loss": 2.0489, + "step": 10974 + }, + { + "epoch": 1.2804806907012019, + "grad_norm": 1.275307059288025, + "learning_rate": 0.0002582834411369145, + "loss": 2.0824, + "step": 10975 + }, + { + "epoch": 1.2805973632014935, + "grad_norm": 1.1132538318634033, + "learning_rate": 0.0002582729333136478, + "loss": 2.0418, + "step": 10976 + }, + { + "epoch": 1.2807140357017852, + "grad_norm": 1.2514748573303223, + "learning_rate": 0.0002582624243834775, + "loss": 2.313, + "step": 10977 + }, + { + "epoch": 1.280830708202077, + "grad_norm": 1.1176973581314087, + "learning_rate": 0.0002582519143465125, + "loss": 2.1353, + "step": 10978 + }, + { + "epoch": 1.2809473807023686, + "grad_norm": 1.2467012405395508, + "learning_rate": 0.00025824140320286176, + "loss": 2.0951, + "step": 10979 + }, + { + "epoch": 1.2810640532026603, + "grad_norm": 1.4200289249420166, + "learning_rate": 0.00025823089095263423, + "loss": 2.1914, + "step": 10980 + }, + { + "epoch": 1.281180725702952, + "grad_norm": 1.122018575668335, + "learning_rate": 0.00025822037759593887, + "loss": 1.8769, + "step": 10981 + }, + { + "epoch": 1.2812973982032436, + "grad_norm": 1.263465166091919, + "learning_rate": 0.00025820986313288474, + "loss": 2.2154, + "step": 10982 + }, + { + "epoch": 1.2814140707035353, + "grad_norm": 1.1820108890533447, + "learning_rate": 0.0002581993475635808, + "loss": 1.9707, + "step": 10983 + }, + { + "epoch": 1.281530743203827, + "grad_norm": 1.0975756645202637, + "learning_rate": 0.0002581888308881361, + "loss": 2.0861, + "step": 10984 + }, + { + "epoch": 1.2816474157041187, + "grad_norm": 1.1618149280548096, + "learning_rate": 0.0002581783131066596, + "loss": 1.9615, + "step": 10985 + }, + { + "epoch": 1.2817640882044103, + "grad_norm": 1.3323107957839966, + "learning_rate": 0.0002581677942192604, + "loss": 2.0837, + "step": 10986 + }, + { + "epoch": 1.281880760704702, + "grad_norm": 1.2154875993728638, + "learning_rate": 0.0002581572742260475, + "loss": 2.1274, + "step": 10987 + }, + { + "epoch": 1.2819974332049937, + "grad_norm": 1.1281965970993042, + "learning_rate": 0.00025814675312713, + "loss": 2.0804, + "step": 10988 + }, + { + "epoch": 1.2821141057052854, + "grad_norm": 1.0405112504959106, + "learning_rate": 0.00025813623092261687, + "loss": 2.1123, + "step": 10989 + }, + { + "epoch": 1.282230778205577, + "grad_norm": 1.189231038093567, + "learning_rate": 0.00025812570761261737, + "loss": 2.1915, + "step": 10990 + }, + { + "epoch": 1.2823474507058688, + "grad_norm": 1.1390472650527954, + "learning_rate": 0.0002581151831972405, + "loss": 2.1699, + "step": 10991 + }, + { + "epoch": 1.2824641232061604, + "grad_norm": 1.1092356443405151, + "learning_rate": 0.00025810465767659536, + "loss": 2.1593, + "step": 10992 + }, + { + "epoch": 1.2825807957064521, + "grad_norm": 1.1430140733718872, + "learning_rate": 0.000258094131050791, + "loss": 2.0813, + "step": 10993 + }, + { + "epoch": 1.2826974682067438, + "grad_norm": 1.0950801372528076, + "learning_rate": 0.00025808360331993664, + "loss": 2.0436, + "step": 10994 + }, + { + "epoch": 1.2828141407070355, + "grad_norm": 1.1899707317352295, + "learning_rate": 0.00025807307448414145, + "loss": 2.106, + "step": 10995 + }, + { + "epoch": 1.2829308132073272, + "grad_norm": 1.381303310394287, + "learning_rate": 0.00025806254454351454, + "loss": 2.0833, + "step": 10996 + }, + { + "epoch": 1.2830474857076188, + "grad_norm": 1.1174367666244507, + "learning_rate": 0.000258052013498165, + "loss": 2.1104, + "step": 10997 + }, + { + "epoch": 1.2831641582079105, + "grad_norm": 1.1395034790039062, + "learning_rate": 0.00025804148134820205, + "loss": 1.9628, + "step": 10998 + }, + { + "epoch": 1.2832808307082022, + "grad_norm": 1.0255677700042725, + "learning_rate": 0.00025803094809373494, + "loss": 2.1576, + "step": 10999 + }, + { + "epoch": 1.2833975032084939, + "grad_norm": 1.2924786806106567, + "learning_rate": 0.00025802041373487283, + "loss": 2.0612, + "step": 11000 + }, + { + "epoch": 1.2835141757087856, + "grad_norm": 1.2261440753936768, + "learning_rate": 0.0002580098782717249, + "loss": 2.091, + "step": 11001 + }, + { + "epoch": 1.2836308482090772, + "grad_norm": 1.2514523267745972, + "learning_rate": 0.00025799934170440036, + "loss": 2.0221, + "step": 11002 + }, + { + "epoch": 1.283747520709369, + "grad_norm": 1.2169928550720215, + "learning_rate": 0.00025798880403300853, + "loss": 2.0904, + "step": 11003 + }, + { + "epoch": 1.2838641932096606, + "grad_norm": 1.097341775894165, + "learning_rate": 0.0002579782652576586, + "loss": 2.1279, + "step": 11004 + }, + { + "epoch": 1.2839808657099523, + "grad_norm": 1.2977346181869507, + "learning_rate": 0.0002579677253784598, + "loss": 2.0726, + "step": 11005 + }, + { + "epoch": 1.284097538210244, + "grad_norm": 1.1438133716583252, + "learning_rate": 0.0002579571843955213, + "loss": 2.1373, + "step": 11006 + }, + { + "epoch": 1.2842142107105357, + "grad_norm": 1.1843104362487793, + "learning_rate": 0.00025794664230895266, + "loss": 2.1281, + "step": 11007 + }, + { + "epoch": 1.2843308832108273, + "grad_norm": 1.2156437635421753, + "learning_rate": 0.0002579360991188629, + "loss": 1.9988, + "step": 11008 + }, + { + "epoch": 1.284447555711119, + "grad_norm": 1.4136021137237549, + "learning_rate": 0.0002579255548253615, + "loss": 1.9891, + "step": 11009 + }, + { + "epoch": 1.2845642282114107, + "grad_norm": 1.1324480772018433, + "learning_rate": 0.0002579150094285576, + "loss": 2.0762, + "step": 11010 + }, + { + "epoch": 1.2846809007117024, + "grad_norm": 1.4937598705291748, + "learning_rate": 0.00025790446292856065, + "loss": 1.904, + "step": 11011 + }, + { + "epoch": 1.284797573211994, + "grad_norm": 1.216188669204712, + "learning_rate": 0.00025789391532547994, + "loss": 1.9703, + "step": 11012 + }, + { + "epoch": 1.2849142457122857, + "grad_norm": 0.9807082414627075, + "learning_rate": 0.0002578833666194248, + "loss": 1.8546, + "step": 11013 + }, + { + "epoch": 1.2850309182125774, + "grad_norm": 1.1236140727996826, + "learning_rate": 0.0002578728168105047, + "loss": 2.1131, + "step": 11014 + }, + { + "epoch": 1.285147590712869, + "grad_norm": 1.020220398902893, + "learning_rate": 0.0002578622658988288, + "loss": 2.0052, + "step": 11015 + }, + { + "epoch": 1.2852642632131608, + "grad_norm": 1.4420236349105835, + "learning_rate": 0.0002578517138845067, + "loss": 2.2058, + "step": 11016 + }, + { + "epoch": 1.2853809357134525, + "grad_norm": 1.2657145261764526, + "learning_rate": 0.0002578411607676477, + "loss": 1.983, + "step": 11017 + }, + { + "epoch": 1.2854976082137441, + "grad_norm": 1.0914061069488525, + "learning_rate": 0.00025783060654836114, + "loss": 1.9768, + "step": 11018 + }, + { + "epoch": 1.2856142807140358, + "grad_norm": 1.3879852294921875, + "learning_rate": 0.0002578200512267565, + "loss": 2.1691, + "step": 11019 + }, + { + "epoch": 1.2857309532143275, + "grad_norm": 1.2362910509109497, + "learning_rate": 0.00025780949480294315, + "loss": 1.9426, + "step": 11020 + }, + { + "epoch": 1.2858476257146192, + "grad_norm": 1.1107614040374756, + "learning_rate": 0.0002577989372770306, + "loss": 2.1792, + "step": 11021 + }, + { + "epoch": 1.2859642982149109, + "grad_norm": 1.267394781112671, + "learning_rate": 0.0002577883786491284, + "loss": 2.0629, + "step": 11022 + }, + { + "epoch": 1.2860809707152026, + "grad_norm": 1.1955616474151611, + "learning_rate": 0.00025777781891934574, + "loss": 2.0453, + "step": 11023 + }, + { + "epoch": 1.2861976432154942, + "grad_norm": 1.3401732444763184, + "learning_rate": 0.00025776725808779224, + "loss": 1.9708, + "step": 11024 + }, + { + "epoch": 1.286314315715786, + "grad_norm": 1.213603138923645, + "learning_rate": 0.00025775669615457744, + "loss": 2.2796, + "step": 11025 + }, + { + "epoch": 1.2864309882160776, + "grad_norm": 1.0659795999526978, + "learning_rate": 0.00025774613311981074, + "loss": 2.0543, + "step": 11026 + }, + { + "epoch": 1.2865476607163693, + "grad_norm": 1.304368019104004, + "learning_rate": 0.00025773556898360165, + "loss": 2.1478, + "step": 11027 + }, + { + "epoch": 1.286664333216661, + "grad_norm": 1.3227529525756836, + "learning_rate": 0.00025772500374605977, + "loss": 2.1439, + "step": 11028 + }, + { + "epoch": 1.2867810057169526, + "grad_norm": 1.111536979675293, + "learning_rate": 0.0002577144374072945, + "loss": 2.1044, + "step": 11029 + }, + { + "epoch": 1.2868976782172443, + "grad_norm": 1.2689603567123413, + "learning_rate": 0.0002577038699674155, + "loss": 1.9957, + "step": 11030 + }, + { + "epoch": 1.287014350717536, + "grad_norm": 1.058388352394104, + "learning_rate": 0.00025769330142653225, + "loss": 1.9745, + "step": 11031 + }, + { + "epoch": 1.2871310232178277, + "grad_norm": 1.0775691270828247, + "learning_rate": 0.00025768273178475434, + "loss": 1.9649, + "step": 11032 + }, + { + "epoch": 1.2872476957181194, + "grad_norm": 1.1944352388381958, + "learning_rate": 0.0002576721610421913, + "loss": 1.9441, + "step": 11033 + }, + { + "epoch": 1.287364368218411, + "grad_norm": 1.1390354633331299, + "learning_rate": 0.00025766158919895293, + "loss": 2.1506, + "step": 11034 + }, + { + "epoch": 1.2874810407187027, + "grad_norm": 1.240143895149231, + "learning_rate": 0.0002576510162551485, + "loss": 2.1742, + "step": 11035 + }, + { + "epoch": 1.2875977132189944, + "grad_norm": 1.0251171588897705, + "learning_rate": 0.0002576404422108878, + "loss": 2.1008, + "step": 11036 + }, + { + "epoch": 1.287714385719286, + "grad_norm": 1.0477237701416016, + "learning_rate": 0.0002576298670662804, + "loss": 1.9615, + "step": 11037 + }, + { + "epoch": 1.2878310582195778, + "grad_norm": 1.1816997528076172, + "learning_rate": 0.00025761929082143605, + "loss": 2.1582, + "step": 11038 + }, + { + "epoch": 1.2879477307198695, + "grad_norm": 1.1553014516830444, + "learning_rate": 0.0002576087134764642, + "loss": 2.1175, + "step": 11039 + }, + { + "epoch": 1.2880644032201611, + "grad_norm": 1.4649373292922974, + "learning_rate": 0.00025759813503147466, + "loss": 2.1173, + "step": 11040 + }, + { + "epoch": 1.2881810757204528, + "grad_norm": 1.1027119159698486, + "learning_rate": 0.000257587555486577, + "loss": 1.9111, + "step": 11041 + }, + { + "epoch": 1.2882977482207445, + "grad_norm": 1.2807775735855103, + "learning_rate": 0.00025757697484188097, + "loss": 2.0343, + "step": 11042 + }, + { + "epoch": 1.2884144207210362, + "grad_norm": 1.1536951065063477, + "learning_rate": 0.00025756639309749616, + "loss": 2.1205, + "step": 11043 + }, + { + "epoch": 1.2885310932213279, + "grad_norm": 1.3937468528747559, + "learning_rate": 0.00025755581025353237, + "loss": 2.0349, + "step": 11044 + }, + { + "epoch": 1.2886477657216195, + "grad_norm": 1.2007206678390503, + "learning_rate": 0.00025754522631009927, + "loss": 2.1643, + "step": 11045 + }, + { + "epoch": 1.2887644382219112, + "grad_norm": 1.0504670143127441, + "learning_rate": 0.0002575346412673066, + "loss": 2.2232, + "step": 11046 + }, + { + "epoch": 1.288881110722203, + "grad_norm": 1.2191630601882935, + "learning_rate": 0.00025752405512526413, + "loss": 2.0095, + "step": 11047 + }, + { + "epoch": 1.2889977832224946, + "grad_norm": 1.3424091339111328, + "learning_rate": 0.0002575134678840815, + "loss": 2.0133, + "step": 11048 + }, + { + "epoch": 1.2891144557227863, + "grad_norm": 1.2125996351242065, + "learning_rate": 0.0002575028795438685, + "loss": 2.0998, + "step": 11049 + }, + { + "epoch": 1.289231128223078, + "grad_norm": 1.2383095026016235, + "learning_rate": 0.0002574922901047349, + "loss": 2.0141, + "step": 11050 + }, + { + "epoch": 1.2893478007233696, + "grad_norm": 1.145959734916687, + "learning_rate": 0.00025748169956679056, + "loss": 1.9024, + "step": 11051 + }, + { + "epoch": 1.2894644732236613, + "grad_norm": 1.520267367362976, + "learning_rate": 0.0002574711079301452, + "loss": 2.2517, + "step": 11052 + }, + { + "epoch": 1.289581145723953, + "grad_norm": 1.1801398992538452, + "learning_rate": 0.0002574605151949086, + "loss": 2.1294, + "step": 11053 + }, + { + "epoch": 1.2896978182242447, + "grad_norm": 1.0509498119354248, + "learning_rate": 0.00025744992136119067, + "loss": 2.0495, + "step": 11054 + }, + { + "epoch": 1.2898144907245364, + "grad_norm": 1.1610521078109741, + "learning_rate": 0.00025743932642910113, + "loss": 2.0857, + "step": 11055 + }, + { + "epoch": 1.289931163224828, + "grad_norm": 1.1042388677597046, + "learning_rate": 0.0002574287303987498, + "loss": 1.9981, + "step": 11056 + }, + { + "epoch": 1.2900478357251197, + "grad_norm": 1.2366105318069458, + "learning_rate": 0.0002574181332702467, + "loss": 2.0886, + "step": 11057 + }, + { + "epoch": 1.2901645082254114, + "grad_norm": 1.1121995449066162, + "learning_rate": 0.00025740753504370153, + "loss": 2.0795, + "step": 11058 + }, + { + "epoch": 1.290281180725703, + "grad_norm": 1.239285945892334, + "learning_rate": 0.00025739693571922423, + "loss": 2.1538, + "step": 11059 + }, + { + "epoch": 1.2903978532259948, + "grad_norm": 1.0270119905471802, + "learning_rate": 0.00025738633529692463, + "loss": 2.0444, + "step": 11060 + }, + { + "epoch": 1.2905145257262864, + "grad_norm": 1.2142226696014404, + "learning_rate": 0.00025737573377691267, + "loss": 2.1243, + "step": 11061 + }, + { + "epoch": 1.2906311982265781, + "grad_norm": 1.142907738685608, + "learning_rate": 0.0002573651311592982, + "loss": 2.0357, + "step": 11062 + }, + { + "epoch": 1.2907478707268698, + "grad_norm": 1.0940470695495605, + "learning_rate": 0.0002573545274441912, + "loss": 1.9408, + "step": 11063 + }, + { + "epoch": 1.2908645432271615, + "grad_norm": 1.1551532745361328, + "learning_rate": 0.00025734392263170155, + "loss": 2.1977, + "step": 11064 + }, + { + "epoch": 1.2909812157274532, + "grad_norm": 1.292138695716858, + "learning_rate": 0.00025733331672193925, + "loss": 2.1934, + "step": 11065 + }, + { + "epoch": 1.2910978882277448, + "grad_norm": 1.170775294303894, + "learning_rate": 0.0002573227097150142, + "loss": 1.9166, + "step": 11066 + }, + { + "epoch": 1.2912145607280365, + "grad_norm": 1.2303932905197144, + "learning_rate": 0.0002573121016110363, + "loss": 2.2294, + "step": 11067 + }, + { + "epoch": 1.2913312332283282, + "grad_norm": 1.2184913158416748, + "learning_rate": 0.0002573014924101157, + "loss": 2.1369, + "step": 11068 + }, + { + "epoch": 1.29144790572862, + "grad_norm": 1.2339245080947876, + "learning_rate": 0.00025729088211236224, + "loss": 2.1465, + "step": 11069 + }, + { + "epoch": 1.2915645782289116, + "grad_norm": 1.0770050287246704, + "learning_rate": 0.00025728027071788596, + "loss": 2.1492, + "step": 11070 + }, + { + "epoch": 1.2916812507292033, + "grad_norm": 1.1530017852783203, + "learning_rate": 0.0002572696582267968, + "loss": 2.1007, + "step": 11071 + }, + { + "epoch": 1.291797923229495, + "grad_norm": 1.1213964223861694, + "learning_rate": 0.00025725904463920496, + "loss": 1.9929, + "step": 11072 + }, + { + "epoch": 1.2919145957297866, + "grad_norm": 1.1692700386047363, + "learning_rate": 0.00025724842995522026, + "loss": 1.9128, + "step": 11073 + }, + { + "epoch": 1.2920312682300783, + "grad_norm": 1.1312990188598633, + "learning_rate": 0.0002572378141749529, + "loss": 1.8756, + "step": 11074 + }, + { + "epoch": 1.29214794073037, + "grad_norm": 1.2809888124465942, + "learning_rate": 0.00025722719729851277, + "loss": 2.1347, + "step": 11075 + }, + { + "epoch": 1.2922646132306617, + "grad_norm": 1.1462825536727905, + "learning_rate": 0.00025721657932601007, + "loss": 2.0514, + "step": 11076 + }, + { + "epoch": 1.2923812857309533, + "grad_norm": 1.1436021327972412, + "learning_rate": 0.0002572059602575549, + "loss": 2.1248, + "step": 11077 + }, + { + "epoch": 1.292497958231245, + "grad_norm": 1.2371826171875, + "learning_rate": 0.0002571953400932572, + "loss": 2.0853, + "step": 11078 + }, + { + "epoch": 1.2926146307315367, + "grad_norm": 1.2419730424880981, + "learning_rate": 0.00025718471883322714, + "loss": 1.9332, + "step": 11079 + }, + { + "epoch": 1.2927313032318284, + "grad_norm": 1.249754548072815, + "learning_rate": 0.00025717409647757494, + "loss": 2.1631, + "step": 11080 + }, + { + "epoch": 1.29284797573212, + "grad_norm": 1.1404824256896973, + "learning_rate": 0.00025716347302641055, + "loss": 2.073, + "step": 11081 + }, + { + "epoch": 1.2929646482324118, + "grad_norm": 1.1424517631530762, + "learning_rate": 0.0002571528484798442, + "loss": 2.0203, + "step": 11082 + }, + { + "epoch": 1.2930813207327034, + "grad_norm": 1.237661361694336, + "learning_rate": 0.00025714222283798594, + "loss": 2.1973, + "step": 11083 + }, + { + "epoch": 1.2931979932329951, + "grad_norm": 1.2123863697052002, + "learning_rate": 0.000257131596100946, + "loss": 2.265, + "step": 11084 + }, + { + "epoch": 1.2933146657332868, + "grad_norm": 1.0898412466049194, + "learning_rate": 0.0002571209682688346, + "loss": 2.0033, + "step": 11085 + }, + { + "epoch": 1.2934313382335785, + "grad_norm": 1.3184781074523926, + "learning_rate": 0.00025711033934176184, + "loss": 2.1455, + "step": 11086 + }, + { + "epoch": 1.2935480107338702, + "grad_norm": 1.2315436601638794, + "learning_rate": 0.0002570997093198379, + "loss": 2.0217, + "step": 11087 + }, + { + "epoch": 1.2936646832341618, + "grad_norm": 1.0381221771240234, + "learning_rate": 0.000257089078203173, + "loss": 1.9793, + "step": 11088 + }, + { + "epoch": 1.2937813557344535, + "grad_norm": 1.1525108814239502, + "learning_rate": 0.0002570784459918774, + "loss": 2.0826, + "step": 11089 + }, + { + "epoch": 1.2938980282347452, + "grad_norm": 1.3960496187210083, + "learning_rate": 0.0002570678126860612, + "loss": 2.0664, + "step": 11090 + }, + { + "epoch": 1.2940147007350369, + "grad_norm": 1.359491229057312, + "learning_rate": 0.00025705717828583476, + "loss": 2.0795, + "step": 11091 + }, + { + "epoch": 1.2941313732353286, + "grad_norm": 1.16548752784729, + "learning_rate": 0.0002570465427913083, + "loss": 1.9162, + "step": 11092 + }, + { + "epoch": 1.2942480457356202, + "grad_norm": 1.2295869588851929, + "learning_rate": 0.00025703590620259205, + "loss": 2.0533, + "step": 11093 + }, + { + "epoch": 1.294364718235912, + "grad_norm": 1.2674378156661987, + "learning_rate": 0.00025702526851979627, + "loss": 2.0942, + "step": 11094 + }, + { + "epoch": 1.2944813907362036, + "grad_norm": 1.1031063795089722, + "learning_rate": 0.0002570146297430312, + "loss": 2.11, + "step": 11095 + }, + { + "epoch": 1.2945980632364953, + "grad_norm": 1.2859954833984375, + "learning_rate": 0.00025700398987240725, + "loss": 2.2209, + "step": 11096 + }, + { + "epoch": 1.294714735736787, + "grad_norm": 1.2093868255615234, + "learning_rate": 0.00025699334890803464, + "loss": 2.109, + "step": 11097 + }, + { + "epoch": 1.2948314082370787, + "grad_norm": 1.0884274244308472, + "learning_rate": 0.00025698270685002366, + "loss": 2.0948, + "step": 11098 + }, + { + "epoch": 1.2949480807373703, + "grad_norm": 1.1911709308624268, + "learning_rate": 0.00025697206369848475, + "loss": 1.9921, + "step": 11099 + }, + { + "epoch": 1.295064753237662, + "grad_norm": 1.2308036088943481, + "learning_rate": 0.0002569614194535281, + "loss": 2.0234, + "step": 11100 + }, + { + "epoch": 1.2951814257379537, + "grad_norm": 1.1841824054718018, + "learning_rate": 0.0002569507741152642, + "loss": 2.1013, + "step": 11101 + }, + { + "epoch": 1.2952980982382454, + "grad_norm": 1.161172866821289, + "learning_rate": 0.0002569401276838033, + "loss": 1.9675, + "step": 11102 + }, + { + "epoch": 1.295414770738537, + "grad_norm": 1.1005971431732178, + "learning_rate": 0.00025692948015925587, + "loss": 2.0684, + "step": 11103 + }, + { + "epoch": 1.2955314432388287, + "grad_norm": 1.2548106908798218, + "learning_rate": 0.00025691883154173214, + "loss": 2.3253, + "step": 11104 + }, + { + "epoch": 1.2956481157391204, + "grad_norm": 1.088679552078247, + "learning_rate": 0.00025690818183134265, + "loss": 2.0915, + "step": 11105 + }, + { + "epoch": 1.295764788239412, + "grad_norm": 1.2453581094741821, + "learning_rate": 0.00025689753102819767, + "loss": 2.1056, + "step": 11106 + }, + { + "epoch": 1.2958814607397038, + "grad_norm": 1.4074594974517822, + "learning_rate": 0.0002568868791324078, + "loss": 2.1752, + "step": 11107 + }, + { + "epoch": 1.2959981332399955, + "grad_norm": 1.008345127105713, + "learning_rate": 0.0002568762261440833, + "loss": 1.769, + "step": 11108 + }, + { + "epoch": 1.2961148057402871, + "grad_norm": 1.105207920074463, + "learning_rate": 0.0002568655720633347, + "loss": 2.3821, + "step": 11109 + }, + { + "epoch": 1.2962314782405788, + "grad_norm": 1.2542433738708496, + "learning_rate": 0.0002568549168902724, + "loss": 2.0388, + "step": 11110 + }, + { + "epoch": 1.2963481507408705, + "grad_norm": 1.168134093284607, + "learning_rate": 0.0002568442606250069, + "loss": 2.154, + "step": 11111 + }, + { + "epoch": 1.2964648232411622, + "grad_norm": 1.1861186027526855, + "learning_rate": 0.0002568336032676486, + "loss": 2.1028, + "step": 11112 + }, + { + "epoch": 1.2965814957414539, + "grad_norm": 1.3169267177581787, + "learning_rate": 0.00025682294481830804, + "loss": 2.047, + "step": 11113 + }, + { + "epoch": 1.2966981682417456, + "grad_norm": 1.129515290260315, + "learning_rate": 0.0002568122852770958, + "loss": 1.8718, + "step": 11114 + }, + { + "epoch": 1.2968148407420372, + "grad_norm": 1.1549072265625, + "learning_rate": 0.00025680162464412226, + "loss": 2.0687, + "step": 11115 + }, + { + "epoch": 1.296931513242329, + "grad_norm": 1.275722861289978, + "learning_rate": 0.00025679096291949795, + "loss": 2.1842, + "step": 11116 + }, + { + "epoch": 1.2970481857426206, + "grad_norm": 1.1023802757263184, + "learning_rate": 0.00025678030010333337, + "loss": 1.9006, + "step": 11117 + }, + { + "epoch": 1.2971648582429123, + "grad_norm": 1.1207852363586426, + "learning_rate": 0.0002567696361957392, + "loss": 2.1663, + "step": 11118 + }, + { + "epoch": 1.297281530743204, + "grad_norm": 1.3132915496826172, + "learning_rate": 0.0002567589711968259, + "loss": 2.1528, + "step": 11119 + }, + { + "epoch": 1.2973982032434956, + "grad_norm": 1.214728593826294, + "learning_rate": 0.000256748305106704, + "loss": 1.9605, + "step": 11120 + }, + { + "epoch": 1.2975148757437873, + "grad_norm": 1.011993408203125, + "learning_rate": 0.0002567376379254842, + "loss": 1.9475, + "step": 11121 + }, + { + "epoch": 1.297631548244079, + "grad_norm": 1.2469474077224731, + "learning_rate": 0.0002567269696532769, + "loss": 2.17, + "step": 11122 + }, + { + "epoch": 1.2977482207443707, + "grad_norm": 1.2406939268112183, + "learning_rate": 0.0002567163002901928, + "loss": 2.0332, + "step": 11123 + }, + { + "epoch": 1.2978648932446624, + "grad_norm": 1.2447110414505005, + "learning_rate": 0.00025670562983634256, + "loss": 2.1432, + "step": 11124 + }, + { + "epoch": 1.297981565744954, + "grad_norm": 1.1046956777572632, + "learning_rate": 0.00025669495829183674, + "loss": 2.106, + "step": 11125 + }, + { + "epoch": 1.2980982382452457, + "grad_norm": 1.152066946029663, + "learning_rate": 0.00025668428565678596, + "loss": 2.2334, + "step": 11126 + }, + { + "epoch": 1.2982149107455374, + "grad_norm": 1.1092948913574219, + "learning_rate": 0.0002566736119313009, + "loss": 2.0767, + "step": 11127 + }, + { + "epoch": 1.298331583245829, + "grad_norm": 1.4921929836273193, + "learning_rate": 0.00025666293711549214, + "loss": 2.1887, + "step": 11128 + }, + { + "epoch": 1.2984482557461208, + "grad_norm": 1.151198387145996, + "learning_rate": 0.0002566522612094704, + "loss": 2.1236, + "step": 11129 + }, + { + "epoch": 1.2985649282464125, + "grad_norm": 1.1989511251449585, + "learning_rate": 0.00025664158421334637, + "loss": 2.1128, + "step": 11130 + }, + { + "epoch": 1.2986816007467041, + "grad_norm": 1.1477090120315552, + "learning_rate": 0.00025663090612723074, + "loss": 1.9986, + "step": 11131 + }, + { + "epoch": 1.2987982732469958, + "grad_norm": 1.2557237148284912, + "learning_rate": 0.0002566202269512341, + "loss": 2.2766, + "step": 11132 + }, + { + "epoch": 1.2989149457472875, + "grad_norm": 1.1808396577835083, + "learning_rate": 0.00025660954668546733, + "loss": 2.1427, + "step": 11133 + }, + { + "epoch": 1.2990316182475792, + "grad_norm": 1.7062455415725708, + "learning_rate": 0.00025659886533004105, + "loss": 1.7863, + "step": 11134 + }, + { + "epoch": 1.2991482907478709, + "grad_norm": 1.287206768989563, + "learning_rate": 0.000256588182885066, + "loss": 2.0712, + "step": 11135 + }, + { + "epoch": 1.2992649632481625, + "grad_norm": 1.3870024681091309, + "learning_rate": 0.0002565774993506529, + "loss": 2.0893, + "step": 11136 + }, + { + "epoch": 1.2993816357484542, + "grad_norm": 1.2046408653259277, + "learning_rate": 0.0002565668147269126, + "loss": 2.158, + "step": 11137 + }, + { + "epoch": 1.299498308248746, + "grad_norm": 1.4384764432907104, + "learning_rate": 0.0002565561290139558, + "loss": 2.2262, + "step": 11138 + }, + { + "epoch": 1.2996149807490376, + "grad_norm": 1.1940768957138062, + "learning_rate": 0.00025654544221189325, + "loss": 2.0749, + "step": 11139 + }, + { + "epoch": 1.2997316532493293, + "grad_norm": 1.180048942565918, + "learning_rate": 0.00025653475432083577, + "loss": 1.993, + "step": 11140 + }, + { + "epoch": 1.299848325749621, + "grad_norm": 1.173736333847046, + "learning_rate": 0.0002565240653408942, + "loss": 1.9649, + "step": 11141 + }, + { + "epoch": 1.2999649982499126, + "grad_norm": 1.3888601064682007, + "learning_rate": 0.0002565133752721793, + "loss": 2.0643, + "step": 11142 + }, + { + "epoch": 1.3000816707502043, + "grad_norm": 1.111755132675171, + "learning_rate": 0.0002565026841148019, + "loss": 1.891, + "step": 11143 + }, + { + "epoch": 1.300198343250496, + "grad_norm": 1.0363377332687378, + "learning_rate": 0.0002564919918688728, + "loss": 1.9491, + "step": 11144 + }, + { + "epoch": 1.3003150157507877, + "grad_norm": 1.2180883884429932, + "learning_rate": 0.00025648129853450297, + "loss": 2.0631, + "step": 11145 + }, + { + "epoch": 1.3004316882510794, + "grad_norm": 1.2572243213653564, + "learning_rate": 0.00025647060411180313, + "loss": 2.0485, + "step": 11146 + }, + { + "epoch": 1.300548360751371, + "grad_norm": 1.192103624343872, + "learning_rate": 0.0002564599086008842, + "loss": 2.0862, + "step": 11147 + }, + { + "epoch": 1.3006650332516627, + "grad_norm": 1.1233543157577515, + "learning_rate": 0.00025644921200185705, + "loss": 2.2091, + "step": 11148 + }, + { + "epoch": 1.3007817057519544, + "grad_norm": 1.342734456062317, + "learning_rate": 0.0002564385143148326, + "loss": 1.9507, + "step": 11149 + }, + { + "epoch": 1.300898378252246, + "grad_norm": 1.2139033079147339, + "learning_rate": 0.00025642781553992177, + "loss": 2.1817, + "step": 11150 + }, + { + "epoch": 1.3010150507525378, + "grad_norm": 1.0868496894836426, + "learning_rate": 0.00025641711567723536, + "loss": 1.9921, + "step": 11151 + }, + { + "epoch": 1.3011317232528294, + "grad_norm": 1.2921535968780518, + "learning_rate": 0.00025640641472688444, + "loss": 2.2031, + "step": 11152 + }, + { + "epoch": 1.3012483957531211, + "grad_norm": 1.244962215423584, + "learning_rate": 0.0002563957126889798, + "loss": 2.0313, + "step": 11153 + }, + { + "epoch": 1.3013650682534128, + "grad_norm": 1.1670740842819214, + "learning_rate": 0.00025638500956363247, + "loss": 1.9691, + "step": 11154 + }, + { + "epoch": 1.3014817407537045, + "grad_norm": 1.1222015619277954, + "learning_rate": 0.0002563743053509534, + "loss": 2.2325, + "step": 11155 + }, + { + "epoch": 1.3015984132539962, + "grad_norm": 1.0554909706115723, + "learning_rate": 0.0002563636000510536, + "loss": 2.139, + "step": 11156 + }, + { + "epoch": 1.3017150857542878, + "grad_norm": 1.1401702165603638, + "learning_rate": 0.000256352893664044, + "loss": 2.156, + "step": 11157 + }, + { + "epoch": 1.3018317582545795, + "grad_norm": 1.2652699947357178, + "learning_rate": 0.0002563421861900356, + "loss": 2.2603, + "step": 11158 + }, + { + "epoch": 1.3019484307548712, + "grad_norm": 1.3559935092926025, + "learning_rate": 0.0002563314776291394, + "loss": 2.1599, + "step": 11159 + }, + { + "epoch": 1.302065103255163, + "grad_norm": 1.0832504034042358, + "learning_rate": 0.00025632076798146644, + "loss": 2.0848, + "step": 11160 + }, + { + "epoch": 1.3021817757554546, + "grad_norm": 1.3462209701538086, + "learning_rate": 0.0002563100572471277, + "loss": 2.1094, + "step": 11161 + }, + { + "epoch": 1.3022984482557463, + "grad_norm": 1.148258090019226, + "learning_rate": 0.00025629934542623425, + "loss": 2.0637, + "step": 11162 + }, + { + "epoch": 1.302415120756038, + "grad_norm": 1.171261191368103, + "learning_rate": 0.0002562886325188972, + "loss": 2.1396, + "step": 11163 + }, + { + "epoch": 1.3025317932563296, + "grad_norm": 1.106808066368103, + "learning_rate": 0.0002562779185252274, + "loss": 1.9535, + "step": 11164 + }, + { + "epoch": 1.3026484657566213, + "grad_norm": 1.517569661140442, + "learning_rate": 0.00025626720344533617, + "loss": 2.0919, + "step": 11165 + }, + { + "epoch": 1.302765138256913, + "grad_norm": 1.374820351600647, + "learning_rate": 0.0002562564872793344, + "loss": 2.1034, + "step": 11166 + }, + { + "epoch": 1.3028818107572047, + "grad_norm": 1.1794270277023315, + "learning_rate": 0.00025624577002733334, + "loss": 2.083, + "step": 11167 + }, + { + "epoch": 1.3029984832574963, + "grad_norm": 1.1862553358078003, + "learning_rate": 0.000256235051689444, + "loss": 2.0723, + "step": 11168 + }, + { + "epoch": 1.303115155757788, + "grad_norm": 1.1942309141159058, + "learning_rate": 0.0002562243322657775, + "loss": 2.2449, + "step": 11169 + }, + { + "epoch": 1.3032318282580797, + "grad_norm": 1.2128989696502686, + "learning_rate": 0.000256213611756445, + "loss": 2.2779, + "step": 11170 + }, + { + "epoch": 1.3033485007583714, + "grad_norm": 1.2697317600250244, + "learning_rate": 0.0002562028901615576, + "loss": 2.1325, + "step": 11171 + }, + { + "epoch": 1.303465173258663, + "grad_norm": 1.0720983743667603, + "learning_rate": 0.0002561921674812265, + "loss": 1.9342, + "step": 11172 + }, + { + "epoch": 1.3035818457589547, + "grad_norm": 1.115573763847351, + "learning_rate": 0.00025618144371556285, + "loss": 1.9166, + "step": 11173 + }, + { + "epoch": 1.3036985182592464, + "grad_norm": 1.0408461093902588, + "learning_rate": 0.00025617071886467775, + "loss": 1.9666, + "step": 11174 + }, + { + "epoch": 1.3038151907595381, + "grad_norm": 1.2446582317352295, + "learning_rate": 0.00025615999292868247, + "loss": 2.1978, + "step": 11175 + }, + { + "epoch": 1.3039318632598298, + "grad_norm": 1.3268524408340454, + "learning_rate": 0.00025614926590768823, + "loss": 2.1132, + "step": 11176 + }, + { + "epoch": 1.3040485357601215, + "grad_norm": 1.087984323501587, + "learning_rate": 0.00025613853780180607, + "loss": 2.1156, + "step": 11177 + }, + { + "epoch": 1.3041652082604132, + "grad_norm": 1.0946546792984009, + "learning_rate": 0.00025612780861114737, + "loss": 2.103, + "step": 11178 + }, + { + "epoch": 1.3042818807607048, + "grad_norm": 1.0763753652572632, + "learning_rate": 0.00025611707833582333, + "loss": 2.072, + "step": 11179 + }, + { + "epoch": 1.3043985532609965, + "grad_norm": 1.0833723545074463, + "learning_rate": 0.0002561063469759451, + "loss": 1.9685, + "step": 11180 + }, + { + "epoch": 1.3045152257612882, + "grad_norm": 1.4175479412078857, + "learning_rate": 0.000256095614531624, + "loss": 2.1289, + "step": 11181 + }, + { + "epoch": 1.3046318982615799, + "grad_norm": 1.177369475364685, + "learning_rate": 0.0002560848810029714, + "loss": 2.0346, + "step": 11182 + }, + { + "epoch": 1.3047485707618716, + "grad_norm": 1.2313029766082764, + "learning_rate": 0.0002560741463900983, + "loss": 2.0234, + "step": 11183 + }, + { + "epoch": 1.3048652432621632, + "grad_norm": 1.3771588802337646, + "learning_rate": 0.0002560634106931163, + "loss": 2.0888, + "step": 11184 + }, + { + "epoch": 1.304981915762455, + "grad_norm": 1.1105772256851196, + "learning_rate": 0.00025605267391213643, + "loss": 2.0629, + "step": 11185 + }, + { + "epoch": 1.3050985882627466, + "grad_norm": 1.1629419326782227, + "learning_rate": 0.0002560419360472701, + "loss": 2.3361, + "step": 11186 + }, + { + "epoch": 1.3052152607630383, + "grad_norm": 1.211350679397583, + "learning_rate": 0.0002560311970986287, + "loss": 2.095, + "step": 11187 + }, + { + "epoch": 1.30533193326333, + "grad_norm": 1.37654447555542, + "learning_rate": 0.00025602045706632346, + "loss": 2.1476, + "step": 11188 + }, + { + "epoch": 1.3054486057636217, + "grad_norm": 1.380738377571106, + "learning_rate": 0.0002560097159504658, + "loss": 2.1906, + "step": 11189 + }, + { + "epoch": 1.3055652782639133, + "grad_norm": 1.1948747634887695, + "learning_rate": 0.000255998973751167, + "loss": 2.0141, + "step": 11190 + }, + { + "epoch": 1.305681950764205, + "grad_norm": 1.1592929363250732, + "learning_rate": 0.0002559882304685385, + "loss": 1.8424, + "step": 11191 + }, + { + "epoch": 1.3057986232644967, + "grad_norm": 1.3253400325775146, + "learning_rate": 0.0002559774861026916, + "loss": 2.2262, + "step": 11192 + }, + { + "epoch": 1.3059152957647884, + "grad_norm": 1.3098617792129517, + "learning_rate": 0.0002559667406537376, + "loss": 2.1564, + "step": 11193 + }, + { + "epoch": 1.30603196826508, + "grad_norm": 1.457944631576538, + "learning_rate": 0.00025595599412178813, + "loss": 2.2149, + "step": 11194 + }, + { + "epoch": 1.3061486407653717, + "grad_norm": 1.2617518901824951, + "learning_rate": 0.00025594524650695443, + "loss": 2.1698, + "step": 11195 + }, + { + "epoch": 1.3062653132656634, + "grad_norm": 1.1955196857452393, + "learning_rate": 0.00025593449780934803, + "loss": 2.1214, + "step": 11196 + }, + { + "epoch": 1.306381985765955, + "grad_norm": 1.2354612350463867, + "learning_rate": 0.00025592374802908027, + "loss": 2.0143, + "step": 11197 + }, + { + "epoch": 1.3064986582662468, + "grad_norm": 1.1356252431869507, + "learning_rate": 0.0002559129971662626, + "loss": 2.1751, + "step": 11198 + }, + { + "epoch": 1.3066153307665385, + "grad_norm": 1.3318474292755127, + "learning_rate": 0.0002559022452210065, + "loss": 2.0838, + "step": 11199 + }, + { + "epoch": 1.3067320032668301, + "grad_norm": 1.3469969034194946, + "learning_rate": 0.00025589149219342345, + "loss": 2.2238, + "step": 11200 + }, + { + "epoch": 1.3068486757671218, + "grad_norm": 1.2735782861709595, + "learning_rate": 0.0002558807380836248, + "loss": 2.0405, + "step": 11201 + }, + { + "epoch": 1.3069653482674135, + "grad_norm": 1.0788981914520264, + "learning_rate": 0.00025586998289172226, + "loss": 1.9947, + "step": 11202 + }, + { + "epoch": 1.3070820207677052, + "grad_norm": 1.1008739471435547, + "learning_rate": 0.00025585922661782714, + "loss": 1.9519, + "step": 11203 + }, + { + "epoch": 1.3071986932679969, + "grad_norm": 1.1184321641921997, + "learning_rate": 0.00025584846926205107, + "loss": 2.2413, + "step": 11204 + }, + { + "epoch": 1.3073153657682886, + "grad_norm": 1.2909018993377686, + "learning_rate": 0.0002558377108245054, + "loss": 2.0033, + "step": 11205 + }, + { + "epoch": 1.3074320382685802, + "grad_norm": 1.319319725036621, + "learning_rate": 0.0002558269513053019, + "loss": 2.093, + "step": 11206 + }, + { + "epoch": 1.307548710768872, + "grad_norm": 1.0138858556747437, + "learning_rate": 0.00025581619070455194, + "loss": 2.1516, + "step": 11207 + }, + { + "epoch": 1.3076653832691636, + "grad_norm": 1.4235529899597168, + "learning_rate": 0.0002558054290223672, + "loss": 2.0906, + "step": 11208 + }, + { + "epoch": 1.3077820557694553, + "grad_norm": 1.213219404220581, + "learning_rate": 0.00025579466625885906, + "loss": 2.0246, + "step": 11209 + }, + { + "epoch": 1.307898728269747, + "grad_norm": 1.373543381690979, + "learning_rate": 0.0002557839024141393, + "loss": 2.0495, + "step": 11210 + }, + { + "epoch": 1.3080154007700386, + "grad_norm": 1.2827321290969849, + "learning_rate": 0.0002557731374883193, + "loss": 2.2157, + "step": 11211 + }, + { + "epoch": 1.3081320732703303, + "grad_norm": 1.1002378463745117, + "learning_rate": 0.00025576237148151084, + "loss": 1.9573, + "step": 11212 + }, + { + "epoch": 1.308248745770622, + "grad_norm": 1.1999197006225586, + "learning_rate": 0.00025575160439382543, + "loss": 2.1552, + "step": 11213 + }, + { + "epoch": 1.3083654182709137, + "grad_norm": 1.107131838798523, + "learning_rate": 0.00025574083622537477, + "loss": 2.1629, + "step": 11214 + }, + { + "epoch": 1.3084820907712054, + "grad_norm": 1.0977849960327148, + "learning_rate": 0.00025573006697627047, + "loss": 2.2853, + "step": 11215 + }, + { + "epoch": 1.308598763271497, + "grad_norm": 1.2057430744171143, + "learning_rate": 0.0002557192966466241, + "loss": 2.0719, + "step": 11216 + }, + { + "epoch": 1.3087154357717887, + "grad_norm": 1.1061625480651855, + "learning_rate": 0.00025570852523654734, + "loss": 1.9233, + "step": 11217 + }, + { + "epoch": 1.3088321082720804, + "grad_norm": 1.098464012145996, + "learning_rate": 0.0002556977527461519, + "loss": 2.0752, + "step": 11218 + }, + { + "epoch": 1.308948780772372, + "grad_norm": 1.5938692092895508, + "learning_rate": 0.00025568697917554943, + "loss": 2.4008, + "step": 11219 + }, + { + "epoch": 1.3090654532726638, + "grad_norm": 1.0819982290267944, + "learning_rate": 0.0002556762045248517, + "loss": 2.0378, + "step": 11220 + }, + { + "epoch": 1.3091821257729555, + "grad_norm": 1.2189496755599976, + "learning_rate": 0.0002556654287941703, + "loss": 2.0666, + "step": 11221 + }, + { + "epoch": 1.3092987982732471, + "grad_norm": 1.0874837636947632, + "learning_rate": 0.0002556546519836169, + "loss": 2.1773, + "step": 11222 + }, + { + "epoch": 1.3094154707735388, + "grad_norm": 1.278602123260498, + "learning_rate": 0.0002556438740933034, + "loss": 2.3003, + "step": 11223 + }, + { + "epoch": 1.3095321432738305, + "grad_norm": 1.387523889541626, + "learning_rate": 0.0002556330951233414, + "loss": 2.0165, + "step": 11224 + }, + { + "epoch": 1.3096488157741222, + "grad_norm": 1.2403233051300049, + "learning_rate": 0.00025562231507384267, + "loss": 2.0731, + "step": 11225 + }, + { + "epoch": 1.3097654882744139, + "grad_norm": 1.2155280113220215, + "learning_rate": 0.000255611533944919, + "loss": 2.0903, + "step": 11226 + }, + { + "epoch": 1.3098821607747055, + "grad_norm": 1.1661646366119385, + "learning_rate": 0.00025560075173668205, + "loss": 2.1406, + "step": 11227 + }, + { + "epoch": 1.3099988332749972, + "grad_norm": 1.1945827007293701, + "learning_rate": 0.00025558996844924373, + "loss": 2.0089, + "step": 11228 + }, + { + "epoch": 1.310115505775289, + "grad_norm": 1.3464967012405396, + "learning_rate": 0.0002555791840827158, + "loss": 2.1634, + "step": 11229 + }, + { + "epoch": 1.3102321782755806, + "grad_norm": 1.169306993484497, + "learning_rate": 0.00025556839863720996, + "loss": 2.0174, + "step": 11230 + }, + { + "epoch": 1.3103488507758723, + "grad_norm": 1.3291354179382324, + "learning_rate": 0.0002555576121128382, + "loss": 2.1007, + "step": 11231 + }, + { + "epoch": 1.310465523276164, + "grad_norm": 1.353952407836914, + "learning_rate": 0.0002555468245097122, + "loss": 2.0893, + "step": 11232 + }, + { + "epoch": 1.3105821957764556, + "grad_norm": 1.2496217489242554, + "learning_rate": 0.00025553603582794386, + "loss": 1.9011, + "step": 11233 + }, + { + "epoch": 1.3106988682767473, + "grad_norm": 1.609719157218933, + "learning_rate": 0.00025552524606764493, + "loss": 2.1593, + "step": 11234 + }, + { + "epoch": 1.310815540777039, + "grad_norm": 1.3963918685913086, + "learning_rate": 0.00025551445522892734, + "loss": 2.0099, + "step": 11235 + }, + { + "epoch": 1.3109322132773307, + "grad_norm": 1.1928967237472534, + "learning_rate": 0.000255503663311903, + "loss": 2.0785, + "step": 11236 + }, + { + "epoch": 1.3110488857776224, + "grad_norm": 1.4258694648742676, + "learning_rate": 0.00025549287031668375, + "loss": 2.0885, + "step": 11237 + }, + { + "epoch": 1.311165558277914, + "grad_norm": 1.2480957508087158, + "learning_rate": 0.0002554820762433814, + "loss": 2.0932, + "step": 11238 + }, + { + "epoch": 1.3112822307782057, + "grad_norm": 1.0289306640625, + "learning_rate": 0.00025547128109210804, + "loss": 1.8717, + "step": 11239 + }, + { + "epoch": 1.3113989032784974, + "grad_norm": 1.06467866897583, + "learning_rate": 0.0002554604848629754, + "loss": 2.0177, + "step": 11240 + }, + { + "epoch": 1.311515575778789, + "grad_norm": 1.1704853773117065, + "learning_rate": 0.0002554496875560955, + "loss": 1.9379, + "step": 11241 + }, + { + "epoch": 1.3116322482790808, + "grad_norm": 1.1577246189117432, + "learning_rate": 0.0002554388891715802, + "loss": 2.0915, + "step": 11242 + }, + { + "epoch": 1.3117489207793724, + "grad_norm": 1.2467447519302368, + "learning_rate": 0.0002554280897095415, + "loss": 2.0599, + "step": 11243 + }, + { + "epoch": 1.3118655932796641, + "grad_norm": 1.1806581020355225, + "learning_rate": 0.0002554172891700913, + "loss": 2.0695, + "step": 11244 + }, + { + "epoch": 1.3119822657799558, + "grad_norm": 1.0971647500991821, + "learning_rate": 0.0002554064875533417, + "loss": 2.0119, + "step": 11245 + }, + { + "epoch": 1.3120989382802475, + "grad_norm": 1.0562947988510132, + "learning_rate": 0.00025539568485940454, + "loss": 1.9533, + "step": 11246 + }, + { + "epoch": 1.3122156107805392, + "grad_norm": 1.1870852708816528, + "learning_rate": 0.0002553848810883919, + "loss": 2.114, + "step": 11247 + }, + { + "epoch": 1.3123322832808308, + "grad_norm": 1.0931988954544067, + "learning_rate": 0.0002553740762404157, + "loss": 1.9452, + "step": 11248 + }, + { + "epoch": 1.3124489557811225, + "grad_norm": 1.0446921586990356, + "learning_rate": 0.00025536327031558804, + "loss": 1.8963, + "step": 11249 + }, + { + "epoch": 1.312565628281414, + "grad_norm": 1.1508105993270874, + "learning_rate": 0.0002553524633140209, + "loss": 1.9411, + "step": 11250 + }, + { + "epoch": 1.3126823007817057, + "grad_norm": 1.3295166492462158, + "learning_rate": 0.0002553416552358263, + "loss": 1.939, + "step": 11251 + }, + { + "epoch": 1.3127989732819974, + "grad_norm": 1.0562622547149658, + "learning_rate": 0.0002553308460811163, + "loss": 1.9019, + "step": 11252 + }, + { + "epoch": 1.312915645782289, + "grad_norm": 1.2474644184112549, + "learning_rate": 0.000255320035850003, + "loss": 2.0271, + "step": 11253 + }, + { + "epoch": 1.3130323182825807, + "grad_norm": 1.0848520994186401, + "learning_rate": 0.0002553092245425984, + "loss": 1.9579, + "step": 11254 + }, + { + "epoch": 1.3131489907828724, + "grad_norm": 1.1144747734069824, + "learning_rate": 0.0002552984121590146, + "loss": 2.0396, + "step": 11255 + }, + { + "epoch": 1.313265663283164, + "grad_norm": 1.5419507026672363, + "learning_rate": 0.0002552875986993637, + "loss": 2.1762, + "step": 11256 + }, + { + "epoch": 1.3133823357834558, + "grad_norm": 1.0366597175598145, + "learning_rate": 0.0002552767841637578, + "loss": 2.0171, + "step": 11257 + }, + { + "epoch": 1.3134990082837474, + "grad_norm": 1.231966495513916, + "learning_rate": 0.000255265968552309, + "loss": 2.0648, + "step": 11258 + }, + { + "epoch": 1.3136156807840391, + "grad_norm": 1.0767632722854614, + "learning_rate": 0.0002552551518651295, + "loss": 2.0361, + "step": 11259 + }, + { + "epoch": 1.3137323532843308, + "grad_norm": 1.03428053855896, + "learning_rate": 0.0002552443341023313, + "loss": 2.0533, + "step": 11260 + }, + { + "epoch": 1.3138490257846225, + "grad_norm": 1.6068655252456665, + "learning_rate": 0.0002552335152640267, + "loss": 2.275, + "step": 11261 + }, + { + "epoch": 1.3139656982849142, + "grad_norm": 1.3024927377700806, + "learning_rate": 0.0002552226953503277, + "loss": 2.2096, + "step": 11262 + }, + { + "epoch": 1.3140823707852058, + "grad_norm": 1.2348865270614624, + "learning_rate": 0.0002552118743613466, + "loss": 2.0481, + "step": 11263 + }, + { + "epoch": 1.3141990432854975, + "grad_norm": 1.2042218446731567, + "learning_rate": 0.0002552010522971955, + "loss": 2.0187, + "step": 11264 + }, + { + "epoch": 1.3143157157857892, + "grad_norm": 1.0632790327072144, + "learning_rate": 0.0002551902291579866, + "loss": 1.9207, + "step": 11265 + }, + { + "epoch": 1.3144323882860809, + "grad_norm": 1.2986540794372559, + "learning_rate": 0.00025517940494383223, + "loss": 2.0406, + "step": 11266 + }, + { + "epoch": 1.3145490607863726, + "grad_norm": 1.1548908948898315, + "learning_rate": 0.0002551685796548444, + "loss": 2.0652, + "step": 11267 + }, + { + "epoch": 1.3146657332866643, + "grad_norm": 1.1383553743362427, + "learning_rate": 0.0002551577532911354, + "loss": 2.1054, + "step": 11268 + }, + { + "epoch": 1.314782405786956, + "grad_norm": 1.128766417503357, + "learning_rate": 0.0002551469258528176, + "loss": 1.907, + "step": 11269 + }, + { + "epoch": 1.3148990782872476, + "grad_norm": 1.2653030157089233, + "learning_rate": 0.0002551360973400031, + "loss": 2.0737, + "step": 11270 + }, + { + "epoch": 1.3150157507875393, + "grad_norm": 1.066852331161499, + "learning_rate": 0.0002551252677528042, + "loss": 1.9854, + "step": 11271 + }, + { + "epoch": 1.315132423287831, + "grad_norm": 1.178679347038269, + "learning_rate": 0.0002551144370913331, + "loss": 1.9842, + "step": 11272 + }, + { + "epoch": 1.3152490957881227, + "grad_norm": 1.2166904211044312, + "learning_rate": 0.00025510360535570216, + "loss": 2.0936, + "step": 11273 + }, + { + "epoch": 1.3153657682884143, + "grad_norm": 1.1695101261138916, + "learning_rate": 0.00025509277254602374, + "loss": 2.079, + "step": 11274 + }, + { + "epoch": 1.315482440788706, + "grad_norm": 1.2703458070755005, + "learning_rate": 0.00025508193866241, + "loss": 2.2624, + "step": 11275 + }, + { + "epoch": 1.3155991132889977, + "grad_norm": 1.1827569007873535, + "learning_rate": 0.00025507110370497337, + "loss": 2.0568, + "step": 11276 + }, + { + "epoch": 1.3157157857892894, + "grad_norm": 1.1838181018829346, + "learning_rate": 0.00025506026767382607, + "loss": 1.9235, + "step": 11277 + }, + { + "epoch": 1.315832458289581, + "grad_norm": 1.184999704360962, + "learning_rate": 0.00025504943056908053, + "loss": 2.1628, + "step": 11278 + }, + { + "epoch": 1.3159491307898727, + "grad_norm": 1.1684187650680542, + "learning_rate": 0.000255038592390849, + "loss": 2.0341, + "step": 11279 + }, + { + "epoch": 1.3160658032901644, + "grad_norm": 1.153696894645691, + "learning_rate": 0.00025502775313924396, + "loss": 2.0298, + "step": 11280 + }, + { + "epoch": 1.316182475790456, + "grad_norm": 1.3226683139801025, + "learning_rate": 0.0002550169128143777, + "loss": 2.1812, + "step": 11281 + }, + { + "epoch": 1.3162991482907478, + "grad_norm": 1.3859355449676514, + "learning_rate": 0.0002550060714163626, + "loss": 2.3071, + "step": 11282 + }, + { + "epoch": 1.3164158207910395, + "grad_norm": 1.2739322185516357, + "learning_rate": 0.00025499522894531105, + "loss": 2.2578, + "step": 11283 + }, + { + "epoch": 1.3165324932913312, + "grad_norm": 1.2693151235580444, + "learning_rate": 0.0002549843854013355, + "loss": 2.1673, + "step": 11284 + }, + { + "epoch": 1.3166491657916228, + "grad_norm": 1.043060541152954, + "learning_rate": 0.0002549735407845483, + "loss": 1.9206, + "step": 11285 + }, + { + "epoch": 1.3167658382919145, + "grad_norm": 1.0504367351531982, + "learning_rate": 0.00025496269509506193, + "loss": 2.0458, + "step": 11286 + }, + { + "epoch": 1.3168825107922062, + "grad_norm": 1.1484906673431396, + "learning_rate": 0.00025495184833298877, + "loss": 1.8649, + "step": 11287 + }, + { + "epoch": 1.3169991832924979, + "grad_norm": 1.0780370235443115, + "learning_rate": 0.00025494100049844136, + "loss": 2.0661, + "step": 11288 + }, + { + "epoch": 1.3171158557927896, + "grad_norm": 1.1951513290405273, + "learning_rate": 0.000254930151591532, + "loss": 2.087, + "step": 11289 + }, + { + "epoch": 1.3172325282930812, + "grad_norm": 1.3508970737457275, + "learning_rate": 0.00025491930161237336, + "loss": 2.1566, + "step": 11290 + }, + { + "epoch": 1.317349200793373, + "grad_norm": 1.3636149168014526, + "learning_rate": 0.00025490845056107774, + "loss": 2.1613, + "step": 11291 + }, + { + "epoch": 1.3174658732936646, + "grad_norm": 1.1671866178512573, + "learning_rate": 0.00025489759843775776, + "loss": 2.0783, + "step": 11292 + }, + { + "epoch": 1.3175825457939563, + "grad_norm": 1.386754035949707, + "learning_rate": 0.0002548867452425259, + "loss": 2.1014, + "step": 11293 + }, + { + "epoch": 1.317699218294248, + "grad_norm": 1.0512936115264893, + "learning_rate": 0.0002548758909754946, + "loss": 2.2375, + "step": 11294 + }, + { + "epoch": 1.3178158907945396, + "grad_norm": 1.1081547737121582, + "learning_rate": 0.0002548650356367764, + "loss": 2.0514, + "step": 11295 + }, + { + "epoch": 1.3179325632948313, + "grad_norm": 1.2859573364257812, + "learning_rate": 0.0002548541792264839, + "loss": 2.1212, + "step": 11296 + }, + { + "epoch": 1.318049235795123, + "grad_norm": 1.1847913265228271, + "learning_rate": 0.00025484332174472964, + "loss": 2.261, + "step": 11297 + }, + { + "epoch": 1.3181659082954147, + "grad_norm": 1.1781688928604126, + "learning_rate": 0.00025483246319162614, + "loss": 2.0055, + "step": 11298 + }, + { + "epoch": 1.3182825807957064, + "grad_norm": 1.1621202230453491, + "learning_rate": 0.00025482160356728596, + "loss": 2.142, + "step": 11299 + }, + { + "epoch": 1.318399253295998, + "grad_norm": 1.0866049528121948, + "learning_rate": 0.00025481074287182174, + "loss": 1.9471, + "step": 11300 + }, + { + "epoch": 1.3185159257962897, + "grad_norm": 1.1200990676879883, + "learning_rate": 0.000254799881105346, + "loss": 2.0481, + "step": 11301 + }, + { + "epoch": 1.3186325982965814, + "grad_norm": 0.9840235114097595, + "learning_rate": 0.0002547890182679714, + "loss": 1.7809, + "step": 11302 + }, + { + "epoch": 1.318749270796873, + "grad_norm": 1.428930640220642, + "learning_rate": 0.0002547781543598105, + "loss": 2.2516, + "step": 11303 + }, + { + "epoch": 1.3188659432971648, + "grad_norm": 1.1997429132461548, + "learning_rate": 0.00025476728938097603, + "loss": 1.8661, + "step": 11304 + }, + { + "epoch": 1.3189826157974565, + "grad_norm": 1.3556088209152222, + "learning_rate": 0.0002547564233315805, + "loss": 2.0516, + "step": 11305 + }, + { + "epoch": 1.3190992882977481, + "grad_norm": 1.2988289594650269, + "learning_rate": 0.0002547455562117366, + "loss": 2.0681, + "step": 11306 + }, + { + "epoch": 1.3192159607980398, + "grad_norm": 1.4932608604431152, + "learning_rate": 0.000254734688021557, + "loss": 2.1051, + "step": 11307 + }, + { + "epoch": 1.3193326332983315, + "grad_norm": 1.0402649641036987, + "learning_rate": 0.0002547238187611544, + "loss": 2.1827, + "step": 11308 + }, + { + "epoch": 1.3194493057986232, + "grad_norm": 1.254902720451355, + "learning_rate": 0.00025471294843064146, + "loss": 2.0482, + "step": 11309 + }, + { + "epoch": 1.3195659782989149, + "grad_norm": 1.270281195640564, + "learning_rate": 0.0002547020770301308, + "loss": 2.1881, + "step": 11310 + }, + { + "epoch": 1.3196826507992065, + "grad_norm": 1.0852649211883545, + "learning_rate": 0.0002546912045597352, + "loss": 2.1485, + "step": 11311 + }, + { + "epoch": 1.3197993232994982, + "grad_norm": 1.2303770780563354, + "learning_rate": 0.00025468033101956736, + "loss": 2.0361, + "step": 11312 + }, + { + "epoch": 1.31991599579979, + "grad_norm": 1.0121716260910034, + "learning_rate": 0.00025466945640974004, + "loss": 1.8919, + "step": 11313 + }, + { + "epoch": 1.3200326683000816, + "grad_norm": 1.1587212085723877, + "learning_rate": 0.0002546585807303659, + "loss": 2.0796, + "step": 11314 + }, + { + "epoch": 1.3201493408003733, + "grad_norm": 1.3669062852859497, + "learning_rate": 0.0002546477039815577, + "loss": 2.0113, + "step": 11315 + }, + { + "epoch": 1.320266013300665, + "grad_norm": 1.0729676485061646, + "learning_rate": 0.0002546368261634283, + "loss": 1.8968, + "step": 11316 + }, + { + "epoch": 1.3203826858009566, + "grad_norm": 1.1394652128219604, + "learning_rate": 0.00025462594727609033, + "loss": 2.052, + "step": 11317 + }, + { + "epoch": 1.3204993583012483, + "grad_norm": 1.1394715309143066, + "learning_rate": 0.0002546150673196566, + "loss": 1.9692, + "step": 11318 + }, + { + "epoch": 1.32061603080154, + "grad_norm": 1.1885409355163574, + "learning_rate": 0.00025460418629424003, + "loss": 2.0336, + "step": 11319 + }, + { + "epoch": 1.3207327033018317, + "grad_norm": 1.2621489763259888, + "learning_rate": 0.0002545933041999533, + "loss": 2.0399, + "step": 11320 + }, + { + "epoch": 1.3208493758021234, + "grad_norm": 1.1036255359649658, + "learning_rate": 0.00025458242103690916, + "loss": 1.9968, + "step": 11321 + }, + { + "epoch": 1.320966048302415, + "grad_norm": 1.3565363883972168, + "learning_rate": 0.00025457153680522065, + "loss": 2.0962, + "step": 11322 + }, + { + "epoch": 1.3210827208027067, + "grad_norm": 1.1296664476394653, + "learning_rate": 0.00025456065150500037, + "loss": 2.1851, + "step": 11323 + }, + { + "epoch": 1.3211993933029984, + "grad_norm": 1.1015374660491943, + "learning_rate": 0.0002545497651363613, + "loss": 2.1674, + "step": 11324 + }, + { + "epoch": 1.32131606580329, + "grad_norm": 1.0999846458435059, + "learning_rate": 0.0002545388776994163, + "loss": 2.0375, + "step": 11325 + }, + { + "epoch": 1.3214327383035818, + "grad_norm": 1.2576043605804443, + "learning_rate": 0.00025452798919427824, + "loss": 2.1466, + "step": 11326 + }, + { + "epoch": 1.3215494108038734, + "grad_norm": 1.095327615737915, + "learning_rate": 0.0002545170996210599, + "loss": 2.0581, + "step": 11327 + }, + { + "epoch": 1.3216660833041651, + "grad_norm": 1.3410096168518066, + "learning_rate": 0.0002545062089798743, + "loss": 2.0318, + "step": 11328 + }, + { + "epoch": 1.3217827558044568, + "grad_norm": 1.2055203914642334, + "learning_rate": 0.00025449531727083426, + "loss": 1.9189, + "step": 11329 + }, + { + "epoch": 1.3218994283047485, + "grad_norm": 0.9809430241584778, + "learning_rate": 0.0002544844244940528, + "loss": 2.0417, + "step": 11330 + }, + { + "epoch": 1.3220161008050402, + "grad_norm": 0.9843941926956177, + "learning_rate": 0.0002544735306496426, + "loss": 1.9202, + "step": 11331 + }, + { + "epoch": 1.3221327733053319, + "grad_norm": 1.113356351852417, + "learning_rate": 0.00025446263573771687, + "loss": 2.0701, + "step": 11332 + }, + { + "epoch": 1.3222494458056235, + "grad_norm": 1.1271916627883911, + "learning_rate": 0.00025445173975838844, + "loss": 2.2394, + "step": 11333 + }, + { + "epoch": 1.3223661183059152, + "grad_norm": 1.1512675285339355, + "learning_rate": 0.00025444084271177025, + "loss": 2.0487, + "step": 11334 + }, + { + "epoch": 1.322482790806207, + "grad_norm": 1.0546153783798218, + "learning_rate": 0.00025442994459797533, + "loss": 1.8459, + "step": 11335 + }, + { + "epoch": 1.3225994633064986, + "grad_norm": 1.2576464414596558, + "learning_rate": 0.00025441904541711655, + "loss": 2.2652, + "step": 11336 + }, + { + "epoch": 1.3227161358067903, + "grad_norm": 1.398413062095642, + "learning_rate": 0.000254408145169307, + "loss": 2.2532, + "step": 11337 + }, + { + "epoch": 1.322832808307082, + "grad_norm": 1.1796042919158936, + "learning_rate": 0.0002543972438546597, + "loss": 1.9544, + "step": 11338 + }, + { + "epoch": 1.3229494808073736, + "grad_norm": 1.123504877090454, + "learning_rate": 0.0002543863414732876, + "loss": 1.8584, + "step": 11339 + }, + { + "epoch": 1.3230661533076653, + "grad_norm": 1.137612223625183, + "learning_rate": 0.0002543754380253036, + "loss": 2.0293, + "step": 11340 + }, + { + "epoch": 1.323182825807957, + "grad_norm": 1.105818510055542, + "learning_rate": 0.000254364533510821, + "loss": 2.0438, + "step": 11341 + }, + { + "epoch": 1.3232994983082487, + "grad_norm": 1.0204781293869019, + "learning_rate": 0.00025435362792995274, + "loss": 1.9844, + "step": 11342 + }, + { + "epoch": 1.3234161708085403, + "grad_norm": 1.162802815437317, + "learning_rate": 0.00025434272128281186, + "loss": 2.0418, + "step": 11343 + }, + { + "epoch": 1.323532843308832, + "grad_norm": 1.2071857452392578, + "learning_rate": 0.0002543318135695114, + "loss": 2.0023, + "step": 11344 + }, + { + "epoch": 1.3236495158091237, + "grad_norm": 1.1129989624023438, + "learning_rate": 0.00025432090479016444, + "loss": 2.0686, + "step": 11345 + }, + { + "epoch": 1.3237661883094154, + "grad_norm": 1.251983404159546, + "learning_rate": 0.0002543099949448841, + "loss": 2.1004, + "step": 11346 + }, + { + "epoch": 1.323882860809707, + "grad_norm": 1.059016227722168, + "learning_rate": 0.0002542990840337835, + "loss": 2.1245, + "step": 11347 + }, + { + "epoch": 1.3239995333099988, + "grad_norm": 2.008098840713501, + "learning_rate": 0.00025428817205697574, + "loss": 2.2374, + "step": 11348 + }, + { + "epoch": 1.3241162058102904, + "grad_norm": 1.0873273611068726, + "learning_rate": 0.00025427725901457386, + "loss": 2.1306, + "step": 11349 + }, + { + "epoch": 1.3242328783105821, + "grad_norm": 1.1609413623809814, + "learning_rate": 0.0002542663449066911, + "loss": 2.1958, + "step": 11350 + }, + { + "epoch": 1.3243495508108738, + "grad_norm": 1.1736923456192017, + "learning_rate": 0.00025425542973344067, + "loss": 2.1177, + "step": 11351 + }, + { + "epoch": 1.3244662233111655, + "grad_norm": 1.0844491720199585, + "learning_rate": 0.00025424451349493547, + "loss": 2.0141, + "step": 11352 + }, + { + "epoch": 1.3245828958114572, + "grad_norm": 1.2893638610839844, + "learning_rate": 0.0002542335961912889, + "loss": 2.1189, + "step": 11353 + }, + { + "epoch": 1.3246995683117488, + "grad_norm": 1.0750640630722046, + "learning_rate": 0.0002542226778226141, + "loss": 1.7986, + "step": 11354 + }, + { + "epoch": 1.3248162408120405, + "grad_norm": 1.1377297639846802, + "learning_rate": 0.00025421175838902417, + "loss": 1.8832, + "step": 11355 + }, + { + "epoch": 1.3249329133123322, + "grad_norm": 1.189577579498291, + "learning_rate": 0.00025420083789063236, + "loss": 2.1728, + "step": 11356 + }, + { + "epoch": 1.3250495858126239, + "grad_norm": 1.1773689985275269, + "learning_rate": 0.0002541899163275519, + "loss": 1.9772, + "step": 11357 + }, + { + "epoch": 1.3251662583129156, + "grad_norm": 1.1970807313919067, + "learning_rate": 0.000254178993699896, + "loss": 2.0179, + "step": 11358 + }, + { + "epoch": 1.3252829308132072, + "grad_norm": 1.1147381067276, + "learning_rate": 0.0002541680700077779, + "loss": 1.9356, + "step": 11359 + }, + { + "epoch": 1.325399603313499, + "grad_norm": 1.1759757995605469, + "learning_rate": 0.0002541571452513108, + "loss": 1.8978, + "step": 11360 + }, + { + "epoch": 1.3255162758137906, + "grad_norm": 1.2648305892944336, + "learning_rate": 0.00025414621943060803, + "loss": 1.987, + "step": 11361 + }, + { + "epoch": 1.3256329483140823, + "grad_norm": 1.3460803031921387, + "learning_rate": 0.0002541352925457828, + "loss": 2.0874, + "step": 11362 + }, + { + "epoch": 1.325749620814374, + "grad_norm": 1.2820953130722046, + "learning_rate": 0.00025412436459694844, + "loss": 2.0326, + "step": 11363 + }, + { + "epoch": 1.3258662933146657, + "grad_norm": 1.165541410446167, + "learning_rate": 0.0002541134355842182, + "loss": 2.1678, + "step": 11364 + }, + { + "epoch": 1.3259829658149573, + "grad_norm": 1.0705444812774658, + "learning_rate": 0.00025410250550770533, + "loss": 2.0142, + "step": 11365 + }, + { + "epoch": 1.326099638315249, + "grad_norm": 1.5957978963851929, + "learning_rate": 0.00025409157436752323, + "loss": 2.0692, + "step": 11366 + }, + { + "epoch": 1.3262163108155407, + "grad_norm": 1.1463239192962646, + "learning_rate": 0.0002540806421637852, + "loss": 2.2197, + "step": 11367 + }, + { + "epoch": 1.3263329833158324, + "grad_norm": 1.228649377822876, + "learning_rate": 0.00025406970889660455, + "loss": 2.1841, + "step": 11368 + }, + { + "epoch": 1.326449655816124, + "grad_norm": 1.2201260328292847, + "learning_rate": 0.0002540587745660947, + "loss": 2.048, + "step": 11369 + }, + { + "epoch": 1.3265663283164157, + "grad_norm": 1.1549047231674194, + "learning_rate": 0.00025404783917236883, + "loss": 2.0885, + "step": 11370 + }, + { + "epoch": 1.3266830008167074, + "grad_norm": 1.2322980165481567, + "learning_rate": 0.0002540369027155405, + "loss": 2.0271, + "step": 11371 + }, + { + "epoch": 1.326799673316999, + "grad_norm": 1.2027888298034668, + "learning_rate": 0.00025402596519572294, + "loss": 2.0723, + "step": 11372 + }, + { + "epoch": 1.3269163458172908, + "grad_norm": 1.1827086210250854, + "learning_rate": 0.00025401502661302966, + "loss": 2.1593, + "step": 11373 + }, + { + "epoch": 1.3270330183175825, + "grad_norm": 1.1316884756088257, + "learning_rate": 0.00025400408696757393, + "loss": 2.1396, + "step": 11374 + }, + { + "epoch": 1.3271496908178742, + "grad_norm": 1.346949577331543, + "learning_rate": 0.0002539931462594693, + "loss": 2.1312, + "step": 11375 + }, + { + "epoch": 1.3272663633181658, + "grad_norm": 1.1932506561279297, + "learning_rate": 0.0002539822044888291, + "loss": 2.2268, + "step": 11376 + }, + { + "epoch": 1.3273830358184575, + "grad_norm": 1.1740546226501465, + "learning_rate": 0.0002539712616557668, + "loss": 2.1859, + "step": 11377 + }, + { + "epoch": 1.3274997083187492, + "grad_norm": 1.0746363401412964, + "learning_rate": 0.00025396031776039576, + "loss": 2.1199, + "step": 11378 + }, + { + "epoch": 1.3276163808190409, + "grad_norm": 1.135560393333435, + "learning_rate": 0.00025394937280282957, + "loss": 2.0717, + "step": 11379 + }, + { + "epoch": 1.3277330533193326, + "grad_norm": 1.300255537033081, + "learning_rate": 0.0002539384267831816, + "loss": 2.2355, + "step": 11380 + }, + { + "epoch": 1.3278497258196242, + "grad_norm": 1.3291330337524414, + "learning_rate": 0.0002539274797015654, + "loss": 2.1168, + "step": 11381 + }, + { + "epoch": 1.327966398319916, + "grad_norm": 1.1020828485488892, + "learning_rate": 0.0002539165315580944, + "loss": 1.96, + "step": 11382 + }, + { + "epoch": 1.3280830708202076, + "grad_norm": 1.2168041467666626, + "learning_rate": 0.000253905582352882, + "loss": 2.174, + "step": 11383 + }, + { + "epoch": 1.3281997433204993, + "grad_norm": 1.3174601793289185, + "learning_rate": 0.00025389463208604193, + "loss": 2.0653, + "step": 11384 + }, + { + "epoch": 1.328316415820791, + "grad_norm": 1.028438687324524, + "learning_rate": 0.0002538836807576876, + "loss": 1.9095, + "step": 11385 + }, + { + "epoch": 1.3284330883210826, + "grad_norm": 1.1176773309707642, + "learning_rate": 0.00025387272836793247, + "loss": 2.0279, + "step": 11386 + }, + { + "epoch": 1.3285497608213743, + "grad_norm": 1.0745656490325928, + "learning_rate": 0.0002538617749168902, + "loss": 1.9863, + "step": 11387 + }, + { + "epoch": 1.328666433321666, + "grad_norm": 1.2031704187393188, + "learning_rate": 0.0002538508204046743, + "loss": 2.0987, + "step": 11388 + }, + { + "epoch": 1.3287831058219577, + "grad_norm": 1.2566845417022705, + "learning_rate": 0.0002538398648313983, + "loss": 2.175, + "step": 11389 + }, + { + "epoch": 1.3288997783222494, + "grad_norm": 1.1264069080352783, + "learning_rate": 0.0002538289081971758, + "loss": 2.0735, + "step": 11390 + }, + { + "epoch": 1.329016450822541, + "grad_norm": 1.225925326347351, + "learning_rate": 0.0002538179505021204, + "loss": 2.0695, + "step": 11391 + }, + { + "epoch": 1.3291331233228327, + "grad_norm": 1.2089784145355225, + "learning_rate": 0.0002538069917463457, + "loss": 1.9127, + "step": 11392 + }, + { + "epoch": 1.3292497958231244, + "grad_norm": 1.2531728744506836, + "learning_rate": 0.0002537960319299653, + "loss": 2.0268, + "step": 11393 + }, + { + "epoch": 1.329366468323416, + "grad_norm": 1.1212115287780762, + "learning_rate": 0.0002537850710530928, + "loss": 2.0884, + "step": 11394 + }, + { + "epoch": 1.3294831408237078, + "grad_norm": 1.2579814195632935, + "learning_rate": 0.0002537741091158419, + "loss": 2.1584, + "step": 11395 + }, + { + "epoch": 1.3295998133239995, + "grad_norm": 1.0873180627822876, + "learning_rate": 0.0002537631461183261, + "loss": 1.9135, + "step": 11396 + }, + { + "epoch": 1.3297164858242911, + "grad_norm": 1.2171415090560913, + "learning_rate": 0.0002537521820606592, + "loss": 1.9177, + "step": 11397 + }, + { + "epoch": 1.3298331583245828, + "grad_norm": 1.1077678203582764, + "learning_rate": 0.0002537412169429548, + "loss": 1.8996, + "step": 11398 + }, + { + "epoch": 1.3299498308248745, + "grad_norm": 1.1624565124511719, + "learning_rate": 0.00025373025076532655, + "loss": 2.1265, + "step": 11399 + }, + { + "epoch": 1.3300665033251662, + "grad_norm": 1.2848455905914307, + "learning_rate": 0.0002537192835278882, + "loss": 2.103, + "step": 11400 + }, + { + "epoch": 1.3301831758254579, + "grad_norm": 1.2678675651550293, + "learning_rate": 0.00025370831523075334, + "loss": 2.111, + "step": 11401 + }, + { + "epoch": 1.3302998483257495, + "grad_norm": 0.9876028895378113, + "learning_rate": 0.0002536973458740358, + "loss": 2.0245, + "step": 11402 + }, + { + "epoch": 1.3304165208260412, + "grad_norm": 1.2357919216156006, + "learning_rate": 0.0002536863754578492, + "loss": 2.0285, + "step": 11403 + }, + { + "epoch": 1.330533193326333, + "grad_norm": 1.3445844650268555, + "learning_rate": 0.0002536754039823074, + "loss": 2.1186, + "step": 11404 + }, + { + "epoch": 1.3306498658266246, + "grad_norm": 1.1868352890014648, + "learning_rate": 0.00025366443144752394, + "loss": 2.1528, + "step": 11405 + }, + { + "epoch": 1.3307665383269163, + "grad_norm": 1.124165415763855, + "learning_rate": 0.0002536534578536127, + "loss": 2.0892, + "step": 11406 + }, + { + "epoch": 1.330883210827208, + "grad_norm": 1.2450031042099, + "learning_rate": 0.0002536424832006875, + "loss": 2.2448, + "step": 11407 + }, + { + "epoch": 1.3309998833274996, + "grad_norm": 1.146901249885559, + "learning_rate": 0.000253631507488862, + "loss": 2.0267, + "step": 11408 + }, + { + "epoch": 1.3311165558277913, + "grad_norm": 1.1346914768218994, + "learning_rate": 0.0002536205307182501, + "loss": 2.1827, + "step": 11409 + }, + { + "epoch": 1.331233228328083, + "grad_norm": 1.1670385599136353, + "learning_rate": 0.0002536095528889654, + "loss": 2.0344, + "step": 11410 + }, + { + "epoch": 1.3313499008283747, + "grad_norm": 1.0759259462356567, + "learning_rate": 0.00025359857400112187, + "loss": 2.2355, + "step": 11411 + }, + { + "epoch": 1.3314665733286664, + "grad_norm": 1.1493134498596191, + "learning_rate": 0.0002535875940548333, + "loss": 1.9287, + "step": 11412 + }, + { + "epoch": 1.331583245828958, + "grad_norm": 1.188701868057251, + "learning_rate": 0.00025357661305021344, + "loss": 2.3174, + "step": 11413 + }, + { + "epoch": 1.3316999183292497, + "grad_norm": 1.1609054803848267, + "learning_rate": 0.00025356563098737626, + "loss": 2.2537, + "step": 11414 + }, + { + "epoch": 1.3318165908295414, + "grad_norm": 1.2275516986846924, + "learning_rate": 0.0002535546478664355, + "loss": 2.0918, + "step": 11415 + }, + { + "epoch": 1.331933263329833, + "grad_norm": 1.1148735284805298, + "learning_rate": 0.00025354366368750503, + "loss": 1.9181, + "step": 11416 + }, + { + "epoch": 1.3320499358301248, + "grad_norm": 1.088014006614685, + "learning_rate": 0.0002535326784506988, + "loss": 2.1031, + "step": 11417 + }, + { + "epoch": 1.3321666083304164, + "grad_norm": 1.2227290868759155, + "learning_rate": 0.00025352169215613063, + "loss": 2.0961, + "step": 11418 + }, + { + "epoch": 1.3321666083304164, + "eval_train_loss": 2.0187792778015137, + "eval_train_mean_batch_perplexity": 8.654761856190383, + "eval_train_runtime": 11040.0451, + "eval_train_samples_per_second": 12.422, + "eval_train_steps_per_second": 0.776, + "step": 11418 + }, + { + "epoch": 1.3321666083304164, + "eval_test_loss": 2.1062510013580322, + "eval_test_mean_batch_perplexity": 9.520766290067291, + "eval_test_runtime": 2383.2736, + "eval_test_samples_per_second": 12.331, + "eval_test_steps_per_second": 0.771, + "step": 11418 + }, + { + "epoch": 1.3322832808307081, + "grad_norm": 1.1415737867355347, + "learning_rate": 0.0002535107048039144, + "loss": 1.995, + "step": 11419 + }, + { + "epoch": 1.3323999533309998, + "grad_norm": 1.1591417789459229, + "learning_rate": 0.00025349971639416406, + "loss": 2.2243, + "step": 11420 + }, + { + "epoch": 1.3325166258312915, + "grad_norm": 1.0837905406951904, + "learning_rate": 0.00025348872692699354, + "loss": 2.0291, + "step": 11421 + }, + { + "epoch": 1.3326332983315832, + "grad_norm": 1.1747814416885376, + "learning_rate": 0.0002534777364025167, + "loss": 2.0104, + "step": 11422 + }, + { + "epoch": 1.3327499708318749, + "grad_norm": 1.101151943206787, + "learning_rate": 0.0002534667448208475, + "loss": 2.1625, + "step": 11423 + }, + { + "epoch": 1.3328666433321665, + "grad_norm": 1.1680444478988647, + "learning_rate": 0.00025345575218209995, + "loss": 1.9569, + "step": 11424 + }, + { + "epoch": 1.3329833158324582, + "grad_norm": 1.0578988790512085, + "learning_rate": 0.000253444758486388, + "loss": 1.9758, + "step": 11425 + }, + { + "epoch": 1.33309998833275, + "grad_norm": 1.2763234376907349, + "learning_rate": 0.00025343376373382543, + "loss": 2.0316, + "step": 11426 + }, + { + "epoch": 1.3332166608330416, + "grad_norm": 1.2335089445114136, + "learning_rate": 0.0002534227679245265, + "loss": 2.1107, + "step": 11427 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 1.147104263305664, + "learning_rate": 0.000253411771058605, + "loss": 2.1249, + "step": 11428 + }, + { + "epoch": 1.333450005833625, + "grad_norm": 1.1182446479797363, + "learning_rate": 0.000253400773136175, + "loss": 2.0237, + "step": 11429 + }, + { + "epoch": 1.3335666783339166, + "grad_norm": 1.0576080083847046, + "learning_rate": 0.00025338977415735055, + "loss": 1.8566, + "step": 11430 + }, + { + "epoch": 1.3336833508342083, + "grad_norm": 1.1321009397506714, + "learning_rate": 0.00025337877412224565, + "loss": 2.1294, + "step": 11431 + }, + { + "epoch": 1.3338000233345, + "grad_norm": 1.5691882371902466, + "learning_rate": 0.0002533677730309743, + "loss": 2.2895, + "step": 11432 + }, + { + "epoch": 1.3339166958347917, + "grad_norm": 1.1648412942886353, + "learning_rate": 0.0002533567708836507, + "loss": 2.1477, + "step": 11433 + }, + { + "epoch": 1.3340333683350833, + "grad_norm": 1.270260214805603, + "learning_rate": 0.00025334576768038864, + "loss": 1.8936, + "step": 11434 + }, + { + "epoch": 1.334150040835375, + "grad_norm": 1.2516520023345947, + "learning_rate": 0.00025333476342130244, + "loss": 2.2687, + "step": 11435 + }, + { + "epoch": 1.3342667133356667, + "grad_norm": 1.0867538452148438, + "learning_rate": 0.000253323758106506, + "loss": 2.1934, + "step": 11436 + }, + { + "epoch": 1.3343833858359584, + "grad_norm": 1.0918841361999512, + "learning_rate": 0.0002533127517361135, + "loss": 2.0753, + "step": 11437 + }, + { + "epoch": 1.33450005833625, + "grad_norm": 1.4410067796707153, + "learning_rate": 0.0002533017443102391, + "loss": 2.0348, + "step": 11438 + }, + { + "epoch": 1.3346167308365418, + "grad_norm": 1.2179232835769653, + "learning_rate": 0.0002532907358289968, + "loss": 2.1521, + "step": 11439 + }, + { + "epoch": 1.3347334033368334, + "grad_norm": 1.2189325094223022, + "learning_rate": 0.0002532797262925008, + "loss": 2.2047, + "step": 11440 + }, + { + "epoch": 1.3348500758371251, + "grad_norm": 1.2328662872314453, + "learning_rate": 0.0002532687157008651, + "loss": 2.1354, + "step": 11441 + }, + { + "epoch": 1.3349667483374168, + "grad_norm": 1.201682686805725, + "learning_rate": 0.000253257704054204, + "loss": 1.9827, + "step": 11442 + }, + { + "epoch": 1.3350834208377085, + "grad_norm": 1.30399489402771, + "learning_rate": 0.0002532466913526316, + "loss": 1.9842, + "step": 11443 + }, + { + "epoch": 1.3352000933380002, + "grad_norm": 1.6234318017959595, + "learning_rate": 0.00025323567759626207, + "loss": 2.0751, + "step": 11444 + }, + { + "epoch": 1.3353167658382918, + "grad_norm": 1.1404505968093872, + "learning_rate": 0.0002532246627852096, + "loss": 2.1252, + "step": 11445 + }, + { + "epoch": 1.3354334383385835, + "grad_norm": 1.1501953601837158, + "learning_rate": 0.00025321364691958844, + "loss": 2.0225, + "step": 11446 + }, + { + "epoch": 1.3355501108388752, + "grad_norm": 1.073019027709961, + "learning_rate": 0.00025320262999951263, + "loss": 1.9289, + "step": 11447 + }, + { + "epoch": 1.3356667833391669, + "grad_norm": 1.1190235614776611, + "learning_rate": 0.0002531916120250965, + "loss": 2.1348, + "step": 11448 + }, + { + "epoch": 1.3357834558394586, + "grad_norm": 1.6726226806640625, + "learning_rate": 0.0002531805929964542, + "loss": 2.0143, + "step": 11449 + }, + { + "epoch": 1.3359001283397502, + "grad_norm": 1.303475022315979, + "learning_rate": 0.00025316957291370007, + "loss": 2.1965, + "step": 11450 + }, + { + "epoch": 1.336016800840042, + "grad_norm": 1.218296766281128, + "learning_rate": 0.00025315855177694826, + "loss": 2.0444, + "step": 11451 + }, + { + "epoch": 1.3361334733403336, + "grad_norm": 1.0651510953903198, + "learning_rate": 0.000253147529586313, + "loss": 2.0289, + "step": 11452 + }, + { + "epoch": 1.3362501458406253, + "grad_norm": 1.0953426361083984, + "learning_rate": 0.0002531365063419088, + "loss": 1.9584, + "step": 11453 + }, + { + "epoch": 1.336366818340917, + "grad_norm": 1.4111237525939941, + "learning_rate": 0.00025312548204384954, + "loss": 1.9672, + "step": 11454 + }, + { + "epoch": 1.3364834908412087, + "grad_norm": 1.204437017440796, + "learning_rate": 0.00025311445669224975, + "loss": 2.0758, + "step": 11455 + }, + { + "epoch": 1.3366001633415003, + "grad_norm": 1.2986634969711304, + "learning_rate": 0.00025310343028722373, + "loss": 2.0166, + "step": 11456 + }, + { + "epoch": 1.336716835841792, + "grad_norm": 1.0876028537750244, + "learning_rate": 0.00025309240282888576, + "loss": 2.0145, + "step": 11457 + }, + { + "epoch": 1.3368335083420837, + "grad_norm": 1.1747500896453857, + "learning_rate": 0.00025308137431735017, + "loss": 2.3602, + "step": 11458 + }, + { + "epoch": 1.3369501808423754, + "grad_norm": 1.2684701681137085, + "learning_rate": 0.00025307034475273123, + "loss": 2.2336, + "step": 11459 + }, + { + "epoch": 1.337066853342667, + "grad_norm": 1.2204264402389526, + "learning_rate": 0.0002530593141351434, + "loss": 2.0316, + "step": 11460 + }, + { + "epoch": 1.3371835258429587, + "grad_norm": 1.023687481880188, + "learning_rate": 0.00025304828246470085, + "loss": 2.0048, + "step": 11461 + }, + { + "epoch": 1.3373001983432504, + "grad_norm": 1.177816390991211, + "learning_rate": 0.0002530372497415181, + "loss": 2.1352, + "step": 11462 + }, + { + "epoch": 1.337416870843542, + "grad_norm": 1.0760068893432617, + "learning_rate": 0.00025302621596570956, + "loss": 1.8656, + "step": 11463 + }, + { + "epoch": 1.3375335433438338, + "grad_norm": 1.2201952934265137, + "learning_rate": 0.0002530151811373895, + "loss": 2.0819, + "step": 11464 + }, + { + "epoch": 1.3376502158441255, + "grad_norm": 1.0954149961471558, + "learning_rate": 0.0002530041452566723, + "loss": 1.8307, + "step": 11465 + }, + { + "epoch": 1.3377668883444171, + "grad_norm": 1.1265023946762085, + "learning_rate": 0.00025299310832367243, + "loss": 1.9164, + "step": 11466 + }, + { + "epoch": 1.3378835608447088, + "grad_norm": 1.3328814506530762, + "learning_rate": 0.0002529820703385043, + "loss": 2.1924, + "step": 11467 + }, + { + "epoch": 1.3380002333450005, + "grad_norm": 1.2512203454971313, + "learning_rate": 0.00025297103130128237, + "loss": 2.1505, + "step": 11468 + }, + { + "epoch": 1.3381169058452922, + "grad_norm": 1.1916697025299072, + "learning_rate": 0.00025295999121212104, + "loss": 2.0669, + "step": 11469 + }, + { + "epoch": 1.3382335783455839, + "grad_norm": 1.2039247751235962, + "learning_rate": 0.00025294895007113474, + "loss": 2.1402, + "step": 11470 + }, + { + "epoch": 1.3383502508458756, + "grad_norm": 1.1515756845474243, + "learning_rate": 0.00025293790787843796, + "loss": 2.0809, + "step": 11471 + }, + { + "epoch": 1.3384669233461672, + "grad_norm": 1.4004982709884644, + "learning_rate": 0.00025292686463414526, + "loss": 2.0537, + "step": 11472 + }, + { + "epoch": 1.338583595846459, + "grad_norm": 1.0727142095565796, + "learning_rate": 0.00025291582033837095, + "loss": 2.097, + "step": 11473 + }, + { + "epoch": 1.3387002683467506, + "grad_norm": 1.2503169775009155, + "learning_rate": 0.00025290477499122964, + "loss": 2.0708, + "step": 11474 + }, + { + "epoch": 1.3388169408470423, + "grad_norm": 1.1779314279556274, + "learning_rate": 0.0002528937285928358, + "loss": 2.1762, + "step": 11475 + }, + { + "epoch": 1.338933613347334, + "grad_norm": 1.1483253240585327, + "learning_rate": 0.00025288268114330404, + "loss": 1.9149, + "step": 11476 + }, + { + "epoch": 1.3390502858476256, + "grad_norm": 1.2318942546844482, + "learning_rate": 0.00025287163264274873, + "loss": 1.9367, + "step": 11477 + }, + { + "epoch": 1.3391669583479173, + "grad_norm": 1.343727469444275, + "learning_rate": 0.0002528605830912845, + "loss": 2.1634, + "step": 11478 + }, + { + "epoch": 1.339283630848209, + "grad_norm": 1.157818078994751, + "learning_rate": 0.0002528495324890259, + "loss": 1.9668, + "step": 11479 + }, + { + "epoch": 1.3394003033485007, + "grad_norm": 1.3056831359863281, + "learning_rate": 0.00025283848083608744, + "loss": 2.0831, + "step": 11480 + }, + { + "epoch": 1.3395169758487924, + "grad_norm": 1.2150338888168335, + "learning_rate": 0.00025282742813258376, + "loss": 1.9946, + "step": 11481 + }, + { + "epoch": 1.339633648349084, + "grad_norm": 1.2332710027694702, + "learning_rate": 0.0002528163743786294, + "loss": 2.1214, + "step": 11482 + }, + { + "epoch": 1.3397503208493757, + "grad_norm": 1.1370059251785278, + "learning_rate": 0.00025280531957433893, + "loss": 2.0206, + "step": 11483 + }, + { + "epoch": 1.3398669933496674, + "grad_norm": 1.1986818313598633, + "learning_rate": 0.00025279426371982704, + "loss": 2.1213, + "step": 11484 + }, + { + "epoch": 1.339983665849959, + "grad_norm": 1.2822223901748657, + "learning_rate": 0.0002527832068152082, + "loss": 2.0893, + "step": 11485 + }, + { + "epoch": 1.3401003383502508, + "grad_norm": 1.204923391342163, + "learning_rate": 0.00025277214886059725, + "loss": 2.1151, + "step": 11486 + }, + { + "epoch": 1.3402170108505425, + "grad_norm": 1.466282844543457, + "learning_rate": 0.0002527610898561086, + "loss": 2.2032, + "step": 11487 + }, + { + "epoch": 1.3403336833508341, + "grad_norm": 1.0766009092330933, + "learning_rate": 0.00025275002980185704, + "loss": 2.1327, + "step": 11488 + }, + { + "epoch": 1.3404503558511258, + "grad_norm": 1.2945626974105835, + "learning_rate": 0.0002527389686979572, + "loss": 2.1478, + "step": 11489 + }, + { + "epoch": 1.3405670283514175, + "grad_norm": 1.054618000984192, + "learning_rate": 0.00025272790654452376, + "loss": 1.9382, + "step": 11490 + }, + { + "epoch": 1.3406837008517092, + "grad_norm": 1.2358407974243164, + "learning_rate": 0.00025271684334167134, + "loss": 1.9988, + "step": 11491 + }, + { + "epoch": 1.3408003733520009, + "grad_norm": 1.1152604818344116, + "learning_rate": 0.0002527057790895146, + "loss": 2.0521, + "step": 11492 + }, + { + "epoch": 1.3409170458522925, + "grad_norm": 1.3187214136123657, + "learning_rate": 0.0002526947137881684, + "loss": 2.1843, + "step": 11493 + }, + { + "epoch": 1.3410337183525842, + "grad_norm": 1.0747959613800049, + "learning_rate": 0.00025268364743774737, + "loss": 2.0659, + "step": 11494 + }, + { + "epoch": 1.341150390852876, + "grad_norm": 1.1923596858978271, + "learning_rate": 0.00025267258003836614, + "loss": 2.1298, + "step": 11495 + }, + { + "epoch": 1.3412670633531676, + "grad_norm": 1.1970349550247192, + "learning_rate": 0.0002526615115901396, + "loss": 2.0499, + "step": 11496 + }, + { + "epoch": 1.3413837358534593, + "grad_norm": 1.2322723865509033, + "learning_rate": 0.00025265044209318247, + "loss": 2.022, + "step": 11497 + }, + { + "epoch": 1.341500408353751, + "grad_norm": 1.1935195922851562, + "learning_rate": 0.00025263937154760944, + "loss": 2.0452, + "step": 11498 + }, + { + "epoch": 1.3416170808540426, + "grad_norm": 1.0967878103256226, + "learning_rate": 0.0002526282999535353, + "loss": 2.0864, + "step": 11499 + }, + { + "epoch": 1.3417337533543343, + "grad_norm": 1.0941320657730103, + "learning_rate": 0.00025261722731107476, + "loss": 2.1953, + "step": 11500 + }, + { + "epoch": 1.341850425854626, + "grad_norm": 1.185551404953003, + "learning_rate": 0.0002526061536203427, + "loss": 2.0588, + "step": 11501 + }, + { + "epoch": 1.3419670983549177, + "grad_norm": 1.0177208185195923, + "learning_rate": 0.00025259507888145393, + "loss": 2.1202, + "step": 11502 + }, + { + "epoch": 1.3420837708552094, + "grad_norm": 0.9766907095909119, + "learning_rate": 0.0002525840030945232, + "loss": 1.9423, + "step": 11503 + }, + { + "epoch": 1.342200443355501, + "grad_norm": 1.1727120876312256, + "learning_rate": 0.00025257292625966544, + "loss": 1.9998, + "step": 11504 + }, + { + "epoch": 1.3423171158557927, + "grad_norm": 1.169298529624939, + "learning_rate": 0.0002525618483769953, + "loss": 2.1154, + "step": 11505 + }, + { + "epoch": 1.3424337883560844, + "grad_norm": 1.2540419101715088, + "learning_rate": 0.00025255076944662776, + "loss": 2.0705, + "step": 11506 + }, + { + "epoch": 1.342550460856376, + "grad_norm": 1.1848275661468506, + "learning_rate": 0.00025253968946867766, + "loss": 2.0663, + "step": 11507 + }, + { + "epoch": 1.3426671333566678, + "grad_norm": 1.0426806211471558, + "learning_rate": 0.0002525286084432598, + "loss": 2.1352, + "step": 11508 + }, + { + "epoch": 1.3427838058569594, + "grad_norm": 1.223834753036499, + "learning_rate": 0.00025251752637048914, + "loss": 2.0658, + "step": 11509 + }, + { + "epoch": 1.3429004783572511, + "grad_norm": 1.0286670923233032, + "learning_rate": 0.00025250644325048053, + "loss": 1.8483, + "step": 11510 + }, + { + "epoch": 1.3430171508575428, + "grad_norm": 1.3817124366760254, + "learning_rate": 0.00025249535908334887, + "loss": 2.0777, + "step": 11511 + }, + { + "epoch": 1.3431338233578345, + "grad_norm": 1.1729774475097656, + "learning_rate": 0.000252484273869209, + "loss": 2.1282, + "step": 11512 + }, + { + "epoch": 1.3432504958581262, + "grad_norm": 1.2160030603408813, + "learning_rate": 0.000252473187608176, + "loss": 2.0939, + "step": 11513 + }, + { + "epoch": 1.3433671683584179, + "grad_norm": 1.2138046026229858, + "learning_rate": 0.00025246210030036465, + "loss": 2.2414, + "step": 11514 + }, + { + "epoch": 1.3434838408587095, + "grad_norm": 1.2323437929153442, + "learning_rate": 0.0002524510119458899, + "loss": 2.1272, + "step": 11515 + }, + { + "epoch": 1.3436005133590012, + "grad_norm": 1.0841903686523438, + "learning_rate": 0.0002524399225448668, + "loss": 2.033, + "step": 11516 + }, + { + "epoch": 1.343717185859293, + "grad_norm": 1.1212955713272095, + "learning_rate": 0.0002524288320974103, + "loss": 2.031, + "step": 11517 + }, + { + "epoch": 1.3438338583595846, + "grad_norm": 1.1298753023147583, + "learning_rate": 0.00025241774060363523, + "loss": 1.9373, + "step": 11518 + }, + { + "epoch": 1.3439505308598763, + "grad_norm": 1.0602262020111084, + "learning_rate": 0.0002524066480636567, + "loss": 1.9922, + "step": 11519 + }, + { + "epoch": 1.344067203360168, + "grad_norm": 1.1752029657363892, + "learning_rate": 0.0002523955544775897, + "loss": 2.0244, + "step": 11520 + }, + { + "epoch": 1.3441838758604596, + "grad_norm": 1.3089369535446167, + "learning_rate": 0.00025238445984554925, + "loss": 2.0741, + "step": 11521 + }, + { + "epoch": 1.3443005483607513, + "grad_norm": 1.1287254095077515, + "learning_rate": 0.00025237336416765026, + "loss": 2.1201, + "step": 11522 + }, + { + "epoch": 1.344417220861043, + "grad_norm": 1.1579704284667969, + "learning_rate": 0.0002523622674440079, + "loss": 2.1094, + "step": 11523 + }, + { + "epoch": 1.3445338933613347, + "grad_norm": 1.1220965385437012, + "learning_rate": 0.00025235116967473715, + "loss": 2.1881, + "step": 11524 + }, + { + "epoch": 1.3446505658616263, + "grad_norm": 1.0758082866668701, + "learning_rate": 0.000252340070859953, + "loss": 1.804, + "step": 11525 + }, + { + "epoch": 1.344767238361918, + "grad_norm": 1.1622810363769531, + "learning_rate": 0.00025232897099977057, + "loss": 1.9774, + "step": 11526 + }, + { + "epoch": 1.3448839108622097, + "grad_norm": 1.1402132511138916, + "learning_rate": 0.0002523178700943049, + "loss": 2.1932, + "step": 11527 + }, + { + "epoch": 1.3450005833625014, + "grad_norm": 1.1004571914672852, + "learning_rate": 0.00025230676814367106, + "loss": 2.0081, + "step": 11528 + }, + { + "epoch": 1.345117255862793, + "grad_norm": 1.0650490522384644, + "learning_rate": 0.00025229566514798425, + "loss": 1.9132, + "step": 11529 + }, + { + "epoch": 1.3452339283630848, + "grad_norm": 1.07913076877594, + "learning_rate": 0.0002522845611073595, + "loss": 1.9156, + "step": 11530 + }, + { + "epoch": 1.3453506008633764, + "grad_norm": 1.2675869464874268, + "learning_rate": 0.00025227345602191187, + "loss": 1.9503, + "step": 11531 + }, + { + "epoch": 1.3454672733636681, + "grad_norm": 1.238257646560669, + "learning_rate": 0.0002522623498917566, + "loss": 2.1424, + "step": 11532 + }, + { + "epoch": 1.3455839458639598, + "grad_norm": 1.3833743333816528, + "learning_rate": 0.0002522512427170087, + "loss": 2.0004, + "step": 11533 + }, + { + "epoch": 1.3457006183642515, + "grad_norm": 1.1553869247436523, + "learning_rate": 0.00025224013449778345, + "loss": 1.9466, + "step": 11534 + }, + { + "epoch": 1.3458172908645432, + "grad_norm": 1.1625977754592896, + "learning_rate": 0.00025222902523419584, + "loss": 2.0336, + "step": 11535 + }, + { + "epoch": 1.3459339633648348, + "grad_norm": 1.292183518409729, + "learning_rate": 0.00025221791492636124, + "loss": 2.1193, + "step": 11536 + }, + { + "epoch": 1.3460506358651265, + "grad_norm": 1.3347145318984985, + "learning_rate": 0.0002522068035743947, + "loss": 2.0876, + "step": 11537 + }, + { + "epoch": 1.3461673083654182, + "grad_norm": 1.1963921785354614, + "learning_rate": 0.00025219569117841136, + "loss": 2.1009, + "step": 11538 + }, + { + "epoch": 1.3462839808657099, + "grad_norm": 1.3432995080947876, + "learning_rate": 0.0002521845777385266, + "loss": 2.0754, + "step": 11539 + }, + { + "epoch": 1.3464006533660016, + "grad_norm": 1.2946903705596924, + "learning_rate": 0.0002521734632548555, + "loss": 2.0878, + "step": 11540 + }, + { + "epoch": 1.3465173258662932, + "grad_norm": 1.1550647020339966, + "learning_rate": 0.0002521623477275133, + "loss": 2.2175, + "step": 11541 + }, + { + "epoch": 1.346633998366585, + "grad_norm": 1.3081564903259277, + "learning_rate": 0.0002521512311566152, + "loss": 2.0831, + "step": 11542 + }, + { + "epoch": 1.3467506708668766, + "grad_norm": 1.0443429946899414, + "learning_rate": 0.0002521401135422766, + "loss": 2.1305, + "step": 11543 + }, + { + "epoch": 1.3468673433671683, + "grad_norm": 1.5088739395141602, + "learning_rate": 0.00025212899488461253, + "loss": 2.0905, + "step": 11544 + }, + { + "epoch": 1.34698401586746, + "grad_norm": 1.411560297012329, + "learning_rate": 0.00025211787518373847, + "loss": 2.1917, + "step": 11545 + }, + { + "epoch": 1.3471006883677517, + "grad_norm": 1.0642452239990234, + "learning_rate": 0.00025210675443976954, + "loss": 2.0779, + "step": 11546 + }, + { + "epoch": 1.3472173608680433, + "grad_norm": 1.3134897947311401, + "learning_rate": 0.0002520956326528211, + "loss": 2.1116, + "step": 11547 + }, + { + "epoch": 1.347334033368335, + "grad_norm": 1.099448323249817, + "learning_rate": 0.0002520845098230084, + "loss": 2.1213, + "step": 11548 + }, + { + "epoch": 1.3474507058686267, + "grad_norm": 1.1606199741363525, + "learning_rate": 0.00025207338595044685, + "loss": 1.9182, + "step": 11549 + }, + { + "epoch": 1.3475673783689184, + "grad_norm": 1.2500123977661133, + "learning_rate": 0.0002520622610352517, + "loss": 2.0594, + "step": 11550 + }, + { + "epoch": 1.34768405086921, + "grad_norm": 1.0531052350997925, + "learning_rate": 0.00025205113507753823, + "loss": 2.2848, + "step": 11551 + }, + { + "epoch": 1.3478007233695017, + "grad_norm": 1.1846020221710205, + "learning_rate": 0.00025204000807742185, + "loss": 2.108, + "step": 11552 + }, + { + "epoch": 1.3479173958697934, + "grad_norm": 1.1762360334396362, + "learning_rate": 0.0002520288800350179, + "loss": 2.1042, + "step": 11553 + }, + { + "epoch": 1.348034068370085, + "grad_norm": 1.0901124477386475, + "learning_rate": 0.00025201775095044175, + "loss": 1.9958, + "step": 11554 + }, + { + "epoch": 1.3481507408703768, + "grad_norm": 1.0213148593902588, + "learning_rate": 0.00025200662082380874, + "loss": 2.0486, + "step": 11555 + }, + { + "epoch": 1.3482674133706685, + "grad_norm": 1.336321234703064, + "learning_rate": 0.00025199548965523436, + "loss": 1.9393, + "step": 11556 + }, + { + "epoch": 1.3483840858709601, + "grad_norm": 1.1868648529052734, + "learning_rate": 0.00025198435744483384, + "loss": 2.1152, + "step": 11557 + }, + { + "epoch": 1.3485007583712518, + "grad_norm": 1.0805786848068237, + "learning_rate": 0.0002519732241927227, + "loss": 2.0384, + "step": 11558 + }, + { + "epoch": 1.3486174308715435, + "grad_norm": 1.2332391738891602, + "learning_rate": 0.00025196208989901634, + "loss": 1.99, + "step": 11559 + }, + { + "epoch": 1.3487341033718352, + "grad_norm": 1.115372896194458, + "learning_rate": 0.00025195095456383013, + "loss": 2.0565, + "step": 11560 + }, + { + "epoch": 1.3488507758721269, + "grad_norm": 1.6655539274215698, + "learning_rate": 0.0002519398181872796, + "loss": 2.0689, + "step": 11561 + }, + { + "epoch": 1.3489674483724186, + "grad_norm": 1.1949748992919922, + "learning_rate": 0.0002519286807694802, + "loss": 2.1727, + "step": 11562 + }, + { + "epoch": 1.3490841208727102, + "grad_norm": 1.356663703918457, + "learning_rate": 0.0002519175423105473, + "loss": 2.2093, + "step": 11563 + }, + { + "epoch": 1.349200793373002, + "grad_norm": 1.059037208557129, + "learning_rate": 0.0002519064028105964, + "loss": 2.0476, + "step": 11564 + }, + { + "epoch": 1.3493174658732936, + "grad_norm": 1.2088688611984253, + "learning_rate": 0.000251895262269743, + "loss": 1.9166, + "step": 11565 + }, + { + "epoch": 1.3494341383735853, + "grad_norm": 1.1557906866073608, + "learning_rate": 0.0002518841206881026, + "loss": 1.9982, + "step": 11566 + }, + { + "epoch": 1.349550810873877, + "grad_norm": 1.0326305627822876, + "learning_rate": 0.00025187297806579073, + "loss": 1.8868, + "step": 11567 + }, + { + "epoch": 1.3496674833741686, + "grad_norm": 1.1221178770065308, + "learning_rate": 0.0002518618344029228, + "loss": 1.8897, + "step": 11568 + }, + { + "epoch": 1.3497841558744603, + "grad_norm": 1.232086420059204, + "learning_rate": 0.0002518506896996145, + "loss": 2.1846, + "step": 11569 + }, + { + "epoch": 1.349900828374752, + "grad_norm": 1.2242732048034668, + "learning_rate": 0.00025183954395598124, + "loss": 2.1191, + "step": 11570 + }, + { + "epoch": 1.3500175008750437, + "grad_norm": 1.112083911895752, + "learning_rate": 0.00025182839717213855, + "loss": 1.9072, + "step": 11571 + }, + { + "epoch": 1.3501341733753354, + "grad_norm": 1.1529563665390015, + "learning_rate": 0.00025181724934820207, + "loss": 2.0037, + "step": 11572 + }, + { + "epoch": 1.350250845875627, + "grad_norm": 1.474420428276062, + "learning_rate": 0.0002518061004842873, + "loss": 2.1578, + "step": 11573 + }, + { + "epoch": 1.3503675183759187, + "grad_norm": 1.0290414094924927, + "learning_rate": 0.00025179495058050993, + "loss": 1.9591, + "step": 11574 + }, + { + "epoch": 1.3504841908762104, + "grad_norm": 1.2823010683059692, + "learning_rate": 0.0002517837996369854, + "loss": 2.0789, + "step": 11575 + }, + { + "epoch": 1.350600863376502, + "grad_norm": 1.353561282157898, + "learning_rate": 0.0002517726476538294, + "loss": 2.2381, + "step": 11576 + }, + { + "epoch": 1.3507175358767938, + "grad_norm": 1.2687397003173828, + "learning_rate": 0.00025176149463115757, + "loss": 2.1487, + "step": 11577 + }, + { + "epoch": 1.3508342083770855, + "grad_norm": 1.1822963953018188, + "learning_rate": 0.0002517503405690855, + "loss": 2.1318, + "step": 11578 + }, + { + "epoch": 1.3509508808773771, + "grad_norm": 1.1456568241119385, + "learning_rate": 0.00025173918546772875, + "loss": 2.0846, + "step": 11579 + }, + { + "epoch": 1.3510675533776688, + "grad_norm": 1.1118172407150269, + "learning_rate": 0.00025172802932720305, + "loss": 2.0296, + "step": 11580 + }, + { + "epoch": 1.3511842258779605, + "grad_norm": 1.1463080644607544, + "learning_rate": 0.000251716872147624, + "loss": 2.0675, + "step": 11581 + }, + { + "epoch": 1.3513008983782522, + "grad_norm": 1.177580714225769, + "learning_rate": 0.00025170571392910726, + "loss": 2.1884, + "step": 11582 + }, + { + "epoch": 1.3514175708785439, + "grad_norm": 1.1045873165130615, + "learning_rate": 0.00025169455467176865, + "loss": 2.0747, + "step": 11583 + }, + { + "epoch": 1.3515342433788355, + "grad_norm": 1.1343070268630981, + "learning_rate": 0.0002516833943757237, + "loss": 2.081, + "step": 11584 + }, + { + "epoch": 1.3516509158791272, + "grad_norm": 1.2426536083221436, + "learning_rate": 0.0002516722330410881, + "loss": 2.0515, + "step": 11585 + }, + { + "epoch": 1.351767588379419, + "grad_norm": 1.3604071140289307, + "learning_rate": 0.00025166107066797763, + "loss": 2.1582, + "step": 11586 + }, + { + "epoch": 1.3518842608797106, + "grad_norm": 1.323179006576538, + "learning_rate": 0.00025164990725650804, + "loss": 1.9002, + "step": 11587 + }, + { + "epoch": 1.3520009333800023, + "grad_norm": 1.0560486316680908, + "learning_rate": 0.000251638742806795, + "loss": 2.0073, + "step": 11588 + }, + { + "epoch": 1.352117605880294, + "grad_norm": 1.2227352857589722, + "learning_rate": 0.0002516275773189543, + "loss": 2.0326, + "step": 11589 + }, + { + "epoch": 1.3522342783805856, + "grad_norm": 1.2444202899932861, + "learning_rate": 0.0002516164107931016, + "loss": 2.1452, + "step": 11590 + }, + { + "epoch": 1.3523509508808773, + "grad_norm": 1.0958712100982666, + "learning_rate": 0.0002516052432293527, + "loss": 1.9029, + "step": 11591 + }, + { + "epoch": 1.352467623381169, + "grad_norm": 1.134212613105774, + "learning_rate": 0.0002515940746278234, + "loss": 1.9841, + "step": 11592 + }, + { + "epoch": 1.3525842958814607, + "grad_norm": 1.0904914140701294, + "learning_rate": 0.0002515829049886295, + "loss": 2.0827, + "step": 11593 + }, + { + "epoch": 1.3527009683817524, + "grad_norm": 1.0864795446395874, + "learning_rate": 0.00025157173431188676, + "loss": 2.0915, + "step": 11594 + }, + { + "epoch": 1.352817640882044, + "grad_norm": 1.1835007667541504, + "learning_rate": 0.00025156056259771094, + "loss": 2.179, + "step": 11595 + }, + { + "epoch": 1.3529343133823357, + "grad_norm": 1.0774433612823486, + "learning_rate": 0.00025154938984621797, + "loss": 2.1587, + "step": 11596 + }, + { + "epoch": 1.3530509858826274, + "grad_norm": 1.2172414064407349, + "learning_rate": 0.00025153821605752364, + "loss": 1.9945, + "step": 11597 + }, + { + "epoch": 1.353167658382919, + "grad_norm": 1.160760521888733, + "learning_rate": 0.0002515270412317437, + "loss": 2.0085, + "step": 11598 + }, + { + "epoch": 1.3532843308832108, + "grad_norm": 1.2602030038833618, + "learning_rate": 0.0002515158653689941, + "loss": 2.178, + "step": 11599 + }, + { + "epoch": 1.3534010033835024, + "grad_norm": 1.1380382776260376, + "learning_rate": 0.0002515046884693906, + "loss": 2.1122, + "step": 11600 + }, + { + "epoch": 1.3535176758837941, + "grad_norm": 1.2485766410827637, + "learning_rate": 0.0002514935105330492, + "loss": 2.1305, + "step": 11601 + }, + { + "epoch": 1.3536343483840858, + "grad_norm": 1.2179049253463745, + "learning_rate": 0.0002514823315600857, + "loss": 1.9072, + "step": 11602 + }, + { + "epoch": 1.3537510208843775, + "grad_norm": 1.1224510669708252, + "learning_rate": 0.00025147115155061593, + "loss": 2.0683, + "step": 11603 + }, + { + "epoch": 1.3538676933846692, + "grad_norm": 1.1024260520935059, + "learning_rate": 0.0002514599705047559, + "loss": 2.0699, + "step": 11604 + }, + { + "epoch": 1.3539843658849608, + "grad_norm": 1.2108875513076782, + "learning_rate": 0.00025144878842262155, + "loss": 2.2462, + "step": 11605 + }, + { + "epoch": 1.3541010383852525, + "grad_norm": 1.2008424997329712, + "learning_rate": 0.0002514376053043287, + "loss": 2.2696, + "step": 11606 + }, + { + "epoch": 1.3542177108855442, + "grad_norm": 1.143336296081543, + "learning_rate": 0.0002514264211499933, + "loss": 2.1348, + "step": 11607 + }, + { + "epoch": 1.354334383385836, + "grad_norm": 1.2701963186264038, + "learning_rate": 0.0002514152359597313, + "loss": 2.2574, + "step": 11608 + }, + { + "epoch": 1.3544510558861276, + "grad_norm": 1.297270655632019, + "learning_rate": 0.00025140404973365875, + "loss": 2.0961, + "step": 11609 + }, + { + "epoch": 1.3545677283864193, + "grad_norm": 1.1380265951156616, + "learning_rate": 0.0002513928624718915, + "loss": 2.1541, + "step": 11610 + }, + { + "epoch": 1.354684400886711, + "grad_norm": 1.2700493335723877, + "learning_rate": 0.0002513816741745456, + "loss": 2.1273, + "step": 11611 + }, + { + "epoch": 1.3548010733870026, + "grad_norm": 1.2291204929351807, + "learning_rate": 0.0002513704848417369, + "loss": 2.1905, + "step": 11612 + }, + { + "epoch": 1.3549177458872943, + "grad_norm": 1.1834534406661987, + "learning_rate": 0.0002513592944735816, + "loss": 2.0437, + "step": 11613 + }, + { + "epoch": 1.355034418387586, + "grad_norm": 1.0366569757461548, + "learning_rate": 0.0002513481030701956, + "loss": 1.887, + "step": 11614 + }, + { + "epoch": 1.3551510908878777, + "grad_norm": 1.0999962091445923, + "learning_rate": 0.00025133691063169495, + "loss": 1.9582, + "step": 11615 + }, + { + "epoch": 1.3552677633881693, + "grad_norm": 1.491745948791504, + "learning_rate": 0.00025132571715819556, + "loss": 2.1904, + "step": 11616 + }, + { + "epoch": 1.355384435888461, + "grad_norm": 1.0541960000991821, + "learning_rate": 0.0002513145226498137, + "loss": 2.0183, + "step": 11617 + }, + { + "epoch": 1.3555011083887527, + "grad_norm": 1.1425323486328125, + "learning_rate": 0.0002513033271066653, + "loss": 2.1995, + "step": 11618 + }, + { + "epoch": 1.3556177808890444, + "grad_norm": 1.010902762413025, + "learning_rate": 0.0002512921305288663, + "loss": 1.9088, + "step": 11619 + }, + { + "epoch": 1.355734453389336, + "grad_norm": 1.124630093574524, + "learning_rate": 0.000251280932916533, + "loss": 2.0453, + "step": 11620 + }, + { + "epoch": 1.3558511258896278, + "grad_norm": 1.2069220542907715, + "learning_rate": 0.00025126973426978135, + "loss": 2.283, + "step": 11621 + }, + { + "epoch": 1.3559677983899194, + "grad_norm": 1.2130411863327026, + "learning_rate": 0.0002512585345887275, + "loss": 2.2846, + "step": 11622 + }, + { + "epoch": 1.3560844708902111, + "grad_norm": 1.1619820594787598, + "learning_rate": 0.0002512473338734875, + "loss": 2.0441, + "step": 11623 + }, + { + "epoch": 1.3562011433905028, + "grad_norm": 1.4690288305282593, + "learning_rate": 0.0002512361321241775, + "loss": 2.2089, + "step": 11624 + }, + { + "epoch": 1.3563178158907945, + "grad_norm": 1.184746503829956, + "learning_rate": 0.0002512249293409136, + "loss": 2.0373, + "step": 11625 + }, + { + "epoch": 1.3564344883910862, + "grad_norm": 1.223212718963623, + "learning_rate": 0.000251213725523812, + "loss": 2.0138, + "step": 11626 + }, + { + "epoch": 1.3565511608913778, + "grad_norm": 1.2422871589660645, + "learning_rate": 0.0002512025206729888, + "loss": 2.0423, + "step": 11627 + }, + { + "epoch": 1.3566678333916695, + "grad_norm": 1.197709560394287, + "learning_rate": 0.0002511913147885602, + "loss": 1.9974, + "step": 11628 + }, + { + "epoch": 1.3567845058919612, + "grad_norm": 1.197331190109253, + "learning_rate": 0.0002511801078706423, + "loss": 2.0576, + "step": 11629 + }, + { + "epoch": 1.3569011783922529, + "grad_norm": 1.162990927696228, + "learning_rate": 0.0002511688999193513, + "loss": 2.1238, + "step": 11630 + }, + { + "epoch": 1.3570178508925446, + "grad_norm": 1.3419277667999268, + "learning_rate": 0.0002511576909348035, + "loss": 2.1734, + "step": 11631 + }, + { + "epoch": 1.3571345233928362, + "grad_norm": 1.2068794965744019, + "learning_rate": 0.000251146480917115, + "loss": 2.1431, + "step": 11632 + }, + { + "epoch": 1.357251195893128, + "grad_norm": 1.3014893531799316, + "learning_rate": 0.000251135269866402, + "loss": 2.28, + "step": 11633 + }, + { + "epoch": 1.3573678683934196, + "grad_norm": 1.1744484901428223, + "learning_rate": 0.0002511240577827807, + "loss": 2.0004, + "step": 11634 + }, + { + "epoch": 1.3574845408937113, + "grad_norm": 1.2679959535598755, + "learning_rate": 0.00025111284466636745, + "loss": 2.117, + "step": 11635 + }, + { + "epoch": 1.357601213394003, + "grad_norm": 1.209822177886963, + "learning_rate": 0.0002511016305172784, + "loss": 1.9585, + "step": 11636 + }, + { + "epoch": 1.3577178858942947, + "grad_norm": 1.3313058614730835, + "learning_rate": 0.00025109041533562986, + "loss": 1.9931, + "step": 11637 + }, + { + "epoch": 1.3578345583945863, + "grad_norm": 1.1068814992904663, + "learning_rate": 0.0002510791991215381, + "loss": 2.2149, + "step": 11638 + }, + { + "epoch": 1.357951230894878, + "grad_norm": 1.2906924486160278, + "learning_rate": 0.00025106798187511935, + "loss": 1.9057, + "step": 11639 + }, + { + "epoch": 1.3580679033951697, + "grad_norm": 1.1687582731246948, + "learning_rate": 0.0002510567635964899, + "loss": 2.1624, + "step": 11640 + }, + { + "epoch": 1.3581845758954614, + "grad_norm": 1.1317206621170044, + "learning_rate": 0.0002510455442857661, + "loss": 2.1251, + "step": 11641 + }, + { + "epoch": 1.358301248395753, + "grad_norm": 1.2573068141937256, + "learning_rate": 0.0002510343239430642, + "loss": 2.2056, + "step": 11642 + }, + { + "epoch": 1.3584179208960447, + "grad_norm": 1.1045132875442505, + "learning_rate": 0.0002510231025685005, + "loss": 2.113, + "step": 11643 + }, + { + "epoch": 1.3585345933963364, + "grad_norm": 1.1589807271957397, + "learning_rate": 0.0002510118801621915, + "loss": 2.0419, + "step": 11644 + }, + { + "epoch": 1.358651265896628, + "grad_norm": 1.3553073406219482, + "learning_rate": 0.00025100065672425335, + "loss": 1.8556, + "step": 11645 + }, + { + "epoch": 1.3587679383969198, + "grad_norm": 1.1679097414016724, + "learning_rate": 0.0002509894322548025, + "loss": 2.0555, + "step": 11646 + }, + { + "epoch": 1.3588846108972115, + "grad_norm": 1.1612354516983032, + "learning_rate": 0.0002509782067539553, + "loss": 2.004, + "step": 11647 + }, + { + "epoch": 1.3590012833975031, + "grad_norm": 1.1696962118148804, + "learning_rate": 0.000250966980221828, + "loss": 2.0055, + "step": 11648 + }, + { + "epoch": 1.3591179558977948, + "grad_norm": 1.0908901691436768, + "learning_rate": 0.0002509557526585372, + "loss": 1.9458, + "step": 11649 + }, + { + "epoch": 1.3592346283980865, + "grad_norm": 1.2163679599761963, + "learning_rate": 0.00025094452406419917, + "loss": 2.0728, + "step": 11650 + }, + { + "epoch": 1.3593513008983782, + "grad_norm": 1.1475164890289307, + "learning_rate": 0.00025093329443893033, + "loss": 1.9842, + "step": 11651 + }, + { + "epoch": 1.3594679733986699, + "grad_norm": 1.208106517791748, + "learning_rate": 0.0002509220637828471, + "loss": 2.1424, + "step": 11652 + }, + { + "epoch": 1.3595846458989616, + "grad_norm": 1.1756832599639893, + "learning_rate": 0.0002509108320960659, + "loss": 2.1439, + "step": 11653 + }, + { + "epoch": 1.3597013183992532, + "grad_norm": 1.0618581771850586, + "learning_rate": 0.0002508995993787032, + "loss": 2.0478, + "step": 11654 + }, + { + "epoch": 1.359817990899545, + "grad_norm": 1.1119917631149292, + "learning_rate": 0.0002508883656308754, + "loss": 2.1544, + "step": 11655 + }, + { + "epoch": 1.3599346633998366, + "grad_norm": 1.3480467796325684, + "learning_rate": 0.000250877130852699, + "loss": 2.2411, + "step": 11656 + }, + { + "epoch": 1.3600513359001283, + "grad_norm": 1.108888864517212, + "learning_rate": 0.00025086589504429044, + "loss": 1.9836, + "step": 11657 + }, + { + "epoch": 1.36016800840042, + "grad_norm": 1.3492581844329834, + "learning_rate": 0.0002508546582057662, + "loss": 2.0902, + "step": 11658 + }, + { + "epoch": 1.3602846809007116, + "grad_norm": 1.1208468675613403, + "learning_rate": 0.0002508434203372428, + "loss": 1.9935, + "step": 11659 + }, + { + "epoch": 1.3604013534010033, + "grad_norm": 1.1660828590393066, + "learning_rate": 0.0002508321814388367, + "loss": 2.0975, + "step": 11660 + }, + { + "epoch": 1.360518025901295, + "grad_norm": 1.1106595993041992, + "learning_rate": 0.00025082094151066453, + "loss": 2.0791, + "step": 11661 + }, + { + "epoch": 1.3606346984015867, + "grad_norm": 1.2085375785827637, + "learning_rate": 0.0002508097005528426, + "loss": 2.0038, + "step": 11662 + }, + { + "epoch": 1.3607513709018784, + "grad_norm": 1.2576649188995361, + "learning_rate": 0.00025079845856548766, + "loss": 2.1946, + "step": 11663 + }, + { + "epoch": 1.36086804340217, + "grad_norm": 1.3220601081848145, + "learning_rate": 0.00025078721554871607, + "loss": 2.2174, + "step": 11664 + }, + { + "epoch": 1.3609847159024617, + "grad_norm": 1.1976063251495361, + "learning_rate": 0.00025077597150264456, + "loss": 1.7035, + "step": 11665 + }, + { + "epoch": 1.3611013884027534, + "grad_norm": 1.2107343673706055, + "learning_rate": 0.0002507647264273896, + "loss": 2.0931, + "step": 11666 + }, + { + "epoch": 1.361218060903045, + "grad_norm": 1.2607638835906982, + "learning_rate": 0.0002507534803230677, + "loss": 2.2498, + "step": 11667 + }, + { + "epoch": 1.3613347334033368, + "grad_norm": 1.4405553340911865, + "learning_rate": 0.0002507422331897956, + "loss": 2.1402, + "step": 11668 + }, + { + "epoch": 1.3614514059036285, + "grad_norm": 1.1588486433029175, + "learning_rate": 0.0002507309850276898, + "loss": 2.1643, + "step": 11669 + }, + { + "epoch": 1.3615680784039201, + "grad_norm": 1.213918924331665, + "learning_rate": 0.00025071973583686697, + "loss": 2.0831, + "step": 11670 + }, + { + "epoch": 1.3616847509042118, + "grad_norm": 1.2850054502487183, + "learning_rate": 0.0002507084856174436, + "loss": 2.1728, + "step": 11671 + }, + { + "epoch": 1.3618014234045035, + "grad_norm": 1.1596508026123047, + "learning_rate": 0.0002506972343695365, + "loss": 2.102, + "step": 11672 + }, + { + "epoch": 1.3619180959047952, + "grad_norm": 1.0873271226882935, + "learning_rate": 0.0002506859820932622, + "loss": 2.009, + "step": 11673 + }, + { + "epoch": 1.3620347684050869, + "grad_norm": 1.0881589651107788, + "learning_rate": 0.00025067472878873736, + "loss": 2.0578, + "step": 11674 + }, + { + "epoch": 1.3621514409053785, + "grad_norm": 1.1202292442321777, + "learning_rate": 0.0002506634744560786, + "loss": 2.0245, + "step": 11675 + }, + { + "epoch": 1.3622681134056702, + "grad_norm": 1.0721198320388794, + "learning_rate": 0.0002506522190954027, + "loss": 1.978, + "step": 11676 + }, + { + "epoch": 1.362384785905962, + "grad_norm": 1.1572209596633911, + "learning_rate": 0.00025064096270682626, + "loss": 2.1655, + "step": 11677 + }, + { + "epoch": 1.3625014584062536, + "grad_norm": 1.1702260971069336, + "learning_rate": 0.00025062970529046605, + "loss": 2.1693, + "step": 11678 + }, + { + "epoch": 1.3626181309065453, + "grad_norm": 1.117017149925232, + "learning_rate": 0.00025061844684643874, + "loss": 1.9465, + "step": 11679 + }, + { + "epoch": 1.362734803406837, + "grad_norm": 1.3498423099517822, + "learning_rate": 0.000250607187374861, + "loss": 2.1031, + "step": 11680 + }, + { + "epoch": 1.3628514759071286, + "grad_norm": 1.231913447380066, + "learning_rate": 0.0002505959268758495, + "loss": 2.1561, + "step": 11681 + }, + { + "epoch": 1.3629681484074203, + "grad_norm": 1.1217464208602905, + "learning_rate": 0.0002505846653495212, + "loss": 2.0832, + "step": 11682 + }, + { + "epoch": 1.363084820907712, + "grad_norm": 0.9700160622596741, + "learning_rate": 0.00025057340279599266, + "loss": 1.9511, + "step": 11683 + }, + { + "epoch": 1.3632014934080037, + "grad_norm": 0.985559344291687, + "learning_rate": 0.00025056213921538067, + "loss": 2.1275, + "step": 11684 + }, + { + "epoch": 1.3633181659082954, + "grad_norm": 1.2643753290176392, + "learning_rate": 0.000250550874607802, + "loss": 2.0197, + "step": 11685 + }, + { + "epoch": 1.363434838408587, + "grad_norm": 1.2601858377456665, + "learning_rate": 0.0002505396089733735, + "loss": 2.3077, + "step": 11686 + }, + { + "epoch": 1.3635515109088787, + "grad_norm": 1.3749504089355469, + "learning_rate": 0.0002505283423122119, + "loss": 2.2008, + "step": 11687 + }, + { + "epoch": 1.3636681834091704, + "grad_norm": 1.3226146697998047, + "learning_rate": 0.00025051707462443403, + "loss": 2.0211, + "step": 11688 + }, + { + "epoch": 1.363784855909462, + "grad_norm": 1.2853344678878784, + "learning_rate": 0.00025050580591015663, + "loss": 2.2326, + "step": 11689 + }, + { + "epoch": 1.3639015284097538, + "grad_norm": 1.1066690683364868, + "learning_rate": 0.0002504945361694966, + "loss": 2.1013, + "step": 11690 + }, + { + "epoch": 1.3640182009100454, + "grad_norm": 1.1854134798049927, + "learning_rate": 0.00025048326540257066, + "loss": 2.1274, + "step": 11691 + }, + { + "epoch": 1.3641348734103371, + "grad_norm": 1.1987957954406738, + "learning_rate": 0.0002504719936094958, + "loss": 2.002, + "step": 11692 + }, + { + "epoch": 1.3642515459106288, + "grad_norm": 1.218513011932373, + "learning_rate": 0.00025046072079038884, + "loss": 2.1271, + "step": 11693 + }, + { + "epoch": 1.3643682184109205, + "grad_norm": 1.400848627090454, + "learning_rate": 0.00025044944694536654, + "loss": 2.1238, + "step": 11694 + }, + { + "epoch": 1.3644848909112122, + "grad_norm": 1.207202672958374, + "learning_rate": 0.00025043817207454587, + "loss": 1.9814, + "step": 11695 + }, + { + "epoch": 1.3646015634115038, + "grad_norm": 1.424712061882019, + "learning_rate": 0.0002504268961780437, + "loss": 2.029, + "step": 11696 + }, + { + "epoch": 1.3647182359117955, + "grad_norm": 1.2445372343063354, + "learning_rate": 0.00025041561925597693, + "loss": 2.0868, + "step": 11697 + }, + { + "epoch": 1.3648349084120872, + "grad_norm": 1.123969554901123, + "learning_rate": 0.0002504043413084625, + "loss": 2.1327, + "step": 11698 + }, + { + "epoch": 1.364951580912379, + "grad_norm": 1.009229063987732, + "learning_rate": 0.0002503930623356172, + "loss": 1.7835, + "step": 11699 + }, + { + "epoch": 1.3650682534126706, + "grad_norm": 1.1170735359191895, + "learning_rate": 0.00025038178233755806, + "loss": 2.137, + "step": 11700 + }, + { + "epoch": 1.3651849259129623, + "grad_norm": 1.1257092952728271, + "learning_rate": 0.000250370501314402, + "loss": 2.1718, + "step": 11701 + }, + { + "epoch": 1.365301598413254, + "grad_norm": 1.0337368249893188, + "learning_rate": 0.0002503592192662659, + "loss": 2.1363, + "step": 11702 + }, + { + "epoch": 1.3654182709135456, + "grad_norm": 1.317059874534607, + "learning_rate": 0.0002503479361932669, + "loss": 2.1665, + "step": 11703 + }, + { + "epoch": 1.3655349434138373, + "grad_norm": 1.1297516822814941, + "learning_rate": 0.0002503366520955218, + "loss": 1.98, + "step": 11704 + }, + { + "epoch": 1.365651615914129, + "grad_norm": 1.3037652969360352, + "learning_rate": 0.00025032536697314764, + "loss": 2.1898, + "step": 11705 + }, + { + "epoch": 1.3657682884144207, + "grad_norm": 1.1057707071304321, + "learning_rate": 0.00025031408082626144, + "loss": 1.9635, + "step": 11706 + }, + { + "epoch": 1.3658849609147123, + "grad_norm": 1.137495756149292, + "learning_rate": 0.00025030279365498016, + "loss": 1.9566, + "step": 11707 + }, + { + "epoch": 1.366001633415004, + "grad_norm": 1.24565851688385, + "learning_rate": 0.0002502915054594208, + "loss": 2.1619, + "step": 11708 + }, + { + "epoch": 1.3661183059152957, + "grad_norm": 1.2587010860443115, + "learning_rate": 0.0002502802162397005, + "loss": 2.2292, + "step": 11709 + }, + { + "epoch": 1.3662349784155874, + "grad_norm": 1.128503680229187, + "learning_rate": 0.0002502689259959361, + "loss": 1.8346, + "step": 11710 + }, + { + "epoch": 1.366351650915879, + "grad_norm": 1.140813946723938, + "learning_rate": 0.0002502576347282448, + "loss": 2.189, + "step": 11711 + }, + { + "epoch": 1.3664683234161707, + "grad_norm": 1.4140292406082153, + "learning_rate": 0.00025024634243674354, + "loss": 2.1422, + "step": 11712 + }, + { + "epoch": 1.3665849959164624, + "grad_norm": 1.1226271390914917, + "learning_rate": 0.0002502350491215495, + "loss": 1.9716, + "step": 11713 + }, + { + "epoch": 1.3667016684167541, + "grad_norm": 1.1138195991516113, + "learning_rate": 0.0002502237547827797, + "loss": 1.9372, + "step": 11714 + }, + { + "epoch": 1.3668183409170458, + "grad_norm": 1.2039055824279785, + "learning_rate": 0.0002502124594205512, + "loss": 2.06, + "step": 11715 + }, + { + "epoch": 1.3669350134173375, + "grad_norm": 1.2444199323654175, + "learning_rate": 0.0002502011630349812, + "loss": 2.054, + "step": 11716 + }, + { + "epoch": 1.3670516859176292, + "grad_norm": 1.259663701057434, + "learning_rate": 0.00025018986562618667, + "loss": 2.1756, + "step": 11717 + }, + { + "epoch": 1.3671683584179208, + "grad_norm": 1.3845881223678589, + "learning_rate": 0.00025017856719428486, + "loss": 2.1397, + "step": 11718 + }, + { + "epoch": 1.3672850309182125, + "grad_norm": 1.2622071504592896, + "learning_rate": 0.0002501672677393928, + "loss": 2.1261, + "step": 11719 + }, + { + "epoch": 1.3674017034185042, + "grad_norm": 1.156272053718567, + "learning_rate": 0.0002501559672616277, + "loss": 2.1194, + "step": 11720 + }, + { + "epoch": 1.3675183759187959, + "grad_norm": 1.287375807762146, + "learning_rate": 0.00025014466576110667, + "loss": 2.1301, + "step": 11721 + }, + { + "epoch": 1.3676350484190876, + "grad_norm": 0.9879615902900696, + "learning_rate": 0.00025013336323794687, + "loss": 1.9209, + "step": 11722 + }, + { + "epoch": 1.3677517209193792, + "grad_norm": 1.1283255815505981, + "learning_rate": 0.00025012205969226543, + "loss": 2.0022, + "step": 11723 + }, + { + "epoch": 1.367868393419671, + "grad_norm": 1.1753475666046143, + "learning_rate": 0.0002501107551241796, + "loss": 2.0524, + "step": 11724 + }, + { + "epoch": 1.3679850659199626, + "grad_norm": 1.1707385778427124, + "learning_rate": 0.0002500994495338067, + "loss": 2.1491, + "step": 11725 + }, + { + "epoch": 1.3681017384202543, + "grad_norm": 1.3070416450500488, + "learning_rate": 0.00025008814292126367, + "loss": 2.3115, + "step": 11726 + }, + { + "epoch": 1.368218410920546, + "grad_norm": 1.1742908954620361, + "learning_rate": 0.00025007683528666786, + "loss": 2.2001, + "step": 11727 + }, + { + "epoch": 1.3683350834208377, + "grad_norm": 1.1364500522613525, + "learning_rate": 0.00025006552663013647, + "loss": 2.0462, + "step": 11728 + }, + { + "epoch": 1.3684517559211293, + "grad_norm": 1.3369520902633667, + "learning_rate": 0.00025005421695178676, + "loss": 2.1518, + "step": 11729 + }, + { + "epoch": 1.368568428421421, + "grad_norm": 1.1459200382232666, + "learning_rate": 0.00025004290625173594, + "loss": 2.0153, + "step": 11730 + }, + { + "epoch": 1.3686851009217127, + "grad_norm": 1.5043485164642334, + "learning_rate": 0.0002500315945301013, + "loss": 2.2137, + "step": 11731 + }, + { + "epoch": 1.3688017734220044, + "grad_norm": 1.3429731130599976, + "learning_rate": 0.00025002028178700014, + "loss": 2.1722, + "step": 11732 + }, + { + "epoch": 1.368918445922296, + "grad_norm": 1.1071912050247192, + "learning_rate": 0.0002500089680225497, + "loss": 1.9242, + "step": 11733 + }, + { + "epoch": 1.3690351184225877, + "grad_norm": 1.0660685300827026, + "learning_rate": 0.0002499976532368672, + "loss": 2.0749, + "step": 11734 + }, + { + "epoch": 1.3691517909228794, + "grad_norm": 0.9942983388900757, + "learning_rate": 0.00024998633743007, + "loss": 1.9054, + "step": 11735 + }, + { + "epoch": 1.369268463423171, + "grad_norm": 1.3788081407546997, + "learning_rate": 0.00024997502060227546, + "loss": 2.1135, + "step": 11736 + }, + { + "epoch": 1.3693851359234628, + "grad_norm": 1.147401213645935, + "learning_rate": 0.0002499637027536009, + "loss": 2.0732, + "step": 11737 + }, + { + "epoch": 1.3695018084237545, + "grad_norm": 1.2565919160842896, + "learning_rate": 0.0002499523838841635, + "loss": 2.1258, + "step": 11738 + }, + { + "epoch": 1.3696184809240461, + "grad_norm": 1.091339349746704, + "learning_rate": 0.00024994106399408076, + "loss": 2.0558, + "step": 11739 + }, + { + "epoch": 1.3697351534243378, + "grad_norm": 1.1642603874206543, + "learning_rate": 0.00024992974308347, + "loss": 2.1269, + "step": 11740 + }, + { + "epoch": 1.3698518259246295, + "grad_norm": 1.0546525716781616, + "learning_rate": 0.0002499184211524485, + "loss": 2.0179, + "step": 11741 + }, + { + "epoch": 1.3699684984249212, + "grad_norm": 1.125800609588623, + "learning_rate": 0.00024990709820113375, + "loss": 1.8992, + "step": 11742 + }, + { + "epoch": 1.3700851709252129, + "grad_norm": 1.2514671087265015, + "learning_rate": 0.00024989577422964305, + "loss": 2.1977, + "step": 11743 + }, + { + "epoch": 1.3702018434255046, + "grad_norm": 1.0900986194610596, + "learning_rate": 0.0002498844492380938, + "loss": 2.0779, + "step": 11744 + }, + { + "epoch": 1.3703185159257962, + "grad_norm": 1.1316697597503662, + "learning_rate": 0.00024987312322660347, + "loss": 2.0086, + "step": 11745 + }, + { + "epoch": 1.370435188426088, + "grad_norm": 1.1584036350250244, + "learning_rate": 0.0002498617961952894, + "loss": 2.093, + "step": 11746 + }, + { + "epoch": 1.3705518609263796, + "grad_norm": 1.2392956018447876, + "learning_rate": 0.0002498504681442691, + "loss": 1.8736, + "step": 11747 + }, + { + "epoch": 1.3706685334266713, + "grad_norm": 1.4379849433898926, + "learning_rate": 0.0002498391390736599, + "loss": 2.3207, + "step": 11748 + }, + { + "epoch": 1.370785205926963, + "grad_norm": 1.2074745893478394, + "learning_rate": 0.00024982780898357934, + "loss": 2.1475, + "step": 11749 + }, + { + "epoch": 1.3709018784272546, + "grad_norm": 1.0245908498764038, + "learning_rate": 0.00024981647787414475, + "loss": 2.0308, + "step": 11750 + }, + { + "epoch": 1.3710185509275463, + "grad_norm": 1.1878714561462402, + "learning_rate": 0.0002498051457454738, + "loss": 2.2044, + "step": 11751 + }, + { + "epoch": 1.371135223427838, + "grad_norm": 1.2408674955368042, + "learning_rate": 0.00024979381259768386, + "loss": 2.1857, + "step": 11752 + }, + { + "epoch": 1.3712518959281297, + "grad_norm": 1.020683765411377, + "learning_rate": 0.0002497824784308924, + "loss": 1.8327, + "step": 11753 + }, + { + "epoch": 1.3713685684284214, + "grad_norm": 1.0456269979476929, + "learning_rate": 0.00024977114324521694, + "loss": 2.0754, + "step": 11754 + }, + { + "epoch": 1.371485240928713, + "grad_norm": 1.1735732555389404, + "learning_rate": 0.0002497598070407749, + "loss": 2.0312, + "step": 11755 + }, + { + "epoch": 1.3716019134290047, + "grad_norm": 1.0193403959274292, + "learning_rate": 0.00024974846981768394, + "loss": 2.0478, + "step": 11756 + }, + { + "epoch": 1.3717185859292964, + "grad_norm": 1.2369426488876343, + "learning_rate": 0.00024973713157606155, + "loss": 1.9709, + "step": 11757 + }, + { + "epoch": 1.371835258429588, + "grad_norm": 1.2196452617645264, + "learning_rate": 0.0002497257923160253, + "loss": 2.1973, + "step": 11758 + }, + { + "epoch": 1.3719519309298798, + "grad_norm": 1.2317025661468506, + "learning_rate": 0.0002497144520376927, + "loss": 1.9919, + "step": 11759 + }, + { + "epoch": 1.3720686034301715, + "grad_norm": 1.1477783918380737, + "learning_rate": 0.0002497031107411813, + "loss": 2.1494, + "step": 11760 + }, + { + "epoch": 1.3721852759304631, + "grad_norm": 1.1215087175369263, + "learning_rate": 0.0002496917684266087, + "loss": 2.174, + "step": 11761 + }, + { + "epoch": 1.3723019484307548, + "grad_norm": 1.2264047861099243, + "learning_rate": 0.0002496804250940925, + "loss": 2.0816, + "step": 11762 + }, + { + "epoch": 1.3724186209310465, + "grad_norm": 1.1524789333343506, + "learning_rate": 0.0002496690807437502, + "loss": 2.091, + "step": 11763 + }, + { + "epoch": 1.3725352934313382, + "grad_norm": 1.0902323722839355, + "learning_rate": 0.0002496577353756996, + "loss": 2.0736, + "step": 11764 + }, + { + "epoch": 1.3726519659316299, + "grad_norm": 1.186883807182312, + "learning_rate": 0.00024964638899005806, + "loss": 2.051, + "step": 11765 + }, + { + "epoch": 1.3727686384319215, + "grad_norm": 1.1940586566925049, + "learning_rate": 0.0002496350415869434, + "loss": 2.1839, + "step": 11766 + }, + { + "epoch": 1.3728853109322132, + "grad_norm": 1.1146291494369507, + "learning_rate": 0.00024962369316647325, + "loss": 2.0118, + "step": 11767 + }, + { + "epoch": 1.373001983432505, + "grad_norm": 1.0707319974899292, + "learning_rate": 0.00024961234372876514, + "loss": 2.0084, + "step": 11768 + }, + { + "epoch": 1.3731186559327966, + "grad_norm": 1.3973701000213623, + "learning_rate": 0.0002496009932739368, + "loss": 2.0593, + "step": 11769 + }, + { + "epoch": 1.3732353284330883, + "grad_norm": 1.2547760009765625, + "learning_rate": 0.00024958964180210597, + "loss": 2.1782, + "step": 11770 + }, + { + "epoch": 1.37335200093338, + "grad_norm": 1.3018684387207031, + "learning_rate": 0.00024957828931339017, + "loss": 2.1533, + "step": 11771 + }, + { + "epoch": 1.3734686734336716, + "grad_norm": 1.1711093187332153, + "learning_rate": 0.0002495669358079072, + "loss": 2.2069, + "step": 11772 + }, + { + "epoch": 1.3735853459339633, + "grad_norm": 1.1760730743408203, + "learning_rate": 0.00024955558128577477, + "loss": 2.1266, + "step": 11773 + }, + { + "epoch": 1.373702018434255, + "grad_norm": 1.0901604890823364, + "learning_rate": 0.0002495442257471105, + "loss": 2.0296, + "step": 11774 + }, + { + "epoch": 1.3738186909345467, + "grad_norm": 1.178957462310791, + "learning_rate": 0.0002495328691920322, + "loss": 2.0458, + "step": 11775 + }, + { + "epoch": 1.3739353634348384, + "grad_norm": 1.1157586574554443, + "learning_rate": 0.00024952151162065757, + "loss": 2.1459, + "step": 11776 + }, + { + "epoch": 1.37405203593513, + "grad_norm": 1.2659289836883545, + "learning_rate": 0.00024951015303310435, + "loss": 2.0189, + "step": 11777 + }, + { + "epoch": 1.3741687084354217, + "grad_norm": 1.3178330659866333, + "learning_rate": 0.0002494987934294903, + "loss": 2.1436, + "step": 11778 + }, + { + "epoch": 1.3742853809357134, + "grad_norm": 1.0952363014221191, + "learning_rate": 0.00024948743280993313, + "loss": 1.8913, + "step": 11779 + }, + { + "epoch": 1.374402053436005, + "grad_norm": 1.09627103805542, + "learning_rate": 0.00024947607117455073, + "loss": 1.9602, + "step": 11780 + }, + { + "epoch": 1.3745187259362968, + "grad_norm": 1.19074547290802, + "learning_rate": 0.0002494647085234608, + "loss": 2.011, + "step": 11781 + }, + { + "epoch": 1.3746353984365884, + "grad_norm": 1.0701112747192383, + "learning_rate": 0.00024945334485678116, + "loss": 1.9391, + "step": 11782 + }, + { + "epoch": 1.3747520709368801, + "grad_norm": 1.2547965049743652, + "learning_rate": 0.0002494419801746296, + "loss": 2.0649, + "step": 11783 + }, + { + "epoch": 1.3748687434371718, + "grad_norm": 1.180050015449524, + "learning_rate": 0.00024943061447712394, + "loss": 2.0865, + "step": 11784 + }, + { + "epoch": 1.3749854159374635, + "grad_norm": 1.1272549629211426, + "learning_rate": 0.000249419247764382, + "loss": 2.1193, + "step": 11785 + }, + { + "epoch": 1.3751020884377552, + "grad_norm": 1.044912576675415, + "learning_rate": 0.00024940788003652173, + "loss": 1.9964, + "step": 11786 + }, + { + "epoch": 1.3752187609380468, + "grad_norm": 1.4109958410263062, + "learning_rate": 0.0002493965112936608, + "loss": 2.2907, + "step": 11787 + }, + { + "epoch": 1.3753354334383385, + "grad_norm": 1.1465826034545898, + "learning_rate": 0.0002493851415359171, + "loss": 2.0092, + "step": 11788 + }, + { + "epoch": 1.3754521059386302, + "grad_norm": 1.1979568004608154, + "learning_rate": 0.00024937377076340866, + "loss": 1.9608, + "step": 11789 + }, + { + "epoch": 1.375568778438922, + "grad_norm": 1.147674560546875, + "learning_rate": 0.00024936239897625323, + "loss": 1.9786, + "step": 11790 + }, + { + "epoch": 1.3756854509392136, + "grad_norm": 1.2935450077056885, + "learning_rate": 0.0002493510261745686, + "loss": 2.09, + "step": 11791 + }, + { + "epoch": 1.3758021234395053, + "grad_norm": 1.0543503761291504, + "learning_rate": 0.0002493396523584729, + "loss": 2.1201, + "step": 11792 + }, + { + "epoch": 1.375918795939797, + "grad_norm": 1.099236011505127, + "learning_rate": 0.00024932827752808384, + "loss": 2.0049, + "step": 11793 + }, + { + "epoch": 1.3760354684400886, + "grad_norm": 1.1412131786346436, + "learning_rate": 0.00024931690168351954, + "loss": 2.0823, + "step": 11794 + }, + { + "epoch": 1.3761521409403803, + "grad_norm": 0.9233871102333069, + "learning_rate": 0.00024930552482489775, + "loss": 1.6035, + "step": 11795 + }, + { + "epoch": 1.376268813440672, + "grad_norm": 1.3711684942245483, + "learning_rate": 0.0002492941469523365, + "loss": 2.1919, + "step": 11796 + }, + { + "epoch": 1.3763854859409637, + "grad_norm": 1.1071151494979858, + "learning_rate": 0.0002492827680659537, + "loss": 1.9754, + "step": 11797 + }, + { + "epoch": 1.3765021584412553, + "grad_norm": 1.2911014556884766, + "learning_rate": 0.0002492713881658674, + "loss": 2.0771, + "step": 11798 + }, + { + "epoch": 1.376618830941547, + "grad_norm": 1.1619772911071777, + "learning_rate": 0.0002492600072521955, + "loss": 2.205, + "step": 11799 + }, + { + "epoch": 1.3767355034418387, + "grad_norm": 1.310835361480713, + "learning_rate": 0.000249248625325056, + "loss": 2.1901, + "step": 11800 + }, + { + "epoch": 1.3768521759421304, + "grad_norm": 1.303938388824463, + "learning_rate": 0.00024923724238456687, + "loss": 2.0814, + "step": 11801 + }, + { + "epoch": 1.376968848442422, + "grad_norm": 1.333976149559021, + "learning_rate": 0.00024922585843084615, + "loss": 1.8683, + "step": 11802 + }, + { + "epoch": 1.3770855209427137, + "grad_norm": 1.3268566131591797, + "learning_rate": 0.0002492144734640119, + "loss": 2.2107, + "step": 11803 + }, + { + "epoch": 1.3772021934430054, + "grad_norm": 1.1954946517944336, + "learning_rate": 0.00024920308748418204, + "loss": 1.9098, + "step": 11804 + }, + { + "epoch": 1.377318865943297, + "grad_norm": 1.2516347169876099, + "learning_rate": 0.0002491917004914747, + "loss": 2.0559, + "step": 11805 + }, + { + "epoch": 1.3774355384435888, + "grad_norm": 1.1870334148406982, + "learning_rate": 0.0002491803124860078, + "loss": 1.9328, + "step": 11806 + }, + { + "epoch": 1.3775522109438805, + "grad_norm": 1.1974689960479736, + "learning_rate": 0.00024916892346789956, + "loss": 2.0204, + "step": 11807 + }, + { + "epoch": 1.3776688834441722, + "grad_norm": 1.0781514644622803, + "learning_rate": 0.00024915753343726795, + "loss": 2.0688, + "step": 11808 + }, + { + "epoch": 1.3777855559444638, + "grad_norm": 1.22158682346344, + "learning_rate": 0.0002491461423942311, + "loss": 2.0108, + "step": 11809 + }, + { + "epoch": 1.3779022284447555, + "grad_norm": 1.193375825881958, + "learning_rate": 0.0002491347503389071, + "loss": 2.0356, + "step": 11810 + }, + { + "epoch": 1.3780189009450472, + "grad_norm": 1.194874882698059, + "learning_rate": 0.00024912335727141397, + "loss": 2.098, + "step": 11811 + }, + { + "epoch": 1.3781355734453389, + "grad_norm": 1.172039270401001, + "learning_rate": 0.0002491119631918699, + "loss": 2.2032, + "step": 11812 + }, + { + "epoch": 1.3782522459456306, + "grad_norm": 1.1392221450805664, + "learning_rate": 0.000249100568100393, + "loss": 2.0973, + "step": 11813 + }, + { + "epoch": 1.3783689184459222, + "grad_norm": 1.059020757675171, + "learning_rate": 0.00024908917199710143, + "loss": 2.1222, + "step": 11814 + }, + { + "epoch": 1.378485590946214, + "grad_norm": 1.029970645904541, + "learning_rate": 0.00024907777488211323, + "loss": 1.8996, + "step": 11815 + }, + { + "epoch": 1.3786022634465056, + "grad_norm": 1.2269816398620605, + "learning_rate": 0.0002490663767555466, + "loss": 2.2061, + "step": 11816 + }, + { + "epoch": 1.3787189359467973, + "grad_norm": 1.403018832206726, + "learning_rate": 0.0002490549776175198, + "loss": 2.2591, + "step": 11817 + }, + { + "epoch": 1.378835608447089, + "grad_norm": 1.1470885276794434, + "learning_rate": 0.00024904357746815083, + "loss": 2.0409, + "step": 11818 + }, + { + "epoch": 1.3789522809473806, + "grad_norm": 1.2080891132354736, + "learning_rate": 0.000249032176307558, + "loss": 2.05, + "step": 11819 + }, + { + "epoch": 1.3790689534476723, + "grad_norm": 1.596090316772461, + "learning_rate": 0.0002490207741358595, + "loss": 2.0086, + "step": 11820 + }, + { + "epoch": 1.379185625947964, + "grad_norm": 1.3470135927200317, + "learning_rate": 0.0002490093709531734, + "loss": 2.2622, + "step": 11821 + }, + { + "epoch": 1.3793022984482557, + "grad_norm": 1.1449663639068604, + "learning_rate": 0.00024899796675961815, + "loss": 1.937, + "step": 11822 + }, + { + "epoch": 1.3794189709485474, + "grad_norm": 1.133290410041809, + "learning_rate": 0.0002489865615553118, + "loss": 2.0011, + "step": 11823 + }, + { + "epoch": 1.379535643448839, + "grad_norm": 0.8730303645133972, + "learning_rate": 0.00024897515534037264, + "loss": 1.8525, + "step": 11824 + }, + { + "epoch": 1.3796523159491307, + "grad_norm": 1.5840728282928467, + "learning_rate": 0.0002489637481149189, + "loss": 2.1736, + "step": 11825 + }, + { + "epoch": 1.3797689884494224, + "grad_norm": 1.1385284662246704, + "learning_rate": 0.00024895233987906883, + "loss": 2.0156, + "step": 11826 + }, + { + "epoch": 1.379885660949714, + "grad_norm": 1.0790013074874878, + "learning_rate": 0.0002489409306329408, + "loss": 2.1023, + "step": 11827 + }, + { + "epoch": 1.3800023334500058, + "grad_norm": 1.1488512754440308, + "learning_rate": 0.0002489295203766529, + "loss": 2.0829, + "step": 11828 + }, + { + "epoch": 1.3801190059502975, + "grad_norm": 1.2147241830825806, + "learning_rate": 0.0002489181091103235, + "loss": 2.2156, + "step": 11829 + }, + { + "epoch": 1.3802356784505891, + "grad_norm": 1.2221676111221313, + "learning_rate": 0.00024890669683407104, + "loss": 2.222, + "step": 11830 + }, + { + "epoch": 1.3803523509508808, + "grad_norm": 1.1559886932373047, + "learning_rate": 0.0002488952835480137, + "loss": 1.9473, + "step": 11831 + }, + { + "epoch": 1.3804690234511725, + "grad_norm": 1.0475555658340454, + "learning_rate": 0.00024888386925226976, + "loss": 2.03, + "step": 11832 + }, + { + "epoch": 1.3805856959514642, + "grad_norm": 1.1133722066879272, + "learning_rate": 0.0002488724539469576, + "loss": 2.1137, + "step": 11833 + }, + { + "epoch": 1.3807023684517559, + "grad_norm": 1.1661226749420166, + "learning_rate": 0.00024886103763219555, + "loss": 2.1005, + "step": 11834 + }, + { + "epoch": 1.3808190409520475, + "grad_norm": 1.142117977142334, + "learning_rate": 0.000248849620308102, + "loss": 1.976, + "step": 11835 + }, + { + "epoch": 1.3809357134523392, + "grad_norm": 1.3053019046783447, + "learning_rate": 0.0002488382019747953, + "loss": 2.3023, + "step": 11836 + }, + { + "epoch": 1.381052385952631, + "grad_norm": 1.4525710344314575, + "learning_rate": 0.0002488267826323938, + "loss": 2.4555, + "step": 11837 + }, + { + "epoch": 1.3811690584529226, + "grad_norm": 1.259462833404541, + "learning_rate": 0.00024881536228101595, + "loss": 2.0885, + "step": 11838 + }, + { + "epoch": 1.3812857309532143, + "grad_norm": 1.5916857719421387, + "learning_rate": 0.00024880394092078, + "loss": 2.1107, + "step": 11839 + }, + { + "epoch": 1.381402403453506, + "grad_norm": 1.1564645767211914, + "learning_rate": 0.00024879251855180454, + "loss": 1.9926, + "step": 11840 + }, + { + "epoch": 1.3815190759537976, + "grad_norm": 1.0278513431549072, + "learning_rate": 0.0002487810951742078, + "loss": 1.922, + "step": 11841 + }, + { + "epoch": 1.3816357484540893, + "grad_norm": 1.1021398305892944, + "learning_rate": 0.00024876967078810836, + "loss": 1.9997, + "step": 11842 + }, + { + "epoch": 1.381752420954381, + "grad_norm": 1.3243502378463745, + "learning_rate": 0.0002487582453936245, + "loss": 2.3012, + "step": 11843 + }, + { + "epoch": 1.3818690934546727, + "grad_norm": 1.34896719455719, + "learning_rate": 0.00024874681899087484, + "loss": 2.0545, + "step": 11844 + }, + { + "epoch": 1.3819857659549644, + "grad_norm": 1.1024302244186401, + "learning_rate": 0.0002487353915799777, + "loss": 2.227, + "step": 11845 + }, + { + "epoch": 1.382102438455256, + "grad_norm": 1.2385494709014893, + "learning_rate": 0.0002487239631610516, + "loss": 2.1167, + "step": 11846 + }, + { + "epoch": 1.3822191109555477, + "grad_norm": 1.198029637336731, + "learning_rate": 0.00024871253373421503, + "loss": 2.0702, + "step": 11847 + }, + { + "epoch": 1.3823357834558394, + "grad_norm": 1.115834355354309, + "learning_rate": 0.0002487011032995864, + "loss": 2.106, + "step": 11848 + }, + { + "epoch": 1.382452455956131, + "grad_norm": 1.2102662324905396, + "learning_rate": 0.00024868967185728434, + "loss": 2.1022, + "step": 11849 + }, + { + "epoch": 1.3825691284564228, + "grad_norm": 1.230945110321045, + "learning_rate": 0.0002486782394074273, + "loss": 2.1584, + "step": 11850 + }, + { + "epoch": 1.3826858009567145, + "grad_norm": 0.9654670357704163, + "learning_rate": 0.00024866680595013376, + "loss": 1.9416, + "step": 11851 + }, + { + "epoch": 1.3828024734570061, + "grad_norm": 1.105645775794983, + "learning_rate": 0.00024865537148552227, + "loss": 2.1746, + "step": 11852 + }, + { + "epoch": 1.3829191459572978, + "grad_norm": 1.1804959774017334, + "learning_rate": 0.0002486439360137113, + "loss": 2.1539, + "step": 11853 + }, + { + "epoch": 1.3830358184575895, + "grad_norm": 1.1852607727050781, + "learning_rate": 0.0002486324995348196, + "loss": 2.0518, + "step": 11854 + }, + { + "epoch": 1.3831524909578812, + "grad_norm": 1.1406729221343994, + "learning_rate": 0.00024862106204896555, + "loss": 1.9878, + "step": 11855 + }, + { + "epoch": 1.3832691634581729, + "grad_norm": 1.2361586093902588, + "learning_rate": 0.00024860962355626777, + "loss": 2.1319, + "step": 11856 + }, + { + "epoch": 1.3833858359584645, + "grad_norm": 1.3213911056518555, + "learning_rate": 0.0002485981840568449, + "loss": 2.0632, + "step": 11857 + }, + { + "epoch": 1.3835025084587562, + "grad_norm": 1.2363998889923096, + "learning_rate": 0.00024858674355081545, + "loss": 2.102, + "step": 11858 + }, + { + "epoch": 1.383619180959048, + "grad_norm": 1.324581503868103, + "learning_rate": 0.00024857530203829803, + "loss": 2.2581, + "step": 11859 + }, + { + "epoch": 1.3837358534593396, + "grad_norm": 1.2207989692687988, + "learning_rate": 0.0002485638595194113, + "loss": 1.9227, + "step": 11860 + }, + { + "epoch": 1.3838525259596313, + "grad_norm": 1.1319159269332886, + "learning_rate": 0.00024855241599427383, + "loss": 2.0346, + "step": 11861 + }, + { + "epoch": 1.383969198459923, + "grad_norm": 1.062819004058838, + "learning_rate": 0.00024854097146300433, + "loss": 2.0024, + "step": 11862 + }, + { + "epoch": 1.3840858709602146, + "grad_norm": 1.1542401313781738, + "learning_rate": 0.00024852952592572137, + "loss": 2.0867, + "step": 11863 + }, + { + "epoch": 1.3842025434605063, + "grad_norm": 1.14168381690979, + "learning_rate": 0.0002485180793825437, + "loss": 2.2549, + "step": 11864 + }, + { + "epoch": 1.384319215960798, + "grad_norm": 1.2976667881011963, + "learning_rate": 0.00024850663183358985, + "loss": 2.1284, + "step": 11865 + }, + { + "epoch": 1.3844358884610897, + "grad_norm": 1.3306139707565308, + "learning_rate": 0.0002484951832789786, + "loss": 2.066, + "step": 11866 + }, + { + "epoch": 1.3845525609613814, + "grad_norm": 1.2303260564804077, + "learning_rate": 0.00024848373371882856, + "loss": 2.1224, + "step": 11867 + }, + { + "epoch": 1.384669233461673, + "grad_norm": 1.0685031414031982, + "learning_rate": 0.00024847228315325845, + "loss": 1.8531, + "step": 11868 + }, + { + "epoch": 1.3847859059619647, + "grad_norm": 1.0548099279403687, + "learning_rate": 0.00024846083158238703, + "loss": 1.8473, + "step": 11869 + }, + { + "epoch": 1.3849025784622564, + "grad_norm": 1.108578085899353, + "learning_rate": 0.000248449379006333, + "loss": 2.1236, + "step": 11870 + }, + { + "epoch": 1.385019250962548, + "grad_norm": 1.112761378288269, + "learning_rate": 0.0002484379254252151, + "loss": 2.0887, + "step": 11871 + }, + { + "epoch": 1.3851359234628398, + "grad_norm": 1.056383490562439, + "learning_rate": 0.00024842647083915195, + "loss": 1.96, + "step": 11872 + }, + { + "epoch": 1.3852525959631314, + "grad_norm": 1.12296724319458, + "learning_rate": 0.0002484150152482624, + "loss": 1.9811, + "step": 11873 + }, + { + "epoch": 1.3853692684634231, + "grad_norm": 1.1475988626480103, + "learning_rate": 0.00024840355865266516, + "loss": 2.1647, + "step": 11874 + }, + { + "epoch": 1.3854859409637148, + "grad_norm": 1.2220900058746338, + "learning_rate": 0.0002483921010524791, + "loss": 1.9932, + "step": 11875 + }, + { + "epoch": 1.3856026134640065, + "grad_norm": 1.185305118560791, + "learning_rate": 0.00024838064244782294, + "loss": 2.2265, + "step": 11876 + }, + { + "epoch": 1.3857192859642982, + "grad_norm": 1.1124166250228882, + "learning_rate": 0.00024836918283881545, + "loss": 2.1732, + "step": 11877 + }, + { + "epoch": 1.3858359584645898, + "grad_norm": 1.35872483253479, + "learning_rate": 0.00024835772222557545, + "loss": 2.2791, + "step": 11878 + }, + { + "epoch": 1.3859526309648815, + "grad_norm": 1.1533513069152832, + "learning_rate": 0.00024834626060822167, + "loss": 1.9261, + "step": 11879 + }, + { + "epoch": 1.3860693034651732, + "grad_norm": 1.29282546043396, + "learning_rate": 0.0002483347979868731, + "loss": 2.1782, + "step": 11880 + }, + { + "epoch": 1.386185975965465, + "grad_norm": 1.1479151248931885, + "learning_rate": 0.00024832333436164845, + "loss": 2.0942, + "step": 11881 + }, + { + "epoch": 1.3863026484657566, + "grad_norm": 1.1222385168075562, + "learning_rate": 0.00024831186973266656, + "loss": 2.1535, + "step": 11882 + }, + { + "epoch": 1.3864193209660483, + "grad_norm": 1.4276634454727173, + "learning_rate": 0.0002483004041000463, + "loss": 2.0112, + "step": 11883 + }, + { + "epoch": 1.38653599346634, + "grad_norm": 1.1764936447143555, + "learning_rate": 0.0002482889374639066, + "loss": 1.9972, + "step": 11884 + }, + { + "epoch": 1.3866526659666316, + "grad_norm": 1.0202018022537231, + "learning_rate": 0.00024827746982436624, + "loss": 1.9912, + "step": 11885 + }, + { + "epoch": 1.3867693384669233, + "grad_norm": 1.2075607776641846, + "learning_rate": 0.00024826600118154415, + "loss": 2.1658, + "step": 11886 + }, + { + "epoch": 1.386886010967215, + "grad_norm": 1.4772284030914307, + "learning_rate": 0.0002482545315355592, + "loss": 2.1827, + "step": 11887 + }, + { + "epoch": 1.3870026834675067, + "grad_norm": 1.319198489189148, + "learning_rate": 0.00024824306088653034, + "loss": 1.9954, + "step": 11888 + }, + { + "epoch": 1.3871193559677983, + "grad_norm": 1.1825438737869263, + "learning_rate": 0.00024823158923457646, + "loss": 2.1297, + "step": 11889 + }, + { + "epoch": 1.38723602846809, + "grad_norm": 1.112741470336914, + "learning_rate": 0.0002482201165798165, + "loss": 2.0096, + "step": 11890 + }, + { + "epoch": 1.3873527009683817, + "grad_norm": 1.1613199710845947, + "learning_rate": 0.0002482086429223693, + "loss": 2.0579, + "step": 11891 + }, + { + "epoch": 1.3874693734686734, + "grad_norm": 1.08023202419281, + "learning_rate": 0.00024819716826235394, + "loss": 2.1509, + "step": 11892 + }, + { + "epoch": 1.387586045968965, + "grad_norm": 1.135652780532837, + "learning_rate": 0.0002481856925998893, + "loss": 1.8638, + "step": 11893 + }, + { + "epoch": 1.3877027184692567, + "grad_norm": 1.2933043241500854, + "learning_rate": 0.0002481742159350944, + "loss": 1.8346, + "step": 11894 + }, + { + "epoch": 1.3878193909695484, + "grad_norm": 1.0466690063476562, + "learning_rate": 0.00024816273826808817, + "loss": 1.9577, + "step": 11895 + }, + { + "epoch": 1.38793606346984, + "grad_norm": 0.9961144328117371, + "learning_rate": 0.00024815125959898955, + "loss": 1.8953, + "step": 11896 + }, + { + "epoch": 1.3880527359701318, + "grad_norm": 1.201258897781372, + "learning_rate": 0.0002481397799279177, + "loss": 2.1039, + "step": 11897 + }, + { + "epoch": 1.3881694084704235, + "grad_norm": 1.2976369857788086, + "learning_rate": 0.00024812829925499147, + "loss": 2.1695, + "step": 11898 + }, + { + "epoch": 1.3882860809707152, + "grad_norm": 1.012108564376831, + "learning_rate": 0.00024811681758032993, + "loss": 1.9869, + "step": 11899 + }, + { + "epoch": 1.3884027534710068, + "grad_norm": 1.1808825731277466, + "learning_rate": 0.0002481053349040521, + "loss": 2.3211, + "step": 11900 + }, + { + "epoch": 1.3885194259712985, + "grad_norm": 1.1947141885757446, + "learning_rate": 0.0002480938512262771, + "loss": 2.1706, + "step": 11901 + }, + { + "epoch": 1.3886360984715902, + "grad_norm": 0.9746079444885254, + "learning_rate": 0.0002480823665471239, + "loss": 1.965, + "step": 11902 + }, + { + "epoch": 1.3887527709718819, + "grad_norm": 1.1628516912460327, + "learning_rate": 0.0002480708808667116, + "loss": 1.9532, + "step": 11903 + }, + { + "epoch": 1.3888694434721736, + "grad_norm": 1.2148234844207764, + "learning_rate": 0.00024805939418515914, + "loss": 2.2815, + "step": 11904 + }, + { + "epoch": 1.3889861159724652, + "grad_norm": 1.3651282787322998, + "learning_rate": 0.0002480479065025858, + "loss": 2.1795, + "step": 11905 + }, + { + "epoch": 1.389102788472757, + "grad_norm": 1.1826273202896118, + "learning_rate": 0.00024803641781911056, + "loss": 1.8785, + "step": 11906 + }, + { + "epoch": 1.3892194609730486, + "grad_norm": 1.2002450227737427, + "learning_rate": 0.00024802492813485257, + "loss": 2.0209, + "step": 11907 + }, + { + "epoch": 1.3893361334733403, + "grad_norm": 1.3476576805114746, + "learning_rate": 0.0002480134374499309, + "loss": 2.3537, + "step": 11908 + }, + { + "epoch": 1.389452805973632, + "grad_norm": 1.6427178382873535, + "learning_rate": 0.00024800194576446465, + "loss": 2.2791, + "step": 11909 + }, + { + "epoch": 1.3895694784739236, + "grad_norm": 1.3483977317810059, + "learning_rate": 0.00024799045307857305, + "loss": 2.1777, + "step": 11910 + }, + { + "epoch": 1.3896861509742153, + "grad_norm": 1.1392704248428345, + "learning_rate": 0.00024797895939237516, + "loss": 1.9781, + "step": 11911 + }, + { + "epoch": 1.389802823474507, + "grad_norm": 1.1806615591049194, + "learning_rate": 0.00024796746470599014, + "loss": 1.9899, + "step": 11912 + }, + { + "epoch": 1.3899194959747987, + "grad_norm": 0.9893964529037476, + "learning_rate": 0.00024795596901953714, + "loss": 2.1016, + "step": 11913 + }, + { + "epoch": 1.3900361684750904, + "grad_norm": 1.152222990989685, + "learning_rate": 0.00024794447233313544, + "loss": 2.0155, + "step": 11914 + }, + { + "epoch": 1.390152840975382, + "grad_norm": 1.2550709247589111, + "learning_rate": 0.00024793297464690406, + "loss": 2.1619, + "step": 11915 + }, + { + "epoch": 1.3902695134756737, + "grad_norm": 1.0755174160003662, + "learning_rate": 0.0002479214759609624, + "loss": 2.0906, + "step": 11916 + }, + { + "epoch": 1.3903861859759654, + "grad_norm": 1.1379153728485107, + "learning_rate": 0.00024790997627542946, + "loss": 2.0197, + "step": 11917 + }, + { + "epoch": 1.390502858476257, + "grad_norm": 1.1873031854629517, + "learning_rate": 0.0002478984755904246, + "loss": 2.0348, + "step": 11918 + }, + { + "epoch": 1.3906195309765488, + "grad_norm": 1.3119179010391235, + "learning_rate": 0.000247886973906067, + "loss": 2.0759, + "step": 11919 + }, + { + "epoch": 1.3907362034768405, + "grad_norm": 1.3189806938171387, + "learning_rate": 0.0002478754712224759, + "loss": 2.299, + "step": 11920 + }, + { + "epoch": 1.3908528759771321, + "grad_norm": 1.2068719863891602, + "learning_rate": 0.0002478639675397705, + "loss": 2.1645, + "step": 11921 + }, + { + "epoch": 1.3909695484774238, + "grad_norm": 1.1814886331558228, + "learning_rate": 0.00024785246285807013, + "loss": 2.0853, + "step": 11922 + }, + { + "epoch": 1.3910862209777155, + "grad_norm": 1.3764264583587646, + "learning_rate": 0.00024784095717749404, + "loss": 2.0592, + "step": 11923 + }, + { + "epoch": 1.3912028934780072, + "grad_norm": 1.020490050315857, + "learning_rate": 0.0002478294504981614, + "loss": 1.9881, + "step": 11924 + }, + { + "epoch": 1.3913195659782989, + "grad_norm": 1.1038157939910889, + "learning_rate": 0.00024781794282019166, + "loss": 2.1242, + "step": 11925 + }, + { + "epoch": 1.3914362384785905, + "grad_norm": 1.2260197401046753, + "learning_rate": 0.00024780643414370405, + "loss": 2.1496, + "step": 11926 + }, + { + "epoch": 1.3915529109788822, + "grad_norm": 1.3028610944747925, + "learning_rate": 0.00024779492446881787, + "loss": 2.1546, + "step": 11927 + }, + { + "epoch": 1.391669583479174, + "grad_norm": 1.0427124500274658, + "learning_rate": 0.0002477834137956525, + "loss": 1.8744, + "step": 11928 + }, + { + "epoch": 1.3917862559794656, + "grad_norm": 1.1550172567367554, + "learning_rate": 0.0002477719021243271, + "loss": 2.0915, + "step": 11929 + }, + { + "epoch": 1.3919029284797573, + "grad_norm": 1.1274863481521606, + "learning_rate": 0.0002477603894549612, + "loss": 2.0486, + "step": 11930 + }, + { + "epoch": 1.392019600980049, + "grad_norm": 1.2618677616119385, + "learning_rate": 0.00024774887578767404, + "loss": 1.9337, + "step": 11931 + }, + { + "epoch": 1.3921362734803406, + "grad_norm": 1.169382929801941, + "learning_rate": 0.0002477373611225851, + "loss": 2.1646, + "step": 11932 + }, + { + "epoch": 1.3922529459806323, + "grad_norm": 1.007964015007019, + "learning_rate": 0.0002477258454598136, + "loss": 2.2476, + "step": 11933 + }, + { + "epoch": 1.392369618480924, + "grad_norm": 1.1920043230056763, + "learning_rate": 0.000247714328799479, + "loss": 2.02, + "step": 11934 + }, + { + "epoch": 1.3924862909812157, + "grad_norm": 1.273535966873169, + "learning_rate": 0.0002477028111417007, + "loss": 2.1526, + "step": 11935 + }, + { + "epoch": 1.3926029634815074, + "grad_norm": 1.0916727781295776, + "learning_rate": 0.0002476912924865981, + "loss": 2.0436, + "step": 11936 + }, + { + "epoch": 1.392719635981799, + "grad_norm": 1.3954572677612305, + "learning_rate": 0.0002476797728342906, + "loss": 2.1839, + "step": 11937 + }, + { + "epoch": 1.3928363084820907, + "grad_norm": 1.0662628412246704, + "learning_rate": 0.0002476682521848976, + "loss": 1.9354, + "step": 11938 + }, + { + "epoch": 1.3929529809823824, + "grad_norm": 1.2852281332015991, + "learning_rate": 0.0002476567305385386, + "loss": 2.1331, + "step": 11939 + }, + { + "epoch": 1.393069653482674, + "grad_norm": 1.2382025718688965, + "learning_rate": 0.00024764520789533295, + "loss": 1.9677, + "step": 11940 + }, + { + "epoch": 1.3931863259829658, + "grad_norm": 1.1623855829238892, + "learning_rate": 0.0002476336842554002, + "loss": 2.0878, + "step": 11941 + }, + { + "epoch": 1.3933029984832574, + "grad_norm": 1.244797945022583, + "learning_rate": 0.0002476221596188597, + "loss": 2.0363, + "step": 11942 + }, + { + "epoch": 1.3934196709835491, + "grad_norm": 1.1585808992385864, + "learning_rate": 0.000247610633985831, + "loss": 2.2092, + "step": 11943 + }, + { + "epoch": 1.3935363434838408, + "grad_norm": 1.150971531867981, + "learning_rate": 0.0002475991073564337, + "loss": 2.1082, + "step": 11944 + }, + { + "epoch": 1.3936530159841325, + "grad_norm": 1.4876861572265625, + "learning_rate": 0.0002475875797307871, + "loss": 2.2423, + "step": 11945 + }, + { + "epoch": 1.3937696884844242, + "grad_norm": 1.0982415676116943, + "learning_rate": 0.00024757605110901074, + "loss": 2.0383, + "step": 11946 + }, + { + "epoch": 1.3938863609847159, + "grad_norm": 1.139672875404358, + "learning_rate": 0.0002475645214912242, + "loss": 2.0126, + "step": 11947 + }, + { + "epoch": 1.3940030334850075, + "grad_norm": 1.0371825695037842, + "learning_rate": 0.000247552990877547, + "loss": 2.0595, + "step": 11948 + }, + { + "epoch": 1.3941197059852992, + "grad_norm": 1.1466859579086304, + "learning_rate": 0.00024754145926809864, + "loss": 2.0772, + "step": 11949 + }, + { + "epoch": 1.394236378485591, + "grad_norm": 1.1467641592025757, + "learning_rate": 0.0002475299266629987, + "loss": 1.9218, + "step": 11950 + }, + { + "epoch": 1.3943530509858826, + "grad_norm": 1.1565570831298828, + "learning_rate": 0.0002475183930623667, + "loss": 2.0211, + "step": 11951 + }, + { + "epoch": 1.3944697234861743, + "grad_norm": 1.1174685955047607, + "learning_rate": 0.0002475068584663222, + "loss": 2.1782, + "step": 11952 + }, + { + "epoch": 1.394586395986466, + "grad_norm": 1.0400346517562866, + "learning_rate": 0.00024749532287498486, + "loss": 2.1181, + "step": 11953 + }, + { + "epoch": 1.3947030684867576, + "grad_norm": 1.3086961507797241, + "learning_rate": 0.0002474837862884742, + "loss": 2.0208, + "step": 11954 + }, + { + "epoch": 1.3948197409870493, + "grad_norm": 1.2959697246551514, + "learning_rate": 0.0002474722487069098, + "loss": 2.1347, + "step": 11955 + }, + { + "epoch": 1.394936413487341, + "grad_norm": 1.2020015716552734, + "learning_rate": 0.0002474607101304114, + "loss": 1.9139, + "step": 11956 + }, + { + "epoch": 1.3950530859876327, + "grad_norm": 0.9938914775848389, + "learning_rate": 0.0002474491705590985, + "loss": 1.9177, + "step": 11957 + }, + { + "epoch": 1.3951697584879243, + "grad_norm": 1.1669747829437256, + "learning_rate": 0.0002474376299930907, + "loss": 2.1538, + "step": 11958 + }, + { + "epoch": 1.395286430988216, + "grad_norm": 1.2017490863800049, + "learning_rate": 0.0002474260884325077, + "loss": 2.0921, + "step": 11959 + }, + { + "epoch": 1.3954031034885077, + "grad_norm": 1.256534457206726, + "learning_rate": 0.00024741454587746915, + "loss": 2.2363, + "step": 11960 + }, + { + "epoch": 1.3955197759887994, + "grad_norm": 1.2374606132507324, + "learning_rate": 0.00024740300232809474, + "loss": 2.1027, + "step": 11961 + }, + { + "epoch": 1.395636448489091, + "grad_norm": 1.2224740982055664, + "learning_rate": 0.00024739145778450406, + "loss": 1.928, + "step": 11962 + }, + { + "epoch": 1.3957531209893828, + "grad_norm": 1.261396884918213, + "learning_rate": 0.00024737991224681687, + "loss": 2.0467, + "step": 11963 + }, + { + "epoch": 1.3958697934896744, + "grad_norm": 1.1398829221725464, + "learning_rate": 0.00024736836571515275, + "loss": 1.9941, + "step": 11964 + }, + { + "epoch": 1.3959864659899661, + "grad_norm": 1.2716871500015259, + "learning_rate": 0.00024735681818963156, + "loss": 1.9726, + "step": 11965 + }, + { + "epoch": 1.3961031384902578, + "grad_norm": 1.3011714220046997, + "learning_rate": 0.00024734526967037284, + "loss": 2.2038, + "step": 11966 + }, + { + "epoch": 1.3962198109905495, + "grad_norm": 1.1417735815048218, + "learning_rate": 0.0002473337201574965, + "loss": 2.0311, + "step": 11967 + }, + { + "epoch": 1.3963364834908412, + "grad_norm": 1.1865953207015991, + "learning_rate": 0.00024732216965112216, + "loss": 2.1112, + "step": 11968 + }, + { + "epoch": 1.3964531559911328, + "grad_norm": 1.2833331823349, + "learning_rate": 0.00024731061815136954, + "loss": 2.2034, + "step": 11969 + }, + { + "epoch": 1.3965698284914245, + "grad_norm": 1.2602216005325317, + "learning_rate": 0.00024729906565835845, + "loss": 2.1092, + "step": 11970 + }, + { + "epoch": 1.3966865009917162, + "grad_norm": 1.1567074060440063, + "learning_rate": 0.0002472875121722086, + "loss": 1.9429, + "step": 11971 + }, + { + "epoch": 1.3968031734920079, + "grad_norm": 1.0384864807128906, + "learning_rate": 0.00024727595769303987, + "loss": 2.0279, + "step": 11972 + }, + { + "epoch": 1.3969198459922996, + "grad_norm": 1.2243555784225464, + "learning_rate": 0.00024726440222097194, + "loss": 1.9767, + "step": 11973 + }, + { + "epoch": 1.3970365184925913, + "grad_norm": 1.4056708812713623, + "learning_rate": 0.0002472528457561246, + "loss": 2.0137, + "step": 11974 + }, + { + "epoch": 1.397153190992883, + "grad_norm": 1.0458794832229614, + "learning_rate": 0.00024724128829861777, + "loss": 1.9478, + "step": 11975 + }, + { + "epoch": 1.3972698634931746, + "grad_norm": 1.133620023727417, + "learning_rate": 0.0002472297298485711, + "loss": 2.1302, + "step": 11976 + }, + { + "epoch": 1.3973865359934663, + "grad_norm": 1.4190224409103394, + "learning_rate": 0.0002472181704061045, + "loss": 2.0349, + "step": 11977 + }, + { + "epoch": 1.397503208493758, + "grad_norm": 1.317465901374817, + "learning_rate": 0.0002472066099713378, + "loss": 2.256, + "step": 11978 + }, + { + "epoch": 1.3976198809940497, + "grad_norm": 1.1181223392486572, + "learning_rate": 0.0002471950485443909, + "loss": 2.0132, + "step": 11979 + }, + { + "epoch": 1.3977365534943413, + "grad_norm": 1.2048530578613281, + "learning_rate": 0.0002471834861253836, + "loss": 2.0841, + "step": 11980 + }, + { + "epoch": 1.397853225994633, + "grad_norm": 1.1982066631317139, + "learning_rate": 0.0002471719227144358, + "loss": 2.2146, + "step": 11981 + }, + { + "epoch": 1.3979698984949247, + "grad_norm": 1.220300316810608, + "learning_rate": 0.00024716035831166735, + "loss": 2.0475, + "step": 11982 + }, + { + "epoch": 1.3980865709952164, + "grad_norm": 1.0291805267333984, + "learning_rate": 0.0002471487929171981, + "loss": 2.0756, + "step": 11983 + }, + { + "epoch": 1.398203243495508, + "grad_norm": 1.232949137687683, + "learning_rate": 0.000247137226531148, + "loss": 2.0311, + "step": 11984 + }, + { + "epoch": 1.3983199159957997, + "grad_norm": 1.388102412223816, + "learning_rate": 0.0002471256591536369, + "loss": 2.0434, + "step": 11985 + }, + { + "epoch": 1.3984365884960914, + "grad_norm": 1.100110650062561, + "learning_rate": 0.00024711409078478485, + "loss": 1.8607, + "step": 11986 + }, + { + "epoch": 1.398553260996383, + "grad_norm": 1.3592793941497803, + "learning_rate": 0.00024710252142471164, + "loss": 2.326, + "step": 11987 + }, + { + "epoch": 1.3986699334966748, + "grad_norm": 1.2051931619644165, + "learning_rate": 0.00024709095107353725, + "loss": 2.2903, + "step": 11988 + }, + { + "epoch": 1.3987866059969665, + "grad_norm": 1.320364236831665, + "learning_rate": 0.00024707937973138163, + "loss": 2.1373, + "step": 11989 + }, + { + "epoch": 1.3989032784972582, + "grad_norm": 1.2943049669265747, + "learning_rate": 0.00024706780739836476, + "loss": 2.152, + "step": 11990 + }, + { + "epoch": 1.3990199509975498, + "grad_norm": 1.2058956623077393, + "learning_rate": 0.0002470562340746066, + "loss": 2.1019, + "step": 11991 + }, + { + "epoch": 1.3991366234978415, + "grad_norm": 1.2798986434936523, + "learning_rate": 0.0002470446597602271, + "loss": 2.1054, + "step": 11992 + }, + { + "epoch": 1.3992532959981332, + "grad_norm": 1.1634317636489868, + "learning_rate": 0.00024703308445534627, + "loss": 2.0176, + "step": 11993 + }, + { + "epoch": 1.3993699684984249, + "grad_norm": 1.304067611694336, + "learning_rate": 0.00024702150816008415, + "loss": 1.9792, + "step": 11994 + }, + { + "epoch": 1.3994866409987166, + "grad_norm": 1.10985267162323, + "learning_rate": 0.0002470099308745607, + "loss": 2.0548, + "step": 11995 + }, + { + "epoch": 1.3996033134990082, + "grad_norm": 1.509903907775879, + "learning_rate": 0.000246998352598896, + "loss": 2.1044, + "step": 11996 + }, + { + "epoch": 1.3997199859993, + "grad_norm": 1.1928590536117554, + "learning_rate": 0.00024698677333320995, + "loss": 2.0207, + "step": 11997 + }, + { + "epoch": 1.3998366584995916, + "grad_norm": 1.2663546800613403, + "learning_rate": 0.0002469751930776227, + "loss": 2.0229, + "step": 11998 + }, + { + "epoch": 1.3999533309998833, + "grad_norm": 1.0913792848587036, + "learning_rate": 0.00024696361183225435, + "loss": 1.9165, + "step": 11999 + }, + { + "epoch": 1.400070003500175, + "grad_norm": 1.1880983114242554, + "learning_rate": 0.0002469520295972248, + "loss": 2.1872, + "step": 12000 + }, + { + "epoch": 1.4001866760004666, + "grad_norm": 1.0840998888015747, + "learning_rate": 0.00024694044637265424, + "loss": 1.9382, + "step": 12001 + }, + { + "epoch": 1.4003033485007583, + "grad_norm": 1.3074125051498413, + "learning_rate": 0.0002469288621586627, + "loss": 2.1304, + "step": 12002 + }, + { + "epoch": 1.40042002100105, + "grad_norm": 1.0664198398590088, + "learning_rate": 0.0002469172769553703, + "loss": 1.8973, + "step": 12003 + }, + { + "epoch": 1.4005366935013417, + "grad_norm": 1.15361487865448, + "learning_rate": 0.00024690569076289714, + "loss": 1.9925, + "step": 12004 + }, + { + "epoch": 1.4006533660016334, + "grad_norm": 1.2003191709518433, + "learning_rate": 0.0002468941035813633, + "loss": 2.0968, + "step": 12005 + }, + { + "epoch": 1.400770038501925, + "grad_norm": 1.1198253631591797, + "learning_rate": 0.00024688251541088896, + "loss": 2.0179, + "step": 12006 + }, + { + "epoch": 1.4008867110022167, + "grad_norm": 1.2847448587417603, + "learning_rate": 0.0002468709262515943, + "loss": 2.116, + "step": 12007 + }, + { + "epoch": 1.4010033835025084, + "grad_norm": 1.1251295804977417, + "learning_rate": 0.0002468593361035993, + "loss": 1.9445, + "step": 12008 + }, + { + "epoch": 1.4011200560028, + "grad_norm": 1.2869805097579956, + "learning_rate": 0.00024684774496702423, + "loss": 2.214, + "step": 12009 + }, + { + "epoch": 1.4012367285030918, + "grad_norm": 1.206695556640625, + "learning_rate": 0.00024683615284198923, + "loss": 2.1082, + "step": 12010 + }, + { + "epoch": 1.4013534010033835, + "grad_norm": 1.1328706741333008, + "learning_rate": 0.0002468245597286145, + "loss": 2.0217, + "step": 12011 + }, + { + "epoch": 1.4014700735036751, + "grad_norm": 1.0536566972732544, + "learning_rate": 0.00024681296562702014, + "loss": 2.0961, + "step": 12012 + }, + { + "epoch": 1.4015867460039668, + "grad_norm": 1.194703459739685, + "learning_rate": 0.00024680137053732646, + "loss": 1.969, + "step": 12013 + }, + { + "epoch": 1.4017034185042585, + "grad_norm": 1.1666622161865234, + "learning_rate": 0.00024678977445965355, + "loss": 2.1704, + "step": 12014 + }, + { + "epoch": 1.4018200910045502, + "grad_norm": 1.1777746677398682, + "learning_rate": 0.0002467781773941217, + "loss": 2.1186, + "step": 12015 + }, + { + "epoch": 1.4019367635048419, + "grad_norm": 1.2008744478225708, + "learning_rate": 0.00024676657934085116, + "loss": 2.1011, + "step": 12016 + }, + { + "epoch": 1.4020534360051335, + "grad_norm": 1.3932920694351196, + "learning_rate": 0.00024675498029996204, + "loss": 2.0861, + "step": 12017 + }, + { + "epoch": 1.4021701085054252, + "grad_norm": 1.1231801509857178, + "learning_rate": 0.0002467433802715747, + "loss": 2.3079, + "step": 12018 + }, + { + "epoch": 1.402286781005717, + "grad_norm": 1.1037483215332031, + "learning_rate": 0.0002467317792558094, + "loss": 1.9505, + "step": 12019 + }, + { + "epoch": 1.4024034535060086, + "grad_norm": 1.3645879030227661, + "learning_rate": 0.00024672017725278636, + "loss": 2.0732, + "step": 12020 + }, + { + "epoch": 1.4025201260063003, + "grad_norm": 1.066557765007019, + "learning_rate": 0.00024670857426262584, + "loss": 2.0887, + "step": 12021 + }, + { + "epoch": 1.402636798506592, + "grad_norm": 1.190115213394165, + "learning_rate": 0.0002466969702854482, + "loss": 2.0509, + "step": 12022 + }, + { + "epoch": 1.4027534710068836, + "grad_norm": 1.1802396774291992, + "learning_rate": 0.0002466853653213737, + "loss": 2.0977, + "step": 12023 + }, + { + "epoch": 1.4028701435071753, + "grad_norm": 1.2559547424316406, + "learning_rate": 0.0002466737593705226, + "loss": 2.1026, + "step": 12024 + }, + { + "epoch": 1.402986816007467, + "grad_norm": 1.1221274137496948, + "learning_rate": 0.00024666215243301524, + "loss": 2.0377, + "step": 12025 + }, + { + "epoch": 1.4031034885077587, + "grad_norm": 1.4063911437988281, + "learning_rate": 0.000246650544508972, + "loss": 2.2107, + "step": 12026 + }, + { + "epoch": 1.4032201610080504, + "grad_norm": 1.0455164909362793, + "learning_rate": 0.0002466389355985132, + "loss": 2.0554, + "step": 12027 + }, + { + "epoch": 1.403336833508342, + "grad_norm": 1.1287872791290283, + "learning_rate": 0.00024662732570175915, + "loss": 2.094, + "step": 12028 + }, + { + "epoch": 1.4034535060086337, + "grad_norm": 1.1876624822616577, + "learning_rate": 0.0002466157148188302, + "loss": 2.0573, + "step": 12029 + }, + { + "epoch": 1.4035701785089254, + "grad_norm": 1.1617408990859985, + "learning_rate": 0.0002466041029498468, + "loss": 1.9999, + "step": 12030 + }, + { + "epoch": 1.403686851009217, + "grad_norm": 1.2073400020599365, + "learning_rate": 0.0002465924900949292, + "loss": 2.1124, + "step": 12031 + }, + { + "epoch": 1.4038035235095088, + "grad_norm": 1.212573766708374, + "learning_rate": 0.000246580876254198, + "loss": 2.298, + "step": 12032 + }, + { + "epoch": 1.4039201960098004, + "grad_norm": 1.2604879140853882, + "learning_rate": 0.00024656926142777334, + "loss": 2.0137, + "step": 12033 + }, + { + "epoch": 1.4040368685100921, + "grad_norm": 1.2539883852005005, + "learning_rate": 0.00024655764561577585, + "loss": 2.2223, + "step": 12034 + }, + { + "epoch": 1.4041535410103838, + "grad_norm": 1.2663979530334473, + "learning_rate": 0.0002465460288183258, + "loss": 2.0013, + "step": 12035 + }, + { + "epoch": 1.4042702135106755, + "grad_norm": 1.2864940166473389, + "learning_rate": 0.0002465344110355437, + "loss": 2.1929, + "step": 12036 + }, + { + "epoch": 1.4043868860109672, + "grad_norm": 1.2745070457458496, + "learning_rate": 0.00024652279226754995, + "loss": 2.0524, + "step": 12037 + }, + { + "epoch": 1.4045035585112589, + "grad_norm": 1.1252658367156982, + "learning_rate": 0.00024651117251446505, + "loss": 2.1106, + "step": 12038 + }, + { + "epoch": 1.4046202310115505, + "grad_norm": 1.0740065574645996, + "learning_rate": 0.0002464995517764094, + "loss": 2.0532, + "step": 12039 + }, + { + "epoch": 1.4047369035118422, + "grad_norm": 1.228193759918213, + "learning_rate": 0.0002464879300535035, + "loss": 2.037, + "step": 12040 + }, + { + "epoch": 1.404853576012134, + "grad_norm": 1.0596644878387451, + "learning_rate": 0.00024647630734586786, + "loss": 2.1012, + "step": 12041 + }, + { + "epoch": 1.4049702485124256, + "grad_norm": 1.337007761001587, + "learning_rate": 0.00024646468365362287, + "loss": 2.0024, + "step": 12042 + }, + { + "epoch": 1.4050869210127173, + "grad_norm": 1.2746188640594482, + "learning_rate": 0.0002464530589768892, + "loss": 2.0385, + "step": 12043 + }, + { + "epoch": 1.405203593513009, + "grad_norm": 1.081925630569458, + "learning_rate": 0.0002464414333157872, + "loss": 2.0773, + "step": 12044 + }, + { + "epoch": 1.4053202660133006, + "grad_norm": 1.1682525873184204, + "learning_rate": 0.0002464298066704375, + "loss": 2.0157, + "step": 12045 + }, + { + "epoch": 1.4054369385135923, + "grad_norm": 1.229115605354309, + "learning_rate": 0.00024641817904096054, + "loss": 2.253, + "step": 12046 + }, + { + "epoch": 1.405553611013884, + "grad_norm": 1.0241526365280151, + "learning_rate": 0.00024640655042747694, + "loss": 1.9033, + "step": 12047 + }, + { + "epoch": 1.4056702835141757, + "grad_norm": 0.9601776599884033, + "learning_rate": 0.0002463949208301072, + "loss": 1.7663, + "step": 12048 + }, + { + "epoch": 1.4057869560144673, + "grad_norm": 1.1588537693023682, + "learning_rate": 0.00024638329024897197, + "loss": 2.0512, + "step": 12049 + }, + { + "epoch": 1.405903628514759, + "grad_norm": 1.0211994647979736, + "learning_rate": 0.00024637165868419167, + "loss": 2.2108, + "step": 12050 + }, + { + "epoch": 1.4060203010150507, + "grad_norm": 1.3671742677688599, + "learning_rate": 0.00024636002613588707, + "loss": 2.1804, + "step": 12051 + }, + { + "epoch": 1.4061369735153424, + "grad_norm": 1.2099767923355103, + "learning_rate": 0.00024634839260417857, + "loss": 2.0851, + "step": 12052 + }, + { + "epoch": 1.406253646015634, + "grad_norm": 1.1159802675247192, + "learning_rate": 0.00024633675808918696, + "loss": 2.0737, + "step": 12053 + }, + { + "epoch": 1.4063703185159258, + "grad_norm": 1.3821125030517578, + "learning_rate": 0.00024632512259103274, + "loss": 2.2611, + "step": 12054 + }, + { + "epoch": 1.4064869910162174, + "grad_norm": 1.2694437503814697, + "learning_rate": 0.00024631348610983653, + "loss": 1.987, + "step": 12055 + }, + { + "epoch": 1.4066036635165091, + "grad_norm": 1.3285279273986816, + "learning_rate": 0.00024630184864571895, + "loss": 2.0866, + "step": 12056 + }, + { + "epoch": 1.4067203360168008, + "grad_norm": 1.126963496208191, + "learning_rate": 0.0002462902101988008, + "loss": 2.0011, + "step": 12057 + }, + { + "epoch": 1.4068370085170925, + "grad_norm": 1.145933985710144, + "learning_rate": 0.0002462785707692026, + "loss": 2.1039, + "step": 12058 + }, + { + "epoch": 1.4069536810173842, + "grad_norm": 1.119769811630249, + "learning_rate": 0.00024626693035704496, + "loss": 2.1053, + "step": 12059 + }, + { + "epoch": 1.4070703535176758, + "grad_norm": 1.099799394607544, + "learning_rate": 0.00024625528896244866, + "loss": 2.0968, + "step": 12060 + }, + { + "epoch": 1.4071870260179675, + "grad_norm": 1.0681570768356323, + "learning_rate": 0.00024624364658553435, + "loss": 2.0327, + "step": 12061 + }, + { + "epoch": 1.4073036985182592, + "grad_norm": 1.19357168674469, + "learning_rate": 0.00024623200322642275, + "loss": 2.024, + "step": 12062 + }, + { + "epoch": 1.4074203710185509, + "grad_norm": 1.1928755044937134, + "learning_rate": 0.00024622035888523453, + "loss": 2.1173, + "step": 12063 + }, + { + "epoch": 1.4075370435188426, + "grad_norm": 1.0189152956008911, + "learning_rate": 0.0002462087135620905, + "loss": 2.0163, + "step": 12064 + }, + { + "epoch": 1.4076537160191342, + "grad_norm": 1.1057159900665283, + "learning_rate": 0.00024619706725711126, + "loss": 2.0895, + "step": 12065 + }, + { + "epoch": 1.407770388519426, + "grad_norm": 1.0634526014328003, + "learning_rate": 0.0002461854199704175, + "loss": 1.9557, + "step": 12066 + }, + { + "epoch": 1.4078870610197176, + "grad_norm": 1.2625280618667603, + "learning_rate": 0.0002461737717021302, + "loss": 2.2119, + "step": 12067 + }, + { + "epoch": 1.4080037335200093, + "grad_norm": 1.0542922019958496, + "learning_rate": 0.0002461621224523699, + "loss": 2.099, + "step": 12068 + }, + { + "epoch": 1.408120406020301, + "grad_norm": 1.1858947277069092, + "learning_rate": 0.0002461504722212574, + "loss": 1.9225, + "step": 12069 + }, + { + "epoch": 1.4082370785205927, + "grad_norm": 1.1663236618041992, + "learning_rate": 0.0002461388210089136, + "loss": 1.9783, + "step": 12070 + }, + { + "epoch": 1.4083537510208843, + "grad_norm": 1.1291755437850952, + "learning_rate": 0.00024612716881545923, + "loss": 2.1832, + "step": 12071 + }, + { + "epoch": 1.408470423521176, + "grad_norm": 1.0671157836914062, + "learning_rate": 0.000246115515641015, + "loss": 1.9224, + "step": 12072 + }, + { + "epoch": 1.4085870960214677, + "grad_norm": 1.1661045551300049, + "learning_rate": 0.0002461038614857018, + "loss": 1.9824, + "step": 12073 + }, + { + "epoch": 1.4087037685217594, + "grad_norm": 1.1227785348892212, + "learning_rate": 0.0002460922063496404, + "loss": 2.0527, + "step": 12074 + }, + { + "epoch": 1.408820441022051, + "grad_norm": 1.214709758758545, + "learning_rate": 0.00024608055023295173, + "loss": 2.2347, + "step": 12075 + }, + { + "epoch": 1.4089371135223427, + "grad_norm": 1.1603604555130005, + "learning_rate": 0.0002460688931357565, + "loss": 2.0635, + "step": 12076 + }, + { + "epoch": 1.4090537860226344, + "grad_norm": 1.1747337579727173, + "learning_rate": 0.00024605723505817566, + "loss": 2.0701, + "step": 12077 + }, + { + "epoch": 1.409170458522926, + "grad_norm": 1.1408617496490479, + "learning_rate": 0.00024604557600032997, + "loss": 2.2456, + "step": 12078 + }, + { + "epoch": 1.4092871310232178, + "grad_norm": 1.1677525043487549, + "learning_rate": 0.00024603391596234037, + "loss": 2.1319, + "step": 12079 + }, + { + "epoch": 1.4094038035235095, + "grad_norm": 1.0074059963226318, + "learning_rate": 0.0002460222549443277, + "loss": 1.9902, + "step": 12080 + }, + { + "epoch": 1.4095204760238011, + "grad_norm": 1.1924210786819458, + "learning_rate": 0.00024601059294641285, + "loss": 2.1172, + "step": 12081 + }, + { + "epoch": 1.4096371485240928, + "grad_norm": 1.1329090595245361, + "learning_rate": 0.00024599892996871677, + "loss": 1.9717, + "step": 12082 + }, + { + "epoch": 1.4097538210243845, + "grad_norm": 0.9972872138023376, + "learning_rate": 0.00024598726601136027, + "loss": 2.0789, + "step": 12083 + }, + { + "epoch": 1.4098704935246762, + "grad_norm": 1.185747504234314, + "learning_rate": 0.00024597560107446435, + "loss": 2.3036, + "step": 12084 + }, + { + "epoch": 1.4099871660249679, + "grad_norm": 1.0591537952423096, + "learning_rate": 0.00024596393515815, + "loss": 1.8845, + "step": 12085 + }, + { + "epoch": 1.4101038385252596, + "grad_norm": 1.1621564626693726, + "learning_rate": 0.000245952268262538, + "loss": 2.1956, + "step": 12086 + }, + { + "epoch": 1.4102205110255512, + "grad_norm": 1.1614813804626465, + "learning_rate": 0.0002459406003877495, + "loss": 1.9453, + "step": 12087 + }, + { + "epoch": 1.410337183525843, + "grad_norm": 1.1297372579574585, + "learning_rate": 0.00024592893153390516, + "loss": 2.1458, + "step": 12088 + }, + { + "epoch": 1.4104538560261346, + "grad_norm": 1.0738563537597656, + "learning_rate": 0.0002459172617011263, + "loss": 1.9679, + "step": 12089 + }, + { + "epoch": 1.4105705285264263, + "grad_norm": 1.144080400466919, + "learning_rate": 0.00024590559088953363, + "loss": 2.0535, + "step": 12090 + }, + { + "epoch": 1.410687201026718, + "grad_norm": 1.3459843397140503, + "learning_rate": 0.00024589391909924827, + "loss": 2.0087, + "step": 12091 + }, + { + "epoch": 1.4108038735270096, + "grad_norm": 1.1207211017608643, + "learning_rate": 0.00024588224633039115, + "loss": 2.0784, + "step": 12092 + }, + { + "epoch": 1.4109205460273013, + "grad_norm": 1.0907915830612183, + "learning_rate": 0.0002458705725830833, + "loss": 1.8689, + "step": 12093 + }, + { + "epoch": 1.411037218527593, + "grad_norm": 1.1233224868774414, + "learning_rate": 0.00024585889785744584, + "loss": 2.0504, + "step": 12094 + }, + { + "epoch": 1.4111538910278847, + "grad_norm": 1.114882469177246, + "learning_rate": 0.00024584722215359965, + "loss": 2.0551, + "step": 12095 + }, + { + "epoch": 1.4112705635281764, + "grad_norm": 1.4350041151046753, + "learning_rate": 0.0002458355454716659, + "loss": 2.4271, + "step": 12096 + }, + { + "epoch": 1.411387236028468, + "grad_norm": 1.2895994186401367, + "learning_rate": 0.00024582386781176553, + "loss": 2.059, + "step": 12097 + }, + { + "epoch": 1.4115039085287597, + "grad_norm": 1.1546188592910767, + "learning_rate": 0.0002458121891740197, + "loss": 2.0115, + "step": 12098 + }, + { + "epoch": 1.4116205810290514, + "grad_norm": 1.1592482328414917, + "learning_rate": 0.00024580050955854945, + "loss": 2.2045, + "step": 12099 + }, + { + "epoch": 1.411737253529343, + "grad_norm": 1.1336870193481445, + "learning_rate": 0.0002457888289654758, + "loss": 1.8846, + "step": 12100 + }, + { + "epoch": 1.4118539260296348, + "grad_norm": 0.9931747317314148, + "learning_rate": 0.00024577714739491987, + "loss": 2.0072, + "step": 12101 + }, + { + "epoch": 1.4119705985299265, + "grad_norm": 1.284371018409729, + "learning_rate": 0.00024576546484700285, + "loss": 2.2177, + "step": 12102 + }, + { + "epoch": 1.4120872710302181, + "grad_norm": 1.2487542629241943, + "learning_rate": 0.00024575378132184574, + "loss": 2.1273, + "step": 12103 + }, + { + "epoch": 1.4122039435305098, + "grad_norm": 1.2177976369857788, + "learning_rate": 0.0002457420968195697, + "loss": 2.2577, + "step": 12104 + }, + { + "epoch": 1.4123206160308015, + "grad_norm": 1.0294790267944336, + "learning_rate": 0.00024573041134029586, + "loss": 2.1446, + "step": 12105 + }, + { + "epoch": 1.4124372885310932, + "grad_norm": 1.1504653692245483, + "learning_rate": 0.00024571872488414535, + "loss": 2.0868, + "step": 12106 + }, + { + "epoch": 1.4125539610313849, + "grad_norm": 1.1946371793746948, + "learning_rate": 0.00024570703745123933, + "loss": 2.193, + "step": 12107 + }, + { + "epoch": 1.4126706335316765, + "grad_norm": 1.2568999528884888, + "learning_rate": 0.00024569534904169907, + "loss": 1.9454, + "step": 12108 + }, + { + "epoch": 1.4127873060319682, + "grad_norm": 1.2128760814666748, + "learning_rate": 0.00024568365965564555, + "loss": 2.0903, + "step": 12109 + }, + { + "epoch": 1.41290397853226, + "grad_norm": 1.2180095911026, + "learning_rate": 0.00024567196929320005, + "loss": 2.0918, + "step": 12110 + }, + { + "epoch": 1.4130206510325516, + "grad_norm": 1.170528531074524, + "learning_rate": 0.0002456602779544838, + "loss": 1.9929, + "step": 12111 + }, + { + "epoch": 1.4131373235328433, + "grad_norm": 1.4284740686416626, + "learning_rate": 0.0002456485856396179, + "loss": 1.9624, + "step": 12112 + }, + { + "epoch": 1.413253996033135, + "grad_norm": 1.1538898944854736, + "learning_rate": 0.0002456368923487237, + "loss": 2.0694, + "step": 12113 + }, + { + "epoch": 1.4133706685334266, + "grad_norm": 1.2514313459396362, + "learning_rate": 0.0002456251980819223, + "loss": 2.0662, + "step": 12114 + }, + { + "epoch": 1.4134873410337183, + "grad_norm": 1.2656422853469849, + "learning_rate": 0.000245613502839335, + "loss": 2.1686, + "step": 12115 + }, + { + "epoch": 1.41360401353401, + "grad_norm": 1.1539584398269653, + "learning_rate": 0.000245601806621083, + "loss": 2.027, + "step": 12116 + }, + { + "epoch": 1.4137206860343017, + "grad_norm": 1.1514651775360107, + "learning_rate": 0.00024559010942728766, + "loss": 1.9873, + "step": 12117 + }, + { + "epoch": 1.4138373585345934, + "grad_norm": 1.2427747249603271, + "learning_rate": 0.00024557841125807006, + "loss": 2.1772, + "step": 12118 + }, + { + "epoch": 1.413954031034885, + "grad_norm": 1.2808644771575928, + "learning_rate": 0.0002455667121135516, + "loss": 2.0821, + "step": 12119 + }, + { + "epoch": 1.4140707035351767, + "grad_norm": 1.0455453395843506, + "learning_rate": 0.0002455550119938536, + "loss": 1.9938, + "step": 12120 + }, + { + "epoch": 1.4141873760354684, + "grad_norm": 1.184077262878418, + "learning_rate": 0.0002455433108990972, + "loss": 2.1717, + "step": 12121 + }, + { + "epoch": 1.41430404853576, + "grad_norm": 1.1272200345993042, + "learning_rate": 0.00024553160882940383, + "loss": 2.0936, + "step": 12122 + }, + { + "epoch": 1.4144207210360518, + "grad_norm": 1.043756365776062, + "learning_rate": 0.0002455199057848948, + "loss": 2.0527, + "step": 12123 + }, + { + "epoch": 1.4145373935363434, + "grad_norm": 1.5571482181549072, + "learning_rate": 0.0002455082017656914, + "loss": 2.2524, + "step": 12124 + }, + { + "epoch": 1.4146540660366351, + "grad_norm": 1.3162668943405151, + "learning_rate": 0.000245496496771915, + "loss": 1.9869, + "step": 12125 + }, + { + "epoch": 1.4147707385369268, + "grad_norm": 1.2485729455947876, + "learning_rate": 0.0002454847908036868, + "loss": 2.1839, + "step": 12126 + }, + { + "epoch": 1.4148874110372185, + "grad_norm": 1.257918119430542, + "learning_rate": 0.0002454730838611284, + "loss": 2.1412, + "step": 12127 + }, + { + "epoch": 1.4150040835375102, + "grad_norm": 1.1306750774383545, + "learning_rate": 0.00024546137594436097, + "loss": 2.1244, + "step": 12128 + }, + { + "epoch": 1.4151207560378019, + "grad_norm": 1.2111001014709473, + "learning_rate": 0.000245449667053506, + "loss": 2.078, + "step": 12129 + }, + { + "epoch": 1.4152374285380935, + "grad_norm": 1.188607096672058, + "learning_rate": 0.0002454379571886847, + "loss": 1.9956, + "step": 12130 + }, + { + "epoch": 1.4153541010383852, + "grad_norm": 1.1267505884170532, + "learning_rate": 0.0002454262463500187, + "loss": 2.0252, + "step": 12131 + }, + { + "epoch": 1.415470773538677, + "grad_norm": 1.0334053039550781, + "learning_rate": 0.0002454145345376292, + "loss": 1.8831, + "step": 12132 + }, + { + "epoch": 1.4155874460389686, + "grad_norm": 1.09062922000885, + "learning_rate": 0.00024540282175163785, + "loss": 2.1237, + "step": 12133 + }, + { + "epoch": 1.4157041185392603, + "grad_norm": 1.1293812990188599, + "learning_rate": 0.0002453911079921658, + "loss": 2.1566, + "step": 12134 + }, + { + "epoch": 1.415820791039552, + "grad_norm": 1.4078713655471802, + "learning_rate": 0.00024537939325933465, + "loss": 2.0368, + "step": 12135 + }, + { + "epoch": 1.4159374635398436, + "grad_norm": 1.033121943473816, + "learning_rate": 0.00024536767755326585, + "loss": 1.8161, + "step": 12136 + }, + { + "epoch": 1.4160541360401353, + "grad_norm": 1.2850446701049805, + "learning_rate": 0.0002453559608740808, + "loss": 2.0984, + "step": 12137 + }, + { + "epoch": 1.416170808540427, + "grad_norm": 1.3096154928207397, + "learning_rate": 0.000245344243221901, + "loss": 2.1217, + "step": 12138 + }, + { + "epoch": 1.4162874810407187, + "grad_norm": 1.13384211063385, + "learning_rate": 0.00024533252459684786, + "loss": 1.7936, + "step": 12139 + }, + { + "epoch": 1.4164041535410103, + "grad_norm": 1.190670132637024, + "learning_rate": 0.00024532080499904295, + "loss": 2.0589, + "step": 12140 + }, + { + "epoch": 1.416520826041302, + "grad_norm": 1.1706013679504395, + "learning_rate": 0.00024530908442860775, + "loss": 2.0387, + "step": 12141 + }, + { + "epoch": 1.4166374985415937, + "grad_norm": 1.0785305500030518, + "learning_rate": 0.00024529736288566374, + "loss": 2.1066, + "step": 12142 + }, + { + "epoch": 1.4167541710418854, + "grad_norm": 1.2408852577209473, + "learning_rate": 0.0002452856403703324, + "loss": 2.258, + "step": 12143 + }, + { + "epoch": 1.416870843542177, + "grad_norm": 1.1425297260284424, + "learning_rate": 0.0002452739168827354, + "loss": 1.9987, + "step": 12144 + }, + { + "epoch": 1.4169875160424688, + "grad_norm": 1.196015477180481, + "learning_rate": 0.00024526219242299404, + "loss": 2.2322, + "step": 12145 + }, + { + "epoch": 1.4171041885427604, + "grad_norm": 1.058168888092041, + "learning_rate": 0.0002452504669912301, + "loss": 2.0114, + "step": 12146 + }, + { + "epoch": 1.4172208610430521, + "grad_norm": 1.1189340353012085, + "learning_rate": 0.000245238740587565, + "loss": 1.9362, + "step": 12147 + }, + { + "epoch": 1.4173375335433438, + "grad_norm": 0.938681960105896, + "learning_rate": 0.00024522701321212035, + "loss": 2.0624, + "step": 12148 + }, + { + "epoch": 1.4174542060436355, + "grad_norm": 1.22098708152771, + "learning_rate": 0.0002452152848650177, + "loss": 2.0449, + "step": 12149 + }, + { + "epoch": 1.4175708785439272, + "grad_norm": 1.1435327529907227, + "learning_rate": 0.00024520355554637875, + "loss": 2.22, + "step": 12150 + }, + { + "epoch": 1.4176875510442188, + "grad_norm": 1.2006208896636963, + "learning_rate": 0.00024519182525632494, + "loss": 2.0362, + "step": 12151 + }, + { + "epoch": 1.4178042235445105, + "grad_norm": 1.1120960712432861, + "learning_rate": 0.0002451800939949779, + "loss": 1.9062, + "step": 12152 + }, + { + "epoch": 1.4179208960448022, + "grad_norm": 1.1205469369888306, + "learning_rate": 0.0002451683617624593, + "loss": 2.1308, + "step": 12153 + }, + { + "epoch": 1.4180375685450939, + "grad_norm": 0.9774135947227478, + "learning_rate": 0.0002451566285588908, + "loss": 2.1012, + "step": 12154 + }, + { + "epoch": 1.4181542410453856, + "grad_norm": 1.021591305732727, + "learning_rate": 0.00024514489438439406, + "loss": 1.9582, + "step": 12155 + }, + { + "epoch": 1.4182709135456772, + "grad_norm": 0.9727099537849426, + "learning_rate": 0.0002451331592390905, + "loss": 2.0954, + "step": 12156 + }, + { + "epoch": 1.418387586045969, + "grad_norm": 1.0801290273666382, + "learning_rate": 0.000245121423123102, + "loss": 1.8268, + "step": 12157 + }, + { + "epoch": 1.4185042585462606, + "grad_norm": 1.2905209064483643, + "learning_rate": 0.00024510968603655024, + "loss": 2.1453, + "step": 12158 + }, + { + "epoch": 1.4186209310465523, + "grad_norm": 1.2813174724578857, + "learning_rate": 0.0002450979479795567, + "loss": 2.255, + "step": 12159 + }, + { + "epoch": 1.418737603546844, + "grad_norm": 1.2050130367279053, + "learning_rate": 0.00024508620895224325, + "loss": 1.9863, + "step": 12160 + }, + { + "epoch": 1.4188542760471357, + "grad_norm": 1.266812801361084, + "learning_rate": 0.0002450744689547315, + "loss": 2.1242, + "step": 12161 + }, + { + "epoch": 1.4189709485474273, + "grad_norm": 1.1738686561584473, + "learning_rate": 0.0002450627279871432, + "loss": 2.0921, + "step": 12162 + }, + { + "epoch": 1.419087621047719, + "grad_norm": 1.1990952491760254, + "learning_rate": 0.00024505098604960006, + "loss": 2.0238, + "step": 12163 + }, + { + "epoch": 1.4192042935480107, + "grad_norm": 1.199942708015442, + "learning_rate": 0.00024503924314222376, + "loss": 1.9449, + "step": 12164 + }, + { + "epoch": 1.4193209660483024, + "grad_norm": 1.1102784872055054, + "learning_rate": 0.0002450274992651361, + "loss": 2.1892, + "step": 12165 + }, + { + "epoch": 1.419437638548594, + "grad_norm": 1.2733027935028076, + "learning_rate": 0.00024501575441845876, + "loss": 2.0482, + "step": 12166 + }, + { + "epoch": 1.4195543110488857, + "grad_norm": 1.0632704496383667, + "learning_rate": 0.00024500400860231363, + "loss": 1.9597, + "step": 12167 + }, + { + "epoch": 1.4196709835491774, + "grad_norm": 1.1236158609390259, + "learning_rate": 0.0002449922618168223, + "loss": 2.0705, + "step": 12168 + }, + { + "epoch": 1.419787656049469, + "grad_norm": 1.1617194414138794, + "learning_rate": 0.00024498051406210663, + "loss": 2.0536, + "step": 12169 + }, + { + "epoch": 1.4199043285497608, + "grad_norm": 1.2147352695465088, + "learning_rate": 0.0002449687653382885, + "loss": 2.1474, + "step": 12170 + }, + { + "epoch": 1.4200210010500525, + "grad_norm": 1.2769991159439087, + "learning_rate": 0.0002449570156454896, + "loss": 2.3337, + "step": 12171 + }, + { + "epoch": 1.4201376735503441, + "grad_norm": 1.330310583114624, + "learning_rate": 0.0002449452649838317, + "loss": 2.2414, + "step": 12172 + }, + { + "epoch": 1.4202543460506358, + "grad_norm": 1.210325837135315, + "learning_rate": 0.00024493351335343666, + "loss": 2.0632, + "step": 12173 + }, + { + "epoch": 1.4203710185509275, + "grad_norm": 1.0265361070632935, + "learning_rate": 0.00024492176075442635, + "loss": 2.0702, + "step": 12174 + }, + { + "epoch": 1.4204876910512192, + "grad_norm": 1.091615080833435, + "learning_rate": 0.0002449100071869226, + "loss": 2.1259, + "step": 12175 + }, + { + "epoch": 1.4206043635515109, + "grad_norm": 1.1291406154632568, + "learning_rate": 0.00024489825265104723, + "loss": 2.1549, + "step": 12176 + }, + { + "epoch": 1.4207210360518026, + "grad_norm": 1.1390560865402222, + "learning_rate": 0.00024488649714692214, + "loss": 2.0168, + "step": 12177 + }, + { + "epoch": 1.4208377085520942, + "grad_norm": 1.1012459993362427, + "learning_rate": 0.0002448747406746691, + "loss": 2.0125, + "step": 12178 + }, + { + "epoch": 1.420954381052386, + "grad_norm": 1.1763650178909302, + "learning_rate": 0.0002448629832344101, + "loss": 1.9822, + "step": 12179 + }, + { + "epoch": 1.4210710535526776, + "grad_norm": 1.0637640953063965, + "learning_rate": 0.000244851224826267, + "loss": 1.9526, + "step": 12180 + }, + { + "epoch": 1.4211877260529693, + "grad_norm": 1.1942486763000488, + "learning_rate": 0.00024483946545036163, + "loss": 2.0149, + "step": 12181 + }, + { + "epoch": 1.421304398553261, + "grad_norm": 1.4455244541168213, + "learning_rate": 0.00024482770510681596, + "loss": 2.1589, + "step": 12182 + }, + { + "epoch": 1.4214210710535526, + "grad_norm": 1.0501495599746704, + "learning_rate": 0.0002448159437957519, + "loss": 2.0316, + "step": 12183 + }, + { + "epoch": 1.4215377435538443, + "grad_norm": 1.0650956630706787, + "learning_rate": 0.0002448041815172914, + "loss": 2.1689, + "step": 12184 + }, + { + "epoch": 1.421654416054136, + "grad_norm": 1.128487229347229, + "learning_rate": 0.00024479241827155636, + "loss": 2.0909, + "step": 12185 + }, + { + "epoch": 1.4217710885544277, + "grad_norm": 1.1323665380477905, + "learning_rate": 0.0002447806540586687, + "loss": 2.1016, + "step": 12186 + }, + { + "epoch": 1.4218877610547194, + "grad_norm": 1.1190903186798096, + "learning_rate": 0.00024476888887875047, + "loss": 2.0741, + "step": 12187 + }, + { + "epoch": 1.422004433555011, + "grad_norm": 1.4167959690093994, + "learning_rate": 0.00024475712273192354, + "loss": 2.1206, + "step": 12188 + }, + { + "epoch": 1.4221211060553027, + "grad_norm": 1.1597765684127808, + "learning_rate": 0.00024474535561830996, + "loss": 2.1118, + "step": 12189 + }, + { + "epoch": 1.4222377785555944, + "grad_norm": 1.1263402700424194, + "learning_rate": 0.0002447335875380317, + "loss": 2.1131, + "step": 12190 + }, + { + "epoch": 1.422354451055886, + "grad_norm": 1.1847732067108154, + "learning_rate": 0.0002447218184912108, + "loss": 1.7561, + "step": 12191 + }, + { + "epoch": 1.4224711235561778, + "grad_norm": 1.1709948778152466, + "learning_rate": 0.0002447100484779691, + "loss": 1.9159, + "step": 12192 + }, + { + "epoch": 1.4225877960564695, + "grad_norm": 1.2584221363067627, + "learning_rate": 0.00024469827749842887, + "loss": 2.2049, + "step": 12193 + }, + { + "epoch": 1.4227044685567611, + "grad_norm": 1.1755039691925049, + "learning_rate": 0.0002446865055527119, + "loss": 2.2533, + "step": 12194 + }, + { + "epoch": 1.4228211410570528, + "grad_norm": 1.2647565603256226, + "learning_rate": 0.00024467473264094044, + "loss": 1.979, + "step": 12195 + }, + { + "epoch": 1.4229378135573445, + "grad_norm": 1.1999680995941162, + "learning_rate": 0.0002446629587632364, + "loss": 2.0954, + "step": 12196 + }, + { + "epoch": 1.4230544860576362, + "grad_norm": 1.207420825958252, + "learning_rate": 0.0002446511839197218, + "loss": 2.2792, + "step": 12197 + }, + { + "epoch": 1.4231711585579279, + "grad_norm": 1.1852093935012817, + "learning_rate": 0.00024463940811051885, + "loss": 2.0931, + "step": 12198 + }, + { + "epoch": 1.4232878310582195, + "grad_norm": 1.1688191890716553, + "learning_rate": 0.0002446276313357495, + "loss": 2.2007, + "step": 12199 + }, + { + "epoch": 1.4234045035585112, + "grad_norm": 1.2020829916000366, + "learning_rate": 0.00024461585359553594, + "loss": 2.1147, + "step": 12200 + }, + { + "epoch": 1.423521176058803, + "grad_norm": 1.069047212600708, + "learning_rate": 0.0002446040748900003, + "loss": 1.8489, + "step": 12201 + }, + { + "epoch": 1.4236378485590946, + "grad_norm": 1.2754809856414795, + "learning_rate": 0.0002445922952192645, + "loss": 2.0975, + "step": 12202 + }, + { + "epoch": 1.4237545210593863, + "grad_norm": 1.0494004487991333, + "learning_rate": 0.00024458051458345093, + "loss": 2.049, + "step": 12203 + }, + { + "epoch": 1.423871193559678, + "grad_norm": 1.0475200414657593, + "learning_rate": 0.0002445687329826814, + "loss": 2.1821, + "step": 12204 + }, + { + "epoch": 1.4239878660599696, + "grad_norm": 1.1859592199325562, + "learning_rate": 0.00024455695041707826, + "loss": 1.9803, + "step": 12205 + }, + { + "epoch": 1.4241045385602613, + "grad_norm": 1.2034797668457031, + "learning_rate": 0.00024454516688676367, + "loss": 2.1025, + "step": 12206 + }, + { + "epoch": 1.424221211060553, + "grad_norm": 1.161128044128418, + "learning_rate": 0.00024453338239185967, + "loss": 2.0734, + "step": 12207 + }, + { + "epoch": 1.4243378835608447, + "grad_norm": 1.2352036237716675, + "learning_rate": 0.0002445215969324885, + "loss": 2.12, + "step": 12208 + }, + { + "epoch": 1.4244545560611364, + "grad_norm": 1.2164679765701294, + "learning_rate": 0.00024450981050877236, + "loss": 2.0086, + "step": 12209 + }, + { + "epoch": 1.424571228561428, + "grad_norm": 1.1862602233886719, + "learning_rate": 0.0002444980231208334, + "loss": 2.046, + "step": 12210 + }, + { + "epoch": 1.4246879010617197, + "grad_norm": 1.2208210229873657, + "learning_rate": 0.0002444862347687938, + "loss": 2.0967, + "step": 12211 + }, + { + "epoch": 1.4248045735620114, + "grad_norm": 1.0001407861709595, + "learning_rate": 0.00024447444545277574, + "loss": 1.8136, + "step": 12212 + }, + { + "epoch": 1.424921246062303, + "grad_norm": 1.1637024879455566, + "learning_rate": 0.00024446265517290153, + "loss": 1.8662, + "step": 12213 + }, + { + "epoch": 1.4250379185625948, + "grad_norm": 1.1269237995147705, + "learning_rate": 0.0002444508639292934, + "loss": 2.0764, + "step": 12214 + }, + { + "epoch": 1.4251545910628864, + "grad_norm": 1.1630561351776123, + "learning_rate": 0.00024443907172207353, + "loss": 2.1803, + "step": 12215 + }, + { + "epoch": 1.4252712635631781, + "grad_norm": 1.2415730953216553, + "learning_rate": 0.0002444272785513642, + "loss": 2.1199, + "step": 12216 + }, + { + "epoch": 1.4253879360634698, + "grad_norm": 1.1968379020690918, + "learning_rate": 0.00024441548441728764, + "loss": 2.1393, + "step": 12217 + }, + { + "epoch": 1.4255046085637615, + "grad_norm": 1.1098384857177734, + "learning_rate": 0.00024440368931996615, + "loss": 2.0778, + "step": 12218 + }, + { + "epoch": 1.4256212810640532, + "grad_norm": 1.1432433128356934, + "learning_rate": 0.00024439189325952196, + "loss": 2.3477, + "step": 12219 + }, + { + "epoch": 1.4257379535643449, + "grad_norm": 1.4723480939865112, + "learning_rate": 0.00024438009623607736, + "loss": 2.0285, + "step": 12220 + }, + { + "epoch": 1.4258546260646365, + "grad_norm": 1.1886945962905884, + "learning_rate": 0.00024436829824975477, + "loss": 2.1831, + "step": 12221 + }, + { + "epoch": 1.4259712985649282, + "grad_norm": 1.0847853422164917, + "learning_rate": 0.0002443564993006763, + "loss": 1.9725, + "step": 12222 + }, + { + "epoch": 1.42608797106522, + "grad_norm": 1.0108270645141602, + "learning_rate": 0.0002443446993889645, + "loss": 2.0545, + "step": 12223 + }, + { + "epoch": 1.4262046435655116, + "grad_norm": 0.9896931648254395, + "learning_rate": 0.0002443328985147415, + "loss": 2.0808, + "step": 12224 + }, + { + "epoch": 1.4263213160658033, + "grad_norm": 1.1282809972763062, + "learning_rate": 0.00024432109667812973, + "loss": 2.0808, + "step": 12225 + }, + { + "epoch": 1.426437988566095, + "grad_norm": 1.2145404815673828, + "learning_rate": 0.0002443092938792515, + "loss": 2.1404, + "step": 12226 + }, + { + "epoch": 1.4265546610663866, + "grad_norm": 1.1857483386993408, + "learning_rate": 0.0002442974901182292, + "loss": 2.2747, + "step": 12227 + }, + { + "epoch": 1.4266713335666783, + "grad_norm": 1.0479085445404053, + "learning_rate": 0.00024428568539518517, + "loss": 1.9943, + "step": 12228 + }, + { + "epoch": 1.42678800606697, + "grad_norm": 1.0156810283660889, + "learning_rate": 0.0002442738797102419, + "loss": 1.9529, + "step": 12229 + }, + { + "epoch": 1.4269046785672617, + "grad_norm": 1.2373554706573486, + "learning_rate": 0.0002442620730635216, + "loss": 2.0495, + "step": 12230 + }, + { + "epoch": 1.4270213510675533, + "grad_norm": 1.2846308946609497, + "learning_rate": 0.0002442502654551468, + "loss": 2.1441, + "step": 12231 + }, + { + "epoch": 1.427138023567845, + "grad_norm": 1.1569157838821411, + "learning_rate": 0.0002442384568852398, + "loss": 1.9915, + "step": 12232 + }, + { + "epoch": 1.4272546960681367, + "grad_norm": 1.256556510925293, + "learning_rate": 0.0002442266473539231, + "loss": 2.1201, + "step": 12233 + }, + { + "epoch": 1.4273713685684284, + "grad_norm": 1.1594911813735962, + "learning_rate": 0.00024421483686131915, + "loss": 2.0342, + "step": 12234 + }, + { + "epoch": 1.42748804106872, + "grad_norm": 1.1732933521270752, + "learning_rate": 0.0002442030254075503, + "loss": 2.1716, + "step": 12235 + }, + { + "epoch": 1.4276047135690118, + "grad_norm": 1.1245124340057373, + "learning_rate": 0.0002441912129927391, + "loss": 2.1229, + "step": 12236 + }, + { + "epoch": 1.4277213860693034, + "grad_norm": 1.3796049356460571, + "learning_rate": 0.0002441793996170079, + "loss": 2.0105, + "step": 12237 + }, + { + "epoch": 1.4278380585695951, + "grad_norm": 1.2335776090621948, + "learning_rate": 0.00024416758528047924, + "loss": 2.0457, + "step": 12238 + }, + { + "epoch": 1.4279547310698868, + "grad_norm": 1.0644009113311768, + "learning_rate": 0.00024415576998327557, + "loss": 1.975, + "step": 12239 + }, + { + "epoch": 1.4280714035701785, + "grad_norm": 1.542765498161316, + "learning_rate": 0.00024414395372551935, + "loss": 2.2376, + "step": 12240 + }, + { + "epoch": 1.4281880760704702, + "grad_norm": 1.061065673828125, + "learning_rate": 0.00024413213650733317, + "loss": 2.0879, + "step": 12241 + }, + { + "epoch": 1.4283047485707618, + "grad_norm": 1.3777178525924683, + "learning_rate": 0.00024412031832883948, + "loss": 2.0417, + "step": 12242 + }, + { + "epoch": 1.4284214210710535, + "grad_norm": 1.07462739944458, + "learning_rate": 0.0002441084991901608, + "loss": 1.955, + "step": 12243 + }, + { + "epoch": 1.4285380935713452, + "grad_norm": 1.1430943012237549, + "learning_rate": 0.00024409667909141964, + "loss": 2.0802, + "step": 12244 + }, + { + "epoch": 1.4286547660716369, + "grad_norm": 1.2664592266082764, + "learning_rate": 0.00024408485803273857, + "loss": 1.9099, + "step": 12245 + }, + { + "epoch": 1.4287714385719286, + "grad_norm": 1.0865083932876587, + "learning_rate": 0.0002440730360142401, + "loss": 2.0663, + "step": 12246 + }, + { + "epoch": 1.4288881110722202, + "grad_norm": 1.2252697944641113, + "learning_rate": 0.00024406121303604682, + "loss": 2.2121, + "step": 12247 + }, + { + "epoch": 1.429004783572512, + "grad_norm": 1.1172305345535278, + "learning_rate": 0.0002440493890982813, + "loss": 2.0729, + "step": 12248 + }, + { + "epoch": 1.4291214560728036, + "grad_norm": 1.1417616605758667, + "learning_rate": 0.00024403756420106606, + "loss": 2.0324, + "step": 12249 + }, + { + "epoch": 1.4292381285730953, + "grad_norm": 1.1442573070526123, + "learning_rate": 0.0002440257383445238, + "loss": 2.0799, + "step": 12250 + }, + { + "epoch": 1.429354801073387, + "grad_norm": 1.1357396841049194, + "learning_rate": 0.000244013911528777, + "loss": 1.8523, + "step": 12251 + }, + { + "epoch": 1.4294714735736787, + "grad_norm": 1.0935426950454712, + "learning_rate": 0.00024400208375394833, + "loss": 2.1592, + "step": 12252 + }, + { + "epoch": 1.4295881460739703, + "grad_norm": 0.9849005937576294, + "learning_rate": 0.00024399025502016037, + "loss": 1.9533, + "step": 12253 + }, + { + "epoch": 1.429704818574262, + "grad_norm": 1.15227210521698, + "learning_rate": 0.00024397842532753583, + "loss": 2.0957, + "step": 12254 + }, + { + "epoch": 1.4298214910745537, + "grad_norm": 1.2741358280181885, + "learning_rate": 0.00024396659467619726, + "loss": 2.0599, + "step": 12255 + }, + { + "epoch": 1.4299381635748454, + "grad_norm": 1.2090178728103638, + "learning_rate": 0.00024395476306626732, + "loss": 2.1192, + "step": 12256 + }, + { + "epoch": 1.430054836075137, + "grad_norm": 1.1448453664779663, + "learning_rate": 0.00024394293049786873, + "loss": 2.1993, + "step": 12257 + }, + { + "epoch": 1.4301715085754287, + "grad_norm": 1.0350587368011475, + "learning_rate": 0.00024393109697112408, + "loss": 1.9705, + "step": 12258 + }, + { + "epoch": 1.4302881810757204, + "grad_norm": 1.1321579217910767, + "learning_rate": 0.00024391926248615607, + "loss": 2.1558, + "step": 12259 + }, + { + "epoch": 1.430404853576012, + "grad_norm": 1.2016383409500122, + "learning_rate": 0.00024390742704308736, + "loss": 2.2204, + "step": 12260 + }, + { + "epoch": 1.4305215260763038, + "grad_norm": 1.046506404876709, + "learning_rate": 0.00024389559064204078, + "loss": 2.1564, + "step": 12261 + }, + { + "epoch": 1.4306381985765955, + "grad_norm": 1.096591591835022, + "learning_rate": 0.00024388375328313888, + "loss": 1.9599, + "step": 12262 + }, + { + "epoch": 1.4307548710768871, + "grad_norm": 1.0169787406921387, + "learning_rate": 0.00024387191496650444, + "loss": 2.1106, + "step": 12263 + }, + { + "epoch": 1.4308715435771788, + "grad_norm": 1.2139023542404175, + "learning_rate": 0.00024386007569226012, + "loss": 2.0567, + "step": 12264 + }, + { + "epoch": 1.4309882160774705, + "grad_norm": 1.339241862297058, + "learning_rate": 0.0002438482354605288, + "loss": 2.2228, + "step": 12265 + }, + { + "epoch": 1.4311048885777622, + "grad_norm": 1.292906641960144, + "learning_rate": 0.00024383639427143312, + "loss": 2.1505, + "step": 12266 + }, + { + "epoch": 1.4312215610780539, + "grad_norm": 1.1772514581680298, + "learning_rate": 0.00024382455212509585, + "loss": 2.1418, + "step": 12267 + }, + { + "epoch": 1.4313382335783456, + "grad_norm": 1.195281982421875, + "learning_rate": 0.00024381270902163978, + "loss": 2.1313, + "step": 12268 + }, + { + "epoch": 1.4314549060786372, + "grad_norm": 1.0799616575241089, + "learning_rate": 0.00024380086496118765, + "loss": 2.0151, + "step": 12269 + }, + { + "epoch": 1.431571578578929, + "grad_norm": 1.2461111545562744, + "learning_rate": 0.0002437890199438623, + "loss": 1.9998, + "step": 12270 + }, + { + "epoch": 1.4316882510792206, + "grad_norm": 1.220537781715393, + "learning_rate": 0.00024377717396978647, + "loss": 2.1322, + "step": 12271 + }, + { + "epoch": 1.4318049235795123, + "grad_norm": 1.2407511472702026, + "learning_rate": 0.000243765327039083, + "loss": 2.1511, + "step": 12272 + }, + { + "epoch": 1.431921596079804, + "grad_norm": 1.1446905136108398, + "learning_rate": 0.00024375347915187468, + "loss": 1.9127, + "step": 12273 + }, + { + "epoch": 1.4320382685800956, + "grad_norm": 1.2435435056686401, + "learning_rate": 0.0002437416303082844, + "loss": 1.9729, + "step": 12274 + }, + { + "epoch": 1.4321549410803873, + "grad_norm": 1.2337234020233154, + "learning_rate": 0.00024372978050843488, + "loss": 2.1137, + "step": 12275 + }, + { + "epoch": 1.432271613580679, + "grad_norm": 1.0662062168121338, + "learning_rate": 0.00024371792975244911, + "loss": 1.9177, + "step": 12276 + }, + { + "epoch": 1.4323882860809707, + "grad_norm": 1.253875970840454, + "learning_rate": 0.00024370607804044984, + "loss": 1.9913, + "step": 12277 + }, + { + "epoch": 1.4325049585812624, + "grad_norm": 1.1931346654891968, + "learning_rate": 0.0002436942253725599, + "loss": 2.0446, + "step": 12278 + }, + { + "epoch": 1.432621631081554, + "grad_norm": 1.1933767795562744, + "learning_rate": 0.00024368237174890233, + "loss": 2.1608, + "step": 12279 + }, + { + "epoch": 1.4327383035818457, + "grad_norm": 1.122648000717163, + "learning_rate": 0.00024367051716959987, + "loss": 2.1799, + "step": 12280 + }, + { + "epoch": 1.4328549760821374, + "grad_norm": 1.24635648727417, + "learning_rate": 0.0002436586616347754, + "loss": 2.1676, + "step": 12281 + }, + { + "epoch": 1.432971648582429, + "grad_norm": 1.193487286567688, + "learning_rate": 0.00024364680514455197, + "loss": 2.1936, + "step": 12282 + }, + { + "epoch": 1.4330883210827208, + "grad_norm": 1.237878680229187, + "learning_rate": 0.00024363494769905238, + "loss": 2.1769, + "step": 12283 + }, + { + "epoch": 1.4332049935830125, + "grad_norm": 1.0527583360671997, + "learning_rate": 0.0002436230892983996, + "loss": 2.0853, + "step": 12284 + }, + { + "epoch": 1.4333216660833041, + "grad_norm": 1.1477949619293213, + "learning_rate": 0.00024361122994271656, + "loss": 2.0636, + "step": 12285 + }, + { + "epoch": 1.4334383385835958, + "grad_norm": 1.0961763858795166, + "learning_rate": 0.00024359936963212617, + "loss": 2.0752, + "step": 12286 + }, + { + "epoch": 1.4335550110838875, + "grad_norm": 1.2110313177108765, + "learning_rate": 0.00024358750836675142, + "loss": 2.0203, + "step": 12287 + }, + { + "epoch": 1.4336716835841792, + "grad_norm": 1.280568242073059, + "learning_rate": 0.00024357564614671526, + "loss": 2.1365, + "step": 12288 + }, + { + "epoch": 1.4337883560844709, + "grad_norm": 1.351973056793213, + "learning_rate": 0.00024356378297214065, + "loss": 2.0404, + "step": 12289 + }, + { + "epoch": 1.4339050285847625, + "grad_norm": 1.114176869392395, + "learning_rate": 0.0002435519188431506, + "loss": 2.0833, + "step": 12290 + }, + { + "epoch": 1.4340217010850542, + "grad_norm": 1.0844006538391113, + "learning_rate": 0.00024354005375986813, + "loss": 2.0583, + "step": 12291 + }, + { + "epoch": 1.434138373585346, + "grad_norm": 1.0905044078826904, + "learning_rate": 0.00024352818772241616, + "loss": 2.0119, + "step": 12292 + }, + { + "epoch": 1.4342550460856376, + "grad_norm": 1.0816726684570312, + "learning_rate": 0.00024351632073091778, + "loss": 1.9774, + "step": 12293 + }, + { + "epoch": 1.4343717185859293, + "grad_norm": 1.3614833354949951, + "learning_rate": 0.000243504452785496, + "loss": 2.0192, + "step": 12294 + }, + { + "epoch": 1.434488391086221, + "grad_norm": 0.9895538091659546, + "learning_rate": 0.00024349258388627382, + "loss": 1.8935, + "step": 12295 + }, + { + "epoch": 1.4346050635865126, + "grad_norm": 1.244184970855713, + "learning_rate": 0.00024348071403337433, + "loss": 2.1476, + "step": 12296 + }, + { + "epoch": 1.4347217360868043, + "grad_norm": 1.090089201927185, + "learning_rate": 0.00024346884322692052, + "loss": 2.0256, + "step": 12297 + }, + { + "epoch": 1.434838408587096, + "grad_norm": 1.2747167348861694, + "learning_rate": 0.0002434569714670355, + "loss": 2.0679, + "step": 12298 + }, + { + "epoch": 1.4349550810873877, + "grad_norm": 1.120068907737732, + "learning_rate": 0.0002434450987538424, + "loss": 1.9799, + "step": 12299 + }, + { + "epoch": 1.4350717535876794, + "grad_norm": 1.1620644330978394, + "learning_rate": 0.0002434332250874642, + "loss": 2.2138, + "step": 12300 + }, + { + "epoch": 1.435188426087971, + "grad_norm": 1.1476690769195557, + "learning_rate": 0.00024342135046802403, + "loss": 2.2023, + "step": 12301 + }, + { + "epoch": 1.4353050985882627, + "grad_norm": 1.2009490728378296, + "learning_rate": 0.00024340947489564498, + "loss": 1.9538, + "step": 12302 + }, + { + "epoch": 1.4354217710885544, + "grad_norm": 1.253745198249817, + "learning_rate": 0.00024339759837045018, + "loss": 2.1921, + "step": 12303 + }, + { + "epoch": 1.435538443588846, + "grad_norm": 1.2488113641738892, + "learning_rate": 0.00024338572089256275, + "loss": 1.8861, + "step": 12304 + }, + { + "epoch": 1.4356551160891378, + "grad_norm": 1.1783947944641113, + "learning_rate": 0.00024337384246210584, + "loss": 2.2598, + "step": 12305 + }, + { + "epoch": 1.4357717885894294, + "grad_norm": 1.2149103879928589, + "learning_rate": 0.00024336196307920256, + "loss": 2.3964, + "step": 12306 + }, + { + "epoch": 1.4358884610897211, + "grad_norm": 1.1719331741333008, + "learning_rate": 0.0002433500827439761, + "loss": 2.0157, + "step": 12307 + }, + { + "epoch": 1.4360051335900128, + "grad_norm": 1.3660645484924316, + "learning_rate": 0.00024333820145654956, + "loss": 2.1196, + "step": 12308 + }, + { + "epoch": 1.4361218060903045, + "grad_norm": 1.3369578123092651, + "learning_rate": 0.0002433263192170462, + "loss": 2.2369, + "step": 12309 + }, + { + "epoch": 1.4362384785905962, + "grad_norm": 1.1162573099136353, + "learning_rate": 0.00024331443602558912, + "loss": 2.0386, + "step": 12310 + }, + { + "epoch": 1.4363551510908878, + "grad_norm": 1.1466782093048096, + "learning_rate": 0.00024330255188230156, + "loss": 1.874, + "step": 12311 + }, + { + "epoch": 1.4364718235911795, + "grad_norm": 1.2330365180969238, + "learning_rate": 0.00024329066678730672, + "loss": 2.0502, + "step": 12312 + }, + { + "epoch": 1.4365884960914712, + "grad_norm": 1.0510823726654053, + "learning_rate": 0.00024327878074072778, + "loss": 2.0046, + "step": 12313 + }, + { + "epoch": 1.436705168591763, + "grad_norm": 1.144925594329834, + "learning_rate": 0.00024326689374268802, + "loss": 2.2239, + "step": 12314 + }, + { + "epoch": 1.4368218410920546, + "grad_norm": 1.465470552444458, + "learning_rate": 0.00024325500579331058, + "loss": 2.1462, + "step": 12315 + }, + { + "epoch": 1.4369385135923463, + "grad_norm": 1.098763108253479, + "learning_rate": 0.00024324311689271876, + "loss": 2.0743, + "step": 12316 + }, + { + "epoch": 1.437055186092638, + "grad_norm": 1.158150315284729, + "learning_rate": 0.00024323122704103577, + "loss": 2.182, + "step": 12317 + }, + { + "epoch": 1.4371718585929296, + "grad_norm": 0.9875715374946594, + "learning_rate": 0.00024321933623838493, + "loss": 1.8207, + "step": 12318 + }, + { + "epoch": 1.4372885310932213, + "grad_norm": 1.2612046003341675, + "learning_rate": 0.00024320744448488955, + "loss": 2.063, + "step": 12319 + }, + { + "epoch": 1.437405203593513, + "grad_norm": 1.023984670639038, + "learning_rate": 0.00024319555178067278, + "loss": 1.7725, + "step": 12320 + }, + { + "epoch": 1.4375218760938047, + "grad_norm": 1.1605758666992188, + "learning_rate": 0.00024318365812585802, + "loss": 2.0251, + "step": 12321 + }, + { + "epoch": 1.4376385485940963, + "grad_norm": 1.1175904273986816, + "learning_rate": 0.00024317176352056847, + "loss": 2.2675, + "step": 12322 + }, + { + "epoch": 1.437755221094388, + "grad_norm": 1.2221834659576416, + "learning_rate": 0.00024315986796492753, + "loss": 2.0309, + "step": 12323 + }, + { + "epoch": 1.4378718935946797, + "grad_norm": 1.0933901071548462, + "learning_rate": 0.0002431479714590585, + "loss": 1.929, + "step": 12324 + }, + { + "epoch": 1.4379885660949714, + "grad_norm": 1.2284542322158813, + "learning_rate": 0.0002431360740030847, + "loss": 1.9436, + "step": 12325 + }, + { + "epoch": 1.438105238595263, + "grad_norm": 1.3465404510498047, + "learning_rate": 0.00024312417559712945, + "loss": 2.2304, + "step": 12326 + }, + { + "epoch": 1.4382219110955548, + "grad_norm": 1.170372486114502, + "learning_rate": 0.0002431122762413161, + "loss": 2.057, + "step": 12327 + }, + { + "epoch": 1.4383385835958464, + "grad_norm": 1.3693078756332397, + "learning_rate": 0.00024310037593576803, + "loss": 2.1599, + "step": 12328 + }, + { + "epoch": 1.4384552560961381, + "grad_norm": 1.19704008102417, + "learning_rate": 0.0002430884746806086, + "loss": 1.9693, + "step": 12329 + }, + { + "epoch": 1.4385719285964298, + "grad_norm": 1.067096471786499, + "learning_rate": 0.00024307657247596117, + "loss": 2.0664, + "step": 12330 + }, + { + "epoch": 1.4386886010967215, + "grad_norm": 1.1573361158370972, + "learning_rate": 0.0002430646693219492, + "loss": 2.0464, + "step": 12331 + }, + { + "epoch": 1.4388052735970132, + "grad_norm": 1.1675875186920166, + "learning_rate": 0.000243052765218696, + "loss": 2.1281, + "step": 12332 + }, + { + "epoch": 1.4389219460973048, + "grad_norm": 1.0781042575836182, + "learning_rate": 0.00024304086016632505, + "loss": 2.1212, + "step": 12333 + }, + { + "epoch": 1.4390386185975965, + "grad_norm": 1.2685846090316772, + "learning_rate": 0.00024302895416495975, + "loss": 2.0498, + "step": 12334 + }, + { + "epoch": 1.4391552910978882, + "grad_norm": 1.0212732553482056, + "learning_rate": 0.0002430170472147235, + "loss": 2.0836, + "step": 12335 + }, + { + "epoch": 1.4392719635981799, + "grad_norm": 1.2505593299865723, + "learning_rate": 0.00024300513931573974, + "loss": 1.9661, + "step": 12336 + }, + { + "epoch": 1.4393886360984716, + "grad_norm": 1.1859298944473267, + "learning_rate": 0.00024299323046813196, + "loss": 2.07, + "step": 12337 + }, + { + "epoch": 1.4395053085987632, + "grad_norm": 1.1413753032684326, + "learning_rate": 0.00024298132067202357, + "loss": 1.9997, + "step": 12338 + }, + { + "epoch": 1.439621981099055, + "grad_norm": 1.112102746963501, + "learning_rate": 0.00024296940992753804, + "loss": 2.0699, + "step": 12339 + }, + { + "epoch": 1.4397386535993466, + "grad_norm": 1.180192470550537, + "learning_rate": 0.00024295749823479888, + "loss": 2.2998, + "step": 12340 + }, + { + "epoch": 1.4398553260996383, + "grad_norm": 1.1331894397735596, + "learning_rate": 0.00024294558559392957, + "loss": 1.9828, + "step": 12341 + }, + { + "epoch": 1.43997199859993, + "grad_norm": 1.1614431142807007, + "learning_rate": 0.00024293367200505363, + "loss": 2.0012, + "step": 12342 + }, + { + "epoch": 1.4400886711002217, + "grad_norm": 1.158982753753662, + "learning_rate": 0.00024292175746829447, + "loss": 2.0337, + "step": 12343 + }, + { + "epoch": 1.4402053436005133, + "grad_norm": 1.17058527469635, + "learning_rate": 0.0002429098419837757, + "loss": 2.0239, + "step": 12344 + }, + { + "epoch": 1.440322016100805, + "grad_norm": 1.0378860235214233, + "learning_rate": 0.0002428979255516208, + "loss": 2.029, + "step": 12345 + }, + { + "epoch": 1.4404386886010967, + "grad_norm": 1.0090340375900269, + "learning_rate": 0.0002428860081719534, + "loss": 1.9114, + "step": 12346 + }, + { + "epoch": 1.4405553611013884, + "grad_norm": 1.1661309003829956, + "learning_rate": 0.0002428740898448969, + "loss": 2.1421, + "step": 12347 + }, + { + "epoch": 1.44067203360168, + "grad_norm": 1.1298121213912964, + "learning_rate": 0.00024286217057057496, + "loss": 2.1221, + "step": 12348 + }, + { + "epoch": 1.4407887061019717, + "grad_norm": 1.0218466520309448, + "learning_rate": 0.00024285025034911106, + "loss": 1.9137, + "step": 12349 + }, + { + "epoch": 1.4409053786022634, + "grad_norm": 1.3441681861877441, + "learning_rate": 0.00024283832918062888, + "loss": 2.1292, + "step": 12350 + }, + { + "epoch": 1.441022051102555, + "grad_norm": 1.249208688735962, + "learning_rate": 0.00024282640706525192, + "loss": 2.0599, + "step": 12351 + }, + { + "epoch": 1.4411387236028468, + "grad_norm": 1.169252634048462, + "learning_rate": 0.00024281448400310387, + "loss": 1.951, + "step": 12352 + }, + { + "epoch": 1.4412553961031385, + "grad_norm": 1.320989966392517, + "learning_rate": 0.00024280255999430825, + "loss": 2.1205, + "step": 12353 + }, + { + "epoch": 1.4413720686034301, + "grad_norm": 1.3027199506759644, + "learning_rate": 0.00024279063503898868, + "loss": 2.2312, + "step": 12354 + }, + { + "epoch": 1.4414887411037218, + "grad_norm": 1.1879796981811523, + "learning_rate": 0.00024277870913726882, + "loss": 2.0874, + "step": 12355 + }, + { + "epoch": 1.4416054136040135, + "grad_norm": 1.164067268371582, + "learning_rate": 0.00024276678228927226, + "loss": 2.0512, + "step": 12356 + }, + { + "epoch": 1.4417220861043052, + "grad_norm": 1.1405607461929321, + "learning_rate": 0.00024275485449512272, + "loss": 2.2864, + "step": 12357 + }, + { + "epoch": 1.4418387586045969, + "grad_norm": 1.339751958847046, + "learning_rate": 0.00024274292575494373, + "loss": 2.273, + "step": 12358 + }, + { + "epoch": 1.4419554311048886, + "grad_norm": 1.3839281797409058, + "learning_rate": 0.0002427309960688591, + "loss": 2.2158, + "step": 12359 + }, + { + "epoch": 1.4420721036051802, + "grad_norm": 1.1005038022994995, + "learning_rate": 0.0002427190654369924, + "loss": 2.0445, + "step": 12360 + }, + { + "epoch": 1.442188776105472, + "grad_norm": 1.238350510597229, + "learning_rate": 0.00024270713385946742, + "loss": 2.0051, + "step": 12361 + }, + { + "epoch": 1.4423054486057636, + "grad_norm": 1.1505802869796753, + "learning_rate": 0.00024269520133640774, + "loss": 2.2337, + "step": 12362 + }, + { + "epoch": 1.4424221211060553, + "grad_norm": 1.2561063766479492, + "learning_rate": 0.00024268326786793708, + "loss": 2.181, + "step": 12363 + }, + { + "epoch": 1.442538793606347, + "grad_norm": 1.2024421691894531, + "learning_rate": 0.00024267133345417925, + "loss": 2.1635, + "step": 12364 + }, + { + "epoch": 1.4426554661066386, + "grad_norm": 1.1449244022369385, + "learning_rate": 0.00024265939809525782, + "loss": 2.1375, + "step": 12365 + }, + { + "epoch": 1.4427721386069303, + "grad_norm": 1.0970053672790527, + "learning_rate": 0.00024264746179129666, + "loss": 2.0891, + "step": 12366 + }, + { + "epoch": 1.442888811107222, + "grad_norm": 1.061383605003357, + "learning_rate": 0.0002426355245424194, + "loss": 2.0277, + "step": 12367 + }, + { + "epoch": 1.4430054836075137, + "grad_norm": 1.1767983436584473, + "learning_rate": 0.00024262358634874988, + "loss": 1.8486, + "step": 12368 + }, + { + "epoch": 1.4431221561078054, + "grad_norm": 1.354383945465088, + "learning_rate": 0.00024261164721041184, + "loss": 2.0564, + "step": 12369 + }, + { + "epoch": 1.443238828608097, + "grad_norm": 1.1594890356063843, + "learning_rate": 0.00024259970712752903, + "loss": 2.0589, + "step": 12370 + }, + { + "epoch": 1.4433555011083887, + "grad_norm": 1.088486909866333, + "learning_rate": 0.00024258776610022525, + "loss": 2.0908, + "step": 12371 + }, + { + "epoch": 1.4434721736086804, + "grad_norm": 1.059393048286438, + "learning_rate": 0.00024257582412862429, + "loss": 2.025, + "step": 12372 + }, + { + "epoch": 1.443588846108972, + "grad_norm": 1.2577303647994995, + "learning_rate": 0.00024256388121284993, + "loss": 1.9547, + "step": 12373 + }, + { + "epoch": 1.4437055186092638, + "grad_norm": 1.2294254302978516, + "learning_rate": 0.00024255193735302598, + "loss": 2.2205, + "step": 12374 + }, + { + "epoch": 1.4438221911095555, + "grad_norm": 0.9986647963523865, + "learning_rate": 0.00024253999254927633, + "loss": 1.9367, + "step": 12375 + }, + { + "epoch": 1.4439388636098471, + "grad_norm": 1.2007087469100952, + "learning_rate": 0.00024252804680172473, + "loss": 2.0437, + "step": 12376 + }, + { + "epoch": 1.4440555361101388, + "grad_norm": 1.1323387622833252, + "learning_rate": 0.00024251610011049508, + "loss": 1.9357, + "step": 12377 + }, + { + "epoch": 1.4441722086104305, + "grad_norm": 1.2006042003631592, + "learning_rate": 0.00024250415247571117, + "loss": 2.3137, + "step": 12378 + }, + { + "epoch": 1.4442888811107222, + "grad_norm": 1.020058512687683, + "learning_rate": 0.00024249220389749692, + "loss": 1.8479, + "step": 12379 + }, + { + "epoch": 1.4444055536110139, + "grad_norm": 1.306944489479065, + "learning_rate": 0.0002424802543759761, + "loss": 2.1138, + "step": 12380 + }, + { + "epoch": 1.4445222261113055, + "grad_norm": 1.1882610321044922, + "learning_rate": 0.0002424683039112727, + "loss": 2.0782, + "step": 12381 + }, + { + "epoch": 1.4446388986115972, + "grad_norm": 1.1140061616897583, + "learning_rate": 0.00024245635250351056, + "loss": 2.0628, + "step": 12382 + }, + { + "epoch": 1.444755571111889, + "grad_norm": 1.2897766828536987, + "learning_rate": 0.0002424444001528136, + "loss": 2.2066, + "step": 12383 + }, + { + "epoch": 1.4448722436121806, + "grad_norm": 1.1924054622650146, + "learning_rate": 0.0002424324468593057, + "loss": 2.1533, + "step": 12384 + }, + { + "epoch": 1.4449889161124723, + "grad_norm": 1.2057299613952637, + "learning_rate": 0.00024242049262311084, + "loss": 2.1144, + "step": 12385 + }, + { + "epoch": 1.445105588612764, + "grad_norm": 1.0575422048568726, + "learning_rate": 0.00024240853744435286, + "loss": 2.0001, + "step": 12386 + }, + { + "epoch": 1.4452222611130556, + "grad_norm": 1.1869843006134033, + "learning_rate": 0.00024239658132315574, + "loss": 1.8763, + "step": 12387 + }, + { + "epoch": 1.4453389336133473, + "grad_norm": 1.0573346614837646, + "learning_rate": 0.0002423846242596434, + "loss": 2.0973, + "step": 12388 + }, + { + "epoch": 1.445455606113639, + "grad_norm": 1.2064498662948608, + "learning_rate": 0.00024237266625393985, + "loss": 2.1223, + "step": 12389 + }, + { + "epoch": 1.4455722786139307, + "grad_norm": 1.22520112991333, + "learning_rate": 0.00024236070730616907, + "loss": 1.8852, + "step": 12390 + }, + { + "epoch": 1.4456889511142224, + "grad_norm": 1.1485077142715454, + "learning_rate": 0.00024234874741645498, + "loss": 2.1147, + "step": 12391 + }, + { + "epoch": 1.445805623614514, + "grad_norm": 1.194191813468933, + "learning_rate": 0.00024233678658492158, + "loss": 2.1571, + "step": 12392 + }, + { + "epoch": 1.4459222961148057, + "grad_norm": 1.1175475120544434, + "learning_rate": 0.00024232482481169288, + "loss": 2.1955, + "step": 12393 + }, + { + "epoch": 1.4460389686150974, + "grad_norm": 1.1096524000167847, + "learning_rate": 0.00024231286209689285, + "loss": 2.0606, + "step": 12394 + }, + { + "epoch": 1.446155641115389, + "grad_norm": 1.2669217586517334, + "learning_rate": 0.00024230089844064556, + "loss": 2.2297, + "step": 12395 + }, + { + "epoch": 1.4462723136156808, + "grad_norm": 1.221541404724121, + "learning_rate": 0.000242288933843075, + "loss": 2.018, + "step": 12396 + }, + { + "epoch": 1.4463889861159724, + "grad_norm": 1.2922470569610596, + "learning_rate": 0.0002422769683043052, + "loss": 2.2765, + "step": 12397 + }, + { + "epoch": 1.4465056586162641, + "grad_norm": 1.2956640720367432, + "learning_rate": 0.0002422650018244603, + "loss": 2.1181, + "step": 12398 + }, + { + "epoch": 1.4466223311165558, + "grad_norm": 1.2687968015670776, + "learning_rate": 0.00024225303440366422, + "loss": 2.0656, + "step": 12399 + }, + { + "epoch": 1.4467390036168475, + "grad_norm": 1.276180624961853, + "learning_rate": 0.00024224106604204109, + "loss": 2.0258, + "step": 12400 + }, + { + "epoch": 1.4468556761171392, + "grad_norm": 1.0887831449508667, + "learning_rate": 0.00024222909673971498, + "loss": 1.8083, + "step": 12401 + }, + { + "epoch": 1.4469723486174308, + "grad_norm": 1.1694890260696411, + "learning_rate": 0.00024221712649680996, + "loss": 2.1022, + "step": 12402 + }, + { + "epoch": 1.4470890211177225, + "grad_norm": 1.2195106744766235, + "learning_rate": 0.00024220515531345015, + "loss": 2.0229, + "step": 12403 + }, + { + "epoch": 1.4472056936180142, + "grad_norm": 1.3163608312606812, + "learning_rate": 0.00024219318318975967, + "loss": 1.9923, + "step": 12404 + }, + { + "epoch": 1.447322366118306, + "grad_norm": 1.2945787906646729, + "learning_rate": 0.00024218121012586257, + "loss": 2.1217, + "step": 12405 + }, + { + "epoch": 1.4474390386185976, + "grad_norm": 1.2715288400650024, + "learning_rate": 0.00024216923612188298, + "loss": 2.1947, + "step": 12406 + }, + { + "epoch": 1.4475557111188893, + "grad_norm": 1.229345440864563, + "learning_rate": 0.0002421572611779451, + "loss": 2.1399, + "step": 12407 + }, + { + "epoch": 1.447672383619181, + "grad_norm": 1.1422327756881714, + "learning_rate": 0.00024214528529417303, + "loss": 2.2529, + "step": 12408 + }, + { + "epoch": 1.4477890561194726, + "grad_norm": 1.2563525438308716, + "learning_rate": 0.0002421333084706909, + "loss": 1.9925, + "step": 12409 + }, + { + "epoch": 1.4479057286197643, + "grad_norm": 1.0579081773757935, + "learning_rate": 0.00024212133070762292, + "loss": 2.0399, + "step": 12410 + }, + { + "epoch": 1.448022401120056, + "grad_norm": 1.1865010261535645, + "learning_rate": 0.0002421093520050933, + "loss": 2.06, + "step": 12411 + }, + { + "epoch": 1.4481390736203477, + "grad_norm": 1.1762630939483643, + "learning_rate": 0.0002420973723632261, + "loss": 2.0335, + "step": 12412 + }, + { + "epoch": 1.4482557461206393, + "grad_norm": 1.200181007385254, + "learning_rate": 0.00024208539178214555, + "loss": 2.0671, + "step": 12413 + }, + { + "epoch": 1.448372418620931, + "grad_norm": 1.2274868488311768, + "learning_rate": 0.00024207341026197593, + "loss": 2.279, + "step": 12414 + }, + { + "epoch": 1.4484890911212227, + "grad_norm": 1.3450500965118408, + "learning_rate": 0.0002420614278028414, + "loss": 2.2159, + "step": 12415 + }, + { + "epoch": 1.4486057636215144, + "grad_norm": 1.1541321277618408, + "learning_rate": 0.00024204944440486616, + "loss": 1.824, + "step": 12416 + }, + { + "epoch": 1.448722436121806, + "grad_norm": 1.4146510362625122, + "learning_rate": 0.00024203746006817446, + "loss": 2.2026, + "step": 12417 + }, + { + "epoch": 1.4488391086220977, + "grad_norm": 1.324271321296692, + "learning_rate": 0.00024202547479289054, + "loss": 2.0735, + "step": 12418 + }, + { + "epoch": 1.4489557811223894, + "grad_norm": 1.0831786394119263, + "learning_rate": 0.00024201348857913862, + "loss": 2.1089, + "step": 12419 + }, + { + "epoch": 1.4490724536226811, + "grad_norm": 1.129828929901123, + "learning_rate": 0.00024200150142704308, + "loss": 2.1648, + "step": 12420 + }, + { + "epoch": 1.4491891261229728, + "grad_norm": 1.1816134452819824, + "learning_rate": 0.00024198951333672801, + "loss": 1.9975, + "step": 12421 + }, + { + "epoch": 1.4493057986232645, + "grad_norm": 1.0486681461334229, + "learning_rate": 0.00024197752430831778, + "loss": 2.0327, + "step": 12422 + }, + { + "epoch": 1.4494224711235562, + "grad_norm": 1.1474531888961792, + "learning_rate": 0.00024196553434193668, + "loss": 2.2984, + "step": 12423 + }, + { + "epoch": 1.4495391436238478, + "grad_norm": 1.1756963729858398, + "learning_rate": 0.00024195354343770902, + "loss": 2.1004, + "step": 12424 + }, + { + "epoch": 1.4496558161241395, + "grad_norm": 1.1936639547348022, + "learning_rate": 0.00024194155159575913, + "loss": 2.1877, + "step": 12425 + }, + { + "epoch": 1.4497724886244312, + "grad_norm": 1.1055363416671753, + "learning_rate": 0.0002419295588162113, + "loss": 2.0466, + "step": 12426 + }, + { + "epoch": 1.4498891611247229, + "grad_norm": 1.3098475933074951, + "learning_rate": 0.00024191756509918977, + "loss": 2.2224, + "step": 12427 + }, + { + "epoch": 1.4500058336250146, + "grad_norm": 1.1154862642288208, + "learning_rate": 0.00024190557044481904, + "loss": 2.0033, + "step": 12428 + }, + { + "epoch": 1.4501225061253062, + "grad_norm": 1.2531447410583496, + "learning_rate": 0.00024189357485322331, + "loss": 2.0997, + "step": 12429 + }, + { + "epoch": 1.450239178625598, + "grad_norm": 1.254991054534912, + "learning_rate": 0.00024188157832452707, + "loss": 2.0954, + "step": 12430 + }, + { + "epoch": 1.4503558511258896, + "grad_norm": 1.277886986732483, + "learning_rate": 0.00024186958085885456, + "loss": 2.1661, + "step": 12431 + }, + { + "epoch": 1.4504725236261813, + "grad_norm": 1.3049641847610474, + "learning_rate": 0.00024185758245633024, + "loss": 2.1347, + "step": 12432 + }, + { + "epoch": 1.450589196126473, + "grad_norm": 1.1818820238113403, + "learning_rate": 0.00024184558311707844, + "loss": 2.0803, + "step": 12433 + }, + { + "epoch": 1.4507058686267646, + "grad_norm": 1.1049525737762451, + "learning_rate": 0.00024183358284122364, + "loss": 2.0176, + "step": 12434 + }, + { + "epoch": 1.4508225411270563, + "grad_norm": 1.1943717002868652, + "learning_rate": 0.00024182158162889015, + "loss": 2.1297, + "step": 12435 + }, + { + "epoch": 1.450939213627348, + "grad_norm": 1.0498493909835815, + "learning_rate": 0.00024180957948020243, + "loss": 1.9802, + "step": 12436 + }, + { + "epoch": 1.4510558861276397, + "grad_norm": 1.1235320568084717, + "learning_rate": 0.00024179757639528494, + "loss": 2.1288, + "step": 12437 + }, + { + "epoch": 1.4511725586279314, + "grad_norm": 1.0061571598052979, + "learning_rate": 0.00024178557237426209, + "loss": 1.9409, + "step": 12438 + }, + { + "epoch": 1.451289231128223, + "grad_norm": 0.9913708567619324, + "learning_rate": 0.00024177356741725826, + "loss": 2.1381, + "step": 12439 + }, + { + "epoch": 1.4514059036285147, + "grad_norm": 1.2311166524887085, + "learning_rate": 0.00024176156152439798, + "loss": 2.025, + "step": 12440 + }, + { + "epoch": 1.4515225761288064, + "grad_norm": 1.1900302171707153, + "learning_rate": 0.00024174955469580565, + "loss": 2.2456, + "step": 12441 + }, + { + "epoch": 1.451639248629098, + "grad_norm": 1.06264328956604, + "learning_rate": 0.0002417375469316059, + "loss": 1.9231, + "step": 12442 + }, + { + "epoch": 1.4517559211293898, + "grad_norm": 1.1065928936004639, + "learning_rate": 0.000241725538231923, + "loss": 2.0655, + "step": 12443 + }, + { + "epoch": 1.4518725936296815, + "grad_norm": 1.0567326545715332, + "learning_rate": 0.00024171352859688152, + "loss": 1.8997, + "step": 12444 + }, + { + "epoch": 1.4519892661299731, + "grad_norm": 1.2808523178100586, + "learning_rate": 0.00024170151802660602, + "loss": 2.1834, + "step": 12445 + }, + { + "epoch": 1.4521059386302648, + "grad_norm": 1.2505896091461182, + "learning_rate": 0.000241689506521221, + "loss": 2.2174, + "step": 12446 + }, + { + "epoch": 1.4522226111305565, + "grad_norm": 1.3225535154342651, + "learning_rate": 0.00024167749408085095, + "loss": 2.0186, + "step": 12447 + }, + { + "epoch": 1.4523392836308482, + "grad_norm": 1.2293407917022705, + "learning_rate": 0.00024166548070562034, + "loss": 2.1325, + "step": 12448 + }, + { + "epoch": 1.4524559561311399, + "grad_norm": 1.3376635313034058, + "learning_rate": 0.00024165346639565385, + "loss": 2.0622, + "step": 12449 + }, + { + "epoch": 1.4525726286314316, + "grad_norm": 1.044122338294983, + "learning_rate": 0.00024164145115107596, + "loss": 1.9924, + "step": 12450 + }, + { + "epoch": 1.4526893011317232, + "grad_norm": 1.1003800630569458, + "learning_rate": 0.00024162943497201122, + "loss": 2.1511, + "step": 12451 + }, + { + "epoch": 1.452805973632015, + "grad_norm": 1.1784509420394897, + "learning_rate": 0.00024161741785858423, + "loss": 2.1702, + "step": 12452 + }, + { + "epoch": 1.4529226461323066, + "grad_norm": 1.4927313327789307, + "learning_rate": 0.00024160539981091952, + "loss": 2.2308, + "step": 12453 + }, + { + "epoch": 1.4530393186325983, + "grad_norm": 1.1944730281829834, + "learning_rate": 0.00024159338082914173, + "loss": 2.0525, + "step": 12454 + }, + { + "epoch": 1.45315599113289, + "grad_norm": 1.2664378881454468, + "learning_rate": 0.00024158136091337546, + "loss": 2.1774, + "step": 12455 + }, + { + "epoch": 1.4532726636331816, + "grad_norm": 1.3075518608093262, + "learning_rate": 0.00024156934006374527, + "loss": 2.1488, + "step": 12456 + }, + { + "epoch": 1.4533893361334733, + "grad_norm": 1.0664552450180054, + "learning_rate": 0.0002415573182803758, + "loss": 1.8742, + "step": 12457 + }, + { + "epoch": 1.453506008633765, + "grad_norm": 1.2027256488800049, + "learning_rate": 0.0002415452955633917, + "loss": 2.1369, + "step": 12458 + }, + { + "epoch": 1.4536226811340567, + "grad_norm": 1.1736817359924316, + "learning_rate": 0.0002415332719129176, + "loss": 2.0024, + "step": 12459 + }, + { + "epoch": 1.4537393536343484, + "grad_norm": 1.1652841567993164, + "learning_rate": 0.0002415212473290782, + "loss": 2.0314, + "step": 12460 + }, + { + "epoch": 1.45385602613464, + "grad_norm": 1.211491584777832, + "learning_rate": 0.00024150922181199802, + "loss": 2.0602, + "step": 12461 + }, + { + "epoch": 1.4539726986349317, + "grad_norm": 1.170800805091858, + "learning_rate": 0.00024149719536180176, + "loss": 2.1863, + "step": 12462 + }, + { + "epoch": 1.4540893711352234, + "grad_norm": 1.2606472969055176, + "learning_rate": 0.0002414851679786143, + "loss": 2.0216, + "step": 12463 + }, + { + "epoch": 1.454206043635515, + "grad_norm": 1.125150442123413, + "learning_rate": 0.00024147313966256008, + "loss": 1.9758, + "step": 12464 + }, + { + "epoch": 1.4543227161358068, + "grad_norm": 1.0328593254089355, + "learning_rate": 0.00024146111041376393, + "loss": 2.052, + "step": 12465 + }, + { + "epoch": 1.4544393886360985, + "grad_norm": 1.2238445281982422, + "learning_rate": 0.0002414490802323505, + "loss": 2.2167, + "step": 12466 + }, + { + "epoch": 1.4545560611363901, + "grad_norm": 1.2943087816238403, + "learning_rate": 0.0002414370491184445, + "loss": 2.1937, + "step": 12467 + }, + { + "epoch": 1.4546727336366818, + "grad_norm": 1.108556866645813, + "learning_rate": 0.0002414250170721707, + "loss": 1.867, + "step": 12468 + }, + { + "epoch": 1.4547894061369735, + "grad_norm": 1.2199128866195679, + "learning_rate": 0.00024141298409365375, + "loss": 2.0215, + "step": 12469 + }, + { + "epoch": 1.4549060786372652, + "grad_norm": 1.249144434928894, + "learning_rate": 0.00024140095018301854, + "loss": 2.0522, + "step": 12470 + }, + { + "epoch": 1.4550227511375569, + "grad_norm": 1.346558928489685, + "learning_rate": 0.0002413889153403897, + "loss": 2.1089, + "step": 12471 + }, + { + "epoch": 1.4551394236378485, + "grad_norm": 1.1358884572982788, + "learning_rate": 0.00024137687956589206, + "loss": 2.1397, + "step": 12472 + }, + { + "epoch": 1.4552560961381402, + "grad_norm": 1.3228789567947388, + "learning_rate": 0.00024136484285965033, + "loss": 2.1198, + "step": 12473 + }, + { + "epoch": 1.455372768638432, + "grad_norm": 1.1296870708465576, + "learning_rate": 0.00024135280522178933, + "loss": 2.0549, + "step": 12474 + }, + { + "epoch": 1.4554894411387236, + "grad_norm": 1.123869776725769, + "learning_rate": 0.0002413407666524339, + "loss": 2.1625, + "step": 12475 + }, + { + "epoch": 1.4556061136390153, + "grad_norm": 1.1569702625274658, + "learning_rate": 0.00024132872715170874, + "loss": 2.1748, + "step": 12476 + }, + { + "epoch": 1.455722786139307, + "grad_norm": 1.2657071352005005, + "learning_rate": 0.00024131668671973876, + "loss": 2.2516, + "step": 12477 + }, + { + "epoch": 1.4558394586395986, + "grad_norm": 1.1992378234863281, + "learning_rate": 0.00024130464535664876, + "loss": 2.087, + "step": 12478 + }, + { + "epoch": 1.4559561311398903, + "grad_norm": 1.0595643520355225, + "learning_rate": 0.00024129260306256348, + "loss": 1.9578, + "step": 12479 + }, + { + "epoch": 1.456072803640182, + "grad_norm": 1.3753541707992554, + "learning_rate": 0.00024128055983760793, + "loss": 2.2193, + "step": 12480 + }, + { + "epoch": 1.4561894761404737, + "grad_norm": 1.1582287549972534, + "learning_rate": 0.00024126851568190678, + "loss": 2.0614, + "step": 12481 + }, + { + "epoch": 1.4563061486407654, + "grad_norm": 1.1133555173873901, + "learning_rate": 0.00024125647059558496, + "loss": 1.8548, + "step": 12482 + }, + { + "epoch": 1.456422821141057, + "grad_norm": 1.2339184284210205, + "learning_rate": 0.00024124442457876743, + "loss": 2.2046, + "step": 12483 + }, + { + "epoch": 1.4565394936413487, + "grad_norm": 1.1914303302764893, + "learning_rate": 0.00024123237763157897, + "loss": 2.0477, + "step": 12484 + }, + { + "epoch": 1.4566561661416404, + "grad_norm": 1.0639126300811768, + "learning_rate": 0.00024122032975414448, + "loss": 2.1404, + "step": 12485 + }, + { + "epoch": 1.456772838641932, + "grad_norm": 1.319421410560608, + "learning_rate": 0.00024120828094658888, + "loss": 2.3131, + "step": 12486 + }, + { + "epoch": 1.4568895111422238, + "grad_norm": 1.139578938484192, + "learning_rate": 0.00024119623120903702, + "loss": 2.079, + "step": 12487 + }, + { + "epoch": 1.4570061836425154, + "grad_norm": 1.1016725301742554, + "learning_rate": 0.00024118418054161394, + "loss": 1.9418, + "step": 12488 + }, + { + "epoch": 1.4571228561428071, + "grad_norm": 1.1036498546600342, + "learning_rate": 0.0002411721289444445, + "loss": 1.9364, + "step": 12489 + }, + { + "epoch": 1.4572395286430988, + "grad_norm": 1.0837960243225098, + "learning_rate": 0.00024116007641765357, + "loss": 1.9944, + "step": 12490 + }, + { + "epoch": 1.4573562011433905, + "grad_norm": 1.262612223625183, + "learning_rate": 0.00024114802296136624, + "loss": 2.1086, + "step": 12491 + }, + { + "epoch": 1.4574728736436822, + "grad_norm": 1.2413945198059082, + "learning_rate": 0.00024113596857570734, + "loss": 2.169, + "step": 12492 + }, + { + "epoch": 1.4575895461439738, + "grad_norm": 1.3520464897155762, + "learning_rate": 0.0002411239132608019, + "loss": 2.0553, + "step": 12493 + }, + { + "epoch": 1.4577062186442655, + "grad_norm": 1.400681495666504, + "learning_rate": 0.0002411118570167749, + "loss": 2.034, + "step": 12494 + }, + { + "epoch": 1.4578228911445572, + "grad_norm": 1.0735687017440796, + "learning_rate": 0.00024109979984375125, + "loss": 2.1, + "step": 12495 + }, + { + "epoch": 1.457939563644849, + "grad_norm": 1.4222822189331055, + "learning_rate": 0.00024108774174185605, + "loss": 2.3436, + "step": 12496 + }, + { + "epoch": 1.4580562361451406, + "grad_norm": 1.147943139076233, + "learning_rate": 0.00024107568271121427, + "loss": 1.963, + "step": 12497 + }, + { + "epoch": 1.4581729086454323, + "grad_norm": 1.261902928352356, + "learning_rate": 0.0002410636227519509, + "loss": 2.1583, + "step": 12498 + }, + { + "epoch": 1.458289581145724, + "grad_norm": 1.264065146446228, + "learning_rate": 0.00024105156186419096, + "loss": 2.2514, + "step": 12499 + }, + { + "epoch": 1.4584062536460156, + "grad_norm": 1.3454819917678833, + "learning_rate": 0.00024103950004805949, + "loss": 2.1018, + "step": 12500 + }, + { + "epoch": 1.4585229261463073, + "grad_norm": 1.08320152759552, + "learning_rate": 0.0002410274373036816, + "loss": 1.897, + "step": 12501 + }, + { + "epoch": 1.458639598646599, + "grad_norm": 1.1804211139678955, + "learning_rate": 0.00024101537363118225, + "loss": 2.084, + "step": 12502 + }, + { + "epoch": 1.4587562711468907, + "grad_norm": 1.097793698310852, + "learning_rate": 0.00024100330903068653, + "loss": 2.076, + "step": 12503 + }, + { + "epoch": 1.4588729436471823, + "grad_norm": 1.1508870124816895, + "learning_rate": 0.00024099124350231957, + "loss": 1.9601, + "step": 12504 + }, + { + "epoch": 1.458989616147474, + "grad_norm": 1.1009197235107422, + "learning_rate": 0.00024097917704620633, + "loss": 1.9353, + "step": 12505 + }, + { + "epoch": 1.4591062886477657, + "grad_norm": 1.1708104610443115, + "learning_rate": 0.00024096710966247208, + "loss": 2.2298, + "step": 12506 + }, + { + "epoch": 1.4592229611480574, + "grad_norm": 1.0466817617416382, + "learning_rate": 0.0002409550413512417, + "loss": 2.0715, + "step": 12507 + }, + { + "epoch": 1.459339633648349, + "grad_norm": 1.2328660488128662, + "learning_rate": 0.00024094297211264053, + "loss": 2.1929, + "step": 12508 + }, + { + "epoch": 1.4594563061486407, + "grad_norm": 1.2867677211761475, + "learning_rate": 0.0002409309019467935, + "loss": 2.0576, + "step": 12509 + }, + { + "epoch": 1.4595729786489324, + "grad_norm": 1.1479649543762207, + "learning_rate": 0.00024091883085382588, + "loss": 2.0743, + "step": 12510 + }, + { + "epoch": 1.459689651149224, + "grad_norm": 1.0914983749389648, + "learning_rate": 0.0002409067588338627, + "loss": 2.1488, + "step": 12511 + }, + { + "epoch": 1.4598063236495158, + "grad_norm": 1.1328566074371338, + "learning_rate": 0.00024089468588702918, + "loss": 2.0278, + "step": 12512 + }, + { + "epoch": 1.4599229961498075, + "grad_norm": 1.1167067289352417, + "learning_rate": 0.00024088261201345043, + "loss": 2.1062, + "step": 12513 + }, + { + "epoch": 1.4600396686500992, + "grad_norm": 1.1135259866714478, + "learning_rate": 0.00024087053721325174, + "loss": 2.2703, + "step": 12514 + }, + { + "epoch": 1.4601563411503908, + "grad_norm": 1.088565707206726, + "learning_rate": 0.00024085846148655813, + "loss": 1.9317, + "step": 12515 + }, + { + "epoch": 1.4602730136506825, + "grad_norm": 1.2399342060089111, + "learning_rate": 0.00024084638483349484, + "loss": 2.129, + "step": 12516 + }, + { + "epoch": 1.4603896861509742, + "grad_norm": 1.240850806236267, + "learning_rate": 0.0002408343072541871, + "loss": 2.1503, + "step": 12517 + }, + { + "epoch": 1.4605063586512659, + "grad_norm": 1.091507077217102, + "learning_rate": 0.00024082222874876005, + "loss": 2.2036, + "step": 12518 + }, + { + "epoch": 1.4606230311515576, + "grad_norm": 1.195219874382019, + "learning_rate": 0.00024081014931733903, + "loss": 2.1713, + "step": 12519 + }, + { + "epoch": 1.4607397036518492, + "grad_norm": 1.2711411714553833, + "learning_rate": 0.00024079806896004917, + "loss": 2.0611, + "step": 12520 + }, + { + "epoch": 1.460856376152141, + "grad_norm": 1.0774489641189575, + "learning_rate": 0.00024078598767701567, + "loss": 2.1165, + "step": 12521 + }, + { + "epoch": 1.4609730486524326, + "grad_norm": 1.3957901000976562, + "learning_rate": 0.00024077390546836384, + "loss": 2.1575, + "step": 12522 + }, + { + "epoch": 1.4610897211527243, + "grad_norm": 1.2085256576538086, + "learning_rate": 0.00024076182233421896, + "loss": 2.245, + "step": 12523 + }, + { + "epoch": 1.461206393653016, + "grad_norm": 1.4041985273361206, + "learning_rate": 0.0002407497382747063, + "loss": 2.3377, + "step": 12524 + }, + { + "epoch": 1.4613230661533076, + "grad_norm": 1.2923791408538818, + "learning_rate": 0.00024073765328995106, + "loss": 2.1092, + "step": 12525 + }, + { + "epoch": 1.4614397386535993, + "grad_norm": 1.2194066047668457, + "learning_rate": 0.00024072556738007853, + "loss": 2.0446, + "step": 12526 + }, + { + "epoch": 1.461556411153891, + "grad_norm": 1.1900627613067627, + "learning_rate": 0.00024071348054521406, + "loss": 2.0318, + "step": 12527 + }, + { + "epoch": 1.4616730836541827, + "grad_norm": 1.0357868671417236, + "learning_rate": 0.0002407013927854829, + "loss": 2.0495, + "step": 12528 + }, + { + "epoch": 1.4617897561544744, + "grad_norm": 1.306951880455017, + "learning_rate": 0.00024068930410101042, + "loss": 2.0081, + "step": 12529 + }, + { + "epoch": 1.461906428654766, + "grad_norm": 1.3684653043746948, + "learning_rate": 0.00024067721449192185, + "loss": 2.0541, + "step": 12530 + }, + { + "epoch": 1.4620231011550577, + "grad_norm": 1.2520817518234253, + "learning_rate": 0.00024066512395834266, + "loss": 2.2312, + "step": 12531 + }, + { + "epoch": 1.4621397736553494, + "grad_norm": 1.2913109064102173, + "learning_rate": 0.00024065303250039809, + "loss": 2.0607, + "step": 12532 + }, + { + "epoch": 1.462256446155641, + "grad_norm": 1.1954176425933838, + "learning_rate": 0.00024064094011821347, + "loss": 2.1914, + "step": 12533 + }, + { + "epoch": 1.4623731186559328, + "grad_norm": 1.2066128253936768, + "learning_rate": 0.0002406288468119142, + "loss": 2.0588, + "step": 12534 + }, + { + "epoch": 1.4624897911562245, + "grad_norm": 1.2024861574172974, + "learning_rate": 0.0002406167525816257, + "loss": 2.1353, + "step": 12535 + }, + { + "epoch": 1.4626064636565161, + "grad_norm": 1.3087118864059448, + "learning_rate": 0.0002406046574274733, + "loss": 1.9919, + "step": 12536 + }, + { + "epoch": 1.4627231361568078, + "grad_norm": 1.235211730003357, + "learning_rate": 0.00024059256134958238, + "loss": 2.0617, + "step": 12537 + }, + { + "epoch": 1.4628398086570995, + "grad_norm": 1.2043365240097046, + "learning_rate": 0.00024058046434807836, + "loss": 2.0961, + "step": 12538 + }, + { + "epoch": 1.4629564811573912, + "grad_norm": 1.074709415435791, + "learning_rate": 0.00024056836642308665, + "loss": 2.0311, + "step": 12539 + }, + { + "epoch": 1.4630731536576829, + "grad_norm": 1.3763507604599, + "learning_rate": 0.0002405562675747326, + "loss": 1.8455, + "step": 12540 + }, + { + "epoch": 1.4631898261579745, + "grad_norm": 1.0804930925369263, + "learning_rate": 0.00024054416780314175, + "loss": 1.885, + "step": 12541 + }, + { + "epoch": 1.4633064986582662, + "grad_norm": 1.2305415868759155, + "learning_rate": 0.00024053206710843942, + "loss": 2.2105, + "step": 12542 + }, + { + "epoch": 1.463423171158558, + "grad_norm": 1.2230173349380493, + "learning_rate": 0.0002405199654907512, + "loss": 2.0609, + "step": 12543 + }, + { + "epoch": 1.4635398436588496, + "grad_norm": 1.1014667749404907, + "learning_rate": 0.00024050786295020245, + "loss": 2.027, + "step": 12544 + }, + { + "epoch": 1.4636565161591413, + "grad_norm": 1.3569427728652954, + "learning_rate": 0.00024049575948691863, + "loss": 2.1365, + "step": 12545 + }, + { + "epoch": 1.463773188659433, + "grad_norm": 1.2189092636108398, + "learning_rate": 0.0002404836551010252, + "loss": 2.0632, + "step": 12546 + }, + { + "epoch": 1.4638898611597246, + "grad_norm": 1.127941608428955, + "learning_rate": 0.0002404715497926477, + "loss": 2.0896, + "step": 12547 + }, + { + "epoch": 1.4640065336600163, + "grad_norm": 1.0654675960540771, + "learning_rate": 0.00024045944356191167, + "loss": 2.0576, + "step": 12548 + }, + { + "epoch": 1.464123206160308, + "grad_norm": 1.1984381675720215, + "learning_rate": 0.00024044733640894242, + "loss": 2.0425, + "step": 12549 + }, + { + "epoch": 1.4642398786605997, + "grad_norm": 1.0365970134735107, + "learning_rate": 0.00024043522833386573, + "loss": 1.9858, + "step": 12550 + }, + { + "epoch": 1.4643565511608914, + "grad_norm": 1.1124399900436401, + "learning_rate": 0.00024042311933680693, + "loss": 2.1376, + "step": 12551 + }, + { + "epoch": 1.464473223661183, + "grad_norm": 1.1573853492736816, + "learning_rate": 0.0002404110094178916, + "loss": 2.0334, + "step": 12552 + }, + { + "epoch": 1.4645898961614747, + "grad_norm": 1.2246404886245728, + "learning_rate": 0.00024039889857724528, + "loss": 2.1326, + "step": 12553 + }, + { + "epoch": 1.4647065686617664, + "grad_norm": 1.026012659072876, + "learning_rate": 0.00024038678681499351, + "loss": 2.0679, + "step": 12554 + }, + { + "epoch": 1.464823241162058, + "grad_norm": 1.2167165279388428, + "learning_rate": 0.00024037467413126195, + "loss": 2.0068, + "step": 12555 + }, + { + "epoch": 1.4649399136623498, + "grad_norm": 1.1313273906707764, + "learning_rate": 0.00024036256052617608, + "loss": 2.0481, + "step": 12556 + }, + { + "epoch": 1.4650565861626414, + "grad_norm": 0.9872554540634155, + "learning_rate": 0.00024035044599986146, + "loss": 1.8534, + "step": 12557 + }, + { + "epoch": 1.4651732586629331, + "grad_norm": 1.0659174919128418, + "learning_rate": 0.0002403383305524437, + "loss": 2.0369, + "step": 12558 + }, + { + "epoch": 1.4652899311632248, + "grad_norm": 1.292487621307373, + "learning_rate": 0.00024032621418404842, + "loss": 2.1583, + "step": 12559 + }, + { + "epoch": 1.4654066036635165, + "grad_norm": 1.1496758460998535, + "learning_rate": 0.00024031409689480125, + "loss": 1.8683, + "step": 12560 + }, + { + "epoch": 1.4655232761638082, + "grad_norm": 1.1741751432418823, + "learning_rate": 0.0002403019786848278, + "loss": 2.0498, + "step": 12561 + }, + { + "epoch": 1.4656399486640999, + "grad_norm": 1.0500972270965576, + "learning_rate": 0.0002402898595542537, + "loss": 1.9585, + "step": 12562 + }, + { + "epoch": 1.4657566211643915, + "grad_norm": 1.2023892402648926, + "learning_rate": 0.00024027773950320453, + "loss": 2.0954, + "step": 12563 + }, + { + "epoch": 1.4658732936646832, + "grad_norm": 1.2664226293563843, + "learning_rate": 0.0002402656185318059, + "loss": 2.192, + "step": 12564 + }, + { + "epoch": 1.465989966164975, + "grad_norm": 1.0637069940567017, + "learning_rate": 0.00024025349664018363, + "loss": 2.042, + "step": 12565 + }, + { + "epoch": 1.4661066386652666, + "grad_norm": 1.1145201921463013, + "learning_rate": 0.00024024137382846328, + "loss": 2.02, + "step": 12566 + }, + { + "epoch": 1.4662233111655583, + "grad_norm": 1.170838713645935, + "learning_rate": 0.00024022925009677056, + "loss": 2.0852, + "step": 12567 + }, + { + "epoch": 1.46633998366585, + "grad_norm": 1.1084563732147217, + "learning_rate": 0.00024021712544523111, + "loss": 1.8724, + "step": 12568 + }, + { + "epoch": 1.4664566561661416, + "grad_norm": 1.1454471349716187, + "learning_rate": 0.0002402049998739707, + "loss": 2.0239, + "step": 12569 + }, + { + "epoch": 1.4665733286664333, + "grad_norm": 1.433587908744812, + "learning_rate": 0.00024019287338311496, + "loss": 2.0818, + "step": 12570 + }, + { + "epoch": 1.466690001166725, + "grad_norm": 1.0886343717575073, + "learning_rate": 0.00024018074597278964, + "loss": 1.9955, + "step": 12571 + }, + { + "epoch": 1.4668066736670167, + "grad_norm": 1.0784868001937866, + "learning_rate": 0.00024016861764312045, + "loss": 2.1014, + "step": 12572 + }, + { + "epoch": 1.4669233461673084, + "grad_norm": 1.2679766416549683, + "learning_rate": 0.0002401564883942331, + "loss": 2.1212, + "step": 12573 + }, + { + "epoch": 1.4670400186676, + "grad_norm": 1.2903690338134766, + "learning_rate": 0.0002401443582262534, + "loss": 2.1242, + "step": 12574 + }, + { + "epoch": 1.4671566911678917, + "grad_norm": 1.287184238433838, + "learning_rate": 0.0002401322271393071, + "loss": 2.1277, + "step": 12575 + }, + { + "epoch": 1.4672733636681834, + "grad_norm": 1.2445961236953735, + "learning_rate": 0.00024012009513351988, + "loss": 2.136, + "step": 12576 + }, + { + "epoch": 1.467390036168475, + "grad_norm": 1.1559693813323975, + "learning_rate": 0.00024010796220901755, + "loss": 2.1489, + "step": 12577 + }, + { + "epoch": 1.4675067086687668, + "grad_norm": 1.368701457977295, + "learning_rate": 0.00024009582836592594, + "loss": 2.1277, + "step": 12578 + }, + { + "epoch": 1.4676233811690584, + "grad_norm": 1.2062166929244995, + "learning_rate": 0.0002400836936043707, + "loss": 2.1243, + "step": 12579 + }, + { + "epoch": 1.4677400536693501, + "grad_norm": 1.6116081476211548, + "learning_rate": 0.00024007155792447784, + "loss": 2.062, + "step": 12580 + }, + { + "epoch": 1.4678567261696418, + "grad_norm": 1.0993640422821045, + "learning_rate": 0.00024005942132637303, + "loss": 2.0443, + "step": 12581 + }, + { + "epoch": 1.4679733986699335, + "grad_norm": 1.2576260566711426, + "learning_rate": 0.0002400472838101821, + "loss": 2.1061, + "step": 12582 + }, + { + "epoch": 1.4680900711702252, + "grad_norm": 1.0102263689041138, + "learning_rate": 0.0002400351453760309, + "loss": 2.1111, + "step": 12583 + }, + { + "epoch": 1.4682067436705168, + "grad_norm": 1.1982131004333496, + "learning_rate": 0.00024002300602404524, + "loss": 2.0331, + "step": 12584 + }, + { + "epoch": 1.4683234161708085, + "grad_norm": 1.1167999505996704, + "learning_rate": 0.00024001086575435108, + "loss": 2.0244, + "step": 12585 + }, + { + "epoch": 1.4684400886711002, + "grad_norm": 1.193231463432312, + "learning_rate": 0.00023999872456707412, + "loss": 2.0813, + "step": 12586 + }, + { + "epoch": 1.4685567611713919, + "grad_norm": 1.1642777919769287, + "learning_rate": 0.0002399865824623403, + "loss": 2.0622, + "step": 12587 + }, + { + "epoch": 1.4686734336716836, + "grad_norm": 0.9906080365180969, + "learning_rate": 0.00023997443944027549, + "loss": 2.1267, + "step": 12588 + }, + { + "epoch": 1.4687901061719753, + "grad_norm": 1.332050085067749, + "learning_rate": 0.00023996229550100553, + "loss": 2.1465, + "step": 12589 + }, + { + "epoch": 1.468906778672267, + "grad_norm": 1.11050546169281, + "learning_rate": 0.00023995015064465645, + "loss": 1.9302, + "step": 12590 + }, + { + "epoch": 1.4690234511725586, + "grad_norm": 1.2157073020935059, + "learning_rate": 0.00023993800487135398, + "loss": 2.1537, + "step": 12591 + }, + { + "epoch": 1.4691401236728503, + "grad_norm": 1.1642451286315918, + "learning_rate": 0.00023992585818122415, + "loss": 2.3264, + "step": 12592 + }, + { + "epoch": 1.469256796173142, + "grad_norm": 1.4191220998764038, + "learning_rate": 0.00023991371057439286, + "loss": 2.0189, + "step": 12593 + }, + { + "epoch": 1.4693734686734337, + "grad_norm": 1.1790528297424316, + "learning_rate": 0.00023990156205098604, + "loss": 2.0141, + "step": 12594 + }, + { + "epoch": 1.4694901411737253, + "grad_norm": 1.2335160970687866, + "learning_rate": 0.00023988941261112958, + "loss": 2.1949, + "step": 12595 + }, + { + "epoch": 1.469606813674017, + "grad_norm": 1.204961895942688, + "learning_rate": 0.00023987726225494958, + "loss": 1.9468, + "step": 12596 + }, + { + "epoch": 1.4697234861743087, + "grad_norm": 1.218312382698059, + "learning_rate": 0.00023986511098257184, + "loss": 1.9269, + "step": 12597 + }, + { + "epoch": 1.4698401586746004, + "grad_norm": 1.3307514190673828, + "learning_rate": 0.00023985295879412236, + "loss": 2.1397, + "step": 12598 + }, + { + "epoch": 1.469956831174892, + "grad_norm": 1.2016761302947998, + "learning_rate": 0.00023984080568972718, + "loss": 2.025, + "step": 12599 + }, + { + "epoch": 1.4700735036751837, + "grad_norm": 1.1240296363830566, + "learning_rate": 0.00023982865166951225, + "loss": 1.9026, + "step": 12600 + }, + { + "epoch": 1.4701901761754754, + "grad_norm": 1.248589277267456, + "learning_rate": 0.00023981649673360364, + "loss": 2.0651, + "step": 12601 + }, + { + "epoch": 1.470306848675767, + "grad_norm": 1.1360896825790405, + "learning_rate": 0.00023980434088212722, + "loss": 2.1028, + "step": 12602 + }, + { + "epoch": 1.4704235211760588, + "grad_norm": 1.0545175075531006, + "learning_rate": 0.00023979218411520915, + "loss": 2.0456, + "step": 12603 + }, + { + "epoch": 1.4705401936763505, + "grad_norm": 1.1762529611587524, + "learning_rate": 0.00023978002643297537, + "loss": 1.9709, + "step": 12604 + }, + { + "epoch": 1.4706568661766422, + "grad_norm": 1.1142687797546387, + "learning_rate": 0.00023976786783555193, + "loss": 2.1738, + "step": 12605 + }, + { + "epoch": 1.4707735386769338, + "grad_norm": 1.25674569606781, + "learning_rate": 0.0002397557083230649, + "loss": 2.0606, + "step": 12606 + }, + { + "epoch": 1.4708902111772255, + "grad_norm": 1.2582509517669678, + "learning_rate": 0.00023974354789564034, + "loss": 2.3196, + "step": 12607 + }, + { + "epoch": 1.4710068836775172, + "grad_norm": 1.1369106769561768, + "learning_rate": 0.00023973138655340427, + "loss": 1.9386, + "step": 12608 + }, + { + "epoch": 1.4711235561778089, + "grad_norm": 1.2990697622299194, + "learning_rate": 0.00023971922429648286, + "loss": 2.067, + "step": 12609 + }, + { + "epoch": 1.4712402286781006, + "grad_norm": 1.1464662551879883, + "learning_rate": 0.0002397070611250021, + "loss": 1.872, + "step": 12610 + }, + { + "epoch": 1.4713569011783922, + "grad_norm": 1.2546905279159546, + "learning_rate": 0.00023969489703908812, + "loss": 2.1569, + "step": 12611 + }, + { + "epoch": 1.471473573678684, + "grad_norm": 1.2619194984436035, + "learning_rate": 0.00023968273203886704, + "loss": 2.0577, + "step": 12612 + }, + { + "epoch": 1.4715902461789756, + "grad_norm": 1.0788650512695312, + "learning_rate": 0.0002396705661244649, + "loss": 2.0953, + "step": 12613 + }, + { + "epoch": 1.4717069186792673, + "grad_norm": 1.0335266590118408, + "learning_rate": 0.00023965839929600795, + "loss": 1.924, + "step": 12614 + }, + { + "epoch": 1.471823591179559, + "grad_norm": 1.0226982831954956, + "learning_rate": 0.00023964623155362219, + "loss": 1.9341, + "step": 12615 + }, + { + "epoch": 1.4719402636798506, + "grad_norm": 1.0276464223861694, + "learning_rate": 0.00023963406289743383, + "loss": 1.9489, + "step": 12616 + }, + { + "epoch": 1.4720569361801423, + "grad_norm": 1.0303494930267334, + "learning_rate": 0.00023962189332756904, + "loss": 2.0427, + "step": 12617 + }, + { + "epoch": 1.472173608680434, + "grad_norm": 1.235327959060669, + "learning_rate": 0.0002396097228441539, + "loss": 2.1219, + "step": 12618 + }, + { + "epoch": 1.4722902811807257, + "grad_norm": 1.217961072921753, + "learning_rate": 0.0002395975514473147, + "loss": 2.2509, + "step": 12619 + }, + { + "epoch": 1.4724069536810174, + "grad_norm": 1.234645962715149, + "learning_rate": 0.00023958537913717752, + "loss": 2.1584, + "step": 12620 + }, + { + "epoch": 1.472523626181309, + "grad_norm": 1.1518155336380005, + "learning_rate": 0.0002395732059138686, + "loss": 2.2392, + "step": 12621 + }, + { + "epoch": 1.4726402986816007, + "grad_norm": 1.0813711881637573, + "learning_rate": 0.00023956103177751406, + "loss": 1.9774, + "step": 12622 + }, + { + "epoch": 1.4727569711818924, + "grad_norm": 1.2347455024719238, + "learning_rate": 0.0002395488567282403, + "loss": 1.9989, + "step": 12623 + }, + { + "epoch": 1.472873643682184, + "grad_norm": 1.2444946765899658, + "learning_rate": 0.00023953668076617324, + "loss": 2.302, + "step": 12624 + }, + { + "epoch": 1.4729903161824758, + "grad_norm": 1.1521759033203125, + "learning_rate": 0.0002395245038914394, + "loss": 2.1047, + "step": 12625 + }, + { + "epoch": 1.4731069886827675, + "grad_norm": 1.1887181997299194, + "learning_rate": 0.0002395123261041648, + "loss": 2.0614, + "step": 12626 + }, + { + "epoch": 1.4732236611830591, + "grad_norm": 1.1909834146499634, + "learning_rate": 0.00023950014740447588, + "loss": 2.2491, + "step": 12627 + }, + { + "epoch": 1.4733403336833508, + "grad_norm": 1.1136082410812378, + "learning_rate": 0.0002394879677924987, + "loss": 2.0293, + "step": 12628 + }, + { + "epoch": 1.4734570061836425, + "grad_norm": 1.1880005598068237, + "learning_rate": 0.00023947578726835964, + "loss": 2.0097, + "step": 12629 + }, + { + "epoch": 1.4735736786839342, + "grad_norm": 1.2319732904434204, + "learning_rate": 0.000239463605832185, + "loss": 2.0625, + "step": 12630 + }, + { + "epoch": 1.4736903511842259, + "grad_norm": 1.0663505792617798, + "learning_rate": 0.0002394514234841009, + "loss": 2.0617, + "step": 12631 + }, + { + "epoch": 1.4738070236845175, + "grad_norm": 1.065268874168396, + "learning_rate": 0.00023943924022423382, + "loss": 1.9348, + "step": 12632 + }, + { + "epoch": 1.4739236961848092, + "grad_norm": 1.1070330142974854, + "learning_rate": 0.00023942705605270998, + "loss": 2.1, + "step": 12633 + }, + { + "epoch": 1.474040368685101, + "grad_norm": 1.262018084526062, + "learning_rate": 0.00023941487096965569, + "loss": 2.0233, + "step": 12634 + }, + { + "epoch": 1.4741570411853926, + "grad_norm": 1.1797088384628296, + "learning_rate": 0.0002394026849751973, + "loss": 2.1324, + "step": 12635 + }, + { + "epoch": 1.4742737136856843, + "grad_norm": 1.1425459384918213, + "learning_rate": 0.00023939049806946113, + "loss": 1.9856, + "step": 12636 + }, + { + "epoch": 1.474390386185976, + "grad_norm": 1.4400174617767334, + "learning_rate": 0.00023937831025257344, + "loss": 2.3378, + "step": 12637 + }, + { + "epoch": 1.4745070586862676, + "grad_norm": 1.3858389854431152, + "learning_rate": 0.00023936612152466078, + "loss": 2.0627, + "step": 12638 + }, + { + "epoch": 1.4746237311865593, + "grad_norm": 1.3462210893630981, + "learning_rate": 0.00023935393188584934, + "loss": 2.1346, + "step": 12639 + }, + { + "epoch": 1.474740403686851, + "grad_norm": 1.1020638942718506, + "learning_rate": 0.00023934174133626544, + "loss": 2.0385, + "step": 12640 + }, + { + "epoch": 1.4748570761871427, + "grad_norm": 1.0744218826293945, + "learning_rate": 0.00023932954987603559, + "loss": 2.0977, + "step": 12641 + }, + { + "epoch": 1.4749737486874344, + "grad_norm": 1.1572085618972778, + "learning_rate": 0.0002393173575052862, + "loss": 2.103, + "step": 12642 + }, + { + "epoch": 1.475090421187726, + "grad_norm": 1.1818917989730835, + "learning_rate": 0.00023930516422414355, + "loss": 2.2209, + "step": 12643 + }, + { + "epoch": 1.4752070936880177, + "grad_norm": 1.250959873199463, + "learning_rate": 0.00023929297003273412, + "loss": 2.1566, + "step": 12644 + }, + { + "epoch": 1.4753237661883094, + "grad_norm": 1.3237392902374268, + "learning_rate": 0.0002392807749311843, + "loss": 2.1106, + "step": 12645 + }, + { + "epoch": 1.475440438688601, + "grad_norm": 1.3368701934814453, + "learning_rate": 0.0002392685789196205, + "loss": 2.0257, + "step": 12646 + }, + { + "epoch": 1.4755571111888928, + "grad_norm": 1.3767131567001343, + "learning_rate": 0.00023925638199816924, + "loss": 2.1948, + "step": 12647 + }, + { + "epoch": 1.4756737836891844, + "grad_norm": 1.1361236572265625, + "learning_rate": 0.00023924418416695683, + "loss": 2.1435, + "step": 12648 + }, + { + "epoch": 1.4757904561894761, + "grad_norm": 1.1008834838867188, + "learning_rate": 0.00023923198542610984, + "loss": 2.0963, + "step": 12649 + }, + { + "epoch": 1.4759071286897678, + "grad_norm": 0.9976339340209961, + "learning_rate": 0.00023921978577575467, + "loss": 2.0447, + "step": 12650 + }, + { + "epoch": 1.4760238011900595, + "grad_norm": 1.442771315574646, + "learning_rate": 0.00023920758521601778, + "loss": 2.1677, + "step": 12651 + }, + { + "epoch": 1.4761404736903512, + "grad_norm": 1.4260419607162476, + "learning_rate": 0.0002391953837470257, + "loss": 2.0492, + "step": 12652 + }, + { + "epoch": 1.4762571461906429, + "grad_norm": 1.2174725532531738, + "learning_rate": 0.00023918318136890496, + "loss": 2.0241, + "step": 12653 + }, + { + "epoch": 1.4763738186909345, + "grad_norm": 1.2133780717849731, + "learning_rate": 0.0002391709780817819, + "loss": 2.1137, + "step": 12654 + }, + { + "epoch": 1.4764904911912262, + "grad_norm": 1.1507117748260498, + "learning_rate": 0.00023915877388578323, + "loss": 2.0699, + "step": 12655 + }, + { + "epoch": 1.476607163691518, + "grad_norm": 1.1098321676254272, + "learning_rate": 0.00023914656878103532, + "loss": 1.8733, + "step": 12656 + }, + { + "epoch": 1.4767238361918096, + "grad_norm": 1.3229873180389404, + "learning_rate": 0.00023913436276766476, + "loss": 1.976, + "step": 12657 + }, + { + "epoch": 1.4768405086921013, + "grad_norm": 1.2487040758132935, + "learning_rate": 0.00023912215584579808, + "loss": 2.1848, + "step": 12658 + }, + { + "epoch": 1.476957181192393, + "grad_norm": 1.130354404449463, + "learning_rate": 0.00023910994801556185, + "loss": 2.0282, + "step": 12659 + }, + { + "epoch": 1.4770738536926846, + "grad_norm": 1.2153562307357788, + "learning_rate": 0.00023909773927708258, + "loss": 2.3374, + "step": 12660 + }, + { + "epoch": 1.4771905261929763, + "grad_norm": 1.231713056564331, + "learning_rate": 0.00023908552963048687, + "loss": 1.8753, + "step": 12661 + }, + { + "epoch": 1.477307198693268, + "grad_norm": 1.2176905870437622, + "learning_rate": 0.00023907331907590127, + "loss": 1.9893, + "step": 12662 + }, + { + "epoch": 1.4774238711935597, + "grad_norm": 1.1432029008865356, + "learning_rate": 0.00023906110761345243, + "loss": 2.141, + "step": 12663 + }, + { + "epoch": 1.4775405436938513, + "grad_norm": 1.1565183401107788, + "learning_rate": 0.00023904889524326687, + "loss": 2.0856, + "step": 12664 + }, + { + "epoch": 1.477657216194143, + "grad_norm": 1.3179590702056885, + "learning_rate": 0.00023903668196547124, + "loss": 2.1357, + "step": 12665 + }, + { + "epoch": 1.4777738886944347, + "grad_norm": 1.3230210542678833, + "learning_rate": 0.00023902446778019214, + "loss": 2.1814, + "step": 12666 + }, + { + "epoch": 1.4778905611947264, + "grad_norm": 1.2626885175704956, + "learning_rate": 0.00023901225268755616, + "loss": 2.1132, + "step": 12667 + }, + { + "epoch": 1.478007233695018, + "grad_norm": 1.2649545669555664, + "learning_rate": 0.00023900003668768995, + "loss": 2.1411, + "step": 12668 + }, + { + "epoch": 1.4781239061953098, + "grad_norm": 1.0437548160552979, + "learning_rate": 0.0002389878197807202, + "loss": 2.1003, + "step": 12669 + }, + { + "epoch": 1.4782405786956014, + "grad_norm": 1.2035191059112549, + "learning_rate": 0.0002389756019667735, + "loss": 2.0985, + "step": 12670 + }, + { + "epoch": 1.4783572511958931, + "grad_norm": 1.1121761798858643, + "learning_rate": 0.00023896338324597653, + "loss": 2.0121, + "step": 12671 + }, + { + "epoch": 1.4784739236961848, + "grad_norm": 1.0788191556930542, + "learning_rate": 0.000238951163618456, + "loss": 2.0547, + "step": 12672 + }, + { + "epoch": 1.4785905961964765, + "grad_norm": 1.1170949935913086, + "learning_rate": 0.00023893894308433855, + "loss": 2.0171, + "step": 12673 + }, + { + "epoch": 1.4787072686967682, + "grad_norm": 1.3124688863754272, + "learning_rate": 0.0002389267216437509, + "loss": 2.037, + "step": 12674 + }, + { + "epoch": 1.4788239411970598, + "grad_norm": 1.1357088088989258, + "learning_rate": 0.00023891449929681967, + "loss": 2.0709, + "step": 12675 + }, + { + "epoch": 1.4789406136973515, + "grad_norm": 1.0960142612457275, + "learning_rate": 0.00023890227604367168, + "loss": 1.9161, + "step": 12676 + }, + { + "epoch": 1.4790572861976432, + "grad_norm": 1.2946417331695557, + "learning_rate": 0.00023889005188443356, + "loss": 2.1297, + "step": 12677 + }, + { + "epoch": 1.4791739586979349, + "grad_norm": 1.1106444597244263, + "learning_rate": 0.00023887782681923208, + "loss": 1.9549, + "step": 12678 + }, + { + "epoch": 1.4792906311982266, + "grad_norm": 1.2571617364883423, + "learning_rate": 0.00023886560084819395, + "loss": 2.2136, + "step": 12679 + }, + { + "epoch": 1.4794073036985183, + "grad_norm": 1.2020180225372314, + "learning_rate": 0.0002388533739714459, + "loss": 2.0929, + "step": 12680 + }, + { + "epoch": 1.47952397619881, + "grad_norm": 1.3933460712432861, + "learning_rate": 0.00023884114618911474, + "loss": 2.208, + "step": 12681 + }, + { + "epoch": 1.4796406486991016, + "grad_norm": 1.2099683284759521, + "learning_rate": 0.00023882891750132723, + "loss": 2.0037, + "step": 12682 + }, + { + "epoch": 1.4797573211993933, + "grad_norm": 1.1684540510177612, + "learning_rate": 0.00023881668790821006, + "loss": 1.9352, + "step": 12683 + }, + { + "epoch": 1.479873993699685, + "grad_norm": 1.249952793121338, + "learning_rate": 0.00023880445740989012, + "loss": 2.0871, + "step": 12684 + }, + { + "epoch": 1.4799906661999767, + "grad_norm": 1.158970594406128, + "learning_rate": 0.0002387922260064941, + "loss": 2.0768, + "step": 12685 + }, + { + "epoch": 1.4801073387002683, + "grad_norm": 1.270797848701477, + "learning_rate": 0.0002387799936981489, + "loss": 2.1119, + "step": 12686 + }, + { + "epoch": 1.48022401120056, + "grad_norm": 1.290972352027893, + "learning_rate": 0.0002387677604849813, + "loss": 2.1209, + "step": 12687 + }, + { + "epoch": 1.4803406837008517, + "grad_norm": 1.1722743511199951, + "learning_rate": 0.00023875552636711808, + "loss": 2.1328, + "step": 12688 + }, + { + "epoch": 1.4804573562011434, + "grad_norm": 1.2769246101379395, + "learning_rate": 0.00023874329134468612, + "loss": 1.8786, + "step": 12689 + }, + { + "epoch": 1.480574028701435, + "grad_norm": 1.2949104309082031, + "learning_rate": 0.0002387310554178122, + "loss": 1.9548, + "step": 12690 + }, + { + "epoch": 1.4806907012017267, + "grad_norm": 1.1909040212631226, + "learning_rate": 0.00023871881858662325, + "loss": 2.065, + "step": 12691 + }, + { + "epoch": 1.4808073737020184, + "grad_norm": 1.043463110923767, + "learning_rate": 0.00023870658085124608, + "loss": 2.0939, + "step": 12692 + }, + { + "epoch": 1.48092404620231, + "grad_norm": 1.1445220708847046, + "learning_rate": 0.0002386943422118075, + "loss": 2.1464, + "step": 12693 + }, + { + "epoch": 1.4810407187026018, + "grad_norm": 1.110805630683899, + "learning_rate": 0.00023868210266843448, + "loss": 2.0292, + "step": 12694 + }, + { + "epoch": 1.4811573912028935, + "grad_norm": 1.1980493068695068, + "learning_rate": 0.0002386698622212539, + "loss": 1.9953, + "step": 12695 + }, + { + "epoch": 1.4812740637031852, + "grad_norm": 0.9338337182998657, + "learning_rate": 0.0002386576208703926, + "loss": 1.9775, + "step": 12696 + }, + { + "epoch": 1.4813907362034768, + "grad_norm": 1.3180763721466064, + "learning_rate": 0.00023864537861597752, + "loss": 1.9767, + "step": 12697 + }, + { + "epoch": 1.4815074087037685, + "grad_norm": 1.1587802171707153, + "learning_rate": 0.00023863313545813557, + "loss": 2.1985, + "step": 12698 + }, + { + "epoch": 1.4816240812040602, + "grad_norm": 1.187589406967163, + "learning_rate": 0.00023862089139699367, + "loss": 2.156, + "step": 12699 + }, + { + "epoch": 1.4817407537043519, + "grad_norm": 1.1901087760925293, + "learning_rate": 0.00023860864643267875, + "loss": 2.1832, + "step": 12700 + }, + { + "epoch": 1.4818574262046436, + "grad_norm": 1.2500791549682617, + "learning_rate": 0.00023859640056531778, + "loss": 2.0205, + "step": 12701 + }, + { + "epoch": 1.4819740987049352, + "grad_norm": 1.2741793394088745, + "learning_rate": 0.00023858415379503762, + "loss": 2.1583, + "step": 12702 + }, + { + "epoch": 1.482090771205227, + "grad_norm": 1.1056594848632812, + "learning_rate": 0.00023857190612196536, + "loss": 2.0273, + "step": 12703 + }, + { + "epoch": 1.4822074437055186, + "grad_norm": 1.1864930391311646, + "learning_rate": 0.00023855965754622792, + "loss": 2.1035, + "step": 12704 + }, + { + "epoch": 1.4823241162058103, + "grad_norm": 1.248866081237793, + "learning_rate": 0.00023854740806795226, + "loss": 2.2421, + "step": 12705 + }, + { + "epoch": 1.482440788706102, + "grad_norm": 1.135063648223877, + "learning_rate": 0.0002385351576872653, + "loss": 2.0393, + "step": 12706 + }, + { + "epoch": 1.4825574612063936, + "grad_norm": 1.0268089771270752, + "learning_rate": 0.0002385229064042942, + "loss": 2.1052, + "step": 12707 + }, + { + "epoch": 1.4826741337066853, + "grad_norm": 1.162642240524292, + "learning_rate": 0.00023851065421916587, + "loss": 2.0602, + "step": 12708 + }, + { + "epoch": 1.482790806206977, + "grad_norm": 1.1637765169143677, + "learning_rate": 0.00023849840113200732, + "loss": 2.03, + "step": 12709 + }, + { + "epoch": 1.4829074787072687, + "grad_norm": 1.3537440299987793, + "learning_rate": 0.00023848614714294558, + "loss": 2.075, + "step": 12710 + }, + { + "epoch": 1.4830241512075604, + "grad_norm": 1.1438108682632446, + "learning_rate": 0.00023847389225210773, + "loss": 1.7501, + "step": 12711 + }, + { + "epoch": 1.483140823707852, + "grad_norm": 1.1374270915985107, + "learning_rate": 0.00023846163645962085, + "loss": 2.1607, + "step": 12712 + }, + { + "epoch": 1.4832574962081437, + "grad_norm": 1.1001914739608765, + "learning_rate": 0.00023844937976561184, + "loss": 1.8311, + "step": 12713 + }, + { + "epoch": 1.4833741687084354, + "grad_norm": 1.2659318447113037, + "learning_rate": 0.0002384371221702079, + "loss": 1.93, + "step": 12714 + }, + { + "epoch": 1.483490841208727, + "grad_norm": 1.3484008312225342, + "learning_rate": 0.000238424863673536, + "loss": 2.0475, + "step": 12715 + }, + { + "epoch": 1.4836075137090188, + "grad_norm": 1.0951719284057617, + "learning_rate": 0.00023841260427572337, + "loss": 2.028, + "step": 12716 + }, + { + "epoch": 1.4837241862093105, + "grad_norm": 1.1134949922561646, + "learning_rate": 0.00023840034397689695, + "loss": 1.917, + "step": 12717 + }, + { + "epoch": 1.4838408587096021, + "grad_norm": 1.0856387615203857, + "learning_rate": 0.00023838808277718397, + "loss": 1.9225, + "step": 12718 + }, + { + "epoch": 1.4839575312098938, + "grad_norm": 1.177894949913025, + "learning_rate": 0.00023837582067671138, + "loss": 1.9842, + "step": 12719 + }, + { + "epoch": 1.4840742037101855, + "grad_norm": 1.272458553314209, + "learning_rate": 0.0002383635576756065, + "loss": 1.9922, + "step": 12720 + }, + { + "epoch": 1.4841908762104772, + "grad_norm": 1.1734347343444824, + "learning_rate": 0.00023835129377399632, + "loss": 1.9617, + "step": 12721 + }, + { + "epoch": 1.4843075487107689, + "grad_norm": 1.3550093173980713, + "learning_rate": 0.000238339028972008, + "loss": 1.9758, + "step": 12722 + }, + { + "epoch": 1.4844242212110605, + "grad_norm": 1.1303460597991943, + "learning_rate": 0.00023832676326976867, + "loss": 2.0322, + "step": 12723 + }, + { + "epoch": 1.4845408937113522, + "grad_norm": 1.273656964302063, + "learning_rate": 0.00023831449666740558, + "loss": 2.0432, + "step": 12724 + }, + { + "epoch": 1.484657566211644, + "grad_norm": 1.1822651624679565, + "learning_rate": 0.0002383022291650458, + "loss": 2.0829, + "step": 12725 + }, + { + "epoch": 1.4847742387119356, + "grad_norm": 1.1945161819458008, + "learning_rate": 0.00023828996076281653, + "loss": 2.076, + "step": 12726 + }, + { + "epoch": 1.4848909112122273, + "grad_norm": 1.2152460813522339, + "learning_rate": 0.00023827769146084497, + "loss": 1.9275, + "step": 12727 + }, + { + "epoch": 1.485007583712519, + "grad_norm": 1.167536973953247, + "learning_rate": 0.00023826542125925833, + "loss": 2.0158, + "step": 12728 + }, + { + "epoch": 1.4851242562128106, + "grad_norm": 1.169323444366455, + "learning_rate": 0.00023825315015818383, + "loss": 2.0438, + "step": 12729 + }, + { + "epoch": 1.4852409287131023, + "grad_norm": 1.168581247329712, + "learning_rate": 0.0002382408781577486, + "loss": 2.2103, + "step": 12730 + }, + { + "epoch": 1.485357601213394, + "grad_norm": 1.1188228130340576, + "learning_rate": 0.0002382286052580799, + "loss": 2.2309, + "step": 12731 + }, + { + "epoch": 1.4854742737136857, + "grad_norm": 1.32307767868042, + "learning_rate": 0.00023821633145930504, + "loss": 2.1623, + "step": 12732 + }, + { + "epoch": 1.4855909462139774, + "grad_norm": 1.2450019121170044, + "learning_rate": 0.0002382040567615511, + "loss": 1.8969, + "step": 12733 + }, + { + "epoch": 1.485707618714269, + "grad_norm": 1.273409128189087, + "learning_rate": 0.00023819178116494546, + "loss": 2.0803, + "step": 12734 + }, + { + "epoch": 1.4858242912145607, + "grad_norm": 1.0129683017730713, + "learning_rate": 0.00023817950466961535, + "loss": 1.9274, + "step": 12735 + }, + { + "epoch": 1.4859409637148524, + "grad_norm": 1.060649037361145, + "learning_rate": 0.000238167227275688, + "loss": 1.926, + "step": 12736 + }, + { + "epoch": 1.486057636215144, + "grad_norm": 1.0263500213623047, + "learning_rate": 0.00023815494898329082, + "loss": 1.9405, + "step": 12737 + }, + { + "epoch": 1.4861743087154358, + "grad_norm": 1.154455542564392, + "learning_rate": 0.0002381426697925509, + "loss": 2.0097, + "step": 12738 + }, + { + "epoch": 1.4862909812157274, + "grad_norm": 1.3072680234909058, + "learning_rate": 0.0002381303897035957, + "loss": 1.9839, + "step": 12739 + }, + { + "epoch": 1.4864076537160191, + "grad_norm": 1.1296700239181519, + "learning_rate": 0.00023811810871655242, + "loss": 2.1754, + "step": 12740 + }, + { + "epoch": 1.4865243262163108, + "grad_norm": 1.1546872854232788, + "learning_rate": 0.00023810582683154842, + "loss": 1.9513, + "step": 12741 + }, + { + "epoch": 1.4866409987166025, + "grad_norm": 1.0686627626419067, + "learning_rate": 0.00023809354404871104, + "loss": 1.9411, + "step": 12742 + }, + { + "epoch": 1.4867576712168942, + "grad_norm": 1.2713810205459595, + "learning_rate": 0.0002380812603681676, + "loss": 2.3102, + "step": 12743 + }, + { + "epoch": 1.4868743437171859, + "grad_norm": 1.2527635097503662, + "learning_rate": 0.00023806897579004547, + "loss": 2.01, + "step": 12744 + }, + { + "epoch": 1.4869910162174775, + "grad_norm": 1.129123568534851, + "learning_rate": 0.0002380566903144719, + "loss": 1.9103, + "step": 12745 + }, + { + "epoch": 1.4871076887177692, + "grad_norm": 1.311671495437622, + "learning_rate": 0.0002380444039415744, + "loss": 2.2197, + "step": 12746 + }, + { + "epoch": 1.487224361218061, + "grad_norm": 1.1076756715774536, + "learning_rate": 0.00023803211667148023, + "loss": 2.0556, + "step": 12747 + }, + { + "epoch": 1.4873410337183526, + "grad_norm": 1.0408415794372559, + "learning_rate": 0.0002380198285043168, + "loss": 1.9211, + "step": 12748 + }, + { + "epoch": 1.4874577062186443, + "grad_norm": 1.0564019680023193, + "learning_rate": 0.00023800753944021148, + "loss": 2.0891, + "step": 12749 + }, + { + "epoch": 1.487574378718936, + "grad_norm": 1.044613242149353, + "learning_rate": 0.00023799524947929176, + "loss": 1.9322, + "step": 12750 + }, + { + "epoch": 1.4876910512192276, + "grad_norm": 1.310209035873413, + "learning_rate": 0.000237982958621685, + "loss": 2.2247, + "step": 12751 + }, + { + "epoch": 1.4878077237195193, + "grad_norm": 1.0702539682388306, + "learning_rate": 0.0002379706668675186, + "loss": 2.0113, + "step": 12752 + }, + { + "epoch": 1.487924396219811, + "grad_norm": 1.209908127784729, + "learning_rate": 0.00023795837421691993, + "loss": 2.04, + "step": 12753 + }, + { + "epoch": 1.4880410687201027, + "grad_norm": 1.2485411167144775, + "learning_rate": 0.00023794608067001649, + "loss": 2.3164, + "step": 12754 + }, + { + "epoch": 1.4881577412203943, + "grad_norm": 1.3143949508666992, + "learning_rate": 0.00023793378622693578, + "loss": 2.1363, + "step": 12755 + }, + { + "epoch": 1.488274413720686, + "grad_norm": 1.0167183876037598, + "learning_rate": 0.0002379214908878052, + "loss": 1.9325, + "step": 12756 + }, + { + "epoch": 1.4883910862209777, + "grad_norm": 1.1516791582107544, + "learning_rate": 0.0002379091946527522, + "loss": 2.1718, + "step": 12757 + }, + { + "epoch": 1.4885077587212694, + "grad_norm": 1.1160595417022705, + "learning_rate": 0.00023789689752190424, + "loss": 2.1233, + "step": 12758 + }, + { + "epoch": 1.488624431221561, + "grad_norm": 1.4816733598709106, + "learning_rate": 0.00023788459949538885, + "loss": 2.2437, + "step": 12759 + }, + { + "epoch": 1.4887411037218528, + "grad_norm": 1.1119413375854492, + "learning_rate": 0.00023787230057333353, + "loss": 2.1851, + "step": 12760 + }, + { + "epoch": 1.4888577762221444, + "grad_norm": 1.1256136894226074, + "learning_rate": 0.0002378600007558657, + "loss": 2.3082, + "step": 12761 + }, + { + "epoch": 1.4889744487224361, + "grad_norm": 1.218597650527954, + "learning_rate": 0.00023784770004311296, + "loss": 2.0839, + "step": 12762 + }, + { + "epoch": 1.4890911212227278, + "grad_norm": 1.3847784996032715, + "learning_rate": 0.0002378353984352028, + "loss": 2.2282, + "step": 12763 + }, + { + "epoch": 1.4892077937230195, + "grad_norm": 1.2448656558990479, + "learning_rate": 0.00023782309593226275, + "loss": 2.0557, + "step": 12764 + }, + { + "epoch": 1.4893244662233112, + "grad_norm": 0.9876543879508972, + "learning_rate": 0.00023781079253442034, + "loss": 1.893, + "step": 12765 + }, + { + "epoch": 1.4894411387236028, + "grad_norm": 1.1448040008544922, + "learning_rate": 0.00023779848824180308, + "loss": 2.1425, + "step": 12766 + }, + { + "epoch": 1.4895578112238945, + "grad_norm": 1.2964982986450195, + "learning_rate": 0.00023778618305453857, + "loss": 2.0301, + "step": 12767 + }, + { + "epoch": 1.4896744837241862, + "grad_norm": 1.1221024990081787, + "learning_rate": 0.00023777387697275444, + "loss": 2.0901, + "step": 12768 + }, + { + "epoch": 1.4897911562244779, + "grad_norm": 1.3611780405044556, + "learning_rate": 0.0002377615699965782, + "loss": 2.0764, + "step": 12769 + }, + { + "epoch": 1.4899078287247696, + "grad_norm": 1.2294983863830566, + "learning_rate": 0.0002377492621261374, + "loss": 2.0366, + "step": 12770 + }, + { + "epoch": 1.4900245012250612, + "grad_norm": 1.2142996788024902, + "learning_rate": 0.00023773695336155967, + "loss": 2.0975, + "step": 12771 + }, + { + "epoch": 1.490141173725353, + "grad_norm": 1.1631217002868652, + "learning_rate": 0.00023772464370297263, + "loss": 1.9554, + "step": 12772 + }, + { + "epoch": 1.4902578462256446, + "grad_norm": 1.4043225049972534, + "learning_rate": 0.0002377123331505039, + "loss": 2.2605, + "step": 12773 + }, + { + "epoch": 1.4903745187259363, + "grad_norm": 1.283718466758728, + "learning_rate": 0.0002377000217042811, + "loss": 2.2347, + "step": 12774 + }, + { + "epoch": 1.490491191226228, + "grad_norm": 1.2738685607910156, + "learning_rate": 0.00023768770936443177, + "loss": 2.1945, + "step": 12775 + }, + { + "epoch": 1.4906078637265197, + "grad_norm": 1.1638869047164917, + "learning_rate": 0.00023767539613108363, + "loss": 2.0042, + "step": 12776 + }, + { + "epoch": 1.4907245362268113, + "grad_norm": 1.0295485258102417, + "learning_rate": 0.00023766308200436434, + "loss": 1.902, + "step": 12777 + }, + { + "epoch": 1.490841208727103, + "grad_norm": 1.2688803672790527, + "learning_rate": 0.00023765076698440153, + "loss": 2.2056, + "step": 12778 + }, + { + "epoch": 1.4909578812273947, + "grad_norm": 1.2524375915527344, + "learning_rate": 0.00023763845107132293, + "loss": 2.2471, + "step": 12779 + }, + { + "epoch": 1.4910745537276864, + "grad_norm": 1.0607707500457764, + "learning_rate": 0.00023762613426525614, + "loss": 1.9826, + "step": 12780 + }, + { + "epoch": 1.491191226227978, + "grad_norm": 1.1876490116119385, + "learning_rate": 0.00023761381656632884, + "loss": 1.9501, + "step": 12781 + }, + { + "epoch": 1.4913078987282697, + "grad_norm": 1.192920207977295, + "learning_rate": 0.00023760149797466884, + "loss": 2.1554, + "step": 12782 + }, + { + "epoch": 1.4914245712285614, + "grad_norm": 1.1485857963562012, + "learning_rate": 0.00023758917849040366, + "loss": 1.9483, + "step": 12783 + }, + { + "epoch": 1.491541243728853, + "grad_norm": 1.2237484455108643, + "learning_rate": 0.0002375768581136612, + "loss": 2.0117, + "step": 12784 + }, + { + "epoch": 1.4916579162291448, + "grad_norm": 1.2426278591156006, + "learning_rate": 0.00023756453684456907, + "loss": 2.0722, + "step": 12785 + }, + { + "epoch": 1.4917745887294365, + "grad_norm": 1.2523572444915771, + "learning_rate": 0.0002375522146832551, + "loss": 2.0255, + "step": 12786 + }, + { + "epoch": 1.4918912612297281, + "grad_norm": 1.2058113813400269, + "learning_rate": 0.0002375398916298469, + "loss": 1.8552, + "step": 12787 + }, + { + "epoch": 1.4920079337300198, + "grad_norm": 1.1639565229415894, + "learning_rate": 0.0002375275676844723, + "loss": 2.0068, + "step": 12788 + }, + { + "epoch": 1.4921246062303115, + "grad_norm": 1.1139047145843506, + "learning_rate": 0.00023751524284725906, + "loss": 2.1235, + "step": 12789 + }, + { + "epoch": 1.4922412787306032, + "grad_norm": 1.346511960029602, + "learning_rate": 0.000237502917118335, + "loss": 2.187, + "step": 12790 + }, + { + "epoch": 1.4923579512308949, + "grad_norm": 1.1713693141937256, + "learning_rate": 0.00023749059049782778, + "loss": 1.9336, + "step": 12791 + }, + { + "epoch": 1.4924746237311866, + "grad_norm": 1.1829968690872192, + "learning_rate": 0.00023747826298586524, + "loss": 2.0397, + "step": 12792 + }, + { + "epoch": 1.4925912962314782, + "grad_norm": 1.1053467988967896, + "learning_rate": 0.00023746593458257527, + "loss": 1.9041, + "step": 12793 + }, + { + "epoch": 1.49270796873177, + "grad_norm": 1.019209623336792, + "learning_rate": 0.00023745360528808557, + "loss": 1.984, + "step": 12794 + }, + { + "epoch": 1.4928246412320616, + "grad_norm": 1.0836290121078491, + "learning_rate": 0.00023744127510252398, + "loss": 1.9618, + "step": 12795 + }, + { + "epoch": 1.4929413137323533, + "grad_norm": 1.1695947647094727, + "learning_rate": 0.00023742894402601833, + "loss": 2.2034, + "step": 12796 + }, + { + "epoch": 1.493057986232645, + "grad_norm": 1.2816002368927002, + "learning_rate": 0.00023741661205869645, + "loss": 2.2356, + "step": 12797 + }, + { + "epoch": 1.4931746587329366, + "grad_norm": 1.2140963077545166, + "learning_rate": 0.00023740427920068615, + "loss": 2.039, + "step": 12798 + }, + { + "epoch": 1.4932913312332283, + "grad_norm": 1.1680558919906616, + "learning_rate": 0.00023739194545211538, + "loss": 2.0921, + "step": 12799 + }, + { + "epoch": 1.49340800373352, + "grad_norm": 1.0440555810928345, + "learning_rate": 0.0002373796108131119, + "loss": 1.9229, + "step": 12800 + }, + { + "epoch": 1.4935246762338117, + "grad_norm": 1.1269872188568115, + "learning_rate": 0.0002373672752838036, + "loss": 2.1332, + "step": 12801 + }, + { + "epoch": 1.4936413487341034, + "grad_norm": 1.0519592761993408, + "learning_rate": 0.0002373549388643185, + "loss": 1.9446, + "step": 12802 + }, + { + "epoch": 1.493758021234395, + "grad_norm": 1.124184250831604, + "learning_rate": 0.00023734260155478432, + "loss": 2.2297, + "step": 12803 + }, + { + "epoch": 1.4938746937346867, + "grad_norm": 1.092888355255127, + "learning_rate": 0.00023733026335532903, + "loss": 2.0947, + "step": 12804 + }, + { + "epoch": 1.4939913662349784, + "grad_norm": 1.1629011631011963, + "learning_rate": 0.00023731792426608052, + "loss": 2.1238, + "step": 12805 + }, + { + "epoch": 1.49410803873527, + "grad_norm": 1.0490916967391968, + "learning_rate": 0.0002373055842871667, + "loss": 2.0803, + "step": 12806 + }, + { + "epoch": 1.4942247112355618, + "grad_norm": 1.056817889213562, + "learning_rate": 0.00023729324341871552, + "loss": 1.95, + "step": 12807 + }, + { + "epoch": 1.4943413837358535, + "grad_norm": 0.9637038111686707, + "learning_rate": 0.0002372809016608549, + "loss": 2.026, + "step": 12808 + }, + { + "epoch": 1.4944580562361451, + "grad_norm": 1.1337788105010986, + "learning_rate": 0.0002372685590137128, + "loss": 1.9454, + "step": 12809 + }, + { + "epoch": 1.4945747287364368, + "grad_norm": 1.1906415224075317, + "learning_rate": 0.00023725621547741722, + "loss": 2.1747, + "step": 12810 + }, + { + "epoch": 1.4946914012367285, + "grad_norm": 1.164568305015564, + "learning_rate": 0.000237243871052096, + "loss": 2.0492, + "step": 12811 + }, + { + "epoch": 1.4948080737370202, + "grad_norm": 1.2068501710891724, + "learning_rate": 0.00023723152573787724, + "loss": 2.2715, + "step": 12812 + }, + { + "epoch": 1.4949247462373119, + "grad_norm": 1.2308919429779053, + "learning_rate": 0.00023721917953488882, + "loss": 2.1796, + "step": 12813 + }, + { + "epoch": 1.4950414187376035, + "grad_norm": 1.117760181427002, + "learning_rate": 0.0002372068324432588, + "loss": 2.1714, + "step": 12814 + }, + { + "epoch": 1.4951580912378952, + "grad_norm": 1.2081843614578247, + "learning_rate": 0.00023719448446311517, + "loss": 2.096, + "step": 12815 + }, + { + "epoch": 1.495274763738187, + "grad_norm": 0.9903899431228638, + "learning_rate": 0.00023718213559458594, + "loss": 2.0461, + "step": 12816 + }, + { + "epoch": 1.4953914362384786, + "grad_norm": 1.1452083587646484, + "learning_rate": 0.00023716978583779912, + "loss": 2.1777, + "step": 12817 + }, + { + "epoch": 1.4955081087387703, + "grad_norm": 1.1284759044647217, + "learning_rate": 0.00023715743519288275, + "loss": 2.1943, + "step": 12818 + }, + { + "epoch": 1.495624781239062, + "grad_norm": 1.1743453741073608, + "learning_rate": 0.00023714508365996483, + "loss": 2.1928, + "step": 12819 + }, + { + "epoch": 1.4957414537393536, + "grad_norm": 1.0598928928375244, + "learning_rate": 0.00023713273123917347, + "loss": 2.0082, + "step": 12820 + }, + { + "epoch": 1.4958581262396453, + "grad_norm": 1.1165701150894165, + "learning_rate": 0.00023712037793063666, + "loss": 1.9852, + "step": 12821 + }, + { + "epoch": 1.495974798739937, + "grad_norm": 1.261854887008667, + "learning_rate": 0.0002371080237344825, + "loss": 1.9962, + "step": 12822 + }, + { + "epoch": 1.4960914712402287, + "grad_norm": 1.2502471208572388, + "learning_rate": 0.00023709566865083912, + "loss": 2.2403, + "step": 12823 + }, + { + "epoch": 1.4962081437405204, + "grad_norm": 1.0636664628982544, + "learning_rate": 0.0002370833126798345, + "loss": 2.0293, + "step": 12824 + }, + { + "epoch": 1.496324816240812, + "grad_norm": 1.204843521118164, + "learning_rate": 0.00023707095582159676, + "loss": 2.0866, + "step": 12825 + }, + { + "epoch": 1.4964414887411037, + "grad_norm": 1.307477355003357, + "learning_rate": 0.00023705859807625403, + "loss": 2.0892, + "step": 12826 + }, + { + "epoch": 1.4965581612413954, + "grad_norm": 1.2359775304794312, + "learning_rate": 0.0002370462394439344, + "loss": 2.073, + "step": 12827 + }, + { + "epoch": 1.496674833741687, + "grad_norm": 1.344340443611145, + "learning_rate": 0.00023703387992476605, + "loss": 2.1876, + "step": 12828 + }, + { + "epoch": 1.4967915062419788, + "grad_norm": 1.1187827587127686, + "learning_rate": 0.00023702151951887705, + "loss": 2.1335, + "step": 12829 + }, + { + "epoch": 1.4969081787422704, + "grad_norm": 0.9612862467765808, + "learning_rate": 0.0002370091582263955, + "loss": 2.0041, + "step": 12830 + }, + { + "epoch": 1.4970248512425621, + "grad_norm": 1.11970055103302, + "learning_rate": 0.00023699679604744962, + "loss": 2.0989, + "step": 12831 + }, + { + "epoch": 1.4971415237428538, + "grad_norm": 1.0889732837677002, + "learning_rate": 0.0002369844329821676, + "loss": 2.0938, + "step": 12832 + }, + { + "epoch": 1.4972581962431455, + "grad_norm": 1.0633870363235474, + "learning_rate": 0.0002369720690306775, + "loss": 2.0377, + "step": 12833 + }, + { + "epoch": 1.4973748687434372, + "grad_norm": 1.3344261646270752, + "learning_rate": 0.00023695970419310757, + "loss": 2.0424, + "step": 12834 + }, + { + "epoch": 1.4974915412437289, + "grad_norm": 1.086992859840393, + "learning_rate": 0.000236947338469586, + "loss": 2.1014, + "step": 12835 + }, + { + "epoch": 1.4976082137440205, + "grad_norm": 1.2632640600204468, + "learning_rate": 0.00023693497186024094, + "loss": 2.0541, + "step": 12836 + }, + { + "epoch": 1.4977248862443122, + "grad_norm": 1.1513161659240723, + "learning_rate": 0.00023692260436520063, + "loss": 2.2716, + "step": 12837 + }, + { + "epoch": 1.497841558744604, + "grad_norm": 1.0385146141052246, + "learning_rate": 0.0002369102359845932, + "loss": 2.0705, + "step": 12838 + }, + { + "epoch": 1.4979582312448956, + "grad_norm": 1.3385794162750244, + "learning_rate": 0.00023689786671854697, + "loss": 2.0608, + "step": 12839 + }, + { + "epoch": 1.4980749037451873, + "grad_norm": 1.1400487422943115, + "learning_rate": 0.00023688549656719014, + "loss": 2.2259, + "step": 12840 + }, + { + "epoch": 1.498191576245479, + "grad_norm": 1.2208683490753174, + "learning_rate": 0.00023687312553065097, + "loss": 2.0177, + "step": 12841 + }, + { + "epoch": 1.4983082487457706, + "grad_norm": 1.5976170301437378, + "learning_rate": 0.00023686075360905765, + "loss": 2.1028, + "step": 12842 + }, + { + "epoch": 1.4984249212460623, + "grad_norm": 1.1103943586349487, + "learning_rate": 0.0002368483808025385, + "loss": 1.8926, + "step": 12843 + }, + { + "epoch": 1.498541593746354, + "grad_norm": 1.0924060344696045, + "learning_rate": 0.00023683600711122175, + "loss": 1.9189, + "step": 12844 + }, + { + "epoch": 1.4986582662466457, + "grad_norm": 1.3056124448776245, + "learning_rate": 0.00023682363253523572, + "loss": 2.0614, + "step": 12845 + }, + { + "epoch": 1.4987749387469373, + "grad_norm": 1.3072364330291748, + "learning_rate": 0.00023681125707470868, + "loss": 2.1665, + "step": 12846 + }, + { + "epoch": 1.498891611247229, + "grad_norm": 1.251145601272583, + "learning_rate": 0.00023679888072976885, + "loss": 2.162, + "step": 12847 + }, + { + "epoch": 1.4990082837475207, + "grad_norm": 1.331475853919983, + "learning_rate": 0.00023678650350054462, + "loss": 1.8989, + "step": 12848 + }, + { + "epoch": 1.4991249562478124, + "grad_norm": 1.03056800365448, + "learning_rate": 0.00023677412538716425, + "loss": 1.9928, + "step": 12849 + }, + { + "epoch": 1.499241628748104, + "grad_norm": 1.2556493282318115, + "learning_rate": 0.00023676174638975612, + "loss": 2.2829, + "step": 12850 + }, + { + "epoch": 1.4993583012483958, + "grad_norm": 1.2843503952026367, + "learning_rate": 0.00023674936650844853, + "loss": 1.8758, + "step": 12851 + }, + { + "epoch": 1.4994749737486874, + "grad_norm": 1.3275389671325684, + "learning_rate": 0.00023673698574336983, + "loss": 2.1188, + "step": 12852 + }, + { + "epoch": 1.4995916462489791, + "grad_norm": 1.1209440231323242, + "learning_rate": 0.00023672460409464836, + "loss": 2.1709, + "step": 12853 + }, + { + "epoch": 1.4997083187492708, + "grad_norm": 1.2115691900253296, + "learning_rate": 0.00023671222156241244, + "loss": 2.039, + "step": 12854 + }, + { + "epoch": 1.4998249912495625, + "grad_norm": 1.184156060218811, + "learning_rate": 0.00023669983814679052, + "loss": 1.9813, + "step": 12855 + }, + { + "epoch": 1.4999416637498542, + "grad_norm": 1.1809399127960205, + "learning_rate": 0.00023668745384791093, + "loss": 2.15, + "step": 12856 + }, + { + "epoch": 1.5000583362501458, + "grad_norm": 1.2757258415222168, + "learning_rate": 0.00023667506866590209, + "loss": 2.1386, + "step": 12857 + }, + { + "epoch": 1.5001750087504375, + "grad_norm": 1.1683305501937866, + "learning_rate": 0.00023666268260089235, + "loss": 2.1894, + "step": 12858 + }, + { + "epoch": 1.5002916812507292, + "grad_norm": 1.0910295248031616, + "learning_rate": 0.00023665029565301014, + "loss": 1.9449, + "step": 12859 + }, + { + "epoch": 1.5004083537510209, + "grad_norm": 1.0818288326263428, + "learning_rate": 0.00023663790782238385, + "loss": 1.9692, + "step": 12860 + }, + { + "epoch": 1.5005250262513126, + "grad_norm": 1.1610934734344482, + "learning_rate": 0.00023662551910914198, + "loss": 2.0981, + "step": 12861 + }, + { + "epoch": 1.5006416987516042, + "grad_norm": 1.158176064491272, + "learning_rate": 0.00023661312951341284, + "loss": 2.0582, + "step": 12862 + }, + { + "epoch": 1.500758371251896, + "grad_norm": 1.16142737865448, + "learning_rate": 0.00023660073903532497, + "loss": 2.0358, + "step": 12863 + }, + { + "epoch": 1.5008750437521876, + "grad_norm": 1.1961932182312012, + "learning_rate": 0.00023658834767500682, + "loss": 2.0985, + "step": 12864 + }, + { + "epoch": 1.5009917162524793, + "grad_norm": 1.0981320142745972, + "learning_rate": 0.00023657595543258674, + "loss": 2.0572, + "step": 12865 + }, + { + "epoch": 1.501108388752771, + "grad_norm": 1.0573740005493164, + "learning_rate": 0.00023656356230819333, + "loss": 1.9956, + "step": 12866 + }, + { + "epoch": 1.5012250612530627, + "grad_norm": 1.3524937629699707, + "learning_rate": 0.000236551168301955, + "loss": 2.1935, + "step": 12867 + }, + { + "epoch": 1.5013417337533543, + "grad_norm": 1.2883424758911133, + "learning_rate": 0.00023653877341400027, + "loss": 2.2944, + "step": 12868 + }, + { + "epoch": 1.501458406253646, + "grad_norm": 1.4410419464111328, + "learning_rate": 0.0002365263776444576, + "loss": 2.1863, + "step": 12869 + }, + { + "epoch": 1.5015750787539377, + "grad_norm": 0.9399113059043884, + "learning_rate": 0.00023651398099345549, + "loss": 1.8585, + "step": 12870 + }, + { + "epoch": 1.5016917512542294, + "grad_norm": 1.2271840572357178, + "learning_rate": 0.00023650158346112253, + "loss": 2.052, + "step": 12871 + }, + { + "epoch": 1.501808423754521, + "grad_norm": 1.2596515417099, + "learning_rate": 0.0002364891850475872, + "loss": 2.0182, + "step": 12872 + }, + { + "epoch": 1.5019250962548127, + "grad_norm": 1.147079586982727, + "learning_rate": 0.00023647678575297801, + "loss": 1.929, + "step": 12873 + }, + { + "epoch": 1.5020417687551044, + "grad_norm": 0.9986235499382019, + "learning_rate": 0.0002364643855774235, + "loss": 2.0113, + "step": 12874 + }, + { + "epoch": 1.502158441255396, + "grad_norm": 1.259164571762085, + "learning_rate": 0.00023645198452105224, + "loss": 2.153, + "step": 12875 + }, + { + "epoch": 1.5022751137556878, + "grad_norm": 1.1377477645874023, + "learning_rate": 0.00023643958258399287, + "loss": 2.0151, + "step": 12876 + }, + { + "epoch": 1.5023917862559795, + "grad_norm": 1.1758842468261719, + "learning_rate": 0.00023642717976637376, + "loss": 2.0392, + "step": 12877 + }, + { + "epoch": 1.5025084587562711, + "grad_norm": 1.0989850759506226, + "learning_rate": 0.00023641477606832365, + "loss": 1.9541, + "step": 12878 + }, + { + "epoch": 1.5026251312565628, + "grad_norm": 1.190143346786499, + "learning_rate": 0.0002364023714899711, + "loss": 2.1264, + "step": 12879 + }, + { + "epoch": 1.5027418037568545, + "grad_norm": 1.1978389024734497, + "learning_rate": 0.00023638996603144471, + "loss": 2.0772, + "step": 12880 + }, + { + "epoch": 1.5028584762571462, + "grad_norm": 1.0815069675445557, + "learning_rate": 0.000236377559692873, + "loss": 2.1751, + "step": 12881 + }, + { + "epoch": 1.5029751487574379, + "grad_norm": 1.117041826248169, + "learning_rate": 0.00023636515247438475, + "loss": 2.1107, + "step": 12882 + }, + { + "epoch": 1.5030918212577296, + "grad_norm": 1.1975698471069336, + "learning_rate": 0.00023635274437610842, + "loss": 2.1596, + "step": 12883 + }, + { + "epoch": 1.5032084937580212, + "grad_norm": 1.1150230169296265, + "learning_rate": 0.00023634033539817275, + "loss": 2.1314, + "step": 12884 + }, + { + "epoch": 1.503325166258313, + "grad_norm": 1.5084284543991089, + "learning_rate": 0.00023632792554070635, + "loss": 2.2857, + "step": 12885 + }, + { + "epoch": 1.5034418387586046, + "grad_norm": 1.1169581413269043, + "learning_rate": 0.00023631551480383787, + "loss": 2.1, + "step": 12886 + }, + { + "epoch": 1.5035585112588963, + "grad_norm": 1.1860404014587402, + "learning_rate": 0.0002363031031876959, + "loss": 2.0596, + "step": 12887 + }, + { + "epoch": 1.503675183759188, + "grad_norm": 1.2756335735321045, + "learning_rate": 0.00023629069069240927, + "loss": 2.1163, + "step": 12888 + }, + { + "epoch": 1.5037918562594796, + "grad_norm": 1.1028635501861572, + "learning_rate": 0.00023627827731810652, + "loss": 2.0039, + "step": 12889 + }, + { + "epoch": 1.5039085287597713, + "grad_norm": 1.2083879709243774, + "learning_rate": 0.00023626586306491637, + "loss": 1.9483, + "step": 12890 + }, + { + "epoch": 1.504025201260063, + "grad_norm": 1.2234524488449097, + "learning_rate": 0.0002362534479329676, + "loss": 2.059, + "step": 12891 + }, + { + "epoch": 1.5041418737603547, + "grad_norm": 1.1116431951522827, + "learning_rate": 0.00023624103192238877, + "loss": 2.0895, + "step": 12892 + }, + { + "epoch": 1.5042585462606464, + "grad_norm": 1.0658739805221558, + "learning_rate": 0.0002362286150333087, + "loss": 2.1151, + "step": 12893 + }, + { + "epoch": 1.504375218760938, + "grad_norm": 1.2346380949020386, + "learning_rate": 0.00023621619726585608, + "loss": 2.1044, + "step": 12894 + }, + { + "epoch": 1.5044918912612297, + "grad_norm": 1.1632407903671265, + "learning_rate": 0.00023620377862015968, + "loss": 2.1274, + "step": 12895 + }, + { + "epoch": 1.5046085637615214, + "grad_norm": 1.0802925825119019, + "learning_rate": 0.00023619135909634819, + "loss": 2.0284, + "step": 12896 + }, + { + "epoch": 1.504725236261813, + "grad_norm": 1.2319998741149902, + "learning_rate": 0.0002361789386945504, + "loss": 2.0166, + "step": 12897 + }, + { + "epoch": 1.5048419087621048, + "grad_norm": 1.343641996383667, + "learning_rate": 0.00023616651741489506, + "loss": 2.2415, + "step": 12898 + }, + { + "epoch": 1.5049585812623965, + "grad_norm": 1.1760056018829346, + "learning_rate": 0.00023615409525751092, + "loss": 2.0253, + "step": 12899 + }, + { + "epoch": 1.5050752537626881, + "grad_norm": 1.3459477424621582, + "learning_rate": 0.0002361416722225268, + "loss": 2.0719, + "step": 12900 + }, + { + "epoch": 1.5051919262629798, + "grad_norm": 1.0357389450073242, + "learning_rate": 0.00023612924831007144, + "loss": 1.8556, + "step": 12901 + }, + { + "epoch": 1.5053085987632715, + "grad_norm": 1.0175055265426636, + "learning_rate": 0.0002361168235202737, + "loss": 2.2106, + "step": 12902 + }, + { + "epoch": 1.5054252712635632, + "grad_norm": 1.1781721115112305, + "learning_rate": 0.00023610439785326233, + "loss": 1.904, + "step": 12903 + }, + { + "epoch": 1.5055419437638549, + "grad_norm": 1.039893627166748, + "learning_rate": 0.00023609197130916614, + "loss": 2.0201, + "step": 12904 + }, + { + "epoch": 1.5056586162641465, + "grad_norm": 1.2547340393066406, + "learning_rate": 0.000236079543888114, + "loss": 1.9651, + "step": 12905 + }, + { + "epoch": 1.5057752887644382, + "grad_norm": 1.0991359949111938, + "learning_rate": 0.0002360671155902347, + "loss": 2.0389, + "step": 12906 + }, + { + "epoch": 1.50589196126473, + "grad_norm": 1.1679595708847046, + "learning_rate": 0.0002360546864156571, + "loss": 2.1565, + "step": 12907 + }, + { + "epoch": 1.5060086337650216, + "grad_norm": 1.0336912870407104, + "learning_rate": 0.00023604225636451013, + "loss": 2.1668, + "step": 12908 + }, + { + "epoch": 1.5061253062653133, + "grad_norm": 1.0616743564605713, + "learning_rate": 0.00023602982543692255, + "loss": 2.0315, + "step": 12909 + }, + { + "epoch": 1.506241978765605, + "grad_norm": 1.3342305421829224, + "learning_rate": 0.00023601739363302324, + "loss": 2.3764, + "step": 12910 + }, + { + "epoch": 1.5063586512658966, + "grad_norm": 1.2179646492004395, + "learning_rate": 0.00023600496095294105, + "loss": 2.0311, + "step": 12911 + }, + { + "epoch": 1.5064753237661883, + "grad_norm": 1.2255147695541382, + "learning_rate": 0.00023599252739680502, + "loss": 2.0004, + "step": 12912 + }, + { + "epoch": 1.50659199626648, + "grad_norm": 1.134616732597351, + "learning_rate": 0.00023598009296474387, + "loss": 1.9986, + "step": 12913 + }, + { + "epoch": 1.5067086687667717, + "grad_norm": 1.3750355243682861, + "learning_rate": 0.00023596765765688664, + "loss": 2.0293, + "step": 12914 + }, + { + "epoch": 1.5068253412670634, + "grad_norm": 1.2092170715332031, + "learning_rate": 0.00023595522147336212, + "loss": 2.0604, + "step": 12915 + }, + { + "epoch": 1.506942013767355, + "grad_norm": 1.4487473964691162, + "learning_rate": 0.00023594278441429933, + "loss": 2.1411, + "step": 12916 + }, + { + "epoch": 1.5070586862676467, + "grad_norm": 1.2080830335617065, + "learning_rate": 0.00023593034647982714, + "loss": 2.1244, + "step": 12917 + }, + { + "epoch": 1.5071753587679384, + "grad_norm": 1.2610867023468018, + "learning_rate": 0.00023591790767007458, + "loss": 2.0135, + "step": 12918 + }, + { + "epoch": 1.50729203126823, + "grad_norm": 1.0129821300506592, + "learning_rate": 0.00023590546798517054, + "loss": 1.81, + "step": 12919 + }, + { + "epoch": 1.5074087037685218, + "grad_norm": 1.0796492099761963, + "learning_rate": 0.00023589302742524397, + "loss": 2.0906, + "step": 12920 + }, + { + "epoch": 1.5075253762688134, + "grad_norm": 1.0860174894332886, + "learning_rate": 0.00023588058599042385, + "loss": 2.0095, + "step": 12921 + }, + { + "epoch": 1.5076420487691051, + "grad_norm": 1.209524393081665, + "learning_rate": 0.00023586814368083923, + "loss": 1.8968, + "step": 12922 + }, + { + "epoch": 1.5077587212693968, + "grad_norm": 1.2206380367279053, + "learning_rate": 0.00023585570049661897, + "loss": 2.0976, + "step": 12923 + }, + { + "epoch": 1.5078753937696885, + "grad_norm": 1.20325767993927, + "learning_rate": 0.00023584325643789215, + "loss": 2.0996, + "step": 12924 + }, + { + "epoch": 1.5079920662699802, + "grad_norm": 1.1152560710906982, + "learning_rate": 0.00023583081150478776, + "loss": 1.991, + "step": 12925 + }, + { + "epoch": 1.5081087387702719, + "grad_norm": 1.2331411838531494, + "learning_rate": 0.00023581836569743482, + "loss": 2.0591, + "step": 12926 + }, + { + "epoch": 1.5082254112705635, + "grad_norm": 1.3084769248962402, + "learning_rate": 0.00023580591901596235, + "loss": 2.0619, + "step": 12927 + }, + { + "epoch": 1.5083420837708552, + "grad_norm": 1.1447649002075195, + "learning_rate": 0.00023579347146049942, + "loss": 2.0561, + "step": 12928 + }, + { + "epoch": 1.508458756271147, + "grad_norm": 1.2617453336715698, + "learning_rate": 0.00023578102303117496, + "loss": 2.133, + "step": 12929 + }, + { + "epoch": 1.5085754287714386, + "grad_norm": 1.1573455333709717, + "learning_rate": 0.00023576857372811812, + "loss": 1.9758, + "step": 12930 + }, + { + "epoch": 1.5086921012717303, + "grad_norm": 1.1156381368637085, + "learning_rate": 0.000235756123551458, + "loss": 2.0505, + "step": 12931 + }, + { + "epoch": 1.508808773772022, + "grad_norm": 1.463678240776062, + "learning_rate": 0.00023574367250132352, + "loss": 2.1535, + "step": 12932 + }, + { + "epoch": 1.5089254462723136, + "grad_norm": 1.1231032609939575, + "learning_rate": 0.0002357312205778439, + "loss": 2.163, + "step": 12933 + }, + { + "epoch": 1.5090421187726053, + "grad_norm": 1.2608627080917358, + "learning_rate": 0.00023571876778114814, + "loss": 2.1015, + "step": 12934 + }, + { + "epoch": 1.509158791272897, + "grad_norm": 1.1447172164916992, + "learning_rate": 0.0002357063141113654, + "loss": 2.0972, + "step": 12935 + }, + { + "epoch": 1.5092754637731887, + "grad_norm": 1.0807386636734009, + "learning_rate": 0.00023569385956862476, + "loss": 1.9976, + "step": 12936 + }, + { + "epoch": 1.5093921362734803, + "grad_norm": 1.0804672241210938, + "learning_rate": 0.00023568140415305532, + "loss": 2.0108, + "step": 12937 + }, + { + "epoch": 1.509508808773772, + "grad_norm": 1.1637952327728271, + "learning_rate": 0.00023566894786478622, + "loss": 2.1861, + "step": 12938 + }, + { + "epoch": 1.5096254812740637, + "grad_norm": 1.0685045719146729, + "learning_rate": 0.00023565649070394655, + "loss": 2.0253, + "step": 12939 + }, + { + "epoch": 1.5097421537743554, + "grad_norm": 1.1752861738204956, + "learning_rate": 0.00023564403267066554, + "loss": 1.9459, + "step": 12940 + }, + { + "epoch": 1.509858826274647, + "grad_norm": 0.8996789455413818, + "learning_rate": 0.00023563157376507233, + "loss": 1.7911, + "step": 12941 + }, + { + "epoch": 1.5099754987749388, + "grad_norm": 1.022010087966919, + "learning_rate": 0.00023561911398729593, + "loss": 2.0102, + "step": 12942 + }, + { + "epoch": 1.5100921712752304, + "grad_norm": 1.1375280618667603, + "learning_rate": 0.0002356066533374657, + "loss": 2.1427, + "step": 12943 + }, + { + "epoch": 1.5102088437755221, + "grad_norm": 1.2035865783691406, + "learning_rate": 0.00023559419181571072, + "loss": 1.9743, + "step": 12944 + }, + { + "epoch": 1.5103255162758138, + "grad_norm": 1.2907330989837646, + "learning_rate": 0.00023558172942216018, + "loss": 2.1854, + "step": 12945 + }, + { + "epoch": 1.5104421887761055, + "grad_norm": 1.2869096994400024, + "learning_rate": 0.00023556926615694328, + "loss": 2.107, + "step": 12946 + }, + { + "epoch": 1.5105588612763972, + "grad_norm": 1.2749779224395752, + "learning_rate": 0.0002355568020201893, + "loss": 2.0595, + "step": 12947 + }, + { + "epoch": 1.5106755337766888, + "grad_norm": 1.1778684854507446, + "learning_rate": 0.00023554433701202735, + "loss": 2.213, + "step": 12948 + }, + { + "epoch": 1.5107922062769805, + "grad_norm": 1.250720500946045, + "learning_rate": 0.0002355318711325867, + "loss": 2.1326, + "step": 12949 + }, + { + "epoch": 1.5109088787772722, + "grad_norm": 1.2523027658462524, + "learning_rate": 0.0002355194043819966, + "loss": 2.2797, + "step": 12950 + }, + { + "epoch": 1.5110255512775639, + "grad_norm": 1.0902283191680908, + "learning_rate": 0.0002355069367603862, + "loss": 2.2451, + "step": 12951 + }, + { + "epoch": 1.5111422237778556, + "grad_norm": 1.1438214778900146, + "learning_rate": 0.00023549446826788488, + "loss": 2.1016, + "step": 12952 + }, + { + "epoch": 1.5112588962781472, + "grad_norm": 1.5188037157058716, + "learning_rate": 0.0002354819989046219, + "loss": 2.2242, + "step": 12953 + }, + { + "epoch": 1.511375568778439, + "grad_norm": 1.4433821439743042, + "learning_rate": 0.00023546952867072637, + "loss": 2.2271, + "step": 12954 + }, + { + "epoch": 1.5114922412787306, + "grad_norm": 1.3794485330581665, + "learning_rate": 0.0002354570575663277, + "loss": 1.8796, + "step": 12955 + }, + { + "epoch": 1.5116089137790223, + "grad_norm": 1.1428544521331787, + "learning_rate": 0.00023544458559155514, + "loss": 2.1082, + "step": 12956 + }, + { + "epoch": 1.511725586279314, + "grad_norm": 1.1673797369003296, + "learning_rate": 0.00023543211274653795, + "loss": 2.0226, + "step": 12957 + }, + { + "epoch": 1.5118422587796057, + "grad_norm": 1.1535152196884155, + "learning_rate": 0.0002354196390314055, + "loss": 2.168, + "step": 12958 + }, + { + "epoch": 1.5119589312798973, + "grad_norm": 1.1176254749298096, + "learning_rate": 0.0002354071644462871, + "loss": 2.1261, + "step": 12959 + }, + { + "epoch": 1.512075603780189, + "grad_norm": 1.070093035697937, + "learning_rate": 0.00023539468899131202, + "loss": 1.9313, + "step": 12960 + }, + { + "epoch": 1.5121922762804807, + "grad_norm": 0.9812670350074768, + "learning_rate": 0.00023538221266660968, + "loss": 1.9556, + "step": 12961 + }, + { + "epoch": 1.5123089487807724, + "grad_norm": 1.2056161165237427, + "learning_rate": 0.00023536973547230928, + "loss": 1.9567, + "step": 12962 + }, + { + "epoch": 1.512425621281064, + "grad_norm": 1.0903105735778809, + "learning_rate": 0.00023535725740854031, + "loss": 2.118, + "step": 12963 + }, + { + "epoch": 1.5125422937813557, + "grad_norm": 0.9750627875328064, + "learning_rate": 0.00023534477847543205, + "loss": 1.9823, + "step": 12964 + }, + { + "epoch": 1.5126589662816474, + "grad_norm": 1.065739393234253, + "learning_rate": 0.0002353322986731139, + "loss": 2.0246, + "step": 12965 + }, + { + "epoch": 1.512775638781939, + "grad_norm": 1.062783122062683, + "learning_rate": 0.00023531981800171522, + "loss": 2.1465, + "step": 12966 + }, + { + "epoch": 1.5128923112822308, + "grad_norm": 1.2632122039794922, + "learning_rate": 0.0002353073364613654, + "loss": 1.9858, + "step": 12967 + }, + { + "epoch": 1.5130089837825225, + "grad_norm": 1.133787751197815, + "learning_rate": 0.00023529485405219383, + "loss": 2.0489, + "step": 12968 + }, + { + "epoch": 1.5131256562828141, + "grad_norm": 1.0761088132858276, + "learning_rate": 0.00023528237077432987, + "loss": 1.9337, + "step": 12969 + }, + { + "epoch": 1.5132423287831058, + "grad_norm": 0.974772572517395, + "learning_rate": 0.00023526988662790307, + "loss": 2.0154, + "step": 12970 + }, + { + "epoch": 1.5133590012833975, + "grad_norm": 1.2854281663894653, + "learning_rate": 0.0002352574016130427, + "loss": 2.0357, + "step": 12971 + }, + { + "epoch": 1.5134756737836892, + "grad_norm": 1.297935962677002, + "learning_rate": 0.0002352449157298783, + "loss": 2.219, + "step": 12972 + }, + { + "epoch": 1.5135923462839809, + "grad_norm": 1.2300783395767212, + "learning_rate": 0.00023523242897853926, + "loss": 2.0491, + "step": 12973 + }, + { + "epoch": 1.5137090187842726, + "grad_norm": 1.2861227989196777, + "learning_rate": 0.00023521994135915499, + "loss": 2.3509, + "step": 12974 + }, + { + "epoch": 1.5138256912845642, + "grad_norm": 1.0914556980133057, + "learning_rate": 0.000235207452871855, + "loss": 2.0382, + "step": 12975 + }, + { + "epoch": 1.513942363784856, + "grad_norm": 1.2669881582260132, + "learning_rate": 0.0002351949635167688, + "loss": 2.2945, + "step": 12976 + }, + { + "epoch": 1.5140590362851476, + "grad_norm": 1.0301533937454224, + "learning_rate": 0.00023518247329402573, + "loss": 1.9655, + "step": 12977 + }, + { + "epoch": 1.5141757087854393, + "grad_norm": 1.195143699645996, + "learning_rate": 0.00023516998220375545, + "loss": 2.1483, + "step": 12978 + }, + { + "epoch": 1.514292381285731, + "grad_norm": 1.0521671772003174, + "learning_rate": 0.0002351574902460873, + "loss": 1.9967, + "step": 12979 + }, + { + "epoch": 1.5144090537860226, + "grad_norm": 1.186318039894104, + "learning_rate": 0.0002351449974211509, + "loss": 2.1162, + "step": 12980 + }, + { + "epoch": 1.5145257262863143, + "grad_norm": 1.081233263015747, + "learning_rate": 0.00023513250372907563, + "loss": 2.0023, + "step": 12981 + }, + { + "epoch": 1.514642398786606, + "grad_norm": 1.1371431350708008, + "learning_rate": 0.00023512000916999114, + "loss": 1.9906, + "step": 12982 + }, + { + "epoch": 1.5147590712868977, + "grad_norm": 1.0250935554504395, + "learning_rate": 0.00023510751374402694, + "loss": 2.0555, + "step": 12983 + }, + { + "epoch": 1.5148757437871894, + "grad_norm": 1.216640830039978, + "learning_rate": 0.00023509501745131242, + "loss": 2.126, + "step": 12984 + }, + { + "epoch": 1.514992416287481, + "grad_norm": 1.2222589254379272, + "learning_rate": 0.00023508252029197732, + "loss": 2.2196, + "step": 12985 + }, + { + "epoch": 1.5151090887877727, + "grad_norm": 1.2160078287124634, + "learning_rate": 0.00023507002226615115, + "loss": 2.1051, + "step": 12986 + }, + { + "epoch": 1.5152257612880644, + "grad_norm": 1.2509608268737793, + "learning_rate": 0.0002350575233739634, + "loss": 2.1732, + "step": 12987 + }, + { + "epoch": 1.515342433788356, + "grad_norm": 1.262382984161377, + "learning_rate": 0.00023504502361554373, + "loss": 2.0089, + "step": 12988 + }, + { + "epoch": 1.5154591062886478, + "grad_norm": 1.2071939706802368, + "learning_rate": 0.00023503252299102165, + "loss": 2.0102, + "step": 12989 + }, + { + "epoch": 1.5155757787889395, + "grad_norm": 1.281389594078064, + "learning_rate": 0.0002350200215005268, + "loss": 2.0416, + "step": 12990 + }, + { + "epoch": 1.5156924512892311, + "grad_norm": 1.0847746133804321, + "learning_rate": 0.0002350075191441888, + "loss": 2.1675, + "step": 12991 + }, + { + "epoch": 1.5158091237895228, + "grad_norm": 1.053358793258667, + "learning_rate": 0.00023499501592213717, + "loss": 2.1037, + "step": 12992 + }, + { + "epoch": 1.5159257962898145, + "grad_norm": 1.09786057472229, + "learning_rate": 0.0002349825118345016, + "loss": 1.9493, + "step": 12993 + }, + { + "epoch": 1.5160424687901062, + "grad_norm": 1.1375967264175415, + "learning_rate": 0.0002349700068814117, + "loss": 2.0967, + "step": 12994 + }, + { + "epoch": 1.5161591412903979, + "grad_norm": 1.1120212078094482, + "learning_rate": 0.00023495750106299716, + "loss": 2.0449, + "step": 12995 + }, + { + "epoch": 1.5162758137906895, + "grad_norm": 1.1229342222213745, + "learning_rate": 0.00023494499437938757, + "loss": 1.9425, + "step": 12996 + }, + { + "epoch": 1.5163924862909812, + "grad_norm": 1.3636835813522339, + "learning_rate": 0.00023493248683071258, + "loss": 1.9714, + "step": 12997 + }, + { + "epoch": 1.516509158791273, + "grad_norm": 1.1970231533050537, + "learning_rate": 0.00023491997841710186, + "loss": 1.9319, + "step": 12998 + }, + { + "epoch": 1.5166258312915646, + "grad_norm": 1.2698286771774292, + "learning_rate": 0.00023490746913868514, + "loss": 2.0275, + "step": 12999 + }, + { + "epoch": 1.5167425037918563, + "grad_norm": 1.1336040496826172, + "learning_rate": 0.00023489495899559206, + "loss": 2.0924, + "step": 13000 + }, + { + "epoch": 1.516859176292148, + "grad_norm": 1.024540662765503, + "learning_rate": 0.00023488244798795221, + "loss": 2.2322, + "step": 13001 + }, + { + "epoch": 1.5169758487924396, + "grad_norm": 1.082262396812439, + "learning_rate": 0.00023486993611589546, + "loss": 2.0677, + "step": 13002 + }, + { + "epoch": 1.5170925212927313, + "grad_norm": 1.0727037191390991, + "learning_rate": 0.0002348574233795515, + "loss": 2.0572, + "step": 13003 + }, + { + "epoch": 1.517209193793023, + "grad_norm": 1.197798490524292, + "learning_rate": 0.00023484490977904992, + "loss": 2.0178, + "step": 13004 + }, + { + "epoch": 1.5173258662933147, + "grad_norm": 1.325905203819275, + "learning_rate": 0.00023483239531452057, + "loss": 2.1655, + "step": 13005 + }, + { + "epoch": 1.5174425387936064, + "grad_norm": 1.2010334730148315, + "learning_rate": 0.00023481987998609312, + "loss": 2.0895, + "step": 13006 + }, + { + "epoch": 1.517559211293898, + "grad_norm": 1.0334481000900269, + "learning_rate": 0.0002348073637938973, + "loss": 2.0299, + "step": 13007 + }, + { + "epoch": 1.5176758837941897, + "grad_norm": 1.1518312692642212, + "learning_rate": 0.00023479484673806297, + "loss": 2.1444, + "step": 13008 + }, + { + "epoch": 1.5177925562944814, + "grad_norm": 1.094919204711914, + "learning_rate": 0.00023478232881871981, + "loss": 2.0171, + "step": 13009 + }, + { + "epoch": 1.517909228794773, + "grad_norm": 1.0769628286361694, + "learning_rate": 0.0002347698100359975, + "loss": 1.996, + "step": 13010 + }, + { + "epoch": 1.5180259012950648, + "grad_norm": 1.1303770542144775, + "learning_rate": 0.000234757290390026, + "loss": 1.8856, + "step": 13011 + }, + { + "epoch": 1.5181425737953564, + "grad_norm": 1.174399495124817, + "learning_rate": 0.00023474476988093507, + "loss": 2.0777, + "step": 13012 + }, + { + "epoch": 1.5182592462956481, + "grad_norm": 1.1150833368301392, + "learning_rate": 0.0002347322485088544, + "loss": 1.867, + "step": 13013 + }, + { + "epoch": 1.5183759187959398, + "grad_norm": 1.0857727527618408, + "learning_rate": 0.00023471972627391388, + "loss": 2.0501, + "step": 13014 + }, + { + "epoch": 1.5184925912962315, + "grad_norm": 1.276915192604065, + "learning_rate": 0.0002347072031762433, + "loss": 2.0814, + "step": 13015 + }, + { + "epoch": 1.5186092637965232, + "grad_norm": 1.2229645252227783, + "learning_rate": 0.0002346946792159725, + "loss": 2.204, + "step": 13016 + }, + { + "epoch": 1.5187259362968148, + "grad_norm": 1.0546919107437134, + "learning_rate": 0.00023468215439323133, + "loss": 2.0058, + "step": 13017 + }, + { + "epoch": 1.5188426087971065, + "grad_norm": 1.0938071012496948, + "learning_rate": 0.00023466962870814957, + "loss": 2.0102, + "step": 13018 + }, + { + "epoch": 1.5189592812973982, + "grad_norm": 1.2616558074951172, + "learning_rate": 0.00023465710216085713, + "loss": 2.162, + "step": 13019 + }, + { + "epoch": 1.51907595379769, + "grad_norm": 1.238309383392334, + "learning_rate": 0.00023464457475148384, + "loss": 2.0724, + "step": 13020 + }, + { + "epoch": 1.5191926262979816, + "grad_norm": 1.1168057918548584, + "learning_rate": 0.00023463204648015965, + "loss": 2.0006, + "step": 13021 + }, + { + "epoch": 1.5193092987982733, + "grad_norm": 1.307214617729187, + "learning_rate": 0.00023461951734701433, + "loss": 2.161, + "step": 13022 + }, + { + "epoch": 1.519425971298565, + "grad_norm": 1.1487174034118652, + "learning_rate": 0.00023460698735217777, + "loss": 2.0777, + "step": 13023 + }, + { + "epoch": 1.5195426437988566, + "grad_norm": 1.3880236148834229, + "learning_rate": 0.00023459445649577995, + "loss": 2.1924, + "step": 13024 + }, + { + "epoch": 1.5196593162991483, + "grad_norm": 1.1708519458770752, + "learning_rate": 0.00023458192477795076, + "loss": 2.0712, + "step": 13025 + }, + { + "epoch": 1.51977598879944, + "grad_norm": 1.1416223049163818, + "learning_rate": 0.00023456939219882012, + "loss": 2.257, + "step": 13026 + }, + { + "epoch": 1.5198926612997317, + "grad_norm": 1.0961484909057617, + "learning_rate": 0.00023455685875851782, + "loss": 2.0287, + "step": 13027 + }, + { + "epoch": 1.5200093338000233, + "grad_norm": 1.049004077911377, + "learning_rate": 0.000234544324457174, + "loss": 2.0246, + "step": 13028 + }, + { + "epoch": 1.520126006300315, + "grad_norm": 1.2184154987335205, + "learning_rate": 0.0002345317892949184, + "loss": 2.1608, + "step": 13029 + }, + { + "epoch": 1.5202426788006067, + "grad_norm": 1.2351871728897095, + "learning_rate": 0.00023451925327188115, + "loss": 2.1483, + "step": 13030 + }, + { + "epoch": 1.5203593513008984, + "grad_norm": 1.1888916492462158, + "learning_rate": 0.00023450671638819212, + "loss": 2.1586, + "step": 13031 + }, + { + "epoch": 1.52047602380119, + "grad_norm": 1.1600549221038818, + "learning_rate": 0.00023449417864398128, + "loss": 2.1517, + "step": 13032 + }, + { + "epoch": 1.5205926963014817, + "grad_norm": 1.301664113998413, + "learning_rate": 0.0002344816400393786, + "loss": 2.0839, + "step": 13033 + }, + { + "epoch": 1.5207093688017734, + "grad_norm": 1.0618464946746826, + "learning_rate": 0.00023446910057451408, + "loss": 1.9173, + "step": 13034 + }, + { + "epoch": 1.5208260413020651, + "grad_norm": 1.0437778234481812, + "learning_rate": 0.00023445656024951775, + "loss": 1.9664, + "step": 13035 + }, + { + "epoch": 1.5209427138023568, + "grad_norm": 1.1931116580963135, + "learning_rate": 0.00023444401906451957, + "loss": 2.0867, + "step": 13036 + }, + { + "epoch": 1.5210593863026485, + "grad_norm": 1.1523332595825195, + "learning_rate": 0.00023443147701964948, + "loss": 2.1121, + "step": 13037 + }, + { + "epoch": 1.5211760588029402, + "grad_norm": 1.2824180126190186, + "learning_rate": 0.00023441893411503767, + "loss": 2.1206, + "step": 13038 + }, + { + "epoch": 1.5212927313032318, + "grad_norm": 1.1465622186660767, + "learning_rate": 0.0002344063903508141, + "loss": 2.1839, + "step": 13039 + }, + { + "epoch": 1.5214094038035235, + "grad_norm": 1.1414308547973633, + "learning_rate": 0.00023439384572710882, + "loss": 1.9402, + "step": 13040 + }, + { + "epoch": 1.5215260763038152, + "grad_norm": 1.0314671993255615, + "learning_rate": 0.00023438130024405174, + "loss": 2.0694, + "step": 13041 + }, + { + "epoch": 1.5216427488041069, + "grad_norm": 1.1687556505203247, + "learning_rate": 0.00023436875390177314, + "loss": 1.9716, + "step": 13042 + }, + { + "epoch": 1.5217594213043986, + "grad_norm": 1.2012959718704224, + "learning_rate": 0.00023435620670040296, + "loss": 2.0991, + "step": 13043 + }, + { + "epoch": 1.5218760938046902, + "grad_norm": 1.1103696823120117, + "learning_rate": 0.00023434365864007128, + "loss": 1.9757, + "step": 13044 + }, + { + "epoch": 1.521992766304982, + "grad_norm": 1.140961766242981, + "learning_rate": 0.00023433110972090822, + "loss": 2.1931, + "step": 13045 + }, + { + "epoch": 1.5221094388052736, + "grad_norm": 1.1464996337890625, + "learning_rate": 0.00023431855994304382, + "loss": 1.9706, + "step": 13046 + }, + { + "epoch": 1.5222261113055653, + "grad_norm": 1.1675893068313599, + "learning_rate": 0.0002343060093066083, + "loss": 2.0077, + "step": 13047 + }, + { + "epoch": 1.522342783805857, + "grad_norm": 1.173864722251892, + "learning_rate": 0.0002342934578117316, + "loss": 2.0385, + "step": 13048 + }, + { + "epoch": 1.5224594563061487, + "grad_norm": 1.2376141548156738, + "learning_rate": 0.00023428090545854393, + "loss": 2.0534, + "step": 13049 + }, + { + "epoch": 1.5225761288064403, + "grad_norm": 1.064356803894043, + "learning_rate": 0.00023426835224717545, + "loss": 2.1564, + "step": 13050 + }, + { + "epoch": 1.522692801306732, + "grad_norm": 1.2315387725830078, + "learning_rate": 0.0002342557981777562, + "loss": 2.1897, + "step": 13051 + }, + { + "epoch": 1.5228094738070237, + "grad_norm": 1.1852329969406128, + "learning_rate": 0.00023424324325041645, + "loss": 2.1761, + "step": 13052 + }, + { + "epoch": 1.5229261463073154, + "grad_norm": 1.0772095918655396, + "learning_rate": 0.00023423068746528632, + "loss": 1.9652, + "step": 13053 + }, + { + "epoch": 1.523042818807607, + "grad_norm": 1.2617658376693726, + "learning_rate": 0.00023421813082249592, + "loss": 2.0263, + "step": 13054 + }, + { + "epoch": 1.5231594913078987, + "grad_norm": 1.02657949924469, + "learning_rate": 0.0002342055733221755, + "loss": 2.119, + "step": 13055 + }, + { + "epoch": 1.5232761638081904, + "grad_norm": 1.2759028673171997, + "learning_rate": 0.0002341930149644551, + "loss": 2.163, + "step": 13056 + }, + { + "epoch": 1.523392836308482, + "grad_norm": 1.097219467163086, + "learning_rate": 0.00023418045574946511, + "loss": 2.1191, + "step": 13057 + }, + { + "epoch": 1.5235095088087738, + "grad_norm": 1.0317362546920776, + "learning_rate": 0.00023416789567733555, + "loss": 1.9076, + "step": 13058 + }, + { + "epoch": 1.5236261813090655, + "grad_norm": 1.2806400060653687, + "learning_rate": 0.00023415533474819674, + "loss": 2.1918, + "step": 13059 + }, + { + "epoch": 1.5237428538093571, + "grad_norm": 1.0804616212844849, + "learning_rate": 0.0002341427729621789, + "loss": 2.1615, + "step": 13060 + }, + { + "epoch": 1.5238595263096488, + "grad_norm": 0.978942334651947, + "learning_rate": 0.00023413021031941215, + "loss": 2.0627, + "step": 13061 + }, + { + "epoch": 1.5239761988099405, + "grad_norm": 1.0103188753128052, + "learning_rate": 0.00023411764682002681, + "loss": 2.1777, + "step": 13062 + }, + { + "epoch": 1.5240928713102322, + "grad_norm": 1.2679052352905273, + "learning_rate": 0.00023410508246415312, + "loss": 1.9148, + "step": 13063 + }, + { + "epoch": 1.5242095438105239, + "grad_norm": 1.0163694620132446, + "learning_rate": 0.00023409251725192134, + "loss": 1.957, + "step": 13064 + }, + { + "epoch": 1.5243262163108156, + "grad_norm": 1.2040157318115234, + "learning_rate": 0.0002340799511834617, + "loss": 2.1641, + "step": 13065 + }, + { + "epoch": 1.5244428888111072, + "grad_norm": 1.1979506015777588, + "learning_rate": 0.00023406738425890453, + "loss": 2.0968, + "step": 13066 + }, + { + "epoch": 1.524559561311399, + "grad_norm": 1.1460098028182983, + "learning_rate": 0.00023405481647838, + "loss": 2.0414, + "step": 13067 + }, + { + "epoch": 1.5246762338116906, + "grad_norm": 1.3511759042739868, + "learning_rate": 0.00023404224784201853, + "loss": 1.968, + "step": 13068 + }, + { + "epoch": 1.5247929063119823, + "grad_norm": 1.1467887163162231, + "learning_rate": 0.0002340296783499503, + "loss": 2.0752, + "step": 13069 + }, + { + "epoch": 1.524909578812274, + "grad_norm": 1.2521865367889404, + "learning_rate": 0.00023401710800230577, + "loss": 2.0897, + "step": 13070 + }, + { + "epoch": 1.5250262513125656, + "grad_norm": 1.2386724948883057, + "learning_rate": 0.00023400453679921505, + "loss": 1.9966, + "step": 13071 + }, + { + "epoch": 1.5251429238128573, + "grad_norm": 1.0677071809768677, + "learning_rate": 0.0002339919647408086, + "loss": 1.9957, + "step": 13072 + }, + { + "epoch": 1.525259596313149, + "grad_norm": 1.3000580072402954, + "learning_rate": 0.00023397939182721677, + "loss": 2.0325, + "step": 13073 + }, + { + "epoch": 1.5253762688134407, + "grad_norm": 1.063611388206482, + "learning_rate": 0.00023396681805856978, + "loss": 2.1545, + "step": 13074 + }, + { + "epoch": 1.5254929413137324, + "grad_norm": 1.1452590227127075, + "learning_rate": 0.00023395424343499807, + "loss": 2.0788, + "step": 13075 + }, + { + "epoch": 1.525609613814024, + "grad_norm": 1.2652324438095093, + "learning_rate": 0.00023394166795663205, + "loss": 2.1871, + "step": 13076 + }, + { + "epoch": 1.5257262863143157, + "grad_norm": 1.0954158306121826, + "learning_rate": 0.00023392909162360195, + "loss": 1.9529, + "step": 13077 + }, + { + "epoch": 1.5258429588146074, + "grad_norm": 1.0868302583694458, + "learning_rate": 0.0002339165144360383, + "loss": 2.0239, + "step": 13078 + }, + { + "epoch": 1.525959631314899, + "grad_norm": 1.2381598949432373, + "learning_rate": 0.00023390393639407137, + "loss": 2.034, + "step": 13079 + }, + { + "epoch": 1.5260763038151908, + "grad_norm": 1.0760773420333862, + "learning_rate": 0.0002338913574978316, + "loss": 2.1825, + "step": 13080 + }, + { + "epoch": 1.5261929763154825, + "grad_norm": 0.9781418442726135, + "learning_rate": 0.00023387877774744937, + "loss": 1.9263, + "step": 13081 + }, + { + "epoch": 1.5263096488157741, + "grad_norm": 1.243344783782959, + "learning_rate": 0.00023386619714305514, + "loss": 1.9854, + "step": 13082 + }, + { + "epoch": 1.5264263213160658, + "grad_norm": 1.3006256818771362, + "learning_rate": 0.0002338536156847793, + "loss": 2.1121, + "step": 13083 + }, + { + "epoch": 1.5265429938163575, + "grad_norm": 1.4571163654327393, + "learning_rate": 0.00023384103337275227, + "loss": 2.0209, + "step": 13084 + }, + { + "epoch": 1.5266596663166492, + "grad_norm": 1.1940925121307373, + "learning_rate": 0.00023382845020710448, + "loss": 1.9094, + "step": 13085 + }, + { + "epoch": 1.5267763388169409, + "grad_norm": 1.1351299285888672, + "learning_rate": 0.00023381586618796642, + "loss": 1.739, + "step": 13086 + }, + { + "epoch": 1.5268930113172325, + "grad_norm": 1.2027779817581177, + "learning_rate": 0.00023380328131546854, + "loss": 2.0455, + "step": 13087 + }, + { + "epoch": 1.5270096838175242, + "grad_norm": 1.2408912181854248, + "learning_rate": 0.0002337906955897413, + "loss": 1.9949, + "step": 13088 + }, + { + "epoch": 1.527126356317816, + "grad_norm": 1.174608826637268, + "learning_rate": 0.00023377810901091517, + "loss": 2.2796, + "step": 13089 + }, + { + "epoch": 1.5272430288181076, + "grad_norm": 1.135837197303772, + "learning_rate": 0.00023376552157912061, + "loss": 1.8903, + "step": 13090 + }, + { + "epoch": 1.5273597013183993, + "grad_norm": 1.1834282875061035, + "learning_rate": 0.00023375293329448815, + "loss": 2.0844, + "step": 13091 + }, + { + "epoch": 1.527476373818691, + "grad_norm": 1.0804811716079712, + "learning_rate": 0.0002337403441571483, + "loss": 2.1634, + "step": 13092 + }, + { + "epoch": 1.5275930463189826, + "grad_norm": 1.1334598064422607, + "learning_rate": 0.00023372775416723152, + "loss": 1.9626, + "step": 13093 + }, + { + "epoch": 1.5277097188192743, + "grad_norm": 1.1830546855926514, + "learning_rate": 0.00023371516332486834, + "loss": 2.0847, + "step": 13094 + }, + { + "epoch": 1.527826391319566, + "grad_norm": 1.2807753086090088, + "learning_rate": 0.00023370257163018936, + "loss": 2.0789, + "step": 13095 + }, + { + "epoch": 1.5279430638198577, + "grad_norm": 1.2279936075210571, + "learning_rate": 0.00023368997908332503, + "loss": 2.0502, + "step": 13096 + }, + { + "epoch": 1.5280597363201494, + "grad_norm": 1.0970051288604736, + "learning_rate": 0.0002336773856844059, + "loss": 1.9907, + "step": 13097 + }, + { + "epoch": 1.528176408820441, + "grad_norm": 1.2610572576522827, + "learning_rate": 0.0002336647914335626, + "loss": 1.9863, + "step": 13098 + }, + { + "epoch": 1.5282930813207327, + "grad_norm": 1.1682698726654053, + "learning_rate": 0.00023365219633092566, + "loss": 2.0444, + "step": 13099 + }, + { + "epoch": 1.5284097538210244, + "grad_norm": 1.086746335029602, + "learning_rate": 0.00023363960037662556, + "loss": 1.9755, + "step": 13100 + }, + { + "epoch": 1.528526426321316, + "grad_norm": 1.2599200010299683, + "learning_rate": 0.00023362700357079303, + "loss": 2.1331, + "step": 13101 + }, + { + "epoch": 1.5286430988216078, + "grad_norm": 1.1894142627716064, + "learning_rate": 0.00023361440591355856, + "loss": 1.9258, + "step": 13102 + }, + { + "epoch": 1.5287597713218994, + "grad_norm": 1.2410173416137695, + "learning_rate": 0.00023360180740505276, + "loss": 2.0858, + "step": 13103 + }, + { + "epoch": 1.5288764438221911, + "grad_norm": 1.1878927946090698, + "learning_rate": 0.00023358920804540626, + "loss": 2.2124, + "step": 13104 + }, + { + "epoch": 1.5289931163224828, + "grad_norm": 1.145504355430603, + "learning_rate": 0.00023357660783474967, + "loss": 1.8318, + "step": 13105 + }, + { + "epoch": 1.5291097888227745, + "grad_norm": 1.2139042615890503, + "learning_rate": 0.00023356400677321362, + "loss": 1.8029, + "step": 13106 + }, + { + "epoch": 1.5292264613230662, + "grad_norm": 1.2522797584533691, + "learning_rate": 0.00023355140486092877, + "loss": 2.1283, + "step": 13107 + }, + { + "epoch": 1.5293431338233578, + "grad_norm": 1.336523413658142, + "learning_rate": 0.00023353880209802568, + "loss": 2.0222, + "step": 13108 + }, + { + "epoch": 1.5294598063236495, + "grad_norm": 1.2235430479049683, + "learning_rate": 0.00023352619848463507, + "loss": 2.1151, + "step": 13109 + }, + { + "epoch": 1.5295764788239412, + "grad_norm": 1.220244288444519, + "learning_rate": 0.00023351359402088765, + "loss": 2.079, + "step": 13110 + }, + { + "epoch": 1.529693151324233, + "grad_norm": 1.0825462341308594, + "learning_rate": 0.0002335009887069139, + "loss": 1.9233, + "step": 13111 + }, + { + "epoch": 1.5298098238245246, + "grad_norm": 1.2018147706985474, + "learning_rate": 0.00023348838254284474, + "loss": 2.178, + "step": 13112 + }, + { + "epoch": 1.5299264963248163, + "grad_norm": 1.2598680257797241, + "learning_rate": 0.00023347577552881068, + "loss": 1.8993, + "step": 13113 + }, + { + "epoch": 1.530043168825108, + "grad_norm": 1.15961754322052, + "learning_rate": 0.00023346316766494242, + "loss": 2.1647, + "step": 13114 + }, + { + "epoch": 1.5301598413253996, + "grad_norm": 1.1155047416687012, + "learning_rate": 0.0002334505589513708, + "loss": 1.9139, + "step": 13115 + }, + { + "epoch": 1.5302765138256913, + "grad_norm": 1.2943003177642822, + "learning_rate": 0.00023343794938822642, + "loss": 2.2549, + "step": 13116 + }, + { + "epoch": 1.530393186325983, + "grad_norm": 1.10310697555542, + "learning_rate": 0.00023342533897564003, + "loss": 2.0163, + "step": 13117 + }, + { + "epoch": 1.5305098588262747, + "grad_norm": 1.0971605777740479, + "learning_rate": 0.0002334127277137424, + "loss": 2.1378, + "step": 13118 + }, + { + "epoch": 1.5306265313265663, + "grad_norm": 1.130365252494812, + "learning_rate": 0.00023340011560266417, + "loss": 1.9691, + "step": 13119 + }, + { + "epoch": 1.530743203826858, + "grad_norm": 1.3633719682693481, + "learning_rate": 0.0002333875026425362, + "loss": 2.3804, + "step": 13120 + }, + { + "epoch": 1.5308598763271497, + "grad_norm": 1.1549872159957886, + "learning_rate": 0.00023337488883348913, + "loss": 2.0246, + "step": 13121 + }, + { + "epoch": 1.5309765488274414, + "grad_norm": 1.3120441436767578, + "learning_rate": 0.00023336227417565384, + "loss": 2.0812, + "step": 13122 + }, + { + "epoch": 1.531093221327733, + "grad_norm": 1.0309442281723022, + "learning_rate": 0.00023334965866916114, + "loss": 1.9653, + "step": 13123 + }, + { + "epoch": 1.5312098938280247, + "grad_norm": 1.3499408960342407, + "learning_rate": 0.00023333704231414163, + "loss": 2.2345, + "step": 13124 + }, + { + "epoch": 1.5313265663283164, + "grad_norm": 1.2138192653656006, + "learning_rate": 0.00023332442511072625, + "loss": 1.9399, + "step": 13125 + }, + { + "epoch": 1.5314432388286081, + "grad_norm": 1.2451978921890259, + "learning_rate": 0.0002333118070590457, + "loss": 1.8171, + "step": 13126 + }, + { + "epoch": 1.5315599113288998, + "grad_norm": 1.057404637336731, + "learning_rate": 0.0002332991881592309, + "loss": 2.0078, + "step": 13127 + }, + { + "epoch": 1.5316765838291915, + "grad_norm": 1.3011126518249512, + "learning_rate": 0.00023328656841141259, + "loss": 2.1433, + "step": 13128 + }, + { + "epoch": 1.5317932563294832, + "grad_norm": 1.0646305084228516, + "learning_rate": 0.00023327394781572162, + "loss": 2.1306, + "step": 13129 + }, + { + "epoch": 1.5319099288297748, + "grad_norm": 1.1913975477218628, + "learning_rate": 0.00023326132637228888, + "loss": 2.0593, + "step": 13130 + }, + { + "epoch": 1.5320266013300665, + "grad_norm": 1.3018972873687744, + "learning_rate": 0.0002332487040812451, + "loss": 2.1009, + "step": 13131 + }, + { + "epoch": 1.5321432738303582, + "grad_norm": 1.271836519241333, + "learning_rate": 0.00023323608094272117, + "loss": 2.219, + "step": 13132 + }, + { + "epoch": 1.5322599463306499, + "grad_norm": 1.413377285003662, + "learning_rate": 0.00023322345695684808, + "loss": 1.9359, + "step": 13133 + }, + { + "epoch": 1.5323766188309416, + "grad_norm": 1.1505906581878662, + "learning_rate": 0.00023321083212375656, + "loss": 1.9711, + "step": 13134 + }, + { + "epoch": 1.5324932913312332, + "grad_norm": 1.5394935607910156, + "learning_rate": 0.00023319820644357754, + "loss": 2.2201, + "step": 13135 + }, + { + "epoch": 1.532609963831525, + "grad_norm": 1.035797357559204, + "learning_rate": 0.0002331855799164419, + "loss": 2.1304, + "step": 13136 + }, + { + "epoch": 1.5327266363318166, + "grad_norm": 1.1869690418243408, + "learning_rate": 0.00023317295254248052, + "loss": 2.0702, + "step": 13137 + }, + { + "epoch": 1.5328433088321083, + "grad_norm": 1.1410963535308838, + "learning_rate": 0.00023316032432182433, + "loss": 2.0003, + "step": 13138 + }, + { + "epoch": 1.5329599813324, + "grad_norm": 1.3189088106155396, + "learning_rate": 0.00023314769525460428, + "loss": 2.1621, + "step": 13139 + }, + { + "epoch": 1.5330766538326916, + "grad_norm": 1.1655762195587158, + "learning_rate": 0.00023313506534095122, + "loss": 2.2171, + "step": 13140 + }, + { + "epoch": 1.5331933263329833, + "grad_norm": 1.1438829898834229, + "learning_rate": 0.00023312243458099613, + "loss": 2.1683, + "step": 13141 + }, + { + "epoch": 1.533309998833275, + "grad_norm": 1.1720399856567383, + "learning_rate": 0.0002331098029748699, + "loss": 2.129, + "step": 13142 + }, + { + "epoch": 1.5334266713335667, + "grad_norm": 1.2529675960540771, + "learning_rate": 0.00023309717052270355, + "loss": 2.0188, + "step": 13143 + }, + { + "epoch": 1.5335433438338584, + "grad_norm": 1.2438883781433105, + "learning_rate": 0.00023308453722462803, + "loss": 2.1664, + "step": 13144 + }, + { + "epoch": 1.53366001633415, + "grad_norm": 1.1644728183746338, + "learning_rate": 0.00023307190308077428, + "loss": 2.0441, + "step": 13145 + }, + { + "epoch": 1.5337766888344417, + "grad_norm": 1.0771929025650024, + "learning_rate": 0.0002330592680912733, + "loss": 2.1499, + "step": 13146 + }, + { + "epoch": 1.5338933613347334, + "grad_norm": 1.0464404821395874, + "learning_rate": 0.00023304663225625605, + "loss": 1.9981, + "step": 13147 + }, + { + "epoch": 1.534010033835025, + "grad_norm": 1.1658183336257935, + "learning_rate": 0.0002330339955758535, + "loss": 2.223, + "step": 13148 + }, + { + "epoch": 1.5341267063353168, + "grad_norm": 1.1160167455673218, + "learning_rate": 0.00023302135805019673, + "loss": 2.0348, + "step": 13149 + }, + { + "epoch": 1.5342433788356085, + "grad_norm": 1.1671092510223389, + "learning_rate": 0.00023300871967941674, + "loss": 2.1456, + "step": 13150 + }, + { + "epoch": 1.5343600513359001, + "grad_norm": 1.1424062252044678, + "learning_rate": 0.00023299608046364444, + "loss": 2.0624, + "step": 13151 + }, + { + "epoch": 1.5344767238361918, + "grad_norm": 0.9669429659843445, + "learning_rate": 0.00023298344040301102, + "loss": 1.7562, + "step": 13152 + }, + { + "epoch": 1.5345933963364835, + "grad_norm": 1.1659928560256958, + "learning_rate": 0.00023297079949764738, + "loss": 2.1523, + "step": 13153 + }, + { + "epoch": 1.5347100688367752, + "grad_norm": 1.1783385276794434, + "learning_rate": 0.00023295815774768464, + "loss": 2.0439, + "step": 13154 + }, + { + "epoch": 1.5348267413370669, + "grad_norm": 1.219567060470581, + "learning_rate": 0.00023294551515325382, + "loss": 1.9884, + "step": 13155 + }, + { + "epoch": 1.5349434138373586, + "grad_norm": 1.2532434463500977, + "learning_rate": 0.000232932871714486, + "loss": 2.1716, + "step": 13156 + }, + { + "epoch": 1.5350600863376502, + "grad_norm": 1.2820587158203125, + "learning_rate": 0.00023292022743151233, + "loss": 1.952, + "step": 13157 + }, + { + "epoch": 1.535176758837942, + "grad_norm": 1.140818476676941, + "learning_rate": 0.00023290758230446376, + "loss": 1.9441, + "step": 13158 + }, + { + "epoch": 1.5352934313382336, + "grad_norm": 1.061798334121704, + "learning_rate": 0.0002328949363334715, + "loss": 1.9486, + "step": 13159 + }, + { + "epoch": 1.5354101038385253, + "grad_norm": 1.1094067096710205, + "learning_rate": 0.00023288228951866657, + "loss": 2.1283, + "step": 13160 + }, + { + "epoch": 1.535526776338817, + "grad_norm": 1.2411473989486694, + "learning_rate": 0.00023286964186018007, + "loss": 2.1212, + "step": 13161 + }, + { + "epoch": 1.5356434488391086, + "grad_norm": 0.9880355000495911, + "learning_rate": 0.00023285699335814317, + "loss": 1.8513, + "step": 13162 + }, + { + "epoch": 1.5357601213394003, + "grad_norm": 1.1203229427337646, + "learning_rate": 0.00023284434401268694, + "loss": 2.1351, + "step": 13163 + }, + { + "epoch": 1.535876793839692, + "grad_norm": 1.309325933456421, + "learning_rate": 0.0002328316938239426, + "loss": 2.1739, + "step": 13164 + }, + { + "epoch": 1.5359934663399837, + "grad_norm": 1.1788629293441772, + "learning_rate": 0.0002328190427920412, + "loss": 2.236, + "step": 13165 + }, + { + "epoch": 1.5361101388402754, + "grad_norm": 1.3597232103347778, + "learning_rate": 0.0002328063909171139, + "loss": 2.146, + "step": 13166 + }, + { + "epoch": 1.536226811340567, + "grad_norm": 0.9503220319747925, + "learning_rate": 0.00023279373819929194, + "loss": 1.912, + "step": 13167 + }, + { + "epoch": 1.5363434838408587, + "grad_norm": 1.3223100900650024, + "learning_rate": 0.00023278108463870644, + "loss": 1.8523, + "step": 13168 + }, + { + "epoch": 1.5364601563411504, + "grad_norm": 1.2377246618270874, + "learning_rate": 0.00023276843023548854, + "loss": 2.2049, + "step": 13169 + }, + { + "epoch": 1.536576828841442, + "grad_norm": 1.2747905254364014, + "learning_rate": 0.00023275577498976948, + "loss": 1.8059, + "step": 13170 + }, + { + "epoch": 1.5366935013417338, + "grad_norm": 1.298970103263855, + "learning_rate": 0.00023274311890168048, + "loss": 2.0537, + "step": 13171 + }, + { + "epoch": 1.5368101738420255, + "grad_norm": 1.2888556718826294, + "learning_rate": 0.00023273046197135272, + "loss": 2.1728, + "step": 13172 + }, + { + "epoch": 1.5369268463423171, + "grad_norm": 1.5161023139953613, + "learning_rate": 0.00023271780419891733, + "loss": 2.0664, + "step": 13173 + }, + { + "epoch": 1.5370435188426088, + "grad_norm": 1.218732476234436, + "learning_rate": 0.00023270514558450565, + "loss": 2.1236, + "step": 13174 + }, + { + "epoch": 1.5371601913429005, + "grad_norm": 1.0431119203567505, + "learning_rate": 0.00023269248612824882, + "loss": 1.9004, + "step": 13175 + }, + { + "epoch": 1.5372768638431922, + "grad_norm": 1.2186918258666992, + "learning_rate": 0.00023267982583027814, + "loss": 2.0484, + "step": 13176 + }, + { + "epoch": 1.5373935363434839, + "grad_norm": 1.1808747053146362, + "learning_rate": 0.00023266716469072488, + "loss": 2.1763, + "step": 13177 + }, + { + "epoch": 1.5375102088437755, + "grad_norm": 0.9985096454620361, + "learning_rate": 0.00023265450270972026, + "loss": 1.76, + "step": 13178 + }, + { + "epoch": 1.5376268813440672, + "grad_norm": 0.9234044551849365, + "learning_rate": 0.00023264183988739548, + "loss": 1.8827, + "step": 13179 + }, + { + "epoch": 1.537743553844359, + "grad_norm": 1.3375003337860107, + "learning_rate": 0.00023262917622388188, + "loss": 2.2592, + "step": 13180 + }, + { + "epoch": 1.5378602263446506, + "grad_norm": 1.152559757232666, + "learning_rate": 0.00023261651171931077, + "loss": 2.2109, + "step": 13181 + }, + { + "epoch": 1.5379768988449423, + "grad_norm": 1.1741963624954224, + "learning_rate": 0.00023260384637381336, + "loss": 1.963, + "step": 13182 + }, + { + "epoch": 1.538093571345234, + "grad_norm": 1.04847252368927, + "learning_rate": 0.00023259118018752106, + "loss": 2.2874, + "step": 13183 + }, + { + "epoch": 1.5382102438455256, + "grad_norm": 1.3540983200073242, + "learning_rate": 0.00023257851316056506, + "loss": 2.2758, + "step": 13184 + }, + { + "epoch": 1.5383269163458173, + "grad_norm": 1.2103441953659058, + "learning_rate": 0.00023256584529307678, + "loss": 2.0801, + "step": 13185 + }, + { + "epoch": 1.538443588846109, + "grad_norm": 1.1420433521270752, + "learning_rate": 0.0002325531765851875, + "loss": 2.0075, + "step": 13186 + }, + { + "epoch": 1.5385602613464007, + "grad_norm": 1.0674163103103638, + "learning_rate": 0.00023254050703702857, + "loss": 2.0395, + "step": 13187 + }, + { + "epoch": 1.5386769338466924, + "grad_norm": 1.2510662078857422, + "learning_rate": 0.00023252783664873128, + "loss": 2.149, + "step": 13188 + }, + { + "epoch": 1.538793606346984, + "grad_norm": 1.1337311267852783, + "learning_rate": 0.00023251516542042706, + "loss": 2.1371, + "step": 13189 + }, + { + "epoch": 1.5389102788472757, + "grad_norm": 1.1841001510620117, + "learning_rate": 0.00023250249335224718, + "loss": 2.191, + "step": 13190 + }, + { + "epoch": 1.5390269513475674, + "grad_norm": 1.2449493408203125, + "learning_rate": 0.00023248982044432316, + "loss": 2.086, + "step": 13191 + }, + { + "epoch": 1.539143623847859, + "grad_norm": 1.1227585077285767, + "learning_rate": 0.00023247714669678618, + "loss": 2.0712, + "step": 13192 + }, + { + "epoch": 1.5392602963481508, + "grad_norm": 1.2251285314559937, + "learning_rate": 0.0002324644721097678, + "loss": 2.1544, + "step": 13193 + }, + { + "epoch": 1.5393769688484424, + "grad_norm": 1.2659657001495361, + "learning_rate": 0.00023245179668339932, + "loss": 2.2571, + "step": 13194 + }, + { + "epoch": 1.5394936413487341, + "grad_norm": 1.0960173606872559, + "learning_rate": 0.00023243912041781217, + "loss": 2.0568, + "step": 13195 + }, + { + "epoch": 1.5396103138490258, + "grad_norm": 1.1126039028167725, + "learning_rate": 0.0002324264433131378, + "loss": 2.0208, + "step": 13196 + }, + { + "epoch": 1.5397269863493175, + "grad_norm": 1.04450523853302, + "learning_rate": 0.00023241376536950763, + "loss": 1.8989, + "step": 13197 + }, + { + "epoch": 1.5398436588496092, + "grad_norm": 1.0049035549163818, + "learning_rate": 0.00023240108658705296, + "loss": 2.0648, + "step": 13198 + }, + { + "epoch": 1.5399603313499008, + "grad_norm": 1.2972919940948486, + "learning_rate": 0.00023238840696590535, + "loss": 1.9197, + "step": 13199 + }, + { + "epoch": 1.5400770038501925, + "grad_norm": 1.2808440923690796, + "learning_rate": 0.00023237572650619626, + "loss": 1.9975, + "step": 13200 + }, + { + "epoch": 1.5401936763504842, + "grad_norm": 1.2648351192474365, + "learning_rate": 0.00023236304520805716, + "loss": 2.3471, + "step": 13201 + }, + { + "epoch": 1.540310348850776, + "grad_norm": 1.1252806186676025, + "learning_rate": 0.0002323503630716194, + "loss": 2.0171, + "step": 13202 + }, + { + "epoch": 1.5404270213510676, + "grad_norm": 1.2482002973556519, + "learning_rate": 0.00023233768009701456, + "loss": 2.1658, + "step": 13203 + }, + { + "epoch": 1.5405436938513593, + "grad_norm": 1.1664553880691528, + "learning_rate": 0.00023232499628437404, + "loss": 2.1382, + "step": 13204 + }, + { + "epoch": 1.540660366351651, + "grad_norm": 1.2807326316833496, + "learning_rate": 0.0002323123116338294, + "loss": 2.1973, + "step": 13205 + }, + { + "epoch": 1.5407770388519426, + "grad_norm": 0.988340437412262, + "learning_rate": 0.00023229962614551211, + "loss": 2.0364, + "step": 13206 + }, + { + "epoch": 1.5408937113522343, + "grad_norm": 1.087856650352478, + "learning_rate": 0.0002322869398195537, + "loss": 1.9792, + "step": 13207 + }, + { + "epoch": 1.541010383852526, + "grad_norm": 1.3810983896255493, + "learning_rate": 0.00023227425265608565, + "loss": 2.0896, + "step": 13208 + }, + { + "epoch": 1.5411270563528177, + "grad_norm": 1.0744032859802246, + "learning_rate": 0.00023226156465523958, + "loss": 2.06, + "step": 13209 + }, + { + "epoch": 1.5412437288531093, + "grad_norm": 1.293162226676941, + "learning_rate": 0.00023224887581714694, + "loss": 1.9693, + "step": 13210 + }, + { + "epoch": 1.541360401353401, + "grad_norm": 1.1030397415161133, + "learning_rate": 0.00023223618614193924, + "loss": 2.0575, + "step": 13211 + }, + { + "epoch": 1.5414770738536927, + "grad_norm": 1.173793911933899, + "learning_rate": 0.00023222349562974812, + "loss": 2.2348, + "step": 13212 + }, + { + "epoch": 1.5415937463539844, + "grad_norm": 1.2849364280700684, + "learning_rate": 0.00023221080428070508, + "loss": 2.2452, + "step": 13213 + }, + { + "epoch": 1.541710418854276, + "grad_norm": 1.2545642852783203, + "learning_rate": 0.00023219811209494176, + "loss": 2.0783, + "step": 13214 + }, + { + "epoch": 1.5418270913545677, + "grad_norm": 1.260913610458374, + "learning_rate": 0.0002321854190725897, + "loss": 2.1928, + "step": 13215 + }, + { + "epoch": 1.5419437638548594, + "grad_norm": 1.1580414772033691, + "learning_rate": 0.00023217272521378047, + "loss": 1.9866, + "step": 13216 + }, + { + "epoch": 1.542060436355151, + "grad_norm": 0.9820929169654846, + "learning_rate": 0.00023216003051864563, + "loss": 2.136, + "step": 13217 + }, + { + "epoch": 1.5421771088554428, + "grad_norm": 1.111788272857666, + "learning_rate": 0.00023214733498731681, + "loss": 2.1598, + "step": 13218 + }, + { + "epoch": 1.5422937813557345, + "grad_norm": 1.2964369058609009, + "learning_rate": 0.00023213463861992576, + "loss": 2.1299, + "step": 13219 + }, + { + "epoch": 1.5424104538560262, + "grad_norm": 1.1362558603286743, + "learning_rate": 0.00023212194141660383, + "loss": 2.1504, + "step": 13220 + }, + { + "epoch": 1.5425271263563178, + "grad_norm": 1.1302732229232788, + "learning_rate": 0.00023210924337748294, + "loss": 2.0094, + "step": 13221 + }, + { + "epoch": 1.5426437988566095, + "grad_norm": 1.1458579301834106, + "learning_rate": 0.00023209654450269458, + "loss": 1.9959, + "step": 13222 + }, + { + "epoch": 1.5427604713569012, + "grad_norm": 1.3445848226547241, + "learning_rate": 0.00023208384479237037, + "loss": 2.0693, + "step": 13223 + }, + { + "epoch": 1.5428771438571929, + "grad_norm": 1.2093034982681274, + "learning_rate": 0.00023207114424664204, + "loss": 2.1264, + "step": 13224 + }, + { + "epoch": 1.5429938163574846, + "grad_norm": 1.1375242471694946, + "learning_rate": 0.0002320584428656412, + "loss": 2.1258, + "step": 13225 + }, + { + "epoch": 1.5431104888577762, + "grad_norm": 1.069517731666565, + "learning_rate": 0.00023204574064949955, + "loss": 2.0776, + "step": 13226 + }, + { + "epoch": 1.543227161358068, + "grad_norm": 1.2646023035049438, + "learning_rate": 0.00023203303759834884, + "loss": 2.0107, + "step": 13227 + }, + { + "epoch": 1.5433438338583596, + "grad_norm": 1.1973851919174194, + "learning_rate": 0.00023202033371232065, + "loss": 2.0022, + "step": 13228 + }, + { + "epoch": 1.5434605063586513, + "grad_norm": 1.3467124700546265, + "learning_rate": 0.00023200762899154668, + "loss": 2.2917, + "step": 13229 + }, + { + "epoch": 1.543577178858943, + "grad_norm": 1.1161755323410034, + "learning_rate": 0.00023199492343615872, + "loss": 2.1321, + "step": 13230 + }, + { + "epoch": 1.5436938513592346, + "grad_norm": 1.4052460193634033, + "learning_rate": 0.00023198221704628843, + "loss": 2.0649, + "step": 13231 + }, + { + "epoch": 1.5438105238595263, + "grad_norm": 1.1299610137939453, + "learning_rate": 0.00023196950982206757, + "loss": 2.1605, + "step": 13232 + }, + { + "epoch": 1.543927196359818, + "grad_norm": 1.2612159252166748, + "learning_rate": 0.0002319568017636279, + "loss": 2.0525, + "step": 13233 + }, + { + "epoch": 1.5440438688601097, + "grad_norm": 1.0444844961166382, + "learning_rate": 0.00023194409287110101, + "loss": 2.0707, + "step": 13234 + }, + { + "epoch": 1.5441605413604014, + "grad_norm": 1.2100045680999756, + "learning_rate": 0.0002319313831446189, + "loss": 2.1247, + "step": 13235 + }, + { + "epoch": 1.544277213860693, + "grad_norm": 1.0354498624801636, + "learning_rate": 0.00023191867258431307, + "loss": 2.0426, + "step": 13236 + }, + { + "epoch": 1.5443938863609847, + "grad_norm": 1.0798518657684326, + "learning_rate": 0.0002319059611903155, + "loss": 2.0555, + "step": 13237 + }, + { + "epoch": 1.5445105588612764, + "grad_norm": 1.2458240985870361, + "learning_rate": 0.00023189324896275782, + "loss": 1.9707, + "step": 13238 + }, + { + "epoch": 1.544627231361568, + "grad_norm": 1.3143126964569092, + "learning_rate": 0.00023188053590177194, + "loss": 2.1479, + "step": 13239 + }, + { + "epoch": 1.5447439038618598, + "grad_norm": 1.2501312494277954, + "learning_rate": 0.00023186782200748958, + "loss": 2.088, + "step": 13240 + }, + { + "epoch": 1.5448605763621515, + "grad_norm": 1.2401074171066284, + "learning_rate": 0.0002318551072800425, + "loss": 2.0313, + "step": 13241 + }, + { + "epoch": 1.5449772488624431, + "grad_norm": 1.140363335609436, + "learning_rate": 0.00023184239171956266, + "loss": 2.1038, + "step": 13242 + }, + { + "epoch": 1.5450939213627348, + "grad_norm": 1.1466020345687866, + "learning_rate": 0.0002318296753261817, + "loss": 2.0696, + "step": 13243 + }, + { + "epoch": 1.5452105938630265, + "grad_norm": 1.2117547988891602, + "learning_rate": 0.00023181695810003158, + "loss": 2.1282, + "step": 13244 + }, + { + "epoch": 1.5453272663633182, + "grad_norm": 1.2125059366226196, + "learning_rate": 0.0002318042400412441, + "loss": 2.1691, + "step": 13245 + }, + { + "epoch": 1.5454439388636099, + "grad_norm": 1.1657075881958008, + "learning_rate": 0.00023179152114995106, + "loss": 2.1824, + "step": 13246 + }, + { + "epoch": 1.5455606113639015, + "grad_norm": 1.1176512241363525, + "learning_rate": 0.00023177880142628441, + "loss": 1.9963, + "step": 13247 + }, + { + "epoch": 1.5456772838641932, + "grad_norm": 1.264962077140808, + "learning_rate": 0.00023176608087037596, + "loss": 2.0667, + "step": 13248 + }, + { + "epoch": 1.545793956364485, + "grad_norm": 1.25925612449646, + "learning_rate": 0.00023175335948235758, + "loss": 2.149, + "step": 13249 + }, + { + "epoch": 1.5459106288647766, + "grad_norm": 1.0945810079574585, + "learning_rate": 0.00023174063726236115, + "loss": 2.0174, + "step": 13250 + }, + { + "epoch": 1.5460273013650683, + "grad_norm": 1.1657437086105347, + "learning_rate": 0.00023172791421051855, + "loss": 2.0864, + "step": 13251 + }, + { + "epoch": 1.54614397386536, + "grad_norm": 1.072753667831421, + "learning_rate": 0.00023171519032696172, + "loss": 1.9781, + "step": 13252 + }, + { + "epoch": 1.5462606463656516, + "grad_norm": 1.049385905265808, + "learning_rate": 0.0002317024656118225, + "loss": 1.9618, + "step": 13253 + }, + { + "epoch": 1.5463773188659433, + "grad_norm": 1.1584391593933105, + "learning_rate": 0.0002316897400652329, + "loss": 2.221, + "step": 13254 + }, + { + "epoch": 1.546493991366235, + "grad_norm": 1.4805724620819092, + "learning_rate": 0.0002316770136873248, + "loss": 1.9287, + "step": 13255 + }, + { + "epoch": 1.5466106638665267, + "grad_norm": 1.056564211845398, + "learning_rate": 0.0002316642864782301, + "loss": 1.9023, + "step": 13256 + }, + { + "epoch": 1.5467273363668184, + "grad_norm": 1.0387344360351562, + "learning_rate": 0.00023165155843808075, + "loss": 2.007, + "step": 13257 + }, + { + "epoch": 1.54684400886711, + "grad_norm": 1.257959246635437, + "learning_rate": 0.00023163882956700875, + "loss": 2.0178, + "step": 13258 + }, + { + "epoch": 1.5469606813674017, + "grad_norm": 1.3668144941329956, + "learning_rate": 0.000231626099865146, + "loss": 2.038, + "step": 13259 + }, + { + "epoch": 1.5470773538676934, + "grad_norm": 1.267348051071167, + "learning_rate": 0.0002316133693326245, + "loss": 2.005, + "step": 13260 + }, + { + "epoch": 1.547194026367985, + "grad_norm": 1.128320336341858, + "learning_rate": 0.0002316006379695762, + "loss": 2.1909, + "step": 13261 + }, + { + "epoch": 1.5473106988682768, + "grad_norm": 1.1049025058746338, + "learning_rate": 0.00023158790577613314, + "loss": 1.9507, + "step": 13262 + }, + { + "epoch": 1.5474273713685684, + "grad_norm": 1.2579001188278198, + "learning_rate": 0.00023157517275242728, + "loss": 2.0086, + "step": 13263 + }, + { + "epoch": 1.5475440438688601, + "grad_norm": 1.1826058626174927, + "learning_rate": 0.0002315624388985906, + "loss": 2.1513, + "step": 13264 + }, + { + "epoch": 1.5476607163691518, + "grad_norm": 1.2446410655975342, + "learning_rate": 0.00023154970421475517, + "loss": 2.1101, + "step": 13265 + }, + { + "epoch": 1.5477773888694435, + "grad_norm": 1.3535749912261963, + "learning_rate": 0.0002315369687010529, + "loss": 2.0795, + "step": 13266 + }, + { + "epoch": 1.5478940613697352, + "grad_norm": 1.4296154975891113, + "learning_rate": 0.00023152423235761594, + "loss": 2.1933, + "step": 13267 + }, + { + "epoch": 1.5480107338700269, + "grad_norm": 1.3723424673080444, + "learning_rate": 0.00023151149518457626, + "loss": 2.0463, + "step": 13268 + }, + { + "epoch": 1.5481274063703185, + "grad_norm": 1.0854356288909912, + "learning_rate": 0.0002314987571820659, + "loss": 1.929, + "step": 13269 + }, + { + "epoch": 1.5482440788706102, + "grad_norm": 1.1235281229019165, + "learning_rate": 0.00023148601835021695, + "loss": 2.1112, + "step": 13270 + }, + { + "epoch": 1.548360751370902, + "grad_norm": 1.1345773935317993, + "learning_rate": 0.00023147327868916143, + "loss": 2.2117, + "step": 13271 + }, + { + "epoch": 1.5484774238711936, + "grad_norm": 1.1934528350830078, + "learning_rate": 0.00023146053819903144, + "loss": 2.1268, + "step": 13272 + }, + { + "epoch": 1.5485940963714853, + "grad_norm": 1.1722865104675293, + "learning_rate": 0.00023144779687995907, + "loss": 2.1993, + "step": 13273 + }, + { + "epoch": 1.548710768871777, + "grad_norm": 1.2039034366607666, + "learning_rate": 0.0002314350547320764, + "loss": 2.0824, + "step": 13274 + }, + { + "epoch": 1.5488274413720686, + "grad_norm": 0.9841431379318237, + "learning_rate": 0.0002314223117555155, + "loss": 1.9046, + "step": 13275 + }, + { + "epoch": 1.5489441138723603, + "grad_norm": 1.0721652507781982, + "learning_rate": 0.0002314095679504085, + "loss": 2.0788, + "step": 13276 + }, + { + "epoch": 1.549060786372652, + "grad_norm": 1.0611335039138794, + "learning_rate": 0.00023139682331688752, + "loss": 2.2609, + "step": 13277 + }, + { + "epoch": 1.5491774588729437, + "grad_norm": 1.1092849969863892, + "learning_rate": 0.00023138407785508464, + "loss": 2.0357, + "step": 13278 + }, + { + "epoch": 1.5492941313732354, + "grad_norm": 1.0948008298873901, + "learning_rate": 0.00023137133156513203, + "loss": 2.0161, + "step": 13279 + }, + { + "epoch": 1.549410803873527, + "grad_norm": 1.1691471338272095, + "learning_rate": 0.00023135858444716187, + "loss": 2.0856, + "step": 13280 + }, + { + "epoch": 1.5495274763738187, + "grad_norm": 1.2391453981399536, + "learning_rate": 0.00023134583650130623, + "loss": 2.172, + "step": 13281 + }, + { + "epoch": 1.5496441488741104, + "grad_norm": 1.0593668222427368, + "learning_rate": 0.00023133308772769727, + "loss": 1.9587, + "step": 13282 + }, + { + "epoch": 1.549760821374402, + "grad_norm": 1.0639528036117554, + "learning_rate": 0.00023132033812646717, + "loss": 1.9798, + "step": 13283 + }, + { + "epoch": 1.5498774938746938, + "grad_norm": 1.087868094444275, + "learning_rate": 0.00023130758769774817, + "loss": 2.0791, + "step": 13284 + }, + { + "epoch": 1.5499941663749854, + "grad_norm": 1.2503468990325928, + "learning_rate": 0.00023129483644167227, + "loss": 2.3528, + "step": 13285 + }, + { + "epoch": 1.5501108388752771, + "grad_norm": 1.1365838050842285, + "learning_rate": 0.00023128208435837193, + "loss": 1.9411, + "step": 13286 + }, + { + "epoch": 1.5502275113755688, + "grad_norm": 1.1194300651550293, + "learning_rate": 0.00023126933144797915, + "loss": 2.1803, + "step": 13287 + }, + { + "epoch": 1.5503441838758605, + "grad_norm": 1.1178423166275024, + "learning_rate": 0.00023125657771062621, + "loss": 2.2815, + "step": 13288 + }, + { + "epoch": 1.5504608563761522, + "grad_norm": 1.1453627347946167, + "learning_rate": 0.00023124382314644527, + "loss": 2.0357, + "step": 13289 + }, + { + "epoch": 1.5505775288764438, + "grad_norm": 1.2594050168991089, + "learning_rate": 0.00023123106775556858, + "loss": 2.2196, + "step": 13290 + }, + { + "epoch": 1.5506942013767355, + "grad_norm": 1.2781481742858887, + "learning_rate": 0.0002312183115381285, + "loss": 1.9972, + "step": 13291 + }, + { + "epoch": 1.5508108738770272, + "grad_norm": 1.3092042207717896, + "learning_rate": 0.00023120555449425708, + "loss": 2.0371, + "step": 13292 + }, + { + "epoch": 1.5509275463773189, + "grad_norm": 1.135198712348938, + "learning_rate": 0.0002311927966240866, + "loss": 2.0601, + "step": 13293 + }, + { + "epoch": 1.5510442188776106, + "grad_norm": 1.3276969194412231, + "learning_rate": 0.00023118003792774948, + "loss": 2.0694, + "step": 13294 + }, + { + "epoch": 1.5511608913779023, + "grad_norm": 1.1597185134887695, + "learning_rate": 0.00023116727840537783, + "loss": 2.1186, + "step": 13295 + }, + { + "epoch": 1.551277563878194, + "grad_norm": 1.1697696447372437, + "learning_rate": 0.000231154518057104, + "loss": 2.2077, + "step": 13296 + }, + { + "epoch": 1.5513942363784856, + "grad_norm": 1.1176234483718872, + "learning_rate": 0.00023114175688306022, + "loss": 2.121, + "step": 13297 + }, + { + "epoch": 1.5515109088787773, + "grad_norm": 1.120800495147705, + "learning_rate": 0.00023112899488337885, + "loss": 1.9803, + "step": 13298 + }, + { + "epoch": 1.551627581379069, + "grad_norm": 1.021613359451294, + "learning_rate": 0.00023111623205819215, + "loss": 2.1552, + "step": 13299 + }, + { + "epoch": 1.5517442538793607, + "grad_norm": 1.1238988637924194, + "learning_rate": 0.00023110346840763237, + "loss": 2.0401, + "step": 13300 + }, + { + "epoch": 1.5518609263796523, + "grad_norm": 1.0456552505493164, + "learning_rate": 0.00023109070393183197, + "loss": 1.9255, + "step": 13301 + }, + { + "epoch": 1.551977598879944, + "grad_norm": 1.1745941638946533, + "learning_rate": 0.0002310779386309232, + "loss": 1.9544, + "step": 13302 + }, + { + "epoch": 1.5520942713802357, + "grad_norm": 1.103397011756897, + "learning_rate": 0.0002310651725050384, + "loss": 2.1635, + "step": 13303 + }, + { + "epoch": 1.5522109438805274, + "grad_norm": 1.2370636463165283, + "learning_rate": 0.00023105240555430993, + "loss": 2.1843, + "step": 13304 + }, + { + "epoch": 1.552327616380819, + "grad_norm": 1.0780001878738403, + "learning_rate": 0.0002310396377788701, + "loss": 2.1272, + "step": 13305 + }, + { + "epoch": 1.5524442888811107, + "grad_norm": 1.6022652387619019, + "learning_rate": 0.00023102686917885135, + "loss": 2.1133, + "step": 13306 + }, + { + "epoch": 1.5525609613814024, + "grad_norm": 1.1226260662078857, + "learning_rate": 0.00023101409975438597, + "loss": 2.0791, + "step": 13307 + }, + { + "epoch": 1.552677633881694, + "grad_norm": 1.1058847904205322, + "learning_rate": 0.0002310013295056064, + "loss": 1.8935, + "step": 13308 + }, + { + "epoch": 1.5527943063819858, + "grad_norm": 1.234940528869629, + "learning_rate": 0.00023098855843264494, + "loss": 2.0158, + "step": 13309 + }, + { + "epoch": 1.5529109788822775, + "grad_norm": 1.1277098655700684, + "learning_rate": 0.0002309757865356341, + "loss": 2.1194, + "step": 13310 + }, + { + "epoch": 1.5530276513825692, + "grad_norm": 1.2380375862121582, + "learning_rate": 0.00023096301381470618, + "loss": 1.9583, + "step": 13311 + }, + { + "epoch": 1.5531443238828608, + "grad_norm": 1.0171680450439453, + "learning_rate": 0.00023095024026999368, + "loss": 2.0049, + "step": 13312 + }, + { + "epoch": 1.5532609963831525, + "grad_norm": 1.0794543027877808, + "learning_rate": 0.000230937465901629, + "loss": 1.805, + "step": 13313 + }, + { + "epoch": 1.5533776688834442, + "grad_norm": 1.5230013132095337, + "learning_rate": 0.00023092469070974455, + "loss": 2.1692, + "step": 13314 + }, + { + "epoch": 1.5534943413837359, + "grad_norm": 1.205031156539917, + "learning_rate": 0.00023091191469447285, + "loss": 2.0761, + "step": 13315 + }, + { + "epoch": 1.5536110138840276, + "grad_norm": 0.9825060367584229, + "learning_rate": 0.00023089913785594615, + "loss": 1.8735, + "step": 13316 + }, + { + "epoch": 1.5537276863843192, + "grad_norm": 1.3211207389831543, + "learning_rate": 0.00023088636019429713, + "loss": 2.1879, + "step": 13317 + }, + { + "epoch": 1.553844358884611, + "grad_norm": 1.0762797594070435, + "learning_rate": 0.0002308735817096581, + "loss": 1.9703, + "step": 13318 + }, + { + "epoch": 1.5539610313849026, + "grad_norm": 1.2428576946258545, + "learning_rate": 0.0002308608024021616, + "loss": 1.9423, + "step": 13319 + }, + { + "epoch": 1.5540777038851943, + "grad_norm": 1.2548072338104248, + "learning_rate": 0.0002308480222719401, + "loss": 2.1079, + "step": 13320 + }, + { + "epoch": 1.554194376385486, + "grad_norm": 1.2403154373168945, + "learning_rate": 0.00023083524131912612, + "loss": 2.1262, + "step": 13321 + }, + { + "epoch": 1.5543110488857776, + "grad_norm": 1.146108627319336, + "learning_rate": 0.0002308224595438521, + "loss": 1.9748, + "step": 13322 + }, + { + "epoch": 1.5544277213860693, + "grad_norm": 1.1245266199111938, + "learning_rate": 0.00023080967694625058, + "loss": 1.9921, + "step": 13323 + }, + { + "epoch": 1.554544393886361, + "grad_norm": 1.1529309749603271, + "learning_rate": 0.00023079689352645407, + "loss": 2.2134, + "step": 13324 + }, + { + "epoch": 1.5546610663866527, + "grad_norm": 1.3625712394714355, + "learning_rate": 0.00023078410928459512, + "loss": 2.098, + "step": 13325 + }, + { + "epoch": 1.5547777388869444, + "grad_norm": 1.243772029876709, + "learning_rate": 0.0002307713242208062, + "loss": 2.1031, + "step": 13326 + }, + { + "epoch": 1.554894411387236, + "grad_norm": 1.2037875652313232, + "learning_rate": 0.0002307585383352199, + "loss": 2.0622, + "step": 13327 + }, + { + "epoch": 1.5550110838875277, + "grad_norm": 1.2865142822265625, + "learning_rate": 0.00023074575162796882, + "loss": 1.8981, + "step": 13328 + }, + { + "epoch": 1.5551277563878194, + "grad_norm": 1.1241439580917358, + "learning_rate": 0.00023073296409918537, + "loss": 2.1059, + "step": 13329 + }, + { + "epoch": 1.555244428888111, + "grad_norm": 1.198325276374817, + "learning_rate": 0.0002307201757490023, + "loss": 2.0721, + "step": 13330 + }, + { + "epoch": 1.5553611013884028, + "grad_norm": 1.1101295948028564, + "learning_rate": 0.00023070738657755203, + "loss": 2.1569, + "step": 13331 + }, + { + "epoch": 1.5554777738886945, + "grad_norm": 1.147587776184082, + "learning_rate": 0.0002306945965849672, + "loss": 2.0551, + "step": 13332 + }, + { + "epoch": 1.5555944463889861, + "grad_norm": 1.12086820602417, + "learning_rate": 0.00023068180577138052, + "loss": 2.0305, + "step": 13333 + }, + { + "epoch": 1.5557111188892778, + "grad_norm": 1.0736194849014282, + "learning_rate": 0.00023066901413692435, + "loss": 2.2259, + "step": 13334 + }, + { + "epoch": 1.5558277913895695, + "grad_norm": 0.9597567319869995, + "learning_rate": 0.0002306562216817315, + "loss": 1.916, + "step": 13335 + }, + { + "epoch": 1.5559444638898612, + "grad_norm": 1.122233510017395, + "learning_rate": 0.00023064342840593447, + "loss": 2.1933, + "step": 13336 + }, + { + "epoch": 1.5560611363901529, + "grad_norm": 1.0465831756591797, + "learning_rate": 0.00023063063430966594, + "loss": 2.0809, + "step": 13337 + }, + { + "epoch": 1.5561778088904445, + "grad_norm": 0.9977936148643494, + "learning_rate": 0.00023061783939305852, + "loss": 2.0773, + "step": 13338 + }, + { + "epoch": 1.5562944813907362, + "grad_norm": 1.0386260747909546, + "learning_rate": 0.00023060504365624495, + "loss": 1.9985, + "step": 13339 + }, + { + "epoch": 1.556411153891028, + "grad_norm": 1.1024655103683472, + "learning_rate": 0.00023059224709935774, + "loss": 2.083, + "step": 13340 + }, + { + "epoch": 1.5565278263913196, + "grad_norm": 1.1287287473678589, + "learning_rate": 0.00023057944972252969, + "loss": 2.1187, + "step": 13341 + }, + { + "epoch": 1.5566444988916113, + "grad_norm": 1.0441163778305054, + "learning_rate": 0.0002305666515258933, + "loss": 2.2299, + "step": 13342 + }, + { + "epoch": 1.556761171391903, + "grad_norm": 1.1626378297805786, + "learning_rate": 0.0002305538525095814, + "loss": 2.1315, + "step": 13343 + }, + { + "epoch": 1.5568778438921946, + "grad_norm": 1.1932830810546875, + "learning_rate": 0.00023054105267372661, + "loss": 2.1214, + "step": 13344 + }, + { + "epoch": 1.5569945163924863, + "grad_norm": 1.1270732879638672, + "learning_rate": 0.00023052825201846163, + "loss": 2.2273, + "step": 13345 + }, + { + "epoch": 1.557111188892778, + "grad_norm": 1.074304461479187, + "learning_rate": 0.0002305154505439192, + "loss": 2.1032, + "step": 13346 + }, + { + "epoch": 1.5572278613930697, + "grad_norm": 1.170267939567566, + "learning_rate": 0.0002305026482502319, + "loss": 2.2107, + "step": 13347 + }, + { + "epoch": 1.5573445338933614, + "grad_norm": 1.0970464944839478, + "learning_rate": 0.00023048984513753264, + "loss": 2.1715, + "step": 13348 + }, + { + "epoch": 1.557461206393653, + "grad_norm": 1.1712549924850464, + "learning_rate": 0.00023047704120595397, + "loss": 2.0363, + "step": 13349 + }, + { + "epoch": 1.5575778788939447, + "grad_norm": 1.1583144664764404, + "learning_rate": 0.00023046423645562878, + "loss": 2.088, + "step": 13350 + }, + { + "epoch": 1.5576945513942364, + "grad_norm": 1.1876240968704224, + "learning_rate": 0.00023045143088668972, + "loss": 1.9828, + "step": 13351 + }, + { + "epoch": 1.557811223894528, + "grad_norm": 1.0263029336929321, + "learning_rate": 0.0002304386244992696, + "loss": 2.1495, + "step": 13352 + }, + { + "epoch": 1.5579278963948198, + "grad_norm": 0.9529525637626648, + "learning_rate": 0.00023042581729350114, + "loss": 2.0529, + "step": 13353 + }, + { + "epoch": 1.5580445688951114, + "grad_norm": 1.414016604423523, + "learning_rate": 0.0002304130092695171, + "loss": 2.1074, + "step": 13354 + }, + { + "epoch": 1.5581612413954031, + "grad_norm": 1.0545109510421753, + "learning_rate": 0.00023040020042745034, + "loss": 2.0542, + "step": 13355 + }, + { + "epoch": 1.5582779138956948, + "grad_norm": 1.1284219026565552, + "learning_rate": 0.00023038739076743355, + "loss": 2.0846, + "step": 13356 + }, + { + "epoch": 1.5583945863959865, + "grad_norm": 1.1516755819320679, + "learning_rate": 0.0002303745802895996, + "loss": 2.0796, + "step": 13357 + }, + { + "epoch": 1.5585112588962782, + "grad_norm": 1.3671478033065796, + "learning_rate": 0.00023036176899408125, + "loss": 2.2304, + "step": 13358 + }, + { + "epoch": 1.5586279313965699, + "grad_norm": 0.9977447986602783, + "learning_rate": 0.00023034895688101134, + "loss": 1.9425, + "step": 13359 + }, + { + "epoch": 1.5587446038968615, + "grad_norm": 1.2354743480682373, + "learning_rate": 0.00023033614395052266, + "loss": 1.9771, + "step": 13360 + }, + { + "epoch": 1.5588612763971532, + "grad_norm": 1.295853853225708, + "learning_rate": 0.00023032333020274804, + "loss": 2.1057, + "step": 13361 + }, + { + "epoch": 1.558977948897445, + "grad_norm": 1.2731212377548218, + "learning_rate": 0.00023031051563782035, + "loss": 2.1497, + "step": 13362 + }, + { + "epoch": 1.5590946213977366, + "grad_norm": 1.1852822303771973, + "learning_rate": 0.00023029770025587246, + "loss": 2.0341, + "step": 13363 + }, + { + "epoch": 1.5592112938980283, + "grad_norm": 1.049129843711853, + "learning_rate": 0.00023028488405703714, + "loss": 1.9422, + "step": 13364 + }, + { + "epoch": 1.55932796639832, + "grad_norm": 1.3707131147384644, + "learning_rate": 0.0002302720670414473, + "loss": 2.425, + "step": 13365 + }, + { + "epoch": 1.5594446388986116, + "grad_norm": 0.9550833702087402, + "learning_rate": 0.00023025924920923587, + "loss": 1.9954, + "step": 13366 + }, + { + "epoch": 1.5595613113989033, + "grad_norm": 1.129913568496704, + "learning_rate": 0.0002302464305605357, + "loss": 1.9729, + "step": 13367 + }, + { + "epoch": 1.559677983899195, + "grad_norm": 1.2125072479248047, + "learning_rate": 0.0002302336110954796, + "loss": 2.1186, + "step": 13368 + }, + { + "epoch": 1.5597946563994867, + "grad_norm": 1.0708637237548828, + "learning_rate": 0.00023022079081420053, + "loss": 2.0901, + "step": 13369 + }, + { + "epoch": 1.5599113288997783, + "grad_norm": 1.371662974357605, + "learning_rate": 0.0002302079697168314, + "loss": 2.354, + "step": 13370 + }, + { + "epoch": 1.56002800140007, + "grad_norm": 1.0536017417907715, + "learning_rate": 0.00023019514780350515, + "loss": 2.0336, + "step": 13371 + }, + { + "epoch": 1.5601446739003617, + "grad_norm": 1.3543975353240967, + "learning_rate": 0.00023018232507435467, + "loss": 2.2819, + "step": 13372 + }, + { + "epoch": 1.5602613464006534, + "grad_norm": 0.9604460000991821, + "learning_rate": 0.0002301695015295128, + "loss": 1.8923, + "step": 13373 + }, + { + "epoch": 1.560378018900945, + "grad_norm": 1.0343074798583984, + "learning_rate": 0.0002301566771691127, + "loss": 1.9149, + "step": 13374 + }, + { + "epoch": 1.5604946914012368, + "grad_norm": 1.1279908418655396, + "learning_rate": 0.0002301438519932871, + "loss": 2.1271, + "step": 13375 + }, + { + "epoch": 1.5606113639015284, + "grad_norm": 1.1021244525909424, + "learning_rate": 0.00023013102600216915, + "loss": 2.125, + "step": 13376 + }, + { + "epoch": 1.5607280364018201, + "grad_norm": 1.1278446912765503, + "learning_rate": 0.00023011819919589163, + "loss": 2.1811, + "step": 13377 + }, + { + "epoch": 1.5608447089021118, + "grad_norm": 1.2496778964996338, + "learning_rate": 0.0002301053715745876, + "loss": 1.9052, + "step": 13378 + }, + { + "epoch": 1.5609613814024035, + "grad_norm": 1.2291423082351685, + "learning_rate": 0.00023009254313839002, + "loss": 2.0931, + "step": 13379 + }, + { + "epoch": 1.5610780539026952, + "grad_norm": 1.2582403421401978, + "learning_rate": 0.00023007971388743195, + "loss": 2.1697, + "step": 13380 + }, + { + "epoch": 1.5611947264029868, + "grad_norm": 1.4063307046890259, + "learning_rate": 0.00023006688382184637, + "loss": 2.0336, + "step": 13381 + }, + { + "epoch": 1.5613113989032785, + "grad_norm": 1.2462897300720215, + "learning_rate": 0.00023005405294176622, + "loss": 2.1155, + "step": 13382 + }, + { + "epoch": 1.5614280714035702, + "grad_norm": 1.2087260484695435, + "learning_rate": 0.00023004122124732458, + "loss": 2.0459, + "step": 13383 + }, + { + "epoch": 1.5615447439038619, + "grad_norm": 1.1606472730636597, + "learning_rate": 0.00023002838873865445, + "loss": 2.1496, + "step": 13384 + }, + { + "epoch": 1.5616614164041536, + "grad_norm": 1.1496655941009521, + "learning_rate": 0.00023001555541588884, + "loss": 2.0836, + "step": 13385 + }, + { + "epoch": 1.5617780889044452, + "grad_norm": 1.1263474225997925, + "learning_rate": 0.00023000272127916084, + "loss": 1.9588, + "step": 13386 + }, + { + "epoch": 1.561894761404737, + "grad_norm": 1.2795060873031616, + "learning_rate": 0.00022998988632860347, + "loss": 2.264, + "step": 13387 + }, + { + "epoch": 1.5620114339050286, + "grad_norm": 1.226662039756775, + "learning_rate": 0.00022997705056434974, + "loss": 2.0435, + "step": 13388 + }, + { + "epoch": 1.5621281064053203, + "grad_norm": 1.3880060911178589, + "learning_rate": 0.00022996421398653288, + "loss": 2.1629, + "step": 13389 + }, + { + "epoch": 1.562244778905612, + "grad_norm": 1.1331042051315308, + "learning_rate": 0.00022995137659528577, + "loss": 2.0343, + "step": 13390 + }, + { + "epoch": 1.5623614514059037, + "grad_norm": 1.0987465381622314, + "learning_rate": 0.0002299385383907416, + "loss": 2.0465, + "step": 13391 + }, + { + "epoch": 1.5624781239061953, + "grad_norm": 1.1696553230285645, + "learning_rate": 0.0002299256993730335, + "loss": 2.1524, + "step": 13392 + }, + { + "epoch": 1.562594796406487, + "grad_norm": 1.1549561023712158, + "learning_rate": 0.00022991285954229447, + "loss": 1.9586, + "step": 13393 + }, + { + "epoch": 1.5627114689067787, + "grad_norm": 1.2116341590881348, + "learning_rate": 0.00022990001889865765, + "loss": 1.9587, + "step": 13394 + }, + { + "epoch": 1.5628281414070704, + "grad_norm": 1.0496554374694824, + "learning_rate": 0.00022988717744225616, + "loss": 2.0476, + "step": 13395 + }, + { + "epoch": 1.562944813907362, + "grad_norm": 1.091362714767456, + "learning_rate": 0.00022987433517322322, + "loss": 2.2148, + "step": 13396 + }, + { + "epoch": 1.5630614864076537, + "grad_norm": 1.1421879529953003, + "learning_rate": 0.00022986149209169182, + "loss": 2.0856, + "step": 13397 + }, + { + "epoch": 1.5631781589079454, + "grad_norm": 1.0762863159179688, + "learning_rate": 0.00022984864819779516, + "loss": 2.2912, + "step": 13398 + }, + { + "epoch": 1.563294831408237, + "grad_norm": 1.305516004562378, + "learning_rate": 0.00022983580349166641, + "loss": 2.1373, + "step": 13399 + }, + { + "epoch": 1.5634115039085288, + "grad_norm": 1.3262925148010254, + "learning_rate": 0.00022982295797343874, + "loss": 2.1228, + "step": 13400 + }, + { + "epoch": 1.5635281764088205, + "grad_norm": 1.020814061164856, + "learning_rate": 0.00022981011164324524, + "loss": 2.0541, + "step": 13401 + }, + { + "epoch": 1.5636448489091122, + "grad_norm": 0.9821900129318237, + "learning_rate": 0.0002297972645012192, + "loss": 1.8401, + "step": 13402 + }, + { + "epoch": 1.5637615214094038, + "grad_norm": 1.0030781030654907, + "learning_rate": 0.00022978441654749368, + "loss": 1.902, + "step": 13403 + }, + { + "epoch": 1.5638781939096955, + "grad_norm": 1.124819040298462, + "learning_rate": 0.00022977156778220198, + "loss": 2.0449, + "step": 13404 + }, + { + "epoch": 1.5639948664099872, + "grad_norm": 1.1816661357879639, + "learning_rate": 0.00022975871820547726, + "loss": 2.1462, + "step": 13405 + }, + { + "epoch": 1.5641115389102789, + "grad_norm": 1.2485809326171875, + "learning_rate": 0.00022974586781745272, + "loss": 2.1841, + "step": 13406 + }, + { + "epoch": 1.5642282114105706, + "grad_norm": 1.1315689086914062, + "learning_rate": 0.00022973301661826157, + "loss": 2.0937, + "step": 13407 + }, + { + "epoch": 1.5643448839108622, + "grad_norm": 1.1673362255096436, + "learning_rate": 0.0002297201646080371, + "loss": 2.0773, + "step": 13408 + }, + { + "epoch": 1.564461556411154, + "grad_norm": 1.2378909587860107, + "learning_rate": 0.00022970731178691245, + "loss": 2.2078, + "step": 13409 + }, + { + "epoch": 1.5645782289114456, + "grad_norm": 1.1223927736282349, + "learning_rate": 0.00022969445815502102, + "loss": 2.0483, + "step": 13410 + }, + { + "epoch": 1.5646949014117373, + "grad_norm": 1.2724865674972534, + "learning_rate": 0.00022968160371249587, + "loss": 2.1605, + "step": 13411 + }, + { + "epoch": 1.564811573912029, + "grad_norm": 1.087273359298706, + "learning_rate": 0.00022966874845947038, + "loss": 2.0664, + "step": 13412 + }, + { + "epoch": 1.5649282464123206, + "grad_norm": 1.105924129486084, + "learning_rate": 0.00022965589239607774, + "loss": 2.1023, + "step": 13413 + }, + { + "epoch": 1.5650449189126123, + "grad_norm": 1.2668616771697998, + "learning_rate": 0.00022964303552245132, + "loss": 2.2316, + "step": 13414 + }, + { + "epoch": 1.565161591412904, + "grad_norm": 1.1688458919525146, + "learning_rate": 0.00022963017783872436, + "loss": 2.0772, + "step": 13415 + }, + { + "epoch": 1.5652782639131957, + "grad_norm": 1.2570167779922485, + "learning_rate": 0.0002296173193450301, + "loss": 2.2415, + "step": 13416 + }, + { + "epoch": 1.5653949364134874, + "grad_norm": 1.3013386726379395, + "learning_rate": 0.00022960446004150191, + "loss": 2.1241, + "step": 13417 + }, + { + "epoch": 1.565511608913779, + "grad_norm": 1.2937148809432983, + "learning_rate": 0.00022959159992827317, + "loss": 2.0617, + "step": 13418 + }, + { + "epoch": 1.5656282814140707, + "grad_norm": 1.309726357460022, + "learning_rate": 0.00022957873900547707, + "loss": 2.0861, + "step": 13419 + }, + { + "epoch": 1.5657449539143624, + "grad_norm": 1.286783218383789, + "learning_rate": 0.00022956587727324695, + "loss": 2.2025, + "step": 13420 + }, + { + "epoch": 1.565861626414654, + "grad_norm": 1.0420693159103394, + "learning_rate": 0.00022955301473171624, + "loss": 1.8495, + "step": 13421 + }, + { + "epoch": 1.5659782989149458, + "grad_norm": 1.3487178087234497, + "learning_rate": 0.0002295401513810182, + "loss": 2.2191, + "step": 13422 + }, + { + "epoch": 1.5660949714152375, + "grad_norm": 1.1860467195510864, + "learning_rate": 0.0002295272872212862, + "loss": 2.0669, + "step": 13423 + }, + { + "epoch": 1.5662116439155291, + "grad_norm": 1.2094494104385376, + "learning_rate": 0.00022951442225265363, + "loss": 2.032, + "step": 13424 + }, + { + "epoch": 1.5663283164158208, + "grad_norm": 1.242326259613037, + "learning_rate": 0.00022950155647525383, + "loss": 2.1922, + "step": 13425 + }, + { + "epoch": 1.5664449889161125, + "grad_norm": 1.327633023262024, + "learning_rate": 0.0002294886898892202, + "loss": 2.1118, + "step": 13426 + }, + { + "epoch": 1.5665616614164042, + "grad_norm": 1.1791545152664185, + "learning_rate": 0.0002294758224946861, + "loss": 2.1145, + "step": 13427 + }, + { + "epoch": 1.5666783339166959, + "grad_norm": 1.1714327335357666, + "learning_rate": 0.000229462954291785, + "loss": 2.0482, + "step": 13428 + }, + { + "epoch": 1.5667950064169875, + "grad_norm": 1.2639451026916504, + "learning_rate": 0.00022945008528065016, + "loss": 2.0019, + "step": 13429 + }, + { + "epoch": 1.5669116789172792, + "grad_norm": 1.204681634902954, + "learning_rate": 0.00022943721546141513, + "loss": 2.1637, + "step": 13430 + }, + { + "epoch": 1.567028351417571, + "grad_norm": 1.1779264211654663, + "learning_rate": 0.00022942434483421323, + "loss": 2.1009, + "step": 13431 + }, + { + "epoch": 1.5671450239178626, + "grad_norm": 1.12730073928833, + "learning_rate": 0.00022941147339917796, + "loss": 2.0588, + "step": 13432 + }, + { + "epoch": 1.5672616964181543, + "grad_norm": 1.2069686651229858, + "learning_rate": 0.00022939860115644273, + "loss": 1.9733, + "step": 13433 + }, + { + "epoch": 1.567378368918446, + "grad_norm": 1.1808967590332031, + "learning_rate": 0.00022938572810614097, + "loss": 2.1768, + "step": 13434 + }, + { + "epoch": 1.5674950414187376, + "grad_norm": 1.132504940032959, + "learning_rate": 0.00022937285424840624, + "loss": 2.069, + "step": 13435 + }, + { + "epoch": 1.5676117139190293, + "grad_norm": 1.109128713607788, + "learning_rate": 0.0002293599795833718, + "loss": 2.1092, + "step": 13436 + }, + { + "epoch": 1.567728386419321, + "grad_norm": 1.0463751554489136, + "learning_rate": 0.00022934710411117126, + "loss": 2.0321, + "step": 13437 + }, + { + "epoch": 1.5678450589196127, + "grad_norm": 1.2445220947265625, + "learning_rate": 0.00022933422783193809, + "loss": 2.2324, + "step": 13438 + }, + { + "epoch": 1.5679617314199044, + "grad_norm": 1.296747088432312, + "learning_rate": 0.00022932135074580574, + "loss": 2.1267, + "step": 13439 + }, + { + "epoch": 1.568078403920196, + "grad_norm": 1.2924120426177979, + "learning_rate": 0.00022930847285290772, + "loss": 2.012, + "step": 13440 + }, + { + "epoch": 1.5681950764204877, + "grad_norm": 1.1297248601913452, + "learning_rate": 0.00022929559415337756, + "loss": 2.1038, + "step": 13441 + }, + { + "epoch": 1.5683117489207794, + "grad_norm": 1.3327947854995728, + "learning_rate": 0.0002292827146473487, + "loss": 2.0909, + "step": 13442 + }, + { + "epoch": 1.568428421421071, + "grad_norm": 1.167776107788086, + "learning_rate": 0.00022926983433495477, + "loss": 2.049, + "step": 13443 + }, + { + "epoch": 1.5685450939213628, + "grad_norm": 1.0431252717971802, + "learning_rate": 0.00022925695321632922, + "loss": 2.1687, + "step": 13444 + }, + { + "epoch": 1.5686617664216544, + "grad_norm": 1.182384729385376, + "learning_rate": 0.00022924407129160557, + "loss": 2.1145, + "step": 13445 + }, + { + "epoch": 1.5687784389219461, + "grad_norm": 1.0643140077590942, + "learning_rate": 0.00022923118856091746, + "loss": 2.0859, + "step": 13446 + }, + { + "epoch": 1.5688951114222378, + "grad_norm": 1.0010334253311157, + "learning_rate": 0.00022921830502439834, + "loss": 2.0703, + "step": 13447 + }, + { + "epoch": 1.5690117839225295, + "grad_norm": 1.0995497703552246, + "learning_rate": 0.0002292054206821818, + "loss": 2.1959, + "step": 13448 + }, + { + "epoch": 1.5691284564228212, + "grad_norm": 1.1808325052261353, + "learning_rate": 0.00022919253553440146, + "loss": 1.9874, + "step": 13449 + }, + { + "epoch": 1.5692451289231129, + "grad_norm": 1.1568734645843506, + "learning_rate": 0.00022917964958119085, + "loss": 2.1247, + "step": 13450 + }, + { + "epoch": 1.5693618014234045, + "grad_norm": 1.046219825744629, + "learning_rate": 0.00022916676282268362, + "loss": 2.0411, + "step": 13451 + }, + { + "epoch": 1.5694784739236962, + "grad_norm": 1.2450885772705078, + "learning_rate": 0.00022915387525901325, + "loss": 2.0567, + "step": 13452 + }, + { + "epoch": 1.569595146423988, + "grad_norm": 1.0490963459014893, + "learning_rate": 0.0002291409868903135, + "loss": 2.1, + "step": 13453 + }, + { + "epoch": 1.5697118189242796, + "grad_norm": 1.2995277643203735, + "learning_rate": 0.00022912809771671782, + "loss": 2.0644, + "step": 13454 + }, + { + "epoch": 1.5698284914245713, + "grad_norm": 1.1268465518951416, + "learning_rate": 0.00022911520773835994, + "loss": 2.0209, + "step": 13455 + }, + { + "epoch": 1.569945163924863, + "grad_norm": 1.1673415899276733, + "learning_rate": 0.0002291023169553734, + "loss": 2.0523, + "step": 13456 + }, + { + "epoch": 1.5700618364251546, + "grad_norm": 1.322920560836792, + "learning_rate": 0.00022908942536789195, + "loss": 2.2651, + "step": 13457 + }, + { + "epoch": 1.5701785089254463, + "grad_norm": 1.3587650060653687, + "learning_rate": 0.00022907653297604916, + "loss": 2.2054, + "step": 13458 + }, + { + "epoch": 1.570295181425738, + "grad_norm": 1.1924524307250977, + "learning_rate": 0.00022906363977997872, + "loss": 2.2427, + "step": 13459 + }, + { + "epoch": 1.5704118539260297, + "grad_norm": 1.3220155239105225, + "learning_rate": 0.00022905074577981427, + "loss": 2.2732, + "step": 13460 + }, + { + "epoch": 1.5705285264263213, + "grad_norm": 1.0873875617980957, + "learning_rate": 0.0002290378509756895, + "loss": 2.0165, + "step": 13461 + }, + { + "epoch": 1.570645198926613, + "grad_norm": 1.4953601360321045, + "learning_rate": 0.00022902495536773803, + "loss": 2.1569, + "step": 13462 + }, + { + "epoch": 1.5707618714269047, + "grad_norm": 1.0872788429260254, + "learning_rate": 0.00022901205895609362, + "loss": 2.1025, + "step": 13463 + }, + { + "epoch": 1.5708785439271964, + "grad_norm": 1.083024263381958, + "learning_rate": 0.0002289991617408899, + "loss": 2.0767, + "step": 13464 + }, + { + "epoch": 1.570995216427488, + "grad_norm": 1.0301854610443115, + "learning_rate": 0.00022898626372226067, + "loss": 1.8707, + "step": 13465 + }, + { + "epoch": 1.5711118889277798, + "grad_norm": 1.1140564680099487, + "learning_rate": 0.00022897336490033956, + "loss": 2.0029, + "step": 13466 + }, + { + "epoch": 1.5712285614280714, + "grad_norm": 1.1632726192474365, + "learning_rate": 0.00022896046527526033, + "loss": 2.205, + "step": 13467 + }, + { + "epoch": 1.5713452339283631, + "grad_norm": 1.1331766843795776, + "learning_rate": 0.00022894756484715664, + "loss": 2.0818, + "step": 13468 + }, + { + "epoch": 1.5714619064286548, + "grad_norm": 1.0720757246017456, + "learning_rate": 0.00022893466361616238, + "loss": 2.0556, + "step": 13469 + }, + { + "epoch": 1.5715785789289465, + "grad_norm": 1.0597057342529297, + "learning_rate": 0.00022892176158241108, + "loss": 1.9984, + "step": 13470 + }, + { + "epoch": 1.5716952514292382, + "grad_norm": 1.2296123504638672, + "learning_rate": 0.00022890885874603668, + "loss": 2.1766, + "step": 13471 + }, + { + "epoch": 1.5718119239295298, + "grad_norm": 1.2005834579467773, + "learning_rate": 0.00022889595510717286, + "loss": 2.027, + "step": 13472 + }, + { + "epoch": 1.5719285964298215, + "grad_norm": 1.1753062009811401, + "learning_rate": 0.0002288830506659534, + "loss": 2.0769, + "step": 13473 + }, + { + "epoch": 1.5720452689301132, + "grad_norm": 1.1645435094833374, + "learning_rate": 0.00022887014542251208, + "loss": 2.1037, + "step": 13474 + }, + { + "epoch": 1.5721619414304049, + "grad_norm": 1.1210596561431885, + "learning_rate": 0.0002288572393769827, + "loss": 2.0511, + "step": 13475 + }, + { + "epoch": 1.5722786139306966, + "grad_norm": 1.1128283739089966, + "learning_rate": 0.00022884433252949905, + "loss": 1.8911, + "step": 13476 + }, + { + "epoch": 1.5723952864309882, + "grad_norm": 1.1936277151107788, + "learning_rate": 0.00022883142488019492, + "loss": 1.957, + "step": 13477 + }, + { + "epoch": 1.57251195893128, + "grad_norm": 1.0591800212860107, + "learning_rate": 0.0002288185164292042, + "loss": 2.0217, + "step": 13478 + }, + { + "epoch": 1.5726286314315716, + "grad_norm": 1.057716965675354, + "learning_rate": 0.00022880560717666057, + "loss": 2.0576, + "step": 13479 + }, + { + "epoch": 1.5727453039318633, + "grad_norm": 1.2900030612945557, + "learning_rate": 0.0002287926971226979, + "loss": 2.1443, + "step": 13480 + }, + { + "epoch": 1.572861976432155, + "grad_norm": 1.103912353515625, + "learning_rate": 0.0002287797862674501, + "loss": 2.1966, + "step": 13481 + }, + { + "epoch": 1.5729786489324467, + "grad_norm": 1.016985535621643, + "learning_rate": 0.00022876687461105097, + "loss": 1.9505, + "step": 13482 + }, + { + "epoch": 1.5730953214327383, + "grad_norm": 1.3388588428497314, + "learning_rate": 0.00022875396215363435, + "loss": 2.1281, + "step": 13483 + }, + { + "epoch": 1.57321199393303, + "grad_norm": 1.2615941762924194, + "learning_rate": 0.00022874104889533415, + "loss": 2.2296, + "step": 13484 + }, + { + "epoch": 1.5733286664333217, + "grad_norm": 1.1594491004943848, + "learning_rate": 0.00022872813483628424, + "loss": 2.1338, + "step": 13485 + }, + { + "epoch": 1.5734453389336134, + "grad_norm": 1.1526130437850952, + "learning_rate": 0.0002287152199766184, + "loss": 2.1459, + "step": 13486 + }, + { + "epoch": 1.573562011433905, + "grad_norm": 1.2146421670913696, + "learning_rate": 0.00022870230431647063, + "loss": 2.0755, + "step": 13487 + }, + { + "epoch": 1.5736786839341967, + "grad_norm": 0.9489529132843018, + "learning_rate": 0.00022868938785597476, + "loss": 1.9529, + "step": 13488 + }, + { + "epoch": 1.5737953564344884, + "grad_norm": 1.2020164728164673, + "learning_rate": 0.00022867647059526475, + "loss": 2.0463, + "step": 13489 + }, + { + "epoch": 1.57391202893478, + "grad_norm": 1.0836386680603027, + "learning_rate": 0.00022866355253447443, + "loss": 2.1983, + "step": 13490 + }, + { + "epoch": 1.5740287014350718, + "grad_norm": 1.0542430877685547, + "learning_rate": 0.0002286506336737378, + "loss": 2.1356, + "step": 13491 + }, + { + "epoch": 1.5741453739353635, + "grad_norm": 1.2394956350326538, + "learning_rate": 0.00022863771401318873, + "loss": 2.2152, + "step": 13492 + }, + { + "epoch": 1.5742620464356551, + "grad_norm": 0.9818066954612732, + "learning_rate": 0.0002286247935529612, + "loss": 2.0083, + "step": 13493 + }, + { + "epoch": 1.5743787189359468, + "grad_norm": 1.3350563049316406, + "learning_rate": 0.0002286118722931891, + "loss": 2.2722, + "step": 13494 + }, + { + "epoch": 1.5744953914362385, + "grad_norm": 1.1064155101776123, + "learning_rate": 0.00022859895023400644, + "loss": 2.0686, + "step": 13495 + }, + { + "epoch": 1.5746120639365302, + "grad_norm": 1.0719013214111328, + "learning_rate": 0.0002285860273755472, + "loss": 1.9143, + "step": 13496 + }, + { + "epoch": 1.5747287364368219, + "grad_norm": 1.1504487991333008, + "learning_rate": 0.00022857310371794527, + "loss": 2.1353, + "step": 13497 + }, + { + "epoch": 1.5748454089371136, + "grad_norm": 1.1535379886627197, + "learning_rate": 0.0002285601792613347, + "loss": 2.207, + "step": 13498 + }, + { + "epoch": 1.5749620814374052, + "grad_norm": 1.3701684474945068, + "learning_rate": 0.00022854725400584943, + "loss": 2.1941, + "step": 13499 + }, + { + "epoch": 1.575078753937697, + "grad_norm": 1.0829027891159058, + "learning_rate": 0.00022853432795162348, + "loss": 1.8939, + "step": 13500 + }, + { + "epoch": 1.5751954264379886, + "grad_norm": 1.148366928100586, + "learning_rate": 0.00022852140109879085, + "loss": 2.1091, + "step": 13501 + }, + { + "epoch": 1.5753120989382803, + "grad_norm": 1.00237238407135, + "learning_rate": 0.0002285084734474855, + "loss": 1.9714, + "step": 13502 + }, + { + "epoch": 1.575428771438572, + "grad_norm": 1.2966424226760864, + "learning_rate": 0.0002284955449978415, + "loss": 2.1976, + "step": 13503 + }, + { + "epoch": 1.5755454439388636, + "grad_norm": 1.01051664352417, + "learning_rate": 0.00022848261574999294, + "loss": 2.1897, + "step": 13504 + }, + { + "epoch": 1.5756621164391553, + "grad_norm": 1.0733990669250488, + "learning_rate": 0.0002284696857040738, + "loss": 2.1512, + "step": 13505 + }, + { + "epoch": 1.575778788939447, + "grad_norm": 1.1977627277374268, + "learning_rate": 0.00022845675486021804, + "loss": 2.0108, + "step": 13506 + }, + { + "epoch": 1.5758954614397387, + "grad_norm": 1.2297358512878418, + "learning_rate": 0.00022844382321855978, + "loss": 2.1087, + "step": 13507 + }, + { + "epoch": 1.5760121339400304, + "grad_norm": 1.1232203245162964, + "learning_rate": 0.00022843089077923307, + "loss": 2.0488, + "step": 13508 + }, + { + "epoch": 1.576128806440322, + "grad_norm": 1.0266238451004028, + "learning_rate": 0.00022841795754237207, + "loss": 2.0117, + "step": 13509 + }, + { + "epoch": 1.5762454789406137, + "grad_norm": 1.1519190073013306, + "learning_rate": 0.0002284050235081108, + "loss": 2.0287, + "step": 13510 + }, + { + "epoch": 1.5763621514409054, + "grad_norm": 1.2618159055709839, + "learning_rate": 0.00022839208867658323, + "loss": 2.169, + "step": 13511 + }, + { + "epoch": 1.576478823941197, + "grad_norm": 1.3238557577133179, + "learning_rate": 0.0002283791530479236, + "loss": 2.0509, + "step": 13512 + }, + { + "epoch": 1.5765954964414888, + "grad_norm": 1.0669410228729248, + "learning_rate": 0.00022836621662226605, + "loss": 2.0738, + "step": 13513 + }, + { + "epoch": 1.5767121689417805, + "grad_norm": 1.1582345962524414, + "learning_rate": 0.00022835327939974454, + "loss": 1.8909, + "step": 13514 + }, + { + "epoch": 1.5768288414420721, + "grad_norm": 1.174597978591919, + "learning_rate": 0.0002283403413804932, + "loss": 2.1173, + "step": 13515 + }, + { + "epoch": 1.5769455139423638, + "grad_norm": 1.2034674882888794, + "learning_rate": 0.00022832740256464632, + "loss": 2.1213, + "step": 13516 + }, + { + "epoch": 1.5770621864426555, + "grad_norm": 1.3380248546600342, + "learning_rate": 0.00022831446295233784, + "loss": 2.0062, + "step": 13517 + }, + { + "epoch": 1.5771788589429472, + "grad_norm": 1.2956427335739136, + "learning_rate": 0.00022830152254370203, + "loss": 2.1978, + "step": 13518 + }, + { + "epoch": 1.5772955314432389, + "grad_norm": 1.2483148574829102, + "learning_rate": 0.00022828858133887298, + "loss": 2.1626, + "step": 13519 + }, + { + "epoch": 1.5774122039435305, + "grad_norm": 1.221229076385498, + "learning_rate": 0.00022827563933798488, + "loss": 2.1344, + "step": 13520 + }, + { + "epoch": 1.5775288764438222, + "grad_norm": 1.018725872039795, + "learning_rate": 0.00022826269654117187, + "loss": 2.1119, + "step": 13521 + }, + { + "epoch": 1.577645548944114, + "grad_norm": 1.144673228263855, + "learning_rate": 0.00022824975294856822, + "loss": 2.1344, + "step": 13522 + }, + { + "epoch": 1.5777622214444056, + "grad_norm": 1.1770128011703491, + "learning_rate": 0.00022823680856030797, + "loss": 2.1696, + "step": 13523 + }, + { + "epoch": 1.5778788939446973, + "grad_norm": 1.1670141220092773, + "learning_rate": 0.00022822386337652544, + "loss": 1.9552, + "step": 13524 + }, + { + "epoch": 1.577995566444989, + "grad_norm": 1.0901728868484497, + "learning_rate": 0.00022821091739735474, + "loss": 1.8726, + "step": 13525 + }, + { + "epoch": 1.5781122389452806, + "grad_norm": 1.2214150428771973, + "learning_rate": 0.00022819797062293017, + "loss": 2.0851, + "step": 13526 + }, + { + "epoch": 1.5782289114455723, + "grad_norm": 1.1550633907318115, + "learning_rate": 0.00022818502305338583, + "loss": 2.1749, + "step": 13527 + }, + { + "epoch": 1.578345583945864, + "grad_norm": 1.1234077215194702, + "learning_rate": 0.00022817207468885605, + "loss": 2.1066, + "step": 13528 + }, + { + "epoch": 1.5784622564461557, + "grad_norm": 1.263655185699463, + "learning_rate": 0.00022815912552947497, + "loss": 2.1674, + "step": 13529 + }, + { + "epoch": 1.5785789289464474, + "grad_norm": 1.099238634109497, + "learning_rate": 0.000228146175575377, + "loss": 1.8652, + "step": 13530 + }, + { + "epoch": 1.578695601446739, + "grad_norm": 1.1330897808074951, + "learning_rate": 0.00022813322482669618, + "loss": 2.0017, + "step": 13531 + }, + { + "epoch": 1.5788122739470307, + "grad_norm": 1.3812793493270874, + "learning_rate": 0.00022812027328356687, + "loss": 2.1218, + "step": 13532 + }, + { + "epoch": 1.5789289464473224, + "grad_norm": 1.188448429107666, + "learning_rate": 0.0002281073209461234, + "loss": 2.0208, + "step": 13533 + }, + { + "epoch": 1.579045618947614, + "grad_norm": 1.2062541246414185, + "learning_rate": 0.00022809436781449992, + "loss": 2.0913, + "step": 13534 + }, + { + "epoch": 1.5791622914479058, + "grad_norm": 1.2592922449111938, + "learning_rate": 0.0002280814138888308, + "loss": 2.1032, + "step": 13535 + }, + { + "epoch": 1.5792789639481974, + "grad_norm": 1.1675125360488892, + "learning_rate": 0.00022806845916925028, + "loss": 2.1751, + "step": 13536 + }, + { + "epoch": 1.5793956364484891, + "grad_norm": 1.272215723991394, + "learning_rate": 0.0002280555036558927, + "loss": 2.2327, + "step": 13537 + }, + { + "epoch": 1.5795123089487808, + "grad_norm": 1.389318585395813, + "learning_rate": 0.00022804254734889244, + "loss": 2.129, + "step": 13538 + }, + { + "epoch": 1.5796289814490725, + "grad_norm": 1.1317106485366821, + "learning_rate": 0.0002280295902483836, + "loss": 2.0966, + "step": 13539 + }, + { + "epoch": 1.5797456539493642, + "grad_norm": 1.35916268825531, + "learning_rate": 0.0002280166323545007, + "loss": 2.1366, + "step": 13540 + }, + { + "epoch": 1.5798623264496559, + "grad_norm": 1.1414401531219482, + "learning_rate": 0.00022800367366737802, + "loss": 1.9239, + "step": 13541 + }, + { + "epoch": 1.5799789989499475, + "grad_norm": 1.0983840227127075, + "learning_rate": 0.00022799071418714987, + "loss": 2.1904, + "step": 13542 + }, + { + "epoch": 1.5800956714502392, + "grad_norm": 1.1991056203842163, + "learning_rate": 0.0002279777539139506, + "loss": 2.1437, + "step": 13543 + }, + { + "epoch": 1.580212343950531, + "grad_norm": 1.2071497440338135, + "learning_rate": 0.00022796479284791464, + "loss": 2.1931, + "step": 13544 + }, + { + "epoch": 1.5803290164508226, + "grad_norm": 1.005617618560791, + "learning_rate": 0.00022795183098917626, + "loss": 2.1274, + "step": 13545 + }, + { + "epoch": 1.5804456889511143, + "grad_norm": 1.1050949096679688, + "learning_rate": 0.00022793886833786988, + "loss": 2.2601, + "step": 13546 + }, + { + "epoch": 1.580562361451406, + "grad_norm": 1.2491176128387451, + "learning_rate": 0.0002279259048941299, + "loss": 2.0, + "step": 13547 + }, + { + "epoch": 1.5806790339516976, + "grad_norm": 1.130761981010437, + "learning_rate": 0.00022791294065809074, + "loss": 2.1263, + "step": 13548 + }, + { + "epoch": 1.5807957064519893, + "grad_norm": 1.2058637142181396, + "learning_rate": 0.00022789997562988666, + "loss": 1.8912, + "step": 13549 + }, + { + "epoch": 1.580912378952281, + "grad_norm": 1.3233689069747925, + "learning_rate": 0.0002278870098096522, + "loss": 1.989, + "step": 13550 + }, + { + "epoch": 1.5810290514525727, + "grad_norm": 1.209342360496521, + "learning_rate": 0.00022787404319752168, + "loss": 2.0083, + "step": 13551 + }, + { + "epoch": 1.5811457239528643, + "grad_norm": 1.2089861631393433, + "learning_rate": 0.00022786107579362964, + "loss": 2.0213, + "step": 13552 + }, + { + "epoch": 1.581262396453156, + "grad_norm": 1.1936360597610474, + "learning_rate": 0.00022784810759811044, + "loss": 1.9834, + "step": 13553 + }, + { + "epoch": 1.5813790689534477, + "grad_norm": 1.0460307598114014, + "learning_rate": 0.00022783513861109853, + "loss": 2.1772, + "step": 13554 + }, + { + "epoch": 1.5814957414537394, + "grad_norm": 1.3776090145111084, + "learning_rate": 0.00022782216883272834, + "loss": 2.2126, + "step": 13555 + }, + { + "epoch": 1.581612413954031, + "grad_norm": 1.0639158487319946, + "learning_rate": 0.00022780919826313439, + "loss": 2.1803, + "step": 13556 + }, + { + "epoch": 1.5817290864543228, + "grad_norm": 1.1430234909057617, + "learning_rate": 0.00022779622690245104, + "loss": 2.1994, + "step": 13557 + }, + { + "epoch": 1.5818457589546144, + "grad_norm": 1.0026212930679321, + "learning_rate": 0.00022778325475081285, + "loss": 2.0363, + "step": 13558 + }, + { + "epoch": 1.5819624314549061, + "grad_norm": 1.1159716844558716, + "learning_rate": 0.00022777028180835424, + "loss": 2.1574, + "step": 13559 + }, + { + "epoch": 1.5820791039551978, + "grad_norm": 1.0266162157058716, + "learning_rate": 0.0002277573080752097, + "loss": 2.1036, + "step": 13560 + }, + { + "epoch": 1.5821957764554895, + "grad_norm": 1.1930320262908936, + "learning_rate": 0.00022774433355151382, + "loss": 1.8985, + "step": 13561 + }, + { + "epoch": 1.5823124489557812, + "grad_norm": 1.1767041683197021, + "learning_rate": 0.00022773135823740104, + "loss": 2.1492, + "step": 13562 + }, + { + "epoch": 1.5824291214560728, + "grad_norm": 1.381470799446106, + "learning_rate": 0.0002277183821330058, + "loss": 1.8334, + "step": 13563 + }, + { + "epoch": 1.5825457939563645, + "grad_norm": 1.177081823348999, + "learning_rate": 0.0002277054052384628, + "loss": 1.9599, + "step": 13564 + }, + { + "epoch": 1.5826624664566562, + "grad_norm": 1.1763814687728882, + "learning_rate": 0.0002276924275539064, + "loss": 1.9614, + "step": 13565 + }, + { + "epoch": 1.5827791389569479, + "grad_norm": 1.095568299293518, + "learning_rate": 0.00022767944907947125, + "loss": 2.0471, + "step": 13566 + }, + { + "epoch": 1.5828958114572396, + "grad_norm": 1.040682315826416, + "learning_rate": 0.00022766646981529183, + "loss": 1.9218, + "step": 13567 + }, + { + "epoch": 1.5830124839575312, + "grad_norm": 1.0691739320755005, + "learning_rate": 0.00022765348976150273, + "loss": 2.0338, + "step": 13568 + }, + { + "epoch": 1.583129156457823, + "grad_norm": 1.0334151983261108, + "learning_rate": 0.00022764050891823848, + "loss": 1.9764, + "step": 13569 + }, + { + "epoch": 1.5832458289581146, + "grad_norm": 1.0719292163848877, + "learning_rate": 0.0002276275272856337, + "loss": 2.0794, + "step": 13570 + }, + { + "epoch": 1.5833625014584063, + "grad_norm": 1.2448115348815918, + "learning_rate": 0.00022761454486382288, + "loss": 1.9724, + "step": 13571 + }, + { + "epoch": 1.583479173958698, + "grad_norm": 1.1156812906265259, + "learning_rate": 0.00022760156165294075, + "loss": 2.0516, + "step": 13572 + }, + { + "epoch": 1.5835958464589897, + "grad_norm": 1.1169289350509644, + "learning_rate": 0.0002275885776531218, + "loss": 2.1332, + "step": 13573 + }, + { + "epoch": 1.5837125189592813, + "grad_norm": 1.1927493810653687, + "learning_rate": 0.0002275755928645006, + "loss": 1.9196, + "step": 13574 + }, + { + "epoch": 1.583829191459573, + "grad_norm": 1.2453454732894897, + "learning_rate": 0.00022756260728721187, + "loss": 1.9654, + "step": 13575 + }, + { + "epoch": 1.5839458639598647, + "grad_norm": 1.0537351369857788, + "learning_rate": 0.00022754962092139016, + "loss": 2.0503, + "step": 13576 + }, + { + "epoch": 1.5840625364601564, + "grad_norm": 1.4503189325332642, + "learning_rate": 0.00022753663376717012, + "loss": 2.1281, + "step": 13577 + }, + { + "epoch": 1.584179208960448, + "grad_norm": 1.1290569305419922, + "learning_rate": 0.00022752364582468638, + "loss": 1.9234, + "step": 13578 + }, + { + "epoch": 1.5842958814607397, + "grad_norm": 1.2328603267669678, + "learning_rate": 0.00022751065709407362, + "loss": 2.1859, + "step": 13579 + }, + { + "epoch": 1.5844125539610314, + "grad_norm": 1.0857384204864502, + "learning_rate": 0.00022749766757546643, + "loss": 2.059, + "step": 13580 + }, + { + "epoch": 1.584529226461323, + "grad_norm": 1.1763395071029663, + "learning_rate": 0.00022748467726899953, + "loss": 1.9577, + "step": 13581 + }, + { + "epoch": 1.5846458989616148, + "grad_norm": 1.0788416862487793, + "learning_rate": 0.00022747168617480752, + "loss": 1.9081, + "step": 13582 + }, + { + "epoch": 1.5847625714619065, + "grad_norm": 1.3357945680618286, + "learning_rate": 0.00022745869429302513, + "loss": 2.2155, + "step": 13583 + }, + { + "epoch": 1.5848792439621981, + "grad_norm": 1.3481539487838745, + "learning_rate": 0.0002274457016237871, + "loss": 2.2278, + "step": 13584 + }, + { + "epoch": 1.5849959164624898, + "grad_norm": 1.0888416767120361, + "learning_rate": 0.00022743270816722794, + "loss": 1.9768, + "step": 13585 + }, + { + "epoch": 1.5851125889627815, + "grad_norm": 1.1805107593536377, + "learning_rate": 0.00022741971392348253, + "loss": 2.1084, + "step": 13586 + }, + { + "epoch": 1.5852292614630732, + "grad_norm": 1.3585083484649658, + "learning_rate": 0.0002274067188926855, + "loss": 2.0338, + "step": 13587 + }, + { + "epoch": 1.5853459339633649, + "grad_norm": 1.302901268005371, + "learning_rate": 0.00022739372307497164, + "loss": 2.2934, + "step": 13588 + }, + { + "epoch": 1.5854626064636566, + "grad_norm": 1.2026329040527344, + "learning_rate": 0.00022738072647047558, + "loss": 1.9972, + "step": 13589 + }, + { + "epoch": 1.5855792789639482, + "grad_norm": 1.1975821256637573, + "learning_rate": 0.00022736772907933216, + "loss": 2.1717, + "step": 13590 + }, + { + "epoch": 1.58569595146424, + "grad_norm": 1.3014694452285767, + "learning_rate": 0.000227354730901676, + "loss": 1.981, + "step": 13591 + }, + { + "epoch": 1.5858126239645316, + "grad_norm": 1.1732192039489746, + "learning_rate": 0.00022734173193764192, + "loss": 2.1182, + "step": 13592 + }, + { + "epoch": 1.5859292964648233, + "grad_norm": 1.1367543935775757, + "learning_rate": 0.00022732873218736464, + "loss": 2.1183, + "step": 13593 + }, + { + "epoch": 1.586045968965115, + "grad_norm": 1.07009756565094, + "learning_rate": 0.00022731573165097902, + "loss": 1.9458, + "step": 13594 + }, + { + "epoch": 1.5861626414654066, + "grad_norm": 1.3106787204742432, + "learning_rate": 0.00022730273032861976, + "loss": 2.2404, + "step": 13595 + }, + { + "epoch": 1.5862793139656983, + "grad_norm": 1.199149489402771, + "learning_rate": 0.00022728972822042164, + "loss": 2.0909, + "step": 13596 + }, + { + "epoch": 1.58639598646599, + "grad_norm": 1.1283856630325317, + "learning_rate": 0.00022727672532651944, + "loss": 2.1457, + "step": 13597 + }, + { + "epoch": 1.5865126589662817, + "grad_norm": 1.235194206237793, + "learning_rate": 0.000227263721647048, + "loss": 1.9522, + "step": 13598 + }, + { + "epoch": 1.5866293314665734, + "grad_norm": 1.1861900091171265, + "learning_rate": 0.00022725071718214217, + "loss": 2.0496, + "step": 13599 + }, + { + "epoch": 1.586746003966865, + "grad_norm": 1.2907394170761108, + "learning_rate": 0.00022723771193193664, + "loss": 2.1574, + "step": 13600 + }, + { + "epoch": 1.5868626764671567, + "grad_norm": 1.0556248426437378, + "learning_rate": 0.00022722470589656632, + "loss": 2.0701, + "step": 13601 + }, + { + "epoch": 1.5869793489674484, + "grad_norm": 1.1167962551116943, + "learning_rate": 0.00022721169907616607, + "loss": 1.996, + "step": 13602 + }, + { + "epoch": 1.58709602146774, + "grad_norm": 1.2702727317810059, + "learning_rate": 0.00022719869147087065, + "loss": 2.1524, + "step": 13603 + }, + { + "epoch": 1.5872126939680318, + "grad_norm": 1.0726345777511597, + "learning_rate": 0.00022718568308081495, + "loss": 2.0598, + "step": 13604 + }, + { + "epoch": 1.5873293664683235, + "grad_norm": 1.037846565246582, + "learning_rate": 0.00022717267390613384, + "loss": 2.0872, + "step": 13605 + }, + { + "epoch": 1.5874460389686151, + "grad_norm": 1.0492204427719116, + "learning_rate": 0.00022715966394696213, + "loss": 2.0884, + "step": 13606 + }, + { + "epoch": 1.5875627114689068, + "grad_norm": 1.2677110433578491, + "learning_rate": 0.00022714665320343482, + "loss": 2.0998, + "step": 13607 + }, + { + "epoch": 1.5876793839691985, + "grad_norm": 1.3831253051757812, + "learning_rate": 0.00022713364167568662, + "loss": 2.2115, + "step": 13608 + }, + { + "epoch": 1.5877960564694902, + "grad_norm": 1.1726621389389038, + "learning_rate": 0.00022712062936385252, + "loss": 2.1269, + "step": 13609 + }, + { + "epoch": 1.5879127289697819, + "grad_norm": 1.0830495357513428, + "learning_rate": 0.00022710761626806736, + "loss": 1.9349, + "step": 13610 + }, + { + "epoch": 1.5880294014700735, + "grad_norm": 1.1568045616149902, + "learning_rate": 0.00022709460238846613, + "loss": 2.0783, + "step": 13611 + }, + { + "epoch": 1.5881460739703652, + "grad_norm": 1.3783222436904907, + "learning_rate": 0.00022708158772518364, + "loss": 2.2511, + "step": 13612 + }, + { + "epoch": 1.588262746470657, + "grad_norm": 1.2020189762115479, + "learning_rate": 0.00022706857227835491, + "loss": 2.1226, + "step": 13613 + }, + { + "epoch": 1.5883794189709486, + "grad_norm": 1.1254866123199463, + "learning_rate": 0.0002270555560481148, + "loss": 2.0775, + "step": 13614 + }, + { + "epoch": 1.5884960914712403, + "grad_norm": 1.1458406448364258, + "learning_rate": 0.00022704253903459837, + "loss": 1.94, + "step": 13615 + }, + { + "epoch": 1.588612763971532, + "grad_norm": 1.143085241317749, + "learning_rate": 0.00022702952123794038, + "loss": 2.097, + "step": 13616 + }, + { + "epoch": 1.5887294364718236, + "grad_norm": 1.1154224872589111, + "learning_rate": 0.0002270165026582759, + "loss": 2.018, + "step": 13617 + }, + { + "epoch": 1.5888461089721153, + "grad_norm": 1.0869311094284058, + "learning_rate": 0.00022700348329573986, + "loss": 1.9909, + "step": 13618 + }, + { + "epoch": 1.588962781472407, + "grad_norm": 1.229114055633545, + "learning_rate": 0.00022699046315046727, + "loss": 2.1512, + "step": 13619 + }, + { + "epoch": 1.5890794539726987, + "grad_norm": 1.1444129943847656, + "learning_rate": 0.00022697744222259302, + "loss": 2.0619, + "step": 13620 + }, + { + "epoch": 1.5891961264729904, + "grad_norm": 0.9617262482643127, + "learning_rate": 0.00022696442051225218, + "loss": 1.9297, + "step": 13621 + }, + { + "epoch": 1.589312798973282, + "grad_norm": 1.1164007186889648, + "learning_rate": 0.00022695139801957973, + "loss": 2.059, + "step": 13622 + }, + { + "epoch": 1.5894294714735737, + "grad_norm": 1.1139909029006958, + "learning_rate": 0.00022693837474471067, + "loss": 1.9707, + "step": 13623 + }, + { + "epoch": 1.5895461439738654, + "grad_norm": 1.1980599164962769, + "learning_rate": 0.00022692535068777998, + "loss": 2.3151, + "step": 13624 + }, + { + "epoch": 1.589662816474157, + "grad_norm": 1.1821744441986084, + "learning_rate": 0.0002269123258489227, + "loss": 2.2548, + "step": 13625 + }, + { + "epoch": 1.5897794889744488, + "grad_norm": 1.1790282726287842, + "learning_rate": 0.00022689930022827385, + "loss": 2.0801, + "step": 13626 + }, + { + "epoch": 1.5898961614747404, + "grad_norm": 1.1877310276031494, + "learning_rate": 0.00022688627382596847, + "loss": 2.1513, + "step": 13627 + }, + { + "epoch": 1.5900128339750321, + "grad_norm": 1.0725955963134766, + "learning_rate": 0.00022687324664214163, + "loss": 2.0013, + "step": 13628 + }, + { + "epoch": 1.5901295064753238, + "grad_norm": 1.3151535987854004, + "learning_rate": 0.00022686021867692835, + "loss": 1.9971, + "step": 13629 + }, + { + "epoch": 1.5902461789756155, + "grad_norm": 1.0767830610275269, + "learning_rate": 0.00022684718993046368, + "loss": 2.058, + "step": 13630 + }, + { + "epoch": 1.5903628514759072, + "grad_norm": 1.014422059059143, + "learning_rate": 0.0002268341604028827, + "loss": 1.9721, + "step": 13631 + }, + { + "epoch": 1.5904795239761989, + "grad_norm": 0.975307047367096, + "learning_rate": 0.00022682113009432056, + "loss": 1.8841, + "step": 13632 + }, + { + "epoch": 1.5905961964764905, + "grad_norm": 1.3167035579681396, + "learning_rate": 0.00022680809900491223, + "loss": 2.1457, + "step": 13633 + }, + { + "epoch": 1.5907128689767822, + "grad_norm": 1.1515474319458008, + "learning_rate": 0.00022679506713479283, + "loss": 1.9718, + "step": 13634 + }, + { + "epoch": 1.590829541477074, + "grad_norm": 1.2839804887771606, + "learning_rate": 0.0002267820344840975, + "loss": 2.231, + "step": 13635 + }, + { + "epoch": 1.5909462139773656, + "grad_norm": 1.1612355709075928, + "learning_rate": 0.00022676900105296133, + "loss": 2.145, + "step": 13636 + }, + { + "epoch": 1.5910628864776573, + "grad_norm": 1.3549487590789795, + "learning_rate": 0.0002267559668415194, + "loss": 2.1362, + "step": 13637 + }, + { + "epoch": 1.591179558977949, + "grad_norm": 1.0762420892715454, + "learning_rate": 0.00022674293184990683, + "loss": 1.9235, + "step": 13638 + }, + { + "epoch": 1.5912962314782406, + "grad_norm": 1.280908465385437, + "learning_rate": 0.00022672989607825885, + "loss": 2.1839, + "step": 13639 + }, + { + "epoch": 1.5914129039785323, + "grad_norm": 1.3027713298797607, + "learning_rate": 0.00022671685952671054, + "loss": 2.0798, + "step": 13640 + }, + { + "epoch": 1.591529576478824, + "grad_norm": 1.0600476264953613, + "learning_rate": 0.000226703822195397, + "loss": 2.1821, + "step": 13641 + }, + { + "epoch": 1.5916462489791157, + "grad_norm": 1.2894448041915894, + "learning_rate": 0.0002266907840844535, + "loss": 2.2655, + "step": 13642 + }, + { + "epoch": 1.5917629214794073, + "grad_norm": 1.043695092201233, + "learning_rate": 0.00022667774519401508, + "loss": 2.0213, + "step": 13643 + }, + { + "epoch": 1.591879593979699, + "grad_norm": 1.1455872058868408, + "learning_rate": 0.00022666470552421697, + "loss": 2.0493, + "step": 13644 + }, + { + "epoch": 1.5919962664799907, + "grad_norm": 1.0108240842819214, + "learning_rate": 0.00022665166507519438, + "loss": 2.0181, + "step": 13645 + }, + { + "epoch": 1.5921129389802824, + "grad_norm": 1.2099502086639404, + "learning_rate": 0.00022663862384708245, + "loss": 1.9946, + "step": 13646 + }, + { + "epoch": 1.592229611480574, + "grad_norm": 1.1924943923950195, + "learning_rate": 0.00022662558184001643, + "loss": 2.1316, + "step": 13647 + }, + { + "epoch": 1.5923462839808658, + "grad_norm": 1.303093671798706, + "learning_rate": 0.00022661253905413144, + "loss": 1.9852, + "step": 13648 + }, + { + "epoch": 1.5924629564811574, + "grad_norm": 1.3279978036880493, + "learning_rate": 0.0002265994954895628, + "loss": 2.0168, + "step": 13649 + }, + { + "epoch": 1.5925796289814491, + "grad_norm": 1.2780884504318237, + "learning_rate": 0.00022658645114644564, + "loss": 2.2695, + "step": 13650 + }, + { + "epoch": 1.5926963014817408, + "grad_norm": 1.163678526878357, + "learning_rate": 0.00022657340602491523, + "loss": 2.1326, + "step": 13651 + }, + { + "epoch": 1.5928129739820325, + "grad_norm": 1.3308745622634888, + "learning_rate": 0.00022656036012510684, + "loss": 2.0388, + "step": 13652 + }, + { + "epoch": 1.5929296464823242, + "grad_norm": 1.3094605207443237, + "learning_rate": 0.0002265473134471556, + "loss": 2.2467, + "step": 13653 + }, + { + "epoch": 1.5930463189826158, + "grad_norm": 1.0988359451293945, + "learning_rate": 0.00022653426599119692, + "loss": 2.1321, + "step": 13654 + }, + { + "epoch": 1.5931629914829075, + "grad_norm": 1.1476331949234009, + "learning_rate": 0.00022652121775736597, + "loss": 2.1773, + "step": 13655 + }, + { + "epoch": 1.5932796639831992, + "grad_norm": 1.0293960571289062, + "learning_rate": 0.000226508168745798, + "loss": 1.9242, + "step": 13656 + }, + { + "epoch": 1.5933963364834909, + "grad_norm": 1.1097968816757202, + "learning_rate": 0.00022649511895662833, + "loss": 2.1378, + "step": 13657 + }, + { + "epoch": 1.5935130089837826, + "grad_norm": 1.0779147148132324, + "learning_rate": 0.00022648206838999225, + "loss": 1.9855, + "step": 13658 + }, + { + "epoch": 1.5936296814840742, + "grad_norm": 1.1523511409759521, + "learning_rate": 0.0002264690170460251, + "loss": 2.196, + "step": 13659 + }, + { + "epoch": 1.593746353984366, + "grad_norm": 1.0792505741119385, + "learning_rate": 0.00022645596492486204, + "loss": 1.9975, + "step": 13660 + }, + { + "epoch": 1.5938630264846576, + "grad_norm": 1.0716476440429688, + "learning_rate": 0.00022644291202663847, + "loss": 2.0025, + "step": 13661 + }, + { + "epoch": 1.5939796989849493, + "grad_norm": 1.3262568712234497, + "learning_rate": 0.00022642985835148974, + "loss": 2.1275, + "step": 13662 + }, + { + "epoch": 1.594096371485241, + "grad_norm": 1.1073403358459473, + "learning_rate": 0.00022641680389955113, + "loss": 2.0469, + "step": 13663 + }, + { + "epoch": 1.5942130439855327, + "grad_norm": 1.106185793876648, + "learning_rate": 0.0002264037486709579, + "loss": 1.9414, + "step": 13664 + }, + { + "epoch": 1.5943297164858243, + "grad_norm": 1.1211755275726318, + "learning_rate": 0.00022639069266584553, + "loss": 2.098, + "step": 13665 + }, + { + "epoch": 1.594446388986116, + "grad_norm": 1.271363615989685, + "learning_rate": 0.00022637763588434934, + "loss": 2.0401, + "step": 13666 + }, + { + "epoch": 1.5945630614864077, + "grad_norm": 1.1321165561676025, + "learning_rate": 0.00022636457832660465, + "loss": 2.1986, + "step": 13667 + }, + { + "epoch": 1.5946797339866994, + "grad_norm": 1.180892825126648, + "learning_rate": 0.00022635151999274687, + "loss": 1.9375, + "step": 13668 + }, + { + "epoch": 1.594796406486991, + "grad_norm": 1.1765353679656982, + "learning_rate": 0.0002263384608829113, + "loss": 2.1708, + "step": 13669 + }, + { + "epoch": 1.5949130789872827, + "grad_norm": 1.2567099332809448, + "learning_rate": 0.00022632540099723334, + "loss": 2.0748, + "step": 13670 + }, + { + "epoch": 1.5950297514875744, + "grad_norm": 1.253240704536438, + "learning_rate": 0.00022631234033584845, + "loss": 2.0908, + "step": 13671 + }, + { + "epoch": 1.595146423987866, + "grad_norm": 1.1831547021865845, + "learning_rate": 0.00022629927889889195, + "loss": 2.067, + "step": 13672 + }, + { + "epoch": 1.5952630964881578, + "grad_norm": 1.1319597959518433, + "learning_rate": 0.00022628621668649928, + "loss": 2.216, + "step": 13673 + }, + { + "epoch": 1.5953797689884495, + "grad_norm": 1.1538599729537964, + "learning_rate": 0.0002262731536988059, + "loss": 2.2506, + "step": 13674 + }, + { + "epoch": 1.5954964414887411, + "grad_norm": 0.990452766418457, + "learning_rate": 0.00022626008993594716, + "loss": 2.0377, + "step": 13675 + }, + { + "epoch": 1.5956131139890328, + "grad_norm": 1.2697652578353882, + "learning_rate": 0.00022624702539805847, + "loss": 2.1249, + "step": 13676 + }, + { + "epoch": 1.5957297864893245, + "grad_norm": 1.1770051717758179, + "learning_rate": 0.00022623396008527541, + "loss": 2.0613, + "step": 13677 + }, + { + "epoch": 1.5958464589896162, + "grad_norm": 0.9679543972015381, + "learning_rate": 0.00022622089399773328, + "loss": 1.9387, + "step": 13678 + }, + { + "epoch": 1.5959631314899079, + "grad_norm": 1.3358983993530273, + "learning_rate": 0.0002262078271355676, + "loss": 2.2285, + "step": 13679 + }, + { + "epoch": 1.5960798039901996, + "grad_norm": 1.111818552017212, + "learning_rate": 0.0002261947594989138, + "loss": 2.0453, + "step": 13680 + }, + { + "epoch": 1.5961964764904912, + "grad_norm": 1.2739818096160889, + "learning_rate": 0.00022618169108790735, + "loss": 2.1547, + "step": 13681 + }, + { + "epoch": 1.596313148990783, + "grad_norm": 1.2665839195251465, + "learning_rate": 0.00022616862190268382, + "loss": 2.1931, + "step": 13682 + }, + { + "epoch": 1.5964298214910746, + "grad_norm": 1.2274060249328613, + "learning_rate": 0.00022615555194337858, + "loss": 2.0103, + "step": 13683 + }, + { + "epoch": 1.5965464939913663, + "grad_norm": 1.0986601114273071, + "learning_rate": 0.00022614248121012717, + "loss": 2.0057, + "step": 13684 + }, + { + "epoch": 1.596663166491658, + "grad_norm": 1.176774501800537, + "learning_rate": 0.00022612940970306516, + "loss": 2.1337, + "step": 13685 + }, + { + "epoch": 1.5967798389919496, + "grad_norm": 1.0868947505950928, + "learning_rate": 0.00022611633742232795, + "loss": 1.9048, + "step": 13686 + }, + { + "epoch": 1.5968965114922413, + "grad_norm": 1.0275400876998901, + "learning_rate": 0.00022610326436805112, + "loss": 2.0339, + "step": 13687 + }, + { + "epoch": 1.597013183992533, + "grad_norm": 1.2515252828598022, + "learning_rate": 0.00022609019054037013, + "loss": 1.9953, + "step": 13688 + }, + { + "epoch": 1.5971298564928247, + "grad_norm": 1.0316994190216064, + "learning_rate": 0.00022607711593942062, + "loss": 2.0761, + "step": 13689 + }, + { + "epoch": 1.5972465289931164, + "grad_norm": 1.0345560312271118, + "learning_rate": 0.00022606404056533806, + "loss": 2.1199, + "step": 13690 + }, + { + "epoch": 1.597363201493408, + "grad_norm": 1.0858261585235596, + "learning_rate": 0.00022605096441825795, + "loss": 1.934, + "step": 13691 + }, + { + "epoch": 1.5974798739936997, + "grad_norm": 1.2549983263015747, + "learning_rate": 0.00022603788749831605, + "loss": 2.1137, + "step": 13692 + }, + { + "epoch": 1.5975965464939914, + "grad_norm": 1.2168365716934204, + "learning_rate": 0.00022602480980564777, + "loss": 2.025, + "step": 13693 + }, + { + "epoch": 1.597713218994283, + "grad_norm": 0.9313831925392151, + "learning_rate": 0.00022601173134038867, + "loss": 1.8646, + "step": 13694 + }, + { + "epoch": 1.5978298914945748, + "grad_norm": 1.2002975940704346, + "learning_rate": 0.0002259986521026744, + "loss": 2.126, + "step": 13695 + }, + { + "epoch": 1.5979465639948665, + "grad_norm": 1.1805299520492554, + "learning_rate": 0.00022598557209264046, + "loss": 2.0731, + "step": 13696 + }, + { + "epoch": 1.5980632364951581, + "grad_norm": 1.3105032444000244, + "learning_rate": 0.0002259724913104226, + "loss": 2.1808, + "step": 13697 + }, + { + "epoch": 1.5981799089954498, + "grad_norm": 1.1127785444259644, + "learning_rate": 0.0002259594097561563, + "loss": 1.9769, + "step": 13698 + }, + { + "epoch": 1.5982965814957415, + "grad_norm": 1.1801327466964722, + "learning_rate": 0.0002259463274299772, + "loss": 2.049, + "step": 13699 + }, + { + "epoch": 1.5984132539960332, + "grad_norm": 1.0048776865005493, + "learning_rate": 0.00022593324433202103, + "loss": 2.1046, + "step": 13700 + }, + { + "epoch": 1.5985299264963249, + "grad_norm": 1.2106021642684937, + "learning_rate": 0.00022592016046242322, + "loss": 2.0915, + "step": 13701 + }, + { + "epoch": 1.5986465989966165, + "grad_norm": 1.2654002904891968, + "learning_rate": 0.00022590707582131962, + "loss": 2.0958, + "step": 13702 + }, + { + "epoch": 1.5987632714969082, + "grad_norm": 1.1412419080734253, + "learning_rate": 0.00022589399040884574, + "loss": 2.1896, + "step": 13703 + }, + { + "epoch": 1.5988799439972, + "grad_norm": 1.2759907245635986, + "learning_rate": 0.00022588090422513717, + "loss": 2.1608, + "step": 13704 + }, + { + "epoch": 1.5989966164974916, + "grad_norm": 1.1049809455871582, + "learning_rate": 0.00022586781727032982, + "loss": 2.1466, + "step": 13705 + }, + { + "epoch": 1.5991132889977833, + "grad_norm": 1.113665223121643, + "learning_rate": 0.00022585472954455915, + "loss": 2.1203, + "step": 13706 + }, + { + "epoch": 1.599229961498075, + "grad_norm": 1.1437240839004517, + "learning_rate": 0.00022584164104796095, + "loss": 1.9744, + "step": 13707 + }, + { + "epoch": 1.5993466339983666, + "grad_norm": 1.3640011548995972, + "learning_rate": 0.0002258285517806708, + "loss": 2.3993, + "step": 13708 + }, + { + "epoch": 1.5994633064986583, + "grad_norm": 1.0860803127288818, + "learning_rate": 0.00022581546174282454, + "loss": 1.9987, + "step": 13709 + }, + { + "epoch": 1.59957997899895, + "grad_norm": 1.1015419960021973, + "learning_rate": 0.00022580237093455775, + "loss": 1.9818, + "step": 13710 + }, + { + "epoch": 1.5996966514992417, + "grad_norm": 0.9186866879463196, + "learning_rate": 0.00022578927935600619, + "loss": 1.8958, + "step": 13711 + }, + { + "epoch": 1.5998133239995334, + "grad_norm": 0.9869712591171265, + "learning_rate": 0.00022577618700730554, + "loss": 2.0889, + "step": 13712 + }, + { + "epoch": 1.599929996499825, + "grad_norm": 1.3541755676269531, + "learning_rate": 0.00022576309388859156, + "loss": 2.083, + "step": 13713 + }, + { + "epoch": 1.6000466690001167, + "grad_norm": 1.0714887380599976, + "learning_rate": 0.00022574999999999996, + "loss": 2.4001, + "step": 13714 + }, + { + "epoch": 1.6001633415004084, + "grad_norm": 1.152688980102539, + "learning_rate": 0.00022573690534166658, + "loss": 2.2414, + "step": 13715 + }, + { + "epoch": 1.6002800140007, + "grad_norm": 1.0076490640640259, + "learning_rate": 0.00022572380991372706, + "loss": 1.9711, + "step": 13716 + }, + { + "epoch": 1.6003966865009918, + "grad_norm": 1.0665628910064697, + "learning_rate": 0.00022571071371631717, + "loss": 2.076, + "step": 13717 + }, + { + "epoch": 1.6005133590012834, + "grad_norm": 1.2931662797927856, + "learning_rate": 0.00022569761674957276, + "loss": 2.0527, + "step": 13718 + }, + { + "epoch": 1.6006300315015751, + "grad_norm": 1.110041856765747, + "learning_rate": 0.0002256845190136295, + "loss": 2.0569, + "step": 13719 + }, + { + "epoch": 1.6007467040018668, + "grad_norm": 1.3557045459747314, + "learning_rate": 0.00022567142050862324, + "loss": 2.0599, + "step": 13720 + }, + { + "epoch": 1.6008633765021585, + "grad_norm": 1.1290736198425293, + "learning_rate": 0.00022565832123468978, + "loss": 2.0783, + "step": 13721 + }, + { + "epoch": 1.6009800490024502, + "grad_norm": 1.0301618576049805, + "learning_rate": 0.00022564522119196485, + "loss": 1.9483, + "step": 13722 + }, + { + "epoch": 1.6010967215027418, + "grad_norm": 1.0846326351165771, + "learning_rate": 0.0002256321203805843, + "loss": 1.9931, + "step": 13723 + }, + { + "epoch": 1.6012133940030335, + "grad_norm": 1.190772294998169, + "learning_rate": 0.00022561901880068395, + "loss": 2.0137, + "step": 13724 + }, + { + "epoch": 1.6013300665033252, + "grad_norm": 1.1727442741394043, + "learning_rate": 0.00022560591645239962, + "loss": 2.1795, + "step": 13725 + }, + { + "epoch": 1.601446739003617, + "grad_norm": 1.1596949100494385, + "learning_rate": 0.00022559281333586713, + "loss": 2.0319, + "step": 13726 + }, + { + "epoch": 1.6015634115039086, + "grad_norm": 1.2477473020553589, + "learning_rate": 0.00022557970945122227, + "loss": 1.9909, + "step": 13727 + }, + { + "epoch": 1.6016800840042003, + "grad_norm": 1.1000932455062866, + "learning_rate": 0.00022556660479860105, + "loss": 2.0528, + "step": 13728 + }, + { + "epoch": 1.601796756504492, + "grad_norm": 1.087254285812378, + "learning_rate": 0.00022555349937813916, + "loss": 2.0341, + "step": 13729 + }, + { + "epoch": 1.6019134290047836, + "grad_norm": 1.044813632965088, + "learning_rate": 0.00022554039318997246, + "loss": 2.068, + "step": 13730 + }, + { + "epoch": 1.6020301015050753, + "grad_norm": 1.4032381772994995, + "learning_rate": 0.00022552728623423694, + "loss": 2.1091, + "step": 13731 + }, + { + "epoch": 1.602146774005367, + "grad_norm": 1.1855530738830566, + "learning_rate": 0.0002255141785110684, + "loss": 2.0518, + "step": 13732 + }, + { + "epoch": 1.6022634465056587, + "grad_norm": 1.0757300853729248, + "learning_rate": 0.00022550107002060275, + "loss": 2.1633, + "step": 13733 + }, + { + "epoch": 1.6023801190059503, + "grad_norm": 1.1897906064987183, + "learning_rate": 0.0002254879607629759, + "loss": 2.1885, + "step": 13734 + }, + { + "epoch": 1.602496791506242, + "grad_norm": 1.084963083267212, + "learning_rate": 0.00022547485073832368, + "loss": 1.8023, + "step": 13735 + }, + { + "epoch": 1.6026134640065337, + "grad_norm": 1.2164742946624756, + "learning_rate": 0.00022546173994678208, + "loss": 1.9916, + "step": 13736 + }, + { + "epoch": 1.6027301365068254, + "grad_norm": 1.1668860912322998, + "learning_rate": 0.000225448628388487, + "loss": 2.0337, + "step": 13737 + }, + { + "epoch": 1.602846809007117, + "grad_norm": 1.181406855583191, + "learning_rate": 0.00022543551606357432, + "loss": 1.8423, + "step": 13738 + }, + { + "epoch": 1.6029634815074087, + "grad_norm": 1.0034359693527222, + "learning_rate": 0.00022542240297218004, + "loss": 1.9917, + "step": 13739 + }, + { + "epoch": 1.6030801540077004, + "grad_norm": 1.1265363693237305, + "learning_rate": 0.00022540928911444005, + "loss": 1.9752, + "step": 13740 + }, + { + "epoch": 1.6031968265079921, + "grad_norm": 1.0830034017562866, + "learning_rate": 0.00022539617449049033, + "loss": 2.0809, + "step": 13741 + }, + { + "epoch": 1.6033134990082838, + "grad_norm": 1.1327471733093262, + "learning_rate": 0.00022538305910046682, + "loss": 2.1058, + "step": 13742 + }, + { + "epoch": 1.6034301715085755, + "grad_norm": 1.1432373523712158, + "learning_rate": 0.00022536994294450544, + "loss": 2.0553, + "step": 13743 + }, + { + "epoch": 1.6035468440088672, + "grad_norm": 1.3163301944732666, + "learning_rate": 0.0002253568260227423, + "loss": 1.9612, + "step": 13744 + }, + { + "epoch": 1.6036635165091588, + "grad_norm": 1.0640461444854736, + "learning_rate": 0.00022534370833531326, + "loss": 2.1057, + "step": 13745 + }, + { + "epoch": 1.6037801890094505, + "grad_norm": 1.1563301086425781, + "learning_rate": 0.00022533058988235437, + "loss": 2.2102, + "step": 13746 + }, + { + "epoch": 1.6038968615097422, + "grad_norm": 1.1801385879516602, + "learning_rate": 0.00022531747066400159, + "loss": 2.0523, + "step": 13747 + }, + { + "epoch": 1.6040135340100339, + "grad_norm": 1.0349133014678955, + "learning_rate": 0.00022530435068039093, + "loss": 2.1437, + "step": 13748 + }, + { + "epoch": 1.6041302065103256, + "grad_norm": 1.2293381690979004, + "learning_rate": 0.00022529122993165848, + "loss": 2.0468, + "step": 13749 + }, + { + "epoch": 1.6042468790106172, + "grad_norm": 1.2070820331573486, + "learning_rate": 0.00022527810841794013, + "loss": 2.1965, + "step": 13750 + }, + { + "epoch": 1.604363551510909, + "grad_norm": 1.168502688407898, + "learning_rate": 0.00022526498613937196, + "loss": 1.8354, + "step": 13751 + }, + { + "epoch": 1.6044802240112006, + "grad_norm": 1.0787111520767212, + "learning_rate": 0.00022525186309609006, + "loss": 1.986, + "step": 13752 + }, + { + "epoch": 1.6045968965114923, + "grad_norm": 1.107251763343811, + "learning_rate": 0.00022523873928823043, + "loss": 2.1534, + "step": 13753 + }, + { + "epoch": 1.604713569011784, + "grad_norm": 1.2317726612091064, + "learning_rate": 0.0002252256147159292, + "loss": 2.0347, + "step": 13754 + }, + { + "epoch": 1.6048302415120757, + "grad_norm": 1.1602482795715332, + "learning_rate": 0.00022521248937932227, + "loss": 2.2306, + "step": 13755 + }, + { + "epoch": 1.6049469140123673, + "grad_norm": 1.1718286275863647, + "learning_rate": 0.0002251993632785458, + "loss": 2.1077, + "step": 13756 + }, + { + "epoch": 1.605063586512659, + "grad_norm": 1.083462119102478, + "learning_rate": 0.0002251862364137359, + "loss": 2.1661, + "step": 13757 + }, + { + "epoch": 1.6051802590129507, + "grad_norm": 1.0529135465621948, + "learning_rate": 0.00022517310878502863, + "loss": 2.056, + "step": 13758 + }, + { + "epoch": 1.6052969315132424, + "grad_norm": 1.198569893836975, + "learning_rate": 0.00022515998039256005, + "loss": 2.2074, + "step": 13759 + }, + { + "epoch": 1.605413604013534, + "grad_norm": 1.0040591955184937, + "learning_rate": 0.0002251468512364663, + "loss": 2.0541, + "step": 13760 + }, + { + "epoch": 1.6055302765138257, + "grad_norm": 1.2820099592208862, + "learning_rate": 0.0002251337213168835, + "loss": 2.3373, + "step": 13761 + }, + { + "epoch": 1.6056469490141174, + "grad_norm": 1.4164544343948364, + "learning_rate": 0.00022512059063394771, + "loss": 2.2761, + "step": 13762 + }, + { + "epoch": 1.605763621514409, + "grad_norm": 1.2504054307937622, + "learning_rate": 0.00022510745918779513, + "loss": 2.0973, + "step": 13763 + }, + { + "epoch": 1.6058802940147008, + "grad_norm": 1.2242900133132935, + "learning_rate": 0.00022509432697856178, + "loss": 2.1207, + "step": 13764 + }, + { + "epoch": 1.6059969665149925, + "grad_norm": 0.9567497372627258, + "learning_rate": 0.00022508119400638398, + "loss": 1.7996, + "step": 13765 + }, + { + "epoch": 1.6061136390152841, + "grad_norm": 1.1720359325408936, + "learning_rate": 0.00022506806027139766, + "loss": 2.1879, + "step": 13766 + }, + { + "epoch": 1.6062303115155758, + "grad_norm": 1.1461873054504395, + "learning_rate": 0.0002250549257737391, + "loss": 1.9828, + "step": 13767 + }, + { + "epoch": 1.6063469840158675, + "grad_norm": 1.2780365943908691, + "learning_rate": 0.00022504179051354445, + "loss": 2.2878, + "step": 13768 + }, + { + "epoch": 1.6064636565161592, + "grad_norm": 1.2316160202026367, + "learning_rate": 0.0002250286544909499, + "loss": 1.9802, + "step": 13769 + }, + { + "epoch": 1.6065803290164509, + "grad_norm": 1.2890284061431885, + "learning_rate": 0.00022501551770609158, + "loss": 2.3056, + "step": 13770 + }, + { + "epoch": 1.6066970015167426, + "grad_norm": 1.213487148284912, + "learning_rate": 0.00022500238015910574, + "loss": 2.1043, + "step": 13771 + }, + { + "epoch": 1.6068136740170342, + "grad_norm": 1.2131531238555908, + "learning_rate": 0.0002249892418501285, + "loss": 2.021, + "step": 13772 + }, + { + "epoch": 1.606930346517326, + "grad_norm": 1.3651111125946045, + "learning_rate": 0.00022497610277929616, + "loss": 2.1045, + "step": 13773 + }, + { + "epoch": 1.6070470190176176, + "grad_norm": 1.4945182800292969, + "learning_rate": 0.00022496296294674485, + "loss": 2.184, + "step": 13774 + }, + { + "epoch": 1.6071636915179093, + "grad_norm": 1.0776594877243042, + "learning_rate": 0.0002249498223526108, + "loss": 2.1207, + "step": 13775 + }, + { + "epoch": 1.607280364018201, + "grad_norm": 1.19923734664917, + "learning_rate": 0.00022493668099703024, + "loss": 2.1343, + "step": 13776 + }, + { + "epoch": 1.6073970365184926, + "grad_norm": 1.2793627977371216, + "learning_rate": 0.00022492353888013946, + "loss": 2.0151, + "step": 13777 + }, + { + "epoch": 1.6075137090187843, + "grad_norm": 1.1431204080581665, + "learning_rate": 0.00022491039600207462, + "loss": 2.1215, + "step": 13778 + }, + { + "epoch": 1.607630381519076, + "grad_norm": 1.0454435348510742, + "learning_rate": 0.00022489725236297207, + "loss": 1.7977, + "step": 13779 + }, + { + "epoch": 1.6077470540193677, + "grad_norm": 1.1602030992507935, + "learning_rate": 0.00022488410796296798, + "loss": 2.1443, + "step": 13780 + }, + { + "epoch": 1.6078637265196594, + "grad_norm": 1.2235344648361206, + "learning_rate": 0.00022487096280219862, + "loss": 2.0619, + "step": 13781 + }, + { + "epoch": 1.607980399019951, + "grad_norm": 1.109796404838562, + "learning_rate": 0.00022485781688080027, + "loss": 1.9928, + "step": 13782 + }, + { + "epoch": 1.6080970715202427, + "grad_norm": 1.1696261167526245, + "learning_rate": 0.00022484467019890929, + "loss": 2.0414, + "step": 13783 + }, + { + "epoch": 1.6082137440205344, + "grad_norm": 1.1180799007415771, + "learning_rate": 0.0002248315227566619, + "loss": 2.0215, + "step": 13784 + }, + { + "epoch": 1.608330416520826, + "grad_norm": 1.1665654182434082, + "learning_rate": 0.0002248183745541944, + "loss": 2.1503, + "step": 13785 + }, + { + "epoch": 1.6084470890211178, + "grad_norm": 1.1393290758132935, + "learning_rate": 0.00022480522559164314, + "loss": 2.0054, + "step": 13786 + }, + { + "epoch": 1.6085637615214095, + "grad_norm": 1.0791646242141724, + "learning_rate": 0.0002247920758691444, + "loss": 2.0635, + "step": 13787 + }, + { + "epoch": 1.6086804340217011, + "grad_norm": 1.1435487270355225, + "learning_rate": 0.00022477892538683448, + "loss": 2.0261, + "step": 13788 + }, + { + "epoch": 1.6087971065219928, + "grad_norm": 1.2309784889221191, + "learning_rate": 0.00022476577414484977, + "loss": 2.2616, + "step": 13789 + }, + { + "epoch": 1.6089137790222845, + "grad_norm": 1.2071033716201782, + "learning_rate": 0.00022475262214332654, + "loss": 2.0296, + "step": 13790 + }, + { + "epoch": 1.6090304515225762, + "grad_norm": 1.1513947248458862, + "learning_rate": 0.00022473946938240114, + "loss": 2.2462, + "step": 13791 + }, + { + "epoch": 1.6091471240228679, + "grad_norm": 0.9625357985496521, + "learning_rate": 0.00022472631586221, + "loss": 1.9706, + "step": 13792 + }, + { + "epoch": 1.6092637965231595, + "grad_norm": 1.142519235610962, + "learning_rate": 0.0002247131615828894, + "loss": 2.0782, + "step": 13793 + }, + { + "epoch": 1.6093804690234512, + "grad_norm": 1.0612136125564575, + "learning_rate": 0.00022470000654457574, + "loss": 2.1599, + "step": 13794 + }, + { + "epoch": 1.609497141523743, + "grad_norm": 1.094974160194397, + "learning_rate": 0.00022468685074740542, + "loss": 1.9543, + "step": 13795 + }, + { + "epoch": 1.6096138140240346, + "grad_norm": 1.120152473449707, + "learning_rate": 0.0002246736941915148, + "loss": 2.1281, + "step": 13796 + }, + { + "epoch": 1.6097304865243263, + "grad_norm": 1.1042051315307617, + "learning_rate": 0.00022466053687704025, + "loss": 1.9212, + "step": 13797 + }, + { + "epoch": 1.609847159024618, + "grad_norm": 1.0202752351760864, + "learning_rate": 0.00022464737880411817, + "loss": 2.091, + "step": 13798 + }, + { + "epoch": 1.6099638315249096, + "grad_norm": 1.2341721057891846, + "learning_rate": 0.00022463421997288502, + "loss": 2.2157, + "step": 13799 + }, + { + "epoch": 1.6100805040252013, + "grad_norm": 1.1218584775924683, + "learning_rate": 0.0002246210603834772, + "loss": 2.06, + "step": 13800 + }, + { + "epoch": 1.610197176525493, + "grad_norm": 1.1034455299377441, + "learning_rate": 0.00022460790003603115, + "loss": 1.984, + "step": 13801 + }, + { + "epoch": 1.6103138490257847, + "grad_norm": 1.2555538415908813, + "learning_rate": 0.0002245947389306832, + "loss": 2.0237, + "step": 13802 + }, + { + "epoch": 1.6104305215260764, + "grad_norm": 1.102327585220337, + "learning_rate": 0.0002245815770675699, + "loss": 2.1479, + "step": 13803 + }, + { + "epoch": 1.610547194026368, + "grad_norm": 1.22594153881073, + "learning_rate": 0.00022456841444682768, + "loss": 2.1622, + "step": 13804 + }, + { + "epoch": 1.6106638665266597, + "grad_norm": 0.9489760398864746, + "learning_rate": 0.000224555251068593, + "loss": 2.1311, + "step": 13805 + }, + { + "epoch": 1.6107805390269514, + "grad_norm": 1.2594572305679321, + "learning_rate": 0.00022454208693300227, + "loss": 2.1746, + "step": 13806 + }, + { + "epoch": 1.610897211527243, + "grad_norm": 1.116502285003662, + "learning_rate": 0.000224528922040192, + "loss": 2.0115, + "step": 13807 + }, + { + "epoch": 1.6110138840275348, + "grad_norm": 1.0715819597244263, + "learning_rate": 0.00022451575639029862, + "loss": 1.9724, + "step": 13808 + }, + { + "epoch": 1.6111305565278264, + "grad_norm": 1.172145962715149, + "learning_rate": 0.00022450258998345867, + "loss": 1.9547, + "step": 13809 + }, + { + "epoch": 1.6112472290281181, + "grad_norm": 1.1723015308380127, + "learning_rate": 0.00022448942281980862, + "loss": 2.3329, + "step": 13810 + }, + { + "epoch": 1.6113639015284098, + "grad_norm": 1.0233486890792847, + "learning_rate": 0.00022447625489948507, + "loss": 2.1476, + "step": 13811 + }, + { + "epoch": 1.6114805740287015, + "grad_norm": 1.1934740543365479, + "learning_rate": 0.0002244630862226244, + "loss": 1.9255, + "step": 13812 + }, + { + "epoch": 1.6115972465289932, + "grad_norm": 1.1486637592315674, + "learning_rate": 0.0002244499167893632, + "loss": 2.0154, + "step": 13813 + }, + { + "epoch": 1.6117139190292848, + "grad_norm": 1.0560792684555054, + "learning_rate": 0.00022443674659983795, + "loss": 2.08, + "step": 13814 + }, + { + "epoch": 1.6118305915295765, + "grad_norm": 1.326797366142273, + "learning_rate": 0.00022442357565418514, + "loss": 2.067, + "step": 13815 + }, + { + "epoch": 1.6119472640298682, + "grad_norm": 1.044120192527771, + "learning_rate": 0.00022441040395254148, + "loss": 1.9505, + "step": 13816 + }, + { + "epoch": 1.61206393653016, + "grad_norm": 1.0879409313201904, + "learning_rate": 0.00022439723149504334, + "loss": 1.9542, + "step": 13817 + }, + { + "epoch": 1.6121806090304516, + "grad_norm": 1.1262964010238647, + "learning_rate": 0.00022438405828182736, + "loss": 2.0995, + "step": 13818 + }, + { + "epoch": 1.6122972815307433, + "grad_norm": 1.175624966621399, + "learning_rate": 0.00022437088431303013, + "loss": 2.0046, + "step": 13819 + }, + { + "epoch": 1.612413954031035, + "grad_norm": 1.1458405256271362, + "learning_rate": 0.00022435770958878818, + "loss": 2.0116, + "step": 13820 + }, + { + "epoch": 1.6125306265313266, + "grad_norm": 1.1373707056045532, + "learning_rate": 0.00022434453410923808, + "loss": 1.9692, + "step": 13821 + }, + { + "epoch": 1.6126472990316183, + "grad_norm": 0.91275954246521, + "learning_rate": 0.00022433135787451648, + "loss": 1.9523, + "step": 13822 + }, + { + "epoch": 1.61276397153191, + "grad_norm": 1.2528737783432007, + "learning_rate": 0.00022431818088475996, + "loss": 2.036, + "step": 13823 + }, + { + "epoch": 1.6128806440322017, + "grad_norm": 1.1030148267745972, + "learning_rate": 0.00022430500314010504, + "loss": 2.0778, + "step": 13824 + }, + { + "epoch": 1.6129973165324933, + "grad_norm": 1.227201223373413, + "learning_rate": 0.00022429182464068844, + "loss": 2.2405, + "step": 13825 + }, + { + "epoch": 1.613113989032785, + "grad_norm": 1.1613084077835083, + "learning_rate": 0.0002242786453866467, + "loss": 2.0562, + "step": 13826 + }, + { + "epoch": 1.6132306615330767, + "grad_norm": 1.1032360792160034, + "learning_rate": 0.00022426546537811652, + "loss": 2.0241, + "step": 13827 + }, + { + "epoch": 1.6133473340333684, + "grad_norm": 1.2569410800933838, + "learning_rate": 0.00022425228461523446, + "loss": 2.007, + "step": 13828 + }, + { + "epoch": 1.61346400653366, + "grad_norm": 1.0985643863677979, + "learning_rate": 0.00022423910309813722, + "loss": 2.1172, + "step": 13829 + }, + { + "epoch": 1.6135806790339517, + "grad_norm": 1.222724199295044, + "learning_rate": 0.0002242259208269614, + "loss": 2.2337, + "step": 13830 + }, + { + "epoch": 1.6136973515342434, + "grad_norm": 1.2382491827011108, + "learning_rate": 0.00022421273780184376, + "loss": 2.1968, + "step": 13831 + }, + { + "epoch": 1.613814024034535, + "grad_norm": 1.0403088331222534, + "learning_rate": 0.00022419955402292087, + "loss": 2.0739, + "step": 13832 + }, + { + "epoch": 1.6139306965348268, + "grad_norm": 1.1165642738342285, + "learning_rate": 0.00022418636949032944, + "loss": 1.9136, + "step": 13833 + }, + { + "epoch": 1.6140473690351185, + "grad_norm": 1.1317578554153442, + "learning_rate": 0.00022417318420420612, + "loss": 2.0056, + "step": 13834 + }, + { + "epoch": 1.6141640415354102, + "grad_norm": 1.1425297260284424, + "learning_rate": 0.00022415999816468763, + "loss": 2.0123, + "step": 13835 + }, + { + "epoch": 1.6142807140357018, + "grad_norm": 1.0659329891204834, + "learning_rate": 0.00022414681137191068, + "loss": 2.0312, + "step": 13836 + }, + { + "epoch": 1.6143973865359935, + "grad_norm": 1.0558052062988281, + "learning_rate": 0.00022413362382601198, + "loss": 2.0567, + "step": 13837 + }, + { + "epoch": 1.6145140590362852, + "grad_norm": 1.1933681964874268, + "learning_rate": 0.0002241204355271282, + "loss": 2.2841, + "step": 13838 + }, + { + "epoch": 1.6146307315365769, + "grad_norm": 1.053118348121643, + "learning_rate": 0.0002241072464753961, + "loss": 2.0585, + "step": 13839 + }, + { + "epoch": 1.6147474040368686, + "grad_norm": 1.0995677709579468, + "learning_rate": 0.0002240940566709524, + "loss": 2.043, + "step": 13840 + }, + { + "epoch": 1.6148640765371602, + "grad_norm": 0.9989785552024841, + "learning_rate": 0.00022408086611393386, + "loss": 1.993, + "step": 13841 + }, + { + "epoch": 1.614980749037452, + "grad_norm": 1.1510534286499023, + "learning_rate": 0.00022406767480447716, + "loss": 2.0984, + "step": 13842 + }, + { + "epoch": 1.6150974215377436, + "grad_norm": 1.196241021156311, + "learning_rate": 0.00022405448274271907, + "loss": 2.1187, + "step": 13843 + }, + { + "epoch": 1.6152140940380353, + "grad_norm": 1.0185548067092896, + "learning_rate": 0.0002240412899287964, + "loss": 2.0234, + "step": 13844 + }, + { + "epoch": 1.615330766538327, + "grad_norm": 1.1008100509643555, + "learning_rate": 0.0002240280963628459, + "loss": 2.1874, + "step": 13845 + }, + { + "epoch": 1.6154474390386186, + "grad_norm": 1.071565866470337, + "learning_rate": 0.00022401490204500434, + "loss": 2.0287, + "step": 13846 + }, + { + "epoch": 1.6155641115389103, + "grad_norm": 1.190983533859253, + "learning_rate": 0.0002240017069754085, + "loss": 2.0646, + "step": 13847 + }, + { + "epoch": 1.615680784039202, + "grad_norm": 1.1765623092651367, + "learning_rate": 0.00022398851115419514, + "loss": 2.068, + "step": 13848 + }, + { + "epoch": 1.6157974565394937, + "grad_norm": 1.2465039491653442, + "learning_rate": 0.00022397531458150118, + "loss": 2.1697, + "step": 13849 + }, + { + "epoch": 1.6159141290397854, + "grad_norm": 1.4108768701553345, + "learning_rate": 0.00022396211725746326, + "loss": 1.9673, + "step": 13850 + }, + { + "epoch": 1.616030801540077, + "grad_norm": 1.3219739198684692, + "learning_rate": 0.00022394891918221829, + "loss": 1.9613, + "step": 13851 + }, + { + "epoch": 1.6161474740403687, + "grad_norm": 1.1251174211502075, + "learning_rate": 0.00022393572035590313, + "loss": 2.1044, + "step": 13852 + }, + { + "epoch": 1.6162641465406604, + "grad_norm": 1.2002389430999756, + "learning_rate": 0.00022392252077865448, + "loss": 2.0849, + "step": 13853 + }, + { + "epoch": 1.616380819040952, + "grad_norm": 1.1536732912063599, + "learning_rate": 0.0002239093204506093, + "loss": 2.0507, + "step": 13854 + }, + { + "epoch": 1.6164974915412438, + "grad_norm": 1.2174855470657349, + "learning_rate": 0.0002238961193719044, + "loss": 2.2875, + "step": 13855 + }, + { + "epoch": 1.6166141640415355, + "grad_norm": 0.9963514804840088, + "learning_rate": 0.0002238829175426766, + "loss": 1.9883, + "step": 13856 + }, + { + "epoch": 1.6167308365418271, + "grad_norm": 1.230025053024292, + "learning_rate": 0.0002238697149630629, + "loss": 2.1186, + "step": 13857 + }, + { + "epoch": 1.6168475090421188, + "grad_norm": 1.1419174671173096, + "learning_rate": 0.00022385651163319995, + "loss": 2.0767, + "step": 13858 + }, + { + "epoch": 1.6169641815424105, + "grad_norm": 1.0411648750305176, + "learning_rate": 0.00022384330755322474, + "loss": 2.2164, + "step": 13859 + }, + { + "epoch": 1.6170808540427022, + "grad_norm": 1.3078373670578003, + "learning_rate": 0.0002238301027232742, + "loss": 2.1629, + "step": 13860 + }, + { + "epoch": 1.6171975265429939, + "grad_norm": 1.0078853368759155, + "learning_rate": 0.00022381689714348512, + "loss": 2.0597, + "step": 13861 + }, + { + "epoch": 1.6173141990432855, + "grad_norm": 1.1000523567199707, + "learning_rate": 0.0002238036908139945, + "loss": 2.0679, + "step": 13862 + }, + { + "epoch": 1.6174308715435772, + "grad_norm": 1.211632490158081, + "learning_rate": 0.00022379048373493917, + "loss": 2.0851, + "step": 13863 + }, + { + "epoch": 1.617547544043869, + "grad_norm": 1.2238409519195557, + "learning_rate": 0.00022377727590645617, + "loss": 2.0701, + "step": 13864 + }, + { + "epoch": 1.6176642165441606, + "grad_norm": 1.0112004280090332, + "learning_rate": 0.00022376406732868228, + "loss": 1.9618, + "step": 13865 + }, + { + "epoch": 1.6177808890444523, + "grad_norm": 0.9888989329338074, + "learning_rate": 0.0002237508580017545, + "loss": 2.0726, + "step": 13866 + }, + { + "epoch": 1.617897561544744, + "grad_norm": 1.1197463274002075, + "learning_rate": 0.00022373764792580971, + "loss": 2.018, + "step": 13867 + }, + { + "epoch": 1.6180142340450356, + "grad_norm": 1.2396339178085327, + "learning_rate": 0.00022372443710098495, + "loss": 2.076, + "step": 13868 + }, + { + "epoch": 1.6181309065453273, + "grad_norm": 1.1070932149887085, + "learning_rate": 0.00022371122552741714, + "loss": 1.956, + "step": 13869 + }, + { + "epoch": 1.618247579045619, + "grad_norm": 1.1345465183258057, + "learning_rate": 0.0002236980132052432, + "loss": 2.2686, + "step": 13870 + }, + { + "epoch": 1.6183642515459107, + "grad_norm": 1.02924382686615, + "learning_rate": 0.00022368480013460019, + "loss": 2.0615, + "step": 13871 + }, + { + "epoch": 1.6184809240462024, + "grad_norm": 1.0426545143127441, + "learning_rate": 0.00022367158631562495, + "loss": 2.0147, + "step": 13872 + }, + { + "epoch": 1.618597596546494, + "grad_norm": 1.3190160989761353, + "learning_rate": 0.0002236583717484546, + "loss": 2.1119, + "step": 13873 + }, + { + "epoch": 1.6187142690467857, + "grad_norm": 1.355954647064209, + "learning_rate": 0.0002236451564332261, + "loss": 2.1356, + "step": 13874 + }, + { + "epoch": 1.6188309415470774, + "grad_norm": 1.1228207349777222, + "learning_rate": 0.00022363194037007642, + "loss": 2.0881, + "step": 13875 + }, + { + "epoch": 1.618947614047369, + "grad_norm": 1.0516420602798462, + "learning_rate": 0.0002236187235591426, + "loss": 2.1261, + "step": 13876 + }, + { + "epoch": 1.6190642865476608, + "grad_norm": 1.2626203298568726, + "learning_rate": 0.0002236055060005616, + "loss": 2.1168, + "step": 13877 + }, + { + "epoch": 1.6191809590479525, + "grad_norm": 1.2261465787887573, + "learning_rate": 0.00022359228769447048, + "loss": 2.0698, + "step": 13878 + }, + { + "epoch": 1.6192976315482441, + "grad_norm": 1.2673838138580322, + "learning_rate": 0.0002235790686410063, + "loss": 2.1716, + "step": 13879 + }, + { + "epoch": 1.6194143040485358, + "grad_norm": 1.2298336029052734, + "learning_rate": 0.00022356584884030616, + "loss": 1.9777, + "step": 13880 + }, + { + "epoch": 1.6195309765488275, + "grad_norm": 1.2453843355178833, + "learning_rate": 0.00022355262829250693, + "loss": 2.0971, + "step": 13881 + }, + { + "epoch": 1.6196476490491192, + "grad_norm": 1.1883200407028198, + "learning_rate": 0.00022353940699774584, + "loss": 2.0538, + "step": 13882 + }, + { + "epoch": 1.6197643215494109, + "grad_norm": 1.1297510862350464, + "learning_rate": 0.00022352618495615983, + "loss": 1.9897, + "step": 13883 + }, + { + "epoch": 1.6198809940497025, + "grad_norm": 1.2645093202590942, + "learning_rate": 0.000223512962167886, + "loss": 1.8877, + "step": 13884 + }, + { + "epoch": 1.6199976665499942, + "grad_norm": 1.2307016849517822, + "learning_rate": 0.00022349973863306148, + "loss": 1.9951, + "step": 13885 + }, + { + "epoch": 1.620114339050286, + "grad_norm": 1.0825008153915405, + "learning_rate": 0.00022348651435182332, + "loss": 1.8052, + "step": 13886 + }, + { + "epoch": 1.6202310115505776, + "grad_norm": 1.2084249258041382, + "learning_rate": 0.00022347328932430862, + "loss": 2.1738, + "step": 13887 + }, + { + "epoch": 1.6203476840508693, + "grad_norm": 1.097549557685852, + "learning_rate": 0.0002234600635506545, + "loss": 2.0125, + "step": 13888 + }, + { + "epoch": 1.620464356551161, + "grad_norm": 1.3337006568908691, + "learning_rate": 0.000223446837030998, + "loss": 2.019, + "step": 13889 + }, + { + "epoch": 1.6205810290514526, + "grad_norm": 1.0454472303390503, + "learning_rate": 0.00022343360976547635, + "loss": 2.2408, + "step": 13890 + }, + { + "epoch": 1.6206977015517443, + "grad_norm": 1.1622155904769897, + "learning_rate": 0.00022342038175422662, + "loss": 2.0889, + "step": 13891 + }, + { + "epoch": 1.620814374052036, + "grad_norm": 1.1795859336853027, + "learning_rate": 0.0002234071529973859, + "loss": 1.9525, + "step": 13892 + }, + { + "epoch": 1.6209310465523277, + "grad_norm": 1.3059033155441284, + "learning_rate": 0.00022339392349509141, + "loss": 2.1617, + "step": 13893 + }, + { + "epoch": 1.6210477190526194, + "grad_norm": 1.1260462999343872, + "learning_rate": 0.00022338069324748025, + "loss": 2.1082, + "step": 13894 + }, + { + "epoch": 1.621164391552911, + "grad_norm": 1.098652720451355, + "learning_rate": 0.00022336746225468955, + "loss": 2.0724, + "step": 13895 + }, + { + "epoch": 1.6212810640532027, + "grad_norm": 1.1523560285568237, + "learning_rate": 0.00022335423051685657, + "loss": 2.2012, + "step": 13896 + }, + { + "epoch": 1.6213977365534944, + "grad_norm": 1.2321486473083496, + "learning_rate": 0.0002233409980341184, + "loss": 2.2197, + "step": 13897 + }, + { + "epoch": 1.621514409053786, + "grad_norm": 1.1118680238723755, + "learning_rate": 0.0002233277648066122, + "loss": 2.0071, + "step": 13898 + }, + { + "epoch": 1.6216310815540778, + "grad_norm": 1.039467692375183, + "learning_rate": 0.00022331453083447526, + "loss": 2.0215, + "step": 13899 + }, + { + "epoch": 1.6217477540543694, + "grad_norm": 1.1303212642669678, + "learning_rate": 0.00022330129611784468, + "loss": 2.1676, + "step": 13900 + }, + { + "epoch": 1.6218644265546611, + "grad_norm": 1.3683117628097534, + "learning_rate": 0.0002232880606568577, + "loss": 2.0594, + "step": 13901 + }, + { + "epoch": 1.6219810990549528, + "grad_norm": 1.2829184532165527, + "learning_rate": 0.0002232748244516516, + "loss": 2.1808, + "step": 13902 + }, + { + "epoch": 1.6220977715552445, + "grad_norm": 1.2323251962661743, + "learning_rate": 0.00022326158750236342, + "loss": 2.0638, + "step": 13903 + }, + { + "epoch": 1.6222144440555362, + "grad_norm": 1.0212109088897705, + "learning_rate": 0.00022324834980913056, + "loss": 1.9979, + "step": 13904 + }, + { + "epoch": 1.6223311165558278, + "grad_norm": 0.9928629994392395, + "learning_rate": 0.00022323511137209017, + "loss": 1.9195, + "step": 13905 + }, + { + "epoch": 1.6224477890561195, + "grad_norm": 1.0591243505477905, + "learning_rate": 0.0002232218721913795, + "loss": 2.041, + "step": 13906 + }, + { + "epoch": 1.6225644615564112, + "grad_norm": 1.171968936920166, + "learning_rate": 0.0002232086322671358, + "loss": 2.0046, + "step": 13907 + }, + { + "epoch": 1.622681134056703, + "grad_norm": 1.1638755798339844, + "learning_rate": 0.0002231953915994964, + "loss": 1.9553, + "step": 13908 + }, + { + "epoch": 1.6227978065569946, + "grad_norm": 1.121382236480713, + "learning_rate": 0.00022318215018859845, + "loss": 2.181, + "step": 13909 + }, + { + "epoch": 1.6229144790572863, + "grad_norm": 1.0954056978225708, + "learning_rate": 0.00022316890803457927, + "loss": 2.0376, + "step": 13910 + }, + { + "epoch": 1.623031151557578, + "grad_norm": 1.1133490800857544, + "learning_rate": 0.00022315566513757612, + "loss": 2.1557, + "step": 13911 + }, + { + "epoch": 1.6231478240578696, + "grad_norm": 1.0782164335250854, + "learning_rate": 0.00022314242149772636, + "loss": 2.0029, + "step": 13912 + }, + { + "epoch": 1.6232644965581613, + "grad_norm": 1.1995843648910522, + "learning_rate": 0.00022312917711516718, + "loss": 2.0956, + "step": 13913 + }, + { + "epoch": 1.623381169058453, + "grad_norm": 1.2602477073669434, + "learning_rate": 0.000223115931990036, + "loss": 1.9492, + "step": 13914 + }, + { + "epoch": 1.6234978415587447, + "grad_norm": 1.179869532585144, + "learning_rate": 0.00022310268612247002, + "loss": 1.9823, + "step": 13915 + }, + { + "epoch": 1.6236145140590363, + "grad_norm": 1.1351503133773804, + "learning_rate": 0.00022308943951260667, + "loss": 2.0089, + "step": 13916 + }, + { + "epoch": 1.623731186559328, + "grad_norm": 1.245121717453003, + "learning_rate": 0.00022307619216058316, + "loss": 2.03, + "step": 13917 + }, + { + "epoch": 1.6238478590596197, + "grad_norm": 1.0649664402008057, + "learning_rate": 0.00022306294406653689, + "loss": 2.0897, + "step": 13918 + }, + { + "epoch": 1.6239645315599114, + "grad_norm": 1.0114566087722778, + "learning_rate": 0.00022304969523060517, + "loss": 1.8244, + "step": 13919 + }, + { + "epoch": 1.624081204060203, + "grad_norm": 1.167981505393982, + "learning_rate": 0.0002230364456529254, + "loss": 1.9243, + "step": 13920 + }, + { + "epoch": 1.6241978765604947, + "grad_norm": 1.2497178316116333, + "learning_rate": 0.00022302319533363488, + "loss": 2.1381, + "step": 13921 + }, + { + "epoch": 1.6243145490607864, + "grad_norm": 1.2421026229858398, + "learning_rate": 0.00022300994427287103, + "loss": 1.9595, + "step": 13922 + }, + { + "epoch": 1.624431221561078, + "grad_norm": 1.158666968345642, + "learning_rate": 0.00022299669247077117, + "loss": 2.0586, + "step": 13923 + }, + { + "epoch": 1.6245478940613698, + "grad_norm": 1.239445447921753, + "learning_rate": 0.0002229834399274727, + "loss": 2.0072, + "step": 13924 + }, + { + "epoch": 1.6246645665616615, + "grad_norm": 1.2834856510162354, + "learning_rate": 0.000222970186643113, + "loss": 2.2866, + "step": 13925 + }, + { + "epoch": 1.6247812390619532, + "grad_norm": 1.2584048509597778, + "learning_rate": 0.00022295693261782952, + "loss": 2.2294, + "step": 13926 + }, + { + "epoch": 1.6248979115622448, + "grad_norm": 1.1985135078430176, + "learning_rate": 0.00022294367785175956, + "loss": 2.209, + "step": 13927 + }, + { + "epoch": 1.6250145840625363, + "grad_norm": 0.9594308137893677, + "learning_rate": 0.00022293042234504065, + "loss": 1.8176, + "step": 13928 + }, + { + "epoch": 1.625131256562828, + "grad_norm": 1.0360904932022095, + "learning_rate": 0.0002229171660978101, + "loss": 2.0549, + "step": 13929 + }, + { + "epoch": 1.6252479290631197, + "grad_norm": 1.3089807033538818, + "learning_rate": 0.00022290390911020543, + "loss": 2.025, + "step": 13930 + }, + { + "epoch": 1.6253646015634113, + "grad_norm": 1.0035998821258545, + "learning_rate": 0.00022289065138236396, + "loss": 2.0087, + "step": 13931 + }, + { + "epoch": 1.625481274063703, + "grad_norm": 1.1170858144760132, + "learning_rate": 0.00022287739291442326, + "loss": 2.1195, + "step": 13932 + }, + { + "epoch": 1.6255979465639947, + "grad_norm": 1.2143573760986328, + "learning_rate": 0.0002228641337065207, + "loss": 2.0259, + "step": 13933 + }, + { + "epoch": 1.6257146190642864, + "grad_norm": 0.9577794671058655, + "learning_rate": 0.0002228508737587938, + "loss": 1.8841, + "step": 13934 + }, + { + "epoch": 1.625831291564578, + "grad_norm": 1.0242855548858643, + "learning_rate": 0.00022283761307137988, + "loss": 2.0943, + "step": 13935 + }, + { + "epoch": 1.6259479640648697, + "grad_norm": 1.0804824829101562, + "learning_rate": 0.00022282435164441657, + "loss": 1.8236, + "step": 13936 + }, + { + "epoch": 1.6260646365651614, + "grad_norm": 1.1270205974578857, + "learning_rate": 0.0002228110894780413, + "loss": 2.0727, + "step": 13937 + }, + { + "epoch": 1.626181309065453, + "grad_norm": 1.1359785795211792, + "learning_rate": 0.00022279782657239153, + "loss": 2.0181, + "step": 13938 + }, + { + "epoch": 1.6262979815657448, + "grad_norm": 1.3478059768676758, + "learning_rate": 0.00022278456292760474, + "loss": 2.2537, + "step": 13939 + }, + { + "epoch": 1.6264146540660365, + "grad_norm": 1.1321430206298828, + "learning_rate": 0.00022277129854381854, + "loss": 2.1085, + "step": 13940 + }, + { + "epoch": 1.6265313265663282, + "grad_norm": 0.9713317155838013, + "learning_rate": 0.00022275803342117036, + "loss": 1.7072, + "step": 13941 + }, + { + "epoch": 1.6266479990666198, + "grad_norm": 1.0202198028564453, + "learning_rate": 0.00022274476755979768, + "loss": 2.0038, + "step": 13942 + }, + { + "epoch": 1.6267646715669115, + "grad_norm": 1.0850439071655273, + "learning_rate": 0.00022273150095983808, + "loss": 2.0598, + "step": 13943 + }, + { + "epoch": 1.6268813440672032, + "grad_norm": 1.1648060083389282, + "learning_rate": 0.0002227182336214291, + "loss": 1.9815, + "step": 13944 + }, + { + "epoch": 1.6269980165674949, + "grad_norm": 1.2643611431121826, + "learning_rate": 0.0002227049655447083, + "loss": 2.1671, + "step": 13945 + }, + { + "epoch": 1.6271146890677866, + "grad_norm": 1.0551596879959106, + "learning_rate": 0.0002226916967298131, + "loss": 1.9683, + "step": 13946 + }, + { + "epoch": 1.6272313615680782, + "grad_norm": 1.111077070236206, + "learning_rate": 0.00022267842717688121, + "loss": 2.0362, + "step": 13947 + }, + { + "epoch": 1.62734803406837, + "grad_norm": 1.0209707021713257, + "learning_rate": 0.00022266515688605011, + "loss": 2.0551, + "step": 13948 + }, + { + "epoch": 1.6274647065686616, + "grad_norm": 1.1118453741073608, + "learning_rate": 0.00022265188585745744, + "loss": 1.9565, + "step": 13949 + }, + { + "epoch": 1.6275813790689533, + "grad_norm": 1.0555142164230347, + "learning_rate": 0.00022263861409124069, + "loss": 2.0713, + "step": 13950 + }, + { + "epoch": 1.627698051569245, + "grad_norm": 1.348197102546692, + "learning_rate": 0.00022262534158753756, + "loss": 2.1374, + "step": 13951 + }, + { + "epoch": 1.6278147240695366, + "grad_norm": 1.1352789402008057, + "learning_rate": 0.0002226120683464855, + "loss": 2.0294, + "step": 13952 + }, + { + "epoch": 1.6279313965698283, + "grad_norm": 0.9988720417022705, + "learning_rate": 0.00022259879436822223, + "loss": 1.8652, + "step": 13953 + }, + { + "epoch": 1.62804806907012, + "grad_norm": 1.0593010187149048, + "learning_rate": 0.0002225855196528853, + "loss": 2.0896, + "step": 13954 + }, + { + "epoch": 1.6281647415704117, + "grad_norm": 1.2554821968078613, + "learning_rate": 0.00022257224420061235, + "loss": 1.9461, + "step": 13955 + }, + { + "epoch": 1.6282814140707034, + "grad_norm": 1.06217622756958, + "learning_rate": 0.00022255896801154102, + "loss": 2.0521, + "step": 13956 + }, + { + "epoch": 1.628398086570995, + "grad_norm": 1.4173575639724731, + "learning_rate": 0.00022254569108580893, + "loss": 2.1933, + "step": 13957 + }, + { + "epoch": 1.6285147590712867, + "grad_norm": 1.330931305885315, + "learning_rate": 0.0002225324134235537, + "loss": 2.2194, + "step": 13958 + }, + { + "epoch": 1.6286314315715784, + "grad_norm": 1.0809706449508667, + "learning_rate": 0.000222519135024913, + "loss": 2.0118, + "step": 13959 + }, + { + "epoch": 1.62874810407187, + "grad_norm": 1.2411408424377441, + "learning_rate": 0.0002225058558900245, + "loss": 2.0301, + "step": 13960 + }, + { + "epoch": 1.6288647765721618, + "grad_norm": 1.1540412902832031, + "learning_rate": 0.00022249257601902582, + "loss": 2.0081, + "step": 13961 + }, + { + "epoch": 1.6289814490724535, + "grad_norm": 1.3489255905151367, + "learning_rate": 0.00022247929541205466, + "loss": 2.0833, + "step": 13962 + }, + { + "epoch": 1.6290981215727451, + "grad_norm": 1.1489694118499756, + "learning_rate": 0.00022246601406924866, + "loss": 1.9296, + "step": 13963 + }, + { + "epoch": 1.6292147940730368, + "grad_norm": 1.0812461376190186, + "learning_rate": 0.00022245273199074557, + "loss": 2.0408, + "step": 13964 + }, + { + "epoch": 1.6293314665733285, + "grad_norm": 1.207362413406372, + "learning_rate": 0.00022243944917668305, + "loss": 2.1188, + "step": 13965 + }, + { + "epoch": 1.6294481390736202, + "grad_norm": 1.2229670286178589, + "learning_rate": 0.0002224261656271988, + "loss": 2.1626, + "step": 13966 + }, + { + "epoch": 1.6295648115739119, + "grad_norm": 1.321239709854126, + "learning_rate": 0.00022241288134243055, + "loss": 2.1632, + "step": 13967 + }, + { + "epoch": 1.6296814840742035, + "grad_norm": 1.0888240337371826, + "learning_rate": 0.00022239959632251597, + "loss": 1.9504, + "step": 13968 + }, + { + "epoch": 1.6297981565744952, + "grad_norm": 1.0746233463287354, + "learning_rate": 0.00022238631056759286, + "loss": 1.9461, + "step": 13969 + }, + { + "epoch": 1.629914829074787, + "grad_norm": 1.335167646408081, + "learning_rate": 0.00022237302407779888, + "loss": 2.1722, + "step": 13970 + }, + { + "epoch": 1.6300315015750786, + "grad_norm": 1.02924382686615, + "learning_rate": 0.00022235973685327178, + "loss": 1.8436, + "step": 13971 + }, + { + "epoch": 1.6301481740753703, + "grad_norm": 1.2098807096481323, + "learning_rate": 0.00022234644889414934, + "loss": 2.1554, + "step": 13972 + }, + { + "epoch": 1.630264846575662, + "grad_norm": 1.0671719312667847, + "learning_rate": 0.00022233316020056927, + "loss": 2.0838, + "step": 13973 + }, + { + "epoch": 1.6303815190759536, + "grad_norm": 1.1347662210464478, + "learning_rate": 0.0002223198707726694, + "loss": 1.9897, + "step": 13974 + }, + { + "epoch": 1.6304981915762453, + "grad_norm": 1.032935380935669, + "learning_rate": 0.00022230658061058746, + "loss": 1.9792, + "step": 13975 + }, + { + "epoch": 1.630614864076537, + "grad_norm": 1.032335877418518, + "learning_rate": 0.00022229328971446122, + "loss": 2.1051, + "step": 13976 + }, + { + "epoch": 1.6307315365768287, + "grad_norm": 1.3206219673156738, + "learning_rate": 0.0002222799980844285, + "loss": 1.9729, + "step": 13977 + }, + { + "epoch": 1.6308482090771204, + "grad_norm": 1.1187647581100464, + "learning_rate": 0.000222266705720627, + "loss": 2.1783, + "step": 13978 + }, + { + "epoch": 1.630964881577412, + "grad_norm": 1.1305696964263916, + "learning_rate": 0.00022225341262319465, + "loss": 2.1052, + "step": 13979 + }, + { + "epoch": 1.6310815540777037, + "grad_norm": 1.377785325050354, + "learning_rate": 0.00022224011879226914, + "loss": 2.2042, + "step": 13980 + }, + { + "epoch": 1.6311982265779954, + "grad_norm": 1.2046977281570435, + "learning_rate": 0.0002222268242279884, + "loss": 2.031, + "step": 13981 + }, + { + "epoch": 1.631314899078287, + "grad_norm": 1.092545986175537, + "learning_rate": 0.00022221352893049017, + "loss": 2.1704, + "step": 13982 + }, + { + "epoch": 1.6314315715785788, + "grad_norm": 0.9917227029800415, + "learning_rate": 0.00022220023289991228, + "loss": 1.9864, + "step": 13983 + }, + { + "epoch": 1.6315482440788704, + "grad_norm": 1.1537553071975708, + "learning_rate": 0.00022218693613639264, + "loss": 2.1846, + "step": 13984 + }, + { + "epoch": 1.6316649165791621, + "grad_norm": 1.2428812980651855, + "learning_rate": 0.00022217363864006904, + "loss": 2.1422, + "step": 13985 + }, + { + "epoch": 1.6317815890794538, + "grad_norm": 1.2754558324813843, + "learning_rate": 0.00022216034041107934, + "loss": 2.0455, + "step": 13986 + }, + { + "epoch": 1.6318982615797455, + "grad_norm": 1.3027820587158203, + "learning_rate": 0.00022214704144956138, + "loss": 1.94, + "step": 13987 + }, + { + "epoch": 1.6320149340800372, + "grad_norm": 1.2510225772857666, + "learning_rate": 0.00022213374175565308, + "loss": 1.9079, + "step": 13988 + }, + { + "epoch": 1.6321316065803289, + "grad_norm": 1.153568983078003, + "learning_rate": 0.00022212044132949231, + "loss": 2.0574, + "step": 13989 + }, + { + "epoch": 1.6322482790806205, + "grad_norm": 1.1222507953643799, + "learning_rate": 0.00022210714017121687, + "loss": 1.8543, + "step": 13990 + }, + { + "epoch": 1.6323649515809122, + "grad_norm": 1.1925493478775024, + "learning_rate": 0.00022209383828096472, + "loss": 2.0514, + "step": 13991 + }, + { + "epoch": 1.632481624081204, + "grad_norm": 0.9643850922584534, + "learning_rate": 0.0002220805356588738, + "loss": 1.8329, + "step": 13992 + }, + { + "epoch": 1.6325982965814956, + "grad_norm": 1.0790561437606812, + "learning_rate": 0.00022206723230508202, + "loss": 2.0516, + "step": 13993 + }, + { + "epoch": 1.6327149690817873, + "grad_norm": 1.1109663248062134, + "learning_rate": 0.00022205392821972715, + "loss": 2.2148, + "step": 13994 + }, + { + "epoch": 1.632831641582079, + "grad_norm": 1.1726194620132446, + "learning_rate": 0.00022204062340294727, + "loss": 2.1669, + "step": 13995 + }, + { + "epoch": 1.6329483140823706, + "grad_norm": 1.0762665271759033, + "learning_rate": 0.00022202731785488024, + "loss": 2.0911, + "step": 13996 + }, + { + "epoch": 1.6330649865826623, + "grad_norm": 1.1229698657989502, + "learning_rate": 0.00022201401157566396, + "loss": 2.1069, + "step": 13997 + }, + { + "epoch": 1.633181659082954, + "grad_norm": 1.1267472505569458, + "learning_rate": 0.00022200070456543646, + "loss": 1.9241, + "step": 13998 + }, + { + "epoch": 1.6332983315832457, + "grad_norm": 1.1004347801208496, + "learning_rate": 0.00022198739682433564, + "loss": 2.0738, + "step": 13999 + }, + { + "epoch": 1.6334150040835373, + "grad_norm": 1.2352406978607178, + "learning_rate": 0.0002219740883524995, + "loss": 2.2124, + "step": 14000 + }, + { + "epoch": 1.633531676583829, + "grad_norm": 1.0130119323730469, + "learning_rate": 0.00022196077915006594, + "loss": 2.0227, + "step": 14001 + }, + { + "epoch": 1.6336483490841207, + "grad_norm": 1.3906069993972778, + "learning_rate": 0.00022194746921717302, + "loss": 2.25, + "step": 14002 + }, + { + "epoch": 1.6337650215844124, + "grad_norm": 1.2288589477539062, + "learning_rate": 0.00022193415855395867, + "loss": 2.1553, + "step": 14003 + }, + { + "epoch": 1.633881694084704, + "grad_norm": 1.167365550994873, + "learning_rate": 0.00022192084716056085, + "loss": 2.2656, + "step": 14004 + }, + { + "epoch": 1.6339983665849958, + "grad_norm": 1.0966194868087769, + "learning_rate": 0.0002219075350371176, + "loss": 2.097, + "step": 14005 + }, + { + "epoch": 1.6341150390852874, + "grad_norm": 1.148797631263733, + "learning_rate": 0.00022189422218376698, + "loss": 2.1922, + "step": 14006 + }, + { + "epoch": 1.6342317115855791, + "grad_norm": 0.9666908383369446, + "learning_rate": 0.00022188090860064688, + "loss": 2.0169, + "step": 14007 + }, + { + "epoch": 1.6343483840858708, + "grad_norm": 1.1701114177703857, + "learning_rate": 0.00022186759428789545, + "loss": 2.1575, + "step": 14008 + }, + { + "epoch": 1.6344650565861625, + "grad_norm": 1.2103915214538574, + "learning_rate": 0.0002218542792456506, + "loss": 1.9909, + "step": 14009 + }, + { + "epoch": 1.6345817290864542, + "grad_norm": 1.1210646629333496, + "learning_rate": 0.00022184096347405044, + "loss": 2.1611, + "step": 14010 + }, + { + "epoch": 1.6346984015867458, + "grad_norm": 1.0099393129348755, + "learning_rate": 0.000221827646973233, + "loss": 2.0149, + "step": 14011 + }, + { + "epoch": 1.6348150740870375, + "grad_norm": 1.2372877597808838, + "learning_rate": 0.00022181432974333632, + "loss": 2.0955, + "step": 14012 + }, + { + "epoch": 1.6349317465873292, + "grad_norm": 1.1370658874511719, + "learning_rate": 0.0002218010117844984, + "loss": 2.2131, + "step": 14013 + }, + { + "epoch": 1.6350484190876209, + "grad_norm": 1.1491286754608154, + "learning_rate": 0.00022178769309685742, + "loss": 2.0999, + "step": 14014 + }, + { + "epoch": 1.6351650915879126, + "grad_norm": 1.0527164936065674, + "learning_rate": 0.00022177437368055142, + "loss": 2.0866, + "step": 14015 + }, + { + "epoch": 1.6352817640882042, + "grad_norm": 1.1851574182510376, + "learning_rate": 0.00022176105353571843, + "loss": 1.9904, + "step": 14016 + }, + { + "epoch": 1.635398436588496, + "grad_norm": 1.2648299932479858, + "learning_rate": 0.00022174773266249657, + "loss": 2.0726, + "step": 14017 + }, + { + "epoch": 1.6355151090887876, + "grad_norm": 1.2834593057632446, + "learning_rate": 0.00022173441106102397, + "loss": 2.058, + "step": 14018 + }, + { + "epoch": 1.6356317815890793, + "grad_norm": 1.245243787765503, + "learning_rate": 0.00022172108873143867, + "loss": 2.2618, + "step": 14019 + }, + { + "epoch": 1.635748454089371, + "grad_norm": 1.1708868741989136, + "learning_rate": 0.0002217077656738788, + "loss": 2.0278, + "step": 14020 + }, + { + "epoch": 1.6358651265896627, + "grad_norm": 1.2249910831451416, + "learning_rate": 0.00022169444188848252, + "loss": 2.1137, + "step": 14021 + }, + { + "epoch": 1.6359817990899543, + "grad_norm": 1.1299209594726562, + "learning_rate": 0.00022168111737538789, + "loss": 2.1256, + "step": 14022 + }, + { + "epoch": 1.636098471590246, + "grad_norm": 1.208276629447937, + "learning_rate": 0.00022166779213473312, + "loss": 2.0755, + "step": 14023 + }, + { + "epoch": 1.6362151440905377, + "grad_norm": 1.1406737565994263, + "learning_rate": 0.00022165446616665628, + "loss": 2.0978, + "step": 14024 + }, + { + "epoch": 1.6363318165908294, + "grad_norm": 1.0157908201217651, + "learning_rate": 0.00022164113947129556, + "loss": 2.0106, + "step": 14025 + }, + { + "epoch": 1.636448489091121, + "grad_norm": 1.1939116716384888, + "learning_rate": 0.00022162781204878912, + "loss": 1.9608, + "step": 14026 + }, + { + "epoch": 1.6365651615914127, + "grad_norm": 1.3078280687332153, + "learning_rate": 0.00022161448389927508, + "loss": 2.2205, + "step": 14027 + }, + { + "epoch": 1.6366818340917044, + "grad_norm": 1.134570598602295, + "learning_rate": 0.0002216011550228917, + "loss": 2.0904, + "step": 14028 + }, + { + "epoch": 1.636798506591996, + "grad_norm": 1.4544239044189453, + "learning_rate": 0.00022158782541977702, + "loss": 2.2018, + "step": 14029 + }, + { + "epoch": 1.6369151790922878, + "grad_norm": 1.4014500379562378, + "learning_rate": 0.00022157449509006935, + "loss": 1.9672, + "step": 14030 + }, + { + "epoch": 1.6370318515925795, + "grad_norm": 1.1442551612854004, + "learning_rate": 0.00022156116403390682, + "loss": 2.0399, + "step": 14031 + }, + { + "epoch": 1.6371485240928711, + "grad_norm": 1.3735381364822388, + "learning_rate": 0.00022154783225142768, + "loss": 2.2431, + "step": 14032 + }, + { + "epoch": 1.6372651965931628, + "grad_norm": 1.2165106534957886, + "learning_rate": 0.0002215344997427701, + "loss": 2.009, + "step": 14033 + }, + { + "epoch": 1.6373818690934545, + "grad_norm": 1.0865275859832764, + "learning_rate": 0.0002215211665080723, + "loss": 2.1195, + "step": 14034 + }, + { + "epoch": 1.6374985415937462, + "grad_norm": 1.285845160484314, + "learning_rate": 0.00022150783254747255, + "loss": 2.0169, + "step": 14035 + }, + { + "epoch": 1.6376152140940379, + "grad_norm": 1.1083264350891113, + "learning_rate": 0.00022149449786110903, + "loss": 2.0993, + "step": 14036 + }, + { + "epoch": 1.6377318865943296, + "grad_norm": 1.0880403518676758, + "learning_rate": 0.00022148116244912, + "loss": 2.0997, + "step": 14037 + }, + { + "epoch": 1.6378485590946212, + "grad_norm": 1.3383105993270874, + "learning_rate": 0.0002214678263116437, + "loss": 1.9802, + "step": 14038 + }, + { + "epoch": 1.637965231594913, + "grad_norm": 1.1031984090805054, + "learning_rate": 0.00022145448944881833, + "loss": 2.0376, + "step": 14039 + }, + { + "epoch": 1.6380819040952046, + "grad_norm": 1.3679225444793701, + "learning_rate": 0.0002214411518607823, + "loss": 2.1478, + "step": 14040 + }, + { + "epoch": 1.6381985765954963, + "grad_norm": 1.2210708856582642, + "learning_rate": 0.0002214278135476737, + "loss": 1.8485, + "step": 14041 + }, + { + "epoch": 1.638315249095788, + "grad_norm": 1.1919976472854614, + "learning_rate": 0.00022141447450963093, + "loss": 2.1728, + "step": 14042 + }, + { + "epoch": 1.6384319215960796, + "grad_norm": 1.185835599899292, + "learning_rate": 0.00022140113474679223, + "loss": 2.005, + "step": 14043 + }, + { + "epoch": 1.6385485940963713, + "grad_norm": 1.208877682685852, + "learning_rate": 0.0002213877942592959, + "loss": 2.0949, + "step": 14044 + }, + { + "epoch": 1.638665266596663, + "grad_norm": 1.0729039907455444, + "learning_rate": 0.00022137445304728026, + "loss": 2.0819, + "step": 14045 + }, + { + "epoch": 1.6387819390969547, + "grad_norm": 1.1332836151123047, + "learning_rate": 0.0002213611111108836, + "loss": 2.0092, + "step": 14046 + }, + { + "epoch": 1.6388986115972464, + "grad_norm": 1.2558276653289795, + "learning_rate": 0.0002213477684502442, + "loss": 1.9791, + "step": 14047 + }, + { + "epoch": 1.639015284097538, + "grad_norm": 1.38021719455719, + "learning_rate": 0.0002213344250655004, + "loss": 2.1507, + "step": 14048 + }, + { + "epoch": 1.6391319565978297, + "grad_norm": 1.0861958265304565, + "learning_rate": 0.00022132108095679055, + "loss": 2.1427, + "step": 14049 + }, + { + "epoch": 1.6392486290981214, + "grad_norm": 1.1987501382827759, + "learning_rate": 0.000221307736124253, + "loss": 1.9575, + "step": 14050 + }, + { + "epoch": 1.639365301598413, + "grad_norm": 1.3418207168579102, + "learning_rate": 0.0002212943905680261, + "loss": 2.2547, + "step": 14051 + }, + { + "epoch": 1.6394819740987048, + "grad_norm": 1.146583914756775, + "learning_rate": 0.0002212810442882481, + "loss": 2.1581, + "step": 14052 + }, + { + "epoch": 1.6395986465989965, + "grad_norm": 1.0391709804534912, + "learning_rate": 0.0002212676972850575, + "loss": 2.1266, + "step": 14053 + }, + { + "epoch": 1.6397153190992881, + "grad_norm": 1.1560059785842896, + "learning_rate": 0.00022125434955859262, + "loss": 2.0121, + "step": 14054 + }, + { + "epoch": 1.6398319915995798, + "grad_norm": 1.2284951210021973, + "learning_rate": 0.00022124100110899176, + "loss": 2.1526, + "step": 14055 + }, + { + "epoch": 1.6399486640998715, + "grad_norm": 1.150354266166687, + "learning_rate": 0.00022122765193639338, + "loss": 2.1568, + "step": 14056 + }, + { + "epoch": 1.6400653366001632, + "grad_norm": 1.0886470079421997, + "learning_rate": 0.0002212143020409358, + "loss": 2.1441, + "step": 14057 + }, + { + "epoch": 1.6401820091004549, + "grad_norm": 1.117344856262207, + "learning_rate": 0.0002212009514227575, + "loss": 1.9662, + "step": 14058 + }, + { + "epoch": 1.6402986816007465, + "grad_norm": 1.1256524324417114, + "learning_rate": 0.00022118760008199686, + "loss": 2.083, + "step": 14059 + }, + { + "epoch": 1.6404153541010382, + "grad_norm": 1.2437959909439087, + "learning_rate": 0.0002211742480187923, + "loss": 2.0426, + "step": 14060 + }, + { + "epoch": 1.64053202660133, + "grad_norm": 0.9597336649894714, + "learning_rate": 0.00022116089523328218, + "loss": 2.0055, + "step": 14061 + }, + { + "epoch": 1.6406486991016216, + "grad_norm": 1.2073384523391724, + "learning_rate": 0.00022114754172560504, + "loss": 2.221, + "step": 14062 + }, + { + "epoch": 1.6407653716019133, + "grad_norm": 1.226536750793457, + "learning_rate": 0.00022113418749589918, + "loss": 2.0833, + "step": 14063 + }, + { + "epoch": 1.640882044102205, + "grad_norm": 1.1412134170532227, + "learning_rate": 0.0002211208325443031, + "loss": 1.9727, + "step": 14064 + }, + { + "epoch": 1.6409987166024966, + "grad_norm": 1.24349045753479, + "learning_rate": 0.00022110747687095525, + "loss": 2.1677, + "step": 14065 + }, + { + "epoch": 1.6411153891027883, + "grad_norm": 1.131854772567749, + "learning_rate": 0.0002210941204759941, + "loss": 2.1417, + "step": 14066 + }, + { + "epoch": 1.64123206160308, + "grad_norm": 1.2304167747497559, + "learning_rate": 0.00022108076335955812, + "loss": 2.2747, + "step": 14067 + }, + { + "epoch": 1.6413487341033717, + "grad_norm": 0.9782321453094482, + "learning_rate": 0.00022106740552178575, + "loss": 2.0223, + "step": 14068 + }, + { + "epoch": 1.6414654066036634, + "grad_norm": 1.1912949085235596, + "learning_rate": 0.0002210540469628155, + "loss": 2.0862, + "step": 14069 + }, + { + "epoch": 1.641582079103955, + "grad_norm": 1.199400544166565, + "learning_rate": 0.0002210406876827858, + "loss": 2.0873, + "step": 14070 + }, + { + "epoch": 1.6416987516042467, + "grad_norm": 1.1456031799316406, + "learning_rate": 0.0002210273276818352, + "loss": 2.025, + "step": 14071 + }, + { + "epoch": 1.6418154241045384, + "grad_norm": 2.3586084842681885, + "learning_rate": 0.0002210139669601022, + "loss": 2.1405, + "step": 14072 + }, + { + "epoch": 1.64193209660483, + "grad_norm": 1.1606321334838867, + "learning_rate": 0.00022100060551772532, + "loss": 2.0215, + "step": 14073 + }, + { + "epoch": 1.6420487691051218, + "grad_norm": 1.0252898931503296, + "learning_rate": 0.00022098724335484302, + "loss": 2.1339, + "step": 14074 + }, + { + "epoch": 1.6421654416054134, + "grad_norm": 1.2804559469223022, + "learning_rate": 0.00022097388047159387, + "loss": 1.9209, + "step": 14075 + }, + { + "epoch": 1.6422821141057051, + "grad_norm": 1.1506110429763794, + "learning_rate": 0.0002209605168681164, + "loss": 2.1387, + "step": 14076 + }, + { + "epoch": 1.6423987866059968, + "grad_norm": 1.1127270460128784, + "learning_rate": 0.00022094715254454913, + "loss": 2.1257, + "step": 14077 + }, + { + "epoch": 1.6425154591062885, + "grad_norm": 1.120223879814148, + "learning_rate": 0.0002209337875010306, + "loss": 2.1952, + "step": 14078 + }, + { + "epoch": 1.6426321316065802, + "grad_norm": 1.000333547592163, + "learning_rate": 0.0002209204217376994, + "loss": 1.8988, + "step": 14079 + }, + { + "epoch": 1.6427488041068719, + "grad_norm": 1.0822279453277588, + "learning_rate": 0.00022090705525469405, + "loss": 2.0413, + "step": 14080 + }, + { + "epoch": 1.6428654766071635, + "grad_norm": 1.0320258140563965, + "learning_rate": 0.00022089368805215318, + "loss": 1.903, + "step": 14081 + }, + { + "epoch": 1.6429821491074552, + "grad_norm": 1.1389715671539307, + "learning_rate": 0.0002208803201302153, + "loss": 2.0804, + "step": 14082 + }, + { + "epoch": 1.643098821607747, + "grad_norm": 1.4687162637710571, + "learning_rate": 0.00022086695148901899, + "loss": 2.2264, + "step": 14083 + }, + { + "epoch": 1.6432154941080386, + "grad_norm": 1.0391337871551514, + "learning_rate": 0.0002208535821287029, + "loss": 1.829, + "step": 14084 + }, + { + "epoch": 1.6433321666083303, + "grad_norm": 1.1618496179580688, + "learning_rate": 0.00022084021204940557, + "loss": 2.0411, + "step": 14085 + }, + { + "epoch": 1.643448839108622, + "grad_norm": 1.137627363204956, + "learning_rate": 0.00022082684125126567, + "loss": 2.1307, + "step": 14086 + }, + { + "epoch": 1.6435655116089136, + "grad_norm": 1.2886836528778076, + "learning_rate": 0.00022081346973442178, + "loss": 2.0771, + "step": 14087 + }, + { + "epoch": 1.6436821841092053, + "grad_norm": 1.223515510559082, + "learning_rate": 0.00022080009749901255, + "loss": 2.0037, + "step": 14088 + }, + { + "epoch": 1.643798856609497, + "grad_norm": 1.1048527956008911, + "learning_rate": 0.00022078672454517654, + "loss": 2.0579, + "step": 14089 + }, + { + "epoch": 1.6439155291097887, + "grad_norm": 1.0900564193725586, + "learning_rate": 0.0002207733508730524, + "loss": 2.2196, + "step": 14090 + }, + { + "epoch": 1.6440322016100803, + "grad_norm": 0.9657742381095886, + "learning_rate": 0.00022075997648277884, + "loss": 2.0314, + "step": 14091 + }, + { + "epoch": 1.644148874110372, + "grad_norm": 1.111646056175232, + "learning_rate": 0.00022074660137449444, + "loss": 1.9911, + "step": 14092 + }, + { + "epoch": 1.6442655466106637, + "grad_norm": 1.1386215686798096, + "learning_rate": 0.0002207332255483379, + "loss": 2.2037, + "step": 14093 + }, + { + "epoch": 1.6443822191109554, + "grad_norm": 1.089468002319336, + "learning_rate": 0.00022071984900444788, + "loss": 2.0389, + "step": 14094 + }, + { + "epoch": 1.644498891611247, + "grad_norm": 1.2369866371154785, + "learning_rate": 0.000220706471742963, + "loss": 1.9671, + "step": 14095 + }, + { + "epoch": 1.6446155641115388, + "grad_norm": 1.1315057277679443, + "learning_rate": 0.0002206930937640221, + "loss": 2.0304, + "step": 14096 + }, + { + "epoch": 1.6447322366118304, + "grad_norm": 1.3821427822113037, + "learning_rate": 0.00022067971506776365, + "loss": 2.2946, + "step": 14097 + }, + { + "epoch": 1.6448489091121221, + "grad_norm": 1.265270709991455, + "learning_rate": 0.0002206663356543265, + "loss": 2.2416, + "step": 14098 + }, + { + "epoch": 1.6449655816124138, + "grad_norm": 1.1159676313400269, + "learning_rate": 0.00022065295552384928, + "loss": 2.0828, + "step": 14099 + }, + { + "epoch": 1.6450822541127055, + "grad_norm": 1.0342003107070923, + "learning_rate": 0.0002206395746764707, + "loss": 1.8708, + "step": 14100 + }, + { + "epoch": 1.6451989266129972, + "grad_norm": 1.1372281312942505, + "learning_rate": 0.00022062619311232957, + "loss": 2.025, + "step": 14101 + }, + { + "epoch": 1.6453155991132888, + "grad_norm": 1.0740832090377808, + "learning_rate": 0.0002206128108315645, + "loss": 2.0202, + "step": 14102 + }, + { + "epoch": 1.6454322716135805, + "grad_norm": 1.0278323888778687, + "learning_rate": 0.00022059942783431426, + "loss": 2.0747, + "step": 14103 + }, + { + "epoch": 1.6455489441138722, + "grad_norm": 0.9820451140403748, + "learning_rate": 0.00022058604412071765, + "loss": 1.987, + "step": 14104 + }, + { + "epoch": 1.6456656166141639, + "grad_norm": 1.0403107404708862, + "learning_rate": 0.00022057265969091335, + "loss": 2.0868, + "step": 14105 + }, + { + "epoch": 1.6457822891144556, + "grad_norm": 1.2518397569656372, + "learning_rate": 0.00022055927454504013, + "loss": 2.039, + "step": 14106 + }, + { + "epoch": 1.6458989616147472, + "grad_norm": 0.9626978039741516, + "learning_rate": 0.00022054588868323675, + "loss": 1.9093, + "step": 14107 + }, + { + "epoch": 1.646015634115039, + "grad_norm": 1.0661438703536987, + "learning_rate": 0.00022053250210564199, + "loss": 1.8024, + "step": 14108 + }, + { + "epoch": 1.6461323066153306, + "grad_norm": 1.1292821168899536, + "learning_rate": 0.00022051911481239456, + "loss": 2.1042, + "step": 14109 + }, + { + "epoch": 1.6462489791156223, + "grad_norm": 1.0398061275482178, + "learning_rate": 0.00022050572680363338, + "loss": 2.1294, + "step": 14110 + }, + { + "epoch": 1.646365651615914, + "grad_norm": 1.0452296733856201, + "learning_rate": 0.00022049233807949713, + "loss": 1.9678, + "step": 14111 + }, + { + "epoch": 1.6464823241162057, + "grad_norm": 1.0985361337661743, + "learning_rate": 0.00022047894864012468, + "loss": 2.0545, + "step": 14112 + }, + { + "epoch": 1.6465989966164973, + "grad_norm": 1.3498820066452026, + "learning_rate": 0.00022046555848565481, + "loss": 2.1299, + "step": 14113 + }, + { + "epoch": 1.646715669116789, + "grad_norm": 1.2483075857162476, + "learning_rate": 0.0002204521676162263, + "loss": 2.0776, + "step": 14114 + }, + { + "epoch": 1.6468323416170807, + "grad_norm": 1.1343144178390503, + "learning_rate": 0.000220438776031978, + "loss": 2.1179, + "step": 14115 + }, + { + "epoch": 1.6469490141173724, + "grad_norm": 1.1815454959869385, + "learning_rate": 0.00022042538373304876, + "loss": 1.94, + "step": 14116 + }, + { + "epoch": 1.647065686617664, + "grad_norm": 1.1452780961990356, + "learning_rate": 0.00022041199071957737, + "loss": 2.1073, + "step": 14117 + }, + { + "epoch": 1.6471823591179557, + "grad_norm": 1.113983154296875, + "learning_rate": 0.00022039859699170275, + "loss": 2.1758, + "step": 14118 + }, + { + "epoch": 1.6472990316182474, + "grad_norm": 1.1741374731063843, + "learning_rate": 0.00022038520254956368, + "loss": 2.0134, + "step": 14119 + }, + { + "epoch": 1.647415704118539, + "grad_norm": 1.143011450767517, + "learning_rate": 0.00022037180739329902, + "loss": 2.1304, + "step": 14120 + }, + { + "epoch": 1.6475323766188308, + "grad_norm": 1.1529946327209473, + "learning_rate": 0.00022035841152304766, + "loss": 1.9428, + "step": 14121 + }, + { + "epoch": 1.6476490491191225, + "grad_norm": 1.2773958444595337, + "learning_rate": 0.00022034501493894848, + "loss": 2.0757, + "step": 14122 + }, + { + "epoch": 1.6477657216194141, + "grad_norm": 1.140628695487976, + "learning_rate": 0.00022033161764114034, + "loss": 2.1261, + "step": 14123 + }, + { + "epoch": 1.6478823941197058, + "grad_norm": 1.165804386138916, + "learning_rate": 0.00022031821962976216, + "loss": 2.1182, + "step": 14124 + }, + { + "epoch": 1.6479990666199975, + "grad_norm": 1.7188223600387573, + "learning_rate": 0.00022030482090495282, + "loss": 2.2625, + "step": 14125 + }, + { + "epoch": 1.6481157391202892, + "grad_norm": 1.1360251903533936, + "learning_rate": 0.00022029142146685115, + "loss": 2.0826, + "step": 14126 + }, + { + "epoch": 1.6482324116205809, + "grad_norm": 1.084973692893982, + "learning_rate": 0.0002202780213155962, + "loss": 2.0553, + "step": 14127 + }, + { + "epoch": 1.6483490841208726, + "grad_norm": 1.085761308670044, + "learning_rate": 0.00022026462045132684, + "loss": 1.9961, + "step": 14128 + }, + { + "epoch": 1.6484657566211642, + "grad_norm": 1.1154588460922241, + "learning_rate": 0.0002202512188741819, + "loss": 2.0864, + "step": 14129 + }, + { + "epoch": 1.648582429121456, + "grad_norm": 1.3318579196929932, + "learning_rate": 0.00022023781658430042, + "loss": 2.0475, + "step": 14130 + }, + { + "epoch": 1.6486991016217476, + "grad_norm": 1.1584748029708862, + "learning_rate": 0.00022022441358182136, + "loss": 1.8989, + "step": 14131 + }, + { + "epoch": 1.6488157741220393, + "grad_norm": 1.2061810493469238, + "learning_rate": 0.00022021100986688354, + "loss": 2.1405, + "step": 14132 + }, + { + "epoch": 1.648932446622331, + "grad_norm": 1.1012777090072632, + "learning_rate": 0.00022019760543962602, + "loss": 2.0022, + "step": 14133 + }, + { + "epoch": 1.6490491191226226, + "grad_norm": 1.140083909034729, + "learning_rate": 0.0002201842003001877, + "loss": 2.0777, + "step": 14134 + }, + { + "epoch": 1.6491657916229143, + "grad_norm": 1.1798985004425049, + "learning_rate": 0.00022017079444870756, + "loss": 1.9071, + "step": 14135 + }, + { + "epoch": 1.649282464123206, + "grad_norm": 1.2475852966308594, + "learning_rate": 0.00022015738788532465, + "loss": 2.0694, + "step": 14136 + }, + { + "epoch": 1.6493991366234977, + "grad_norm": 1.2024025917053223, + "learning_rate": 0.00022014398061017787, + "loss": 2.0831, + "step": 14137 + }, + { + "epoch": 1.6495158091237894, + "grad_norm": 1.0728240013122559, + "learning_rate": 0.00022013057262340623, + "loss": 1.9024, + "step": 14138 + }, + { + "epoch": 1.649632481624081, + "grad_norm": 1.1382521390914917, + "learning_rate": 0.00022011716392514883, + "loss": 2.0304, + "step": 14139 + }, + { + "epoch": 1.6497491541243727, + "grad_norm": 1.2274261713027954, + "learning_rate": 0.00022010375451554453, + "loss": 2.0324, + "step": 14140 + }, + { + "epoch": 1.6498658266246644, + "grad_norm": 1.1676374673843384, + "learning_rate": 0.00022009034439473237, + "loss": 2.0861, + "step": 14141 + }, + { + "epoch": 1.649982499124956, + "grad_norm": 1.2231849431991577, + "learning_rate": 0.00022007693356285147, + "loss": 2.1014, + "step": 14142 + }, + { + "epoch": 1.6500991716252478, + "grad_norm": 1.153775691986084, + "learning_rate": 0.00022006352202004074, + "loss": 2.0496, + "step": 14143 + }, + { + "epoch": 1.6502158441255395, + "grad_norm": 1.3688381910324097, + "learning_rate": 0.00022005010976643927, + "loss": 2.0888, + "step": 14144 + }, + { + "epoch": 1.6503325166258311, + "grad_norm": 1.1526179313659668, + "learning_rate": 0.0002200366968021861, + "loss": 2.2243, + "step": 14145 + }, + { + "epoch": 1.6504491891261228, + "grad_norm": 1.0898877382278442, + "learning_rate": 0.00022002328312742032, + "loss": 2.1632, + "step": 14146 + }, + { + "epoch": 1.6505658616264145, + "grad_norm": 1.1380480527877808, + "learning_rate": 0.00022000986874228093, + "loss": 2.0928, + "step": 14147 + }, + { + "epoch": 1.6506825341267062, + "grad_norm": 1.1596168279647827, + "learning_rate": 0.000219996453646907, + "loss": 2.1046, + "step": 14148 + }, + { + "epoch": 1.6507992066269979, + "grad_norm": 1.2409402132034302, + "learning_rate": 0.00021998303784143765, + "loss": 2.2902, + "step": 14149 + }, + { + "epoch": 1.6509158791272895, + "grad_norm": 1.1375696659088135, + "learning_rate": 0.00021996962132601192, + "loss": 2.1879, + "step": 14150 + }, + { + "epoch": 1.6510325516275812, + "grad_norm": 1.1141539812088013, + "learning_rate": 0.00021995620410076888, + "loss": 2.2557, + "step": 14151 + }, + { + "epoch": 1.651149224127873, + "grad_norm": 1.2248815298080444, + "learning_rate": 0.00021994278616584766, + "loss": 2.0851, + "step": 14152 + }, + { + "epoch": 1.6512658966281646, + "grad_norm": 1.0995861291885376, + "learning_rate": 0.0002199293675213874, + "loss": 2.2089, + "step": 14153 + }, + { + "epoch": 1.6513825691284563, + "grad_norm": 1.0057746171951294, + "learning_rate": 0.00021991594816752713, + "loss": 1.8889, + "step": 14154 + }, + { + "epoch": 1.651499241628748, + "grad_norm": 1.0055418014526367, + "learning_rate": 0.000219902528104406, + "loss": 1.9553, + "step": 14155 + }, + { + "epoch": 1.6516159141290396, + "grad_norm": 1.0822945833206177, + "learning_rate": 0.00021988910733216312, + "loss": 2.0617, + "step": 14156 + }, + { + "epoch": 1.6517325866293313, + "grad_norm": 1.2375155687332153, + "learning_rate": 0.0002198756858509377, + "loss": 2.2285, + "step": 14157 + }, + { + "epoch": 1.651849259129623, + "grad_norm": 1.072981357574463, + "learning_rate": 0.00021986226366086878, + "loss": 2.1318, + "step": 14158 + }, + { + "epoch": 1.6519659316299147, + "grad_norm": 1.11165452003479, + "learning_rate": 0.00021984884076209552, + "loss": 2.1057, + "step": 14159 + }, + { + "epoch": 1.6520826041302064, + "grad_norm": 1.1661722660064697, + "learning_rate": 0.00021983541715475712, + "loss": 2.0422, + "step": 14160 + }, + { + "epoch": 1.652199276630498, + "grad_norm": 1.1008845567703247, + "learning_rate": 0.00021982199283899267, + "loss": 2.0073, + "step": 14161 + }, + { + "epoch": 1.6523159491307897, + "grad_norm": 1.3257594108581543, + "learning_rate": 0.0002198085678149414, + "loss": 2.2624, + "step": 14162 + }, + { + "epoch": 1.6524326216310814, + "grad_norm": 1.2613261938095093, + "learning_rate": 0.0002197951420827425, + "loss": 2.1248, + "step": 14163 + }, + { + "epoch": 1.652549294131373, + "grad_norm": 1.0556446313858032, + "learning_rate": 0.00021978171564253512, + "loss": 2.0408, + "step": 14164 + }, + { + "epoch": 1.6526659666316648, + "grad_norm": 1.0601290464401245, + "learning_rate": 0.0002197682884944585, + "loss": 2.1476, + "step": 14165 + }, + { + "epoch": 1.6527826391319564, + "grad_norm": 1.1015877723693848, + "learning_rate": 0.00021975486063865172, + "loss": 2.0052, + "step": 14166 + }, + { + "epoch": 1.6528993116322481, + "grad_norm": 1.0588092803955078, + "learning_rate": 0.0002197414320752541, + "loss": 2.0332, + "step": 14167 + }, + { + "epoch": 1.6530159841325398, + "grad_norm": 1.1132317781448364, + "learning_rate": 0.00021972800280440483, + "loss": 2.0182, + "step": 14168 + }, + { + "epoch": 1.6531326566328315, + "grad_norm": 1.1015231609344482, + "learning_rate": 0.00021971457282624307, + "loss": 1.9912, + "step": 14169 + }, + { + "epoch": 1.6532493291331232, + "grad_norm": 1.0533839464187622, + "learning_rate": 0.00021970114214090808, + "loss": 1.8995, + "step": 14170 + }, + { + "epoch": 1.6533660016334149, + "grad_norm": 1.008480429649353, + "learning_rate": 0.00021968771074853914, + "loss": 1.9429, + "step": 14171 + }, + { + "epoch": 1.6534826741337065, + "grad_norm": 1.166344165802002, + "learning_rate": 0.00021967427864927542, + "loss": 2.0443, + "step": 14172 + }, + { + "epoch": 1.6535993466339982, + "grad_norm": 1.3442554473876953, + "learning_rate": 0.00021966084584325622, + "loss": 2.1409, + "step": 14173 + }, + { + "epoch": 1.65371601913429, + "grad_norm": 1.136348009109497, + "learning_rate": 0.00021964741233062077, + "loss": 2.2202, + "step": 14174 + }, + { + "epoch": 1.6538326916345816, + "grad_norm": 1.0287593603134155, + "learning_rate": 0.00021963397811150835, + "loss": 1.9643, + "step": 14175 + }, + { + "epoch": 1.6539493641348733, + "grad_norm": 1.2455943822860718, + "learning_rate": 0.00021962054318605821, + "loss": 2.0663, + "step": 14176 + }, + { + "epoch": 1.654066036635165, + "grad_norm": 1.15375554561615, + "learning_rate": 0.0002196071075544097, + "loss": 1.9875, + "step": 14177 + }, + { + "epoch": 1.6541827091354566, + "grad_norm": 1.1452155113220215, + "learning_rate": 0.00021959367121670198, + "loss": 2.0761, + "step": 14178 + }, + { + "epoch": 1.6542993816357483, + "grad_norm": 1.0102777481079102, + "learning_rate": 0.00021958023417307444, + "loss": 1.8986, + "step": 14179 + }, + { + "epoch": 1.65441605413604, + "grad_norm": 1.2099047899246216, + "learning_rate": 0.00021956679642366633, + "loss": 2.074, + "step": 14180 + }, + { + "epoch": 1.6545327266363317, + "grad_norm": 1.1797813177108765, + "learning_rate": 0.000219553357968617, + "loss": 1.9974, + "step": 14181 + }, + { + "epoch": 1.6546493991366233, + "grad_norm": 1.3897727727890015, + "learning_rate": 0.00021953991880806578, + "loss": 2.019, + "step": 14182 + }, + { + "epoch": 1.654766071636915, + "grad_norm": 1.0932832956314087, + "learning_rate": 0.00021952647894215194, + "loss": 1.9684, + "step": 14183 + }, + { + "epoch": 1.6548827441372067, + "grad_norm": 1.1995699405670166, + "learning_rate": 0.0002195130383710148, + "loss": 2.1101, + "step": 14184 + }, + { + "epoch": 1.6549994166374984, + "grad_norm": 1.2713487148284912, + "learning_rate": 0.00021949959709479373, + "loss": 2.0757, + "step": 14185 + }, + { + "epoch": 1.65511608913779, + "grad_norm": 1.111384391784668, + "learning_rate": 0.00021948615511362807, + "loss": 1.953, + "step": 14186 + }, + { + "epoch": 1.6552327616380818, + "grad_norm": 1.2331515550613403, + "learning_rate": 0.00021947271242765717, + "loss": 2.0386, + "step": 14187 + }, + { + "epoch": 1.6553494341383734, + "grad_norm": 1.28336763381958, + "learning_rate": 0.0002194592690370204, + "loss": 2.2112, + "step": 14188 + }, + { + "epoch": 1.6554661066386651, + "grad_norm": 1.161152958869934, + "learning_rate": 0.00021944582494185706, + "loss": 2.0181, + "step": 14189 + }, + { + "epoch": 1.6555827791389568, + "grad_norm": 1.1163231134414673, + "learning_rate": 0.00021943238014230667, + "loss": 1.9949, + "step": 14190 + }, + { + "epoch": 1.6556994516392485, + "grad_norm": 1.0908859968185425, + "learning_rate": 0.00021941893463850847, + "loss": 2.0941, + "step": 14191 + }, + { + "epoch": 1.6558161241395402, + "grad_norm": 1.0896004438400269, + "learning_rate": 0.0002194054884306019, + "loss": 2.1404, + "step": 14192 + }, + { + "epoch": 1.6559327966398318, + "grad_norm": 1.2192353010177612, + "learning_rate": 0.00021939204151872637, + "loss": 2.051, + "step": 14193 + }, + { + "epoch": 1.6560494691401235, + "grad_norm": 1.1105201244354248, + "learning_rate": 0.00021937859390302122, + "loss": 2.0153, + "step": 14194 + }, + { + "epoch": 1.6561661416404152, + "grad_norm": 1.2047449350357056, + "learning_rate": 0.00021936514558362595, + "loss": 2.0977, + "step": 14195 + }, + { + "epoch": 1.6562828141407069, + "grad_norm": 1.1898528337478638, + "learning_rate": 0.00021935169656067994, + "loss": 2.2477, + "step": 14196 + }, + { + "epoch": 1.6563994866409986, + "grad_norm": 1.239677906036377, + "learning_rate": 0.00021933824683432256, + "loss": 2.1147, + "step": 14197 + }, + { + "epoch": 1.6565161591412902, + "grad_norm": 1.2612154483795166, + "learning_rate": 0.00021932479640469332, + "loss": 1.9928, + "step": 14198 + }, + { + "epoch": 1.656632831641582, + "grad_norm": 1.112892746925354, + "learning_rate": 0.00021931134527193163, + "loss": 2.1341, + "step": 14199 + }, + { + "epoch": 1.6567495041418736, + "grad_norm": 1.152862787246704, + "learning_rate": 0.00021929789343617693, + "loss": 2.0265, + "step": 14200 + }, + { + "epoch": 1.6568661766421653, + "grad_norm": 1.1201775074005127, + "learning_rate": 0.00021928444089756865, + "loss": 2.0689, + "step": 14201 + }, + { + "epoch": 1.656982849142457, + "grad_norm": 1.0469201803207397, + "learning_rate": 0.0002192709876562463, + "loss": 1.9955, + "step": 14202 + }, + { + "epoch": 1.6570995216427487, + "grad_norm": 1.2734092473983765, + "learning_rate": 0.00021925753371234937, + "loss": 2.0644, + "step": 14203 + }, + { + "epoch": 1.6572161941430403, + "grad_norm": 1.0549980401992798, + "learning_rate": 0.00021924407906601725, + "loss": 1.9816, + "step": 14204 + }, + { + "epoch": 1.657332866643332, + "grad_norm": 1.0978814363479614, + "learning_rate": 0.00021923062371738945, + "loss": 2.0218, + "step": 14205 + }, + { + "epoch": 1.6574495391436237, + "grad_norm": 1.141951322555542, + "learning_rate": 0.00021921716766660552, + "loss": 2.1639, + "step": 14206 + }, + { + "epoch": 1.6575662116439154, + "grad_norm": 1.1877495050430298, + "learning_rate": 0.0002192037109138049, + "loss": 2.097, + "step": 14207 + }, + { + "epoch": 1.657682884144207, + "grad_norm": 1.2241249084472656, + "learning_rate": 0.00021919025345912713, + "loss": 2.2553, + "step": 14208 + }, + { + "epoch": 1.6577995566444987, + "grad_norm": 1.0862922668457031, + "learning_rate": 0.00021917679530271164, + "loss": 2.0918, + "step": 14209 + }, + { + "epoch": 1.6579162291447904, + "grad_norm": 1.209216833114624, + "learning_rate": 0.00021916333644469809, + "loss": 2.0768, + "step": 14210 + }, + { + "epoch": 1.658032901645082, + "grad_norm": 1.4103754758834839, + "learning_rate": 0.0002191498768852259, + "loss": 2.0623, + "step": 14211 + }, + { + "epoch": 1.6581495741453738, + "grad_norm": 1.3314317464828491, + "learning_rate": 0.0002191364166244346, + "loss": 2.2245, + "step": 14212 + }, + { + "epoch": 1.6582662466456655, + "grad_norm": 1.1147069931030273, + "learning_rate": 0.00021912295566246378, + "loss": 2.0993, + "step": 14213 + }, + { + "epoch": 1.6583829191459571, + "grad_norm": 1.2202568054199219, + "learning_rate": 0.0002191094939994529, + "loss": 2.0778, + "step": 14214 + }, + { + "epoch": 1.6584995916462488, + "grad_norm": 1.1897517442703247, + "learning_rate": 0.00021909603163554166, + "loss": 2.1498, + "step": 14215 + }, + { + "epoch": 1.6586162641465405, + "grad_norm": 1.3772982358932495, + "learning_rate": 0.00021908256857086956, + "loss": 2.1326, + "step": 14216 + }, + { + "epoch": 1.6587329366468322, + "grad_norm": 1.1842323541641235, + "learning_rate": 0.00021906910480557615, + "loss": 2.1893, + "step": 14217 + }, + { + "epoch": 1.6588496091471239, + "grad_norm": 1.1781796216964722, + "learning_rate": 0.00021905564033980102, + "loss": 2.1832, + "step": 14218 + }, + { + "epoch": 1.6589662816474156, + "grad_norm": 1.0558929443359375, + "learning_rate": 0.0002190421751736837, + "loss": 1.9585, + "step": 14219 + }, + { + "epoch": 1.6590829541477072, + "grad_norm": 1.1070054769515991, + "learning_rate": 0.00021902870930736387, + "loss": 2.064, + "step": 14220 + }, + { + "epoch": 1.659199626647999, + "grad_norm": 1.1155213117599487, + "learning_rate": 0.0002190152427409811, + "loss": 2.1719, + "step": 14221 + }, + { + "epoch": 1.6593162991482906, + "grad_norm": 1.2558631896972656, + "learning_rate": 0.00021900177547467498, + "loss": 2.295, + "step": 14222 + }, + { + "epoch": 1.6594329716485823, + "grad_norm": 1.0688557624816895, + "learning_rate": 0.00021898830750858512, + "loss": 1.9593, + "step": 14223 + }, + { + "epoch": 1.659549644148874, + "grad_norm": 1.0020148754119873, + "learning_rate": 0.00021897483884285115, + "loss": 1.8878, + "step": 14224 + }, + { + "epoch": 1.6596663166491656, + "grad_norm": 1.1203056573867798, + "learning_rate": 0.00021896136947761272, + "loss": 2.0698, + "step": 14225 + }, + { + "epoch": 1.6597829891494573, + "grad_norm": 1.2583158016204834, + "learning_rate": 0.00021894789941300947, + "loss": 2.0961, + "step": 14226 + }, + { + "epoch": 1.659899661649749, + "grad_norm": 1.048824667930603, + "learning_rate": 0.00021893442864918098, + "loss": 2.1415, + "step": 14227 + }, + { + "epoch": 1.6600163341500407, + "grad_norm": 1.2145743370056152, + "learning_rate": 0.00021892095718626693, + "loss": 2.232, + "step": 14228 + }, + { + "epoch": 1.6601330066503324, + "grad_norm": 1.2115687131881714, + "learning_rate": 0.00021890748502440704, + "loss": 2.0807, + "step": 14229 + }, + { + "epoch": 1.660249679150624, + "grad_norm": 1.0179568529129028, + "learning_rate": 0.00021889401216374085, + "loss": 2.1581, + "step": 14230 + }, + { + "epoch": 1.6603663516509157, + "grad_norm": 1.2054895162582397, + "learning_rate": 0.00021888053860440816, + "loss": 1.9459, + "step": 14231 + }, + { + "epoch": 1.6604830241512074, + "grad_norm": 1.260221242904663, + "learning_rate": 0.00021886706434654855, + "loss": 2.1341, + "step": 14232 + }, + { + "epoch": 1.660599696651499, + "grad_norm": 1.0633577108383179, + "learning_rate": 0.00021885358939030178, + "loss": 2.0358, + "step": 14233 + }, + { + "epoch": 1.6607163691517908, + "grad_norm": 1.0409231185913086, + "learning_rate": 0.00021884011373580754, + "loss": 1.8587, + "step": 14234 + }, + { + "epoch": 1.6608330416520825, + "grad_norm": 1.0725244283676147, + "learning_rate": 0.0002188266373832055, + "loss": 2.0045, + "step": 14235 + }, + { + "epoch": 1.6609497141523741, + "grad_norm": 1.207695484161377, + "learning_rate": 0.0002188131603326353, + "loss": 1.8869, + "step": 14236 + }, + { + "epoch": 1.6610663866526658, + "grad_norm": 1.1770765781402588, + "learning_rate": 0.00021879968258423676, + "loss": 2.1841, + "step": 14237 + }, + { + "epoch": 1.6611830591529575, + "grad_norm": 1.090320110321045, + "learning_rate": 0.00021878620413814958, + "loss": 1.9452, + "step": 14238 + }, + { + "epoch": 1.6612997316532492, + "grad_norm": 1.0244157314300537, + "learning_rate": 0.00021877272499451345, + "loss": 2.1261, + "step": 14239 + }, + { + "epoch": 1.6614164041535409, + "grad_norm": 1.211936354637146, + "learning_rate": 0.00021875924515346812, + "loss": 2.1432, + "step": 14240 + }, + { + "epoch": 1.6615330766538325, + "grad_norm": 1.0745136737823486, + "learning_rate": 0.0002187457646151534, + "loss": 1.9947, + "step": 14241 + }, + { + "epoch": 1.6616497491541242, + "grad_norm": 1.098189115524292, + "learning_rate": 0.00021873228337970896, + "loss": 2.1933, + "step": 14242 + }, + { + "epoch": 1.661766421654416, + "grad_norm": 1.1548364162445068, + "learning_rate": 0.00021871880144727458, + "loss": 2.0518, + "step": 14243 + }, + { + "epoch": 1.6618830941547076, + "grad_norm": 1.250321865081787, + "learning_rate": 0.00021870531881799005, + "loss": 2.1746, + "step": 14244 + }, + { + "epoch": 1.6619997666549993, + "grad_norm": 1.167453408241272, + "learning_rate": 0.00021869183549199514, + "loss": 2.2493, + "step": 14245 + }, + { + "epoch": 1.662116439155291, + "grad_norm": 1.2576954364776611, + "learning_rate": 0.00021867835146942958, + "loss": 2.1889, + "step": 14246 + }, + { + "epoch": 1.6622331116555826, + "grad_norm": 1.0132097005844116, + "learning_rate": 0.00021866486675043323, + "loss": 1.993, + "step": 14247 + }, + { + "epoch": 1.6623497841558743, + "grad_norm": 0.9717087149620056, + "learning_rate": 0.00021865138133514585, + "loss": 1.8655, + "step": 14248 + }, + { + "epoch": 1.662466456656166, + "grad_norm": 1.2018356323242188, + "learning_rate": 0.00021863789522370722, + "loss": 2.2933, + "step": 14249 + }, + { + "epoch": 1.6625831291564577, + "grad_norm": 1.0447605848312378, + "learning_rate": 0.00021862440841625717, + "loss": 1.9444, + "step": 14250 + }, + { + "epoch": 1.6626998016567494, + "grad_norm": 1.1973944902420044, + "learning_rate": 0.00021861092091293553, + "loss": 2.0956, + "step": 14251 + }, + { + "epoch": 1.662816474157041, + "grad_norm": 1.187045693397522, + "learning_rate": 0.00021859743271388215, + "loss": 1.9526, + "step": 14252 + }, + { + "epoch": 1.6629331466573327, + "grad_norm": 1.0825048685073853, + "learning_rate": 0.00021858394381923672, + "loss": 2.2624, + "step": 14253 + }, + { + "epoch": 1.6630498191576244, + "grad_norm": 1.2460484504699707, + "learning_rate": 0.00021857045422913924, + "loss": 2.075, + "step": 14254 + }, + { + "epoch": 1.663166491657916, + "grad_norm": 1.0478754043579102, + "learning_rate": 0.00021855696394372946, + "loss": 2.0824, + "step": 14255 + }, + { + "epoch": 1.6632831641582078, + "grad_norm": 1.2473808526992798, + "learning_rate": 0.00021854347296314732, + "loss": 2.1623, + "step": 14256 + }, + { + "epoch": 1.6633998366584994, + "grad_norm": 1.3000714778900146, + "learning_rate": 0.00021852998128753256, + "loss": 2.1548, + "step": 14257 + }, + { + "epoch": 1.6635165091587911, + "grad_norm": 1.1578260660171509, + "learning_rate": 0.00021851648891702517, + "loss": 2.1128, + "step": 14258 + }, + { + "epoch": 1.6636331816590828, + "grad_norm": 1.3078001737594604, + "learning_rate": 0.00021850299585176493, + "loss": 2.0799, + "step": 14259 + }, + { + "epoch": 1.6637498541593745, + "grad_norm": 1.2011566162109375, + "learning_rate": 0.0002184895020918918, + "loss": 2.0095, + "step": 14260 + }, + { + "epoch": 1.6638665266596662, + "grad_norm": 1.2393139600753784, + "learning_rate": 0.0002184760076375456, + "loss": 2.0949, + "step": 14261 + }, + { + "epoch": 1.6639831991599578, + "grad_norm": 1.1308228969573975, + "learning_rate": 0.00021846251248886626, + "loss": 2.094, + "step": 14262 + }, + { + "epoch": 1.6640998716602495, + "grad_norm": 1.5063908100128174, + "learning_rate": 0.00021844901664599367, + "loss": 2.1262, + "step": 14263 + }, + { + "epoch": 1.6642165441605412, + "grad_norm": 1.2130911350250244, + "learning_rate": 0.00021843552010906774, + "loss": 2.3096, + "step": 14264 + }, + { + "epoch": 1.664333216660833, + "grad_norm": 0.9570263028144836, + "learning_rate": 0.00021842202287822835, + "loss": 1.9658, + "step": 14265 + }, + { + "epoch": 1.6644498891611246, + "grad_norm": 1.3356987237930298, + "learning_rate": 0.0002184085249536155, + "loss": 2.093, + "step": 14266 + }, + { + "epoch": 1.6645665616614163, + "grad_norm": 1.1155778169631958, + "learning_rate": 0.0002183950263353691, + "loss": 1.9867, + "step": 14267 + }, + { + "epoch": 1.664683234161708, + "grad_norm": 1.1564791202545166, + "learning_rate": 0.00021838152702362913, + "loss": 2.0453, + "step": 14268 + }, + { + "epoch": 1.6647999066619996, + "grad_norm": 1.1319464445114136, + "learning_rate": 0.00021836802701853537, + "loss": 2.1872, + "step": 14269 + }, + { + "epoch": 1.6649165791622913, + "grad_norm": 0.9922872185707092, + "learning_rate": 0.00021835452632022794, + "loss": 2.0484, + "step": 14270 + }, + { + "epoch": 1.665033251662583, + "grad_norm": 1.375396728515625, + "learning_rate": 0.00021834102492884674, + "loss": 2.1089, + "step": 14271 + }, + { + "epoch": 1.6651499241628747, + "grad_norm": 1.0719685554504395, + "learning_rate": 0.00021832752284453173, + "loss": 2.0792, + "step": 14272 + }, + { + "epoch": 1.6652665966631663, + "grad_norm": 1.0892293453216553, + "learning_rate": 0.00021831402006742293, + "loss": 2.0888, + "step": 14273 + }, + { + "epoch": 1.665383269163458, + "grad_norm": 1.370322346687317, + "learning_rate": 0.00021830051659766026, + "loss": 2.1795, + "step": 14274 + }, + { + "epoch": 1.6654999416637497, + "grad_norm": 1.1241347789764404, + "learning_rate": 0.00021828701243538376, + "loss": 1.9705, + "step": 14275 + }, + { + "epoch": 1.6656166141640414, + "grad_norm": 1.2321401834487915, + "learning_rate": 0.0002182735075807334, + "loss": 2.1089, + "step": 14276 + }, + { + "epoch": 1.665733286664333, + "grad_norm": 1.0913397073745728, + "learning_rate": 0.00021826000203384918, + "loss": 1.9574, + "step": 14277 + }, + { + "epoch": 1.6658499591646247, + "grad_norm": 1.115099549293518, + "learning_rate": 0.00021824649579487112, + "loss": 1.9489, + "step": 14278 + }, + { + "epoch": 1.6659666316649164, + "grad_norm": 1.3896182775497437, + "learning_rate": 0.0002182329888639392, + "loss": 2.0775, + "step": 14279 + }, + { + "epoch": 1.6660833041652081, + "grad_norm": 1.1724773645401, + "learning_rate": 0.00021821948124119356, + "loss": 2.1795, + "step": 14280 + }, + { + "epoch": 1.6661999766654998, + "grad_norm": 1.4277594089508057, + "learning_rate": 0.0002182059729267741, + "loss": 2.1835, + "step": 14281 + }, + { + "epoch": 1.6663166491657915, + "grad_norm": 1.00634765625, + "learning_rate": 0.0002181924639208209, + "loss": 1.9252, + "step": 14282 + }, + { + "epoch": 1.6664333216660832, + "grad_norm": 1.2855557203292847, + "learning_rate": 0.00021817895422347405, + "loss": 2.1477, + "step": 14283 + }, + { + "epoch": 1.6665499941663748, + "grad_norm": 1.2375564575195312, + "learning_rate": 0.00021816544383487357, + "loss": 2.1549, + "step": 14284 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 1.1379072666168213, + "learning_rate": 0.00021815193275515952, + "loss": 2.0425, + "step": 14285 + }, + { + "epoch": 1.6667833391669582, + "grad_norm": 0.9841390252113342, + "learning_rate": 0.00021813842098447196, + "loss": 2.0923, + "step": 14286 + }, + { + "epoch": 1.6669000116672499, + "grad_norm": 1.103806734085083, + "learning_rate": 0.000218124908522951, + "loss": 2.053, + "step": 14287 + }, + { + "epoch": 1.6670166841675416, + "grad_norm": 1.0617949962615967, + "learning_rate": 0.0002181113953707367, + "loss": 2.0817, + "step": 14288 + }, + { + "epoch": 1.6671333566678332, + "grad_norm": 1.2284647226333618, + "learning_rate": 0.00021809788152796913, + "loss": 2.0481, + "step": 14289 + }, + { + "epoch": 1.667250029168125, + "grad_norm": 1.0459988117218018, + "learning_rate": 0.0002180843669947884, + "loss": 2.0905, + "step": 14290 + }, + { + "epoch": 1.6673667016684166, + "grad_norm": 1.1565165519714355, + "learning_rate": 0.00021807085177133457, + "loss": 2.173, + "step": 14291 + }, + { + "epoch": 1.6674833741687083, + "grad_norm": 1.2081642150878906, + "learning_rate": 0.00021805733585774785, + "loss": 2.1149, + "step": 14292 + }, + { + "epoch": 1.667600046669, + "grad_norm": 1.0283571481704712, + "learning_rate": 0.00021804381925416834, + "loss": 1.9622, + "step": 14293 + }, + { + "epoch": 1.6677167191692917, + "grad_norm": 1.3009707927703857, + "learning_rate": 0.0002180303019607361, + "loss": 2.0947, + "step": 14294 + }, + { + "epoch": 1.6678333916695833, + "grad_norm": 1.1011227369308472, + "learning_rate": 0.0002180167839775913, + "loss": 2.011, + "step": 14295 + }, + { + "epoch": 1.667950064169875, + "grad_norm": 1.2720413208007812, + "learning_rate": 0.00021800326530487408, + "loss": 2.2261, + "step": 14296 + }, + { + "epoch": 1.6680667366701667, + "grad_norm": 1.2066301107406616, + "learning_rate": 0.00021798974594272453, + "loss": 1.9954, + "step": 14297 + }, + { + "epoch": 1.6681834091704584, + "grad_norm": 1.1218215227127075, + "learning_rate": 0.00021797622589128293, + "loss": 2.0599, + "step": 14298 + }, + { + "epoch": 1.66830008167075, + "grad_norm": 1.241207242012024, + "learning_rate": 0.00021796270515068934, + "loss": 2.128, + "step": 14299 + }, + { + "epoch": 1.6684167541710417, + "grad_norm": 1.1721463203430176, + "learning_rate": 0.00021794918372108397, + "loss": 2.0434, + "step": 14300 + }, + { + "epoch": 1.6685334266713334, + "grad_norm": 1.1378133296966553, + "learning_rate": 0.00021793566160260695, + "loss": 2.0916, + "step": 14301 + }, + { + "epoch": 1.668650099171625, + "grad_norm": 1.1212087869644165, + "learning_rate": 0.0002179221387953985, + "loss": 2.0895, + "step": 14302 + }, + { + "epoch": 1.6687667716719168, + "grad_norm": 1.5838068723678589, + "learning_rate": 0.0002179086152995988, + "loss": 2.231, + "step": 14303 + }, + { + "epoch": 1.6688834441722085, + "grad_norm": 1.1110032796859741, + "learning_rate": 0.00021789509111534803, + "loss": 2.0477, + "step": 14304 + }, + { + "epoch": 1.6690001166725001, + "grad_norm": 1.2366867065429688, + "learning_rate": 0.00021788156624278642, + "loss": 2.2425, + "step": 14305 + }, + { + "epoch": 1.6691167891727918, + "grad_norm": 1.1265699863433838, + "learning_rate": 0.00021786804068205416, + "loss": 1.9773, + "step": 14306 + }, + { + "epoch": 1.6692334616730835, + "grad_norm": 1.005571722984314, + "learning_rate": 0.0002178545144332915, + "loss": 1.9722, + "step": 14307 + }, + { + "epoch": 1.6693501341733752, + "grad_norm": 1.3388110399246216, + "learning_rate": 0.00021784098749663862, + "loss": 2.1519, + "step": 14308 + }, + { + "epoch": 1.6694668066736669, + "grad_norm": 1.2650591135025024, + "learning_rate": 0.00021782745987223583, + "loss": 2.1005, + "step": 14309 + }, + { + "epoch": 1.6695834791739586, + "grad_norm": 1.068779468536377, + "learning_rate": 0.0002178139315602233, + "loss": 1.9817, + "step": 14310 + }, + { + "epoch": 1.6697001516742502, + "grad_norm": 0.9688642024993896, + "learning_rate": 0.0002178004025607413, + "loss": 1.8906, + "step": 14311 + }, + { + "epoch": 1.669816824174542, + "grad_norm": 1.3202770948410034, + "learning_rate": 0.00021778687287393004, + "loss": 2.1912, + "step": 14312 + }, + { + "epoch": 1.6699334966748336, + "grad_norm": 1.1519418954849243, + "learning_rate": 0.00021777334249992986, + "loss": 1.945, + "step": 14313 + }, + { + "epoch": 1.6700501691751253, + "grad_norm": 1.399878740310669, + "learning_rate": 0.00021775981143888092, + "loss": 2.0868, + "step": 14314 + }, + { + "epoch": 1.670166841675417, + "grad_norm": 1.1500543355941772, + "learning_rate": 0.00021774627969092363, + "loss": 1.9727, + "step": 14315 + }, + { + "epoch": 1.6702835141757086, + "grad_norm": 1.2402936220169067, + "learning_rate": 0.00021773274725619818, + "loss": 2.0595, + "step": 14316 + }, + { + "epoch": 1.6704001866760003, + "grad_norm": 1.261497974395752, + "learning_rate": 0.0002177192141348449, + "loss": 2.0808, + "step": 14317 + }, + { + "epoch": 1.670516859176292, + "grad_norm": 1.1696707010269165, + "learning_rate": 0.000217705680327004, + "loss": 2.2441, + "step": 14318 + }, + { + "epoch": 1.6706335316765837, + "grad_norm": 1.1130752563476562, + "learning_rate": 0.00021769214583281592, + "loss": 2.1178, + "step": 14319 + }, + { + "epoch": 1.6707502041768754, + "grad_norm": 0.9624467492103577, + "learning_rate": 0.00021767861065242087, + "loss": 2.0181, + "step": 14320 + }, + { + "epoch": 1.670866876677167, + "grad_norm": 1.0654088258743286, + "learning_rate": 0.00021766507478595922, + "loss": 1.9434, + "step": 14321 + }, + { + "epoch": 1.6709835491774587, + "grad_norm": 1.1581429243087769, + "learning_rate": 0.00021765153823357129, + "loss": 2.0774, + "step": 14322 + }, + { + "epoch": 1.6711002216777504, + "grad_norm": 1.2001392841339111, + "learning_rate": 0.00021763800099539735, + "loss": 2.0797, + "step": 14323 + }, + { + "epoch": 1.671216894178042, + "grad_norm": 1.000004529953003, + "learning_rate": 0.00021762446307157783, + "loss": 1.9525, + "step": 14324 + }, + { + "epoch": 1.6713335666783338, + "grad_norm": 1.1420567035675049, + "learning_rate": 0.00021761092446225303, + "loss": 2.1348, + "step": 14325 + }, + { + "epoch": 1.6714502391786255, + "grad_norm": 1.1612592935562134, + "learning_rate": 0.00021759738516756333, + "loss": 2.1321, + "step": 14326 + }, + { + "epoch": 1.6715669116789171, + "grad_norm": 1.1898318529129028, + "learning_rate": 0.000217583845187649, + "loss": 2.1281, + "step": 14327 + }, + { + "epoch": 1.6716835841792088, + "grad_norm": 1.0885623693466187, + "learning_rate": 0.0002175703045226505, + "loss": 2.1781, + "step": 14328 + }, + { + "epoch": 1.6718002566795005, + "grad_norm": 1.1801059246063232, + "learning_rate": 0.00021755676317270823, + "loss": 2.1712, + "step": 14329 + }, + { + "epoch": 1.6719169291797922, + "grad_norm": 1.2581292390823364, + "learning_rate": 0.00021754322113796246, + "loss": 2.1716, + "step": 14330 + }, + { + "epoch": 1.6720336016800839, + "grad_norm": 1.0950322151184082, + "learning_rate": 0.00021752967841855367, + "loss": 2.2003, + "step": 14331 + }, + { + "epoch": 1.6721502741803755, + "grad_norm": 1.248071551322937, + "learning_rate": 0.00021751613501462219, + "loss": 2.2691, + "step": 14332 + }, + { + "epoch": 1.6722669466806672, + "grad_norm": 1.171906590461731, + "learning_rate": 0.00021750259092630846, + "loss": 2.0572, + "step": 14333 + }, + { + "epoch": 1.672383619180959, + "grad_norm": 1.2042959928512573, + "learning_rate": 0.00021748904615375296, + "loss": 1.9669, + "step": 14334 + }, + { + "epoch": 1.6725002916812506, + "grad_norm": 1.2616114616394043, + "learning_rate": 0.00021747550069709597, + "loss": 2.1149, + "step": 14335 + }, + { + "epoch": 1.6726169641815423, + "grad_norm": 1.1034901142120361, + "learning_rate": 0.000217461954556478, + "loss": 2.1221, + "step": 14336 + }, + { + "epoch": 1.672733636681834, + "grad_norm": 1.0739516019821167, + "learning_rate": 0.0002174484077320395, + "loss": 1.8837, + "step": 14337 + }, + { + "epoch": 1.6728503091821256, + "grad_norm": 1.153535008430481, + "learning_rate": 0.00021743486022392084, + "loss": 1.9837, + "step": 14338 + }, + { + "epoch": 1.6729669816824173, + "grad_norm": 1.15143620967865, + "learning_rate": 0.00021742131203226247, + "loss": 1.9138, + "step": 14339 + }, + { + "epoch": 1.673083654182709, + "grad_norm": 1.142630696296692, + "learning_rate": 0.00021740776315720486, + "loss": 2.1659, + "step": 14340 + }, + { + "epoch": 1.6732003266830007, + "grad_norm": 1.2407691478729248, + "learning_rate": 0.00021739421359888854, + "loss": 2.1646, + "step": 14341 + }, + { + "epoch": 1.6733169991832924, + "grad_norm": 1.141194224357605, + "learning_rate": 0.00021738066335745386, + "loss": 1.9997, + "step": 14342 + }, + { + "epoch": 1.673433671683584, + "grad_norm": 1.0253973007202148, + "learning_rate": 0.0002173671124330414, + "loss": 1.987, + "step": 14343 + }, + { + "epoch": 1.6735503441838757, + "grad_norm": 1.234035611152649, + "learning_rate": 0.0002173535608257915, + "loss": 2.0395, + "step": 14344 + }, + { + "epoch": 1.6736670166841674, + "grad_norm": 1.0001686811447144, + "learning_rate": 0.00021734000853584483, + "loss": 1.9761, + "step": 14345 + }, + { + "epoch": 1.673783689184459, + "grad_norm": 1.5090965032577515, + "learning_rate": 0.0002173264555633418, + "loss": 2.1457, + "step": 14346 + }, + { + "epoch": 1.6739003616847508, + "grad_norm": 1.1343085765838623, + "learning_rate": 0.00021731290190842283, + "loss": 2.1652, + "step": 14347 + }, + { + "epoch": 1.6740170341850424, + "grad_norm": 1.3599210977554321, + "learning_rate": 0.00021729934757122856, + "loss": 2.1346, + "step": 14348 + }, + { + "epoch": 1.6741337066853341, + "grad_norm": 1.2267720699310303, + "learning_rate": 0.0002172857925518994, + "loss": 2.0155, + "step": 14349 + }, + { + "epoch": 1.6742503791856258, + "grad_norm": 1.158136248588562, + "learning_rate": 0.00021727223685057595, + "loss": 2.1403, + "step": 14350 + }, + { + "epoch": 1.6743670516859175, + "grad_norm": 1.1652480363845825, + "learning_rate": 0.00021725868046739875, + "loss": 2.0731, + "step": 14351 + }, + { + "epoch": 1.6744837241862092, + "grad_norm": 1.1629527807235718, + "learning_rate": 0.00021724512340250823, + "loss": 1.9777, + "step": 14352 + }, + { + "epoch": 1.6746003966865008, + "grad_norm": 1.2474544048309326, + "learning_rate": 0.00021723156565604508, + "loss": 2.107, + "step": 14353 + }, + { + "epoch": 1.6747170691867925, + "grad_norm": 1.2016844749450684, + "learning_rate": 0.00021721800722814973, + "loss": 1.9842, + "step": 14354 + }, + { + "epoch": 1.6748337416870842, + "grad_norm": 1.0637941360473633, + "learning_rate": 0.00021720444811896287, + "loss": 2.0646, + "step": 14355 + }, + { + "epoch": 1.674950414187376, + "grad_norm": 1.0640615224838257, + "learning_rate": 0.00021719088832862488, + "loss": 1.883, + "step": 14356 + }, + { + "epoch": 1.6750670866876676, + "grad_norm": 0.993187665939331, + "learning_rate": 0.0002171773278572765, + "loss": 1.992, + "step": 14357 + }, + { + "epoch": 1.6751837591879593, + "grad_norm": 0.9526501893997192, + "learning_rate": 0.00021716376670505817, + "loss": 1.8946, + "step": 14358 + }, + { + "epoch": 1.675300431688251, + "grad_norm": 1.2065606117248535, + "learning_rate": 0.00021715020487211062, + "loss": 2.0112, + "step": 14359 + }, + { + "epoch": 1.6754171041885426, + "grad_norm": 1.0286403894424438, + "learning_rate": 0.0002171366423585744, + "loss": 2.05, + "step": 14360 + }, + { + "epoch": 1.6755337766888343, + "grad_norm": 1.201654076576233, + "learning_rate": 0.00021712307916459006, + "loss": 2.0507, + "step": 14361 + }, + { + "epoch": 1.675650449189126, + "grad_norm": 1.139143466949463, + "learning_rate": 0.00021710951529029822, + "loss": 1.8567, + "step": 14362 + }, + { + "epoch": 1.6757671216894177, + "grad_norm": 1.26543390750885, + "learning_rate": 0.00021709595073583955, + "loss": 2.1276, + "step": 14363 + }, + { + "epoch": 1.6758837941897093, + "grad_norm": 1.1089727878570557, + "learning_rate": 0.0002170823855013546, + "loss": 2.1296, + "step": 14364 + }, + { + "epoch": 1.676000466690001, + "grad_norm": 1.0354119539260864, + "learning_rate": 0.00021706881958698403, + "loss": 2.1081, + "step": 14365 + }, + { + "epoch": 1.6761171391902927, + "grad_norm": 1.133561134338379, + "learning_rate": 0.00021705525299286848, + "loss": 2.1247, + "step": 14366 + }, + { + "epoch": 1.6762338116905844, + "grad_norm": 1.427865743637085, + "learning_rate": 0.00021704168571914863, + "loss": 2.2715, + "step": 14367 + }, + { + "epoch": 1.676350484190876, + "grad_norm": 1.03889000415802, + "learning_rate": 0.00021702811776596505, + "loss": 1.8016, + "step": 14368 + }, + { + "epoch": 1.6764671566911677, + "grad_norm": 1.2031147480010986, + "learning_rate": 0.0002170145491334585, + "loss": 2.1771, + "step": 14369 + }, + { + "epoch": 1.6765838291914594, + "grad_norm": 1.3101922273635864, + "learning_rate": 0.00021700097982176953, + "loss": 2.3496, + "step": 14370 + }, + { + "epoch": 1.676700501691751, + "grad_norm": 1.1899863481521606, + "learning_rate": 0.00021698740983103887, + "loss": 2.0506, + "step": 14371 + }, + { + "epoch": 1.6768171741920428, + "grad_norm": 1.2471132278442383, + "learning_rate": 0.00021697383916140722, + "loss": 2.0956, + "step": 14372 + }, + { + "epoch": 1.6769338466923345, + "grad_norm": 1.0025919675827026, + "learning_rate": 0.00021696026781301524, + "loss": 2.1498, + "step": 14373 + }, + { + "epoch": 1.6770505191926262, + "grad_norm": 1.2102296352386475, + "learning_rate": 0.00021694669578600358, + "loss": 2.2391, + "step": 14374 + }, + { + "epoch": 1.6771671916929178, + "grad_norm": 1.04759681224823, + "learning_rate": 0.00021693312308051303, + "loss": 2.0319, + "step": 14375 + }, + { + "epoch": 1.6772838641932095, + "grad_norm": 1.1268773078918457, + "learning_rate": 0.00021691954969668426, + "loss": 1.9682, + "step": 14376 + }, + { + "epoch": 1.6774005366935012, + "grad_norm": 1.095768928527832, + "learning_rate": 0.00021690597563465798, + "loss": 1.9663, + "step": 14377 + }, + { + "epoch": 1.6775172091937929, + "grad_norm": 1.050695538520813, + "learning_rate": 0.00021689240089457486, + "loss": 2.2426, + "step": 14378 + }, + { + "epoch": 1.6776338816940846, + "grad_norm": 1.1156636476516724, + "learning_rate": 0.00021687882547657573, + "loss": 1.8407, + "step": 14379 + }, + { + "epoch": 1.6777505541943762, + "grad_norm": 1.1251606941223145, + "learning_rate": 0.00021686524938080128, + "loss": 2.1256, + "step": 14380 + }, + { + "epoch": 1.677867226694668, + "grad_norm": 1.2078659534454346, + "learning_rate": 0.00021685167260739217, + "loss": 2.2616, + "step": 14381 + }, + { + "epoch": 1.6779838991949596, + "grad_norm": 1.2517414093017578, + "learning_rate": 0.00021683809515648928, + "loss": 2.0268, + "step": 14382 + }, + { + "epoch": 1.6781005716952513, + "grad_norm": 1.0095950365066528, + "learning_rate": 0.00021682451702823328, + "loss": 1.9892, + "step": 14383 + }, + { + "epoch": 1.678217244195543, + "grad_norm": 1.2536091804504395, + "learning_rate": 0.0002168109382227649, + "loss": 2.2343, + "step": 14384 + }, + { + "epoch": 1.6783339166958346, + "grad_norm": 1.2151435613632202, + "learning_rate": 0.00021679735874022502, + "loss": 2.1606, + "step": 14385 + }, + { + "epoch": 1.6784505891961263, + "grad_norm": 1.0620771646499634, + "learning_rate": 0.00021678377858075438, + "loss": 1.9124, + "step": 14386 + }, + { + "epoch": 1.678567261696418, + "grad_norm": 1.1327366828918457, + "learning_rate": 0.00021677019774449374, + "loss": 2.0551, + "step": 14387 + }, + { + "epoch": 1.6786839341967097, + "grad_norm": 1.037712574005127, + "learning_rate": 0.00021675661623158393, + "loss": 2.1211, + "step": 14388 + }, + { + "epoch": 1.6788006066970014, + "grad_norm": 1.2668607234954834, + "learning_rate": 0.0002167430340421657, + "loss": 2.2181, + "step": 14389 + }, + { + "epoch": 1.678917279197293, + "grad_norm": 1.2217844724655151, + "learning_rate": 0.00021672945117637987, + "loss": 2.126, + "step": 14390 + }, + { + "epoch": 1.6790339516975847, + "grad_norm": 1.1874306201934814, + "learning_rate": 0.00021671586763436724, + "loss": 2.0495, + "step": 14391 + }, + { + "epoch": 1.6791506241978764, + "grad_norm": 1.1926518678665161, + "learning_rate": 0.00021670228341626866, + "loss": 1.9959, + "step": 14392 + }, + { + "epoch": 1.679267296698168, + "grad_norm": 1.0555031299591064, + "learning_rate": 0.0002166886985222249, + "loss": 1.9867, + "step": 14393 + }, + { + "epoch": 1.6793839691984598, + "grad_norm": 1.2459062337875366, + "learning_rate": 0.0002166751129523769, + "loss": 2.0908, + "step": 14394 + }, + { + "epoch": 1.6795006416987515, + "grad_norm": 1.0638989210128784, + "learning_rate": 0.00021666152670686542, + "loss": 2.1166, + "step": 14395 + }, + { + "epoch": 1.6796173141990431, + "grad_norm": 1.1346399784088135, + "learning_rate": 0.00021664793978583134, + "loss": 2.2609, + "step": 14396 + }, + { + "epoch": 1.6797339866993348, + "grad_norm": 1.144239068031311, + "learning_rate": 0.00021663435218941543, + "loss": 2.119, + "step": 14397 + }, + { + "epoch": 1.6798506591996265, + "grad_norm": 1.0285148620605469, + "learning_rate": 0.00021662076391775866, + "loss": 1.8659, + "step": 14398 + }, + { + "epoch": 1.6799673316999182, + "grad_norm": 1.2373931407928467, + "learning_rate": 0.00021660717497100183, + "loss": 2.2224, + "step": 14399 + }, + { + "epoch": 1.6800840042002099, + "grad_norm": 1.2052488327026367, + "learning_rate": 0.0002165935853492859, + "loss": 2.1423, + "step": 14400 + }, + { + "epoch": 1.6802006767005015, + "grad_norm": 1.0535980463027954, + "learning_rate": 0.00021657999505275168, + "loss": 1.8169, + "step": 14401 + }, + { + "epoch": 1.6803173492007932, + "grad_norm": 1.1284326314926147, + "learning_rate": 0.00021656640408154005, + "loss": 2.0865, + "step": 14402 + }, + { + "epoch": 1.680434021701085, + "grad_norm": 1.1827974319458008, + "learning_rate": 0.0002165528124357919, + "loss": 1.9718, + "step": 14403 + }, + { + "epoch": 1.6805506942013766, + "grad_norm": 1.0880911350250244, + "learning_rate": 0.00021653922011564817, + "loss": 2.0766, + "step": 14404 + }, + { + "epoch": 1.6806673667016683, + "grad_norm": 1.1615554094314575, + "learning_rate": 0.00021652562712124978, + "loss": 1.9349, + "step": 14405 + }, + { + "epoch": 1.68078403920196, + "grad_norm": 1.1723589897155762, + "learning_rate": 0.00021651203345273768, + "loss": 1.8446, + "step": 14406 + }, + { + "epoch": 1.6809007117022516, + "grad_norm": 1.4109653234481812, + "learning_rate": 0.00021649843911025266, + "loss": 2.2301, + "step": 14407 + }, + { + "epoch": 1.6810173842025433, + "grad_norm": 1.1075725555419922, + "learning_rate": 0.00021648484409393575, + "loss": 1.8765, + "step": 14408 + }, + { + "epoch": 1.681134056702835, + "grad_norm": 1.2877379655838013, + "learning_rate": 0.00021647124840392791, + "loss": 2.2245, + "step": 14409 + }, + { + "epoch": 1.6812507292031267, + "grad_norm": 1.0092891454696655, + "learning_rate": 0.00021645765204037, + "loss": 1.905, + "step": 14410 + }, + { + "epoch": 1.6813674017034184, + "grad_norm": 1.2690105438232422, + "learning_rate": 0.00021644405500340303, + "loss": 2.189, + "step": 14411 + }, + { + "epoch": 1.68148407420371, + "grad_norm": 1.2377276420593262, + "learning_rate": 0.000216430457293168, + "loss": 2.0597, + "step": 14412 + }, + { + "epoch": 1.6816007467040017, + "grad_norm": 1.177775263786316, + "learning_rate": 0.00021641685890980576, + "loss": 2.0917, + "step": 14413 + }, + { + "epoch": 1.6817174192042934, + "grad_norm": 1.3253653049468994, + "learning_rate": 0.0002164032598534574, + "loss": 2.1219, + "step": 14414 + }, + { + "epoch": 1.681834091704585, + "grad_norm": 1.1795053482055664, + "learning_rate": 0.00021638966012426384, + "loss": 1.9369, + "step": 14415 + }, + { + "epoch": 1.6819507642048768, + "grad_norm": 1.2319380044937134, + "learning_rate": 0.00021637605972236605, + "loss": 2.1842, + "step": 14416 + }, + { + "epoch": 1.6820674367051685, + "grad_norm": 1.1579030752182007, + "learning_rate": 0.00021636245864790503, + "loss": 1.9308, + "step": 14417 + }, + { + "epoch": 1.6821841092054601, + "grad_norm": 1.1564990282058716, + "learning_rate": 0.00021634885690102182, + "loss": 2.0401, + "step": 14418 + }, + { + "epoch": 1.6823007817057518, + "grad_norm": 1.1801668405532837, + "learning_rate": 0.00021633525448185742, + "loss": 2.1654, + "step": 14419 + }, + { + "epoch": 1.6824174542060435, + "grad_norm": 1.1060786247253418, + "learning_rate": 0.0002163216513905528, + "loss": 2.0672, + "step": 14420 + }, + { + "epoch": 1.6825341267063352, + "grad_norm": 0.9950454235076904, + "learning_rate": 0.00021630804762724906, + "loss": 2.1358, + "step": 14421 + }, + { + "epoch": 1.6826507992066269, + "grad_norm": 1.2614299058914185, + "learning_rate": 0.00021629444319208713, + "loss": 1.9179, + "step": 14422 + }, + { + "epoch": 1.6827674717069185, + "grad_norm": 1.0792393684387207, + "learning_rate": 0.00021628083808520815, + "loss": 2.0887, + "step": 14423 + }, + { + "epoch": 1.6828841442072102, + "grad_norm": 1.0373961925506592, + "learning_rate": 0.00021626723230675307, + "loss": 2.1008, + "step": 14424 + }, + { + "epoch": 1.683000816707502, + "grad_norm": 1.5634135007858276, + "learning_rate": 0.00021625362585686296, + "loss": 2.1932, + "step": 14425 + }, + { + "epoch": 1.6831174892077936, + "grad_norm": 1.3208330869674683, + "learning_rate": 0.000216240018735679, + "loss": 2.0211, + "step": 14426 + }, + { + "epoch": 1.6832341617080853, + "grad_norm": 1.0672216415405273, + "learning_rate": 0.00021622641094334204, + "loss": 1.9615, + "step": 14427 + }, + { + "epoch": 1.683350834208377, + "grad_norm": 1.0854121446609497, + "learning_rate": 0.0002162128024799933, + "loss": 2.1368, + "step": 14428 + }, + { + "epoch": 1.6834675067086686, + "grad_norm": 1.0880377292633057, + "learning_rate": 0.00021619919334577383, + "loss": 2.156, + "step": 14429 + }, + { + "epoch": 1.6835841792089603, + "grad_norm": 1.0193320512771606, + "learning_rate": 0.00021618558354082472, + "loss": 2.1327, + "step": 14430 + }, + { + "epoch": 1.683700851709252, + "grad_norm": 1.0929615497589111, + "learning_rate": 0.000216171973065287, + "loss": 2.057, + "step": 14431 + }, + { + "epoch": 1.6838175242095437, + "grad_norm": 1.1103817224502563, + "learning_rate": 0.00021615836191930186, + "loss": 2.0806, + "step": 14432 + }, + { + "epoch": 1.6839341967098354, + "grad_norm": 1.2315559387207031, + "learning_rate": 0.00021614475010301031, + "loss": 2.1159, + "step": 14433 + }, + { + "epoch": 1.684050869210127, + "grad_norm": 1.0538907051086426, + "learning_rate": 0.00021613113761655356, + "loss": 2.1224, + "step": 14434 + }, + { + "epoch": 1.6841675417104187, + "grad_norm": 1.1258151531219482, + "learning_rate": 0.0002161175244600726, + "loss": 2.1121, + "step": 14435 + }, + { + "epoch": 1.6842842142107104, + "grad_norm": 0.998998761177063, + "learning_rate": 0.0002161039106337087, + "loss": 2.0693, + "step": 14436 + }, + { + "epoch": 1.684400886711002, + "grad_norm": 1.021385669708252, + "learning_rate": 0.00021609029613760294, + "loss": 1.9655, + "step": 14437 + }, + { + "epoch": 1.6845175592112938, + "grad_norm": 1.1641943454742432, + "learning_rate": 0.00021607668097189638, + "loss": 2.081, + "step": 14438 + }, + { + "epoch": 1.6846342317115854, + "grad_norm": 1.2536216974258423, + "learning_rate": 0.00021606306513673027, + "loss": 2.1057, + "step": 14439 + }, + { + "epoch": 1.6847509042118771, + "grad_norm": 1.16157865524292, + "learning_rate": 0.0002160494486322458, + "loss": 2.1922, + "step": 14440 + }, + { + "epoch": 1.6848675767121688, + "grad_norm": 1.192366361618042, + "learning_rate": 0.00021603583145858398, + "loss": 1.9777, + "step": 14441 + }, + { + "epoch": 1.6849842492124605, + "grad_norm": 1.2089632749557495, + "learning_rate": 0.0002160222136158861, + "loss": 2.2577, + "step": 14442 + }, + { + "epoch": 1.6851009217127522, + "grad_norm": 1.1305609941482544, + "learning_rate": 0.00021600859510429325, + "loss": 2.1618, + "step": 14443 + }, + { + "epoch": 1.6852175942130438, + "grad_norm": 0.9894979000091553, + "learning_rate": 0.0002159949759239467, + "loss": 2.0494, + "step": 14444 + }, + { + "epoch": 1.6853342667133355, + "grad_norm": 1.0757087469100952, + "learning_rate": 0.00021598135607498755, + "loss": 2.1923, + "step": 14445 + }, + { + "epoch": 1.6854509392136272, + "grad_norm": 1.2827950716018677, + "learning_rate": 0.00021596773555755704, + "loss": 2.2578, + "step": 14446 + }, + { + "epoch": 1.685567611713919, + "grad_norm": 1.2702373266220093, + "learning_rate": 0.0002159541143717964, + "loss": 2.2303, + "step": 14447 + }, + { + "epoch": 1.6856842842142106, + "grad_norm": 1.1518222093582153, + "learning_rate": 0.0002159404925178468, + "loss": 2.0384, + "step": 14448 + }, + { + "epoch": 1.6858009567145023, + "grad_norm": 1.187975525856018, + "learning_rate": 0.00021592686999584947, + "loss": 2.0502, + "step": 14449 + }, + { + "epoch": 1.685917629214794, + "grad_norm": 1.3854485750198364, + "learning_rate": 0.0002159132468059456, + "loss": 2.2085, + "step": 14450 + }, + { + "epoch": 1.6860343017150856, + "grad_norm": 1.0583487749099731, + "learning_rate": 0.00021589962294827651, + "loss": 2.0918, + "step": 14451 + }, + { + "epoch": 1.6861509742153773, + "grad_norm": 1.1050729751586914, + "learning_rate": 0.0002158859984229833, + "loss": 2.0219, + "step": 14452 + }, + { + "epoch": 1.686267646715669, + "grad_norm": 1.1058450937271118, + "learning_rate": 0.00021587237323020738, + "loss": 2.0585, + "step": 14453 + }, + { + "epoch": 1.6863843192159607, + "grad_norm": 1.1648083925247192, + "learning_rate": 0.0002158587473700899, + "loss": 2.0579, + "step": 14454 + }, + { + "epoch": 1.6865009917162523, + "grad_norm": 1.0530431270599365, + "learning_rate": 0.00021584512084277209, + "loss": 1.9101, + "step": 14455 + }, + { + "epoch": 1.686617664216544, + "grad_norm": 1.0812652111053467, + "learning_rate": 0.00021583149364839523, + "loss": 2.1244, + "step": 14456 + }, + { + "epoch": 1.6867343367168357, + "grad_norm": 1.3023009300231934, + "learning_rate": 0.0002158178657871007, + "loss": 2.1794, + "step": 14457 + }, + { + "epoch": 1.6868510092171274, + "grad_norm": 1.196776270866394, + "learning_rate": 0.00021580423725902964, + "loss": 2.068, + "step": 14458 + }, + { + "epoch": 1.686967681717419, + "grad_norm": 1.1310017108917236, + "learning_rate": 0.00021579060806432339, + "loss": 1.9622, + "step": 14459 + }, + { + "epoch": 1.6870843542177107, + "grad_norm": 1.2236652374267578, + "learning_rate": 0.00021577697820312328, + "loss": 2.1174, + "step": 14460 + }, + { + "epoch": 1.6872010267180024, + "grad_norm": 1.0100849866867065, + "learning_rate": 0.00021576334767557052, + "loss": 1.9842, + "step": 14461 + }, + { + "epoch": 1.687317699218294, + "grad_norm": 1.2485949993133545, + "learning_rate": 0.00021574971648180644, + "loss": 2.0939, + "step": 14462 + }, + { + "epoch": 1.6874343717185858, + "grad_norm": 1.3983148336410522, + "learning_rate": 0.00021573608462197244, + "loss": 2.2855, + "step": 14463 + }, + { + "epoch": 1.6875510442188775, + "grad_norm": 1.1328480243682861, + "learning_rate": 0.00021572245209620979, + "loss": 2.0191, + "step": 14464 + }, + { + "epoch": 1.6876677167191692, + "grad_norm": 1.081538438796997, + "learning_rate": 0.00021570881890465977, + "loss": 2.0047, + "step": 14465 + }, + { + "epoch": 1.6877843892194608, + "grad_norm": 1.1855427026748657, + "learning_rate": 0.0002156951850474638, + "loss": 2.1045, + "step": 14466 + }, + { + "epoch": 1.6879010617197525, + "grad_norm": 1.0163841247558594, + "learning_rate": 0.00021568155052476316, + "loss": 2.0347, + "step": 14467 + }, + { + "epoch": 1.6880177342200442, + "grad_norm": 1.4094712734222412, + "learning_rate": 0.00021566791533669924, + "loss": 2.1394, + "step": 14468 + }, + { + "epoch": 1.6881344067203359, + "grad_norm": 1.194118857383728, + "learning_rate": 0.0002156542794834133, + "loss": 2.1372, + "step": 14469 + }, + { + "epoch": 1.6882510792206276, + "grad_norm": 1.2458857297897339, + "learning_rate": 0.00021564064296504678, + "loss": 2.0377, + "step": 14470 + }, + { + "epoch": 1.6883677517209192, + "grad_norm": 1.2105482816696167, + "learning_rate": 0.00021562700578174105, + "loss": 1.9508, + "step": 14471 + }, + { + "epoch": 1.688484424221211, + "grad_norm": 1.230635643005371, + "learning_rate": 0.00021561336793363747, + "loss": 1.8618, + "step": 14472 + }, + { + "epoch": 1.6886010967215026, + "grad_norm": 1.112723469734192, + "learning_rate": 0.0002155997294208774, + "loss": 2.1762, + "step": 14473 + }, + { + "epoch": 1.6887177692217943, + "grad_norm": 1.0034910440444946, + "learning_rate": 0.00021558609024360228, + "loss": 1.9709, + "step": 14474 + }, + { + "epoch": 1.688834441722086, + "grad_norm": 1.2160193920135498, + "learning_rate": 0.00021557245040195347, + "loss": 2.1384, + "step": 14475 + }, + { + "epoch": 1.6889511142223776, + "grad_norm": 1.0709147453308105, + "learning_rate": 0.00021555880989607238, + "loss": 2.0464, + "step": 14476 + }, + { + "epoch": 1.6890677867226693, + "grad_norm": 0.991669774055481, + "learning_rate": 0.0002155451687261004, + "loss": 2.0275, + "step": 14477 + }, + { + "epoch": 1.689184459222961, + "grad_norm": 1.2062615156173706, + "learning_rate": 0.000215531526892179, + "loss": 2.1051, + "step": 14478 + }, + { + "epoch": 1.6893011317232527, + "grad_norm": 1.0949718952178955, + "learning_rate": 0.00021551788439444958, + "loss": 2.2602, + "step": 14479 + }, + { + "epoch": 1.6894178042235444, + "grad_norm": 1.185819387435913, + "learning_rate": 0.00021550424123305354, + "loss": 2.0677, + "step": 14480 + }, + { + "epoch": 1.689534476723836, + "grad_norm": 1.3428641557693481, + "learning_rate": 0.00021549059740813236, + "loss": 1.9527, + "step": 14481 + }, + { + "epoch": 1.6896511492241277, + "grad_norm": 1.2289952039718628, + "learning_rate": 0.00021547695291982742, + "loss": 2.0468, + "step": 14482 + }, + { + "epoch": 1.6897678217244194, + "grad_norm": 1.2058416604995728, + "learning_rate": 0.00021546330776828026, + "loss": 2.1178, + "step": 14483 + }, + { + "epoch": 1.689884494224711, + "grad_norm": 1.2320419549942017, + "learning_rate": 0.00021544966195363228, + "loss": 2.1477, + "step": 14484 + }, + { + "epoch": 1.6900011667250028, + "grad_norm": 1.0908254384994507, + "learning_rate": 0.0002154360154760249, + "loss": 1.8191, + "step": 14485 + }, + { + "epoch": 1.6901178392252945, + "grad_norm": 1.1532797813415527, + "learning_rate": 0.0002154223683355997, + "loss": 2.0535, + "step": 14486 + }, + { + "epoch": 1.6902345117255861, + "grad_norm": 1.1928210258483887, + "learning_rate": 0.00021540872053249808, + "loss": 2.0887, + "step": 14487 + }, + { + "epoch": 1.6903511842258778, + "grad_norm": 1.1439244747161865, + "learning_rate": 0.00021539507206686155, + "loss": 2.0304, + "step": 14488 + }, + { + "epoch": 1.6904678567261695, + "grad_norm": 1.1673506498336792, + "learning_rate": 0.0002153814229388316, + "loss": 2.1717, + "step": 14489 + }, + { + "epoch": 1.6905845292264612, + "grad_norm": 1.0788874626159668, + "learning_rate": 0.00021536777314854974, + "loss": 2.1083, + "step": 14490 + }, + { + "epoch": 1.6907012017267529, + "grad_norm": 1.2367500066757202, + "learning_rate": 0.0002153541226961575, + "loss": 2.1542, + "step": 14491 + }, + { + "epoch": 1.6908178742270445, + "grad_norm": 1.147425889968872, + "learning_rate": 0.00021534047158179635, + "loss": 2.241, + "step": 14492 + }, + { + "epoch": 1.6909345467273362, + "grad_norm": 1.4532122611999512, + "learning_rate": 0.0002153268198056078, + "loss": 2.0435, + "step": 14493 + }, + { + "epoch": 1.691051219227628, + "grad_norm": 1.1449236869812012, + "learning_rate": 0.0002153131673677334, + "loss": 2.1073, + "step": 14494 + }, + { + "epoch": 1.6911678917279196, + "grad_norm": 1.218445897102356, + "learning_rate": 0.00021529951426831463, + "loss": 1.9756, + "step": 14495 + }, + { + "epoch": 1.6912845642282113, + "grad_norm": 1.1090461015701294, + "learning_rate": 0.00021528586050749318, + "loss": 1.9597, + "step": 14496 + }, + { + "epoch": 1.691401236728503, + "grad_norm": 1.275091290473938, + "learning_rate": 0.00021527220608541045, + "loss": 2.1791, + "step": 14497 + }, + { + "epoch": 1.6915179092287946, + "grad_norm": 1.033004879951477, + "learning_rate": 0.00021525855100220804, + "loss": 1.917, + "step": 14498 + }, + { + "epoch": 1.6916345817290863, + "grad_norm": 1.1036756038665771, + "learning_rate": 0.0002152448952580275, + "loss": 2.0293, + "step": 14499 + }, + { + "epoch": 1.691751254229378, + "grad_norm": 1.1064049005508423, + "learning_rate": 0.0002152312388530104, + "loss": 2.113, + "step": 14500 + }, + { + "epoch": 1.6918679267296697, + "grad_norm": 1.0235353708267212, + "learning_rate": 0.00021521758178729837, + "loss": 2.0214, + "step": 14501 + }, + { + "epoch": 1.6919845992299614, + "grad_norm": 1.2951011657714844, + "learning_rate": 0.00021520392406103295, + "loss": 2.2417, + "step": 14502 + }, + { + "epoch": 1.692101271730253, + "grad_norm": 1.1162896156311035, + "learning_rate": 0.00021519026567435568, + "loss": 2.0866, + "step": 14503 + }, + { + "epoch": 1.6922179442305447, + "grad_norm": 0.9777102470397949, + "learning_rate": 0.00021517660662740822, + "loss": 2.0253, + "step": 14504 + }, + { + "epoch": 1.6923346167308364, + "grad_norm": 1.0085529088974, + "learning_rate": 0.00021516294692033213, + "loss": 1.9624, + "step": 14505 + }, + { + "epoch": 1.692451289231128, + "grad_norm": 1.176064133644104, + "learning_rate": 0.00021514928655326905, + "loss": 2.1187, + "step": 14506 + }, + { + "epoch": 1.6925679617314198, + "grad_norm": 1.0586636066436768, + "learning_rate": 0.0002151356255263606, + "loss": 2.0029, + "step": 14507 + }, + { + "epoch": 1.6926846342317114, + "grad_norm": 1.1409716606140137, + "learning_rate": 0.00021512196383974836, + "loss": 1.9386, + "step": 14508 + }, + { + "epoch": 1.6928013067320031, + "grad_norm": 1.1051517724990845, + "learning_rate": 0.000215108301493574, + "loss": 2.0968, + "step": 14509 + }, + { + "epoch": 1.6929179792322948, + "grad_norm": 1.1684240102767944, + "learning_rate": 0.0002150946384879792, + "loss": 2.0744, + "step": 14510 + }, + { + "epoch": 1.6930346517325865, + "grad_norm": 0.9848664402961731, + "learning_rate": 0.0002150809748231055, + "loss": 1.972, + "step": 14511 + }, + { + "epoch": 1.6931513242328782, + "grad_norm": 1.4022314548492432, + "learning_rate": 0.00021506731049909455, + "loss": 2.059, + "step": 14512 + }, + { + "epoch": 1.6932679967331699, + "grad_norm": 1.1104227304458618, + "learning_rate": 0.00021505364551608808, + "loss": 1.9998, + "step": 14513 + }, + { + "epoch": 1.6933846692334615, + "grad_norm": 1.2751259803771973, + "learning_rate": 0.00021503997987422775, + "loss": 2.226, + "step": 14514 + }, + { + "epoch": 1.6935013417337532, + "grad_norm": 1.1396183967590332, + "learning_rate": 0.00021502631357365513, + "loss": 1.8766, + "step": 14515 + }, + { + "epoch": 1.693618014234045, + "grad_norm": 1.1216906309127808, + "learning_rate": 0.00021501264661451206, + "loss": 2.0705, + "step": 14516 + }, + { + "epoch": 1.6937346867343366, + "grad_norm": 1.1324330568313599, + "learning_rate": 0.00021499897899694015, + "loss": 1.9955, + "step": 14517 + }, + { + "epoch": 1.6938513592346283, + "grad_norm": 0.942024290561676, + "learning_rate": 0.00021498531072108106, + "loss": 2.0575, + "step": 14518 + }, + { + "epoch": 1.69396803173492, + "grad_norm": 1.1071661710739136, + "learning_rate": 0.00021497164178707644, + "loss": 2.0113, + "step": 14519 + }, + { + "epoch": 1.6940847042352116, + "grad_norm": 1.1786699295043945, + "learning_rate": 0.00021495797219506812, + "loss": 2.1696, + "step": 14520 + }, + { + "epoch": 1.6942013767355033, + "grad_norm": 1.3647135496139526, + "learning_rate": 0.00021494430194519772, + "loss": 2.068, + "step": 14521 + }, + { + "epoch": 1.694318049235795, + "grad_norm": 1.2881231307983398, + "learning_rate": 0.000214930631037607, + "loss": 2.1301, + "step": 14522 + }, + { + "epoch": 1.6944347217360867, + "grad_norm": 1.152665138244629, + "learning_rate": 0.0002149169594724377, + "loss": 2.0089, + "step": 14523 + }, + { + "epoch": 1.6945513942363783, + "grad_norm": 1.1577966213226318, + "learning_rate": 0.0002149032872498315, + "loss": 2.0355, + "step": 14524 + }, + { + "epoch": 1.69466806673667, + "grad_norm": 1.2372983694076538, + "learning_rate": 0.00021488961436993016, + "loss": 2.1877, + "step": 14525 + }, + { + "epoch": 1.6947847392369617, + "grad_norm": 1.1065995693206787, + "learning_rate": 0.00021487594083287542, + "loss": 1.9888, + "step": 14526 + }, + { + "epoch": 1.6949014117372534, + "grad_norm": 1.076187014579773, + "learning_rate": 0.00021486226663880909, + "loss": 2.2503, + "step": 14527 + }, + { + "epoch": 1.695018084237545, + "grad_norm": 1.1797302961349487, + "learning_rate": 0.00021484859178787282, + "loss": 1.9063, + "step": 14528 + }, + { + "epoch": 1.6951347567378368, + "grad_norm": 0.9735510945320129, + "learning_rate": 0.00021483491628020846, + "loss": 2.0674, + "step": 14529 + }, + { + "epoch": 1.6952514292381284, + "grad_norm": 1.2295427322387695, + "learning_rate": 0.00021482124011595777, + "loss": 2.1359, + "step": 14530 + }, + { + "epoch": 1.6953681017384201, + "grad_norm": 1.2746130228042603, + "learning_rate": 0.00021480756329526248, + "loss": 2.1889, + "step": 14531 + }, + { + "epoch": 1.6954847742387118, + "grad_norm": 1.1031837463378906, + "learning_rate": 0.00021479388581826443, + "loss": 2.082, + "step": 14532 + }, + { + "epoch": 1.6956014467390035, + "grad_norm": 1.102675437927246, + "learning_rate": 0.00021478020768510541, + "loss": 2.1107, + "step": 14533 + }, + { + "epoch": 1.6957181192392952, + "grad_norm": 1.1757395267486572, + "learning_rate": 0.00021476652889592724, + "loss": 2.2008, + "step": 14534 + }, + { + "epoch": 1.6958347917395868, + "grad_norm": 1.2351516485214233, + "learning_rate": 0.00021475284945087167, + "loss": 1.9535, + "step": 14535 + }, + { + "epoch": 1.6959514642398785, + "grad_norm": 1.1342215538024902, + "learning_rate": 0.00021473916935008049, + "loss": 2.2076, + "step": 14536 + }, + { + "epoch": 1.6960681367401702, + "grad_norm": 1.132418155670166, + "learning_rate": 0.00021472548859369561, + "loss": 2.0447, + "step": 14537 + }, + { + "epoch": 1.6961848092404619, + "grad_norm": 1.3959145545959473, + "learning_rate": 0.0002147118071818588, + "loss": 2.306, + "step": 14538 + }, + { + "epoch": 1.6963014817407536, + "grad_norm": 1.1613435745239258, + "learning_rate": 0.00021469812511471192, + "loss": 2.0344, + "step": 14539 + }, + { + "epoch": 1.6964181542410453, + "grad_norm": 1.1614779233932495, + "learning_rate": 0.00021468444239239678, + "loss": 2.0469, + "step": 14540 + }, + { + "epoch": 1.696534826741337, + "grad_norm": 1.328782081604004, + "learning_rate": 0.00021467075901505524, + "loss": 2.1925, + "step": 14541 + }, + { + "epoch": 1.6966514992416286, + "grad_norm": 1.1462464332580566, + "learning_rate": 0.00021465707498282918, + "loss": 2.175, + "step": 14542 + }, + { + "epoch": 1.6967681717419203, + "grad_norm": 1.0810939073562622, + "learning_rate": 0.00021464339029586045, + "loss": 1.9641, + "step": 14543 + }, + { + "epoch": 1.696884844242212, + "grad_norm": 1.2876720428466797, + "learning_rate": 0.0002146297049542909, + "loss": 2.2385, + "step": 14544 + }, + { + "epoch": 1.6970015167425037, + "grad_norm": 1.1491267681121826, + "learning_rate": 0.00021461601895826236, + "loss": 2.2263, + "step": 14545 + }, + { + "epoch": 1.6971181892427953, + "grad_norm": 1.0623277425765991, + "learning_rate": 0.00021460233230791682, + "loss": 1.9234, + "step": 14546 + }, + { + "epoch": 1.697234861743087, + "grad_norm": 0.9576266407966614, + "learning_rate": 0.0002145886450033961, + "loss": 1.9619, + "step": 14547 + }, + { + "epoch": 1.6973515342433787, + "grad_norm": 1.1405655145645142, + "learning_rate": 0.0002145749570448421, + "loss": 2.1374, + "step": 14548 + }, + { + "epoch": 1.6974682067436704, + "grad_norm": 1.2110495567321777, + "learning_rate": 0.00021456126843239672, + "loss": 2.046, + "step": 14549 + }, + { + "epoch": 1.697584879243962, + "grad_norm": 1.0366952419281006, + "learning_rate": 0.0002145475791662019, + "loss": 2.0624, + "step": 14550 + }, + { + "epoch": 1.6977015517442537, + "grad_norm": 0.9774922728538513, + "learning_rate": 0.0002145338892463995, + "loss": 1.978, + "step": 14551 + }, + { + "epoch": 1.6978182242445454, + "grad_norm": 1.0479259490966797, + "learning_rate": 0.00021452019867313155, + "loss": 2.1705, + "step": 14552 + }, + { + "epoch": 1.697934896744837, + "grad_norm": 1.1000735759735107, + "learning_rate": 0.00021450650744653983, + "loss": 2.1331, + "step": 14553 + }, + { + "epoch": 1.6980515692451288, + "grad_norm": 1.0112481117248535, + "learning_rate": 0.00021449281556676635, + "loss": 1.9346, + "step": 14554 + }, + { + "epoch": 1.6981682417454205, + "grad_norm": 1.179235816001892, + "learning_rate": 0.00021447912303395307, + "loss": 2.2384, + "step": 14555 + }, + { + "epoch": 1.6982849142457122, + "grad_norm": 1.1089941263198853, + "learning_rate": 0.0002144654298482419, + "loss": 1.998, + "step": 14556 + }, + { + "epoch": 1.6984015867460038, + "grad_norm": 0.9571640491485596, + "learning_rate": 0.0002144517360097748, + "loss": 1.8784, + "step": 14557 + }, + { + "epoch": 1.6985182592462955, + "grad_norm": 1.1514554023742676, + "learning_rate": 0.0002144380415186938, + "loss": 2.0163, + "step": 14558 + }, + { + "epoch": 1.6986349317465872, + "grad_norm": 1.252350091934204, + "learning_rate": 0.0002144243463751408, + "loss": 2.1601, + "step": 14559 + }, + { + "epoch": 1.6987516042468789, + "grad_norm": 1.1474437713623047, + "learning_rate": 0.00021441065057925779, + "loss": 2.1077, + "step": 14560 + }, + { + "epoch": 1.6988682767471706, + "grad_norm": 1.195837378501892, + "learning_rate": 0.00021439695413118674, + "loss": 2.1224, + "step": 14561 + }, + { + "epoch": 1.6989849492474622, + "grad_norm": 1.2535290718078613, + "learning_rate": 0.0002143832570310697, + "loss": 1.966, + "step": 14562 + }, + { + "epoch": 1.699101621747754, + "grad_norm": 1.1970467567443848, + "learning_rate": 0.0002143695592790486, + "loss": 2.1632, + "step": 14563 + }, + { + "epoch": 1.6992182942480456, + "grad_norm": 1.1644794940948486, + "learning_rate": 0.00021435586087526542, + "loss": 2.0826, + "step": 14564 + }, + { + "epoch": 1.6993349667483373, + "grad_norm": 1.1297858953475952, + "learning_rate": 0.00021434216181986224, + "loss": 2.1626, + "step": 14565 + }, + { + "epoch": 1.699451639248629, + "grad_norm": 1.1481822729110718, + "learning_rate": 0.00021432846211298106, + "loss": 2.0602, + "step": 14566 + }, + { + "epoch": 1.6995683117489206, + "grad_norm": 1.0658258199691772, + "learning_rate": 0.00021431476175476385, + "loss": 2.0875, + "step": 14567 + }, + { + "epoch": 1.6996849842492123, + "grad_norm": 1.1560084819793701, + "learning_rate": 0.00021430106074535278, + "loss": 2.0248, + "step": 14568 + }, + { + "epoch": 1.699801656749504, + "grad_norm": 1.2095263004302979, + "learning_rate": 0.00021428735908488974, + "loss": 2.0694, + "step": 14569 + }, + { + "epoch": 1.6999183292497957, + "grad_norm": 1.1100215911865234, + "learning_rate": 0.0002142736567735168, + "loss": 2.0546, + "step": 14570 + }, + { + "epoch": 1.7000350017500874, + "grad_norm": 1.0195319652557373, + "learning_rate": 0.00021425995381137608, + "loss": 1.9839, + "step": 14571 + }, + { + "epoch": 1.700151674250379, + "grad_norm": 1.0362565517425537, + "learning_rate": 0.00021424625019860957, + "loss": 1.9399, + "step": 14572 + }, + { + "epoch": 1.7002683467506707, + "grad_norm": 1.1904550790786743, + "learning_rate": 0.00021423254593535933, + "loss": 2.0335, + "step": 14573 + }, + { + "epoch": 1.7003850192509624, + "grad_norm": 1.213263750076294, + "learning_rate": 0.0002142188410217675, + "loss": 2.1734, + "step": 14574 + }, + { + "epoch": 1.700501691751254, + "grad_norm": 1.1636265516281128, + "learning_rate": 0.0002142051354579761, + "loss": 2.1207, + "step": 14575 + }, + { + "epoch": 1.7006183642515458, + "grad_norm": 1.2645595073699951, + "learning_rate": 0.00021419142924412724, + "loss": 1.9298, + "step": 14576 + }, + { + "epoch": 1.7007350367518375, + "grad_norm": 1.1140305995941162, + "learning_rate": 0.00021417772238036299, + "loss": 2.0962, + "step": 14577 + }, + { + "epoch": 1.7008517092521291, + "grad_norm": 1.2140229940414429, + "learning_rate": 0.0002141640148668255, + "loss": 2.0593, + "step": 14578 + }, + { + "epoch": 1.7009683817524208, + "grad_norm": 1.2035170793533325, + "learning_rate": 0.00021415030670365678, + "loss": 2.085, + "step": 14579 + }, + { + "epoch": 1.7010850542527125, + "grad_norm": 1.1305779218673706, + "learning_rate": 0.00021413659789099897, + "loss": 2.0991, + "step": 14580 + }, + { + "epoch": 1.7012017267530042, + "grad_norm": 1.2194405794143677, + "learning_rate": 0.00021412288842899424, + "loss": 2.0953, + "step": 14581 + }, + { + "epoch": 1.7013183992532959, + "grad_norm": 1.0909737348556519, + "learning_rate": 0.00021410917831778473, + "loss": 2.191, + "step": 14582 + }, + { + "epoch": 1.7014350717535875, + "grad_norm": 1.315737009048462, + "learning_rate": 0.00021409546755751246, + "loss": 2.0196, + "step": 14583 + }, + { + "epoch": 1.7015517442538792, + "grad_norm": 1.1861345767974854, + "learning_rate": 0.00021408175614831966, + "loss": 2.2611, + "step": 14584 + }, + { + "epoch": 1.701668416754171, + "grad_norm": 1.2746779918670654, + "learning_rate": 0.00021406804409034844, + "loss": 2.0564, + "step": 14585 + }, + { + "epoch": 1.7017850892544626, + "grad_norm": 1.139814853668213, + "learning_rate": 0.00021405433138374098, + "loss": 2.0379, + "step": 14586 + }, + { + "epoch": 1.7019017617547543, + "grad_norm": 1.2180383205413818, + "learning_rate": 0.00021404061802863942, + "loss": 2.1989, + "step": 14587 + }, + { + "epoch": 1.702018434255046, + "grad_norm": 1.0421162843704224, + "learning_rate": 0.00021402690402518593, + "loss": 1.9407, + "step": 14588 + }, + { + "epoch": 1.7021351067553376, + "grad_norm": 1.2382880449295044, + "learning_rate": 0.00021401318937352265, + "loss": 2.0415, + "step": 14589 + }, + { + "epoch": 1.7022517792556293, + "grad_norm": 1.2548409700393677, + "learning_rate": 0.00021399947407379181, + "loss": 2.0627, + "step": 14590 + }, + { + "epoch": 1.702368451755921, + "grad_norm": 1.228577971458435, + "learning_rate": 0.00021398575812613552, + "loss": 2.037, + "step": 14591 + }, + { + "epoch": 1.7024851242562127, + "grad_norm": 1.1415412425994873, + "learning_rate": 0.00021397204153069605, + "loss": 2.0091, + "step": 14592 + }, + { + "epoch": 1.7026017967565044, + "grad_norm": 1.2003589868545532, + "learning_rate": 0.00021395832428761557, + "loss": 2.0003, + "step": 14593 + }, + { + "epoch": 1.702718469256796, + "grad_norm": 1.0539768934249878, + "learning_rate": 0.0002139446063970363, + "loss": 2.1841, + "step": 14594 + }, + { + "epoch": 1.7028351417570877, + "grad_norm": 1.1665500402450562, + "learning_rate": 0.00021393088785910043, + "loss": 2.0846, + "step": 14595 + }, + { + "epoch": 1.7029518142573794, + "grad_norm": 1.1201926469802856, + "learning_rate": 0.00021391716867395018, + "loss": 2.0618, + "step": 14596 + }, + { + "epoch": 1.703068486757671, + "grad_norm": 1.0846747159957886, + "learning_rate": 0.00021390344884172774, + "loss": 2.0285, + "step": 14597 + }, + { + "epoch": 1.7031851592579628, + "grad_norm": 1.2034050226211548, + "learning_rate": 0.00021388972836257542, + "loss": 2.1449, + "step": 14598 + }, + { + "epoch": 1.7033018317582544, + "grad_norm": 0.9652242064476013, + "learning_rate": 0.00021387600723663544, + "loss": 2.0309, + "step": 14599 + }, + { + "epoch": 1.7034185042585461, + "grad_norm": 1.2734177112579346, + "learning_rate": 0.00021386228546405002, + "loss": 2.2657, + "step": 14600 + }, + { + "epoch": 1.7035351767588378, + "grad_norm": 1.063171148300171, + "learning_rate": 0.0002138485630449614, + "loss": 1.9555, + "step": 14601 + }, + { + "epoch": 1.7036518492591295, + "grad_norm": 1.074082374572754, + "learning_rate": 0.00021383483997951185, + "loss": 2.1932, + "step": 14602 + }, + { + "epoch": 1.7037685217594212, + "grad_norm": 1.1652846336364746, + "learning_rate": 0.00021382111626784369, + "loss": 1.9802, + "step": 14603 + }, + { + "epoch": 1.7038851942597129, + "grad_norm": 1.182118535041809, + "learning_rate": 0.00021380739191009907, + "loss": 2.2214, + "step": 14604 + }, + { + "epoch": 1.7040018667600045, + "grad_norm": 1.2746567726135254, + "learning_rate": 0.00021379366690642043, + "loss": 2.1406, + "step": 14605 + }, + { + "epoch": 1.7041185392602962, + "grad_norm": 1.113720178604126, + "learning_rate": 0.0002137799412569499, + "loss": 2.214, + "step": 14606 + }, + { + "epoch": 1.704235211760588, + "grad_norm": 1.0313392877578735, + "learning_rate": 0.00021376621496182986, + "loss": 1.9416, + "step": 14607 + }, + { + "epoch": 1.7043518842608796, + "grad_norm": 1.1115195751190186, + "learning_rate": 0.00021375248802120257, + "loss": 2.1042, + "step": 14608 + }, + { + "epoch": 1.7044685567611713, + "grad_norm": 1.2304623126983643, + "learning_rate": 0.0002137387604352104, + "loss": 2.0568, + "step": 14609 + }, + { + "epoch": 1.704585229261463, + "grad_norm": 1.1797139644622803, + "learning_rate": 0.0002137250322039956, + "loss": 2.0202, + "step": 14610 + }, + { + "epoch": 1.7047019017617546, + "grad_norm": 1.2369436025619507, + "learning_rate": 0.00021371130332770052, + "loss": 2.2236, + "step": 14611 + }, + { + "epoch": 1.7048185742620463, + "grad_norm": 1.1807498931884766, + "learning_rate": 0.00021369757380646748, + "loss": 2.1014, + "step": 14612 + }, + { + "epoch": 1.704935246762338, + "grad_norm": 1.178468942642212, + "learning_rate": 0.00021368384364043878, + "loss": 2.0326, + "step": 14613 + }, + { + "epoch": 1.7050519192626297, + "grad_norm": 1.2189842462539673, + "learning_rate": 0.00021367011282975683, + "loss": 1.9237, + "step": 14614 + }, + { + "epoch": 1.7051685917629213, + "grad_norm": 1.0927857160568237, + "learning_rate": 0.0002136563813745639, + "loss": 2.1004, + "step": 14615 + }, + { + "epoch": 1.705285264263213, + "grad_norm": 1.052696704864502, + "learning_rate": 0.00021364264927500233, + "loss": 1.8089, + "step": 14616 + }, + { + "epoch": 1.7054019367635047, + "grad_norm": 1.1127066612243652, + "learning_rate": 0.00021362891653121458, + "loss": 1.9572, + "step": 14617 + }, + { + "epoch": 1.7055186092637964, + "grad_norm": 1.242498517036438, + "learning_rate": 0.00021361518314334297, + "loss": 2.0932, + "step": 14618 + }, + { + "epoch": 1.705635281764088, + "grad_norm": 1.1735507249832153, + "learning_rate": 0.00021360144911152982, + "loss": 1.9968, + "step": 14619 + }, + { + "epoch": 1.7057519542643798, + "grad_norm": 1.025414228439331, + "learning_rate": 0.00021358771443591763, + "loss": 2.0603, + "step": 14620 + }, + { + "epoch": 1.7058686267646714, + "grad_norm": 1.1563547849655151, + "learning_rate": 0.00021357397911664862, + "loss": 2.0993, + "step": 14621 + }, + { + "epoch": 1.7059852992649631, + "grad_norm": 1.1240323781967163, + "learning_rate": 0.0002135602431538653, + "loss": 1.8048, + "step": 14622 + }, + { + "epoch": 1.7061019717652548, + "grad_norm": 1.0490506887435913, + "learning_rate": 0.00021354650654771, + "loss": 2.1833, + "step": 14623 + }, + { + "epoch": 1.7062186442655465, + "grad_norm": 1.0902894735336304, + "learning_rate": 0.00021353276929832524, + "loss": 1.9694, + "step": 14624 + }, + { + "epoch": 1.7063353167658382, + "grad_norm": 0.9861480593681335, + "learning_rate": 0.0002135190314058533, + "loss": 1.9998, + "step": 14625 + }, + { + "epoch": 1.7064519892661298, + "grad_norm": 1.2762104272842407, + "learning_rate": 0.00021350529287043665, + "loss": 2.239, + "step": 14626 + }, + { + "epoch": 1.7065686617664215, + "grad_norm": 1.191332459449768, + "learning_rate": 0.00021349155369221776, + "loss": 2.2085, + "step": 14627 + }, + { + "epoch": 1.7066853342667132, + "grad_norm": 1.1297048330307007, + "learning_rate": 0.00021347781387133898, + "loss": 2.1074, + "step": 14628 + }, + { + "epoch": 1.7068020067670049, + "grad_norm": 1.1358356475830078, + "learning_rate": 0.0002134640734079429, + "loss": 2.1784, + "step": 14629 + }, + { + "epoch": 1.7069186792672966, + "grad_norm": 1.311775803565979, + "learning_rate": 0.0002134503323021718, + "loss": 2.0575, + "step": 14630 + }, + { + "epoch": 1.7070353517675882, + "grad_norm": 0.9465835690498352, + "learning_rate": 0.00021343659055416816, + "loss": 1.8433, + "step": 14631 + }, + { + "epoch": 1.70715202426788, + "grad_norm": 1.0828940868377686, + "learning_rate": 0.0002134228481640745, + "loss": 2.0297, + "step": 14632 + }, + { + "epoch": 1.7072686967681716, + "grad_norm": 1.1822184324264526, + "learning_rate": 0.00021340910513203322, + "loss": 2.0814, + "step": 14633 + }, + { + "epoch": 1.7073853692684633, + "grad_norm": 1.1840341091156006, + "learning_rate": 0.00021339536145818685, + "loss": 1.9404, + "step": 14634 + }, + { + "epoch": 1.707502041768755, + "grad_norm": 1.1353970766067505, + "learning_rate": 0.00021338161714267787, + "loss": 2.1429, + "step": 14635 + }, + { + "epoch": 1.7076187142690467, + "grad_norm": 0.9848516583442688, + "learning_rate": 0.00021336787218564874, + "loss": 1.9522, + "step": 14636 + }, + { + "epoch": 1.7077353867693383, + "grad_norm": 1.076546311378479, + "learning_rate": 0.000213354126587242, + "loss": 2.0373, + "step": 14637 + }, + { + "epoch": 1.70785205926963, + "grad_norm": 1.0230063199996948, + "learning_rate": 0.00021334038034760008, + "loss": 1.9623, + "step": 14638 + }, + { + "epoch": 1.7079687317699217, + "grad_norm": 1.1580791473388672, + "learning_rate": 0.0002133266334668655, + "loss": 2.0213, + "step": 14639 + }, + { + "epoch": 1.7080854042702134, + "grad_norm": 0.9786444306373596, + "learning_rate": 0.00021331288594518078, + "loss": 2.0145, + "step": 14640 + }, + { + "epoch": 1.708202076770505, + "grad_norm": 1.2790075540542603, + "learning_rate": 0.00021329913778268844, + "loss": 2.1181, + "step": 14641 + }, + { + "epoch": 1.7083187492707967, + "grad_norm": 1.1280330419540405, + "learning_rate": 0.00021328538897953103, + "loss": 1.9759, + "step": 14642 + }, + { + "epoch": 1.7084354217710884, + "grad_norm": 1.0338428020477295, + "learning_rate": 0.00021327163953585102, + "loss": 2.0081, + "step": 14643 + }, + { + "epoch": 1.70855209427138, + "grad_norm": 0.9661173820495605, + "learning_rate": 0.00021325788945179104, + "loss": 2.0337, + "step": 14644 + }, + { + "epoch": 1.7086687667716718, + "grad_norm": 1.3226436376571655, + "learning_rate": 0.0002132441387274936, + "loss": 2.2204, + "step": 14645 + }, + { + "epoch": 1.7087854392719635, + "grad_norm": 2.126462459564209, + "learning_rate": 0.00021323038736310122, + "loss": 1.9859, + "step": 14646 + }, + { + "epoch": 1.7089021117722552, + "grad_norm": 1.2062422037124634, + "learning_rate": 0.00021321663535875645, + "loss": 2.1521, + "step": 14647 + }, + { + "epoch": 1.7090187842725468, + "grad_norm": 1.0775593519210815, + "learning_rate": 0.00021320288271460186, + "loss": 2.0048, + "step": 14648 + }, + { + "epoch": 1.7091354567728385, + "grad_norm": 1.099003791809082, + "learning_rate": 0.00021318912943078013, + "loss": 1.7604, + "step": 14649 + }, + { + "epoch": 1.7092521292731302, + "grad_norm": 1.104344367980957, + "learning_rate": 0.00021317537550743373, + "loss": 2.1609, + "step": 14650 + }, + { + "epoch": 1.7093688017734219, + "grad_norm": 1.159111499786377, + "learning_rate": 0.00021316162094470526, + "loss": 2.1301, + "step": 14651 + }, + { + "epoch": 1.7094854742737136, + "grad_norm": 1.1212087869644165, + "learning_rate": 0.0002131478657427373, + "loss": 2.0235, + "step": 14652 + }, + { + "epoch": 1.7096021467740052, + "grad_norm": 0.9106534719467163, + "learning_rate": 0.00021313410990167248, + "loss": 1.9369, + "step": 14653 + }, + { + "epoch": 1.709718819274297, + "grad_norm": 1.0667741298675537, + "learning_rate": 0.00021312035342165343, + "loss": 2.0241, + "step": 14654 + }, + { + "epoch": 1.7098354917745886, + "grad_norm": 1.2545270919799805, + "learning_rate": 0.00021310659630282273, + "loss": 2.2295, + "step": 14655 + }, + { + "epoch": 1.7099521642748803, + "grad_norm": 1.2934470176696777, + "learning_rate": 0.00021309283854532298, + "loss": 1.8776, + "step": 14656 + }, + { + "epoch": 1.710068836775172, + "grad_norm": 1.1035714149475098, + "learning_rate": 0.0002130790801492968, + "loss": 2.1069, + "step": 14657 + }, + { + "epoch": 1.7101855092754636, + "grad_norm": 1.312710165977478, + "learning_rate": 0.00021306532111488684, + "loss": 2.3248, + "step": 14658 + }, + { + "epoch": 1.7103021817757553, + "grad_norm": 1.1020623445510864, + "learning_rate": 0.00021305156144223576, + "loss": 2.1912, + "step": 14659 + }, + { + "epoch": 1.710418854276047, + "grad_norm": 1.3239620923995972, + "learning_rate": 0.00021303780113148622, + "loss": 2.2062, + "step": 14660 + }, + { + "epoch": 1.7105355267763387, + "grad_norm": 1.1835280656814575, + "learning_rate": 0.00021302404018278082, + "loss": 2.2881, + "step": 14661 + }, + { + "epoch": 1.7106521992766304, + "grad_norm": 1.152077555656433, + "learning_rate": 0.00021301027859626224, + "loss": 2.0183, + "step": 14662 + }, + { + "epoch": 1.710768871776922, + "grad_norm": 1.1760740280151367, + "learning_rate": 0.00021299651637207316, + "loss": 2.0482, + "step": 14663 + }, + { + "epoch": 1.7108855442772137, + "grad_norm": 1.0958631038665771, + "learning_rate": 0.00021298275351035623, + "loss": 2.1124, + "step": 14664 + }, + { + "epoch": 1.7110022167775054, + "grad_norm": 1.0782737731933594, + "learning_rate": 0.0002129689900112541, + "loss": 2.0422, + "step": 14665 + }, + { + "epoch": 1.711118889277797, + "grad_norm": 1.1007156372070312, + "learning_rate": 0.00021295522587490948, + "loss": 2.084, + "step": 14666 + }, + { + "epoch": 1.7112355617780888, + "grad_norm": 1.1036876440048218, + "learning_rate": 0.00021294146110146514, + "loss": 2.0518, + "step": 14667 + }, + { + "epoch": 1.7113522342783805, + "grad_norm": 1.0591427087783813, + "learning_rate": 0.00021292769569106364, + "loss": 2.0944, + "step": 14668 + }, + { + "epoch": 1.7114689067786721, + "grad_norm": 1.1039648056030273, + "learning_rate": 0.00021291392964384773, + "loss": 2.0468, + "step": 14669 + }, + { + "epoch": 1.7115855792789638, + "grad_norm": 1.2239307165145874, + "learning_rate": 0.00021290016295996022, + "loss": 2.0835, + "step": 14670 + }, + { + "epoch": 1.7117022517792555, + "grad_norm": 1.0828412771224976, + "learning_rate": 0.0002128863956395437, + "loss": 2.1237, + "step": 14671 + }, + { + "epoch": 1.7118189242795472, + "grad_norm": 1.4243149757385254, + "learning_rate": 0.00021287262768274095, + "loss": 2.1245, + "step": 14672 + }, + { + "epoch": 1.7119355967798389, + "grad_norm": 1.0317364931106567, + "learning_rate": 0.00021285885908969471, + "loss": 1.9567, + "step": 14673 + }, + { + "epoch": 1.7120522692801305, + "grad_norm": 1.0066179037094116, + "learning_rate": 0.00021284508986054769, + "loss": 1.9588, + "step": 14674 + }, + { + "epoch": 1.7121689417804222, + "grad_norm": 1.0832037925720215, + "learning_rate": 0.00021283131999544263, + "loss": 2.1006, + "step": 14675 + }, + { + "epoch": 1.712285614280714, + "grad_norm": 1.3539013862609863, + "learning_rate": 0.00021281754949452236, + "loss": 1.8934, + "step": 14676 + }, + { + "epoch": 1.7124022867810056, + "grad_norm": 1.206597924232483, + "learning_rate": 0.00021280377835792953, + "loss": 1.9886, + "step": 14677 + }, + { + "epoch": 1.7125189592812973, + "grad_norm": 1.0893393754959106, + "learning_rate": 0.00021279000658580696, + "loss": 2.0394, + "step": 14678 + }, + { + "epoch": 1.712635631781589, + "grad_norm": 1.0750610828399658, + "learning_rate": 0.0002127762341782974, + "loss": 1.8478, + "step": 14679 + }, + { + "epoch": 1.7127523042818806, + "grad_norm": 1.241211175918579, + "learning_rate": 0.00021276246113554362, + "loss": 1.9699, + "step": 14680 + }, + { + "epoch": 1.7128689767821723, + "grad_norm": 1.0620601177215576, + "learning_rate": 0.0002127486874576885, + "loss": 2.0021, + "step": 14681 + }, + { + "epoch": 1.712985649282464, + "grad_norm": 1.1633299589157104, + "learning_rate": 0.00021273491314487467, + "loss": 2.0327, + "step": 14682 + }, + { + "epoch": 1.7131023217827557, + "grad_norm": 1.1444487571716309, + "learning_rate": 0.00021272113819724507, + "loss": 2.184, + "step": 14683 + }, + { + "epoch": 1.7132189942830474, + "grad_norm": 1.1615873575210571, + "learning_rate": 0.00021270736261494238, + "loss": 2.0794, + "step": 14684 + }, + { + "epoch": 1.713335666783339, + "grad_norm": 1.1193820238113403, + "learning_rate": 0.00021269358639810942, + "loss": 2.0373, + "step": 14685 + }, + { + "epoch": 1.7134523392836307, + "grad_norm": 1.1547735929489136, + "learning_rate": 0.00021267980954688912, + "loss": 1.9258, + "step": 14686 + }, + { + "epoch": 1.7135690117839224, + "grad_norm": 1.4043360948562622, + "learning_rate": 0.0002126660320614243, + "loss": 2.2248, + "step": 14687 + }, + { + "epoch": 1.713685684284214, + "grad_norm": 1.1533527374267578, + "learning_rate": 0.00021265225394185766, + "loss": 1.9994, + "step": 14688 + }, + { + "epoch": 1.7138023567845058, + "grad_norm": 1.1386542320251465, + "learning_rate": 0.00021263847518833217, + "loss": 2.2738, + "step": 14689 + }, + { + "epoch": 1.7139190292847974, + "grad_norm": 1.0213418006896973, + "learning_rate": 0.00021262469580099052, + "loss": 2.0582, + "step": 14690 + }, + { + "epoch": 1.7140357017850891, + "grad_norm": 1.1200273036956787, + "learning_rate": 0.00021261091577997574, + "loss": 2.1351, + "step": 14691 + }, + { + "epoch": 1.7141523742853808, + "grad_norm": 1.0729891061782837, + "learning_rate": 0.00021259713512543054, + "loss": 2.0146, + "step": 14692 + }, + { + "epoch": 1.7142690467856725, + "grad_norm": 1.0065734386444092, + "learning_rate": 0.00021258335383749784, + "loss": 2.0162, + "step": 14693 + }, + { + "epoch": 1.7143857192859642, + "grad_norm": 1.1371779441833496, + "learning_rate": 0.00021256957191632053, + "loss": 2.211, + "step": 14694 + }, + { + "epoch": 1.7145023917862559, + "grad_norm": 1.2398587465286255, + "learning_rate": 0.00021255578936204146, + "loss": 1.9222, + "step": 14695 + }, + { + "epoch": 1.7146190642865475, + "grad_norm": 1.194726824760437, + "learning_rate": 0.00021254200617480348, + "loss": 1.9852, + "step": 14696 + }, + { + "epoch": 1.7147357367868392, + "grad_norm": 1.2283238172531128, + "learning_rate": 0.00021252822235474952, + "loss": 2.1868, + "step": 14697 + }, + { + "epoch": 1.714852409287131, + "grad_norm": 0.9403799772262573, + "learning_rate": 0.0002125144379020225, + "loss": 2.067, + "step": 14698 + }, + { + "epoch": 1.7149690817874226, + "grad_norm": 1.0988234281539917, + "learning_rate": 0.00021250065281676526, + "loss": 2.0006, + "step": 14699 + }, + { + "epoch": 1.7150857542877143, + "grad_norm": 1.42811119556427, + "learning_rate": 0.00021248686709912073, + "loss": 2.0693, + "step": 14700 + }, + { + "epoch": 1.715202426788006, + "grad_norm": 1.1691983938217163, + "learning_rate": 0.0002124730807492319, + "loss": 2.1988, + "step": 14701 + }, + { + "epoch": 1.7153190992882976, + "grad_norm": 1.0923048257827759, + "learning_rate": 0.00021245929376724156, + "loss": 1.862, + "step": 14702 + }, + { + "epoch": 1.7154357717885893, + "grad_norm": 1.1593241691589355, + "learning_rate": 0.00021244550615329274, + "loss": 1.9095, + "step": 14703 + }, + { + "epoch": 1.715552444288881, + "grad_norm": 1.176123023033142, + "learning_rate": 0.00021243171790752833, + "loss": 2.1743, + "step": 14704 + }, + { + "epoch": 1.7156691167891727, + "grad_norm": 1.0783567428588867, + "learning_rate": 0.00021241792903009128, + "loss": 2.0025, + "step": 14705 + }, + { + "epoch": 1.7157857892894643, + "grad_norm": 1.1550965309143066, + "learning_rate": 0.00021240413952112455, + "loss": 2.112, + "step": 14706 + }, + { + "epoch": 1.715902461789756, + "grad_norm": 1.0437469482421875, + "learning_rate": 0.00021239034938077108, + "loss": 2.0637, + "step": 14707 + }, + { + "epoch": 1.7160191342900477, + "grad_norm": 1.0240452289581299, + "learning_rate": 0.0002123765586091738, + "loss": 2.0638, + "step": 14708 + }, + { + "epoch": 1.7161358067903394, + "grad_norm": 1.2448186874389648, + "learning_rate": 0.00021236276720647574, + "loss": 2.0422, + "step": 14709 + }, + { + "epoch": 1.716252479290631, + "grad_norm": 1.1926747560501099, + "learning_rate": 0.00021234897517281983, + "loss": 2.0734, + "step": 14710 + }, + { + "epoch": 1.7163691517909228, + "grad_norm": 1.035101056098938, + "learning_rate": 0.00021233518250834908, + "loss": 2.1431, + "step": 14711 + }, + { + "epoch": 1.7164858242912144, + "grad_norm": 1.064809799194336, + "learning_rate": 0.00021232138921320644, + "loss": 2.0692, + "step": 14712 + }, + { + "epoch": 1.7166024967915061, + "grad_norm": 0.9801211357116699, + "learning_rate": 0.00021230759528753494, + "loss": 1.8769, + "step": 14713 + }, + { + "epoch": 1.7167191692917978, + "grad_norm": 1.1596003770828247, + "learning_rate": 0.0002122938007314776, + "loss": 2.002, + "step": 14714 + }, + { + "epoch": 1.7168358417920895, + "grad_norm": 1.316321611404419, + "learning_rate": 0.00021228000554517732, + "loss": 2.0172, + "step": 14715 + }, + { + "epoch": 1.7169525142923812, + "grad_norm": 1.083441972732544, + "learning_rate": 0.00021226620972877725, + "loss": 1.9523, + "step": 14716 + }, + { + "epoch": 1.7170691867926728, + "grad_norm": 1.1460691690444946, + "learning_rate": 0.00021225241328242032, + "loss": 2.0928, + "step": 14717 + }, + { + "epoch": 1.7171858592929645, + "grad_norm": 1.1766875982284546, + "learning_rate": 0.0002122386162062496, + "loss": 2.1543, + "step": 14718 + }, + { + "epoch": 1.7173025317932562, + "grad_norm": 1.0650349855422974, + "learning_rate": 0.00021222481850040806, + "loss": 1.9779, + "step": 14719 + }, + { + "epoch": 1.7174192042935479, + "grad_norm": 1.1843397617340088, + "learning_rate": 0.0002122110201650388, + "loss": 2.1275, + "step": 14720 + }, + { + "epoch": 1.7175358767938396, + "grad_norm": 1.0708343982696533, + "learning_rate": 0.00021219722120028488, + "loss": 1.9908, + "step": 14721 + }, + { + "epoch": 1.7176525492941312, + "grad_norm": 1.1601226329803467, + "learning_rate": 0.00021218342160628933, + "loss": 2.1938, + "step": 14722 + }, + { + "epoch": 1.717769221794423, + "grad_norm": 1.0658761262893677, + "learning_rate": 0.00021216962138319515, + "loss": 2.0552, + "step": 14723 + }, + { + "epoch": 1.7178858942947146, + "grad_norm": 1.1016831398010254, + "learning_rate": 0.00021215582053114553, + "loss": 1.9518, + "step": 14724 + }, + { + "epoch": 1.7180025667950063, + "grad_norm": 0.9741917848587036, + "learning_rate": 0.0002121420190502834, + "loss": 1.9465, + "step": 14725 + }, + { + "epoch": 1.718119239295298, + "grad_norm": 1.0076504945755005, + "learning_rate": 0.00021212821694075192, + "loss": 1.8771, + "step": 14726 + }, + { + "epoch": 1.7182359117955897, + "grad_norm": 1.0103896856307983, + "learning_rate": 0.00021211441420269418, + "loss": 1.928, + "step": 14727 + }, + { + "epoch": 1.7183525842958813, + "grad_norm": 1.1268638372421265, + "learning_rate": 0.00021210061083625325, + "loss": 2.3145, + "step": 14728 + }, + { + "epoch": 1.718469256796173, + "grad_norm": 1.1987812519073486, + "learning_rate": 0.00021208680684157223, + "loss": 2.1968, + "step": 14729 + }, + { + "epoch": 1.7185859292964647, + "grad_norm": 1.0154348611831665, + "learning_rate": 0.0002120730022187942, + "loss": 2.06, + "step": 14730 + }, + { + "epoch": 1.7187026017967564, + "grad_norm": 1.1590749025344849, + "learning_rate": 0.00021205919696806234, + "loss": 2.0396, + "step": 14731 + }, + { + "epoch": 1.718819274297048, + "grad_norm": 1.0664540529251099, + "learning_rate": 0.00021204539108951974, + "loss": 2.0788, + "step": 14732 + }, + { + "epoch": 1.7189359467973397, + "grad_norm": 1.1292616128921509, + "learning_rate": 0.00021203158458330947, + "loss": 2.0891, + "step": 14733 + }, + { + "epoch": 1.7190526192976314, + "grad_norm": 1.1747193336486816, + "learning_rate": 0.00021201777744957472, + "loss": 2.0714, + "step": 14734 + }, + { + "epoch": 1.719169291797923, + "grad_norm": 1.1153743267059326, + "learning_rate": 0.0002120039696884586, + "loss": 2.1706, + "step": 14735 + }, + { + "epoch": 1.7192859642982148, + "grad_norm": 0.8985786437988281, + "learning_rate": 0.00021199016130010425, + "loss": 1.8869, + "step": 14736 + }, + { + "epoch": 1.7194026367985065, + "grad_norm": 1.1967878341674805, + "learning_rate": 0.00021197635228465477, + "loss": 2.1952, + "step": 14737 + }, + { + "epoch": 1.7195193092987981, + "grad_norm": 1.1915451288223267, + "learning_rate": 0.00021196254264225345, + "loss": 2.0782, + "step": 14738 + }, + { + "epoch": 1.7196359817990898, + "grad_norm": 1.2930539846420288, + "learning_rate": 0.00021194873237304342, + "loss": 2.088, + "step": 14739 + }, + { + "epoch": 1.7197526542993815, + "grad_norm": 1.2786500453948975, + "learning_rate": 0.00021193492147716776, + "loss": 2.0731, + "step": 14740 + }, + { + "epoch": 1.7198693267996732, + "grad_norm": 1.1022309064865112, + "learning_rate": 0.0002119211099547697, + "loss": 2.1558, + "step": 14741 + }, + { + "epoch": 1.7199859992999649, + "grad_norm": 1.2020224332809448, + "learning_rate": 0.00021190729780599243, + "loss": 2.1748, + "step": 14742 + }, + { + "epoch": 1.7201026718002566, + "grad_norm": 1.1784555912017822, + "learning_rate": 0.00021189348503097914, + "loss": 2.1147, + "step": 14743 + }, + { + "epoch": 1.7202193443005482, + "grad_norm": 1.0343554019927979, + "learning_rate": 0.00021187967162987297, + "loss": 1.8836, + "step": 14744 + }, + { + "epoch": 1.72033601680084, + "grad_norm": 1.1974188089370728, + "learning_rate": 0.0002118658576028172, + "loss": 1.9944, + "step": 14745 + }, + { + "epoch": 1.7204526893011316, + "grad_norm": 1.2873984575271606, + "learning_rate": 0.00021185204294995496, + "loss": 2.2365, + "step": 14746 + }, + { + "epoch": 1.7205693618014233, + "grad_norm": 1.222235083580017, + "learning_rate": 0.00021183822767142956, + "loss": 2.1164, + "step": 14747 + }, + { + "epoch": 1.720686034301715, + "grad_norm": 1.0977035760879517, + "learning_rate": 0.00021182441176738416, + "loss": 2.115, + "step": 14748 + }, + { + "epoch": 1.7208027068020066, + "grad_norm": 1.3553600311279297, + "learning_rate": 0.00021181059523796197, + "loss": 2.2127, + "step": 14749 + }, + { + "epoch": 1.7209193793022983, + "grad_norm": 1.0795536041259766, + "learning_rate": 0.00021179677808330627, + "loss": 2.1102, + "step": 14750 + }, + { + "epoch": 1.72103605180259, + "grad_norm": 1.143198847770691, + "learning_rate": 0.0002117829603035603, + "loss": 1.9653, + "step": 14751 + }, + { + "epoch": 1.7211527243028817, + "grad_norm": 1.163653016090393, + "learning_rate": 0.00021176914189886725, + "loss": 2.2004, + "step": 14752 + }, + { + "epoch": 1.7212693968031734, + "grad_norm": 1.4298758506774902, + "learning_rate": 0.00021175532286937042, + "loss": 2.4032, + "step": 14753 + }, + { + "epoch": 1.721386069303465, + "grad_norm": 1.3222432136535645, + "learning_rate": 0.0002117415032152131, + "loss": 2.0201, + "step": 14754 + }, + { + "epoch": 1.7215027418037567, + "grad_norm": 1.3214422464370728, + "learning_rate": 0.0002117276829365385, + "loss": 2.2378, + "step": 14755 + }, + { + "epoch": 1.7216194143040484, + "grad_norm": 1.0845681428909302, + "learning_rate": 0.00021171386203348993, + "loss": 1.9803, + "step": 14756 + }, + { + "epoch": 1.72173608680434, + "grad_norm": 1.1440287828445435, + "learning_rate": 0.00021170004050621063, + "loss": 2.0101, + "step": 14757 + }, + { + "epoch": 1.7218527593046318, + "grad_norm": 1.1257798671722412, + "learning_rate": 0.00021168621835484396, + "loss": 2.0656, + "step": 14758 + }, + { + "epoch": 1.7219694318049235, + "grad_norm": 1.3628082275390625, + "learning_rate": 0.00021167239557953312, + "loss": 2.0421, + "step": 14759 + }, + { + "epoch": 1.7220861043052151, + "grad_norm": 1.0812840461730957, + "learning_rate": 0.0002116585721804215, + "loss": 2.0217, + "step": 14760 + }, + { + "epoch": 1.7222027768055068, + "grad_norm": 1.203890085220337, + "learning_rate": 0.00021164474815765232, + "loss": 2.1627, + "step": 14761 + }, + { + "epoch": 1.7223194493057985, + "grad_norm": 1.0942978858947754, + "learning_rate": 0.00021163092351136895, + "loss": 2.0627, + "step": 14762 + }, + { + "epoch": 1.7224361218060902, + "grad_norm": 1.3006536960601807, + "learning_rate": 0.00021161709824171467, + "loss": 2.1635, + "step": 14763 + }, + { + "epoch": 1.7225527943063819, + "grad_norm": 1.1431154012680054, + "learning_rate": 0.00021160327234883284, + "loss": 2.0337, + "step": 14764 + }, + { + "epoch": 1.7226694668066735, + "grad_norm": 1.0617650747299194, + "learning_rate": 0.00021158944583286676, + "loss": 2.1881, + "step": 14765 + }, + { + "epoch": 1.7227861393069652, + "grad_norm": 1.0169693231582642, + "learning_rate": 0.0002115756186939598, + "loss": 2.0442, + "step": 14766 + }, + { + "epoch": 1.722902811807257, + "grad_norm": 1.182148814201355, + "learning_rate": 0.00021156179093225532, + "loss": 2.1571, + "step": 14767 + }, + { + "epoch": 1.7230194843075486, + "grad_norm": 1.2748171091079712, + "learning_rate": 0.0002115479625478966, + "loss": 2.0859, + "step": 14768 + }, + { + "epoch": 1.7231361568078403, + "grad_norm": 1.2574115991592407, + "learning_rate": 0.00021153413354102707, + "loss": 1.944, + "step": 14769 + }, + { + "epoch": 1.723252829308132, + "grad_norm": 1.217231035232544, + "learning_rate": 0.00021152030391179003, + "loss": 2.0603, + "step": 14770 + }, + { + "epoch": 1.7233695018084236, + "grad_norm": 1.0197697877883911, + "learning_rate": 0.00021150647366032887, + "loss": 2.0678, + "step": 14771 + }, + { + "epoch": 1.7234861743087153, + "grad_norm": 1.1696709394454956, + "learning_rate": 0.00021149264278678701, + "loss": 2.0781, + "step": 14772 + }, + { + "epoch": 1.723602846809007, + "grad_norm": 1.0976179838180542, + "learning_rate": 0.00021147881129130779, + "loss": 2.1569, + "step": 14773 + }, + { + "epoch": 1.7237195193092987, + "grad_norm": 1.0996013879776, + "learning_rate": 0.00021146497917403463, + "loss": 2.0517, + "step": 14774 + }, + { + "epoch": 1.7238361918095904, + "grad_norm": 1.1510928869247437, + "learning_rate": 0.0002114511464351109, + "loss": 2.0223, + "step": 14775 + }, + { + "epoch": 1.723952864309882, + "grad_norm": 1.2569161653518677, + "learning_rate": 0.00021143731307467998, + "loss": 2.119, + "step": 14776 + }, + { + "epoch": 1.7240695368101737, + "grad_norm": 1.1639665365219116, + "learning_rate": 0.00021142347909288537, + "loss": 2.1388, + "step": 14777 + }, + { + "epoch": 1.7241862093104654, + "grad_norm": 1.0724401473999023, + "learning_rate": 0.00021140964448987037, + "loss": 2.1191, + "step": 14778 + }, + { + "epoch": 1.724302881810757, + "grad_norm": 1.133887529373169, + "learning_rate": 0.00021139580926577846, + "loss": 1.8868, + "step": 14779 + }, + { + "epoch": 1.7244195543110488, + "grad_norm": 1.0171563625335693, + "learning_rate": 0.0002113819734207531, + "loss": 1.9884, + "step": 14780 + }, + { + "epoch": 1.7245362268113404, + "grad_norm": 1.0205408334732056, + "learning_rate": 0.00021136813695493765, + "loss": 2.0525, + "step": 14781 + }, + { + "epoch": 1.7246528993116321, + "grad_norm": 1.236926555633545, + "learning_rate": 0.0002113542998684756, + "loss": 2.2492, + "step": 14782 + }, + { + "epoch": 1.7247695718119238, + "grad_norm": 1.0628480911254883, + "learning_rate": 0.00021134046216151044, + "loss": 2.1333, + "step": 14783 + }, + { + "epoch": 1.7248862443122155, + "grad_norm": 1.0609252452850342, + "learning_rate": 0.00021132662383418555, + "loss": 2.1756, + "step": 14784 + }, + { + "epoch": 1.7250029168125072, + "grad_norm": 0.9264306426048279, + "learning_rate": 0.00021131278488664438, + "loss": 1.8665, + "step": 14785 + }, + { + "epoch": 1.7251195893127989, + "grad_norm": 1.196976661682129, + "learning_rate": 0.00021129894531903047, + "loss": 2.1325, + "step": 14786 + }, + { + "epoch": 1.7252362618130905, + "grad_norm": 1.095808744430542, + "learning_rate": 0.0002112851051314872, + "loss": 1.9963, + "step": 14787 + }, + { + "epoch": 1.7253529343133822, + "grad_norm": 1.187860131263733, + "learning_rate": 0.00021127126432415817, + "loss": 2.1365, + "step": 14788 + }, + { + "epoch": 1.725469606813674, + "grad_norm": 1.2813807725906372, + "learning_rate": 0.00021125742289718668, + "loss": 2.3245, + "step": 14789 + }, + { + "epoch": 1.7255862793139656, + "grad_norm": 1.055713176727295, + "learning_rate": 0.00021124358085071644, + "loss": 1.8843, + "step": 14790 + }, + { + "epoch": 1.7257029518142573, + "grad_norm": 1.1051552295684814, + "learning_rate": 0.00021122973818489085, + "loss": 2.023, + "step": 14791 + }, + { + "epoch": 1.725819624314549, + "grad_norm": 1.0372743606567383, + "learning_rate": 0.00021121589489985342, + "loss": 2.1447, + "step": 14792 + }, + { + "epoch": 1.7259362968148406, + "grad_norm": 1.0333540439605713, + "learning_rate": 0.00021120205099574765, + "loss": 2.0512, + "step": 14793 + }, + { + "epoch": 1.7260529693151323, + "grad_norm": 1.2380682229995728, + "learning_rate": 0.00021118820647271703, + "loss": 2.2521, + "step": 14794 + }, + { + "epoch": 1.726169641815424, + "grad_norm": 1.1656053066253662, + "learning_rate": 0.00021117436133090512, + "loss": 2.0409, + "step": 14795 + }, + { + "epoch": 1.7262863143157157, + "grad_norm": 1.1453373432159424, + "learning_rate": 0.00021116051557045547, + "loss": 2.0229, + "step": 14796 + }, + { + "epoch": 1.7264029868160073, + "grad_norm": 1.1639872789382935, + "learning_rate": 0.00021114666919151164, + "loss": 2.1106, + "step": 14797 + }, + { + "epoch": 1.726519659316299, + "grad_norm": 1.1483306884765625, + "learning_rate": 0.00021113282219421706, + "loss": 1.9644, + "step": 14798 + }, + { + "epoch": 1.7266363318165907, + "grad_norm": 1.132930874824524, + "learning_rate": 0.00021111897457871538, + "loss": 1.9746, + "step": 14799 + }, + { + "epoch": 1.7267530043168824, + "grad_norm": 1.2426621913909912, + "learning_rate": 0.0002111051263451502, + "loss": 2.1279, + "step": 14800 + }, + { + "epoch": 1.726869676817174, + "grad_norm": 1.192627191543579, + "learning_rate": 0.00021109127749366496, + "loss": 2.0366, + "step": 14801 + }, + { + "epoch": 1.7269863493174658, + "grad_norm": 1.297097086906433, + "learning_rate": 0.00021107742802440327, + "loss": 1.9537, + "step": 14802 + }, + { + "epoch": 1.7271030218177574, + "grad_norm": 1.1061099767684937, + "learning_rate": 0.00021106357793750872, + "loss": 1.9652, + "step": 14803 + }, + { + "epoch": 1.7272196943180491, + "grad_norm": 1.252763032913208, + "learning_rate": 0.0002110497272331249, + "loss": 2.1168, + "step": 14804 + }, + { + "epoch": 1.7273363668183408, + "grad_norm": 1.0434306859970093, + "learning_rate": 0.0002110358759113954, + "loss": 2.0256, + "step": 14805 + }, + { + "epoch": 1.7274530393186325, + "grad_norm": 1.0312466621398926, + "learning_rate": 0.00021102202397246377, + "loss": 1.996, + "step": 14806 + }, + { + "epoch": 1.7275697118189242, + "grad_norm": 1.1553362607955933, + "learning_rate": 0.0002110081714164737, + "loss": 2.193, + "step": 14807 + }, + { + "epoch": 1.7276863843192158, + "grad_norm": 1.098556637763977, + "learning_rate": 0.00021099431824356872, + "loss": 2.1682, + "step": 14808 + }, + { + "epoch": 1.7278030568195075, + "grad_norm": 1.0770965814590454, + "learning_rate": 0.00021098046445389252, + "loss": 2.0184, + "step": 14809 + }, + { + "epoch": 1.7279197293197992, + "grad_norm": 1.1504732370376587, + "learning_rate": 0.00021096661004758858, + "loss": 1.9919, + "step": 14810 + }, + { + "epoch": 1.7280364018200909, + "grad_norm": 1.2060697078704834, + "learning_rate": 0.0002109527550248007, + "loss": 2.0687, + "step": 14811 + }, + { + "epoch": 1.7281530743203826, + "grad_norm": 1.1591756343841553, + "learning_rate": 0.00021093889938567236, + "loss": 2.0764, + "step": 14812 + }, + { + "epoch": 1.7282697468206742, + "grad_norm": 1.0969074964523315, + "learning_rate": 0.00021092504313034732, + "loss": 2.1311, + "step": 14813 + }, + { + "epoch": 1.728386419320966, + "grad_norm": 1.245343804359436, + "learning_rate": 0.00021091118625896912, + "loss": 2.0795, + "step": 14814 + }, + { + "epoch": 1.7285030918212576, + "grad_norm": 1.0030573606491089, + "learning_rate": 0.0002108973287716815, + "loss": 1.9516, + "step": 14815 + }, + { + "epoch": 1.7286197643215493, + "grad_norm": 1.2355371713638306, + "learning_rate": 0.0002108834706686281, + "loss": 2.1296, + "step": 14816 + }, + { + "epoch": 1.728736436821841, + "grad_norm": 1.0826243162155151, + "learning_rate": 0.0002108696119499526, + "loss": 1.8914, + "step": 14817 + }, + { + "epoch": 1.7288531093221327, + "grad_norm": 1.1525654792785645, + "learning_rate": 0.00021085575261579862, + "loss": 2.0742, + "step": 14818 + }, + { + "epoch": 1.7289697818224243, + "grad_norm": 1.1659173965454102, + "learning_rate": 0.0002108418926663099, + "loss": 2.0513, + "step": 14819 + }, + { + "epoch": 1.729086454322716, + "grad_norm": 1.0547728538513184, + "learning_rate": 0.00021082803210163, + "loss": 2.116, + "step": 14820 + }, + { + "epoch": 1.7292031268230077, + "grad_norm": 1.1577109098434448, + "learning_rate": 0.00021081417092190278, + "loss": 2.1049, + "step": 14821 + }, + { + "epoch": 1.7293197993232994, + "grad_norm": 1.144708514213562, + "learning_rate": 0.00021080030912727182, + "loss": 2.0272, + "step": 14822 + }, + { + "epoch": 1.729436471823591, + "grad_norm": 1.177838921546936, + "learning_rate": 0.00021078644671788092, + "loss": 2.0666, + "step": 14823 + }, + { + "epoch": 1.7295531443238827, + "grad_norm": 1.1230822801589966, + "learning_rate": 0.00021077258369387366, + "loss": 2.0562, + "step": 14824 + }, + { + "epoch": 1.7296698168241744, + "grad_norm": 1.0461374521255493, + "learning_rate": 0.00021075872005539385, + "loss": 1.9243, + "step": 14825 + }, + { + "epoch": 1.729786489324466, + "grad_norm": 1.0064129829406738, + "learning_rate": 0.0002107448558025852, + "loss": 2.0427, + "step": 14826 + }, + { + "epoch": 1.7299031618247578, + "grad_norm": 1.0394591093063354, + "learning_rate": 0.00021073099093559143, + "loss": 1.8906, + "step": 14827 + }, + { + "epoch": 1.7300198343250495, + "grad_norm": 1.1312577724456787, + "learning_rate": 0.00021071712545455624, + "loss": 2.1535, + "step": 14828 + }, + { + "epoch": 1.7301365068253411, + "grad_norm": 1.1075913906097412, + "learning_rate": 0.00021070325935962345, + "loss": 2.1222, + "step": 14829 + }, + { + "epoch": 1.7302531793256328, + "grad_norm": 1.1307346820831299, + "learning_rate": 0.0002106893926509367, + "loss": 1.9594, + "step": 14830 + }, + { + "epoch": 1.7303698518259245, + "grad_norm": 1.092448115348816, + "learning_rate": 0.0002106755253286399, + "loss": 1.9707, + "step": 14831 + }, + { + "epoch": 1.7304865243262162, + "grad_norm": 1.0326635837554932, + "learning_rate": 0.00021066165739287672, + "loss": 1.9418, + "step": 14832 + }, + { + "epoch": 1.7306031968265079, + "grad_norm": 1.0834527015686035, + "learning_rate": 0.00021064778884379086, + "loss": 2.0987, + "step": 14833 + }, + { + "epoch": 1.7307198693267996, + "grad_norm": 1.1746500730514526, + "learning_rate": 0.00021063391968152622, + "loss": 2.0379, + "step": 14834 + }, + { + "epoch": 1.7308365418270912, + "grad_norm": 1.1817271709442139, + "learning_rate": 0.00021062004990622652, + "loss": 1.8216, + "step": 14835 + }, + { + "epoch": 1.730953214327383, + "grad_norm": 1.1052159070968628, + "learning_rate": 0.00021060617951803554, + "loss": 2.1777, + "step": 14836 + }, + { + "epoch": 1.7310698868276746, + "grad_norm": 1.0967044830322266, + "learning_rate": 0.00021059230851709707, + "loss": 2.1437, + "step": 14837 + }, + { + "epoch": 1.7311865593279663, + "grad_norm": 1.000797986984253, + "learning_rate": 0.00021057843690355492, + "loss": 2.0124, + "step": 14838 + }, + { + "epoch": 1.731303231828258, + "grad_norm": 1.0734697580337524, + "learning_rate": 0.00021056456467755292, + "loss": 1.866, + "step": 14839 + }, + { + "epoch": 1.7314199043285496, + "grad_norm": 1.1879676580429077, + "learning_rate": 0.00021055069183923485, + "loss": 1.9651, + "step": 14840 + }, + { + "epoch": 1.7315365768288413, + "grad_norm": 1.0159657001495361, + "learning_rate": 0.0002105368183887445, + "loss": 2.0089, + "step": 14841 + }, + { + "epoch": 1.731653249329133, + "grad_norm": 1.2150191068649292, + "learning_rate": 0.00021052294432622576, + "loss": 1.9684, + "step": 14842 + }, + { + "epoch": 1.7317699218294247, + "grad_norm": 1.057521104812622, + "learning_rate": 0.00021050906965182245, + "loss": 2.0846, + "step": 14843 + }, + { + "epoch": 1.7318865943297164, + "grad_norm": 1.2937610149383545, + "learning_rate": 0.00021049519436567836, + "loss": 1.8779, + "step": 14844 + }, + { + "epoch": 1.732003266830008, + "grad_norm": 1.1126199960708618, + "learning_rate": 0.00021048131846793737, + "loss": 2.0195, + "step": 14845 + }, + { + "epoch": 1.7321199393302997, + "grad_norm": 1.0981595516204834, + "learning_rate": 0.0002104674419587434, + "loss": 2.1178, + "step": 14846 + }, + { + "epoch": 1.7322366118305914, + "grad_norm": 1.1876598596572876, + "learning_rate": 0.00021045356483824013, + "loss": 2.213, + "step": 14847 + }, + { + "epoch": 1.732353284330883, + "grad_norm": 1.3628908395767212, + "learning_rate": 0.00021043968710657155, + "loss": 2.2172, + "step": 14848 + }, + { + "epoch": 1.7324699568311748, + "grad_norm": 1.1594353914260864, + "learning_rate": 0.00021042580876388153, + "loss": 2.2801, + "step": 14849 + }, + { + "epoch": 1.7325866293314665, + "grad_norm": 1.080692172050476, + "learning_rate": 0.0002104119298103139, + "loss": 2.0884, + "step": 14850 + }, + { + "epoch": 1.7327033018317581, + "grad_norm": 1.2648009061813354, + "learning_rate": 0.00021039805024601252, + "loss": 2.0812, + "step": 14851 + }, + { + "epoch": 1.7328199743320498, + "grad_norm": 1.2265098094940186, + "learning_rate": 0.00021038417007112137, + "loss": 2.3311, + "step": 14852 + }, + { + "epoch": 1.7329366468323415, + "grad_norm": 1.175098180770874, + "learning_rate": 0.0002103702892857843, + "loss": 1.9776, + "step": 14853 + }, + { + "epoch": 1.7330533193326332, + "grad_norm": 1.0605928897857666, + "learning_rate": 0.00021035640789014517, + "loss": 1.9987, + "step": 14854 + }, + { + "epoch": 1.7331699918329249, + "grad_norm": 1.3227804899215698, + "learning_rate": 0.00021034252588434787, + "loss": 2.1888, + "step": 14855 + }, + { + "epoch": 1.7332866643332165, + "grad_norm": 1.1989418268203735, + "learning_rate": 0.00021032864326853647, + "loss": 2.122, + "step": 14856 + }, + { + "epoch": 1.7334033368335082, + "grad_norm": 1.186477541923523, + "learning_rate": 0.00021031476004285468, + "loss": 2.0802, + "step": 14857 + }, + { + "epoch": 1.7335200093338, + "grad_norm": 1.431248426437378, + "learning_rate": 0.00021030087620744657, + "loss": 2.1464, + "step": 14858 + }, + { + "epoch": 1.7336366818340916, + "grad_norm": 1.1170969009399414, + "learning_rate": 0.00021028699176245603, + "loss": 1.9664, + "step": 14859 + }, + { + "epoch": 1.7337533543343833, + "grad_norm": 1.0759012699127197, + "learning_rate": 0.00021027310670802701, + "loss": 2.193, + "step": 14860 + }, + { + "epoch": 1.733870026834675, + "grad_norm": 1.1464890241622925, + "learning_rate": 0.00021025922104430346, + "loss": 2.1601, + "step": 14861 + }, + { + "epoch": 1.7339866993349666, + "grad_norm": 1.185364842414856, + "learning_rate": 0.0002102453347714293, + "loss": 2.1864, + "step": 14862 + }, + { + "epoch": 1.7341033718352583, + "grad_norm": 1.2004389762878418, + "learning_rate": 0.00021023144788954846, + "loss": 2.2194, + "step": 14863 + }, + { + "epoch": 1.73422004433555, + "grad_norm": 1.044775366783142, + "learning_rate": 0.00021021756039880498, + "loss": 1.9771, + "step": 14864 + }, + { + "epoch": 1.7343367168358417, + "grad_norm": 1.3413721323013306, + "learning_rate": 0.0002102036722993428, + "loss": 2.0483, + "step": 14865 + }, + { + "epoch": 1.7344533893361334, + "grad_norm": 1.062008261680603, + "learning_rate": 0.00021018978359130589, + "loss": 2.0546, + "step": 14866 + }, + { + "epoch": 1.734570061836425, + "grad_norm": 1.1010346412658691, + "learning_rate": 0.00021017589427483824, + "loss": 2.0987, + "step": 14867 + }, + { + "epoch": 1.7346867343367167, + "grad_norm": 1.1668850183486938, + "learning_rate": 0.00021016200435008378, + "loss": 2.0916, + "step": 14868 + }, + { + "epoch": 1.7348034068370084, + "grad_norm": 1.0394082069396973, + "learning_rate": 0.0002101481138171866, + "loss": 2.0643, + "step": 14869 + }, + { + "epoch": 1.7349200793373, + "grad_norm": 1.0336161851882935, + "learning_rate": 0.00021013422267629066, + "loss": 2.027, + "step": 14870 + }, + { + "epoch": 1.7350367518375918, + "grad_norm": 1.1965688467025757, + "learning_rate": 0.00021012033092753998, + "loss": 2.1135, + "step": 14871 + }, + { + "epoch": 1.7351534243378834, + "grad_norm": 1.185316801071167, + "learning_rate": 0.0002101064385710786, + "loss": 2.2261, + "step": 14872 + }, + { + "epoch": 1.7352700968381751, + "grad_norm": 1.0074149370193481, + "learning_rate": 0.00021009254560705042, + "loss": 2.1329, + "step": 14873 + }, + { + "epoch": 1.7353867693384668, + "grad_norm": 1.0555943250656128, + "learning_rate": 0.0002100786520355996, + "loss": 2.09, + "step": 14874 + }, + { + "epoch": 1.7355034418387585, + "grad_norm": 1.2901424169540405, + "learning_rate": 0.00021006475785687007, + "loss": 2.1953, + "step": 14875 + }, + { + "epoch": 1.7356201143390502, + "grad_norm": 1.0870999097824097, + "learning_rate": 0.000210050863071006, + "loss": 1.9178, + "step": 14876 + }, + { + "epoch": 1.7357367868393418, + "grad_norm": 1.278326153755188, + "learning_rate": 0.00021003696767815132, + "loss": 1.922, + "step": 14877 + }, + { + "epoch": 1.7358534593396335, + "grad_norm": 1.0512137413024902, + "learning_rate": 0.00021002307167845014, + "loss": 2.0309, + "step": 14878 + }, + { + "epoch": 1.7359701318399252, + "grad_norm": 1.105742335319519, + "learning_rate": 0.00021000917507204653, + "loss": 1.9802, + "step": 14879 + }, + { + "epoch": 1.736086804340217, + "grad_norm": 1.2000067234039307, + "learning_rate": 0.0002099952778590844, + "loss": 2.1236, + "step": 14880 + }, + { + "epoch": 1.7362034768405086, + "grad_norm": 1.0417227745056152, + "learning_rate": 0.00020998138003970804, + "loss": 1.893, + "step": 14881 + }, + { + "epoch": 1.7363201493408003, + "grad_norm": 1.0292638540267944, + "learning_rate": 0.0002099674816140614, + "loss": 2.0232, + "step": 14882 + }, + { + "epoch": 1.736436821841092, + "grad_norm": 1.1638883352279663, + "learning_rate": 0.0002099535825822886, + "loss": 2.1132, + "step": 14883 + }, + { + "epoch": 1.7365534943413836, + "grad_norm": 1.1220508813858032, + "learning_rate": 0.00020993968294453377, + "loss": 1.9202, + "step": 14884 + }, + { + "epoch": 1.7366701668416753, + "grad_norm": 1.090781331062317, + "learning_rate": 0.0002099257827009409, + "loss": 2.1504, + "step": 14885 + }, + { + "epoch": 1.736786839341967, + "grad_norm": 1.1105457544326782, + "learning_rate": 0.00020991188185165424, + "loss": 1.9919, + "step": 14886 + }, + { + "epoch": 1.7369035118422587, + "grad_norm": 1.0467944145202637, + "learning_rate": 0.00020989798039681772, + "loss": 1.822, + "step": 14887 + }, + { + "epoch": 1.7370201843425503, + "grad_norm": 1.0597478151321411, + "learning_rate": 0.0002098840783365756, + "loss": 2.0628, + "step": 14888 + }, + { + "epoch": 1.737136856842842, + "grad_norm": 1.1576836109161377, + "learning_rate": 0.00020987017567107195, + "loss": 2.1245, + "step": 14889 + }, + { + "epoch": 1.7372535293431337, + "grad_norm": 1.1985926628112793, + "learning_rate": 0.00020985627240045087, + "loss": 2.114, + "step": 14890 + }, + { + "epoch": 1.7373702018434254, + "grad_norm": 1.1898958683013916, + "learning_rate": 0.0002098423685248565, + "loss": 2.2113, + "step": 14891 + }, + { + "epoch": 1.737486874343717, + "grad_norm": 1.2052793502807617, + "learning_rate": 0.00020982846404443304, + "loss": 2.0315, + "step": 14892 + }, + { + "epoch": 1.7376035468440088, + "grad_norm": 1.1761999130249023, + "learning_rate": 0.0002098145589593246, + "loss": 2.122, + "step": 14893 + }, + { + "epoch": 1.7377202193443004, + "grad_norm": 0.9770725965499878, + "learning_rate": 0.0002098006532696753, + "loss": 2.076, + "step": 14894 + }, + { + "epoch": 1.7378368918445921, + "grad_norm": 1.3515968322753906, + "learning_rate": 0.00020978674697562935, + "loss": 2.1123, + "step": 14895 + }, + { + "epoch": 1.7379535643448838, + "grad_norm": 1.2316457033157349, + "learning_rate": 0.0002097728400773309, + "loss": 2.0385, + "step": 14896 + }, + { + "epoch": 1.7380702368451755, + "grad_norm": 1.219012975692749, + "learning_rate": 0.00020975893257492407, + "loss": 1.9973, + "step": 14897 + }, + { + "epoch": 1.7381869093454672, + "grad_norm": 1.2095704078674316, + "learning_rate": 0.0002097450244685531, + "loss": 2.1242, + "step": 14898 + }, + { + "epoch": 1.7383035818457588, + "grad_norm": 1.3278099298477173, + "learning_rate": 0.00020973111575836218, + "loss": 2.1166, + "step": 14899 + }, + { + "epoch": 1.7384202543460505, + "grad_norm": 1.0303385257720947, + "learning_rate": 0.00020971720644449548, + "loss": 2.0006, + "step": 14900 + }, + { + "epoch": 1.7385369268463422, + "grad_norm": 1.247766375541687, + "learning_rate": 0.00020970329652709717, + "loss": 2.0175, + "step": 14901 + }, + { + "epoch": 1.7386535993466339, + "grad_norm": 1.0970019102096558, + "learning_rate": 0.0002096893860063115, + "loss": 2.1163, + "step": 14902 + }, + { + "epoch": 1.7387702718469256, + "grad_norm": 1.3694044351577759, + "learning_rate": 0.00020967547488228257, + "loss": 2.1215, + "step": 14903 + }, + { + "epoch": 1.7388869443472172, + "grad_norm": 1.140128254890442, + "learning_rate": 0.0002096615631551548, + "loss": 2.1105, + "step": 14904 + }, + { + "epoch": 1.739003616847509, + "grad_norm": 1.2861186265945435, + "learning_rate": 0.00020964765082507223, + "loss": 2.2502, + "step": 14905 + }, + { + "epoch": 1.7391202893478006, + "grad_norm": 1.1320801973342896, + "learning_rate": 0.0002096337378921791, + "loss": 2.1006, + "step": 14906 + }, + { + "epoch": 1.7392369618480923, + "grad_norm": 1.2476962804794312, + "learning_rate": 0.00020961982435661975, + "loss": 2.1007, + "step": 14907 + }, + { + "epoch": 1.739353634348384, + "grad_norm": 1.1773394346237183, + "learning_rate": 0.00020960591021853831, + "loss": 2.0895, + "step": 14908 + }, + { + "epoch": 1.7394703068486757, + "grad_norm": 1.2230093479156494, + "learning_rate": 0.0002095919954780791, + "loss": 2.2062, + "step": 14909 + }, + { + "epoch": 1.7395869793489673, + "grad_norm": 1.1479644775390625, + "learning_rate": 0.00020957808013538642, + "loss": 2.1764, + "step": 14910 + }, + { + "epoch": 1.739703651849259, + "grad_norm": 1.3681615591049194, + "learning_rate": 0.0002095641641906044, + "loss": 2.0217, + "step": 14911 + }, + { + "epoch": 1.7398203243495507, + "grad_norm": 0.9301050305366516, + "learning_rate": 0.00020955024764387738, + "loss": 1.825, + "step": 14912 + }, + { + "epoch": 1.7399369968498424, + "grad_norm": 1.2595964670181274, + "learning_rate": 0.00020953633049534964, + "loss": 2.0432, + "step": 14913 + }, + { + "epoch": 1.740053669350134, + "grad_norm": 1.3524460792541504, + "learning_rate": 0.00020952241274516537, + "loss": 1.988, + "step": 14914 + }, + { + "epoch": 1.7401703418504257, + "grad_norm": 1.3149538040161133, + "learning_rate": 0.00020950849439346897, + "loss": 2.0262, + "step": 14915 + }, + { + "epoch": 1.7402870143507174, + "grad_norm": 1.0373003482818604, + "learning_rate": 0.00020949457544040465, + "loss": 2.1533, + "step": 14916 + }, + { + "epoch": 1.740403686851009, + "grad_norm": 1.1644452810287476, + "learning_rate": 0.00020948065588611675, + "loss": 2.0682, + "step": 14917 + }, + { + "epoch": 1.7405203593513008, + "grad_norm": 1.1170802116394043, + "learning_rate": 0.00020946673573074952, + "loss": 2.0666, + "step": 14918 + }, + { + "epoch": 1.7406370318515925, + "grad_norm": 1.1095914840698242, + "learning_rate": 0.00020945281497444736, + "loss": 2.0879, + "step": 14919 + }, + { + "epoch": 1.7407537043518841, + "grad_norm": 1.0453578233718872, + "learning_rate": 0.00020943889361735445, + "loss": 2.0192, + "step": 14920 + }, + { + "epoch": 1.7408703768521758, + "grad_norm": 1.1975280046463013, + "learning_rate": 0.00020942497165961525, + "loss": 2.0322, + "step": 14921 + }, + { + "epoch": 1.7409870493524675, + "grad_norm": 1.227210521697998, + "learning_rate": 0.00020941104910137404, + "loss": 2.0743, + "step": 14922 + }, + { + "epoch": 1.7411037218527592, + "grad_norm": 1.1249889135360718, + "learning_rate": 0.00020939712594277514, + "loss": 2.0523, + "step": 14923 + }, + { + "epoch": 1.7412203943530509, + "grad_norm": 1.1610205173492432, + "learning_rate": 0.00020938320218396287, + "loss": 2.1854, + "step": 14924 + }, + { + "epoch": 1.7413370668533426, + "grad_norm": 1.2713919878005981, + "learning_rate": 0.0002093692778250816, + "loss": 2.2262, + "step": 14925 + }, + { + "epoch": 1.7414537393536342, + "grad_norm": 1.0803020000457764, + "learning_rate": 0.0002093553528662757, + "loss": 2.3124, + "step": 14926 + }, + { + "epoch": 1.741570411853926, + "grad_norm": 1.1417897939682007, + "learning_rate": 0.0002093414273076895, + "loss": 1.9798, + "step": 14927 + }, + { + "epoch": 1.7416870843542176, + "grad_norm": 1.3138704299926758, + "learning_rate": 0.0002093275011494674, + "loss": 1.9954, + "step": 14928 + }, + { + "epoch": 1.7418037568545093, + "grad_norm": 1.1354013681411743, + "learning_rate": 0.00020931357439175365, + "loss": 2.1178, + "step": 14929 + }, + { + "epoch": 1.741920429354801, + "grad_norm": 0.9195387959480286, + "learning_rate": 0.00020929964703469284, + "loss": 2.02, + "step": 14930 + }, + { + "epoch": 1.7420371018550926, + "grad_norm": 1.1685974597930908, + "learning_rate": 0.00020928571907842918, + "loss": 2.1221, + "step": 14931 + }, + { + "epoch": 1.7421537743553843, + "grad_norm": 1.2187258005142212, + "learning_rate": 0.00020927179052310712, + "loss": 2.1277, + "step": 14932 + }, + { + "epoch": 1.742270446855676, + "grad_norm": 1.0510704517364502, + "learning_rate": 0.00020925786136887102, + "loss": 2.1116, + "step": 14933 + }, + { + "epoch": 1.7423871193559677, + "grad_norm": 1.0091884136199951, + "learning_rate": 0.00020924393161586533, + "loss": 1.9142, + "step": 14934 + }, + { + "epoch": 1.7425037918562594, + "grad_norm": 1.0293246507644653, + "learning_rate": 0.0002092300012642345, + "loss": 2.0302, + "step": 14935 + }, + { + "epoch": 1.742620464356551, + "grad_norm": 1.2563374042510986, + "learning_rate": 0.00020921607031412282, + "loss": 2.1635, + "step": 14936 + }, + { + "epoch": 1.7427371368568427, + "grad_norm": 1.110595464706421, + "learning_rate": 0.00020920213876567478, + "loss": 2.2139, + "step": 14937 + }, + { + "epoch": 1.7428538093571344, + "grad_norm": 1.2269446849822998, + "learning_rate": 0.00020918820661903486, + "loss": 2.0971, + "step": 14938 + }, + { + "epoch": 1.742970481857426, + "grad_norm": 1.1814924478530884, + "learning_rate": 0.00020917427387434742, + "loss": 2.0424, + "step": 14939 + }, + { + "epoch": 1.7430871543577178, + "grad_norm": 1.0170536041259766, + "learning_rate": 0.00020916034053175688, + "loss": 2.0043, + "step": 14940 + }, + { + "epoch": 1.7432038268580095, + "grad_norm": 1.2250795364379883, + "learning_rate": 0.00020914640659140774, + "loss": 2.0859, + "step": 14941 + }, + { + "epoch": 1.7433204993583011, + "grad_norm": 1.2031440734863281, + "learning_rate": 0.0002091324720534444, + "loss": 1.9363, + "step": 14942 + }, + { + "epoch": 1.7434371718585928, + "grad_norm": 1.3161109685897827, + "learning_rate": 0.0002091185369180114, + "loss": 2.0843, + "step": 14943 + }, + { + "epoch": 1.7435538443588845, + "grad_norm": 1.0920389890670776, + "learning_rate": 0.00020910460118525313, + "loss": 2.112, + "step": 14944 + }, + { + "epoch": 1.7436705168591762, + "grad_norm": 1.2131303548812866, + "learning_rate": 0.0002090906648553141, + "loss": 1.9811, + "step": 14945 + }, + { + "epoch": 1.7437871893594679, + "grad_norm": 1.3480737209320068, + "learning_rate": 0.00020907672792833877, + "loss": 2.2883, + "step": 14946 + }, + { + "epoch": 1.7439038618597595, + "grad_norm": 1.0581141710281372, + "learning_rate": 0.00020906279040447162, + "loss": 1.9946, + "step": 14947 + }, + { + "epoch": 1.7440205343600512, + "grad_norm": 1.1295082569122314, + "learning_rate": 0.00020904885228385716, + "loss": 2.1175, + "step": 14948 + }, + { + "epoch": 1.744137206860343, + "grad_norm": 1.21942138671875, + "learning_rate": 0.0002090349135666398, + "loss": 2.0554, + "step": 14949 + }, + { + "epoch": 1.7442538793606346, + "grad_norm": 1.093847393989563, + "learning_rate": 0.0002090209742529642, + "loss": 1.9934, + "step": 14950 + }, + { + "epoch": 1.7443705518609263, + "grad_norm": 1.202554702758789, + "learning_rate": 0.00020900703434297477, + "loss": 1.9573, + "step": 14951 + }, + { + "epoch": 1.744487224361218, + "grad_norm": 1.05198073387146, + "learning_rate": 0.00020899309383681596, + "loss": 2.0268, + "step": 14952 + }, + { + "epoch": 1.7446038968615096, + "grad_norm": 1.2146390676498413, + "learning_rate": 0.00020897915273463243, + "loss": 2.1962, + "step": 14953 + }, + { + "epoch": 1.7447205693618013, + "grad_norm": 1.1693676710128784, + "learning_rate": 0.00020896521103656857, + "loss": 2.3213, + "step": 14954 + }, + { + "epoch": 1.744837241862093, + "grad_norm": 1.1358951330184937, + "learning_rate": 0.00020895126874276906, + "loss": 2.0166, + "step": 14955 + }, + { + "epoch": 1.7449539143623847, + "grad_norm": 1.313210368156433, + "learning_rate": 0.0002089373258533783, + "loss": 2.0889, + "step": 14956 + }, + { + "epoch": 1.7450705868626764, + "grad_norm": 1.098242998123169, + "learning_rate": 0.00020892338236854087, + "loss": 2.1608, + "step": 14957 + }, + { + "epoch": 1.745187259362968, + "grad_norm": 1.1762816905975342, + "learning_rate": 0.00020890943828840138, + "loss": 2.1959, + "step": 14958 + }, + { + "epoch": 1.7453039318632597, + "grad_norm": 1.0498496294021606, + "learning_rate": 0.00020889549361310431, + "loss": 1.979, + "step": 14959 + }, + { + "epoch": 1.7454206043635514, + "grad_norm": 1.1475934982299805, + "learning_rate": 0.0002088815483427943, + "loss": 2.0334, + "step": 14960 + }, + { + "epoch": 1.745537276863843, + "grad_norm": 1.352053165435791, + "learning_rate": 0.00020886760247761587, + "loss": 2.2041, + "step": 14961 + }, + { + "epoch": 1.7456539493641348, + "grad_norm": 1.127570390701294, + "learning_rate": 0.0002088536560177136, + "loss": 2.1154, + "step": 14962 + }, + { + "epoch": 1.7457706218644264, + "grad_norm": 1.0905613899230957, + "learning_rate": 0.0002088397089632321, + "loss": 1.9612, + "step": 14963 + }, + { + "epoch": 1.7458872943647181, + "grad_norm": 0.9685848355293274, + "learning_rate": 0.00020882576131431592, + "loss": 1.9567, + "step": 14964 + }, + { + "epoch": 1.7460039668650098, + "grad_norm": 1.0089948177337646, + "learning_rate": 0.00020881181307110967, + "loss": 1.9292, + "step": 14965 + }, + { + "epoch": 1.7461206393653015, + "grad_norm": 1.1143457889556885, + "learning_rate": 0.00020879786423375798, + "loss": 1.9505, + "step": 14966 + }, + { + "epoch": 1.7462373118655932, + "grad_norm": 1.2028216123580933, + "learning_rate": 0.00020878391480240536, + "loss": 1.8897, + "step": 14967 + }, + { + "epoch": 1.7463539843658848, + "grad_norm": 1.093466877937317, + "learning_rate": 0.00020876996477719653, + "loss": 1.9647, + "step": 14968 + }, + { + "epoch": 1.7464706568661765, + "grad_norm": 1.1707864999771118, + "learning_rate": 0.00020875601415827605, + "loss": 2.0993, + "step": 14969 + }, + { + "epoch": 1.7465873293664682, + "grad_norm": 1.1789411306381226, + "learning_rate": 0.00020874206294578857, + "loss": 2.0436, + "step": 14970 + }, + { + "epoch": 1.74670400186676, + "grad_norm": 1.192944884300232, + "learning_rate": 0.0002087281111398787, + "loss": 1.9356, + "step": 14971 + }, + { + "epoch": 1.7468206743670516, + "grad_norm": 1.0980350971221924, + "learning_rate": 0.00020871415874069112, + "loss": 1.9659, + "step": 14972 + }, + { + "epoch": 1.7469373468673433, + "grad_norm": 1.0650924444198608, + "learning_rate": 0.00020870020574837039, + "loss": 1.9123, + "step": 14973 + }, + { + "epoch": 1.747054019367635, + "grad_norm": 1.1376279592514038, + "learning_rate": 0.00020868625216306124, + "loss": 2.0933, + "step": 14974 + }, + { + "epoch": 1.7471706918679266, + "grad_norm": 1.026285171508789, + "learning_rate": 0.00020867229798490828, + "loss": 2.0508, + "step": 14975 + }, + { + "epoch": 1.7472873643682183, + "grad_norm": 1.1254489421844482, + "learning_rate": 0.0002086583432140562, + "loss": 2.016, + "step": 14976 + }, + { + "epoch": 1.74740403686851, + "grad_norm": 1.2567837238311768, + "learning_rate": 0.00020864438785064967, + "loss": 2.2996, + "step": 14977 + }, + { + "epoch": 1.7475207093688017, + "grad_norm": 1.2747858762741089, + "learning_rate": 0.0002086304318948333, + "loss": 2.2748, + "step": 14978 + }, + { + "epoch": 1.7476373818690933, + "grad_norm": 1.1178295612335205, + "learning_rate": 0.00020861647534675185, + "loss": 2.1418, + "step": 14979 + }, + { + "epoch": 1.747754054369385, + "grad_norm": 1.05564284324646, + "learning_rate": 0.00020860251820655, + "loss": 1.9364, + "step": 14980 + }, + { + "epoch": 1.7478707268696767, + "grad_norm": 1.129490852355957, + "learning_rate": 0.0002085885604743724, + "loss": 1.9047, + "step": 14981 + }, + { + "epoch": 1.7479873993699684, + "grad_norm": 1.1080429553985596, + "learning_rate": 0.00020857460215036375, + "loss": 2.2345, + "step": 14982 + }, + { + "epoch": 1.74810407187026, + "grad_norm": 1.4284955263137817, + "learning_rate": 0.00020856064323466879, + "loss": 2.2178, + "step": 14983 + }, + { + "epoch": 1.7482207443705517, + "grad_norm": 1.1508824825286865, + "learning_rate": 0.00020854668372743217, + "loss": 2.1822, + "step": 14984 + }, + { + "epoch": 1.7483374168708434, + "grad_norm": 1.1701539754867554, + "learning_rate": 0.00020853272362879866, + "loss": 2.1733, + "step": 14985 + }, + { + "epoch": 1.7484540893711351, + "grad_norm": 1.0889235734939575, + "learning_rate": 0.00020851876293891296, + "loss": 2.055, + "step": 14986 + }, + { + "epoch": 1.7485707618714268, + "grad_norm": 1.092807412147522, + "learning_rate": 0.0002085048016579198, + "loss": 1.9873, + "step": 14987 + }, + { + "epoch": 1.7486874343717185, + "grad_norm": 1.1147040128707886, + "learning_rate": 0.00020849083978596397, + "loss": 2.2495, + "step": 14988 + }, + { + "epoch": 1.7488041068720102, + "grad_norm": 1.2173539400100708, + "learning_rate": 0.00020847687732319017, + "loss": 1.8586, + "step": 14989 + }, + { + "epoch": 1.7489207793723018, + "grad_norm": 0.9560040235519409, + "learning_rate": 0.00020846291426974314, + "loss": 1.7923, + "step": 14990 + }, + { + "epoch": 1.7490374518725935, + "grad_norm": 1.252889633178711, + "learning_rate": 0.0002084489506257676, + "loss": 2.1724, + "step": 14991 + }, + { + "epoch": 1.7491541243728852, + "grad_norm": 0.9567563533782959, + "learning_rate": 0.00020843498639140835, + "loss": 1.8396, + "step": 14992 + }, + { + "epoch": 1.7492707968731769, + "grad_norm": 1.1803838014602661, + "learning_rate": 0.00020842102156681016, + "loss": 2.029, + "step": 14993 + }, + { + "epoch": 1.7493874693734686, + "grad_norm": 1.2197471857070923, + "learning_rate": 0.00020840705615211778, + "loss": 2.0364, + "step": 14994 + }, + { + "epoch": 1.7495041418737602, + "grad_norm": 1.258655309677124, + "learning_rate": 0.000208393090147476, + "loss": 2.0693, + "step": 14995 + }, + { + "epoch": 1.749620814374052, + "grad_norm": 1.160464882850647, + "learning_rate": 0.00020837912355302962, + "loss": 2.0734, + "step": 14996 + }, + { + "epoch": 1.7497374868743436, + "grad_norm": 1.018418312072754, + "learning_rate": 0.0002083651563689234, + "loss": 1.9533, + "step": 14997 + }, + { + "epoch": 1.7498541593746353, + "grad_norm": 1.1153117418289185, + "learning_rate": 0.00020835118859530217, + "loss": 2.0536, + "step": 14998 + }, + { + "epoch": 1.749970831874927, + "grad_norm": 1.1527540683746338, + "learning_rate": 0.00020833722023231072, + "loss": 2.2217, + "step": 14999 + }, + { + "epoch": 1.7500875043752186, + "grad_norm": 1.0143566131591797, + "learning_rate": 0.0002083232512800938, + "loss": 2.031, + "step": 15000 + }, + { + "epoch": 1.7502041768755103, + "grad_norm": 1.1856870651245117, + "learning_rate": 0.00020830928173879633, + "loss": 2.0128, + "step": 15001 + }, + { + "epoch": 1.750320849375802, + "grad_norm": 1.3222450017929077, + "learning_rate": 0.0002082953116085631, + "loss": 2.2715, + "step": 15002 + }, + { + "epoch": 1.7504375218760937, + "grad_norm": 1.287526249885559, + "learning_rate": 0.00020828134088953884, + "loss": 2.0254, + "step": 15003 + }, + { + "epoch": 1.7505541943763854, + "grad_norm": 0.9818697571754456, + "learning_rate": 0.00020826736958186852, + "loss": 2.0334, + "step": 15004 + }, + { + "epoch": 1.750670866876677, + "grad_norm": 1.0922837257385254, + "learning_rate": 0.00020825339768569686, + "loss": 2.1791, + "step": 15005 + }, + { + "epoch": 1.7507875393769687, + "grad_norm": 1.070106029510498, + "learning_rate": 0.0002082394252011688, + "loss": 1.8716, + "step": 15006 + }, + { + "epoch": 1.7509042118772604, + "grad_norm": 1.0656380653381348, + "learning_rate": 0.00020822545212842918, + "loss": 2.1649, + "step": 15007 + }, + { + "epoch": 1.751020884377552, + "grad_norm": 1.1812361478805542, + "learning_rate": 0.00020821147846762278, + "loss": 2.1134, + "step": 15008 + }, + { + "epoch": 1.7511375568778438, + "grad_norm": 1.1305272579193115, + "learning_rate": 0.00020819750421889453, + "loss": 2.1589, + "step": 15009 + }, + { + "epoch": 1.7512542293781355, + "grad_norm": 1.2094353437423706, + "learning_rate": 0.0002081835293823893, + "loss": 2.2477, + "step": 15010 + }, + { + "epoch": 1.7513709018784271, + "grad_norm": 0.999411940574646, + "learning_rate": 0.0002081695539582519, + "loss": 2.0525, + "step": 15011 + }, + { + "epoch": 1.7514875743787188, + "grad_norm": 1.0922746658325195, + "learning_rate": 0.00020815557794662726, + "loss": 2.0718, + "step": 15012 + }, + { + "epoch": 1.7516042468790105, + "grad_norm": 1.2691601514816284, + "learning_rate": 0.0002081416013476603, + "loss": 2.143, + "step": 15013 + }, + { + "epoch": 1.7517209193793022, + "grad_norm": 1.1554689407348633, + "learning_rate": 0.00020812762416149588, + "loss": 1.9706, + "step": 15014 + }, + { + "epoch": 1.7518375918795939, + "grad_norm": 1.148353934288025, + "learning_rate": 0.00020811364638827893, + "loss": 2.2752, + "step": 15015 + }, + { + "epoch": 1.7519542643798856, + "grad_norm": 1.0361765623092651, + "learning_rate": 0.0002080996680281543, + "loss": 1.9705, + "step": 15016 + }, + { + "epoch": 1.7520709368801772, + "grad_norm": 1.0999183654785156, + "learning_rate": 0.00020808568908126692, + "loss": 1.8913, + "step": 15017 + }, + { + "epoch": 1.752187609380469, + "grad_norm": 1.022518277168274, + "learning_rate": 0.00020807170954776177, + "loss": 1.9054, + "step": 15018 + }, + { + "epoch": 1.7523042818807606, + "grad_norm": 1.2080084085464478, + "learning_rate": 0.00020805772942778368, + "loss": 2.0899, + "step": 15019 + }, + { + "epoch": 1.7524209543810523, + "grad_norm": 1.1702125072479248, + "learning_rate": 0.00020804374872147763, + "loss": 2.0646, + "step": 15020 + }, + { + "epoch": 1.752537626881344, + "grad_norm": 1.0137604475021362, + "learning_rate": 0.00020802976742898853, + "loss": 2.1202, + "step": 15021 + }, + { + "epoch": 1.7526542993816356, + "grad_norm": 1.07744300365448, + "learning_rate": 0.00020801578555046136, + "loss": 2.1774, + "step": 15022 + }, + { + "epoch": 1.7527709718819273, + "grad_norm": 1.1294142007827759, + "learning_rate": 0.00020800180308604106, + "loss": 2.1325, + "step": 15023 + }, + { + "epoch": 1.752887644382219, + "grad_norm": 1.100127935409546, + "learning_rate": 0.00020798782003587257, + "loss": 1.9859, + "step": 15024 + }, + { + "epoch": 1.7530043168825107, + "grad_norm": 1.1182454824447632, + "learning_rate": 0.00020797383640010087, + "loss": 2.0538, + "step": 15025 + }, + { + "epoch": 1.7531209893828024, + "grad_norm": 1.1494828462600708, + "learning_rate": 0.0002079598521788709, + "loss": 2.2162, + "step": 15026 + }, + { + "epoch": 1.753237661883094, + "grad_norm": 1.0840851068496704, + "learning_rate": 0.00020794586737232762, + "loss": 1.8625, + "step": 15027 + }, + { + "epoch": 1.7533543343833857, + "grad_norm": 1.1399226188659668, + "learning_rate": 0.00020793188198061608, + "loss": 2.0804, + "step": 15028 + }, + { + "epoch": 1.7534710068836774, + "grad_norm": 1.1915258169174194, + "learning_rate": 0.00020791789600388118, + "loss": 2.041, + "step": 15029 + }, + { + "epoch": 1.753587679383969, + "grad_norm": 1.201654076576233, + "learning_rate": 0.00020790390944226797, + "loss": 2.1762, + "step": 15030 + }, + { + "epoch": 1.7537043518842608, + "grad_norm": 1.101855754852295, + "learning_rate": 0.00020788992229592144, + "loss": 1.9581, + "step": 15031 + }, + { + "epoch": 1.7538210243845525, + "grad_norm": 1.1525819301605225, + "learning_rate": 0.00020787593456498657, + "loss": 2.0853, + "step": 15032 + }, + { + "epoch": 1.7539376968848441, + "grad_norm": 0.9596691131591797, + "learning_rate": 0.00020786194624960845, + "loss": 1.9986, + "step": 15033 + }, + { + "epoch": 1.7540543693851358, + "grad_norm": 1.07615327835083, + "learning_rate": 0.00020784795734993196, + "loss": 2.0637, + "step": 15034 + }, + { + "epoch": 1.7541710418854275, + "grad_norm": 1.0651676654815674, + "learning_rate": 0.00020783396786610217, + "loss": 1.8467, + "step": 15035 + }, + { + "epoch": 1.7542877143857192, + "grad_norm": 1.0872610807418823, + "learning_rate": 0.00020781997779826416, + "loss": 2.0778, + "step": 15036 + }, + { + "epoch": 1.7544043868860109, + "grad_norm": 1.1569650173187256, + "learning_rate": 0.00020780598714656293, + "loss": 2.2895, + "step": 15037 + }, + { + "epoch": 1.7545210593863025, + "grad_norm": 1.101141333580017, + "learning_rate": 0.00020779199591114343, + "loss": 2.1285, + "step": 15038 + }, + { + "epoch": 1.7546377318865942, + "grad_norm": 1.049960732460022, + "learning_rate": 0.00020777800409215092, + "loss": 2.0737, + "step": 15039 + }, + { + "epoch": 1.754754404386886, + "grad_norm": 1.1042442321777344, + "learning_rate": 0.0002077640116897303, + "loss": 2.0679, + "step": 15040 + }, + { + "epoch": 1.7548710768871776, + "grad_norm": 1.2460285425186157, + "learning_rate": 0.00020775001870402664, + "loss": 2.0572, + "step": 15041 + }, + { + "epoch": 1.7549877493874693, + "grad_norm": 1.0596370697021484, + "learning_rate": 0.00020773602513518503, + "loss": 2.0234, + "step": 15042 + }, + { + "epoch": 1.755104421887761, + "grad_norm": 1.0534346103668213, + "learning_rate": 0.0002077220309833505, + "loss": 2.1373, + "step": 15043 + }, + { + "epoch": 1.7552210943880526, + "grad_norm": 1.0828088521957397, + "learning_rate": 0.0002077080362486681, + "loss": 2.075, + "step": 15044 + }, + { + "epoch": 1.7553377668883443, + "grad_norm": 1.149765133857727, + "learning_rate": 0.00020769404093128307, + "loss": 2.1194, + "step": 15045 + }, + { + "epoch": 1.755454439388636, + "grad_norm": 1.0258897542953491, + "learning_rate": 0.00020768004503134035, + "loss": 2.0861, + "step": 15046 + }, + { + "epoch": 1.7555711118889277, + "grad_norm": 0.9756130576133728, + "learning_rate": 0.0002076660485489851, + "loss": 1.9911, + "step": 15047 + }, + { + "epoch": 1.7556877843892194, + "grad_norm": 1.0787094831466675, + "learning_rate": 0.00020765205148436236, + "loss": 2.0208, + "step": 15048 + }, + { + "epoch": 1.755804456889511, + "grad_norm": 0.9559562802314758, + "learning_rate": 0.0002076380538376173, + "loss": 2.0948, + "step": 15049 + }, + { + "epoch": 1.7559211293898027, + "grad_norm": 0.9976707100868225, + "learning_rate": 0.00020762405560889501, + "loss": 2.126, + "step": 15050 + }, + { + "epoch": 1.7560378018900944, + "grad_norm": 1.2445261478424072, + "learning_rate": 0.00020761005679834053, + "loss": 1.9454, + "step": 15051 + }, + { + "epoch": 1.756154474390386, + "grad_norm": 1.0110381841659546, + "learning_rate": 0.00020759605740609916, + "loss": 2.0352, + "step": 15052 + }, + { + "epoch": 1.7562711468906778, + "grad_norm": 1.0115216970443726, + "learning_rate": 0.0002075820574323159, + "loss": 1.9928, + "step": 15053 + }, + { + "epoch": 1.7563878193909694, + "grad_norm": 0.9544082880020142, + "learning_rate": 0.00020756805687713588, + "loss": 1.9418, + "step": 15054 + }, + { + "epoch": 1.7565044918912611, + "grad_norm": 1.1908464431762695, + "learning_rate": 0.00020755405574070433, + "loss": 2.0117, + "step": 15055 + }, + { + "epoch": 1.7566211643915528, + "grad_norm": 1.1074819564819336, + "learning_rate": 0.00020754005402316632, + "loss": 2.1314, + "step": 15056 + }, + { + "epoch": 1.7567378368918445, + "grad_norm": 1.2763952016830444, + "learning_rate": 0.00020752605172466695, + "loss": 2.0196, + "step": 15057 + }, + { + "epoch": 1.7568545093921362, + "grad_norm": 1.1084915399551392, + "learning_rate": 0.0002075120488453516, + "loss": 2.1274, + "step": 15058 + }, + { + "epoch": 1.7569711818924278, + "grad_norm": 1.2075111865997314, + "learning_rate": 0.00020749804538536522, + "loss": 1.9472, + "step": 15059 + }, + { + "epoch": 1.7570878543927195, + "grad_norm": 1.1458966732025146, + "learning_rate": 0.00020748404134485307, + "loss": 2.1627, + "step": 15060 + }, + { + "epoch": 1.7572045268930112, + "grad_norm": 0.9692326188087463, + "learning_rate": 0.00020747003672396032, + "loss": 2.1572, + "step": 15061 + }, + { + "epoch": 1.757321199393303, + "grad_norm": 1.068756103515625, + "learning_rate": 0.00020745603152283216, + "loss": 2.0606, + "step": 15062 + }, + { + "epoch": 1.7574378718935946, + "grad_norm": 1.190824270248413, + "learning_rate": 0.00020744202574161374, + "loss": 2.1199, + "step": 15063 + }, + { + "epoch": 1.7575545443938863, + "grad_norm": 1.0264068841934204, + "learning_rate": 0.0002074280193804503, + "loss": 2.0574, + "step": 15064 + }, + { + "epoch": 1.757671216894178, + "grad_norm": 1.0948853492736816, + "learning_rate": 0.00020741401243948703, + "loss": 2.0553, + "step": 15065 + }, + { + "epoch": 1.7577878893944696, + "grad_norm": 1.0888618230819702, + "learning_rate": 0.00020740000491886916, + "loss": 1.805, + "step": 15066 + }, + { + "epoch": 1.7579045618947613, + "grad_norm": 1.1883536577224731, + "learning_rate": 0.00020738599681874187, + "loss": 2.1698, + "step": 15067 + }, + { + "epoch": 1.758021234395053, + "grad_norm": 1.0814396142959595, + "learning_rate": 0.0002073719881392504, + "loss": 2.0795, + "step": 15068 + }, + { + "epoch": 1.7581379068953447, + "grad_norm": 1.1474463939666748, + "learning_rate": 0.00020735797888053997, + "loss": 2.0388, + "step": 15069 + }, + { + "epoch": 1.7582545793956363, + "grad_norm": 1.2306355237960815, + "learning_rate": 0.0002073439690427558, + "loss": 2.2024, + "step": 15070 + }, + { + "epoch": 1.758371251895928, + "grad_norm": 1.1764392852783203, + "learning_rate": 0.00020732995862604318, + "loss": 2.0382, + "step": 15071 + }, + { + "epoch": 1.7584879243962197, + "grad_norm": 1.0647072792053223, + "learning_rate": 0.00020731594763054726, + "loss": 1.8988, + "step": 15072 + }, + { + "epoch": 1.7586045968965114, + "grad_norm": 1.0594511032104492, + "learning_rate": 0.00020730193605641336, + "loss": 2.0867, + "step": 15073 + }, + { + "epoch": 1.758721269396803, + "grad_norm": 0.9944509267807007, + "learning_rate": 0.00020728792390378676, + "loss": 2.1087, + "step": 15074 + }, + { + "epoch": 1.7588379418970947, + "grad_norm": 1.2463679313659668, + "learning_rate": 0.00020727391117281264, + "loss": 2.1594, + "step": 15075 + }, + { + "epoch": 1.7589546143973864, + "grad_norm": 1.0685436725616455, + "learning_rate": 0.00020725989786363642, + "loss": 1.991, + "step": 15076 + }, + { + "epoch": 1.759071286897678, + "grad_norm": 1.2043132781982422, + "learning_rate": 0.00020724588397640312, + "loss": 2.0479, + "step": 15077 + }, + { + "epoch": 1.7591879593979698, + "grad_norm": 1.2086858749389648, + "learning_rate": 0.0002072318695112582, + "loss": 2.0447, + "step": 15078 + }, + { + "epoch": 1.7593046318982615, + "grad_norm": 1.1060172319412231, + "learning_rate": 0.00020721785446834695, + "loss": 2.1918, + "step": 15079 + }, + { + "epoch": 1.7594213043985532, + "grad_norm": 1.0184218883514404, + "learning_rate": 0.00020720383884781463, + "loss": 2.0302, + "step": 15080 + }, + { + "epoch": 1.7595379768988448, + "grad_norm": 1.1169483661651611, + "learning_rate": 0.00020718982264980658, + "loss": 2.0156, + "step": 15081 + }, + { + "epoch": 1.7596546493991365, + "grad_norm": 1.0049545764923096, + "learning_rate": 0.00020717580587446797, + "loss": 1.8349, + "step": 15082 + }, + { + "epoch": 1.7597713218994282, + "grad_norm": 1.0594476461410522, + "learning_rate": 0.00020716178852194425, + "loss": 2.0145, + "step": 15083 + }, + { + "epoch": 1.7598879943997199, + "grad_norm": 1.0463918447494507, + "learning_rate": 0.0002071477705923807, + "loss": 1.9941, + "step": 15084 + }, + { + "epoch": 1.7600046669000116, + "grad_norm": 1.0842838287353516, + "learning_rate": 0.00020713375208592262, + "loss": 2.1136, + "step": 15085 + }, + { + "epoch": 1.7601213394003032, + "grad_norm": 1.1079461574554443, + "learning_rate": 0.00020711973300271538, + "loss": 2.1018, + "step": 15086 + }, + { + "epoch": 1.760238011900595, + "grad_norm": 1.1952745914459229, + "learning_rate": 0.0002071057133429042, + "loss": 2.2292, + "step": 15087 + }, + { + "epoch": 1.7603546844008866, + "grad_norm": 1.2215149402618408, + "learning_rate": 0.00020709169310663457, + "loss": 1.9813, + "step": 15088 + }, + { + "epoch": 1.7604713569011783, + "grad_norm": 1.0113162994384766, + "learning_rate": 0.00020707767229405174, + "loss": 2.0069, + "step": 15089 + }, + { + "epoch": 1.76058802940147, + "grad_norm": 1.189348816871643, + "learning_rate": 0.00020706365090530107, + "loss": 2.069, + "step": 15090 + }, + { + "epoch": 1.7607047019017616, + "grad_norm": 1.0044337511062622, + "learning_rate": 0.00020704962894052797, + "loss": 2.0172, + "step": 15091 + }, + { + "epoch": 1.7608213744020533, + "grad_norm": 1.0895085334777832, + "learning_rate": 0.00020703560639987782, + "loss": 2.1235, + "step": 15092 + }, + { + "epoch": 1.760938046902345, + "grad_norm": 1.0941399335861206, + "learning_rate": 0.0002070215832834959, + "loss": 2.1568, + "step": 15093 + }, + { + "epoch": 1.7610547194026367, + "grad_norm": 1.2131576538085938, + "learning_rate": 0.0002070075595915276, + "loss": 2.1404, + "step": 15094 + }, + { + "epoch": 1.7611713919029284, + "grad_norm": 1.1643353700637817, + "learning_rate": 0.00020699353532411838, + "loss": 2.3003, + "step": 15095 + }, + { + "epoch": 1.76128806440322, + "grad_norm": 1.2229318618774414, + "learning_rate": 0.00020697951048141353, + "loss": 2.2082, + "step": 15096 + }, + { + "epoch": 1.7614047369035117, + "grad_norm": 1.0724852085113525, + "learning_rate": 0.00020696548506355856, + "loss": 2.0725, + "step": 15097 + }, + { + "epoch": 1.7615214094038034, + "grad_norm": 1.265248417854309, + "learning_rate": 0.00020695145907069878, + "loss": 2.1246, + "step": 15098 + }, + { + "epoch": 1.761638081904095, + "grad_norm": 1.1132642030715942, + "learning_rate": 0.00020693743250297958, + "loss": 2.2224, + "step": 15099 + }, + { + "epoch": 1.7617547544043868, + "grad_norm": 1.06089186668396, + "learning_rate": 0.00020692340536054645, + "loss": 1.951, + "step": 15100 + }, + { + "epoch": 1.7618714269046785, + "grad_norm": 1.0394327640533447, + "learning_rate": 0.00020690937764354474, + "loss": 1.8858, + "step": 15101 + }, + { + "epoch": 1.7619880994049701, + "grad_norm": 1.1297005414962769, + "learning_rate": 0.00020689534935212, + "loss": 2.0032, + "step": 15102 + }, + { + "epoch": 1.7621047719052618, + "grad_norm": 1.120957374572754, + "learning_rate": 0.00020688132048641745, + "loss": 2.0434, + "step": 15103 + }, + { + "epoch": 1.7622214444055535, + "grad_norm": 1.1447758674621582, + "learning_rate": 0.0002068672910465827, + "loss": 1.814, + "step": 15104 + }, + { + "epoch": 1.7623381169058452, + "grad_norm": 1.1004992723464966, + "learning_rate": 0.00020685326103276111, + "loss": 1.9412, + "step": 15105 + }, + { + "epoch": 1.7624547894061369, + "grad_norm": 0.9370938539505005, + "learning_rate": 0.00020683923044509817, + "loss": 2.0391, + "step": 15106 + }, + { + "epoch": 1.7625714619064285, + "grad_norm": 1.0220112800598145, + "learning_rate": 0.0002068251992837393, + "loss": 1.9078, + "step": 15107 + }, + { + "epoch": 1.7626881344067202, + "grad_norm": 1.0879319906234741, + "learning_rate": 0.00020681116754882995, + "loss": 2.0937, + "step": 15108 + }, + { + "epoch": 1.762804806907012, + "grad_norm": 1.1839728355407715, + "learning_rate": 0.0002067971352405156, + "loss": 2.1492, + "step": 15109 + }, + { + "epoch": 1.7629214794073036, + "grad_norm": 1.0430408716201782, + "learning_rate": 0.00020678310235894179, + "loss": 1.9671, + "step": 15110 + }, + { + "epoch": 1.7630381519075953, + "grad_norm": 1.3211238384246826, + "learning_rate": 0.00020676906890425395, + "loss": 2.0795, + "step": 15111 + }, + { + "epoch": 1.763154824407887, + "grad_norm": 1.2872223854064941, + "learning_rate": 0.00020675503487659747, + "loss": 2.0869, + "step": 15112 + }, + { + "epoch": 1.7632714969081786, + "grad_norm": 1.0316622257232666, + "learning_rate": 0.000206741000276118, + "loss": 2.1058, + "step": 15113 + }, + { + "epoch": 1.7633881694084703, + "grad_norm": 1.2141114473342896, + "learning_rate": 0.00020672696510296089, + "loss": 2.0462, + "step": 15114 + }, + { + "epoch": 1.763504841908762, + "grad_norm": 1.2691789865493774, + "learning_rate": 0.0002067129293572717, + "loss": 1.9389, + "step": 15115 + }, + { + "epoch": 1.7636215144090537, + "grad_norm": 1.2679606676101685, + "learning_rate": 0.00020669889303919598, + "loss": 2.0409, + "step": 15116 + }, + { + "epoch": 1.7637381869093454, + "grad_norm": 1.1747193336486816, + "learning_rate": 0.0002066848561488792, + "loss": 2.2255, + "step": 15117 + }, + { + "epoch": 1.763854859409637, + "grad_norm": 1.0103027820587158, + "learning_rate": 0.0002066708186864669, + "loss": 2.0136, + "step": 15118 + }, + { + "epoch": 1.7639715319099287, + "grad_norm": 1.0013729333877563, + "learning_rate": 0.00020665678065210456, + "loss": 1.9513, + "step": 15119 + }, + { + "epoch": 1.7640882044102204, + "grad_norm": 1.0678807497024536, + "learning_rate": 0.00020664274204593774, + "loss": 1.7829, + "step": 15120 + }, + { + "epoch": 1.764204876910512, + "grad_norm": 1.1312609910964966, + "learning_rate": 0.000206628702868112, + "loss": 2.0454, + "step": 15121 + }, + { + "epoch": 1.7643215494108038, + "grad_norm": 1.2268054485321045, + "learning_rate": 0.00020661466311877285, + "loss": 1.9582, + "step": 15122 + }, + { + "epoch": 1.7644382219110955, + "grad_norm": 1.1281334161758423, + "learning_rate": 0.00020660062279806584, + "loss": 2.2501, + "step": 15123 + }, + { + "epoch": 1.7645548944113871, + "grad_norm": 1.1972566843032837, + "learning_rate": 0.00020658658190613653, + "loss": 2.1963, + "step": 15124 + }, + { + "epoch": 1.7646715669116788, + "grad_norm": 1.1509443521499634, + "learning_rate": 0.00020657254044313051, + "loss": 2.0328, + "step": 15125 + }, + { + "epoch": 1.7647882394119705, + "grad_norm": 1.1300525665283203, + "learning_rate": 0.00020655849840919328, + "loss": 2.1159, + "step": 15126 + }, + { + "epoch": 1.7649049119122622, + "grad_norm": 1.2317192554473877, + "learning_rate": 0.00020654445580447052, + "loss": 2.0635, + "step": 15127 + }, + { + "epoch": 1.7650215844125539, + "grad_norm": 1.1639022827148438, + "learning_rate": 0.00020653041262910766, + "loss": 2.1432, + "step": 15128 + }, + { + "epoch": 1.7651382569128455, + "grad_norm": 1.0597078800201416, + "learning_rate": 0.00020651636888325037, + "loss": 2.0365, + "step": 15129 + }, + { + "epoch": 1.7652549294131372, + "grad_norm": 1.209232211112976, + "learning_rate": 0.00020650232456704428, + "loss": 2.067, + "step": 15130 + }, + { + "epoch": 1.765371601913429, + "grad_norm": 1.099298357963562, + "learning_rate": 0.00020648827968063497, + "loss": 1.9872, + "step": 15131 + }, + { + "epoch": 1.7654882744137206, + "grad_norm": 1.1244758367538452, + "learning_rate": 0.00020647423422416795, + "loss": 2.1973, + "step": 15132 + }, + { + "epoch": 1.7656049469140123, + "grad_norm": 1.1755601167678833, + "learning_rate": 0.00020646018819778887, + "loss": 2.1573, + "step": 15133 + }, + { + "epoch": 1.765721619414304, + "grad_norm": 1.101746916770935, + "learning_rate": 0.0002064461416016434, + "loss": 2.0068, + "step": 15134 + }, + { + "epoch": 1.7658382919145956, + "grad_norm": 1.063907504081726, + "learning_rate": 0.0002064320944358772, + "loss": 2.1012, + "step": 15135 + }, + { + "epoch": 1.7659549644148873, + "grad_norm": 0.9731101393699646, + "learning_rate": 0.0002064180467006357, + "loss": 1.8937, + "step": 15136 + }, + { + "epoch": 1.766071636915179, + "grad_norm": 1.114364743232727, + "learning_rate": 0.0002064039983960647, + "loss": 1.963, + "step": 15137 + }, + { + "epoch": 1.7661883094154707, + "grad_norm": 1.0932241678237915, + "learning_rate": 0.0002063899495223098, + "loss": 2.06, + "step": 15138 + }, + { + "epoch": 1.7663049819157624, + "grad_norm": 1.0946463346481323, + "learning_rate": 0.00020637590007951663, + "loss": 2.0955, + "step": 15139 + }, + { + "epoch": 1.766421654416054, + "grad_norm": 1.04124915599823, + "learning_rate": 0.0002063618500678308, + "loss": 2.0055, + "step": 15140 + }, + { + "epoch": 1.7665383269163457, + "grad_norm": 1.1294784545898438, + "learning_rate": 0.000206347799487398, + "loss": 1.9434, + "step": 15141 + }, + { + "epoch": 1.7666549994166374, + "grad_norm": 1.0280826091766357, + "learning_rate": 0.0002063337483383639, + "loss": 2.2139, + "step": 15142 + }, + { + "epoch": 1.766771671916929, + "grad_norm": 1.3789746761322021, + "learning_rate": 0.0002063196966208742, + "loss": 2.2958, + "step": 15143 + }, + { + "epoch": 1.7668883444172208, + "grad_norm": 1.4093635082244873, + "learning_rate": 0.00020630564433507455, + "loss": 1.9767, + "step": 15144 + }, + { + "epoch": 1.7670050169175124, + "grad_norm": 1.1343764066696167, + "learning_rate": 0.00020629159148111058, + "loss": 1.996, + "step": 15145 + }, + { + "epoch": 1.7671216894178041, + "grad_norm": 1.220569133758545, + "learning_rate": 0.000206277538059128, + "loss": 2.1231, + "step": 15146 + }, + { + "epoch": 1.7672383619180958, + "grad_norm": 1.3226962089538574, + "learning_rate": 0.00020626348406927253, + "loss": 2.0865, + "step": 15147 + }, + { + "epoch": 1.7673550344183875, + "grad_norm": 1.192833662033081, + "learning_rate": 0.00020624942951168982, + "loss": 1.9754, + "step": 15148 + }, + { + "epoch": 1.7674717069186792, + "grad_norm": 1.2768373489379883, + "learning_rate": 0.00020623537438652564, + "loss": 2.0483, + "step": 15149 + }, + { + "epoch": 1.7675883794189708, + "grad_norm": 1.1388264894485474, + "learning_rate": 0.00020622131869392558, + "loss": 1.8217, + "step": 15150 + }, + { + "epoch": 1.7677050519192625, + "grad_norm": 1.0543067455291748, + "learning_rate": 0.00020620726243403547, + "loss": 1.9526, + "step": 15151 + }, + { + "epoch": 1.7678217244195542, + "grad_norm": 1.2088563442230225, + "learning_rate": 0.000206193205607001, + "loss": 2.1184, + "step": 15152 + }, + { + "epoch": 1.767938396919846, + "grad_norm": 1.138213038444519, + "learning_rate": 0.0002061791482129679, + "loss": 1.9247, + "step": 15153 + }, + { + "epoch": 1.7680550694201376, + "grad_norm": 1.109326958656311, + "learning_rate": 0.00020616509025208187, + "loss": 1.9671, + "step": 15154 + }, + { + "epoch": 1.7681717419204293, + "grad_norm": 0.9716981649398804, + "learning_rate": 0.00020615103172448864, + "loss": 2.0054, + "step": 15155 + }, + { + "epoch": 1.768288414420721, + "grad_norm": 1.307568907737732, + "learning_rate": 0.00020613697263033393, + "loss": 2.0028, + "step": 15156 + }, + { + "epoch": 1.7684050869210126, + "grad_norm": 1.0553553104400635, + "learning_rate": 0.00020612291296976363, + "loss": 2.0967, + "step": 15157 + }, + { + "epoch": 1.7685217594213043, + "grad_norm": 1.0579677820205688, + "learning_rate": 0.00020610885274292332, + "loss": 2.0504, + "step": 15158 + }, + { + "epoch": 1.768638431921596, + "grad_norm": 1.0591384172439575, + "learning_rate": 0.0002060947919499589, + "loss": 1.9607, + "step": 15159 + }, + { + "epoch": 1.7687551044218877, + "grad_norm": 1.268674373626709, + "learning_rate": 0.00020608073059101608, + "loss": 2.0096, + "step": 15160 + }, + { + "epoch": 1.7688717769221793, + "grad_norm": 1.0939308404922485, + "learning_rate": 0.00020606666866624065, + "loss": 2.0795, + "step": 15161 + }, + { + "epoch": 1.768988449422471, + "grad_norm": 0.9853312969207764, + "learning_rate": 0.00020605260617577833, + "loss": 1.9211, + "step": 15162 + }, + { + "epoch": 1.7691051219227627, + "grad_norm": 1.3738526105880737, + "learning_rate": 0.00020603854311977495, + "loss": 2.237, + "step": 15163 + }, + { + "epoch": 1.7692217944230544, + "grad_norm": 1.258175253868103, + "learning_rate": 0.00020602447949837627, + "loss": 2.0237, + "step": 15164 + }, + { + "epoch": 1.769338466923346, + "grad_norm": 1.0159966945648193, + "learning_rate": 0.0002060104153117281, + "loss": 1.947, + "step": 15165 + }, + { + "epoch": 1.7694551394236377, + "grad_norm": 1.079939365386963, + "learning_rate": 0.0002059963505599763, + "loss": 2.0147, + "step": 15166 + }, + { + "epoch": 1.7695718119239294, + "grad_norm": 1.0281472206115723, + "learning_rate": 0.0002059822852432666, + "loss": 1.9713, + "step": 15167 + }, + { + "epoch": 1.769688484424221, + "grad_norm": 1.1468298435211182, + "learning_rate": 0.00020596821936174486, + "loss": 2.0966, + "step": 15168 + }, + { + "epoch": 1.7698051569245128, + "grad_norm": 1.1851729154586792, + "learning_rate": 0.00020595415291555684, + "loss": 1.827, + "step": 15169 + }, + { + "epoch": 1.7699218294248045, + "grad_norm": 1.1987919807434082, + "learning_rate": 0.00020594008590484844, + "loss": 2.05, + "step": 15170 + }, + { + "epoch": 1.7700385019250962, + "grad_norm": 1.460571527481079, + "learning_rate": 0.0002059260183297655, + "loss": 2.1694, + "step": 15171 + }, + { + "epoch": 1.7701551744253878, + "grad_norm": 1.1322323083877563, + "learning_rate": 0.00020591195019045377, + "loss": 1.9783, + "step": 15172 + }, + { + "epoch": 1.7702718469256795, + "grad_norm": 1.2704987525939941, + "learning_rate": 0.00020589788148705912, + "loss": 2.2077, + "step": 15173 + }, + { + "epoch": 1.7703885194259712, + "grad_norm": 1.233902931213379, + "learning_rate": 0.00020588381221972745, + "loss": 2.0617, + "step": 15174 + }, + { + "epoch": 1.7705051919262629, + "grad_norm": 1.1366686820983887, + "learning_rate": 0.0002058697423886046, + "loss": 2.1826, + "step": 15175 + }, + { + "epoch": 1.7706218644265546, + "grad_norm": 1.0001046657562256, + "learning_rate": 0.00020585567199383638, + "loss": 1.9955, + "step": 15176 + }, + { + "epoch": 1.7707385369268462, + "grad_norm": 1.0508774518966675, + "learning_rate": 0.00020584160103556868, + "loss": 2.008, + "step": 15177 + }, + { + "epoch": 1.770855209427138, + "grad_norm": 1.2019416093826294, + "learning_rate": 0.0002058275295139474, + "loss": 2.1507, + "step": 15178 + }, + { + "epoch": 1.7709718819274296, + "grad_norm": 1.273311972618103, + "learning_rate": 0.0002058134574291184, + "loss": 2.1402, + "step": 15179 + }, + { + "epoch": 1.7710885544277213, + "grad_norm": 1.2067089080810547, + "learning_rate": 0.00020579938478122758, + "loss": 2.1043, + "step": 15180 + }, + { + "epoch": 1.771205226928013, + "grad_norm": 1.0669668912887573, + "learning_rate": 0.00020578531157042077, + "loss": 2.0239, + "step": 15181 + }, + { + "epoch": 1.7713218994283046, + "grad_norm": 1.0573642253875732, + "learning_rate": 0.00020577123779684392, + "loss": 2.1601, + "step": 15182 + }, + { + "epoch": 1.7714385719285963, + "grad_norm": 1.0583860874176025, + "learning_rate": 0.0002057571634606429, + "loss": 2.0281, + "step": 15183 + }, + { + "epoch": 1.771555244428888, + "grad_norm": 1.0196335315704346, + "learning_rate": 0.00020574308856196368, + "loss": 2.0268, + "step": 15184 + }, + { + "epoch": 1.7716719169291797, + "grad_norm": 1.1236441135406494, + "learning_rate": 0.00020572901310095213, + "loss": 2.0509, + "step": 15185 + }, + { + "epoch": 1.7717885894294714, + "grad_norm": 1.0712664127349854, + "learning_rate": 0.00020571493707775414, + "loss": 2.0608, + "step": 15186 + }, + { + "epoch": 1.771905261929763, + "grad_norm": 1.1012439727783203, + "learning_rate": 0.00020570086049251571, + "loss": 2.213, + "step": 15187 + }, + { + "epoch": 1.7720219344300547, + "grad_norm": 1.0597823858261108, + "learning_rate": 0.00020568678334538268, + "loss": 2.0356, + "step": 15188 + }, + { + "epoch": 1.7721386069303464, + "grad_norm": 1.0433858633041382, + "learning_rate": 0.00020567270563650106, + "loss": 1.9966, + "step": 15189 + }, + { + "epoch": 1.772255279430638, + "grad_norm": 1.1305748224258423, + "learning_rate": 0.00020565862736601675, + "loss": 2.1942, + "step": 15190 + }, + { + "epoch": 1.7723719519309298, + "grad_norm": 0.9825367331504822, + "learning_rate": 0.00020564454853407566, + "loss": 1.9867, + "step": 15191 + }, + { + "epoch": 1.7724886244312215, + "grad_norm": 1.1743031740188599, + "learning_rate": 0.0002056304691408238, + "loss": 1.9412, + "step": 15192 + }, + { + "epoch": 1.7726052969315131, + "grad_norm": 1.0264071226119995, + "learning_rate": 0.0002056163891864072, + "loss": 1.9259, + "step": 15193 + }, + { + "epoch": 1.7727219694318048, + "grad_norm": 1.0913074016571045, + "learning_rate": 0.0002056023086709717, + "loss": 2.0651, + "step": 15194 + }, + { + "epoch": 1.7728386419320965, + "grad_norm": 1.1986286640167236, + "learning_rate": 0.0002055882275946633, + "loss": 2.142, + "step": 15195 + }, + { + "epoch": 1.7729553144323882, + "grad_norm": 1.1280109882354736, + "learning_rate": 0.00020557414595762804, + "loss": 2.0612, + "step": 15196 + }, + { + "epoch": 1.7730719869326799, + "grad_norm": 1.009284496307373, + "learning_rate": 0.0002055600637600118, + "loss": 1.838, + "step": 15197 + }, + { + "epoch": 1.7731886594329715, + "grad_norm": 1.208463191986084, + "learning_rate": 0.0002055459810019607, + "loss": 2.2204, + "step": 15198 + }, + { + "epoch": 1.7733053319332632, + "grad_norm": 1.1455730199813843, + "learning_rate": 0.00020553189768362062, + "loss": 1.9288, + "step": 15199 + }, + { + "epoch": 1.773422004433555, + "grad_norm": 1.0786281824111938, + "learning_rate": 0.0002055178138051376, + "loss": 1.912, + "step": 15200 + }, + { + "epoch": 1.7735386769338466, + "grad_norm": 1.0884813070297241, + "learning_rate": 0.00020550372936665765, + "loss": 2.0008, + "step": 15201 + }, + { + "epoch": 1.7736553494341383, + "grad_norm": 1.3546079397201538, + "learning_rate": 0.0002054896443683268, + "loss": 2.0224, + "step": 15202 + }, + { + "epoch": 1.77377202193443, + "grad_norm": 1.1029778718948364, + "learning_rate": 0.00020547555881029103, + "loss": 2.0471, + "step": 15203 + }, + { + "epoch": 1.7738886944347216, + "grad_norm": 1.1042060852050781, + "learning_rate": 0.00020546147269269642, + "loss": 2.0287, + "step": 15204 + }, + { + "epoch": 1.7740053669350133, + "grad_norm": 1.2824994325637817, + "learning_rate": 0.00020544738601568896, + "loss": 1.9626, + "step": 15205 + }, + { + "epoch": 1.774122039435305, + "grad_norm": 1.3363888263702393, + "learning_rate": 0.00020543329877941463, + "loss": 2.1475, + "step": 15206 + }, + { + "epoch": 1.7742387119355967, + "grad_norm": 1.2218959331512451, + "learning_rate": 0.00020541921098401957, + "loss": 2.112, + "step": 15207 + }, + { + "epoch": 1.7743553844358884, + "grad_norm": 1.156833529472351, + "learning_rate": 0.00020540512262964974, + "loss": 2.1311, + "step": 15208 + }, + { + "epoch": 1.77447205693618, + "grad_norm": 1.2141531705856323, + "learning_rate": 0.00020539103371645126, + "loss": 2.0243, + "step": 15209 + }, + { + "epoch": 1.7745887294364717, + "grad_norm": 1.1380603313446045, + "learning_rate": 0.00020537694424457015, + "loss": 1.9011, + "step": 15210 + }, + { + "epoch": 1.7747054019367634, + "grad_norm": 1.0260483026504517, + "learning_rate": 0.00020536285421415254, + "loss": 2.1048, + "step": 15211 + }, + { + "epoch": 1.774822074437055, + "grad_norm": 1.166671872138977, + "learning_rate": 0.00020534876362534444, + "loss": 2.0652, + "step": 15212 + }, + { + "epoch": 1.7749387469373468, + "grad_norm": 1.1566805839538574, + "learning_rate": 0.0002053346724782919, + "loss": 2.1991, + "step": 15213 + }, + { + "epoch": 1.7750554194376384, + "grad_norm": 1.0773396492004395, + "learning_rate": 0.00020532058077314105, + "loss": 1.9476, + "step": 15214 + }, + { + "epoch": 1.7751720919379301, + "grad_norm": 1.2887510061264038, + "learning_rate": 0.00020530648851003794, + "loss": 2.1017, + "step": 15215 + }, + { + "epoch": 1.7752887644382218, + "grad_norm": 1.0982623100280762, + "learning_rate": 0.0002052923956891287, + "loss": 2.2342, + "step": 15216 + }, + { + "epoch": 1.7754054369385135, + "grad_norm": 1.1442402601242065, + "learning_rate": 0.00020527830231055946, + "loss": 2.0605, + "step": 15217 + }, + { + "epoch": 1.7755221094388052, + "grad_norm": 1.023774266242981, + "learning_rate": 0.00020526420837447618, + "loss": 2.0535, + "step": 15218 + }, + { + "epoch": 1.7756387819390969, + "grad_norm": 1.000547170639038, + "learning_rate": 0.00020525011388102516, + "loss": 2.1603, + "step": 15219 + }, + { + "epoch": 1.7757554544393885, + "grad_norm": 1.0670089721679688, + "learning_rate": 0.00020523601883035242, + "loss": 2.0298, + "step": 15220 + }, + { + "epoch": 1.7758721269396802, + "grad_norm": 0.9636197090148926, + "learning_rate": 0.000205221923222604, + "loss": 1.9569, + "step": 15221 + }, + { + "epoch": 1.775988799439972, + "grad_norm": 1.0654025077819824, + "learning_rate": 0.00020520782705792622, + "loss": 1.9576, + "step": 15222 + }, + { + "epoch": 1.7761054719402636, + "grad_norm": 1.0623234510421753, + "learning_rate": 0.00020519373033646502, + "loss": 1.909, + "step": 15223 + }, + { + "epoch": 1.7762221444405553, + "grad_norm": 1.115945816040039, + "learning_rate": 0.00020517963305836668, + "loss": 2.0237, + "step": 15224 + }, + { + "epoch": 1.7762221444405553, + "eval_train_loss": 1.9815627336502075, + "eval_train_mean_batch_perplexity": 8.295781583044203, + "eval_train_runtime": 11043.6816, + "eval_train_samples_per_second": 12.418, + "eval_train_steps_per_second": 0.776, + "step": 15224 + }, + { + "epoch": 1.7762221444405553, + "eval_test_loss": 2.084994077682495, + "eval_test_mean_batch_perplexity": 9.28730432862353, + "eval_test_runtime": 2383.1152, + "eval_test_samples_per_second": 12.331, + "eval_test_steps_per_second": 0.771, + "step": 15224 + }, + { + "epoch": 1.776338816940847, + "grad_norm": 1.155945062637329, + "learning_rate": 0.0002051655352237773, + "loss": 1.9954, + "step": 15225 + }, + { + "epoch": 1.7764554894411386, + "grad_norm": 1.1410330533981323, + "learning_rate": 0.000205151436832843, + "loss": 2.0873, + "step": 15226 + }, + { + "epoch": 1.7765721619414303, + "grad_norm": 0.971274197101593, + "learning_rate": 0.00020513733788570996, + "loss": 1.9878, + "step": 15227 + }, + { + "epoch": 1.776688834441722, + "grad_norm": 0.9875081181526184, + "learning_rate": 0.00020512323838252437, + "loss": 2.0274, + "step": 15228 + }, + { + "epoch": 1.7768055069420137, + "grad_norm": 1.2879053354263306, + "learning_rate": 0.0002051091383234323, + "loss": 2.2346, + "step": 15229 + }, + { + "epoch": 1.7769221794423053, + "grad_norm": 1.2291532754898071, + "learning_rate": 0.00020509503770858013, + "loss": 1.9173, + "step": 15230 + }, + { + "epoch": 1.777038851942597, + "grad_norm": 1.2359106540679932, + "learning_rate": 0.0002050809365381138, + "loss": 2.1232, + "step": 15231 + }, + { + "epoch": 1.7771555244428887, + "grad_norm": 1.0983909368515015, + "learning_rate": 0.00020506683481217965, + "loss": 2.0997, + "step": 15232 + }, + { + "epoch": 1.7772721969431804, + "grad_norm": 1.0275384187698364, + "learning_rate": 0.00020505273253092375, + "loss": 2.0592, + "step": 15233 + }, + { + "epoch": 1.777388869443472, + "grad_norm": 1.180981159210205, + "learning_rate": 0.00020503862969449237, + "loss": 2.2953, + "step": 15234 + }, + { + "epoch": 1.7775055419437638, + "grad_norm": 1.2322300672531128, + "learning_rate": 0.00020502452630303172, + "loss": 2.0608, + "step": 15235 + }, + { + "epoch": 1.7776222144440554, + "grad_norm": 1.0972548723220825, + "learning_rate": 0.00020501042235668805, + "loss": 2.1583, + "step": 15236 + }, + { + "epoch": 1.7777388869443471, + "grad_norm": 1.037230134010315, + "learning_rate": 0.0002049963178556075, + "loss": 2.0308, + "step": 15237 + }, + { + "epoch": 1.7778555594446388, + "grad_norm": 1.3063687086105347, + "learning_rate": 0.00020498221279993635, + "loss": 2.1221, + "step": 15238 + }, + { + "epoch": 1.7779722319449305, + "grad_norm": 1.2923628091812134, + "learning_rate": 0.0002049681071898207, + "loss": 1.943, + "step": 15239 + }, + { + "epoch": 1.7780889044452222, + "grad_norm": 1.1383212804794312, + "learning_rate": 0.00020495400102540692, + "loss": 2.0889, + "step": 15240 + }, + { + "epoch": 1.7782055769455138, + "grad_norm": 1.1137925386428833, + "learning_rate": 0.0002049398943068412, + "loss": 2.1631, + "step": 15241 + }, + { + "epoch": 1.7783222494458055, + "grad_norm": 0.9937662482261658, + "learning_rate": 0.00020492578703426974, + "loss": 2.1012, + "step": 15242 + }, + { + "epoch": 1.7784389219460972, + "grad_norm": 1.1506038904190063, + "learning_rate": 0.00020491167920783885, + "loss": 2.042, + "step": 15243 + }, + { + "epoch": 1.7785555944463889, + "grad_norm": 1.0986144542694092, + "learning_rate": 0.0002048975708276948, + "loss": 2.0427, + "step": 15244 + }, + { + "epoch": 1.7786722669466806, + "grad_norm": 1.0264009237289429, + "learning_rate": 0.00020488346189398375, + "loss": 2.0394, + "step": 15245 + }, + { + "epoch": 1.7787889394469723, + "grad_norm": 1.0378837585449219, + "learning_rate": 0.00020486935240685207, + "loss": 2.0517, + "step": 15246 + }, + { + "epoch": 1.778905611947264, + "grad_norm": 1.1487140655517578, + "learning_rate": 0.00020485524236644594, + "loss": 2.1222, + "step": 15247 + }, + { + "epoch": 1.7790222844475556, + "grad_norm": 1.0392082929611206, + "learning_rate": 0.00020484113177291177, + "loss": 2.105, + "step": 15248 + }, + { + "epoch": 1.7791389569478473, + "grad_norm": 1.0514343976974487, + "learning_rate": 0.00020482702062639565, + "loss": 1.7886, + "step": 15249 + }, + { + "epoch": 1.779255629448139, + "grad_norm": 1.1525421142578125, + "learning_rate": 0.00020481290892704405, + "loss": 2.1588, + "step": 15250 + }, + { + "epoch": 1.7793723019484307, + "grad_norm": 1.220241904258728, + "learning_rate": 0.0002047987966750032, + "loss": 2.0802, + "step": 15251 + }, + { + "epoch": 1.7794889744487223, + "grad_norm": 1.2587319612503052, + "learning_rate": 0.00020478468387041937, + "loss": 2.1412, + "step": 15252 + }, + { + "epoch": 1.779605646949014, + "grad_norm": 1.3582806587219238, + "learning_rate": 0.00020477057051343888, + "loss": 2.0051, + "step": 15253 + }, + { + "epoch": 1.7797223194493057, + "grad_norm": 1.3553117513656616, + "learning_rate": 0.00020475645660420806, + "loss": 2.0976, + "step": 15254 + }, + { + "epoch": 1.7798389919495974, + "grad_norm": 1.182678461074829, + "learning_rate": 0.00020474234214287322, + "loss": 1.9545, + "step": 15255 + }, + { + "epoch": 1.779955664449889, + "grad_norm": 1.0874364376068115, + "learning_rate": 0.0002047282271295807, + "loss": 2.0002, + "step": 15256 + }, + { + "epoch": 1.7800723369501807, + "grad_norm": 1.1006437540054321, + "learning_rate": 0.0002047141115644768, + "loss": 2.0095, + "step": 15257 + }, + { + "epoch": 1.7801890094504724, + "grad_norm": 1.065765619277954, + "learning_rate": 0.00020469999544770788, + "loss": 2.1349, + "step": 15258 + }, + { + "epoch": 1.780305681950764, + "grad_norm": 1.0973049402236938, + "learning_rate": 0.00020468587877942021, + "loss": 1.8575, + "step": 15259 + }, + { + "epoch": 1.7804223544510558, + "grad_norm": 1.0628578662872314, + "learning_rate": 0.0002046717615597602, + "loss": 1.8169, + "step": 15260 + }, + { + "epoch": 1.7805390269513475, + "grad_norm": 1.1237045526504517, + "learning_rate": 0.00020465764378887422, + "loss": 2.0457, + "step": 15261 + }, + { + "epoch": 1.7806556994516392, + "grad_norm": 1.1429109573364258, + "learning_rate": 0.0002046435254669086, + "loss": 1.9286, + "step": 15262 + }, + { + "epoch": 1.7807723719519308, + "grad_norm": 1.1252570152282715, + "learning_rate": 0.0002046294065940097, + "loss": 2.0236, + "step": 15263 + }, + { + "epoch": 1.7808890444522225, + "grad_norm": 1.0010071992874146, + "learning_rate": 0.00020461528717032388, + "loss": 1.929, + "step": 15264 + }, + { + "epoch": 1.7810057169525142, + "grad_norm": 1.2756083011627197, + "learning_rate": 0.00020460116719599755, + "loss": 2.0496, + "step": 15265 + }, + { + "epoch": 1.7811223894528059, + "grad_norm": 0.9876391291618347, + "learning_rate": 0.00020458704667117704, + "loss": 1.847, + "step": 15266 + }, + { + "epoch": 1.7812390619530976, + "grad_norm": 1.2144414186477661, + "learning_rate": 0.00020457292559600878, + "loss": 2.0735, + "step": 15267 + }, + { + "epoch": 1.7813557344533892, + "grad_norm": 1.2105820178985596, + "learning_rate": 0.00020455880397063912, + "loss": 2.0763, + "step": 15268 + }, + { + "epoch": 1.781472406953681, + "grad_norm": 1.2403818368911743, + "learning_rate": 0.00020454468179521452, + "loss": 2.1773, + "step": 15269 + }, + { + "epoch": 1.7815890794539726, + "grad_norm": 1.0341418981552124, + "learning_rate": 0.00020453055906988134, + "loss": 1.9131, + "step": 15270 + }, + { + "epoch": 1.7817057519542643, + "grad_norm": 1.0894083976745605, + "learning_rate": 0.00020451643579478596, + "loss": 2.1966, + "step": 15271 + }, + { + "epoch": 1.781822424454556, + "grad_norm": 1.0385394096374512, + "learning_rate": 0.00020450231197007487, + "loss": 2.0166, + "step": 15272 + }, + { + "epoch": 1.7819390969548476, + "grad_norm": 1.3345928192138672, + "learning_rate": 0.00020448818759589437, + "loss": 2.0505, + "step": 15273 + }, + { + "epoch": 1.7820557694551393, + "grad_norm": 1.2521930932998657, + "learning_rate": 0.00020447406267239105, + "loss": 2.1314, + "step": 15274 + }, + { + "epoch": 1.782172441955431, + "grad_norm": 1.0637239217758179, + "learning_rate": 0.00020445993719971122, + "loss": 2.0852, + "step": 15275 + }, + { + "epoch": 1.7822891144557227, + "grad_norm": 1.1784403324127197, + "learning_rate": 0.00020444581117800136, + "loss": 2.0792, + "step": 15276 + }, + { + "epoch": 1.7824057869560144, + "grad_norm": 1.292323350906372, + "learning_rate": 0.0002044316846074079, + "loss": 2.0823, + "step": 15277 + }, + { + "epoch": 1.782522459456306, + "grad_norm": 1.1100324392318726, + "learning_rate": 0.0002044175574880773, + "loss": 2.007, + "step": 15278 + }, + { + "epoch": 1.7826391319565977, + "grad_norm": 1.1038776636123657, + "learning_rate": 0.000204403429820156, + "loss": 2.005, + "step": 15279 + }, + { + "epoch": 1.7827558044568894, + "grad_norm": 1.1476917266845703, + "learning_rate": 0.0002043893016037905, + "loss": 2.0943, + "step": 15280 + }, + { + "epoch": 1.782872476957181, + "grad_norm": 1.184214472770691, + "learning_rate": 0.0002043751728391272, + "loss": 2.0843, + "step": 15281 + }, + { + "epoch": 1.7829891494574728, + "grad_norm": 1.063123106956482, + "learning_rate": 0.00020436104352631264, + "loss": 2.0033, + "step": 15282 + }, + { + "epoch": 1.7831058219577645, + "grad_norm": 1.3124289512634277, + "learning_rate": 0.00020434691366549325, + "loss": 2.2466, + "step": 15283 + }, + { + "epoch": 1.7832224944580561, + "grad_norm": 0.975212574005127, + "learning_rate": 0.00020433278325681552, + "loss": 2.0353, + "step": 15284 + }, + { + "epoch": 1.7833391669583478, + "grad_norm": 0.9602452516555786, + "learning_rate": 0.00020431865230042596, + "loss": 2.0154, + "step": 15285 + }, + { + "epoch": 1.7834558394586395, + "grad_norm": 1.0147778987884521, + "learning_rate": 0.00020430452079647096, + "loss": 2.1231, + "step": 15286 + }, + { + "epoch": 1.7835725119589312, + "grad_norm": 1.1345268487930298, + "learning_rate": 0.0002042903887450972, + "loss": 2.0476, + "step": 15287 + }, + { + "epoch": 1.7836891844592229, + "grad_norm": 1.1186293363571167, + "learning_rate": 0.0002042762561464511, + "loss": 2.1756, + "step": 15288 + }, + { + "epoch": 1.7838058569595145, + "grad_norm": 1.0375081300735474, + "learning_rate": 0.00020426212300067912, + "loss": 2.0932, + "step": 15289 + }, + { + "epoch": 1.7839225294598062, + "grad_norm": 1.2975640296936035, + "learning_rate": 0.00020424798930792785, + "loss": 2.2426, + "step": 15290 + }, + { + "epoch": 1.784039201960098, + "grad_norm": 1.069836139678955, + "learning_rate": 0.00020423385506834377, + "loss": 2.15, + "step": 15291 + }, + { + "epoch": 1.7841558744603896, + "grad_norm": 1.098077654838562, + "learning_rate": 0.00020421972028207346, + "loss": 1.84, + "step": 15292 + }, + { + "epoch": 1.7842725469606813, + "grad_norm": 1.078798770904541, + "learning_rate": 0.00020420558494926337, + "loss": 1.9078, + "step": 15293 + }, + { + "epoch": 1.784389219460973, + "grad_norm": 1.324875831604004, + "learning_rate": 0.00020419144907006008, + "loss": 2.3172, + "step": 15294 + }, + { + "epoch": 1.7845058919612646, + "grad_norm": 0.9772500991821289, + "learning_rate": 0.00020417731264461015, + "loss": 2.1376, + "step": 15295 + }, + { + "epoch": 1.7846225644615563, + "grad_norm": 0.9470217227935791, + "learning_rate": 0.00020416317567306015, + "loss": 1.9855, + "step": 15296 + }, + { + "epoch": 1.784739236961848, + "grad_norm": 1.0963764190673828, + "learning_rate": 0.00020414903815555658, + "loss": 1.9763, + "step": 15297 + }, + { + "epoch": 1.7848559094621397, + "grad_norm": 1.0805913209915161, + "learning_rate": 0.00020413490009224604, + "loss": 2.1072, + "step": 15298 + }, + { + "epoch": 1.7849725819624314, + "grad_norm": 1.1260141134262085, + "learning_rate": 0.0002041207614832751, + "loss": 2.0404, + "step": 15299 + }, + { + "epoch": 1.785089254462723, + "grad_norm": 1.0958918333053589, + "learning_rate": 0.00020410662232879032, + "loss": 2.0479, + "step": 15300 + }, + { + "epoch": 1.7852059269630147, + "grad_norm": 1.280928373336792, + "learning_rate": 0.00020409248262893824, + "loss": 2.0973, + "step": 15301 + }, + { + "epoch": 1.7853225994633064, + "grad_norm": 1.3146661520004272, + "learning_rate": 0.00020407834238386554, + "loss": 2.0054, + "step": 15302 + }, + { + "epoch": 1.785439271963598, + "grad_norm": 1.0132184028625488, + "learning_rate": 0.00020406420159371875, + "loss": 2.0177, + "step": 15303 + }, + { + "epoch": 1.7855559444638898, + "grad_norm": 1.1343307495117188, + "learning_rate": 0.00020405006025864443, + "loss": 2.0942, + "step": 15304 + }, + { + "epoch": 1.7856726169641814, + "grad_norm": 1.1539859771728516, + "learning_rate": 0.00020403591837878923, + "loss": 2.1201, + "step": 15305 + }, + { + "epoch": 1.7857892894644731, + "grad_norm": 1.276559829711914, + "learning_rate": 0.0002040217759542998, + "loss": 2.0165, + "step": 15306 + }, + { + "epoch": 1.7859059619647648, + "grad_norm": 1.0625369548797607, + "learning_rate": 0.00020400763298532266, + "loss": 2.1027, + "step": 15307 + }, + { + "epoch": 1.7860226344650565, + "grad_norm": 1.1456499099731445, + "learning_rate": 0.0002039934894720045, + "loss": 2.047, + "step": 15308 + }, + { + "epoch": 1.7861393069653482, + "grad_norm": 1.2697421312332153, + "learning_rate": 0.0002039793454144919, + "loss": 2.1432, + "step": 15309 + }, + { + "epoch": 1.7862559794656399, + "grad_norm": 1.2185335159301758, + "learning_rate": 0.0002039652008129315, + "loss": 1.9688, + "step": 15310 + }, + { + "epoch": 1.7863726519659315, + "grad_norm": 1.3401941061019897, + "learning_rate": 0.00020395105566746997, + "loss": 2.0525, + "step": 15311 + }, + { + "epoch": 1.7864893244662232, + "grad_norm": 1.0987128019332886, + "learning_rate": 0.00020393690997825385, + "loss": 1.9221, + "step": 15312 + }, + { + "epoch": 1.786605996966515, + "grad_norm": 1.2187610864639282, + "learning_rate": 0.00020392276374542995, + "loss": 2.0245, + "step": 15313 + }, + { + "epoch": 1.7867226694668066, + "grad_norm": 1.2682867050170898, + "learning_rate": 0.0002039086169691448, + "loss": 2.1015, + "step": 15314 + }, + { + "epoch": 1.7868393419670983, + "grad_norm": 1.1560871601104736, + "learning_rate": 0.0002038944696495451, + "loss": 2.1966, + "step": 15315 + }, + { + "epoch": 1.78695601446739, + "grad_norm": 1.0835723876953125, + "learning_rate": 0.0002038803217867775, + "loss": 2.2489, + "step": 15316 + }, + { + "epoch": 1.7870726869676816, + "grad_norm": 1.1466830968856812, + "learning_rate": 0.0002038661733809887, + "loss": 2.0614, + "step": 15317 + }, + { + "epoch": 1.7871893594679733, + "grad_norm": 1.1570686101913452, + "learning_rate": 0.0002038520244323253, + "loss": 2.1655, + "step": 15318 + }, + { + "epoch": 1.787306031968265, + "grad_norm": 1.2595230340957642, + "learning_rate": 0.00020383787494093406, + "loss": 2.0875, + "step": 15319 + }, + { + "epoch": 1.7874227044685567, + "grad_norm": 1.067148208618164, + "learning_rate": 0.00020382372490696164, + "loss": 1.9618, + "step": 15320 + }, + { + "epoch": 1.7875393769688483, + "grad_norm": 1.050590991973877, + "learning_rate": 0.00020380957433055469, + "loss": 1.9004, + "step": 15321 + }, + { + "epoch": 1.78765604946914, + "grad_norm": 1.1247503757476807, + "learning_rate": 0.00020379542321185995, + "loss": 2.0854, + "step": 15322 + }, + { + "epoch": 1.7877727219694317, + "grad_norm": 1.189670205116272, + "learning_rate": 0.00020378127155102412, + "loss": 2.0891, + "step": 15323 + }, + { + "epoch": 1.7878893944697234, + "grad_norm": 0.9997778534889221, + "learning_rate": 0.00020376711934819394, + "loss": 1.8651, + "step": 15324 + }, + { + "epoch": 1.788006066970015, + "grad_norm": 1.10139000415802, + "learning_rate": 0.0002037529666035161, + "loss": 2.1022, + "step": 15325 + }, + { + "epoch": 1.7881227394703068, + "grad_norm": 1.210526704788208, + "learning_rate": 0.0002037388133171373, + "loss": 2.1505, + "step": 15326 + }, + { + "epoch": 1.7882394119705984, + "grad_norm": 1.2131768465042114, + "learning_rate": 0.00020372465948920423, + "loss": 2.2408, + "step": 15327 + }, + { + "epoch": 1.7883560844708901, + "grad_norm": 1.1935359239578247, + "learning_rate": 0.00020371050511986373, + "loss": 2.0765, + "step": 15328 + }, + { + "epoch": 1.7884727569711818, + "grad_norm": 1.2000731229782104, + "learning_rate": 0.00020369635020926245, + "loss": 1.9129, + "step": 15329 + }, + { + "epoch": 1.7885894294714735, + "grad_norm": 1.0552128553390503, + "learning_rate": 0.00020368219475754715, + "loss": 2.0214, + "step": 15330 + }, + { + "epoch": 1.7887061019717652, + "grad_norm": 1.2716572284698486, + "learning_rate": 0.0002036680387648646, + "loss": 2.131, + "step": 15331 + }, + { + "epoch": 1.7888227744720568, + "grad_norm": 1.3598624467849731, + "learning_rate": 0.00020365388223136154, + "loss": 2.082, + "step": 15332 + }, + { + "epoch": 1.7889394469723485, + "grad_norm": 1.0839568376541138, + "learning_rate": 0.00020363972515718477, + "loss": 2.0978, + "step": 15333 + }, + { + "epoch": 1.7890561194726402, + "grad_norm": 0.9330839514732361, + "learning_rate": 0.00020362556754248095, + "loss": 1.8681, + "step": 15334 + }, + { + "epoch": 1.7891727919729319, + "grad_norm": 1.4266501665115356, + "learning_rate": 0.00020361140938739697, + "loss": 2.0816, + "step": 15335 + }, + { + "epoch": 1.7892894644732236, + "grad_norm": 1.1858415603637695, + "learning_rate": 0.00020359725069207948, + "loss": 1.9988, + "step": 15336 + }, + { + "epoch": 1.7894061369735152, + "grad_norm": 1.0829752683639526, + "learning_rate": 0.0002035830914566754, + "loss": 2.0377, + "step": 15337 + }, + { + "epoch": 1.789522809473807, + "grad_norm": 1.0161718130111694, + "learning_rate": 0.00020356893168133142, + "loss": 2.0729, + "step": 15338 + }, + { + "epoch": 1.7896394819740986, + "grad_norm": 1.0262361764907837, + "learning_rate": 0.00020355477136619433, + "loss": 2.1216, + "step": 15339 + }, + { + "epoch": 1.7897561544743903, + "grad_norm": 1.481928825378418, + "learning_rate": 0.00020354061051141098, + "loss": 2.2573, + "step": 15340 + }, + { + "epoch": 1.789872826974682, + "grad_norm": 1.0387359857559204, + "learning_rate": 0.0002035264491171282, + "loss": 1.9482, + "step": 15341 + }, + { + "epoch": 1.7899894994749737, + "grad_norm": 1.1740455627441406, + "learning_rate": 0.0002035122871834927, + "loss": 2.0798, + "step": 15342 + }, + { + "epoch": 1.7901061719752653, + "grad_norm": 0.9951997399330139, + "learning_rate": 0.00020349812471065137, + "loss": 2.0955, + "step": 15343 + }, + { + "epoch": 1.790222844475557, + "grad_norm": 1.0657148361206055, + "learning_rate": 0.000203483961698751, + "loss": 2.056, + "step": 15344 + }, + { + "epoch": 1.7903395169758487, + "grad_norm": 1.0553386211395264, + "learning_rate": 0.00020346979814793846, + "loss": 1.8408, + "step": 15345 + }, + { + "epoch": 1.7904561894761404, + "grad_norm": 0.9633587598800659, + "learning_rate": 0.00020345563405836054, + "loss": 1.8754, + "step": 15346 + }, + { + "epoch": 1.790572861976432, + "grad_norm": 1.0633842945098877, + "learning_rate": 0.00020344146943016407, + "loss": 1.9874, + "step": 15347 + }, + { + "epoch": 1.7906895344767237, + "grad_norm": 1.0545852184295654, + "learning_rate": 0.0002034273042634959, + "loss": 2.0709, + "step": 15348 + }, + { + "epoch": 1.7908062069770154, + "grad_norm": 1.0505363941192627, + "learning_rate": 0.0002034131385585029, + "loss": 2.0777, + "step": 15349 + }, + { + "epoch": 1.790922879477307, + "grad_norm": 1.1529096364974976, + "learning_rate": 0.00020339897231533193, + "loss": 1.955, + "step": 15350 + }, + { + "epoch": 1.7910395519775988, + "grad_norm": 1.0894966125488281, + "learning_rate": 0.00020338480553412985, + "loss": 2.1365, + "step": 15351 + }, + { + "epoch": 1.7911562244778905, + "grad_norm": 1.1175678968429565, + "learning_rate": 0.00020337063821504345, + "loss": 1.9766, + "step": 15352 + }, + { + "epoch": 1.7912728969781821, + "grad_norm": 1.076361894607544, + "learning_rate": 0.00020335647035821968, + "loss": 1.9978, + "step": 15353 + }, + { + "epoch": 1.7913895694784738, + "grad_norm": 1.1384791135787964, + "learning_rate": 0.0002033423019638054, + "loss": 1.9301, + "step": 15354 + }, + { + "epoch": 1.7915062419787655, + "grad_norm": 0.9573400020599365, + "learning_rate": 0.00020332813303194748, + "loss": 2.0214, + "step": 15355 + }, + { + "epoch": 1.7916229144790572, + "grad_norm": 1.1930131912231445, + "learning_rate": 0.00020331396356279284, + "loss": 2.0609, + "step": 15356 + }, + { + "epoch": 1.7917395869793489, + "grad_norm": 1.3567038774490356, + "learning_rate": 0.00020329979355648836, + "loss": 2.2559, + "step": 15357 + }, + { + "epoch": 1.7918562594796406, + "grad_norm": 1.1164851188659668, + "learning_rate": 0.0002032856230131809, + "loss": 1.9378, + "step": 15358 + }, + { + "epoch": 1.7919729319799322, + "grad_norm": 1.2175579071044922, + "learning_rate": 0.00020327145193301747, + "loss": 2.1957, + "step": 15359 + }, + { + "epoch": 1.792089604480224, + "grad_norm": 1.177046775817871, + "learning_rate": 0.00020325728031614482, + "loss": 2.0424, + "step": 15360 + }, + { + "epoch": 1.7922062769805156, + "grad_norm": 0.9426882266998291, + "learning_rate": 0.00020324310816270997, + "loss": 2.0005, + "step": 15361 + }, + { + "epoch": 1.7923229494808073, + "grad_norm": 1.2441445589065552, + "learning_rate": 0.0002032289354728598, + "loss": 1.9673, + "step": 15362 + }, + { + "epoch": 1.792439621981099, + "grad_norm": 1.0151925086975098, + "learning_rate": 0.00020321476224674127, + "loss": 2.1327, + "step": 15363 + }, + { + "epoch": 1.7925562944813906, + "grad_norm": 1.0197227001190186, + "learning_rate": 0.00020320058848450132, + "loss": 1.8408, + "step": 15364 + }, + { + "epoch": 1.7926729669816823, + "grad_norm": 1.1750181913375854, + "learning_rate": 0.00020318641418628683, + "loss": 2.1111, + "step": 15365 + }, + { + "epoch": 1.792789639481974, + "grad_norm": 1.2744861841201782, + "learning_rate": 0.00020317223935224484, + "loss": 2.1301, + "step": 15366 + }, + { + "epoch": 1.7929063119822657, + "grad_norm": 1.1093708276748657, + "learning_rate": 0.00020315806398252218, + "loss": 2.282, + "step": 15367 + }, + { + "epoch": 1.7930229844825574, + "grad_norm": 1.197956919670105, + "learning_rate": 0.00020314388807726586, + "loss": 2.0448, + "step": 15368 + }, + { + "epoch": 1.793139656982849, + "grad_norm": 1.1647696495056152, + "learning_rate": 0.0002031297116366229, + "loss": 1.9156, + "step": 15369 + }, + { + "epoch": 1.7932563294831407, + "grad_norm": 0.9805497527122498, + "learning_rate": 0.00020311553466074018, + "loss": 2.1398, + "step": 15370 + }, + { + "epoch": 1.7933730019834324, + "grad_norm": 1.1752820014953613, + "learning_rate": 0.00020310135714976467, + "loss": 2.1074, + "step": 15371 + }, + { + "epoch": 1.793489674483724, + "grad_norm": 1.2736343145370483, + "learning_rate": 0.0002030871791038434, + "loss": 2.082, + "step": 15372 + }, + { + "epoch": 1.7936063469840158, + "grad_norm": 1.2126909494400024, + "learning_rate": 0.00020307300052312336, + "loss": 2.1929, + "step": 15373 + }, + { + "epoch": 1.7937230194843075, + "grad_norm": 1.155612587928772, + "learning_rate": 0.00020305882140775152, + "loss": 2.0024, + "step": 15374 + }, + { + "epoch": 1.7938396919845991, + "grad_norm": 1.027565598487854, + "learning_rate": 0.0002030446417578748, + "loss": 1.8215, + "step": 15375 + }, + { + "epoch": 1.7939563644848908, + "grad_norm": 1.1586875915527344, + "learning_rate": 0.00020303046157364032, + "loss": 2.0268, + "step": 15376 + }, + { + "epoch": 1.7940730369851825, + "grad_norm": 1.1427110433578491, + "learning_rate": 0.00020301628085519499, + "loss": 2.2282, + "step": 15377 + }, + { + "epoch": 1.7941897094854742, + "grad_norm": 1.1117031574249268, + "learning_rate": 0.00020300209960268586, + "loss": 1.953, + "step": 15378 + }, + { + "epoch": 1.7943063819857659, + "grad_norm": 1.3679224252700806, + "learning_rate": 0.0002029879178162599, + "loss": 2.0359, + "step": 15379 + }, + { + "epoch": 1.7944230544860575, + "grad_norm": 0.9943884015083313, + "learning_rate": 0.0002029737354960642, + "loss": 2.0595, + "step": 15380 + }, + { + "epoch": 1.7945397269863492, + "grad_norm": 0.9133491516113281, + "learning_rate": 0.00020295955264224577, + "loss": 2.0218, + "step": 15381 + }, + { + "epoch": 1.794656399486641, + "grad_norm": 1.2555807828903198, + "learning_rate": 0.00020294536925495165, + "loss": 2.0542, + "step": 15382 + }, + { + "epoch": 1.7947730719869326, + "grad_norm": 1.444470763206482, + "learning_rate": 0.0002029311853343288, + "loss": 2.2243, + "step": 15383 + }, + { + "epoch": 1.7948897444872243, + "grad_norm": 1.0323991775512695, + "learning_rate": 0.0002029170008805244, + "loss": 1.8539, + "step": 15384 + }, + { + "epoch": 1.795006416987516, + "grad_norm": 1.0012195110321045, + "learning_rate": 0.0002029028158936854, + "loss": 2.0423, + "step": 15385 + }, + { + "epoch": 1.7951230894878076, + "grad_norm": 1.119674801826477, + "learning_rate": 0.00020288863037395884, + "loss": 2.0745, + "step": 15386 + }, + { + "epoch": 1.7952397619880993, + "grad_norm": 1.1054067611694336, + "learning_rate": 0.0002028744443214918, + "loss": 2.1155, + "step": 15387 + }, + { + "epoch": 1.795356434488391, + "grad_norm": 1.1449273824691772, + "learning_rate": 0.00020286025773643142, + "loss": 2.4109, + "step": 15388 + }, + { + "epoch": 1.7954731069886827, + "grad_norm": 1.2600561380386353, + "learning_rate": 0.00020284607061892468, + "loss": 2.1141, + "step": 15389 + }, + { + "epoch": 1.7955897794889744, + "grad_norm": 1.1068072319030762, + "learning_rate": 0.00020283188296911866, + "loss": 2.1002, + "step": 15390 + }, + { + "epoch": 1.795706451989266, + "grad_norm": 1.3824197053909302, + "learning_rate": 0.00020281769478716044, + "loss": 2.0077, + "step": 15391 + }, + { + "epoch": 1.7958231244895577, + "grad_norm": 1.2046202421188354, + "learning_rate": 0.00020280350607319723, + "loss": 2.1272, + "step": 15392 + }, + { + "epoch": 1.7959397969898494, + "grad_norm": 1.0561656951904297, + "learning_rate": 0.00020278931682737598, + "loss": 2.1004, + "step": 15393 + }, + { + "epoch": 1.796056469490141, + "grad_norm": 1.0570154190063477, + "learning_rate": 0.00020277512704984388, + "loss": 2.0939, + "step": 15394 + }, + { + "epoch": 1.7961731419904328, + "grad_norm": 1.12161386013031, + "learning_rate": 0.0002027609367407479, + "loss": 2.0611, + "step": 15395 + }, + { + "epoch": 1.7962898144907244, + "grad_norm": 1.1190308332443237, + "learning_rate": 0.00020274674590023528, + "loss": 2.0072, + "step": 15396 + }, + { + "epoch": 1.7964064869910161, + "grad_norm": 1.1418005228042603, + "learning_rate": 0.00020273255452845311, + "loss": 2.0756, + "step": 15397 + }, + { + "epoch": 1.7965231594913078, + "grad_norm": 1.0386089086532593, + "learning_rate": 0.0002027183626255485, + "loss": 2.158, + "step": 15398 + }, + { + "epoch": 1.7966398319915995, + "grad_norm": 1.1117486953735352, + "learning_rate": 0.0002027041701916686, + "loss": 2.0482, + "step": 15399 + }, + { + "epoch": 1.7967565044918912, + "grad_norm": 1.2490469217300415, + "learning_rate": 0.00020268997722696044, + "loss": 2.2199, + "step": 15400 + }, + { + "epoch": 1.7968731769921829, + "grad_norm": 1.107810378074646, + "learning_rate": 0.0002026757837315713, + "loss": 2.1715, + "step": 15401 + }, + { + "epoch": 1.7969898494924745, + "grad_norm": 1.039018154144287, + "learning_rate": 0.0002026615897056482, + "loss": 2.0952, + "step": 15402 + }, + { + "epoch": 1.7971065219927662, + "grad_norm": 1.0367740392684937, + "learning_rate": 0.0002026473951493384, + "loss": 2.15, + "step": 15403 + }, + { + "epoch": 1.797223194493058, + "grad_norm": 0.9857138991355896, + "learning_rate": 0.0002026332000627889, + "loss": 2.0066, + "step": 15404 + }, + { + "epoch": 1.7973398669933496, + "grad_norm": 1.0475713014602661, + "learning_rate": 0.000202619004446147, + "loss": 1.6933, + "step": 15405 + }, + { + "epoch": 1.7974565394936413, + "grad_norm": 1.0050415992736816, + "learning_rate": 0.00020260480829955983, + "loss": 2.1021, + "step": 15406 + }, + { + "epoch": 1.797573211993933, + "grad_norm": 1.1720972061157227, + "learning_rate": 0.00020259061162317455, + "loss": 2.1883, + "step": 15407 + }, + { + "epoch": 1.7976898844942246, + "grad_norm": 1.0235868692398071, + "learning_rate": 0.00020257641441713832, + "loss": 2.0443, + "step": 15408 + }, + { + "epoch": 1.7978065569945163, + "grad_norm": 1.189128041267395, + "learning_rate": 0.00020256221668159833, + "loss": 2.0625, + "step": 15409 + }, + { + "epoch": 1.797923229494808, + "grad_norm": 1.1321301460266113, + "learning_rate": 0.0002025480184167018, + "loss": 1.9665, + "step": 15410 + }, + { + "epoch": 1.7980399019950997, + "grad_norm": 1.0399627685546875, + "learning_rate": 0.00020253381962259583, + "loss": 1.991, + "step": 15411 + }, + { + "epoch": 1.7981565744953913, + "grad_norm": 1.1434595584869385, + "learning_rate": 0.00020251962029942778, + "loss": 2.0043, + "step": 15412 + }, + { + "epoch": 1.798273246995683, + "grad_norm": 1.1183199882507324, + "learning_rate": 0.00020250542044734462, + "loss": 2.1503, + "step": 15413 + }, + { + "epoch": 1.7983899194959747, + "grad_norm": 1.168701171875, + "learning_rate": 0.0002024912200664938, + "loss": 2.0605, + "step": 15414 + }, + { + "epoch": 1.7985065919962664, + "grad_norm": 1.0773189067840576, + "learning_rate": 0.00020247701915702238, + "loss": 2.1699, + "step": 15415 + }, + { + "epoch": 1.798623264496558, + "grad_norm": 0.982257604598999, + "learning_rate": 0.0002024628177190776, + "loss": 1.95, + "step": 15416 + }, + { + "epoch": 1.7987399369968498, + "grad_norm": 1.1843295097351074, + "learning_rate": 0.0002024486157528067, + "loss": 2.0425, + "step": 15417 + }, + { + "epoch": 1.7988566094971414, + "grad_norm": 1.0585110187530518, + "learning_rate": 0.00020243441325835699, + "loss": 2.0511, + "step": 15418 + }, + { + "epoch": 1.7989732819974331, + "grad_norm": 1.0928173065185547, + "learning_rate": 0.00020242021023587561, + "loss": 2.1057, + "step": 15419 + }, + { + "epoch": 1.7990899544977248, + "grad_norm": 1.1863726377487183, + "learning_rate": 0.00020240600668550978, + "loss": 1.9993, + "step": 15420 + }, + { + "epoch": 1.7992066269980165, + "grad_norm": 1.1321038007736206, + "learning_rate": 0.00020239180260740684, + "loss": 2.0706, + "step": 15421 + }, + { + "epoch": 1.7993232994983082, + "grad_norm": 1.1192023754119873, + "learning_rate": 0.00020237759800171401, + "loss": 2.0536, + "step": 15422 + }, + { + "epoch": 1.7994399719985998, + "grad_norm": 1.2240517139434814, + "learning_rate": 0.00020236339286857846, + "loss": 2.071, + "step": 15423 + }, + { + "epoch": 1.7995566444988915, + "grad_norm": 1.1292383670806885, + "learning_rate": 0.00020234918720814758, + "loss": 2.1668, + "step": 15424 + }, + { + "epoch": 1.7996733169991832, + "grad_norm": 1.0577419996261597, + "learning_rate": 0.00020233498102056858, + "loss": 2.0294, + "step": 15425 + }, + { + "epoch": 1.7997899894994749, + "grad_norm": 1.2492058277130127, + "learning_rate": 0.0002023207743059887, + "loss": 2.3724, + "step": 15426 + }, + { + "epoch": 1.7999066619997666, + "grad_norm": 1.2037025690078735, + "learning_rate": 0.0002023065670645553, + "loss": 2.0438, + "step": 15427 + }, + { + "epoch": 1.8000233345000582, + "grad_norm": 1.1342231035232544, + "learning_rate": 0.00020229235929641564, + "loss": 1.9228, + "step": 15428 + }, + { + "epoch": 1.80014000700035, + "grad_norm": 1.1599658727645874, + "learning_rate": 0.00020227815100171698, + "loss": 2.1022, + "step": 15429 + }, + { + "epoch": 1.8002566795006416, + "grad_norm": 1.061711072921753, + "learning_rate": 0.00020226394218060656, + "loss": 1.9573, + "step": 15430 + }, + { + "epoch": 1.8003733520009333, + "grad_norm": 1.1471396684646606, + "learning_rate": 0.00020224973283323178, + "loss": 1.9728, + "step": 15431 + }, + { + "epoch": 1.800490024501225, + "grad_norm": 1.2281756401062012, + "learning_rate": 0.00020223552295973998, + "loss": 2.0202, + "step": 15432 + }, + { + "epoch": 1.8006066970015167, + "grad_norm": 1.052262783050537, + "learning_rate": 0.00020222131256027835, + "loss": 2.0579, + "step": 15433 + }, + { + "epoch": 1.8007233695018083, + "grad_norm": 1.0333991050720215, + "learning_rate": 0.00020220710163499427, + "loss": 1.9766, + "step": 15434 + }, + { + "epoch": 1.8008400420021, + "grad_norm": 0.995790421962738, + "learning_rate": 0.00020219289018403506, + "loss": 2.0913, + "step": 15435 + }, + { + "epoch": 1.8009567145023917, + "grad_norm": 1.2797194719314575, + "learning_rate": 0.0002021786782075481, + "loss": 2.04, + "step": 15436 + }, + { + "epoch": 1.8010733870026834, + "grad_norm": 1.188070297241211, + "learning_rate": 0.00020216446570568065, + "loss": 2.133, + "step": 15437 + }, + { + "epoch": 1.801190059502975, + "grad_norm": 1.2098970413208008, + "learning_rate": 0.00020215025267858005, + "loss": 1.9534, + "step": 15438 + }, + { + "epoch": 1.8013067320032667, + "grad_norm": 1.0218724012374878, + "learning_rate": 0.00020213603912639367, + "loss": 2.145, + "step": 15439 + }, + { + "epoch": 1.8014234045035584, + "grad_norm": 1.1917721033096313, + "learning_rate": 0.00020212182504926885, + "loss": 2.073, + "step": 15440 + }, + { + "epoch": 1.80154007700385, + "grad_norm": 1.158881664276123, + "learning_rate": 0.00020210761044735294, + "loss": 2.1321, + "step": 15441 + }, + { + "epoch": 1.8016567495041418, + "grad_norm": 1.1505053043365479, + "learning_rate": 0.00020209339532079336, + "loss": 2.1727, + "step": 15442 + }, + { + "epoch": 1.8017734220044335, + "grad_norm": 1.1353917121887207, + "learning_rate": 0.0002020791796697374, + "loss": 1.8351, + "step": 15443 + }, + { + "epoch": 1.8018900945047251, + "grad_norm": 1.252411961555481, + "learning_rate": 0.00020206496349433245, + "loss": 2.0289, + "step": 15444 + }, + { + "epoch": 1.8020067670050168, + "grad_norm": 1.164570927619934, + "learning_rate": 0.00020205074679472594, + "loss": 2.1048, + "step": 15445 + }, + { + "epoch": 1.8021234395053085, + "grad_norm": 1.1208478212356567, + "learning_rate": 0.0002020365295710652, + "loss": 2.0673, + "step": 15446 + }, + { + "epoch": 1.8022401120056002, + "grad_norm": 1.0714455842971802, + "learning_rate": 0.00020202231182349763, + "loss": 2.0552, + "step": 15447 + }, + { + "epoch": 1.8023567845058919, + "grad_norm": 1.1708717346191406, + "learning_rate": 0.0002020080935521706, + "loss": 2.0701, + "step": 15448 + }, + { + "epoch": 1.8024734570061836, + "grad_norm": 1.0531294345855713, + "learning_rate": 0.00020199387475723155, + "loss": 2.034, + "step": 15449 + }, + { + "epoch": 1.8025901295064752, + "grad_norm": 1.1708091497421265, + "learning_rate": 0.00020197965543882789, + "loss": 1.9516, + "step": 15450 + }, + { + "epoch": 1.802706802006767, + "grad_norm": 1.0693119764328003, + "learning_rate": 0.00020196543559710702, + "loss": 2.0803, + "step": 15451 + }, + { + "epoch": 1.8028234745070586, + "grad_norm": 1.024138331413269, + "learning_rate": 0.0002019512152322163, + "loss": 2.0387, + "step": 15452 + }, + { + "epoch": 1.8029401470073503, + "grad_norm": 1.2036737203598022, + "learning_rate": 0.00020193699434430324, + "loss": 1.9372, + "step": 15453 + }, + { + "epoch": 1.803056819507642, + "grad_norm": 1.1192725896835327, + "learning_rate": 0.00020192277293351523, + "loss": 2.1696, + "step": 15454 + }, + { + "epoch": 1.8031734920079336, + "grad_norm": 1.1188989877700806, + "learning_rate": 0.00020190855099999963, + "loss": 2.0605, + "step": 15455 + }, + { + "epoch": 1.8032901645082253, + "grad_norm": 1.24661123752594, + "learning_rate": 0.00020189432854390398, + "loss": 2.1768, + "step": 15456 + }, + { + "epoch": 1.803406837008517, + "grad_norm": 1.0475354194641113, + "learning_rate": 0.00020188010556537568, + "loss": 2.0503, + "step": 15457 + }, + { + "epoch": 1.8035235095088087, + "grad_norm": 0.9987559914588928, + "learning_rate": 0.0002018658820645622, + "loss": 2.1066, + "step": 15458 + }, + { + "epoch": 1.8036401820091004, + "grad_norm": 1.1289169788360596, + "learning_rate": 0.00020185165804161096, + "loss": 2.1129, + "step": 15459 + }, + { + "epoch": 1.803756854509392, + "grad_norm": 0.9505793452262878, + "learning_rate": 0.0002018374334966694, + "loss": 1.9875, + "step": 15460 + }, + { + "epoch": 1.8038735270096837, + "grad_norm": 1.0520691871643066, + "learning_rate": 0.00020182320842988515, + "loss": 2.1149, + "step": 15461 + }, + { + "epoch": 1.8039901995099754, + "grad_norm": 1.235302209854126, + "learning_rate": 0.00020180898284140542, + "loss": 2.1748, + "step": 15462 + }, + { + "epoch": 1.804106872010267, + "grad_norm": 1.2922388315200806, + "learning_rate": 0.0002017947567313779, + "loss": 2.1583, + "step": 15463 + }, + { + "epoch": 1.8042235445105588, + "grad_norm": 1.2158358097076416, + "learning_rate": 0.00020178053009994995, + "loss": 1.9235, + "step": 15464 + }, + { + "epoch": 1.8043402170108505, + "grad_norm": 1.292656421661377, + "learning_rate": 0.00020176630294726908, + "loss": 1.9679, + "step": 15465 + }, + { + "epoch": 1.8044568895111421, + "grad_norm": 1.0840606689453125, + "learning_rate": 0.00020175207527348282, + "loss": 2.0604, + "step": 15466 + }, + { + "epoch": 1.8045735620114338, + "grad_norm": 1.0887529850006104, + "learning_rate": 0.00020173784707873857, + "loss": 2.0242, + "step": 15467 + }, + { + "epoch": 1.8046902345117255, + "grad_norm": 1.1203014850616455, + "learning_rate": 0.00020172361836318396, + "loss": 2.1449, + "step": 15468 + }, + { + "epoch": 1.8048069070120172, + "grad_norm": 1.1832524538040161, + "learning_rate": 0.00020170938912696644, + "loss": 1.9378, + "step": 15469 + }, + { + "epoch": 1.8049235795123089, + "grad_norm": 1.0271873474121094, + "learning_rate": 0.0002016951593702335, + "loss": 2.0025, + "step": 15470 + }, + { + "epoch": 1.8050402520126005, + "grad_norm": 1.2613757848739624, + "learning_rate": 0.00020168092909313268, + "loss": 2.0596, + "step": 15471 + }, + { + "epoch": 1.8051569245128922, + "grad_norm": 1.1383614540100098, + "learning_rate": 0.00020166669829581153, + "loss": 2.0812, + "step": 15472 + }, + { + "epoch": 1.805273597013184, + "grad_norm": 1.1121833324432373, + "learning_rate": 0.00020165246697841755, + "loss": 2.025, + "step": 15473 + }, + { + "epoch": 1.8053902695134756, + "grad_norm": 1.0124751329421997, + "learning_rate": 0.00020163823514109824, + "loss": 2.0878, + "step": 15474 + }, + { + "epoch": 1.8055069420137673, + "grad_norm": 1.116491436958313, + "learning_rate": 0.0002016240027840012, + "loss": 1.951, + "step": 15475 + }, + { + "epoch": 1.805623614514059, + "grad_norm": 0.9986028075218201, + "learning_rate": 0.00020160976990727395, + "loss": 1.9764, + "step": 15476 + }, + { + "epoch": 1.8057402870143506, + "grad_norm": 1.1353222131729126, + "learning_rate": 0.00020159553651106406, + "loss": 2.0064, + "step": 15477 + }, + { + "epoch": 1.8058569595146423, + "grad_norm": 1.2494815587997437, + "learning_rate": 0.00020158130259551908, + "loss": 2.2591, + "step": 15478 + }, + { + "epoch": 1.805973632014934, + "grad_norm": 1.16073477268219, + "learning_rate": 0.00020156706816078655, + "loss": 2.1618, + "step": 15479 + }, + { + "epoch": 1.8060903045152257, + "grad_norm": 1.1336555480957031, + "learning_rate": 0.00020155283320701407, + "loss": 2.0764, + "step": 15480 + }, + { + "epoch": 1.8062069770155174, + "grad_norm": 1.2037723064422607, + "learning_rate": 0.00020153859773434918, + "loss": 2.0779, + "step": 15481 + }, + { + "epoch": 1.806323649515809, + "grad_norm": 1.0475245714187622, + "learning_rate": 0.0002015243617429394, + "loss": 2.0687, + "step": 15482 + }, + { + "epoch": 1.8064403220161007, + "grad_norm": 1.184355616569519, + "learning_rate": 0.00020151012523293246, + "loss": 2.1506, + "step": 15483 + }, + { + "epoch": 1.8065569945163924, + "grad_norm": 1.0808069705963135, + "learning_rate": 0.00020149588820447584, + "loss": 2.0286, + "step": 15484 + }, + { + "epoch": 1.806673667016684, + "grad_norm": 1.1195406913757324, + "learning_rate": 0.00020148165065771718, + "loss": 2.0449, + "step": 15485 + }, + { + "epoch": 1.8067903395169758, + "grad_norm": 1.117789626121521, + "learning_rate": 0.00020146741259280404, + "loss": 1.9597, + "step": 15486 + }, + { + "epoch": 1.8069070120172674, + "grad_norm": 1.0722334384918213, + "learning_rate": 0.0002014531740098841, + "loss": 2.0508, + "step": 15487 + }, + { + "epoch": 1.8070236845175591, + "grad_norm": 1.1263471841812134, + "learning_rate": 0.00020143893490910488, + "loss": 2.0342, + "step": 15488 + }, + { + "epoch": 1.8071403570178508, + "grad_norm": 1.118713617324829, + "learning_rate": 0.00020142469529061403, + "loss": 2.0162, + "step": 15489 + }, + { + "epoch": 1.8072570295181425, + "grad_norm": 1.2205256223678589, + "learning_rate": 0.00020141045515455918, + "loss": 2.0415, + "step": 15490 + }, + { + "epoch": 1.8073737020184342, + "grad_norm": 1.3444150686264038, + "learning_rate": 0.00020139621450108796, + "loss": 2.3447, + "step": 15491 + }, + { + "epoch": 1.8074903745187259, + "grad_norm": 1.3185019493103027, + "learning_rate": 0.00020138197333034795, + "loss": 2.1145, + "step": 15492 + }, + { + "epoch": 1.8076070470190175, + "grad_norm": 1.0626240968704224, + "learning_rate": 0.00020136773164248684, + "loss": 2.0429, + "step": 15493 + }, + { + "epoch": 1.8077237195193092, + "grad_norm": 1.1864464282989502, + "learning_rate": 0.00020135348943765225, + "loss": 2.0618, + "step": 15494 + }, + { + "epoch": 1.807840392019601, + "grad_norm": 1.0050950050354004, + "learning_rate": 0.00020133924671599188, + "loss": 1.9344, + "step": 15495 + }, + { + "epoch": 1.8079570645198926, + "grad_norm": 1.0174404382705688, + "learning_rate": 0.00020132500347765333, + "loss": 1.8236, + "step": 15496 + }, + { + "epoch": 1.8080737370201843, + "grad_norm": 1.1517727375030518, + "learning_rate": 0.00020131075972278423, + "loss": 2.1201, + "step": 15497 + }, + { + "epoch": 1.808190409520476, + "grad_norm": 1.1855632066726685, + "learning_rate": 0.00020129651545153232, + "loss": 2.0193, + "step": 15498 + }, + { + "epoch": 1.8083070820207676, + "grad_norm": 1.0089949369430542, + "learning_rate": 0.0002012822706640452, + "loss": 2.1916, + "step": 15499 + }, + { + "epoch": 1.8084237545210593, + "grad_norm": 1.275961995124817, + "learning_rate": 0.0002012680253604706, + "loss": 2.1291, + "step": 15500 + }, + { + "epoch": 1.808540427021351, + "grad_norm": 1.1165187358856201, + "learning_rate": 0.00020125377954095618, + "loss": 2.1262, + "step": 15501 + }, + { + "epoch": 1.8086570995216427, + "grad_norm": 1.225630283355713, + "learning_rate": 0.00020123953320564956, + "loss": 2.2091, + "step": 15502 + }, + { + "epoch": 1.8087737720219343, + "grad_norm": 1.1164681911468506, + "learning_rate": 0.00020122528635469855, + "loss": 1.9582, + "step": 15503 + }, + { + "epoch": 1.808890444522226, + "grad_norm": 1.1937835216522217, + "learning_rate": 0.00020121103898825075, + "loss": 2.0907, + "step": 15504 + }, + { + "epoch": 1.8090071170225177, + "grad_norm": 1.0822515487670898, + "learning_rate": 0.00020119679110645395, + "loss": 2.1665, + "step": 15505 + }, + { + "epoch": 1.8091237895228094, + "grad_norm": 1.1089510917663574, + "learning_rate": 0.00020118254270945577, + "loss": 2.0268, + "step": 15506 + }, + { + "epoch": 1.809240462023101, + "grad_norm": 0.9781625866889954, + "learning_rate": 0.00020116829379740396, + "loss": 1.9916, + "step": 15507 + }, + { + "epoch": 1.8093571345233928, + "grad_norm": 0.9633251428604126, + "learning_rate": 0.00020115404437044622, + "loss": 1.8918, + "step": 15508 + }, + { + "epoch": 1.8094738070236844, + "grad_norm": 1.1120156049728394, + "learning_rate": 0.00020113979442873024, + "loss": 1.9631, + "step": 15509 + }, + { + "epoch": 1.8095904795239761, + "grad_norm": 1.0832146406173706, + "learning_rate": 0.0002011255439724038, + "loss": 2.1104, + "step": 15510 + }, + { + "epoch": 1.8097071520242678, + "grad_norm": 1.1260802745819092, + "learning_rate": 0.00020111129300161466, + "loss": 1.8817, + "step": 15511 + }, + { + "epoch": 1.8098238245245595, + "grad_norm": 1.109361171722412, + "learning_rate": 0.00020109704151651051, + "loss": 2.0325, + "step": 15512 + }, + { + "epoch": 1.8099404970248512, + "grad_norm": 1.1997995376586914, + "learning_rate": 0.00020108278951723915, + "loss": 1.8986, + "step": 15513 + }, + { + "epoch": 1.8100571695251428, + "grad_norm": 1.143687129020691, + "learning_rate": 0.00020106853700394825, + "loss": 2.0814, + "step": 15514 + }, + { + "epoch": 1.8101738420254345, + "grad_norm": 1.1187903881072998, + "learning_rate": 0.00020105428397678557, + "loss": 2.0601, + "step": 15515 + }, + { + "epoch": 1.8102905145257262, + "grad_norm": 1.2192895412445068, + "learning_rate": 0.00020104003043589893, + "loss": 2.0796, + "step": 15516 + }, + { + "epoch": 1.8104071870260179, + "grad_norm": 1.2392362356185913, + "learning_rate": 0.00020102577638143602, + "loss": 1.9863, + "step": 15517 + }, + { + "epoch": 1.8105238595263096, + "grad_norm": 0.9811788201332092, + "learning_rate": 0.0002010115218135447, + "loss": 1.9435, + "step": 15518 + }, + { + "epoch": 1.8106405320266012, + "grad_norm": 1.0380628108978271, + "learning_rate": 0.00020099726673237267, + "loss": 2.0142, + "step": 15519 + }, + { + "epoch": 1.810757204526893, + "grad_norm": 1.2730770111083984, + "learning_rate": 0.00020098301113806772, + "loss": 1.9388, + "step": 15520 + }, + { + "epoch": 1.8108738770271846, + "grad_norm": 1.2309297323226929, + "learning_rate": 0.00020096875503077772, + "loss": 2.0307, + "step": 15521 + }, + { + "epoch": 1.8109905495274763, + "grad_norm": 1.1128737926483154, + "learning_rate": 0.00020095449841065039, + "loss": 2.0238, + "step": 15522 + }, + { + "epoch": 1.811107222027768, + "grad_norm": 1.1320761442184448, + "learning_rate": 0.00020094024127783347, + "loss": 2.2088, + "step": 15523 + }, + { + "epoch": 1.8112238945280597, + "grad_norm": 1.2607343196868896, + "learning_rate": 0.00020092598363247486, + "loss": 2.2355, + "step": 15524 + }, + { + "epoch": 1.8113405670283513, + "grad_norm": 1.0811991691589355, + "learning_rate": 0.00020091172547472233, + "loss": 1.9673, + "step": 15525 + }, + { + "epoch": 1.811457239528643, + "grad_norm": 1.1030503511428833, + "learning_rate": 0.00020089746680472374, + "loss": 2.1315, + "step": 15526 + }, + { + "epoch": 1.8115739120289347, + "grad_norm": 1.121105670928955, + "learning_rate": 0.0002008832076226268, + "loss": 2.1846, + "step": 15527 + }, + { + "epoch": 1.8116905845292264, + "grad_norm": 1.0583423376083374, + "learning_rate": 0.00020086894792857942, + "loss": 2.125, + "step": 15528 + }, + { + "epoch": 1.811807257029518, + "grad_norm": 1.0112171173095703, + "learning_rate": 0.00020085468772272944, + "loss": 2.0693, + "step": 15529 + }, + { + "epoch": 1.8119239295298097, + "grad_norm": 0.9881126880645752, + "learning_rate": 0.00020084042700522462, + "loss": 1.8541, + "step": 15530 + }, + { + "epoch": 1.8120406020301014, + "grad_norm": 1.1934595108032227, + "learning_rate": 0.0002008261657762129, + "loss": 2.0069, + "step": 15531 + }, + { + "epoch": 1.812157274530393, + "grad_norm": 1.097212791442871, + "learning_rate": 0.00020081190403584205, + "loss": 2.2258, + "step": 15532 + }, + { + "epoch": 1.8122739470306848, + "grad_norm": 1.0353546142578125, + "learning_rate": 0.00020079764178425986, + "loss": 1.9896, + "step": 15533 + }, + { + "epoch": 1.8123906195309765, + "grad_norm": 1.224259614944458, + "learning_rate": 0.00020078337902161434, + "loss": 2.1255, + "step": 15534 + }, + { + "epoch": 1.8125072920312681, + "grad_norm": 1.0906270742416382, + "learning_rate": 0.00020076911574805318, + "loss": 1.9376, + "step": 15535 + }, + { + "epoch": 1.8126239645315598, + "grad_norm": 1.4732539653778076, + "learning_rate": 0.0002007548519637244, + "loss": 2.3844, + "step": 15536 + }, + { + "epoch": 1.8127406370318515, + "grad_norm": 1.1786322593688965, + "learning_rate": 0.0002007405876687758, + "loss": 1.9706, + "step": 15537 + }, + { + "epoch": 1.8128573095321432, + "grad_norm": 1.1199138164520264, + "learning_rate": 0.00020072632286335528, + "loss": 2.0814, + "step": 15538 + }, + { + "epoch": 1.8129739820324349, + "grad_norm": 1.3419748544692993, + "learning_rate": 0.00020071205754761072, + "loss": 2.0745, + "step": 15539 + }, + { + "epoch": 1.8130906545327266, + "grad_norm": 1.2103230953216553, + "learning_rate": 0.00020069779172168998, + "loss": 2.1571, + "step": 15540 + }, + { + "epoch": 1.8132073270330182, + "grad_norm": 1.2831699848175049, + "learning_rate": 0.00020068352538574094, + "loss": 2.1056, + "step": 15541 + }, + { + "epoch": 1.81332399953331, + "grad_norm": 1.0233722925186157, + "learning_rate": 0.0002006692585399115, + "loss": 1.7877, + "step": 15542 + }, + { + "epoch": 1.8134406720336016, + "grad_norm": 1.0754475593566895, + "learning_rate": 0.0002006549911843496, + "loss": 2.0055, + "step": 15543 + }, + { + "epoch": 1.8135573445338933, + "grad_norm": 1.2855701446533203, + "learning_rate": 0.0002006407233192032, + "loss": 2.0651, + "step": 15544 + }, + { + "epoch": 1.813674017034185, + "grad_norm": 1.0364595651626587, + "learning_rate": 0.0002006264549446201, + "loss": 2.0446, + "step": 15545 + }, + { + "epoch": 1.8137906895344766, + "grad_norm": 1.1659483909606934, + "learning_rate": 0.00020061218606074827, + "loss": 2.0858, + "step": 15546 + }, + { + "epoch": 1.8139073620347683, + "grad_norm": 1.1788500547409058, + "learning_rate": 0.0002005979166677356, + "loss": 2.1659, + "step": 15547 + }, + { + "epoch": 1.81402403453506, + "grad_norm": 1.4193944931030273, + "learning_rate": 0.0002005836467657301, + "loss": 1.9972, + "step": 15548 + }, + { + "epoch": 1.8141407070353517, + "grad_norm": 1.1426295042037964, + "learning_rate": 0.00020056937635487963, + "loss": 2.1565, + "step": 15549 + }, + { + "epoch": 1.8142573795356434, + "grad_norm": 1.2913928031921387, + "learning_rate": 0.00020055510543533212, + "loss": 2.1127, + "step": 15550 + }, + { + "epoch": 1.814374052035935, + "grad_norm": 1.1972355842590332, + "learning_rate": 0.0002005408340072356, + "loss": 2.0311, + "step": 15551 + }, + { + "epoch": 1.8144907245362267, + "grad_norm": 1.053562879562378, + "learning_rate": 0.00020052656207073793, + "loss": 2.0072, + "step": 15552 + }, + { + "epoch": 1.8146073970365184, + "grad_norm": 0.9846097826957703, + "learning_rate": 0.00020051228962598711, + "loss": 1.9819, + "step": 15553 + }, + { + "epoch": 1.81472406953681, + "grad_norm": 1.0734038352966309, + "learning_rate": 0.0002004980166731311, + "loss": 2.1762, + "step": 15554 + }, + { + "epoch": 1.8148407420371018, + "grad_norm": 1.1629513502120972, + "learning_rate": 0.00020048374321231789, + "loss": 2.1265, + "step": 15555 + }, + { + "epoch": 1.8149574145373935, + "grad_norm": 1.1374412775039673, + "learning_rate": 0.0002004694692436954, + "loss": 2.2227, + "step": 15556 + }, + { + "epoch": 1.8150740870376851, + "grad_norm": 1.2078405618667603, + "learning_rate": 0.00020045519476741166, + "loss": 2.069, + "step": 15557 + }, + { + "epoch": 1.8151907595379768, + "grad_norm": 1.0307825803756714, + "learning_rate": 0.00020044091978361464, + "loss": 2.0273, + "step": 15558 + }, + { + "epoch": 1.8153074320382685, + "grad_norm": 0.8884561061859131, + "learning_rate": 0.00020042664429245226, + "loss": 2.1076, + "step": 15559 + }, + { + "epoch": 1.8154241045385602, + "grad_norm": 1.1687300205230713, + "learning_rate": 0.00020041236829407256, + "loss": 2.2115, + "step": 15560 + }, + { + "epoch": 1.8155407770388519, + "grad_norm": 1.1096268892288208, + "learning_rate": 0.0002003980917886235, + "loss": 2.2408, + "step": 15561 + }, + { + "epoch": 1.8156574495391435, + "grad_norm": 0.979780375957489, + "learning_rate": 0.00020038381477625316, + "loss": 1.933, + "step": 15562 + }, + { + "epoch": 1.8157741220394352, + "grad_norm": 1.2816962003707886, + "learning_rate": 0.00020036953725710953, + "loss": 2.1433, + "step": 15563 + }, + { + "epoch": 1.815890794539727, + "grad_norm": 1.0859260559082031, + "learning_rate": 0.00020035525923134062, + "loss": 2.2476, + "step": 15564 + }, + { + "epoch": 1.8160074670400186, + "grad_norm": 1.1196626424789429, + "learning_rate": 0.00020034098069909444, + "loss": 2.0237, + "step": 15565 + }, + { + "epoch": 1.8161241395403103, + "grad_norm": 1.2231979370117188, + "learning_rate": 0.00020032670166051893, + "loss": 2.2539, + "step": 15566 + }, + { + "epoch": 1.816240812040602, + "grad_norm": 1.0448758602142334, + "learning_rate": 0.00020031242211576225, + "loss": 2.0398, + "step": 15567 + }, + { + "epoch": 1.8163574845408936, + "grad_norm": 1.2972121238708496, + "learning_rate": 0.0002002981420649724, + "loss": 2.273, + "step": 15568 + }, + { + "epoch": 1.8164741570411853, + "grad_norm": 0.958639919757843, + "learning_rate": 0.00020028386150829737, + "loss": 2.1263, + "step": 15569 + }, + { + "epoch": 1.816590829541477, + "grad_norm": 1.0846307277679443, + "learning_rate": 0.00020026958044588522, + "loss": 2.0438, + "step": 15570 + }, + { + "epoch": 1.8167075020417687, + "grad_norm": 1.179321050643921, + "learning_rate": 0.00020025529887788404, + "loss": 2.0103, + "step": 15571 + }, + { + "epoch": 1.8168241745420604, + "grad_norm": 1.098060131072998, + "learning_rate": 0.00020024101680444185, + "loss": 1.904, + "step": 15572 + }, + { + "epoch": 1.816940847042352, + "grad_norm": 1.1533193588256836, + "learning_rate": 0.00020022673422570673, + "loss": 2.2644, + "step": 15573 + }, + { + "epoch": 1.8170575195426437, + "grad_norm": 1.1649409532546997, + "learning_rate": 0.00020021245114182674, + "loss": 2.0377, + "step": 15574 + }, + { + "epoch": 1.8171741920429354, + "grad_norm": 1.140601396560669, + "learning_rate": 0.00020019816755294996, + "loss": 2.1122, + "step": 15575 + }, + { + "epoch": 1.817290864543227, + "grad_norm": 1.0023185014724731, + "learning_rate": 0.00020018388345922441, + "loss": 2.0212, + "step": 15576 + }, + { + "epoch": 1.8174075370435188, + "grad_norm": 1.184443473815918, + "learning_rate": 0.00020016959886079826, + "loss": 2.1239, + "step": 15577 + }, + { + "epoch": 1.8175242095438104, + "grad_norm": 1.2924896478652954, + "learning_rate": 0.00020015531375781954, + "loss": 1.9879, + "step": 15578 + }, + { + "epoch": 1.8176408820441021, + "grad_norm": 1.249354362487793, + "learning_rate": 0.00020014102815043633, + "loss": 2.1391, + "step": 15579 + }, + { + "epoch": 1.8177575545443938, + "grad_norm": 0.961320698261261, + "learning_rate": 0.0002001267420387968, + "loss": 2.0143, + "step": 15580 + }, + { + "epoch": 1.8178742270446855, + "grad_norm": 1.4784826040267944, + "learning_rate": 0.00020011245542304897, + "loss": 2.2644, + "step": 15581 + }, + { + "epoch": 1.8179908995449772, + "grad_norm": 1.2156705856323242, + "learning_rate": 0.00020009816830334108, + "loss": 2.0281, + "step": 15582 + }, + { + "epoch": 1.8181075720452688, + "grad_norm": 1.1450769901275635, + "learning_rate": 0.00020008388067982105, + "loss": 2.1633, + "step": 15583 + }, + { + "epoch": 1.8182242445455605, + "grad_norm": 1.2252554893493652, + "learning_rate": 0.00020006959255263712, + "loss": 1.8077, + "step": 15584 + }, + { + "epoch": 1.8183409170458522, + "grad_norm": 1.054352045059204, + "learning_rate": 0.00020005530392193738, + "loss": 2.1266, + "step": 15585 + }, + { + "epoch": 1.818457589546144, + "grad_norm": 1.2937066555023193, + "learning_rate": 0.00020004101478786996, + "loss": 2.1008, + "step": 15586 + }, + { + "epoch": 1.8185742620464356, + "grad_norm": 1.2399630546569824, + "learning_rate": 0.000200026725150583, + "loss": 2.1693, + "step": 15587 + }, + { + "epoch": 1.8186909345467273, + "grad_norm": 1.1922200918197632, + "learning_rate": 0.00020001243501022466, + "loss": 2.0838, + "step": 15588 + }, + { + "epoch": 1.818807607047019, + "grad_norm": 1.0101617574691772, + "learning_rate": 0.0001999981443669431, + "loss": 1.8898, + "step": 15589 + }, + { + "epoch": 1.8189242795473106, + "grad_norm": 1.0666407346725464, + "learning_rate": 0.00019998385322088637, + "loss": 1.997, + "step": 15590 + }, + { + "epoch": 1.8190409520476023, + "grad_norm": 1.0888454914093018, + "learning_rate": 0.0001999695615722027, + "loss": 1.9911, + "step": 15591 + }, + { + "epoch": 1.819157624547894, + "grad_norm": 1.2538249492645264, + "learning_rate": 0.00019995526942104028, + "loss": 2.0438, + "step": 15592 + }, + { + "epoch": 1.8192742970481857, + "grad_norm": 1.1915708780288696, + "learning_rate": 0.00019994097676754723, + "loss": 2.3076, + "step": 15593 + }, + { + "epoch": 1.8193909695484773, + "grad_norm": 1.1227591037750244, + "learning_rate": 0.00019992668361187168, + "loss": 2.1016, + "step": 15594 + }, + { + "epoch": 1.819507642048769, + "grad_norm": 1.2873821258544922, + "learning_rate": 0.00019991238995416188, + "loss": 2.0388, + "step": 15595 + }, + { + "epoch": 1.8196243145490607, + "grad_norm": 1.140362024307251, + "learning_rate": 0.000199898095794566, + "loss": 1.9846, + "step": 15596 + }, + { + "epoch": 1.8197409870493524, + "grad_norm": 1.256148099899292, + "learning_rate": 0.0001998838011332322, + "loss": 2.1467, + "step": 15597 + }, + { + "epoch": 1.819857659549644, + "grad_norm": 1.1601577997207642, + "learning_rate": 0.00019986950597030865, + "loss": 2.2412, + "step": 15598 + }, + { + "epoch": 1.8199743320499358, + "grad_norm": 1.0327153205871582, + "learning_rate": 0.0001998552103059436, + "loss": 2.0127, + "step": 15599 + }, + { + "epoch": 1.8200910045502274, + "grad_norm": 1.300339937210083, + "learning_rate": 0.00019984091414028518, + "loss": 2.1756, + "step": 15600 + }, + { + "epoch": 1.8202076770505191, + "grad_norm": 1.0766772031784058, + "learning_rate": 0.00019982661747348172, + "loss": 2.1895, + "step": 15601 + }, + { + "epoch": 1.8203243495508108, + "grad_norm": 1.219219446182251, + "learning_rate": 0.0001998123203056813, + "loss": 2.0076, + "step": 15602 + }, + { + "epoch": 1.8204410220511025, + "grad_norm": 0.9801347255706787, + "learning_rate": 0.00019979802263703222, + "loss": 1.9035, + "step": 15603 + }, + { + "epoch": 1.8205576945513942, + "grad_norm": 1.0562105178833008, + "learning_rate": 0.00019978372446768263, + "loss": 1.951, + "step": 15604 + }, + { + "epoch": 1.8206743670516858, + "grad_norm": 1.1016453504562378, + "learning_rate": 0.00019976942579778085, + "loss": 2.2089, + "step": 15605 + }, + { + "epoch": 1.8207910395519775, + "grad_norm": 1.4028651714324951, + "learning_rate": 0.00019975512662747508, + "loss": 2.1871, + "step": 15606 + }, + { + "epoch": 1.8209077120522692, + "grad_norm": 1.0352232456207275, + "learning_rate": 0.00019974082695691352, + "loss": 2.0042, + "step": 15607 + }, + { + "epoch": 1.8210243845525609, + "grad_norm": 1.1959853172302246, + "learning_rate": 0.00019972652678624446, + "loss": 2.0825, + "step": 15608 + }, + { + "epoch": 1.8211410570528526, + "grad_norm": 1.3352372646331787, + "learning_rate": 0.00019971222611561611, + "loss": 2.1792, + "step": 15609 + }, + { + "epoch": 1.8212577295531442, + "grad_norm": 0.9905586242675781, + "learning_rate": 0.00019969792494517675, + "loss": 2.0628, + "step": 15610 + }, + { + "epoch": 1.821374402053436, + "grad_norm": 1.1882535219192505, + "learning_rate": 0.00019968362327507457, + "loss": 2.0968, + "step": 15611 + }, + { + "epoch": 1.8214910745537276, + "grad_norm": 1.278110146522522, + "learning_rate": 0.00019966932110545795, + "loss": 2.0852, + "step": 15612 + }, + { + "epoch": 1.8216077470540193, + "grad_norm": 1.240712285041809, + "learning_rate": 0.00019965501843647506, + "loss": 2.1814, + "step": 15613 + }, + { + "epoch": 1.821724419554311, + "grad_norm": 1.234556794166565, + "learning_rate": 0.00019964071526827422, + "loss": 2.2133, + "step": 15614 + }, + { + "epoch": 1.8218410920546027, + "grad_norm": 1.0597089529037476, + "learning_rate": 0.00019962641160100374, + "loss": 2.0492, + "step": 15615 + }, + { + "epoch": 1.8219577645548943, + "grad_norm": 1.1246790885925293, + "learning_rate": 0.00019961210743481183, + "loss": 1.8616, + "step": 15616 + }, + { + "epoch": 1.822074437055186, + "grad_norm": 1.0972959995269775, + "learning_rate": 0.00019959780276984686, + "loss": 2.0254, + "step": 15617 + }, + { + "epoch": 1.8221911095554777, + "grad_norm": 1.0964397192001343, + "learning_rate": 0.00019958349760625707, + "loss": 2.179, + "step": 15618 + }, + { + "epoch": 1.8223077820557694, + "grad_norm": 0.9953850507736206, + "learning_rate": 0.00019956919194419077, + "loss": 2.0636, + "step": 15619 + }, + { + "epoch": 1.822424454556061, + "grad_norm": 1.141104817390442, + "learning_rate": 0.00019955488578379625, + "loss": 2.1281, + "step": 15620 + }, + { + "epoch": 1.8225411270563527, + "grad_norm": 1.1484980583190918, + "learning_rate": 0.00019954057912522188, + "loss": 2.1975, + "step": 15621 + }, + { + "epoch": 1.8226577995566444, + "grad_norm": 1.1149014234542847, + "learning_rate": 0.0001995262719686159, + "loss": 2.036, + "step": 15622 + }, + { + "epoch": 1.822774472056936, + "grad_norm": 1.470829725265503, + "learning_rate": 0.00019951196431412666, + "loss": 2.1324, + "step": 15623 + }, + { + "epoch": 1.8228911445572278, + "grad_norm": 0.993452250957489, + "learning_rate": 0.0001994976561619025, + "loss": 2.0141, + "step": 15624 + }, + { + "epoch": 1.8230078170575195, + "grad_norm": 1.0464674234390259, + "learning_rate": 0.00019948334751209175, + "loss": 2.0313, + "step": 15625 + }, + { + "epoch": 1.8231244895578111, + "grad_norm": 1.0465099811553955, + "learning_rate": 0.0001994690383648427, + "loss": 2.0571, + "step": 15626 + }, + { + "epoch": 1.8232411620581028, + "grad_norm": 1.1502585411071777, + "learning_rate": 0.00019945472872030383, + "loss": 2.0389, + "step": 15627 + }, + { + "epoch": 1.8233578345583945, + "grad_norm": 1.0546696186065674, + "learning_rate": 0.00019944041857862332, + "loss": 1.9454, + "step": 15628 + }, + { + "epoch": 1.8234745070586862, + "grad_norm": 1.183820128440857, + "learning_rate": 0.0001994261079399496, + "loss": 2.077, + "step": 15629 + }, + { + "epoch": 1.8235911795589779, + "grad_norm": 1.4159653186798096, + "learning_rate": 0.00019941179680443098, + "loss": 1.9668, + "step": 15630 + }, + { + "epoch": 1.8237078520592696, + "grad_norm": 0.9951744079589844, + "learning_rate": 0.0001993974851722159, + "loss": 1.8623, + "step": 15631 + }, + { + "epoch": 1.8238245245595612, + "grad_norm": 1.1788859367370605, + "learning_rate": 0.0001993831730434527, + "loss": 2.183, + "step": 15632 + }, + { + "epoch": 1.823941197059853, + "grad_norm": 1.1644654273986816, + "learning_rate": 0.0001993688604182897, + "loss": 2.2837, + "step": 15633 + }, + { + "epoch": 1.8240578695601446, + "grad_norm": 1.4266833066940308, + "learning_rate": 0.00019935454729687536, + "loss": 2.1559, + "step": 15634 + }, + { + "epoch": 1.8241745420604363, + "grad_norm": 1.0503133535385132, + "learning_rate": 0.000199340233679358, + "loss": 1.8944, + "step": 15635 + }, + { + "epoch": 1.824291214560728, + "grad_norm": 1.0448495149612427, + "learning_rate": 0.00019932591956588598, + "loss": 1.9475, + "step": 15636 + }, + { + "epoch": 1.8244078870610196, + "grad_norm": 1.0771406888961792, + "learning_rate": 0.00019931160495660776, + "loss": 2.0367, + "step": 15637 + }, + { + "epoch": 1.8245245595613113, + "grad_norm": 1.02121102809906, + "learning_rate": 0.00019929728985167172, + "loss": 1.8457, + "step": 15638 + }, + { + "epoch": 1.824641232061603, + "grad_norm": 0.9720343351364136, + "learning_rate": 0.00019928297425122622, + "loss": 1.9295, + "step": 15639 + }, + { + "epoch": 1.8247579045618947, + "grad_norm": 1.0811493396759033, + "learning_rate": 0.00019926865815541976, + "loss": 2.109, + "step": 15640 + }, + { + "epoch": 1.8248745770621864, + "grad_norm": 1.0025827884674072, + "learning_rate": 0.00019925434156440073, + "loss": 2.0128, + "step": 15641 + }, + { + "epoch": 1.824991249562478, + "grad_norm": 1.200869083404541, + "learning_rate": 0.00019924002447831745, + "loss": 1.9288, + "step": 15642 + }, + { + "epoch": 1.8251079220627697, + "grad_norm": 0.979673445224762, + "learning_rate": 0.00019922570689731846, + "loss": 1.681, + "step": 15643 + }, + { + "epoch": 1.8252245945630614, + "grad_norm": 1.043320655822754, + "learning_rate": 0.00019921138882155213, + "loss": 1.9712, + "step": 15644 + }, + { + "epoch": 1.825341267063353, + "grad_norm": 1.2025001049041748, + "learning_rate": 0.0001991970702511669, + "loss": 2.0971, + "step": 15645 + }, + { + "epoch": 1.8254579395636448, + "grad_norm": 1.149627685546875, + "learning_rate": 0.00019918275118631118, + "loss": 2.1316, + "step": 15646 + }, + { + "epoch": 1.8255746120639365, + "grad_norm": 1.0877727270126343, + "learning_rate": 0.0001991684316271335, + "loss": 2.0847, + "step": 15647 + }, + { + "epoch": 1.8256912845642281, + "grad_norm": 1.0118498802185059, + "learning_rate": 0.00019915411157378225, + "loss": 1.9356, + "step": 15648 + }, + { + "epoch": 1.8258079570645198, + "grad_norm": 0.8941608667373657, + "learning_rate": 0.00019913979102640587, + "loss": 1.7455, + "step": 15649 + }, + { + "epoch": 1.8259246295648115, + "grad_norm": 1.0526604652404785, + "learning_rate": 0.00019912546998515288, + "loss": 1.9049, + "step": 15650 + }, + { + "epoch": 1.8260413020651032, + "grad_norm": 1.103309988975525, + "learning_rate": 0.0001991111484501717, + "loss": 1.9661, + "step": 15651 + }, + { + "epoch": 1.8261579745653949, + "grad_norm": 1.06355881690979, + "learning_rate": 0.00019909682642161078, + "loss": 1.9186, + "step": 15652 + }, + { + "epoch": 1.8262746470656865, + "grad_norm": 1.1829638481140137, + "learning_rate": 0.00019908250389961862, + "loss": 2.0849, + "step": 15653 + }, + { + "epoch": 1.8263913195659782, + "grad_norm": 1.222639560699463, + "learning_rate": 0.00019906818088434374, + "loss": 2.0597, + "step": 15654 + }, + { + "epoch": 1.82650799206627, + "grad_norm": 1.0768698453903198, + "learning_rate": 0.00019905385737593455, + "loss": 2.1264, + "step": 15655 + }, + { + "epoch": 1.8266246645665616, + "grad_norm": 1.0298750400543213, + "learning_rate": 0.00019903953337453958, + "loss": 2.2733, + "step": 15656 + }, + { + "epoch": 1.8267413370668533, + "grad_norm": 1.0192127227783203, + "learning_rate": 0.00019902520888030734, + "loss": 1.9706, + "step": 15657 + }, + { + "epoch": 1.826858009567145, + "grad_norm": 1.081323266029358, + "learning_rate": 0.0001990108838933863, + "loss": 2.0503, + "step": 15658 + }, + { + "epoch": 1.8269746820674366, + "grad_norm": 1.147857666015625, + "learning_rate": 0.00019899655841392505, + "loss": 2.2145, + "step": 15659 + }, + { + "epoch": 1.8270913545677283, + "grad_norm": 1.1877824068069458, + "learning_rate": 0.00019898223244207196, + "loss": 2.1349, + "step": 15660 + }, + { + "epoch": 1.82720802706802, + "grad_norm": 0.9399847984313965, + "learning_rate": 0.00019896790597797562, + "loss": 1.93, + "step": 15661 + }, + { + "epoch": 1.8273246995683117, + "grad_norm": 1.0616785287857056, + "learning_rate": 0.00019895357902178452, + "loss": 2.0434, + "step": 15662 + }, + { + "epoch": 1.8274413720686034, + "grad_norm": 1.0176308155059814, + "learning_rate": 0.00019893925157364725, + "loss": 2.05, + "step": 15663 + }, + { + "epoch": 1.827558044568895, + "grad_norm": 1.0956367254257202, + "learning_rate": 0.0001989249236337123, + "loss": 2.0501, + "step": 15664 + }, + { + "epoch": 1.8276747170691867, + "grad_norm": 1.2277443408966064, + "learning_rate": 0.0001989105952021282, + "loss": 2.17, + "step": 15665 + }, + { + "epoch": 1.8277913895694784, + "grad_norm": 1.2089214324951172, + "learning_rate": 0.00019889626627904352, + "loss": 2.0177, + "step": 15666 + }, + { + "epoch": 1.82790806206977, + "grad_norm": 1.0563892126083374, + "learning_rate": 0.0001988819368646068, + "loss": 2.042, + "step": 15667 + }, + { + "epoch": 1.8280247345700618, + "grad_norm": 1.1663947105407715, + "learning_rate": 0.00019886760695896652, + "loss": 2.0725, + "step": 15668 + }, + { + "epoch": 1.8281414070703534, + "grad_norm": 1.1761378049850464, + "learning_rate": 0.00019885327656227138, + "loss": 2.2295, + "step": 15669 + }, + { + "epoch": 1.8282580795706451, + "grad_norm": 1.2400075197219849, + "learning_rate": 0.0001988389456746698, + "loss": 2.1251, + "step": 15670 + }, + { + "epoch": 1.8283747520709368, + "grad_norm": 1.1404931545257568, + "learning_rate": 0.0001988246142963104, + "loss": 2.0891, + "step": 15671 + }, + { + "epoch": 1.8284914245712285, + "grad_norm": 1.0023757219314575, + "learning_rate": 0.00019881028242734174, + "loss": 2.0047, + "step": 15672 + }, + { + "epoch": 1.8286080970715202, + "grad_norm": 1.1190860271453857, + "learning_rate": 0.00019879595006791248, + "loss": 1.9742, + "step": 15673 + }, + { + "epoch": 1.8287247695718118, + "grad_norm": 1.11064875125885, + "learning_rate": 0.00019878161721817108, + "loss": 2.1103, + "step": 15674 + }, + { + "epoch": 1.8288414420721035, + "grad_norm": 1.1998087167739868, + "learning_rate": 0.00019876728387826622, + "loss": 2.0729, + "step": 15675 + }, + { + "epoch": 1.8289581145723952, + "grad_norm": 1.017905592918396, + "learning_rate": 0.00019875295004834643, + "loss": 2.0997, + "step": 15676 + }, + { + "epoch": 1.829074787072687, + "grad_norm": 1.322292685508728, + "learning_rate": 0.00019873861572856038, + "loss": 2.2927, + "step": 15677 + }, + { + "epoch": 1.8291914595729786, + "grad_norm": 1.238145351409912, + "learning_rate": 0.00019872428091905655, + "loss": 1.9791, + "step": 15678 + }, + { + "epoch": 1.8293081320732703, + "grad_norm": 1.1881071329116821, + "learning_rate": 0.00019870994561998367, + "loss": 2.0959, + "step": 15679 + }, + { + "epoch": 1.829424804573562, + "grad_norm": 1.0130491256713867, + "learning_rate": 0.00019869560983149025, + "loss": 2.1306, + "step": 15680 + }, + { + "epoch": 1.8295414770738536, + "grad_norm": 1.0691776275634766, + "learning_rate": 0.000198681273553725, + "loss": 2.0595, + "step": 15681 + }, + { + "epoch": 1.8296581495741453, + "grad_norm": 1.4721109867095947, + "learning_rate": 0.00019866693678683648, + "loss": 2.3326, + "step": 15682 + }, + { + "epoch": 1.829774822074437, + "grad_norm": 1.015737771987915, + "learning_rate": 0.0001986525995309734, + "loss": 1.9355, + "step": 15683 + }, + { + "epoch": 1.8298914945747287, + "grad_norm": 1.2584973573684692, + "learning_rate": 0.0001986382617862843, + "loss": 2.07, + "step": 15684 + }, + { + "epoch": 1.8300081670750203, + "grad_norm": 1.084733009338379, + "learning_rate": 0.00019862392355291788, + "loss": 2.0388, + "step": 15685 + }, + { + "epoch": 1.830124839575312, + "grad_norm": 1.0280416011810303, + "learning_rate": 0.00019860958483102273, + "loss": 1.8653, + "step": 15686 + }, + { + "epoch": 1.8302415120756037, + "grad_norm": 1.0607507228851318, + "learning_rate": 0.00019859524562074754, + "loss": 2.1112, + "step": 15687 + }, + { + "epoch": 1.8303581845758954, + "grad_norm": 1.118338704109192, + "learning_rate": 0.0001985809059222409, + "loss": 2.2219, + "step": 15688 + }, + { + "epoch": 1.830474857076187, + "grad_norm": 1.1735605001449585, + "learning_rate": 0.00019856656573565156, + "loss": 1.996, + "step": 15689 + }, + { + "epoch": 1.8305915295764787, + "grad_norm": 1.101800799369812, + "learning_rate": 0.00019855222506112813, + "loss": 1.9968, + "step": 15690 + }, + { + "epoch": 1.8307082020767704, + "grad_norm": 1.09415864944458, + "learning_rate": 0.00019853788389881928, + "loss": 1.9701, + "step": 15691 + }, + { + "epoch": 1.8308248745770621, + "grad_norm": 1.0466279983520508, + "learning_rate": 0.00019852354224887366, + "loss": 1.9044, + "step": 15692 + }, + { + "epoch": 1.8309415470773538, + "grad_norm": 1.1603912115097046, + "learning_rate": 0.00019850920011144004, + "loss": 2.0122, + "step": 15693 + }, + { + "epoch": 1.8310582195776455, + "grad_norm": 1.1852680444717407, + "learning_rate": 0.000198494857486667, + "loss": 2.1977, + "step": 15694 + }, + { + "epoch": 1.8311748920779372, + "grad_norm": 1.0460765361785889, + "learning_rate": 0.00019848051437470327, + "loss": 1.8734, + "step": 15695 + }, + { + "epoch": 1.8312915645782288, + "grad_norm": 1.1486358642578125, + "learning_rate": 0.0001984661707756976, + "loss": 2.2049, + "step": 15696 + }, + { + "epoch": 1.8314082370785205, + "grad_norm": 1.0475010871887207, + "learning_rate": 0.00019845182668979855, + "loss": 2.1026, + "step": 15697 + }, + { + "epoch": 1.8315249095788122, + "grad_norm": 1.132030963897705, + "learning_rate": 0.00019843748211715497, + "loss": 2.1289, + "step": 15698 + }, + { + "epoch": 1.8316415820791039, + "grad_norm": 0.9924505949020386, + "learning_rate": 0.00019842313705791545, + "loss": 2.0083, + "step": 15699 + }, + { + "epoch": 1.8317582545793956, + "grad_norm": 1.060895323753357, + "learning_rate": 0.0001984087915122288, + "loss": 2.0502, + "step": 15700 + }, + { + "epoch": 1.8318749270796872, + "grad_norm": 0.9757795929908752, + "learning_rate": 0.00019839444548024372, + "loss": 2.0905, + "step": 15701 + }, + { + "epoch": 1.831991599579979, + "grad_norm": 1.1058757305145264, + "learning_rate": 0.00019838009896210883, + "loss": 2.2987, + "step": 15702 + }, + { + "epoch": 1.8321082720802706, + "grad_norm": 1.0834542512893677, + "learning_rate": 0.000198365751957973, + "loss": 1.9034, + "step": 15703 + }, + { + "epoch": 1.8322249445805623, + "grad_norm": 1.2797915935516357, + "learning_rate": 0.0001983514044679849, + "loss": 2.1012, + "step": 15704 + }, + { + "epoch": 1.832341617080854, + "grad_norm": 1.0900808572769165, + "learning_rate": 0.00019833705649229325, + "loss": 2.0399, + "step": 15705 + }, + { + "epoch": 1.8324582895811456, + "grad_norm": 1.0008338689804077, + "learning_rate": 0.00019832270803104679, + "loss": 1.9672, + "step": 15706 + }, + { + "epoch": 1.8325749620814373, + "grad_norm": 1.2088652849197388, + "learning_rate": 0.00019830835908439437, + "loss": 2.0753, + "step": 15707 + }, + { + "epoch": 1.832691634581729, + "grad_norm": 1.1933584213256836, + "learning_rate": 0.0001982940096524846, + "loss": 2.1108, + "step": 15708 + }, + { + "epoch": 1.8328083070820207, + "grad_norm": 1.4738154411315918, + "learning_rate": 0.00019827965973546633, + "loss": 2.2738, + "step": 15709 + }, + { + "epoch": 1.8329249795823124, + "grad_norm": 0.9795435070991516, + "learning_rate": 0.00019826530933348836, + "loss": 2.0111, + "step": 15710 + }, + { + "epoch": 1.833041652082604, + "grad_norm": 0.9660933017730713, + "learning_rate": 0.00019825095844669933, + "loss": 1.9938, + "step": 15711 + }, + { + "epoch": 1.8331583245828957, + "grad_norm": 1.1103692054748535, + "learning_rate": 0.0001982366070752481, + "loss": 2.0592, + "step": 15712 + }, + { + "epoch": 1.8332749970831874, + "grad_norm": 1.003990650177002, + "learning_rate": 0.00019822225521928347, + "loss": 2.0504, + "step": 15713 + }, + { + "epoch": 1.833391669583479, + "grad_norm": 1.0333322286605835, + "learning_rate": 0.00019820790287895416, + "loss": 1.9886, + "step": 15714 + }, + { + "epoch": 1.8335083420837708, + "grad_norm": 1.1088874340057373, + "learning_rate": 0.00019819355005440897, + "loss": 2.0938, + "step": 15715 + }, + { + "epoch": 1.8336250145840625, + "grad_norm": 1.0899264812469482, + "learning_rate": 0.00019817919674579673, + "loss": 2.0532, + "step": 15716 + }, + { + "epoch": 1.8337416870843541, + "grad_norm": 1.132809042930603, + "learning_rate": 0.0001981648429532662, + "loss": 2.1392, + "step": 15717 + }, + { + "epoch": 1.8338583595846458, + "grad_norm": 1.0906906127929688, + "learning_rate": 0.00019815048867696626, + "loss": 2.0185, + "step": 15718 + }, + { + "epoch": 1.8339750320849375, + "grad_norm": 1.0716713666915894, + "learning_rate": 0.00019813613391704563, + "loss": 2.069, + "step": 15719 + }, + { + "epoch": 1.8340917045852292, + "grad_norm": 0.9474628567695618, + "learning_rate": 0.00019812177867365314, + "loss": 2.0944, + "step": 15720 + }, + { + "epoch": 1.8342083770855209, + "grad_norm": 1.142272710800171, + "learning_rate": 0.00019810742294693766, + "loss": 2.0866, + "step": 15721 + }, + { + "epoch": 1.8343250495858126, + "grad_norm": 1.1894489526748657, + "learning_rate": 0.00019809306673704795, + "loss": 2.0929, + "step": 15722 + }, + { + "epoch": 1.8344417220861042, + "grad_norm": 1.096695899963379, + "learning_rate": 0.00019807871004413292, + "loss": 2.0905, + "step": 15723 + }, + { + "epoch": 1.834558394586396, + "grad_norm": 1.110616683959961, + "learning_rate": 0.00019806435286834134, + "loss": 1.9446, + "step": 15724 + }, + { + "epoch": 1.8346750670866876, + "grad_norm": 1.1359962224960327, + "learning_rate": 0.00019804999520982202, + "loss": 2.135, + "step": 15725 + }, + { + "epoch": 1.8347917395869793, + "grad_norm": 1.0229105949401855, + "learning_rate": 0.00019803563706872392, + "loss": 1.8164, + "step": 15726 + }, + { + "epoch": 1.834908412087271, + "grad_norm": 1.2007445096969604, + "learning_rate": 0.0001980212784451958, + "loss": 2.2464, + "step": 15727 + }, + { + "epoch": 1.8350250845875626, + "grad_norm": 1.1290202140808105, + "learning_rate": 0.00019800691933938655, + "loss": 2.0205, + "step": 15728 + }, + { + "epoch": 1.8351417570878543, + "grad_norm": 1.1163325309753418, + "learning_rate": 0.00019799255975144498, + "loss": 1.8896, + "step": 15729 + }, + { + "epoch": 1.835258429588146, + "grad_norm": 0.9338106513023376, + "learning_rate": 0.00019797819968152, + "loss": 1.9351, + "step": 15730 + }, + { + "epoch": 1.8353751020884377, + "grad_norm": 1.1735297441482544, + "learning_rate": 0.00019796383912976042, + "loss": 2.0715, + "step": 15731 + }, + { + "epoch": 1.8354917745887294, + "grad_norm": 1.0750620365142822, + "learning_rate": 0.0001979494780963152, + "loss": 1.9385, + "step": 15732 + }, + { + "epoch": 1.835608447089021, + "grad_norm": 1.0651658773422241, + "learning_rate": 0.00019793511658133315, + "loss": 1.7183, + "step": 15733 + }, + { + "epoch": 1.8357251195893127, + "grad_norm": 1.0319730043411255, + "learning_rate": 0.0001979207545849632, + "loss": 2.0445, + "step": 15734 + }, + { + "epoch": 1.8358417920896044, + "grad_norm": 1.1035139560699463, + "learning_rate": 0.00019790639210735426, + "loss": 2.0301, + "step": 15735 + }, + { + "epoch": 1.835958464589896, + "grad_norm": 1.150307297706604, + "learning_rate": 0.00019789202914865522, + "loss": 2.1011, + "step": 15736 + }, + { + "epoch": 1.8360751370901878, + "grad_norm": 1.1088695526123047, + "learning_rate": 0.00019787766570901485, + "loss": 2.0578, + "step": 15737 + }, + { + "epoch": 1.8361918095904795, + "grad_norm": 0.9770944118499756, + "learning_rate": 0.00019786330178858222, + "loss": 1.8753, + "step": 15738 + }, + { + "epoch": 1.8363084820907711, + "grad_norm": 1.1786919832229614, + "learning_rate": 0.0001978489373875061, + "loss": 2.0802, + "step": 15739 + }, + { + "epoch": 1.8364251545910628, + "grad_norm": 1.1823164224624634, + "learning_rate": 0.00019783457250593557, + "loss": 1.8268, + "step": 15740 + }, + { + "epoch": 1.8365418270913545, + "grad_norm": 1.2396657466888428, + "learning_rate": 0.00019782020714401939, + "loss": 2.1845, + "step": 15741 + }, + { + "epoch": 1.8366584995916462, + "grad_norm": 1.2030466794967651, + "learning_rate": 0.00019780584130190657, + "loss": 2.093, + "step": 15742 + }, + { + "epoch": 1.8367751720919379, + "grad_norm": 1.2828627824783325, + "learning_rate": 0.000197791474979746, + "loss": 2.2291, + "step": 15743 + }, + { + "epoch": 1.8368918445922295, + "grad_norm": 1.3603867292404175, + "learning_rate": 0.00019777710817768664, + "loss": 2.1109, + "step": 15744 + }, + { + "epoch": 1.8370085170925212, + "grad_norm": 1.0647550821304321, + "learning_rate": 0.00019776274089587743, + "loss": 1.9887, + "step": 15745 + }, + { + "epoch": 1.837125189592813, + "grad_norm": 1.1601494550704956, + "learning_rate": 0.00019774837313446729, + "loss": 2.1011, + "step": 15746 + }, + { + "epoch": 1.8372418620931046, + "grad_norm": 1.1589856147766113, + "learning_rate": 0.0001977340048936052, + "loss": 2.1445, + "step": 15747 + }, + { + "epoch": 1.8373585345933963, + "grad_norm": 1.0278196334838867, + "learning_rate": 0.00019771963617344008, + "loss": 1.8831, + "step": 15748 + }, + { + "epoch": 1.837475207093688, + "grad_norm": 1.120967149734497, + "learning_rate": 0.00019770526697412093, + "loss": 1.9347, + "step": 15749 + }, + { + "epoch": 1.8375918795939796, + "grad_norm": 1.1276086568832397, + "learning_rate": 0.00019769089729579665, + "loss": 1.9571, + "step": 15750 + }, + { + "epoch": 1.8377085520942713, + "grad_norm": 1.112043857574463, + "learning_rate": 0.0001976765271386163, + "loss": 1.9518, + "step": 15751 + }, + { + "epoch": 1.837825224594563, + "grad_norm": 1.1886216402053833, + "learning_rate": 0.00019766215650272875, + "loss": 2.0795, + "step": 15752 + }, + { + "epoch": 1.8379418970948547, + "grad_norm": 1.4659790992736816, + "learning_rate": 0.00019764778538828304, + "loss": 2.0518, + "step": 15753 + }, + { + "epoch": 1.8380585695951464, + "grad_norm": 1.097679615020752, + "learning_rate": 0.00019763341379542824, + "loss": 2.0455, + "step": 15754 + }, + { + "epoch": 1.838175242095438, + "grad_norm": 1.2193714380264282, + "learning_rate": 0.00019761904172431317, + "loss": 2.0431, + "step": 15755 + }, + { + "epoch": 1.8382919145957297, + "grad_norm": 1.1060011386871338, + "learning_rate": 0.0001976046691750869, + "loss": 2.0082, + "step": 15756 + }, + { + "epoch": 1.8384085870960214, + "grad_norm": 1.1155669689178467, + "learning_rate": 0.0001975902961478984, + "loss": 2.1339, + "step": 15757 + }, + { + "epoch": 1.838525259596313, + "grad_norm": 1.293083667755127, + "learning_rate": 0.00019757592264289675, + "loss": 2.2105, + "step": 15758 + }, + { + "epoch": 1.8386419320966048, + "grad_norm": 1.1361325979232788, + "learning_rate": 0.0001975615486602309, + "loss": 2.0972, + "step": 15759 + }, + { + "epoch": 1.8387586045968964, + "grad_norm": 1.1969972848892212, + "learning_rate": 0.00019754717420004985, + "loss": 1.9368, + "step": 15760 + }, + { + "epoch": 1.8388752770971881, + "grad_norm": 1.1450731754302979, + "learning_rate": 0.00019753279926250266, + "loss": 2.2889, + "step": 15761 + }, + { + "epoch": 1.8389919495974798, + "grad_norm": 1.126548171043396, + "learning_rate": 0.00019751842384773836, + "loss": 2.0868, + "step": 15762 + }, + { + "epoch": 1.8391086220977715, + "grad_norm": 1.013347864151001, + "learning_rate": 0.00019750404795590591, + "loss": 2.0463, + "step": 15763 + }, + { + "epoch": 1.8392252945980632, + "grad_norm": 1.235353946685791, + "learning_rate": 0.00019748967158715442, + "loss": 2.1787, + "step": 15764 + }, + { + "epoch": 1.8393419670983548, + "grad_norm": 1.2388980388641357, + "learning_rate": 0.0001974752947416329, + "loss": 2.095, + "step": 15765 + }, + { + "epoch": 1.8394586395986465, + "grad_norm": 1.1598576307296753, + "learning_rate": 0.00019746091741949036, + "loss": 2.1135, + "step": 15766 + }, + { + "epoch": 1.8395753120989382, + "grad_norm": 1.1774495840072632, + "learning_rate": 0.0001974465396208759, + "loss": 2.0504, + "step": 15767 + }, + { + "epoch": 1.83969198459923, + "grad_norm": 1.1221264600753784, + "learning_rate": 0.0001974321613459385, + "loss": 2.1128, + "step": 15768 + }, + { + "epoch": 1.8398086570995216, + "grad_norm": 1.3303325176239014, + "learning_rate": 0.00019741778259482735, + "loss": 2.189, + "step": 15769 + }, + { + "epoch": 1.8399253295998133, + "grad_norm": 1.0476018190383911, + "learning_rate": 0.00019740340336769132, + "loss": 2.0874, + "step": 15770 + }, + { + "epoch": 1.840042002100105, + "grad_norm": 1.1425886154174805, + "learning_rate": 0.00019738902366467966, + "loss": 2.2168, + "step": 15771 + }, + { + "epoch": 1.8401586746003966, + "grad_norm": 1.1176692247390747, + "learning_rate": 0.0001973746434859414, + "loss": 2.1623, + "step": 15772 + }, + { + "epoch": 1.8402753471006883, + "grad_norm": 1.0451363325119019, + "learning_rate": 0.00019736026283162557, + "loss": 1.9725, + "step": 15773 + }, + { + "epoch": 1.84039201960098, + "grad_norm": 1.1111421585083008, + "learning_rate": 0.00019734588170188124, + "loss": 2.2155, + "step": 15774 + }, + { + "epoch": 1.8405086921012717, + "grad_norm": 1.055235505104065, + "learning_rate": 0.00019733150009685753, + "loss": 2.1437, + "step": 15775 + }, + { + "epoch": 1.8406253646015633, + "grad_norm": 1.200207233428955, + "learning_rate": 0.00019731711801670356, + "loss": 2.1753, + "step": 15776 + }, + { + "epoch": 1.840742037101855, + "grad_norm": 1.1177740097045898, + "learning_rate": 0.00019730273546156838, + "loss": 1.9083, + "step": 15777 + }, + { + "epoch": 1.8408587096021467, + "grad_norm": 0.9900450706481934, + "learning_rate": 0.00019728835243160113, + "loss": 1.9963, + "step": 15778 + }, + { + "epoch": 1.8409753821024384, + "grad_norm": 1.0743640661239624, + "learning_rate": 0.0001972739689269509, + "loss": 2.1602, + "step": 15779 + }, + { + "epoch": 1.84109205460273, + "grad_norm": 1.2709347009658813, + "learning_rate": 0.0001972595849477668, + "loss": 2.2247, + "step": 15780 + }, + { + "epoch": 1.8412087271030217, + "grad_norm": 1.123620629310608, + "learning_rate": 0.00019724520049419796, + "loss": 2.1276, + "step": 15781 + }, + { + "epoch": 1.8413253996033134, + "grad_norm": 1.104706883430481, + "learning_rate": 0.00019723081556639345, + "loss": 2.1045, + "step": 15782 + }, + { + "epoch": 1.841442072103605, + "grad_norm": 1.3289155960083008, + "learning_rate": 0.00019721643016450247, + "loss": 2.0895, + "step": 15783 + }, + { + "epoch": 1.8415587446038968, + "grad_norm": 1.333052158355713, + "learning_rate": 0.00019720204428867405, + "loss": 2.2083, + "step": 15784 + }, + { + "epoch": 1.8416754171041885, + "grad_norm": 1.084774136543274, + "learning_rate": 0.0001971876579390575, + "loss": 1.9504, + "step": 15785 + }, + { + "epoch": 1.8417920896044802, + "grad_norm": 1.126585602760315, + "learning_rate": 0.00019717327111580182, + "loss": 2.2682, + "step": 15786 + }, + { + "epoch": 1.8419087621047718, + "grad_norm": 0.9209029078483582, + "learning_rate": 0.0001971588838190562, + "loss": 2.0724, + "step": 15787 + }, + { + "epoch": 1.8420254346050635, + "grad_norm": 1.2564500570297241, + "learning_rate": 0.00019714449604896979, + "loss": 2.1155, + "step": 15788 + }, + { + "epoch": 1.8421421071053552, + "grad_norm": 1.3315705060958862, + "learning_rate": 0.00019713010780569176, + "loss": 2.1986, + "step": 15789 + }, + { + "epoch": 1.8422587796056469, + "grad_norm": 1.07728910446167, + "learning_rate": 0.0001971157190893712, + "loss": 2.1554, + "step": 15790 + }, + { + "epoch": 1.8423754521059386, + "grad_norm": 1.1044856309890747, + "learning_rate": 0.00019710132990015736, + "loss": 2.0623, + "step": 15791 + }, + { + "epoch": 1.8424921246062302, + "grad_norm": 1.209755301475525, + "learning_rate": 0.0001970869402381994, + "loss": 2.188, + "step": 15792 + }, + { + "epoch": 1.842608797106522, + "grad_norm": 1.2461638450622559, + "learning_rate": 0.00019707255010364643, + "loss": 1.984, + "step": 15793 + }, + { + "epoch": 1.8427254696068136, + "grad_norm": 1.0497403144836426, + "learning_rate": 0.00019705815949664775, + "loss": 2.2615, + "step": 15794 + }, + { + "epoch": 1.8428421421071053, + "grad_norm": 1.1706327199935913, + "learning_rate": 0.00019704376841735244, + "loss": 1.9417, + "step": 15795 + }, + { + "epoch": 1.842958814607397, + "grad_norm": 1.030310869216919, + "learning_rate": 0.0001970293768659097, + "loss": 1.9164, + "step": 15796 + }, + { + "epoch": 1.8430754871076886, + "grad_norm": 1.2063117027282715, + "learning_rate": 0.00019701498484246877, + "loss": 2.3166, + "step": 15797 + }, + { + "epoch": 1.8431921596079803, + "grad_norm": 1.1500760316848755, + "learning_rate": 0.00019700059234717884, + "loss": 2.1429, + "step": 15798 + }, + { + "epoch": 1.843308832108272, + "grad_norm": 1.023737907409668, + "learning_rate": 0.00019698619938018908, + "loss": 2.0701, + "step": 15799 + }, + { + "epoch": 1.8434255046085637, + "grad_norm": 1.0999122858047485, + "learning_rate": 0.00019697180594164875, + "loss": 2.0453, + "step": 15800 + }, + { + "epoch": 1.8435421771088554, + "grad_norm": 1.4136900901794434, + "learning_rate": 0.00019695741203170703, + "loss": 1.9956, + "step": 15801 + }, + { + "epoch": 1.843658849609147, + "grad_norm": 1.1082178354263306, + "learning_rate": 0.0001969430176505132, + "loss": 1.9991, + "step": 15802 + }, + { + "epoch": 1.8437755221094387, + "grad_norm": 1.09379243850708, + "learning_rate": 0.00019692862279821642, + "loss": 2.0446, + "step": 15803 + }, + { + "epoch": 1.8438921946097304, + "grad_norm": 1.125246286392212, + "learning_rate": 0.00019691422747496594, + "loss": 2.1122, + "step": 15804 + }, + { + "epoch": 1.844008867110022, + "grad_norm": 1.05366849899292, + "learning_rate": 0.00019689983168091102, + "loss": 2.0141, + "step": 15805 + }, + { + "epoch": 1.8441255396103138, + "grad_norm": 1.1993519067764282, + "learning_rate": 0.0001968854354162008, + "loss": 2.085, + "step": 15806 + }, + { + "epoch": 1.8442422121106055, + "grad_norm": 1.1283634901046753, + "learning_rate": 0.00019687103868098465, + "loss": 2.0583, + "step": 15807 + }, + { + "epoch": 1.8443588846108971, + "grad_norm": 1.076792597770691, + "learning_rate": 0.00019685664147541174, + "loss": 2.0244, + "step": 15808 + }, + { + "epoch": 1.8444755571111888, + "grad_norm": 1.0612846612930298, + "learning_rate": 0.0001968422437996313, + "loss": 1.9978, + "step": 15809 + }, + { + "epoch": 1.8445922296114805, + "grad_norm": 1.0661746263504028, + "learning_rate": 0.00019682784565379274, + "loss": 1.9668, + "step": 15810 + }, + { + "epoch": 1.8447089021117722, + "grad_norm": 1.0988160371780396, + "learning_rate": 0.00019681344703804515, + "loss": 2.1455, + "step": 15811 + }, + { + "epoch": 1.8448255746120639, + "grad_norm": 1.0547114610671997, + "learning_rate": 0.00019679904795253793, + "loss": 1.9627, + "step": 15812 + }, + { + "epoch": 1.8449422471123555, + "grad_norm": 1.1116276979446411, + "learning_rate": 0.00019678464839742031, + "loss": 1.966, + "step": 15813 + }, + { + "epoch": 1.8450589196126472, + "grad_norm": 1.0722042322158813, + "learning_rate": 0.0001967702483728415, + "loss": 2.1354, + "step": 15814 + }, + { + "epoch": 1.845175592112939, + "grad_norm": 0.9244295358657837, + "learning_rate": 0.00019675584787895084, + "loss": 1.9449, + "step": 15815 + }, + { + "epoch": 1.8452922646132306, + "grad_norm": 1.1208102703094482, + "learning_rate": 0.00019674144691589766, + "loss": 2.0817, + "step": 15816 + }, + { + "epoch": 1.8454089371135223, + "grad_norm": 1.0442067384719849, + "learning_rate": 0.00019672704548383117, + "loss": 1.9156, + "step": 15817 + }, + { + "epoch": 1.845525609613814, + "grad_norm": 1.051240086555481, + "learning_rate": 0.00019671264358290074, + "loss": 2.0285, + "step": 15818 + }, + { + "epoch": 1.8456422821141056, + "grad_norm": 1.0344096422195435, + "learning_rate": 0.00019669824121325562, + "loss": 2.0311, + "step": 15819 + }, + { + "epoch": 1.8457589546143973, + "grad_norm": 1.064158320426941, + "learning_rate": 0.00019668383837504516, + "loss": 2.1371, + "step": 15820 + }, + { + "epoch": 1.845875627114689, + "grad_norm": 1.1838208436965942, + "learning_rate": 0.0001966694350684186, + "loss": 2.0273, + "step": 15821 + }, + { + "epoch": 1.8459922996149807, + "grad_norm": 1.0804314613342285, + "learning_rate": 0.00019665503129352542, + "loss": 2.209, + "step": 15822 + }, + { + "epoch": 1.8461089721152724, + "grad_norm": 1.234552264213562, + "learning_rate": 0.00019664062705051472, + "loss": 2.0227, + "step": 15823 + }, + { + "epoch": 1.846225644615564, + "grad_norm": 1.21317458152771, + "learning_rate": 0.000196626222339536, + "loss": 2.0821, + "step": 15824 + }, + { + "epoch": 1.8463423171158557, + "grad_norm": 1.1642590761184692, + "learning_rate": 0.00019661181716073848, + "loss": 2.2362, + "step": 15825 + }, + { + "epoch": 1.8464589896161474, + "grad_norm": 1.046035885810852, + "learning_rate": 0.00019659741151427162, + "loss": 2.1743, + "step": 15826 + }, + { + "epoch": 1.846575662116439, + "grad_norm": 1.1430485248565674, + "learning_rate": 0.00019658300540028464, + "loss": 2.0215, + "step": 15827 + }, + { + "epoch": 1.8466923346167308, + "grad_norm": 1.1910686492919922, + "learning_rate": 0.00019656859881892695, + "loss": 2.0416, + "step": 15828 + }, + { + "epoch": 1.8468090071170224, + "grad_norm": 1.4857593774795532, + "learning_rate": 0.0001965541917703479, + "loss": 2.0529, + "step": 15829 + }, + { + "epoch": 1.8469256796173141, + "grad_norm": 1.117864727973938, + "learning_rate": 0.00019653978425469682, + "loss": 2.23, + "step": 15830 + }, + { + "epoch": 1.8470423521176058, + "grad_norm": 1.080359935760498, + "learning_rate": 0.00019652537627212318, + "loss": 2.0875, + "step": 15831 + }, + { + "epoch": 1.8471590246178975, + "grad_norm": 1.2068192958831787, + "learning_rate": 0.0001965109678227761, + "loss": 2.0967, + "step": 15832 + }, + { + "epoch": 1.8472756971181892, + "grad_norm": 0.9780882596969604, + "learning_rate": 0.0001964965589068052, + "loss": 1.9414, + "step": 15833 + }, + { + "epoch": 1.8473923696184809, + "grad_norm": 1.1635534763336182, + "learning_rate": 0.00019648214952435974, + "loss": 2.215, + "step": 15834 + }, + { + "epoch": 1.8475090421187725, + "grad_norm": 1.2650800943374634, + "learning_rate": 0.0001964677396755891, + "loss": 2.078, + "step": 15835 + }, + { + "epoch": 1.8476257146190642, + "grad_norm": 1.1458780765533447, + "learning_rate": 0.00019645332936064268, + "loss": 2.0251, + "step": 15836 + }, + { + "epoch": 1.847742387119356, + "grad_norm": 1.140524983406067, + "learning_rate": 0.00019643891857966988, + "loss": 2.0642, + "step": 15837 + }, + { + "epoch": 1.8478590596196476, + "grad_norm": 1.1130242347717285, + "learning_rate": 0.00019642450733282007, + "loss": 2.0884, + "step": 15838 + }, + { + "epoch": 1.8479757321199393, + "grad_norm": 1.0881311893463135, + "learning_rate": 0.00019641009562024273, + "loss": 2.0654, + "step": 15839 + }, + { + "epoch": 1.848092404620231, + "grad_norm": 1.1867772340774536, + "learning_rate": 0.0001963956834420872, + "loss": 2.1303, + "step": 15840 + }, + { + "epoch": 1.8482090771205226, + "grad_norm": 1.007053256034851, + "learning_rate": 0.00019638127079850282, + "loss": 1.984, + "step": 15841 + }, + { + "epoch": 1.8483257496208143, + "grad_norm": 1.3030012845993042, + "learning_rate": 0.00019636685768963916, + "loss": 2.0936, + "step": 15842 + }, + { + "epoch": 1.848442422121106, + "grad_norm": 1.0015366077423096, + "learning_rate": 0.00019635244411564552, + "loss": 2.0832, + "step": 15843 + }, + { + "epoch": 1.8485590946213977, + "grad_norm": 1.0346804857254028, + "learning_rate": 0.00019633803007667135, + "loss": 2.0973, + "step": 15844 + }, + { + "epoch": 1.8486757671216894, + "grad_norm": 1.2326436042785645, + "learning_rate": 0.0001963236155728661, + "loss": 2.1435, + "step": 15845 + }, + { + "epoch": 1.848792439621981, + "grad_norm": 1.3280110359191895, + "learning_rate": 0.00019630920060437923, + "loss": 2.0711, + "step": 15846 + }, + { + "epoch": 1.8489091121222727, + "grad_norm": 0.9798626899719238, + "learning_rate": 0.0001962947851713601, + "loss": 1.8408, + "step": 15847 + }, + { + "epoch": 1.8490257846225644, + "grad_norm": 1.428849697113037, + "learning_rate": 0.00019628036927395824, + "loss": 2.0837, + "step": 15848 + }, + { + "epoch": 1.849142457122856, + "grad_norm": 1.1304173469543457, + "learning_rate": 0.000196265952912323, + "loss": 2.0682, + "step": 15849 + }, + { + "epoch": 1.8492591296231478, + "grad_norm": 1.0451701879501343, + "learning_rate": 0.0001962515360866039, + "loss": 1.9222, + "step": 15850 + }, + { + "epoch": 1.8493758021234394, + "grad_norm": 0.9868384003639221, + "learning_rate": 0.0001962371187969504, + "loss": 1.8814, + "step": 15851 + }, + { + "epoch": 1.8494924746237311, + "grad_norm": 1.2208008766174316, + "learning_rate": 0.00019622270104351194, + "loss": 2.1488, + "step": 15852 + }, + { + "epoch": 1.8496091471240228, + "grad_norm": 1.04853355884552, + "learning_rate": 0.00019620828282643799, + "loss": 1.9116, + "step": 15853 + }, + { + "epoch": 1.8497258196243145, + "grad_norm": 1.0464588403701782, + "learning_rate": 0.00019619386414587803, + "loss": 1.8899, + "step": 15854 + }, + { + "epoch": 1.8498424921246062, + "grad_norm": 0.9705399870872498, + "learning_rate": 0.00019617944500198155, + "loss": 1.7852, + "step": 15855 + }, + { + "epoch": 1.8499591646248978, + "grad_norm": 1.056857943534851, + "learning_rate": 0.00019616502539489798, + "loss": 2.1371, + "step": 15856 + }, + { + "epoch": 1.8500758371251895, + "grad_norm": 1.236233115196228, + "learning_rate": 0.0001961506053247769, + "loss": 2.1669, + "step": 15857 + }, + { + "epoch": 1.8501925096254812, + "grad_norm": 1.1054373979568481, + "learning_rate": 0.00019613618479176768, + "loss": 2.2109, + "step": 15858 + }, + { + "epoch": 1.8503091821257729, + "grad_norm": 1.1045690774917603, + "learning_rate": 0.00019612176379601995, + "loss": 1.8401, + "step": 15859 + }, + { + "epoch": 1.8504258546260646, + "grad_norm": 1.0421507358551025, + "learning_rate": 0.00019610734233768305, + "loss": 2.1141, + "step": 15860 + }, + { + "epoch": 1.8505425271263563, + "grad_norm": 1.085472822189331, + "learning_rate": 0.00019609292041690665, + "loss": 2.1324, + "step": 15861 + }, + { + "epoch": 1.850659199626648, + "grad_norm": 1.0533212423324585, + "learning_rate": 0.00019607849803384014, + "loss": 2.0382, + "step": 15862 + }, + { + "epoch": 1.8507758721269396, + "grad_norm": 1.1702536344528198, + "learning_rate": 0.00019606407518863304, + "loss": 2.1902, + "step": 15863 + }, + { + "epoch": 1.8508925446272313, + "grad_norm": 1.0584052801132202, + "learning_rate": 0.00019604965188143496, + "loss": 1.981, + "step": 15864 + }, + { + "epoch": 1.851009217127523, + "grad_norm": 0.9550347924232483, + "learning_rate": 0.00019603522811239546, + "loss": 1.8554, + "step": 15865 + }, + { + "epoch": 1.8511258896278147, + "grad_norm": 1.2434031963348389, + "learning_rate": 0.00019602080388166388, + "loss": 2.174, + "step": 15866 + }, + { + "epoch": 1.8512425621281063, + "grad_norm": 1.1718007326126099, + "learning_rate": 0.0001960063791893899, + "loss": 2.1594, + "step": 15867 + }, + { + "epoch": 1.851359234628398, + "grad_norm": 1.0557812452316284, + "learning_rate": 0.00019599195403572302, + "loss": 2.0284, + "step": 15868 + }, + { + "epoch": 1.8514759071286897, + "grad_norm": 1.1601641178131104, + "learning_rate": 0.00019597752842081276, + "loss": 2.105, + "step": 15869 + }, + { + "epoch": 1.8515925796289814, + "grad_norm": 1.1605826616287231, + "learning_rate": 0.0001959631023448087, + "loss": 2.0326, + "step": 15870 + }, + { + "epoch": 1.851709252129273, + "grad_norm": 1.0843361616134644, + "learning_rate": 0.00019594867580786045, + "loss": 1.9765, + "step": 15871 + }, + { + "epoch": 1.8518259246295647, + "grad_norm": 1.1023552417755127, + "learning_rate": 0.00019593424881011743, + "loss": 2.144, + "step": 15872 + }, + { + "epoch": 1.8519425971298564, + "grad_norm": 1.2565891742706299, + "learning_rate": 0.0001959198213517293, + "loss": 2.0041, + "step": 15873 + }, + { + "epoch": 1.852059269630148, + "grad_norm": 1.2416259050369263, + "learning_rate": 0.00019590539343284556, + "loss": 2.0501, + "step": 15874 + }, + { + "epoch": 1.8521759421304398, + "grad_norm": 1.2649372816085815, + "learning_rate": 0.00019589096505361587, + "loss": 2.1676, + "step": 15875 + }, + { + "epoch": 1.8522926146307315, + "grad_norm": 1.205300211906433, + "learning_rate": 0.0001958765362141898, + "loss": 2.1158, + "step": 15876 + }, + { + "epoch": 1.8524092871310232, + "grad_norm": 1.1368073225021362, + "learning_rate": 0.00019586210691471684, + "loss": 2.0269, + "step": 15877 + }, + { + "epoch": 1.8525259596313148, + "grad_norm": 1.1509958505630493, + "learning_rate": 0.00019584767715534669, + "loss": 2.0812, + "step": 15878 + }, + { + "epoch": 1.8526426321316065, + "grad_norm": 1.170457124710083, + "learning_rate": 0.00019583324693622884, + "loss": 1.9647, + "step": 15879 + }, + { + "epoch": 1.8527593046318982, + "grad_norm": 1.0386195182800293, + "learning_rate": 0.00019581881625751297, + "loss": 2.0673, + "step": 15880 + }, + { + "epoch": 1.8528759771321899, + "grad_norm": 1.034377932548523, + "learning_rate": 0.00019580438511934863, + "loss": 1.9708, + "step": 15881 + }, + { + "epoch": 1.8529926496324816, + "grad_norm": 1.107712984085083, + "learning_rate": 0.00019578995352188548, + "loss": 2.0263, + "step": 15882 + }, + { + "epoch": 1.8531093221327732, + "grad_norm": 1.0022377967834473, + "learning_rate": 0.00019577552146527304, + "loss": 1.9089, + "step": 15883 + }, + { + "epoch": 1.853225994633065, + "grad_norm": 1.143233060836792, + "learning_rate": 0.00019576108894966095, + "loss": 1.8949, + "step": 15884 + }, + { + "epoch": 1.8533426671333566, + "grad_norm": 1.1381186246871948, + "learning_rate": 0.00019574665597519892, + "loss": 2.2256, + "step": 15885 + }, + { + "epoch": 1.8534593396336483, + "grad_norm": 1.0286189317703247, + "learning_rate": 0.00019573222254203648, + "loss": 2.1103, + "step": 15886 + }, + { + "epoch": 1.85357601213394, + "grad_norm": 1.1002198457717896, + "learning_rate": 0.00019571778865032333, + "loss": 2.045, + "step": 15887 + }, + { + "epoch": 1.8536926846342316, + "grad_norm": 1.0996872186660767, + "learning_rate": 0.000195703354300209, + "loss": 1.8885, + "step": 15888 + }, + { + "epoch": 1.8538093571345233, + "grad_norm": 1.4197455644607544, + "learning_rate": 0.00019568891949184322, + "loss": 2.0182, + "step": 15889 + }, + { + "epoch": 1.853926029634815, + "grad_norm": 1.1984977722167969, + "learning_rate": 0.00019567448422537568, + "loss": 2.1535, + "step": 15890 + }, + { + "epoch": 1.8540427021351067, + "grad_norm": 1.072653889656067, + "learning_rate": 0.0001956600485009559, + "loss": 2.0812, + "step": 15891 + }, + { + "epoch": 1.8541593746353984, + "grad_norm": 1.02570378780365, + "learning_rate": 0.00019564561231873357, + "loss": 1.9774, + "step": 15892 + }, + { + "epoch": 1.85427604713569, + "grad_norm": 1.098435878753662, + "learning_rate": 0.0001956311756788584, + "loss": 1.8947, + "step": 15893 + }, + { + "epoch": 1.8543927196359817, + "grad_norm": 1.0659974813461304, + "learning_rate": 0.00019561673858148004, + "loss": 1.9595, + "step": 15894 + }, + { + "epoch": 1.8545093921362734, + "grad_norm": 1.0867100954055786, + "learning_rate": 0.00019560230102674814, + "loss": 2.1376, + "step": 15895 + }, + { + "epoch": 1.854626064636565, + "grad_norm": 1.20731520652771, + "learning_rate": 0.0001955878630148124, + "loss": 2.0225, + "step": 15896 + }, + { + "epoch": 1.8547427371368568, + "grad_norm": 1.0601043701171875, + "learning_rate": 0.0001955734245458224, + "loss": 1.9931, + "step": 15897 + }, + { + "epoch": 1.8548594096371485, + "grad_norm": 1.2789579629898071, + "learning_rate": 0.00019555898561992793, + "loss": 2.0751, + "step": 15898 + }, + { + "epoch": 1.8549760821374401, + "grad_norm": 1.2373970746994019, + "learning_rate": 0.00019554454623727863, + "loss": 2.257, + "step": 15899 + }, + { + "epoch": 1.8550927546377318, + "grad_norm": 1.093785285949707, + "learning_rate": 0.00019553010639802425, + "loss": 1.9564, + "step": 15900 + }, + { + "epoch": 1.8552094271380235, + "grad_norm": 1.1092536449432373, + "learning_rate": 0.00019551566610231437, + "loss": 2.0297, + "step": 15901 + }, + { + "epoch": 1.8553260996383152, + "grad_norm": 1.0853939056396484, + "learning_rate": 0.0001955012253502988, + "loss": 2.0325, + "step": 15902 + }, + { + "epoch": 1.8554427721386069, + "grad_norm": 1.061214566230774, + "learning_rate": 0.0001954867841421272, + "loss": 2.0286, + "step": 15903 + }, + { + "epoch": 1.8555594446388985, + "grad_norm": 1.0508852005004883, + "learning_rate": 0.0001954723424779493, + "loss": 1.932, + "step": 15904 + }, + { + "epoch": 1.8556761171391902, + "grad_norm": 1.1695582866668701, + "learning_rate": 0.00019545790035791478, + "loss": 2.0629, + "step": 15905 + }, + { + "epoch": 1.855792789639482, + "grad_norm": 1.1859904527664185, + "learning_rate": 0.0001954434577821734, + "loss": 2.1672, + "step": 15906 + }, + { + "epoch": 1.8559094621397736, + "grad_norm": 0.98724764585495, + "learning_rate": 0.00019542901475087487, + "loss": 1.9133, + "step": 15907 + }, + { + "epoch": 1.8560261346400653, + "grad_norm": 1.2329394817352295, + "learning_rate": 0.00019541457126416893, + "loss": 2.0915, + "step": 15908 + }, + { + "epoch": 1.856142807140357, + "grad_norm": 1.3633018732070923, + "learning_rate": 0.00019540012732220525, + "loss": 2.0066, + "step": 15909 + }, + { + "epoch": 1.8562594796406486, + "grad_norm": 1.0789426565170288, + "learning_rate": 0.00019538568292513367, + "loss": 2.1823, + "step": 15910 + }, + { + "epoch": 1.8563761521409403, + "grad_norm": 1.0863653421401978, + "learning_rate": 0.00019537123807310383, + "loss": 2.0966, + "step": 15911 + }, + { + "epoch": 1.856492824641232, + "grad_norm": 1.0601428747177124, + "learning_rate": 0.00019535679276626555, + "loss": 2.2269, + "step": 15912 + }, + { + "epoch": 1.8566094971415237, + "grad_norm": 1.1389989852905273, + "learning_rate": 0.00019534234700476858, + "loss": 2.093, + "step": 15913 + }, + { + "epoch": 1.8567261696418154, + "grad_norm": 1.0662628412246704, + "learning_rate": 0.00019532790078876261, + "loss": 1.9613, + "step": 15914 + }, + { + "epoch": 1.856842842142107, + "grad_norm": 1.017547607421875, + "learning_rate": 0.00019531345411839753, + "loss": 1.9287, + "step": 15915 + }, + { + "epoch": 1.8569595146423987, + "grad_norm": 1.078158974647522, + "learning_rate": 0.00019529900699382302, + "loss": 2.1345, + "step": 15916 + }, + { + "epoch": 1.8570761871426904, + "grad_norm": 1.1761268377304077, + "learning_rate": 0.00019528455941518878, + "loss": 2.1399, + "step": 15917 + }, + { + "epoch": 1.857192859642982, + "grad_norm": 1.1159727573394775, + "learning_rate": 0.00019527011138264477, + "loss": 1.9789, + "step": 15918 + }, + { + "epoch": 1.8573095321432738, + "grad_norm": 1.1309311389923096, + "learning_rate": 0.0001952556628963406, + "loss": 1.9698, + "step": 15919 + }, + { + "epoch": 1.8574262046435654, + "grad_norm": 1.0930746793746948, + "learning_rate": 0.00019524121395642614, + "loss": 2.022, + "step": 15920 + }, + { + "epoch": 1.8575428771438571, + "grad_norm": 1.0555241107940674, + "learning_rate": 0.00019522676456305122, + "loss": 2.1402, + "step": 15921 + }, + { + "epoch": 1.8576595496441488, + "grad_norm": 1.0242934226989746, + "learning_rate": 0.00019521231471636552, + "loss": 2.0412, + "step": 15922 + }, + { + "epoch": 1.8577762221444405, + "grad_norm": 1.1015713214874268, + "learning_rate": 0.00019519786441651892, + "loss": 2.2641, + "step": 15923 + }, + { + "epoch": 1.8578928946447322, + "grad_norm": 1.0277708768844604, + "learning_rate": 0.0001951834136636612, + "loss": 2.1521, + "step": 15924 + }, + { + "epoch": 1.8580095671450239, + "grad_norm": 1.0339365005493164, + "learning_rate": 0.00019516896245794216, + "loss": 1.9562, + "step": 15925 + }, + { + "epoch": 1.8581262396453155, + "grad_norm": 1.0851613283157349, + "learning_rate": 0.0001951545107995117, + "loss": 2.0139, + "step": 15926 + }, + { + "epoch": 1.8582429121456072, + "grad_norm": 1.0795091390609741, + "learning_rate": 0.0001951400586885195, + "loss": 2.1064, + "step": 15927 + }, + { + "epoch": 1.858359584645899, + "grad_norm": 0.9901224374771118, + "learning_rate": 0.00019512560612511547, + "loss": 1.9112, + "step": 15928 + }, + { + "epoch": 1.8584762571461906, + "grad_norm": 1.3446177244186401, + "learning_rate": 0.00019511115310944944, + "loss": 2.0658, + "step": 15929 + }, + { + "epoch": 1.8585929296464823, + "grad_norm": 1.0854216814041138, + "learning_rate": 0.00019509669964167122, + "loss": 1.9749, + "step": 15930 + }, + { + "epoch": 1.858709602146774, + "grad_norm": 1.0231411457061768, + "learning_rate": 0.00019508224572193064, + "loss": 1.9528, + "step": 15931 + }, + { + "epoch": 1.8588262746470656, + "grad_norm": 1.2972849607467651, + "learning_rate": 0.00019506779135037753, + "loss": 2.0975, + "step": 15932 + }, + { + "epoch": 1.8589429471473573, + "grad_norm": 1.1522327661514282, + "learning_rate": 0.0001950533365271618, + "loss": 2.0985, + "step": 15933 + }, + { + "epoch": 1.859059619647649, + "grad_norm": 1.0617128610610962, + "learning_rate": 0.00019503888125243328, + "loss": 2.2936, + "step": 15934 + }, + { + "epoch": 1.8591762921479407, + "grad_norm": 1.3026589155197144, + "learning_rate": 0.00019502442552634175, + "loss": 2.1113, + "step": 15935 + }, + { + "epoch": 1.8592929646482323, + "grad_norm": 1.0131841897964478, + "learning_rate": 0.00019500996934903716, + "loss": 1.9075, + "step": 15936 + }, + { + "epoch": 1.859409637148524, + "grad_norm": 1.3227168321609497, + "learning_rate": 0.0001949955127206693, + "loss": 2.1146, + "step": 15937 + }, + { + "epoch": 1.8595263096488157, + "grad_norm": 1.3033647537231445, + "learning_rate": 0.00019498105564138817, + "loss": 2.1437, + "step": 15938 + }, + { + "epoch": 1.8596429821491074, + "grad_norm": 1.2326831817626953, + "learning_rate": 0.0001949665981113435, + "loss": 2.0551, + "step": 15939 + }, + { + "epoch": 1.859759654649399, + "grad_norm": 1.2053759098052979, + "learning_rate": 0.00019495214013068519, + "loss": 2.1182, + "step": 15940 + }, + { + "epoch": 1.8598763271496908, + "grad_norm": 1.0673391819000244, + "learning_rate": 0.00019493768169956323, + "loss": 2.0398, + "step": 15941 + }, + { + "epoch": 1.8599929996499824, + "grad_norm": 1.0544602870941162, + "learning_rate": 0.0001949232228181275, + "loss": 1.8805, + "step": 15942 + }, + { + "epoch": 1.8601096721502741, + "grad_norm": 1.1640443801879883, + "learning_rate": 0.00019490876348652775, + "loss": 2.1307, + "step": 15943 + }, + { + "epoch": 1.8602263446505658, + "grad_norm": 1.3111532926559448, + "learning_rate": 0.00019489430370491395, + "loss": 2.2027, + "step": 15944 + }, + { + "epoch": 1.8603430171508575, + "grad_norm": 1.1338722705841064, + "learning_rate": 0.00019487984347343605, + "loss": 1.9471, + "step": 15945 + }, + { + "epoch": 1.8604596896511492, + "grad_norm": 1.212143898010254, + "learning_rate": 0.00019486538279224393, + "loss": 2.1473, + "step": 15946 + }, + { + "epoch": 1.8605763621514408, + "grad_norm": 1.3051819801330566, + "learning_rate": 0.0001948509216614875, + "loss": 2.2, + "step": 15947 + }, + { + "epoch": 1.8606930346517325, + "grad_norm": 1.276701807975769, + "learning_rate": 0.0001948364600813167, + "loss": 2.117, + "step": 15948 + }, + { + "epoch": 1.8608097071520242, + "grad_norm": 1.2067567110061646, + "learning_rate": 0.0001948219980518814, + "loss": 2.0173, + "step": 15949 + }, + { + "epoch": 1.8609263796523159, + "grad_norm": 1.1501245498657227, + "learning_rate": 0.00019480753557333155, + "loss": 1.9856, + "step": 15950 + }, + { + "epoch": 1.8610430521526076, + "grad_norm": 1.1085357666015625, + "learning_rate": 0.00019479307264581706, + "loss": 1.8651, + "step": 15951 + }, + { + "epoch": 1.8611597246528992, + "grad_norm": 1.353581428527832, + "learning_rate": 0.00019477860926948794, + "loss": 2.1101, + "step": 15952 + }, + { + "epoch": 1.861276397153191, + "grad_norm": 1.1811575889587402, + "learning_rate": 0.00019476414544449403, + "loss": 2.1329, + "step": 15953 + }, + { + "epoch": 1.8613930696534826, + "grad_norm": 1.0788376331329346, + "learning_rate": 0.00019474968117098531, + "loss": 1.9044, + "step": 15954 + }, + { + "epoch": 1.8615097421537743, + "grad_norm": 1.1742703914642334, + "learning_rate": 0.0001947352164491118, + "loss": 1.9544, + "step": 15955 + }, + { + "epoch": 1.861626414654066, + "grad_norm": 1.0326920747756958, + "learning_rate": 0.0001947207512790234, + "loss": 1.989, + "step": 15956 + }, + { + "epoch": 1.8617430871543577, + "grad_norm": 1.0166531801223755, + "learning_rate": 0.00019470628566087004, + "loss": 1.7113, + "step": 15957 + }, + { + "epoch": 1.8618597596546493, + "grad_norm": 1.2475169897079468, + "learning_rate": 0.00019469181959480176, + "loss": 2.0361, + "step": 15958 + }, + { + "epoch": 1.861976432154941, + "grad_norm": 1.330242395401001, + "learning_rate": 0.00019467735308096847, + "loss": 2.1406, + "step": 15959 + }, + { + "epoch": 1.8620931046552327, + "grad_norm": 0.99131840467453, + "learning_rate": 0.00019466288611952011, + "loss": 2.1027, + "step": 15960 + }, + { + "epoch": 1.8622097771555244, + "grad_norm": 1.2188726663589478, + "learning_rate": 0.0001946484187106067, + "loss": 2.0809, + "step": 15961 + }, + { + "epoch": 1.862326449655816, + "grad_norm": 1.172927737236023, + "learning_rate": 0.00019463395085437824, + "loss": 2.151, + "step": 15962 + }, + { + "epoch": 1.8624431221561077, + "grad_norm": 1.1351511478424072, + "learning_rate": 0.0001946194825509847, + "loss": 1.9396, + "step": 15963 + }, + { + "epoch": 1.8625597946563994, + "grad_norm": 1.190091848373413, + "learning_rate": 0.00019460501380057604, + "loss": 2.1439, + "step": 15964 + }, + { + "epoch": 1.862676467156691, + "grad_norm": 1.2427836656570435, + "learning_rate": 0.00019459054460330233, + "loss": 2.1317, + "step": 15965 + }, + { + "epoch": 1.8627931396569828, + "grad_norm": 1.1133266687393188, + "learning_rate": 0.00019457607495931346, + "loss": 1.9559, + "step": 15966 + }, + { + "epoch": 1.8629098121572745, + "grad_norm": 1.1541295051574707, + "learning_rate": 0.0001945616048687596, + "loss": 2.0557, + "step": 15967 + }, + { + "epoch": 1.8630264846575662, + "grad_norm": 1.137763261795044, + "learning_rate": 0.0001945471343317906, + "loss": 2.0492, + "step": 15968 + }, + { + "epoch": 1.8631431571578578, + "grad_norm": 1.0641392469406128, + "learning_rate": 0.00019453266334855653, + "loss": 2.0772, + "step": 15969 + }, + { + "epoch": 1.8632598296581495, + "grad_norm": 1.0564162731170654, + "learning_rate": 0.00019451819191920747, + "loss": 2.1035, + "step": 15970 + }, + { + "epoch": 1.8633765021584412, + "grad_norm": 1.0959898233413696, + "learning_rate": 0.0001945037200438933, + "loss": 2.1269, + "step": 15971 + }, + { + "epoch": 1.8634931746587329, + "grad_norm": 1.0454378128051758, + "learning_rate": 0.0001944892477227642, + "loss": 1.9877, + "step": 15972 + }, + { + "epoch": 1.8636098471590246, + "grad_norm": 1.2980122566223145, + "learning_rate": 0.00019447477495597017, + "loss": 2.0294, + "step": 15973 + }, + { + "epoch": 1.8637265196593162, + "grad_norm": 1.2071729898452759, + "learning_rate": 0.00019446030174366118, + "loss": 2.1815, + "step": 15974 + }, + { + "epoch": 1.863843192159608, + "grad_norm": 1.4341425895690918, + "learning_rate": 0.00019444582808598727, + "loss": 2.0519, + "step": 15975 + }, + { + "epoch": 1.8639598646598996, + "grad_norm": 1.120940089225769, + "learning_rate": 0.0001944313539830986, + "loss": 1.8955, + "step": 15976 + }, + { + "epoch": 1.8640765371601913, + "grad_norm": 1.1281942129135132, + "learning_rate": 0.0001944168794351451, + "loss": 2.2467, + "step": 15977 + }, + { + "epoch": 1.864193209660483, + "grad_norm": 1.2294028997421265, + "learning_rate": 0.00019440240444227694, + "loss": 2.2922, + "step": 15978 + }, + { + "epoch": 1.8643098821607746, + "grad_norm": 1.1935635805130005, + "learning_rate": 0.000194387929004644, + "loss": 1.9854, + "step": 15979 + }, + { + "epoch": 1.8644265546610663, + "grad_norm": 1.2435390949249268, + "learning_rate": 0.00019437345312239653, + "loss": 2.2012, + "step": 15980 + }, + { + "epoch": 1.864543227161358, + "grad_norm": 1.16478431224823, + "learning_rate": 0.0001943589767956845, + "loss": 2.1588, + "step": 15981 + }, + { + "epoch": 1.8646598996616497, + "grad_norm": 1.093734860420227, + "learning_rate": 0.00019434450002465807, + "loss": 1.9948, + "step": 15982 + }, + { + "epoch": 1.8647765721619414, + "grad_norm": 1.0122004747390747, + "learning_rate": 0.00019433002280946725, + "loss": 1.9998, + "step": 15983 + }, + { + "epoch": 1.864893244662233, + "grad_norm": 1.0532959699630737, + "learning_rate": 0.0001943155451502621, + "loss": 2.0048, + "step": 15984 + }, + { + "epoch": 1.8650099171625247, + "grad_norm": 0.9590617418289185, + "learning_rate": 0.0001943010670471928, + "loss": 1.9732, + "step": 15985 + }, + { + "epoch": 1.8651265896628164, + "grad_norm": 1.0144915580749512, + "learning_rate": 0.00019428658850040938, + "loss": 1.9577, + "step": 15986 + }, + { + "epoch": 1.865243262163108, + "grad_norm": 1.041671633720398, + "learning_rate": 0.00019427210951006193, + "loss": 2.2318, + "step": 15987 + }, + { + "epoch": 1.8653599346633998, + "grad_norm": 1.165808916091919, + "learning_rate": 0.00019425763007630055, + "loss": 2.2075, + "step": 15988 + }, + { + "epoch": 1.8654766071636915, + "grad_norm": 1.1208394765853882, + "learning_rate": 0.00019424315019927538, + "loss": 1.976, + "step": 15989 + }, + { + "epoch": 1.8655932796639831, + "grad_norm": 1.2208454608917236, + "learning_rate": 0.00019422866987913652, + "loss": 1.9656, + "step": 15990 + }, + { + "epoch": 1.8657099521642748, + "grad_norm": 1.1612545251846313, + "learning_rate": 0.0001942141891160341, + "loss": 2.0962, + "step": 15991 + }, + { + "epoch": 1.8658266246645665, + "grad_norm": 1.1080563068389893, + "learning_rate": 0.0001941997079101182, + "loss": 2.1931, + "step": 15992 + }, + { + "epoch": 1.8659432971648582, + "grad_norm": 1.1830297708511353, + "learning_rate": 0.00019418522626153898, + "loss": 2.2414, + "step": 15993 + }, + { + "epoch": 1.8660599696651499, + "grad_norm": 1.085391640663147, + "learning_rate": 0.00019417074417044656, + "loss": 2.0991, + "step": 15994 + }, + { + "epoch": 1.8661766421654415, + "grad_norm": 1.1682102680206299, + "learning_rate": 0.00019415626163699106, + "loss": 2.2834, + "step": 15995 + }, + { + "epoch": 1.8662933146657332, + "grad_norm": 1.1814510822296143, + "learning_rate": 0.00019414177866132265, + "loss": 2.0711, + "step": 15996 + }, + { + "epoch": 1.866409987166025, + "grad_norm": 1.2294658422470093, + "learning_rate": 0.00019412729524359145, + "loss": 2.1774, + "step": 15997 + }, + { + "epoch": 1.8665266596663166, + "grad_norm": 1.105846881866455, + "learning_rate": 0.0001941128113839476, + "loss": 2.0002, + "step": 15998 + }, + { + "epoch": 1.8666433321666083, + "grad_norm": 1.1023831367492676, + "learning_rate": 0.00019409832708254125, + "loss": 1.986, + "step": 15999 + }, + { + "epoch": 1.8667600046669, + "grad_norm": 1.043100357055664, + "learning_rate": 0.0001940838423395226, + "loss": 2.1276, + "step": 16000 + }, + { + "epoch": 1.8668766771671916, + "grad_norm": 1.204458236694336, + "learning_rate": 0.0001940693571550418, + "loss": 1.9813, + "step": 16001 + }, + { + "epoch": 1.8669933496674833, + "grad_norm": 1.0777530670166016, + "learning_rate": 0.00019405487152924898, + "loss": 1.867, + "step": 16002 + }, + { + "epoch": 1.867110022167775, + "grad_norm": 1.2303122282028198, + "learning_rate": 0.00019404038546229439, + "loss": 2.1127, + "step": 16003 + }, + { + "epoch": 1.8672266946680667, + "grad_norm": 1.0573731660842896, + "learning_rate": 0.00019402589895432804, + "loss": 2.1376, + "step": 16004 + }, + { + "epoch": 1.8673433671683584, + "grad_norm": 1.124709963798523, + "learning_rate": 0.0001940114120055003, + "loss": 2.0307, + "step": 16005 + }, + { + "epoch": 1.86746003966865, + "grad_norm": 1.2852801084518433, + "learning_rate": 0.00019399692461596123, + "loss": 2.0192, + "step": 16006 + }, + { + "epoch": 1.8675767121689417, + "grad_norm": 1.2415802478790283, + "learning_rate": 0.00019398243678586107, + "loss": 2.1615, + "step": 16007 + }, + { + "epoch": 1.8676933846692334, + "grad_norm": 1.01693594455719, + "learning_rate": 0.00019396794851535003, + "loss": 1.8773, + "step": 16008 + }, + { + "epoch": 1.867810057169525, + "grad_norm": 1.2281912565231323, + "learning_rate": 0.00019395345980457827, + "loss": 2.0975, + "step": 16009 + }, + { + "epoch": 1.8679267296698168, + "grad_norm": 1.1304082870483398, + "learning_rate": 0.00019393897065369598, + "loss": 1.9518, + "step": 16010 + }, + { + "epoch": 1.8680434021701084, + "grad_norm": 1.0759289264678955, + "learning_rate": 0.00019392448106285345, + "loss": 2.0055, + "step": 16011 + }, + { + "epoch": 1.8681600746704001, + "grad_norm": 1.040110468864441, + "learning_rate": 0.0001939099910322008, + "loss": 1.9893, + "step": 16012 + }, + { + "epoch": 1.8682767471706918, + "grad_norm": 1.136073350906372, + "learning_rate": 0.00019389550056188833, + "loss": 2.406, + "step": 16013 + }, + { + "epoch": 1.8683934196709835, + "grad_norm": 1.4368035793304443, + "learning_rate": 0.00019388100965206614, + "loss": 2.0458, + "step": 16014 + }, + { + "epoch": 1.8685100921712752, + "grad_norm": 1.1065064668655396, + "learning_rate": 0.00019386651830288456, + "loss": 2.1633, + "step": 16015 + }, + { + "epoch": 1.8686267646715669, + "grad_norm": 1.303727388381958, + "learning_rate": 0.0001938520265144938, + "loss": 2.0898, + "step": 16016 + }, + { + "epoch": 1.8687434371718585, + "grad_norm": 1.0823720693588257, + "learning_rate": 0.00019383753428704406, + "loss": 2.0824, + "step": 16017 + }, + { + "epoch": 1.8688601096721502, + "grad_norm": 1.1078481674194336, + "learning_rate": 0.0001938230416206856, + "loss": 2.1535, + "step": 16018 + }, + { + "epoch": 1.868976782172442, + "grad_norm": 1.240092158317566, + "learning_rate": 0.0001938085485155687, + "loss": 2.1002, + "step": 16019 + }, + { + "epoch": 1.8690934546727336, + "grad_norm": 1.118402361869812, + "learning_rate": 0.0001937940549718436, + "loss": 2.1077, + "step": 16020 + }, + { + "epoch": 1.8692101271730253, + "grad_norm": 1.1607085466384888, + "learning_rate": 0.00019377956098966048, + "loss": 2.2164, + "step": 16021 + }, + { + "epoch": 1.869326799673317, + "grad_norm": 1.1567683219909668, + "learning_rate": 0.00019376506656916962, + "loss": 2.0092, + "step": 16022 + }, + { + "epoch": 1.8694434721736086, + "grad_norm": 1.0752061605453491, + "learning_rate": 0.0001937505717105214, + "loss": 1.9748, + "step": 16023 + }, + { + "epoch": 1.8695601446739003, + "grad_norm": 1.1438993215560913, + "learning_rate": 0.00019373607641386595, + "loss": 2.1238, + "step": 16024 + }, + { + "epoch": 1.869676817174192, + "grad_norm": 1.20003342628479, + "learning_rate": 0.0001937215806793536, + "loss": 2.0819, + "step": 16025 + }, + { + "epoch": 1.8697934896744837, + "grad_norm": 1.1577636003494263, + "learning_rate": 0.0001937070845071346, + "loss": 2.0585, + "step": 16026 + }, + { + "epoch": 1.8699101621747753, + "grad_norm": 1.0335403680801392, + "learning_rate": 0.00019369258789735924, + "loss": 2.1189, + "step": 16027 + }, + { + "epoch": 1.870026834675067, + "grad_norm": 1.0152997970581055, + "learning_rate": 0.0001936780908501778, + "loss": 2.0457, + "step": 16028 + }, + { + "epoch": 1.8701435071753587, + "grad_norm": 1.0654568672180176, + "learning_rate": 0.00019366359336574064, + "loss": 2.027, + "step": 16029 + }, + { + "epoch": 1.8702601796756504, + "grad_norm": 1.034930944442749, + "learning_rate": 0.0001936490954441979, + "loss": 2.0005, + "step": 16030 + }, + { + "epoch": 1.870376852175942, + "grad_norm": 1.157734751701355, + "learning_rate": 0.0001936345970857, + "loss": 2.2377, + "step": 16031 + }, + { + "epoch": 1.8704935246762338, + "grad_norm": 1.1445778608322144, + "learning_rate": 0.00019362009829039723, + "loss": 2.0541, + "step": 16032 + }, + { + "epoch": 1.8706101971765254, + "grad_norm": 1.1623753309249878, + "learning_rate": 0.00019360559905843986, + "loss": 1.9768, + "step": 16033 + }, + { + "epoch": 1.8707268696768171, + "grad_norm": 1.0261881351470947, + "learning_rate": 0.00019359109938997825, + "loss": 2.1843, + "step": 16034 + }, + { + "epoch": 1.8708435421771088, + "grad_norm": 0.9979092478752136, + "learning_rate": 0.0001935765992851627, + "loss": 2.1733, + "step": 16035 + }, + { + "epoch": 1.8709602146774005, + "grad_norm": 1.1325864791870117, + "learning_rate": 0.00019356209874414346, + "loss": 2.0712, + "step": 16036 + }, + { + "epoch": 1.8710768871776922, + "grad_norm": 1.0343313217163086, + "learning_rate": 0.000193547597767071, + "loss": 2.1853, + "step": 16037 + }, + { + "epoch": 1.8711935596779838, + "grad_norm": 1.1476625204086304, + "learning_rate": 0.00019353309635409548, + "loss": 2.1457, + "step": 16038 + }, + { + "epoch": 1.8713102321782755, + "grad_norm": 1.0818713903427124, + "learning_rate": 0.00019351859450536738, + "loss": 1.9999, + "step": 16039 + }, + { + "epoch": 1.8714269046785672, + "grad_norm": 1.143976092338562, + "learning_rate": 0.00019350409222103696, + "loss": 2.152, + "step": 16040 + }, + { + "epoch": 1.8715435771788589, + "grad_norm": 1.2234395742416382, + "learning_rate": 0.0001934895895012546, + "loss": 2.0485, + "step": 16041 + }, + { + "epoch": 1.8716602496791506, + "grad_norm": 1.1398615837097168, + "learning_rate": 0.00019347508634617058, + "loss": 2.2058, + "step": 16042 + }, + { + "epoch": 1.8717769221794422, + "grad_norm": 1.2112997770309448, + "learning_rate": 0.00019346058275593534, + "loss": 2.0051, + "step": 16043 + }, + { + "epoch": 1.871893594679734, + "grad_norm": 1.2306398153305054, + "learning_rate": 0.00019344607873069918, + "loss": 2.1124, + "step": 16044 + }, + { + "epoch": 1.8720102671800256, + "grad_norm": 1.0543663501739502, + "learning_rate": 0.0001934315742706125, + "loss": 2.0223, + "step": 16045 + }, + { + "epoch": 1.8721269396803173, + "grad_norm": 1.1679801940917969, + "learning_rate": 0.00019341706937582562, + "loss": 2.1235, + "step": 16046 + }, + { + "epoch": 1.872243612180609, + "grad_norm": 1.0271974802017212, + "learning_rate": 0.000193402564046489, + "loss": 1.925, + "step": 16047 + }, + { + "epoch": 1.8723602846809007, + "grad_norm": 1.051498532295227, + "learning_rate": 0.00019338805828275288, + "loss": 2.1092, + "step": 16048 + }, + { + "epoch": 1.8724769571811923, + "grad_norm": 1.2493889331817627, + "learning_rate": 0.0001933735520847678, + "loss": 2.1635, + "step": 16049 + }, + { + "epoch": 1.872593629681484, + "grad_norm": 1.165692925453186, + "learning_rate": 0.00019335904545268398, + "loss": 2.0613, + "step": 16050 + }, + { + "epoch": 1.8727103021817757, + "grad_norm": 1.1209179162979126, + "learning_rate": 0.00019334453838665194, + "loss": 2.044, + "step": 16051 + }, + { + "epoch": 1.8728269746820674, + "grad_norm": 1.217624545097351, + "learning_rate": 0.000193330030886822, + "loss": 2.0335, + "step": 16052 + }, + { + "epoch": 1.872943647182359, + "grad_norm": 1.056164264678955, + "learning_rate": 0.00019331552295334458, + "loss": 2.0427, + "step": 16053 + }, + { + "epoch": 1.8730603196826507, + "grad_norm": 1.0872762203216553, + "learning_rate": 0.0001933010145863701, + "loss": 2.0594, + "step": 16054 + }, + { + "epoch": 1.8731769921829424, + "grad_norm": 1.1610748767852783, + "learning_rate": 0.00019328650578604893, + "loss": 2.1252, + "step": 16055 + }, + { + "epoch": 1.873293664683234, + "grad_norm": 1.0756285190582275, + "learning_rate": 0.00019327199655253148, + "loss": 2.074, + "step": 16056 + }, + { + "epoch": 1.8734103371835258, + "grad_norm": 1.1338387727737427, + "learning_rate": 0.00019325748688596824, + "loss": 1.9273, + "step": 16057 + }, + { + "epoch": 1.8735270096838175, + "grad_norm": 1.1237834692001343, + "learning_rate": 0.00019324297678650944, + "loss": 2.113, + "step": 16058 + }, + { + "epoch": 1.8736436821841091, + "grad_norm": 1.1819170713424683, + "learning_rate": 0.00019322846625430577, + "loss": 2.0307, + "step": 16059 + }, + { + "epoch": 1.8737603546844008, + "grad_norm": 1.0262120962142944, + "learning_rate": 0.00019321395528950743, + "loss": 1.9826, + "step": 16060 + }, + { + "epoch": 1.8738770271846925, + "grad_norm": 1.0033738613128662, + "learning_rate": 0.00019319944389226503, + "loss": 1.9749, + "step": 16061 + }, + { + "epoch": 1.8739936996849842, + "grad_norm": 1.1670818328857422, + "learning_rate": 0.00019318493206272894, + "loss": 2.0476, + "step": 16062 + }, + { + "epoch": 1.8741103721852759, + "grad_norm": 0.962527871131897, + "learning_rate": 0.00019317041980104958, + "loss": 2.0423, + "step": 16063 + }, + { + "epoch": 1.8742270446855676, + "grad_norm": 1.0213375091552734, + "learning_rate": 0.00019315590710737742, + "loss": 1.954, + "step": 16064 + }, + { + "epoch": 1.8743437171858592, + "grad_norm": 1.0006964206695557, + "learning_rate": 0.00019314139398186284, + "loss": 1.9382, + "step": 16065 + }, + { + "epoch": 1.874460389686151, + "grad_norm": 1.001328706741333, + "learning_rate": 0.00019312688042465642, + "loss": 1.9423, + "step": 16066 + }, + { + "epoch": 1.8745770621864426, + "grad_norm": 1.0638670921325684, + "learning_rate": 0.00019311236643590853, + "loss": 2.234, + "step": 16067 + }, + { + "epoch": 1.8746937346867343, + "grad_norm": 1.11000394821167, + "learning_rate": 0.00019309785201576966, + "loss": 2.1474, + "step": 16068 + }, + { + "epoch": 1.874810407187026, + "grad_norm": 1.3613284826278687, + "learning_rate": 0.00019308333716439022, + "loss": 1.9212, + "step": 16069 + }, + { + "epoch": 1.8749270796873176, + "grad_norm": 0.9841364026069641, + "learning_rate": 0.00019306882188192085, + "loss": 2.0464, + "step": 16070 + }, + { + "epoch": 1.8750437521876093, + "grad_norm": 1.0017809867858887, + "learning_rate": 0.00019305430616851186, + "loss": 2.0547, + "step": 16071 + }, + { + "epoch": 1.875160424687901, + "grad_norm": 1.1351956129074097, + "learning_rate": 0.00019303979002431384, + "loss": 2.1101, + "step": 16072 + }, + { + "epoch": 1.8752770971881927, + "grad_norm": 1.0332999229431152, + "learning_rate": 0.0001930252734494772, + "loss": 2.1368, + "step": 16073 + }, + { + "epoch": 1.8753937696884844, + "grad_norm": 1.1573020219802856, + "learning_rate": 0.00019301075644415248, + "loss": 2.1185, + "step": 16074 + }, + { + "epoch": 1.875510442188776, + "grad_norm": 1.3040657043457031, + "learning_rate": 0.00019299623900849016, + "loss": 2.0002, + "step": 16075 + }, + { + "epoch": 1.8756271146890677, + "grad_norm": 1.109914779663086, + "learning_rate": 0.00019298172114264072, + "loss": 1.8965, + "step": 16076 + }, + { + "epoch": 1.8757437871893594, + "grad_norm": 1.1020299196243286, + "learning_rate": 0.00019296720284675472, + "loss": 2.1474, + "step": 16077 + }, + { + "epoch": 1.875860459689651, + "grad_norm": 1.3647888898849487, + "learning_rate": 0.00019295268412098266, + "loss": 2.0792, + "step": 16078 + }, + { + "epoch": 1.8759771321899428, + "grad_norm": 1.1457513570785522, + "learning_rate": 0.00019293816496547498, + "loss": 2.0414, + "step": 16079 + }, + { + "epoch": 1.8760938046902345, + "grad_norm": 1.0989128351211548, + "learning_rate": 0.0001929236453803823, + "loss": 2.0313, + "step": 16080 + }, + { + "epoch": 1.8762104771905261, + "grad_norm": 1.0303272008895874, + "learning_rate": 0.00019290912536585507, + "loss": 2.0084, + "step": 16081 + }, + { + "epoch": 1.8763271496908178, + "grad_norm": 1.1336597204208374, + "learning_rate": 0.00019289460492204384, + "loss": 2.0713, + "step": 16082 + }, + { + "epoch": 1.8764438221911095, + "grad_norm": 1.1054043769836426, + "learning_rate": 0.00019288008404909912, + "loss": 1.9748, + "step": 16083 + }, + { + "epoch": 1.8765604946914012, + "grad_norm": 1.21847403049469, + "learning_rate": 0.0001928655627471715, + "loss": 2.3285, + "step": 16084 + }, + { + "epoch": 1.8766771671916929, + "grad_norm": 1.26167893409729, + "learning_rate": 0.00019285104101641147, + "loss": 2.1367, + "step": 16085 + }, + { + "epoch": 1.8767938396919845, + "grad_norm": 1.4395110607147217, + "learning_rate": 0.0001928365188569696, + "loss": 2.0934, + "step": 16086 + }, + { + "epoch": 1.8769105121922762, + "grad_norm": 1.145086407661438, + "learning_rate": 0.00019282199626899645, + "loss": 2.157, + "step": 16087 + }, + { + "epoch": 1.877027184692568, + "grad_norm": 1.1256470680236816, + "learning_rate": 0.00019280747325264256, + "loss": 2.1202, + "step": 16088 + }, + { + "epoch": 1.8771438571928596, + "grad_norm": 0.9735117554664612, + "learning_rate": 0.0001927929498080585, + "loss": 1.9945, + "step": 16089 + }, + { + "epoch": 1.8772605296931513, + "grad_norm": 1.1867976188659668, + "learning_rate": 0.00019277842593539479, + "loss": 2.1021, + "step": 16090 + }, + { + "epoch": 1.877377202193443, + "grad_norm": 1.049802541732788, + "learning_rate": 0.00019276390163480204, + "loss": 2.0357, + "step": 16091 + }, + { + "epoch": 1.8774938746937346, + "grad_norm": 1.0419694185256958, + "learning_rate": 0.00019274937690643077, + "loss": 2.0208, + "step": 16092 + }, + { + "epoch": 1.8776105471940263, + "grad_norm": 1.1790646314620972, + "learning_rate": 0.00019273485175043165, + "loss": 2.1165, + "step": 16093 + }, + { + "epoch": 1.877727219694318, + "grad_norm": 1.3078974485397339, + "learning_rate": 0.0001927203261669552, + "loss": 2.1136, + "step": 16094 + }, + { + "epoch": 1.8778438921946097, + "grad_norm": 1.0883302688598633, + "learning_rate": 0.00019270580015615202, + "loss": 2.0529, + "step": 16095 + }, + { + "epoch": 1.8779605646949014, + "grad_norm": 1.3289387226104736, + "learning_rate": 0.00019269127371817265, + "loss": 2.1851, + "step": 16096 + }, + { + "epoch": 1.878077237195193, + "grad_norm": 1.132307529449463, + "learning_rate": 0.00019267674685316774, + "loss": 2.0986, + "step": 16097 + }, + { + "epoch": 1.8781939096954847, + "grad_norm": 1.253553032875061, + "learning_rate": 0.00019266221956128787, + "loss": 2.2852, + "step": 16098 + }, + { + "epoch": 1.8783105821957764, + "grad_norm": 1.1662694215774536, + "learning_rate": 0.00019264769184268365, + "loss": 1.9098, + "step": 16099 + }, + { + "epoch": 1.878427254696068, + "grad_norm": 1.037658929824829, + "learning_rate": 0.00019263316369750572, + "loss": 2.0751, + "step": 16100 + }, + { + "epoch": 1.8785439271963598, + "grad_norm": 1.2533732652664185, + "learning_rate": 0.00019261863512590462, + "loss": 2.1586, + "step": 16101 + }, + { + "epoch": 1.8786605996966514, + "grad_norm": 1.2234363555908203, + "learning_rate": 0.000192604106128031, + "loss": 1.969, + "step": 16102 + }, + { + "epoch": 1.8787772721969431, + "grad_norm": 1.2029235363006592, + "learning_rate": 0.0001925895767040355, + "loss": 2.0151, + "step": 16103 + }, + { + "epoch": 1.8788939446972348, + "grad_norm": 1.2303956747055054, + "learning_rate": 0.00019257504685406873, + "loss": 2.2916, + "step": 16104 + }, + { + "epoch": 1.8790106171975265, + "grad_norm": 1.0953564643859863, + "learning_rate": 0.0001925605165782813, + "loss": 2.0477, + "step": 16105 + }, + { + "epoch": 1.8791272896978182, + "grad_norm": 1.1500608921051025, + "learning_rate": 0.0001925459858768239, + "loss": 2.1687, + "step": 16106 + }, + { + "epoch": 1.8792439621981099, + "grad_norm": 1.0517407655715942, + "learning_rate": 0.00019253145474984712, + "loss": 2.0019, + "step": 16107 + }, + { + "epoch": 1.8793606346984015, + "grad_norm": 1.1879631280899048, + "learning_rate": 0.00019251692319750157, + "loss": 1.9269, + "step": 16108 + }, + { + "epoch": 1.8794773071986932, + "grad_norm": 1.3254033327102661, + "learning_rate": 0.00019250239121993798, + "loss": 2.0519, + "step": 16109 + }, + { + "epoch": 1.879593979698985, + "grad_norm": 1.2046163082122803, + "learning_rate": 0.0001924878588173069, + "loss": 2.1448, + "step": 16110 + }, + { + "epoch": 1.8797106521992766, + "grad_norm": 1.303213119506836, + "learning_rate": 0.0001924733259897591, + "loss": 2.2866, + "step": 16111 + }, + { + "epoch": 1.8798273246995683, + "grad_norm": 1.1183812618255615, + "learning_rate": 0.00019245879273744517, + "loss": 2.0668, + "step": 16112 + }, + { + "epoch": 1.87994399719986, + "grad_norm": 1.0724903345108032, + "learning_rate": 0.0001924442590605158, + "loss": 2.1142, + "step": 16113 + }, + { + "epoch": 1.8800606697001516, + "grad_norm": 1.0337989330291748, + "learning_rate": 0.0001924297249591217, + "loss": 1.9068, + "step": 16114 + }, + { + "epoch": 1.8801773422004433, + "grad_norm": 1.0747195482254028, + "learning_rate": 0.0001924151904334134, + "loss": 1.9636, + "step": 16115 + }, + { + "epoch": 1.880294014700735, + "grad_norm": 1.0437978506088257, + "learning_rate": 0.00019240065548354175, + "loss": 2.0924, + "step": 16116 + }, + { + "epoch": 1.8804106872010267, + "grad_norm": 1.0698881149291992, + "learning_rate": 0.0001923861201096573, + "loss": 2.1687, + "step": 16117 + }, + { + "epoch": 1.8805273597013183, + "grad_norm": 1.0275969505310059, + "learning_rate": 0.00019237158431191083, + "loss": 1.9042, + "step": 16118 + }, + { + "epoch": 1.88064403220161, + "grad_norm": 0.9721900224685669, + "learning_rate": 0.00019235704809045297, + "loss": 1.9951, + "step": 16119 + }, + { + "epoch": 1.8807607047019017, + "grad_norm": 1.2647831439971924, + "learning_rate": 0.00019234251144543442, + "loss": 2.0505, + "step": 16120 + }, + { + "epoch": 1.8808773772021934, + "grad_norm": 1.3673906326293945, + "learning_rate": 0.00019232797437700592, + "loss": 2.1409, + "step": 16121 + }, + { + "epoch": 1.880994049702485, + "grad_norm": 1.1855140924453735, + "learning_rate": 0.00019231343688531815, + "loss": 1.9459, + "step": 16122 + }, + { + "epoch": 1.8811107222027768, + "grad_norm": 1.3264182806015015, + "learning_rate": 0.00019229889897052184, + "loss": 2.0634, + "step": 16123 + }, + { + "epoch": 1.8812273947030684, + "grad_norm": 1.0960273742675781, + "learning_rate": 0.00019228436063276766, + "loss": 2.0332, + "step": 16124 + }, + { + "epoch": 1.8813440672033601, + "grad_norm": 1.105263113975525, + "learning_rate": 0.00019226982187220637, + "loss": 1.9742, + "step": 16125 + }, + { + "epoch": 1.8814607397036518, + "grad_norm": 1.015877366065979, + "learning_rate": 0.0001922552826889886, + "loss": 2.0627, + "step": 16126 + }, + { + "epoch": 1.8815774122039435, + "grad_norm": 1.0373783111572266, + "learning_rate": 0.0001922407430832652, + "loss": 2.1002, + "step": 16127 + }, + { + "epoch": 1.8816940847042352, + "grad_norm": 1.2135534286499023, + "learning_rate": 0.0001922262030551869, + "loss": 2.1204, + "step": 16128 + }, + { + "epoch": 1.8818107572045268, + "grad_norm": 1.151658535003662, + "learning_rate": 0.00019221166260490431, + "loss": 2.0326, + "step": 16129 + }, + { + "epoch": 1.8819274297048185, + "grad_norm": 1.1915502548217773, + "learning_rate": 0.00019219712173256827, + "loss": 2.1738, + "step": 16130 + }, + { + "epoch": 1.8820441022051102, + "grad_norm": 1.1663941144943237, + "learning_rate": 0.0001921825804383295, + "loss": 2.1349, + "step": 16131 + }, + { + "epoch": 1.8821607747054019, + "grad_norm": 1.4320040941238403, + "learning_rate": 0.00019216803872233875, + "loss": 2.1878, + "step": 16132 + }, + { + "epoch": 1.8822774472056936, + "grad_norm": 1.2714999914169312, + "learning_rate": 0.00019215349658474677, + "loss": 2.0568, + "step": 16133 + }, + { + "epoch": 1.8823941197059852, + "grad_norm": 1.14512300491333, + "learning_rate": 0.00019213895402570434, + "loss": 2.3072, + "step": 16134 + }, + { + "epoch": 1.882510792206277, + "grad_norm": 1.0664172172546387, + "learning_rate": 0.00019212441104536211, + "loss": 1.9101, + "step": 16135 + }, + { + "epoch": 1.8826274647065686, + "grad_norm": 1.4453003406524658, + "learning_rate": 0.000192109867643871, + "loss": 2.1851, + "step": 16136 + }, + { + "epoch": 1.8827441372068603, + "grad_norm": 0.9838145971298218, + "learning_rate": 0.00019209532382138167, + "loss": 2.0713, + "step": 16137 + }, + { + "epoch": 1.882860809707152, + "grad_norm": 1.1587401628494263, + "learning_rate": 0.00019208077957804493, + "loss": 2.0654, + "step": 16138 + }, + { + "epoch": 1.8829774822074437, + "grad_norm": 1.1198334693908691, + "learning_rate": 0.00019206623491401162, + "loss": 2.2162, + "step": 16139 + }, + { + "epoch": 1.8830941547077353, + "grad_norm": 1.2701375484466553, + "learning_rate": 0.00019205168982943248, + "loss": 1.9927, + "step": 16140 + }, + { + "epoch": 1.883210827208027, + "grad_norm": 1.1638916730880737, + "learning_rate": 0.00019203714432445822, + "loss": 2.0513, + "step": 16141 + }, + { + "epoch": 1.8833274997083187, + "grad_norm": 1.0360139608383179, + "learning_rate": 0.0001920225983992397, + "loss": 2.0299, + "step": 16142 + }, + { + "epoch": 1.8834441722086104, + "grad_norm": 1.0800997018814087, + "learning_rate": 0.0001920080520539277, + "loss": 2.0052, + "step": 16143 + }, + { + "epoch": 1.883560844708902, + "grad_norm": 1.0552161931991577, + "learning_rate": 0.00019199350528867307, + "loss": 1.9613, + "step": 16144 + }, + { + "epoch": 1.8836775172091937, + "grad_norm": 1.3364887237548828, + "learning_rate": 0.00019197895810362658, + "loss": 2.1384, + "step": 16145 + }, + { + "epoch": 1.8837941897094854, + "grad_norm": 1.040906310081482, + "learning_rate": 0.00019196441049893902, + "loss": 2.0087, + "step": 16146 + }, + { + "epoch": 1.883910862209777, + "grad_norm": 1.1517380475997925, + "learning_rate": 0.00019194986247476122, + "loss": 2.086, + "step": 16147 + }, + { + "epoch": 1.8840275347100688, + "grad_norm": 1.1944904327392578, + "learning_rate": 0.000191935314031244, + "loss": 2.1285, + "step": 16148 + }, + { + "epoch": 1.8841442072103605, + "grad_norm": 1.200201392173767, + "learning_rate": 0.0001919207651685382, + "loss": 2.0, + "step": 16149 + }, + { + "epoch": 1.8842608797106521, + "grad_norm": 1.3034851551055908, + "learning_rate": 0.00019190621588679457, + "loss": 2.1187, + "step": 16150 + }, + { + "epoch": 1.8843775522109438, + "grad_norm": 1.2065608501434326, + "learning_rate": 0.00019189166618616404, + "loss": 2.102, + "step": 16151 + }, + { + "epoch": 1.8844942247112355, + "grad_norm": 1.144492506980896, + "learning_rate": 0.0001918771160667974, + "loss": 2.0308, + "step": 16152 + }, + { + "epoch": 1.8846108972115272, + "grad_norm": 1.1345796585083008, + "learning_rate": 0.00019186256552884546, + "loss": 1.9733, + "step": 16153 + }, + { + "epoch": 1.8847275697118189, + "grad_norm": 1.257743239402771, + "learning_rate": 0.00019184801457245912, + "loss": 2.0162, + "step": 16154 + }, + { + "epoch": 1.8848442422121106, + "grad_norm": 1.1798123121261597, + "learning_rate": 0.0001918334631977892, + "loss": 1.9466, + "step": 16155 + }, + { + "epoch": 1.8849609147124022, + "grad_norm": 1.22186279296875, + "learning_rate": 0.00019181891140498654, + "loss": 2.1539, + "step": 16156 + }, + { + "epoch": 1.885077587212694, + "grad_norm": 1.064946174621582, + "learning_rate": 0.0001918043591942021, + "loss": 2.1451, + "step": 16157 + }, + { + "epoch": 1.8851942597129856, + "grad_norm": 1.0110154151916504, + "learning_rate": 0.00019178980656558658, + "loss": 1.9187, + "step": 16158 + }, + { + "epoch": 1.8853109322132773, + "grad_norm": 1.0024526119232178, + "learning_rate": 0.00019177525351929094, + "loss": 1.8278, + "step": 16159 + }, + { + "epoch": 1.885427604713569, + "grad_norm": 1.1442841291427612, + "learning_rate": 0.000191760700055466, + "loss": 2.039, + "step": 16160 + }, + { + "epoch": 1.8855442772138606, + "grad_norm": 1.1828988790512085, + "learning_rate": 0.00019174614617426268, + "loss": 2.1117, + "step": 16161 + }, + { + "epoch": 1.8856609497141523, + "grad_norm": 1.0050827264785767, + "learning_rate": 0.00019173159187583185, + "loss": 2.1161, + "step": 16162 + }, + { + "epoch": 1.885777622214444, + "grad_norm": 1.3033071756362915, + "learning_rate": 0.00019171703716032435, + "loss": 1.9619, + "step": 16163 + }, + { + "epoch": 1.8858942947147357, + "grad_norm": 1.0331469774246216, + "learning_rate": 0.00019170248202789113, + "loss": 2.0597, + "step": 16164 + }, + { + "epoch": 1.8860109672150274, + "grad_norm": 1.3478049039840698, + "learning_rate": 0.00019168792647868306, + "loss": 2.1727, + "step": 16165 + }, + { + "epoch": 1.886127639715319, + "grad_norm": 1.1009637117385864, + "learning_rate": 0.00019167337051285104, + "loss": 2.0628, + "step": 16166 + }, + { + "epoch": 1.8862443122156107, + "grad_norm": 1.0748178958892822, + "learning_rate": 0.00019165881413054598, + "loss": 1.9714, + "step": 16167 + }, + { + "epoch": 1.8863609847159024, + "grad_norm": 1.1445834636688232, + "learning_rate": 0.0001916442573319187, + "loss": 2.1264, + "step": 16168 + }, + { + "epoch": 1.886477657216194, + "grad_norm": 1.1333496570587158, + "learning_rate": 0.00019162970011712022, + "loss": 2.0072, + "step": 16169 + }, + { + "epoch": 1.8865943297164858, + "grad_norm": 1.198533296585083, + "learning_rate": 0.00019161514248630136, + "loss": 2.045, + "step": 16170 + }, + { + "epoch": 1.8867110022167775, + "grad_norm": 1.1218773126602173, + "learning_rate": 0.00019160058443961313, + "loss": 2.0951, + "step": 16171 + }, + { + "epoch": 1.8868276747170691, + "grad_norm": 1.119018793106079, + "learning_rate": 0.00019158602597720638, + "loss": 2.1407, + "step": 16172 + }, + { + "epoch": 1.8869443472173608, + "grad_norm": 1.1008199453353882, + "learning_rate": 0.0001915714670992321, + "loss": 2.0765, + "step": 16173 + }, + { + "epoch": 1.8870610197176525, + "grad_norm": 1.0551750659942627, + "learning_rate": 0.00019155690780584114, + "loss": 2.0401, + "step": 16174 + }, + { + "epoch": 1.8871776922179442, + "grad_norm": 1.207930088043213, + "learning_rate": 0.0001915423480971845, + "loss": 1.9911, + "step": 16175 + }, + { + "epoch": 1.8872943647182359, + "grad_norm": 1.2232261896133423, + "learning_rate": 0.0001915277879734131, + "loss": 1.9724, + "step": 16176 + }, + { + "epoch": 1.8874110372185275, + "grad_norm": 1.1253403425216675, + "learning_rate": 0.00019151322743467784, + "loss": 2.3058, + "step": 16177 + }, + { + "epoch": 1.8875277097188192, + "grad_norm": 0.9788912534713745, + "learning_rate": 0.00019149866648112974, + "loss": 2.0318, + "step": 16178 + }, + { + "epoch": 1.887644382219111, + "grad_norm": 1.265804409980774, + "learning_rate": 0.0001914841051129197, + "loss": 2.1315, + "step": 16179 + }, + { + "epoch": 1.8877610547194026, + "grad_norm": 0.975077748298645, + "learning_rate": 0.0001914695433301987, + "loss": 1.9556, + "step": 16180 + }, + { + "epoch": 1.8878777272196943, + "grad_norm": 1.2412126064300537, + "learning_rate": 0.0001914549811331177, + "loss": 2.213, + "step": 16181 + }, + { + "epoch": 1.887994399719986, + "grad_norm": 1.1151847839355469, + "learning_rate": 0.00019144041852182768, + "loss": 1.9417, + "step": 16182 + }, + { + "epoch": 1.8881110722202776, + "grad_norm": 1.1078824996948242, + "learning_rate": 0.00019142585549647957, + "loss": 1.9647, + "step": 16183 + }, + { + "epoch": 1.8882277447205693, + "grad_norm": 1.2552193403244019, + "learning_rate": 0.00019141129205722435, + "loss": 1.9927, + "step": 16184 + }, + { + "epoch": 1.888344417220861, + "grad_norm": 1.410455584526062, + "learning_rate": 0.00019139672820421303, + "loss": 2.0951, + "step": 16185 + }, + { + "epoch": 1.8884610897211527, + "grad_norm": 1.3416895866394043, + "learning_rate": 0.00019138216393759658, + "loss": 2.2547, + "step": 16186 + }, + { + "epoch": 1.8885777622214444, + "grad_norm": 0.9821380972862244, + "learning_rate": 0.00019136759925752594, + "loss": 1.8561, + "step": 16187 + }, + { + "epoch": 1.888694434721736, + "grad_norm": 1.0456089973449707, + "learning_rate": 0.00019135303416415215, + "loss": 1.9617, + "step": 16188 + }, + { + "epoch": 1.8888111072220277, + "grad_norm": 1.006136178970337, + "learning_rate": 0.00019133846865762618, + "loss": 1.9542, + "step": 16189 + }, + { + "epoch": 1.8889277797223194, + "grad_norm": 1.172096610069275, + "learning_rate": 0.00019132390273809906, + "loss": 2.0543, + "step": 16190 + }, + { + "epoch": 1.889044452222611, + "grad_norm": 1.0706837177276611, + "learning_rate": 0.00019130933640572182, + "loss": 2.0414, + "step": 16191 + }, + { + "epoch": 1.8891611247229028, + "grad_norm": 1.0741182565689087, + "learning_rate": 0.00019129476966064539, + "loss": 1.9736, + "step": 16192 + }, + { + "epoch": 1.8892777972231944, + "grad_norm": 1.1090813875198364, + "learning_rate": 0.00019128020250302078, + "loss": 2.1094, + "step": 16193 + }, + { + "epoch": 1.8893944697234861, + "grad_norm": 1.0911002159118652, + "learning_rate": 0.0001912656349329991, + "loss": 1.863, + "step": 16194 + }, + { + "epoch": 1.8895111422237778, + "grad_norm": 1.1594382524490356, + "learning_rate": 0.00019125106695073123, + "loss": 2.1593, + "step": 16195 + }, + { + "epoch": 1.8896278147240695, + "grad_norm": 1.153157114982605, + "learning_rate": 0.00019123649855636835, + "loss": 2.3749, + "step": 16196 + }, + { + "epoch": 1.8897444872243612, + "grad_norm": 1.0551484823226929, + "learning_rate": 0.00019122192975006135, + "loss": 1.8344, + "step": 16197 + }, + { + "epoch": 1.8898611597246529, + "grad_norm": 1.1573823690414429, + "learning_rate": 0.00019120736053196137, + "loss": 2.1143, + "step": 16198 + }, + { + "epoch": 1.8899778322249445, + "grad_norm": 1.1474767923355103, + "learning_rate": 0.00019119279090221944, + "loss": 1.8651, + "step": 16199 + }, + { + "epoch": 1.8900945047252362, + "grad_norm": 1.0681753158569336, + "learning_rate": 0.0001911782208609865, + "loss": 2.0866, + "step": 16200 + }, + { + "epoch": 1.890211177225528, + "grad_norm": 1.00837242603302, + "learning_rate": 0.00019116365040841373, + "loss": 1.8138, + "step": 16201 + }, + { + "epoch": 1.8903278497258196, + "grad_norm": 1.0777643918991089, + "learning_rate": 0.00019114907954465203, + "loss": 2.0151, + "step": 16202 + }, + { + "epoch": 1.8904445222261113, + "grad_norm": 1.2480019330978394, + "learning_rate": 0.00019113450826985261, + "loss": 2.1307, + "step": 16203 + }, + { + "epoch": 1.890561194726403, + "grad_norm": 1.1232917308807373, + "learning_rate": 0.00019111993658416643, + "loss": 1.9187, + "step": 16204 + }, + { + "epoch": 1.8906778672266946, + "grad_norm": 1.1642485857009888, + "learning_rate": 0.0001911053644877446, + "loss": 1.8417, + "step": 16205 + }, + { + "epoch": 1.8907945397269863, + "grad_norm": 1.17887544631958, + "learning_rate": 0.00019109079198073814, + "loss": 2.2299, + "step": 16206 + }, + { + "epoch": 1.890911212227278, + "grad_norm": 0.9875811338424683, + "learning_rate": 0.0001910762190632982, + "loss": 1.9274, + "step": 16207 + }, + { + "epoch": 1.8910278847275697, + "grad_norm": 1.1476211547851562, + "learning_rate": 0.00019106164573557573, + "loss": 2.0022, + "step": 16208 + }, + { + "epoch": 1.8911445572278613, + "grad_norm": 1.2666904926300049, + "learning_rate": 0.00019104707199772195, + "loss": 2.131, + "step": 16209 + }, + { + "epoch": 1.891261229728153, + "grad_norm": 1.2516798973083496, + "learning_rate": 0.00019103249784988788, + "loss": 2.2236, + "step": 16210 + }, + { + "epoch": 1.8913779022284447, + "grad_norm": 1.0463999509811401, + "learning_rate": 0.00019101792329222458, + "loss": 2.0739, + "step": 16211 + }, + { + "epoch": 1.8914945747287364, + "grad_norm": 1.1012402772903442, + "learning_rate": 0.00019100334832488318, + "loss": 2.0824, + "step": 16212 + }, + { + "epoch": 1.891611247229028, + "grad_norm": 1.2346631288528442, + "learning_rate": 0.0001909887729480148, + "loss": 2.3282, + "step": 16213 + }, + { + "epoch": 1.8917279197293198, + "grad_norm": 1.087430715560913, + "learning_rate": 0.00019097419716177045, + "loss": 2.1918, + "step": 16214 + }, + { + "epoch": 1.8918445922296114, + "grad_norm": 1.0627198219299316, + "learning_rate": 0.0001909596209663013, + "loss": 1.9287, + "step": 16215 + }, + { + "epoch": 1.8919612647299031, + "grad_norm": 1.0397592782974243, + "learning_rate": 0.00019094504436175847, + "loss": 1.9378, + "step": 16216 + }, + { + "epoch": 1.8920779372301948, + "grad_norm": 1.1965478658676147, + "learning_rate": 0.0001909304673482931, + "loss": 2.0945, + "step": 16217 + }, + { + "epoch": 1.8921946097304865, + "grad_norm": 1.006968379020691, + "learning_rate": 0.0001909158899260562, + "loss": 1.8218, + "step": 16218 + }, + { + "epoch": 1.8923112822307782, + "grad_norm": 0.919048547744751, + "learning_rate": 0.000190901312095199, + "loss": 2.1269, + "step": 16219 + }, + { + "epoch": 1.8924279547310698, + "grad_norm": 0.938515305519104, + "learning_rate": 0.0001908867338558726, + "loss": 1.9935, + "step": 16220 + }, + { + "epoch": 1.8925446272313615, + "grad_norm": 0.9813210368156433, + "learning_rate": 0.0001908721552082281, + "loss": 1.9705, + "step": 16221 + }, + { + "epoch": 1.8926612997316532, + "grad_norm": 0.971466064453125, + "learning_rate": 0.00019085757615241666, + "loss": 1.9095, + "step": 16222 + }, + { + "epoch": 1.8927779722319449, + "grad_norm": 1.1384189128875732, + "learning_rate": 0.0001908429966885894, + "loss": 2.1852, + "step": 16223 + }, + { + "epoch": 1.8928946447322366, + "grad_norm": 1.1795634031295776, + "learning_rate": 0.0001908284168168975, + "loss": 2.1381, + "step": 16224 + }, + { + "epoch": 1.8930113172325282, + "grad_norm": 1.176713466644287, + "learning_rate": 0.00019081383653749207, + "loss": 2.0293, + "step": 16225 + }, + { + "epoch": 1.89312798973282, + "grad_norm": 1.1242396831512451, + "learning_rate": 0.00019079925585052432, + "loss": 2.0799, + "step": 16226 + }, + { + "epoch": 1.8932446622331116, + "grad_norm": 1.0156757831573486, + "learning_rate": 0.0001907846747561453, + "loss": 2.1248, + "step": 16227 + }, + { + "epoch": 1.8933613347334033, + "grad_norm": 1.0728205442428589, + "learning_rate": 0.00019077009325450624, + "loss": 1.9867, + "step": 16228 + }, + { + "epoch": 1.893478007233695, + "grad_norm": 1.1480083465576172, + "learning_rate": 0.00019075551134575832, + "loss": 2.1835, + "step": 16229 + }, + { + "epoch": 1.8935946797339867, + "grad_norm": 0.9921936988830566, + "learning_rate": 0.00019074092903005268, + "loss": 2.1108, + "step": 16230 + }, + { + "epoch": 1.8937113522342783, + "grad_norm": 1.05055570602417, + "learning_rate": 0.00019072634630754048, + "loss": 2.2044, + "step": 16231 + }, + { + "epoch": 1.89382802473457, + "grad_norm": 0.9665073156356812, + "learning_rate": 0.00019071176317837297, + "loss": 2.0315, + "step": 16232 + }, + { + "epoch": 1.8939446972348617, + "grad_norm": 1.0013929605484009, + "learning_rate": 0.0001906971796427013, + "loss": 1.9972, + "step": 16233 + }, + { + "epoch": 1.8940613697351534, + "grad_norm": 1.0730403661727905, + "learning_rate": 0.0001906825957006766, + "loss": 1.7911, + "step": 16234 + }, + { + "epoch": 1.894178042235445, + "grad_norm": 1.2848789691925049, + "learning_rate": 0.00019066801135245008, + "loss": 2.0129, + "step": 16235 + }, + { + "epoch": 1.8942947147357367, + "grad_norm": 1.0504010915756226, + "learning_rate": 0.00019065342659817293, + "loss": 2.2717, + "step": 16236 + }, + { + "epoch": 1.8944113872360284, + "grad_norm": 1.0417919158935547, + "learning_rate": 0.00019063884143799643, + "loss": 1.9925, + "step": 16237 + }, + { + "epoch": 1.89452805973632, + "grad_norm": 1.247825264930725, + "learning_rate": 0.00019062425587207174, + "loss": 2.2571, + "step": 16238 + }, + { + "epoch": 1.8946447322366118, + "grad_norm": 1.280718445777893, + "learning_rate": 0.00019060966990055, + "loss": 2.1653, + "step": 16239 + }, + { + "epoch": 1.8947614047369035, + "grad_norm": 1.302517294883728, + "learning_rate": 0.00019059508352358245, + "loss": 2.1943, + "step": 16240 + }, + { + "epoch": 1.8948780772371951, + "grad_norm": 1.2185391187667847, + "learning_rate": 0.00019058049674132037, + "loss": 2.2193, + "step": 16241 + }, + { + "epoch": 1.8949947497374868, + "grad_norm": 1.1918529272079468, + "learning_rate": 0.0001905659095539149, + "loss": 2.1731, + "step": 16242 + }, + { + "epoch": 1.8951114222377785, + "grad_norm": 1.1591365337371826, + "learning_rate": 0.00019055132196151733, + "loss": 2.127, + "step": 16243 + }, + { + "epoch": 1.8952280947380702, + "grad_norm": 1.1807594299316406, + "learning_rate": 0.0001905367339642789, + "loss": 2.1467, + "step": 16244 + }, + { + "epoch": 1.8953447672383619, + "grad_norm": 1.1067193746566772, + "learning_rate": 0.0001905221455623507, + "loss": 2.1587, + "step": 16245 + }, + { + "epoch": 1.8954614397386536, + "grad_norm": 1.0037791728973389, + "learning_rate": 0.00019050755675588412, + "loss": 2.1216, + "step": 16246 + }, + { + "epoch": 1.8955781122389452, + "grad_norm": 1.1314436197280884, + "learning_rate": 0.00019049296754503032, + "loss": 1.9925, + "step": 16247 + }, + { + "epoch": 1.895694784739237, + "grad_norm": 1.2112007141113281, + "learning_rate": 0.00019047837792994063, + "loss": 1.9744, + "step": 16248 + }, + { + "epoch": 1.8958114572395286, + "grad_norm": 1.0886090993881226, + "learning_rate": 0.0001904637879107662, + "loss": 1.9015, + "step": 16249 + }, + { + "epoch": 1.8959281297398203, + "grad_norm": 1.0373104810714722, + "learning_rate": 0.00019044919748765834, + "loss": 1.9597, + "step": 16250 + }, + { + "epoch": 1.896044802240112, + "grad_norm": 1.1176068782806396, + "learning_rate": 0.00019043460666076827, + "loss": 2.0118, + "step": 16251 + }, + { + "epoch": 1.8961614747404036, + "grad_norm": 1.194708228111267, + "learning_rate": 0.0001904200154302473, + "loss": 2.0893, + "step": 16252 + }, + { + "epoch": 1.8962781472406953, + "grad_norm": 1.0607953071594238, + "learning_rate": 0.0001904054237962467, + "loss": 1.9749, + "step": 16253 + }, + { + "epoch": 1.896394819740987, + "grad_norm": 0.9354615211486816, + "learning_rate": 0.00019039083175891763, + "loss": 1.9511, + "step": 16254 + }, + { + "epoch": 1.8965114922412787, + "grad_norm": 1.1699435710906982, + "learning_rate": 0.0001903762393184115, + "loss": 2.123, + "step": 16255 + }, + { + "epoch": 1.8966281647415704, + "grad_norm": 1.2270222902297974, + "learning_rate": 0.00019036164647487946, + "loss": 1.9509, + "step": 16256 + }, + { + "epoch": 1.896744837241862, + "grad_norm": 1.097324252128601, + "learning_rate": 0.00019034705322847292, + "loss": 2.0484, + "step": 16257 + }, + { + "epoch": 1.8968615097421537, + "grad_norm": 1.0573954582214355, + "learning_rate": 0.00019033245957934312, + "loss": 2.0574, + "step": 16258 + }, + { + "epoch": 1.8969781822424454, + "grad_norm": 1.071494460105896, + "learning_rate": 0.0001903178655276413, + "loss": 2.0999, + "step": 16259 + }, + { + "epoch": 1.897094854742737, + "grad_norm": 1.0393773317337036, + "learning_rate": 0.00019030327107351887, + "loss": 2.0347, + "step": 16260 + }, + { + "epoch": 1.8972115272430288, + "grad_norm": 1.2383649349212646, + "learning_rate": 0.000190288676217127, + "loss": 2.1068, + "step": 16261 + }, + { + "epoch": 1.8973281997433205, + "grad_norm": 1.1260532140731812, + "learning_rate": 0.00019027408095861708, + "loss": 2.1545, + "step": 16262 + }, + { + "epoch": 1.8974448722436121, + "grad_norm": 1.1432567834854126, + "learning_rate": 0.00019025948529814035, + "loss": 2.1515, + "step": 16263 + }, + { + "epoch": 1.8975615447439038, + "grad_norm": 1.1809728145599365, + "learning_rate": 0.00019024488923584815, + "loss": 2.0403, + "step": 16264 + }, + { + "epoch": 1.8976782172441955, + "grad_norm": 1.0827280282974243, + "learning_rate": 0.0001902302927718918, + "loss": 2.176, + "step": 16265 + }, + { + "epoch": 1.8977948897444872, + "grad_norm": 1.0576823949813843, + "learning_rate": 0.00019021569590642266, + "loss": 2.0073, + "step": 16266 + }, + { + "epoch": 1.8979115622447789, + "grad_norm": 1.1527408361434937, + "learning_rate": 0.000190201098639592, + "loss": 2.1459, + "step": 16267 + }, + { + "epoch": 1.8980282347450705, + "grad_norm": 1.0357853174209595, + "learning_rate": 0.00019018650097155117, + "loss": 2.0002, + "step": 16268 + }, + { + "epoch": 1.8981449072453622, + "grad_norm": 1.1636728048324585, + "learning_rate": 0.0001901719029024515, + "loss": 2.0284, + "step": 16269 + }, + { + "epoch": 1.898261579745654, + "grad_norm": 1.318917989730835, + "learning_rate": 0.00019015730443244433, + "loss": 2.0785, + "step": 16270 + }, + { + "epoch": 1.8983782522459456, + "grad_norm": 1.1666796207427979, + "learning_rate": 0.000190142705561681, + "loss": 2.1295, + "step": 16271 + }, + { + "epoch": 1.8984949247462373, + "grad_norm": 1.1158748865127563, + "learning_rate": 0.00019012810629031283, + "loss": 2.1928, + "step": 16272 + }, + { + "epoch": 1.898611597246529, + "grad_norm": 1.1847426891326904, + "learning_rate": 0.00019011350661849122, + "loss": 1.9604, + "step": 16273 + }, + { + "epoch": 1.8987282697468206, + "grad_norm": 1.1856974363327026, + "learning_rate": 0.00019009890654636747, + "loss": 2.0315, + "step": 16274 + }, + { + "epoch": 1.8988449422471123, + "grad_norm": 1.000550389289856, + "learning_rate": 0.00019008430607409297, + "loss": 1.8194, + "step": 16275 + }, + { + "epoch": 1.898961614747404, + "grad_norm": 1.1485391855239868, + "learning_rate": 0.00019006970520181907, + "loss": 2.0922, + "step": 16276 + }, + { + "epoch": 1.8990782872476957, + "grad_norm": 1.09098482131958, + "learning_rate": 0.0001900551039296971, + "loss": 2.1373, + "step": 16277 + }, + { + "epoch": 1.8991949597479874, + "grad_norm": 0.9777517318725586, + "learning_rate": 0.00019004050225787853, + "loss": 1.8185, + "step": 16278 + }, + { + "epoch": 1.899311632248279, + "grad_norm": 1.1392817497253418, + "learning_rate": 0.00019002590018651466, + "loss": 2.1932, + "step": 16279 + }, + { + "epoch": 1.8994283047485707, + "grad_norm": 1.0962324142456055, + "learning_rate": 0.00019001129771575683, + "loss": 2.0385, + "step": 16280 + }, + { + "epoch": 1.8995449772488624, + "grad_norm": 1.1184495687484741, + "learning_rate": 0.0001899966948457565, + "loss": 2.0061, + "step": 16281 + }, + { + "epoch": 1.899661649749154, + "grad_norm": 1.2185331583023071, + "learning_rate": 0.00018998209157666503, + "loss": 2.0068, + "step": 16282 + }, + { + "epoch": 1.8997783222494458, + "grad_norm": 1.1075726747512817, + "learning_rate": 0.00018996748790863386, + "loss": 2.0115, + "step": 16283 + }, + { + "epoch": 1.8998949947497374, + "grad_norm": 1.1480014324188232, + "learning_rate": 0.00018995288384181426, + "loss": 1.9928, + "step": 16284 + }, + { + "epoch": 1.9000116672500291, + "grad_norm": 1.205704927444458, + "learning_rate": 0.00018993827937635775, + "loss": 2.1112, + "step": 16285 + }, + { + "epoch": 1.9001283397503208, + "grad_norm": 1.182483434677124, + "learning_rate": 0.00018992367451241567, + "loss": 2.2369, + "step": 16286 + }, + { + "epoch": 1.9002450122506125, + "grad_norm": 0.9799625277519226, + "learning_rate": 0.00018990906925013944, + "loss": 1.9445, + "step": 16287 + }, + { + "epoch": 1.9003616847509042, + "grad_norm": 1.2122774124145508, + "learning_rate": 0.00018989446358968045, + "loss": 1.9332, + "step": 16288 + }, + { + "epoch": 1.9004783572511958, + "grad_norm": 1.2077056169509888, + "learning_rate": 0.00018987985753119018, + "loss": 2.2983, + "step": 16289 + }, + { + "epoch": 1.9005950297514875, + "grad_norm": 1.0882947444915771, + "learning_rate": 0.00018986525107481998, + "loss": 2.1273, + "step": 16290 + }, + { + "epoch": 1.9007117022517792, + "grad_norm": 1.245848298072815, + "learning_rate": 0.0001898506442207213, + "loss": 2.2984, + "step": 16291 + }, + { + "epoch": 1.900828374752071, + "grad_norm": 1.1884804964065552, + "learning_rate": 0.0001898360369690456, + "loss": 2.0747, + "step": 16292 + }, + { + "epoch": 1.9009450472523626, + "grad_norm": 0.9814680218696594, + "learning_rate": 0.00018982142931994427, + "loss": 1.9328, + "step": 16293 + }, + { + "epoch": 1.9010617197526543, + "grad_norm": 1.1583887338638306, + "learning_rate": 0.00018980682127356873, + "loss": 2.2266, + "step": 16294 + }, + { + "epoch": 1.901178392252946, + "grad_norm": 0.9709891080856323, + "learning_rate": 0.00018979221283007045, + "loss": 2.0453, + "step": 16295 + }, + { + "epoch": 1.9012950647532376, + "grad_norm": 1.0130170583724976, + "learning_rate": 0.00018977760398960087, + "loss": 1.8456, + "step": 16296 + }, + { + "epoch": 1.9014117372535293, + "grad_norm": 1.3842681646347046, + "learning_rate": 0.0001897629947523115, + "loss": 2.0528, + "step": 16297 + }, + { + "epoch": 1.901528409753821, + "grad_norm": 1.0622256994247437, + "learning_rate": 0.00018974838511835368, + "loss": 2.0423, + "step": 16298 + }, + { + "epoch": 1.9016450822541127, + "grad_norm": 1.1386055946350098, + "learning_rate": 0.00018973377508787892, + "loss": 1.9871, + "step": 16299 + }, + { + "epoch": 1.9017617547544043, + "grad_norm": 1.1715449094772339, + "learning_rate": 0.00018971916466103871, + "loss": 1.9907, + "step": 16300 + }, + { + "epoch": 1.901878427254696, + "grad_norm": 0.9923451542854309, + "learning_rate": 0.00018970455383798446, + "loss": 1.9777, + "step": 16301 + }, + { + "epoch": 1.9019950997549877, + "grad_norm": 1.0542550086975098, + "learning_rate": 0.00018968994261886766, + "loss": 1.905, + "step": 16302 + }, + { + "epoch": 1.9021117722552794, + "grad_norm": 1.0996439456939697, + "learning_rate": 0.0001896753310038398, + "loss": 2.0707, + "step": 16303 + }, + { + "epoch": 1.902228444755571, + "grad_norm": 1.1985225677490234, + "learning_rate": 0.00018966071899305238, + "loss": 2.0702, + "step": 16304 + }, + { + "epoch": 1.9023451172558627, + "grad_norm": 1.3219327926635742, + "learning_rate": 0.00018964610658665678, + "loss": 2.1004, + "step": 16305 + }, + { + "epoch": 1.9024617897561544, + "grad_norm": 1.1511520147323608, + "learning_rate": 0.00018963149378480452, + "loss": 2.1564, + "step": 16306 + }, + { + "epoch": 1.9025784622564461, + "grad_norm": 1.0274385213851929, + "learning_rate": 0.0001896168805876472, + "loss": 2.0752, + "step": 16307 + }, + { + "epoch": 1.9026951347567378, + "grad_norm": 1.0911028385162354, + "learning_rate": 0.00018960226699533622, + "loss": 2.1419, + "step": 16308 + }, + { + "epoch": 1.9028118072570295, + "grad_norm": 1.1170850992202759, + "learning_rate": 0.00018958765300802305, + "loss": 2.0428, + "step": 16309 + }, + { + "epoch": 1.9029284797573212, + "grad_norm": 1.141568899154663, + "learning_rate": 0.00018957303862585927, + "loss": 2.1483, + "step": 16310 + }, + { + "epoch": 1.9030451522576128, + "grad_norm": 1.1396452188491821, + "learning_rate": 0.00018955842384899634, + "loss": 2.1232, + "step": 16311 + }, + { + "epoch": 1.9031618247579045, + "grad_norm": 1.1091995239257812, + "learning_rate": 0.00018954380867758578, + "loss": 1.8832, + "step": 16312 + }, + { + "epoch": 1.9032784972581962, + "grad_norm": 1.1809403896331787, + "learning_rate": 0.00018952919311177908, + "loss": 2.1465, + "step": 16313 + }, + { + "epoch": 1.9033951697584879, + "grad_norm": 1.0701290369033813, + "learning_rate": 0.0001895145771517278, + "loss": 2.1395, + "step": 16314 + }, + { + "epoch": 1.9035118422587796, + "grad_norm": 1.1733958721160889, + "learning_rate": 0.0001894999607975834, + "loss": 1.832, + "step": 16315 + }, + { + "epoch": 1.9036285147590712, + "grad_norm": 1.0210447311401367, + "learning_rate": 0.0001894853440494975, + "loss": 2.0161, + "step": 16316 + }, + { + "epoch": 1.903745187259363, + "grad_norm": 1.1834882497787476, + "learning_rate": 0.00018947072690762157, + "loss": 2.0832, + "step": 16317 + }, + { + "epoch": 1.9038618597596546, + "grad_norm": 0.9985252022743225, + "learning_rate": 0.00018945610937210714, + "loss": 2.121, + "step": 16318 + }, + { + "epoch": 1.9039785322599463, + "grad_norm": 1.0914833545684814, + "learning_rate": 0.0001894414914431057, + "loss": 2.0349, + "step": 16319 + }, + { + "epoch": 1.904095204760238, + "grad_norm": 0.9395394921302795, + "learning_rate": 0.00018942687312076896, + "loss": 2.1073, + "step": 16320 + }, + { + "epoch": 1.9042118772605297, + "grad_norm": 1.0938388109207153, + "learning_rate": 0.00018941225440524833, + "loss": 1.8744, + "step": 16321 + }, + { + "epoch": 1.9043285497608213, + "grad_norm": 1.2332040071487427, + "learning_rate": 0.00018939763529669533, + "loss": 1.9702, + "step": 16322 + }, + { + "epoch": 1.904445222261113, + "grad_norm": 1.118796706199646, + "learning_rate": 0.00018938301579526164, + "loss": 2.0981, + "step": 16323 + }, + { + "epoch": 1.9045618947614047, + "grad_norm": 1.2309627532958984, + "learning_rate": 0.0001893683959010987, + "loss": 2.0179, + "step": 16324 + }, + { + "epoch": 1.9046785672616964, + "grad_norm": 1.1747524738311768, + "learning_rate": 0.0001893537756143582, + "loss": 1.9662, + "step": 16325 + }, + { + "epoch": 1.904795239761988, + "grad_norm": 1.198053002357483, + "learning_rate": 0.0001893391549351916, + "loss": 2.1612, + "step": 16326 + }, + { + "epoch": 1.9049119122622797, + "grad_norm": 1.109986424446106, + "learning_rate": 0.00018932453386375048, + "loss": 1.9557, + "step": 16327 + }, + { + "epoch": 1.9050285847625714, + "grad_norm": 1.131568431854248, + "learning_rate": 0.00018930991240018644, + "loss": 2.1588, + "step": 16328 + }, + { + "epoch": 1.905145257262863, + "grad_norm": 1.2759206295013428, + "learning_rate": 0.00018929529054465113, + "loss": 2.0709, + "step": 16329 + }, + { + "epoch": 1.9052619297631548, + "grad_norm": 1.311997413635254, + "learning_rate": 0.00018928066829729598, + "loss": 2.12, + "step": 16330 + }, + { + "epoch": 1.9053786022634465, + "grad_norm": 1.1264863014221191, + "learning_rate": 0.00018926604565827268, + "loss": 2.0905, + "step": 16331 + }, + { + "epoch": 1.9054952747637381, + "grad_norm": 1.2349950075149536, + "learning_rate": 0.00018925142262773284, + "loss": 2.1348, + "step": 16332 + }, + { + "epoch": 1.9056119472640298, + "grad_norm": 1.3745274543762207, + "learning_rate": 0.00018923679920582794, + "loss": 2.2248, + "step": 16333 + }, + { + "epoch": 1.9057286197643215, + "grad_norm": 1.014575481414795, + "learning_rate": 0.0001892221753927097, + "loss": 2.0395, + "step": 16334 + }, + { + "epoch": 1.9058452922646132, + "grad_norm": 1.0249156951904297, + "learning_rate": 0.00018920755118852965, + "loss": 2.0372, + "step": 16335 + }, + { + "epoch": 1.9059619647649049, + "grad_norm": 1.1185221672058105, + "learning_rate": 0.00018919292659343948, + "loss": 1.8862, + "step": 16336 + }, + { + "epoch": 1.9060786372651966, + "grad_norm": 1.3191688060760498, + "learning_rate": 0.00018917830160759076, + "loss": 2.1393, + "step": 16337 + }, + { + "epoch": 1.9061953097654882, + "grad_norm": 1.252410650253296, + "learning_rate": 0.00018916367623113503, + "loss": 2.1746, + "step": 16338 + }, + { + "epoch": 1.90631198226578, + "grad_norm": 1.1555826663970947, + "learning_rate": 0.00018914905046422403, + "loss": 2.1717, + "step": 16339 + }, + { + "epoch": 1.9064286547660716, + "grad_norm": 1.041569709777832, + "learning_rate": 0.0001891344243070093, + "loss": 1.925, + "step": 16340 + }, + { + "epoch": 1.9065453272663633, + "grad_norm": 1.1084216833114624, + "learning_rate": 0.0001891197977596425, + "loss": 1.9424, + "step": 16341 + }, + { + "epoch": 1.906661999766655, + "grad_norm": 1.0755910873413086, + "learning_rate": 0.00018910517082227523, + "loss": 2.1434, + "step": 16342 + }, + { + "epoch": 1.9067786722669466, + "grad_norm": 1.0755125284194946, + "learning_rate": 0.00018909054349505917, + "loss": 1.8602, + "step": 16343 + }, + { + "epoch": 1.9068953447672383, + "grad_norm": 1.2350050210952759, + "learning_rate": 0.00018907591577814597, + "loss": 2.1267, + "step": 16344 + }, + { + "epoch": 1.90701201726753, + "grad_norm": 1.0771548748016357, + "learning_rate": 0.0001890612876716872, + "loss": 2.0245, + "step": 16345 + }, + { + "epoch": 1.9071286897678217, + "grad_norm": 1.1183521747589111, + "learning_rate": 0.00018904665917583458, + "loss": 1.9594, + "step": 16346 + }, + { + "epoch": 1.9072453622681134, + "grad_norm": 1.0370627641677856, + "learning_rate": 0.00018903203029073975, + "loss": 1.8574, + "step": 16347 + }, + { + "epoch": 1.907362034768405, + "grad_norm": 1.1284234523773193, + "learning_rate": 0.00018901740101655434, + "loss": 1.9491, + "step": 16348 + }, + { + "epoch": 1.9074787072686967, + "grad_norm": 1.1692266464233398, + "learning_rate": 0.00018900277135343004, + "loss": 2.1335, + "step": 16349 + }, + { + "epoch": 1.9075953797689884, + "grad_norm": 1.1690224409103394, + "learning_rate": 0.00018898814130151846, + "loss": 2.1578, + "step": 16350 + }, + { + "epoch": 1.90771205226928, + "grad_norm": 1.1950337886810303, + "learning_rate": 0.00018897351086097133, + "loss": 1.9867, + "step": 16351 + }, + { + "epoch": 1.9078287247695718, + "grad_norm": 1.0714348554611206, + "learning_rate": 0.0001889588800319403, + "loss": 2.0554, + "step": 16352 + }, + { + "epoch": 1.9079453972698635, + "grad_norm": 0.9425806999206543, + "learning_rate": 0.00018894424881457701, + "loss": 1.8682, + "step": 16353 + }, + { + "epoch": 1.9080620697701551, + "grad_norm": 1.2431364059448242, + "learning_rate": 0.00018892961720903327, + "loss": 1.949, + "step": 16354 + }, + { + "epoch": 1.9081787422704468, + "grad_norm": 1.1483596563339233, + "learning_rate": 0.0001889149852154606, + "loss": 2.1481, + "step": 16355 + }, + { + "epoch": 1.9082954147707385, + "grad_norm": 1.0637714862823486, + "learning_rate": 0.00018890035283401074, + "loss": 2.056, + "step": 16356 + }, + { + "epoch": 1.9084120872710302, + "grad_norm": 1.0574357509613037, + "learning_rate": 0.00018888572006483547, + "loss": 2.0834, + "step": 16357 + }, + { + "epoch": 1.9085287597713219, + "grad_norm": 1.2047679424285889, + "learning_rate": 0.00018887108690808632, + "loss": 2.1436, + "step": 16358 + }, + { + "epoch": 1.9086454322716135, + "grad_norm": 1.0886842012405396, + "learning_rate": 0.00018885645336391514, + "loss": 2.2024, + "step": 16359 + }, + { + "epoch": 1.9087621047719052, + "grad_norm": 1.0447242259979248, + "learning_rate": 0.00018884181943247354, + "loss": 1.9964, + "step": 16360 + }, + { + "epoch": 1.908878777272197, + "grad_norm": 1.2599629163742065, + "learning_rate": 0.00018882718511391335, + "loss": 2.1942, + "step": 16361 + }, + { + "epoch": 1.9089954497724886, + "grad_norm": 1.1609883308410645, + "learning_rate": 0.00018881255040838617, + "loss": 2.1583, + "step": 16362 + }, + { + "epoch": 1.9091121222727803, + "grad_norm": 1.2032101154327393, + "learning_rate": 0.0001887979153160437, + "loss": 2.0008, + "step": 16363 + }, + { + "epoch": 1.909228794773072, + "grad_norm": 1.036718726158142, + "learning_rate": 0.00018878327983703778, + "loss": 2.0533, + "step": 16364 + }, + { + "epoch": 1.9093454672733636, + "grad_norm": 1.0179836750030518, + "learning_rate": 0.00018876864397152, + "loss": 2.0424, + "step": 16365 + }, + { + "epoch": 1.9094621397736553, + "grad_norm": 1.0577914714813232, + "learning_rate": 0.00018875400771964216, + "loss": 2.0457, + "step": 16366 + }, + { + "epoch": 1.909578812273947, + "grad_norm": 1.0962623357772827, + "learning_rate": 0.00018873937108155602, + "loss": 1.9623, + "step": 16367 + }, + { + "epoch": 1.9096954847742387, + "grad_norm": 1.0738730430603027, + "learning_rate": 0.00018872473405741327, + "loss": 1.9838, + "step": 16368 + }, + { + "epoch": 1.9098121572745304, + "grad_norm": 1.1838001012802124, + "learning_rate": 0.00018871009664736562, + "loss": 2.0501, + "step": 16369 + }, + { + "epoch": 1.909928829774822, + "grad_norm": 1.1869487762451172, + "learning_rate": 0.0001886954588515649, + "loss": 2.041, + "step": 16370 + }, + { + "epoch": 1.9100455022751137, + "grad_norm": 1.0342421531677246, + "learning_rate": 0.00018868082067016278, + "loss": 2.0351, + "step": 16371 + }, + { + "epoch": 1.9101621747754054, + "grad_norm": 1.1099776029586792, + "learning_rate": 0.00018866618210331107, + "loss": 2.2156, + "step": 16372 + }, + { + "epoch": 1.910278847275697, + "grad_norm": 0.9511376619338989, + "learning_rate": 0.00018865154315116152, + "loss": 1.9466, + "step": 16373 + }, + { + "epoch": 1.9103955197759888, + "grad_norm": 1.1731082201004028, + "learning_rate": 0.00018863690381386582, + "loss": 2.1022, + "step": 16374 + }, + { + "epoch": 1.9105121922762804, + "grad_norm": 1.2015239000320435, + "learning_rate": 0.0001886222640915758, + "loss": 2.2223, + "step": 16375 + }, + { + "epoch": 1.9106288647765721, + "grad_norm": 0.9569002985954285, + "learning_rate": 0.00018860762398444325, + "loss": 1.9774, + "step": 16376 + }, + { + "epoch": 1.9107455372768638, + "grad_norm": 1.2211016416549683, + "learning_rate": 0.00018859298349261988, + "loss": 1.8582, + "step": 16377 + }, + { + "epoch": 1.9108622097771555, + "grad_norm": 1.0310567617416382, + "learning_rate": 0.0001885783426162575, + "loss": 1.8545, + "step": 16378 + }, + { + "epoch": 1.9109788822774472, + "grad_norm": 0.90913325548172, + "learning_rate": 0.00018856370135550792, + "loss": 1.8098, + "step": 16379 + }, + { + "epoch": 1.9110955547777388, + "grad_norm": 1.109019160270691, + "learning_rate": 0.0001885490597105229, + "loss": 2.2523, + "step": 16380 + }, + { + "epoch": 1.9112122272780305, + "grad_norm": 1.1352213621139526, + "learning_rate": 0.0001885344176814542, + "loss": 2.0168, + "step": 16381 + }, + { + "epoch": 1.9113288997783222, + "grad_norm": 1.1558358669281006, + "learning_rate": 0.0001885197752684536, + "loss": 2.2133, + "step": 16382 + }, + { + "epoch": 1.911445572278614, + "grad_norm": 1.154754877090454, + "learning_rate": 0.0001885051324716729, + "loss": 1.985, + "step": 16383 + }, + { + "epoch": 1.9115622447789056, + "grad_norm": 1.2213605642318726, + "learning_rate": 0.00018849048929126398, + "loss": 2.0966, + "step": 16384 + }, + { + "epoch": 1.9116789172791973, + "grad_norm": 0.974761962890625, + "learning_rate": 0.00018847584572737855, + "loss": 1.9866, + "step": 16385 + }, + { + "epoch": 1.911795589779489, + "grad_norm": 0.9653862714767456, + "learning_rate": 0.00018846120178016846, + "loss": 2.1044, + "step": 16386 + }, + { + "epoch": 1.9119122622797806, + "grad_norm": 1.012160062789917, + "learning_rate": 0.00018844655744978554, + "loss": 1.9895, + "step": 16387 + }, + { + "epoch": 1.9120289347800723, + "grad_norm": 1.0428229570388794, + "learning_rate": 0.00018843191273638167, + "loss": 1.9068, + "step": 16388 + }, + { + "epoch": 1.912145607280364, + "grad_norm": 1.202952265739441, + "learning_rate": 0.00018841726764010848, + "loss": 2.1399, + "step": 16389 + }, + { + "epoch": 1.9122622797806557, + "grad_norm": 1.0004454851150513, + "learning_rate": 0.00018840262216111793, + "loss": 2.0102, + "step": 16390 + }, + { + "epoch": 1.9123789522809473, + "grad_norm": 1.0952677726745605, + "learning_rate": 0.00018838797629956182, + "loss": 1.9111, + "step": 16391 + }, + { + "epoch": 1.912495624781239, + "grad_norm": 1.070655345916748, + "learning_rate": 0.000188373330055592, + "loss": 1.8378, + "step": 16392 + }, + { + "epoch": 1.9126122972815307, + "grad_norm": 1.3767826557159424, + "learning_rate": 0.0001883586834293603, + "loss": 2.1208, + "step": 16393 + }, + { + "epoch": 1.9127289697818224, + "grad_norm": 1.0819406509399414, + "learning_rate": 0.00018834403642101853, + "loss": 1.917, + "step": 16394 + }, + { + "epoch": 1.912845642282114, + "grad_norm": 1.062126636505127, + "learning_rate": 0.00018832938903071852, + "loss": 1.9172, + "step": 16395 + }, + { + "epoch": 1.9129623147824057, + "grad_norm": 1.1646207571029663, + "learning_rate": 0.00018831474125861214, + "loss": 2.0855, + "step": 16396 + }, + { + "epoch": 1.9130789872826974, + "grad_norm": 1.1311180591583252, + "learning_rate": 0.00018830009310485133, + "loss": 2.093, + "step": 16397 + }, + { + "epoch": 1.913195659782989, + "grad_norm": 1.0644911527633667, + "learning_rate": 0.00018828544456958785, + "loss": 2.0901, + "step": 16398 + }, + { + "epoch": 1.9133123322832808, + "grad_norm": 1.0954633951187134, + "learning_rate": 0.0001882707956529735, + "loss": 2.0868, + "step": 16399 + }, + { + "epoch": 1.9134290047835725, + "grad_norm": 1.071653962135315, + "learning_rate": 0.00018825614635516029, + "loss": 1.9578, + "step": 16400 + }, + { + "epoch": 1.9135456772838642, + "grad_norm": 1.3379024267196655, + "learning_rate": 0.00018824149667629998, + "loss": 2.0222, + "step": 16401 + }, + { + "epoch": 1.9136623497841558, + "grad_norm": 1.139136791229248, + "learning_rate": 0.00018822684661654455, + "loss": 2.0981, + "step": 16402 + }, + { + "epoch": 1.9137790222844475, + "grad_norm": 1.272792935371399, + "learning_rate": 0.0001882121961760458, + "loss": 2.2113, + "step": 16403 + }, + { + "epoch": 1.9138956947847392, + "grad_norm": 1.3020495176315308, + "learning_rate": 0.00018819754535495556, + "loss": 2.3288, + "step": 16404 + }, + { + "epoch": 1.9140123672850309, + "grad_norm": 1.0099701881408691, + "learning_rate": 0.0001881828941534258, + "loss": 1.9426, + "step": 16405 + }, + { + "epoch": 1.9141290397853226, + "grad_norm": 1.2468265295028687, + "learning_rate": 0.0001881682425716084, + "loss": 2.1866, + "step": 16406 + }, + { + "epoch": 1.9142457122856142, + "grad_norm": 1.074587106704712, + "learning_rate": 0.0001881535906096552, + "loss": 2.0459, + "step": 16407 + }, + { + "epoch": 1.914362384785906, + "grad_norm": 1.1252586841583252, + "learning_rate": 0.0001881389382677181, + "loss": 2.2701, + "step": 16408 + }, + { + "epoch": 1.9144790572861976, + "grad_norm": 0.9791412353515625, + "learning_rate": 0.00018812428554594902, + "loss": 1.8864, + "step": 16409 + }, + { + "epoch": 1.9145957297864893, + "grad_norm": 1.0605902671813965, + "learning_rate": 0.00018810963244449993, + "loss": 1.9634, + "step": 16410 + }, + { + "epoch": 1.914712402286781, + "grad_norm": 1.0416392087936401, + "learning_rate": 0.00018809497896352256, + "loss": 2.2248, + "step": 16411 + }, + { + "epoch": 1.9148290747870726, + "grad_norm": 1.1359758377075195, + "learning_rate": 0.00018808032510316905, + "loss": 2.1519, + "step": 16412 + }, + { + "epoch": 1.9149457472873643, + "grad_norm": 1.0643953084945679, + "learning_rate": 0.00018806567086359114, + "loss": 2.0955, + "step": 16413 + }, + { + "epoch": 1.915062419787656, + "grad_norm": 1.021471381187439, + "learning_rate": 0.0001880510162449409, + "loss": 1.9679, + "step": 16414 + }, + { + "epoch": 1.9151790922879477, + "grad_norm": 1.059412956237793, + "learning_rate": 0.00018803636124737008, + "loss": 2.1232, + "step": 16415 + }, + { + "epoch": 1.9152957647882394, + "grad_norm": 1.1213048696517944, + "learning_rate": 0.0001880217058710307, + "loss": 2.0912, + "step": 16416 + }, + { + "epoch": 1.915412437288531, + "grad_norm": 0.973455548286438, + "learning_rate": 0.00018800705011607467, + "loss": 1.8991, + "step": 16417 + }, + { + "epoch": 1.9155291097888227, + "grad_norm": 1.2009297609329224, + "learning_rate": 0.00018799239398265395, + "loss": 2.1634, + "step": 16418 + }, + { + "epoch": 1.9156457822891144, + "grad_norm": 1.175310730934143, + "learning_rate": 0.0001879777374709204, + "loss": 2.1743, + "step": 16419 + }, + { + "epoch": 1.915762454789406, + "grad_norm": 1.1907527446746826, + "learning_rate": 0.0001879630805810261, + "loss": 2.0215, + "step": 16420 + }, + { + "epoch": 1.9158791272896978, + "grad_norm": 1.0018665790557861, + "learning_rate": 0.0001879484233131229, + "loss": 1.9282, + "step": 16421 + }, + { + "epoch": 1.9159957997899895, + "grad_norm": 1.2723571062088013, + "learning_rate": 0.00018793376566736276, + "loss": 2.2307, + "step": 16422 + }, + { + "epoch": 1.9161124722902811, + "grad_norm": 1.1506903171539307, + "learning_rate": 0.0001879191076438977, + "loss": 1.993, + "step": 16423 + }, + { + "epoch": 1.9162291447905728, + "grad_norm": 1.1558022499084473, + "learning_rate": 0.00018790444924287952, + "loss": 2.0714, + "step": 16424 + }, + { + "epoch": 1.9163458172908645, + "grad_norm": 1.3340935707092285, + "learning_rate": 0.00018788979046446038, + "loss": 2.1803, + "step": 16425 + }, + { + "epoch": 1.9164624897911562, + "grad_norm": 1.15876042842865, + "learning_rate": 0.00018787513130879213, + "loss": 2.0895, + "step": 16426 + }, + { + "epoch": 1.9165791622914479, + "grad_norm": 1.0447216033935547, + "learning_rate": 0.00018786047177602675, + "loss": 2.062, + "step": 16427 + }, + { + "epoch": 1.9166958347917395, + "grad_norm": 1.0034986734390259, + "learning_rate": 0.00018784581186631621, + "loss": 1.9496, + "step": 16428 + }, + { + "epoch": 1.9168125072920312, + "grad_norm": 1.201600193977356, + "learning_rate": 0.00018783115157981252, + "loss": 2.1028, + "step": 16429 + }, + { + "epoch": 1.916929179792323, + "grad_norm": 1.0712053775787354, + "learning_rate": 0.00018781649091666766, + "loss": 2.0175, + "step": 16430 + }, + { + "epoch": 1.9170458522926146, + "grad_norm": 1.0950220823287964, + "learning_rate": 0.00018780182987703357, + "loss": 2.1068, + "step": 16431 + }, + { + "epoch": 1.9171625247929063, + "grad_norm": 1.132776141166687, + "learning_rate": 0.00018778716846106234, + "loss": 1.9982, + "step": 16432 + }, + { + "epoch": 1.917279197293198, + "grad_norm": 1.0495600700378418, + "learning_rate": 0.00018777250666890584, + "loss": 1.815, + "step": 16433 + }, + { + "epoch": 1.9173958697934896, + "grad_norm": 1.1151087284088135, + "learning_rate": 0.00018775784450071611, + "loss": 1.9745, + "step": 16434 + }, + { + "epoch": 1.9175125422937813, + "grad_norm": 1.0508372783660889, + "learning_rate": 0.0001877431819566452, + "loss": 2.1628, + "step": 16435 + }, + { + "epoch": 1.917629214794073, + "grad_norm": 1.0787556171417236, + "learning_rate": 0.00018772851903684504, + "loss": 2.1277, + "step": 16436 + }, + { + "epoch": 1.9177458872943647, + "grad_norm": 1.15290367603302, + "learning_rate": 0.00018771385574146765, + "loss": 2.0703, + "step": 16437 + }, + { + "epoch": 1.9178625597946564, + "grad_norm": 1.0187774896621704, + "learning_rate": 0.00018769919207066514, + "loss": 1.8066, + "step": 16438 + }, + { + "epoch": 1.917979232294948, + "grad_norm": 1.0563524961471558, + "learning_rate": 0.00018768452802458939, + "loss": 2.0054, + "step": 16439 + }, + { + "epoch": 1.9180959047952397, + "grad_norm": 1.1603095531463623, + "learning_rate": 0.00018766986360339256, + "loss": 1.9703, + "step": 16440 + }, + { + "epoch": 1.9182125772955314, + "grad_norm": 1.0331823825836182, + "learning_rate": 0.00018765519880722655, + "loss": 1.8382, + "step": 16441 + }, + { + "epoch": 1.918329249795823, + "grad_norm": 1.1313732862472534, + "learning_rate": 0.00018764053363624342, + "loss": 2.1326, + "step": 16442 + }, + { + "epoch": 1.9184459222961148, + "grad_norm": 1.0009546279907227, + "learning_rate": 0.00018762586809059528, + "loss": 1.9903, + "step": 16443 + }, + { + "epoch": 1.9185625947964065, + "grad_norm": 1.126704216003418, + "learning_rate": 0.00018761120217043403, + "loss": 2.0738, + "step": 16444 + }, + { + "epoch": 1.9186792672966981, + "grad_norm": 1.071020483970642, + "learning_rate": 0.00018759653587591187, + "loss": 1.8928, + "step": 16445 + }, + { + "epoch": 1.9187959397969898, + "grad_norm": 1.1662898063659668, + "learning_rate": 0.0001875818692071807, + "loss": 2.0354, + "step": 16446 + }, + { + "epoch": 1.9189126122972815, + "grad_norm": 1.1097288131713867, + "learning_rate": 0.00018756720216439265, + "loss": 2.0605, + "step": 16447 + }, + { + "epoch": 1.9190292847975732, + "grad_norm": 1.4553946256637573, + "learning_rate": 0.00018755253474769977, + "loss": 2.1686, + "step": 16448 + }, + { + "epoch": 1.9191459572978649, + "grad_norm": 1.1073310375213623, + "learning_rate": 0.00018753786695725408, + "loss": 1.9304, + "step": 16449 + }, + { + "epoch": 1.9192626297981565, + "grad_norm": 0.956197202205658, + "learning_rate": 0.00018752319879320766, + "loss": 1.9598, + "step": 16450 + }, + { + "epoch": 1.9193793022984482, + "grad_norm": 1.0044186115264893, + "learning_rate": 0.00018750853025571257, + "loss": 1.977, + "step": 16451 + }, + { + "epoch": 1.91949597479874, + "grad_norm": 1.101757526397705, + "learning_rate": 0.00018749386134492085, + "loss": 2.1084, + "step": 16452 + }, + { + "epoch": 1.9196126472990316, + "grad_norm": 1.137589693069458, + "learning_rate": 0.00018747919206098463, + "loss": 2.1228, + "step": 16453 + }, + { + "epoch": 1.9197293197993233, + "grad_norm": 1.004876971244812, + "learning_rate": 0.00018746452240405592, + "loss": 1.7984, + "step": 16454 + }, + { + "epoch": 1.919845992299615, + "grad_norm": 0.9734475612640381, + "learning_rate": 0.00018744985237428686, + "loss": 1.9701, + "step": 16455 + }, + { + "epoch": 1.9199626647999066, + "grad_norm": 0.9695495367050171, + "learning_rate": 0.0001874351819718295, + "loss": 1.992, + "step": 16456 + }, + { + "epoch": 1.9200793373001983, + "grad_norm": 1.1080716848373413, + "learning_rate": 0.0001874205111968359, + "loss": 2.0997, + "step": 16457 + }, + { + "epoch": 1.92019600980049, + "grad_norm": 1.0862107276916504, + "learning_rate": 0.00018740584004945824, + "loss": 1.8596, + "step": 16458 + }, + { + "epoch": 1.9203126823007817, + "grad_norm": 1.1395432949066162, + "learning_rate": 0.00018739116852984855, + "loss": 2.0524, + "step": 16459 + }, + { + "epoch": 1.9204293548010734, + "grad_norm": 1.4697233438491821, + "learning_rate": 0.0001873764966381589, + "loss": 2.1135, + "step": 16460 + }, + { + "epoch": 1.920546027301365, + "grad_norm": 1.1042617559432983, + "learning_rate": 0.0001873618243745414, + "loss": 2.0561, + "step": 16461 + }, + { + "epoch": 1.9206626998016567, + "grad_norm": 1.110310673713684, + "learning_rate": 0.00018734715173914822, + "loss": 2.1912, + "step": 16462 + }, + { + "epoch": 1.9207793723019484, + "grad_norm": 1.0349444150924683, + "learning_rate": 0.0001873324787321314, + "loss": 1.9425, + "step": 16463 + }, + { + "epoch": 1.92089604480224, + "grad_norm": 0.9964301586151123, + "learning_rate": 0.00018731780535364306, + "loss": 1.9021, + "step": 16464 + }, + { + "epoch": 1.9210127173025318, + "grad_norm": 1.199849009513855, + "learning_rate": 0.00018730313160383546, + "loss": 1.9002, + "step": 16465 + }, + { + "epoch": 1.9211293898028234, + "grad_norm": 1.0701420307159424, + "learning_rate": 0.00018728845748286049, + "loss": 2.0608, + "step": 16466 + }, + { + "epoch": 1.9212460623031151, + "grad_norm": 1.0830990076065063, + "learning_rate": 0.00018727378299087044, + "loss": 1.9985, + "step": 16467 + }, + { + "epoch": 1.9213627348034068, + "grad_norm": 1.0520521402359009, + "learning_rate": 0.00018725910812801734, + "loss": 2.0964, + "step": 16468 + }, + { + "epoch": 1.9214794073036985, + "grad_norm": 1.2493103742599487, + "learning_rate": 0.0001872444328944534, + "loss": 2.2032, + "step": 16469 + }, + { + "epoch": 1.9215960798039902, + "grad_norm": 1.1451236009597778, + "learning_rate": 0.00018722975729033075, + "loss": 2.1071, + "step": 16470 + }, + { + "epoch": 1.9217127523042818, + "grad_norm": 0.9701276421546936, + "learning_rate": 0.00018721508131580144, + "loss": 1.8885, + "step": 16471 + }, + { + "epoch": 1.9218294248045735, + "grad_norm": 1.072128415107727, + "learning_rate": 0.00018720040497101774, + "loss": 1.9522, + "step": 16472 + }, + { + "epoch": 1.9219460973048652, + "grad_norm": 1.2050938606262207, + "learning_rate": 0.00018718572825613173, + "loss": 2.0092, + "step": 16473 + }, + { + "epoch": 1.922062769805157, + "grad_norm": 1.1848773956298828, + "learning_rate": 0.00018717105117129553, + "loss": 2.0375, + "step": 16474 + }, + { + "epoch": 1.9221794423054486, + "grad_norm": 1.32135009765625, + "learning_rate": 0.00018715637371666139, + "loss": 2.203, + "step": 16475 + }, + { + "epoch": 1.9222961148057403, + "grad_norm": 1.5415281057357788, + "learning_rate": 0.0001871416958923814, + "loss": 2.0471, + "step": 16476 + }, + { + "epoch": 1.922412787306032, + "grad_norm": 1.1737338304519653, + "learning_rate": 0.00018712701769860767, + "loss": 2.2413, + "step": 16477 + }, + { + "epoch": 1.9225294598063236, + "grad_norm": 1.0591827630996704, + "learning_rate": 0.00018711233913549252, + "loss": 1.9955, + "step": 16478 + }, + { + "epoch": 1.9226461323066153, + "grad_norm": 1.141416311264038, + "learning_rate": 0.000187097660203188, + "loss": 1.9331, + "step": 16479 + }, + { + "epoch": 1.922762804806907, + "grad_norm": 1.2876332998275757, + "learning_rate": 0.00018708298090184633, + "loss": 2.103, + "step": 16480 + }, + { + "epoch": 1.9228794773071987, + "grad_norm": 1.0220240354537964, + "learning_rate": 0.00018706830123161968, + "loss": 2.1103, + "step": 16481 + }, + { + "epoch": 1.9229961498074903, + "grad_norm": 1.0162882804870605, + "learning_rate": 0.00018705362119266023, + "loss": 2.0384, + "step": 16482 + }, + { + "epoch": 1.923112822307782, + "grad_norm": 1.0463494062423706, + "learning_rate": 0.00018703894078512024, + "loss": 1.9892, + "step": 16483 + }, + { + "epoch": 1.9232294948080737, + "grad_norm": 1.0781090259552002, + "learning_rate": 0.00018702426000915173, + "loss": 2.0975, + "step": 16484 + }, + { + "epoch": 1.9233461673083654, + "grad_norm": 1.1692652702331543, + "learning_rate": 0.00018700957886490704, + "loss": 2.1995, + "step": 16485 + }, + { + "epoch": 1.923462839808657, + "grad_norm": 1.1217336654663086, + "learning_rate": 0.0001869948973525383, + "loss": 1.9738, + "step": 16486 + }, + { + "epoch": 1.9235795123089487, + "grad_norm": 1.3998165130615234, + "learning_rate": 0.00018698021547219776, + "loss": 2.0415, + "step": 16487 + }, + { + "epoch": 1.9236961848092404, + "grad_norm": 1.287787675857544, + "learning_rate": 0.0001869655332240376, + "loss": 2.1439, + "step": 16488 + }, + { + "epoch": 1.923812857309532, + "grad_norm": 1.1233940124511719, + "learning_rate": 0.00018695085060821002, + "loss": 2.0196, + "step": 16489 + }, + { + "epoch": 1.9239295298098238, + "grad_norm": 1.1004326343536377, + "learning_rate": 0.00018693616762486718, + "loss": 2.0995, + "step": 16490 + }, + { + "epoch": 1.9240462023101155, + "grad_norm": 1.184768795967102, + "learning_rate": 0.00018692148427416146, + "loss": 2.0897, + "step": 16491 + }, + { + "epoch": 1.9241628748104072, + "grad_norm": 1.1965351104736328, + "learning_rate": 0.00018690680055624491, + "loss": 2.0419, + "step": 16492 + }, + { + "epoch": 1.9242795473106988, + "grad_norm": 1.1695843935012817, + "learning_rate": 0.00018689211647126988, + "loss": 2.0777, + "step": 16493 + }, + { + "epoch": 1.9243962198109905, + "grad_norm": 1.1842479705810547, + "learning_rate": 0.0001868774320193885, + "loss": 2.005, + "step": 16494 + }, + { + "epoch": 1.9245128923112822, + "grad_norm": 1.2220253944396973, + "learning_rate": 0.00018686274720075303, + "loss": 2.1087, + "step": 16495 + }, + { + "epoch": 1.9246295648115739, + "grad_norm": 1.1133054494857788, + "learning_rate": 0.00018684806201551575, + "loss": 1.9345, + "step": 16496 + }, + { + "epoch": 1.9247462373118656, + "grad_norm": 1.1516754627227783, + "learning_rate": 0.00018683337646382885, + "loss": 2.1525, + "step": 16497 + }, + { + "epoch": 1.9248629098121572, + "grad_norm": 1.0580366849899292, + "learning_rate": 0.00018681869054584463, + "loss": 1.9418, + "step": 16498 + }, + { + "epoch": 1.924979582312449, + "grad_norm": 1.083680510520935, + "learning_rate": 0.00018680400426171527, + "loss": 2.133, + "step": 16499 + }, + { + "epoch": 1.9250962548127406, + "grad_norm": 1.1083570718765259, + "learning_rate": 0.0001867893176115931, + "loss": 2.04, + "step": 16500 + }, + { + "epoch": 1.9252129273130323, + "grad_norm": 1.0010793209075928, + "learning_rate": 0.0001867746305956303, + "loss": 1.8356, + "step": 16501 + }, + { + "epoch": 1.925329599813324, + "grad_norm": 1.1575475931167603, + "learning_rate": 0.00018675994321397914, + "loss": 2.1213, + "step": 16502 + }, + { + "epoch": 1.9254462723136156, + "grad_norm": 1.0797747373580933, + "learning_rate": 0.0001867452554667919, + "loss": 2.1137, + "step": 16503 + }, + { + "epoch": 1.9255629448139073, + "grad_norm": 1.217187523841858, + "learning_rate": 0.00018673056735422084, + "loss": 2.0429, + "step": 16504 + }, + { + "epoch": 1.925679617314199, + "grad_norm": 1.0642038583755493, + "learning_rate": 0.00018671587887641823, + "loss": 2.0467, + "step": 16505 + }, + { + "epoch": 1.9257962898144907, + "grad_norm": 1.2031630277633667, + "learning_rate": 0.00018670119003353638, + "loss": 1.9377, + "step": 16506 + }, + { + "epoch": 1.9259129623147824, + "grad_norm": 1.0556222200393677, + "learning_rate": 0.0001866865008257275, + "loss": 2.0019, + "step": 16507 + }, + { + "epoch": 1.926029634815074, + "grad_norm": 1.1780734062194824, + "learning_rate": 0.00018667181125314394, + "loss": 1.9834, + "step": 16508 + }, + { + "epoch": 1.9261463073153657, + "grad_norm": 1.2100602388381958, + "learning_rate": 0.00018665712131593794, + "loss": 2.2261, + "step": 16509 + }, + { + "epoch": 1.9262629798156574, + "grad_norm": 1.1381114721298218, + "learning_rate": 0.00018664243101426182, + "loss": 2.0928, + "step": 16510 + }, + { + "epoch": 1.926379652315949, + "grad_norm": 1.0618664026260376, + "learning_rate": 0.00018662774034826783, + "loss": 2.0257, + "step": 16511 + }, + { + "epoch": 1.9264963248162408, + "grad_norm": 1.1469649076461792, + "learning_rate": 0.00018661304931810832, + "loss": 1.9162, + "step": 16512 + }, + { + "epoch": 1.9266129973165325, + "grad_norm": 0.9460930228233337, + "learning_rate": 0.00018659835792393553, + "loss": 2.0423, + "step": 16513 + }, + { + "epoch": 1.9267296698168241, + "grad_norm": 1.074278712272644, + "learning_rate": 0.0001865836661659018, + "loss": 2.132, + "step": 16514 + }, + { + "epoch": 1.9268463423171158, + "grad_norm": 1.2566667795181274, + "learning_rate": 0.00018656897404415942, + "loss": 2.0485, + "step": 16515 + }, + { + "epoch": 1.9269630148174075, + "grad_norm": 1.169319748878479, + "learning_rate": 0.00018655428155886074, + "loss": 2.1521, + "step": 16516 + }, + { + "epoch": 1.9270796873176992, + "grad_norm": 1.0174734592437744, + "learning_rate": 0.00018653958871015807, + "loss": 2.0779, + "step": 16517 + }, + { + "epoch": 1.9271963598179909, + "grad_norm": 1.1736809015274048, + "learning_rate": 0.0001865248954982037, + "loss": 2.106, + "step": 16518 + }, + { + "epoch": 1.9273130323182825, + "grad_norm": 1.0125732421875, + "learning_rate": 0.00018651020192314997, + "loss": 2.17, + "step": 16519 + }, + { + "epoch": 1.9274297048185742, + "grad_norm": 1.076285481452942, + "learning_rate": 0.00018649550798514915, + "loss": 2.0285, + "step": 16520 + }, + { + "epoch": 1.927546377318866, + "grad_norm": 1.3226816654205322, + "learning_rate": 0.00018648081368435366, + "loss": 2.0712, + "step": 16521 + }, + { + "epoch": 1.9276630498191576, + "grad_norm": 0.9189196228981018, + "learning_rate": 0.00018646611902091575, + "loss": 1.9865, + "step": 16522 + }, + { + "epoch": 1.9277797223194493, + "grad_norm": 1.050674557685852, + "learning_rate": 0.00018645142399498785, + "loss": 1.9087, + "step": 16523 + }, + { + "epoch": 1.927896394819741, + "grad_norm": 1.1492518186569214, + "learning_rate": 0.00018643672860672227, + "loss": 2.1806, + "step": 16524 + }, + { + "epoch": 1.9280130673200326, + "grad_norm": 1.045770525932312, + "learning_rate": 0.00018642203285627128, + "loss": 1.911, + "step": 16525 + }, + { + "epoch": 1.9281297398203243, + "grad_norm": 1.0196865797042847, + "learning_rate": 0.0001864073367437873, + "loss": 2.1886, + "step": 16526 + }, + { + "epoch": 1.928246412320616, + "grad_norm": 1.0150363445281982, + "learning_rate": 0.00018639264026942273, + "loss": 1.8662, + "step": 16527 + }, + { + "epoch": 1.9283630848209077, + "grad_norm": 1.073909878730774, + "learning_rate": 0.00018637794343332988, + "loss": 1.9477, + "step": 16528 + }, + { + "epoch": 1.9284797573211994, + "grad_norm": 1.0821713209152222, + "learning_rate": 0.000186363246235661, + "loss": 2.0436, + "step": 16529 + }, + { + "epoch": 1.928596429821491, + "grad_norm": 1.2616360187530518, + "learning_rate": 0.00018634854867656861, + "loss": 2.0682, + "step": 16530 + }, + { + "epoch": 1.9287131023217827, + "grad_norm": 1.004542350769043, + "learning_rate": 0.00018633385075620502, + "loss": 1.8619, + "step": 16531 + }, + { + "epoch": 1.9288297748220744, + "grad_norm": 0.8645080327987671, + "learning_rate": 0.00018631915247472258, + "loss": 1.8075, + "step": 16532 + }, + { + "epoch": 1.928946447322366, + "grad_norm": 1.0274864435195923, + "learning_rate": 0.0001863044538322737, + "loss": 2.0084, + "step": 16533 + }, + { + "epoch": 1.9290631198226578, + "grad_norm": 1.0973050594329834, + "learning_rate": 0.00018628975482901078, + "loss": 1.9374, + "step": 16534 + }, + { + "epoch": 1.9291797923229494, + "grad_norm": 1.1072584390640259, + "learning_rate": 0.00018627505546508618, + "loss": 2.0623, + "step": 16535 + }, + { + "epoch": 1.9292964648232411, + "grad_norm": 1.155329942703247, + "learning_rate": 0.00018626035574065224, + "loss": 2.2031, + "step": 16536 + }, + { + "epoch": 1.9294131373235328, + "grad_norm": 1.105916142463684, + "learning_rate": 0.0001862456556558614, + "loss": 2.0425, + "step": 16537 + }, + { + "epoch": 1.9295298098238245, + "grad_norm": 1.0116212368011475, + "learning_rate": 0.00018623095521086604, + "loss": 2.0616, + "step": 16538 + }, + { + "epoch": 1.9296464823241162, + "grad_norm": 1.2338706254959106, + "learning_rate": 0.00018621625440581855, + "loss": 1.9998, + "step": 16539 + }, + { + "epoch": 1.9297631548244079, + "grad_norm": 0.9965884685516357, + "learning_rate": 0.00018620155324087137, + "loss": 2.0537, + "step": 16540 + }, + { + "epoch": 1.9298798273246995, + "grad_norm": 1.2132551670074463, + "learning_rate": 0.00018618685171617686, + "loss": 2.1708, + "step": 16541 + }, + { + "epoch": 1.9299964998249912, + "grad_norm": 1.0898346900939941, + "learning_rate": 0.00018617214983188745, + "loss": 2.0809, + "step": 16542 + }, + { + "epoch": 1.930113172325283, + "grad_norm": 1.104244351387024, + "learning_rate": 0.0001861574475881555, + "loss": 1.9347, + "step": 16543 + }, + { + "epoch": 1.9302298448255746, + "grad_norm": 1.1709978580474854, + "learning_rate": 0.00018614274498513357, + "loss": 2.0437, + "step": 16544 + }, + { + "epoch": 1.9303465173258663, + "grad_norm": 1.0894901752471924, + "learning_rate": 0.00018612804202297392, + "loss": 1.9982, + "step": 16545 + }, + { + "epoch": 1.930463189826158, + "grad_norm": 1.0135616064071655, + "learning_rate": 0.00018611333870182907, + "loss": 1.8507, + "step": 16546 + }, + { + "epoch": 1.9305798623264496, + "grad_norm": 1.0952085256576538, + "learning_rate": 0.0001860986350218514, + "loss": 2.0616, + "step": 16547 + }, + { + "epoch": 1.9306965348267413, + "grad_norm": 1.3183273077011108, + "learning_rate": 0.00018608393098319337, + "loss": 2.026, + "step": 16548 + }, + { + "epoch": 1.930813207327033, + "grad_norm": 1.2449932098388672, + "learning_rate": 0.0001860692265860074, + "loss": 1.9709, + "step": 16549 + }, + { + "epoch": 1.9309298798273247, + "grad_norm": 1.207923173904419, + "learning_rate": 0.00018605452183044596, + "loss": 1.9814, + "step": 16550 + }, + { + "epoch": 1.9310465523276164, + "grad_norm": 1.0827279090881348, + "learning_rate": 0.00018603981671666144, + "loss": 2.1241, + "step": 16551 + }, + { + "epoch": 1.931163224827908, + "grad_norm": 1.2842581272125244, + "learning_rate": 0.00018602511124480634, + "loss": 1.9877, + "step": 16552 + }, + { + "epoch": 1.9312798973281997, + "grad_norm": 1.3152154684066772, + "learning_rate": 0.00018601040541503305, + "loss": 2.1733, + "step": 16553 + }, + { + "epoch": 1.9313965698284914, + "grad_norm": 1.220472812652588, + "learning_rate": 0.0001859956992274941, + "loss": 2.0477, + "step": 16554 + }, + { + "epoch": 1.931513242328783, + "grad_norm": 1.1010594367980957, + "learning_rate": 0.00018598099268234182, + "loss": 2.0457, + "step": 16555 + }, + { + "epoch": 1.9316299148290748, + "grad_norm": 1.1134389638900757, + "learning_rate": 0.0001859662857797288, + "loss": 2.0351, + "step": 16556 + }, + { + "epoch": 1.9317465873293664, + "grad_norm": 1.234587550163269, + "learning_rate": 0.00018595157851980748, + "loss": 2.1123, + "step": 16557 + }, + { + "epoch": 1.9318632598296581, + "grad_norm": 1.0561964511871338, + "learning_rate": 0.00018593687090273032, + "loss": 2.2132, + "step": 16558 + }, + { + "epoch": 1.9319799323299498, + "grad_norm": 1.3142642974853516, + "learning_rate": 0.00018592216292864975, + "loss": 2.2705, + "step": 16559 + }, + { + "epoch": 1.9320966048302415, + "grad_norm": 1.0522123575210571, + "learning_rate": 0.0001859074545977183, + "loss": 1.8841, + "step": 16560 + }, + { + "epoch": 1.9322132773305332, + "grad_norm": 1.0714757442474365, + "learning_rate": 0.00018589274591008844, + "loss": 1.8381, + "step": 16561 + }, + { + "epoch": 1.9323299498308248, + "grad_norm": 1.0802351236343384, + "learning_rate": 0.0001858780368659126, + "loss": 2.0727, + "step": 16562 + }, + { + "epoch": 1.9324466223311165, + "grad_norm": 1.1812942028045654, + "learning_rate": 0.00018586332746534337, + "loss": 1.9707, + "step": 16563 + }, + { + "epoch": 1.9325632948314082, + "grad_norm": 1.1409083604812622, + "learning_rate": 0.0001858486177085331, + "loss": 2.0601, + "step": 16564 + }, + { + "epoch": 1.9326799673316999, + "grad_norm": 1.0869946479797363, + "learning_rate": 0.0001858339075956344, + "loss": 2.1408, + "step": 16565 + }, + { + "epoch": 1.9327966398319916, + "grad_norm": 1.3054065704345703, + "learning_rate": 0.0001858191971267997, + "loss": 2.0436, + "step": 16566 + }, + { + "epoch": 1.9329133123322833, + "grad_norm": 0.9626932144165039, + "learning_rate": 0.0001858044863021816, + "loss": 2.0058, + "step": 16567 + }, + { + "epoch": 1.933029984832575, + "grad_norm": 0.9839900732040405, + "learning_rate": 0.0001857897751219325, + "loss": 1.9653, + "step": 16568 + }, + { + "epoch": 1.9331466573328666, + "grad_norm": 1.1844276189804077, + "learning_rate": 0.00018577506358620493, + "loss": 1.9714, + "step": 16569 + }, + { + "epoch": 1.9332633298331583, + "grad_norm": 1.0301363468170166, + "learning_rate": 0.00018576035169515145, + "loss": 2.1066, + "step": 16570 + }, + { + "epoch": 1.93338000233345, + "grad_norm": 1.120015263557434, + "learning_rate": 0.00018574563944892452, + "loss": 2.1057, + "step": 16571 + }, + { + "epoch": 1.9334966748337417, + "grad_norm": 1.1809303760528564, + "learning_rate": 0.00018573092684767673, + "loss": 2.0126, + "step": 16572 + }, + { + "epoch": 1.9336133473340333, + "grad_norm": 1.2272197008132935, + "learning_rate": 0.0001857162138915605, + "loss": 2.1031, + "step": 16573 + }, + { + "epoch": 1.933730019834325, + "grad_norm": 1.1255916357040405, + "learning_rate": 0.00018570150058072846, + "loss": 2.0657, + "step": 16574 + }, + { + "epoch": 1.9338466923346167, + "grad_norm": 1.0862572193145752, + "learning_rate": 0.00018568678691533312, + "loss": 1.9508, + "step": 16575 + }, + { + "epoch": 1.9339633648349084, + "grad_norm": 1.135966181755066, + "learning_rate": 0.00018567207289552694, + "loss": 2.1762, + "step": 16576 + }, + { + "epoch": 1.9340800373352, + "grad_norm": 1.1315994262695312, + "learning_rate": 0.00018565735852146256, + "loss": 2.1145, + "step": 16577 + }, + { + "epoch": 1.9341967098354917, + "grad_norm": 1.0638772249221802, + "learning_rate": 0.0001856426437932925, + "loss": 2.2443, + "step": 16578 + }, + { + "epoch": 1.9343133823357834, + "grad_norm": 0.9755063056945801, + "learning_rate": 0.00018562792871116925, + "loss": 1.9358, + "step": 16579 + }, + { + "epoch": 1.934430054836075, + "grad_norm": 1.0617289543151855, + "learning_rate": 0.00018561321327524534, + "loss": 2.0899, + "step": 16580 + }, + { + "epoch": 1.9345467273363668, + "grad_norm": 0.9086790680885315, + "learning_rate": 0.00018559849748567343, + "loss": 1.9506, + "step": 16581 + }, + { + "epoch": 1.9346633998366585, + "grad_norm": 1.1355677843093872, + "learning_rate": 0.000185583781342606, + "loss": 2.0706, + "step": 16582 + }, + { + "epoch": 1.9347800723369502, + "grad_norm": 1.024545669555664, + "learning_rate": 0.00018556906484619564, + "loss": 2.0273, + "step": 16583 + }, + { + "epoch": 1.9348967448372418, + "grad_norm": 1.2848230600357056, + "learning_rate": 0.0001855543479965949, + "loss": 2.1188, + "step": 16584 + }, + { + "epoch": 1.9350134173375335, + "grad_norm": 1.2880425453186035, + "learning_rate": 0.0001855396307939564, + "loss": 2.0713, + "step": 16585 + }, + { + "epoch": 1.9351300898378252, + "grad_norm": 1.1434340476989746, + "learning_rate": 0.00018552491323843266, + "loss": 2.0399, + "step": 16586 + }, + { + "epoch": 1.9352467623381169, + "grad_norm": 1.1311872005462646, + "learning_rate": 0.00018551019533017627, + "loss": 2.2096, + "step": 16587 + }, + { + "epoch": 1.9353634348384086, + "grad_norm": 0.9787843823432922, + "learning_rate": 0.00018549547706933977, + "loss": 1.8898, + "step": 16588 + }, + { + "epoch": 1.9354801073387002, + "grad_norm": 1.0162056684494019, + "learning_rate": 0.0001854807584560758, + "loss": 1.8355, + "step": 16589 + }, + { + "epoch": 1.935596779838992, + "grad_norm": 1.0599448680877686, + "learning_rate": 0.00018546603949053686, + "loss": 2.2057, + "step": 16590 + }, + { + "epoch": 1.9357134523392836, + "grad_norm": 1.235030174255371, + "learning_rate": 0.00018545132017287562, + "loss": 1.9805, + "step": 16591 + }, + { + "epoch": 1.9358301248395753, + "grad_norm": 1.0790022611618042, + "learning_rate": 0.0001854366005032447, + "loss": 2.0206, + "step": 16592 + }, + { + "epoch": 1.935946797339867, + "grad_norm": 1.1774117946624756, + "learning_rate": 0.0001854218804817966, + "loss": 2.2006, + "step": 16593 + }, + { + "epoch": 1.9360634698401586, + "grad_norm": 1.120369553565979, + "learning_rate": 0.000185407160108684, + "loss": 2.1027, + "step": 16594 + }, + { + "epoch": 1.9361801423404503, + "grad_norm": 1.144593358039856, + "learning_rate": 0.0001853924393840595, + "loss": 2.0728, + "step": 16595 + }, + { + "epoch": 1.936296814840742, + "grad_norm": 1.0115525722503662, + "learning_rate": 0.00018537771830807564, + "loss": 2.0035, + "step": 16596 + }, + { + "epoch": 1.9364134873410337, + "grad_norm": 1.1514922380447388, + "learning_rate": 0.0001853629968808851, + "loss": 1.9526, + "step": 16597 + }, + { + "epoch": 1.9365301598413254, + "grad_norm": 1.102591872215271, + "learning_rate": 0.00018534827510264048, + "loss": 1.9119, + "step": 16598 + }, + { + "epoch": 1.936646832341617, + "grad_norm": 1.3692145347595215, + "learning_rate": 0.00018533355297349437, + "loss": 2.1142, + "step": 16599 + }, + { + "epoch": 1.9367635048419087, + "grad_norm": 1.0742864608764648, + "learning_rate": 0.0001853188304935994, + "loss": 1.9321, + "step": 16600 + }, + { + "epoch": 1.9368801773422004, + "grad_norm": 1.1962038278579712, + "learning_rate": 0.0001853041076631083, + "loss": 2.1145, + "step": 16601 + }, + { + "epoch": 1.936996849842492, + "grad_norm": 1.0871918201446533, + "learning_rate": 0.00018528938448217352, + "loss": 1.9946, + "step": 16602 + }, + { + "epoch": 1.9371135223427838, + "grad_norm": 1.011837363243103, + "learning_rate": 0.00018527466095094778, + "loss": 1.8199, + "step": 16603 + }, + { + "epoch": 1.9372301948430755, + "grad_norm": 1.0941190719604492, + "learning_rate": 0.00018525993706958383, + "loss": 1.9213, + "step": 16604 + }, + { + "epoch": 1.9373468673433671, + "grad_norm": 1.192366123199463, + "learning_rate": 0.0001852452128382341, + "loss": 2.0696, + "step": 16605 + }, + { + "epoch": 1.9374635398436588, + "grad_norm": 1.1568882465362549, + "learning_rate": 0.00018523048825705137, + "loss": 2.0526, + "step": 16606 + }, + { + "epoch": 1.9375802123439505, + "grad_norm": 1.3536478281021118, + "learning_rate": 0.0001852157633261882, + "loss": 2.1862, + "step": 16607 + }, + { + "epoch": 1.9376968848442422, + "grad_norm": 1.1166590452194214, + "learning_rate": 0.00018520103804579738, + "loss": 2.1029, + "step": 16608 + }, + { + "epoch": 1.9378135573445339, + "grad_norm": 1.1188733577728271, + "learning_rate": 0.0001851863124160314, + "loss": 1.9092, + "step": 16609 + }, + { + "epoch": 1.9379302298448255, + "grad_norm": 1.1568506956100464, + "learning_rate": 0.00018517158643704308, + "loss": 1.9505, + "step": 16610 + }, + { + "epoch": 1.9380469023451172, + "grad_norm": 1.090771198272705, + "learning_rate": 0.00018515686010898497, + "loss": 2.1517, + "step": 16611 + }, + { + "epoch": 1.938163574845409, + "grad_norm": 1.1265404224395752, + "learning_rate": 0.0001851421334320098, + "loss": 1.9649, + "step": 16612 + }, + { + "epoch": 1.9382802473457006, + "grad_norm": 1.174798607826233, + "learning_rate": 0.0001851274064062702, + "loss": 2.0004, + "step": 16613 + }, + { + "epoch": 1.9383969198459923, + "grad_norm": 1.230013132095337, + "learning_rate": 0.00018511267903191885, + "loss": 2.0935, + "step": 16614 + }, + { + "epoch": 1.938513592346284, + "grad_norm": 1.1658934354782104, + "learning_rate": 0.00018509795130910837, + "loss": 2.0849, + "step": 16615 + }, + { + "epoch": 1.9386302648465756, + "grad_norm": 1.1111253499984741, + "learning_rate": 0.0001850832232379916, + "loss": 1.996, + "step": 16616 + }, + { + "epoch": 1.9387469373468673, + "grad_norm": 1.081645131111145, + "learning_rate": 0.00018506849481872106, + "loss": 2.0266, + "step": 16617 + }, + { + "epoch": 1.938863609847159, + "grad_norm": 1.0317261219024658, + "learning_rate": 0.0001850537660514495, + "loss": 2.0604, + "step": 16618 + }, + { + "epoch": 1.9389802823474507, + "grad_norm": 1.132942795753479, + "learning_rate": 0.00018503903693632963, + "loss": 2.0662, + "step": 16619 + }, + { + "epoch": 1.9390969548477424, + "grad_norm": 1.0330296754837036, + "learning_rate": 0.00018502430747351417, + "loss": 2.0377, + "step": 16620 + }, + { + "epoch": 1.939213627348034, + "grad_norm": 1.128577709197998, + "learning_rate": 0.00018500957766315576, + "loss": 2.1346, + "step": 16621 + }, + { + "epoch": 1.9393302998483257, + "grad_norm": 1.1777263879776, + "learning_rate": 0.00018499484750540712, + "loss": 2.0952, + "step": 16622 + }, + { + "epoch": 1.9394469723486174, + "grad_norm": 1.0468206405639648, + "learning_rate": 0.00018498011700042092, + "loss": 1.9986, + "step": 16623 + }, + { + "epoch": 1.939563644848909, + "grad_norm": 1.2755153179168701, + "learning_rate": 0.00018496538614834994, + "loss": 2.1758, + "step": 16624 + }, + { + "epoch": 1.9396803173492008, + "grad_norm": 1.0862209796905518, + "learning_rate": 0.00018495065494934681, + "loss": 2.0123, + "step": 16625 + }, + { + "epoch": 1.9397969898494924, + "grad_norm": 1.0834718942642212, + "learning_rate": 0.0001849359234035644, + "loss": 2.0862, + "step": 16626 + }, + { + "epoch": 1.9399136623497841, + "grad_norm": 1.090560793876648, + "learning_rate": 0.00018492119151115524, + "loss": 2.1202, + "step": 16627 + }, + { + "epoch": 1.9400303348500758, + "grad_norm": 1.089306116104126, + "learning_rate": 0.00018490645927227219, + "loss": 2.1088, + "step": 16628 + }, + { + "epoch": 1.9401470073503675, + "grad_norm": 1.2876155376434326, + "learning_rate": 0.00018489172668706793, + "loss": 2.1365, + "step": 16629 + }, + { + "epoch": 1.9402636798506592, + "grad_norm": 1.3375039100646973, + "learning_rate": 0.0001848769937556952, + "loss": 2.1848, + "step": 16630 + }, + { + "epoch": 1.9403803523509509, + "grad_norm": 1.0437822341918945, + "learning_rate": 0.00018486226047830667, + "loss": 1.907, + "step": 16631 + }, + { + "epoch": 1.9404970248512425, + "grad_norm": 1.0476927757263184, + "learning_rate": 0.00018484752685505516, + "loss": 2.054, + "step": 16632 + }, + { + "epoch": 1.9406136973515342, + "grad_norm": 1.2792198657989502, + "learning_rate": 0.00018483279288609338, + "loss": 1.8251, + "step": 16633 + }, + { + "epoch": 1.940730369851826, + "grad_norm": 1.1232110261917114, + "learning_rate": 0.00018481805857157408, + "loss": 2.1157, + "step": 16634 + }, + { + "epoch": 1.9408470423521176, + "grad_norm": 1.050219178199768, + "learning_rate": 0.00018480332391165, + "loss": 2.0473, + "step": 16635 + }, + { + "epoch": 1.9409637148524093, + "grad_norm": 1.0485690832138062, + "learning_rate": 0.00018478858890647393, + "loss": 2.1529, + "step": 16636 + }, + { + "epoch": 1.941080387352701, + "grad_norm": 1.1396722793579102, + "learning_rate": 0.00018477385355619862, + "loss": 1.9732, + "step": 16637 + }, + { + "epoch": 1.9411970598529926, + "grad_norm": 1.18445885181427, + "learning_rate": 0.00018475911786097675, + "loss": 1.9835, + "step": 16638 + }, + { + "epoch": 1.9413137323532843, + "grad_norm": 1.2165782451629639, + "learning_rate": 0.0001847443818209612, + "loss": 1.9395, + "step": 16639 + }, + { + "epoch": 1.941430404853576, + "grad_norm": 1.092485785484314, + "learning_rate": 0.00018472964543630465, + "loss": 2.1526, + "step": 16640 + }, + { + "epoch": 1.9415470773538677, + "grad_norm": 0.9775098562240601, + "learning_rate": 0.00018471490870715986, + "loss": 2.0316, + "step": 16641 + }, + { + "epoch": 1.9416637498541593, + "grad_norm": 1.0059269666671753, + "learning_rate": 0.00018470017163367973, + "loss": 1.8004, + "step": 16642 + }, + { + "epoch": 1.941780422354451, + "grad_norm": 1.1475720405578613, + "learning_rate": 0.0001846854342160169, + "loss": 1.8192, + "step": 16643 + }, + { + "epoch": 1.9418970948547427, + "grad_norm": 1.1093302965164185, + "learning_rate": 0.00018467069645432418, + "loss": 1.9783, + "step": 16644 + }, + { + "epoch": 1.9420137673550344, + "grad_norm": 1.1916958093643188, + "learning_rate": 0.00018465595834875443, + "loss": 2.007, + "step": 16645 + }, + { + "epoch": 1.942130439855326, + "grad_norm": 1.239334225654602, + "learning_rate": 0.00018464121989946036, + "loss": 2.2586, + "step": 16646 + }, + { + "epoch": 1.9422471123556178, + "grad_norm": 0.9942759275436401, + "learning_rate": 0.00018462648110659477, + "loss": 1.9198, + "step": 16647 + }, + { + "epoch": 1.9423637848559094, + "grad_norm": 0.9128613471984863, + "learning_rate": 0.0001846117419703105, + "loss": 1.8964, + "step": 16648 + }, + { + "epoch": 1.9424804573562011, + "grad_norm": 1.3897006511688232, + "learning_rate": 0.0001845970024907603, + "loss": 1.9302, + "step": 16649 + }, + { + "epoch": 1.9425971298564928, + "grad_norm": 1.1321167945861816, + "learning_rate": 0.000184582262668097, + "loss": 2.2412, + "step": 16650 + }, + { + "epoch": 1.9427138023567845, + "grad_norm": 1.0496654510498047, + "learning_rate": 0.0001845675225024734, + "loss": 1.9369, + "step": 16651 + }, + { + "epoch": 1.9428304748570762, + "grad_norm": 1.2086111307144165, + "learning_rate": 0.00018455278199404235, + "loss": 2.0569, + "step": 16652 + }, + { + "epoch": 1.9429471473573678, + "grad_norm": 1.0673567056655884, + "learning_rate": 0.00018453804114295657, + "loss": 2.2147, + "step": 16653 + }, + { + "epoch": 1.9430638198576595, + "grad_norm": 1.0678393840789795, + "learning_rate": 0.00018452329994936897, + "loss": 1.9581, + "step": 16654 + }, + { + "epoch": 1.9431804923579512, + "grad_norm": 1.127425193786621, + "learning_rate": 0.00018450855841343236, + "loss": 2.0692, + "step": 16655 + }, + { + "epoch": 1.9432971648582429, + "grad_norm": 1.1036927700042725, + "learning_rate": 0.00018449381653529948, + "loss": 2.1787, + "step": 16656 + }, + { + "epoch": 1.9434138373585346, + "grad_norm": 1.0936932563781738, + "learning_rate": 0.0001844790743151232, + "loss": 2.0985, + "step": 16657 + }, + { + "epoch": 1.9435305098588262, + "grad_norm": 1.1504844427108765, + "learning_rate": 0.0001844643317530564, + "loss": 1.9743, + "step": 16658 + }, + { + "epoch": 1.943647182359118, + "grad_norm": 1.3600361347198486, + "learning_rate": 0.00018444958884925188, + "loss": 2.2409, + "step": 16659 + }, + { + "epoch": 1.9437638548594096, + "grad_norm": 1.1592040061950684, + "learning_rate": 0.0001844348456038624, + "loss": 1.958, + "step": 16660 + }, + { + "epoch": 1.9438805273597013, + "grad_norm": 1.1543127298355103, + "learning_rate": 0.00018442010201704095, + "loss": 1.9132, + "step": 16661 + }, + { + "epoch": 1.943997199859993, + "grad_norm": 1.3795608282089233, + "learning_rate": 0.0001844053580889403, + "loss": 2.168, + "step": 16662 + }, + { + "epoch": 1.9441138723602847, + "grad_norm": 1.1087837219238281, + "learning_rate": 0.0001843906138197133, + "loss": 2.073, + "step": 16663 + }, + { + "epoch": 1.9442305448605763, + "grad_norm": 1.1398776769638062, + "learning_rate": 0.0001843758692095128, + "loss": 2.1283, + "step": 16664 + }, + { + "epoch": 1.944347217360868, + "grad_norm": 0.9777557849884033, + "learning_rate": 0.00018436112425849167, + "loss": 1.9274, + "step": 16665 + }, + { + "epoch": 1.9444638898611597, + "grad_norm": 1.255272626876831, + "learning_rate": 0.00018434637896680274, + "loss": 2.2252, + "step": 16666 + }, + { + "epoch": 1.9445805623614514, + "grad_norm": 1.074183464050293, + "learning_rate": 0.00018433163333459894, + "loss": 2.1057, + "step": 16667 + }, + { + "epoch": 1.944697234861743, + "grad_norm": 1.0535567998886108, + "learning_rate": 0.00018431688736203302, + "loss": 2.1106, + "step": 16668 + }, + { + "epoch": 1.9448139073620347, + "grad_norm": 1.1604771614074707, + "learning_rate": 0.00018430214104925795, + "loss": 2.1375, + "step": 16669 + }, + { + "epoch": 1.9449305798623264, + "grad_norm": 0.9914947152137756, + "learning_rate": 0.00018428739439642656, + "loss": 1.9705, + "step": 16670 + }, + { + "epoch": 1.945047252362618, + "grad_norm": 1.053472876548767, + "learning_rate": 0.00018427264740369177, + "loss": 2.1063, + "step": 16671 + }, + { + "epoch": 1.9451639248629098, + "grad_norm": 1.1390337944030762, + "learning_rate": 0.00018425790007120638, + "loss": 2.0964, + "step": 16672 + }, + { + "epoch": 1.9452805973632015, + "grad_norm": 1.2077637910842896, + "learning_rate": 0.00018424315239912338, + "loss": 2.0797, + "step": 16673 + }, + { + "epoch": 1.9453972698634932, + "grad_norm": 1.1254346370697021, + "learning_rate": 0.00018422840438759558, + "loss": 2.0485, + "step": 16674 + }, + { + "epoch": 1.9455139423637848, + "grad_norm": 1.3259057998657227, + "learning_rate": 0.00018421365603677587, + "loss": 2.2041, + "step": 16675 + }, + { + "epoch": 1.9456306148640765, + "grad_norm": 1.0377217531204224, + "learning_rate": 0.00018419890734681717, + "loss": 1.9298, + "step": 16676 + }, + { + "epoch": 1.9457472873643682, + "grad_norm": 1.1340503692626953, + "learning_rate": 0.0001841841583178724, + "loss": 2.022, + "step": 16677 + }, + { + "epoch": 1.9458639598646599, + "grad_norm": 1.188040018081665, + "learning_rate": 0.00018416940895009444, + "loss": 2.0246, + "step": 16678 + }, + { + "epoch": 1.9459806323649516, + "grad_norm": 1.2054545879364014, + "learning_rate": 0.00018415465924363615, + "loss": 2.165, + "step": 16679 + }, + { + "epoch": 1.9460973048652432, + "grad_norm": 1.2587735652923584, + "learning_rate": 0.00018413990919865052, + "loss": 2.327, + "step": 16680 + }, + { + "epoch": 1.946213977365535, + "grad_norm": 1.2148032188415527, + "learning_rate": 0.00018412515881529044, + "loss": 1.9868, + "step": 16681 + }, + { + "epoch": 1.9463306498658266, + "grad_norm": 1.105147361755371, + "learning_rate": 0.00018411040809370878, + "loss": 2.0676, + "step": 16682 + }, + { + "epoch": 1.9464473223661183, + "grad_norm": 1.1933976411819458, + "learning_rate": 0.00018409565703405849, + "loss": 2.0848, + "step": 16683 + }, + { + "epoch": 1.94656399486641, + "grad_norm": 0.9880884289741516, + "learning_rate": 0.0001840809056364925, + "loss": 2.1221, + "step": 16684 + }, + { + "epoch": 1.9466806673667016, + "grad_norm": 1.116085171699524, + "learning_rate": 0.0001840661539011637, + "loss": 2.1469, + "step": 16685 + }, + { + "epoch": 1.9467973398669933, + "grad_norm": 1.1494306325912476, + "learning_rate": 0.000184051401828225, + "loss": 2.0343, + "step": 16686 + }, + { + "epoch": 1.946914012367285, + "grad_norm": 1.2246372699737549, + "learning_rate": 0.00018403664941782946, + "loss": 1.9865, + "step": 16687 + }, + { + "epoch": 1.9470306848675767, + "grad_norm": 1.1818586587905884, + "learning_rate": 0.0001840218966701299, + "loss": 2.0615, + "step": 16688 + }, + { + "epoch": 1.9471473573678684, + "grad_norm": 1.0183579921722412, + "learning_rate": 0.00018400714358527936, + "loss": 1.9859, + "step": 16689 + }, + { + "epoch": 1.94726402986816, + "grad_norm": 1.1278793811798096, + "learning_rate": 0.00018399239016343063, + "loss": 1.9884, + "step": 16690 + }, + { + "epoch": 1.9473807023684517, + "grad_norm": 1.112099051475525, + "learning_rate": 0.00018397763640473676, + "loss": 2.1707, + "step": 16691 + }, + { + "epoch": 1.9474973748687434, + "grad_norm": 1.1286364793777466, + "learning_rate": 0.00018396288230935071, + "loss": 1.9145, + "step": 16692 + }, + { + "epoch": 1.947614047369035, + "grad_norm": 1.1336324214935303, + "learning_rate": 0.0001839481278774254, + "loss": 2.0844, + "step": 16693 + }, + { + "epoch": 1.9477307198693268, + "grad_norm": 0.9762246608734131, + "learning_rate": 0.00018393337310911384, + "loss": 2.0489, + "step": 16694 + }, + { + "epoch": 1.9478473923696185, + "grad_norm": 1.288443684577942, + "learning_rate": 0.00018391861800456892, + "loss": 2.1394, + "step": 16695 + }, + { + "epoch": 1.9479640648699101, + "grad_norm": 1.066887378692627, + "learning_rate": 0.00018390386256394363, + "loss": 2.1539, + "step": 16696 + }, + { + "epoch": 1.9480807373702018, + "grad_norm": 0.8845151662826538, + "learning_rate": 0.00018388910678739098, + "loss": 1.8179, + "step": 16697 + }, + { + "epoch": 1.9481974098704935, + "grad_norm": 1.1351383924484253, + "learning_rate": 0.00018387435067506382, + "loss": 2.0392, + "step": 16698 + }, + { + "epoch": 1.9483140823707852, + "grad_norm": 1.0099295377731323, + "learning_rate": 0.0001838595942271153, + "loss": 1.9112, + "step": 16699 + }, + { + "epoch": 1.9484307548710769, + "grad_norm": 1.0218814611434937, + "learning_rate": 0.00018384483744369825, + "loss": 2.0313, + "step": 16700 + }, + { + "epoch": 1.9485474273713685, + "grad_norm": 1.2902843952178955, + "learning_rate": 0.00018383008032496573, + "loss": 2.0918, + "step": 16701 + }, + { + "epoch": 1.9486640998716602, + "grad_norm": 1.135340690612793, + "learning_rate": 0.00018381532287107071, + "loss": 2.1671, + "step": 16702 + }, + { + "epoch": 1.948780772371952, + "grad_norm": 1.0800912380218506, + "learning_rate": 0.00018380056508216614, + "loss": 2.2504, + "step": 16703 + }, + { + "epoch": 1.9488974448722436, + "grad_norm": 1.1787385940551758, + "learning_rate": 0.0001837858069584051, + "loss": 2.0832, + "step": 16704 + }, + { + "epoch": 1.9490141173725353, + "grad_norm": 0.9652170538902283, + "learning_rate": 0.00018377104849994053, + "loss": 2.0091, + "step": 16705 + }, + { + "epoch": 1.949130789872827, + "grad_norm": 1.095196008682251, + "learning_rate": 0.0001837562897069254, + "loss": 1.9681, + "step": 16706 + }, + { + "epoch": 1.9492474623731186, + "grad_norm": 1.2115933895111084, + "learning_rate": 0.00018374153057951277, + "loss": 2.1262, + "step": 16707 + }, + { + "epoch": 1.9493641348734103, + "grad_norm": 1.1308417320251465, + "learning_rate": 0.0001837267711178556, + "loss": 2.0701, + "step": 16708 + }, + { + "epoch": 1.949480807373702, + "grad_norm": 1.1188455820083618, + "learning_rate": 0.00018371201132210693, + "loss": 2.1431, + "step": 16709 + }, + { + "epoch": 1.9495974798739937, + "grad_norm": 1.2597757577896118, + "learning_rate": 0.00018369725119241976, + "loss": 2.1327, + "step": 16710 + }, + { + "epoch": 1.9497141523742854, + "grad_norm": 1.284466028213501, + "learning_rate": 0.00018368249072894712, + "loss": 2.2663, + "step": 16711 + }, + { + "epoch": 1.949830824874577, + "grad_norm": 0.9366382956504822, + "learning_rate": 0.00018366772993184195, + "loss": 2.0567, + "step": 16712 + }, + { + "epoch": 1.9499474973748687, + "grad_norm": 1.0803978443145752, + "learning_rate": 0.00018365296880125745, + "loss": 2.0106, + "step": 16713 + }, + { + "epoch": 1.9500641698751604, + "grad_norm": 1.0417929887771606, + "learning_rate": 0.00018363820733734648, + "loss": 2.129, + "step": 16714 + }, + { + "epoch": 1.950180842375452, + "grad_norm": 1.1862565279006958, + "learning_rate": 0.00018362344554026218, + "loss": 2.1302, + "step": 16715 + }, + { + "epoch": 1.9502975148757438, + "grad_norm": 1.2907605171203613, + "learning_rate": 0.00018360868341015752, + "loss": 2.1203, + "step": 16716 + }, + { + "epoch": 1.9504141873760354, + "grad_norm": 1.1715495586395264, + "learning_rate": 0.00018359392094718555, + "loss": 2.2468, + "step": 16717 + }, + { + "epoch": 1.9505308598763271, + "grad_norm": 1.0557639598846436, + "learning_rate": 0.00018357915815149931, + "loss": 2.0133, + "step": 16718 + }, + { + "epoch": 1.9506475323766188, + "grad_norm": 1.1738550662994385, + "learning_rate": 0.0001835643950232518, + "loss": 2.2209, + "step": 16719 + }, + { + "epoch": 1.9507642048769105, + "grad_norm": 1.2724204063415527, + "learning_rate": 0.00018354963156259617, + "loss": 1.9203, + "step": 16720 + }, + { + "epoch": 1.9508808773772022, + "grad_norm": 1.1141163110733032, + "learning_rate": 0.00018353486776968544, + "loss": 2.0953, + "step": 16721 + }, + { + "epoch": 1.9509975498774939, + "grad_norm": 1.228601336479187, + "learning_rate": 0.0001835201036446726, + "loss": 2.2382, + "step": 16722 + }, + { + "epoch": 1.9511142223777855, + "grad_norm": 1.1262109279632568, + "learning_rate": 0.00018350533918771078, + "loss": 2.2263, + "step": 16723 + }, + { + "epoch": 1.9512308948780772, + "grad_norm": 0.9644824862480164, + "learning_rate": 0.00018349057439895304, + "loss": 2.0316, + "step": 16724 + }, + { + "epoch": 1.951347567378369, + "grad_norm": 1.032230257987976, + "learning_rate": 0.00018347580927855232, + "loss": 1.9773, + "step": 16725 + }, + { + "epoch": 1.9514642398786606, + "grad_norm": 1.1625778675079346, + "learning_rate": 0.00018346104382666183, + "loss": 2.0665, + "step": 16726 + }, + { + "epoch": 1.9515809123789523, + "grad_norm": 1.1099766492843628, + "learning_rate": 0.00018344627804343458, + "loss": 1.9956, + "step": 16727 + }, + { + "epoch": 1.951697584879244, + "grad_norm": 1.0817835330963135, + "learning_rate": 0.00018343151192902368, + "loss": 2.0269, + "step": 16728 + }, + { + "epoch": 1.9518142573795356, + "grad_norm": 1.1221003532409668, + "learning_rate": 0.00018341674548358225, + "loss": 2.2351, + "step": 16729 + }, + { + "epoch": 1.9519309298798273, + "grad_norm": 1.073898196220398, + "learning_rate": 0.00018340197870726324, + "loss": 2.0356, + "step": 16730 + }, + { + "epoch": 1.952047602380119, + "grad_norm": 1.374686598777771, + "learning_rate": 0.00018338721160021982, + "loss": 2.1378, + "step": 16731 + }, + { + "epoch": 1.9521642748804107, + "grad_norm": 1.0865312814712524, + "learning_rate": 0.00018337244416260514, + "loss": 1.9969, + "step": 16732 + }, + { + "epoch": 1.9522809473807023, + "grad_norm": 0.986150324344635, + "learning_rate": 0.00018335767639457215, + "loss": 1.9807, + "step": 16733 + }, + { + "epoch": 1.952397619880994, + "grad_norm": 1.0718894004821777, + "learning_rate": 0.00018334290829627402, + "loss": 1.9412, + "step": 16734 + }, + { + "epoch": 1.9525142923812857, + "grad_norm": 1.1542572975158691, + "learning_rate": 0.00018332813986786386, + "loss": 2.1376, + "step": 16735 + }, + { + "epoch": 1.9526309648815774, + "grad_norm": 1.119385838508606, + "learning_rate": 0.00018331337110949478, + "loss": 2.3224, + "step": 16736 + }, + { + "epoch": 1.952747637381869, + "grad_norm": 0.9724710583686829, + "learning_rate": 0.0001832986020213198, + "loss": 1.9521, + "step": 16737 + }, + { + "epoch": 1.9528643098821608, + "grad_norm": 1.115501046180725, + "learning_rate": 0.00018328383260349213, + "loss": 2.0491, + "step": 16738 + }, + { + "epoch": 1.9529809823824524, + "grad_norm": 1.1952687501907349, + "learning_rate": 0.00018326906285616486, + "loss": 2.166, + "step": 16739 + }, + { + "epoch": 1.9530976548827441, + "grad_norm": 1.1195943355560303, + "learning_rate": 0.0001832542927794911, + "loss": 2.2884, + "step": 16740 + }, + { + "epoch": 1.9532143273830358, + "grad_norm": 1.17789626121521, + "learning_rate": 0.00018323952237362393, + "loss": 2.0106, + "step": 16741 + }, + { + "epoch": 1.9533309998833275, + "grad_norm": 0.9762994647026062, + "learning_rate": 0.00018322475163871652, + "loss": 1.9436, + "step": 16742 + }, + { + "epoch": 1.9534476723836192, + "grad_norm": 1.0828667879104614, + "learning_rate": 0.00018320998057492197, + "loss": 2.0663, + "step": 16743 + }, + { + "epoch": 1.9535643448839108, + "grad_norm": 1.142960786819458, + "learning_rate": 0.0001831952091823935, + "loss": 2.0251, + "step": 16744 + }, + { + "epoch": 1.9536810173842025, + "grad_norm": 1.0995965003967285, + "learning_rate": 0.0001831804374612841, + "loss": 1.9007, + "step": 16745 + }, + { + "epoch": 1.9537976898844942, + "grad_norm": 1.0232515335083008, + "learning_rate": 0.00018316566541174697, + "loss": 1.9945, + "step": 16746 + }, + { + "epoch": 1.9539143623847859, + "grad_norm": 1.3129786252975464, + "learning_rate": 0.0001831508930339353, + "loss": 2.1481, + "step": 16747 + }, + { + "epoch": 1.9540310348850776, + "grad_norm": 1.1486155986785889, + "learning_rate": 0.0001831361203280021, + "loss": 2.2417, + "step": 16748 + }, + { + "epoch": 1.9541477073853692, + "grad_norm": 1.2193219661712646, + "learning_rate": 0.0001831213472941007, + "loss": 1.969, + "step": 16749 + }, + { + "epoch": 1.954264379885661, + "grad_norm": 0.980659544467926, + "learning_rate": 0.00018310657393238414, + "loss": 2.1576, + "step": 16750 + }, + { + "epoch": 1.9543810523859526, + "grad_norm": 1.0699735879898071, + "learning_rate": 0.00018309180024300551, + "loss": 2.0657, + "step": 16751 + }, + { + "epoch": 1.9544977248862443, + "grad_norm": 1.1302745342254639, + "learning_rate": 0.00018307702622611812, + "loss": 2.0545, + "step": 16752 + }, + { + "epoch": 1.954614397386536, + "grad_norm": 1.1974948644638062, + "learning_rate": 0.000183062251881875, + "loss": 2.1732, + "step": 16753 + }, + { + "epoch": 1.9547310698868277, + "grad_norm": 1.169275164604187, + "learning_rate": 0.00018304747721042941, + "loss": 2.2621, + "step": 16754 + }, + { + "epoch": 1.9548477423871193, + "grad_norm": 1.087007761001587, + "learning_rate": 0.00018303270221193453, + "loss": 2.1826, + "step": 16755 + }, + { + "epoch": 1.954964414887411, + "grad_norm": 1.126813530921936, + "learning_rate": 0.0001830179268865434, + "loss": 2.105, + "step": 16756 + }, + { + "epoch": 1.9550810873877027, + "grad_norm": 1.0942000150680542, + "learning_rate": 0.0001830031512344093, + "loss": 2.0773, + "step": 16757 + }, + { + "epoch": 1.9551977598879944, + "grad_norm": 1.2481848001480103, + "learning_rate": 0.00018298837525568542, + "loss": 2.1525, + "step": 16758 + }, + { + "epoch": 1.955314432388286, + "grad_norm": 1.0831068754196167, + "learning_rate": 0.00018297359895052488, + "loss": 2.0276, + "step": 16759 + }, + { + "epoch": 1.9554311048885777, + "grad_norm": 1.1285414695739746, + "learning_rate": 0.00018295882231908088, + "loss": 1.9486, + "step": 16760 + }, + { + "epoch": 1.9555477773888694, + "grad_norm": 1.2640161514282227, + "learning_rate": 0.00018294404536150659, + "loss": 2.0764, + "step": 16761 + }, + { + "epoch": 1.955664449889161, + "grad_norm": 1.0208232402801514, + "learning_rate": 0.0001829292680779552, + "loss": 2.055, + "step": 16762 + }, + { + "epoch": 1.9557811223894528, + "grad_norm": 1.1415189504623413, + "learning_rate": 0.00018291449046858, + "loss": 2.0432, + "step": 16763 + }, + { + "epoch": 1.9558977948897445, + "grad_norm": 1.0394299030303955, + "learning_rate": 0.00018289971253353406, + "loss": 1.8954, + "step": 16764 + }, + { + "epoch": 1.9560144673900361, + "grad_norm": 1.3044019937515259, + "learning_rate": 0.00018288493427297067, + "loss": 2.0227, + "step": 16765 + }, + { + "epoch": 1.9561311398903278, + "grad_norm": 1.1058557033538818, + "learning_rate": 0.00018287015568704303, + "loss": 1.9502, + "step": 16766 + }, + { + "epoch": 1.9562478123906195, + "grad_norm": 1.3188838958740234, + "learning_rate": 0.0001828553767759043, + "loss": 2.0594, + "step": 16767 + }, + { + "epoch": 1.9563644848909112, + "grad_norm": 1.0904357433319092, + "learning_rate": 0.00018284059753970771, + "loss": 2.0775, + "step": 16768 + }, + { + "epoch": 1.9564811573912029, + "grad_norm": 0.9784189462661743, + "learning_rate": 0.0001828258179786065, + "loss": 2.0725, + "step": 16769 + }, + { + "epoch": 1.9565978298914946, + "grad_norm": 1.2592661380767822, + "learning_rate": 0.0001828110380927538, + "loss": 2.0406, + "step": 16770 + }, + { + "epoch": 1.9567145023917862, + "grad_norm": 1.198673963546753, + "learning_rate": 0.00018279625788230296, + "loss": 2.1332, + "step": 16771 + }, + { + "epoch": 1.956831174892078, + "grad_norm": 1.1547775268554688, + "learning_rate": 0.00018278147734740715, + "loss": 1.956, + "step": 16772 + }, + { + "epoch": 1.9569478473923696, + "grad_norm": 1.3887497186660767, + "learning_rate": 0.00018276669648821953, + "loss": 2.0367, + "step": 16773 + }, + { + "epoch": 1.9570645198926613, + "grad_norm": 1.1725404262542725, + "learning_rate": 0.00018275191530489347, + "loss": 2.1011, + "step": 16774 + }, + { + "epoch": 1.957181192392953, + "grad_norm": 1.3148622512817383, + "learning_rate": 0.00018273713379758205, + "loss": 2.1992, + "step": 16775 + }, + { + "epoch": 1.9572978648932446, + "grad_norm": 1.1549618244171143, + "learning_rate": 0.00018272235196643865, + "loss": 2.059, + "step": 16776 + }, + { + "epoch": 1.9574145373935363, + "grad_norm": 1.0663992166519165, + "learning_rate": 0.00018270756981161637, + "loss": 2.1872, + "step": 16777 + }, + { + "epoch": 1.957531209893828, + "grad_norm": 1.1866066455841064, + "learning_rate": 0.0001826927873332686, + "loss": 1.8769, + "step": 16778 + }, + { + "epoch": 1.9576478823941197, + "grad_norm": 1.2144315242767334, + "learning_rate": 0.0001826780045315485, + "loss": 2.0616, + "step": 16779 + }, + { + "epoch": 1.9577645548944114, + "grad_norm": 1.1974855661392212, + "learning_rate": 0.00018266322140660934, + "loss": 1.965, + "step": 16780 + }, + { + "epoch": 1.957881227394703, + "grad_norm": 1.2490180730819702, + "learning_rate": 0.00018264843795860438, + "loss": 2.1058, + "step": 16781 + }, + { + "epoch": 1.9579978998949947, + "grad_norm": 1.0680334568023682, + "learning_rate": 0.0001826336541876869, + "loss": 1.9521, + "step": 16782 + }, + { + "epoch": 1.9581145723952864, + "grad_norm": 1.0482097864151, + "learning_rate": 0.0001826188700940101, + "loss": 1.9675, + "step": 16783 + }, + { + "epoch": 1.958231244895578, + "grad_norm": 1.1413980722427368, + "learning_rate": 0.00018260408567772737, + "loss": 2.0864, + "step": 16784 + }, + { + "epoch": 1.9583479173958698, + "grad_norm": 1.0906857252120972, + "learning_rate": 0.0001825893009389918, + "loss": 2.1464, + "step": 16785 + }, + { + "epoch": 1.9584645898961615, + "grad_norm": 1.0856037139892578, + "learning_rate": 0.00018257451587795677, + "loss": 2.1214, + "step": 16786 + }, + { + "epoch": 1.9585812623964531, + "grad_norm": 1.1304289102554321, + "learning_rate": 0.00018255973049477556, + "loss": 2.1483, + "step": 16787 + }, + { + "epoch": 1.9586979348967448, + "grad_norm": 1.1220048666000366, + "learning_rate": 0.0001825449447896014, + "loss": 1.9557, + "step": 16788 + }, + { + "epoch": 1.9588146073970365, + "grad_norm": 1.0976922512054443, + "learning_rate": 0.0001825301587625876, + "loss": 2.2169, + "step": 16789 + }, + { + "epoch": 1.9589312798973282, + "grad_norm": 1.1073919534683228, + "learning_rate": 0.00018251537241388742, + "loss": 2.0339, + "step": 16790 + }, + { + "epoch": 1.9590479523976199, + "grad_norm": 1.1720651388168335, + "learning_rate": 0.00018250058574365417, + "loss": 2.0449, + "step": 16791 + }, + { + "epoch": 1.9591646248979115, + "grad_norm": 1.2764660120010376, + "learning_rate": 0.00018248579875204123, + "loss": 2.0968, + "step": 16792 + }, + { + "epoch": 1.9592812973982032, + "grad_norm": 1.2064968347549438, + "learning_rate": 0.0001824710114392018, + "loss": 2.1835, + "step": 16793 + }, + { + "epoch": 1.959397969898495, + "grad_norm": 1.157912254333496, + "learning_rate": 0.0001824562238052891, + "loss": 2.1379, + "step": 16794 + }, + { + "epoch": 1.9595146423987866, + "grad_norm": 1.1266567707061768, + "learning_rate": 0.00018244143585045657, + "loss": 2.195, + "step": 16795 + }, + { + "epoch": 1.9596313148990783, + "grad_norm": 1.141322135925293, + "learning_rate": 0.0001824266475748575, + "loss": 2.0664, + "step": 16796 + }, + { + "epoch": 1.95974798739937, + "grad_norm": 1.0818235874176025, + "learning_rate": 0.0001824118589786451, + "loss": 2.0799, + "step": 16797 + }, + { + "epoch": 1.9598646598996616, + "grad_norm": 1.1236369609832764, + "learning_rate": 0.0001823970700619728, + "loss": 2.1708, + "step": 16798 + }, + { + "epoch": 1.9599813323999533, + "grad_norm": 1.2179181575775146, + "learning_rate": 0.00018238228082499382, + "loss": 2.1668, + "step": 16799 + }, + { + "epoch": 1.960098004900245, + "grad_norm": 0.9386318922042847, + "learning_rate": 0.00018236749126786153, + "loss": 1.9079, + "step": 16800 + }, + { + "epoch": 1.9602146774005367, + "grad_norm": 1.198997139930725, + "learning_rate": 0.00018235270139072926, + "loss": 1.9705, + "step": 16801 + }, + { + "epoch": 1.9603313499008284, + "grad_norm": 1.0881386995315552, + "learning_rate": 0.0001823379111937503, + "loss": 2.1531, + "step": 16802 + }, + { + "epoch": 1.96044802240112, + "grad_norm": 1.2284269332885742, + "learning_rate": 0.00018232312067707797, + "loss": 2.326, + "step": 16803 + }, + { + "epoch": 1.9605646949014117, + "grad_norm": 1.198481559753418, + "learning_rate": 0.00018230832984086564, + "loss": 2.1961, + "step": 16804 + }, + { + "epoch": 1.9606813674017034, + "grad_norm": 1.0626187324523926, + "learning_rate": 0.00018229353868526662, + "loss": 2.1439, + "step": 16805 + }, + { + "epoch": 1.960798039901995, + "grad_norm": 1.116105318069458, + "learning_rate": 0.0001822787472104343, + "loss": 2.2195, + "step": 16806 + }, + { + "epoch": 1.9609147124022868, + "grad_norm": 1.0805126428604126, + "learning_rate": 0.00018226395541652197, + "loss": 2.1554, + "step": 16807 + }, + { + "epoch": 1.9610313849025784, + "grad_norm": 1.0434143543243408, + "learning_rate": 0.00018224916330368294, + "loss": 2.1066, + "step": 16808 + }, + { + "epoch": 1.9611480574028701, + "grad_norm": 1.3491299152374268, + "learning_rate": 0.0001822343708720707, + "loss": 2.0738, + "step": 16809 + }, + { + "epoch": 1.9612647299031618, + "grad_norm": 1.1284271478652954, + "learning_rate": 0.00018221957812183842, + "loss": 2.0537, + "step": 16810 + }, + { + "epoch": 1.9613814024034535, + "grad_norm": 1.058053970336914, + "learning_rate": 0.00018220478505313957, + "loss": 2.0577, + "step": 16811 + }, + { + "epoch": 1.9614980749037452, + "grad_norm": 1.115045428276062, + "learning_rate": 0.00018218999166612747, + "loss": 2.1745, + "step": 16812 + }, + { + "epoch": 1.9616147474040369, + "grad_norm": 1.0199823379516602, + "learning_rate": 0.00018217519796095549, + "loss": 2.0852, + "step": 16813 + }, + { + "epoch": 1.9617314199043285, + "grad_norm": 1.3210433721542358, + "learning_rate": 0.000182160403937777, + "loss": 2.1851, + "step": 16814 + }, + { + "epoch": 1.9618480924046202, + "grad_norm": 1.1624376773834229, + "learning_rate": 0.00018214560959674535, + "loss": 2.0466, + "step": 16815 + }, + { + "epoch": 1.961964764904912, + "grad_norm": 1.0467015504837036, + "learning_rate": 0.00018213081493801395, + "loss": 1.8691, + "step": 16816 + }, + { + "epoch": 1.9620814374052036, + "grad_norm": 1.1371772289276123, + "learning_rate": 0.00018211601996173614, + "loss": 2.0788, + "step": 16817 + }, + { + "epoch": 1.9621981099054953, + "grad_norm": 1.075446605682373, + "learning_rate": 0.00018210122466806532, + "loss": 2.0844, + "step": 16818 + }, + { + "epoch": 1.962314782405787, + "grad_norm": 1.216558575630188, + "learning_rate": 0.0001820864290571548, + "loss": 1.9621, + "step": 16819 + }, + { + "epoch": 1.9624314549060786, + "grad_norm": 1.192785382270813, + "learning_rate": 0.00018207163312915805, + "loss": 2.0412, + "step": 16820 + }, + { + "epoch": 1.9625481274063703, + "grad_norm": 1.2318098545074463, + "learning_rate": 0.00018205683688422847, + "loss": 2.1561, + "step": 16821 + }, + { + "epoch": 1.962664799906662, + "grad_norm": 1.0816714763641357, + "learning_rate": 0.00018204204032251935, + "loss": 2.0335, + "step": 16822 + }, + { + "epoch": 1.9627814724069537, + "grad_norm": 1.1230586767196655, + "learning_rate": 0.00018202724344418422, + "loss": 2.1852, + "step": 16823 + }, + { + "epoch": 1.9628981449072453, + "grad_norm": 1.031478762626648, + "learning_rate": 0.00018201244624937635, + "loss": 2.0834, + "step": 16824 + }, + { + "epoch": 1.963014817407537, + "grad_norm": 1.0990406274795532, + "learning_rate": 0.0001819976487382492, + "loss": 2.0482, + "step": 16825 + }, + { + "epoch": 1.9631314899078287, + "grad_norm": 0.9844846129417419, + "learning_rate": 0.00018198285091095616, + "loss": 1.7968, + "step": 16826 + }, + { + "epoch": 1.9632481624081204, + "grad_norm": 1.129286527633667, + "learning_rate": 0.0001819680527676507, + "loss": 2.2248, + "step": 16827 + }, + { + "epoch": 1.963364834908412, + "grad_norm": 1.2076795101165771, + "learning_rate": 0.00018195325430848616, + "loss": 2.1345, + "step": 16828 + }, + { + "epoch": 1.9634815074087038, + "grad_norm": 1.2321908473968506, + "learning_rate": 0.00018193845553361593, + "loss": 2.3326, + "step": 16829 + }, + { + "epoch": 1.9635981799089954, + "grad_norm": 1.100555181503296, + "learning_rate": 0.00018192365644319346, + "loss": 1.9721, + "step": 16830 + }, + { + "epoch": 1.9637148524092871, + "grad_norm": 0.9524139165878296, + "learning_rate": 0.0001819088570373722, + "loss": 2.0854, + "step": 16831 + }, + { + "epoch": 1.9638315249095788, + "grad_norm": 0.9947860240936279, + "learning_rate": 0.00018189405731630555, + "loss": 1.8972, + "step": 16832 + }, + { + "epoch": 1.9639481974098705, + "grad_norm": 1.2017133235931396, + "learning_rate": 0.00018187925728014697, + "loss": 2.1017, + "step": 16833 + }, + { + "epoch": 1.9640648699101622, + "grad_norm": 1.085099697113037, + "learning_rate": 0.00018186445692904983, + "loss": 2.015, + "step": 16834 + }, + { + "epoch": 1.9641815424104538, + "grad_norm": 1.1835157871246338, + "learning_rate": 0.00018184965626316766, + "loss": 2.0469, + "step": 16835 + }, + { + "epoch": 1.9642982149107455, + "grad_norm": 1.1174206733703613, + "learning_rate": 0.00018183485528265372, + "loss": 1.9336, + "step": 16836 + }, + { + "epoch": 1.9644148874110372, + "grad_norm": 1.2453912496566772, + "learning_rate": 0.00018182005398766166, + "loss": 1.9395, + "step": 16837 + }, + { + "epoch": 1.9645315599113289, + "grad_norm": 1.1280221939086914, + "learning_rate": 0.0001818052523783448, + "loss": 2.0535, + "step": 16838 + }, + { + "epoch": 1.9646482324116206, + "grad_norm": 1.1389732360839844, + "learning_rate": 0.00018179045045485655, + "loss": 1.8991, + "step": 16839 + }, + { + "epoch": 1.9647649049119122, + "grad_norm": 1.0602144002914429, + "learning_rate": 0.00018177564821735047, + "loss": 2.2024, + "step": 16840 + }, + { + "epoch": 1.964881577412204, + "grad_norm": 1.1445927619934082, + "learning_rate": 0.00018176084566597997, + "loss": 2.0338, + "step": 16841 + }, + { + "epoch": 1.9649982499124956, + "grad_norm": 1.0143840312957764, + "learning_rate": 0.0001817460428008985, + "loss": 2.1697, + "step": 16842 + }, + { + "epoch": 1.9651149224127873, + "grad_norm": 0.9962614178657532, + "learning_rate": 0.0001817312396222595, + "loss": 1.8006, + "step": 16843 + }, + { + "epoch": 1.965231594913079, + "grad_norm": 1.0738565921783447, + "learning_rate": 0.00018171643613021647, + "loss": 1.9921, + "step": 16844 + }, + { + "epoch": 1.9653482674133707, + "grad_norm": 1.0534071922302246, + "learning_rate": 0.00018170163232492285, + "loss": 2.0336, + "step": 16845 + }, + { + "epoch": 1.9654649399136623, + "grad_norm": 1.004110336303711, + "learning_rate": 0.00018168682820653216, + "loss": 2.0801, + "step": 16846 + }, + { + "epoch": 1.965581612413954, + "grad_norm": 1.077197790145874, + "learning_rate": 0.00018167202377519778, + "loss": 2.0017, + "step": 16847 + }, + { + "epoch": 1.9656982849142457, + "grad_norm": 1.125078558921814, + "learning_rate": 0.00018165721903107323, + "loss": 1.9752, + "step": 16848 + }, + { + "epoch": 1.9658149574145374, + "grad_norm": 1.137770175933838, + "learning_rate": 0.00018164241397431203, + "loss": 1.932, + "step": 16849 + }, + { + "epoch": 1.965931629914829, + "grad_norm": 1.1433279514312744, + "learning_rate": 0.00018162760860506762, + "loss": 2.1119, + "step": 16850 + }, + { + "epoch": 1.9660483024151207, + "grad_norm": 1.2072476148605347, + "learning_rate": 0.0001816128029234935, + "loss": 2.0607, + "step": 16851 + }, + { + "epoch": 1.9661649749154124, + "grad_norm": 1.1605923175811768, + "learning_rate": 0.00018159799692974316, + "loss": 2.0317, + "step": 16852 + }, + { + "epoch": 1.966281647415704, + "grad_norm": 1.2497990131378174, + "learning_rate": 0.00018158319062397014, + "loss": 2.0464, + "step": 16853 + }, + { + "epoch": 1.9663983199159958, + "grad_norm": 1.1042587757110596, + "learning_rate": 0.0001815683840063278, + "loss": 2.198, + "step": 16854 + }, + { + "epoch": 1.9665149924162875, + "grad_norm": 1.1373412609100342, + "learning_rate": 0.00018155357707696975, + "loss": 2.0755, + "step": 16855 + }, + { + "epoch": 1.9666316649165791, + "grad_norm": 1.067691445350647, + "learning_rate": 0.0001815387698360494, + "loss": 2.0968, + "step": 16856 + }, + { + "epoch": 1.9667483374168708, + "grad_norm": 1.0196983814239502, + "learning_rate": 0.00018152396228372038, + "loss": 1.9086, + "step": 16857 + }, + { + "epoch": 1.9668650099171625, + "grad_norm": 1.0143399238586426, + "learning_rate": 0.00018150915442013613, + "loss": 1.9563, + "step": 16858 + }, + { + "epoch": 1.9669816824174542, + "grad_norm": 0.9150398969650269, + "learning_rate": 0.00018149434624545018, + "loss": 2.0654, + "step": 16859 + }, + { + "epoch": 1.9670983549177459, + "grad_norm": 1.2587754726409912, + "learning_rate": 0.000181479537759816, + "loss": 2.1804, + "step": 16860 + }, + { + "epoch": 1.9672150274180376, + "grad_norm": 1.0619062185287476, + "learning_rate": 0.00018146472896338715, + "loss": 1.9746, + "step": 16861 + }, + { + "epoch": 1.9673316999183292, + "grad_norm": 1.1777607202529907, + "learning_rate": 0.00018144991985631715, + "loss": 2.1286, + "step": 16862 + }, + { + "epoch": 1.967448372418621, + "grad_norm": 0.976177453994751, + "learning_rate": 0.00018143511043875956, + "loss": 2.0044, + "step": 16863 + }, + { + "epoch": 1.9675650449189126, + "grad_norm": 1.1613014936447144, + "learning_rate": 0.00018142030071086784, + "loss": 2.0011, + "step": 16864 + }, + { + "epoch": 1.9676817174192043, + "grad_norm": 1.1050655841827393, + "learning_rate": 0.00018140549067279551, + "loss": 1.9497, + "step": 16865 + }, + { + "epoch": 1.967798389919496, + "grad_norm": 1.1114881038665771, + "learning_rate": 0.00018139068032469612, + "loss": 1.9014, + "step": 16866 + }, + { + "epoch": 1.9679150624197876, + "grad_norm": 1.0137568712234497, + "learning_rate": 0.00018137586966672327, + "loss": 1.8973, + "step": 16867 + }, + { + "epoch": 1.9680317349200793, + "grad_norm": 1.1604515314102173, + "learning_rate": 0.00018136105869903045, + "loss": 2.1428, + "step": 16868 + }, + { + "epoch": 1.968148407420371, + "grad_norm": 1.2769323587417603, + "learning_rate": 0.0001813462474217712, + "loss": 2.2352, + "step": 16869 + }, + { + "epoch": 1.9682650799206627, + "grad_norm": 1.2222150564193726, + "learning_rate": 0.0001813314358350991, + "loss": 2.2118, + "step": 16870 + }, + { + "epoch": 1.9683817524209544, + "grad_norm": 1.1992342472076416, + "learning_rate": 0.00018131662393916764, + "loss": 2.0805, + "step": 16871 + }, + { + "epoch": 1.968498424921246, + "grad_norm": 0.9721747040748596, + "learning_rate": 0.0001813018117341304, + "loss": 1.9903, + "step": 16872 + }, + { + "epoch": 1.9686150974215377, + "grad_norm": 1.1474854946136475, + "learning_rate": 0.00018128699922014097, + "loss": 1.9164, + "step": 16873 + }, + { + "epoch": 1.9687317699218294, + "grad_norm": 1.1437650918960571, + "learning_rate": 0.0001812721863973529, + "loss": 1.8784, + "step": 16874 + }, + { + "epoch": 1.968848442422121, + "grad_norm": 1.1204129457473755, + "learning_rate": 0.00018125737326591976, + "loss": 1.9947, + "step": 16875 + }, + { + "epoch": 1.9689651149224128, + "grad_norm": 1.0038446187973022, + "learning_rate": 0.00018124255982599505, + "loss": 1.9477, + "step": 16876 + }, + { + "epoch": 1.9690817874227045, + "grad_norm": 1.1336435079574585, + "learning_rate": 0.00018122774607773242, + "loss": 1.9913, + "step": 16877 + }, + { + "epoch": 1.9691984599229961, + "grad_norm": 0.9363910555839539, + "learning_rate": 0.00018121293202128538, + "loss": 1.9939, + "step": 16878 + }, + { + "epoch": 1.9693151324232878, + "grad_norm": 1.2319453954696655, + "learning_rate": 0.00018119811765680756, + "loss": 2.0272, + "step": 16879 + }, + { + "epoch": 1.9694318049235795, + "grad_norm": 1.0407278537750244, + "learning_rate": 0.00018118330298445249, + "loss": 1.9198, + "step": 16880 + }, + { + "epoch": 1.9695484774238712, + "grad_norm": 1.0311267375946045, + "learning_rate": 0.0001811684880043738, + "loss": 2.0429, + "step": 16881 + }, + { + "epoch": 1.9696651499241629, + "grad_norm": 1.210342288017273, + "learning_rate": 0.000181153672716725, + "loss": 2.1342, + "step": 16882 + }, + { + "epoch": 1.9697818224244545, + "grad_norm": 1.0801695585250854, + "learning_rate": 0.00018113885712165977, + "loss": 1.9797, + "step": 16883 + }, + { + "epoch": 1.9698984949247462, + "grad_norm": 1.0957821607589722, + "learning_rate": 0.00018112404121933167, + "loss": 2.0475, + "step": 16884 + }, + { + "epoch": 1.970015167425038, + "grad_norm": 1.1392570734024048, + "learning_rate": 0.00018110922500989422, + "loss": 2.0137, + "step": 16885 + }, + { + "epoch": 1.9701318399253296, + "grad_norm": 1.3162328004837036, + "learning_rate": 0.00018109440849350118, + "loss": 2.2589, + "step": 16886 + }, + { + "epoch": 1.9702485124256213, + "grad_norm": 1.0569158792495728, + "learning_rate": 0.000181079591670306, + "loss": 1.8144, + "step": 16887 + }, + { + "epoch": 1.970365184925913, + "grad_norm": 1.3800410032272339, + "learning_rate": 0.00018106477454046233, + "loss": 1.9288, + "step": 16888 + }, + { + "epoch": 1.9704818574262046, + "grad_norm": 1.1453850269317627, + "learning_rate": 0.00018104995710412381, + "loss": 2.0033, + "step": 16889 + }, + { + "epoch": 1.9705985299264963, + "grad_norm": 0.9503206610679626, + "learning_rate": 0.000181035139361444, + "loss": 2.1276, + "step": 16890 + }, + { + "epoch": 1.970715202426788, + "grad_norm": 1.1578965187072754, + "learning_rate": 0.00018102032131257653, + "loss": 2.1505, + "step": 16891 + }, + { + "epoch": 1.9708318749270797, + "grad_norm": 0.9487391114234924, + "learning_rate": 0.0001810055029576751, + "loss": 1.9607, + "step": 16892 + }, + { + "epoch": 1.9709485474273714, + "grad_norm": 1.101391077041626, + "learning_rate": 0.0001809906842968932, + "loss": 2.0232, + "step": 16893 + }, + { + "epoch": 1.971065219927663, + "grad_norm": 1.2598751783370972, + "learning_rate": 0.00018097586533038452, + "loss": 2.163, + "step": 16894 + }, + { + "epoch": 1.9711818924279547, + "grad_norm": 1.2405335903167725, + "learning_rate": 0.00018096104605830267, + "loss": 2.0335, + "step": 16895 + }, + { + "epoch": 1.9712985649282464, + "grad_norm": 1.1090004444122314, + "learning_rate": 0.00018094622648080132, + "loss": 1.9424, + "step": 16896 + }, + { + "epoch": 1.971415237428538, + "grad_norm": 1.1172175407409668, + "learning_rate": 0.00018093140659803406, + "loss": 1.9368, + "step": 16897 + }, + { + "epoch": 1.9715319099288298, + "grad_norm": 1.151105523109436, + "learning_rate": 0.00018091658641015453, + "loss": 2.1344, + "step": 16898 + }, + { + "epoch": 1.9716485824291214, + "grad_norm": 1.0676548480987549, + "learning_rate": 0.00018090176591731638, + "loss": 1.9284, + "step": 16899 + }, + { + "epoch": 1.9717652549294131, + "grad_norm": 0.9865688681602478, + "learning_rate": 0.00018088694511967325, + "loss": 2.0512, + "step": 16900 + }, + { + "epoch": 1.9718819274297048, + "grad_norm": 1.1231210231781006, + "learning_rate": 0.00018087212401737875, + "loss": 2.0921, + "step": 16901 + }, + { + "epoch": 1.9719985999299965, + "grad_norm": 0.9572242498397827, + "learning_rate": 0.00018085730261058658, + "loss": 2.0169, + "step": 16902 + }, + { + "epoch": 1.9721152724302882, + "grad_norm": 1.0142980813980103, + "learning_rate": 0.00018084248089945043, + "loss": 2.173, + "step": 16903 + }, + { + "epoch": 1.9722319449305798, + "grad_norm": 1.1724759340286255, + "learning_rate": 0.0001808276588841239, + "loss": 2.0839, + "step": 16904 + }, + { + "epoch": 1.9723486174308715, + "grad_norm": 1.1112264394760132, + "learning_rate": 0.0001808128365647606, + "loss": 1.9155, + "step": 16905 + }, + { + "epoch": 1.9724652899311632, + "grad_norm": 1.0974518060684204, + "learning_rate": 0.00018079801394151428, + "loss": 1.9896, + "step": 16906 + }, + { + "epoch": 1.972581962431455, + "grad_norm": 1.1883915662765503, + "learning_rate": 0.00018078319101453853, + "loss": 2.1532, + "step": 16907 + }, + { + "epoch": 1.9726986349317466, + "grad_norm": 1.1309648752212524, + "learning_rate": 0.000180768367783987, + "loss": 2.2053, + "step": 16908 + }, + { + "epoch": 1.9728153074320383, + "grad_norm": 1.3297088146209717, + "learning_rate": 0.00018075354425001346, + "loss": 2.1592, + "step": 16909 + }, + { + "epoch": 1.97293197993233, + "grad_norm": 1.1413112878799438, + "learning_rate": 0.0001807387204127716, + "loss": 2.1956, + "step": 16910 + }, + { + "epoch": 1.9730486524326216, + "grad_norm": 1.1476577520370483, + "learning_rate": 0.00018072389627241497, + "loss": 2.084, + "step": 16911 + }, + { + "epoch": 1.9731653249329133, + "grad_norm": 1.10170578956604, + "learning_rate": 0.00018070907182909736, + "loss": 2.0961, + "step": 16912 + }, + { + "epoch": 1.973281997433205, + "grad_norm": 0.9557125568389893, + "learning_rate": 0.00018069424708297237, + "loss": 2.0286, + "step": 16913 + }, + { + "epoch": 1.9733986699334967, + "grad_norm": 1.0097142457962036, + "learning_rate": 0.00018067942203419375, + "loss": 2.14, + "step": 16914 + }, + { + "epoch": 1.9735153424337883, + "grad_norm": 1.157116413116455, + "learning_rate": 0.00018066459668291513, + "loss": 1.9782, + "step": 16915 + }, + { + "epoch": 1.97363201493408, + "grad_norm": 1.1799805164337158, + "learning_rate": 0.00018064977102929028, + "loss": 2.1352, + "step": 16916 + }, + { + "epoch": 1.9737486874343717, + "grad_norm": 0.9933177828788757, + "learning_rate": 0.00018063494507347284, + "loss": 2.1008, + "step": 16917 + }, + { + "epoch": 1.9738653599346634, + "grad_norm": 1.2999470233917236, + "learning_rate": 0.0001806201188156165, + "loss": 2.1671, + "step": 16918 + }, + { + "epoch": 1.973982032434955, + "grad_norm": 1.0716572999954224, + "learning_rate": 0.00018060529225587501, + "loss": 1.9883, + "step": 16919 + }, + { + "epoch": 1.9740987049352468, + "grad_norm": 1.2999709844589233, + "learning_rate": 0.00018059046539440203, + "loss": 2.1813, + "step": 16920 + }, + { + "epoch": 1.9742153774355384, + "grad_norm": 1.0056397914886475, + "learning_rate": 0.00018057563823135133, + "loss": 1.9454, + "step": 16921 + }, + { + "epoch": 1.9743320499358301, + "grad_norm": 1.2809314727783203, + "learning_rate": 0.00018056081076687656, + "loss": 2.0008, + "step": 16922 + }, + { + "epoch": 1.9744487224361218, + "grad_norm": 1.1223787069320679, + "learning_rate": 0.00018054598300113142, + "loss": 2.1409, + "step": 16923 + }, + { + "epoch": 1.9745653949364135, + "grad_norm": 0.9880968332290649, + "learning_rate": 0.0001805311549342697, + "loss": 1.9743, + "step": 16924 + }, + { + "epoch": 1.9746820674367052, + "grad_norm": 1.0386455059051514, + "learning_rate": 0.00018051632656644506, + "loss": 1.997, + "step": 16925 + }, + { + "epoch": 1.9747987399369968, + "grad_norm": 1.0653661489486694, + "learning_rate": 0.00018050149789781124, + "loss": 1.9874, + "step": 16926 + }, + { + "epoch": 1.9749154124372885, + "grad_norm": 0.9444254636764526, + "learning_rate": 0.000180486668928522, + "loss": 1.9019, + "step": 16927 + }, + { + "epoch": 1.9750320849375802, + "grad_norm": 1.0998514890670776, + "learning_rate": 0.00018047183965873108, + "loss": 2.1201, + "step": 16928 + }, + { + "epoch": 1.9751487574378719, + "grad_norm": 1.1047881841659546, + "learning_rate": 0.0001804570100885921, + "loss": 1.8949, + "step": 16929 + }, + { + "epoch": 1.9752654299381636, + "grad_norm": 1.049920678138733, + "learning_rate": 0.00018044218021825897, + "loss": 2.0878, + "step": 16930 + }, + { + "epoch": 1.9753821024384552, + "grad_norm": 1.1774235963821411, + "learning_rate": 0.00018042735004788528, + "loss": 2.1665, + "step": 16931 + }, + { + "epoch": 1.975498774938747, + "grad_norm": 1.1153088808059692, + "learning_rate": 0.00018041251957762481, + "loss": 2.0115, + "step": 16932 + }, + { + "epoch": 1.9756154474390386, + "grad_norm": 1.1077097654342651, + "learning_rate": 0.00018039768880763138, + "loss": 2.1021, + "step": 16933 + }, + { + "epoch": 1.9757321199393303, + "grad_norm": 1.1427488327026367, + "learning_rate": 0.0001803828577380586, + "loss": 2.1355, + "step": 16934 + }, + { + "epoch": 1.975848792439622, + "grad_norm": 1.0724050998687744, + "learning_rate": 0.00018036802636906031, + "loss": 2.2398, + "step": 16935 + }, + { + "epoch": 1.9759654649399137, + "grad_norm": 0.8840580582618713, + "learning_rate": 0.00018035319470079036, + "loss": 1.9518, + "step": 16936 + }, + { + "epoch": 1.9760821374402053, + "grad_norm": 1.1469635963439941, + "learning_rate": 0.00018033836273340234, + "loss": 2.0832, + "step": 16937 + }, + { + "epoch": 1.976198809940497, + "grad_norm": 1.144127368927002, + "learning_rate": 0.00018032353046705012, + "loss": 1.9607, + "step": 16938 + }, + { + "epoch": 1.9763154824407887, + "grad_norm": 1.1645153760910034, + "learning_rate": 0.0001803086979018874, + "loss": 2.0346, + "step": 16939 + }, + { + "epoch": 1.9764321549410804, + "grad_norm": 1.3139268159866333, + "learning_rate": 0.00018029386503806797, + "loss": 2.1541, + "step": 16940 + }, + { + "epoch": 1.976548827441372, + "grad_norm": 1.006485939025879, + "learning_rate": 0.00018027903187574563, + "loss": 1.9714, + "step": 16941 + }, + { + "epoch": 1.9766654999416637, + "grad_norm": 1.174461007118225, + "learning_rate": 0.00018026419841507408, + "loss": 1.9713, + "step": 16942 + }, + { + "epoch": 1.9767821724419554, + "grad_norm": 1.016455054283142, + "learning_rate": 0.00018024936465620717, + "loss": 2.1062, + "step": 16943 + }, + { + "epoch": 1.976898844942247, + "grad_norm": 1.1598924398422241, + "learning_rate": 0.00018023453059929864, + "loss": 2.1224, + "step": 16944 + }, + { + "epoch": 1.9770155174425388, + "grad_norm": 1.232783555984497, + "learning_rate": 0.0001802196962445023, + "loss": 2.0733, + "step": 16945 + }, + { + "epoch": 1.9771321899428305, + "grad_norm": 1.316551923751831, + "learning_rate": 0.00018020486159197192, + "loss": 2.2709, + "step": 16946 + }, + { + "epoch": 1.9772488624431221, + "grad_norm": 1.0558602809906006, + "learning_rate": 0.00018019002664186126, + "loss": 2.03, + "step": 16947 + }, + { + "epoch": 1.9773655349434138, + "grad_norm": 1.1983197927474976, + "learning_rate": 0.0001801751913943242, + "loss": 2.0567, + "step": 16948 + }, + { + "epoch": 1.9774822074437055, + "grad_norm": 1.2197998762130737, + "learning_rate": 0.00018016035584951444, + "loss": 2.2704, + "step": 16949 + }, + { + "epoch": 1.9775988799439972, + "grad_norm": 1.0700490474700928, + "learning_rate": 0.00018014552000758586, + "loss": 1.9755, + "step": 16950 + }, + { + "epoch": 1.9777155524442889, + "grad_norm": 1.2207481861114502, + "learning_rate": 0.0001801306838686922, + "loss": 2.3, + "step": 16951 + }, + { + "epoch": 1.9778322249445806, + "grad_norm": 1.16315495967865, + "learning_rate": 0.0001801158474329873, + "loss": 2.0802, + "step": 16952 + }, + { + "epoch": 1.9779488974448722, + "grad_norm": 1.1000070571899414, + "learning_rate": 0.0001801010107006249, + "loss": 2.195, + "step": 16953 + }, + { + "epoch": 1.978065569945164, + "grad_norm": 1.212246060371399, + "learning_rate": 0.00018008617367175892, + "loss": 2.0707, + "step": 16954 + }, + { + "epoch": 1.9781822424454556, + "grad_norm": 1.142834186553955, + "learning_rate": 0.00018007133634654308, + "loss": 2.1156, + "step": 16955 + }, + { + "epoch": 1.9782989149457473, + "grad_norm": 1.0113095045089722, + "learning_rate": 0.00018005649872513131, + "loss": 2.018, + "step": 16956 + }, + { + "epoch": 1.978415587446039, + "grad_norm": 1.1297204494476318, + "learning_rate": 0.0001800416608076773, + "loss": 2.0496, + "step": 16957 + }, + { + "epoch": 1.9785322599463306, + "grad_norm": 1.2124698162078857, + "learning_rate": 0.0001800268225943349, + "loss": 2.0283, + "step": 16958 + }, + { + "epoch": 1.9786489324466223, + "grad_norm": 1.0445003509521484, + "learning_rate": 0.00018001198408525797, + "loss": 2.1512, + "step": 16959 + }, + { + "epoch": 1.978765604946914, + "grad_norm": 1.1435710191726685, + "learning_rate": 0.00017999714528060034, + "loss": 2.0681, + "step": 16960 + }, + { + "epoch": 1.9788822774472057, + "grad_norm": 1.0664421319961548, + "learning_rate": 0.00017998230618051584, + "loss": 1.7999, + "step": 16961 + }, + { + "epoch": 1.9789989499474974, + "grad_norm": 1.0966540575027466, + "learning_rate": 0.00017996746678515828, + "loss": 2.0573, + "step": 16962 + }, + { + "epoch": 1.979115622447789, + "grad_norm": 1.0075833797454834, + "learning_rate": 0.00017995262709468162, + "loss": 2.1626, + "step": 16963 + }, + { + "epoch": 1.9792322949480807, + "grad_norm": 1.1241241693496704, + "learning_rate": 0.0001799377871092395, + "loss": 2.2764, + "step": 16964 + }, + { + "epoch": 1.9793489674483724, + "grad_norm": 1.2627220153808594, + "learning_rate": 0.00017992294682898588, + "loss": 2.0708, + "step": 16965 + }, + { + "epoch": 1.979465639948664, + "grad_norm": 1.2127821445465088, + "learning_rate": 0.0001799081062540746, + "loss": 2.1078, + "step": 16966 + }, + { + "epoch": 1.9795823124489558, + "grad_norm": 1.0799405574798584, + "learning_rate": 0.00017989326538465949, + "loss": 2.0715, + "step": 16967 + }, + { + "epoch": 1.9796989849492475, + "grad_norm": 1.0471410751342773, + "learning_rate": 0.00017987842422089446, + "loss": 1.9829, + "step": 16968 + }, + { + "epoch": 1.9798156574495391, + "grad_norm": 1.09068763256073, + "learning_rate": 0.00017986358276293328, + "loss": 2.0796, + "step": 16969 + }, + { + "epoch": 1.9799323299498308, + "grad_norm": 1.0726385116577148, + "learning_rate": 0.0001798487410109299, + "loss": 2.1275, + "step": 16970 + }, + { + "epoch": 1.9800490024501225, + "grad_norm": 0.990466833114624, + "learning_rate": 0.00017983389896503814, + "loss": 1.8427, + "step": 16971 + }, + { + "epoch": 1.9801656749504142, + "grad_norm": 1.013250708580017, + "learning_rate": 0.0001798190566254118, + "loss": 2.0159, + "step": 16972 + }, + { + "epoch": 1.9802823474507059, + "grad_norm": 0.9739141464233398, + "learning_rate": 0.00017980421399220488, + "loss": 2.0191, + "step": 16973 + }, + { + "epoch": 1.9803990199509975, + "grad_norm": 0.9907062649726868, + "learning_rate": 0.0001797893710655711, + "loss": 2.0693, + "step": 16974 + }, + { + "epoch": 1.9805156924512892, + "grad_norm": 1.2629313468933105, + "learning_rate": 0.00017977452784566445, + "loss": 2.2552, + "step": 16975 + }, + { + "epoch": 1.980632364951581, + "grad_norm": 1.2275307178497314, + "learning_rate": 0.00017975968433263883, + "loss": 2.0503, + "step": 16976 + }, + { + "epoch": 1.9807490374518726, + "grad_norm": 1.134162425994873, + "learning_rate": 0.000179744840526648, + "loss": 2.0981, + "step": 16977 + }, + { + "epoch": 1.9808657099521643, + "grad_norm": 1.2187728881835938, + "learning_rate": 0.00017972999642784594, + "loss": 2.0246, + "step": 16978 + }, + { + "epoch": 1.980982382452456, + "grad_norm": 1.0506373643875122, + "learning_rate": 0.00017971515203638651, + "loss": 2.153, + "step": 16979 + }, + { + "epoch": 1.9810990549527476, + "grad_norm": 1.023240089416504, + "learning_rate": 0.00017970030735242363, + "loss": 1.9011, + "step": 16980 + }, + { + "epoch": 1.9812157274530393, + "grad_norm": 1.0498546361923218, + "learning_rate": 0.00017968546237611115, + "loss": 2.0754, + "step": 16981 + }, + { + "epoch": 1.981332399953331, + "grad_norm": 1.1307822465896606, + "learning_rate": 0.00017967061710760297, + "loss": 2.2645, + "step": 16982 + }, + { + "epoch": 1.9814490724536227, + "grad_norm": 0.9795345067977905, + "learning_rate": 0.00017965577154705297, + "loss": 2.2004, + "step": 16983 + }, + { + "epoch": 1.9815657449539144, + "grad_norm": 1.070223331451416, + "learning_rate": 0.00017964092569461507, + "loss": 2.1595, + "step": 16984 + }, + { + "epoch": 1.981682417454206, + "grad_norm": 0.9616339206695557, + "learning_rate": 0.00017962607955044322, + "loss": 2.0396, + "step": 16985 + }, + { + "epoch": 1.9817990899544977, + "grad_norm": 1.0632023811340332, + "learning_rate": 0.0001796112331146913, + "loss": 2.0763, + "step": 16986 + }, + { + "epoch": 1.9819157624547894, + "grad_norm": 1.0599961280822754, + "learning_rate": 0.00017959638638751318, + "loss": 2.1077, + "step": 16987 + }, + { + "epoch": 1.982032434955081, + "grad_norm": 0.9225784540176392, + "learning_rate": 0.00017958153936906284, + "loss": 2.0188, + "step": 16988 + }, + { + "epoch": 1.9821491074553728, + "grad_norm": 1.3173903226852417, + "learning_rate": 0.00017956669205949422, + "loss": 2.1622, + "step": 16989 + }, + { + "epoch": 1.9822657799556644, + "grad_norm": 1.137609601020813, + "learning_rate": 0.00017955184445896113, + "loss": 2.1297, + "step": 16990 + }, + { + "epoch": 1.9823824524559561, + "grad_norm": 1.0550483465194702, + "learning_rate": 0.00017953699656761754, + "loss": 2.1812, + "step": 16991 + }, + { + "epoch": 1.9824991249562478, + "grad_norm": 1.0502229928970337, + "learning_rate": 0.0001795221483856174, + "loss": 2.0884, + "step": 16992 + }, + { + "epoch": 1.9826157974565395, + "grad_norm": 1.142579197883606, + "learning_rate": 0.00017950729991311466, + "loss": 2.1454, + "step": 16993 + }, + { + "epoch": 1.9827324699568312, + "grad_norm": 1.0349665880203247, + "learning_rate": 0.00017949245115026315, + "loss": 2.1304, + "step": 16994 + }, + { + "epoch": 1.9828491424571228, + "grad_norm": 1.1704306602478027, + "learning_rate": 0.00017947760209721694, + "loss": 2.155, + "step": 16995 + }, + { + "epoch": 1.9829658149574145, + "grad_norm": 1.182161808013916, + "learning_rate": 0.0001794627527541299, + "loss": 2.1369, + "step": 16996 + }, + { + "epoch": 1.9830824874577062, + "grad_norm": 1.2091658115386963, + "learning_rate": 0.0001794479031211559, + "loss": 2.0698, + "step": 16997 + }, + { + "epoch": 1.983199159957998, + "grad_norm": 1.1023484468460083, + "learning_rate": 0.00017943305319844904, + "loss": 2.0685, + "step": 16998 + }, + { + "epoch": 1.9833158324582896, + "grad_norm": 1.2002557516098022, + "learning_rate": 0.00017941820298616318, + "loss": 2.0823, + "step": 16999 + }, + { + "epoch": 1.9834325049585813, + "grad_norm": 1.1403801441192627, + "learning_rate": 0.00017940335248445226, + "loss": 2.0175, + "step": 17000 + }, + { + "epoch": 1.983549177458873, + "grad_norm": 1.0854787826538086, + "learning_rate": 0.00017938850169347024, + "loss": 2.188, + "step": 17001 + }, + { + "epoch": 1.9836658499591646, + "grad_norm": 1.061981439590454, + "learning_rate": 0.00017937365061337107, + "loss": 2.1903, + "step": 17002 + }, + { + "epoch": 1.9837825224594563, + "grad_norm": 1.1329435110092163, + "learning_rate": 0.0001793587992443088, + "loss": 2.0732, + "step": 17003 + }, + { + "epoch": 1.983899194959748, + "grad_norm": 1.2227282524108887, + "learning_rate": 0.0001793439475864373, + "loss": 2.0902, + "step": 17004 + }, + { + "epoch": 1.9840158674600397, + "grad_norm": 1.0759942531585693, + "learning_rate": 0.0001793290956399105, + "loss": 2.0196, + "step": 17005 + }, + { + "epoch": 1.9841325399603313, + "grad_norm": 1.0868676900863647, + "learning_rate": 0.00017931424340488248, + "loss": 2.082, + "step": 17006 + }, + { + "epoch": 1.984249212460623, + "grad_norm": 1.0214093923568726, + "learning_rate": 0.00017929939088150718, + "loss": 1.8295, + "step": 17007 + }, + { + "epoch": 1.9843658849609147, + "grad_norm": 1.0505746603012085, + "learning_rate": 0.0001792845380699385, + "loss": 2.0195, + "step": 17008 + }, + { + "epoch": 1.9844825574612064, + "grad_norm": 1.0526480674743652, + "learning_rate": 0.00017926968497033048, + "loss": 1.9513, + "step": 17009 + }, + { + "epoch": 1.984599229961498, + "grad_norm": 1.1567952632904053, + "learning_rate": 0.0001792548315828371, + "loss": 2.1097, + "step": 17010 + }, + { + "epoch": 1.9847159024617897, + "grad_norm": 1.2390209436416626, + "learning_rate": 0.00017923997790761235, + "loss": 2.2952, + "step": 17011 + }, + { + "epoch": 1.9848325749620814, + "grad_norm": 1.002118706703186, + "learning_rate": 0.00017922512394481015, + "loss": 1.9696, + "step": 17012 + }, + { + "epoch": 1.9849492474623731, + "grad_norm": 1.0655945539474487, + "learning_rate": 0.0001792102696945845, + "loss": 2.0134, + "step": 17013 + }, + { + "epoch": 1.9850659199626648, + "grad_norm": 1.3838746547698975, + "learning_rate": 0.0001791954151570895, + "loss": 2.003, + "step": 17014 + }, + { + "epoch": 1.9851825924629565, + "grad_norm": 1.092419147491455, + "learning_rate": 0.00017918056033247912, + "loss": 2.0621, + "step": 17015 + }, + { + "epoch": 1.9852992649632482, + "grad_norm": 1.0940587520599365, + "learning_rate": 0.00017916570522090726, + "loss": 2.0956, + "step": 17016 + }, + { + "epoch": 1.9854159374635398, + "grad_norm": 1.05238938331604, + "learning_rate": 0.00017915084982252796, + "loss": 2.0595, + "step": 17017 + }, + { + "epoch": 1.9855326099638315, + "grad_norm": 1.1538336277008057, + "learning_rate": 0.00017913599413749528, + "loss": 2.0571, + "step": 17018 + }, + { + "epoch": 1.9856492824641232, + "grad_norm": 1.3972088098526, + "learning_rate": 0.00017912113816596315, + "loss": 2.1939, + "step": 17019 + }, + { + "epoch": 1.9857659549644149, + "grad_norm": 1.165591835975647, + "learning_rate": 0.00017910628190808566, + "loss": 2.0001, + "step": 17020 + }, + { + "epoch": 1.9858826274647066, + "grad_norm": 1.2009471654891968, + "learning_rate": 0.00017909142536401673, + "loss": 2.1956, + "step": 17021 + }, + { + "epoch": 1.9859992999649982, + "grad_norm": 1.0676220655441284, + "learning_rate": 0.00017907656853391048, + "loss": 2.0412, + "step": 17022 + }, + { + "epoch": 1.98611597246529, + "grad_norm": 0.9872285723686218, + "learning_rate": 0.00017906171141792083, + "loss": 2.0439, + "step": 17023 + }, + { + "epoch": 1.9862326449655816, + "grad_norm": 1.1824183464050293, + "learning_rate": 0.00017904685401620184, + "loss": 2.0492, + "step": 17024 + }, + { + "epoch": 1.9863493174658733, + "grad_norm": 1.3300634622573853, + "learning_rate": 0.00017903199632890763, + "loss": 2.1944, + "step": 17025 + }, + { + "epoch": 1.986465989966165, + "grad_norm": 1.080477237701416, + "learning_rate": 0.00017901713835619205, + "loss": 2.2395, + "step": 17026 + }, + { + "epoch": 1.9865826624664567, + "grad_norm": 0.9831546545028687, + "learning_rate": 0.00017900228009820924, + "loss": 1.8237, + "step": 17027 + }, + { + "epoch": 1.9866993349667483, + "grad_norm": 1.17428719997406, + "learning_rate": 0.00017898742155511324, + "loss": 2.1207, + "step": 17028 + }, + { + "epoch": 1.98681600746704, + "grad_norm": 1.3194602727890015, + "learning_rate": 0.00017897256272705805, + "loss": 2.047, + "step": 17029 + }, + { + "epoch": 1.9869326799673317, + "grad_norm": 0.9877644777297974, + "learning_rate": 0.00017895770361419774, + "loss": 1.9259, + "step": 17030 + }, + { + "epoch": 1.9870493524676234, + "grad_norm": 1.2076717615127563, + "learning_rate": 0.00017894284421668633, + "loss": 2.2774, + "step": 17031 + }, + { + "epoch": 1.987166024967915, + "grad_norm": 1.4080424308776855, + "learning_rate": 0.00017892798453467788, + "loss": 2.3013, + "step": 17032 + }, + { + "epoch": 1.9872826974682067, + "grad_norm": 1.272104024887085, + "learning_rate": 0.00017891312456832644, + "loss": 2.1187, + "step": 17033 + }, + { + "epoch": 1.9873993699684984, + "grad_norm": 1.0843786001205444, + "learning_rate": 0.00017889826431778604, + "loss": 2.0691, + "step": 17034 + }, + { + "epoch": 1.98751604246879, + "grad_norm": 1.497360348701477, + "learning_rate": 0.00017888340378321077, + "loss": 2.1286, + "step": 17035 + }, + { + "epoch": 1.9876327149690818, + "grad_norm": 1.071419358253479, + "learning_rate": 0.00017886854296475461, + "loss": 2.1778, + "step": 17036 + }, + { + "epoch": 1.9877493874693735, + "grad_norm": 1.226971983909607, + "learning_rate": 0.0001788536818625717, + "loss": 2.1234, + "step": 17037 + }, + { + "epoch": 1.9878660599696651, + "grad_norm": 1.169569492340088, + "learning_rate": 0.0001788388204768161, + "loss": 1.94, + "step": 17038 + }, + { + "epoch": 1.9879827324699568, + "grad_norm": 1.3621501922607422, + "learning_rate": 0.0001788239588076418, + "loss": 2.2994, + "step": 17039 + }, + { + "epoch": 1.9880994049702485, + "grad_norm": 1.1349363327026367, + "learning_rate": 0.00017880909685520296, + "loss": 2.0286, + "step": 17040 + }, + { + "epoch": 1.9882160774705402, + "grad_norm": 1.034186601638794, + "learning_rate": 0.00017879423461965364, + "loss": 1.8423, + "step": 17041 + }, + { + "epoch": 1.9883327499708319, + "grad_norm": 1.1099988222122192, + "learning_rate": 0.0001787793721011479, + "loss": 2.0037, + "step": 17042 + }, + { + "epoch": 1.9884494224711236, + "grad_norm": 1.1000088453292847, + "learning_rate": 0.00017876450929983975, + "loss": 1.8446, + "step": 17043 + }, + { + "epoch": 1.9885660949714152, + "grad_norm": 1.0225422382354736, + "learning_rate": 0.00017874964621588337, + "loss": 1.8153, + "step": 17044 + }, + { + "epoch": 1.988682767471707, + "grad_norm": 0.9995349645614624, + "learning_rate": 0.00017873478284943282, + "loss": 1.9207, + "step": 17045 + }, + { + "epoch": 1.9887994399719986, + "grad_norm": 1.1433379650115967, + "learning_rate": 0.00017871991920064215, + "loss": 2.1297, + "step": 17046 + }, + { + "epoch": 1.9889161124722903, + "grad_norm": 1.125812292098999, + "learning_rate": 0.00017870505526966548, + "loss": 2.1047, + "step": 17047 + }, + { + "epoch": 1.989032784972582, + "grad_norm": 1.056897521018982, + "learning_rate": 0.0001786901910566569, + "loss": 2.0646, + "step": 17048 + }, + { + "epoch": 1.9891494574728736, + "grad_norm": 1.1132067441940308, + "learning_rate": 0.00017867532656177048, + "loss": 2.1882, + "step": 17049 + }, + { + "epoch": 1.9892661299731653, + "grad_norm": 1.120937466621399, + "learning_rate": 0.0001786604617851603, + "loss": 2.0103, + "step": 17050 + }, + { + "epoch": 1.989382802473457, + "grad_norm": 1.0462396144866943, + "learning_rate": 0.00017864559672698064, + "loss": 1.9562, + "step": 17051 + }, + { + "epoch": 1.9894994749737487, + "grad_norm": 1.0821807384490967, + "learning_rate": 0.00017863073138738532, + "loss": 2.2485, + "step": 17052 + }, + { + "epoch": 1.9896161474740404, + "grad_norm": 1.086373209953308, + "learning_rate": 0.00017861586576652864, + "loss": 2.0932, + "step": 17053 + }, + { + "epoch": 1.989732819974332, + "grad_norm": 1.0555028915405273, + "learning_rate": 0.00017860099986456468, + "loss": 2.1397, + "step": 17054 + }, + { + "epoch": 1.9898494924746237, + "grad_norm": 0.9581751227378845, + "learning_rate": 0.00017858613368164757, + "loss": 2.0604, + "step": 17055 + }, + { + "epoch": 1.9899661649749154, + "grad_norm": 1.4043787717819214, + "learning_rate": 0.00017857126721793133, + "loss": 2.0359, + "step": 17056 + }, + { + "epoch": 1.990082837475207, + "grad_norm": 1.0068180561065674, + "learning_rate": 0.00017855640047357015, + "loss": 2.0457, + "step": 17057 + }, + { + "epoch": 1.9901995099754988, + "grad_norm": 1.074988603591919, + "learning_rate": 0.00017854153344871817, + "loss": 2.0212, + "step": 17058 + }, + { + "epoch": 1.9903161824757905, + "grad_norm": 1.2405248880386353, + "learning_rate": 0.00017852666614352948, + "loss": 2.0348, + "step": 17059 + }, + { + "epoch": 1.9904328549760821, + "grad_norm": 1.0147454738616943, + "learning_rate": 0.00017851179855815822, + "loss": 2.0469, + "step": 17060 + }, + { + "epoch": 1.9905495274763738, + "grad_norm": 0.9780880212783813, + "learning_rate": 0.00017849693069275848, + "loss": 2.0886, + "step": 17061 + }, + { + "epoch": 1.9906661999766655, + "grad_norm": 1.0043944120407104, + "learning_rate": 0.0001784820625474845, + "loss": 1.8036, + "step": 17062 + }, + { + "epoch": 1.9907828724769572, + "grad_norm": 1.0181083679199219, + "learning_rate": 0.00017846719412249027, + "loss": 2.0211, + "step": 17063 + }, + { + "epoch": 1.9908995449772489, + "grad_norm": 1.0454826354980469, + "learning_rate": 0.00017845232541793008, + "loss": 2.0219, + "step": 17064 + }, + { + "epoch": 1.9910162174775405, + "grad_norm": 1.1227152347564697, + "learning_rate": 0.0001784374564339579, + "loss": 2.0078, + "step": 17065 + }, + { + "epoch": 1.9911328899778322, + "grad_norm": 1.009281039237976, + "learning_rate": 0.00017842258717072808, + "loss": 2.015, + "step": 17066 + }, + { + "epoch": 1.991249562478124, + "grad_norm": 1.07469642162323, + "learning_rate": 0.00017840771762839462, + "loss": 1.8168, + "step": 17067 + }, + { + "epoch": 1.9913662349784156, + "grad_norm": 1.124880075454712, + "learning_rate": 0.0001783928478071117, + "loss": 2.1008, + "step": 17068 + }, + { + "epoch": 1.9914829074787073, + "grad_norm": 1.0942827463150024, + "learning_rate": 0.00017837797770703351, + "loss": 2.079, + "step": 17069 + }, + { + "epoch": 1.991599579978999, + "grad_norm": 1.271971583366394, + "learning_rate": 0.00017836310732831414, + "loss": 1.9664, + "step": 17070 + }, + { + "epoch": 1.9917162524792906, + "grad_norm": 1.0734127759933472, + "learning_rate": 0.00017834823667110784, + "loss": 2.0938, + "step": 17071 + }, + { + "epoch": 1.9918329249795823, + "grad_norm": 1.045218586921692, + "learning_rate": 0.00017833336573556868, + "loss": 2.0208, + "step": 17072 + }, + { + "epoch": 1.991949597479874, + "grad_norm": 1.1507139205932617, + "learning_rate": 0.00017831849452185091, + "loss": 1.8521, + "step": 17073 + }, + { + "epoch": 1.9920662699801657, + "grad_norm": 0.9859468936920166, + "learning_rate": 0.00017830362303010867, + "loss": 2.0428, + "step": 17074 + }, + { + "epoch": 1.9921829424804574, + "grad_norm": 1.3303388357162476, + "learning_rate": 0.0001782887512604961, + "loss": 1.9736, + "step": 17075 + }, + { + "epoch": 1.992299614980749, + "grad_norm": 1.1950968503952026, + "learning_rate": 0.0001782738792131674, + "loss": 2.0076, + "step": 17076 + }, + { + "epoch": 1.9924162874810407, + "grad_norm": 1.0967087745666504, + "learning_rate": 0.0001782590068882768, + "loss": 2.1694, + "step": 17077 + }, + { + "epoch": 1.9925329599813324, + "grad_norm": 1.038357138633728, + "learning_rate": 0.00017824413428597826, + "loss": 1.957, + "step": 17078 + }, + { + "epoch": 1.992649632481624, + "grad_norm": 1.1900599002838135, + "learning_rate": 0.00017822926140642623, + "loss": 1.9626, + "step": 17079 + }, + { + "epoch": 1.9927663049819158, + "grad_norm": 1.0900479555130005, + "learning_rate": 0.0001782143882497748, + "loss": 1.9695, + "step": 17080 + }, + { + "epoch": 1.9928829774822074, + "grad_norm": 1.007014513015747, + "learning_rate": 0.00017819951481617808, + "loss": 2.0094, + "step": 17081 + }, + { + "epoch": 1.9929996499824991, + "grad_norm": 1.100284457206726, + "learning_rate": 0.0001781846411057904, + "loss": 2.1785, + "step": 17082 + }, + { + "epoch": 1.9931163224827908, + "grad_norm": 1.1434502601623535, + "learning_rate": 0.00017816976711876586, + "loss": 2.0991, + "step": 17083 + }, + { + "epoch": 1.9932329949830825, + "grad_norm": 1.0475256443023682, + "learning_rate": 0.00017815489285525872, + "loss": 2.0682, + "step": 17084 + }, + { + "epoch": 1.9933496674833742, + "grad_norm": 1.178869605064392, + "learning_rate": 0.0001781400183154231, + "loss": 2.2193, + "step": 17085 + }, + { + "epoch": 1.9934663399836658, + "grad_norm": 0.9989344477653503, + "learning_rate": 0.00017812514349941323, + "loss": 1.9677, + "step": 17086 + }, + { + "epoch": 1.9935830124839575, + "grad_norm": 1.1997557878494263, + "learning_rate": 0.00017811026840738333, + "loss": 1.9887, + "step": 17087 + }, + { + "epoch": 1.9936996849842492, + "grad_norm": 1.049796223640442, + "learning_rate": 0.00017809539303948764, + "loss": 1.9312, + "step": 17088 + }, + { + "epoch": 1.993816357484541, + "grad_norm": 1.0111558437347412, + "learning_rate": 0.0001780805173958803, + "loss": 1.923, + "step": 17089 + }, + { + "epoch": 1.9939330299848326, + "grad_norm": 1.1415358781814575, + "learning_rate": 0.00017806564147671561, + "loss": 2.0677, + "step": 17090 + }, + { + "epoch": 1.9940497024851243, + "grad_norm": 1.1723811626434326, + "learning_rate": 0.00017805076528214767, + "loss": 2.0468, + "step": 17091 + }, + { + "epoch": 1.994166374985416, + "grad_norm": 1.0170567035675049, + "learning_rate": 0.00017803588881233086, + "loss": 1.9038, + "step": 17092 + }, + { + "epoch": 1.9942830474857076, + "grad_norm": 0.9443439245223999, + "learning_rate": 0.00017802101206741927, + "loss": 1.9235, + "step": 17093 + }, + { + "epoch": 1.9943997199859993, + "grad_norm": 1.2967809438705444, + "learning_rate": 0.00017800613504756713, + "loss": 2.1534, + "step": 17094 + }, + { + "epoch": 1.994516392486291, + "grad_norm": 1.1652148962020874, + "learning_rate": 0.00017799125775292877, + "loss": 1.9706, + "step": 17095 + }, + { + "epoch": 1.9946330649865827, + "grad_norm": 1.1026158332824707, + "learning_rate": 0.00017797638018365828, + "loss": 2.0296, + "step": 17096 + }, + { + "epoch": 1.9947497374868743, + "grad_norm": 1.055753469467163, + "learning_rate": 0.00017796150233991007, + "loss": 1.9457, + "step": 17097 + }, + { + "epoch": 1.994866409987166, + "grad_norm": 1.1683471202850342, + "learning_rate": 0.00017794662422183823, + "loss": 2.1448, + "step": 17098 + }, + { + "epoch": 1.9949830824874577, + "grad_norm": 0.9755557775497437, + "learning_rate": 0.00017793174582959702, + "loss": 2.0268, + "step": 17099 + }, + { + "epoch": 1.9950997549877494, + "grad_norm": 1.125749111175537, + "learning_rate": 0.00017791686716334075, + "loss": 2.0542, + "step": 17100 + }, + { + "epoch": 1.995216427488041, + "grad_norm": 0.9726249575614929, + "learning_rate": 0.0001779019882232236, + "loss": 1.9943, + "step": 17101 + }, + { + "epoch": 1.9953330999883327, + "grad_norm": 1.1606825590133667, + "learning_rate": 0.00017788710900939992, + "loss": 1.9919, + "step": 17102 + }, + { + "epoch": 1.9954497724886244, + "grad_norm": 0.9825003147125244, + "learning_rate": 0.00017787222952202382, + "loss": 1.9451, + "step": 17103 + }, + { + "epoch": 1.995566444988916, + "grad_norm": 1.2073712348937988, + "learning_rate": 0.0001778573497612496, + "loss": 2.0456, + "step": 17104 + }, + { + "epoch": 1.9956831174892078, + "grad_norm": 1.0060312747955322, + "learning_rate": 0.00017784246972723158, + "loss": 2.0689, + "step": 17105 + }, + { + "epoch": 1.9957997899894995, + "grad_norm": 1.0380969047546387, + "learning_rate": 0.00017782758942012399, + "loss": 1.858, + "step": 17106 + }, + { + "epoch": 1.9959164624897912, + "grad_norm": 1.1374123096466064, + "learning_rate": 0.000177812708840081, + "loss": 2.0753, + "step": 17107 + }, + { + "epoch": 1.9960331349900828, + "grad_norm": 1.2153096199035645, + "learning_rate": 0.00017779782798725704, + "loss": 2.1855, + "step": 17108 + }, + { + "epoch": 1.9961498074903745, + "grad_norm": 1.1569230556488037, + "learning_rate": 0.00017778294686180627, + "loss": 1.9829, + "step": 17109 + }, + { + "epoch": 1.9962664799906662, + "grad_norm": 1.0817856788635254, + "learning_rate": 0.00017776806546388298, + "loss": 2.043, + "step": 17110 + }, + { + "epoch": 1.9963831524909579, + "grad_norm": 1.0380977392196655, + "learning_rate": 0.0001777531837936415, + "loss": 2.0938, + "step": 17111 + }, + { + "epoch": 1.9964998249912496, + "grad_norm": 1.2046887874603271, + "learning_rate": 0.000177738301851236, + "loss": 2.2697, + "step": 17112 + }, + { + "epoch": 1.9966164974915412, + "grad_norm": 1.2238818407058716, + "learning_rate": 0.0001777234196368208, + "loss": 1.9335, + "step": 17113 + }, + { + "epoch": 1.996733169991833, + "grad_norm": 1.1305750608444214, + "learning_rate": 0.00017770853715055022, + "loss": 2.0513, + "step": 17114 + }, + { + "epoch": 1.9968498424921246, + "grad_norm": 1.1825000047683716, + "learning_rate": 0.00017769365439257852, + "loss": 1.9438, + "step": 17115 + }, + { + "epoch": 1.9969665149924163, + "grad_norm": 1.3849278688430786, + "learning_rate": 0.00017767877136306, + "loss": 2.1128, + "step": 17116 + }, + { + "epoch": 1.997083187492708, + "grad_norm": 1.0039501190185547, + "learning_rate": 0.00017766388806214888, + "loss": 1.8904, + "step": 17117 + }, + { + "epoch": 1.9971998599929996, + "grad_norm": 1.1964163780212402, + "learning_rate": 0.0001776490044899996, + "loss": 2.1139, + "step": 17118 + }, + { + "epoch": 1.9973165324932913, + "grad_norm": 1.211160659790039, + "learning_rate": 0.00017763412064676634, + "loss": 2.0494, + "step": 17119 + }, + { + "epoch": 1.997433204993583, + "grad_norm": 1.085986852645874, + "learning_rate": 0.00017761923653260343, + "loss": 2.0328, + "step": 17120 + }, + { + "epoch": 1.9975498774938747, + "grad_norm": 1.0232090950012207, + "learning_rate": 0.00017760435214766518, + "loss": 2.088, + "step": 17121 + }, + { + "epoch": 1.9976665499941664, + "grad_norm": 0.9926271438598633, + "learning_rate": 0.00017758946749210587, + "loss": 2.1033, + "step": 17122 + }, + { + "epoch": 1.997783222494458, + "grad_norm": 1.1231749057769775, + "learning_rate": 0.00017757458256607985, + "loss": 2.0608, + "step": 17123 + }, + { + "epoch": 1.9978998949947497, + "grad_norm": 1.0902552604675293, + "learning_rate": 0.0001775596973697414, + "loss": 2.1109, + "step": 17124 + }, + { + "epoch": 1.9980165674950414, + "grad_norm": 1.0895304679870605, + "learning_rate": 0.00017754481190324482, + "loss": 1.9897, + "step": 17125 + }, + { + "epoch": 1.998133239995333, + "grad_norm": 1.0865299701690674, + "learning_rate": 0.00017752992616674445, + "loss": 1.9679, + "step": 17126 + }, + { + "epoch": 1.9982499124956248, + "grad_norm": 1.0947871208190918, + "learning_rate": 0.0001775150401603946, + "loss": 2.125, + "step": 17127 + }, + { + "epoch": 1.9983665849959165, + "grad_norm": 0.9781670570373535, + "learning_rate": 0.00017750015388434966, + "loss": 2.0208, + "step": 17128 + }, + { + "epoch": 1.9984832574962081, + "grad_norm": 1.0584574937820435, + "learning_rate": 0.00017748526733876382, + "loss": 1.8417, + "step": 17129 + }, + { + "epoch": 1.9985999299964998, + "grad_norm": 1.0511329174041748, + "learning_rate": 0.00017747038052379143, + "loss": 1.7712, + "step": 17130 + }, + { + "epoch": 1.9987166024967915, + "grad_norm": 1.1802284717559814, + "learning_rate": 0.00017745549343958696, + "loss": 2.0538, + "step": 17131 + }, + { + "epoch": 1.9988332749970832, + "grad_norm": 1.0747933387756348, + "learning_rate": 0.0001774406060863046, + "loss": 2.1639, + "step": 17132 + }, + { + "epoch": 1.9989499474973749, + "grad_norm": 1.0556609630584717, + "learning_rate": 0.00017742571846409875, + "loss": 2.0843, + "step": 17133 + }, + { + "epoch": 1.9990666199976665, + "grad_norm": 1.0890485048294067, + "learning_rate": 0.0001774108305731237, + "loss": 1.9848, + "step": 17134 + }, + { + "epoch": 1.9991832924979582, + "grad_norm": 1.1513868570327759, + "learning_rate": 0.0001773959424135339, + "loss": 1.9293, + "step": 17135 + }, + { + "epoch": 1.99929996499825, + "grad_norm": 0.9552210569381714, + "learning_rate": 0.0001773810539854836, + "loss": 1.9832, + "step": 17136 + }, + { + "epoch": 1.9994166374985416, + "grad_norm": 1.3051334619522095, + "learning_rate": 0.00017736616528912712, + "loss": 2.3254, + "step": 17137 + }, + { + "epoch": 1.9995333099988333, + "grad_norm": 1.122915506362915, + "learning_rate": 0.00017735127632461888, + "loss": 2.0415, + "step": 17138 + }, + { + "epoch": 1.999649982499125, + "grad_norm": 1.1673825979232788, + "learning_rate": 0.00017733638709211324, + "loss": 2.0224, + "step": 17139 + }, + { + "epoch": 1.9997666549994166, + "grad_norm": 1.0411651134490967, + "learning_rate": 0.00017732149759176448, + "loss": 2.0406, + "step": 17140 + }, + { + "epoch": 1.9998833274997083, + "grad_norm": 1.0017940998077393, + "learning_rate": 0.000177306607823727, + "loss": 2.029, + "step": 17141 + }, + { + "epoch": 2.0, + "grad_norm": 1.0611298084259033, + "learning_rate": 0.00017729171778815516, + "loss": 1.9864, + "step": 17142 + }, + { + "epoch": 2.0001166725002917, + "grad_norm": 1.1490713357925415, + "learning_rate": 0.00017727682748520335, + "loss": 2.1033, + "step": 17143 + }, + { + "epoch": 2.0002333450005834, + "grad_norm": 1.1711500883102417, + "learning_rate": 0.00017726193691502588, + "loss": 2.002, + "step": 17144 + }, + { + "epoch": 2.000350017500875, + "grad_norm": 1.2354850769042969, + "learning_rate": 0.00017724704607777718, + "loss": 1.8623, + "step": 17145 + }, + { + "epoch": 2.0004666900011667, + "grad_norm": 1.0603541135787964, + "learning_rate": 0.00017723215497361157, + "loss": 1.8996, + "step": 17146 + }, + { + "epoch": 2.0005833625014584, + "grad_norm": 1.1081397533416748, + "learning_rate": 0.00017721726360268346, + "loss": 1.9077, + "step": 17147 + }, + { + "epoch": 2.00070003500175, + "grad_norm": 1.1112680435180664, + "learning_rate": 0.00017720237196514722, + "loss": 2.0405, + "step": 17148 + }, + { + "epoch": 2.0008167075020418, + "grad_norm": 1.147074818611145, + "learning_rate": 0.00017718748006115715, + "loss": 1.9587, + "step": 17149 + }, + { + "epoch": 2.0009333800023335, + "grad_norm": 1.0759987831115723, + "learning_rate": 0.00017717258789086779, + "loss": 1.8292, + "step": 17150 + }, + { + "epoch": 2.001050052502625, + "grad_norm": 1.2541917562484741, + "learning_rate": 0.0001771576954544334, + "loss": 1.9453, + "step": 17151 + }, + { + "epoch": 2.001166725002917, + "grad_norm": 1.2294681072235107, + "learning_rate": 0.00017714280275200843, + "loss": 2.1777, + "step": 17152 + }, + { + "epoch": 2.0012833975032085, + "grad_norm": 1.18545663356781, + "learning_rate": 0.00017712790978374727, + "loss": 1.9728, + "step": 17153 + }, + { + "epoch": 2.0014000700035, + "grad_norm": 1.2745156288146973, + "learning_rate": 0.00017711301654980432, + "loss": 1.8754, + "step": 17154 + }, + { + "epoch": 2.001516742503792, + "grad_norm": 1.1677424907684326, + "learning_rate": 0.00017709812305033388, + "loss": 1.9839, + "step": 17155 + }, + { + "epoch": 2.0016334150040835, + "grad_norm": 1.2366794347763062, + "learning_rate": 0.00017708322928549044, + "loss": 1.9952, + "step": 17156 + }, + { + "epoch": 2.001750087504375, + "grad_norm": 1.2466639280319214, + "learning_rate": 0.00017706833525542835, + "loss": 2.0945, + "step": 17157 + }, + { + "epoch": 2.001866760004667, + "grad_norm": 1.0739015340805054, + "learning_rate": 0.00017705344096030206, + "loss": 1.7623, + "step": 17158 + }, + { + "epoch": 2.0019834325049586, + "grad_norm": 1.1531440019607544, + "learning_rate": 0.00017703854640026598, + "loss": 2.0107, + "step": 17159 + }, + { + "epoch": 2.0021001050052503, + "grad_norm": 1.0771620273590088, + "learning_rate": 0.00017702365157547454, + "loss": 2.0391, + "step": 17160 + }, + { + "epoch": 2.002216777505542, + "grad_norm": 1.293487787246704, + "learning_rate": 0.00017700875648608215, + "loss": 1.9336, + "step": 17161 + }, + { + "epoch": 2.0023334500058336, + "grad_norm": 1.0837939977645874, + "learning_rate": 0.0001769938611322431, + "loss": 1.7105, + "step": 17162 + }, + { + "epoch": 2.0024501225061253, + "grad_norm": 1.2581509351730347, + "learning_rate": 0.000176978965514112, + "loss": 2.1117, + "step": 17163 + }, + { + "epoch": 2.002566795006417, + "grad_norm": 1.112501621246338, + "learning_rate": 0.0001769640696318431, + "loss": 1.8783, + "step": 17164 + }, + { + "epoch": 2.0026834675067087, + "grad_norm": 1.3213247060775757, + "learning_rate": 0.0001769491734855909, + "loss": 1.9038, + "step": 17165 + }, + { + "epoch": 2.0028001400070004, + "grad_norm": 1.093896746635437, + "learning_rate": 0.00017693427707550984, + "loss": 1.9044, + "step": 17166 + }, + { + "epoch": 2.002916812507292, + "grad_norm": 1.2878508567810059, + "learning_rate": 0.0001769193804017543, + "loss": 2.0731, + "step": 17167 + }, + { + "epoch": 2.0030334850075837, + "grad_norm": 1.1129487752914429, + "learning_rate": 0.0001769044834644788, + "loss": 1.9355, + "step": 17168 + }, + { + "epoch": 2.0031501575078754, + "grad_norm": 1.2198905944824219, + "learning_rate": 0.0001768895862638377, + "loss": 2.1974, + "step": 17169 + }, + { + "epoch": 2.003266830008167, + "grad_norm": 1.0617338418960571, + "learning_rate": 0.00017687468879998543, + "loss": 1.8018, + "step": 17170 + }, + { + "epoch": 2.0033835025084588, + "grad_norm": 1.170455813407898, + "learning_rate": 0.0001768597910730765, + "loss": 1.9188, + "step": 17171 + }, + { + "epoch": 2.0035001750087504, + "grad_norm": 0.9659020304679871, + "learning_rate": 0.0001768448930832653, + "loss": 1.8702, + "step": 17172 + }, + { + "epoch": 2.003616847509042, + "grad_norm": 1.1635154485702515, + "learning_rate": 0.0001768299948307063, + "loss": 2.0154, + "step": 17173 + }, + { + "epoch": 2.003733520009334, + "grad_norm": 1.0802353620529175, + "learning_rate": 0.0001768150963155539, + "loss": 1.888, + "step": 17174 + }, + { + "epoch": 2.0038501925096255, + "grad_norm": 0.9957510232925415, + "learning_rate": 0.00017680019753796265, + "loss": 2.1011, + "step": 17175 + }, + { + "epoch": 2.003966865009917, + "grad_norm": 1.1926345825195312, + "learning_rate": 0.00017678529849808692, + "loss": 2.0036, + "step": 17176 + }, + { + "epoch": 2.004083537510209, + "grad_norm": 1.088306188583374, + "learning_rate": 0.00017677039919608112, + "loss": 1.9373, + "step": 17177 + }, + { + "epoch": 2.0042002100105005, + "grad_norm": 1.100399374961853, + "learning_rate": 0.00017675549963209986, + "loss": 1.6893, + "step": 17178 + }, + { + "epoch": 2.004316882510792, + "grad_norm": 1.165921688079834, + "learning_rate": 0.0001767405998062975, + "loss": 1.9805, + "step": 17179 + }, + { + "epoch": 2.004433555011084, + "grad_norm": 0.9855747818946838, + "learning_rate": 0.00017672569971882856, + "loss": 1.9144, + "step": 17180 + }, + { + "epoch": 2.0045502275113756, + "grad_norm": 1.1617599725723267, + "learning_rate": 0.00017671079936984742, + "loss": 2.0422, + "step": 17181 + }, + { + "epoch": 2.0046669000116673, + "grad_norm": 1.1136317253112793, + "learning_rate": 0.00017669589875950862, + "loss": 1.9815, + "step": 17182 + }, + { + "epoch": 2.004783572511959, + "grad_norm": 1.084306001663208, + "learning_rate": 0.0001766809978879666, + "loss": 1.994, + "step": 17183 + }, + { + "epoch": 2.0049002450122506, + "grad_norm": 1.135345220565796, + "learning_rate": 0.00017666609675537587, + "loss": 1.9044, + "step": 17184 + }, + { + "epoch": 2.0050169175125423, + "grad_norm": 1.018147349357605, + "learning_rate": 0.0001766511953618909, + "loss": 1.6323, + "step": 17185 + }, + { + "epoch": 2.005133590012834, + "grad_norm": 1.3916363716125488, + "learning_rate": 0.00017663629370766613, + "loss": 1.9645, + "step": 17186 + }, + { + "epoch": 2.0052502625131257, + "grad_norm": 1.1410181522369385, + "learning_rate": 0.00017662139179285615, + "loss": 1.8792, + "step": 17187 + }, + { + "epoch": 2.0053669350134173, + "grad_norm": 1.2505011558532715, + "learning_rate": 0.0001766064896176153, + "loss": 2.0473, + "step": 17188 + }, + { + "epoch": 2.005483607513709, + "grad_norm": 1.2134367227554321, + "learning_rate": 0.00017659158718209817, + "loss": 2.0325, + "step": 17189 + }, + { + "epoch": 2.0056002800140007, + "grad_norm": 1.1490181684494019, + "learning_rate": 0.0001765766844864592, + "loss": 2.0601, + "step": 17190 + }, + { + "epoch": 2.0057169525142924, + "grad_norm": 1.0920376777648926, + "learning_rate": 0.00017656178153085297, + "loss": 1.8372, + "step": 17191 + }, + { + "epoch": 2.005833625014584, + "grad_norm": 1.2366570234298706, + "learning_rate": 0.0001765468783154339, + "loss": 2.059, + "step": 17192 + }, + { + "epoch": 2.0059502975148757, + "grad_norm": 1.1684305667877197, + "learning_rate": 0.00017653197484035648, + "loss": 1.8244, + "step": 17193 + }, + { + "epoch": 2.0060669700151674, + "grad_norm": 1.138700246810913, + "learning_rate": 0.00017651707110577522, + "loss": 1.9251, + "step": 17194 + }, + { + "epoch": 2.006183642515459, + "grad_norm": 1.156785011291504, + "learning_rate": 0.00017650216711184464, + "loss": 1.9005, + "step": 17195 + }, + { + "epoch": 2.006300315015751, + "grad_norm": 1.119043231010437, + "learning_rate": 0.00017648726285871927, + "loss": 2.0158, + "step": 17196 + }, + { + "epoch": 2.0064169875160425, + "grad_norm": 1.1478111743927002, + "learning_rate": 0.00017647235834655366, + "loss": 2.061, + "step": 17197 + }, + { + "epoch": 2.006533660016334, + "grad_norm": 1.0535738468170166, + "learning_rate": 0.00017645745357550217, + "loss": 2.0176, + "step": 17198 + }, + { + "epoch": 2.006650332516626, + "grad_norm": 1.1080808639526367, + "learning_rate": 0.00017644254854571947, + "loss": 1.8736, + "step": 17199 + }, + { + "epoch": 2.0067670050169175, + "grad_norm": 1.1174898147583008, + "learning_rate": 0.00017642764325736003, + "loss": 1.9148, + "step": 17200 + }, + { + "epoch": 2.006883677517209, + "grad_norm": 1.4732290506362915, + "learning_rate": 0.00017641273771057834, + "loss": 2.0314, + "step": 17201 + }, + { + "epoch": 2.007000350017501, + "grad_norm": 1.2599101066589355, + "learning_rate": 0.00017639783190552893, + "loss": 2.0339, + "step": 17202 + }, + { + "epoch": 2.0071170225177926, + "grad_norm": 1.0397950410842896, + "learning_rate": 0.00017638292584236636, + "loss": 2.1039, + "step": 17203 + }, + { + "epoch": 2.0072336950180842, + "grad_norm": 1.0389200448989868, + "learning_rate": 0.00017636801952124515, + "loss": 2.0318, + "step": 17204 + }, + { + "epoch": 2.007350367518376, + "grad_norm": 1.1516461372375488, + "learning_rate": 0.00017635311294231985, + "loss": 2.0909, + "step": 17205 + }, + { + "epoch": 2.0074670400186676, + "grad_norm": 1.081581950187683, + "learning_rate": 0.00017633820610574494, + "loss": 1.8805, + "step": 17206 + }, + { + "epoch": 2.0075837125189593, + "grad_norm": 1.0936219692230225, + "learning_rate": 0.000176323299011675, + "loss": 1.9551, + "step": 17207 + }, + { + "epoch": 2.007700385019251, + "grad_norm": 1.1752856969833374, + "learning_rate": 0.00017630839166026457, + "loss": 2.0117, + "step": 17208 + }, + { + "epoch": 2.0078170575195426, + "grad_norm": 1.2061396837234497, + "learning_rate": 0.00017629348405166813, + "loss": 1.8038, + "step": 17209 + }, + { + "epoch": 2.0079337300198343, + "grad_norm": 1.1676019430160522, + "learning_rate": 0.00017627857618604032, + "loss": 1.8263, + "step": 17210 + }, + { + "epoch": 2.008050402520126, + "grad_norm": 1.4547102451324463, + "learning_rate": 0.00017626366806353563, + "loss": 2.0719, + "step": 17211 + }, + { + "epoch": 2.0081670750204177, + "grad_norm": 1.0545110702514648, + "learning_rate": 0.00017624875968430865, + "loss": 2.0363, + "step": 17212 + }, + { + "epoch": 2.0082837475207094, + "grad_norm": 1.304064154624939, + "learning_rate": 0.00017623385104851395, + "loss": 1.9165, + "step": 17213 + }, + { + "epoch": 2.008400420021001, + "grad_norm": 1.1982327699661255, + "learning_rate": 0.00017621894215630595, + "loss": 1.9386, + "step": 17214 + }, + { + "epoch": 2.0085170925212927, + "grad_norm": 1.0566555261611938, + "learning_rate": 0.0001762040330078394, + "loss": 1.9955, + "step": 17215 + }, + { + "epoch": 2.0086337650215844, + "grad_norm": 1.3259994983673096, + "learning_rate": 0.00017618912360326867, + "loss": 2.0653, + "step": 17216 + }, + { + "epoch": 2.008750437521876, + "grad_norm": 1.115475058555603, + "learning_rate": 0.00017617421394274847, + "loss": 1.6372, + "step": 17217 + }, + { + "epoch": 2.008867110022168, + "grad_norm": 1.097494125366211, + "learning_rate": 0.00017615930402643332, + "loss": 2.0351, + "step": 17218 + }, + { + "epoch": 2.0089837825224595, + "grad_norm": 1.1368930339813232, + "learning_rate": 0.00017614439385447777, + "loss": 1.8701, + "step": 17219 + }, + { + "epoch": 2.009100455022751, + "grad_norm": 1.2730334997177124, + "learning_rate": 0.00017612948342703643, + "loss": 1.8988, + "step": 17220 + }, + { + "epoch": 2.009217127523043, + "grad_norm": 1.3386443853378296, + "learning_rate": 0.00017611457274426386, + "loss": 2.021, + "step": 17221 + }, + { + "epoch": 2.0093338000233345, + "grad_norm": 1.2253860235214233, + "learning_rate": 0.00017609966180631462, + "loss": 2.0984, + "step": 17222 + }, + { + "epoch": 2.009450472523626, + "grad_norm": 1.1143335103988647, + "learning_rate": 0.00017608475061334328, + "loss": 2.0166, + "step": 17223 + }, + { + "epoch": 2.009567145023918, + "grad_norm": 1.213422179222107, + "learning_rate": 0.0001760698391655045, + "loss": 2.2285, + "step": 17224 + }, + { + "epoch": 2.0096838175242095, + "grad_norm": 1.1087568998336792, + "learning_rate": 0.00017605492746295277, + "loss": 2.0005, + "step": 17225 + }, + { + "epoch": 2.0098004900245012, + "grad_norm": 1.1274826526641846, + "learning_rate": 0.00017604001550584268, + "loss": 2.126, + "step": 17226 + }, + { + "epoch": 2.009917162524793, + "grad_norm": 1.1962193250656128, + "learning_rate": 0.0001760251032943289, + "loss": 2.0082, + "step": 17227 + }, + { + "epoch": 2.0100338350250846, + "grad_norm": 1.3497637510299683, + "learning_rate": 0.000176010190828566, + "loss": 2.1897, + "step": 17228 + }, + { + "epoch": 2.0101505075253763, + "grad_norm": 1.0879424810409546, + "learning_rate": 0.00017599527810870856, + "loss": 1.8266, + "step": 17229 + }, + { + "epoch": 2.010267180025668, + "grad_norm": 1.2372455596923828, + "learning_rate": 0.0001759803651349112, + "loss": 2.0134, + "step": 17230 + }, + { + "epoch": 2.0103838525259596, + "grad_norm": 0.9781936407089233, + "learning_rate": 0.00017596545190732844, + "loss": 1.7805, + "step": 17231 + }, + { + "epoch": 2.0105005250262513, + "grad_norm": 1.080270528793335, + "learning_rate": 0.00017595053842611494, + "loss": 1.9567, + "step": 17232 + }, + { + "epoch": 2.010617197526543, + "grad_norm": 1.0685666799545288, + "learning_rate": 0.00017593562469142535, + "loss": 1.8707, + "step": 17233 + }, + { + "epoch": 2.0107338700268347, + "grad_norm": 1.3872296810150146, + "learning_rate": 0.0001759207107034142, + "loss": 2.1196, + "step": 17234 + }, + { + "epoch": 2.0108505425271264, + "grad_norm": 1.1395559310913086, + "learning_rate": 0.00017590579646223616, + "loss": 2.0867, + "step": 17235 + }, + { + "epoch": 2.010967215027418, + "grad_norm": 1.3013324737548828, + "learning_rate": 0.0001758908819680458, + "loss": 2.0305, + "step": 17236 + }, + { + "epoch": 2.0110838875277097, + "grad_norm": 1.0293556451797485, + "learning_rate": 0.0001758759672209978, + "loss": 2.0032, + "step": 17237 + }, + { + "epoch": 2.0112005600280014, + "grad_norm": 1.0890856981277466, + "learning_rate": 0.00017586105222124672, + "loss": 1.9839, + "step": 17238 + }, + { + "epoch": 2.011317232528293, + "grad_norm": 1.2674039602279663, + "learning_rate": 0.00017584613696894723, + "loss": 1.9085, + "step": 17239 + }, + { + "epoch": 2.0114339050285848, + "grad_norm": 1.091006875038147, + "learning_rate": 0.00017583122146425388, + "loss": 2.0362, + "step": 17240 + }, + { + "epoch": 2.0115505775288764, + "grad_norm": 1.065615177154541, + "learning_rate": 0.00017581630570732138, + "loss": 1.891, + "step": 17241 + }, + { + "epoch": 2.011667250029168, + "grad_norm": 1.1792540550231934, + "learning_rate": 0.00017580138969830429, + "loss": 1.8481, + "step": 17242 + }, + { + "epoch": 2.01178392252946, + "grad_norm": 1.2097777128219604, + "learning_rate": 0.0001757864734373573, + "loss": 1.9483, + "step": 17243 + }, + { + "epoch": 2.0119005950297515, + "grad_norm": 1.0322874784469604, + "learning_rate": 0.00017577155692463498, + "loss": 1.803, + "step": 17244 + }, + { + "epoch": 2.012017267530043, + "grad_norm": 1.2541906833648682, + "learning_rate": 0.00017575664016029207, + "loss": 1.9569, + "step": 17245 + }, + { + "epoch": 2.012133940030335, + "grad_norm": 1.276663899421692, + "learning_rate": 0.0001757417231444831, + "loss": 2.0679, + "step": 17246 + }, + { + "epoch": 2.0122506125306265, + "grad_norm": 1.3551969528198242, + "learning_rate": 0.00017572680587736283, + "loss": 1.903, + "step": 17247 + }, + { + "epoch": 2.012367285030918, + "grad_norm": 1.132157564163208, + "learning_rate": 0.0001757118883590858, + "loss": 1.8589, + "step": 17248 + }, + { + "epoch": 2.01248395753121, + "grad_norm": 1.004747748374939, + "learning_rate": 0.0001756969705898067, + "loss": 1.9916, + "step": 17249 + }, + { + "epoch": 2.0126006300315016, + "grad_norm": 1.157936692237854, + "learning_rate": 0.00017568205256968017, + "loss": 2.0039, + "step": 17250 + }, + { + "epoch": 2.0127173025317933, + "grad_norm": 1.1466952562332153, + "learning_rate": 0.0001756671342988609, + "loss": 1.6826, + "step": 17251 + }, + { + "epoch": 2.012833975032085, + "grad_norm": 1.1392191648483276, + "learning_rate": 0.0001756522157775035, + "loss": 1.9867, + "step": 17252 + }, + { + "epoch": 2.0129506475323766, + "grad_norm": 1.2610362768173218, + "learning_rate": 0.00017563729700576265, + "loss": 1.9541, + "step": 17253 + }, + { + "epoch": 2.0130673200326683, + "grad_norm": 1.217177391052246, + "learning_rate": 0.000175622377983793, + "loss": 2.0074, + "step": 17254 + }, + { + "epoch": 2.01318399253296, + "grad_norm": 1.1286523342132568, + "learning_rate": 0.00017560745871174924, + "loss": 1.8163, + "step": 17255 + }, + { + "epoch": 2.0133006650332517, + "grad_norm": 1.3139772415161133, + "learning_rate": 0.00017559253918978602, + "loss": 1.9318, + "step": 17256 + }, + { + "epoch": 2.0134173375335433, + "grad_norm": 1.0768043994903564, + "learning_rate": 0.00017557761941805803, + "loss": 1.7485, + "step": 17257 + }, + { + "epoch": 2.013534010033835, + "grad_norm": 1.1728401184082031, + "learning_rate": 0.00017556269939671988, + "loss": 2.094, + "step": 17258 + }, + { + "epoch": 2.0136506825341267, + "grad_norm": 1.1591094732284546, + "learning_rate": 0.00017554777912592628, + "loss": 1.988, + "step": 17259 + }, + { + "epoch": 2.0137673550344184, + "grad_norm": 1.0496656894683838, + "learning_rate": 0.00017553285860583194, + "loss": 1.871, + "step": 17260 + }, + { + "epoch": 2.01388402753471, + "grad_norm": 1.179181694984436, + "learning_rate": 0.00017551793783659146, + "loss": 1.9833, + "step": 17261 + }, + { + "epoch": 2.0140007000350018, + "grad_norm": 1.017535924911499, + "learning_rate": 0.0001755030168183596, + "loss": 1.8622, + "step": 17262 + }, + { + "epoch": 2.0141173725352934, + "grad_norm": 1.2915273904800415, + "learning_rate": 0.00017548809555129104, + "loss": 2.0753, + "step": 17263 + }, + { + "epoch": 2.014234045035585, + "grad_norm": 1.4043668508529663, + "learning_rate": 0.00017547317403554045, + "loss": 2.0062, + "step": 17264 + }, + { + "epoch": 2.014350717535877, + "grad_norm": 1.1013226509094238, + "learning_rate": 0.00017545825227126247, + "loss": 2.0083, + "step": 17265 + }, + { + "epoch": 2.0144673900361685, + "grad_norm": 1.2108922004699707, + "learning_rate": 0.0001754433302586119, + "loss": 2.1117, + "step": 17266 + }, + { + "epoch": 2.01458406253646, + "grad_norm": 1.1356432437896729, + "learning_rate": 0.00017542840799774333, + "loss": 1.9923, + "step": 17267 + }, + { + "epoch": 2.014700735036752, + "grad_norm": 1.1638065576553345, + "learning_rate": 0.0001754134854888115, + "loss": 2.0951, + "step": 17268 + }, + { + "epoch": 2.0148174075370435, + "grad_norm": 1.129414439201355, + "learning_rate": 0.0001753985627319711, + "loss": 2.0185, + "step": 17269 + }, + { + "epoch": 2.014934080037335, + "grad_norm": 0.9983961582183838, + "learning_rate": 0.00017538363972737685, + "loss": 1.9284, + "step": 17270 + }, + { + "epoch": 2.015050752537627, + "grad_norm": 1.0587246417999268, + "learning_rate": 0.00017536871647518345, + "loss": 1.8282, + "step": 17271 + }, + { + "epoch": 2.0151674250379186, + "grad_norm": 1.273529291152954, + "learning_rate": 0.0001753537929755456, + "loss": 2.0006, + "step": 17272 + }, + { + "epoch": 2.0152840975382103, + "grad_norm": 0.9812563061714172, + "learning_rate": 0.000175338869228618, + "loss": 1.8209, + "step": 17273 + }, + { + "epoch": 2.015400770038502, + "grad_norm": 1.1631622314453125, + "learning_rate": 0.0001753239452345554, + "loss": 1.9089, + "step": 17274 + }, + { + "epoch": 2.0155174425387936, + "grad_norm": 1.153010606765747, + "learning_rate": 0.0001753090209935125, + "loss": 1.9524, + "step": 17275 + }, + { + "epoch": 2.0156341150390853, + "grad_norm": 1.08797287940979, + "learning_rate": 0.000175294096505644, + "loss": 1.9277, + "step": 17276 + }, + { + "epoch": 2.015750787539377, + "grad_norm": 1.151557207107544, + "learning_rate": 0.00017527917177110464, + "loss": 1.9795, + "step": 17277 + }, + { + "epoch": 2.0158674600396687, + "grad_norm": 1.0890008211135864, + "learning_rate": 0.0001752642467900491, + "loss": 1.9111, + "step": 17278 + }, + { + "epoch": 2.0159841325399603, + "grad_norm": 1.1534303426742554, + "learning_rate": 0.0001752493215626322, + "loss": 1.8603, + "step": 17279 + }, + { + "epoch": 2.016100805040252, + "grad_norm": 1.057554006576538, + "learning_rate": 0.00017523439608900857, + "loss": 1.7083, + "step": 17280 + }, + { + "epoch": 2.0162174775405437, + "grad_norm": 1.3970595598220825, + "learning_rate": 0.00017521947036933298, + "loss": 1.9953, + "step": 17281 + }, + { + "epoch": 2.0163341500408354, + "grad_norm": 1.4656732082366943, + "learning_rate": 0.0001752045444037602, + "loss": 2.1285, + "step": 17282 + }, + { + "epoch": 2.016450822541127, + "grad_norm": 1.0479954481124878, + "learning_rate": 0.0001751896181924449, + "loss": 1.8272, + "step": 17283 + }, + { + "epoch": 2.0165674950414187, + "grad_norm": 1.231373906135559, + "learning_rate": 0.00017517469173554184, + "loss": 1.7582, + "step": 17284 + }, + { + "epoch": 2.0166841675417104, + "grad_norm": 1.1043143272399902, + "learning_rate": 0.00017515976503320577, + "loss": 1.9714, + "step": 17285 + }, + { + "epoch": 2.016800840042002, + "grad_norm": 1.043727159500122, + "learning_rate": 0.00017514483808559142, + "loss": 1.8521, + "step": 17286 + }, + { + "epoch": 2.016917512542294, + "grad_norm": 1.1285650730133057, + "learning_rate": 0.00017512991089285355, + "loss": 2.1459, + "step": 17287 + }, + { + "epoch": 2.0170341850425855, + "grad_norm": 1.399453043937683, + "learning_rate": 0.00017511498345514694, + "loss": 2.1561, + "step": 17288 + }, + { + "epoch": 2.017150857542877, + "grad_norm": 1.209006667137146, + "learning_rate": 0.00017510005577262626, + "loss": 1.8836, + "step": 17289 + }, + { + "epoch": 2.017267530043169, + "grad_norm": 1.2875745296478271, + "learning_rate": 0.00017508512784544637, + "loss": 2.1917, + "step": 17290 + }, + { + "epoch": 2.0173842025434605, + "grad_norm": 1.0781971216201782, + "learning_rate": 0.00017507019967376194, + "loss": 1.8046, + "step": 17291 + }, + { + "epoch": 2.017500875043752, + "grad_norm": 1.081507921218872, + "learning_rate": 0.00017505527125772776, + "loss": 1.7925, + "step": 17292 + }, + { + "epoch": 2.017617547544044, + "grad_norm": 1.2762234210968018, + "learning_rate": 0.0001750403425974986, + "loss": 2.092, + "step": 17293 + }, + { + "epoch": 2.0177342200443356, + "grad_norm": 1.1609629392623901, + "learning_rate": 0.00017502541369322915, + "loss": 1.84, + "step": 17294 + }, + { + "epoch": 2.0178508925446272, + "grad_norm": 1.2415645122528076, + "learning_rate": 0.0001750104845450743, + "loss": 1.9915, + "step": 17295 + }, + { + "epoch": 2.017967565044919, + "grad_norm": 1.1382362842559814, + "learning_rate": 0.00017499555515318875, + "loss": 1.9804, + "step": 17296 + }, + { + "epoch": 2.0180842375452106, + "grad_norm": 1.1759817600250244, + "learning_rate": 0.00017498062551772725, + "loss": 2.1732, + "step": 17297 + }, + { + "epoch": 2.0182009100455023, + "grad_norm": 1.3651602268218994, + "learning_rate": 0.00017496569563884464, + "loss": 1.9645, + "step": 17298 + }, + { + "epoch": 2.018317582545794, + "grad_norm": 1.0403845310211182, + "learning_rate": 0.00017495076551669563, + "loss": 2.0799, + "step": 17299 + }, + { + "epoch": 2.0184342550460856, + "grad_norm": 1.1441712379455566, + "learning_rate": 0.00017493583515143507, + "loss": 1.9964, + "step": 17300 + }, + { + "epoch": 2.0185509275463773, + "grad_norm": 1.3116556406021118, + "learning_rate": 0.0001749209045432176, + "loss": 1.9785, + "step": 17301 + }, + { + "epoch": 2.018667600046669, + "grad_norm": 1.1737996339797974, + "learning_rate": 0.0001749059736921982, + "loss": 1.9949, + "step": 17302 + }, + { + "epoch": 2.0187842725469607, + "grad_norm": 1.1812046766281128, + "learning_rate": 0.00017489104259853154, + "loss": 1.9548, + "step": 17303 + }, + { + "epoch": 2.0189009450472524, + "grad_norm": 1.1461488008499146, + "learning_rate": 0.0001748761112623724, + "loss": 2.002, + "step": 17304 + }, + { + "epoch": 2.019017617547544, + "grad_norm": 1.1778128147125244, + "learning_rate": 0.00017486117968387559, + "loss": 2.0288, + "step": 17305 + }, + { + "epoch": 2.0191342900478357, + "grad_norm": 1.2580610513687134, + "learning_rate": 0.00017484624786319596, + "loss": 1.9207, + "step": 17306 + }, + { + "epoch": 2.0192509625481274, + "grad_norm": 1.2940452098846436, + "learning_rate": 0.0001748313158004883, + "loss": 1.9711, + "step": 17307 + }, + { + "epoch": 2.019367635048419, + "grad_norm": 1.1081982851028442, + "learning_rate": 0.0001748163834959073, + "loss": 2.0985, + "step": 17308 + }, + { + "epoch": 2.0194843075487108, + "grad_norm": 1.0378128290176392, + "learning_rate": 0.00017480145094960788, + "loss": 1.88, + "step": 17309 + }, + { + "epoch": 2.0196009800490025, + "grad_norm": 1.0874276161193848, + "learning_rate": 0.00017478651816174473, + "loss": 1.8386, + "step": 17310 + }, + { + "epoch": 2.019717652549294, + "grad_norm": 1.1120882034301758, + "learning_rate": 0.00017477158513247277, + "loss": 1.9855, + "step": 17311 + }, + { + "epoch": 2.019834325049586, + "grad_norm": 1.2827014923095703, + "learning_rate": 0.00017475665186194675, + "loss": 1.8347, + "step": 17312 + }, + { + "epoch": 2.0199509975498775, + "grad_norm": 1.189776062965393, + "learning_rate": 0.00017474171835032146, + "loss": 1.9945, + "step": 17313 + }, + { + "epoch": 2.020067670050169, + "grad_norm": 1.1186922788619995, + "learning_rate": 0.00017472678459775178, + "loss": 1.9549, + "step": 17314 + }, + { + "epoch": 2.020184342550461, + "grad_norm": 1.3934224843978882, + "learning_rate": 0.00017471185060439248, + "loss": 1.9517, + "step": 17315 + }, + { + "epoch": 2.0203010150507525, + "grad_norm": 1.065629243850708, + "learning_rate": 0.00017469691637039844, + "loss": 1.9608, + "step": 17316 + }, + { + "epoch": 2.0204176875510442, + "grad_norm": 1.3072530031204224, + "learning_rate": 0.00017468198189592437, + "loss": 1.9793, + "step": 17317 + }, + { + "epoch": 2.020534360051336, + "grad_norm": 1.115960955619812, + "learning_rate": 0.00017466704718112523, + "loss": 1.8947, + "step": 17318 + }, + { + "epoch": 2.0206510325516276, + "grad_norm": 0.9417558908462524, + "learning_rate": 0.00017465211222615572, + "loss": 1.8895, + "step": 17319 + }, + { + "epoch": 2.0207677050519193, + "grad_norm": 1.2226873636245728, + "learning_rate": 0.00017463717703117073, + "loss": 1.9059, + "step": 17320 + }, + { + "epoch": 2.020884377552211, + "grad_norm": 1.3734954595565796, + "learning_rate": 0.0001746222415963251, + "loss": 2.0331, + "step": 17321 + }, + { + "epoch": 2.0210010500525026, + "grad_norm": 1.086093544960022, + "learning_rate": 0.00017460730592177366, + "loss": 1.9596, + "step": 17322 + }, + { + "epoch": 2.0211177225527943, + "grad_norm": 1.3094768524169922, + "learning_rate": 0.0001745923700076712, + "loss": 1.9472, + "step": 17323 + }, + { + "epoch": 2.021234395053086, + "grad_norm": 1.2804723978042603, + "learning_rate": 0.00017457743385417265, + "loss": 1.898, + "step": 17324 + }, + { + "epoch": 2.0213510675533777, + "grad_norm": 1.177069067955017, + "learning_rate": 0.00017456249746143276, + "loss": 2.0182, + "step": 17325 + }, + { + "epoch": 2.0214677400536694, + "grad_norm": 1.2269082069396973, + "learning_rate": 0.0001745475608296064, + "loss": 1.9953, + "step": 17326 + }, + { + "epoch": 2.021584412553961, + "grad_norm": 1.3133846521377563, + "learning_rate": 0.0001745326239588484, + "loss": 2.1907, + "step": 17327 + }, + { + "epoch": 2.0217010850542527, + "grad_norm": 1.2267870903015137, + "learning_rate": 0.00017451768684931368, + "loss": 1.9878, + "step": 17328 + }, + { + "epoch": 2.0218177575545444, + "grad_norm": 1.1772129535675049, + "learning_rate": 0.00017450274950115706, + "loss": 1.9488, + "step": 17329 + }, + { + "epoch": 2.021934430054836, + "grad_norm": 1.1439021825790405, + "learning_rate": 0.0001744878119145334, + "loss": 1.9622, + "step": 17330 + }, + { + "epoch": 2.0220511025551278, + "grad_norm": 1.1838765144348145, + "learning_rate": 0.0001744728740895975, + "loss": 1.9122, + "step": 17331 + }, + { + "epoch": 2.0221677750554194, + "grad_norm": 1.1422759294509888, + "learning_rate": 0.00017445793602650428, + "loss": 1.8572, + "step": 17332 + }, + { + "epoch": 2.022284447555711, + "grad_norm": 1.288913607597351, + "learning_rate": 0.0001744429977254086, + "loss": 2.0405, + "step": 17333 + }, + { + "epoch": 2.022401120056003, + "grad_norm": 1.1183654069900513, + "learning_rate": 0.00017442805918646526, + "loss": 1.8495, + "step": 17334 + }, + { + "epoch": 2.0225177925562945, + "grad_norm": 1.152724027633667, + "learning_rate": 0.00017441312040982916, + "loss": 1.9093, + "step": 17335 + }, + { + "epoch": 2.022634465056586, + "grad_norm": 1.2662357091903687, + "learning_rate": 0.0001743981813956552, + "loss": 1.8017, + "step": 17336 + }, + { + "epoch": 2.022751137556878, + "grad_norm": 1.1799848079681396, + "learning_rate": 0.00017438324214409824, + "loss": 1.9326, + "step": 17337 + }, + { + "epoch": 2.0228678100571695, + "grad_norm": 1.1482717990875244, + "learning_rate": 0.00017436830265531313, + "loss": 1.8457, + "step": 17338 + }, + { + "epoch": 2.022984482557461, + "grad_norm": 1.1163822412490845, + "learning_rate": 0.00017435336292945475, + "loss": 2.006, + "step": 17339 + }, + { + "epoch": 2.023101155057753, + "grad_norm": 1.1640444993972778, + "learning_rate": 0.00017433842296667797, + "loss": 1.9736, + "step": 17340 + }, + { + "epoch": 2.0232178275580446, + "grad_norm": 1.1509486436843872, + "learning_rate": 0.00017432348276713773, + "loss": 1.7933, + "step": 17341 + }, + { + "epoch": 2.0233345000583363, + "grad_norm": 1.2459020614624023, + "learning_rate": 0.00017430854233098885, + "loss": 1.8403, + "step": 17342 + }, + { + "epoch": 2.023451172558628, + "grad_norm": 1.1179336309432983, + "learning_rate": 0.00017429360165838623, + "loss": 1.8377, + "step": 17343 + }, + { + "epoch": 2.0235678450589196, + "grad_norm": 1.1338249444961548, + "learning_rate": 0.0001742786607494848, + "loss": 1.964, + "step": 17344 + }, + { + "epoch": 2.0236845175592113, + "grad_norm": 1.299942970275879, + "learning_rate": 0.00017426371960443937, + "loss": 2.1834, + "step": 17345 + }, + { + "epoch": 2.023801190059503, + "grad_norm": 1.1999000310897827, + "learning_rate": 0.00017424877822340488, + "loss": 2.0376, + "step": 17346 + }, + { + "epoch": 2.0239178625597947, + "grad_norm": 1.335165023803711, + "learning_rate": 0.00017423383660653626, + "loss": 2.0041, + "step": 17347 + }, + { + "epoch": 2.0240345350600863, + "grad_norm": 1.0425463914871216, + "learning_rate": 0.00017421889475398837, + "loss": 1.8696, + "step": 17348 + }, + { + "epoch": 2.024151207560378, + "grad_norm": 1.2723076343536377, + "learning_rate": 0.0001742039526659161, + "loss": 1.8851, + "step": 17349 + }, + { + "epoch": 2.0242678800606697, + "grad_norm": 1.20689857006073, + "learning_rate": 0.00017418901034247435, + "loss": 1.8799, + "step": 17350 + }, + { + "epoch": 2.0243845525609614, + "grad_norm": 1.1994328498840332, + "learning_rate": 0.0001741740677838181, + "loss": 2.0552, + "step": 17351 + }, + { + "epoch": 2.024501225061253, + "grad_norm": 1.0255911350250244, + "learning_rate": 0.00017415912499010216, + "loss": 1.8423, + "step": 17352 + }, + { + "epoch": 2.0246178975615448, + "grad_norm": 1.2658400535583496, + "learning_rate": 0.00017414418196148146, + "loss": 2.0696, + "step": 17353 + }, + { + "epoch": 2.0247345700618364, + "grad_norm": 1.1683186292648315, + "learning_rate": 0.00017412923869811092, + "loss": 2.0095, + "step": 17354 + }, + { + "epoch": 2.024851242562128, + "grad_norm": 1.1948343515396118, + "learning_rate": 0.0001741142952001455, + "loss": 2.0447, + "step": 17355 + }, + { + "epoch": 2.02496791506242, + "grad_norm": 1.1453155279159546, + "learning_rate": 0.00017409935146774013, + "loss": 1.9539, + "step": 17356 + }, + { + "epoch": 2.0250845875627115, + "grad_norm": 1.0620722770690918, + "learning_rate": 0.00017408440750104964, + "loss": 2.0429, + "step": 17357 + }, + { + "epoch": 2.025201260063003, + "grad_norm": 1.023351788520813, + "learning_rate": 0.000174069463300229, + "loss": 1.972, + "step": 17358 + }, + { + "epoch": 2.025317932563295, + "grad_norm": 1.1280947923660278, + "learning_rate": 0.0001740545188654332, + "loss": 1.9902, + "step": 17359 + }, + { + "epoch": 2.0254346050635865, + "grad_norm": 1.1325910091400146, + "learning_rate": 0.000174039574196817, + "loss": 1.9013, + "step": 17360 + }, + { + "epoch": 2.025551277563878, + "grad_norm": 1.19563889503479, + "learning_rate": 0.0001740246292945355, + "loss": 2.0215, + "step": 17361 + }, + { + "epoch": 2.02566795006417, + "grad_norm": 1.2809288501739502, + "learning_rate": 0.00017400968415874353, + "loss": 2.0394, + "step": 17362 + }, + { + "epoch": 2.0257846225644616, + "grad_norm": 1.0724385976791382, + "learning_rate": 0.0001739947387895961, + "loss": 2.0954, + "step": 17363 + }, + { + "epoch": 2.0259012950647532, + "grad_norm": 1.1039732694625854, + "learning_rate": 0.00017397979318724806, + "loss": 2.0232, + "step": 17364 + }, + { + "epoch": 2.026017967565045, + "grad_norm": 1.116710901260376, + "learning_rate": 0.00017396484735185443, + "loss": 1.8383, + "step": 17365 + }, + { + "epoch": 2.0261346400653366, + "grad_norm": 1.3664131164550781, + "learning_rate": 0.00017394990128357, + "loss": 1.9202, + "step": 17366 + }, + { + "epoch": 2.0262513125656283, + "grad_norm": 1.2133969068527222, + "learning_rate": 0.00017393495498254998, + "loss": 1.9651, + "step": 17367 + }, + { + "epoch": 2.02636798506592, + "grad_norm": 1.3446309566497803, + "learning_rate": 0.00017392000844894913, + "loss": 1.9797, + "step": 17368 + }, + { + "epoch": 2.0264846575662117, + "grad_norm": 1.176297664642334, + "learning_rate": 0.00017390506168292238, + "loss": 1.9685, + "step": 17369 + }, + { + "epoch": 2.0266013300665033, + "grad_norm": 1.0504645109176636, + "learning_rate": 0.0001738901146846248, + "loss": 1.8444, + "step": 17370 + }, + { + "epoch": 2.026718002566795, + "grad_norm": 1.2343260049819946, + "learning_rate": 0.00017387516745421126, + "loss": 1.872, + "step": 17371 + }, + { + "epoch": 2.0268346750670867, + "grad_norm": 1.1863127946853638, + "learning_rate": 0.00017386021999183672, + "loss": 2.0442, + "step": 17372 + }, + { + "epoch": 2.0269513475673784, + "grad_norm": 1.0252232551574707, + "learning_rate": 0.00017384527229765618, + "loss": 1.9153, + "step": 17373 + }, + { + "epoch": 2.02706802006767, + "grad_norm": 1.094791293144226, + "learning_rate": 0.0001738303243718246, + "loss": 1.8125, + "step": 17374 + }, + { + "epoch": 2.0271846925679617, + "grad_norm": 1.3894188404083252, + "learning_rate": 0.00017381537621449688, + "loss": 1.9313, + "step": 17375 + }, + { + "epoch": 2.0273013650682534, + "grad_norm": 1.4788540601730347, + "learning_rate": 0.00017380042782582806, + "loss": 2.1783, + "step": 17376 + }, + { + "epoch": 2.027418037568545, + "grad_norm": 1.078558325767517, + "learning_rate": 0.00017378547920597306, + "loss": 2.0, + "step": 17377 + }, + { + "epoch": 2.027534710068837, + "grad_norm": 1.3261923789978027, + "learning_rate": 0.00017377053035508687, + "loss": 1.9413, + "step": 17378 + }, + { + "epoch": 2.0276513825691285, + "grad_norm": 1.1232492923736572, + "learning_rate": 0.00017375558127332444, + "loss": 1.9516, + "step": 17379 + }, + { + "epoch": 2.02776805506942, + "grad_norm": 1.2787672281265259, + "learning_rate": 0.00017374063196084075, + "loss": 1.864, + "step": 17380 + }, + { + "epoch": 2.027884727569712, + "grad_norm": 1.1279963254928589, + "learning_rate": 0.00017372568241779085, + "loss": 2.0021, + "step": 17381 + }, + { + "epoch": 2.0280014000700035, + "grad_norm": 1.4724081754684448, + "learning_rate": 0.0001737107326443296, + "loss": 1.8552, + "step": 17382 + }, + { + "epoch": 2.028118072570295, + "grad_norm": 1.1950218677520752, + "learning_rate": 0.0001736957826406121, + "loss": 1.8747, + "step": 17383 + }, + { + "epoch": 2.028234745070587, + "grad_norm": 1.0075757503509521, + "learning_rate": 0.0001736808324067933, + "loss": 1.6856, + "step": 17384 + }, + { + "epoch": 2.0283514175708786, + "grad_norm": 1.1614034175872803, + "learning_rate": 0.00017366588194302814, + "loss": 2.0624, + "step": 17385 + }, + { + "epoch": 2.0284680900711702, + "grad_norm": 1.2388300895690918, + "learning_rate": 0.00017365093124947165, + "loss": 1.9425, + "step": 17386 + }, + { + "epoch": 2.028584762571462, + "grad_norm": 1.2133573293685913, + "learning_rate": 0.0001736359803262788, + "loss": 2.1052, + "step": 17387 + }, + { + "epoch": 2.0287014350717536, + "grad_norm": 1.2490699291229248, + "learning_rate": 0.00017362102917360458, + "loss": 2.0937, + "step": 17388 + }, + { + "epoch": 2.0288181075720453, + "grad_norm": 1.2096794843673706, + "learning_rate": 0.00017360607779160405, + "loss": 1.9834, + "step": 17389 + }, + { + "epoch": 2.028934780072337, + "grad_norm": 1.1541757583618164, + "learning_rate": 0.00017359112618043208, + "loss": 1.9431, + "step": 17390 + }, + { + "epoch": 2.0290514525726286, + "grad_norm": 1.1998568773269653, + "learning_rate": 0.0001735761743402438, + "loss": 2.1619, + "step": 17391 + }, + { + "epoch": 2.0291681250729203, + "grad_norm": 1.0679528713226318, + "learning_rate": 0.00017356122227119418, + "loss": 1.7757, + "step": 17392 + }, + { + "epoch": 2.029284797573212, + "grad_norm": 1.3140270709991455, + "learning_rate": 0.0001735462699734382, + "loss": 1.9877, + "step": 17393 + }, + { + "epoch": 2.0294014700735037, + "grad_norm": 1.1685998439788818, + "learning_rate": 0.0001735313174471309, + "loss": 1.8947, + "step": 17394 + }, + { + "epoch": 2.0295181425737954, + "grad_norm": 1.2250579595565796, + "learning_rate": 0.0001735163646924273, + "loss": 1.929, + "step": 17395 + }, + { + "epoch": 2.029634815074087, + "grad_norm": 1.0686147212982178, + "learning_rate": 0.00017350141170948232, + "loss": 1.9244, + "step": 17396 + }, + { + "epoch": 2.0297514875743787, + "grad_norm": 1.132218599319458, + "learning_rate": 0.00017348645849845108, + "loss": 1.8726, + "step": 17397 + }, + { + "epoch": 2.0298681600746704, + "grad_norm": 1.412967324256897, + "learning_rate": 0.00017347150505948858, + "loss": 1.859, + "step": 17398 + }, + { + "epoch": 2.029984832574962, + "grad_norm": 1.047875165939331, + "learning_rate": 0.00017345655139274977, + "loss": 1.8674, + "step": 17399 + }, + { + "epoch": 2.0301015050752538, + "grad_norm": 1.1924854516983032, + "learning_rate": 0.00017344159749838978, + "loss": 2.1438, + "step": 17400 + }, + { + "epoch": 2.0302181775755455, + "grad_norm": 1.157677412033081, + "learning_rate": 0.00017342664337656358, + "loss": 1.8385, + "step": 17401 + }, + { + "epoch": 2.030334850075837, + "grad_norm": 1.130258321762085, + "learning_rate": 0.0001734116890274262, + "loss": 2.0438, + "step": 17402 + }, + { + "epoch": 2.030451522576129, + "grad_norm": 1.4274177551269531, + "learning_rate": 0.00017339673445113265, + "loss": 2.2864, + "step": 17403 + }, + { + "epoch": 2.0305681950764205, + "grad_norm": 1.1484992504119873, + "learning_rate": 0.000173381779647838, + "loss": 2.0589, + "step": 17404 + }, + { + "epoch": 2.030684867576712, + "grad_norm": 1.2564655542373657, + "learning_rate": 0.0001733668246176972, + "loss": 1.9949, + "step": 17405 + }, + { + "epoch": 2.030801540077004, + "grad_norm": 1.2792707681655884, + "learning_rate": 0.00017335186936086542, + "loss": 1.9708, + "step": 17406 + }, + { + "epoch": 2.0309182125772955, + "grad_norm": 1.3995200395584106, + "learning_rate": 0.00017333691387749764, + "loss": 1.9845, + "step": 17407 + }, + { + "epoch": 2.0310348850775872, + "grad_norm": 1.301720142364502, + "learning_rate": 0.00017332195816774885, + "loss": 1.9561, + "step": 17408 + }, + { + "epoch": 2.031151557577879, + "grad_norm": 1.255037546157837, + "learning_rate": 0.00017330700223177416, + "loss": 2.0727, + "step": 17409 + }, + { + "epoch": 2.0312682300781706, + "grad_norm": 1.0845856666564941, + "learning_rate": 0.00017329204606972867, + "loss": 1.9632, + "step": 17410 + }, + { + "epoch": 2.0313849025784623, + "grad_norm": 1.4134842157363892, + "learning_rate": 0.00017327708968176728, + "loss": 1.9891, + "step": 17411 + }, + { + "epoch": 2.031501575078754, + "grad_norm": 1.4395513534545898, + "learning_rate": 0.00017326213306804512, + "loss": 1.9203, + "step": 17412 + }, + { + "epoch": 2.0316182475790456, + "grad_norm": 1.157034993171692, + "learning_rate": 0.00017324717622871726, + "loss": 2.0122, + "step": 17413 + }, + { + "epoch": 2.0317349200793373, + "grad_norm": 1.0385998487472534, + "learning_rate": 0.0001732322191639387, + "loss": 1.8489, + "step": 17414 + }, + { + "epoch": 2.031851592579629, + "grad_norm": 1.1999123096466064, + "learning_rate": 0.00017321726187386456, + "loss": 2.0569, + "step": 17415 + }, + { + "epoch": 2.0319682650799207, + "grad_norm": 1.339416265487671, + "learning_rate": 0.0001732023043586499, + "loss": 2.2182, + "step": 17416 + }, + { + "epoch": 2.0320849375802124, + "grad_norm": 1.178586721420288, + "learning_rate": 0.00017318734661844973, + "loss": 2.0957, + "step": 17417 + }, + { + "epoch": 2.032201610080504, + "grad_norm": 1.187853217124939, + "learning_rate": 0.00017317238865341913, + "loss": 1.8043, + "step": 17418 + }, + { + "epoch": 2.0323182825807957, + "grad_norm": 1.3733932971954346, + "learning_rate": 0.00017315743046371321, + "loss": 1.9782, + "step": 17419 + }, + { + "epoch": 2.0324349550810874, + "grad_norm": 1.2607972621917725, + "learning_rate": 0.000173142472049487, + "loss": 1.9752, + "step": 17420 + }, + { + "epoch": 2.032551627581379, + "grad_norm": 1.3733155727386475, + "learning_rate": 0.00017312751341089558, + "loss": 2.0847, + "step": 17421 + }, + { + "epoch": 2.0326683000816708, + "grad_norm": 1.067628264427185, + "learning_rate": 0.00017311255454809406, + "loss": 1.8725, + "step": 17422 + }, + { + "epoch": 2.0327849725819624, + "grad_norm": 1.158402681350708, + "learning_rate": 0.00017309759546123743, + "loss": 1.9851, + "step": 17423 + }, + { + "epoch": 2.032901645082254, + "grad_norm": 1.2619309425354004, + "learning_rate": 0.0001730826361504809, + "loss": 1.939, + "step": 17424 + }, + { + "epoch": 2.033018317582546, + "grad_norm": 1.2927964925765991, + "learning_rate": 0.0001730676766159794, + "loss": 2.0357, + "step": 17425 + }, + { + "epoch": 2.0331349900828375, + "grad_norm": 1.2502119541168213, + "learning_rate": 0.0001730527168578881, + "loss": 2.1149, + "step": 17426 + }, + { + "epoch": 2.033251662583129, + "grad_norm": 1.0546575784683228, + "learning_rate": 0.0001730377568763621, + "loss": 1.8234, + "step": 17427 + }, + { + "epoch": 2.033368335083421, + "grad_norm": 1.2234039306640625, + "learning_rate": 0.00017302279667155652, + "loss": 2.0754, + "step": 17428 + }, + { + "epoch": 2.0334850075837125, + "grad_norm": 1.376697063446045, + "learning_rate": 0.00017300783624362635, + "loss": 2.095, + "step": 17429 + }, + { + "epoch": 2.033601680084004, + "grad_norm": 1.1838843822479248, + "learning_rate": 0.0001729928755927267, + "loss": 1.9945, + "step": 17430 + }, + { + "epoch": 2.033718352584296, + "grad_norm": 1.0596016645431519, + "learning_rate": 0.00017297791471901266, + "loss": 2.0, + "step": 17431 + }, + { + "epoch": 2.0338350250845876, + "grad_norm": 1.1641052961349487, + "learning_rate": 0.00017296295362263943, + "loss": 1.8617, + "step": 17432 + }, + { + "epoch": 2.0339516975848793, + "grad_norm": 1.1316229104995728, + "learning_rate": 0.00017294799230376204, + "loss": 1.8862, + "step": 17433 + }, + { + "epoch": 2.034068370085171, + "grad_norm": 1.1748560667037964, + "learning_rate": 0.00017293303076253556, + "loss": 1.9957, + "step": 17434 + }, + { + "epoch": 2.0341850425854626, + "grad_norm": 1.140100359916687, + "learning_rate": 0.00017291806899911514, + "loss": 1.8925, + "step": 17435 + }, + { + "epoch": 2.0343017150857543, + "grad_norm": 1.144425868988037, + "learning_rate": 0.0001729031070136559, + "loss": 2.0046, + "step": 17436 + }, + { + "epoch": 2.034418387586046, + "grad_norm": 1.0788934230804443, + "learning_rate": 0.00017288814480631297, + "loss": 1.7923, + "step": 17437 + }, + { + "epoch": 2.0345350600863377, + "grad_norm": 1.231398344039917, + "learning_rate": 0.00017287318237724133, + "loss": 2.0217, + "step": 17438 + }, + { + "epoch": 2.0346517325866293, + "grad_norm": 1.1006972789764404, + "learning_rate": 0.00017285821972659623, + "loss": 1.9967, + "step": 17439 + }, + { + "epoch": 2.034768405086921, + "grad_norm": 1.4249470233917236, + "learning_rate": 0.0001728432568545327, + "loss": 1.932, + "step": 17440 + }, + { + "epoch": 2.0348850775872127, + "grad_norm": 1.1597895622253418, + "learning_rate": 0.0001728282937612059, + "loss": 2.0, + "step": 17441 + }, + { + "epoch": 2.0350017500875044, + "grad_norm": 1.3263473510742188, + "learning_rate": 0.00017281333044677098, + "loss": 2.0029, + "step": 17442 + }, + { + "epoch": 2.035118422587796, + "grad_norm": 1.1198958158493042, + "learning_rate": 0.000172798366911383, + "loss": 2.0017, + "step": 17443 + }, + { + "epoch": 2.0352350950880878, + "grad_norm": 1.2863503694534302, + "learning_rate": 0.0001727834031551971, + "loss": 2.0224, + "step": 17444 + }, + { + "epoch": 2.0353517675883794, + "grad_norm": 1.271155834197998, + "learning_rate": 0.00017276843917836846, + "loss": 2.0283, + "step": 17445 + }, + { + "epoch": 2.035468440088671, + "grad_norm": 1.234521746635437, + "learning_rate": 0.00017275347498105217, + "loss": 2.0964, + "step": 17446 + }, + { + "epoch": 2.035585112588963, + "grad_norm": 1.2877568006515503, + "learning_rate": 0.00017273851056340335, + "loss": 1.8534, + "step": 17447 + }, + { + "epoch": 2.0357017850892545, + "grad_norm": 1.267322301864624, + "learning_rate": 0.00017272354592557716, + "loss": 1.9764, + "step": 17448 + }, + { + "epoch": 2.035818457589546, + "grad_norm": 1.1246986389160156, + "learning_rate": 0.00017270858106772872, + "loss": 1.9836, + "step": 17449 + }, + { + "epoch": 2.035935130089838, + "grad_norm": 1.0931037664413452, + "learning_rate": 0.00017269361599001317, + "loss": 1.979, + "step": 17450 + }, + { + "epoch": 2.0360518025901295, + "grad_norm": 1.0502198934555054, + "learning_rate": 0.00017267865069258567, + "loss": 1.9301, + "step": 17451 + }, + { + "epoch": 2.036168475090421, + "grad_norm": 1.1772197484970093, + "learning_rate": 0.00017266368517560132, + "loss": 1.9727, + "step": 17452 + }, + { + "epoch": 2.036285147590713, + "grad_norm": 1.3541430234909058, + "learning_rate": 0.00017264871943921533, + "loss": 2.0073, + "step": 17453 + }, + { + "epoch": 2.0364018200910046, + "grad_norm": 1.1057765483856201, + "learning_rate": 0.00017263375348358284, + "loss": 1.8403, + "step": 17454 + }, + { + "epoch": 2.0365184925912962, + "grad_norm": 1.1702723503112793, + "learning_rate": 0.00017261878730885896, + "loss": 1.9308, + "step": 17455 + }, + { + "epoch": 2.036635165091588, + "grad_norm": 1.322190761566162, + "learning_rate": 0.00017260382091519884, + "loss": 1.9115, + "step": 17456 + }, + { + "epoch": 2.0367518375918796, + "grad_norm": 1.1372485160827637, + "learning_rate": 0.00017258885430275763, + "loss": 1.8418, + "step": 17457 + }, + { + "epoch": 2.0368685100921713, + "grad_norm": 1.1219614744186401, + "learning_rate": 0.0001725738874716905, + "loss": 2.0859, + "step": 17458 + }, + { + "epoch": 2.036985182592463, + "grad_norm": 1.2646864652633667, + "learning_rate": 0.00017255892042215268, + "loss": 2.0687, + "step": 17459 + }, + { + "epoch": 2.0371018550927547, + "grad_norm": 1.210574746131897, + "learning_rate": 0.00017254395315429924, + "loss": 2.0303, + "step": 17460 + }, + { + "epoch": 2.0372185275930463, + "grad_norm": 1.1598899364471436, + "learning_rate": 0.00017252898566828538, + "loss": 2.0326, + "step": 17461 + }, + { + "epoch": 2.037335200093338, + "grad_norm": 1.2239996194839478, + "learning_rate": 0.00017251401796426627, + "loss": 1.9706, + "step": 17462 + }, + { + "epoch": 2.0374518725936297, + "grad_norm": 1.2187771797180176, + "learning_rate": 0.0001724990500423971, + "loss": 1.9906, + "step": 17463 + }, + { + "epoch": 2.0375685450939214, + "grad_norm": 1.2283791303634644, + "learning_rate": 0.00017248408190283294, + "loss": 1.8692, + "step": 17464 + }, + { + "epoch": 2.037685217594213, + "grad_norm": 1.1400755643844604, + "learning_rate": 0.00017246911354572904, + "loss": 2.1336, + "step": 17465 + }, + { + "epoch": 2.0378018900945047, + "grad_norm": 1.0598804950714111, + "learning_rate": 0.00017245414497124066, + "loss": 1.9287, + "step": 17466 + }, + { + "epoch": 2.0379185625947964, + "grad_norm": 1.0855520963668823, + "learning_rate": 0.0001724391761795228, + "loss": 2.0392, + "step": 17467 + }, + { + "epoch": 2.038035235095088, + "grad_norm": 1.1695001125335693, + "learning_rate": 0.00017242420717073075, + "loss": 1.9916, + "step": 17468 + }, + { + "epoch": 2.03815190759538, + "grad_norm": 1.2574710845947266, + "learning_rate": 0.00017240923794501967, + "loss": 2.0723, + "step": 17469 + }, + { + "epoch": 2.0382685800956715, + "grad_norm": 1.1334651708602905, + "learning_rate": 0.00017239426850254475, + "loss": 1.9507, + "step": 17470 + }, + { + "epoch": 2.038385252595963, + "grad_norm": 1.2307592630386353, + "learning_rate": 0.00017237929884346116, + "loss": 2.0271, + "step": 17471 + }, + { + "epoch": 2.038501925096255, + "grad_norm": 1.1283221244812012, + "learning_rate": 0.00017236432896792413, + "loss": 1.8464, + "step": 17472 + }, + { + "epoch": 2.0386185975965465, + "grad_norm": 1.6180232763290405, + "learning_rate": 0.00017234935887608876, + "loss": 1.9021, + "step": 17473 + }, + { + "epoch": 2.038735270096838, + "grad_norm": 1.1908138990402222, + "learning_rate": 0.00017233438856811037, + "loss": 2.0344, + "step": 17474 + }, + { + "epoch": 2.03885194259713, + "grad_norm": 1.132026195526123, + "learning_rate": 0.00017231941804414402, + "loss": 1.8582, + "step": 17475 + }, + { + "epoch": 2.0389686150974216, + "grad_norm": 1.0967131853103638, + "learning_rate": 0.000172304447304345, + "loss": 1.9673, + "step": 17476 + }, + { + "epoch": 2.0390852875977132, + "grad_norm": 1.242741584777832, + "learning_rate": 0.00017228947634886853, + "loss": 2.1032, + "step": 17477 + }, + { + "epoch": 2.039201960098005, + "grad_norm": 1.3813060522079468, + "learning_rate": 0.00017227450517786976, + "loss": 2.0386, + "step": 17478 + }, + { + "epoch": 2.0393186325982966, + "grad_norm": 1.1296799182891846, + "learning_rate": 0.00017225953379150388, + "loss": 1.8999, + "step": 17479 + }, + { + "epoch": 2.0394353050985883, + "grad_norm": 1.2385765314102173, + "learning_rate": 0.00017224456218992618, + "loss": 2.0789, + "step": 17480 + }, + { + "epoch": 2.03955197759888, + "grad_norm": 1.0741850137710571, + "learning_rate": 0.0001722295903732917, + "loss": 1.9617, + "step": 17481 + }, + { + "epoch": 2.0396686500991716, + "grad_norm": 1.2381550073623657, + "learning_rate": 0.00017221461834175585, + "loss": 1.8213, + "step": 17482 + }, + { + "epoch": 2.0397853225994633, + "grad_norm": 1.475364089012146, + "learning_rate": 0.0001721996460954737, + "loss": 2.0325, + "step": 17483 + }, + { + "epoch": 2.039901995099755, + "grad_norm": 1.1533387899398804, + "learning_rate": 0.00017218467363460052, + "loss": 2.0228, + "step": 17484 + }, + { + "epoch": 2.0400186676000467, + "grad_norm": 1.2937705516815186, + "learning_rate": 0.00017216970095929154, + "loss": 2.0084, + "step": 17485 + }, + { + "epoch": 2.0401353401003384, + "grad_norm": 1.2941656112670898, + "learning_rate": 0.00017215472806970199, + "loss": 2.0537, + "step": 17486 + }, + { + "epoch": 2.04025201260063, + "grad_norm": 1.1709574460983276, + "learning_rate": 0.00017213975496598705, + "loss": 1.9019, + "step": 17487 + }, + { + "epoch": 2.0403686851009217, + "grad_norm": 1.238289475440979, + "learning_rate": 0.000172124781648302, + "loss": 1.988, + "step": 17488 + }, + { + "epoch": 2.0404853576012134, + "grad_norm": 1.5629066228866577, + "learning_rate": 0.00017210980811680206, + "loss": 1.9923, + "step": 17489 + }, + { + "epoch": 2.040602030101505, + "grad_norm": 1.1747123003005981, + "learning_rate": 0.00017209483437164233, + "loss": 1.8643, + "step": 17490 + }, + { + "epoch": 2.0407187026017968, + "grad_norm": 1.1894049644470215, + "learning_rate": 0.00017207986041297817, + "loss": 2.0175, + "step": 17491 + }, + { + "epoch": 2.0408353751020885, + "grad_norm": 1.0677381753921509, + "learning_rate": 0.00017206488624096482, + "loss": 1.8315, + "step": 17492 + }, + { + "epoch": 2.04095204760238, + "grad_norm": 1.0717742443084717, + "learning_rate": 0.00017204991185575748, + "loss": 1.9487, + "step": 17493 + }, + { + "epoch": 2.041068720102672, + "grad_norm": 1.1669682264328003, + "learning_rate": 0.00017203493725751138, + "loss": 2.0595, + "step": 17494 + }, + { + "epoch": 2.0411853926029635, + "grad_norm": 1.2084523439407349, + "learning_rate": 0.00017201996244638174, + "loss": 1.9671, + "step": 17495 + }, + { + "epoch": 2.041302065103255, + "grad_norm": 1.0982744693756104, + "learning_rate": 0.0001720049874225239, + "loss": 1.9172, + "step": 17496 + }, + { + "epoch": 2.041418737603547, + "grad_norm": 1.15816068649292, + "learning_rate": 0.00017199001218609297, + "loss": 1.9541, + "step": 17497 + }, + { + "epoch": 2.0415354101038385, + "grad_norm": 1.1891217231750488, + "learning_rate": 0.00017197503673724432, + "loss": 2.0131, + "step": 17498 + }, + { + "epoch": 2.0416520826041302, + "grad_norm": 1.0925785303115845, + "learning_rate": 0.00017196006107613308, + "loss": 1.9055, + "step": 17499 + }, + { + "epoch": 2.041768755104422, + "grad_norm": 1.3030321598052979, + "learning_rate": 0.00017194508520291458, + "loss": 2.2549, + "step": 17500 + }, + { + "epoch": 2.0418854276047136, + "grad_norm": 1.2398560047149658, + "learning_rate": 0.00017193010911774407, + "loss": 2.093, + "step": 17501 + }, + { + "epoch": 2.0420021001050053, + "grad_norm": 1.298370122909546, + "learning_rate": 0.00017191513282077678, + "loss": 1.9604, + "step": 17502 + }, + { + "epoch": 2.042118772605297, + "grad_norm": 1.0502654314041138, + "learning_rate": 0.000171900156312168, + "loss": 1.893, + "step": 17503 + }, + { + "epoch": 2.0422354451055886, + "grad_norm": 1.1402473449707031, + "learning_rate": 0.00017188517959207294, + "loss": 1.8172, + "step": 17504 + }, + { + "epoch": 2.0423521176058803, + "grad_norm": 1.2838973999023438, + "learning_rate": 0.00017187020266064694, + "loss": 2.0928, + "step": 17505 + }, + { + "epoch": 2.042468790106172, + "grad_norm": 1.1872860193252563, + "learning_rate": 0.00017185522551804518, + "loss": 1.9146, + "step": 17506 + }, + { + "epoch": 2.0425854626064637, + "grad_norm": 1.1803346872329712, + "learning_rate": 0.00017184024816442297, + "loss": 2.118, + "step": 17507 + }, + { + "epoch": 2.0427021351067554, + "grad_norm": 1.1998279094696045, + "learning_rate": 0.00017182527059993554, + "loss": 2.1514, + "step": 17508 + }, + { + "epoch": 2.042818807607047, + "grad_norm": 1.1674480438232422, + "learning_rate": 0.00017181029282473825, + "loss": 2.0257, + "step": 17509 + }, + { + "epoch": 2.0429354801073387, + "grad_norm": 1.2373236417770386, + "learning_rate": 0.00017179531483898624, + "loss": 1.7747, + "step": 17510 + }, + { + "epoch": 2.0430521526076304, + "grad_norm": 1.2417089939117432, + "learning_rate": 0.0001717803366428349, + "loss": 2.0279, + "step": 17511 + }, + { + "epoch": 2.043168825107922, + "grad_norm": 1.1328840255737305, + "learning_rate": 0.00017176535823643947, + "loss": 1.8544, + "step": 17512 + }, + { + "epoch": 2.0432854976082138, + "grad_norm": 1.180524230003357, + "learning_rate": 0.00017175037961995526, + "loss": 1.9991, + "step": 17513 + }, + { + "epoch": 2.0434021701085054, + "grad_norm": 1.2758980989456177, + "learning_rate": 0.00017173540079353745, + "loss": 1.9126, + "step": 17514 + }, + { + "epoch": 2.043518842608797, + "grad_norm": 1.293660283088684, + "learning_rate": 0.00017172042175734142, + "loss": 1.9803, + "step": 17515 + }, + { + "epoch": 2.043635515109089, + "grad_norm": 1.368306040763855, + "learning_rate": 0.00017170544251152243, + "loss": 2.0887, + "step": 17516 + }, + { + "epoch": 2.0437521876093805, + "grad_norm": 1.0660799741744995, + "learning_rate": 0.00017169046305623575, + "loss": 1.7266, + "step": 17517 + }, + { + "epoch": 2.043868860109672, + "grad_norm": 1.0961334705352783, + "learning_rate": 0.0001716754833916367, + "loss": 2.1381, + "step": 17518 + }, + { + "epoch": 2.043985532609964, + "grad_norm": 1.0419620275497437, + "learning_rate": 0.00017166050351788057, + "loss": 1.9587, + "step": 17519 + }, + { + "epoch": 2.0441022051102555, + "grad_norm": 1.259543538093567, + "learning_rate": 0.00017164552343512264, + "loss": 2.0641, + "step": 17520 + }, + { + "epoch": 2.044218877610547, + "grad_norm": 1.1420389413833618, + "learning_rate": 0.00017163054314351823, + "loss": 2.0277, + "step": 17521 + }, + { + "epoch": 2.044335550110839, + "grad_norm": 1.1544073820114136, + "learning_rate": 0.00017161556264322258, + "loss": 2.0652, + "step": 17522 + }, + { + "epoch": 2.0444522226111306, + "grad_norm": 1.1884503364562988, + "learning_rate": 0.00017160058193439105, + "loss": 1.9593, + "step": 17523 + }, + { + "epoch": 2.0445688951114223, + "grad_norm": 1.1971994638442993, + "learning_rate": 0.0001715856010171789, + "loss": 1.8979, + "step": 17524 + }, + { + "epoch": 2.044685567611714, + "grad_norm": 1.2522133588790894, + "learning_rate": 0.00017157061989174147, + "loss": 2.0884, + "step": 17525 + }, + { + "epoch": 2.0448022401120056, + "grad_norm": 1.0401216745376587, + "learning_rate": 0.0001715556385582341, + "loss": 1.8166, + "step": 17526 + }, + { + "epoch": 2.0449189126122973, + "grad_norm": 1.2857797145843506, + "learning_rate": 0.00017154065701681202, + "loss": 1.8668, + "step": 17527 + }, + { + "epoch": 2.045035585112589, + "grad_norm": 1.0941895246505737, + "learning_rate": 0.00017152567526763058, + "loss": 1.9731, + "step": 17528 + }, + { + "epoch": 2.0451522576128807, + "grad_norm": 1.2010070085525513, + "learning_rate": 0.0001715106933108451, + "loss": 1.872, + "step": 17529 + }, + { + "epoch": 2.0452689301131723, + "grad_norm": 1.021754264831543, + "learning_rate": 0.00017149571114661092, + "loss": 1.9926, + "step": 17530 + }, + { + "epoch": 2.045385602613464, + "grad_norm": 1.3742969036102295, + "learning_rate": 0.00017148072877508334, + "loss": 1.9011, + "step": 17531 + }, + { + "epoch": 2.0455022751137557, + "grad_norm": 1.137384057044983, + "learning_rate": 0.0001714657461964176, + "loss": 2.0083, + "step": 17532 + }, + { + "epoch": 2.0456189476140474, + "grad_norm": 1.164903998374939, + "learning_rate": 0.00017145076341076916, + "loss": 1.8439, + "step": 17533 + }, + { + "epoch": 2.045735620114339, + "grad_norm": 1.2232916355133057, + "learning_rate": 0.00017143578041829322, + "loss": 1.8002, + "step": 17534 + }, + { + "epoch": 2.0458522926146308, + "grad_norm": 1.1116775274276733, + "learning_rate": 0.0001714207972191452, + "loss": 1.8496, + "step": 17535 + }, + { + "epoch": 2.0459689651149224, + "grad_norm": 1.1021125316619873, + "learning_rate": 0.00017140581381348033, + "loss": 1.9897, + "step": 17536 + }, + { + "epoch": 2.046085637615214, + "grad_norm": 1.2971816062927246, + "learning_rate": 0.00017139083020145408, + "loss": 2.0203, + "step": 17537 + }, + { + "epoch": 2.046202310115506, + "grad_norm": 1.2836048603057861, + "learning_rate": 0.00017137584638322168, + "loss": 1.8795, + "step": 17538 + }, + { + "epoch": 2.0463189826157975, + "grad_norm": 1.1812961101531982, + "learning_rate": 0.00017136086235893853, + "loss": 1.8803, + "step": 17539 + }, + { + "epoch": 2.046435655116089, + "grad_norm": 1.1291663646697998, + "learning_rate": 0.00017134587812875989, + "loss": 2.1568, + "step": 17540 + }, + { + "epoch": 2.046552327616381, + "grad_norm": 1.4264631271362305, + "learning_rate": 0.00017133089369284114, + "loss": 2.05, + "step": 17541 + }, + { + "epoch": 2.0466690001166725, + "grad_norm": 1.16372811794281, + "learning_rate": 0.00017131590905133763, + "loss": 2.0592, + "step": 17542 + }, + { + "epoch": 2.046785672616964, + "grad_norm": 1.0833003520965576, + "learning_rate": 0.0001713009242044047, + "loss": 1.8297, + "step": 17543 + }, + { + "epoch": 2.046902345117256, + "grad_norm": 1.1384655237197876, + "learning_rate": 0.00017128593915219768, + "loss": 2.04, + "step": 17544 + }, + { + "epoch": 2.0470190176175476, + "grad_norm": 1.169385552406311, + "learning_rate": 0.00017127095389487198, + "loss": 1.8996, + "step": 17545 + }, + { + "epoch": 2.0471356901178392, + "grad_norm": 1.1623495817184448, + "learning_rate": 0.00017125596843258286, + "loss": 1.8154, + "step": 17546 + }, + { + "epoch": 2.047252362618131, + "grad_norm": 1.2433909177780151, + "learning_rate": 0.00017124098276548576, + "loss": 1.9681, + "step": 17547 + }, + { + "epoch": 2.0473690351184226, + "grad_norm": 1.1756733655929565, + "learning_rate": 0.00017122599689373594, + "loss": 1.8936, + "step": 17548 + }, + { + "epoch": 2.0474857076187143, + "grad_norm": 1.1193149089813232, + "learning_rate": 0.00017121101081748883, + "loss": 1.9781, + "step": 17549 + }, + { + "epoch": 2.047602380119006, + "grad_norm": 1.1169767379760742, + "learning_rate": 0.00017119602453689978, + "loss": 1.8653, + "step": 17550 + }, + { + "epoch": 2.0477190526192977, + "grad_norm": 1.1253825426101685, + "learning_rate": 0.0001711810380521241, + "loss": 2.0048, + "step": 17551 + }, + { + "epoch": 2.0478357251195893, + "grad_norm": 1.4296776056289673, + "learning_rate": 0.0001711660513633172, + "loss": 2.0274, + "step": 17552 + }, + { + "epoch": 2.047952397619881, + "grad_norm": 1.1275392770767212, + "learning_rate": 0.00017115106447063448, + "loss": 1.9735, + "step": 17553 + }, + { + "epoch": 2.0480690701201727, + "grad_norm": 1.3075189590454102, + "learning_rate": 0.00017113607737423124, + "loss": 1.9483, + "step": 17554 + }, + { + "epoch": 2.0481857426204644, + "grad_norm": 1.0840119123458862, + "learning_rate": 0.00017112109007426284, + "loss": 1.8291, + "step": 17555 + }, + { + "epoch": 2.048302415120756, + "grad_norm": 1.0852335691452026, + "learning_rate": 0.00017110610257088478, + "loss": 1.7212, + "step": 17556 + }, + { + "epoch": 2.0484190876210477, + "grad_norm": 1.1937470436096191, + "learning_rate": 0.00017109111486425228, + "loss": 1.9676, + "step": 17557 + }, + { + "epoch": 2.0485357601213394, + "grad_norm": 1.1907727718353271, + "learning_rate": 0.00017107612695452078, + "loss": 1.8202, + "step": 17558 + }, + { + "epoch": 2.048652432621631, + "grad_norm": 1.1185479164123535, + "learning_rate": 0.00017106113884184565, + "loss": 2.0217, + "step": 17559 + }, + { + "epoch": 2.048769105121923, + "grad_norm": 1.2339084148406982, + "learning_rate": 0.00017104615052638224, + "loss": 1.9803, + "step": 17560 + }, + { + "epoch": 2.0488857776222145, + "grad_norm": 1.2570767402648926, + "learning_rate": 0.00017103116200828603, + "loss": 1.9237, + "step": 17561 + }, + { + "epoch": 2.049002450122506, + "grad_norm": 1.065102219581604, + "learning_rate": 0.00017101617328771227, + "loss": 1.918, + "step": 17562 + }, + { + "epoch": 2.049119122622798, + "grad_norm": 1.2673530578613281, + "learning_rate": 0.00017100118436481643, + "loss": 1.8859, + "step": 17563 + }, + { + "epoch": 2.0492357951230895, + "grad_norm": 1.1334664821624756, + "learning_rate": 0.00017098619523975395, + "loss": 1.8062, + "step": 17564 + }, + { + "epoch": 2.049352467623381, + "grad_norm": 1.255239486694336, + "learning_rate": 0.00017097120591268012, + "loss": 1.9993, + "step": 17565 + }, + { + "epoch": 2.049469140123673, + "grad_norm": 1.3705617189407349, + "learning_rate": 0.00017095621638375038, + "loss": 2.0555, + "step": 17566 + }, + { + "epoch": 2.0495858126239646, + "grad_norm": 1.1624469757080078, + "learning_rate": 0.0001709412266531201, + "loss": 1.9058, + "step": 17567 + }, + { + "epoch": 2.0497024851242562, + "grad_norm": 1.1260861158370972, + "learning_rate": 0.00017092623672094467, + "loss": 2.1325, + "step": 17568 + }, + { + "epoch": 2.049819157624548, + "grad_norm": 1.1239231824874878, + "learning_rate": 0.0001709112465873795, + "loss": 1.7999, + "step": 17569 + }, + { + "epoch": 2.0499358301248396, + "grad_norm": 1.4095295667648315, + "learning_rate": 0.00017089625625258008, + "loss": 2.0354, + "step": 17570 + }, + { + "epoch": 2.0500525026251313, + "grad_norm": 1.1808608770370483, + "learning_rate": 0.00017088126571670166, + "loss": 1.871, + "step": 17571 + }, + { + "epoch": 2.050169175125423, + "grad_norm": 1.251423954963684, + "learning_rate": 0.00017086627497989973, + "loss": 2.0289, + "step": 17572 + }, + { + "epoch": 2.0502858476257146, + "grad_norm": 1.1300112009048462, + "learning_rate": 0.00017085128404232972, + "loss": 1.7611, + "step": 17573 + }, + { + "epoch": 2.0504025201260063, + "grad_norm": 1.1681592464447021, + "learning_rate": 0.00017083629290414704, + "loss": 2.1663, + "step": 17574 + }, + { + "epoch": 2.050519192626298, + "grad_norm": 1.1647348403930664, + "learning_rate": 0.00017082130156550692, + "loss": 1.8215, + "step": 17575 + }, + { + "epoch": 2.0506358651265897, + "grad_norm": 1.2193974256515503, + "learning_rate": 0.00017080631002656504, + "loss": 1.9876, + "step": 17576 + }, + { + "epoch": 2.0507525376268814, + "grad_norm": 1.159178376197815, + "learning_rate": 0.0001707913182874767, + "loss": 1.9536, + "step": 17577 + }, + { + "epoch": 2.050869210127173, + "grad_norm": 1.0849274396896362, + "learning_rate": 0.0001707763263483973, + "loss": 2.0071, + "step": 17578 + }, + { + "epoch": 2.0509858826274647, + "grad_norm": 1.448137640953064, + "learning_rate": 0.00017076133420948223, + "loss": 2.0181, + "step": 17579 + }, + { + "epoch": 2.0511025551277564, + "grad_norm": 1.0377333164215088, + "learning_rate": 0.00017074634187088698, + "loss": 1.9198, + "step": 17580 + }, + { + "epoch": 2.051219227628048, + "grad_norm": 1.2437009811401367, + "learning_rate": 0.00017073134933276695, + "loss": 1.8138, + "step": 17581 + }, + { + "epoch": 2.0513359001283398, + "grad_norm": 1.2810988426208496, + "learning_rate": 0.00017071635659527765, + "loss": 2.0059, + "step": 17582 + }, + { + "epoch": 2.0514525726286315, + "grad_norm": 1.1315902471542358, + "learning_rate": 0.00017070136365857435, + "loss": 1.9832, + "step": 17583 + }, + { + "epoch": 2.051569245128923, + "grad_norm": 1.3077161312103271, + "learning_rate": 0.00017068637052281254, + "loss": 2.0452, + "step": 17584 + }, + { + "epoch": 2.051685917629215, + "grad_norm": 1.3203214406967163, + "learning_rate": 0.0001706713771881477, + "loss": 2.0477, + "step": 17585 + }, + { + "epoch": 2.0518025901295065, + "grad_norm": 1.0484507083892822, + "learning_rate": 0.0001706563836547352, + "loss": 1.7515, + "step": 17586 + }, + { + "epoch": 2.051919262629798, + "grad_norm": 1.2403392791748047, + "learning_rate": 0.00017064138992273052, + "loss": 1.8707, + "step": 17587 + }, + { + "epoch": 2.05203593513009, + "grad_norm": 1.2677981853485107, + "learning_rate": 0.00017062639599228904, + "loss": 1.9776, + "step": 17588 + }, + { + "epoch": 2.0521526076303815, + "grad_norm": 1.0719785690307617, + "learning_rate": 0.0001706114018635663, + "loss": 1.9367, + "step": 17589 + }, + { + "epoch": 2.0522692801306732, + "grad_norm": 1.2162281274795532, + "learning_rate": 0.0001705964075367177, + "loss": 1.8686, + "step": 17590 + }, + { + "epoch": 2.052385952630965, + "grad_norm": 1.318765640258789, + "learning_rate": 0.00017058141301189866, + "loss": 1.9529, + "step": 17591 + }, + { + "epoch": 2.0525026251312566, + "grad_norm": 1.2770631313323975, + "learning_rate": 0.0001705664182892647, + "loss": 2.0247, + "step": 17592 + }, + { + "epoch": 2.0526192976315483, + "grad_norm": 1.0991671085357666, + "learning_rate": 0.0001705514233689711, + "loss": 1.963, + "step": 17593 + }, + { + "epoch": 2.05273597013184, + "grad_norm": 1.1800477504730225, + "learning_rate": 0.00017053642825117345, + "loss": 1.8734, + "step": 17594 + }, + { + "epoch": 2.0528526426321316, + "grad_norm": 1.1884082555770874, + "learning_rate": 0.00017052143293602723, + "loss": 2.018, + "step": 17595 + }, + { + "epoch": 2.0529693151324233, + "grad_norm": 1.0742852687835693, + "learning_rate": 0.00017050643742368778, + "loss": 1.8606, + "step": 17596 + }, + { + "epoch": 2.053085987632715, + "grad_norm": 1.1423616409301758, + "learning_rate": 0.00017049144171431063, + "loss": 1.88, + "step": 17597 + }, + { + "epoch": 2.0532026601330067, + "grad_norm": 1.264299988746643, + "learning_rate": 0.00017047644580805125, + "loss": 1.9881, + "step": 17598 + }, + { + "epoch": 2.0533193326332984, + "grad_norm": 1.2013826370239258, + "learning_rate": 0.00017046144970506502, + "loss": 1.8818, + "step": 17599 + }, + { + "epoch": 2.05343600513359, + "grad_norm": 1.1601053476333618, + "learning_rate": 0.00017044645340550752, + "loss": 1.8733, + "step": 17600 + }, + { + "epoch": 2.0535526776338817, + "grad_norm": 1.4962685108184814, + "learning_rate": 0.0001704314569095341, + "loss": 1.9212, + "step": 17601 + }, + { + "epoch": 2.0536693501341734, + "grad_norm": 1.1417655944824219, + "learning_rate": 0.0001704164602173003, + "loss": 1.9082, + "step": 17602 + }, + { + "epoch": 2.053786022634465, + "grad_norm": 1.1661429405212402, + "learning_rate": 0.00017040146332896158, + "loss": 1.9705, + "step": 17603 + }, + { + "epoch": 2.0539026951347568, + "grad_norm": 1.042108416557312, + "learning_rate": 0.00017038646624467337, + "loss": 1.9058, + "step": 17604 + }, + { + "epoch": 2.0540193676350484, + "grad_norm": 1.056638240814209, + "learning_rate": 0.0001703714689645912, + "loss": 2.0102, + "step": 17605 + }, + { + "epoch": 2.05413604013534, + "grad_norm": 1.2436726093292236, + "learning_rate": 0.0001703564714888705, + "loss": 1.9972, + "step": 17606 + }, + { + "epoch": 2.054252712635632, + "grad_norm": 1.293363332748413, + "learning_rate": 0.00017034147381766678, + "loss": 1.9164, + "step": 17607 + }, + { + "epoch": 2.0543693851359235, + "grad_norm": 1.1368002891540527, + "learning_rate": 0.00017032647595113557, + "loss": 1.9789, + "step": 17608 + }, + { + "epoch": 2.054486057636215, + "grad_norm": 1.3209065198898315, + "learning_rate": 0.00017031147788943219, + "loss": 1.9497, + "step": 17609 + }, + { + "epoch": 2.054602730136507, + "grad_norm": 1.1216922998428345, + "learning_rate": 0.00017029647963271225, + "loss": 2.0223, + "step": 17610 + }, + { + "epoch": 2.0547194026367985, + "grad_norm": 1.492936134338379, + "learning_rate": 0.0001702814811811312, + "loss": 2.0885, + "step": 17611 + }, + { + "epoch": 2.05483607513709, + "grad_norm": 1.162959098815918, + "learning_rate": 0.00017026648253484452, + "loss": 1.9197, + "step": 17612 + }, + { + "epoch": 2.054952747637382, + "grad_norm": 1.102778673171997, + "learning_rate": 0.0001702514836940077, + "loss": 2.0031, + "step": 17613 + }, + { + "epoch": 2.0550694201376736, + "grad_norm": 1.1101030111312866, + "learning_rate": 0.00017023648465877623, + "loss": 1.9977, + "step": 17614 + }, + { + "epoch": 2.0551860926379653, + "grad_norm": 1.169111728668213, + "learning_rate": 0.00017022148542930566, + "loss": 1.8162, + "step": 17615 + }, + { + "epoch": 2.055302765138257, + "grad_norm": 1.3186930418014526, + "learning_rate": 0.00017020648600575146, + "loss": 2.0461, + "step": 17616 + }, + { + "epoch": 2.0554194376385486, + "grad_norm": 1.32343590259552, + "learning_rate": 0.00017019148638826908, + "loss": 1.9715, + "step": 17617 + }, + { + "epoch": 2.0555361101388403, + "grad_norm": 1.085974931716919, + "learning_rate": 0.00017017648657701408, + "loss": 1.7836, + "step": 17618 + }, + { + "epoch": 2.055652782639132, + "grad_norm": 1.260448694229126, + "learning_rate": 0.00017016148657214187, + "loss": 1.8437, + "step": 17619 + }, + { + "epoch": 2.0557694551394237, + "grad_norm": 1.2356191873550415, + "learning_rate": 0.00017014648637380807, + "loss": 2.2652, + "step": 17620 + }, + { + "epoch": 2.0558861276397153, + "grad_norm": 1.0872670412063599, + "learning_rate": 0.00017013148598216807, + "loss": 2.0343, + "step": 17621 + }, + { + "epoch": 2.056002800140007, + "grad_norm": 1.2148404121398926, + "learning_rate": 0.00017011648539737748, + "loss": 1.9548, + "step": 17622 + }, + { + "epoch": 2.0561194726402987, + "grad_norm": 1.1926591396331787, + "learning_rate": 0.00017010148461959177, + "loss": 1.9741, + "step": 17623 + }, + { + "epoch": 2.0562361451405904, + "grad_norm": 1.1439788341522217, + "learning_rate": 0.00017008648364896643, + "loss": 1.9924, + "step": 17624 + }, + { + "epoch": 2.056352817640882, + "grad_norm": 2.0336806774139404, + "learning_rate": 0.00017007148248565694, + "loss": 2.0292, + "step": 17625 + }, + { + "epoch": 2.0564694901411738, + "grad_norm": 1.0969659090042114, + "learning_rate": 0.000170056481129819, + "loss": 1.9372, + "step": 17626 + }, + { + "epoch": 2.0565861626414654, + "grad_norm": 1.2155663967132568, + "learning_rate": 0.0001700414795816079, + "loss": 2.0905, + "step": 17627 + }, + { + "epoch": 2.056702835141757, + "grad_norm": 1.239682674407959, + "learning_rate": 0.00017002647784117924, + "loss": 2.0468, + "step": 17628 + }, + { + "epoch": 2.056819507642049, + "grad_norm": 1.2942819595336914, + "learning_rate": 0.00017001147590868854, + "loss": 2.0721, + "step": 17629 + }, + { + "epoch": 2.0569361801423405, + "grad_norm": 1.0972751379013062, + "learning_rate": 0.00016999647378429143, + "loss": 1.906, + "step": 17630 + }, + { + "epoch": 2.057052852642632, + "grad_norm": 1.2347558736801147, + "learning_rate": 0.00016998147146814324, + "loss": 1.8955, + "step": 17631 + }, + { + "epoch": 2.057169525142924, + "grad_norm": 1.0323026180267334, + "learning_rate": 0.00016996646896039967, + "loss": 1.874, + "step": 17632 + }, + { + "epoch": 2.0572861976432155, + "grad_norm": 0.9841123819351196, + "learning_rate": 0.00016995146626121616, + "loss": 1.8387, + "step": 17633 + }, + { + "epoch": 2.057402870143507, + "grad_norm": 1.3758972883224487, + "learning_rate": 0.0001699364633707483, + "loss": 2.0251, + "step": 17634 + }, + { + "epoch": 2.057519542643799, + "grad_norm": 1.336616039276123, + "learning_rate": 0.00016992146028915156, + "loss": 2.0644, + "step": 17635 + }, + { + "epoch": 2.0576362151440906, + "grad_norm": 1.3084336519241333, + "learning_rate": 0.00016990645701658145, + "loss": 2.0454, + "step": 17636 + }, + { + "epoch": 2.0577528876443822, + "grad_norm": 1.262505054473877, + "learning_rate": 0.0001698914535531936, + "loss": 1.9897, + "step": 17637 + }, + { + "epoch": 2.057869560144674, + "grad_norm": 1.246622920036316, + "learning_rate": 0.00016987644989914348, + "loss": 2.0219, + "step": 17638 + }, + { + "epoch": 2.0579862326449656, + "grad_norm": 1.2356791496276855, + "learning_rate": 0.0001698614460545867, + "loss": 1.9892, + "step": 17639 + }, + { + "epoch": 2.0581029051452573, + "grad_norm": 1.1278828382492065, + "learning_rate": 0.00016984644201967873, + "loss": 2.0206, + "step": 17640 + }, + { + "epoch": 2.058219577645549, + "grad_norm": 0.9880744218826294, + "learning_rate": 0.00016983143779457518, + "loss": 1.9202, + "step": 17641 + }, + { + "epoch": 2.0583362501458407, + "grad_norm": 1.2385644912719727, + "learning_rate": 0.0001698164333794315, + "loss": 2.1741, + "step": 17642 + }, + { + "epoch": 2.0584529226461323, + "grad_norm": 1.3422733545303345, + "learning_rate": 0.00016980142877440335, + "loss": 2.061, + "step": 17643 + }, + { + "epoch": 2.058569595146424, + "grad_norm": 1.1768770217895508, + "learning_rate": 0.00016978642397964626, + "loss": 1.9696, + "step": 17644 + }, + { + "epoch": 2.0586862676467157, + "grad_norm": 1.0912209749221802, + "learning_rate": 0.00016977141899531572, + "loss": 1.9039, + "step": 17645 + }, + { + "epoch": 2.0588029401470074, + "grad_norm": 1.1608800888061523, + "learning_rate": 0.00016975641382156734, + "loss": 1.971, + "step": 17646 + }, + { + "epoch": 2.058919612647299, + "grad_norm": 1.159785509109497, + "learning_rate": 0.00016974140845855664, + "loss": 1.9618, + "step": 17647 + }, + { + "epoch": 2.0590362851475907, + "grad_norm": 1.2050288915634155, + "learning_rate": 0.0001697264029064392, + "loss": 1.9712, + "step": 17648 + }, + { + "epoch": 2.0591529576478824, + "grad_norm": 1.2545514106750488, + "learning_rate": 0.00016971139716537057, + "loss": 1.911, + "step": 17649 + }, + { + "epoch": 2.059269630148174, + "grad_norm": 1.1847200393676758, + "learning_rate": 0.00016969639123550634, + "loss": 2.1277, + "step": 17650 + }, + { + "epoch": 2.059386302648466, + "grad_norm": 1.2711598873138428, + "learning_rate": 0.00016968138511700212, + "loss": 1.918, + "step": 17651 + }, + { + "epoch": 2.0595029751487575, + "grad_norm": 1.1230859756469727, + "learning_rate": 0.00016966637881001332, + "loss": 1.8305, + "step": 17652 + }, + { + "epoch": 2.059619647649049, + "grad_norm": 1.2395416498184204, + "learning_rate": 0.00016965137231469564, + "loss": 1.8649, + "step": 17653 + }, + { + "epoch": 2.059736320149341, + "grad_norm": 1.2545174360275269, + "learning_rate": 0.00016963636563120454, + "loss": 2.1061, + "step": 17654 + }, + { + "epoch": 2.0598529926496325, + "grad_norm": 1.2045363187789917, + "learning_rate": 0.00016962135875969576, + "loss": 1.9924, + "step": 17655 + }, + { + "epoch": 2.059969665149924, + "grad_norm": 1.1898778676986694, + "learning_rate": 0.00016960635170032472, + "loss": 1.9359, + "step": 17656 + }, + { + "epoch": 2.060086337650216, + "grad_norm": 1.2104570865631104, + "learning_rate": 0.00016959134445324712, + "loss": 2.0869, + "step": 17657 + }, + { + "epoch": 2.0602030101505076, + "grad_norm": 1.3630471229553223, + "learning_rate": 0.00016957633701861842, + "loss": 1.9719, + "step": 17658 + }, + { + "epoch": 2.0603196826507992, + "grad_norm": 1.2024985551834106, + "learning_rate": 0.00016956132939659435, + "loss": 1.9623, + "step": 17659 + }, + { + "epoch": 2.060436355151091, + "grad_norm": 1.2844154834747314, + "learning_rate": 0.0001695463215873303, + "loss": 2.0265, + "step": 17660 + }, + { + "epoch": 2.0605530276513826, + "grad_norm": 1.1765486001968384, + "learning_rate": 0.00016953131359098199, + "loss": 1.9454, + "step": 17661 + }, + { + "epoch": 2.0606697001516743, + "grad_norm": 1.208932876586914, + "learning_rate": 0.00016951630540770497, + "loss": 1.932, + "step": 17662 + }, + { + "epoch": 2.060786372651966, + "grad_norm": 1.1307014226913452, + "learning_rate": 0.00016950129703765482, + "loss": 1.935, + "step": 17663 + }, + { + "epoch": 2.0609030451522576, + "grad_norm": 1.466382622718811, + "learning_rate": 0.00016948628848098714, + "loss": 2.2366, + "step": 17664 + }, + { + "epoch": 2.0610197176525493, + "grad_norm": 1.0666548013687134, + "learning_rate": 0.0001694712797378575, + "loss": 1.7932, + "step": 17665 + }, + { + "epoch": 2.061136390152841, + "grad_norm": 1.1422069072723389, + "learning_rate": 0.00016945627080842154, + "loss": 2.0952, + "step": 17666 + }, + { + "epoch": 2.0612530626531327, + "grad_norm": 1.2899507284164429, + "learning_rate": 0.00016944126169283482, + "loss": 2.1956, + "step": 17667 + }, + { + "epoch": 2.0613697351534244, + "grad_norm": 1.2364743947982788, + "learning_rate": 0.00016942625239125294, + "loss": 1.86, + "step": 17668 + }, + { + "epoch": 2.061486407653716, + "grad_norm": 1.2093801498413086, + "learning_rate": 0.00016941124290383152, + "loss": 1.9105, + "step": 17669 + }, + { + "epoch": 2.0616030801540077, + "grad_norm": 1.2251900434494019, + "learning_rate": 0.00016939623323072614, + "loss": 2.0219, + "step": 17670 + }, + { + "epoch": 2.0617197526542994, + "grad_norm": 1.1949647665023804, + "learning_rate": 0.00016938122337209244, + "loss": 1.896, + "step": 17671 + }, + { + "epoch": 2.061836425154591, + "grad_norm": 1.2563592195510864, + "learning_rate": 0.000169366213328086, + "loss": 1.8654, + "step": 17672 + }, + { + "epoch": 2.0619530976548828, + "grad_norm": 1.050897479057312, + "learning_rate": 0.0001693512030988624, + "loss": 1.7291, + "step": 17673 + }, + { + "epoch": 2.0620697701551745, + "grad_norm": 1.0286283493041992, + "learning_rate": 0.0001693361926845773, + "loss": 1.9293, + "step": 17674 + }, + { + "epoch": 2.062186442655466, + "grad_norm": 1.086572289466858, + "learning_rate": 0.00016932118208538624, + "loss": 2.0158, + "step": 17675 + }, + { + "epoch": 2.062303115155758, + "grad_norm": 1.042858362197876, + "learning_rate": 0.00016930617130144493, + "loss": 1.857, + "step": 17676 + }, + { + "epoch": 2.0624197876560495, + "grad_norm": 1.185302972793579, + "learning_rate": 0.0001692911603329089, + "loss": 1.9035, + "step": 17677 + }, + { + "epoch": 2.062536460156341, + "grad_norm": 1.1906931400299072, + "learning_rate": 0.00016927614917993387, + "loss": 1.895, + "step": 17678 + }, + { + "epoch": 2.062653132656633, + "grad_norm": 1.0690335035324097, + "learning_rate": 0.00016926113784267532, + "loss": 1.8853, + "step": 17679 + }, + { + "epoch": 2.0627698051569245, + "grad_norm": 1.1820310354232788, + "learning_rate": 0.00016924612632128894, + "loss": 1.9324, + "step": 17680 + }, + { + "epoch": 2.0628864776572162, + "grad_norm": 1.2399283647537231, + "learning_rate": 0.00016923111461593035, + "loss": 2.0559, + "step": 17681 + }, + { + "epoch": 2.063003150157508, + "grad_norm": 1.3039414882659912, + "learning_rate": 0.00016921610272675522, + "loss": 2.0491, + "step": 17682 + }, + { + "epoch": 2.0631198226577996, + "grad_norm": 1.2308404445648193, + "learning_rate": 0.00016920109065391914, + "loss": 1.8922, + "step": 17683 + }, + { + "epoch": 2.0632364951580913, + "grad_norm": 1.2295666933059692, + "learning_rate": 0.0001691860783975777, + "loss": 1.9334, + "step": 17684 + }, + { + "epoch": 2.063353167658383, + "grad_norm": 1.0535920858383179, + "learning_rate": 0.00016917106595788666, + "loss": 1.8451, + "step": 17685 + }, + { + "epoch": 2.0634698401586746, + "grad_norm": 1.2745671272277832, + "learning_rate": 0.00016915605333500146, + "loss": 2.0903, + "step": 17686 + }, + { + "epoch": 2.0635865126589663, + "grad_norm": 1.34523606300354, + "learning_rate": 0.00016914104052907788, + "loss": 2.1925, + "step": 17687 + }, + { + "epoch": 2.063703185159258, + "grad_norm": 1.4732933044433594, + "learning_rate": 0.0001691260275402715, + "loss": 2.0464, + "step": 17688 + }, + { + "epoch": 2.0638198576595497, + "grad_norm": 1.0431991815567017, + "learning_rate": 0.00016911101436873792, + "loss": 1.7535, + "step": 17689 + }, + { + "epoch": 2.0639365301598414, + "grad_norm": 1.1988599300384521, + "learning_rate": 0.00016909600101463286, + "loss": 2.0971, + "step": 17690 + }, + { + "epoch": 2.064053202660133, + "grad_norm": 1.1733620166778564, + "learning_rate": 0.00016908098747811192, + "loss": 2.2606, + "step": 17691 + }, + { + "epoch": 2.0641698751604247, + "grad_norm": 1.0906306505203247, + "learning_rate": 0.00016906597375933077, + "loss": 1.9177, + "step": 17692 + }, + { + "epoch": 2.0642865476607164, + "grad_norm": 1.2662705183029175, + "learning_rate": 0.00016905095985844505, + "loss": 2.0599, + "step": 17693 + }, + { + "epoch": 2.064403220161008, + "grad_norm": 1.2492389678955078, + "learning_rate": 0.00016903594577561039, + "loss": 1.8166, + "step": 17694 + }, + { + "epoch": 2.0645198926612998, + "grad_norm": 1.3468964099884033, + "learning_rate": 0.00016902093151098245, + "loss": 2.0722, + "step": 17695 + }, + { + "epoch": 2.0646365651615914, + "grad_norm": 1.067662239074707, + "learning_rate": 0.00016900591706471682, + "loss": 1.8155, + "step": 17696 + }, + { + "epoch": 2.064753237661883, + "grad_norm": 1.0916264057159424, + "learning_rate": 0.00016899090243696928, + "loss": 2.1275, + "step": 17697 + }, + { + "epoch": 2.064869910162175, + "grad_norm": 1.2765820026397705, + "learning_rate": 0.00016897588762789534, + "loss": 2.0695, + "step": 17698 + }, + { + "epoch": 2.0649865826624665, + "grad_norm": 1.3094453811645508, + "learning_rate": 0.00016896087263765081, + "loss": 2.0954, + "step": 17699 + }, + { + "epoch": 2.065103255162758, + "grad_norm": 1.0263407230377197, + "learning_rate": 0.00016894585746639117, + "loss": 1.848, + "step": 17700 + }, + { + "epoch": 2.06521992766305, + "grad_norm": 1.1639317274093628, + "learning_rate": 0.0001689308421142723, + "loss": 1.895, + "step": 17701 + }, + { + "epoch": 2.0653366001633415, + "grad_norm": 1.2808387279510498, + "learning_rate": 0.00016891582658144964, + "loss": 1.7429, + "step": 17702 + }, + { + "epoch": 2.065453272663633, + "grad_norm": 1.217510461807251, + "learning_rate": 0.00016890081086807905, + "loss": 2.0877, + "step": 17703 + }, + { + "epoch": 2.065569945163925, + "grad_norm": 1.1585569381713867, + "learning_rate": 0.0001688857949743161, + "loss": 1.8279, + "step": 17704 + }, + { + "epoch": 2.0656866176642166, + "grad_norm": 1.254397988319397, + "learning_rate": 0.00016887077890031642, + "loss": 2.0645, + "step": 17705 + }, + { + "epoch": 2.0658032901645083, + "grad_norm": 1.2636007070541382, + "learning_rate": 0.00016885576264623567, + "loss": 2.19, + "step": 17706 + }, + { + "epoch": 2.0659199626648, + "grad_norm": 1.2411524057388306, + "learning_rate": 0.0001688407462122296, + "loss": 1.9927, + "step": 17707 + }, + { + "epoch": 2.0660366351650916, + "grad_norm": 1.2156039476394653, + "learning_rate": 0.00016882572959845394, + "loss": 1.9299, + "step": 17708 + }, + { + "epoch": 2.0661533076653833, + "grad_norm": 1.1839255094528198, + "learning_rate": 0.00016881071280506422, + "loss": 1.9608, + "step": 17709 + }, + { + "epoch": 2.066269980165675, + "grad_norm": 1.2431706190109253, + "learning_rate": 0.0001687956958322162, + "loss": 1.8217, + "step": 17710 + }, + { + "epoch": 2.0663866526659667, + "grad_norm": 1.2889982461929321, + "learning_rate": 0.00016878067868006556, + "loss": 2.0316, + "step": 17711 + }, + { + "epoch": 2.0665033251662583, + "grad_norm": 1.207854151725769, + "learning_rate": 0.00016876566134876793, + "loss": 2.0123, + "step": 17712 + }, + { + "epoch": 2.06661999766655, + "grad_norm": 1.0935263633728027, + "learning_rate": 0.00016875064383847904, + "loss": 1.907, + "step": 17713 + }, + { + "epoch": 2.0667366701668417, + "grad_norm": 1.277813196182251, + "learning_rate": 0.00016873562614935455, + "loss": 2.066, + "step": 17714 + }, + { + "epoch": 2.0668533426671334, + "grad_norm": 1.2610526084899902, + "learning_rate": 0.00016872060828155015, + "loss": 1.9897, + "step": 17715 + }, + { + "epoch": 2.066970015167425, + "grad_norm": 1.2435246706008911, + "learning_rate": 0.00016870559023522152, + "loss": 1.8452, + "step": 17716 + }, + { + "epoch": 2.0670866876677167, + "grad_norm": 1.2627276182174683, + "learning_rate": 0.0001686905720105244, + "loss": 1.9586, + "step": 17717 + }, + { + "epoch": 2.0672033601680084, + "grad_norm": 1.4801567792892456, + "learning_rate": 0.00016867555360761444, + "loss": 2.1511, + "step": 17718 + }, + { + "epoch": 2.0673200326683, + "grad_norm": 1.2590973377227783, + "learning_rate": 0.00016866053502664734, + "loss": 1.8379, + "step": 17719 + }, + { + "epoch": 2.067436705168592, + "grad_norm": 1.2543443441390991, + "learning_rate": 0.00016864551626777878, + "loss": 1.9376, + "step": 17720 + }, + { + "epoch": 2.0675533776688835, + "grad_norm": 1.1181715726852417, + "learning_rate": 0.00016863049733116452, + "loss": 1.7776, + "step": 17721 + }, + { + "epoch": 2.067670050169175, + "grad_norm": 1.2004741430282593, + "learning_rate": 0.00016861547821696017, + "loss": 2.0509, + "step": 17722 + }, + { + "epoch": 2.067786722669467, + "grad_norm": 1.2590237855911255, + "learning_rate": 0.00016860045892532152, + "loss": 2.0468, + "step": 17723 + }, + { + "epoch": 2.0679033951697585, + "grad_norm": 1.1206239461898804, + "learning_rate": 0.00016858543945640417, + "loss": 2.1073, + "step": 17724 + }, + { + "epoch": 2.06802006767005, + "grad_norm": 1.240452766418457, + "learning_rate": 0.00016857041981036396, + "loss": 2.0009, + "step": 17725 + }, + { + "epoch": 2.068136740170342, + "grad_norm": 1.1524763107299805, + "learning_rate": 0.00016855539998735647, + "loss": 2.0287, + "step": 17726 + }, + { + "epoch": 2.0682534126706336, + "grad_norm": 1.344435453414917, + "learning_rate": 0.00016854037998753748, + "loss": 2.0, + "step": 17727 + }, + { + "epoch": 2.0683700851709252, + "grad_norm": 1.1767563819885254, + "learning_rate": 0.00016852535981106272, + "loss": 1.993, + "step": 17728 + }, + { + "epoch": 2.068486757671217, + "grad_norm": 1.3192462921142578, + "learning_rate": 0.00016851033945808783, + "loss": 2.001, + "step": 17729 + }, + { + "epoch": 2.0686034301715086, + "grad_norm": 1.5833104848861694, + "learning_rate": 0.00016849531892876855, + "loss": 1.9525, + "step": 17730 + }, + { + "epoch": 2.0687201026718003, + "grad_norm": 1.1913172006607056, + "learning_rate": 0.0001684802982232606, + "loss": 1.8992, + "step": 17731 + }, + { + "epoch": 2.068836775172092, + "grad_norm": 1.0475971698760986, + "learning_rate": 0.00016846527734171968, + "loss": 2.0081, + "step": 17732 + }, + { + "epoch": 2.0689534476723836, + "grad_norm": 1.263627290725708, + "learning_rate": 0.00016845025628430156, + "loss": 1.9458, + "step": 17733 + }, + { + "epoch": 2.0690701201726753, + "grad_norm": 1.0959599018096924, + "learning_rate": 0.00016843523505116197, + "loss": 2.0167, + "step": 17734 + }, + { + "epoch": 2.069186792672967, + "grad_norm": 1.2736531496047974, + "learning_rate": 0.00016842021364245658, + "loss": 2.0265, + "step": 17735 + }, + { + "epoch": 2.0693034651732587, + "grad_norm": 1.0595530271530151, + "learning_rate": 0.00016840519205834115, + "loss": 1.9935, + "step": 17736 + }, + { + "epoch": 2.0694201376735504, + "grad_norm": 1.0676450729370117, + "learning_rate": 0.00016839017029897136, + "loss": 1.8671, + "step": 17737 + }, + { + "epoch": 2.069536810173842, + "grad_norm": 1.3227355480194092, + "learning_rate": 0.00016837514836450302, + "loss": 1.9216, + "step": 17738 + }, + { + "epoch": 2.0696534826741337, + "grad_norm": 1.0809662342071533, + "learning_rate": 0.00016836012625509174, + "loss": 1.9606, + "step": 17739 + }, + { + "epoch": 2.0697701551744254, + "grad_norm": 1.4387686252593994, + "learning_rate": 0.00016834510397089337, + "loss": 1.9059, + "step": 17740 + }, + { + "epoch": 2.069886827674717, + "grad_norm": 1.1984827518463135, + "learning_rate": 0.0001683300815120636, + "loss": 2.0365, + "step": 17741 + }, + { + "epoch": 2.070003500175009, + "grad_norm": 1.367456078529358, + "learning_rate": 0.00016831505887875814, + "loss": 2.0001, + "step": 17742 + }, + { + "epoch": 2.0701201726753005, + "grad_norm": 1.3713631629943848, + "learning_rate": 0.00016830003607113276, + "loss": 2.1553, + "step": 17743 + }, + { + "epoch": 2.070236845175592, + "grad_norm": 1.3647284507751465, + "learning_rate": 0.00016828501308934317, + "loss": 1.9239, + "step": 17744 + }, + { + "epoch": 2.070353517675884, + "grad_norm": 1.3650625944137573, + "learning_rate": 0.00016826998993354516, + "loss": 2.2282, + "step": 17745 + }, + { + "epoch": 2.0704701901761755, + "grad_norm": 1.1504184007644653, + "learning_rate": 0.00016825496660389445, + "loss": 2.0176, + "step": 17746 + }, + { + "epoch": 2.070586862676467, + "grad_norm": 1.2529746294021606, + "learning_rate": 0.00016823994310054684, + "loss": 1.9243, + "step": 17747 + }, + { + "epoch": 2.070703535176759, + "grad_norm": 1.0948021411895752, + "learning_rate": 0.00016822491942365792, + "loss": 2.0235, + "step": 17748 + }, + { + "epoch": 2.0708202076770506, + "grad_norm": 1.157005786895752, + "learning_rate": 0.0001682098955733836, + "loss": 1.897, + "step": 17749 + }, + { + "epoch": 2.0709368801773422, + "grad_norm": 1.1439892053604126, + "learning_rate": 0.00016819487154987953, + "loss": 1.9942, + "step": 17750 + }, + { + "epoch": 2.071053552677634, + "grad_norm": 1.1859557628631592, + "learning_rate": 0.00016817984735330153, + "loss": 2.1818, + "step": 17751 + }, + { + "epoch": 2.0711702251779256, + "grad_norm": 1.2219277620315552, + "learning_rate": 0.00016816482298380532, + "loss": 1.9731, + "step": 17752 + }, + { + "epoch": 2.0712868976782173, + "grad_norm": 1.1863703727722168, + "learning_rate": 0.00016814979844154666, + "loss": 1.815, + "step": 17753 + }, + { + "epoch": 2.071403570178509, + "grad_norm": 1.2696316242218018, + "learning_rate": 0.00016813477372668134, + "loss": 2.0559, + "step": 17754 + }, + { + "epoch": 2.0715202426788006, + "grad_norm": 1.2179006338119507, + "learning_rate": 0.00016811974883936508, + "loss": 1.9526, + "step": 17755 + }, + { + "epoch": 2.0716369151790923, + "grad_norm": 1.3341506719589233, + "learning_rate": 0.0001681047237797536, + "loss": 2.0298, + "step": 17756 + }, + { + "epoch": 2.071753587679384, + "grad_norm": 1.0571938753128052, + "learning_rate": 0.00016808969854800276, + "loss": 1.9668, + "step": 17757 + }, + { + "epoch": 2.0718702601796757, + "grad_norm": 1.226033091545105, + "learning_rate": 0.0001680746731442682, + "loss": 1.9383, + "step": 17758 + }, + { + "epoch": 2.0719869326799674, + "grad_norm": 1.0194721221923828, + "learning_rate": 0.00016805964756870588, + "loss": 2.0281, + "step": 17759 + }, + { + "epoch": 2.072103605180259, + "grad_norm": 1.256238341331482, + "learning_rate": 0.0001680446218214714, + "loss": 2.0466, + "step": 17760 + }, + { + "epoch": 2.0722202776805507, + "grad_norm": 1.092232584953308, + "learning_rate": 0.00016802959590272055, + "loss": 1.9159, + "step": 17761 + }, + { + "epoch": 2.0723369501808424, + "grad_norm": 1.1267223358154297, + "learning_rate": 0.00016801456981260925, + "loss": 2.0316, + "step": 17762 + }, + { + "epoch": 2.072453622681134, + "grad_norm": 1.157528042793274, + "learning_rate": 0.00016799954355129312, + "loss": 1.9572, + "step": 17763 + }, + { + "epoch": 2.0725702951814258, + "grad_norm": 1.4627889394760132, + "learning_rate": 0.0001679845171189279, + "loss": 2.0171, + "step": 17764 + }, + { + "epoch": 2.0726869676817175, + "grad_norm": 1.259248971939087, + "learning_rate": 0.00016796949051566952, + "loss": 1.946, + "step": 17765 + }, + { + "epoch": 2.072803640182009, + "grad_norm": 1.1270445585250854, + "learning_rate": 0.00016795446374167367, + "loss": 2.0401, + "step": 17766 + }, + { + "epoch": 2.072920312682301, + "grad_norm": 1.1188571453094482, + "learning_rate": 0.00016793943679709608, + "loss": 2.0712, + "step": 17767 + }, + { + "epoch": 2.0730369851825925, + "grad_norm": 1.3919843435287476, + "learning_rate": 0.00016792440968209265, + "loss": 2.1476, + "step": 17768 + }, + { + "epoch": 2.073153657682884, + "grad_norm": 1.218027949333191, + "learning_rate": 0.00016790938239681912, + "loss": 1.8689, + "step": 17769 + }, + { + "epoch": 2.073270330183176, + "grad_norm": 1.086417555809021, + "learning_rate": 0.00016789435494143126, + "loss": 1.8684, + "step": 17770 + }, + { + "epoch": 2.0733870026834675, + "grad_norm": 1.3188552856445312, + "learning_rate": 0.00016787932731608484, + "loss": 1.9296, + "step": 17771 + }, + { + "epoch": 2.073503675183759, + "grad_norm": 1.233412504196167, + "learning_rate": 0.0001678642995209357, + "loss": 2.076, + "step": 17772 + }, + { + "epoch": 2.073620347684051, + "grad_norm": 1.2674524784088135, + "learning_rate": 0.0001678492715561396, + "loss": 1.9501, + "step": 17773 + }, + { + "epoch": 2.0737370201843426, + "grad_norm": 1.1502569913864136, + "learning_rate": 0.00016783424342185234, + "loss": 1.9878, + "step": 17774 + }, + { + "epoch": 2.0738536926846343, + "grad_norm": 1.2646147012710571, + "learning_rate": 0.00016781921511822975, + "loss": 1.9418, + "step": 17775 + }, + { + "epoch": 2.073970365184926, + "grad_norm": 1.1549321413040161, + "learning_rate": 0.00016780418664542754, + "loss": 2.0438, + "step": 17776 + }, + { + "epoch": 2.0740870376852176, + "grad_norm": 1.04852294921875, + "learning_rate": 0.00016778915800360158, + "loss": 1.9049, + "step": 17777 + }, + { + "epoch": 2.0742037101855093, + "grad_norm": 1.1355814933776855, + "learning_rate": 0.00016777412919290766, + "loss": 1.8817, + "step": 17778 + }, + { + "epoch": 2.074320382685801, + "grad_norm": 1.0927326679229736, + "learning_rate": 0.00016775910021350158, + "loss": 1.8836, + "step": 17779 + }, + { + "epoch": 2.0744370551860927, + "grad_norm": 1.1984859704971313, + "learning_rate": 0.00016774407106553914, + "loss": 1.8102, + "step": 17780 + }, + { + "epoch": 2.0745537276863844, + "grad_norm": 1.214200496673584, + "learning_rate": 0.00016772904174917612, + "loss": 2.0178, + "step": 17781 + }, + { + "epoch": 2.074670400186676, + "grad_norm": 1.0114423036575317, + "learning_rate": 0.00016771401226456836, + "loss": 1.9991, + "step": 17782 + }, + { + "epoch": 2.0747870726869677, + "grad_norm": 1.2017114162445068, + "learning_rate": 0.00016769898261187165, + "loss": 2.0405, + "step": 17783 + }, + { + "epoch": 2.0749037451872594, + "grad_norm": 1.505041480064392, + "learning_rate": 0.00016768395279124175, + "loss": 1.9179, + "step": 17784 + }, + { + "epoch": 2.075020417687551, + "grad_norm": 1.2031031847000122, + "learning_rate": 0.00016766892280283463, + "loss": 2.0532, + "step": 17785 + }, + { + "epoch": 2.0751370901878428, + "grad_norm": 1.040134072303772, + "learning_rate": 0.00016765389264680598, + "loss": 1.7066, + "step": 17786 + }, + { + "epoch": 2.0752537626881344, + "grad_norm": 1.1904867887496948, + "learning_rate": 0.00016763886232331164, + "loss": 1.9021, + "step": 17787 + }, + { + "epoch": 2.075370435188426, + "grad_norm": 1.0548614263534546, + "learning_rate": 0.00016762383183250743, + "loss": 1.9248, + "step": 17788 + }, + { + "epoch": 2.075487107688718, + "grad_norm": 1.2681044340133667, + "learning_rate": 0.0001676088011745492, + "loss": 1.8441, + "step": 17789 + }, + { + "epoch": 2.0756037801890095, + "grad_norm": 1.1158839464187622, + "learning_rate": 0.0001675937703495927, + "loss": 1.76, + "step": 17790 + }, + { + "epoch": 2.075720452689301, + "grad_norm": 1.291548252105713, + "learning_rate": 0.00016757873935779383, + "loss": 2.0108, + "step": 17791 + }, + { + "epoch": 2.075837125189593, + "grad_norm": 1.0328294038772583, + "learning_rate": 0.0001675637081993083, + "loss": 1.8506, + "step": 17792 + }, + { + "epoch": 2.0759537976898845, + "grad_norm": 1.0508099794387817, + "learning_rate": 0.00016754867687429208, + "loss": 1.7618, + "step": 17793 + }, + { + "epoch": 2.076070470190176, + "grad_norm": 1.1181172132492065, + "learning_rate": 0.00016753364538290094, + "loss": 1.82, + "step": 17794 + }, + { + "epoch": 2.076187142690468, + "grad_norm": 1.1769249439239502, + "learning_rate": 0.00016751861372529066, + "loss": 1.9831, + "step": 17795 + }, + { + "epoch": 2.0763038151907596, + "grad_norm": 1.3031166791915894, + "learning_rate": 0.00016750358190161717, + "loss": 1.914, + "step": 17796 + }, + { + "epoch": 2.0764204876910513, + "grad_norm": 1.2562916278839111, + "learning_rate": 0.00016748854991203622, + "loss": 1.9167, + "step": 17797 + }, + { + "epoch": 2.076537160191343, + "grad_norm": 1.2803971767425537, + "learning_rate": 0.00016747351775670365, + "loss": 1.933, + "step": 17798 + }, + { + "epoch": 2.0766538326916346, + "grad_norm": 1.4482237100601196, + "learning_rate": 0.00016745848543577537, + "loss": 1.992, + "step": 17799 + }, + { + "epoch": 2.0767705051919263, + "grad_norm": 1.2039438486099243, + "learning_rate": 0.00016744345294940712, + "loss": 1.8974, + "step": 17800 + }, + { + "epoch": 2.076887177692218, + "grad_norm": 1.299736499786377, + "learning_rate": 0.0001674284202977548, + "loss": 1.9187, + "step": 17801 + }, + { + "epoch": 2.0770038501925097, + "grad_norm": 1.4042069911956787, + "learning_rate": 0.00016741338748097427, + "loss": 1.8648, + "step": 17802 + }, + { + "epoch": 2.0771205226928013, + "grad_norm": 1.0443705320358276, + "learning_rate": 0.0001673983544992213, + "loss": 1.8163, + "step": 17803 + }, + { + "epoch": 2.077237195193093, + "grad_norm": 1.6204776763916016, + "learning_rate": 0.00016738332135265185, + "loss": 2.0961, + "step": 17804 + }, + { + "epoch": 2.0773538676933847, + "grad_norm": 1.23934006690979, + "learning_rate": 0.0001673682880414216, + "loss": 2.0096, + "step": 17805 + }, + { + "epoch": 2.0774705401936764, + "grad_norm": 1.3350753784179688, + "learning_rate": 0.0001673532545656866, + "loss": 2.0683, + "step": 17806 + }, + { + "epoch": 2.077587212693968, + "grad_norm": 1.2713046073913574, + "learning_rate": 0.00016733822092560255, + "loss": 1.9219, + "step": 17807 + }, + { + "epoch": 2.0777038851942597, + "grad_norm": 1.235852599143982, + "learning_rate": 0.00016732318712132534, + "loss": 1.9532, + "step": 17808 + }, + { + "epoch": 2.0778205576945514, + "grad_norm": 1.1819982528686523, + "learning_rate": 0.00016730815315301083, + "loss": 1.9435, + "step": 17809 + }, + { + "epoch": 2.077937230194843, + "grad_norm": 1.1772583723068237, + "learning_rate": 0.00016729311902081488, + "loss": 1.959, + "step": 17810 + }, + { + "epoch": 2.078053902695135, + "grad_norm": 1.206396460533142, + "learning_rate": 0.0001672780847248933, + "loss": 2.1014, + "step": 17811 + }, + { + "epoch": 2.0781705751954265, + "grad_norm": 1.4098771810531616, + "learning_rate": 0.00016726305026540204, + "loss": 2.1354, + "step": 17812 + }, + { + "epoch": 2.078287247695718, + "grad_norm": 1.224690556526184, + "learning_rate": 0.00016724801564249698, + "loss": 1.9556, + "step": 17813 + }, + { + "epoch": 2.07840392019601, + "grad_norm": 1.08162260055542, + "learning_rate": 0.00016723298085633383, + "loss": 1.8903, + "step": 17814 + }, + { + "epoch": 2.0785205926963015, + "grad_norm": 1.0280060768127441, + "learning_rate": 0.00016721794590706854, + "loss": 2.0791, + "step": 17815 + }, + { + "epoch": 2.078637265196593, + "grad_norm": 1.0549261569976807, + "learning_rate": 0.000167202910794857, + "loss": 1.9398, + "step": 17816 + }, + { + "epoch": 2.078753937696885, + "grad_norm": 1.3704181909561157, + "learning_rate": 0.00016718787551985506, + "loss": 2.077, + "step": 17817 + }, + { + "epoch": 2.0788706101971766, + "grad_norm": 1.2173717021942139, + "learning_rate": 0.00016717284008221857, + "loss": 1.9828, + "step": 17818 + }, + { + "epoch": 2.0789872826974682, + "grad_norm": 1.2190351486206055, + "learning_rate": 0.0001671578044821034, + "loss": 1.8463, + "step": 17819 + }, + { + "epoch": 2.07910395519776, + "grad_norm": 1.2996338605880737, + "learning_rate": 0.00016714276871966546, + "loss": 2.0568, + "step": 17820 + }, + { + "epoch": 2.0792206276980516, + "grad_norm": 1.0804808139801025, + "learning_rate": 0.00016712773279506062, + "loss": 1.9969, + "step": 17821 + }, + { + "epoch": 2.0793373001983433, + "grad_norm": 1.2158358097076416, + "learning_rate": 0.00016711269670844467, + "loss": 2.1309, + "step": 17822 + }, + { + "epoch": 2.079453972698635, + "grad_norm": 1.1195619106292725, + "learning_rate": 0.00016709766045997363, + "loss": 1.9298, + "step": 17823 + }, + { + "epoch": 2.0795706451989266, + "grad_norm": 1.156719446182251, + "learning_rate": 0.00016708262404980322, + "loss": 1.9372, + "step": 17824 + }, + { + "epoch": 2.0796873176992183, + "grad_norm": 1.2138131856918335, + "learning_rate": 0.00016706758747808947, + "loss": 2.1197, + "step": 17825 + }, + { + "epoch": 2.07980399019951, + "grad_norm": 1.1604169607162476, + "learning_rate": 0.00016705255074498813, + "loss": 1.8357, + "step": 17826 + }, + { + "epoch": 2.0799206626998017, + "grad_norm": 1.166662335395813, + "learning_rate": 0.0001670375138506552, + "loss": 2.0045, + "step": 17827 + }, + { + "epoch": 2.0800373352000934, + "grad_norm": 1.1176961660385132, + "learning_rate": 0.00016702247679524653, + "loss": 1.7965, + "step": 17828 + }, + { + "epoch": 2.080154007700385, + "grad_norm": 1.1459468603134155, + "learning_rate": 0.00016700743957891797, + "loss": 2.0911, + "step": 17829 + }, + { + "epoch": 2.0802706802006767, + "grad_norm": 1.045927882194519, + "learning_rate": 0.00016699240220182545, + "loss": 1.7809, + "step": 17830 + }, + { + "epoch": 2.0803873527009684, + "grad_norm": 1.0326958894729614, + "learning_rate": 0.00016697736466412484, + "loss": 1.8505, + "step": 17831 + }, + { + "epoch": 2.08050402520126, + "grad_norm": 1.0324695110321045, + "learning_rate": 0.00016696232696597206, + "loss": 1.9548, + "step": 17832 + }, + { + "epoch": 2.080620697701552, + "grad_norm": 1.131089687347412, + "learning_rate": 0.00016694728910752291, + "loss": 2.0371, + "step": 17833 + }, + { + "epoch": 2.0807373702018435, + "grad_norm": 1.2108217477798462, + "learning_rate": 0.0001669322510889334, + "loss": 1.9593, + "step": 17834 + }, + { + "epoch": 2.080854042702135, + "grad_norm": 1.0322647094726562, + "learning_rate": 0.00016691721291035942, + "loss": 1.9209, + "step": 17835 + }, + { + "epoch": 2.080970715202427, + "grad_norm": 1.3658196926116943, + "learning_rate": 0.00016690217457195677, + "loss": 2.0589, + "step": 17836 + }, + { + "epoch": 2.0810873877027185, + "grad_norm": 1.087872862815857, + "learning_rate": 0.00016688713607388136, + "loss": 1.9301, + "step": 17837 + }, + { + "epoch": 2.08120406020301, + "grad_norm": 1.1827127933502197, + "learning_rate": 0.00016687209741628924, + "loss": 2.0674, + "step": 17838 + }, + { + "epoch": 2.081320732703302, + "grad_norm": 1.2124226093292236, + "learning_rate": 0.00016685705859933623, + "loss": 2.0267, + "step": 17839 + }, + { + "epoch": 2.0814374052035935, + "grad_norm": 1.4464480876922607, + "learning_rate": 0.00016684201962317822, + "loss": 2.0465, + "step": 17840 + }, + { + "epoch": 2.0815540777038852, + "grad_norm": 1.2794826030731201, + "learning_rate": 0.00016682698048797108, + "loss": 2.1527, + "step": 17841 + }, + { + "epoch": 2.081670750204177, + "grad_norm": 1.1042747497558594, + "learning_rate": 0.0001668119411938708, + "loss": 1.7765, + "step": 17842 + }, + { + "epoch": 2.0817874227044686, + "grad_norm": 1.2419112920761108, + "learning_rate": 0.00016679690174103322, + "loss": 2.064, + "step": 17843 + }, + { + "epoch": 2.0819040952047603, + "grad_norm": 1.0586047172546387, + "learning_rate": 0.00016678186212961435, + "loss": 1.9003, + "step": 17844 + }, + { + "epoch": 2.082020767705052, + "grad_norm": 1.076475739479065, + "learning_rate": 0.00016676682235976998, + "loss": 1.9698, + "step": 17845 + }, + { + "epoch": 2.0821374402053436, + "grad_norm": 1.2218573093414307, + "learning_rate": 0.00016675178243165612, + "loss": 1.8421, + "step": 17846 + }, + { + "epoch": 2.0822541127056353, + "grad_norm": 1.243105173110962, + "learning_rate": 0.00016673674234542863, + "loss": 1.9551, + "step": 17847 + }, + { + "epoch": 2.082370785205927, + "grad_norm": 1.1429568529129028, + "learning_rate": 0.00016672170210124343, + "loss": 2.0371, + "step": 17848 + }, + { + "epoch": 2.0824874577062187, + "grad_norm": 1.2382898330688477, + "learning_rate": 0.00016670666169925659, + "loss": 1.7477, + "step": 17849 + }, + { + "epoch": 2.0826041302065104, + "grad_norm": 1.1276966333389282, + "learning_rate": 0.0001666916211396237, + "loss": 1.8747, + "step": 17850 + }, + { + "epoch": 2.082720802706802, + "grad_norm": 1.0949482917785645, + "learning_rate": 0.00016667658042250104, + "loss": 1.8564, + "step": 17851 + }, + { + "epoch": 2.0828374752070937, + "grad_norm": 1.3519726991653442, + "learning_rate": 0.00016666153954804434, + "loss": 1.8844, + "step": 17852 + }, + { + "epoch": 2.0829541477073854, + "grad_norm": 1.1732237339019775, + "learning_rate": 0.00016664649851640958, + "loss": 1.9616, + "step": 17853 + }, + { + "epoch": 2.083070820207677, + "grad_norm": 1.1778568029403687, + "learning_rate": 0.0001666314573277527, + "loss": 1.9526, + "step": 17854 + }, + { + "epoch": 2.0831874927079688, + "grad_norm": 1.3329459428787231, + "learning_rate": 0.00016661641598222956, + "loss": 2.0723, + "step": 17855 + }, + { + "epoch": 2.0833041652082605, + "grad_norm": 1.0354862213134766, + "learning_rate": 0.00016660137447999617, + "loss": 2.0363, + "step": 17856 + }, + { + "epoch": 2.083420837708552, + "grad_norm": 1.2021552324295044, + "learning_rate": 0.00016658633282120845, + "loss": 1.9235, + "step": 17857 + }, + { + "epoch": 2.083537510208844, + "grad_norm": 1.2107844352722168, + "learning_rate": 0.0001665712910060223, + "loss": 1.7847, + "step": 17858 + }, + { + "epoch": 2.0836541827091355, + "grad_norm": 1.109719157218933, + "learning_rate": 0.00016655624903459372, + "loss": 2.09, + "step": 17859 + }, + { + "epoch": 2.083770855209427, + "grad_norm": 1.1009931564331055, + "learning_rate": 0.00016654120690707855, + "loss": 1.8769, + "step": 17860 + }, + { + "epoch": 2.083887527709719, + "grad_norm": 1.1007440090179443, + "learning_rate": 0.00016652616462363283, + "loss": 1.996, + "step": 17861 + }, + { + "epoch": 2.0840042002100105, + "grad_norm": 1.1901930570602417, + "learning_rate": 0.00016651112218441245, + "loss": 2.0434, + "step": 17862 + }, + { + "epoch": 2.084120872710302, + "grad_norm": 1.2614532709121704, + "learning_rate": 0.0001664960795895733, + "loss": 2.125, + "step": 17863 + }, + { + "epoch": 2.084237545210594, + "grad_norm": 1.2515255212783813, + "learning_rate": 0.00016648103683927147, + "loss": 1.9783, + "step": 17864 + }, + { + "epoch": 2.0843542177108856, + "grad_norm": 1.1427799463272095, + "learning_rate": 0.0001664659939336628, + "loss": 1.8669, + "step": 17865 + }, + { + "epoch": 2.0844708902111773, + "grad_norm": 0.9332868456840515, + "learning_rate": 0.00016645095087290326, + "loss": 1.7312, + "step": 17866 + }, + { + "epoch": 2.084587562711469, + "grad_norm": 1.0672708749771118, + "learning_rate": 0.0001664359076571488, + "loss": 1.8661, + "step": 17867 + }, + { + "epoch": 2.0847042352117606, + "grad_norm": 1.118641972541809, + "learning_rate": 0.00016642086428655538, + "loss": 2.0245, + "step": 17868 + }, + { + "epoch": 2.0848209077120523, + "grad_norm": 1.068841576576233, + "learning_rate": 0.00016640582076127896, + "loss": 1.9484, + "step": 17869 + }, + { + "epoch": 2.084937580212344, + "grad_norm": 1.0949335098266602, + "learning_rate": 0.0001663907770814754, + "loss": 1.8333, + "step": 17870 + }, + { + "epoch": 2.0850542527126357, + "grad_norm": 0.9665529727935791, + "learning_rate": 0.00016637573324730087, + "loss": 1.8778, + "step": 17871 + }, + { + "epoch": 2.0851709252129274, + "grad_norm": 1.2329288721084595, + "learning_rate": 0.0001663606892589111, + "loss": 1.9531, + "step": 17872 + }, + { + "epoch": 2.085287597713219, + "grad_norm": 1.1721713542938232, + "learning_rate": 0.00016634564511646216, + "loss": 1.952, + "step": 17873 + }, + { + "epoch": 2.0854042702135107, + "grad_norm": 1.1792001724243164, + "learning_rate": 0.00016633060082011001, + "loss": 1.9806, + "step": 17874 + }, + { + "epoch": 2.0855209427138024, + "grad_norm": 1.2939025163650513, + "learning_rate": 0.0001663155563700106, + "loss": 2.0371, + "step": 17875 + }, + { + "epoch": 2.085637615214094, + "grad_norm": 1.1359260082244873, + "learning_rate": 0.0001663005117663199, + "loss": 2.0096, + "step": 17876 + }, + { + "epoch": 2.0857542877143858, + "grad_norm": 1.455444574356079, + "learning_rate": 0.00016628546700919383, + "loss": 2.0591, + "step": 17877 + }, + { + "epoch": 2.0858709602146774, + "grad_norm": 1.0974000692367554, + "learning_rate": 0.00016627042209878842, + "loss": 1.9421, + "step": 17878 + }, + { + "epoch": 2.085987632714969, + "grad_norm": 1.22029709815979, + "learning_rate": 0.00016625537703525965, + "loss": 2.0779, + "step": 17879 + }, + { + "epoch": 2.086104305215261, + "grad_norm": 1.0498392581939697, + "learning_rate": 0.0001662403318187634, + "loss": 1.7889, + "step": 17880 + }, + { + "epoch": 2.0862209777155525, + "grad_norm": 1.0913678407669067, + "learning_rate": 0.00016622528644945572, + "loss": 1.9395, + "step": 17881 + }, + { + "epoch": 2.086337650215844, + "grad_norm": 1.1737401485443115, + "learning_rate": 0.00016621024092749255, + "loss": 1.9661, + "step": 17882 + }, + { + "epoch": 2.086454322716136, + "grad_norm": 1.1263841390609741, + "learning_rate": 0.00016619519525302992, + "loss": 1.9114, + "step": 17883 + }, + { + "epoch": 2.0865709952164275, + "grad_norm": 1.0921026468276978, + "learning_rate": 0.00016618014942622374, + "loss": 1.9165, + "step": 17884 + }, + { + "epoch": 2.086687667716719, + "grad_norm": 1.2582380771636963, + "learning_rate": 0.00016616510344723, + "loss": 2.0066, + "step": 17885 + }, + { + "epoch": 2.086804340217011, + "grad_norm": 1.3199156522750854, + "learning_rate": 0.00016615005731620468, + "loss": 2.0353, + "step": 17886 + }, + { + "epoch": 2.0869210127173026, + "grad_norm": 1.196919322013855, + "learning_rate": 0.00016613501103330378, + "loss": 2.1041, + "step": 17887 + }, + { + "epoch": 2.0870376852175943, + "grad_norm": 1.6325339078903198, + "learning_rate": 0.0001661199645986833, + "loss": 2.116, + "step": 17888 + }, + { + "epoch": 2.087154357717886, + "grad_norm": 1.0796352624893188, + "learning_rate": 0.00016610491801249911, + "loss": 1.9188, + "step": 17889 + }, + { + "epoch": 2.0872710302181776, + "grad_norm": 1.2898609638214111, + "learning_rate": 0.00016608987127490743, + "loss": 2.0521, + "step": 17890 + }, + { + "epoch": 2.0873877027184693, + "grad_norm": 1.2784351110458374, + "learning_rate": 0.00016607482438606405, + "loss": 2.0921, + "step": 17891 + }, + { + "epoch": 2.087504375218761, + "grad_norm": 1.2615844011306763, + "learning_rate": 0.000166059777346125, + "loss": 1.9475, + "step": 17892 + }, + { + "epoch": 2.0876210477190527, + "grad_norm": 1.1375054121017456, + "learning_rate": 0.0001660447301552463, + "loss": 1.9971, + "step": 17893 + }, + { + "epoch": 2.0877377202193443, + "grad_norm": 1.3103386163711548, + "learning_rate": 0.00016602968281358396, + "loss": 2.1174, + "step": 17894 + }, + { + "epoch": 2.087854392719636, + "grad_norm": 1.1905962228775024, + "learning_rate": 0.00016601463532129386, + "loss": 1.9607, + "step": 17895 + }, + { + "epoch": 2.0879710652199277, + "grad_norm": 1.1794575452804565, + "learning_rate": 0.00016599958767853212, + "loss": 1.9931, + "step": 17896 + }, + { + "epoch": 2.0880877377202194, + "grad_norm": 1.2636719942092896, + "learning_rate": 0.00016598453988545472, + "loss": 1.9434, + "step": 17897 + }, + { + "epoch": 2.088204410220511, + "grad_norm": 1.0148284435272217, + "learning_rate": 0.00016596949194221762, + "loss": 1.8429, + "step": 17898 + }, + { + "epoch": 2.0883210827208027, + "grad_norm": 1.0551878213882446, + "learning_rate": 0.00016595444384897684, + "loss": 1.9368, + "step": 17899 + }, + { + "epoch": 2.0884377552210944, + "grad_norm": 1.1097227334976196, + "learning_rate": 0.0001659393956058884, + "loss": 1.9923, + "step": 17900 + }, + { + "epoch": 2.088554427721386, + "grad_norm": 1.1430859565734863, + "learning_rate": 0.00016592434721310825, + "loss": 2.2119, + "step": 17901 + }, + { + "epoch": 2.088671100221678, + "grad_norm": 1.3148014545440674, + "learning_rate": 0.0001659092986707924, + "loss": 1.9986, + "step": 17902 + }, + { + "epoch": 2.0887877727219695, + "grad_norm": 1.0930571556091309, + "learning_rate": 0.0001658942499790969, + "loss": 1.9675, + "step": 17903 + }, + { + "epoch": 2.088904445222261, + "grad_norm": 1.060828447341919, + "learning_rate": 0.00016587920113817777, + "loss": 1.907, + "step": 17904 + }, + { + "epoch": 2.089021117722553, + "grad_norm": 1.1057101488113403, + "learning_rate": 0.00016586415214819093, + "loss": 1.9397, + "step": 17905 + }, + { + "epoch": 2.0891377902228445, + "grad_norm": 1.2808550596237183, + "learning_rate": 0.00016584910300929253, + "loss": 1.8968, + "step": 17906 + }, + { + "epoch": 2.089254462723136, + "grad_norm": 1.0040318965911865, + "learning_rate": 0.00016583405372163848, + "loss": 1.8, + "step": 17907 + }, + { + "epoch": 2.089371135223428, + "grad_norm": 1.1430164575576782, + "learning_rate": 0.00016581900428538484, + "loss": 1.9326, + "step": 17908 + }, + { + "epoch": 2.0894878077237196, + "grad_norm": 1.089857578277588, + "learning_rate": 0.00016580395470068758, + "loss": 1.7981, + "step": 17909 + }, + { + "epoch": 2.0896044802240112, + "grad_norm": 1.262369990348816, + "learning_rate": 0.0001657889049677027, + "loss": 1.9481, + "step": 17910 + }, + { + "epoch": 2.089721152724303, + "grad_norm": 1.336862325668335, + "learning_rate": 0.00016577385508658634, + "loss": 2.0479, + "step": 17911 + }, + { + "epoch": 2.0898378252245946, + "grad_norm": 1.1977779865264893, + "learning_rate": 0.00016575880505749436, + "loss": 2.021, + "step": 17912 + }, + { + "epoch": 2.0899544977248863, + "grad_norm": 1.2634167671203613, + "learning_rate": 0.00016574375488058292, + "loss": 2.1047, + "step": 17913 + }, + { + "epoch": 2.090071170225178, + "grad_norm": 1.0966793298721313, + "learning_rate": 0.000165728704556008, + "loss": 1.731, + "step": 17914 + }, + { + "epoch": 2.0901878427254696, + "grad_norm": 1.351029396057129, + "learning_rate": 0.00016571365408392554, + "loss": 2.0954, + "step": 17915 + }, + { + "epoch": 2.0903045152257613, + "grad_norm": 1.0911943912506104, + "learning_rate": 0.00016569860346449171, + "loss": 1.9156, + "step": 17916 + }, + { + "epoch": 2.090421187726053, + "grad_norm": 1.1603788137435913, + "learning_rate": 0.00016568355269786246, + "loss": 1.7599, + "step": 17917 + }, + { + "epoch": 2.0905378602263447, + "grad_norm": 1.1594699621200562, + "learning_rate": 0.0001656685017841938, + "loss": 1.9357, + "step": 17918 + }, + { + "epoch": 2.0906545327266364, + "grad_norm": 1.3445647954940796, + "learning_rate": 0.0001656534507236418, + "loss": 2.0214, + "step": 17919 + }, + { + "epoch": 2.090771205226928, + "grad_norm": 1.3898205757141113, + "learning_rate": 0.0001656383995163625, + "loss": 1.9357, + "step": 17920 + }, + { + "epoch": 2.0908878777272197, + "grad_norm": 1.1704288721084595, + "learning_rate": 0.00016562334816251186, + "loss": 2.0209, + "step": 17921 + }, + { + "epoch": 2.0910045502275114, + "grad_norm": 1.6114881038665771, + "learning_rate": 0.00016560829666224604, + "loss": 2.188, + "step": 17922 + }, + { + "epoch": 2.091121222727803, + "grad_norm": 1.0713768005371094, + "learning_rate": 0.000165593245015721, + "loss": 1.9218, + "step": 17923 + }, + { + "epoch": 2.091237895228095, + "grad_norm": 1.30636465549469, + "learning_rate": 0.00016557819322309274, + "loss": 2.0698, + "step": 17924 + }, + { + "epoch": 2.0913545677283865, + "grad_norm": 1.06711745262146, + "learning_rate": 0.0001655631412845174, + "loss": 1.7855, + "step": 17925 + }, + { + "epoch": 2.091471240228678, + "grad_norm": 1.2338309288024902, + "learning_rate": 0.00016554808920015096, + "loss": 1.9154, + "step": 17926 + }, + { + "epoch": 2.09158791272897, + "grad_norm": 1.0443106889724731, + "learning_rate": 0.00016553303697014942, + "loss": 1.8476, + "step": 17927 + }, + { + "epoch": 2.0917045852292615, + "grad_norm": 1.1055282354354858, + "learning_rate": 0.0001655179845946689, + "loss": 2.0083, + "step": 17928 + }, + { + "epoch": 2.091821257729553, + "grad_norm": 1.1986467838287354, + "learning_rate": 0.00016550293207386545, + "loss": 1.9454, + "step": 17929 + }, + { + "epoch": 2.091937930229845, + "grad_norm": 1.1423869132995605, + "learning_rate": 0.0001654878794078951, + "loss": 2.0583, + "step": 17930 + }, + { + "epoch": 2.0920546027301365, + "grad_norm": 1.2377570867538452, + "learning_rate": 0.00016547282659691389, + "loss": 1.8245, + "step": 17931 + }, + { + "epoch": 2.0921712752304282, + "grad_norm": 1.3083542585372925, + "learning_rate": 0.00016545777364107783, + "loss": 2.0552, + "step": 17932 + }, + { + "epoch": 2.09228794773072, + "grad_norm": 1.3588098287582397, + "learning_rate": 0.00016544272054054302, + "loss": 2.0421, + "step": 17933 + }, + { + "epoch": 2.0924046202310116, + "grad_norm": 1.221946120262146, + "learning_rate": 0.00016542766729546557, + "loss": 1.9712, + "step": 17934 + }, + { + "epoch": 2.0925212927313033, + "grad_norm": 1.1567010879516602, + "learning_rate": 0.00016541261390600143, + "loss": 1.9416, + "step": 17935 + }, + { + "epoch": 2.092637965231595, + "grad_norm": 1.402095079421997, + "learning_rate": 0.00016539756037230673, + "loss": 2.1239, + "step": 17936 + }, + { + "epoch": 2.0927546377318866, + "grad_norm": 1.0919547080993652, + "learning_rate": 0.00016538250669453745, + "loss": 1.7702, + "step": 17937 + }, + { + "epoch": 2.0928713102321783, + "grad_norm": 1.2299964427947998, + "learning_rate": 0.00016536745287284974, + "loss": 1.88, + "step": 17938 + }, + { + "epoch": 2.09298798273247, + "grad_norm": 1.1334346532821655, + "learning_rate": 0.0001653523989073996, + "loss": 1.9673, + "step": 17939 + }, + { + "epoch": 2.0931046552327617, + "grad_norm": 1.3235398530960083, + "learning_rate": 0.0001653373447983431, + "loss": 1.9195, + "step": 17940 + }, + { + "epoch": 2.0932213277330534, + "grad_norm": 1.039264440536499, + "learning_rate": 0.00016532229054583633, + "loss": 1.9146, + "step": 17941 + }, + { + "epoch": 2.093338000233345, + "grad_norm": 1.2862805128097534, + "learning_rate": 0.0001653072361500354, + "loss": 2.037, + "step": 17942 + }, + { + "epoch": 2.0934546727336367, + "grad_norm": 1.2648329734802246, + "learning_rate": 0.00016529218161109627, + "loss": 1.9979, + "step": 17943 + }, + { + "epoch": 2.0935713452339284, + "grad_norm": 1.3511730432510376, + "learning_rate": 0.0001652771269291751, + "loss": 1.9956, + "step": 17944 + }, + { + "epoch": 2.09368801773422, + "grad_norm": 1.1547917127609253, + "learning_rate": 0.00016526207210442785, + "loss": 1.9644, + "step": 17945 + }, + { + "epoch": 2.0938046902345118, + "grad_norm": 1.3287173509597778, + "learning_rate": 0.00016524701713701065, + "loss": 1.8839, + "step": 17946 + }, + { + "epoch": 2.0939213627348034, + "grad_norm": 1.180652141571045, + "learning_rate": 0.00016523196202707967, + "loss": 1.8714, + "step": 17947 + }, + { + "epoch": 2.094038035235095, + "grad_norm": 1.382380485534668, + "learning_rate": 0.00016521690677479085, + "loss": 1.9338, + "step": 17948 + }, + { + "epoch": 2.094154707735387, + "grad_norm": 1.2997475862503052, + "learning_rate": 0.00016520185138030035, + "loss": 1.8739, + "step": 17949 + }, + { + "epoch": 2.0942713802356785, + "grad_norm": 1.3833481073379517, + "learning_rate": 0.00016518679584376418, + "loss": 1.9705, + "step": 17950 + }, + { + "epoch": 2.09438805273597, + "grad_norm": 1.2149325609207153, + "learning_rate": 0.00016517174016533845, + "loss": 1.9717, + "step": 17951 + }, + { + "epoch": 2.094504725236262, + "grad_norm": 1.4183403253555298, + "learning_rate": 0.00016515668434517928, + "loss": 2.0048, + "step": 17952 + }, + { + "epoch": 2.0946213977365535, + "grad_norm": 1.3107776641845703, + "learning_rate": 0.00016514162838344272, + "loss": 2.2317, + "step": 17953 + }, + { + "epoch": 2.094738070236845, + "grad_norm": 1.0442240238189697, + "learning_rate": 0.00016512657228028482, + "loss": 1.8546, + "step": 17954 + }, + { + "epoch": 2.094854742737137, + "grad_norm": 1.0380504131317139, + "learning_rate": 0.00016511151603586168, + "loss": 1.9013, + "step": 17955 + }, + { + "epoch": 2.0949714152374286, + "grad_norm": 1.1045982837677002, + "learning_rate": 0.0001650964596503294, + "loss": 2.0882, + "step": 17956 + }, + { + "epoch": 2.0950880877377203, + "grad_norm": 1.1287494897842407, + "learning_rate": 0.00016508140312384407, + "loss": 1.8759, + "step": 17957 + }, + { + "epoch": 2.095204760238012, + "grad_norm": 1.1278105974197388, + "learning_rate": 0.0001650663464565618, + "loss": 1.8561, + "step": 17958 + }, + { + "epoch": 2.0953214327383036, + "grad_norm": 1.191475510597229, + "learning_rate": 0.00016505128964863867, + "loss": 1.9162, + "step": 17959 + }, + { + "epoch": 2.0954381052385953, + "grad_norm": 1.330834150314331, + "learning_rate": 0.00016503623270023074, + "loss": 2.0982, + "step": 17960 + }, + { + "epoch": 2.095554777738887, + "grad_norm": 1.1078513860702515, + "learning_rate": 0.00016502117561149416, + "loss": 1.8931, + "step": 17961 + }, + { + "epoch": 2.0956714502391787, + "grad_norm": 1.204120397567749, + "learning_rate": 0.00016500611838258497, + "loss": 1.8735, + "step": 17962 + }, + { + "epoch": 2.0957881227394703, + "grad_norm": 1.401485562324524, + "learning_rate": 0.00016499106101365926, + "loss": 1.9728, + "step": 17963 + }, + { + "epoch": 2.095904795239762, + "grad_norm": 1.128581166267395, + "learning_rate": 0.00016497600350487318, + "loss": 1.9864, + "step": 17964 + }, + { + "epoch": 2.0960214677400537, + "grad_norm": 1.2645881175994873, + "learning_rate": 0.0001649609458563828, + "loss": 2.0491, + "step": 17965 + }, + { + "epoch": 2.0961381402403454, + "grad_norm": 1.1183916330337524, + "learning_rate": 0.00016494588806834423, + "loss": 2.0876, + "step": 17966 + }, + { + "epoch": 2.096254812740637, + "grad_norm": 1.2440778017044067, + "learning_rate": 0.00016493083014091354, + "loss": 2.0115, + "step": 17967 + }, + { + "epoch": 2.0963714852409288, + "grad_norm": 1.2789808511734009, + "learning_rate": 0.00016491577207424686, + "loss": 2.1162, + "step": 17968 + }, + { + "epoch": 2.0964881577412204, + "grad_norm": 1.2425520420074463, + "learning_rate": 0.00016490071386850033, + "loss": 2.0274, + "step": 17969 + }, + { + "epoch": 2.096604830241512, + "grad_norm": 1.0979503393173218, + "learning_rate": 0.00016488565552383002, + "loss": 2.0081, + "step": 17970 + }, + { + "epoch": 2.096721502741804, + "grad_norm": 1.3307417631149292, + "learning_rate": 0.00016487059704039207, + "loss": 1.8691, + "step": 17971 + }, + { + "epoch": 2.0968381752420955, + "grad_norm": 1.080991506576538, + "learning_rate": 0.00016485553841834254, + "loss": 1.9521, + "step": 17972 + }, + { + "epoch": 2.096954847742387, + "grad_norm": 1.2399377822875977, + "learning_rate": 0.00016484047965783754, + "loss": 1.9922, + "step": 17973 + }, + { + "epoch": 2.097071520242679, + "grad_norm": 0.9681766033172607, + "learning_rate": 0.0001648254207590332, + "loss": 2.0011, + "step": 17974 + }, + { + "epoch": 2.0971881927429705, + "grad_norm": 1.0713750123977661, + "learning_rate": 0.00016481036172208568, + "loss": 1.9726, + "step": 17975 + }, + { + "epoch": 2.097304865243262, + "grad_norm": 0.9766508340835571, + "learning_rate": 0.000164795302547151, + "loss": 1.8681, + "step": 17976 + }, + { + "epoch": 2.097421537743554, + "grad_norm": 1.1538687944412231, + "learning_rate": 0.0001647802432343854, + "loss": 1.8054, + "step": 17977 + }, + { + "epoch": 2.0975382102438456, + "grad_norm": 1.144228458404541, + "learning_rate": 0.00016476518378394492, + "loss": 1.9317, + "step": 17978 + }, + { + "epoch": 2.0976548827441373, + "grad_norm": 1.2268750667572021, + "learning_rate": 0.00016475012419598566, + "loss": 1.878, + "step": 17979 + }, + { + "epoch": 2.097771555244429, + "grad_norm": 1.189255714416504, + "learning_rate": 0.0001647350644706638, + "loss": 1.9505, + "step": 17980 + }, + { + "epoch": 2.0978882277447206, + "grad_norm": 1.3029099702835083, + "learning_rate": 0.00016472000460813536, + "loss": 2.0158, + "step": 17981 + }, + { + "epoch": 2.0980049002450123, + "grad_norm": 1.269792914390564, + "learning_rate": 0.0001647049446085566, + "loss": 2.0177, + "step": 17982 + }, + { + "epoch": 2.098121572745304, + "grad_norm": 1.273239254951477, + "learning_rate": 0.00016468988447208355, + "loss": 2.0323, + "step": 17983 + }, + { + "epoch": 2.0982382452455957, + "grad_norm": 1.1679327487945557, + "learning_rate": 0.00016467482419887242, + "loss": 1.9467, + "step": 17984 + }, + { + "epoch": 2.0983549177458873, + "grad_norm": 1.145890474319458, + "learning_rate": 0.00016465976378907932, + "loss": 1.8858, + "step": 17985 + }, + { + "epoch": 2.098471590246179, + "grad_norm": 1.1108667850494385, + "learning_rate": 0.00016464470324286027, + "loss": 1.8481, + "step": 17986 + }, + { + "epoch": 2.0985882627464707, + "grad_norm": 1.1046210527420044, + "learning_rate": 0.00016462964256037151, + "loss": 1.9397, + "step": 17987 + }, + { + "epoch": 2.0987049352467624, + "grad_norm": 1.230387806892395, + "learning_rate": 0.00016461458174176912, + "loss": 2.0283, + "step": 17988 + }, + { + "epoch": 2.098821607747054, + "grad_norm": 1.220154881477356, + "learning_rate": 0.00016459952078720926, + "loss": 1.9296, + "step": 17989 + }, + { + "epoch": 2.0989382802473457, + "grad_norm": 1.017464518547058, + "learning_rate": 0.0001645844596968481, + "loss": 1.9639, + "step": 17990 + }, + { + "epoch": 2.0990549527476374, + "grad_norm": 1.1500301361083984, + "learning_rate": 0.0001645693984708417, + "loss": 1.9831, + "step": 17991 + }, + { + "epoch": 2.099171625247929, + "grad_norm": 1.1502149105072021, + "learning_rate": 0.00016455433710934624, + "loss": 1.9594, + "step": 17992 + }, + { + "epoch": 2.099288297748221, + "grad_norm": 1.1837610006332397, + "learning_rate": 0.00016453927561251784, + "loss": 1.9339, + "step": 17993 + }, + { + "epoch": 2.0994049702485125, + "grad_norm": 1.2124143838882446, + "learning_rate": 0.0001645242139805127, + "loss": 1.8107, + "step": 17994 + }, + { + "epoch": 2.099521642748804, + "grad_norm": 1.6151351928710938, + "learning_rate": 0.0001645091522134869, + "loss": 2.0613, + "step": 17995 + }, + { + "epoch": 2.099638315249096, + "grad_norm": 1.2106646299362183, + "learning_rate": 0.00016449409031159658, + "loss": 2.0701, + "step": 17996 + }, + { + "epoch": 2.0997549877493875, + "grad_norm": 1.0900732278823853, + "learning_rate": 0.00016447902827499794, + "loss": 1.823, + "step": 17997 + }, + { + "epoch": 2.099871660249679, + "grad_norm": 1.1919097900390625, + "learning_rate": 0.0001644639661038471, + "loss": 1.8878, + "step": 17998 + }, + { + "epoch": 2.099988332749971, + "grad_norm": 1.0942904949188232, + "learning_rate": 0.00016444890379830015, + "loss": 1.9949, + "step": 17999 + }, + { + "epoch": 2.1001050052502626, + "grad_norm": 1.127912163734436, + "learning_rate": 0.0001644338413585133, + "loss": 2.0138, + "step": 18000 + }, + { + "epoch": 2.1002216777505542, + "grad_norm": 1.2949886322021484, + "learning_rate": 0.00016441877878464276, + "loss": 1.8481, + "step": 18001 + }, + { + "epoch": 2.100338350250846, + "grad_norm": 1.3091466426849365, + "learning_rate": 0.00016440371607684453, + "loss": 2.1915, + "step": 18002 + }, + { + "epoch": 2.1004550227511376, + "grad_norm": 1.4513285160064697, + "learning_rate": 0.0001643886532352749, + "loss": 2.1018, + "step": 18003 + }, + { + "epoch": 2.1005716952514293, + "grad_norm": 1.971219539642334, + "learning_rate": 0.00016437359026009004, + "loss": 1.9626, + "step": 18004 + }, + { + "epoch": 2.100688367751721, + "grad_norm": 1.0895214080810547, + "learning_rate": 0.00016435852715144592, + "loss": 1.8686, + "step": 18005 + }, + { + "epoch": 2.1008050402520126, + "grad_norm": 1.4721148014068604, + "learning_rate": 0.00016434346390949885, + "loss": 2.0552, + "step": 18006 + }, + { + "epoch": 2.1009217127523043, + "grad_norm": 1.3484920263290405, + "learning_rate": 0.00016432840053440495, + "loss": 1.9242, + "step": 18007 + }, + { + "epoch": 2.101038385252596, + "grad_norm": 1.2315043210983276, + "learning_rate": 0.00016431333702632037, + "loss": 1.9018, + "step": 18008 + }, + { + "epoch": 2.1011550577528877, + "grad_norm": 1.0398271083831787, + "learning_rate": 0.00016429827338540132, + "loss": 1.9706, + "step": 18009 + }, + { + "epoch": 2.1012717302531794, + "grad_norm": 1.1199766397476196, + "learning_rate": 0.00016428320961180392, + "loss": 1.9433, + "step": 18010 + }, + { + "epoch": 2.101388402753471, + "grad_norm": 1.1903748512268066, + "learning_rate": 0.00016426814570568438, + "loss": 2.1305, + "step": 18011 + }, + { + "epoch": 2.1015050752537627, + "grad_norm": 1.0914664268493652, + "learning_rate": 0.00016425308166719883, + "loss": 2.0337, + "step": 18012 + }, + { + "epoch": 2.1016217477540544, + "grad_norm": 1.3027769327163696, + "learning_rate": 0.0001642380174965034, + "loss": 1.936, + "step": 18013 + }, + { + "epoch": 2.101738420254346, + "grad_norm": 1.018722653388977, + "learning_rate": 0.00016422295319375432, + "loss": 1.6964, + "step": 18014 + }, + { + "epoch": 2.1018550927546378, + "grad_norm": 1.194703221321106, + "learning_rate": 0.0001642078887591077, + "loss": 2.0294, + "step": 18015 + }, + { + "epoch": 2.1019717652549295, + "grad_norm": 1.1844942569732666, + "learning_rate": 0.00016419282419271979, + "loss": 1.8058, + "step": 18016 + }, + { + "epoch": 2.102088437755221, + "grad_norm": 1.0622549057006836, + "learning_rate": 0.00016417775949474669, + "loss": 1.8072, + "step": 18017 + }, + { + "epoch": 2.102205110255513, + "grad_norm": 1.1153790950775146, + "learning_rate": 0.00016416269466534462, + "loss": 2.0998, + "step": 18018 + }, + { + "epoch": 2.1023217827558045, + "grad_norm": 1.0859211683273315, + "learning_rate": 0.00016414762970466974, + "loss": 1.9016, + "step": 18019 + }, + { + "epoch": 2.102438455256096, + "grad_norm": 1.0180890560150146, + "learning_rate": 0.00016413256461287823, + "loss": 1.8701, + "step": 18020 + }, + { + "epoch": 2.102555127756388, + "grad_norm": 1.0919173955917358, + "learning_rate": 0.00016411749939012626, + "loss": 1.7641, + "step": 18021 + }, + { + "epoch": 2.1026718002566795, + "grad_norm": 1.0450822114944458, + "learning_rate": 0.00016410243403657004, + "loss": 1.7224, + "step": 18022 + }, + { + "epoch": 2.1027884727569712, + "grad_norm": 1.1877477169036865, + "learning_rate": 0.0001640873685523657, + "loss": 2.0016, + "step": 18023 + }, + { + "epoch": 2.102905145257263, + "grad_norm": 1.4189727306365967, + "learning_rate": 0.00016407230293766947, + "loss": 2.0089, + "step": 18024 + }, + { + "epoch": 2.1030218177575546, + "grad_norm": 1.1465721130371094, + "learning_rate": 0.00016405723719263747, + "loss": 1.8086, + "step": 18025 + }, + { + "epoch": 2.1031384902578463, + "grad_norm": 1.08524751663208, + "learning_rate": 0.00016404217131742597, + "loss": 1.9916, + "step": 18026 + }, + { + "epoch": 2.103255162758138, + "grad_norm": 1.221037745475769, + "learning_rate": 0.0001640271053121911, + "loss": 1.9373, + "step": 18027 + }, + { + "epoch": 2.1033718352584296, + "grad_norm": 1.208255410194397, + "learning_rate": 0.00016401203917708906, + "loss": 1.8952, + "step": 18028 + }, + { + "epoch": 2.1034885077587213, + "grad_norm": 1.2888550758361816, + "learning_rate": 0.00016399697291227607, + "loss": 1.8361, + "step": 18029 + }, + { + "epoch": 2.103605180259013, + "grad_norm": 1.2768120765686035, + "learning_rate": 0.00016398190651790827, + "loss": 1.9299, + "step": 18030 + }, + { + "epoch": 2.1037218527593047, + "grad_norm": 1.2716975212097168, + "learning_rate": 0.00016396683999414185, + "loss": 1.9487, + "step": 18031 + }, + { + "epoch": 2.1038385252595964, + "grad_norm": 1.3744089603424072, + "learning_rate": 0.00016395177334113306, + "loss": 2.2571, + "step": 18032 + }, + { + "epoch": 2.103955197759888, + "grad_norm": 1.3644444942474365, + "learning_rate": 0.000163936706559038, + "loss": 2.0563, + "step": 18033 + }, + { + "epoch": 2.1040718702601797, + "grad_norm": 1.4424244165420532, + "learning_rate": 0.00016392163964801295, + "loss": 2.0977, + "step": 18034 + }, + { + "epoch": 2.1041885427604714, + "grad_norm": 1.1691821813583374, + "learning_rate": 0.00016390657260821414, + "loss": 1.9394, + "step": 18035 + }, + { + "epoch": 2.104305215260763, + "grad_norm": 1.2399338483810425, + "learning_rate": 0.00016389150543979762, + "loss": 1.906, + "step": 18036 + }, + { + "epoch": 2.1044218877610548, + "grad_norm": 1.0940461158752441, + "learning_rate": 0.0001638764381429198, + "loss": 1.7236, + "step": 18037 + }, + { + "epoch": 2.1045385602613464, + "grad_norm": 1.1730642318725586, + "learning_rate": 0.00016386137071773668, + "loss": 1.953, + "step": 18038 + }, + { + "epoch": 2.104655232761638, + "grad_norm": 1.131380319595337, + "learning_rate": 0.00016384630316440457, + "loss": 1.9012, + "step": 18039 + }, + { + "epoch": 2.10477190526193, + "grad_norm": 1.0504717826843262, + "learning_rate": 0.00016383123548307965, + "loss": 1.9217, + "step": 18040 + }, + { + "epoch": 2.1048885777622215, + "grad_norm": 1.0865086317062378, + "learning_rate": 0.00016381616767391808, + "loss": 1.9107, + "step": 18041 + }, + { + "epoch": 2.105005250262513, + "grad_norm": 1.2386215925216675, + "learning_rate": 0.00016380109973707614, + "loss": 2.1683, + "step": 18042 + }, + { + "epoch": 2.105121922762805, + "grad_norm": 1.13432776927948, + "learning_rate": 0.00016378603167271, + "loss": 1.9597, + "step": 18043 + }, + { + "epoch": 2.1052385952630965, + "grad_norm": 1.3211185932159424, + "learning_rate": 0.00016377096348097587, + "loss": 1.7622, + "step": 18044 + }, + { + "epoch": 2.105355267763388, + "grad_norm": 1.0537610054016113, + "learning_rate": 0.00016375589516202997, + "loss": 1.5519, + "step": 18045 + }, + { + "epoch": 2.10547194026368, + "grad_norm": 1.0161880254745483, + "learning_rate": 0.00016374082671602853, + "loss": 2.0754, + "step": 18046 + }, + { + "epoch": 2.1055886127639716, + "grad_norm": 1.1502702236175537, + "learning_rate": 0.00016372575814312774, + "loss": 1.9649, + "step": 18047 + }, + { + "epoch": 2.1057052852642633, + "grad_norm": 1.354207158088684, + "learning_rate": 0.00016371068944348377, + "loss": 2.0566, + "step": 18048 + }, + { + "epoch": 2.105821957764555, + "grad_norm": 1.175794243812561, + "learning_rate": 0.00016369562061725293, + "loss": 2.0507, + "step": 18049 + }, + { + "epoch": 2.1059386302648466, + "grad_norm": 1.2833584547042847, + "learning_rate": 0.00016368055166459135, + "loss": 2.0884, + "step": 18050 + }, + { + "epoch": 2.1060553027651383, + "grad_norm": 0.9864787459373474, + "learning_rate": 0.00016366548258565534, + "loss": 1.8613, + "step": 18051 + }, + { + "epoch": 2.10617197526543, + "grad_norm": 1.2890318632125854, + "learning_rate": 0.00016365041338060104, + "loss": 1.9873, + "step": 18052 + }, + { + "epoch": 2.1062886477657217, + "grad_norm": 1.2848987579345703, + "learning_rate": 0.0001636353440495847, + "loss": 1.9903, + "step": 18053 + }, + { + "epoch": 2.1064053202660133, + "grad_norm": 1.145047903060913, + "learning_rate": 0.0001636202745927625, + "loss": 2.0459, + "step": 18054 + }, + { + "epoch": 2.106521992766305, + "grad_norm": 1.2345941066741943, + "learning_rate": 0.00016360520501029077, + "loss": 2.0868, + "step": 18055 + }, + { + "epoch": 2.1066386652665967, + "grad_norm": 1.1409305334091187, + "learning_rate": 0.00016359013530232563, + "loss": 1.9151, + "step": 18056 + }, + { + "epoch": 2.1067553377668884, + "grad_norm": 1.3589478731155396, + "learning_rate": 0.00016357506546902333, + "loss": 1.8612, + "step": 18057 + }, + { + "epoch": 2.10687201026718, + "grad_norm": 1.1570391654968262, + "learning_rate": 0.00016355999551054013, + "loss": 2.0491, + "step": 18058 + }, + { + "epoch": 2.1069886827674718, + "grad_norm": 1.1185495853424072, + "learning_rate": 0.00016354492542703218, + "loss": 1.9554, + "step": 18059 + }, + { + "epoch": 2.1071053552677634, + "grad_norm": 1.141783595085144, + "learning_rate": 0.00016352985521865583, + "loss": 1.9544, + "step": 18060 + }, + { + "epoch": 2.107222027768055, + "grad_norm": 1.154205322265625, + "learning_rate": 0.00016351478488556726, + "loss": 1.9536, + "step": 18061 + }, + { + "epoch": 2.107338700268347, + "grad_norm": 1.4171135425567627, + "learning_rate": 0.00016349971442792272, + "loss": 1.9689, + "step": 18062 + }, + { + "epoch": 2.1074553727686385, + "grad_norm": 1.4609712362289429, + "learning_rate": 0.00016348464384587837, + "loss": 2.0296, + "step": 18063 + }, + { + "epoch": 2.10757204526893, + "grad_norm": 1.1422817707061768, + "learning_rate": 0.00016346957313959044, + "loss": 2.0607, + "step": 18064 + }, + { + "epoch": 2.107688717769222, + "grad_norm": 1.3970507383346558, + "learning_rate": 0.0001634545023092153, + "loss": 2.1513, + "step": 18065 + }, + { + "epoch": 2.1078053902695135, + "grad_norm": 0.9615989327430725, + "learning_rate": 0.00016343943135490905, + "loss": 1.8688, + "step": 18066 + }, + { + "epoch": 2.107922062769805, + "grad_norm": 1.281360149383545, + "learning_rate": 0.00016342436027682802, + "loss": 1.9547, + "step": 18067 + }, + { + "epoch": 2.108038735270097, + "grad_norm": 1.2167229652404785, + "learning_rate": 0.0001634092890751284, + "loss": 2.1182, + "step": 18068 + }, + { + "epoch": 2.1081554077703886, + "grad_norm": 1.141752004623413, + "learning_rate": 0.00016339421774996647, + "loss": 1.9179, + "step": 18069 + }, + { + "epoch": 2.1082720802706802, + "grad_norm": 1.138627052307129, + "learning_rate": 0.00016337914630149843, + "loss": 1.9655, + "step": 18070 + }, + { + "epoch": 2.108388752770972, + "grad_norm": 1.0297240018844604, + "learning_rate": 0.00016336407472988056, + "loss": 1.9281, + "step": 18071 + }, + { + "epoch": 2.1085054252712636, + "grad_norm": 1.4263020753860474, + "learning_rate": 0.00016334900303526906, + "loss": 2.0929, + "step": 18072 + }, + { + "epoch": 2.1086220977715553, + "grad_norm": 1.3307260274887085, + "learning_rate": 0.00016333393121782018, + "loss": 2.0989, + "step": 18073 + }, + { + "epoch": 2.108738770271847, + "grad_norm": 1.0878907442092896, + "learning_rate": 0.00016331885927769029, + "loss": 1.7379, + "step": 18074 + }, + { + "epoch": 2.1088554427721387, + "grad_norm": 1.1635105609893799, + "learning_rate": 0.00016330378721503546, + "loss": 2.0787, + "step": 18075 + }, + { + "epoch": 2.1089721152724303, + "grad_norm": 1.2103992700576782, + "learning_rate": 0.00016328871503001205, + "loss": 2.0193, + "step": 18076 + }, + { + "epoch": 2.109088787772722, + "grad_norm": 1.1543744802474976, + "learning_rate": 0.0001632736427227763, + "loss": 2.0139, + "step": 18077 + }, + { + "epoch": 2.1092054602730137, + "grad_norm": 1.1351450681686401, + "learning_rate": 0.00016325857029348442, + "loss": 2.165, + "step": 18078 + }, + { + "epoch": 2.1093221327733054, + "grad_norm": 1.1252052783966064, + "learning_rate": 0.0001632434977422927, + "loss": 1.8983, + "step": 18079 + }, + { + "epoch": 2.109438805273597, + "grad_norm": 1.0547817945480347, + "learning_rate": 0.00016322842506935741, + "loss": 1.8974, + "step": 18080 + }, + { + "epoch": 2.1095554777738887, + "grad_norm": 1.186948299407959, + "learning_rate": 0.0001632133522748348, + "loss": 1.8268, + "step": 18081 + }, + { + "epoch": 2.1096721502741804, + "grad_norm": 1.27362859249115, + "learning_rate": 0.00016319827935888109, + "loss": 2.007, + "step": 18082 + }, + { + "epoch": 2.109788822774472, + "grad_norm": 1.1170285940170288, + "learning_rate": 0.0001631832063216525, + "loss": 2.0152, + "step": 18083 + }, + { + "epoch": 2.109905495274764, + "grad_norm": 1.3835252523422241, + "learning_rate": 0.0001631681331633054, + "loss": 2.1818, + "step": 18084 + }, + { + "epoch": 2.1100221677750555, + "grad_norm": 1.033337116241455, + "learning_rate": 0.00016315305988399598, + "loss": 1.7471, + "step": 18085 + }, + { + "epoch": 2.110138840275347, + "grad_norm": 1.326995611190796, + "learning_rate": 0.00016313798648388058, + "loss": 2.033, + "step": 18086 + }, + { + "epoch": 2.110255512775639, + "grad_norm": 1.2786030769348145, + "learning_rate": 0.0001631229129631154, + "loss": 1.9592, + "step": 18087 + }, + { + "epoch": 2.1103721852759305, + "grad_norm": 1.1999242305755615, + "learning_rate": 0.0001631078393218567, + "loss": 2.0086, + "step": 18088 + }, + { + "epoch": 2.110488857776222, + "grad_norm": 1.2724965810775757, + "learning_rate": 0.00016309276556026076, + "loss": 1.964, + "step": 18089 + }, + { + "epoch": 2.110605530276514, + "grad_norm": 1.1772570610046387, + "learning_rate": 0.00016307769167848388, + "loss": 1.808, + "step": 18090 + }, + { + "epoch": 2.1107222027768056, + "grad_norm": 1.2960944175720215, + "learning_rate": 0.0001630626176766823, + "loss": 1.9859, + "step": 18091 + }, + { + "epoch": 2.1108388752770972, + "grad_norm": 1.113012433052063, + "learning_rate": 0.00016304754355501227, + "loss": 1.8031, + "step": 18092 + }, + { + "epoch": 2.110955547777389, + "grad_norm": 1.062821626663208, + "learning_rate": 0.00016303246931363008, + "loss": 1.9498, + "step": 18093 + }, + { + "epoch": 2.1110722202776806, + "grad_norm": 1.2286608219146729, + "learning_rate": 0.00016301739495269203, + "loss": 1.9921, + "step": 18094 + }, + { + "epoch": 2.1111888927779723, + "grad_norm": 1.1656968593597412, + "learning_rate": 0.00016300232047235436, + "loss": 2.0808, + "step": 18095 + }, + { + "epoch": 2.111305565278264, + "grad_norm": 1.1541228294372559, + "learning_rate": 0.00016298724587277333, + "loss": 2.1044, + "step": 18096 + }, + { + "epoch": 2.1114222377785556, + "grad_norm": 0.9675289988517761, + "learning_rate": 0.0001629721711541053, + "loss": 1.7188, + "step": 18097 + }, + { + "epoch": 2.1115389102788473, + "grad_norm": 1.4435871839523315, + "learning_rate": 0.0001629570963165064, + "loss": 2.0256, + "step": 18098 + }, + { + "epoch": 2.111655582779139, + "grad_norm": 1.2494562864303589, + "learning_rate": 0.0001629420213601331, + "loss": 1.8368, + "step": 18099 + }, + { + "epoch": 2.1117722552794307, + "grad_norm": 1.3678079843521118, + "learning_rate": 0.0001629269462851415, + "loss": 2.2189, + "step": 18100 + }, + { + "epoch": 2.1118889277797224, + "grad_norm": 1.2919803857803345, + "learning_rate": 0.00016291187109168805, + "loss": 1.9959, + "step": 18101 + }, + { + "epoch": 2.112005600280014, + "grad_norm": 1.1756515502929688, + "learning_rate": 0.0001628967957799289, + "loss": 2.1025, + "step": 18102 + }, + { + "epoch": 2.1121222727803057, + "grad_norm": 1.1335960626602173, + "learning_rate": 0.00016288172035002038, + "loss": 1.9228, + "step": 18103 + }, + { + "epoch": 2.1122389452805974, + "grad_norm": 1.2591114044189453, + "learning_rate": 0.00016286664480211877, + "loss": 2.2204, + "step": 18104 + }, + { + "epoch": 2.112355617780889, + "grad_norm": 1.1165283918380737, + "learning_rate": 0.00016285156913638037, + "loss": 1.9854, + "step": 18105 + }, + { + "epoch": 2.1124722902811808, + "grad_norm": 1.155369758605957, + "learning_rate": 0.00016283649335296148, + "loss": 1.9532, + "step": 18106 + }, + { + "epoch": 2.1125889627814725, + "grad_norm": 1.2240791320800781, + "learning_rate": 0.00016282141745201833, + "loss": 1.8164, + "step": 18107 + }, + { + "epoch": 2.112705635281764, + "grad_norm": 1.0926240682601929, + "learning_rate": 0.00016280634143370727, + "loss": 1.8746, + "step": 18108 + }, + { + "epoch": 2.112822307782056, + "grad_norm": 1.0158058404922485, + "learning_rate": 0.00016279126529818456, + "loss": 1.8475, + "step": 18109 + }, + { + "epoch": 2.1129389802823475, + "grad_norm": 1.34380042552948, + "learning_rate": 0.0001627761890456065, + "loss": 1.8968, + "step": 18110 + }, + { + "epoch": 2.113055652782639, + "grad_norm": 1.1917438507080078, + "learning_rate": 0.00016276111267612942, + "loss": 2.0409, + "step": 18111 + }, + { + "epoch": 2.113172325282931, + "grad_norm": 1.1680474281311035, + "learning_rate": 0.00016274603618990957, + "loss": 2.0167, + "step": 18112 + }, + { + "epoch": 2.1132889977832225, + "grad_norm": 1.0210374593734741, + "learning_rate": 0.00016273095958710323, + "loss": 1.7823, + "step": 18113 + }, + { + "epoch": 2.1134056702835142, + "grad_norm": 1.1399710178375244, + "learning_rate": 0.0001627158828678668, + "loss": 1.7132, + "step": 18114 + }, + { + "epoch": 2.113522342783806, + "grad_norm": 1.0679755210876465, + "learning_rate": 0.00016270080603235642, + "loss": 1.8286, + "step": 18115 + }, + { + "epoch": 2.1136390152840976, + "grad_norm": 1.173027753829956, + "learning_rate": 0.00016268572908072852, + "loss": 1.8584, + "step": 18116 + }, + { + "epoch": 2.1137556877843893, + "grad_norm": 1.2124825716018677, + "learning_rate": 0.00016267065201313935, + "loss": 1.9434, + "step": 18117 + }, + { + "epoch": 2.113872360284681, + "grad_norm": 1.2660434246063232, + "learning_rate": 0.00016265557482974518, + "loss": 1.8516, + "step": 18118 + }, + { + "epoch": 2.1139890327849726, + "grad_norm": 1.163861632347107, + "learning_rate": 0.0001626404975307024, + "loss": 1.8979, + "step": 18119 + }, + { + "epoch": 2.1141057052852643, + "grad_norm": 1.1057548522949219, + "learning_rate": 0.00016262542011616726, + "loss": 1.9849, + "step": 18120 + }, + { + "epoch": 2.114222377785556, + "grad_norm": 1.1188642978668213, + "learning_rate": 0.00016261034258629602, + "loss": 1.8698, + "step": 18121 + }, + { + "epoch": 2.1143390502858477, + "grad_norm": 1.3310641050338745, + "learning_rate": 0.0001625952649412451, + "loss": 2.0674, + "step": 18122 + }, + { + "epoch": 2.1144557227861394, + "grad_norm": 1.3690829277038574, + "learning_rate": 0.00016258018718117074, + "loss": 2.0544, + "step": 18123 + }, + { + "epoch": 2.114572395286431, + "grad_norm": 1.33525550365448, + "learning_rate": 0.0001625651093062293, + "loss": 1.9445, + "step": 18124 + }, + { + "epoch": 2.1146890677867227, + "grad_norm": 1.4070647954940796, + "learning_rate": 0.00016255003131657693, + "loss": 1.9637, + "step": 18125 + }, + { + "epoch": 2.1148057402870144, + "grad_norm": 1.1777701377868652, + "learning_rate": 0.00016253495321237013, + "loss": 1.9558, + "step": 18126 + }, + { + "epoch": 2.114922412787306, + "grad_norm": 1.171535849571228, + "learning_rate": 0.00016251987499376515, + "loss": 1.8165, + "step": 18127 + }, + { + "epoch": 2.1150390852875978, + "grad_norm": 1.2420746088027954, + "learning_rate": 0.0001625047966609183, + "loss": 2.0472, + "step": 18128 + }, + { + "epoch": 2.1151557577878894, + "grad_norm": 1.0034643411636353, + "learning_rate": 0.00016248971821398588, + "loss": 1.6911, + "step": 18129 + }, + { + "epoch": 2.115272430288181, + "grad_norm": 1.2516788244247437, + "learning_rate": 0.0001624746396531242, + "loss": 1.8959, + "step": 18130 + }, + { + "epoch": 2.115389102788473, + "grad_norm": 1.2052128314971924, + "learning_rate": 0.00016245956097848962, + "loss": 2.0317, + "step": 18131 + }, + { + "epoch": 2.1155057752887645, + "grad_norm": 1.186814785003662, + "learning_rate": 0.00016244448219023848, + "loss": 1.9307, + "step": 18132 + }, + { + "epoch": 2.115622447789056, + "grad_norm": 1.2904707193374634, + "learning_rate": 0.00016242940328852702, + "loss": 1.9989, + "step": 18133 + }, + { + "epoch": 2.115739120289348, + "grad_norm": 1.1898993253707886, + "learning_rate": 0.00016241432427351157, + "loss": 2.0661, + "step": 18134 + }, + { + "epoch": 2.1158557927896395, + "grad_norm": 1.0936472415924072, + "learning_rate": 0.00016239924514534852, + "loss": 1.9073, + "step": 18135 + }, + { + "epoch": 2.115972465289931, + "grad_norm": 1.1301075220108032, + "learning_rate": 0.00016238416590419412, + "loss": 1.9066, + "step": 18136 + }, + { + "epoch": 2.116089137790223, + "grad_norm": 1.0056023597717285, + "learning_rate": 0.00016236908655020476, + "loss": 1.8307, + "step": 18137 + }, + { + "epoch": 2.1162058102905146, + "grad_norm": 1.1336148977279663, + "learning_rate": 0.00016235400708353672, + "loss": 2.0565, + "step": 18138 + }, + { + "epoch": 2.1163224827908063, + "grad_norm": 1.2191014289855957, + "learning_rate": 0.00016233892750434637, + "loss": 1.8525, + "step": 18139 + }, + { + "epoch": 2.116439155291098, + "grad_norm": 1.2907285690307617, + "learning_rate": 0.00016232384781278994, + "loss": 1.9449, + "step": 18140 + }, + { + "epoch": 2.1165558277913896, + "grad_norm": 1.1674515008926392, + "learning_rate": 0.0001623087680090239, + "loss": 2.0047, + "step": 18141 + }, + { + "epoch": 2.1166725002916813, + "grad_norm": 1.6477371454238892, + "learning_rate": 0.0001622936880932045, + "loss": 2.1244, + "step": 18142 + }, + { + "epoch": 2.116789172791973, + "grad_norm": 1.2876585721969604, + "learning_rate": 0.00016227860806548804, + "loss": 1.9683, + "step": 18143 + }, + { + "epoch": 2.1169058452922647, + "grad_norm": 1.2062029838562012, + "learning_rate": 0.000162263527926031, + "loss": 1.9685, + "step": 18144 + }, + { + "epoch": 2.1170225177925563, + "grad_norm": 1.264061689376831, + "learning_rate": 0.00016224844767498945, + "loss": 1.9763, + "step": 18145 + }, + { + "epoch": 2.117139190292848, + "grad_norm": 1.3993535041809082, + "learning_rate": 0.00016223336731252, + "loss": 2.1489, + "step": 18146 + }, + { + "epoch": 2.1172558627931397, + "grad_norm": 1.06883704662323, + "learning_rate": 0.00016221828683877886, + "loss": 1.8279, + "step": 18147 + }, + { + "epoch": 2.1173725352934314, + "grad_norm": 1.0947946310043335, + "learning_rate": 0.00016220320625392237, + "loss": 1.8703, + "step": 18148 + }, + { + "epoch": 2.117489207793723, + "grad_norm": 1.833457589149475, + "learning_rate": 0.00016218812555810692, + "loss": 1.8885, + "step": 18149 + }, + { + "epoch": 2.1176058802940148, + "grad_norm": 1.104112148284912, + "learning_rate": 0.00016217304475148878, + "loss": 1.7374, + "step": 18150 + }, + { + "epoch": 2.1177225527943064, + "grad_norm": 1.6651190519332886, + "learning_rate": 0.00016215796383422425, + "loss": 1.8638, + "step": 18151 + }, + { + "epoch": 2.117839225294598, + "grad_norm": 1.3685190677642822, + "learning_rate": 0.00016214288280646982, + "loss": 2.1543, + "step": 18152 + }, + { + "epoch": 2.11795589779489, + "grad_norm": 1.0639710426330566, + "learning_rate": 0.00016212780166838174, + "loss": 1.9631, + "step": 18153 + }, + { + "epoch": 2.1180725702951815, + "grad_norm": 1.0950298309326172, + "learning_rate": 0.00016211272042011634, + "loss": 1.9908, + "step": 18154 + }, + { + "epoch": 2.118189242795473, + "grad_norm": 1.2830264568328857, + "learning_rate": 0.00016209763906183003, + "loss": 1.9091, + "step": 18155 + }, + { + "epoch": 2.118305915295765, + "grad_norm": 1.3488410711288452, + "learning_rate": 0.0001620825575936791, + "loss": 2.0787, + "step": 18156 + }, + { + "epoch": 2.1184225877960565, + "grad_norm": 1.1757274866104126, + "learning_rate": 0.00016206747601581997, + "loss": 2.0417, + "step": 18157 + }, + { + "epoch": 2.118539260296348, + "grad_norm": 1.1156288385391235, + "learning_rate": 0.0001620523943284089, + "loss": 1.8148, + "step": 18158 + }, + { + "epoch": 2.11865593279664, + "grad_norm": 1.0469250679016113, + "learning_rate": 0.00016203731253160232, + "loss": 1.8625, + "step": 18159 + }, + { + "epoch": 2.1187726052969316, + "grad_norm": 1.0672324895858765, + "learning_rate": 0.00016202223062555647, + "loss": 1.9086, + "step": 18160 + }, + { + "epoch": 2.1188892777972232, + "grad_norm": 1.1125508546829224, + "learning_rate": 0.00016200714861042782, + "loss": 1.8788, + "step": 18161 + }, + { + "epoch": 2.119005950297515, + "grad_norm": 1.0993415117263794, + "learning_rate": 0.00016199206648637266, + "loss": 1.9836, + "step": 18162 + }, + { + "epoch": 2.1191226227978066, + "grad_norm": 1.3638312816619873, + "learning_rate": 0.00016197698425354738, + "loss": 2.1343, + "step": 18163 + }, + { + "epoch": 2.1192392952980983, + "grad_norm": 1.3420974016189575, + "learning_rate": 0.00016196190191210827, + "loss": 1.8504, + "step": 18164 + }, + { + "epoch": 2.11935596779839, + "grad_norm": 1.2980681657791138, + "learning_rate": 0.0001619468194622118, + "loss": 1.925, + "step": 18165 + }, + { + "epoch": 2.1194726402986817, + "grad_norm": 1.0654690265655518, + "learning_rate": 0.00016193173690401424, + "loss": 1.9461, + "step": 18166 + }, + { + "epoch": 2.1195893127989733, + "grad_norm": 1.2416681051254272, + "learning_rate": 0.00016191665423767194, + "loss": 2.1199, + "step": 18167 + }, + { + "epoch": 2.119705985299265, + "grad_norm": 0.9382374882698059, + "learning_rate": 0.00016190157146334135, + "loss": 1.884, + "step": 18168 + }, + { + "epoch": 2.1198226577995567, + "grad_norm": 1.0622892379760742, + "learning_rate": 0.00016188648858117877, + "loss": 1.8876, + "step": 18169 + }, + { + "epoch": 2.1199393302998484, + "grad_norm": 1.1375089883804321, + "learning_rate": 0.00016187140559134052, + "loss": 2.0718, + "step": 18170 + }, + { + "epoch": 2.12005600280014, + "grad_norm": 1.0507005453109741, + "learning_rate": 0.00016185632249398303, + "loss": 1.8695, + "step": 18171 + }, + { + "epoch": 2.1201726753004317, + "grad_norm": 1.1272454261779785, + "learning_rate": 0.00016184123928926266, + "loss": 1.9588, + "step": 18172 + }, + { + "epoch": 2.1202893478007234, + "grad_norm": 1.340569257736206, + "learning_rate": 0.00016182615597733576, + "loss": 2.0996, + "step": 18173 + }, + { + "epoch": 2.120406020301015, + "grad_norm": 1.0439245700836182, + "learning_rate": 0.00016181107255835867, + "loss": 1.7995, + "step": 18174 + }, + { + "epoch": 2.120522692801307, + "grad_norm": 1.2270433902740479, + "learning_rate": 0.00016179598903248786, + "loss": 1.9569, + "step": 18175 + }, + { + "epoch": 2.1206393653015985, + "grad_norm": 1.4134677648544312, + "learning_rate": 0.0001617809053998796, + "loss": 2.2022, + "step": 18176 + }, + { + "epoch": 2.12075603780189, + "grad_norm": 1.2630587816238403, + "learning_rate": 0.00016176582166069022, + "loss": 2.0094, + "step": 18177 + }, + { + "epoch": 2.120872710302182, + "grad_norm": 1.0386524200439453, + "learning_rate": 0.00016175073781507618, + "loss": 1.8586, + "step": 18178 + }, + { + "epoch": 2.1209893828024735, + "grad_norm": 1.1000159978866577, + "learning_rate": 0.00016173565386319388, + "loss": 1.7865, + "step": 18179 + }, + { + "epoch": 2.121106055302765, + "grad_norm": 1.4218257665634155, + "learning_rate": 0.00016172056980519962, + "loss": 1.9701, + "step": 18180 + }, + { + "epoch": 2.121222727803057, + "grad_norm": 1.2502453327178955, + "learning_rate": 0.00016170548564124981, + "loss": 1.959, + "step": 18181 + }, + { + "epoch": 2.1213394003033486, + "grad_norm": 1.1585464477539062, + "learning_rate": 0.0001616904013715008, + "loss": 1.9576, + "step": 18182 + }, + { + "epoch": 2.1214560728036402, + "grad_norm": 1.2112430334091187, + "learning_rate": 0.00016167531699610904, + "loss": 1.9156, + "step": 18183 + }, + { + "epoch": 2.121572745303932, + "grad_norm": 1.1668004989624023, + "learning_rate": 0.00016166023251523078, + "loss": 2.0085, + "step": 18184 + }, + { + "epoch": 2.1216894178042236, + "grad_norm": 1.2874873876571655, + "learning_rate": 0.00016164514792902247, + "loss": 1.9431, + "step": 18185 + }, + { + "epoch": 2.1218060903045153, + "grad_norm": 1.1865297555923462, + "learning_rate": 0.00016163006323764053, + "loss": 1.9599, + "step": 18186 + }, + { + "epoch": 2.121922762804807, + "grad_norm": 1.2082486152648926, + "learning_rate": 0.00016161497844124125, + "loss": 1.9501, + "step": 18187 + }, + { + "epoch": 2.1220394353050986, + "grad_norm": 1.3582321405410767, + "learning_rate": 0.0001615998935399811, + "loss": 1.9367, + "step": 18188 + }, + { + "epoch": 2.1221561078053903, + "grad_norm": 1.0837434530258179, + "learning_rate": 0.00016158480853401645, + "loss": 2.0056, + "step": 18189 + }, + { + "epoch": 2.122272780305682, + "grad_norm": 1.107862949371338, + "learning_rate": 0.0001615697234235036, + "loss": 1.7331, + "step": 18190 + }, + { + "epoch": 2.1223894528059737, + "grad_norm": 1.271094560623169, + "learning_rate": 0.00016155463820859907, + "loss": 1.824, + "step": 18191 + }, + { + "epoch": 2.1225061253062654, + "grad_norm": 1.3311251401901245, + "learning_rate": 0.00016153955288945914, + "loss": 1.9554, + "step": 18192 + }, + { + "epoch": 2.122622797806557, + "grad_norm": 1.3005640506744385, + "learning_rate": 0.00016152446746624022, + "loss": 1.9919, + "step": 18193 + }, + { + "epoch": 2.1227394703068487, + "grad_norm": 1.4197477102279663, + "learning_rate": 0.00016150938193909873, + "loss": 2.0243, + "step": 18194 + }, + { + "epoch": 2.1228561428071404, + "grad_norm": 1.0718337297439575, + "learning_rate": 0.00016149429630819102, + "loss": 1.9548, + "step": 18195 + }, + { + "epoch": 2.122972815307432, + "grad_norm": 1.4491430521011353, + "learning_rate": 0.0001614792105736735, + "loss": 1.9358, + "step": 18196 + }, + { + "epoch": 2.1230894878077238, + "grad_norm": 1.2214475870132446, + "learning_rate": 0.0001614641247357026, + "loss": 2.0123, + "step": 18197 + }, + { + "epoch": 2.1232061603080155, + "grad_norm": 1.13065767288208, + "learning_rate": 0.00016144903879443468, + "loss": 1.971, + "step": 18198 + }, + { + "epoch": 2.123322832808307, + "grad_norm": 1.2307099103927612, + "learning_rate": 0.00016143395275002611, + "loss": 2.028, + "step": 18199 + }, + { + "epoch": 2.123439505308599, + "grad_norm": 1.0916869640350342, + "learning_rate": 0.0001614188666026333, + "loss": 1.9301, + "step": 18200 + }, + { + "epoch": 2.1235561778088905, + "grad_norm": 1.1974612474441528, + "learning_rate": 0.0001614037803524127, + "loss": 1.9965, + "step": 18201 + }, + { + "epoch": 2.123672850309182, + "grad_norm": 1.1611742973327637, + "learning_rate": 0.00016138869399952063, + "loss": 1.8894, + "step": 18202 + }, + { + "epoch": 2.123789522809474, + "grad_norm": 1.25920832157135, + "learning_rate": 0.00016137360754411352, + "loss": 2.0675, + "step": 18203 + }, + { + "epoch": 2.1239061953097655, + "grad_norm": 1.1900125741958618, + "learning_rate": 0.00016135852098634774, + "loss": 1.9344, + "step": 18204 + }, + { + "epoch": 2.1240228678100572, + "grad_norm": 1.3225198984146118, + "learning_rate": 0.00016134343432637976, + "loss": 1.9221, + "step": 18205 + }, + { + "epoch": 2.124139540310349, + "grad_norm": 1.0783993005752563, + "learning_rate": 0.0001613283475643659, + "loss": 1.9885, + "step": 18206 + }, + { + "epoch": 2.1242562128106406, + "grad_norm": 1.052259087562561, + "learning_rate": 0.0001613132607004627, + "loss": 1.8915, + "step": 18207 + }, + { + "epoch": 2.1243728853109323, + "grad_norm": 1.2523458003997803, + "learning_rate": 0.0001612981737348264, + "loss": 2.087, + "step": 18208 + }, + { + "epoch": 2.124489557811224, + "grad_norm": 1.1227277517318726, + "learning_rate": 0.00016128308666761348, + "loss": 1.9066, + "step": 18209 + }, + { + "epoch": 2.1246062303115156, + "grad_norm": 1.5310932397842407, + "learning_rate": 0.00016126799949898037, + "loss": 2.048, + "step": 18210 + }, + { + "epoch": 2.1247229028118073, + "grad_norm": 1.2453190088272095, + "learning_rate": 0.00016125291222908344, + "loss": 2.1348, + "step": 18211 + }, + { + "epoch": 2.124839575312099, + "grad_norm": 1.291938066482544, + "learning_rate": 0.00016123782485807906, + "loss": 2.0346, + "step": 18212 + }, + { + "epoch": 2.1249562478123907, + "grad_norm": 1.12264883518219, + "learning_rate": 0.00016122273738612371, + "loss": 1.8854, + "step": 18213 + }, + { + "epoch": 2.1250729203126824, + "grad_norm": 0.9410760998725891, + "learning_rate": 0.0001612076498133738, + "loss": 1.6988, + "step": 18214 + }, + { + "epoch": 2.125189592812974, + "grad_norm": 1.2062246799468994, + "learning_rate": 0.00016119256213998574, + "loss": 2.1284, + "step": 18215 + }, + { + "epoch": 2.1253062653132657, + "grad_norm": 1.1296898126602173, + "learning_rate": 0.0001611774743661158, + "loss": 2.0247, + "step": 18216 + }, + { + "epoch": 2.1254229378135574, + "grad_norm": 1.179409146308899, + "learning_rate": 0.00016116238649192065, + "loss": 1.8724, + "step": 18217 + }, + { + "epoch": 2.125539610313849, + "grad_norm": 1.0443083047866821, + "learning_rate": 0.00016114729851755648, + "loss": 1.8096, + "step": 18218 + }, + { + "epoch": 2.1256562828141408, + "grad_norm": 1.1329599618911743, + "learning_rate": 0.0001611322104431798, + "loss": 1.8543, + "step": 18219 + }, + { + "epoch": 2.1257729553144324, + "grad_norm": 1.2211706638336182, + "learning_rate": 0.00016111712226894707, + "loss": 2.1202, + "step": 18220 + }, + { + "epoch": 2.125889627814724, + "grad_norm": 1.0159226655960083, + "learning_rate": 0.0001611020339950146, + "loss": 1.9008, + "step": 18221 + }, + { + "epoch": 2.126006300315016, + "grad_norm": 1.302187204360962, + "learning_rate": 0.0001610869456215389, + "loss": 1.9509, + "step": 18222 + }, + { + "epoch": 2.1261229728153075, + "grad_norm": 1.0595351457595825, + "learning_rate": 0.00016107185714867638, + "loss": 1.9879, + "step": 18223 + }, + { + "epoch": 2.126239645315599, + "grad_norm": 0.9967703819274902, + "learning_rate": 0.00016105676857658334, + "loss": 1.622, + "step": 18224 + }, + { + "epoch": 2.126356317815891, + "grad_norm": 1.362723469734192, + "learning_rate": 0.00016104167990541637, + "loss": 1.8446, + "step": 18225 + }, + { + "epoch": 2.1264729903161825, + "grad_norm": 1.2009707689285278, + "learning_rate": 0.0001610265911353318, + "loss": 2.0451, + "step": 18226 + }, + { + "epoch": 2.126589662816474, + "grad_norm": 1.241506576538086, + "learning_rate": 0.00016101150226648613, + "loss": 2.0319, + "step": 18227 + }, + { + "epoch": 2.126706335316766, + "grad_norm": 1.11726713180542, + "learning_rate": 0.00016099641329903568, + "loss": 2.1004, + "step": 18228 + }, + { + "epoch": 2.1268230078170576, + "grad_norm": 1.1309630870819092, + "learning_rate": 0.00016098132423313685, + "loss": 2.0405, + "step": 18229 + }, + { + "epoch": 2.1269396803173493, + "grad_norm": 1.0655478239059448, + "learning_rate": 0.00016096623506894623, + "loss": 1.7905, + "step": 18230 + }, + { + "epoch": 2.127056352817641, + "grad_norm": 1.1992104053497314, + "learning_rate": 0.00016095114580662017, + "loss": 2.0268, + "step": 18231 + }, + { + "epoch": 2.1271730253179326, + "grad_norm": 1.272835373878479, + "learning_rate": 0.00016093605644631505, + "loss": 1.9163, + "step": 18232 + }, + { + "epoch": 2.1272896978182243, + "grad_norm": 1.114356517791748, + "learning_rate": 0.0001609209669881873, + "loss": 1.9708, + "step": 18233 + }, + { + "epoch": 2.127406370318516, + "grad_norm": 1.245177984237671, + "learning_rate": 0.00016090587743239346, + "loss": 2.0707, + "step": 18234 + }, + { + "epoch": 2.1275230428188077, + "grad_norm": 1.264724850654602, + "learning_rate": 0.00016089078777908986, + "loss": 2.0441, + "step": 18235 + }, + { + "epoch": 2.1276397153190993, + "grad_norm": 1.1397364139556885, + "learning_rate": 0.00016087569802843295, + "loss": 1.997, + "step": 18236 + }, + { + "epoch": 2.127756387819391, + "grad_norm": 1.1785186529159546, + "learning_rate": 0.0001608606081805792, + "loss": 1.9487, + "step": 18237 + }, + { + "epoch": 2.1278730603196827, + "grad_norm": 1.1360554695129395, + "learning_rate": 0.00016084551823568498, + "loss": 1.9446, + "step": 18238 + }, + { + "epoch": 2.1279897328199744, + "grad_norm": 1.1786562204360962, + "learning_rate": 0.00016083042819390678, + "loss": 1.9556, + "step": 18239 + }, + { + "epoch": 2.128106405320266, + "grad_norm": 1.0875341892242432, + "learning_rate": 0.00016081533805540103, + "loss": 1.8554, + "step": 18240 + }, + { + "epoch": 2.1282230778205578, + "grad_norm": 1.2177345752716064, + "learning_rate": 0.00016080024782032416, + "loss": 2.0307, + "step": 18241 + }, + { + "epoch": 2.1283397503208494, + "grad_norm": 1.070452332496643, + "learning_rate": 0.00016078515748883262, + "loss": 2.0081, + "step": 18242 + }, + { + "epoch": 2.128456422821141, + "grad_norm": 1.185183048248291, + "learning_rate": 0.0001607700670610828, + "loss": 2.0004, + "step": 18243 + }, + { + "epoch": 2.128573095321433, + "grad_norm": 1.1132744550704956, + "learning_rate": 0.00016075497653723125, + "loss": 1.9788, + "step": 18244 + }, + { + "epoch": 2.1286897678217245, + "grad_norm": 1.0678924322128296, + "learning_rate": 0.0001607398859174343, + "loss": 1.9476, + "step": 18245 + }, + { + "epoch": 2.128806440322016, + "grad_norm": 1.2728919982910156, + "learning_rate": 0.00016072479520184844, + "loss": 2.1948, + "step": 18246 + }, + { + "epoch": 2.128923112822308, + "grad_norm": 1.347761631011963, + "learning_rate": 0.00016070970439063012, + "loss": 2.0695, + "step": 18247 + }, + { + "epoch": 2.1290397853225995, + "grad_norm": 1.3720722198486328, + "learning_rate": 0.00016069461348393576, + "loss": 2.0954, + "step": 18248 + }, + { + "epoch": 2.129156457822891, + "grad_norm": 1.1400691270828247, + "learning_rate": 0.00016067952248192182, + "loss": 1.9118, + "step": 18249 + }, + { + "epoch": 2.129273130323183, + "grad_norm": 1.2068248987197876, + "learning_rate": 0.00016066443138474477, + "loss": 2.0413, + "step": 18250 + }, + { + "epoch": 2.1293898028234746, + "grad_norm": 1.191967487335205, + "learning_rate": 0.00016064934019256098, + "loss": 2.0093, + "step": 18251 + }, + { + "epoch": 2.1295064753237662, + "grad_norm": 1.2442240715026855, + "learning_rate": 0.00016063424890552704, + "loss": 1.9603, + "step": 18252 + }, + { + "epoch": 2.129623147824058, + "grad_norm": 1.0832172632217407, + "learning_rate": 0.00016061915752379924, + "loss": 2.0565, + "step": 18253 + }, + { + "epoch": 2.1297398203243496, + "grad_norm": 1.1139038801193237, + "learning_rate": 0.00016060406604753415, + "loss": 2.1631, + "step": 18254 + }, + { + "epoch": 2.1298564928246413, + "grad_norm": 1.2444417476654053, + "learning_rate": 0.00016058897447688813, + "loss": 1.8514, + "step": 18255 + }, + { + "epoch": 2.129973165324933, + "grad_norm": 1.3710124492645264, + "learning_rate": 0.00016057388281201767, + "loss": 1.9924, + "step": 18256 + }, + { + "epoch": 2.1300898378252247, + "grad_norm": 1.0592646598815918, + "learning_rate": 0.00016055879105307926, + "loss": 1.9105, + "step": 18257 + }, + { + "epoch": 2.1302065103255163, + "grad_norm": 1.206879734992981, + "learning_rate": 0.00016054369920022932, + "loss": 2.0136, + "step": 18258 + }, + { + "epoch": 2.130323182825808, + "grad_norm": 1.049626111984253, + "learning_rate": 0.00016052860725362434, + "loss": 1.988, + "step": 18259 + }, + { + "epoch": 2.1304398553260997, + "grad_norm": 1.1700518131256104, + "learning_rate": 0.00016051351521342074, + "loss": 2.0211, + "step": 18260 + }, + { + "epoch": 2.1305565278263914, + "grad_norm": 1.0650018453598022, + "learning_rate": 0.00016049842307977496, + "loss": 1.8017, + "step": 18261 + }, + { + "epoch": 2.130673200326683, + "grad_norm": 1.0927529335021973, + "learning_rate": 0.00016048333085284355, + "loss": 1.817, + "step": 18262 + }, + { + "epoch": 2.1307898728269747, + "grad_norm": 1.2042505741119385, + "learning_rate": 0.0001604682385327828, + "loss": 1.9874, + "step": 18263 + }, + { + "epoch": 2.1309065453272664, + "grad_norm": 1.4009510278701782, + "learning_rate": 0.00016045314611974935, + "loss": 2.2327, + "step": 18264 + }, + { + "epoch": 2.131023217827558, + "grad_norm": 1.24019193649292, + "learning_rate": 0.00016043805361389953, + "loss": 1.9047, + "step": 18265 + }, + { + "epoch": 2.13113989032785, + "grad_norm": 0.9630752801895142, + "learning_rate": 0.00016042296101538992, + "loss": 1.6325, + "step": 18266 + }, + { + "epoch": 2.1312565628281415, + "grad_norm": 1.360120415687561, + "learning_rate": 0.0001604078683243769, + "loss": 2.0294, + "step": 18267 + }, + { + "epoch": 2.131373235328433, + "grad_norm": 1.2428951263427734, + "learning_rate": 0.00016039277554101694, + "loss": 2.0509, + "step": 18268 + }, + { + "epoch": 2.131489907828725, + "grad_norm": 1.174134612083435, + "learning_rate": 0.00016037768266546654, + "loss": 1.8389, + "step": 18269 + }, + { + "epoch": 2.1316065803290165, + "grad_norm": 1.099528431892395, + "learning_rate": 0.00016036258969788213, + "loss": 1.8813, + "step": 18270 + }, + { + "epoch": 2.131723252829308, + "grad_norm": 1.13177490234375, + "learning_rate": 0.00016034749663842022, + "loss": 1.8838, + "step": 18271 + }, + { + "epoch": 2.1318399253296, + "grad_norm": 1.3847328424453735, + "learning_rate": 0.00016033240348723723, + "loss": 2.0674, + "step": 18272 + }, + { + "epoch": 2.1319565978298916, + "grad_norm": 1.2591207027435303, + "learning_rate": 0.00016031731024448965, + "loss": 2.053, + "step": 18273 + }, + { + "epoch": 2.1320732703301832, + "grad_norm": 1.1887617111206055, + "learning_rate": 0.00016030221691033396, + "loss": 2.0551, + "step": 18274 + }, + { + "epoch": 2.132189942830475, + "grad_norm": 1.2593110799789429, + "learning_rate": 0.0001602871234849266, + "loss": 1.8665, + "step": 18275 + }, + { + "epoch": 2.1323066153307666, + "grad_norm": 1.462572693824768, + "learning_rate": 0.00016027202996842413, + "loss": 2.1332, + "step": 18276 + }, + { + "epoch": 2.1324232878310583, + "grad_norm": 1.2016047239303589, + "learning_rate": 0.00016025693636098292, + "loss": 1.8585, + "step": 18277 + }, + { + "epoch": 2.13253996033135, + "grad_norm": 1.0487420558929443, + "learning_rate": 0.0001602418426627595, + "loss": 2.0894, + "step": 18278 + }, + { + "epoch": 2.1326566328316416, + "grad_norm": 1.1451256275177002, + "learning_rate": 0.0001602267488739103, + "loss": 1.952, + "step": 18279 + }, + { + "epoch": 2.1327733053319333, + "grad_norm": 1.2328377962112427, + "learning_rate": 0.00016021165499459188, + "loss": 2.0358, + "step": 18280 + }, + { + "epoch": 2.132889977832225, + "grad_norm": 1.025262713432312, + "learning_rate": 0.00016019656102496055, + "loss": 1.849, + "step": 18281 + }, + { + "epoch": 2.1330066503325167, + "grad_norm": 1.2342116832733154, + "learning_rate": 0.00016018146696517295, + "loss": 2.0211, + "step": 18282 + }, + { + "epoch": 2.1331233228328084, + "grad_norm": 1.2023340463638306, + "learning_rate": 0.0001601663728153855, + "loss": 1.943, + "step": 18283 + }, + { + "epoch": 2.1332399953331, + "grad_norm": 1.33805513381958, + "learning_rate": 0.0001601512785757547, + "loss": 1.9623, + "step": 18284 + }, + { + "epoch": 2.1333566678333917, + "grad_norm": 1.017783522605896, + "learning_rate": 0.00016013618424643702, + "loss": 2.0238, + "step": 18285 + }, + { + "epoch": 2.1334733403336834, + "grad_norm": 1.086001992225647, + "learning_rate": 0.00016012108982758896, + "loss": 1.8648, + "step": 18286 + }, + { + "epoch": 2.133590012833975, + "grad_norm": 1.2160651683807373, + "learning_rate": 0.00016010599531936698, + "loss": 1.9382, + "step": 18287 + }, + { + "epoch": 2.1337066853342668, + "grad_norm": 1.1614587306976318, + "learning_rate": 0.00016009090072192748, + "loss": 2.0315, + "step": 18288 + }, + { + "epoch": 2.1338233578345585, + "grad_norm": 1.2339447736740112, + "learning_rate": 0.00016007580603542713, + "loss": 1.9116, + "step": 18289 + }, + { + "epoch": 2.13394003033485, + "grad_norm": 1.2015081644058228, + "learning_rate": 0.00016006071126002222, + "loss": 1.9511, + "step": 18290 + }, + { + "epoch": 2.134056702835142, + "grad_norm": 1.3292584419250488, + "learning_rate": 0.00016004561639586937, + "loss": 2.0119, + "step": 18291 + }, + { + "epoch": 2.1341733753354335, + "grad_norm": 1.1084315776824951, + "learning_rate": 0.000160030521443125, + "loss": 1.8834, + "step": 18292 + }, + { + "epoch": 2.134290047835725, + "grad_norm": 1.3429921865463257, + "learning_rate": 0.00016001542640194563, + "loss": 2.0847, + "step": 18293 + }, + { + "epoch": 2.134406720336017, + "grad_norm": 1.278102159500122, + "learning_rate": 0.00016000033127248775, + "loss": 1.9215, + "step": 18294 + }, + { + "epoch": 2.1345233928363085, + "grad_norm": 1.2508068084716797, + "learning_rate": 0.00015998523605490785, + "loss": 1.9882, + "step": 18295 + }, + { + "epoch": 2.1346400653366002, + "grad_norm": 1.2776846885681152, + "learning_rate": 0.0001599701407493624, + "loss": 1.9015, + "step": 18296 + }, + { + "epoch": 2.134756737836892, + "grad_norm": 1.2560980319976807, + "learning_rate": 0.00015995504535600786, + "loss": 2.0316, + "step": 18297 + }, + { + "epoch": 2.1348734103371836, + "grad_norm": 1.3857735395431519, + "learning_rate": 0.00015993994987500084, + "loss": 1.9841, + "step": 18298 + }, + { + "epoch": 2.1349900828374753, + "grad_norm": 1.1491931676864624, + "learning_rate": 0.00015992485430649773, + "loss": 1.8652, + "step": 18299 + }, + { + "epoch": 2.135106755337767, + "grad_norm": 1.18559992313385, + "learning_rate": 0.00015990975865065506, + "loss": 1.9244, + "step": 18300 + }, + { + "epoch": 2.1352234278380586, + "grad_norm": 1.215963363647461, + "learning_rate": 0.00015989466290762923, + "loss": 1.8001, + "step": 18301 + }, + { + "epoch": 2.1353401003383503, + "grad_norm": 1.2476868629455566, + "learning_rate": 0.00015987956707757693, + "loss": 2.004, + "step": 18302 + }, + { + "epoch": 2.135456772838642, + "grad_norm": 1.4091966152191162, + "learning_rate": 0.0001598644711606545, + "loss": 1.9852, + "step": 18303 + }, + { + "epoch": 2.1355734453389337, + "grad_norm": 1.0141162872314453, + "learning_rate": 0.00015984937515701854, + "loss": 1.9868, + "step": 18304 + }, + { + "epoch": 2.1356901178392254, + "grad_norm": 1.1385828256607056, + "learning_rate": 0.00015983427906682545, + "loss": 1.7509, + "step": 18305 + }, + { + "epoch": 2.135806790339517, + "grad_norm": 1.1953482627868652, + "learning_rate": 0.00015981918289023175, + "loss": 1.9755, + "step": 18306 + }, + { + "epoch": 2.1359234628398087, + "grad_norm": 1.3198505640029907, + "learning_rate": 0.00015980408662739402, + "loss": 1.9394, + "step": 18307 + }, + { + "epoch": 2.1360401353401004, + "grad_norm": 1.1850612163543701, + "learning_rate": 0.0001597889902784686, + "loss": 2.0882, + "step": 18308 + }, + { + "epoch": 2.136156807840392, + "grad_norm": 1.0195986032485962, + "learning_rate": 0.00015977389384361223, + "loss": 1.7905, + "step": 18309 + }, + { + "epoch": 2.1362734803406838, + "grad_norm": 1.0988407135009766, + "learning_rate": 0.00015975879732298124, + "loss": 1.9289, + "step": 18310 + }, + { + "epoch": 2.1363901528409754, + "grad_norm": 1.1360881328582764, + "learning_rate": 0.0001597437007167322, + "loss": 2.0795, + "step": 18311 + }, + { + "epoch": 2.136506825341267, + "grad_norm": 1.0676203966140747, + "learning_rate": 0.00015972860402502153, + "loss": 1.7864, + "step": 18312 + }, + { + "epoch": 2.136623497841559, + "grad_norm": 1.3607343435287476, + "learning_rate": 0.00015971350724800588, + "loss": 1.9859, + "step": 18313 + }, + { + "epoch": 2.1367401703418505, + "grad_norm": 1.1196824312210083, + "learning_rate": 0.00015969841038584158, + "loss": 1.9204, + "step": 18314 + }, + { + "epoch": 2.136856842842142, + "grad_norm": 1.077873945236206, + "learning_rate": 0.00015968331343868525, + "loss": 1.865, + "step": 18315 + }, + { + "epoch": 2.136973515342434, + "grad_norm": 1.2460334300994873, + "learning_rate": 0.00015966821640669342, + "loss": 2.058, + "step": 18316 + }, + { + "epoch": 2.1370901878427255, + "grad_norm": 1.0448331832885742, + "learning_rate": 0.00015965311929002255, + "loss": 2.1107, + "step": 18317 + }, + { + "epoch": 2.137206860343017, + "grad_norm": 1.2645833492279053, + "learning_rate": 0.00015963802208882914, + "loss": 2.1044, + "step": 18318 + }, + { + "epoch": 2.137323532843309, + "grad_norm": 1.3564143180847168, + "learning_rate": 0.00015962292480326974, + "loss": 1.9802, + "step": 18319 + }, + { + "epoch": 2.1374402053436006, + "grad_norm": 1.1966146230697632, + "learning_rate": 0.00015960782743350082, + "loss": 1.9289, + "step": 18320 + }, + { + "epoch": 2.1375568778438923, + "grad_norm": 1.397873878479004, + "learning_rate": 0.00015959272997967896, + "loss": 2.1781, + "step": 18321 + }, + { + "epoch": 2.137673550344184, + "grad_norm": 1.2338889837265015, + "learning_rate": 0.00015957763244196058, + "loss": 2.0726, + "step": 18322 + }, + { + "epoch": 2.1377902228444756, + "grad_norm": 1.3615295886993408, + "learning_rate": 0.00015956253482050223, + "loss": 2.0043, + "step": 18323 + }, + { + "epoch": 2.1379068953447673, + "grad_norm": 1.3074085712432861, + "learning_rate": 0.00015954743711546048, + "loss": 1.9426, + "step": 18324 + }, + { + "epoch": 2.138023567845059, + "grad_norm": 1.1535824537277222, + "learning_rate": 0.00015953233932699181, + "loss": 2.0224, + "step": 18325 + }, + { + "epoch": 2.1381402403453507, + "grad_norm": 1.2627246379852295, + "learning_rate": 0.0001595172414552527, + "loss": 2.0012, + "step": 18326 + }, + { + "epoch": 2.1382569128456423, + "grad_norm": 1.2040326595306396, + "learning_rate": 0.00015950214350039972, + "loss": 2.0381, + "step": 18327 + }, + { + "epoch": 2.138373585345934, + "grad_norm": 1.0988129377365112, + "learning_rate": 0.00015948704546258935, + "loss": 2.0836, + "step": 18328 + }, + { + "epoch": 2.1384902578462257, + "grad_norm": 1.1111395359039307, + "learning_rate": 0.00015947194734197822, + "loss": 1.9034, + "step": 18329 + }, + { + "epoch": 2.1386069303465174, + "grad_norm": 1.1764997243881226, + "learning_rate": 0.00015945684913872265, + "loss": 1.9782, + "step": 18330 + }, + { + "epoch": 2.138723602846809, + "grad_norm": 1.3516438007354736, + "learning_rate": 0.00015944175085297931, + "loss": 2.247, + "step": 18331 + }, + { + "epoch": 2.1388402753471008, + "grad_norm": 1.2749571800231934, + "learning_rate": 0.00015942665248490465, + "loss": 1.7747, + "step": 18332 + }, + { + "epoch": 2.1389569478473924, + "grad_norm": 1.276794195175171, + "learning_rate": 0.00015941155403465527, + "loss": 1.9226, + "step": 18333 + }, + { + "epoch": 2.139073620347684, + "grad_norm": 1.1808245182037354, + "learning_rate": 0.00015939645550238764, + "loss": 1.9561, + "step": 18334 + }, + { + "epoch": 2.139190292847976, + "grad_norm": 1.0870274305343628, + "learning_rate": 0.00015938135688825828, + "loss": 1.9319, + "step": 18335 + }, + { + "epoch": 2.1393069653482675, + "grad_norm": 1.4208654165267944, + "learning_rate": 0.00015936625819242375, + "loss": 2.1679, + "step": 18336 + }, + { + "epoch": 2.139423637848559, + "grad_norm": 1.1345378160476685, + "learning_rate": 0.0001593511594150406, + "loss": 1.924, + "step": 18337 + }, + { + "epoch": 2.139540310348851, + "grad_norm": 1.385474443435669, + "learning_rate": 0.00015933606055626526, + "loss": 2.1861, + "step": 18338 + }, + { + "epoch": 2.1396569828491425, + "grad_norm": 1.2196959257125854, + "learning_rate": 0.00015932096161625433, + "loss": 1.9173, + "step": 18339 + }, + { + "epoch": 2.139773655349434, + "grad_norm": 1.304136037826538, + "learning_rate": 0.0001593058625951643, + "loss": 1.9783, + "step": 18340 + }, + { + "epoch": 2.139890327849726, + "grad_norm": 1.3206735849380493, + "learning_rate": 0.00015929076349315173, + "loss": 2.0466, + "step": 18341 + }, + { + "epoch": 2.1400070003500176, + "grad_norm": 1.1359076499938965, + "learning_rate": 0.00015927566431037313, + "loss": 1.9536, + "step": 18342 + }, + { + "epoch": 2.1401236728503092, + "grad_norm": 1.1656368970870972, + "learning_rate": 0.00015926056504698508, + "loss": 1.9648, + "step": 18343 + }, + { + "epoch": 2.140240345350601, + "grad_norm": 1.0992827415466309, + "learning_rate": 0.00015924546570314404, + "loss": 1.8365, + "step": 18344 + }, + { + "epoch": 2.1403570178508926, + "grad_norm": 1.0880178213119507, + "learning_rate": 0.00015923036627900661, + "loss": 2.0454, + "step": 18345 + }, + { + "epoch": 2.1404736903511843, + "grad_norm": 1.1165515184402466, + "learning_rate": 0.0001592152667747293, + "loss": 2.0737, + "step": 18346 + }, + { + "epoch": 2.140590362851476, + "grad_norm": 1.2588036060333252, + "learning_rate": 0.00015920016719046863, + "loss": 1.9951, + "step": 18347 + }, + { + "epoch": 2.1407070353517677, + "grad_norm": 1.2479935884475708, + "learning_rate": 0.00015918506752638116, + "loss": 1.9379, + "step": 18348 + }, + { + "epoch": 2.1408237078520593, + "grad_norm": 1.2114546298980713, + "learning_rate": 0.0001591699677826234, + "loss": 1.8668, + "step": 18349 + }, + { + "epoch": 2.140940380352351, + "grad_norm": 1.2477574348449707, + "learning_rate": 0.0001591548679593519, + "loss": 2.0145, + "step": 18350 + }, + { + "epoch": 2.1410570528526427, + "grad_norm": 1.1046308279037476, + "learning_rate": 0.00015913976805672316, + "loss": 1.8091, + "step": 18351 + }, + { + "epoch": 2.1411737253529344, + "grad_norm": 1.0298402309417725, + "learning_rate": 0.00015912466807489382, + "loss": 1.7378, + "step": 18352 + }, + { + "epoch": 2.141290397853226, + "grad_norm": 1.3168119192123413, + "learning_rate": 0.00015910956801402028, + "loss": 2.0817, + "step": 18353 + }, + { + "epoch": 2.1414070703535177, + "grad_norm": 1.2656687498092651, + "learning_rate": 0.00015909446787425922, + "loss": 2.082, + "step": 18354 + }, + { + "epoch": 2.1415237428538094, + "grad_norm": 1.1994011402130127, + "learning_rate": 0.00015907936765576714, + "loss": 1.9421, + "step": 18355 + }, + { + "epoch": 2.141640415354101, + "grad_norm": 1.2140014171600342, + "learning_rate": 0.00015906426735870052, + "loss": 2.0129, + "step": 18356 + }, + { + "epoch": 2.141757087854393, + "grad_norm": 1.4999699592590332, + "learning_rate": 0.00015904916698321594, + "loss": 2.1075, + "step": 18357 + }, + { + "epoch": 2.1418737603546845, + "grad_norm": 1.2461905479431152, + "learning_rate": 0.00015903406652946998, + "loss": 1.7901, + "step": 18358 + }, + { + "epoch": 2.141990432854976, + "grad_norm": 1.2998411655426025, + "learning_rate": 0.00015901896599761914, + "loss": 1.9949, + "step": 18359 + }, + { + "epoch": 2.142107105355268, + "grad_norm": 1.1359456777572632, + "learning_rate": 0.00015900386538781993, + "loss": 2.0621, + "step": 18360 + }, + { + "epoch": 2.1422237778555595, + "grad_norm": 1.1569099426269531, + "learning_rate": 0.000158988764700229, + "loss": 2.135, + "step": 18361 + }, + { + "epoch": 2.142340450355851, + "grad_norm": 1.203433871269226, + "learning_rate": 0.00015897366393500284, + "loss": 1.9605, + "step": 18362 + }, + { + "epoch": 2.142457122856143, + "grad_norm": 1.194090485572815, + "learning_rate": 0.000158958563092298, + "loss": 2.0368, + "step": 18363 + }, + { + "epoch": 2.1425737953564346, + "grad_norm": 1.1067241430282593, + "learning_rate": 0.000158943462172271, + "loss": 2.0155, + "step": 18364 + }, + { + "epoch": 2.1426904678567262, + "grad_norm": 1.1261953115463257, + "learning_rate": 0.00015892836117507845, + "loss": 1.8853, + "step": 18365 + }, + { + "epoch": 2.142807140357018, + "grad_norm": 1.169527530670166, + "learning_rate": 0.00015891326010087687, + "loss": 2.0614, + "step": 18366 + }, + { + "epoch": 2.1429238128573096, + "grad_norm": 1.1890631914138794, + "learning_rate": 0.00015889815894982277, + "loss": 2.0963, + "step": 18367 + }, + { + "epoch": 2.1430404853576013, + "grad_norm": 1.4302146434783936, + "learning_rate": 0.00015888305772207277, + "loss": 2.0542, + "step": 18368 + }, + { + "epoch": 2.143157157857893, + "grad_norm": 1.216537356376648, + "learning_rate": 0.0001588679564177834, + "loss": 1.826, + "step": 18369 + }, + { + "epoch": 2.1432738303581846, + "grad_norm": 1.11898934841156, + "learning_rate": 0.00015885285503711123, + "loss": 1.8143, + "step": 18370 + }, + { + "epoch": 2.1433905028584763, + "grad_norm": 1.16757333278656, + "learning_rate": 0.00015883775358021277, + "loss": 1.7954, + "step": 18371 + }, + { + "epoch": 2.143507175358768, + "grad_norm": 1.2720420360565186, + "learning_rate": 0.0001588226520472446, + "loss": 2.0163, + "step": 18372 + }, + { + "epoch": 2.1436238478590597, + "grad_norm": 1.060508370399475, + "learning_rate": 0.00015880755043836325, + "loss": 1.9659, + "step": 18373 + }, + { + "epoch": 2.1437405203593514, + "grad_norm": 1.1907830238342285, + "learning_rate": 0.00015879244875372534, + "loss": 1.9825, + "step": 18374 + }, + { + "epoch": 2.143857192859643, + "grad_norm": 1.087398886680603, + "learning_rate": 0.00015877734699348738, + "loss": 1.9277, + "step": 18375 + }, + { + "epoch": 2.1439738653599347, + "grad_norm": 1.257049322128296, + "learning_rate": 0.00015876224515780596, + "loss": 1.9689, + "step": 18376 + }, + { + "epoch": 2.1440905378602264, + "grad_norm": 1.132898211479187, + "learning_rate": 0.00015874714324683756, + "loss": 2.0157, + "step": 18377 + }, + { + "epoch": 2.144207210360518, + "grad_norm": 1.167380928993225, + "learning_rate": 0.00015873204126073883, + "loss": 1.8823, + "step": 18378 + }, + { + "epoch": 2.1443238828608098, + "grad_norm": 1.1207244396209717, + "learning_rate": 0.0001587169391996663, + "loss": 1.8962, + "step": 18379 + }, + { + "epoch": 2.1444405553611015, + "grad_norm": 1.0905392169952393, + "learning_rate": 0.00015870183706377653, + "loss": 2.071, + "step": 18380 + }, + { + "epoch": 2.144557227861393, + "grad_norm": 1.2240662574768066, + "learning_rate": 0.0001586867348532261, + "loss": 2.0478, + "step": 18381 + }, + { + "epoch": 2.144673900361685, + "grad_norm": 1.2799603939056396, + "learning_rate": 0.00015867163256817155, + "loss": 1.9732, + "step": 18382 + }, + { + "epoch": 2.1447905728619765, + "grad_norm": 0.9804182648658752, + "learning_rate": 0.0001586565302087694, + "loss": 1.9297, + "step": 18383 + }, + { + "epoch": 2.144907245362268, + "grad_norm": 1.290818214416504, + "learning_rate": 0.00015864142777517629, + "loss": 2.0151, + "step": 18384 + }, + { + "epoch": 2.14502391786256, + "grad_norm": 1.2145980596542358, + "learning_rate": 0.00015862632526754877, + "loss": 2.0704, + "step": 18385 + }, + { + "epoch": 2.1451405903628515, + "grad_norm": 1.1328707933425903, + "learning_rate": 0.00015861122268604331, + "loss": 2.0416, + "step": 18386 + }, + { + "epoch": 2.145257262863143, + "grad_norm": 1.2396104335784912, + "learning_rate": 0.00015859612003081666, + "loss": 2.1611, + "step": 18387 + }, + { + "epoch": 2.145373935363435, + "grad_norm": 1.083008885383606, + "learning_rate": 0.00015858101730202527, + "loss": 1.7576, + "step": 18388 + }, + { + "epoch": 2.1454906078637266, + "grad_norm": 1.1962645053863525, + "learning_rate": 0.00015856591449982574, + "loss": 2.0315, + "step": 18389 + }, + { + "epoch": 2.1456072803640183, + "grad_norm": 1.0660052299499512, + "learning_rate": 0.0001585508116243746, + "loss": 1.894, + "step": 18390 + }, + { + "epoch": 2.14572395286431, + "grad_norm": 1.2618080377578735, + "learning_rate": 0.00015853570867582843, + "loss": 2.0766, + "step": 18391 + }, + { + "epoch": 2.1458406253646016, + "grad_norm": 1.1938259601593018, + "learning_rate": 0.00015852060565434386, + "loss": 2.1264, + "step": 18392 + }, + { + "epoch": 2.1459572978648933, + "grad_norm": 1.1937808990478516, + "learning_rate": 0.0001585055025600774, + "loss": 1.9519, + "step": 18393 + }, + { + "epoch": 2.146073970365185, + "grad_norm": 1.2677595615386963, + "learning_rate": 0.0001584903993931856, + "loss": 1.9403, + "step": 18394 + }, + { + "epoch": 2.1461906428654767, + "grad_norm": 1.1542413234710693, + "learning_rate": 0.0001584752961538251, + "loss": 1.9893, + "step": 18395 + }, + { + "epoch": 2.1463073153657684, + "grad_norm": 1.1050294637680054, + "learning_rate": 0.00015846019284215245, + "loss": 2.0313, + "step": 18396 + }, + { + "epoch": 2.14642398786606, + "grad_norm": 1.2628647089004517, + "learning_rate": 0.00015844508945832424, + "loss": 1.9424, + "step": 18397 + }, + { + "epoch": 2.1465406603663517, + "grad_norm": 1.3688610792160034, + "learning_rate": 0.000158429986002497, + "loss": 2.1557, + "step": 18398 + }, + { + "epoch": 2.1466573328666434, + "grad_norm": 1.275872826576233, + "learning_rate": 0.00015841488247482732, + "loss": 1.9725, + "step": 18399 + }, + { + "epoch": 2.146774005366935, + "grad_norm": 1.1008857488632202, + "learning_rate": 0.0001583997788754718, + "loss": 1.7582, + "step": 18400 + }, + { + "epoch": 2.1468906778672268, + "grad_norm": 1.3834333419799805, + "learning_rate": 0.00015838467520458703, + "loss": 2.0371, + "step": 18401 + }, + { + "epoch": 2.1470073503675184, + "grad_norm": 1.1760728359222412, + "learning_rate": 0.00015836957146232958, + "loss": 1.9517, + "step": 18402 + }, + { + "epoch": 2.14712402286781, + "grad_norm": 1.164780616760254, + "learning_rate": 0.00015835446764885595, + "loss": 2.0082, + "step": 18403 + }, + { + "epoch": 2.147240695368102, + "grad_norm": 1.2588717937469482, + "learning_rate": 0.00015833936376432283, + "loss": 2.1911, + "step": 18404 + }, + { + "epoch": 2.1473573678683935, + "grad_norm": 1.20476233959198, + "learning_rate": 0.00015832425980888678, + "loss": 2.0431, + "step": 18405 + }, + { + "epoch": 2.147474040368685, + "grad_norm": 0.9991309642791748, + "learning_rate": 0.0001583091557827043, + "loss": 1.7448, + "step": 18406 + }, + { + "epoch": 2.147590712868977, + "grad_norm": 1.1514778137207031, + "learning_rate": 0.00015829405168593202, + "loss": 2.0583, + "step": 18407 + }, + { + "epoch": 2.1477073853692685, + "grad_norm": 1.206250548362732, + "learning_rate": 0.00015827894751872657, + "loss": 1.8576, + "step": 18408 + }, + { + "epoch": 2.14782405786956, + "grad_norm": 1.4201717376708984, + "learning_rate": 0.00015826384328124446, + "loss": 2.103, + "step": 18409 + }, + { + "epoch": 2.147940730369852, + "grad_norm": 1.39409601688385, + "learning_rate": 0.0001582487389736423, + "loss": 2.0786, + "step": 18410 + }, + { + "epoch": 2.1480574028701436, + "grad_norm": 1.1892950534820557, + "learning_rate": 0.00015823363459607672, + "loss": 1.9782, + "step": 18411 + }, + { + "epoch": 2.1481740753704353, + "grad_norm": 1.2601414918899536, + "learning_rate": 0.00015821853014870421, + "loss": 2.1348, + "step": 18412 + }, + { + "epoch": 2.148290747870727, + "grad_norm": 1.0618449449539185, + "learning_rate": 0.00015820342563168145, + "loss": 2.0348, + "step": 18413 + }, + { + "epoch": 2.1484074203710186, + "grad_norm": 1.1687171459197998, + "learning_rate": 0.00015818832104516503, + "loss": 2.2601, + "step": 18414 + }, + { + "epoch": 2.1485240928713103, + "grad_norm": 1.2514493465423584, + "learning_rate": 0.00015817321638931145, + "loss": 2.0336, + "step": 18415 + }, + { + "epoch": 2.148640765371602, + "grad_norm": 1.0331313610076904, + "learning_rate": 0.00015815811166427734, + "loss": 2.0454, + "step": 18416 + }, + { + "epoch": 2.1487574378718937, + "grad_norm": 1.2075353860855103, + "learning_rate": 0.00015814300687021926, + "loss": 2.0426, + "step": 18417 + }, + { + "epoch": 2.1488741103721853, + "grad_norm": 1.3339859247207642, + "learning_rate": 0.00015812790200729393, + "loss": 2.0202, + "step": 18418 + }, + { + "epoch": 2.148990782872477, + "grad_norm": 1.114254355430603, + "learning_rate": 0.00015811279707565775, + "loss": 2.1764, + "step": 18419 + }, + { + "epoch": 2.1491074553727687, + "grad_norm": 1.1270794868469238, + "learning_rate": 0.00015809769207546746, + "loss": 2.1474, + "step": 18420 + }, + { + "epoch": 2.1492241278730604, + "grad_norm": 1.2815344333648682, + "learning_rate": 0.00015808258700687958, + "loss": 1.984, + "step": 18421 + }, + { + "epoch": 2.149340800373352, + "grad_norm": 1.105165958404541, + "learning_rate": 0.00015806748187005074, + "loss": 1.8847, + "step": 18422 + }, + { + "epoch": 2.1494574728736437, + "grad_norm": 1.09516179561615, + "learning_rate": 0.0001580523766651375, + "loss": 1.9168, + "step": 18423 + }, + { + "epoch": 2.1495741453739354, + "grad_norm": 1.1694446802139282, + "learning_rate": 0.00015803727139229647, + "loss": 1.8331, + "step": 18424 + }, + { + "epoch": 2.149690817874227, + "grad_norm": 1.3246697187423706, + "learning_rate": 0.00015802216605168422, + "loss": 1.7204, + "step": 18425 + }, + { + "epoch": 2.149807490374519, + "grad_norm": 1.1494112014770508, + "learning_rate": 0.0001580070606434574, + "loss": 1.9518, + "step": 18426 + }, + { + "epoch": 2.1499241628748105, + "grad_norm": 1.3502333164215088, + "learning_rate": 0.00015799195516777255, + "loss": 2.1548, + "step": 18427 + }, + { + "epoch": 2.150040835375102, + "grad_norm": 1.2580498456954956, + "learning_rate": 0.00015797684962478632, + "loss": 1.9144, + "step": 18428 + }, + { + "epoch": 2.150157507875394, + "grad_norm": 1.2337536811828613, + "learning_rate": 0.00015796174401465522, + "loss": 1.944, + "step": 18429 + }, + { + "epoch": 2.1502741803756855, + "grad_norm": 1.1422631740570068, + "learning_rate": 0.00015794663833753594, + "loss": 1.9997, + "step": 18430 + }, + { + "epoch": 2.150390852875977, + "grad_norm": 1.1223036050796509, + "learning_rate": 0.00015793153259358508, + "loss": 2.0253, + "step": 18431 + }, + { + "epoch": 2.150507525376269, + "grad_norm": 1.3502253293991089, + "learning_rate": 0.00015791642678295923, + "loss": 2.0015, + "step": 18432 + }, + { + "epoch": 2.1506241978765606, + "grad_norm": 1.1710741519927979, + "learning_rate": 0.00015790132090581492, + "loss": 1.9211, + "step": 18433 + }, + { + "epoch": 2.1507408703768522, + "grad_norm": 1.2866530418395996, + "learning_rate": 0.0001578862149623088, + "loss": 2.0315, + "step": 18434 + }, + { + "epoch": 2.150857542877144, + "grad_norm": 1.2060344219207764, + "learning_rate": 0.00015787110895259743, + "loss": 1.9186, + "step": 18435 + }, + { + "epoch": 2.1509742153774356, + "grad_norm": 1.1431235074996948, + "learning_rate": 0.0001578560028768375, + "loss": 1.9681, + "step": 18436 + }, + { + "epoch": 2.1510908878777273, + "grad_norm": 1.1546403169631958, + "learning_rate": 0.00015784089673518553, + "loss": 1.8385, + "step": 18437 + }, + { + "epoch": 2.151207560378019, + "grad_norm": 1.1848735809326172, + "learning_rate": 0.00015782579052779814, + "loss": 1.9182, + "step": 18438 + }, + { + "epoch": 2.1513242328783106, + "grad_norm": 1.1902284622192383, + "learning_rate": 0.00015781068425483198, + "loss": 1.8383, + "step": 18439 + }, + { + "epoch": 2.1514409053786023, + "grad_norm": 1.050605297088623, + "learning_rate": 0.00015779557791644367, + "loss": 2.0055, + "step": 18440 + }, + { + "epoch": 2.151557577878894, + "grad_norm": 1.4096728563308716, + "learning_rate": 0.00015778047151278975, + "loss": 2.0237, + "step": 18441 + }, + { + "epoch": 2.1516742503791857, + "grad_norm": 1.3601888418197632, + "learning_rate": 0.00015776536504402683, + "loss": 2.2048, + "step": 18442 + }, + { + "epoch": 2.1517909228794774, + "grad_norm": 1.1204924583435059, + "learning_rate": 0.00015775025851031153, + "loss": 1.9327, + "step": 18443 + }, + { + "epoch": 2.151907595379769, + "grad_norm": 1.2837109565734863, + "learning_rate": 0.00015773515191180051, + "loss": 1.9682, + "step": 18444 + }, + { + "epoch": 2.1520242678800607, + "grad_norm": 1.1670838594436646, + "learning_rate": 0.0001577200452486503, + "loss": 1.9338, + "step": 18445 + }, + { + "epoch": 2.1521409403803524, + "grad_norm": 1.2454416751861572, + "learning_rate": 0.00015770493852101754, + "loss": 1.8977, + "step": 18446 + }, + { + "epoch": 2.152257612880644, + "grad_norm": 1.2689902782440186, + "learning_rate": 0.00015768983172905884, + "loss": 1.9316, + "step": 18447 + }, + { + "epoch": 2.152374285380936, + "grad_norm": 1.2117352485656738, + "learning_rate": 0.0001576747248729308, + "loss": 2.0072, + "step": 18448 + }, + { + "epoch": 2.1524909578812275, + "grad_norm": 1.0870018005371094, + "learning_rate": 0.00015765961795279008, + "loss": 1.9012, + "step": 18449 + }, + { + "epoch": 2.152607630381519, + "grad_norm": 1.2014292478561401, + "learning_rate": 0.00015764451096879323, + "loss": 2.0438, + "step": 18450 + }, + { + "epoch": 2.152724302881811, + "grad_norm": 1.178748607635498, + "learning_rate": 0.0001576294039210969, + "loss": 1.8724, + "step": 18451 + }, + { + "epoch": 2.1528409753821025, + "grad_norm": 1.1421903371810913, + "learning_rate": 0.0001576142968098577, + "loss": 2.041, + "step": 18452 + }, + { + "epoch": 2.152957647882394, + "grad_norm": 1.014037013053894, + "learning_rate": 0.00015759918963523223, + "loss": 1.8377, + "step": 18453 + }, + { + "epoch": 2.153074320382686, + "grad_norm": 1.1439579725265503, + "learning_rate": 0.0001575840823973771, + "loss": 1.8447, + "step": 18454 + }, + { + "epoch": 2.1531909928829776, + "grad_norm": 1.1649936437606812, + "learning_rate": 0.00015756897509644892, + "loss": 1.9276, + "step": 18455 + }, + { + "epoch": 2.1533076653832692, + "grad_norm": 1.0894590616226196, + "learning_rate": 0.00015755386773260433, + "loss": 1.9322, + "step": 18456 + }, + { + "epoch": 2.153424337883561, + "grad_norm": 1.215929388999939, + "learning_rate": 0.00015753876030599997, + "loss": 2.0358, + "step": 18457 + }, + { + "epoch": 2.1535410103838526, + "grad_norm": 1.2617532014846802, + "learning_rate": 0.0001575236528167924, + "loss": 1.9262, + "step": 18458 + }, + { + "epoch": 2.1536576828841443, + "grad_norm": 1.0381462574005127, + "learning_rate": 0.0001575085452651383, + "loss": 1.8546, + "step": 18459 + }, + { + "epoch": 2.153774355384436, + "grad_norm": 1.0844172239303589, + "learning_rate": 0.0001574934376511942, + "loss": 1.9878, + "step": 18460 + }, + { + "epoch": 2.1538910278847276, + "grad_norm": 1.364478349685669, + "learning_rate": 0.00015747832997511677, + "loss": 2.0486, + "step": 18461 + }, + { + "epoch": 2.1540077003850193, + "grad_norm": 0.9908340573310852, + "learning_rate": 0.00015746322223706265, + "loss": 1.8335, + "step": 18462 + }, + { + "epoch": 2.154124372885311, + "grad_norm": 1.2213568687438965, + "learning_rate": 0.0001574481144371884, + "loss": 2.0713, + "step": 18463 + }, + { + "epoch": 2.1542410453856027, + "grad_norm": 1.269963264465332, + "learning_rate": 0.00015743300657565073, + "loss": 1.9681, + "step": 18464 + }, + { + "epoch": 2.1543577178858944, + "grad_norm": 1.1520628929138184, + "learning_rate": 0.0001574178986526062, + "loss": 1.9066, + "step": 18465 + }, + { + "epoch": 2.154474390386186, + "grad_norm": 0.9596773386001587, + "learning_rate": 0.0001574027906682114, + "loss": 1.7454, + "step": 18466 + }, + { + "epoch": 2.1545910628864777, + "grad_norm": 1.2246530055999756, + "learning_rate": 0.00015738768262262305, + "loss": 2.1435, + "step": 18467 + }, + { + "epoch": 2.1547077353867694, + "grad_norm": 1.1452995538711548, + "learning_rate": 0.0001573725745159977, + "loss": 1.8593, + "step": 18468 + }, + { + "epoch": 2.154824407887061, + "grad_norm": 1.122388482093811, + "learning_rate": 0.00015735746634849197, + "loss": 1.8947, + "step": 18469 + }, + { + "epoch": 2.1549410803873528, + "grad_norm": 1.2498127222061157, + "learning_rate": 0.00015734235812026257, + "loss": 1.9965, + "step": 18470 + }, + { + "epoch": 2.1550577528876445, + "grad_norm": 1.1574220657348633, + "learning_rate": 0.000157327249831466, + "loss": 2.1162, + "step": 18471 + }, + { + "epoch": 2.155174425387936, + "grad_norm": 1.23323655128479, + "learning_rate": 0.00015731214148225901, + "loss": 2.1179, + "step": 18472 + }, + { + "epoch": 2.155291097888228, + "grad_norm": 1.2867463827133179, + "learning_rate": 0.00015729703307279817, + "loss": 2.0158, + "step": 18473 + }, + { + "epoch": 2.1554077703885195, + "grad_norm": 1.148245930671692, + "learning_rate": 0.00015728192460324003, + "loss": 1.9818, + "step": 18474 + }, + { + "epoch": 2.155524442888811, + "grad_norm": 1.1793478727340698, + "learning_rate": 0.00015726681607374133, + "loss": 1.9524, + "step": 18475 + }, + { + "epoch": 2.155641115389103, + "grad_norm": 1.1413795948028564, + "learning_rate": 0.0001572517074844587, + "loss": 1.927, + "step": 18476 + }, + { + "epoch": 2.1557577878893945, + "grad_norm": 1.107148289680481, + "learning_rate": 0.00015723659883554872, + "loss": 1.9953, + "step": 18477 + }, + { + "epoch": 2.155874460389686, + "grad_norm": 1.094104528427124, + "learning_rate": 0.00015722149012716797, + "loss": 1.8904, + "step": 18478 + }, + { + "epoch": 2.155991132889978, + "grad_norm": 1.1149524450302124, + "learning_rate": 0.00015720638135947317, + "loss": 1.8766, + "step": 18479 + }, + { + "epoch": 2.1561078053902696, + "grad_norm": 1.1764320135116577, + "learning_rate": 0.00015719127253262095, + "loss": 2.0753, + "step": 18480 + }, + { + "epoch": 2.1562244778905613, + "grad_norm": 1.219091534614563, + "learning_rate": 0.00015717616364676788, + "loss": 2.0998, + "step": 18481 + }, + { + "epoch": 2.156341150390853, + "grad_norm": 1.0778549909591675, + "learning_rate": 0.0001571610547020706, + "loss": 1.9724, + "step": 18482 + }, + { + "epoch": 2.1564578228911446, + "grad_norm": 1.1879267692565918, + "learning_rate": 0.00015714594569868587, + "loss": 1.9836, + "step": 18483 + }, + { + "epoch": 2.1565744953914363, + "grad_norm": 1.2663471698760986, + "learning_rate": 0.00015713083663677015, + "loss": 1.9401, + "step": 18484 + }, + { + "epoch": 2.156691167891728, + "grad_norm": 0.9966666102409363, + "learning_rate": 0.00015711572751648013, + "loss": 1.966, + "step": 18485 + }, + { + "epoch": 2.1568078403920197, + "grad_norm": 1.2981241941452026, + "learning_rate": 0.00015710061833797248, + "loss": 2.1308, + "step": 18486 + }, + { + "epoch": 2.1569245128923114, + "grad_norm": 1.1878981590270996, + "learning_rate": 0.00015708550910140383, + "loss": 1.852, + "step": 18487 + }, + { + "epoch": 2.157041185392603, + "grad_norm": 1.071549654006958, + "learning_rate": 0.0001570703998069308, + "loss": 1.8551, + "step": 18488 + }, + { + "epoch": 2.1571578578928947, + "grad_norm": 1.2963358163833618, + "learning_rate": 0.00015705529045471, + "loss": 1.9593, + "step": 18489 + }, + { + "epoch": 2.1572745303931864, + "grad_norm": 1.2067445516586304, + "learning_rate": 0.00015704018104489812, + "loss": 1.8817, + "step": 18490 + }, + { + "epoch": 2.157391202893478, + "grad_norm": 1.0869745016098022, + "learning_rate": 0.00015702507157765177, + "loss": 1.88, + "step": 18491 + }, + { + "epoch": 2.1575078753937698, + "grad_norm": 1.1496742963790894, + "learning_rate": 0.0001570099620531276, + "loss": 1.9549, + "step": 18492 + }, + { + "epoch": 2.1576245478940614, + "grad_norm": 1.2411853075027466, + "learning_rate": 0.00015699485247148222, + "loss": 1.8794, + "step": 18493 + }, + { + "epoch": 2.157741220394353, + "grad_norm": 1.080117106437683, + "learning_rate": 0.00015697974283287227, + "loss": 2.0004, + "step": 18494 + }, + { + "epoch": 2.157857892894645, + "grad_norm": 1.0105005502700806, + "learning_rate": 0.00015696463313745443, + "loss": 2.0298, + "step": 18495 + }, + { + "epoch": 2.1579745653949365, + "grad_norm": 1.0517895221710205, + "learning_rate": 0.0001569495233853853, + "loss": 2.0065, + "step": 18496 + }, + { + "epoch": 2.158091237895228, + "grad_norm": 1.0460482835769653, + "learning_rate": 0.0001569344135768216, + "loss": 1.918, + "step": 18497 + }, + { + "epoch": 2.15820791039552, + "grad_norm": 1.1151809692382812, + "learning_rate": 0.00015691930371191986, + "loss": 1.9168, + "step": 18498 + }, + { + "epoch": 2.1583245828958115, + "grad_norm": 1.2818986177444458, + "learning_rate": 0.00015690419379083676, + "loss": 2.1577, + "step": 18499 + }, + { + "epoch": 2.158441255396103, + "grad_norm": 1.1345551013946533, + "learning_rate": 0.00015688908381372902, + "loss": 1.7877, + "step": 18500 + }, + { + "epoch": 2.158557927896395, + "grad_norm": 1.3427549600601196, + "learning_rate": 0.0001568739737807532, + "loss": 1.9436, + "step": 18501 + }, + { + "epoch": 2.1586746003966866, + "grad_norm": 1.0681296586990356, + "learning_rate": 0.00015685886369206595, + "loss": 2.0462, + "step": 18502 + }, + { + "epoch": 2.1587912728969783, + "grad_norm": 1.287399411201477, + "learning_rate": 0.00015684375354782393, + "loss": 2.0473, + "step": 18503 + }, + { + "epoch": 2.15890794539727, + "grad_norm": 1.1801230907440186, + "learning_rate": 0.00015682864334818375, + "loss": 2.1111, + "step": 18504 + }, + { + "epoch": 2.1590246178975616, + "grad_norm": 0.994391918182373, + "learning_rate": 0.00015681353309330212, + "loss": 1.9377, + "step": 18505 + }, + { + "epoch": 2.1591412903978533, + "grad_norm": 1.0797638893127441, + "learning_rate": 0.0001567984227833357, + "loss": 1.8243, + "step": 18506 + }, + { + "epoch": 2.159257962898145, + "grad_norm": 1.332672357559204, + "learning_rate": 0.00015678331241844105, + "loss": 1.9568, + "step": 18507 + }, + { + "epoch": 2.1593746353984367, + "grad_norm": 1.0759207010269165, + "learning_rate": 0.00015676820199877483, + "loss": 1.8932, + "step": 18508 + }, + { + "epoch": 2.1594913078987283, + "grad_norm": 1.113834261894226, + "learning_rate": 0.0001567530915244938, + "loss": 1.8258, + "step": 18509 + }, + { + "epoch": 2.15960798039902, + "grad_norm": 1.1241503953933716, + "learning_rate": 0.00015673798099575452, + "loss": 2.0037, + "step": 18510 + }, + { + "epoch": 2.1597246528993117, + "grad_norm": 1.0794979333877563, + "learning_rate": 0.0001567228704127136, + "loss": 1.9696, + "step": 18511 + }, + { + "epoch": 2.1598413253996034, + "grad_norm": 1.1093101501464844, + "learning_rate": 0.00015670775977552775, + "loss": 1.7132, + "step": 18512 + }, + { + "epoch": 2.159957997899895, + "grad_norm": 1.0674057006835938, + "learning_rate": 0.00015669264908435362, + "loss": 1.9837, + "step": 18513 + }, + { + "epoch": 2.1600746704001867, + "grad_norm": 1.333493947982788, + "learning_rate": 0.00015667753833934782, + "loss": 2.0416, + "step": 18514 + }, + { + "epoch": 2.1601913429004784, + "grad_norm": 1.2031009197235107, + "learning_rate": 0.00015666242754066708, + "loss": 2.0845, + "step": 18515 + }, + { + "epoch": 2.16030801540077, + "grad_norm": 1.2096397876739502, + "learning_rate": 0.00015664731668846798, + "loss": 2.0548, + "step": 18516 + }, + { + "epoch": 2.160424687901062, + "grad_norm": 1.3315914869308472, + "learning_rate": 0.00015663220578290718, + "loss": 1.9641, + "step": 18517 + }, + { + "epoch": 2.1605413604013535, + "grad_norm": 1.0757431983947754, + "learning_rate": 0.00015661709482414139, + "loss": 1.7725, + "step": 18518 + }, + { + "epoch": 2.160658032901645, + "grad_norm": 1.1160948276519775, + "learning_rate": 0.00015660198381232718, + "loss": 1.9107, + "step": 18519 + }, + { + "epoch": 2.160774705401937, + "grad_norm": 1.2963178157806396, + "learning_rate": 0.00015658687274762125, + "loss": 2.1401, + "step": 18520 + }, + { + "epoch": 2.1608913779022285, + "grad_norm": 1.297079086303711, + "learning_rate": 0.00015657176163018027, + "loss": 2.0322, + "step": 18521 + }, + { + "epoch": 2.16100805040252, + "grad_norm": 1.1410865783691406, + "learning_rate": 0.00015655665046016088, + "loss": 2.0409, + "step": 18522 + }, + { + "epoch": 2.161124722902812, + "grad_norm": 1.2479685544967651, + "learning_rate": 0.0001565415392377197, + "loss": 1.9401, + "step": 18523 + }, + { + "epoch": 2.1612413954031036, + "grad_norm": 1.3244174718856812, + "learning_rate": 0.00015652642796301343, + "loss": 1.996, + "step": 18524 + }, + { + "epoch": 2.1613580679033952, + "grad_norm": 1.0735480785369873, + "learning_rate": 0.00015651131663619873, + "loss": 1.8799, + "step": 18525 + }, + { + "epoch": 2.161474740403687, + "grad_norm": 1.1569043397903442, + "learning_rate": 0.00015649620525743223, + "loss": 1.9406, + "step": 18526 + }, + { + "epoch": 2.1615914129039786, + "grad_norm": 1.1334075927734375, + "learning_rate": 0.00015648109382687065, + "loss": 1.8825, + "step": 18527 + }, + { + "epoch": 2.1617080854042703, + "grad_norm": 1.0521948337554932, + "learning_rate": 0.00015646598234467056, + "loss": 1.8286, + "step": 18528 + }, + { + "epoch": 2.161824757904562, + "grad_norm": 1.3283401727676392, + "learning_rate": 0.00015645087081098865, + "loss": 2.1075, + "step": 18529 + }, + { + "epoch": 2.1619414304048536, + "grad_norm": 0.9958030581474304, + "learning_rate": 0.0001564357592259816, + "loss": 1.7593, + "step": 18530 + }, + { + "epoch": 2.1620581029051453, + "grad_norm": 1.447295069694519, + "learning_rate": 0.00015642064758980602, + "loss": 2.1456, + "step": 18531 + }, + { + "epoch": 2.162174775405437, + "grad_norm": 1.1764804124832153, + "learning_rate": 0.00015640553590261867, + "loss": 2.0091, + "step": 18532 + }, + { + "epoch": 2.1622914479057287, + "grad_norm": 1.2112690210342407, + "learning_rate": 0.0001563904241645761, + "loss": 1.9486, + "step": 18533 + }, + { + "epoch": 2.1624081204060204, + "grad_norm": 1.1401264667510986, + "learning_rate": 0.00015637531237583504, + "loss": 2.0006, + "step": 18534 + }, + { + "epoch": 2.162524792906312, + "grad_norm": 1.1570836305618286, + "learning_rate": 0.00015636020053655218, + "loss": 1.9439, + "step": 18535 + }, + { + "epoch": 2.1626414654066037, + "grad_norm": 1.3954397439956665, + "learning_rate": 0.0001563450886468841, + "loss": 2.0957, + "step": 18536 + }, + { + "epoch": 2.1627581379068954, + "grad_norm": 1.1143593788146973, + "learning_rate": 0.0001563299767069875, + "loss": 1.8955, + "step": 18537 + }, + { + "epoch": 2.162874810407187, + "grad_norm": 1.2985702753067017, + "learning_rate": 0.00015631486471701905, + "loss": 1.9526, + "step": 18538 + }, + { + "epoch": 2.162991482907479, + "grad_norm": 1.3359328508377075, + "learning_rate": 0.0001562997526771354, + "loss": 1.9503, + "step": 18539 + }, + { + "epoch": 2.1631081554077705, + "grad_norm": 1.182724118232727, + "learning_rate": 0.00015628464058749326, + "loss": 2.1996, + "step": 18540 + }, + { + "epoch": 2.163224827908062, + "grad_norm": 1.2611445188522339, + "learning_rate": 0.00015626952844824922, + "loss": 2.0166, + "step": 18541 + }, + { + "epoch": 2.163341500408354, + "grad_norm": 1.1881825923919678, + "learning_rate": 0.00015625441625956, + "loss": 1.8224, + "step": 18542 + }, + { + "epoch": 2.1634581729086455, + "grad_norm": 1.2726564407348633, + "learning_rate": 0.00015623930402158223, + "loss": 1.9752, + "step": 18543 + }, + { + "epoch": 2.163574845408937, + "grad_norm": 1.2213237285614014, + "learning_rate": 0.00015622419173447265, + "loss": 1.9861, + "step": 18544 + }, + { + "epoch": 2.163691517909229, + "grad_norm": 1.2907543182373047, + "learning_rate": 0.00015620907939838786, + "loss": 2.0448, + "step": 18545 + }, + { + "epoch": 2.1638081904095205, + "grad_norm": 0.9107268452644348, + "learning_rate": 0.0001561939670134845, + "loss": 1.644, + "step": 18546 + }, + { + "epoch": 2.1639248629098122, + "grad_norm": 1.2706042528152466, + "learning_rate": 0.00015617885457991932, + "loss": 1.934, + "step": 18547 + }, + { + "epoch": 2.164041535410104, + "grad_norm": 1.3738447427749634, + "learning_rate": 0.00015616374209784895, + "loss": 2.0713, + "step": 18548 + }, + { + "epoch": 2.1641582079103956, + "grad_norm": 1.5784603357315063, + "learning_rate": 0.00015614862956743006, + "loss": 1.9905, + "step": 18549 + }, + { + "epoch": 2.1642748804106873, + "grad_norm": 1.1820210218429565, + "learning_rate": 0.00015613351698881933, + "loss": 1.9042, + "step": 18550 + }, + { + "epoch": 2.164391552910979, + "grad_norm": 1.1019996404647827, + "learning_rate": 0.00015611840436217342, + "loss": 1.8362, + "step": 18551 + }, + { + "epoch": 2.1645082254112706, + "grad_norm": 1.2097358703613281, + "learning_rate": 0.00015610329168764896, + "loss": 1.9918, + "step": 18552 + }, + { + "epoch": 2.1646248979115623, + "grad_norm": 1.2806864976882935, + "learning_rate": 0.00015608817896540277, + "loss": 1.9044, + "step": 18553 + }, + { + "epoch": 2.164741570411854, + "grad_norm": 1.1396100521087646, + "learning_rate": 0.0001560730661955913, + "loss": 1.8658, + "step": 18554 + }, + { + "epoch": 2.1648582429121457, + "grad_norm": 1.2322486639022827, + "learning_rate": 0.00015605795337837142, + "loss": 2.0011, + "step": 18555 + }, + { + "epoch": 2.1649749154124374, + "grad_norm": 1.019199252128601, + "learning_rate": 0.00015604284051389964, + "loss": 1.9167, + "step": 18556 + }, + { + "epoch": 2.165091587912729, + "grad_norm": 1.149644136428833, + "learning_rate": 0.00015602772760233276, + "loss": 2.0066, + "step": 18557 + }, + { + "epoch": 2.1652082604130207, + "grad_norm": 1.1883820295333862, + "learning_rate": 0.0001560126146438274, + "loss": 2.0056, + "step": 18558 + }, + { + "epoch": 2.1653249329133124, + "grad_norm": 1.1167725324630737, + "learning_rate": 0.00015599750163854028, + "loss": 2.2686, + "step": 18559 + }, + { + "epoch": 2.165441605413604, + "grad_norm": 1.1986626386642456, + "learning_rate": 0.00015598238858662797, + "loss": 1.8316, + "step": 18560 + }, + { + "epoch": 2.1655582779138958, + "grad_norm": 1.1360286474227905, + "learning_rate": 0.00015596727548824733, + "loss": 1.9568, + "step": 18561 + }, + { + "epoch": 2.1656749504141874, + "grad_norm": 1.272579312324524, + "learning_rate": 0.00015595216234355483, + "loss": 1.9766, + "step": 18562 + }, + { + "epoch": 2.165791622914479, + "grad_norm": 1.0698511600494385, + "learning_rate": 0.00015593704915270728, + "loss": 1.8499, + "step": 18563 + }, + { + "epoch": 2.165908295414771, + "grad_norm": 1.1369941234588623, + "learning_rate": 0.00015592193591586128, + "loss": 2.1045, + "step": 18564 + }, + { + "epoch": 2.1660249679150625, + "grad_norm": 1.0645018815994263, + "learning_rate": 0.00015590682263317353, + "loss": 2.0495, + "step": 18565 + }, + { + "epoch": 2.166141640415354, + "grad_norm": 1.1021924018859863, + "learning_rate": 0.00015589170930480073, + "loss": 1.9824, + "step": 18566 + }, + { + "epoch": 2.166258312915646, + "grad_norm": 1.2425241470336914, + "learning_rate": 0.00015587659593089957, + "loss": 1.9281, + "step": 18567 + }, + { + "epoch": 2.1663749854159375, + "grad_norm": 1.2216546535491943, + "learning_rate": 0.00015586148251162671, + "loss": 1.8537, + "step": 18568 + }, + { + "epoch": 2.166491657916229, + "grad_norm": 1.3359748125076294, + "learning_rate": 0.00015584636904713875, + "loss": 2.1374, + "step": 18569 + }, + { + "epoch": 2.166608330416521, + "grad_norm": 1.1241694688796997, + "learning_rate": 0.00015583125553759254, + "loss": 1.7488, + "step": 18570 + }, + { + "epoch": 2.1667250029168126, + "grad_norm": 1.178022027015686, + "learning_rate": 0.00015581614198314462, + "loss": 1.8382, + "step": 18571 + }, + { + "epoch": 2.1668416754171043, + "grad_norm": 1.3155126571655273, + "learning_rate": 0.00015580102838395172, + "loss": 2.064, + "step": 18572 + }, + { + "epoch": 2.166958347917396, + "grad_norm": 1.085903525352478, + "learning_rate": 0.0001557859147401705, + "loss": 2.104, + "step": 18573 + }, + { + "epoch": 2.1670750204176876, + "grad_norm": 1.122230887413025, + "learning_rate": 0.00015577080105195766, + "loss": 1.893, + "step": 18574 + }, + { + "epoch": 2.1671916929179793, + "grad_norm": 1.2476119995117188, + "learning_rate": 0.00015575568731946994, + "loss": 2.1328, + "step": 18575 + }, + { + "epoch": 2.167308365418271, + "grad_norm": 1.1129920482635498, + "learning_rate": 0.00015574057354286386, + "loss": 1.7886, + "step": 18576 + }, + { + "epoch": 2.1674250379185627, + "grad_norm": 1.1768189668655396, + "learning_rate": 0.0001557254597222963, + "loss": 1.8931, + "step": 18577 + }, + { + "epoch": 2.1675417104188544, + "grad_norm": 1.1117355823516846, + "learning_rate": 0.00015571034585792383, + "loss": 1.9644, + "step": 18578 + }, + { + "epoch": 2.167658382919146, + "grad_norm": 1.0454537868499756, + "learning_rate": 0.00015569523194990315, + "loss": 1.7745, + "step": 18579 + }, + { + "epoch": 2.1677750554194377, + "grad_norm": 1.2825963497161865, + "learning_rate": 0.0001556801179983909, + "loss": 1.8802, + "step": 18580 + }, + { + "epoch": 2.1678917279197294, + "grad_norm": 1.1564971208572388, + "learning_rate": 0.00015566500400354382, + "loss": 2.0615, + "step": 18581 + }, + { + "epoch": 2.168008400420021, + "grad_norm": 1.2649444341659546, + "learning_rate": 0.00015564988996551857, + "loss": 1.8814, + "step": 18582 + }, + { + "epoch": 2.1681250729203128, + "grad_norm": 1.0466822385787964, + "learning_rate": 0.00015563477588447189, + "loss": 2.0235, + "step": 18583 + }, + { + "epoch": 2.1682417454206044, + "grad_norm": 1.1101627349853516, + "learning_rate": 0.00015561966176056045, + "loss": 1.8678, + "step": 18584 + }, + { + "epoch": 2.168358417920896, + "grad_norm": 1.084433913230896, + "learning_rate": 0.0001556045475939409, + "loss": 1.8946, + "step": 18585 + }, + { + "epoch": 2.168475090421188, + "grad_norm": 1.0174165964126587, + "learning_rate": 0.0001555894333847699, + "loss": 1.6747, + "step": 18586 + }, + { + "epoch": 2.1685917629214795, + "grad_norm": 1.3748036623001099, + "learning_rate": 0.00015557431913320422, + "loss": 1.9571, + "step": 18587 + }, + { + "epoch": 2.168708435421771, + "grad_norm": 1.1694494485855103, + "learning_rate": 0.00015555920483940053, + "loss": 2.0425, + "step": 18588 + }, + { + "epoch": 2.168825107922063, + "grad_norm": 1.278826355934143, + "learning_rate": 0.00015554409050351546, + "loss": 2.0681, + "step": 18589 + }, + { + "epoch": 2.1689417804223545, + "grad_norm": 1.1164133548736572, + "learning_rate": 0.00015552897612570573, + "loss": 1.9117, + "step": 18590 + }, + { + "epoch": 2.169058452922646, + "grad_norm": 1.260284423828125, + "learning_rate": 0.00015551386170612803, + "loss": 2.0295, + "step": 18591 + }, + { + "epoch": 2.169175125422938, + "grad_norm": 1.053409218788147, + "learning_rate": 0.00015549874724493906, + "loss": 1.9602, + "step": 18592 + }, + { + "epoch": 2.1692917979232296, + "grad_norm": 1.1678996086120605, + "learning_rate": 0.0001554836327422955, + "loss": 1.9251, + "step": 18593 + }, + { + "epoch": 2.1694084704235213, + "grad_norm": 1.158403754234314, + "learning_rate": 0.00015546851819835404, + "loss": 1.8449, + "step": 18594 + }, + { + "epoch": 2.169525142923813, + "grad_norm": 1.1371644735336304, + "learning_rate": 0.00015545340361327142, + "loss": 2.0523, + "step": 18595 + }, + { + "epoch": 2.1696418154241046, + "grad_norm": 1.2497985363006592, + "learning_rate": 0.00015543828898720421, + "loss": 2.0961, + "step": 18596 + }, + { + "epoch": 2.1697584879243963, + "grad_norm": 1.0621521472930908, + "learning_rate": 0.00015542317432030918, + "loss": 1.9801, + "step": 18597 + }, + { + "epoch": 2.169875160424688, + "grad_norm": 1.334043025970459, + "learning_rate": 0.00015540805961274307, + "loss": 2.0191, + "step": 18598 + }, + { + "epoch": 2.1699918329249797, + "grad_norm": 1.0295624732971191, + "learning_rate": 0.00015539294486466252, + "loss": 1.9155, + "step": 18599 + }, + { + "epoch": 2.1701085054252713, + "grad_norm": 1.1944469213485718, + "learning_rate": 0.0001553778300762242, + "loss": 2.0323, + "step": 18600 + }, + { + "epoch": 2.170225177925563, + "grad_norm": 1.1764957904815674, + "learning_rate": 0.00015536271524758483, + "loss": 1.9081, + "step": 18601 + }, + { + "epoch": 2.1703418504258547, + "grad_norm": 1.3053900003433228, + "learning_rate": 0.00015534760037890115, + "loss": 2.0246, + "step": 18602 + }, + { + "epoch": 2.1704585229261464, + "grad_norm": 1.2124712467193604, + "learning_rate": 0.0001553324854703297, + "loss": 1.9684, + "step": 18603 + }, + { + "epoch": 2.170575195426438, + "grad_norm": 1.1964832544326782, + "learning_rate": 0.0001553173705220274, + "loss": 1.933, + "step": 18604 + }, + { + "epoch": 2.1706918679267297, + "grad_norm": 1.1165145635604858, + "learning_rate": 0.0001553022555341508, + "loss": 1.9366, + "step": 18605 + }, + { + "epoch": 2.1708085404270214, + "grad_norm": 1.0630534887313843, + "learning_rate": 0.00015528714050685658, + "loss": 1.903, + "step": 18606 + }, + { + "epoch": 2.170925212927313, + "grad_norm": 1.1967848539352417, + "learning_rate": 0.00015527202544030148, + "loss": 2.0839, + "step": 18607 + }, + { + "epoch": 2.171041885427605, + "grad_norm": 1.055337905883789, + "learning_rate": 0.00015525691033464216, + "loss": 1.8061, + "step": 18608 + }, + { + "epoch": 2.1711585579278965, + "grad_norm": 1.0549163818359375, + "learning_rate": 0.00015524179519003537, + "loss": 1.9857, + "step": 18609 + }, + { + "epoch": 2.171275230428188, + "grad_norm": 1.064870834350586, + "learning_rate": 0.0001552266800066378, + "loss": 2.0079, + "step": 18610 + }, + { + "epoch": 2.17139190292848, + "grad_norm": 1.1853563785552979, + "learning_rate": 0.00015521156478460617, + "loss": 2.1159, + "step": 18611 + }, + { + "epoch": 2.1715085754287715, + "grad_norm": 1.200487732887268, + "learning_rate": 0.00015519644952409717, + "loss": 2.0002, + "step": 18612 + }, + { + "epoch": 2.171625247929063, + "grad_norm": 1.2906816005706787, + "learning_rate": 0.0001551813342252674, + "loss": 1.9707, + "step": 18613 + }, + { + "epoch": 2.171741920429355, + "grad_norm": 1.1343942880630493, + "learning_rate": 0.00015516621888827367, + "loss": 2.035, + "step": 18614 + }, + { + "epoch": 2.1718585929296466, + "grad_norm": 1.2510370016098022, + "learning_rate": 0.00015515110351327257, + "loss": 2.0899, + "step": 18615 + }, + { + "epoch": 2.1719752654299382, + "grad_norm": 1.2946009635925293, + "learning_rate": 0.00015513598810042094, + "loss": 2.0835, + "step": 18616 + }, + { + "epoch": 2.17209193793023, + "grad_norm": 1.1262327432632446, + "learning_rate": 0.00015512087264987534, + "loss": 1.971, + "step": 18617 + }, + { + "epoch": 2.1722086104305216, + "grad_norm": 1.2681190967559814, + "learning_rate": 0.00015510575716179263, + "loss": 2.1489, + "step": 18618 + }, + { + "epoch": 2.1723252829308133, + "grad_norm": 1.1480096578598022, + "learning_rate": 0.00015509064163632934, + "loss": 1.9919, + "step": 18619 + }, + { + "epoch": 2.172441955431105, + "grad_norm": 1.0859229564666748, + "learning_rate": 0.00015507552607364226, + "loss": 1.9707, + "step": 18620 + }, + { + "epoch": 2.1725586279313966, + "grad_norm": 1.2851039171218872, + "learning_rate": 0.00015506041047388814, + "loss": 2.0183, + "step": 18621 + }, + { + "epoch": 2.1726753004316883, + "grad_norm": 1.064772367477417, + "learning_rate": 0.00015504529483722357, + "loss": 2.0523, + "step": 18622 + }, + { + "epoch": 2.17279197293198, + "grad_norm": 1.137850284576416, + "learning_rate": 0.00015503017916380533, + "loss": 1.9675, + "step": 18623 + }, + { + "epoch": 2.1729086454322717, + "grad_norm": 1.3669822216033936, + "learning_rate": 0.0001550150634537901, + "loss": 2.0111, + "step": 18624 + }, + { + "epoch": 2.1730253179325634, + "grad_norm": 1.0778207778930664, + "learning_rate": 0.00015499994770733455, + "loss": 2.006, + "step": 18625 + }, + { + "epoch": 2.173141990432855, + "grad_norm": 1.0994187593460083, + "learning_rate": 0.00015498483192459542, + "loss": 1.9691, + "step": 18626 + }, + { + "epoch": 2.1732586629331467, + "grad_norm": 1.3178157806396484, + "learning_rate": 0.0001549697161057294, + "loss": 2.0919, + "step": 18627 + }, + { + "epoch": 2.1733753354334384, + "grad_norm": 1.1478188037872314, + "learning_rate": 0.00015495460025089321, + "loss": 2.0755, + "step": 18628 + }, + { + "epoch": 2.17349200793373, + "grad_norm": 0.9683903455734253, + "learning_rate": 0.00015493948436024358, + "loss": 1.9731, + "step": 18629 + }, + { + "epoch": 2.173608680434022, + "grad_norm": 1.0758440494537354, + "learning_rate": 0.0001549243684339372, + "loss": 1.8734, + "step": 18630 + }, + { + "epoch": 2.1737253529343135, + "grad_norm": 1.116462230682373, + "learning_rate": 0.00015490925247213072, + "loss": 1.9604, + "step": 18631 + }, + { + "epoch": 2.173842025434605, + "grad_norm": 1.1667011976242065, + "learning_rate": 0.00015489413647498085, + "loss": 2.0287, + "step": 18632 + }, + { + "epoch": 2.173958697934897, + "grad_norm": 1.2122869491577148, + "learning_rate": 0.00015487902044264434, + "loss": 2.0824, + "step": 18633 + }, + { + "epoch": 2.1740753704351885, + "grad_norm": 1.138037919998169, + "learning_rate": 0.00015486390437527794, + "loss": 1.9023, + "step": 18634 + }, + { + "epoch": 2.17419204293548, + "grad_norm": 1.1316055059432983, + "learning_rate": 0.00015484878827303825, + "loss": 1.865, + "step": 18635 + }, + { + "epoch": 2.174308715435772, + "grad_norm": 1.1600598096847534, + "learning_rate": 0.00015483367213608203, + "loss": 1.9419, + "step": 18636 + }, + { + "epoch": 2.1744253879360635, + "grad_norm": 1.3625761270523071, + "learning_rate": 0.00015481855596456595, + "loss": 2.0275, + "step": 18637 + }, + { + "epoch": 2.1745420604363552, + "grad_norm": 1.2072219848632812, + "learning_rate": 0.00015480343975864688, + "loss": 2.0432, + "step": 18638 + }, + { + "epoch": 2.174658732936647, + "grad_norm": 1.2427481412887573, + "learning_rate": 0.00015478832351848131, + "loss": 1.896, + "step": 18639 + }, + { + "epoch": 2.1747754054369386, + "grad_norm": 1.0190215110778809, + "learning_rate": 0.00015477320724422602, + "loss": 1.9431, + "step": 18640 + }, + { + "epoch": 2.1748920779372303, + "grad_norm": 1.205655813217163, + "learning_rate": 0.0001547580909360378, + "loss": 2.1035, + "step": 18641 + }, + { + "epoch": 2.175008750437522, + "grad_norm": 1.2561159133911133, + "learning_rate": 0.00015474297459407325, + "loss": 2.0177, + "step": 18642 + }, + { + "epoch": 2.1751254229378136, + "grad_norm": 1.3135262727737427, + "learning_rate": 0.00015472785821848912, + "loss": 2.2249, + "step": 18643 + }, + { + "epoch": 2.1752420954381053, + "grad_norm": 1.2133564949035645, + "learning_rate": 0.00015471274180944215, + "loss": 2.072, + "step": 18644 + }, + { + "epoch": 2.175358767938397, + "grad_norm": 1.3257906436920166, + "learning_rate": 0.00015469762536708906, + "loss": 2.0048, + "step": 18645 + }, + { + "epoch": 2.1754754404386887, + "grad_norm": 1.2273846864700317, + "learning_rate": 0.00015468250889158648, + "loss": 2.0416, + "step": 18646 + }, + { + "epoch": 2.1755921129389804, + "grad_norm": 1.1868362426757812, + "learning_rate": 0.0001546673923830912, + "loss": 2.0022, + "step": 18647 + }, + { + "epoch": 2.175708785439272, + "grad_norm": 1.126537799835205, + "learning_rate": 0.00015465227584175986, + "loss": 2.1509, + "step": 18648 + }, + { + "epoch": 2.1758254579395637, + "grad_norm": 1.062191367149353, + "learning_rate": 0.00015463715926774925, + "loss": 2.0163, + "step": 18649 + }, + { + "epoch": 2.1759421304398554, + "grad_norm": 1.2622065544128418, + "learning_rate": 0.00015462204266121602, + "loss": 1.8775, + "step": 18650 + }, + { + "epoch": 2.176058802940147, + "grad_norm": 1.2402675151824951, + "learning_rate": 0.0001546069260223169, + "loss": 2.1048, + "step": 18651 + }, + { + "epoch": 2.1761754754404388, + "grad_norm": 1.1483633518218994, + "learning_rate": 0.00015459180935120865, + "loss": 1.9069, + "step": 18652 + }, + { + "epoch": 2.1762921479407304, + "grad_norm": 1.2179807424545288, + "learning_rate": 0.00015457669264804793, + "loss": 2.0517, + "step": 18653 + }, + { + "epoch": 2.176408820441022, + "grad_norm": 1.296028733253479, + "learning_rate": 0.00015456157591299142, + "loss": 1.9735, + "step": 18654 + }, + { + "epoch": 2.176525492941314, + "grad_norm": 1.092777132987976, + "learning_rate": 0.00015454645914619595, + "loss": 1.9866, + "step": 18655 + }, + { + "epoch": 2.1766421654416055, + "grad_norm": 1.125862717628479, + "learning_rate": 0.00015453134234781813, + "loss": 1.8555, + "step": 18656 + }, + { + "epoch": 2.176758837941897, + "grad_norm": 1.2046765089035034, + "learning_rate": 0.00015451622551801469, + "loss": 1.8835, + "step": 18657 + }, + { + "epoch": 2.176875510442189, + "grad_norm": 1.0867184400558472, + "learning_rate": 0.00015450110865694236, + "loss": 1.9243, + "step": 18658 + }, + { + "epoch": 2.1769921829424805, + "grad_norm": 1.2349138259887695, + "learning_rate": 0.00015448599176475788, + "loss": 1.9166, + "step": 18659 + }, + { + "epoch": 2.177108855442772, + "grad_norm": 1.1070706844329834, + "learning_rate": 0.0001544708748416179, + "loss": 1.888, + "step": 18660 + }, + { + "epoch": 2.177225527943064, + "grad_norm": 1.0777006149291992, + "learning_rate": 0.00015445575788767917, + "loss": 1.8379, + "step": 18661 + }, + { + "epoch": 2.1773422004433556, + "grad_norm": 1.1740950345993042, + "learning_rate": 0.00015444064090309846, + "loss": 1.8988, + "step": 18662 + }, + { + "epoch": 2.1774588729436473, + "grad_norm": 1.0850402116775513, + "learning_rate": 0.00015442552388803246, + "loss": 1.8477, + "step": 18663 + }, + { + "epoch": 2.177575545443939, + "grad_norm": 1.1368635892868042, + "learning_rate": 0.00015441040684263782, + "loss": 1.9787, + "step": 18664 + }, + { + "epoch": 2.1776922179442306, + "grad_norm": 1.1812516450881958, + "learning_rate": 0.0001543952897670713, + "loss": 2.0798, + "step": 18665 + }, + { + "epoch": 2.1778088904445223, + "grad_norm": 1.27645742893219, + "learning_rate": 0.00015438017266148965, + "loss": 2.0252, + "step": 18666 + }, + { + "epoch": 2.177925562944814, + "grad_norm": 1.2335222959518433, + "learning_rate": 0.0001543650555260495, + "loss": 1.928, + "step": 18667 + }, + { + "epoch": 2.1780422354451057, + "grad_norm": 1.134809136390686, + "learning_rate": 0.0001543499383609077, + "loss": 1.8192, + "step": 18668 + }, + { + "epoch": 2.1781589079453973, + "grad_norm": 1.2731616497039795, + "learning_rate": 0.00015433482116622083, + "loss": 1.9941, + "step": 18669 + }, + { + "epoch": 2.178275580445689, + "grad_norm": 1.2060942649841309, + "learning_rate": 0.00015431970394214563, + "loss": 2.0302, + "step": 18670 + }, + { + "epoch": 2.1783922529459807, + "grad_norm": 1.252256155014038, + "learning_rate": 0.00015430458668883894, + "loss": 1.8514, + "step": 18671 + }, + { + "epoch": 2.1785089254462724, + "grad_norm": 1.0572160482406616, + "learning_rate": 0.00015428946940645742, + "loss": 1.9367, + "step": 18672 + }, + { + "epoch": 2.178625597946564, + "grad_norm": 1.261633038520813, + "learning_rate": 0.0001542743520951577, + "loss": 1.8345, + "step": 18673 + }, + { + "epoch": 2.1787422704468558, + "grad_norm": 1.2808846235275269, + "learning_rate": 0.00015425923475509656, + "loss": 2.1221, + "step": 18674 + }, + { + "epoch": 2.1788589429471474, + "grad_norm": 1.1545790433883667, + "learning_rate": 0.00015424411738643076, + "loss": 1.9136, + "step": 18675 + }, + { + "epoch": 2.178975615447439, + "grad_norm": 1.3188426494598389, + "learning_rate": 0.00015422899998931698, + "loss": 2.0865, + "step": 18676 + }, + { + "epoch": 2.179092287947731, + "grad_norm": 0.9523217082023621, + "learning_rate": 0.000154213882563912, + "loss": 1.91, + "step": 18677 + }, + { + "epoch": 2.1792089604480225, + "grad_norm": 1.2601526975631714, + "learning_rate": 0.00015419876511037244, + "loss": 2.0294, + "step": 18678 + }, + { + "epoch": 2.179325632948314, + "grad_norm": 1.1249454021453857, + "learning_rate": 0.00015418364762885507, + "loss": 1.8185, + "step": 18679 + }, + { + "epoch": 2.179442305448606, + "grad_norm": 1.062299132347107, + "learning_rate": 0.0001541685301195166, + "loss": 1.8562, + "step": 18680 + }, + { + "epoch": 2.1795589779488975, + "grad_norm": 1.1375361680984497, + "learning_rate": 0.00015415341258251378, + "loss": 1.9422, + "step": 18681 + }, + { + "epoch": 2.179675650449189, + "grad_norm": 1.0786824226379395, + "learning_rate": 0.0001541382950180033, + "loss": 1.997, + "step": 18682 + }, + { + "epoch": 2.179792322949481, + "grad_norm": 1.187733769416809, + "learning_rate": 0.00015412317742614194, + "loss": 2.0569, + "step": 18683 + }, + { + "epoch": 2.1799089954497726, + "grad_norm": 1.2161270380020142, + "learning_rate": 0.0001541080598070863, + "loss": 2.0029, + "step": 18684 + }, + { + "epoch": 2.1800256679500642, + "grad_norm": 1.3419413566589355, + "learning_rate": 0.00015409294216099323, + "loss": 2.0245, + "step": 18685 + }, + { + "epoch": 2.180142340450356, + "grad_norm": 1.2058100700378418, + "learning_rate": 0.0001540778244880194, + "loss": 2.046, + "step": 18686 + }, + { + "epoch": 2.1802590129506476, + "grad_norm": 1.2876458168029785, + "learning_rate": 0.00015406270678832148, + "loss": 2.1016, + "step": 18687 + }, + { + "epoch": 2.1803756854509393, + "grad_norm": 1.5027867555618286, + "learning_rate": 0.00015404758906205633, + "loss": 2.0532, + "step": 18688 + }, + { + "epoch": 2.180492357951231, + "grad_norm": 1.241433024406433, + "learning_rate": 0.00015403247130938057, + "loss": 1.9956, + "step": 18689 + }, + { + "epoch": 2.1806090304515227, + "grad_norm": 1.310092568397522, + "learning_rate": 0.00015401735353045098, + "loss": 2.0355, + "step": 18690 + }, + { + "epoch": 2.1807257029518143, + "grad_norm": 1.1321290731430054, + "learning_rate": 0.0001540022357254242, + "loss": 1.8967, + "step": 18691 + }, + { + "epoch": 2.180842375452106, + "grad_norm": 1.213017463684082, + "learning_rate": 0.00015398711789445702, + "loss": 1.9455, + "step": 18692 + }, + { + "epoch": 2.1809590479523977, + "grad_norm": 1.1863784790039062, + "learning_rate": 0.00015397200003770616, + "loss": 1.8447, + "step": 18693 + }, + { + "epoch": 2.1810757204526894, + "grad_norm": 1.8204340934753418, + "learning_rate": 0.00015395688215532837, + "loss": 1.9781, + "step": 18694 + }, + { + "epoch": 2.181192392952981, + "grad_norm": 1.3750981092453003, + "learning_rate": 0.00015394176424748032, + "loss": 2.077, + "step": 18695 + }, + { + "epoch": 2.1813090654532727, + "grad_norm": 0.9133469462394714, + "learning_rate": 0.00015392664631431874, + "loss": 1.6384, + "step": 18696 + }, + { + "epoch": 2.1814257379535644, + "grad_norm": 1.0703539848327637, + "learning_rate": 0.00015391152835600037, + "loss": 1.9146, + "step": 18697 + }, + { + "epoch": 2.181542410453856, + "grad_norm": 1.1859630346298218, + "learning_rate": 0.000153896410372682, + "loss": 1.7491, + "step": 18698 + }, + { + "epoch": 2.181659082954148, + "grad_norm": 1.1892708539962769, + "learning_rate": 0.0001538812923645203, + "loss": 1.9476, + "step": 18699 + }, + { + "epoch": 2.1817757554544395, + "grad_norm": 1.142240285873413, + "learning_rate": 0.0001538661743316719, + "loss": 1.9088, + "step": 18700 + }, + { + "epoch": 2.181892427954731, + "grad_norm": 1.1437938213348389, + "learning_rate": 0.0001538510562742937, + "loss": 1.7204, + "step": 18701 + }, + { + "epoch": 2.182009100455023, + "grad_norm": 1.1685644388198853, + "learning_rate": 0.00015383593819254232, + "loss": 2.0123, + "step": 18702 + }, + { + "epoch": 2.1821257729553145, + "grad_norm": 1.1860082149505615, + "learning_rate": 0.0001538208200865745, + "loss": 1.8409, + "step": 18703 + }, + { + "epoch": 2.182242445455606, + "grad_norm": 1.103902816772461, + "learning_rate": 0.00015380570195654706, + "loss": 2.0132, + "step": 18704 + }, + { + "epoch": 2.182359117955898, + "grad_norm": 1.451709270477295, + "learning_rate": 0.00015379058380261658, + "loss": 2.0055, + "step": 18705 + }, + { + "epoch": 2.1824757904561896, + "grad_norm": 1.2842849493026733, + "learning_rate": 0.00015377546562493987, + "loss": 2.0886, + "step": 18706 + }, + { + "epoch": 2.1825924629564812, + "grad_norm": 1.2582834959030151, + "learning_rate": 0.0001537603474236737, + "loss": 1.9693, + "step": 18707 + }, + { + "epoch": 2.182709135456773, + "grad_norm": 1.1581714153289795, + "learning_rate": 0.00015374522919897474, + "loss": 1.9239, + "step": 18708 + }, + { + "epoch": 2.1828258079570646, + "grad_norm": 1.3612136840820312, + "learning_rate": 0.00015373011095099967, + "loss": 1.8415, + "step": 18709 + }, + { + "epoch": 2.1829424804573563, + "grad_norm": 1.125762939453125, + "learning_rate": 0.00015371499267990534, + "loss": 1.9208, + "step": 18710 + }, + { + "epoch": 2.183059152957648, + "grad_norm": 1.2237706184387207, + "learning_rate": 0.0001536998743858484, + "loss": 1.9456, + "step": 18711 + }, + { + "epoch": 2.1831758254579396, + "grad_norm": 1.2189782857894897, + "learning_rate": 0.00015368475606898556, + "loss": 1.8961, + "step": 18712 + }, + { + "epoch": 2.1832924979582313, + "grad_norm": 0.9694661498069763, + "learning_rate": 0.00015366963772947358, + "loss": 1.868, + "step": 18713 + }, + { + "epoch": 2.183409170458523, + "grad_norm": 1.2146389484405518, + "learning_rate": 0.0001536545193674692, + "loss": 1.9461, + "step": 18714 + }, + { + "epoch": 2.1835258429588147, + "grad_norm": 1.4997992515563965, + "learning_rate": 0.00015363940098312923, + "loss": 2.0785, + "step": 18715 + }, + { + "epoch": 2.1836425154591064, + "grad_norm": 1.1777063608169556, + "learning_rate": 0.00015362428257661027, + "loss": 1.9637, + "step": 18716 + }, + { + "epoch": 2.183759187959398, + "grad_norm": 1.0201548337936401, + "learning_rate": 0.00015360916414806905, + "loss": 2.0031, + "step": 18717 + }, + { + "epoch": 2.1838758604596897, + "grad_norm": 1.0919480323791504, + "learning_rate": 0.0001535940456976624, + "loss": 1.9223, + "step": 18718 + }, + { + "epoch": 2.1839925329599814, + "grad_norm": 1.2865476608276367, + "learning_rate": 0.000153578927225547, + "loss": 1.9105, + "step": 18719 + }, + { + "epoch": 2.184109205460273, + "grad_norm": 1.0448106527328491, + "learning_rate": 0.00015356380873187957, + "loss": 1.9362, + "step": 18720 + }, + { + "epoch": 2.1842258779605648, + "grad_norm": 1.2225639820098877, + "learning_rate": 0.00015354869021681685, + "loss": 1.9714, + "step": 18721 + }, + { + "epoch": 2.1843425504608565, + "grad_norm": 1.273737907409668, + "learning_rate": 0.00015353357168051558, + "loss": 2.1554, + "step": 18722 + }, + { + "epoch": 2.184459222961148, + "grad_norm": 1.30903959274292, + "learning_rate": 0.00015351845312313247, + "loss": 2.0988, + "step": 18723 + }, + { + "epoch": 2.18457589546144, + "grad_norm": 1.1068129539489746, + "learning_rate": 0.0001535033345448243, + "loss": 1.9141, + "step": 18724 + }, + { + "epoch": 2.1846925679617315, + "grad_norm": 1.0672765970230103, + "learning_rate": 0.0001534882159457478, + "loss": 1.8045, + "step": 18725 + }, + { + "epoch": 2.184809240462023, + "grad_norm": 1.3128046989440918, + "learning_rate": 0.0001534730973260596, + "loss": 2.1223, + "step": 18726 + }, + { + "epoch": 2.184925912962315, + "grad_norm": 1.2799875736236572, + "learning_rate": 0.00015345797868591656, + "loss": 2.072, + "step": 18727 + }, + { + "epoch": 2.1850425854626065, + "grad_norm": 1.1781351566314697, + "learning_rate": 0.00015344286002547537, + "loss": 1.8509, + "step": 18728 + }, + { + "epoch": 2.1851592579628982, + "grad_norm": 1.1616270542144775, + "learning_rate": 0.00015342774134489273, + "loss": 1.9116, + "step": 18729 + }, + { + "epoch": 2.18527593046319, + "grad_norm": 1.2490386962890625, + "learning_rate": 0.00015341262264432538, + "loss": 1.9675, + "step": 18730 + }, + { + "epoch": 2.1853926029634816, + "grad_norm": 1.0088590383529663, + "learning_rate": 0.00015339750392393014, + "loss": 1.9901, + "step": 18731 + }, + { + "epoch": 2.1855092754637733, + "grad_norm": 1.3504782915115356, + "learning_rate": 0.00015338238518386368, + "loss": 2.1061, + "step": 18732 + }, + { + "epoch": 2.185625947964065, + "grad_norm": 1.1629269123077393, + "learning_rate": 0.00015336726642428266, + "loss": 2.0348, + "step": 18733 + }, + { + "epoch": 2.1857426204643566, + "grad_norm": 1.294732689857483, + "learning_rate": 0.00015335214764534394, + "loss": 1.9877, + "step": 18734 + }, + { + "epoch": 2.1858592929646483, + "grad_norm": 1.2324930429458618, + "learning_rate": 0.0001533370288472042, + "loss": 1.9789, + "step": 18735 + }, + { + "epoch": 2.18597596546494, + "grad_norm": 1.1609193086624146, + "learning_rate": 0.00015332191003002016, + "loss": 2.0318, + "step": 18736 + }, + { + "epoch": 2.1860926379652317, + "grad_norm": 1.067777156829834, + "learning_rate": 0.00015330679119394855, + "loss": 1.9813, + "step": 18737 + }, + { + "epoch": 2.1862093104655234, + "grad_norm": 1.0480146408081055, + "learning_rate": 0.0001532916723391462, + "loss": 2.0372, + "step": 18738 + }, + { + "epoch": 2.186325982965815, + "grad_norm": 1.2189991474151611, + "learning_rate": 0.00015327655346576968, + "loss": 2.0189, + "step": 18739 + }, + { + "epoch": 2.1864426554661067, + "grad_norm": 3.283257007598877, + "learning_rate": 0.00015326143457397587, + "loss": 1.8963, + "step": 18740 + }, + { + "epoch": 2.1865593279663984, + "grad_norm": 1.0910969972610474, + "learning_rate": 0.0001532463156639215, + "loss": 1.8933, + "step": 18741 + }, + { + "epoch": 2.18667600046669, + "grad_norm": 1.2483254671096802, + "learning_rate": 0.0001532311967357632, + "loss": 1.9688, + "step": 18742 + }, + { + "epoch": 2.1867926729669818, + "grad_norm": 1.0981128215789795, + "learning_rate": 0.0001532160777896578, + "loss": 1.9935, + "step": 18743 + }, + { + "epoch": 2.1869093454672734, + "grad_norm": 1.3294366598129272, + "learning_rate": 0.000153200958825762, + "loss": 1.9595, + "step": 18744 + }, + { + "epoch": 2.187026017967565, + "grad_norm": 1.3294647932052612, + "learning_rate": 0.0001531858398442325, + "loss": 2.1125, + "step": 18745 + }, + { + "epoch": 2.187142690467857, + "grad_norm": 1.4875391721725464, + "learning_rate": 0.0001531707208452261, + "loss": 2.0161, + "step": 18746 + }, + { + "epoch": 2.1872593629681485, + "grad_norm": 1.3852442502975464, + "learning_rate": 0.00015315560182889952, + "loss": 1.9966, + "step": 18747 + }, + { + "epoch": 2.18737603546844, + "grad_norm": 1.1665149927139282, + "learning_rate": 0.0001531404827954095, + "loss": 1.8164, + "step": 18748 + }, + { + "epoch": 2.187492707968732, + "grad_norm": 1.3960891962051392, + "learning_rate": 0.00015312536374491274, + "loss": 2.0255, + "step": 18749 + }, + { + "epoch": 2.1876093804690235, + "grad_norm": 1.0452382564544678, + "learning_rate": 0.00015311024467756605, + "loss": 1.9587, + "step": 18750 + }, + { + "epoch": 2.187726052969315, + "grad_norm": 1.0991259813308716, + "learning_rate": 0.0001530951255935261, + "loss": 2.1266, + "step": 18751 + }, + { + "epoch": 2.187842725469607, + "grad_norm": 1.2493678331375122, + "learning_rate": 0.00015308000649294962, + "loss": 1.8113, + "step": 18752 + }, + { + "epoch": 2.1879593979698986, + "grad_norm": 1.2489187717437744, + "learning_rate": 0.0001530648873759934, + "loss": 1.9941, + "step": 18753 + }, + { + "epoch": 2.1880760704701903, + "grad_norm": 1.2277237176895142, + "learning_rate": 0.00015304976824281416, + "loss": 1.9997, + "step": 18754 + }, + { + "epoch": 2.188192742970482, + "grad_norm": 1.213281512260437, + "learning_rate": 0.00015303464909356872, + "loss": 1.8469, + "step": 18755 + }, + { + "epoch": 2.1883094154707736, + "grad_norm": 1.027925968170166, + "learning_rate": 0.00015301952992841363, + "loss": 1.9226, + "step": 18756 + }, + { + "epoch": 2.1884260879710653, + "grad_norm": 1.141583800315857, + "learning_rate": 0.0001530044107475058, + "loss": 2.0405, + "step": 18757 + }, + { + "epoch": 2.188542760471357, + "grad_norm": 1.2139861583709717, + "learning_rate": 0.0001529892915510019, + "loss": 1.9232, + "step": 18758 + }, + { + "epoch": 2.1886594329716487, + "grad_norm": 1.2433463335037231, + "learning_rate": 0.00015297417233905864, + "loss": 2.1594, + "step": 18759 + }, + { + "epoch": 2.1887761054719403, + "grad_norm": 1.1745342016220093, + "learning_rate": 0.00015295905311183276, + "loss": 1.9277, + "step": 18760 + }, + { + "epoch": 2.188892777972232, + "grad_norm": 1.3754523992538452, + "learning_rate": 0.0001529439338694811, + "loss": 1.9531, + "step": 18761 + }, + { + "epoch": 2.1890094504725237, + "grad_norm": 1.1762157678604126, + "learning_rate": 0.00015292881461216032, + "loss": 1.9693, + "step": 18762 + }, + { + "epoch": 2.1891261229728154, + "grad_norm": 0.9922764301300049, + "learning_rate": 0.00015291369534002714, + "loss": 1.8855, + "step": 18763 + }, + { + "epoch": 2.189242795473107, + "grad_norm": 1.3444561958312988, + "learning_rate": 0.00015289857605323834, + "loss": 2.0496, + "step": 18764 + }, + { + "epoch": 2.1893594679733988, + "grad_norm": 1.216315746307373, + "learning_rate": 0.0001528834567519506, + "loss": 2.0057, + "step": 18765 + }, + { + "epoch": 2.1894761404736904, + "grad_norm": 1.1661293506622314, + "learning_rate": 0.00015286833743632082, + "loss": 1.8675, + "step": 18766 + }, + { + "epoch": 2.189592812973982, + "grad_norm": 1.2589390277862549, + "learning_rate": 0.0001528532181065056, + "loss": 1.9473, + "step": 18767 + }, + { + "epoch": 2.189709485474274, + "grad_norm": 1.227932095527649, + "learning_rate": 0.00015283809876266168, + "loss": 2.0714, + "step": 18768 + }, + { + "epoch": 2.1898261579745655, + "grad_norm": 1.1391873359680176, + "learning_rate": 0.00015282297940494584, + "loss": 2.0888, + "step": 18769 + }, + { + "epoch": 2.189942830474857, + "grad_norm": 1.2997971773147583, + "learning_rate": 0.00015280786003351478, + "loss": 1.9036, + "step": 18770 + }, + { + "epoch": 2.190059502975149, + "grad_norm": 1.0974445343017578, + "learning_rate": 0.00015279274064852535, + "loss": 1.6911, + "step": 18771 + }, + { + "epoch": 2.1901761754754405, + "grad_norm": 1.4312574863433838, + "learning_rate": 0.00015277762125013412, + "loss": 2.1498, + "step": 18772 + }, + { + "epoch": 2.190292847975732, + "grad_norm": 1.20133638381958, + "learning_rate": 0.00015276250183849797, + "loss": 2.0358, + "step": 18773 + }, + { + "epoch": 2.190409520476024, + "grad_norm": 1.185280203819275, + "learning_rate": 0.00015274738241377364, + "loss": 1.9318, + "step": 18774 + }, + { + "epoch": 2.1905261929763156, + "grad_norm": 1.2516419887542725, + "learning_rate": 0.0001527322629761178, + "loss": 2.0473, + "step": 18775 + }, + { + "epoch": 2.1906428654766072, + "grad_norm": 1.1662395000457764, + "learning_rate": 0.00015271714352568726, + "loss": 2.1132, + "step": 18776 + }, + { + "epoch": 2.190759537976899, + "grad_norm": 1.2409838438034058, + "learning_rate": 0.00015270202406263868, + "loss": 1.9132, + "step": 18777 + }, + { + "epoch": 2.1908762104771906, + "grad_norm": 1.1655895709991455, + "learning_rate": 0.00015268690458712886, + "loss": 2.0086, + "step": 18778 + }, + { + "epoch": 2.1909928829774823, + "grad_norm": 1.0971262454986572, + "learning_rate": 0.0001526717850993145, + "loss": 2.0031, + "step": 18779 + }, + { + "epoch": 2.191109555477774, + "grad_norm": 1.2288122177124023, + "learning_rate": 0.00015265666559935236, + "loss": 1.8649, + "step": 18780 + }, + { + "epoch": 2.1912262279780657, + "grad_norm": 0.9889227747917175, + "learning_rate": 0.00015264154608739917, + "loss": 2.0853, + "step": 18781 + }, + { + "epoch": 2.1913429004783573, + "grad_norm": 1.0752298831939697, + "learning_rate": 0.00015262642656361177, + "loss": 1.8611, + "step": 18782 + }, + { + "epoch": 2.191459572978649, + "grad_norm": 1.1671855449676514, + "learning_rate": 0.00015261130702814678, + "loss": 1.9313, + "step": 18783 + }, + { + "epoch": 2.1915762454789407, + "grad_norm": 1.0578886270523071, + "learning_rate": 0.000152596187481161, + "loss": 2.0315, + "step": 18784 + }, + { + "epoch": 2.1916929179792324, + "grad_norm": 1.2363169193267822, + "learning_rate": 0.0001525810679228112, + "loss": 2.0172, + "step": 18785 + }, + { + "epoch": 2.191809590479524, + "grad_norm": 1.147084355354309, + "learning_rate": 0.00015256594835325404, + "loss": 2.04, + "step": 18786 + }, + { + "epoch": 2.1919262629798157, + "grad_norm": 1.0472302436828613, + "learning_rate": 0.00015255082877264632, + "loss": 1.8436, + "step": 18787 + }, + { + "epoch": 2.1920429354801074, + "grad_norm": 1.2392246723175049, + "learning_rate": 0.00015253570918114475, + "loss": 1.9885, + "step": 18788 + }, + { + "epoch": 2.192159607980399, + "grad_norm": 1.1447455883026123, + "learning_rate": 0.00015252058957890608, + "loss": 2.1945, + "step": 18789 + }, + { + "epoch": 2.192276280480691, + "grad_norm": 1.219814419746399, + "learning_rate": 0.0001525054699660871, + "loss": 2.2155, + "step": 18790 + }, + { + "epoch": 2.1923929529809825, + "grad_norm": 1.2856019735336304, + "learning_rate": 0.00015249035034284452, + "loss": 1.9432, + "step": 18791 + }, + { + "epoch": 2.192509625481274, + "grad_norm": 1.1268495321273804, + "learning_rate": 0.00015247523070933505, + "loss": 1.9815, + "step": 18792 + }, + { + "epoch": 2.192626297981566, + "grad_norm": 1.3166439533233643, + "learning_rate": 0.00015246011106571548, + "loss": 1.8553, + "step": 18793 + }, + { + "epoch": 2.1927429704818575, + "grad_norm": 1.3361114263534546, + "learning_rate": 0.0001524449914121426, + "loss": 1.9456, + "step": 18794 + }, + { + "epoch": 2.192859642982149, + "grad_norm": 1.2399276494979858, + "learning_rate": 0.000152429871748773, + "loss": 2.0117, + "step": 18795 + }, + { + "epoch": 2.192976315482441, + "grad_norm": 1.086223840713501, + "learning_rate": 0.00015241475207576359, + "loss": 1.9005, + "step": 18796 + }, + { + "epoch": 2.1930929879827326, + "grad_norm": 1.214608073234558, + "learning_rate": 0.000152399632393271, + "loss": 2.0933, + "step": 18797 + }, + { + "epoch": 2.1932096604830242, + "grad_norm": 1.2634291648864746, + "learning_rate": 0.00015238451270145205, + "loss": 2.1612, + "step": 18798 + }, + { + "epoch": 2.193326332983316, + "grad_norm": 1.132673740386963, + "learning_rate": 0.00015236939300046343, + "loss": 1.9725, + "step": 18799 + }, + { + "epoch": 2.1934430054836076, + "grad_norm": 1.0819064378738403, + "learning_rate": 0.00015235427329046187, + "loss": 1.9631, + "step": 18800 + }, + { + "epoch": 2.1935596779838993, + "grad_norm": 1.0378403663635254, + "learning_rate": 0.0001523391535716042, + "loss": 2.0081, + "step": 18801 + }, + { + "epoch": 2.193676350484191, + "grad_norm": 1.1780717372894287, + "learning_rate": 0.00015232403384404715, + "loss": 1.9249, + "step": 18802 + }, + { + "epoch": 2.1937930229844826, + "grad_norm": 0.9542260766029358, + "learning_rate": 0.00015230891410794736, + "loss": 1.6937, + "step": 18803 + }, + { + "epoch": 2.1939096954847743, + "grad_norm": 1.215586543083191, + "learning_rate": 0.0001522937943634617, + "loss": 2.0805, + "step": 18804 + }, + { + "epoch": 2.194026367985066, + "grad_norm": 1.1406948566436768, + "learning_rate": 0.0001522786746107468, + "loss": 1.8774, + "step": 18805 + }, + { + "epoch": 2.1941430404853577, + "grad_norm": 1.2034393548965454, + "learning_rate": 0.00015226355484995944, + "loss": 2.0414, + "step": 18806 + }, + { + "epoch": 2.1942597129856494, + "grad_norm": 1.3349719047546387, + "learning_rate": 0.00015224843508125646, + "loss": 2.087, + "step": 18807 + }, + { + "epoch": 2.194376385485941, + "grad_norm": 1.2041258811950684, + "learning_rate": 0.0001522333153047945, + "loss": 2.0415, + "step": 18808 + }, + { + "epoch": 2.1944930579862327, + "grad_norm": 1.212315320968628, + "learning_rate": 0.00015221819552073033, + "loss": 1.8802, + "step": 18809 + }, + { + "epoch": 2.1946097304865244, + "grad_norm": 1.2177180051803589, + "learning_rate": 0.00015220307572922075, + "loss": 2.1059, + "step": 18810 + }, + { + "epoch": 2.194726402986816, + "grad_norm": 1.163718819618225, + "learning_rate": 0.00015218795593042244, + "loss": 2.0774, + "step": 18811 + }, + { + "epoch": 2.1948430754871078, + "grad_norm": 1.3453069925308228, + "learning_rate": 0.00015217283612449213, + "loss": 2.1839, + "step": 18812 + }, + { + "epoch": 2.1949597479873995, + "grad_norm": 1.1387068033218384, + "learning_rate": 0.00015215771631158664, + "loss": 1.8684, + "step": 18813 + }, + { + "epoch": 2.195076420487691, + "grad_norm": 1.0786871910095215, + "learning_rate": 0.00015214259649186263, + "loss": 1.8087, + "step": 18814 + }, + { + "epoch": 2.195193092987983, + "grad_norm": 1.2185033559799194, + "learning_rate": 0.00015212747666547695, + "loss": 1.9773, + "step": 18815 + }, + { + "epoch": 2.1953097654882745, + "grad_norm": 1.3626261949539185, + "learning_rate": 0.00015211235683258625, + "loss": 1.9458, + "step": 18816 + }, + { + "epoch": 2.195426437988566, + "grad_norm": 1.2020395994186401, + "learning_rate": 0.00015209723699334733, + "loss": 2.0619, + "step": 18817 + }, + { + "epoch": 2.195543110488858, + "grad_norm": 1.2492339611053467, + "learning_rate": 0.00015208211714791688, + "loss": 2.1704, + "step": 18818 + }, + { + "epoch": 2.1956597829891495, + "grad_norm": 1.2517085075378418, + "learning_rate": 0.00015206699729645168, + "loss": 2.0507, + "step": 18819 + }, + { + "epoch": 2.1957764554894412, + "grad_norm": 1.1822407245635986, + "learning_rate": 0.0001520518774391085, + "loss": 1.9427, + "step": 18820 + }, + { + "epoch": 2.195893127989733, + "grad_norm": 1.0631053447723389, + "learning_rate": 0.00015203675757604406, + "loss": 2.0034, + "step": 18821 + }, + { + "epoch": 2.1960098004900246, + "grad_norm": 1.0974401235580444, + "learning_rate": 0.00015202163770741515, + "loss": 1.9495, + "step": 18822 + }, + { + "epoch": 2.1961264729903163, + "grad_norm": 1.128098487854004, + "learning_rate": 0.00015200651783337844, + "loss": 1.8601, + "step": 18823 + }, + { + "epoch": 2.196243145490608, + "grad_norm": 1.2586157321929932, + "learning_rate": 0.00015199139795409072, + "loss": 2.0107, + "step": 18824 + }, + { + "epoch": 2.1963598179908996, + "grad_norm": 0.9866921901702881, + "learning_rate": 0.00015197627806970873, + "loss": 1.7265, + "step": 18825 + }, + { + "epoch": 2.1964764904911913, + "grad_norm": 1.162428379058838, + "learning_rate": 0.00015196115818038924, + "loss": 2.0156, + "step": 18826 + }, + { + "epoch": 2.196593162991483, + "grad_norm": 1.056767463684082, + "learning_rate": 0.00015194603828628896, + "loss": 1.9171, + "step": 18827 + }, + { + "epoch": 2.1967098354917747, + "grad_norm": 1.0818290710449219, + "learning_rate": 0.00015193091838756464, + "loss": 1.7703, + "step": 18828 + }, + { + "epoch": 2.1968265079920664, + "grad_norm": 1.2519773244857788, + "learning_rate": 0.00015191579848437302, + "loss": 1.9127, + "step": 18829 + }, + { + "epoch": 2.196943180492358, + "grad_norm": 1.2122976779937744, + "learning_rate": 0.00015190067857687092, + "loss": 1.9385, + "step": 18830 + }, + { + "epoch": 2.1970598529926497, + "grad_norm": 1.0647640228271484, + "learning_rate": 0.0001518855586652149, + "loss": 1.9276, + "step": 18831 + }, + { + "epoch": 2.1971765254929414, + "grad_norm": 1.128707766532898, + "learning_rate": 0.00015187043874956193, + "loss": 2.0533, + "step": 18832 + }, + { + "epoch": 2.197293197993233, + "grad_norm": 1.1770635843276978, + "learning_rate": 0.00015185531883006863, + "loss": 1.991, + "step": 18833 + }, + { + "epoch": 2.1974098704935248, + "grad_norm": 1.1330294609069824, + "learning_rate": 0.0001518401989068918, + "loss": 1.8859, + "step": 18834 + }, + { + "epoch": 2.1975265429938164, + "grad_norm": 1.338620662689209, + "learning_rate": 0.0001518250789801882, + "loss": 2.0485, + "step": 18835 + }, + { + "epoch": 2.197643215494108, + "grad_norm": 1.286956548690796, + "learning_rate": 0.0001518099590501145, + "loss": 2.0143, + "step": 18836 + }, + { + "epoch": 2.1977598879944, + "grad_norm": 1.080808401107788, + "learning_rate": 0.00015179483911682747, + "loss": 1.9619, + "step": 18837 + }, + { + "epoch": 2.1978765604946915, + "grad_norm": 1.2576650381088257, + "learning_rate": 0.0001517797191804839, + "loss": 2.0993, + "step": 18838 + }, + { + "epoch": 2.197993232994983, + "grad_norm": 1.1761645078659058, + "learning_rate": 0.0001517645992412405, + "loss": 2.1234, + "step": 18839 + }, + { + "epoch": 2.198109905495275, + "grad_norm": 1.1766425371170044, + "learning_rate": 0.00015174947929925403, + "loss": 2.0482, + "step": 18840 + }, + { + "epoch": 2.1982265779955665, + "grad_norm": 1.1389187574386597, + "learning_rate": 0.00015173435935468123, + "loss": 2.0076, + "step": 18841 + }, + { + "epoch": 2.198343250495858, + "grad_norm": 1.1855854988098145, + "learning_rate": 0.00015171923940767888, + "loss": 1.9958, + "step": 18842 + }, + { + "epoch": 2.19845992299615, + "grad_norm": 1.5071165561676025, + "learning_rate": 0.00015170411945840366, + "loss": 2.0479, + "step": 18843 + }, + { + "epoch": 2.1985765954964416, + "grad_norm": 1.1211365461349487, + "learning_rate": 0.00015168899950701237, + "loss": 1.8873, + "step": 18844 + }, + { + "epoch": 2.1986932679967333, + "grad_norm": 1.3360326290130615, + "learning_rate": 0.00015167387955366177, + "loss": 2.1331, + "step": 18845 + }, + { + "epoch": 2.198809940497025, + "grad_norm": 1.3320425748825073, + "learning_rate": 0.00015165875959850851, + "loss": 1.9593, + "step": 18846 + }, + { + "epoch": 2.1989266129973166, + "grad_norm": 1.1380765438079834, + "learning_rate": 0.00015164363964170947, + "loss": 2.1324, + "step": 18847 + }, + { + "epoch": 2.1990432854976083, + "grad_norm": 1.1932294368743896, + "learning_rate": 0.0001516285196834213, + "loss": 1.9448, + "step": 18848 + }, + { + "epoch": 2.1991599579979, + "grad_norm": 1.2525233030319214, + "learning_rate": 0.00015161339972380078, + "loss": 2.1476, + "step": 18849 + }, + { + "epoch": 2.1992766304981917, + "grad_norm": 1.1309784650802612, + "learning_rate": 0.0001515982797630047, + "loss": 1.9242, + "step": 18850 + }, + { + "epoch": 2.1993933029984833, + "grad_norm": 1.0546756982803345, + "learning_rate": 0.00015158315980118973, + "loss": 1.8996, + "step": 18851 + }, + { + "epoch": 2.199509975498775, + "grad_norm": 1.1458163261413574, + "learning_rate": 0.00015156803983851263, + "loss": 2.0, + "step": 18852 + }, + { + "epoch": 2.1996266479990667, + "grad_norm": 1.287853717803955, + "learning_rate": 0.00015155291987513023, + "loss": 2.0242, + "step": 18853 + }, + { + "epoch": 2.1997433204993584, + "grad_norm": 1.2954692840576172, + "learning_rate": 0.00015153779991119916, + "loss": 2.0347, + "step": 18854 + }, + { + "epoch": 2.19985999299965, + "grad_norm": 1.1457369327545166, + "learning_rate": 0.00015152267994687625, + "loss": 1.9647, + "step": 18855 + }, + { + "epoch": 2.1999766654999418, + "grad_norm": 1.254012107849121, + "learning_rate": 0.00015150755998231822, + "loss": 2.0561, + "step": 18856 + }, + { + "epoch": 2.2000933380002334, + "grad_norm": 1.2237354516983032, + "learning_rate": 0.00015149244001768175, + "loss": 1.965, + "step": 18857 + }, + { + "epoch": 2.200210010500525, + "grad_norm": 1.1712318658828735, + "learning_rate": 0.00015147732005312375, + "loss": 1.806, + "step": 18858 + }, + { + "epoch": 2.200326683000817, + "grad_norm": 1.2968772649765015, + "learning_rate": 0.0001514622000888008, + "loss": 1.9417, + "step": 18859 + }, + { + "epoch": 2.2004433555011085, + "grad_norm": 1.2445656061172485, + "learning_rate": 0.00015144708012486977, + "loss": 1.9451, + "step": 18860 + }, + { + "epoch": 2.2005600280014, + "grad_norm": 1.0963290929794312, + "learning_rate": 0.00015143196016148737, + "loss": 1.9638, + "step": 18861 + }, + { + "epoch": 2.200676700501692, + "grad_norm": 1.1924241781234741, + "learning_rate": 0.00015141684019881027, + "loss": 1.8448, + "step": 18862 + }, + { + "epoch": 2.2007933730019835, + "grad_norm": 0.9406132698059082, + "learning_rate": 0.00015140172023699527, + "loss": 1.9411, + "step": 18863 + }, + { + "epoch": 2.200910045502275, + "grad_norm": 1.1392288208007812, + "learning_rate": 0.00015138660027619915, + "loss": 1.9835, + "step": 18864 + }, + { + "epoch": 2.201026718002567, + "grad_norm": 1.2032577991485596, + "learning_rate": 0.0001513714803165787, + "loss": 2.0662, + "step": 18865 + }, + { + "epoch": 2.2011433905028586, + "grad_norm": 1.0551855564117432, + "learning_rate": 0.0001513563603582905, + "loss": 2.1507, + "step": 18866 + }, + { + "epoch": 2.2012600630031502, + "grad_norm": 1.1926249265670776, + "learning_rate": 0.00015134124040149145, + "loss": 2.0594, + "step": 18867 + }, + { + "epoch": 2.201376735503442, + "grad_norm": 1.033733606338501, + "learning_rate": 0.00015132612044633823, + "loss": 1.9374, + "step": 18868 + }, + { + "epoch": 2.2014934080037336, + "grad_norm": 1.4428430795669556, + "learning_rate": 0.0001513110004929876, + "loss": 2.0491, + "step": 18869 + }, + { + "epoch": 2.2016100805040253, + "grad_norm": 1.0735278129577637, + "learning_rate": 0.0001512958805415963, + "loss": 1.886, + "step": 18870 + }, + { + "epoch": 2.201726753004317, + "grad_norm": 1.126867413520813, + "learning_rate": 0.00015128076059232114, + "loss": 1.9852, + "step": 18871 + }, + { + "epoch": 2.2018434255046087, + "grad_norm": 1.3356497287750244, + "learning_rate": 0.00015126564064531873, + "loss": 2.1754, + "step": 18872 + }, + { + "epoch": 2.2019600980049003, + "grad_norm": 1.1033954620361328, + "learning_rate": 0.00015125052070074597, + "loss": 2.0625, + "step": 18873 + }, + { + "epoch": 2.202076770505192, + "grad_norm": 1.0825071334838867, + "learning_rate": 0.00015123540075875945, + "loss": 2.0066, + "step": 18874 + }, + { + "epoch": 2.2021934430054837, + "grad_norm": 0.9777786135673523, + "learning_rate": 0.00015122028081951608, + "loss": 1.7059, + "step": 18875 + }, + { + "epoch": 2.2023101155057754, + "grad_norm": 1.2028896808624268, + "learning_rate": 0.00015120516088317247, + "loss": 1.9155, + "step": 18876 + }, + { + "epoch": 2.202426788006067, + "grad_norm": 1.361444354057312, + "learning_rate": 0.00015119004094988546, + "loss": 2.1107, + "step": 18877 + }, + { + "epoch": 2.2025434605063587, + "grad_norm": 1.0456032752990723, + "learning_rate": 0.00015117492101981178, + "loss": 1.9046, + "step": 18878 + }, + { + "epoch": 2.2026601330066504, + "grad_norm": 0.9560531973838806, + "learning_rate": 0.00015115980109310813, + "loss": 1.8455, + "step": 18879 + }, + { + "epoch": 2.202776805506942, + "grad_norm": 1.1639516353607178, + "learning_rate": 0.00015114468116993133, + "loss": 2.0566, + "step": 18880 + }, + { + "epoch": 2.202893478007234, + "grad_norm": 1.2055304050445557, + "learning_rate": 0.000151129561250438, + "loss": 1.9698, + "step": 18881 + }, + { + "epoch": 2.2030101505075255, + "grad_norm": 1.074829339981079, + "learning_rate": 0.00015111444133478503, + "loss": 1.9945, + "step": 18882 + }, + { + "epoch": 2.203126823007817, + "grad_norm": 1.2273311614990234, + "learning_rate": 0.00015109932142312908, + "loss": 2.1755, + "step": 18883 + }, + { + "epoch": 2.203243495508109, + "grad_norm": 1.5530195236206055, + "learning_rate": 0.00015108420151562697, + "loss": 2.0177, + "step": 18884 + }, + { + "epoch": 2.2033601680084005, + "grad_norm": 1.5173368453979492, + "learning_rate": 0.00015106908161243533, + "loss": 2.1005, + "step": 18885 + }, + { + "epoch": 2.203476840508692, + "grad_norm": 1.3756611347198486, + "learning_rate": 0.00015105396171371106, + "loss": 2.0116, + "step": 18886 + }, + { + "epoch": 2.203593513008984, + "grad_norm": 1.0279383659362793, + "learning_rate": 0.00015103884181961075, + "loss": 2.0048, + "step": 18887 + }, + { + "epoch": 2.2037101855092756, + "grad_norm": 1.3485883474349976, + "learning_rate": 0.00015102372193029123, + "loss": 2.0911, + "step": 18888 + }, + { + "epoch": 2.2038268580095672, + "grad_norm": 1.1758840084075928, + "learning_rate": 0.00015100860204590925, + "loss": 1.9019, + "step": 18889 + }, + { + "epoch": 2.203943530509859, + "grad_norm": 1.1507689952850342, + "learning_rate": 0.0001509934821666215, + "loss": 1.8981, + "step": 18890 + }, + { + "epoch": 2.2040602030101506, + "grad_norm": 1.2793782949447632, + "learning_rate": 0.00015097836229258484, + "loss": 2.0312, + "step": 18891 + }, + { + "epoch": 2.2041768755104423, + "grad_norm": 1.236634373664856, + "learning_rate": 0.00015096324242395588, + "loss": 1.8774, + "step": 18892 + }, + { + "epoch": 2.204293548010734, + "grad_norm": 1.2631231546401978, + "learning_rate": 0.00015094812256089148, + "loss": 2.2871, + "step": 18893 + }, + { + "epoch": 2.2044102205110256, + "grad_norm": 1.213610053062439, + "learning_rate": 0.00015093300270354828, + "loss": 1.7988, + "step": 18894 + }, + { + "epoch": 2.2045268930113173, + "grad_norm": 1.0703116655349731, + "learning_rate": 0.0001509178828520831, + "loss": 2.004, + "step": 18895 + }, + { + "epoch": 2.204643565511609, + "grad_norm": 1.2408511638641357, + "learning_rate": 0.00015090276300665266, + "loss": 2.1497, + "step": 18896 + }, + { + "epoch": 2.2047602380119007, + "grad_norm": 1.0705705881118774, + "learning_rate": 0.00015088764316741377, + "loss": 1.9493, + "step": 18897 + }, + { + "epoch": 2.2048769105121924, + "grad_norm": 1.4007854461669922, + "learning_rate": 0.00015087252333452304, + "loss": 1.796, + "step": 18898 + }, + { + "epoch": 2.204993583012484, + "grad_norm": 1.1489298343658447, + "learning_rate": 0.00015085740350813737, + "loss": 2.1568, + "step": 18899 + }, + { + "epoch": 2.2051102555127757, + "grad_norm": 1.2528845071792603, + "learning_rate": 0.00015084228368841333, + "loss": 1.9964, + "step": 18900 + }, + { + "epoch": 2.2052269280130674, + "grad_norm": 1.309799313545227, + "learning_rate": 0.00015082716387550786, + "loss": 1.8342, + "step": 18901 + }, + { + "epoch": 2.205343600513359, + "grad_norm": 1.2212190628051758, + "learning_rate": 0.00015081204406957756, + "loss": 1.9586, + "step": 18902 + }, + { + "epoch": 2.2054602730136508, + "grad_norm": 1.2655364274978638, + "learning_rate": 0.0001507969242707792, + "loss": 2.1037, + "step": 18903 + }, + { + "epoch": 2.2055769455139425, + "grad_norm": 1.4298280477523804, + "learning_rate": 0.00015078180447926964, + "loss": 2.0303, + "step": 18904 + }, + { + "epoch": 2.205693618014234, + "grad_norm": 1.3736696243286133, + "learning_rate": 0.00015076668469520547, + "loss": 2.056, + "step": 18905 + }, + { + "epoch": 2.205810290514526, + "grad_norm": 1.2059481143951416, + "learning_rate": 0.00015075156491874353, + "loss": 2.1656, + "step": 18906 + }, + { + "epoch": 2.2059269630148175, + "grad_norm": 1.0953598022460938, + "learning_rate": 0.00015073644515004047, + "loss": 1.9657, + "step": 18907 + }, + { + "epoch": 2.206043635515109, + "grad_norm": 1.0348974466323853, + "learning_rate": 0.00015072132538925323, + "loss": 1.7004, + "step": 18908 + }, + { + "epoch": 2.206160308015401, + "grad_norm": 1.1671085357666016, + "learning_rate": 0.00015070620563653828, + "loss": 1.8303, + "step": 18909 + }, + { + "epoch": 2.2062769805156925, + "grad_norm": 1.0807304382324219, + "learning_rate": 0.0001506910858920526, + "loss": 1.8081, + "step": 18910 + }, + { + "epoch": 2.2063936530159842, + "grad_norm": 1.1440318822860718, + "learning_rate": 0.00015067596615595287, + "loss": 1.8446, + "step": 18911 + }, + { + "epoch": 2.206510325516276, + "grad_norm": 1.2439481019973755, + "learning_rate": 0.0001506608464283958, + "loss": 2.0522, + "step": 18912 + }, + { + "epoch": 2.2066269980165676, + "grad_norm": 1.4868329763412476, + "learning_rate": 0.0001506457267095381, + "loss": 2.3255, + "step": 18913 + }, + { + "epoch": 2.2067436705168593, + "grad_norm": 1.123070478439331, + "learning_rate": 0.0001506306069995366, + "loss": 1.8513, + "step": 18914 + }, + { + "epoch": 2.206860343017151, + "grad_norm": 1.2363340854644775, + "learning_rate": 0.00015061548729854794, + "loss": 1.7305, + "step": 18915 + }, + { + "epoch": 2.2069770155174426, + "grad_norm": 1.118567943572998, + "learning_rate": 0.00015060036760672894, + "loss": 2.0039, + "step": 18916 + }, + { + "epoch": 2.2070936880177343, + "grad_norm": 1.2106106281280518, + "learning_rate": 0.0001505852479242364, + "loss": 2.0931, + "step": 18917 + }, + { + "epoch": 2.207210360518026, + "grad_norm": 1.2735486030578613, + "learning_rate": 0.00015057012825122693, + "loss": 2.0135, + "step": 18918 + }, + { + "epoch": 2.2073270330183177, + "grad_norm": 1.0436677932739258, + "learning_rate": 0.00015055500858785742, + "loss": 1.945, + "step": 18919 + }, + { + "epoch": 2.2074437055186094, + "grad_norm": 1.9218305349349976, + "learning_rate": 0.00015053988893428446, + "loss": 2.0571, + "step": 18920 + }, + { + "epoch": 2.207560378018901, + "grad_norm": 1.3819763660430908, + "learning_rate": 0.0001505247692906649, + "loss": 1.8707, + "step": 18921 + }, + { + "epoch": 2.2076770505191927, + "grad_norm": 1.1025551557540894, + "learning_rate": 0.00015050964965715547, + "loss": 1.9811, + "step": 18922 + }, + { + "epoch": 2.2077937230194844, + "grad_norm": 1.2254239320755005, + "learning_rate": 0.00015049453003391293, + "loss": 2.0692, + "step": 18923 + }, + { + "epoch": 2.207910395519776, + "grad_norm": 0.999813437461853, + "learning_rate": 0.00015047941042109386, + "loss": 1.7754, + "step": 18924 + }, + { + "epoch": 2.2080270680200678, + "grad_norm": 1.1060969829559326, + "learning_rate": 0.00015046429081885527, + "loss": 1.9421, + "step": 18925 + }, + { + "epoch": 2.2081437405203594, + "grad_norm": 1.1755503416061401, + "learning_rate": 0.0001504491712273537, + "loss": 1.9457, + "step": 18926 + }, + { + "epoch": 2.208260413020651, + "grad_norm": 1.3767495155334473, + "learning_rate": 0.00015043405164674598, + "loss": 2.0812, + "step": 18927 + }, + { + "epoch": 2.208377085520943, + "grad_norm": 1.2024857997894287, + "learning_rate": 0.0001504189320771888, + "loss": 2.1328, + "step": 18928 + }, + { + "epoch": 2.2084937580212345, + "grad_norm": 1.425891637802124, + "learning_rate": 0.00015040381251883893, + "loss": 2.0759, + "step": 18929 + }, + { + "epoch": 2.208610430521526, + "grad_norm": 1.120510458946228, + "learning_rate": 0.0001503886929718532, + "loss": 1.8908, + "step": 18930 + }, + { + "epoch": 2.208727103021818, + "grad_norm": 1.1392689943313599, + "learning_rate": 0.00015037357343638817, + "loss": 2.0382, + "step": 18931 + }, + { + "epoch": 2.2088437755221095, + "grad_norm": 1.1292237043380737, + "learning_rate": 0.00015035845391260077, + "loss": 2.0171, + "step": 18932 + }, + { + "epoch": 2.208960448022401, + "grad_norm": 1.1864628791809082, + "learning_rate": 0.0001503433344006476, + "loss": 2.0315, + "step": 18933 + }, + { + "epoch": 2.209077120522693, + "grad_norm": 1.2944457530975342, + "learning_rate": 0.00015032821490068552, + "loss": 2.0214, + "step": 18934 + }, + { + "epoch": 2.2091937930229846, + "grad_norm": 1.3484492301940918, + "learning_rate": 0.0001503130954128711, + "loss": 2.1697, + "step": 18935 + }, + { + "epoch": 2.2093104655232763, + "grad_norm": 1.3059229850769043, + "learning_rate": 0.00015029797593736134, + "loss": 2.036, + "step": 18936 + }, + { + "epoch": 2.209427138023568, + "grad_norm": 1.0850026607513428, + "learning_rate": 0.0001502828564743127, + "loss": 1.9061, + "step": 18937 + }, + { + "epoch": 2.2095438105238596, + "grad_norm": 1.2088104486465454, + "learning_rate": 0.00015026773702388215, + "loss": 1.9313, + "step": 18938 + }, + { + "epoch": 2.2096604830241513, + "grad_norm": 1.2933907508850098, + "learning_rate": 0.00015025261758622632, + "loss": 2.0683, + "step": 18939 + }, + { + "epoch": 2.209777155524443, + "grad_norm": 1.20145583152771, + "learning_rate": 0.000150237498161502, + "loss": 1.9794, + "step": 18940 + }, + { + "epoch": 2.2098938280247347, + "grad_norm": 1.26043701171875, + "learning_rate": 0.00015022237874986582, + "loss": 2.0867, + "step": 18941 + }, + { + "epoch": 2.2100105005250263, + "grad_norm": 1.1351161003112793, + "learning_rate": 0.00015020725935147462, + "loss": 2.0741, + "step": 18942 + }, + { + "epoch": 2.210127173025318, + "grad_norm": 1.3471614122390747, + "learning_rate": 0.00015019213996648518, + "loss": 2.068, + "step": 18943 + }, + { + "epoch": 2.2102438455256097, + "grad_norm": 1.1515713930130005, + "learning_rate": 0.0001501770205950541, + "loss": 1.8588, + "step": 18944 + }, + { + "epoch": 2.2103605180259014, + "grad_norm": 1.2816987037658691, + "learning_rate": 0.00015016190123733834, + "loss": 2.1386, + "step": 18945 + }, + { + "epoch": 2.210477190526193, + "grad_norm": 1.235182523727417, + "learning_rate": 0.0001501467818934944, + "loss": 2.0666, + "step": 18946 + }, + { + "epoch": 2.2105938630264848, + "grad_norm": 1.11117422580719, + "learning_rate": 0.00015013166256367918, + "loss": 1.7286, + "step": 18947 + }, + { + "epoch": 2.2107105355267764, + "grad_norm": 1.1611433029174805, + "learning_rate": 0.00015011654324804933, + "loss": 1.8861, + "step": 18948 + }, + { + "epoch": 2.210827208027068, + "grad_norm": 1.2788782119750977, + "learning_rate": 0.00015010142394676168, + "loss": 2.0226, + "step": 18949 + }, + { + "epoch": 2.21094388052736, + "grad_norm": 1.0812076330184937, + "learning_rate": 0.00015008630465997283, + "loss": 1.9877, + "step": 18950 + }, + { + "epoch": 2.2110605530276515, + "grad_norm": 1.100495457649231, + "learning_rate": 0.00015007118538783973, + "loss": 1.8712, + "step": 18951 + }, + { + "epoch": 2.211177225527943, + "grad_norm": 1.110596776008606, + "learning_rate": 0.00015005606613051887, + "loss": 1.8679, + "step": 18952 + }, + { + "epoch": 2.211293898028235, + "grad_norm": 1.1095954179763794, + "learning_rate": 0.00015004094688816723, + "loss": 1.8441, + "step": 18953 + }, + { + "epoch": 2.2114105705285265, + "grad_norm": 1.4363280534744263, + "learning_rate": 0.00015002582766094136, + "loss": 1.9965, + "step": 18954 + }, + { + "epoch": 2.211527243028818, + "grad_norm": 1.2513948678970337, + "learning_rate": 0.00015001070844899812, + "loss": 1.8308, + "step": 18955 + }, + { + "epoch": 2.21164391552911, + "grad_norm": 1.265102505683899, + "learning_rate": 0.00014999558925249423, + "loss": 1.87, + "step": 18956 + }, + { + "epoch": 2.2117605880294016, + "grad_norm": 1.2221283912658691, + "learning_rate": 0.00014998047007158634, + "loss": 1.8233, + "step": 18957 + }, + { + "epoch": 2.2118772605296932, + "grad_norm": 1.2815719842910767, + "learning_rate": 0.00014996535090643133, + "loss": 1.9023, + "step": 18958 + }, + { + "epoch": 2.211993933029985, + "grad_norm": 1.2583390474319458, + "learning_rate": 0.00014995023175718578, + "loss": 2.0855, + "step": 18959 + }, + { + "epoch": 2.2121106055302766, + "grad_norm": 1.1625628471374512, + "learning_rate": 0.00014993511262400658, + "loss": 1.8459, + "step": 18960 + }, + { + "epoch": 2.2122272780305683, + "grad_norm": 1.4168610572814941, + "learning_rate": 0.00014991999350705032, + "loss": 2.0836, + "step": 18961 + }, + { + "epoch": 2.21234395053086, + "grad_norm": 1.5217808485031128, + "learning_rate": 0.00014990487440647392, + "loss": 2.0362, + "step": 18962 + }, + { + "epoch": 2.2124606230311517, + "grad_norm": 1.1818773746490479, + "learning_rate": 0.00014988975532243394, + "loss": 1.89, + "step": 18963 + }, + { + "epoch": 2.2125772955314433, + "grad_norm": 1.058593511581421, + "learning_rate": 0.00014987463625508725, + "loss": 1.7726, + "step": 18964 + }, + { + "epoch": 2.212693968031735, + "grad_norm": 1.1504346132278442, + "learning_rate": 0.0001498595172045905, + "loss": 2.0171, + "step": 18965 + }, + { + "epoch": 2.2128106405320267, + "grad_norm": 1.1929699182510376, + "learning_rate": 0.00014984439817110053, + "loss": 1.8844, + "step": 18966 + }, + { + "epoch": 2.2129273130323184, + "grad_norm": 1.1505459547042847, + "learning_rate": 0.0001498292791547739, + "loss": 1.9785, + "step": 18967 + }, + { + "epoch": 2.21304398553261, + "grad_norm": 1.2414239645004272, + "learning_rate": 0.00014981416015576749, + "loss": 1.9783, + "step": 18968 + }, + { + "epoch": 2.2131606580329017, + "grad_norm": 1.2343621253967285, + "learning_rate": 0.00014979904117423803, + "loss": 2.0995, + "step": 18969 + }, + { + "epoch": 2.2132773305331934, + "grad_norm": 1.1510313749313354, + "learning_rate": 0.0001497839222103422, + "loss": 1.9677, + "step": 18970 + }, + { + "epoch": 2.213394003033485, + "grad_norm": 1.4073983430862427, + "learning_rate": 0.0001497688032642368, + "loss": 1.9441, + "step": 18971 + }, + { + "epoch": 2.213510675533777, + "grad_norm": 1.0924961566925049, + "learning_rate": 0.00014975368433607853, + "loss": 1.9761, + "step": 18972 + }, + { + "epoch": 2.2136273480340685, + "grad_norm": 1.172791600227356, + "learning_rate": 0.0001497385654260241, + "loss": 1.979, + "step": 18973 + }, + { + "epoch": 2.21374402053436, + "grad_norm": 1.1158647537231445, + "learning_rate": 0.0001497234465342303, + "loss": 1.8338, + "step": 18974 + }, + { + "epoch": 2.213860693034652, + "grad_norm": 1.2555997371673584, + "learning_rate": 0.00014970832766085385, + "loss": 2.0917, + "step": 18975 + }, + { + "epoch": 2.2139773655349435, + "grad_norm": 1.136080265045166, + "learning_rate": 0.00014969320880605142, + "loss": 1.9496, + "step": 18976 + }, + { + "epoch": 2.214094038035235, + "grad_norm": 1.2166169881820679, + "learning_rate": 0.00014967808996997986, + "loss": 2.0606, + "step": 18977 + }, + { + "epoch": 2.214210710535527, + "grad_norm": 1.4727874994277954, + "learning_rate": 0.0001496629711527958, + "loss": 2.2202, + "step": 18978 + }, + { + "epoch": 2.2143273830358186, + "grad_norm": 1.1152373552322388, + "learning_rate": 0.00014964785235465608, + "loss": 1.8446, + "step": 18979 + }, + { + "epoch": 2.2144440555361102, + "grad_norm": 1.1451441049575806, + "learning_rate": 0.0001496327335757173, + "loss": 1.8401, + "step": 18980 + }, + { + "epoch": 2.214560728036402, + "grad_norm": 1.0692660808563232, + "learning_rate": 0.00014961761481613632, + "loss": 1.9611, + "step": 18981 + }, + { + "epoch": 2.2146774005366936, + "grad_norm": 1.3329288959503174, + "learning_rate": 0.00014960249607606988, + "loss": 2.0388, + "step": 18982 + }, + { + "epoch": 2.2147940730369853, + "grad_norm": 1.2543498277664185, + "learning_rate": 0.00014958737735567456, + "loss": 2.0861, + "step": 18983 + }, + { + "epoch": 2.214910745537277, + "grad_norm": 1.164873719215393, + "learning_rate": 0.00014957225865510729, + "loss": 1.9343, + "step": 18984 + }, + { + "epoch": 2.2150274180375686, + "grad_norm": 1.21110999584198, + "learning_rate": 0.0001495571399745246, + "loss": 2.0226, + "step": 18985 + }, + { + "epoch": 2.2151440905378603, + "grad_norm": 1.26397705078125, + "learning_rate": 0.00014954202131408343, + "loss": 2.0365, + "step": 18986 + }, + { + "epoch": 2.215260763038152, + "grad_norm": 1.1519757509231567, + "learning_rate": 0.00014952690267394035, + "loss": 2.0251, + "step": 18987 + }, + { + "epoch": 2.2153774355384437, + "grad_norm": 1.0437252521514893, + "learning_rate": 0.00014951178405425223, + "loss": 1.9417, + "step": 18988 + }, + { + "epoch": 2.2154941080387354, + "grad_norm": 1.0553110837936401, + "learning_rate": 0.00014949666545517568, + "loss": 1.9768, + "step": 18989 + }, + { + "epoch": 2.215610780539027, + "grad_norm": 1.0632915496826172, + "learning_rate": 0.0001494815468768675, + "loss": 1.8672, + "step": 18990 + }, + { + "epoch": 2.2157274530393187, + "grad_norm": 1.2840548753738403, + "learning_rate": 0.00014946642831948444, + "loss": 2.0289, + "step": 18991 + }, + { + "epoch": 2.2158441255396104, + "grad_norm": 1.098645567893982, + "learning_rate": 0.0001494513097831832, + "loss": 1.8954, + "step": 18992 + }, + { + "epoch": 2.215960798039902, + "grad_norm": 1.0777727365493774, + "learning_rate": 0.00014943619126812043, + "loss": 1.9563, + "step": 18993 + }, + { + "epoch": 2.2160774705401938, + "grad_norm": 1.1728765964508057, + "learning_rate": 0.00014942107277445297, + "loss": 1.8493, + "step": 18994 + }, + { + "epoch": 2.2161941430404855, + "grad_norm": 1.066390037536621, + "learning_rate": 0.0001494059543023376, + "loss": 1.8567, + "step": 18995 + }, + { + "epoch": 2.216310815540777, + "grad_norm": 1.2804011106491089, + "learning_rate": 0.0001493908358519309, + "loss": 2.1659, + "step": 18996 + }, + { + "epoch": 2.216427488041069, + "grad_norm": 1.077620267868042, + "learning_rate": 0.00014937571742338972, + "loss": 1.9445, + "step": 18997 + }, + { + "epoch": 2.2165441605413605, + "grad_norm": 0.9872652888298035, + "learning_rate": 0.00014936059901687077, + "loss": 1.7522, + "step": 18998 + }, + { + "epoch": 2.216660833041652, + "grad_norm": 1.2733912467956543, + "learning_rate": 0.00014934548063253076, + "loss": 1.9387, + "step": 18999 + }, + { + "epoch": 2.216777505541944, + "grad_norm": 1.0745364427566528, + "learning_rate": 0.0001493303622705264, + "loss": 1.9575, + "step": 19000 + }, + { + "epoch": 2.2168941780422355, + "grad_norm": 1.1033244132995605, + "learning_rate": 0.00014931524393101446, + "loss": 1.9383, + "step": 19001 + }, + { + "epoch": 2.2170108505425272, + "grad_norm": 1.1760326623916626, + "learning_rate": 0.00014930012561415162, + "loss": 1.9232, + "step": 19002 + }, + { + "epoch": 2.217127523042819, + "grad_norm": 1.3442940711975098, + "learning_rate": 0.0001492850073200947, + "loss": 2.0578, + "step": 19003 + }, + { + "epoch": 2.2172441955431106, + "grad_norm": 1.0808035135269165, + "learning_rate": 0.0001492698890490003, + "loss": 1.9446, + "step": 19004 + }, + { + "epoch": 2.2173608680434023, + "grad_norm": 1.2834702730178833, + "learning_rate": 0.00014925477080102528, + "loss": 2.0312, + "step": 19005 + }, + { + "epoch": 2.217477540543694, + "grad_norm": 1.3352879285812378, + "learning_rate": 0.00014923965257632627, + "loss": 1.7892, + "step": 19006 + }, + { + "epoch": 2.2175942130439856, + "grad_norm": 1.1009011268615723, + "learning_rate": 0.0001492245343750601, + "loss": 1.9601, + "step": 19007 + }, + { + "epoch": 2.2177108855442773, + "grad_norm": 1.322211503982544, + "learning_rate": 0.00014920941619738342, + "loss": 1.966, + "step": 19008 + }, + { + "epoch": 2.217827558044569, + "grad_norm": 1.4115649461746216, + "learning_rate": 0.0001491942980434529, + "loss": 1.9565, + "step": 19009 + }, + { + "epoch": 2.2179442305448607, + "grad_norm": 1.1475712060928345, + "learning_rate": 0.00014917917991342546, + "loss": 1.9704, + "step": 19010 + }, + { + "epoch": 2.2180609030451524, + "grad_norm": 1.0720134973526, + "learning_rate": 0.00014916406180745764, + "loss": 1.9454, + "step": 19011 + }, + { + "epoch": 2.218177575545444, + "grad_norm": 1.3557629585266113, + "learning_rate": 0.0001491489437257063, + "loss": 2.0113, + "step": 19012 + }, + { + "epoch": 2.2182942480457357, + "grad_norm": 1.2113336324691772, + "learning_rate": 0.00014913382566832806, + "loss": 1.8209, + "step": 19013 + }, + { + "epoch": 2.2184109205460274, + "grad_norm": 1.2411279678344727, + "learning_rate": 0.00014911870763547976, + "loss": 2.0612, + "step": 19014 + }, + { + "epoch": 2.218527593046319, + "grad_norm": 1.1624165773391724, + "learning_rate": 0.00014910358962731798, + "loss": 1.8869, + "step": 19015 + }, + { + "epoch": 2.2186442655466108, + "grad_norm": 1.1004829406738281, + "learning_rate": 0.0001490884716439996, + "loss": 2.0452, + "step": 19016 + }, + { + "epoch": 2.2187609380469024, + "grad_norm": 1.1587579250335693, + "learning_rate": 0.00014907335368568125, + "loss": 1.8899, + "step": 19017 + }, + { + "epoch": 2.218877610547194, + "grad_norm": 1.217417597770691, + "learning_rate": 0.00014905823575251973, + "loss": 1.9291, + "step": 19018 + }, + { + "epoch": 2.218994283047486, + "grad_norm": 1.2214151620864868, + "learning_rate": 0.00014904311784467165, + "loss": 2.0183, + "step": 19019 + }, + { + "epoch": 2.2191109555477775, + "grad_norm": 1.1759183406829834, + "learning_rate": 0.00014902799996229378, + "loss": 1.8442, + "step": 19020 + }, + { + "epoch": 2.219227628048069, + "grad_norm": 1.1170947551727295, + "learning_rate": 0.00014901288210554295, + "loss": 1.8622, + "step": 19021 + }, + { + "epoch": 2.219344300548361, + "grad_norm": 1.140722393989563, + "learning_rate": 0.00014899776427457575, + "loss": 1.8815, + "step": 19022 + }, + { + "epoch": 2.2194609730486525, + "grad_norm": 1.2283415794372559, + "learning_rate": 0.00014898264646954904, + "loss": 2.0861, + "step": 19023 + }, + { + "epoch": 2.219577645548944, + "grad_norm": 1.2106462717056274, + "learning_rate": 0.0001489675286906194, + "loss": 1.9132, + "step": 19024 + }, + { + "epoch": 2.219694318049236, + "grad_norm": 1.233809471130371, + "learning_rate": 0.00014895241093794366, + "loss": 1.9072, + "step": 19025 + }, + { + "epoch": 2.2198109905495276, + "grad_norm": 1.3475741147994995, + "learning_rate": 0.00014893729321167846, + "loss": 2.0094, + "step": 19026 + }, + { + "epoch": 2.2199276630498193, + "grad_norm": 1.1950592994689941, + "learning_rate": 0.00014892217551198064, + "loss": 1.766, + "step": 19027 + }, + { + "epoch": 2.220044335550111, + "grad_norm": 1.1769980192184448, + "learning_rate": 0.00014890705783900677, + "loss": 2.1918, + "step": 19028 + }, + { + "epoch": 2.2201610080504026, + "grad_norm": 1.145809531211853, + "learning_rate": 0.0001488919401929137, + "loss": 2.0345, + "step": 19029 + }, + { + "epoch": 2.2202776805506943, + "grad_norm": 1.2057312726974487, + "learning_rate": 0.00014887682257385808, + "loss": 1.9715, + "step": 19030 + }, + { + "epoch": 2.2202776805506943, + "eval_train_loss": 1.9240070581436157, + "eval_train_mean_batch_perplexity": 7.834982338110981, + "eval_train_runtime": 11046.1955, + "eval_train_samples_per_second": 12.415, + "eval_train_steps_per_second": 0.776, + "step": 19030 + }, + { + "epoch": 2.2202776805506943, + "eval_test_loss": 2.073427438735962, + "eval_test_mean_batch_perplexity": 9.216437290678401, + "eval_test_runtime": 2384.0787, + "eval_test_samples_per_second": 12.326, + "eval_test_steps_per_second": 0.771, + "step": 19030 + }, + { + "epoch": 2.220394353050986, + "grad_norm": 1.0146902799606323, + "learning_rate": 0.0001488617049819967, + "loss": 1.8284, + "step": 19031 + }, + { + "epoch": 2.2205110255512777, + "grad_norm": 1.1038612127304077, + "learning_rate": 0.0001488465874174862, + "loss": 1.9805, + "step": 19032 + }, + { + "epoch": 2.2206276980515693, + "grad_norm": 1.1338330507278442, + "learning_rate": 0.0001488314698804834, + "loss": 1.9614, + "step": 19033 + }, + { + "epoch": 2.220744370551861, + "grad_norm": 1.3766124248504639, + "learning_rate": 0.00014881635237114495, + "loss": 1.926, + "step": 19034 + }, + { + "epoch": 2.2208610430521527, + "grad_norm": 1.3080008029937744, + "learning_rate": 0.00014880123488962755, + "loss": 2.186, + "step": 19035 + }, + { + "epoch": 2.2209777155524444, + "grad_norm": 1.116191029548645, + "learning_rate": 0.00014878611743608804, + "loss": 1.8743, + "step": 19036 + }, + { + "epoch": 2.221094388052736, + "grad_norm": 1.322593331336975, + "learning_rate": 0.00014877100001068296, + "loss": 1.9484, + "step": 19037 + }, + { + "epoch": 2.2212110605530277, + "grad_norm": 1.1458944082260132, + "learning_rate": 0.00014875588261356923, + "loss": 1.8566, + "step": 19038 + }, + { + "epoch": 2.2213277330533194, + "grad_norm": 1.1282844543457031, + "learning_rate": 0.00014874076524490338, + "loss": 1.8078, + "step": 19039 + }, + { + "epoch": 2.221444405553611, + "grad_norm": 1.1146553754806519, + "learning_rate": 0.0001487256479048423, + "loss": 2.0546, + "step": 19040 + }, + { + "epoch": 2.221561078053903, + "grad_norm": 1.2978218793869019, + "learning_rate": 0.0001487105305935426, + "loss": 1.9447, + "step": 19041 + }, + { + "epoch": 2.2216777505541945, + "grad_norm": 1.1822491884231567, + "learning_rate": 0.00014869541331116105, + "loss": 1.8659, + "step": 19042 + }, + { + "epoch": 2.221794423054486, + "grad_norm": 1.139243245124817, + "learning_rate": 0.00014868029605785433, + "loss": 1.8929, + "step": 19043 + }, + { + "epoch": 2.221911095554778, + "grad_norm": 1.0638024806976318, + "learning_rate": 0.00014866517883377922, + "loss": 1.8591, + "step": 19044 + }, + { + "epoch": 2.2220277680550695, + "grad_norm": 1.2378747463226318, + "learning_rate": 0.00014865006163909233, + "loss": 1.9072, + "step": 19045 + }, + { + "epoch": 2.222144440555361, + "grad_norm": 1.161832332611084, + "learning_rate": 0.00014863494447395052, + "loss": 1.9649, + "step": 19046 + }, + { + "epoch": 2.222261113055653, + "grad_norm": 1.242103099822998, + "learning_rate": 0.00014861982733851037, + "loss": 1.9912, + "step": 19047 + }, + { + "epoch": 2.2223777855559446, + "grad_norm": 1.207268238067627, + "learning_rate": 0.00014860471023292866, + "loss": 2.0013, + "step": 19048 + }, + { + "epoch": 2.2224944580562362, + "grad_norm": 1.2149215936660767, + "learning_rate": 0.00014858959315736217, + "loss": 2.026, + "step": 19049 + }, + { + "epoch": 2.222611130556528, + "grad_norm": 1.2251437902450562, + "learning_rate": 0.00014857447611196754, + "loss": 2.081, + "step": 19050 + }, + { + "epoch": 2.2227278030568196, + "grad_norm": 1.137826681137085, + "learning_rate": 0.00014855935909690153, + "loss": 1.8914, + "step": 19051 + }, + { + "epoch": 2.2228444755571113, + "grad_norm": 1.1611937284469604, + "learning_rate": 0.00014854424211232077, + "loss": 2.0672, + "step": 19052 + }, + { + "epoch": 2.222961148057403, + "grad_norm": 1.2003986835479736, + "learning_rate": 0.0001485291251583821, + "loss": 2.0349, + "step": 19053 + }, + { + "epoch": 2.2230778205576947, + "grad_norm": 1.2945971488952637, + "learning_rate": 0.00014851400823524214, + "loss": 1.9634, + "step": 19054 + }, + { + "epoch": 2.2231944930579863, + "grad_norm": 1.150851845741272, + "learning_rate": 0.00014849889134305766, + "loss": 2.0299, + "step": 19055 + }, + { + "epoch": 2.223311165558278, + "grad_norm": 1.02839994430542, + "learning_rate": 0.0001484837744819853, + "loss": 1.844, + "step": 19056 + }, + { + "epoch": 2.2234278380585697, + "grad_norm": 1.0929909944534302, + "learning_rate": 0.00014846865765218191, + "loss": 2.0534, + "step": 19057 + }, + { + "epoch": 2.2235445105588614, + "grad_norm": 1.3170223236083984, + "learning_rate": 0.00014845354085380404, + "loss": 1.9815, + "step": 19058 + }, + { + "epoch": 2.223661183059153, + "grad_norm": 1.3466089963912964, + "learning_rate": 0.00014843842408700854, + "loss": 2.0076, + "step": 19059 + }, + { + "epoch": 2.2237778555594447, + "grad_norm": 1.0908609628677368, + "learning_rate": 0.0001484233073519521, + "loss": 1.8769, + "step": 19060 + }, + { + "epoch": 2.2238945280597364, + "grad_norm": 1.2164586782455444, + "learning_rate": 0.00014840819064879132, + "loss": 2.0133, + "step": 19061 + }, + { + "epoch": 2.224011200560028, + "grad_norm": 1.2455281019210815, + "learning_rate": 0.00014839307397768306, + "loss": 2.1238, + "step": 19062 + }, + { + "epoch": 2.22412787306032, + "grad_norm": 1.1377861499786377, + "learning_rate": 0.00014837795733878392, + "loss": 1.922, + "step": 19063 + }, + { + "epoch": 2.2242445455606115, + "grad_norm": 1.1527279615402222, + "learning_rate": 0.00014836284073225074, + "loss": 2.1036, + "step": 19064 + }, + { + "epoch": 2.224361218060903, + "grad_norm": 1.169530987739563, + "learning_rate": 0.00014834772415824008, + "loss": 1.8767, + "step": 19065 + }, + { + "epoch": 2.224477890561195, + "grad_norm": 1.0037802457809448, + "learning_rate": 0.00014833260761690882, + "loss": 2.0025, + "step": 19066 + }, + { + "epoch": 2.2245945630614865, + "grad_norm": 1.3152605295181274, + "learning_rate": 0.00014831749110841354, + "loss": 1.8728, + "step": 19067 + }, + { + "epoch": 2.224711235561778, + "grad_norm": 1.1939146518707275, + "learning_rate": 0.00014830237463291096, + "loss": 1.7717, + "step": 19068 + }, + { + "epoch": 2.22482790806207, + "grad_norm": 1.422869324684143, + "learning_rate": 0.0001482872581905578, + "loss": 1.9813, + "step": 19069 + }, + { + "epoch": 2.2249445805623616, + "grad_norm": 1.0791114568710327, + "learning_rate": 0.00014827214178151087, + "loss": 1.8692, + "step": 19070 + }, + { + "epoch": 2.2250612530626532, + "grad_norm": 1.2090189456939697, + "learning_rate": 0.00014825702540592674, + "loss": 1.8264, + "step": 19071 + }, + { + "epoch": 2.225177925562945, + "grad_norm": 1.1450270414352417, + "learning_rate": 0.00014824190906396226, + "loss": 2.065, + "step": 19072 + }, + { + "epoch": 2.2252945980632366, + "grad_norm": 1.255273699760437, + "learning_rate": 0.00014822679275577395, + "loss": 1.864, + "step": 19073 + }, + { + "epoch": 2.2254112705635283, + "grad_norm": 1.0393736362457275, + "learning_rate": 0.0001482116764815187, + "loss": 2.2236, + "step": 19074 + }, + { + "epoch": 2.22552794306382, + "grad_norm": 1.1660863161087036, + "learning_rate": 0.00014819656024135313, + "loss": 2.0014, + "step": 19075 + }, + { + "epoch": 2.2256446155641116, + "grad_norm": 1.0210223197937012, + "learning_rate": 0.000148181444035434, + "loss": 1.858, + "step": 19076 + }, + { + "epoch": 2.2257612880644033, + "grad_norm": 1.5394835472106934, + "learning_rate": 0.00014816632786391799, + "loss": 1.889, + "step": 19077 + }, + { + "epoch": 2.225877960564695, + "grad_norm": 1.183899164199829, + "learning_rate": 0.00014815121172696174, + "loss": 1.9175, + "step": 19078 + }, + { + "epoch": 2.2259946330649867, + "grad_norm": 1.0671052932739258, + "learning_rate": 0.00014813609562472208, + "loss": 1.772, + "step": 19079 + }, + { + "epoch": 2.2261113055652784, + "grad_norm": 1.1523675918579102, + "learning_rate": 0.00014812097955735562, + "loss": 2.1905, + "step": 19080 + }, + { + "epoch": 2.22622797806557, + "grad_norm": 1.012098789215088, + "learning_rate": 0.00014810586352501914, + "loss": 1.9501, + "step": 19081 + }, + { + "epoch": 2.2263446505658617, + "grad_norm": 1.1903743743896484, + "learning_rate": 0.00014809074752786928, + "loss": 2.0101, + "step": 19082 + }, + { + "epoch": 2.2264613230661534, + "grad_norm": 1.41904878616333, + "learning_rate": 0.00014807563156606283, + "loss": 2.0702, + "step": 19083 + }, + { + "epoch": 2.226577995566445, + "grad_norm": 1.2580218315124512, + "learning_rate": 0.00014806051563975644, + "loss": 1.9479, + "step": 19084 + }, + { + "epoch": 2.2266946680667368, + "grad_norm": 1.3615790605545044, + "learning_rate": 0.00014804539974910678, + "loss": 1.9849, + "step": 19085 + }, + { + "epoch": 2.2268113405670285, + "grad_norm": 1.3804001808166504, + "learning_rate": 0.0001480302838942706, + "loss": 2.0457, + "step": 19086 + }, + { + "epoch": 2.22692801306732, + "grad_norm": 1.2204641103744507, + "learning_rate": 0.00014801516807540454, + "loss": 1.9902, + "step": 19087 + }, + { + "epoch": 2.227044685567612, + "grad_norm": 1.093369722366333, + "learning_rate": 0.00014800005229266547, + "loss": 1.9758, + "step": 19088 + }, + { + "epoch": 2.2271613580679035, + "grad_norm": 1.215855360031128, + "learning_rate": 0.0001479849365462099, + "loss": 1.9811, + "step": 19089 + }, + { + "epoch": 2.227278030568195, + "grad_norm": 1.064211368560791, + "learning_rate": 0.0001479698208361947, + "loss": 1.9235, + "step": 19090 + }, + { + "epoch": 2.227394703068487, + "grad_norm": 1.0497184991836548, + "learning_rate": 0.0001479547051627764, + "loss": 1.8946, + "step": 19091 + }, + { + "epoch": 2.2275113755687785, + "grad_norm": 1.1961584091186523, + "learning_rate": 0.00014793958952611188, + "loss": 2.007, + "step": 19092 + }, + { + "epoch": 2.22762804806907, + "grad_norm": 1.1334223747253418, + "learning_rate": 0.00014792447392635773, + "loss": 1.9219, + "step": 19093 + }, + { + "epoch": 2.227744720569362, + "grad_norm": 1.202094554901123, + "learning_rate": 0.00014790935836367068, + "loss": 2.1001, + "step": 19094 + }, + { + "epoch": 2.2278613930696536, + "grad_norm": 1.1217201948165894, + "learning_rate": 0.0001478942428382074, + "loss": 2.084, + "step": 19095 + }, + { + "epoch": 2.2279780655699453, + "grad_norm": 1.1272428035736084, + "learning_rate": 0.00014787912735012465, + "loss": 1.9493, + "step": 19096 + }, + { + "epoch": 2.228094738070237, + "grad_norm": 1.286223292350769, + "learning_rate": 0.00014786401189957908, + "loss": 1.9866, + "step": 19097 + }, + { + "epoch": 2.2282114105705286, + "grad_norm": 1.1373485326766968, + "learning_rate": 0.00014784889648672745, + "loss": 1.8942, + "step": 19098 + }, + { + "epoch": 2.2283280830708203, + "grad_norm": 1.3089370727539062, + "learning_rate": 0.00014783378111172637, + "loss": 1.7782, + "step": 19099 + }, + { + "epoch": 2.228444755571112, + "grad_norm": 1.1664940118789673, + "learning_rate": 0.0001478186657747326, + "loss": 2.0781, + "step": 19100 + }, + { + "epoch": 2.2285614280714037, + "grad_norm": 0.9903453588485718, + "learning_rate": 0.00014780355047590285, + "loss": 1.9219, + "step": 19101 + }, + { + "epoch": 2.2286781005716954, + "grad_norm": 1.2444106340408325, + "learning_rate": 0.0001477884352153938, + "loss": 2.1178, + "step": 19102 + }, + { + "epoch": 2.228794773071987, + "grad_norm": 1.1916587352752686, + "learning_rate": 0.00014777331999336218, + "loss": 2.0051, + "step": 19103 + }, + { + "epoch": 2.2289114455722787, + "grad_norm": 1.3137257099151611, + "learning_rate": 0.00014775820480996456, + "loss": 2.0795, + "step": 19104 + }, + { + "epoch": 2.2290281180725704, + "grad_norm": 1.0924044847488403, + "learning_rate": 0.00014774308966535783, + "loss": 1.8495, + "step": 19105 + }, + { + "epoch": 2.229144790572862, + "grad_norm": 1.1373937129974365, + "learning_rate": 0.00014772797455969854, + "loss": 2.0001, + "step": 19106 + }, + { + "epoch": 2.2292614630731538, + "grad_norm": 1.145623803138733, + "learning_rate": 0.00014771285949314347, + "loss": 1.9341, + "step": 19107 + }, + { + "epoch": 2.2293781355734454, + "grad_norm": 1.2328072786331177, + "learning_rate": 0.00014769774446584923, + "loss": 2.0431, + "step": 19108 + }, + { + "epoch": 2.229494808073737, + "grad_norm": 1.0523957014083862, + "learning_rate": 0.00014768262947797265, + "loss": 2.0425, + "step": 19109 + }, + { + "epoch": 2.229611480574029, + "grad_norm": 1.2719173431396484, + "learning_rate": 0.0001476675145296703, + "loss": 1.8177, + "step": 19110 + }, + { + "epoch": 2.2297281530743205, + "grad_norm": 1.3929933309555054, + "learning_rate": 0.0001476523996210989, + "loss": 2.0837, + "step": 19111 + }, + { + "epoch": 2.229844825574612, + "grad_norm": 1.0995746850967407, + "learning_rate": 0.00014763728475241517, + "loss": 1.9724, + "step": 19112 + }, + { + "epoch": 2.229961498074904, + "grad_norm": 1.0983160734176636, + "learning_rate": 0.00014762216992377574, + "loss": 2.0879, + "step": 19113 + }, + { + "epoch": 2.2300781705751955, + "grad_norm": 1.3486069440841675, + "learning_rate": 0.0001476070551353375, + "loss": 2.1446, + "step": 19114 + }, + { + "epoch": 2.230194843075487, + "grad_norm": 1.2672741413116455, + "learning_rate": 0.00014759194038725687, + "loss": 2.0499, + "step": 19115 + }, + { + "epoch": 2.230311515575779, + "grad_norm": 1.1258234977722168, + "learning_rate": 0.00014757682567969078, + "loss": 2.1374, + "step": 19116 + }, + { + "epoch": 2.2304281880760706, + "grad_norm": 1.2258387804031372, + "learning_rate": 0.00014756171101279575, + "loss": 2.0563, + "step": 19117 + }, + { + "epoch": 2.2305448605763623, + "grad_norm": 1.1206800937652588, + "learning_rate": 0.0001475465963867286, + "loss": 1.9985, + "step": 19118 + }, + { + "epoch": 2.230661533076654, + "grad_norm": 1.2922420501708984, + "learning_rate": 0.00014753148180164595, + "loss": 2.08, + "step": 19119 + }, + { + "epoch": 2.2307782055769456, + "grad_norm": 1.209654450416565, + "learning_rate": 0.00014751636725770454, + "loss": 2.1116, + "step": 19120 + }, + { + "epoch": 2.2308948780772373, + "grad_norm": 1.2789252996444702, + "learning_rate": 0.00014750125275506096, + "loss": 1.89, + "step": 19121 + }, + { + "epoch": 2.231011550577529, + "grad_norm": 1.193280577659607, + "learning_rate": 0.000147486138293872, + "loss": 1.9455, + "step": 19122 + }, + { + "epoch": 2.2311282230778207, + "grad_norm": 1.193922519683838, + "learning_rate": 0.0001474710238742943, + "loss": 1.8607, + "step": 19123 + }, + { + "epoch": 2.2312448955781123, + "grad_norm": 1.0895806550979614, + "learning_rate": 0.00014745590949648458, + "loss": 2.0094, + "step": 19124 + }, + { + "epoch": 2.231361568078404, + "grad_norm": 0.9937450885772705, + "learning_rate": 0.00014744079516059949, + "loss": 1.857, + "step": 19125 + }, + { + "epoch": 2.2314782405786957, + "grad_norm": 1.3332618474960327, + "learning_rate": 0.00014742568086679577, + "loss": 2.0967, + "step": 19126 + }, + { + "epoch": 2.2315949130789874, + "grad_norm": 1.0787354707717896, + "learning_rate": 0.00014741056661523005, + "loss": 1.9488, + "step": 19127 + }, + { + "epoch": 2.231711585579279, + "grad_norm": 1.1168814897537231, + "learning_rate": 0.0001473954524060591, + "loss": 1.9004, + "step": 19128 + }, + { + "epoch": 2.2318282580795707, + "grad_norm": 1.154340386390686, + "learning_rate": 0.00014738033823943956, + "loss": 2.1539, + "step": 19129 + }, + { + "epoch": 2.2319449305798624, + "grad_norm": 1.1215122938156128, + "learning_rate": 0.00014736522411552805, + "loss": 2.0389, + "step": 19130 + }, + { + "epoch": 2.232061603080154, + "grad_norm": 1.3042386770248413, + "learning_rate": 0.00014735011003448142, + "loss": 2.0212, + "step": 19131 + }, + { + "epoch": 2.232178275580446, + "grad_norm": 1.3823189735412598, + "learning_rate": 0.00014733499599645618, + "loss": 2.0056, + "step": 19132 + }, + { + "epoch": 2.2322949480807375, + "grad_norm": 1.4527039527893066, + "learning_rate": 0.00014731988200160912, + "loss": 1.9538, + "step": 19133 + }, + { + "epoch": 2.232411620581029, + "grad_norm": 1.070487141609192, + "learning_rate": 0.0001473047680500969, + "loss": 1.7543, + "step": 19134 + }, + { + "epoch": 2.232528293081321, + "grad_norm": 1.1588772535324097, + "learning_rate": 0.0001472896541420762, + "loss": 2.0069, + "step": 19135 + }, + { + "epoch": 2.2326449655816125, + "grad_norm": 1.4619841575622559, + "learning_rate": 0.00014727454027770373, + "loss": 1.9748, + "step": 19136 + }, + { + "epoch": 2.232761638081904, + "grad_norm": 1.0921920537948608, + "learning_rate": 0.0001472594264571361, + "loss": 1.8412, + "step": 19137 + }, + { + "epoch": 2.232878310582196, + "grad_norm": 1.207290530204773, + "learning_rate": 0.00014724431268053008, + "loss": 2.2223, + "step": 19138 + }, + { + "epoch": 2.2329949830824876, + "grad_norm": 1.2423481941223145, + "learning_rate": 0.00014722919894804228, + "loss": 1.9647, + "step": 19139 + }, + { + "epoch": 2.2331116555827792, + "grad_norm": 1.4290217161178589, + "learning_rate": 0.0001472140852598295, + "loss": 2.0104, + "step": 19140 + }, + { + "epoch": 2.233228328083071, + "grad_norm": 1.3839987516403198, + "learning_rate": 0.00014719897161604825, + "loss": 1.9764, + "step": 19141 + }, + { + "epoch": 2.2333450005833626, + "grad_norm": 1.2092134952545166, + "learning_rate": 0.00014718385801685538, + "loss": 2.1013, + "step": 19142 + }, + { + "epoch": 2.2334616730836543, + "grad_norm": 1.1445589065551758, + "learning_rate": 0.00014716874446240742, + "loss": 1.9972, + "step": 19143 + }, + { + "epoch": 2.233578345583946, + "grad_norm": 1.2488692998886108, + "learning_rate": 0.0001471536309528612, + "loss": 1.9056, + "step": 19144 + }, + { + "epoch": 2.2336950180842376, + "grad_norm": 1.2001060247421265, + "learning_rate": 0.0001471385174883733, + "loss": 1.8909, + "step": 19145 + }, + { + "epoch": 2.2338116905845293, + "grad_norm": 1.0145829916000366, + "learning_rate": 0.00014712340406910045, + "loss": 1.7185, + "step": 19146 + }, + { + "epoch": 2.233928363084821, + "grad_norm": 1.0901087522506714, + "learning_rate": 0.00014710829069519924, + "loss": 1.708, + "step": 19147 + }, + { + "epoch": 2.2340450355851127, + "grad_norm": 1.212187647819519, + "learning_rate": 0.00014709317736682652, + "loss": 1.9418, + "step": 19148 + }, + { + "epoch": 2.2341617080854044, + "grad_norm": 1.064018964767456, + "learning_rate": 0.00014707806408413872, + "loss": 1.9086, + "step": 19149 + }, + { + "epoch": 2.234278380585696, + "grad_norm": 1.158482313156128, + "learning_rate": 0.00014706295084729277, + "loss": 2.071, + "step": 19150 + }, + { + "epoch": 2.2343950530859877, + "grad_norm": 1.0744928121566772, + "learning_rate": 0.00014704783765644516, + "loss": 1.7897, + "step": 19151 + }, + { + "epoch": 2.2345117255862794, + "grad_norm": 1.1912810802459717, + "learning_rate": 0.00014703272451175266, + "loss": 1.9352, + "step": 19152 + }, + { + "epoch": 2.234628398086571, + "grad_norm": 1.1252949237823486, + "learning_rate": 0.000147017611413372, + "loss": 1.9903, + "step": 19153 + }, + { + "epoch": 2.234745070586863, + "grad_norm": 1.2483869791030884, + "learning_rate": 0.00014700249836145968, + "loss": 1.9898, + "step": 19154 + }, + { + "epoch": 2.2348617430871545, + "grad_norm": 1.0657036304473877, + "learning_rate": 0.0001469873853561726, + "loss": 1.832, + "step": 19155 + }, + { + "epoch": 2.234978415587446, + "grad_norm": 1.1077451705932617, + "learning_rate": 0.0001469722723976672, + "loss": 1.9347, + "step": 19156 + }, + { + "epoch": 2.235095088087738, + "grad_norm": 1.3542252779006958, + "learning_rate": 0.00014695715948610037, + "loss": 1.8815, + "step": 19157 + }, + { + "epoch": 2.2352117605880295, + "grad_norm": 1.2815455198287964, + "learning_rate": 0.0001469420466216286, + "loss": 1.9599, + "step": 19158 + }, + { + "epoch": 2.235328433088321, + "grad_norm": 1.13032865524292, + "learning_rate": 0.0001469269338044087, + "loss": 1.8667, + "step": 19159 + }, + { + "epoch": 2.235445105588613, + "grad_norm": 1.1488126516342163, + "learning_rate": 0.00014691182103459727, + "loss": 2.046, + "step": 19160 + }, + { + "epoch": 2.2355617780889045, + "grad_norm": 1.682401180267334, + "learning_rate": 0.00014689670831235103, + "loss": 2.0914, + "step": 19161 + }, + { + "epoch": 2.2356784505891962, + "grad_norm": 1.3688559532165527, + "learning_rate": 0.0001468815956378266, + "loss": 1.9072, + "step": 19162 + }, + { + "epoch": 2.235795123089488, + "grad_norm": 1.2755693197250366, + "learning_rate": 0.00014686648301118072, + "loss": 1.9663, + "step": 19163 + }, + { + "epoch": 2.2359117955897796, + "grad_norm": 1.122783899307251, + "learning_rate": 0.00014685137043256993, + "loss": 1.97, + "step": 19164 + }, + { + "epoch": 2.2360284680900713, + "grad_norm": 1.172562599182129, + "learning_rate": 0.000146836257902151, + "loss": 2.0549, + "step": 19165 + }, + { + "epoch": 2.236145140590363, + "grad_norm": 1.6217677593231201, + "learning_rate": 0.00014682114542008068, + "loss": 2.0825, + "step": 19166 + }, + { + "epoch": 2.2362618130906546, + "grad_norm": 1.0640963315963745, + "learning_rate": 0.00014680603298651546, + "loss": 1.9134, + "step": 19167 + }, + { + "epoch": 2.2363784855909463, + "grad_norm": 1.0059103965759277, + "learning_rate": 0.00014679092060161216, + "loss": 1.9895, + "step": 19168 + }, + { + "epoch": 2.236495158091238, + "grad_norm": 1.2563616037368774, + "learning_rate": 0.00014677580826552735, + "loss": 1.9074, + "step": 19169 + }, + { + "epoch": 2.2366118305915297, + "grad_norm": 1.2373141050338745, + "learning_rate": 0.00014676069597841773, + "loss": 2.0814, + "step": 19170 + }, + { + "epoch": 2.2367285030918214, + "grad_norm": 1.3452918529510498, + "learning_rate": 0.00014674558374044, + "loss": 2.1501, + "step": 19171 + }, + { + "epoch": 2.236845175592113, + "grad_norm": 1.1454817056655884, + "learning_rate": 0.0001467304715517508, + "loss": 1.9505, + "step": 19172 + }, + { + "epoch": 2.2369618480924047, + "grad_norm": 1.2048633098602295, + "learning_rate": 0.00014671535941250676, + "loss": 1.9808, + "step": 19173 + }, + { + "epoch": 2.2370785205926964, + "grad_norm": 1.253145694732666, + "learning_rate": 0.0001467002473228646, + "loss": 1.9822, + "step": 19174 + }, + { + "epoch": 2.237195193092988, + "grad_norm": 1.2296854257583618, + "learning_rate": 0.0001466851352829809, + "loss": 1.9491, + "step": 19175 + }, + { + "epoch": 2.2373118655932798, + "grad_norm": 1.1653095483779907, + "learning_rate": 0.00014667002329301251, + "loss": 2.0151, + "step": 19176 + }, + { + "epoch": 2.2374285380935715, + "grad_norm": 1.256150722503662, + "learning_rate": 0.00014665491135311588, + "loss": 2.0594, + "step": 19177 + }, + { + "epoch": 2.237545210593863, + "grad_norm": 1.3331118822097778, + "learning_rate": 0.0001466397994634478, + "loss": 2.1472, + "step": 19178 + }, + { + "epoch": 2.237661883094155, + "grad_norm": 1.4071955680847168, + "learning_rate": 0.00014662468762416495, + "loss": 1.9569, + "step": 19179 + }, + { + "epoch": 2.2377785555944465, + "grad_norm": 1.0928781032562256, + "learning_rate": 0.00014660957583542388, + "loss": 1.8223, + "step": 19180 + }, + { + "epoch": 2.237895228094738, + "grad_norm": 1.0078091621398926, + "learning_rate": 0.00014659446409738135, + "loss": 1.9385, + "step": 19181 + }, + { + "epoch": 2.23801190059503, + "grad_norm": 1.0816128253936768, + "learning_rate": 0.00014657935241019394, + "loss": 1.9754, + "step": 19182 + }, + { + "epoch": 2.2381285730953215, + "grad_norm": 1.1709527969360352, + "learning_rate": 0.00014656424077401844, + "loss": 1.9915, + "step": 19183 + }, + { + "epoch": 2.238245245595613, + "grad_norm": 1.3361144065856934, + "learning_rate": 0.00014654912918901135, + "loss": 1.9144, + "step": 19184 + }, + { + "epoch": 2.238361918095905, + "grad_norm": 1.192533016204834, + "learning_rate": 0.00014653401765532949, + "loss": 2.0982, + "step": 19185 + }, + { + "epoch": 2.2384785905961966, + "grad_norm": 1.0476350784301758, + "learning_rate": 0.00014651890617312937, + "loss": 1.8289, + "step": 19186 + }, + { + "epoch": 2.2385952630964883, + "grad_norm": 1.2975841760635376, + "learning_rate": 0.00014650379474256777, + "loss": 2.061, + "step": 19187 + }, + { + "epoch": 2.23871193559678, + "grad_norm": 1.227656602859497, + "learning_rate": 0.00014648868336380127, + "loss": 2.0206, + "step": 19188 + }, + { + "epoch": 2.2388286080970716, + "grad_norm": 1.1101477146148682, + "learning_rate": 0.0001464735720369866, + "loss": 1.9522, + "step": 19189 + }, + { + "epoch": 2.2389452805973633, + "grad_norm": 1.1716219186782837, + "learning_rate": 0.0001464584607622803, + "loss": 2.0677, + "step": 19190 + }, + { + "epoch": 2.239061953097655, + "grad_norm": 1.118242859840393, + "learning_rate": 0.0001464433495398391, + "loss": 1.8938, + "step": 19191 + }, + { + "epoch": 2.2391786255979467, + "grad_norm": 1.2475249767303467, + "learning_rate": 0.00014642823836981972, + "loss": 2.0431, + "step": 19192 + }, + { + "epoch": 2.2392952980982384, + "grad_norm": 1.1765056848526, + "learning_rate": 0.00014641312725237871, + "loss": 1.8256, + "step": 19193 + }, + { + "epoch": 2.23941197059853, + "grad_norm": 1.3073004484176636, + "learning_rate": 0.00014639801618767281, + "loss": 2.1036, + "step": 19194 + }, + { + "epoch": 2.2395286430988217, + "grad_norm": 1.1785838603973389, + "learning_rate": 0.00014638290517585863, + "loss": 1.9763, + "step": 19195 + }, + { + "epoch": 2.2396453155991134, + "grad_norm": 1.1131889820098877, + "learning_rate": 0.00014636779421709278, + "loss": 1.8618, + "step": 19196 + }, + { + "epoch": 2.239761988099405, + "grad_norm": 1.29546320438385, + "learning_rate": 0.00014635268331153202, + "loss": 2.0019, + "step": 19197 + }, + { + "epoch": 2.2398786605996968, + "grad_norm": 1.102908730506897, + "learning_rate": 0.00014633757245933294, + "loss": 1.9458, + "step": 19198 + }, + { + "epoch": 2.2399953330999884, + "grad_norm": 1.197865605354309, + "learning_rate": 0.00014632246166065214, + "loss": 2.0142, + "step": 19199 + }, + { + "epoch": 2.24011200560028, + "grad_norm": 1.152316927909851, + "learning_rate": 0.0001463073509156464, + "loss": 1.8012, + "step": 19200 + }, + { + "epoch": 2.240228678100572, + "grad_norm": 1.2318092584609985, + "learning_rate": 0.00014629224022447222, + "loss": 1.9751, + "step": 19201 + }, + { + "epoch": 2.2403453506008635, + "grad_norm": 1.1018503904342651, + "learning_rate": 0.0001462771295872864, + "loss": 2.0515, + "step": 19202 + }, + { + "epoch": 2.240462023101155, + "grad_norm": 1.1226862668991089, + "learning_rate": 0.0001462620190042455, + "loss": 1.9521, + "step": 19203 + }, + { + "epoch": 2.240578695601447, + "grad_norm": 1.0873160362243652, + "learning_rate": 0.00014624690847550616, + "loss": 2.105, + "step": 19204 + }, + { + "epoch": 2.2406953681017385, + "grad_norm": 1.0993369817733765, + "learning_rate": 0.00014623179800122513, + "loss": 1.9859, + "step": 19205 + }, + { + "epoch": 2.24081204060203, + "grad_norm": 1.0501468181610107, + "learning_rate": 0.00014621668758155892, + "loss": 2.0172, + "step": 19206 + }, + { + "epoch": 2.240928713102322, + "grad_norm": 1.1552814245224, + "learning_rate": 0.00014620157721666432, + "loss": 1.9987, + "step": 19207 + }, + { + "epoch": 2.2410453856026136, + "grad_norm": 1.1627141237258911, + "learning_rate": 0.00014618646690669782, + "loss": 1.9012, + "step": 19208 + }, + { + "epoch": 2.2411620581029053, + "grad_norm": 1.2070428133010864, + "learning_rate": 0.00014617135665181624, + "loss": 2.0022, + "step": 19209 + }, + { + "epoch": 2.241278730603197, + "grad_norm": 1.230656623840332, + "learning_rate": 0.00014615624645217606, + "loss": 1.8262, + "step": 19210 + }, + { + "epoch": 2.2413954031034886, + "grad_norm": 1.0761030912399292, + "learning_rate": 0.00014614113630793407, + "loss": 2.0518, + "step": 19211 + }, + { + "epoch": 2.2415120756037803, + "grad_norm": 1.2278231382369995, + "learning_rate": 0.0001461260262192468, + "loss": 1.9938, + "step": 19212 + }, + { + "epoch": 2.241628748104072, + "grad_norm": 1.2944337129592896, + "learning_rate": 0.00014611091618627097, + "loss": 1.9393, + "step": 19213 + }, + { + "epoch": 2.2417454206043637, + "grad_norm": 1.2488322257995605, + "learning_rate": 0.0001460958062091632, + "loss": 1.8926, + "step": 19214 + }, + { + "epoch": 2.2418620931046553, + "grad_norm": 1.2106951475143433, + "learning_rate": 0.00014608069628808018, + "loss": 2.1149, + "step": 19215 + }, + { + "epoch": 2.241978765604947, + "grad_norm": 1.1717188358306885, + "learning_rate": 0.00014606558642317843, + "loss": 1.8051, + "step": 19216 + }, + { + "epoch": 2.2420954381052387, + "grad_norm": 1.094208836555481, + "learning_rate": 0.00014605047661461464, + "loss": 1.9218, + "step": 19217 + }, + { + "epoch": 2.2422121106055304, + "grad_norm": 1.2465518712997437, + "learning_rate": 0.00014603536686254556, + "loss": 2.1411, + "step": 19218 + }, + { + "epoch": 2.242328783105822, + "grad_norm": 1.294506549835205, + "learning_rate": 0.0001460202571671277, + "loss": 1.8982, + "step": 19219 + }, + { + "epoch": 2.2424454556061137, + "grad_norm": 1.4128475189208984, + "learning_rate": 0.0001460051475285178, + "loss": 2.0151, + "step": 19220 + }, + { + "epoch": 2.2425621281064054, + "grad_norm": 1.3052639961242676, + "learning_rate": 0.00014599003794687243, + "loss": 2.0315, + "step": 19221 + }, + { + "epoch": 2.242678800606697, + "grad_norm": 1.1965736150741577, + "learning_rate": 0.00014597492842234822, + "loss": 1.955, + "step": 19222 + }, + { + "epoch": 2.242795473106989, + "grad_norm": 1.2766095399856567, + "learning_rate": 0.00014595981895510187, + "loss": 2.106, + "step": 19223 + }, + { + "epoch": 2.2429121456072805, + "grad_norm": 1.3217090368270874, + "learning_rate": 0.00014594470954529, + "loss": 1.8672, + "step": 19224 + }, + { + "epoch": 2.243028818107572, + "grad_norm": 1.159647822380066, + "learning_rate": 0.0001459296001930692, + "loss": 1.8743, + "step": 19225 + }, + { + "epoch": 2.243145490607864, + "grad_norm": 0.9933373332023621, + "learning_rate": 0.00014591449089859619, + "loss": 1.8119, + "step": 19226 + }, + { + "epoch": 2.2432621631081555, + "grad_norm": 1.1815195083618164, + "learning_rate": 0.0001458993816620275, + "loss": 1.9988, + "step": 19227 + }, + { + "epoch": 2.243378835608447, + "grad_norm": 1.0358408689498901, + "learning_rate": 0.00014588427248351987, + "loss": 1.7989, + "step": 19228 + }, + { + "epoch": 2.243495508108739, + "grad_norm": 1.4130990505218506, + "learning_rate": 0.00014586916336322984, + "loss": 2.1296, + "step": 19229 + }, + { + "epoch": 2.2436121806090306, + "grad_norm": 1.2012584209442139, + "learning_rate": 0.00014585405430131413, + "loss": 1.963, + "step": 19230 + }, + { + "epoch": 2.2437288531093222, + "grad_norm": 1.1873210668563843, + "learning_rate": 0.00014583894529792937, + "loss": 1.7579, + "step": 19231 + }, + { + "epoch": 2.243845525609614, + "grad_norm": 1.2153500318527222, + "learning_rate": 0.00014582383635323206, + "loss": 2.0886, + "step": 19232 + }, + { + "epoch": 2.2439621981099056, + "grad_norm": 1.2524197101593018, + "learning_rate": 0.00014580872746737907, + "loss": 1.8032, + "step": 19233 + }, + { + "epoch": 2.2440788706101973, + "grad_norm": 1.1183505058288574, + "learning_rate": 0.00014579361864052676, + "loss": 2.1169, + "step": 19234 + }, + { + "epoch": 2.244195543110489, + "grad_norm": 1.132325291633606, + "learning_rate": 0.00014577850987283202, + "loss": 1.9193, + "step": 19235 + }, + { + "epoch": 2.2443122156107806, + "grad_norm": 1.1816768646240234, + "learning_rate": 0.00014576340116445127, + "loss": 1.838, + "step": 19236 + }, + { + "epoch": 2.2444288881110723, + "grad_norm": 1.2260262966156006, + "learning_rate": 0.0001457482925155413, + "loss": 1.8771, + "step": 19237 + }, + { + "epoch": 2.244545560611364, + "grad_norm": 1.1489591598510742, + "learning_rate": 0.00014573318392625866, + "loss": 2.0472, + "step": 19238 + }, + { + "epoch": 2.2446622331116557, + "grad_norm": 1.3728554248809814, + "learning_rate": 0.00014571807539675994, + "loss": 2.0843, + "step": 19239 + }, + { + "epoch": 2.2447789056119474, + "grad_norm": 1.2053313255310059, + "learning_rate": 0.00014570296692720188, + "loss": 2.0124, + "step": 19240 + }, + { + "epoch": 2.244895578112239, + "grad_norm": 1.2390867471694946, + "learning_rate": 0.000145687858517741, + "loss": 2.1293, + "step": 19241 + }, + { + "epoch": 2.2450122506125307, + "grad_norm": 1.0980280637741089, + "learning_rate": 0.00014567275016853396, + "loss": 1.9146, + "step": 19242 + }, + { + "epoch": 2.2451289231128224, + "grad_norm": 1.0971949100494385, + "learning_rate": 0.0001456576418797374, + "loss": 2.0232, + "step": 19243 + }, + { + "epoch": 2.245245595613114, + "grad_norm": 1.2721483707427979, + "learning_rate": 0.000145642533651508, + "loss": 2.0433, + "step": 19244 + }, + { + "epoch": 2.245362268113406, + "grad_norm": 1.141135334968567, + "learning_rate": 0.00014562742548400226, + "loss": 2.0781, + "step": 19245 + }, + { + "epoch": 2.2454789406136975, + "grad_norm": 1.1939128637313843, + "learning_rate": 0.00014561231737737692, + "loss": 2.1483, + "step": 19246 + }, + { + "epoch": 2.245595613113989, + "grad_norm": 1.1446890830993652, + "learning_rate": 0.00014559720933178858, + "loss": 1.9196, + "step": 19247 + }, + { + "epoch": 2.245712285614281, + "grad_norm": 1.0313022136688232, + "learning_rate": 0.00014558210134739386, + "loss": 1.8365, + "step": 19248 + }, + { + "epoch": 2.2458289581145725, + "grad_norm": 1.223185658454895, + "learning_rate": 0.00014556699342434926, + "loss": 1.9768, + "step": 19249 + }, + { + "epoch": 2.245945630614864, + "grad_norm": 1.2783138751983643, + "learning_rate": 0.00014555188556281161, + "loss": 1.9293, + "step": 19250 + }, + { + "epoch": 2.246062303115156, + "grad_norm": 1.0604223012924194, + "learning_rate": 0.00014553677776293734, + "loss": 2.0172, + "step": 19251 + }, + { + "epoch": 2.2461789756154475, + "grad_norm": 1.26572847366333, + "learning_rate": 0.00014552167002488325, + "loss": 2.0154, + "step": 19252 + }, + { + "epoch": 2.2462956481157392, + "grad_norm": 1.0210624933242798, + "learning_rate": 0.0001455065623488058, + "loss": 2.0244, + "step": 19253 + }, + { + "epoch": 2.246412320616031, + "grad_norm": 1.0860460996627808, + "learning_rate": 0.00014549145473486175, + "loss": 1.8979, + "step": 19254 + }, + { + "epoch": 2.2465289931163226, + "grad_norm": 1.1789969205856323, + "learning_rate": 0.00014547634718320757, + "loss": 1.9458, + "step": 19255 + }, + { + "epoch": 2.2466456656166143, + "grad_norm": 1.41713547706604, + "learning_rate": 0.00014546123969400002, + "loss": 2.1507, + "step": 19256 + }, + { + "epoch": 2.246762338116906, + "grad_norm": 1.1445597410202026, + "learning_rate": 0.00014544613226739566, + "loss": 1.8943, + "step": 19257 + }, + { + "epoch": 2.2468790106171976, + "grad_norm": 1.105324387550354, + "learning_rate": 0.00014543102490355105, + "loss": 1.9476, + "step": 19258 + }, + { + "epoch": 2.2469956831174893, + "grad_norm": 1.113169550895691, + "learning_rate": 0.0001454159176026229, + "loss": 1.8164, + "step": 19259 + }, + { + "epoch": 2.247112355617781, + "grad_norm": 1.1239138841629028, + "learning_rate": 0.00014540081036476776, + "loss": 1.957, + "step": 19260 + }, + { + "epoch": 2.2472290281180727, + "grad_norm": 1.1404099464416504, + "learning_rate": 0.0001453857031901423, + "loss": 1.9532, + "step": 19261 + }, + { + "epoch": 2.2473457006183644, + "grad_norm": 1.1928929090499878, + "learning_rate": 0.00014537059607890306, + "loss": 1.9484, + "step": 19262 + }, + { + "epoch": 2.247462373118656, + "grad_norm": 1.1024690866470337, + "learning_rate": 0.00014535548903120676, + "loss": 1.9955, + "step": 19263 + }, + { + "epoch": 2.2475790456189477, + "grad_norm": 1.2074685096740723, + "learning_rate": 0.0001453403820472099, + "loss": 2.0674, + "step": 19264 + }, + { + "epoch": 2.2476957181192394, + "grad_norm": 1.1356927156448364, + "learning_rate": 0.00014532527512706918, + "loss": 1.9242, + "step": 19265 + }, + { + "epoch": 2.247812390619531, + "grad_norm": 1.1720894575119019, + "learning_rate": 0.00014531016827094116, + "loss": 2.1363, + "step": 19266 + }, + { + "epoch": 2.2479290631198228, + "grad_norm": 1.1318026781082153, + "learning_rate": 0.0001452950614789825, + "loss": 1.7806, + "step": 19267 + }, + { + "epoch": 2.2480457356201144, + "grad_norm": 1.0037767887115479, + "learning_rate": 0.0001452799547513497, + "loss": 1.921, + "step": 19268 + }, + { + "epoch": 2.248162408120406, + "grad_norm": 1.1023313999176025, + "learning_rate": 0.00014526484808819948, + "loss": 1.7347, + "step": 19269 + }, + { + "epoch": 2.248279080620698, + "grad_norm": 1.2531534433364868, + "learning_rate": 0.00014524974148968843, + "loss": 2.0559, + "step": 19270 + }, + { + "epoch": 2.2483957531209895, + "grad_norm": 1.1010010242462158, + "learning_rate": 0.00014523463495597313, + "loss": 2.0334, + "step": 19271 + }, + { + "epoch": 2.248512425621281, + "grad_norm": 1.1187632083892822, + "learning_rate": 0.00014521952848721025, + "loss": 2.0298, + "step": 19272 + }, + { + "epoch": 2.248629098121573, + "grad_norm": 1.1504520177841187, + "learning_rate": 0.00014520442208355633, + "loss": 1.9505, + "step": 19273 + }, + { + "epoch": 2.2487457706218645, + "grad_norm": 1.0034034252166748, + "learning_rate": 0.00014518931574516802, + "loss": 1.9009, + "step": 19274 + }, + { + "epoch": 2.248862443122156, + "grad_norm": 1.1576178073883057, + "learning_rate": 0.00014517420947220182, + "loss": 2.0042, + "step": 19275 + }, + { + "epoch": 2.248979115622448, + "grad_norm": 1.1177141666412354, + "learning_rate": 0.0001451591032648145, + "loss": 1.9018, + "step": 19276 + }, + { + "epoch": 2.2490957881227396, + "grad_norm": 1.0979437828063965, + "learning_rate": 0.00014514399712316252, + "loss": 2.0275, + "step": 19277 + }, + { + "epoch": 2.2492124606230313, + "grad_norm": 1.2796541452407837, + "learning_rate": 0.0001451288910474026, + "loss": 2.0203, + "step": 19278 + }, + { + "epoch": 2.249329133123323, + "grad_norm": 1.1595402956008911, + "learning_rate": 0.0001451137850376912, + "loss": 2.0946, + "step": 19279 + }, + { + "epoch": 2.2494458056236146, + "grad_norm": 1.1951558589935303, + "learning_rate": 0.00014509867909418513, + "loss": 2.1054, + "step": 19280 + }, + { + "epoch": 2.2495624781239063, + "grad_norm": 1.2852146625518799, + "learning_rate": 0.00014508357321704076, + "loss": 2.0433, + "step": 19281 + }, + { + "epoch": 2.249679150624198, + "grad_norm": 1.1137734651565552, + "learning_rate": 0.00014506846740641488, + "loss": 1.9104, + "step": 19282 + }, + { + "epoch": 2.2497958231244897, + "grad_norm": 1.1471400260925293, + "learning_rate": 0.00014505336166246402, + "loss": 1.956, + "step": 19283 + }, + { + "epoch": 2.2499124956247814, + "grad_norm": 1.2025738954544067, + "learning_rate": 0.00014503825598534471, + "loss": 1.9218, + "step": 19284 + }, + { + "epoch": 2.250029168125073, + "grad_norm": 1.1765570640563965, + "learning_rate": 0.00014502315037521368, + "loss": 2.0718, + "step": 19285 + }, + { + "epoch": 2.2501458406253647, + "grad_norm": 1.2095867395401, + "learning_rate": 0.0001450080448322274, + "loss": 1.9687, + "step": 19286 + }, + { + "epoch": 2.2502625131256564, + "grad_norm": 1.0793098211288452, + "learning_rate": 0.0001449929393565426, + "loss": 1.8547, + "step": 19287 + }, + { + "epoch": 2.250379185625948, + "grad_norm": 1.1654949188232422, + "learning_rate": 0.00014497783394831575, + "loss": 1.988, + "step": 19288 + }, + { + "epoch": 2.2504958581262398, + "grad_norm": 1.4226969480514526, + "learning_rate": 0.00014496272860770355, + "loss": 1.978, + "step": 19289 + }, + { + "epoch": 2.2506125306265314, + "grad_norm": 1.2798196077346802, + "learning_rate": 0.00014494762333486252, + "loss": 2.0282, + "step": 19290 + }, + { + "epoch": 2.250729203126823, + "grad_norm": 1.1274975538253784, + "learning_rate": 0.00014493251812994928, + "loss": 2.049, + "step": 19291 + }, + { + "epoch": 2.250845875627115, + "grad_norm": 1.2701059579849243, + "learning_rate": 0.00014491741299312038, + "loss": 2.0737, + "step": 19292 + }, + { + "epoch": 2.2509625481274065, + "grad_norm": 1.3382943868637085, + "learning_rate": 0.00014490230792453255, + "loss": 2.1278, + "step": 19293 + }, + { + "epoch": 2.251079220627698, + "grad_norm": 1.2109503746032715, + "learning_rate": 0.00014488720292434222, + "loss": 1.9744, + "step": 19294 + }, + { + "epoch": 2.25119589312799, + "grad_norm": 1.119519591331482, + "learning_rate": 0.00014487209799270604, + "loss": 2.0349, + "step": 19295 + }, + { + "epoch": 2.2513125656282815, + "grad_norm": 1.3078432083129883, + "learning_rate": 0.00014485699312978068, + "loss": 2.0646, + "step": 19296 + }, + { + "epoch": 2.251429238128573, + "grad_norm": 1.1953253746032715, + "learning_rate": 0.00014484188833572263, + "loss": 2.0217, + "step": 19297 + }, + { + "epoch": 2.251545910628865, + "grad_norm": 1.1003531217575073, + "learning_rate": 0.00014482678361068854, + "loss": 1.9434, + "step": 19298 + }, + { + "epoch": 2.2516625831291566, + "grad_norm": 1.0693421363830566, + "learning_rate": 0.000144811678954835, + "loss": 1.922, + "step": 19299 + }, + { + "epoch": 2.2517792556294483, + "grad_norm": 1.2401901483535767, + "learning_rate": 0.00014479657436831855, + "loss": 2.0646, + "step": 19300 + }, + { + "epoch": 2.25189592812974, + "grad_norm": 1.1450567245483398, + "learning_rate": 0.00014478146985129575, + "loss": 2.2268, + "step": 19301 + }, + { + "epoch": 2.2520126006300316, + "grad_norm": 1.1166672706604004, + "learning_rate": 0.0001447663654039233, + "loss": 1.969, + "step": 19302 + }, + { + "epoch": 2.2521292731303233, + "grad_norm": 1.1259795427322388, + "learning_rate": 0.00014475126102635766, + "loss": 1.8842, + "step": 19303 + }, + { + "epoch": 2.252245945630615, + "grad_norm": 1.0972245931625366, + "learning_rate": 0.00014473615671875556, + "loss": 2.1363, + "step": 19304 + }, + { + "epoch": 2.2523626181309067, + "grad_norm": 1.1770528554916382, + "learning_rate": 0.00014472105248127342, + "loss": 2.0007, + "step": 19305 + }, + { + "epoch": 2.2524792906311983, + "grad_norm": 1.1783052682876587, + "learning_rate": 0.00014470594831406797, + "loss": 1.9419, + "step": 19306 + }, + { + "epoch": 2.25259596313149, + "grad_norm": 1.162667989730835, + "learning_rate": 0.00014469084421729568, + "loss": 2.1298, + "step": 19307 + }, + { + "epoch": 2.2527126356317817, + "grad_norm": 1.1513265371322632, + "learning_rate": 0.00014467574019111322, + "loss": 2.1978, + "step": 19308 + }, + { + "epoch": 2.2528293081320734, + "grad_norm": 1.197927713394165, + "learning_rate": 0.00014466063623567717, + "loss": 1.9072, + "step": 19309 + }, + { + "epoch": 2.252945980632365, + "grad_norm": 1.2088545560836792, + "learning_rate": 0.00014464553235114398, + "loss": 1.8248, + "step": 19310 + }, + { + "epoch": 2.2530626531326567, + "grad_norm": 1.2006086111068726, + "learning_rate": 0.00014463042853767044, + "loss": 2.0448, + "step": 19311 + }, + { + "epoch": 2.2531793256329484, + "grad_norm": 1.168891191482544, + "learning_rate": 0.0001446153247954129, + "loss": 1.8797, + "step": 19312 + }, + { + "epoch": 2.25329599813324, + "grad_norm": 1.1266847848892212, + "learning_rate": 0.00014460022112452816, + "loss": 2.0849, + "step": 19313 + }, + { + "epoch": 2.253412670633532, + "grad_norm": 1.1796886920928955, + "learning_rate": 0.00014458511752517262, + "loss": 2.0962, + "step": 19314 + }, + { + "epoch": 2.2535293431338235, + "grad_norm": 1.110483169555664, + "learning_rate": 0.000144570013997503, + "loss": 2.0663, + "step": 19315 + }, + { + "epoch": 2.253646015634115, + "grad_norm": 1.1432812213897705, + "learning_rate": 0.00014455491054167575, + "loss": 1.859, + "step": 19316 + }, + { + "epoch": 2.253762688134407, + "grad_norm": 1.1076208353042603, + "learning_rate": 0.00014453980715784754, + "loss": 1.9051, + "step": 19317 + }, + { + "epoch": 2.2538793606346985, + "grad_norm": 1.186894178390503, + "learning_rate": 0.00014452470384617488, + "loss": 1.9688, + "step": 19318 + }, + { + "epoch": 2.25399603313499, + "grad_norm": 1.247733473777771, + "learning_rate": 0.0001445096006068144, + "loss": 1.9084, + "step": 19319 + }, + { + "epoch": 2.254112705635282, + "grad_norm": 1.302299976348877, + "learning_rate": 0.00014449449743992262, + "loss": 1.9759, + "step": 19320 + }, + { + "epoch": 2.2542293781355736, + "grad_norm": 1.068480372428894, + "learning_rate": 0.00014447939434565613, + "loss": 1.9044, + "step": 19321 + }, + { + "epoch": 2.2543460506358652, + "grad_norm": 1.2245575189590454, + "learning_rate": 0.00014446429132417154, + "loss": 1.9132, + "step": 19322 + }, + { + "epoch": 2.254462723136157, + "grad_norm": 1.2616283893585205, + "learning_rate": 0.0001444491883756254, + "loss": 2.179, + "step": 19323 + }, + { + "epoch": 2.2545793956364486, + "grad_norm": 1.3086405992507935, + "learning_rate": 0.00014443408550017428, + "loss": 2.0288, + "step": 19324 + }, + { + "epoch": 2.2546960681367403, + "grad_norm": 1.14808988571167, + "learning_rate": 0.00014441898269797472, + "loss": 2.0767, + "step": 19325 + }, + { + "epoch": 2.254812740637032, + "grad_norm": 1.1674236059188843, + "learning_rate": 0.00014440387996918333, + "loss": 1.8734, + "step": 19326 + }, + { + "epoch": 2.2549294131373236, + "grad_norm": 1.072252631187439, + "learning_rate": 0.00014438877731395662, + "loss": 1.9486, + "step": 19327 + }, + { + "epoch": 2.2550460856376153, + "grad_norm": 1.2777661085128784, + "learning_rate": 0.00014437367473245128, + "loss": 1.8994, + "step": 19328 + }, + { + "epoch": 2.255162758137907, + "grad_norm": 1.2329864501953125, + "learning_rate": 0.0001443585722248237, + "loss": 1.9792, + "step": 19329 + }, + { + "epoch": 2.2552794306381987, + "grad_norm": 1.3065496683120728, + "learning_rate": 0.0001443434697912306, + "loss": 1.9066, + "step": 19330 + }, + { + "epoch": 2.2553961031384904, + "grad_norm": 1.1595338582992554, + "learning_rate": 0.00014432836743182847, + "loss": 1.9684, + "step": 19331 + }, + { + "epoch": 2.255512775638782, + "grad_norm": 1.1955385208129883, + "learning_rate": 0.00014431326514677395, + "loss": 1.9263, + "step": 19332 + }, + { + "epoch": 2.2556294481390737, + "grad_norm": 1.2612202167510986, + "learning_rate": 0.0001442981629362235, + "loss": 2.0498, + "step": 19333 + }, + { + "epoch": 2.2557461206393654, + "grad_norm": 1.2150276899337769, + "learning_rate": 0.00014428306080033367, + "loss": 2.061, + "step": 19334 + }, + { + "epoch": 2.255862793139657, + "grad_norm": 1.1438921689987183, + "learning_rate": 0.00014426795873926116, + "loss": 1.8221, + "step": 19335 + }, + { + "epoch": 2.2559794656399488, + "grad_norm": 1.060387134552002, + "learning_rate": 0.0001442528567531624, + "loss": 1.9679, + "step": 19336 + }, + { + "epoch": 2.2560961381402405, + "grad_norm": 1.1677172183990479, + "learning_rate": 0.00014423775484219406, + "loss": 1.9309, + "step": 19337 + }, + { + "epoch": 2.256212810640532, + "grad_norm": 1.1498587131500244, + "learning_rate": 0.00014422265300651255, + "loss": 1.9816, + "step": 19338 + }, + { + "epoch": 2.256329483140824, + "grad_norm": 1.2942522764205933, + "learning_rate": 0.00014420755124627463, + "loss": 2.266, + "step": 19339 + }, + { + "epoch": 2.2564461556411155, + "grad_norm": 1.1755317449569702, + "learning_rate": 0.0001441924495616367, + "loss": 1.9815, + "step": 19340 + }, + { + "epoch": 2.256562828141407, + "grad_norm": 1.1885287761688232, + "learning_rate": 0.0001441773479527554, + "loss": 2.0601, + "step": 19341 + }, + { + "epoch": 2.256679500641699, + "grad_norm": 1.3723077774047852, + "learning_rate": 0.00014416224641978722, + "loss": 1.9748, + "step": 19342 + }, + { + "epoch": 2.2567961731419905, + "grad_norm": 1.1232128143310547, + "learning_rate": 0.0001441471449628888, + "loss": 1.9782, + "step": 19343 + }, + { + "epoch": 2.2569128456422822, + "grad_norm": 1.3760044574737549, + "learning_rate": 0.00014413204358221658, + "loss": 2.0736, + "step": 19344 + }, + { + "epoch": 2.257029518142574, + "grad_norm": 1.0914524793624878, + "learning_rate": 0.00014411694227792722, + "loss": 1.8806, + "step": 19345 + }, + { + "epoch": 2.2571461906428656, + "grad_norm": 1.1147462129592896, + "learning_rate": 0.0001441018410501772, + "loss": 1.8934, + "step": 19346 + }, + { + "epoch": 2.2572628631431573, + "grad_norm": 1.4271125793457031, + "learning_rate": 0.0001440867398991231, + "loss": 2.1174, + "step": 19347 + }, + { + "epoch": 2.257379535643449, + "grad_norm": 1.2044304609298706, + "learning_rate": 0.00014407163882492155, + "loss": 2.0334, + "step": 19348 + }, + { + "epoch": 2.2574962081437406, + "grad_norm": 1.1213434934616089, + "learning_rate": 0.00014405653782772899, + "loss": 1.9106, + "step": 19349 + }, + { + "epoch": 2.2576128806440323, + "grad_norm": 1.0990153551101685, + "learning_rate": 0.000144041436907702, + "loss": 1.8848, + "step": 19350 + }, + { + "epoch": 2.257729553144324, + "grad_norm": 1.0734145641326904, + "learning_rate": 0.00014402633606499715, + "loss": 1.8941, + "step": 19351 + }, + { + "epoch": 2.2578462256446157, + "grad_norm": 1.0869042873382568, + "learning_rate": 0.00014401123529977103, + "loss": 1.9991, + "step": 19352 + }, + { + "epoch": 2.2579628981449074, + "grad_norm": 1.0742806196212769, + "learning_rate": 0.00014399613461218004, + "loss": 1.8064, + "step": 19353 + }, + { + "epoch": 2.258079570645199, + "grad_norm": 1.1701202392578125, + "learning_rate": 0.00014398103400238088, + "loss": 1.957, + "step": 19354 + }, + { + "epoch": 2.2581962431454907, + "grad_norm": 1.221596121788025, + "learning_rate": 0.00014396593347053003, + "loss": 2.025, + "step": 19355 + }, + { + "epoch": 2.2583129156457824, + "grad_norm": 1.3085415363311768, + "learning_rate": 0.00014395083301678405, + "loss": 2.1175, + "step": 19356 + }, + { + "epoch": 2.258429588146074, + "grad_norm": 1.234699010848999, + "learning_rate": 0.00014393573264129947, + "loss": 1.8928, + "step": 19357 + }, + { + "epoch": 2.2585462606463658, + "grad_norm": 1.203614354133606, + "learning_rate": 0.00014392063234423288, + "loss": 1.9431, + "step": 19358 + }, + { + "epoch": 2.2586629331466574, + "grad_norm": 1.1675889492034912, + "learning_rate": 0.00014390553212574077, + "loss": 1.9428, + "step": 19359 + }, + { + "epoch": 2.258779605646949, + "grad_norm": 1.2523061037063599, + "learning_rate": 0.00014389043198597968, + "loss": 1.9944, + "step": 19360 + }, + { + "epoch": 2.258896278147241, + "grad_norm": 1.0802713632583618, + "learning_rate": 0.0001438753319251062, + "loss": 1.9593, + "step": 19361 + }, + { + "epoch": 2.2590129506475325, + "grad_norm": 1.4048670530319214, + "learning_rate": 0.0001438602319432768, + "loss": 2.0803, + "step": 19362 + }, + { + "epoch": 2.259129623147824, + "grad_norm": 1.419601321220398, + "learning_rate": 0.0001438451320406481, + "loss": 1.9132, + "step": 19363 + }, + { + "epoch": 2.259246295648116, + "grad_norm": 1.1495476961135864, + "learning_rate": 0.0001438300322173766, + "loss": 1.9625, + "step": 19364 + }, + { + "epoch": 2.2593629681484075, + "grad_norm": 1.2674037218093872, + "learning_rate": 0.00014381493247361886, + "loss": 1.9713, + "step": 19365 + }, + { + "epoch": 2.259479640648699, + "grad_norm": 1.086320161819458, + "learning_rate": 0.00014379983280953134, + "loss": 2.0418, + "step": 19366 + }, + { + "epoch": 2.259596313148991, + "grad_norm": 1.0527230501174927, + "learning_rate": 0.0001437847332252707, + "loss": 2.008, + "step": 19367 + }, + { + "epoch": 2.2597129856492826, + "grad_norm": 1.099542498588562, + "learning_rate": 0.00014376963372099338, + "loss": 1.637, + "step": 19368 + }, + { + "epoch": 2.2598296581495743, + "grad_norm": 1.178554892539978, + "learning_rate": 0.00014375453429685595, + "loss": 2.0516, + "step": 19369 + }, + { + "epoch": 2.259946330649866, + "grad_norm": 1.218493103981018, + "learning_rate": 0.00014373943495301492, + "loss": 1.9564, + "step": 19370 + }, + { + "epoch": 2.2600630031501576, + "grad_norm": 1.2082624435424805, + "learning_rate": 0.00014372433568962686, + "loss": 2.0518, + "step": 19371 + }, + { + "epoch": 2.2601796756504493, + "grad_norm": 1.1365312337875366, + "learning_rate": 0.00014370923650684826, + "loss": 2.108, + "step": 19372 + }, + { + "epoch": 2.260296348150741, + "grad_norm": 1.107491374015808, + "learning_rate": 0.00014369413740483573, + "loss": 2.0375, + "step": 19373 + }, + { + "epoch": 2.2604130206510327, + "grad_norm": 1.107556939125061, + "learning_rate": 0.00014367903838374566, + "loss": 1.8868, + "step": 19374 + }, + { + "epoch": 2.2605296931513243, + "grad_norm": 1.07191801071167, + "learning_rate": 0.00014366393944373476, + "loss": 1.9152, + "step": 19375 + }, + { + "epoch": 2.260646365651616, + "grad_norm": 1.3963806629180908, + "learning_rate": 0.0001436488405849594, + "loss": 2.1284, + "step": 19376 + }, + { + "epoch": 2.2607630381519077, + "grad_norm": 1.263852596282959, + "learning_rate": 0.00014363374180757622, + "loss": 1.9359, + "step": 19377 + }, + { + "epoch": 2.2608797106521994, + "grad_norm": 1.0834975242614746, + "learning_rate": 0.0001436186431117417, + "loss": 1.9549, + "step": 19378 + }, + { + "epoch": 2.260996383152491, + "grad_norm": 1.1701902151107788, + "learning_rate": 0.00014360354449761232, + "loss": 1.9012, + "step": 19379 + }, + { + "epoch": 2.2611130556527828, + "grad_norm": 1.2189241647720337, + "learning_rate": 0.00014358844596534473, + "loss": 2.0917, + "step": 19380 + }, + { + "epoch": 2.2612297281530744, + "grad_norm": 1.2161463499069214, + "learning_rate": 0.0001435733475150953, + "loss": 1.9921, + "step": 19381 + }, + { + "epoch": 2.261346400653366, + "grad_norm": 1.0754526853561401, + "learning_rate": 0.0001435582491470207, + "loss": 1.7479, + "step": 19382 + }, + { + "epoch": 2.261463073153658, + "grad_norm": 1.1331108808517456, + "learning_rate": 0.00014354315086127732, + "loss": 2.0103, + "step": 19383 + }, + { + "epoch": 2.2615797456539495, + "grad_norm": 1.2396670579910278, + "learning_rate": 0.00014352805265802182, + "loss": 1.9222, + "step": 19384 + }, + { + "epoch": 2.261696418154241, + "grad_norm": 1.1272636651992798, + "learning_rate": 0.0001435129545374106, + "loss": 2.1256, + "step": 19385 + }, + { + "epoch": 2.261813090654533, + "grad_norm": 1.3767441511154175, + "learning_rate": 0.00014349785649960027, + "loss": 1.9502, + "step": 19386 + }, + { + "epoch": 2.2619297631548245, + "grad_norm": 1.225185513496399, + "learning_rate": 0.00014348275854474728, + "loss": 2.0623, + "step": 19387 + }, + { + "epoch": 2.262046435655116, + "grad_norm": 1.194902777671814, + "learning_rate": 0.00014346766067300815, + "loss": 2.1206, + "step": 19388 + }, + { + "epoch": 2.262163108155408, + "grad_norm": 1.164879322052002, + "learning_rate": 0.0001434525628845395, + "loss": 2.0661, + "step": 19389 + }, + { + "epoch": 2.2622797806556996, + "grad_norm": 1.232991337776184, + "learning_rate": 0.0001434374651794977, + "loss": 1.8723, + "step": 19390 + }, + { + "epoch": 2.2623964531559912, + "grad_norm": 1.2490788698196411, + "learning_rate": 0.00014342236755803943, + "loss": 2.022, + "step": 19391 + }, + { + "epoch": 2.262513125656283, + "grad_norm": 1.1102162599563599, + "learning_rate": 0.00014340727002032103, + "loss": 1.9245, + "step": 19392 + }, + { + "epoch": 2.2626297981565746, + "grad_norm": 1.1585115194320679, + "learning_rate": 0.00014339217256649917, + "loss": 1.8718, + "step": 19393 + }, + { + "epoch": 2.2627464706568663, + "grad_norm": 1.192787528038025, + "learning_rate": 0.00014337707519673025, + "loss": 2.0131, + "step": 19394 + }, + { + "epoch": 2.262863143157158, + "grad_norm": 1.1293023824691772, + "learning_rate": 0.00014336197791117088, + "loss": 1.8518, + "step": 19395 + }, + { + "epoch": 2.2629798156574497, + "grad_norm": 1.1499395370483398, + "learning_rate": 0.00014334688070997745, + "loss": 1.9983, + "step": 19396 + }, + { + "epoch": 2.2630964881577413, + "grad_norm": 1.035032033920288, + "learning_rate": 0.0001433317835933066, + "loss": 1.896, + "step": 19397 + }, + { + "epoch": 2.263213160658033, + "grad_norm": 1.0728484392166138, + "learning_rate": 0.0001433166865613147, + "loss": 2.043, + "step": 19398 + }, + { + "epoch": 2.2633298331583247, + "grad_norm": 1.1348440647125244, + "learning_rate": 0.00014330158961415844, + "loss": 1.9823, + "step": 19399 + }, + { + "epoch": 2.2634465056586164, + "grad_norm": 1.3069263696670532, + "learning_rate": 0.00014328649275199417, + "loss": 1.9704, + "step": 19400 + }, + { + "epoch": 2.263563178158908, + "grad_norm": 0.9734171032905579, + "learning_rate": 0.00014327139597497844, + "loss": 1.7995, + "step": 19401 + }, + { + "epoch": 2.2636798506591997, + "grad_norm": 1.2486298084259033, + "learning_rate": 0.00014325629928326782, + "loss": 2.0842, + "step": 19402 + }, + { + "epoch": 2.2637965231594914, + "grad_norm": 1.1623966693878174, + "learning_rate": 0.00014324120267701873, + "loss": 1.7201, + "step": 19403 + }, + { + "epoch": 2.263913195659783, + "grad_norm": 1.1583346128463745, + "learning_rate": 0.00014322610615638776, + "loss": 1.8612, + "step": 19404 + }, + { + "epoch": 2.264029868160075, + "grad_norm": 1.2107380628585815, + "learning_rate": 0.00014321100972153133, + "loss": 1.8835, + "step": 19405 + }, + { + "epoch": 2.2641465406603665, + "grad_norm": 1.1573041677474976, + "learning_rate": 0.000143195913372606, + "loss": 1.9495, + "step": 19406 + }, + { + "epoch": 2.264263213160658, + "grad_norm": 1.2371646165847778, + "learning_rate": 0.00014318081710976822, + "loss": 1.9078, + "step": 19407 + }, + { + "epoch": 2.26437988566095, + "grad_norm": 1.06569504737854, + "learning_rate": 0.00014316572093317457, + "loss": 2.048, + "step": 19408 + }, + { + "epoch": 2.2644965581612415, + "grad_norm": 1.1383572816848755, + "learning_rate": 0.00014315062484298146, + "loss": 1.9359, + "step": 19409 + }, + { + "epoch": 2.264613230661533, + "grad_norm": 1.1618337631225586, + "learning_rate": 0.0001431355288393455, + "loss": 1.988, + "step": 19410 + }, + { + "epoch": 2.264729903161825, + "grad_norm": 1.2090659141540527, + "learning_rate": 0.00014312043292242306, + "loss": 1.9442, + "step": 19411 + }, + { + "epoch": 2.2648465756621166, + "grad_norm": 1.2200961112976074, + "learning_rate": 0.00014310533709237073, + "loss": 2.0381, + "step": 19412 + }, + { + "epoch": 2.2649632481624082, + "grad_norm": 1.2300201654434204, + "learning_rate": 0.00014309024134934496, + "loss": 1.9402, + "step": 19413 + }, + { + "epoch": 2.2650799206627, + "grad_norm": 1.089262843132019, + "learning_rate": 0.00014307514569350223, + "loss": 1.9076, + "step": 19414 + }, + { + "epoch": 2.2651965931629916, + "grad_norm": 1.1365617513656616, + "learning_rate": 0.00014306005012499915, + "loss": 1.8829, + "step": 19415 + }, + { + "epoch": 2.2653132656632833, + "grad_norm": 1.202730417251587, + "learning_rate": 0.00014304495464399205, + "loss": 2.0282, + "step": 19416 + }, + { + "epoch": 2.265429938163575, + "grad_norm": 1.1515839099884033, + "learning_rate": 0.0001430298592506376, + "loss": 2.0709, + "step": 19417 + }, + { + "epoch": 2.2655466106638666, + "grad_norm": 1.266248345375061, + "learning_rate": 0.00014301476394509215, + "loss": 2.0, + "step": 19418 + }, + { + "epoch": 2.2656632831641583, + "grad_norm": 1.2108502388000488, + "learning_rate": 0.0001429996687275122, + "loss": 1.8053, + "step": 19419 + }, + { + "epoch": 2.26577995566445, + "grad_norm": 1.2159006595611572, + "learning_rate": 0.00014298457359805433, + "loss": 1.9822, + "step": 19420 + }, + { + "epoch": 2.2658966281647417, + "grad_norm": 1.200951337814331, + "learning_rate": 0.00014296947855687501, + "loss": 1.9849, + "step": 19421 + }, + { + "epoch": 2.2660133006650334, + "grad_norm": 1.1805418729782104, + "learning_rate": 0.00014295438360413063, + "loss": 2.1793, + "step": 19422 + }, + { + "epoch": 2.266129973165325, + "grad_norm": 1.2164642810821533, + "learning_rate": 0.00014293928873997778, + "loss": 2.02, + "step": 19423 + }, + { + "epoch": 2.2662466456656167, + "grad_norm": 1.037859559059143, + "learning_rate": 0.0001429241939645729, + "loss": 1.9342, + "step": 19424 + }, + { + "epoch": 2.2663633181659084, + "grad_norm": 1.263121485710144, + "learning_rate": 0.0001429090992780725, + "loss": 2.0161, + "step": 19425 + }, + { + "epoch": 2.2664799906662, + "grad_norm": 0.9628370404243469, + "learning_rate": 0.00014289400468063304, + "loss": 1.8819, + "step": 19426 + }, + { + "epoch": 2.2665966631664918, + "grad_norm": 1.1897414922714233, + "learning_rate": 0.00014287891017241103, + "loss": 2.0142, + "step": 19427 + }, + { + "epoch": 2.2667133356667835, + "grad_norm": 0.9970412254333496, + "learning_rate": 0.00014286381575356297, + "loss": 1.7756, + "step": 19428 + }, + { + "epoch": 2.266830008167075, + "grad_norm": 1.3670806884765625, + "learning_rate": 0.00014284872142424524, + "loss": 1.9084, + "step": 19429 + }, + { + "epoch": 2.266946680667367, + "grad_norm": 1.365133285522461, + "learning_rate": 0.00014283362718461447, + "loss": 2.0486, + "step": 19430 + }, + { + "epoch": 2.2670633531676585, + "grad_norm": 1.2417786121368408, + "learning_rate": 0.000142818533034827, + "loss": 1.8478, + "step": 19431 + }, + { + "epoch": 2.26718002566795, + "grad_norm": 1.3104465007781982, + "learning_rate": 0.00014280343897503945, + "loss": 2.0271, + "step": 19432 + }, + { + "epoch": 2.267296698168242, + "grad_norm": 1.078269124031067, + "learning_rate": 0.00014278834500540814, + "loss": 1.8535, + "step": 19433 + }, + { + "epoch": 2.2674133706685335, + "grad_norm": 1.1370396614074707, + "learning_rate": 0.0001427732511260897, + "loss": 2.0257, + "step": 19434 + }, + { + "epoch": 2.2675300431688252, + "grad_norm": 1.3268955945968628, + "learning_rate": 0.00014275815733724048, + "loss": 1.9306, + "step": 19435 + }, + { + "epoch": 2.267646715669117, + "grad_norm": 1.1817705631256104, + "learning_rate": 0.00014274306363901707, + "loss": 1.9709, + "step": 19436 + }, + { + "epoch": 2.2677633881694086, + "grad_norm": 1.2422566413879395, + "learning_rate": 0.00014272797003157586, + "loss": 2.02, + "step": 19437 + }, + { + "epoch": 2.2678800606697003, + "grad_norm": 1.2972333431243896, + "learning_rate": 0.00014271287651507338, + "loss": 1.9726, + "step": 19438 + }, + { + "epoch": 2.267996733169992, + "grad_norm": 1.2498153448104858, + "learning_rate": 0.000142697783089666, + "loss": 2.1285, + "step": 19439 + }, + { + "epoch": 2.2681134056702836, + "grad_norm": 1.0647541284561157, + "learning_rate": 0.00014268268975551032, + "loss": 1.9033, + "step": 19440 + }, + { + "epoch": 2.2682300781705753, + "grad_norm": 1.2117981910705566, + "learning_rate": 0.00014266759651276276, + "loss": 2.0933, + "step": 19441 + }, + { + "epoch": 2.268346750670867, + "grad_norm": 1.080886960029602, + "learning_rate": 0.00014265250336157974, + "loss": 1.8984, + "step": 19442 + }, + { + "epoch": 2.2684634231711587, + "grad_norm": 1.1705610752105713, + "learning_rate": 0.00014263741030211787, + "loss": 1.8377, + "step": 19443 + }, + { + "epoch": 2.2685800956714504, + "grad_norm": 1.0531105995178223, + "learning_rate": 0.00014262231733453345, + "loss": 1.8705, + "step": 19444 + }, + { + "epoch": 2.268696768171742, + "grad_norm": 1.0701932907104492, + "learning_rate": 0.00014260722445898306, + "loss": 1.9469, + "step": 19445 + }, + { + "epoch": 2.2688134406720337, + "grad_norm": 1.1741150617599487, + "learning_rate": 0.0001425921316756231, + "loss": 1.9523, + "step": 19446 + }, + { + "epoch": 2.2689301131723254, + "grad_norm": 1.1206070184707642, + "learning_rate": 0.0001425770389846101, + "loss": 2.1073, + "step": 19447 + }, + { + "epoch": 2.269046785672617, + "grad_norm": 1.1299241781234741, + "learning_rate": 0.00014256194638610043, + "loss": 2.0198, + "step": 19448 + }, + { + "epoch": 2.2691634581729088, + "grad_norm": 1.2299714088439941, + "learning_rate": 0.00014254685388025067, + "loss": 1.8706, + "step": 19449 + }, + { + "epoch": 2.2692801306732004, + "grad_norm": 1.079673171043396, + "learning_rate": 0.00014253176146721718, + "loss": 1.812, + "step": 19450 + }, + { + "epoch": 2.269396803173492, + "grad_norm": 1.3815981149673462, + "learning_rate": 0.00014251666914715652, + "loss": 1.9481, + "step": 19451 + }, + { + "epoch": 2.269513475673784, + "grad_norm": 1.1797146797180176, + "learning_rate": 0.000142501576920225, + "loss": 1.8015, + "step": 19452 + }, + { + "epoch": 2.2696301481740755, + "grad_norm": 1.1978051662445068, + "learning_rate": 0.00014248648478657925, + "loss": 2.0612, + "step": 19453 + }, + { + "epoch": 2.269746820674367, + "grad_norm": 1.1850162744522095, + "learning_rate": 0.00014247139274637565, + "loss": 2.01, + "step": 19454 + }, + { + "epoch": 2.269863493174659, + "grad_norm": 1.1221294403076172, + "learning_rate": 0.00014245630079977062, + "loss": 1.9657, + "step": 19455 + }, + { + "epoch": 2.2699801656749505, + "grad_norm": 1.1756904125213623, + "learning_rate": 0.00014244120894692073, + "loss": 2.1028, + "step": 19456 + }, + { + "epoch": 2.270096838175242, + "grad_norm": 1.0693002939224243, + "learning_rate": 0.00014242611718798227, + "loss": 2.0313, + "step": 19457 + }, + { + "epoch": 2.270213510675534, + "grad_norm": 1.18515145778656, + "learning_rate": 0.0001424110255231119, + "loss": 2.1172, + "step": 19458 + }, + { + "epoch": 2.2703301831758256, + "grad_norm": 1.1921416521072388, + "learning_rate": 0.00014239593395246587, + "loss": 2.0164, + "step": 19459 + }, + { + "epoch": 2.2704468556761173, + "grad_norm": 1.0020875930786133, + "learning_rate": 0.00014238084247620076, + "loss": 1.7274, + "step": 19460 + }, + { + "epoch": 2.270563528176409, + "grad_norm": 1.106317400932312, + "learning_rate": 0.00014236575109447295, + "loss": 1.9663, + "step": 19461 + }, + { + "epoch": 2.2706802006767006, + "grad_norm": 1.3272302150726318, + "learning_rate": 0.000142350659807439, + "loss": 2.1429, + "step": 19462 + }, + { + "epoch": 2.2707968731769923, + "grad_norm": 1.0212072134017944, + "learning_rate": 0.00014233556861525525, + "loss": 1.8807, + "step": 19463 + }, + { + "epoch": 2.270913545677284, + "grad_norm": 1.110774278640747, + "learning_rate": 0.00014232047751807823, + "loss": 1.853, + "step": 19464 + }, + { + "epoch": 2.2710302181775757, + "grad_norm": 1.4127970933914185, + "learning_rate": 0.00014230538651606423, + "loss": 2.1281, + "step": 19465 + }, + { + "epoch": 2.2711468906778673, + "grad_norm": 1.107481598854065, + "learning_rate": 0.00014229029560936985, + "loss": 1.9497, + "step": 19466 + }, + { + "epoch": 2.271263563178159, + "grad_norm": 1.3059581518173218, + "learning_rate": 0.00014227520479815153, + "loss": 1.9756, + "step": 19467 + }, + { + "epoch": 2.2713802356784507, + "grad_norm": 1.00552237033844, + "learning_rate": 0.00014226011408256564, + "loss": 1.9329, + "step": 19468 + }, + { + "epoch": 2.2714969081787424, + "grad_norm": 1.1789132356643677, + "learning_rate": 0.00014224502346276875, + "loss": 1.9646, + "step": 19469 + }, + { + "epoch": 2.271613580679034, + "grad_norm": 1.4204810857772827, + "learning_rate": 0.00014222993293891716, + "loss": 1.8885, + "step": 19470 + }, + { + "epoch": 2.2717302531793258, + "grad_norm": 1.2231645584106445, + "learning_rate": 0.00014221484251116738, + "loss": 1.8775, + "step": 19471 + }, + { + "epoch": 2.2718469256796174, + "grad_norm": 1.1118916273117065, + "learning_rate": 0.00014219975217967583, + "loss": 2.0539, + "step": 19472 + }, + { + "epoch": 2.271963598179909, + "grad_norm": 1.0298848152160645, + "learning_rate": 0.00014218466194459897, + "loss": 1.6828, + "step": 19473 + }, + { + "epoch": 2.272080270680201, + "grad_norm": 1.0173152685165405, + "learning_rate": 0.0001421695718060932, + "loss": 1.9698, + "step": 19474 + }, + { + "epoch": 2.2721969431804925, + "grad_norm": 1.1658844947814941, + "learning_rate": 0.00014215448176431504, + "loss": 1.8039, + "step": 19475 + }, + { + "epoch": 2.272313615680784, + "grad_norm": 1.2646867036819458, + "learning_rate": 0.00014213939181942082, + "loss": 1.8593, + "step": 19476 + }, + { + "epoch": 2.272430288181076, + "grad_norm": 0.9917899966239929, + "learning_rate": 0.00014212430197156707, + "loss": 1.8136, + "step": 19477 + }, + { + "epoch": 2.2725469606813675, + "grad_norm": 1.128570556640625, + "learning_rate": 0.00014210921222091013, + "loss": 1.9165, + "step": 19478 + }, + { + "epoch": 2.272663633181659, + "grad_norm": 1.1307108402252197, + "learning_rate": 0.00014209412256760653, + "loss": 2.0679, + "step": 19479 + }, + { + "epoch": 2.272780305681951, + "grad_norm": 1.5249356031417847, + "learning_rate": 0.0001420790330118127, + "loss": 1.8018, + "step": 19480 + }, + { + "epoch": 2.2728969781822426, + "grad_norm": 1.1321814060211182, + "learning_rate": 0.00014206394355368494, + "loss": 1.9732, + "step": 19481 + }, + { + "epoch": 2.2730136506825342, + "grad_norm": 1.0783588886260986, + "learning_rate": 0.00014204885419337985, + "loss": 1.9041, + "step": 19482 + }, + { + "epoch": 2.273130323182826, + "grad_norm": 1.1941794157028198, + "learning_rate": 0.00014203376493105373, + "loss": 2.0122, + "step": 19483 + }, + { + "epoch": 2.2732469956831176, + "grad_norm": 1.2159684896469116, + "learning_rate": 0.00014201867576686312, + "loss": 1.9105, + "step": 19484 + }, + { + "epoch": 2.2733636681834093, + "grad_norm": 1.4182382822036743, + "learning_rate": 0.00014200358670096432, + "loss": 2.2018, + "step": 19485 + }, + { + "epoch": 2.273480340683701, + "grad_norm": 1.1870883703231812, + "learning_rate": 0.00014198849773351392, + "loss": 1.9068, + "step": 19486 + }, + { + "epoch": 2.2735970131839927, + "grad_norm": 1.115047574043274, + "learning_rate": 0.0001419734088646682, + "loss": 1.9298, + "step": 19487 + }, + { + "epoch": 2.2737136856842843, + "grad_norm": 2.113191604614258, + "learning_rate": 0.0001419583200945836, + "loss": 1.9593, + "step": 19488 + }, + { + "epoch": 2.273830358184576, + "grad_norm": 1.2049118280410767, + "learning_rate": 0.0001419432314234166, + "loss": 2.0634, + "step": 19489 + }, + { + "epoch": 2.2739470306848677, + "grad_norm": 0.9655433893203735, + "learning_rate": 0.0001419281428513237, + "loss": 1.8198, + "step": 19490 + }, + { + "epoch": 2.2740637031851594, + "grad_norm": 1.2304819822311401, + "learning_rate": 0.0001419130543784611, + "loss": 2.1059, + "step": 19491 + }, + { + "epoch": 2.274180375685451, + "grad_norm": 1.2361363172531128, + "learning_rate": 0.00014189796600498533, + "loss": 1.9136, + "step": 19492 + }, + { + "epoch": 2.2742970481857427, + "grad_norm": 1.1578232049942017, + "learning_rate": 0.00014188287773105293, + "loss": 1.9336, + "step": 19493 + }, + { + "epoch": 2.2744137206860344, + "grad_norm": 1.1524713039398193, + "learning_rate": 0.00014186778955682013, + "loss": 1.9283, + "step": 19494 + }, + { + "epoch": 2.274530393186326, + "grad_norm": 1.1202629804611206, + "learning_rate": 0.0001418527014824435, + "loss": 1.7912, + "step": 19495 + }, + { + "epoch": 2.274647065686618, + "grad_norm": 1.343759536743164, + "learning_rate": 0.00014183761350807934, + "loss": 2.1015, + "step": 19496 + }, + { + "epoch": 2.2747637381869095, + "grad_norm": 1.148833155632019, + "learning_rate": 0.00014182252563388418, + "loss": 1.7795, + "step": 19497 + }, + { + "epoch": 2.274880410687201, + "grad_norm": 1.339969277381897, + "learning_rate": 0.0001418074378600143, + "loss": 1.8268, + "step": 19498 + }, + { + "epoch": 2.274997083187493, + "grad_norm": 1.1889592409133911, + "learning_rate": 0.0001417923501866262, + "loss": 1.88, + "step": 19499 + }, + { + "epoch": 2.2751137556877845, + "grad_norm": 1.0942682027816772, + "learning_rate": 0.00014177726261387625, + "loss": 2.0112, + "step": 19500 + }, + { + "epoch": 2.275230428188076, + "grad_norm": 1.2068302631378174, + "learning_rate": 0.00014176217514192096, + "loss": 2.0431, + "step": 19501 + }, + { + "epoch": 2.275347100688368, + "grad_norm": 1.1217557191848755, + "learning_rate": 0.00014174708777091658, + "loss": 1.9005, + "step": 19502 + }, + { + "epoch": 2.2754637731886596, + "grad_norm": 1.0388610363006592, + "learning_rate": 0.00014173200050101965, + "loss": 1.9111, + "step": 19503 + }, + { + "epoch": 2.2755804456889512, + "grad_norm": 1.225115418434143, + "learning_rate": 0.00014171691333238648, + "loss": 2.074, + "step": 19504 + }, + { + "epoch": 2.275697118189243, + "grad_norm": 1.2088629007339478, + "learning_rate": 0.0001417018262651736, + "loss": 1.7255, + "step": 19505 + }, + { + "epoch": 2.2758137906895346, + "grad_norm": 1.2603237628936768, + "learning_rate": 0.0001416867392995373, + "loss": 1.8625, + "step": 19506 + }, + { + "epoch": 2.2759304631898263, + "grad_norm": 1.082534909248352, + "learning_rate": 0.00014167165243563404, + "loss": 1.8595, + "step": 19507 + }, + { + "epoch": 2.276047135690118, + "grad_norm": 1.1452052593231201, + "learning_rate": 0.00014165656567362024, + "loss": 1.9599, + "step": 19508 + }, + { + "epoch": 2.2761638081904096, + "grad_norm": 1.2549676895141602, + "learning_rate": 0.00014164147901365222, + "loss": 2.0659, + "step": 19509 + }, + { + "epoch": 2.2762804806907013, + "grad_norm": 1.3297432661056519, + "learning_rate": 0.0001416263924558865, + "loss": 1.9724, + "step": 19510 + }, + { + "epoch": 2.276397153190993, + "grad_norm": 1.146121859550476, + "learning_rate": 0.00014161130600047936, + "loss": 2.1478, + "step": 19511 + }, + { + "epoch": 2.2765138256912847, + "grad_norm": 1.1061393022537231, + "learning_rate": 0.00014159621964758733, + "loss": 1.9781, + "step": 19512 + }, + { + "epoch": 2.2766304981915764, + "grad_norm": 1.1308178901672363, + "learning_rate": 0.0001415811333973667, + "loss": 1.8997, + "step": 19513 + }, + { + "epoch": 2.276747170691868, + "grad_norm": 1.1578260660171509, + "learning_rate": 0.00014156604724997388, + "loss": 2.0883, + "step": 19514 + }, + { + "epoch": 2.2768638431921597, + "grad_norm": 1.1699950695037842, + "learning_rate": 0.00014155096120556534, + "loss": 1.7754, + "step": 19515 + }, + { + "epoch": 2.2769805156924514, + "grad_norm": 1.2759231328964233, + "learning_rate": 0.00014153587526429743, + "loss": 1.9425, + "step": 19516 + }, + { + "epoch": 2.277097188192743, + "grad_norm": 1.2272026538848877, + "learning_rate": 0.0001415207894263265, + "loss": 1.8638, + "step": 19517 + }, + { + "epoch": 2.2772138606930348, + "grad_norm": 1.1853835582733154, + "learning_rate": 0.00014150570369180892, + "loss": 2.0179, + "step": 19518 + }, + { + "epoch": 2.2773305331933265, + "grad_norm": 1.3097922801971436, + "learning_rate": 0.00014149061806090126, + "loss": 1.9011, + "step": 19519 + }, + { + "epoch": 2.277447205693618, + "grad_norm": 1.0558459758758545, + "learning_rate": 0.00014147553253375975, + "loss": 1.8476, + "step": 19520 + }, + { + "epoch": 2.27756387819391, + "grad_norm": 1.1088311672210693, + "learning_rate": 0.00014146044711054088, + "loss": 1.8835, + "step": 19521 + }, + { + "epoch": 2.2776805506942015, + "grad_norm": 1.0382909774780273, + "learning_rate": 0.00014144536179140093, + "loss": 1.7967, + "step": 19522 + }, + { + "epoch": 2.277797223194493, + "grad_norm": 1.287205696105957, + "learning_rate": 0.0001414302765764964, + "loss": 1.9325, + "step": 19523 + }, + { + "epoch": 2.277913895694785, + "grad_norm": 1.0313243865966797, + "learning_rate": 0.00014141519146598354, + "loss": 1.8845, + "step": 19524 + }, + { + "epoch": 2.2780305681950765, + "grad_norm": 1.1057146787643433, + "learning_rate": 0.0001414001064600189, + "loss": 1.8193, + "step": 19525 + }, + { + "epoch": 2.2781472406953682, + "grad_norm": 1.1795008182525635, + "learning_rate": 0.00014138502155875871, + "loss": 1.9144, + "step": 19526 + }, + { + "epoch": 2.27826391319566, + "grad_norm": 1.1328775882720947, + "learning_rate": 0.00014136993676235952, + "loss": 1.924, + "step": 19527 + }, + { + "epoch": 2.2783805856959516, + "grad_norm": 1.2244304418563843, + "learning_rate": 0.00014135485207097753, + "loss": 2.1384, + "step": 19528 + }, + { + "epoch": 2.2784972581962433, + "grad_norm": 1.2428569793701172, + "learning_rate": 0.00014133976748476927, + "loss": 1.9087, + "step": 19529 + }, + { + "epoch": 2.278613930696535, + "grad_norm": 1.1644959449768066, + "learning_rate": 0.00014132468300389098, + "loss": 1.9149, + "step": 19530 + }, + { + "epoch": 2.2787306031968266, + "grad_norm": 1.2286043167114258, + "learning_rate": 0.00014130959862849917, + "loss": 2.0647, + "step": 19531 + }, + { + "epoch": 2.2788472756971183, + "grad_norm": 1.193130612373352, + "learning_rate": 0.0001412945143587502, + "loss": 1.9652, + "step": 19532 + }, + { + "epoch": 2.27896394819741, + "grad_norm": 1.1437784433364868, + "learning_rate": 0.00014127943019480034, + "loss": 2.1304, + "step": 19533 + }, + { + "epoch": 2.2790806206977017, + "grad_norm": 1.1465297937393188, + "learning_rate": 0.00014126434613680612, + "loss": 1.9922, + "step": 19534 + }, + { + "epoch": 2.2791972931979934, + "grad_norm": 1.1472077369689941, + "learning_rate": 0.00014124926218492376, + "loss": 1.8939, + "step": 19535 + }, + { + "epoch": 2.279313965698285, + "grad_norm": 1.2047661542892456, + "learning_rate": 0.00014123417833930977, + "loss": 1.8206, + "step": 19536 + }, + { + "epoch": 2.2794306381985767, + "grad_norm": 1.1413697004318237, + "learning_rate": 0.0001412190946001204, + "loss": 2.0079, + "step": 19537 + }, + { + "epoch": 2.2795473106988684, + "grad_norm": 1.3038482666015625, + "learning_rate": 0.00014120401096751216, + "loss": 2.0816, + "step": 19538 + }, + { + "epoch": 2.27966398319916, + "grad_norm": 1.0269590616226196, + "learning_rate": 0.0001411889274416413, + "loss": 1.9699, + "step": 19539 + }, + { + "epoch": 2.2797806556994518, + "grad_norm": 1.033615231513977, + "learning_rate": 0.00014117384402266423, + "loss": 1.8639, + "step": 19540 + }, + { + "epoch": 2.2798973281997434, + "grad_norm": 1.1107263565063477, + "learning_rate": 0.00014115876071073733, + "loss": 2.0561, + "step": 19541 + }, + { + "epoch": 2.280014000700035, + "grad_norm": 1.2012189626693726, + "learning_rate": 0.000141143677506017, + "loss": 1.8237, + "step": 19542 + }, + { + "epoch": 2.280130673200327, + "grad_norm": 1.1446141004562378, + "learning_rate": 0.00014112859440865947, + "loss": 1.9563, + "step": 19543 + }, + { + "epoch": 2.2802473457006185, + "grad_norm": 1.2163091897964478, + "learning_rate": 0.00014111351141882122, + "loss": 2.168, + "step": 19544 + }, + { + "epoch": 2.28036401820091, + "grad_norm": 1.0183390378952026, + "learning_rate": 0.00014109842853665864, + "loss": 1.7991, + "step": 19545 + }, + { + "epoch": 2.280480690701202, + "grad_norm": 1.08857262134552, + "learning_rate": 0.00014108334576232802, + "loss": 2.0389, + "step": 19546 + }, + { + "epoch": 2.2805973632014935, + "grad_norm": 1.3546780347824097, + "learning_rate": 0.00014106826309598575, + "loss": 2.0067, + "step": 19547 + }, + { + "epoch": 2.280714035701785, + "grad_norm": 1.149030327796936, + "learning_rate": 0.0001410531805377882, + "loss": 1.9883, + "step": 19548 + }, + { + "epoch": 2.280830708202077, + "grad_norm": 1.084920883178711, + "learning_rate": 0.00014103809808789175, + "loss": 1.9527, + "step": 19549 + }, + { + "epoch": 2.2809473807023686, + "grad_norm": 1.1035174131393433, + "learning_rate": 0.00014102301574645264, + "loss": 2.1287, + "step": 19550 + }, + { + "epoch": 2.2810640532026603, + "grad_norm": 1.2083333730697632, + "learning_rate": 0.00014100793351362736, + "loss": 2.1213, + "step": 19551 + }, + { + "epoch": 2.281180725702952, + "grad_norm": 1.034427285194397, + "learning_rate": 0.0001409928513895722, + "loss": 1.9198, + "step": 19552 + }, + { + "epoch": 2.2812973982032436, + "grad_norm": 1.0439496040344238, + "learning_rate": 0.00014097776937444358, + "loss": 1.8495, + "step": 19553 + }, + { + "epoch": 2.2814140707035353, + "grad_norm": 1.1713480949401855, + "learning_rate": 0.00014096268746839773, + "loss": 1.8113, + "step": 19554 + }, + { + "epoch": 2.281530743203827, + "grad_norm": 1.1874773502349854, + "learning_rate": 0.00014094760567159113, + "loss": 1.9948, + "step": 19555 + }, + { + "epoch": 2.2816474157041187, + "grad_norm": 1.171126365661621, + "learning_rate": 0.00014093252398418002, + "loss": 2.0868, + "step": 19556 + }, + { + "epoch": 2.2817640882044103, + "grad_norm": 1.1201341152191162, + "learning_rate": 0.00014091744240632087, + "loss": 1.8599, + "step": 19557 + }, + { + "epoch": 2.281880760704702, + "grad_norm": 1.0331785678863525, + "learning_rate": 0.00014090236093816996, + "loss": 1.9174, + "step": 19558 + }, + { + "epoch": 2.2819974332049937, + "grad_norm": 1.3074049949645996, + "learning_rate": 0.00014088727957988363, + "loss": 2.0372, + "step": 19559 + }, + { + "epoch": 2.2821141057052854, + "grad_norm": 1.214408278465271, + "learning_rate": 0.00014087219833161828, + "loss": 1.9934, + "step": 19560 + }, + { + "epoch": 2.282230778205577, + "grad_norm": 1.1860988140106201, + "learning_rate": 0.00014085711719353017, + "loss": 2.1017, + "step": 19561 + }, + { + "epoch": 2.2823474507058688, + "grad_norm": 1.1276500225067139, + "learning_rate": 0.00014084203616577574, + "loss": 1.8925, + "step": 19562 + }, + { + "epoch": 2.2824641232061604, + "grad_norm": 1.0900341272354126, + "learning_rate": 0.00014082695524851124, + "loss": 2.0585, + "step": 19563 + }, + { + "epoch": 2.282580795706452, + "grad_norm": 1.289967656135559, + "learning_rate": 0.00014081187444189313, + "loss": 2.1164, + "step": 19564 + }, + { + "epoch": 2.282697468206744, + "grad_norm": 1.1081180572509766, + "learning_rate": 0.00014079679374607763, + "loss": 2.0364, + "step": 19565 + }, + { + "epoch": 2.2828141407070355, + "grad_norm": 1.1609753370285034, + "learning_rate": 0.00014078171316122114, + "loss": 1.7741, + "step": 19566 + }, + { + "epoch": 2.282930813207327, + "grad_norm": 1.32198965549469, + "learning_rate": 0.00014076663268747998, + "loss": 1.9071, + "step": 19567 + }, + { + "epoch": 2.283047485707619, + "grad_norm": 1.2403095960617065, + "learning_rate": 0.00014075155232501054, + "loss": 2.0339, + "step": 19568 + }, + { + "epoch": 2.2831641582079105, + "grad_norm": 0.9437574744224548, + "learning_rate": 0.00014073647207396906, + "loss": 1.9239, + "step": 19569 + }, + { + "epoch": 2.283280830708202, + "grad_norm": 1.1697574853897095, + "learning_rate": 0.0001407213919345119, + "loss": 2.0718, + "step": 19570 + }, + { + "epoch": 2.283397503208494, + "grad_norm": 1.172247290611267, + "learning_rate": 0.00014070631190679552, + "loss": 1.9555, + "step": 19571 + }, + { + "epoch": 2.2835141757087856, + "grad_norm": 1.1664575338363647, + "learning_rate": 0.0001406912319909761, + "loss": 2.0581, + "step": 19572 + }, + { + "epoch": 2.2836308482090772, + "grad_norm": 1.050085425376892, + "learning_rate": 0.00014067615218721002, + "loss": 1.7811, + "step": 19573 + }, + { + "epoch": 2.283747520709369, + "grad_norm": 1.1823784112930298, + "learning_rate": 0.00014066107249565365, + "loss": 1.8017, + "step": 19574 + }, + { + "epoch": 2.2838641932096606, + "grad_norm": 1.3808282613754272, + "learning_rate": 0.0001406459929164633, + "loss": 2.0165, + "step": 19575 + }, + { + "epoch": 2.2839808657099523, + "grad_norm": 1.1437695026397705, + "learning_rate": 0.00014063091344979523, + "loss": 2.0132, + "step": 19576 + }, + { + "epoch": 2.284097538210244, + "grad_norm": 1.1838926076889038, + "learning_rate": 0.0001406158340958059, + "loss": 1.8749, + "step": 19577 + }, + { + "epoch": 2.2842142107105357, + "grad_norm": 1.1859946250915527, + "learning_rate": 0.0001406007548546515, + "loss": 2.0647, + "step": 19578 + }, + { + "epoch": 2.2843308832108273, + "grad_norm": 1.1230151653289795, + "learning_rate": 0.00014058567572648845, + "loss": 1.9746, + "step": 19579 + }, + { + "epoch": 2.284447555711119, + "grad_norm": 1.1205170154571533, + "learning_rate": 0.00014057059671147298, + "loss": 1.875, + "step": 19580 + }, + { + "epoch": 2.2845642282114107, + "grad_norm": 1.0305112600326538, + "learning_rate": 0.00014055551780976154, + "loss": 1.8642, + "step": 19581 + }, + { + "epoch": 2.2846809007117024, + "grad_norm": 1.1418406963348389, + "learning_rate": 0.00014054043902151037, + "loss": 2.0614, + "step": 19582 + }, + { + "epoch": 2.284797573211994, + "grad_norm": 1.084873080253601, + "learning_rate": 0.00014052536034687575, + "loss": 1.9666, + "step": 19583 + }, + { + "epoch": 2.2849142457122857, + "grad_norm": 1.0619243383407593, + "learning_rate": 0.0001405102817860141, + "loss": 1.9089, + "step": 19584 + }, + { + "epoch": 2.2850309182125774, + "grad_norm": 1.1316273212432861, + "learning_rate": 0.00014049520333908169, + "loss": 1.8904, + "step": 19585 + }, + { + "epoch": 2.285147590712869, + "grad_norm": 1.136682391166687, + "learning_rate": 0.00014048012500623484, + "loss": 1.9824, + "step": 19586 + }, + { + "epoch": 2.285264263213161, + "grad_norm": 1.0186136960983276, + "learning_rate": 0.00014046504678762983, + "loss": 1.9185, + "step": 19587 + }, + { + "epoch": 2.2853809357134525, + "grad_norm": 1.2375482320785522, + "learning_rate": 0.00014044996868342306, + "loss": 1.9587, + "step": 19588 + }, + { + "epoch": 2.285497608213744, + "grad_norm": 1.2025794982910156, + "learning_rate": 0.00014043489069377074, + "loss": 1.8427, + "step": 19589 + }, + { + "epoch": 2.285614280714036, + "grad_norm": 1.291257619857788, + "learning_rate": 0.00014041981281882925, + "loss": 1.9411, + "step": 19590 + }, + { + "epoch": 2.2857309532143275, + "grad_norm": 1.1852878332138062, + "learning_rate": 0.0001404047350587549, + "loss": 1.9388, + "step": 19591 + }, + { + "epoch": 2.285847625714619, + "grad_norm": 1.2899590730667114, + "learning_rate": 0.00014038965741370397, + "loss": 2.0546, + "step": 19592 + }, + { + "epoch": 2.285964298214911, + "grad_norm": 1.2274032831192017, + "learning_rate": 0.00014037457988383276, + "loss": 2.0663, + "step": 19593 + }, + { + "epoch": 2.2860809707152026, + "grad_norm": 0.9870771169662476, + "learning_rate": 0.00014035950246929764, + "loss": 1.8871, + "step": 19594 + }, + { + "epoch": 2.2861976432154942, + "grad_norm": 1.293097734451294, + "learning_rate": 0.0001403444251702548, + "loss": 1.9937, + "step": 19595 + }, + { + "epoch": 2.286314315715786, + "grad_norm": 1.3311326503753662, + "learning_rate": 0.00014032934798686064, + "loss": 1.9708, + "step": 19596 + }, + { + "epoch": 2.2864309882160776, + "grad_norm": 1.1888079643249512, + "learning_rate": 0.00014031427091927147, + "loss": 1.8386, + "step": 19597 + }, + { + "epoch": 2.2865476607163693, + "grad_norm": 1.2450940608978271, + "learning_rate": 0.00014029919396764358, + "loss": 2.2198, + "step": 19598 + }, + { + "epoch": 2.286664333216661, + "grad_norm": 1.1590362787246704, + "learning_rate": 0.00014028411713213322, + "loss": 2.059, + "step": 19599 + }, + { + "epoch": 2.2867810057169526, + "grad_norm": 1.2254291772842407, + "learning_rate": 0.00014026904041289674, + "loss": 1.8959, + "step": 19600 + }, + { + "epoch": 2.2868976782172443, + "grad_norm": 1.4542454481124878, + "learning_rate": 0.00014025396381009045, + "loss": 2.2487, + "step": 19601 + }, + { + "epoch": 2.287014350717536, + "grad_norm": 1.2740964889526367, + "learning_rate": 0.00014023888732387057, + "loss": 2.0225, + "step": 19602 + }, + { + "epoch": 2.2871310232178277, + "grad_norm": 1.266776204109192, + "learning_rate": 0.00014022381095439348, + "loss": 1.8651, + "step": 19603 + }, + { + "epoch": 2.2872476957181194, + "grad_norm": 1.1249170303344727, + "learning_rate": 0.0001402087347018154, + "loss": 1.8764, + "step": 19604 + }, + { + "epoch": 2.287364368218411, + "grad_norm": 1.0874032974243164, + "learning_rate": 0.00014019365856629272, + "loss": 1.9236, + "step": 19605 + }, + { + "epoch": 2.2874810407187027, + "grad_norm": 1.2827867269515991, + "learning_rate": 0.00014017858254798164, + "loss": 2.0168, + "step": 19606 + }, + { + "epoch": 2.2875977132189944, + "grad_norm": 1.0689297914505005, + "learning_rate": 0.00014016350664703854, + "loss": 2.0362, + "step": 19607 + }, + { + "epoch": 2.287714385719286, + "grad_norm": 1.3712198734283447, + "learning_rate": 0.00014014843086361962, + "loss": 2.1273, + "step": 19608 + }, + { + "epoch": 2.2878310582195778, + "grad_norm": 1.4945564270019531, + "learning_rate": 0.0001401333551978812, + "loss": 1.8034, + "step": 19609 + }, + { + "epoch": 2.2879477307198695, + "grad_norm": 1.196701169013977, + "learning_rate": 0.0001401182796499796, + "loss": 1.8863, + "step": 19610 + }, + { + "epoch": 2.288064403220161, + "grad_norm": 1.2334257364273071, + "learning_rate": 0.00014010320422007104, + "loss": 2.0225, + "step": 19611 + }, + { + "epoch": 2.288181075720453, + "grad_norm": 1.3661651611328125, + "learning_rate": 0.00014008812890831194, + "loss": 2.0068, + "step": 19612 + }, + { + "epoch": 2.2882977482207445, + "grad_norm": 1.0449670553207397, + "learning_rate": 0.0001400730537148584, + "loss": 2.0382, + "step": 19613 + }, + { + "epoch": 2.288414420721036, + "grad_norm": 1.5623887777328491, + "learning_rate": 0.0001400579786398669, + "loss": 2.0936, + "step": 19614 + }, + { + "epoch": 2.288531093221328, + "grad_norm": 1.1185959577560425, + "learning_rate": 0.0001400429036834935, + "loss": 1.9927, + "step": 19615 + }, + { + "epoch": 2.2886477657216195, + "grad_norm": 1.0582880973815918, + "learning_rate": 0.00014002782884589473, + "loss": 1.9381, + "step": 19616 + }, + { + "epoch": 2.2887644382219112, + "grad_norm": 1.0910649299621582, + "learning_rate": 0.00014001275412722666, + "loss": 1.9716, + "step": 19617 + }, + { + "epoch": 2.288881110722203, + "grad_norm": 1.226749300956726, + "learning_rate": 0.00013999767952764568, + "loss": 1.8952, + "step": 19618 + }, + { + "epoch": 2.2889977832224946, + "grad_norm": 1.252343773841858, + "learning_rate": 0.00013998260504730797, + "loss": 1.8647, + "step": 19619 + }, + { + "epoch": 2.2891144557227863, + "grad_norm": 1.2286697626113892, + "learning_rate": 0.00013996753068636994, + "loss": 1.787, + "step": 19620 + }, + { + "epoch": 2.289231128223078, + "grad_norm": 1.135644793510437, + "learning_rate": 0.00013995245644498773, + "loss": 2.158, + "step": 19621 + }, + { + "epoch": 2.2893478007233696, + "grad_norm": 1.2033342123031616, + "learning_rate": 0.00013993738232331767, + "loss": 1.8099, + "step": 19622 + }, + { + "epoch": 2.2894644732236613, + "grad_norm": 1.2131668329238892, + "learning_rate": 0.0001399223083215161, + "loss": 1.9597, + "step": 19623 + }, + { + "epoch": 2.289581145723953, + "grad_norm": 1.1331053972244263, + "learning_rate": 0.0001399072344397392, + "loss": 1.9276, + "step": 19624 + }, + { + "epoch": 2.2896978182242447, + "grad_norm": 1.1495064496994019, + "learning_rate": 0.00013989216067814328, + "loss": 1.9812, + "step": 19625 + }, + { + "epoch": 2.2898144907245364, + "grad_norm": 1.2168304920196533, + "learning_rate": 0.0001398770870368846, + "loss": 1.9697, + "step": 19626 + }, + { + "epoch": 2.289931163224828, + "grad_norm": 1.0107324123382568, + "learning_rate": 0.00013986201351611941, + "loss": 1.6835, + "step": 19627 + }, + { + "epoch": 2.2900478357251197, + "grad_norm": 1.1861923933029175, + "learning_rate": 0.00013984694011600395, + "loss": 1.9607, + "step": 19628 + }, + { + "epoch": 2.2901645082254114, + "grad_norm": 1.1590882539749146, + "learning_rate": 0.00013983186683669458, + "loss": 1.9959, + "step": 19629 + }, + { + "epoch": 2.290281180725703, + "grad_norm": 1.094321608543396, + "learning_rate": 0.00013981679367834745, + "loss": 1.9575, + "step": 19630 + }, + { + "epoch": 2.2903978532259948, + "grad_norm": 1.0923153162002563, + "learning_rate": 0.00013980172064111896, + "loss": 1.6792, + "step": 19631 + }, + { + "epoch": 2.2905145257262864, + "grad_norm": 1.2406911849975586, + "learning_rate": 0.0001397866477251652, + "loss": 1.9827, + "step": 19632 + }, + { + "epoch": 2.290631198226578, + "grad_norm": 0.9809187054634094, + "learning_rate": 0.0001397715749306426, + "loss": 1.7802, + "step": 19633 + }, + { + "epoch": 2.29074787072687, + "grad_norm": 1.242872953414917, + "learning_rate": 0.0001397565022577073, + "loss": 1.9275, + "step": 19634 + }, + { + "epoch": 2.2908645432271615, + "grad_norm": 1.0464309453964233, + "learning_rate": 0.00013974142970651554, + "loss": 1.8942, + "step": 19635 + }, + { + "epoch": 2.290981215727453, + "grad_norm": 1.136374592781067, + "learning_rate": 0.0001397263572772237, + "loss": 2.0553, + "step": 19636 + }, + { + "epoch": 2.291097888227745, + "grad_norm": 1.19379723072052, + "learning_rate": 0.00013971128496998792, + "loss": 1.9042, + "step": 19637 + }, + { + "epoch": 2.2912145607280365, + "grad_norm": 1.263586163520813, + "learning_rate": 0.00013969621278496453, + "loss": 1.8572, + "step": 19638 + }, + { + "epoch": 2.291331233228328, + "grad_norm": 1.0707674026489258, + "learning_rate": 0.0001396811407223097, + "loss": 1.9721, + "step": 19639 + }, + { + "epoch": 2.29144790572862, + "grad_norm": 1.3182249069213867, + "learning_rate": 0.00013966606878217978, + "loss": 2.0185, + "step": 19640 + }, + { + "epoch": 2.2915645782289116, + "grad_norm": 1.0731664896011353, + "learning_rate": 0.0001396509969647309, + "loss": 1.7854, + "step": 19641 + }, + { + "epoch": 2.2916812507292033, + "grad_norm": 1.1063965559005737, + "learning_rate": 0.00013963592527011946, + "loss": 1.8737, + "step": 19642 + }, + { + "epoch": 2.291797923229495, + "grad_norm": 1.2297334671020508, + "learning_rate": 0.00013962085369850156, + "loss": 1.9612, + "step": 19643 + }, + { + "epoch": 2.2919145957297866, + "grad_norm": 1.1292827129364014, + "learning_rate": 0.00013960578225003358, + "loss": 1.9852, + "step": 19644 + }, + { + "epoch": 2.2920312682300783, + "grad_norm": 1.3899980783462524, + "learning_rate": 0.00013959071092487158, + "loss": 2.0524, + "step": 19645 + }, + { + "epoch": 2.29214794073037, + "grad_norm": 1.1208313703536987, + "learning_rate": 0.000139575639723172, + "loss": 1.9127, + "step": 19646 + }, + { + "epoch": 2.2922646132306617, + "grad_norm": 1.1916900873184204, + "learning_rate": 0.00013956056864509092, + "loss": 1.9715, + "step": 19647 + }, + { + "epoch": 2.2923812857309533, + "grad_norm": 1.4579538106918335, + "learning_rate": 0.00013954549769078467, + "loss": 2.1612, + "step": 19648 + }, + { + "epoch": 2.292497958231245, + "grad_norm": 1.075169324874878, + "learning_rate": 0.00013953042686040953, + "loss": 1.8182, + "step": 19649 + }, + { + "epoch": 2.2926146307315367, + "grad_norm": 1.2081736326217651, + "learning_rate": 0.00013951535615412165, + "loss": 2.0027, + "step": 19650 + }, + { + "epoch": 2.2927313032318284, + "grad_norm": 1.242908239364624, + "learning_rate": 0.0001395002855720773, + "loss": 1.8609, + "step": 19651 + }, + { + "epoch": 2.29284797573212, + "grad_norm": 1.0156095027923584, + "learning_rate": 0.0001394852151144327, + "loss": 1.9376, + "step": 19652 + }, + { + "epoch": 2.2929646482324118, + "grad_norm": 1.0690356492996216, + "learning_rate": 0.00013947014478134416, + "loss": 2.0256, + "step": 19653 + }, + { + "epoch": 2.2930813207327034, + "grad_norm": 1.0201003551483154, + "learning_rate": 0.00013945507457296775, + "loss": 1.9078, + "step": 19654 + }, + { + "epoch": 2.293197993232995, + "grad_norm": 1.1217411756515503, + "learning_rate": 0.0001394400044894599, + "loss": 1.9223, + "step": 19655 + }, + { + "epoch": 2.293314665733287, + "grad_norm": 1.2059805393218994, + "learning_rate": 0.00013942493453097666, + "loss": 1.9646, + "step": 19656 + }, + { + "epoch": 2.2934313382335785, + "grad_norm": 1.0840840339660645, + "learning_rate": 0.0001394098646976744, + "loss": 1.9855, + "step": 19657 + }, + { + "epoch": 2.29354801073387, + "grad_norm": 1.221107840538025, + "learning_rate": 0.00013939479498970922, + "loss": 1.9679, + "step": 19658 + }, + { + "epoch": 2.293664683234162, + "grad_norm": 1.1610287427902222, + "learning_rate": 0.00013937972540723747, + "loss": 2.0791, + "step": 19659 + }, + { + "epoch": 2.2937813557344535, + "grad_norm": 1.2417168617248535, + "learning_rate": 0.00013936465595041533, + "loss": 1.7524, + "step": 19660 + }, + { + "epoch": 2.293898028234745, + "grad_norm": 1.2286516427993774, + "learning_rate": 0.00013934958661939895, + "loss": 1.861, + "step": 19661 + }, + { + "epoch": 2.294014700735037, + "grad_norm": 1.1990758180618286, + "learning_rate": 0.00013933451741434465, + "loss": 1.9561, + "step": 19662 + }, + { + "epoch": 2.2941313732353286, + "grad_norm": 1.2477476596832275, + "learning_rate": 0.00013931944833540856, + "loss": 2.0138, + "step": 19663 + }, + { + "epoch": 2.2942480457356202, + "grad_norm": 1.4457358121871948, + "learning_rate": 0.00013930437938274707, + "loss": 1.9002, + "step": 19664 + }, + { + "epoch": 2.294364718235912, + "grad_norm": 1.1665501594543457, + "learning_rate": 0.00013928931055651617, + "loss": 2.177, + "step": 19665 + }, + { + "epoch": 2.2944813907362036, + "grad_norm": 1.1461647748947144, + "learning_rate": 0.00013927424185687228, + "loss": 1.8668, + "step": 19666 + }, + { + "epoch": 2.2945980632364953, + "grad_norm": 1.0497853755950928, + "learning_rate": 0.00013925917328397146, + "loss": 1.9503, + "step": 19667 + }, + { + "epoch": 2.294714735736787, + "grad_norm": 1.3209614753723145, + "learning_rate": 0.00013924410483797, + "loss": 1.9553, + "step": 19668 + }, + { + "epoch": 2.2948314082370787, + "grad_norm": 1.0818909406661987, + "learning_rate": 0.0001392290365190241, + "loss": 1.8815, + "step": 19669 + }, + { + "epoch": 2.2949480807373703, + "grad_norm": 1.1304354667663574, + "learning_rate": 0.00013921396832729003, + "loss": 1.9922, + "step": 19670 + }, + { + "epoch": 2.295064753237662, + "grad_norm": 1.1150974035263062, + "learning_rate": 0.00013919890026292385, + "loss": 1.9669, + "step": 19671 + }, + { + "epoch": 2.2951814257379537, + "grad_norm": 1.2173367738723755, + "learning_rate": 0.00013918383232608194, + "loss": 2.0798, + "step": 19672 + }, + { + "epoch": 2.2952980982382454, + "grad_norm": 1.2125927209854126, + "learning_rate": 0.0001391687645169204, + "loss": 1.9916, + "step": 19673 + }, + { + "epoch": 2.295414770738537, + "grad_norm": 1.076094388961792, + "learning_rate": 0.00013915369683559543, + "loss": 1.8334, + "step": 19674 + }, + { + "epoch": 2.2955314432388287, + "grad_norm": 1.3459302186965942, + "learning_rate": 0.0001391386292822633, + "loss": 2.0173, + "step": 19675 + }, + { + "epoch": 2.2956481157391204, + "grad_norm": 1.2030707597732544, + "learning_rate": 0.0001391235618570802, + "loss": 2.0272, + "step": 19676 + }, + { + "epoch": 2.295764788239412, + "grad_norm": 1.0962011814117432, + "learning_rate": 0.00013910849456020234, + "loss": 1.9887, + "step": 19677 + }, + { + "epoch": 2.295881460739704, + "grad_norm": 1.2270961999893188, + "learning_rate": 0.00013909342739178585, + "loss": 1.9074, + "step": 19678 + }, + { + "epoch": 2.2959981332399955, + "grad_norm": 1.1178876161575317, + "learning_rate": 0.00013907836035198702, + "loss": 1.9496, + "step": 19679 + }, + { + "epoch": 2.296114805740287, + "grad_norm": 1.246341586112976, + "learning_rate": 0.00013906329344096197, + "loss": 1.9037, + "step": 19680 + }, + { + "epoch": 2.296231478240579, + "grad_norm": 1.259406566619873, + "learning_rate": 0.00013904822665886698, + "loss": 2.1476, + "step": 19681 + }, + { + "epoch": 2.2963481507408705, + "grad_norm": 1.1550402641296387, + "learning_rate": 0.00013903316000585814, + "loss": 1.8367, + "step": 19682 + }, + { + "epoch": 2.296464823241162, + "grad_norm": 1.1874126195907593, + "learning_rate": 0.00013901809348209178, + "loss": 1.9507, + "step": 19683 + }, + { + "epoch": 2.296581495741454, + "grad_norm": 1.22492253780365, + "learning_rate": 0.00013900302708772392, + "loss": 2.0033, + "step": 19684 + }, + { + "epoch": 2.2966981682417456, + "grad_norm": 1.1350435018539429, + "learning_rate": 0.00013898796082291093, + "loss": 1.86, + "step": 19685 + }, + { + "epoch": 2.2968148407420372, + "grad_norm": 1.2540851831436157, + "learning_rate": 0.0001389728946878089, + "loss": 1.9772, + "step": 19686 + }, + { + "epoch": 2.296931513242329, + "grad_norm": 1.1629778146743774, + "learning_rate": 0.00013895782868257405, + "loss": 2.0501, + "step": 19687 + }, + { + "epoch": 2.2970481857426206, + "grad_norm": 1.1688166856765747, + "learning_rate": 0.00013894276280736252, + "loss": 1.8965, + "step": 19688 + }, + { + "epoch": 2.2971648582429123, + "grad_norm": 1.271804928779602, + "learning_rate": 0.00013892769706233053, + "loss": 1.8969, + "step": 19689 + }, + { + "epoch": 2.297281530743204, + "grad_norm": 1.2605341672897339, + "learning_rate": 0.0001389126314476343, + "loss": 2.1054, + "step": 19690 + }, + { + "epoch": 2.2973982032434956, + "grad_norm": 0.9018110036849976, + "learning_rate": 0.00013889756596342993, + "loss": 1.7119, + "step": 19691 + }, + { + "epoch": 2.2975148757437873, + "grad_norm": 1.1983944177627563, + "learning_rate": 0.00013888250060987373, + "loss": 1.9241, + "step": 19692 + }, + { + "epoch": 2.297631548244079, + "grad_norm": 1.0854053497314453, + "learning_rate": 0.00013886743538712177, + "loss": 1.9515, + "step": 19693 + }, + { + "epoch": 2.2977482207443707, + "grad_norm": 1.1997220516204834, + "learning_rate": 0.00013885237029533025, + "loss": 1.8208, + "step": 19694 + }, + { + "epoch": 2.2978648932446624, + "grad_norm": 1.20161771774292, + "learning_rate": 0.00013883730533465534, + "loss": 1.9371, + "step": 19695 + }, + { + "epoch": 2.297981565744954, + "grad_norm": 1.2473974227905273, + "learning_rate": 0.00013882224050525333, + "loss": 1.8662, + "step": 19696 + }, + { + "epoch": 2.2980982382452457, + "grad_norm": 1.207602620124817, + "learning_rate": 0.0001388071758072802, + "loss": 2.0385, + "step": 19697 + }, + { + "epoch": 2.2982149107455374, + "grad_norm": 1.155074119567871, + "learning_rate": 0.00013879211124089233, + "loss": 1.861, + "step": 19698 + }, + { + "epoch": 2.298331583245829, + "grad_norm": 1.1122206449508667, + "learning_rate": 0.0001387770468062457, + "loss": 1.9234, + "step": 19699 + }, + { + "epoch": 2.2984482557461208, + "grad_norm": 1.087188482284546, + "learning_rate": 0.00013876198250349663, + "loss": 2.1087, + "step": 19700 + }, + { + "epoch": 2.2985649282464125, + "grad_norm": 1.0871518850326538, + "learning_rate": 0.0001387469183328012, + "loss": 1.9249, + "step": 19701 + }, + { + "epoch": 2.298681600746704, + "grad_norm": 1.2167372703552246, + "learning_rate": 0.00013873185429431558, + "loss": 1.9069, + "step": 19702 + }, + { + "epoch": 2.298798273246996, + "grad_norm": 1.2612228393554688, + "learning_rate": 0.00013871679038819607, + "loss": 1.9693, + "step": 19703 + }, + { + "epoch": 2.2989149457472875, + "grad_norm": 1.1324273347854614, + "learning_rate": 0.00013870172661459865, + "loss": 2.0573, + "step": 19704 + }, + { + "epoch": 2.299031618247579, + "grad_norm": 1.2325488328933716, + "learning_rate": 0.00013868666297367962, + "loss": 1.929, + "step": 19705 + }, + { + "epoch": 2.299148290747871, + "grad_norm": 0.9493870139122009, + "learning_rate": 0.00013867159946559502, + "loss": 1.782, + "step": 19706 + }, + { + "epoch": 2.2992649632481625, + "grad_norm": 1.0803567171096802, + "learning_rate": 0.00013865653609050117, + "loss": 2.0122, + "step": 19707 + }, + { + "epoch": 2.2993816357484542, + "grad_norm": 1.1085906028747559, + "learning_rate": 0.00013864147284855407, + "loss": 2.0734, + "step": 19708 + }, + { + "epoch": 2.299498308248746, + "grad_norm": 1.057192325592041, + "learning_rate": 0.00013862640973991003, + "loss": 1.7573, + "step": 19709 + }, + { + "epoch": 2.2996149807490376, + "grad_norm": 1.2664000988006592, + "learning_rate": 0.00013861134676472505, + "loss": 1.9432, + "step": 19710 + }, + { + "epoch": 2.2997316532493293, + "grad_norm": 1.1437602043151855, + "learning_rate": 0.00013859628392315544, + "loss": 1.9498, + "step": 19711 + }, + { + "epoch": 2.299848325749621, + "grad_norm": 0.9523216485977173, + "learning_rate": 0.00013858122121535726, + "loss": 1.7542, + "step": 19712 + }, + { + "epoch": 2.2999649982499126, + "grad_norm": 1.1466530561447144, + "learning_rate": 0.0001385661586414867, + "loss": 1.8802, + "step": 19713 + }, + { + "epoch": 2.3000816707502043, + "grad_norm": 1.0366251468658447, + "learning_rate": 0.00013855109620169984, + "loss": 1.9286, + "step": 19714 + }, + { + "epoch": 2.300198343250496, + "grad_norm": 1.2024240493774414, + "learning_rate": 0.0001385360338961529, + "loss": 1.9797, + "step": 19715 + }, + { + "epoch": 2.3003150157507877, + "grad_norm": 1.128584623336792, + "learning_rate": 0.00013852097172500205, + "loss": 1.8851, + "step": 19716 + }, + { + "epoch": 2.3004316882510794, + "grad_norm": 1.1358524560928345, + "learning_rate": 0.00013850590968840336, + "loss": 1.9129, + "step": 19717 + }, + { + "epoch": 2.300548360751371, + "grad_norm": 1.0713433027267456, + "learning_rate": 0.00013849084778651312, + "loss": 1.8517, + "step": 19718 + }, + { + "epoch": 2.3006650332516627, + "grad_norm": 1.2238807678222656, + "learning_rate": 0.0001384757860194873, + "loss": 2.0603, + "step": 19719 + }, + { + "epoch": 2.3007817057519544, + "grad_norm": 1.1379016637802124, + "learning_rate": 0.00013846072438748215, + "loss": 1.9665, + "step": 19720 + }, + { + "epoch": 2.300898378252246, + "grad_norm": 1.1029248237609863, + "learning_rate": 0.00013844566289065375, + "loss": 1.8242, + "step": 19721 + }, + { + "epoch": 2.3010150507525378, + "grad_norm": 1.158908486366272, + "learning_rate": 0.00013843060152915834, + "loss": 1.9124, + "step": 19722 + }, + { + "epoch": 2.3011317232528294, + "grad_norm": 1.2438348531723022, + "learning_rate": 0.0001384155403031519, + "loss": 2.0562, + "step": 19723 + }, + { + "epoch": 2.301248395753121, + "grad_norm": 1.199130654335022, + "learning_rate": 0.00013840047921279076, + "loss": 2.105, + "step": 19724 + }, + { + "epoch": 2.301365068253413, + "grad_norm": 1.1814138889312744, + "learning_rate": 0.00013838541825823088, + "loss": 1.9159, + "step": 19725 + }, + { + "epoch": 2.3014817407537045, + "grad_norm": 1.1751766204833984, + "learning_rate": 0.0001383703574396285, + "loss": 2.0304, + "step": 19726 + }, + { + "epoch": 2.301598413253996, + "grad_norm": 1.1373567581176758, + "learning_rate": 0.00013835529675713972, + "loss": 1.9723, + "step": 19727 + }, + { + "epoch": 2.301715085754288, + "grad_norm": 1.1208319664001465, + "learning_rate": 0.0001383402362109207, + "loss": 2.0472, + "step": 19728 + }, + { + "epoch": 2.3018317582545795, + "grad_norm": 1.1554502248764038, + "learning_rate": 0.00013832517580112758, + "loss": 1.9143, + "step": 19729 + }, + { + "epoch": 2.301948430754871, + "grad_norm": 1.0300487279891968, + "learning_rate": 0.00013831011552791638, + "loss": 1.878, + "step": 19730 + }, + { + "epoch": 2.302065103255163, + "grad_norm": 1.198055386543274, + "learning_rate": 0.00013829505539144342, + "loss": 1.9753, + "step": 19731 + }, + { + "epoch": 2.3021817757554546, + "grad_norm": 1.129874348640442, + "learning_rate": 0.0001382799953918646, + "loss": 2.054, + "step": 19732 + }, + { + "epoch": 2.3022984482557463, + "grad_norm": 0.9675971269607544, + "learning_rate": 0.00013826493552933624, + "loss": 1.91, + "step": 19733 + }, + { + "epoch": 2.302415120756038, + "grad_norm": 1.2885777950286865, + "learning_rate": 0.00013824987580401434, + "loss": 1.7367, + "step": 19734 + }, + { + "epoch": 2.3025317932563296, + "grad_norm": 1.0875988006591797, + "learning_rate": 0.00013823481621605513, + "loss": 1.8572, + "step": 19735 + }, + { + "epoch": 2.3026484657566213, + "grad_norm": 1.1365516185760498, + "learning_rate": 0.00013821975676561458, + "loss": 2.0216, + "step": 19736 + }, + { + "epoch": 2.302765138256913, + "grad_norm": 1.0665165185928345, + "learning_rate": 0.00013820469745284898, + "loss": 1.9787, + "step": 19737 + }, + { + "epoch": 2.3028818107572047, + "grad_norm": 1.129375696182251, + "learning_rate": 0.00013818963827791434, + "loss": 1.8745, + "step": 19738 + }, + { + "epoch": 2.3029984832574963, + "grad_norm": 1.2645913362503052, + "learning_rate": 0.00013817457924096682, + "loss": 1.9752, + "step": 19739 + }, + { + "epoch": 2.303115155757788, + "grad_norm": 1.1022194623947144, + "learning_rate": 0.00013815952034216248, + "loss": 2.1695, + "step": 19740 + }, + { + "epoch": 2.3032318282580797, + "grad_norm": 1.272651195526123, + "learning_rate": 0.00013814446158165746, + "loss": 2.0784, + "step": 19741 + }, + { + "epoch": 2.3033485007583714, + "grad_norm": 1.209633469581604, + "learning_rate": 0.00013812940295960795, + "loss": 1.9834, + "step": 19742 + }, + { + "epoch": 2.303465173258663, + "grad_norm": 1.1507302522659302, + "learning_rate": 0.00013811434447616992, + "loss": 1.9117, + "step": 19743 + }, + { + "epoch": 2.3035818457589547, + "grad_norm": 1.100422739982605, + "learning_rate": 0.00013809928613149966, + "loss": 2.0882, + "step": 19744 + }, + { + "epoch": 2.3036985182592464, + "grad_norm": 1.233825445175171, + "learning_rate": 0.00013808422792575313, + "loss": 2.0696, + "step": 19745 + }, + { + "epoch": 2.303815190759538, + "grad_norm": 1.1373895406723022, + "learning_rate": 0.00013806916985908648, + "loss": 1.8924, + "step": 19746 + }, + { + "epoch": 2.30393186325983, + "grad_norm": 1.2104243040084839, + "learning_rate": 0.0001380541119316558, + "loss": 2.0533, + "step": 19747 + }, + { + "epoch": 2.3040485357601215, + "grad_norm": 1.1367461681365967, + "learning_rate": 0.00013803905414361723, + "loss": 1.7561, + "step": 19748 + }, + { + "epoch": 2.304165208260413, + "grad_norm": 1.2961246967315674, + "learning_rate": 0.00013802399649512682, + "loss": 1.7882, + "step": 19749 + }, + { + "epoch": 2.304281880760705, + "grad_norm": 1.0455899238586426, + "learning_rate": 0.0001380089389863408, + "loss": 1.8712, + "step": 19750 + }, + { + "epoch": 2.3043985532609965, + "grad_norm": 1.197020411491394, + "learning_rate": 0.00013799388161741505, + "loss": 1.9385, + "step": 19751 + }, + { + "epoch": 2.304515225761288, + "grad_norm": 1.1884247064590454, + "learning_rate": 0.00013797882438850586, + "loss": 1.7635, + "step": 19752 + }, + { + "epoch": 2.30463189826158, + "grad_norm": 1.1554359197616577, + "learning_rate": 0.00013796376729976922, + "loss": 2.2125, + "step": 19753 + }, + { + "epoch": 2.3047485707618716, + "grad_norm": 1.1162505149841309, + "learning_rate": 0.0001379487103513613, + "loss": 1.9717, + "step": 19754 + }, + { + "epoch": 2.3048652432621632, + "grad_norm": 1.141445517539978, + "learning_rate": 0.0001379336535434382, + "loss": 2.1102, + "step": 19755 + }, + { + "epoch": 2.304981915762455, + "grad_norm": 1.2930259704589844, + "learning_rate": 0.0001379185968761559, + "loss": 2.0452, + "step": 19756 + }, + { + "epoch": 2.3050985882627466, + "grad_norm": 1.1671165227890015, + "learning_rate": 0.0001379035403496706, + "loss": 2.0274, + "step": 19757 + }, + { + "epoch": 2.3052152607630383, + "grad_norm": 1.0370217561721802, + "learning_rate": 0.00013788848396413831, + "loss": 1.9101, + "step": 19758 + }, + { + "epoch": 2.30533193326333, + "grad_norm": 1.2428356409072876, + "learning_rate": 0.00013787342771971522, + "loss": 2.1697, + "step": 19759 + }, + { + "epoch": 2.3054486057636217, + "grad_norm": 1.322498083114624, + "learning_rate": 0.00013785837161655728, + "loss": 2.1672, + "step": 19760 + }, + { + "epoch": 2.3055652782639133, + "grad_norm": 1.1029613018035889, + "learning_rate": 0.00013784331565482074, + "loss": 1.9754, + "step": 19761 + }, + { + "epoch": 2.305681950764205, + "grad_norm": 1.2600939273834229, + "learning_rate": 0.00013782825983466151, + "loss": 1.994, + "step": 19762 + }, + { + "epoch": 2.3057986232644967, + "grad_norm": 1.0079962015151978, + "learning_rate": 0.0001378132041562358, + "loss": 1.7877, + "step": 19763 + }, + { + "epoch": 2.3059152957647884, + "grad_norm": 1.2841482162475586, + "learning_rate": 0.00013779814861969965, + "loss": 2.0965, + "step": 19764 + }, + { + "epoch": 2.30603196826508, + "grad_norm": 1.2469815015792847, + "learning_rate": 0.00013778309322520915, + "loss": 2.156, + "step": 19765 + }, + { + "epoch": 2.3061486407653717, + "grad_norm": 1.0963554382324219, + "learning_rate": 0.00013776803797292033, + "loss": 1.9596, + "step": 19766 + }, + { + "epoch": 2.3062653132656634, + "grad_norm": 1.5322967767715454, + "learning_rate": 0.00013775298286298926, + "loss": 2.0489, + "step": 19767 + }, + { + "epoch": 2.306381985765955, + "grad_norm": 1.0837925672531128, + "learning_rate": 0.00013773792789557215, + "loss": 1.9405, + "step": 19768 + }, + { + "epoch": 2.306498658266247, + "grad_norm": 1.077422022819519, + "learning_rate": 0.0001377228730708249, + "loss": 1.8165, + "step": 19769 + }, + { + "epoch": 2.3066153307665385, + "grad_norm": 1.1138466596603394, + "learning_rate": 0.00013770781838890372, + "loss": 1.9334, + "step": 19770 + }, + { + "epoch": 2.30673200326683, + "grad_norm": 1.0703301429748535, + "learning_rate": 0.0001376927638499646, + "loss": 1.9583, + "step": 19771 + }, + { + "epoch": 2.306848675767122, + "grad_norm": 1.2529441118240356, + "learning_rate": 0.00013767770945416366, + "loss": 1.9815, + "step": 19772 + }, + { + "epoch": 2.3069653482674135, + "grad_norm": 1.0777350664138794, + "learning_rate": 0.00013766265520165686, + "loss": 1.9334, + "step": 19773 + }, + { + "epoch": 2.307082020767705, + "grad_norm": 1.3839744329452515, + "learning_rate": 0.0001376476010926004, + "loss": 2.0269, + "step": 19774 + }, + { + "epoch": 2.307198693267997, + "grad_norm": 1.1042637825012207, + "learning_rate": 0.00013763254712715023, + "loss": 1.908, + "step": 19775 + }, + { + "epoch": 2.3073153657682886, + "grad_norm": 1.102842092514038, + "learning_rate": 0.00013761749330546254, + "loss": 1.779, + "step": 19776 + }, + { + "epoch": 2.3074320382685802, + "grad_norm": 1.2075772285461426, + "learning_rate": 0.00013760243962769326, + "loss": 1.7986, + "step": 19777 + }, + { + "epoch": 2.307548710768872, + "grad_norm": 1.0633554458618164, + "learning_rate": 0.00013758738609399856, + "loss": 1.9975, + "step": 19778 + }, + { + "epoch": 2.3076653832691636, + "grad_norm": 1.4096951484680176, + "learning_rate": 0.00013757233270453442, + "loss": 1.9921, + "step": 19779 + }, + { + "epoch": 2.3077820557694553, + "grad_norm": 1.0687377452850342, + "learning_rate": 0.00013755727945945692, + "loss": 1.8239, + "step": 19780 + }, + { + "epoch": 2.307898728269747, + "grad_norm": 1.2549079656600952, + "learning_rate": 0.00013754222635892216, + "loss": 2.1105, + "step": 19781 + }, + { + "epoch": 2.3080154007700386, + "grad_norm": 1.1016874313354492, + "learning_rate": 0.0001375271734030861, + "loss": 1.9475, + "step": 19782 + }, + { + "epoch": 2.3081320732703303, + "grad_norm": 1.2582356929779053, + "learning_rate": 0.0001375121205921049, + "loss": 1.9207, + "step": 19783 + }, + { + "epoch": 2.308248745770622, + "grad_norm": 1.2403796911239624, + "learning_rate": 0.0001374970679261345, + "loss": 1.9468, + "step": 19784 + }, + { + "epoch": 2.3083654182709137, + "grad_norm": 1.1675573587417603, + "learning_rate": 0.0001374820154053311, + "loss": 1.7413, + "step": 19785 + }, + { + "epoch": 2.3084820907712054, + "grad_norm": 1.0866583585739136, + "learning_rate": 0.00013746696302985051, + "loss": 1.9478, + "step": 19786 + }, + { + "epoch": 2.308598763271497, + "grad_norm": 1.3049721717834473, + "learning_rate": 0.00013745191079984906, + "loss": 1.938, + "step": 19787 + }, + { + "epoch": 2.3087154357717887, + "grad_norm": 1.3447459936141968, + "learning_rate": 0.00013743685871548262, + "loss": 2.0916, + "step": 19788 + }, + { + "epoch": 2.3088321082720804, + "grad_norm": 1.6425046920776367, + "learning_rate": 0.00013742180677690725, + "loss": 2.0889, + "step": 19789 + }, + { + "epoch": 2.308948780772372, + "grad_norm": 1.1713470220565796, + "learning_rate": 0.000137406754984279, + "loss": 1.9002, + "step": 19790 + }, + { + "epoch": 2.3090654532726638, + "grad_norm": 1.1683011054992676, + "learning_rate": 0.00013739170333775398, + "loss": 1.7821, + "step": 19791 + }, + { + "epoch": 2.3091821257729555, + "grad_norm": 1.1650766134262085, + "learning_rate": 0.0001373766518374881, + "loss": 1.9754, + "step": 19792 + }, + { + "epoch": 2.309298798273247, + "grad_norm": 1.0754565000534058, + "learning_rate": 0.0001373616004836375, + "loss": 1.8779, + "step": 19793 + }, + { + "epoch": 2.309415470773539, + "grad_norm": 1.14238703250885, + "learning_rate": 0.0001373465492763582, + "loss": 2.0326, + "step": 19794 + }, + { + "epoch": 2.3095321432738305, + "grad_norm": 1.1045883893966675, + "learning_rate": 0.00013733149821580615, + "loss": 1.9872, + "step": 19795 + }, + { + "epoch": 2.309648815774122, + "grad_norm": 1.0376079082489014, + "learning_rate": 0.00013731644730213756, + "loss": 1.9017, + "step": 19796 + }, + { + "epoch": 2.309765488274414, + "grad_norm": 1.074587345123291, + "learning_rate": 0.00013730139653550828, + "loss": 2.0778, + "step": 19797 + }, + { + "epoch": 2.3098821607747055, + "grad_norm": 1.3479682207107544, + "learning_rate": 0.00013728634591607446, + "loss": 2.1118, + "step": 19798 + }, + { + "epoch": 2.309998833274997, + "grad_norm": 1.195030689239502, + "learning_rate": 0.000137271295443992, + "loss": 1.904, + "step": 19799 + }, + { + "epoch": 2.310115505775289, + "grad_norm": 1.0180226564407349, + "learning_rate": 0.0001372562451194171, + "loss": 1.9573, + "step": 19800 + }, + { + "epoch": 2.3102321782755806, + "grad_norm": 1.1072994470596313, + "learning_rate": 0.00013724119494250563, + "loss": 1.995, + "step": 19801 + }, + { + "epoch": 2.3103488507758723, + "grad_norm": 1.1309391260147095, + "learning_rate": 0.0001372261449134137, + "loss": 1.9612, + "step": 19802 + }, + { + "epoch": 2.310465523276164, + "grad_norm": 1.159216284751892, + "learning_rate": 0.00013721109503229726, + "loss": 1.9203, + "step": 19803 + }, + { + "epoch": 2.3105821957764556, + "grad_norm": 1.2793751955032349, + "learning_rate": 0.00013719604529931247, + "loss": 1.7824, + "step": 19804 + }, + { + "epoch": 2.3106988682767473, + "grad_norm": 1.0882322788238525, + "learning_rate": 0.00013718099571461515, + "loss": 2.0822, + "step": 19805 + }, + { + "epoch": 2.310815540777039, + "grad_norm": 1.235678791999817, + "learning_rate": 0.0001371659462783615, + "loss": 1.8888, + "step": 19806 + }, + { + "epoch": 2.3109322132773307, + "grad_norm": 1.131616234779358, + "learning_rate": 0.00013715089699070749, + "loss": 1.9671, + "step": 19807 + }, + { + "epoch": 2.3110488857776224, + "grad_norm": 1.2477303743362427, + "learning_rate": 0.000137135847851809, + "loss": 2.0213, + "step": 19808 + }, + { + "epoch": 2.311165558277914, + "grad_norm": 1.1541500091552734, + "learning_rate": 0.00013712079886182225, + "loss": 1.9037, + "step": 19809 + }, + { + "epoch": 2.3112822307782057, + "grad_norm": 1.4809229373931885, + "learning_rate": 0.00013710575002090307, + "loss": 2.0213, + "step": 19810 + }, + { + "epoch": 2.3113989032784974, + "grad_norm": 1.3314156532287598, + "learning_rate": 0.0001370907013292076, + "loss": 2.056, + "step": 19811 + }, + { + "epoch": 2.311515575778789, + "grad_norm": 1.0747771263122559, + "learning_rate": 0.00013707565278689175, + "loss": 2.0493, + "step": 19812 + }, + { + "epoch": 2.3116322482790808, + "grad_norm": 1.228650689125061, + "learning_rate": 0.00013706060439411163, + "loss": 2.0047, + "step": 19813 + }, + { + "epoch": 2.3117489207793724, + "grad_norm": 1.218276858329773, + "learning_rate": 0.00013704555615102316, + "loss": 2.0403, + "step": 19814 + }, + { + "epoch": 2.311865593279664, + "grad_norm": 1.2381361722946167, + "learning_rate": 0.00013703050805778238, + "loss": 1.9862, + "step": 19815 + }, + { + "epoch": 2.311982265779956, + "grad_norm": 1.1715244054794312, + "learning_rate": 0.00013701546011454528, + "loss": 1.8897, + "step": 19816 + }, + { + "epoch": 2.3120989382802475, + "grad_norm": 1.023274540901184, + "learning_rate": 0.0001370004123214679, + "loss": 1.8437, + "step": 19817 + }, + { + "epoch": 2.312215610780539, + "grad_norm": 1.0939342975616455, + "learning_rate": 0.00013698536467870613, + "loss": 2.0971, + "step": 19818 + }, + { + "epoch": 2.312332283280831, + "grad_norm": 1.313396692276001, + "learning_rate": 0.00013697031718641603, + "loss": 2.0558, + "step": 19819 + }, + { + "epoch": 2.3124489557811225, + "grad_norm": 1.2082065343856812, + "learning_rate": 0.00013695526984475368, + "loss": 2.0441, + "step": 19820 + }, + { + "epoch": 2.312565628281414, + "grad_norm": 1.13568913936615, + "learning_rate": 0.000136940222653875, + "loss": 1.885, + "step": 19821 + }, + { + "epoch": 2.312682300781706, + "grad_norm": 1.1400502920150757, + "learning_rate": 0.00013692517561393594, + "loss": 1.9885, + "step": 19822 + }, + { + "epoch": 2.3127989732819976, + "grad_norm": 1.0592105388641357, + "learning_rate": 0.00013691012872509256, + "loss": 1.8645, + "step": 19823 + }, + { + "epoch": 2.3129156457822893, + "grad_norm": 1.309212565422058, + "learning_rate": 0.00013689508198750085, + "loss": 2.0917, + "step": 19824 + }, + { + "epoch": 2.313032318282581, + "grad_norm": 1.1877057552337646, + "learning_rate": 0.0001368800354013167, + "loss": 1.8673, + "step": 19825 + }, + { + "epoch": 2.3131489907828726, + "grad_norm": 1.1451555490493774, + "learning_rate": 0.00013686498896669624, + "loss": 1.9731, + "step": 19826 + }, + { + "epoch": 2.3132656632831643, + "grad_norm": 1.1184377670288086, + "learning_rate": 0.0001368499426837953, + "loss": 1.9028, + "step": 19827 + }, + { + "epoch": 2.313382335783456, + "grad_norm": 1.2901897430419922, + "learning_rate": 0.00013683489655277002, + "loss": 1.9933, + "step": 19828 + }, + { + "epoch": 2.3134990082837477, + "grad_norm": 1.2750324010849, + "learning_rate": 0.00013681985057377625, + "loss": 1.9671, + "step": 19829 + }, + { + "epoch": 2.3136156807840393, + "grad_norm": 1.1011576652526855, + "learning_rate": 0.00013680480474697013, + "loss": 1.8857, + "step": 19830 + }, + { + "epoch": 2.313732353284331, + "grad_norm": 1.1275049448013306, + "learning_rate": 0.00013678975907250747, + "loss": 1.9946, + "step": 19831 + }, + { + "epoch": 2.3138490257846227, + "grad_norm": 0.9375014901161194, + "learning_rate": 0.00013677471355054427, + "loss": 1.8783, + "step": 19832 + }, + { + "epoch": 2.3139656982849144, + "grad_norm": 1.1253396272659302, + "learning_rate": 0.0001367596681812366, + "loss": 1.9872, + "step": 19833 + }, + { + "epoch": 2.314082370785206, + "grad_norm": 1.1126571893692017, + "learning_rate": 0.00013674462296474034, + "loss": 2.0017, + "step": 19834 + }, + { + "epoch": 2.3141990432854977, + "grad_norm": 0.9765630960464478, + "learning_rate": 0.00013672957790121157, + "loss": 1.7328, + "step": 19835 + }, + { + "epoch": 2.3143157157857894, + "grad_norm": 1.0771195888519287, + "learning_rate": 0.00013671453299080613, + "loss": 1.7667, + "step": 19836 + }, + { + "epoch": 2.314432388286081, + "grad_norm": 1.131382703781128, + "learning_rate": 0.00013669948823368013, + "loss": 2.0125, + "step": 19837 + }, + { + "epoch": 2.314549060786373, + "grad_norm": 1.0898441076278687, + "learning_rate": 0.00013668444362998936, + "loss": 1.8487, + "step": 19838 + }, + { + "epoch": 2.3146657332866645, + "grad_norm": 1.154868721961975, + "learning_rate": 0.00013666939917988998, + "loss": 2.0783, + "step": 19839 + }, + { + "epoch": 2.314782405786956, + "grad_norm": 1.0545226335525513, + "learning_rate": 0.00013665435488353784, + "loss": 1.7798, + "step": 19840 + }, + { + "epoch": 2.314899078287248, + "grad_norm": 1.1536632776260376, + "learning_rate": 0.00013663931074108895, + "loss": 1.9534, + "step": 19841 + }, + { + "epoch": 2.3150157507875395, + "grad_norm": 1.2594828605651855, + "learning_rate": 0.00013662426675269918, + "loss": 2.0126, + "step": 19842 + }, + { + "epoch": 2.315132423287831, + "grad_norm": 1.2830651998519897, + "learning_rate": 0.0001366092229185246, + "loss": 2.1325, + "step": 19843 + }, + { + "epoch": 2.315249095788123, + "grad_norm": 1.0367817878723145, + "learning_rate": 0.00013659417923872106, + "loss": 2.0249, + "step": 19844 + }, + { + "epoch": 2.3153657682884146, + "grad_norm": 1.3188756704330444, + "learning_rate": 0.0001365791357134446, + "loss": 2.1037, + "step": 19845 + }, + { + "epoch": 2.3154824407887062, + "grad_norm": 1.1003104448318481, + "learning_rate": 0.00013656409234285118, + "loss": 2.026, + "step": 19846 + }, + { + "epoch": 2.315599113288998, + "grad_norm": 1.1857882738113403, + "learning_rate": 0.00013654904912709673, + "loss": 1.7925, + "step": 19847 + }, + { + "epoch": 2.3157157857892896, + "grad_norm": 1.165452241897583, + "learning_rate": 0.0001365340060663372, + "loss": 2.0385, + "step": 19848 + }, + { + "epoch": 2.3158324582895813, + "grad_norm": 1.269388198852539, + "learning_rate": 0.00013651896316072852, + "loss": 2.0865, + "step": 19849 + }, + { + "epoch": 2.315949130789873, + "grad_norm": 1.1944653987884521, + "learning_rate": 0.0001365039204104267, + "loss": 1.9975, + "step": 19850 + }, + { + "epoch": 2.3160658032901646, + "grad_norm": 1.1904029846191406, + "learning_rate": 0.00013648887781558756, + "loss": 2.0105, + "step": 19851 + }, + { + "epoch": 2.3161824757904563, + "grad_norm": 1.0865063667297363, + "learning_rate": 0.0001364738353763672, + "loss": 1.9597, + "step": 19852 + }, + { + "epoch": 2.316299148290748, + "grad_norm": 1.3828014135360718, + "learning_rate": 0.00013645879309292141, + "loss": 1.9737, + "step": 19853 + }, + { + "epoch": 2.3164158207910397, + "grad_norm": 1.1447978019714355, + "learning_rate": 0.00013644375096540632, + "loss": 1.8808, + "step": 19854 + }, + { + "epoch": 2.3165324932913314, + "grad_norm": 1.202544093132019, + "learning_rate": 0.0001364287089939777, + "loss": 2.0034, + "step": 19855 + }, + { + "epoch": 2.316649165791623, + "grad_norm": 1.1434322595596313, + "learning_rate": 0.00013641366717879154, + "loss": 1.8624, + "step": 19856 + }, + { + "epoch": 2.3167658382919147, + "grad_norm": 1.2999589443206787, + "learning_rate": 0.00013639862552000383, + "loss": 2.1156, + "step": 19857 + }, + { + "epoch": 2.3168825107922064, + "grad_norm": 1.0055826902389526, + "learning_rate": 0.0001363835840177704, + "loss": 1.8616, + "step": 19858 + }, + { + "epoch": 2.316999183292498, + "grad_norm": 1.0280784368515015, + "learning_rate": 0.00013636854267224733, + "loss": 1.9158, + "step": 19859 + }, + { + "epoch": 2.31711585579279, + "grad_norm": 1.177641749382019, + "learning_rate": 0.0001363535014835904, + "loss": 1.9736, + "step": 19860 + }, + { + "epoch": 2.3172325282930815, + "grad_norm": 1.184613585472107, + "learning_rate": 0.00013633846045195565, + "loss": 2.062, + "step": 19861 + }, + { + "epoch": 2.317349200793373, + "grad_norm": 1.1549400091171265, + "learning_rate": 0.00013632341957749893, + "loss": 2.1196, + "step": 19862 + }, + { + "epoch": 2.317465873293665, + "grad_norm": 1.2127492427825928, + "learning_rate": 0.00013630837886037625, + "loss": 2.0015, + "step": 19863 + }, + { + "epoch": 2.3175825457939565, + "grad_norm": 1.198667049407959, + "learning_rate": 0.00013629333830074343, + "loss": 1.8965, + "step": 19864 + }, + { + "epoch": 2.317699218294248, + "grad_norm": 1.0264594554901123, + "learning_rate": 0.00013627829789875654, + "loss": 2.0211, + "step": 19865 + }, + { + "epoch": 2.31781589079454, + "grad_norm": 1.184706687927246, + "learning_rate": 0.00013626325765457136, + "loss": 2.2156, + "step": 19866 + }, + { + "epoch": 2.3179325632948315, + "grad_norm": 1.263621211051941, + "learning_rate": 0.0001362482175683439, + "loss": 1.9721, + "step": 19867 + }, + { + "epoch": 2.3180492357951232, + "grad_norm": 1.2304195165634155, + "learning_rate": 0.00013623317764023, + "loss": 1.9277, + "step": 19868 + }, + { + "epoch": 2.318165908295415, + "grad_norm": 1.1853989362716675, + "learning_rate": 0.0001362181378703857, + "loss": 2.1265, + "step": 19869 + }, + { + "epoch": 2.3182825807957066, + "grad_norm": 1.218099594116211, + "learning_rate": 0.00013620309825896674, + "loss": 2.116, + "step": 19870 + }, + { + "epoch": 2.3183992532959983, + "grad_norm": 1.216935634613037, + "learning_rate": 0.00013618805880612916, + "loss": 1.974, + "step": 19871 + }, + { + "epoch": 2.31851592579629, + "grad_norm": 1.182765007019043, + "learning_rate": 0.0001361730195120289, + "loss": 2.2748, + "step": 19872 + }, + { + "epoch": 2.3186325982965816, + "grad_norm": 1.240443468093872, + "learning_rate": 0.00013615798037682178, + "loss": 2.1048, + "step": 19873 + }, + { + "epoch": 2.3187492707968733, + "grad_norm": 1.4212661981582642, + "learning_rate": 0.00013614294140066373, + "loss": 1.9845, + "step": 19874 + }, + { + "epoch": 2.318865943297165, + "grad_norm": 1.1267679929733276, + "learning_rate": 0.0001361279025837107, + "loss": 1.9084, + "step": 19875 + }, + { + "epoch": 2.3189826157974567, + "grad_norm": 1.0950229167938232, + "learning_rate": 0.0001361128639261186, + "loss": 2.0536, + "step": 19876 + }, + { + "epoch": 2.3190992882977484, + "grad_norm": 1.1117017269134521, + "learning_rate": 0.00013609782542804325, + "loss": 1.9585, + "step": 19877 + }, + { + "epoch": 2.31921596079804, + "grad_norm": 1.0494645833969116, + "learning_rate": 0.00013608278708964066, + "loss": 1.957, + "step": 19878 + }, + { + "epoch": 2.3193326332983317, + "grad_norm": 1.270467758178711, + "learning_rate": 0.0001360677489110666, + "loss": 2.0591, + "step": 19879 + }, + { + "epoch": 2.3194493057986234, + "grad_norm": 1.0852347612380981, + "learning_rate": 0.0001360527108924771, + "loss": 1.9899, + "step": 19880 + }, + { + "epoch": 2.319565978298915, + "grad_norm": 1.0358918905258179, + "learning_rate": 0.00013603767303402796, + "loss": 1.8489, + "step": 19881 + }, + { + "epoch": 2.3196826507992068, + "grad_norm": 1.0649361610412598, + "learning_rate": 0.00013602263533587518, + "loss": 1.7315, + "step": 19882 + }, + { + "epoch": 2.3197993232994985, + "grad_norm": 1.0704416036605835, + "learning_rate": 0.00013600759779817454, + "loss": 1.9611, + "step": 19883 + }, + { + "epoch": 2.31991599579979, + "grad_norm": 1.2429193258285522, + "learning_rate": 0.000135992560421082, + "loss": 1.9748, + "step": 19884 + }, + { + "epoch": 2.320032668300082, + "grad_norm": 1.4233933687210083, + "learning_rate": 0.00013597752320475346, + "loss": 1.877, + "step": 19885 + }, + { + "epoch": 2.3201493408003735, + "grad_norm": 1.1844773292541504, + "learning_rate": 0.00013596248614934476, + "loss": 1.972, + "step": 19886 + }, + { + "epoch": 2.320266013300665, + "grad_norm": 1.2547080516815186, + "learning_rate": 0.00013594744925501183, + "loss": 2.0954, + "step": 19887 + }, + { + "epoch": 2.320382685800957, + "grad_norm": 1.428360939025879, + "learning_rate": 0.00013593241252191052, + "loss": 2.0865, + "step": 19888 + }, + { + "epoch": 2.3204993583012485, + "grad_norm": 1.1755414009094238, + "learning_rate": 0.00013591737595019678, + "loss": 2.0869, + "step": 19889 + }, + { + "epoch": 2.32061603080154, + "grad_norm": 1.2671128511428833, + "learning_rate": 0.00013590233954002636, + "loss": 2.1586, + "step": 19890 + }, + { + "epoch": 2.320732703301832, + "grad_norm": 1.299851417541504, + "learning_rate": 0.00013588730329155532, + "loss": 1.9004, + "step": 19891 + }, + { + "epoch": 2.3208493758021236, + "grad_norm": 1.459449291229248, + "learning_rate": 0.00013587226720493942, + "loss": 1.9495, + "step": 19892 + }, + { + "epoch": 2.3209660483024153, + "grad_norm": 1.2191071510314941, + "learning_rate": 0.0001358572312803346, + "loss": 2.1398, + "step": 19893 + }, + { + "epoch": 2.321082720802707, + "grad_norm": 1.307478427886963, + "learning_rate": 0.0001358421955178966, + "loss": 2.0633, + "step": 19894 + }, + { + "epoch": 2.3211993933029986, + "grad_norm": 1.0869728326797485, + "learning_rate": 0.00013582715991778148, + "loss": 1.9178, + "step": 19895 + }, + { + "epoch": 2.3213160658032903, + "grad_norm": 1.0488054752349854, + "learning_rate": 0.00013581212448014496, + "loss": 2.1041, + "step": 19896 + }, + { + "epoch": 2.321432738303582, + "grad_norm": 1.2841042280197144, + "learning_rate": 0.00013579708920514295, + "loss": 2.1446, + "step": 19897 + }, + { + "epoch": 2.3215494108038737, + "grad_norm": 1.1995041370391846, + "learning_rate": 0.00013578205409293145, + "loss": 1.9583, + "step": 19898 + }, + { + "epoch": 2.3216660833041654, + "grad_norm": 1.1081860065460205, + "learning_rate": 0.0001357670191436662, + "loss": 1.8895, + "step": 19899 + }, + { + "epoch": 2.321782755804457, + "grad_norm": 1.2227073907852173, + "learning_rate": 0.00013575198435750307, + "loss": 2.0781, + "step": 19900 + }, + { + "epoch": 2.3218994283047487, + "grad_norm": 1.153924822807312, + "learning_rate": 0.00013573694973459792, + "loss": 1.9795, + "step": 19901 + }, + { + "epoch": 2.3220161008050404, + "grad_norm": 1.1686038970947266, + "learning_rate": 0.00013572191527510668, + "loss": 1.7967, + "step": 19902 + }, + { + "epoch": 2.322132773305332, + "grad_norm": 1.2107551097869873, + "learning_rate": 0.00013570688097918512, + "loss": 1.9503, + "step": 19903 + }, + { + "epoch": 2.3222494458056238, + "grad_norm": 1.2442841529846191, + "learning_rate": 0.0001356918468469892, + "loss": 1.933, + "step": 19904 + }, + { + "epoch": 2.3223661183059154, + "grad_norm": 1.156247854232788, + "learning_rate": 0.00013567681287867466, + "loss": 2.0367, + "step": 19905 + }, + { + "epoch": 2.322482790806207, + "grad_norm": 1.2368685007095337, + "learning_rate": 0.0001356617790743975, + "loss": 1.9384, + "step": 19906 + }, + { + "epoch": 2.322599463306499, + "grad_norm": 1.0516079664230347, + "learning_rate": 0.00013564674543431338, + "loss": 1.778, + "step": 19907 + }, + { + "epoch": 2.3227161358067905, + "grad_norm": 1.1415730714797974, + "learning_rate": 0.00013563171195857837, + "loss": 2.1579, + "step": 19908 + }, + { + "epoch": 2.322832808307082, + "grad_norm": 1.1171422004699707, + "learning_rate": 0.0001356166786473482, + "loss": 1.9718, + "step": 19909 + }, + { + "epoch": 2.322949480807374, + "grad_norm": 1.2755717039108276, + "learning_rate": 0.00013560164550077862, + "loss": 1.8921, + "step": 19910 + }, + { + "epoch": 2.3230661533076655, + "grad_norm": 1.177750825881958, + "learning_rate": 0.00013558661251902572, + "loss": 2.0753, + "step": 19911 + }, + { + "epoch": 2.323182825807957, + "grad_norm": 1.3069591522216797, + "learning_rate": 0.00013557157970224513, + "loss": 1.9001, + "step": 19912 + }, + { + "epoch": 2.323299498308249, + "grad_norm": 1.3277174234390259, + "learning_rate": 0.00013555654705059287, + "loss": 2.0169, + "step": 19913 + }, + { + "epoch": 2.3234161708085406, + "grad_norm": 1.0564193725585938, + "learning_rate": 0.0001355415145642246, + "loss": 1.9065, + "step": 19914 + }, + { + "epoch": 2.3235328433088323, + "grad_norm": 1.1642872095108032, + "learning_rate": 0.00013552648224329634, + "loss": 2.11, + "step": 19915 + }, + { + "epoch": 2.323649515809124, + "grad_norm": 1.054084300994873, + "learning_rate": 0.0001355114500879638, + "loss": 2.0179, + "step": 19916 + }, + { + "epoch": 2.3237661883094156, + "grad_norm": 1.0649440288543701, + "learning_rate": 0.00013549641809838282, + "loss": 2.0379, + "step": 19917 + }, + { + "epoch": 2.3238828608097073, + "grad_norm": 1.1038724184036255, + "learning_rate": 0.0001354813862747093, + "loss": 2.0648, + "step": 19918 + }, + { + "epoch": 2.323999533309999, + "grad_norm": 1.1489121913909912, + "learning_rate": 0.00013546635461709908, + "loss": 1.9169, + "step": 19919 + }, + { + "epoch": 2.3241162058102907, + "grad_norm": 1.254459261894226, + "learning_rate": 0.00013545132312570791, + "loss": 2.0972, + "step": 19920 + }, + { + "epoch": 2.3242328783105823, + "grad_norm": 1.1261600255966187, + "learning_rate": 0.0001354362918006917, + "loss": 2.0055, + "step": 19921 + }, + { + "epoch": 2.324349550810874, + "grad_norm": 1.074090838432312, + "learning_rate": 0.0001354212606422062, + "loss": 1.8957, + "step": 19922 + }, + { + "epoch": 2.3244662233111657, + "grad_norm": 1.3262537717819214, + "learning_rate": 0.00013540622965040725, + "loss": 2.1129, + "step": 19923 + }, + { + "epoch": 2.3245828958114574, + "grad_norm": 0.9505627155303955, + "learning_rate": 0.0001353911988254508, + "loss": 1.9175, + "step": 19924 + }, + { + "epoch": 2.324699568311749, + "grad_norm": 1.1711124181747437, + "learning_rate": 0.00013537616816749253, + "loss": 2.1007, + "step": 19925 + }, + { + "epoch": 2.3248162408120407, + "grad_norm": 1.2661073207855225, + "learning_rate": 0.00013536113767668835, + "loss": 1.9629, + "step": 19926 + }, + { + "epoch": 2.3249329133123324, + "grad_norm": 1.2435317039489746, + "learning_rate": 0.000135346107353194, + "loss": 1.879, + "step": 19927 + }, + { + "epoch": 2.325049585812624, + "grad_norm": 1.0839552879333496, + "learning_rate": 0.00013533107719716536, + "loss": 2.0139, + "step": 19928 + }, + { + "epoch": 2.325166258312916, + "grad_norm": 1.2331868410110474, + "learning_rate": 0.00013531604720875816, + "loss": 2.0807, + "step": 19929 + }, + { + "epoch": 2.3252829308132075, + "grad_norm": 1.1373401880264282, + "learning_rate": 0.0001353010173881284, + "loss": 1.8312, + "step": 19930 + }, + { + "epoch": 2.325399603313499, + "grad_norm": 1.3571654558181763, + "learning_rate": 0.00013528598773543163, + "loss": 2.2401, + "step": 19931 + }, + { + "epoch": 2.325516275813791, + "grad_norm": 1.2140352725982666, + "learning_rate": 0.0001352709582508239, + "loss": 1.9832, + "step": 19932 + }, + { + "epoch": 2.3256329483140825, + "grad_norm": 1.0029234886169434, + "learning_rate": 0.00013525592893446088, + "loss": 2.0785, + "step": 19933 + }, + { + "epoch": 2.325749620814374, + "grad_norm": 1.2526559829711914, + "learning_rate": 0.00013524089978649844, + "loss": 2.1676, + "step": 19934 + }, + { + "epoch": 2.325866293314666, + "grad_norm": 1.066035270690918, + "learning_rate": 0.00013522587080709236, + "loss": 1.9681, + "step": 19935 + }, + { + "epoch": 2.3259829658149576, + "grad_norm": 1.369924783706665, + "learning_rate": 0.0001352108419963984, + "loss": 2.156, + "step": 19936 + }, + { + "epoch": 2.3260996383152492, + "grad_norm": 1.0459864139556885, + "learning_rate": 0.00013519581335457245, + "loss": 1.8845, + "step": 19937 + }, + { + "epoch": 2.326216310815541, + "grad_norm": 1.2962356805801392, + "learning_rate": 0.00013518078488177024, + "loss": 2.0563, + "step": 19938 + }, + { + "epoch": 2.3263329833158326, + "grad_norm": 1.10863196849823, + "learning_rate": 0.00013516575657814765, + "loss": 1.9208, + "step": 19939 + }, + { + "epoch": 2.3264496558161243, + "grad_norm": 1.220320701599121, + "learning_rate": 0.00013515072844386036, + "loss": 1.9598, + "step": 19940 + }, + { + "epoch": 2.326566328316416, + "grad_norm": 1.071296215057373, + "learning_rate": 0.0001351357004790643, + "loss": 1.9815, + "step": 19941 + }, + { + "epoch": 2.3266830008167076, + "grad_norm": 1.2927849292755127, + "learning_rate": 0.00013512067268391516, + "loss": 2.0511, + "step": 19942 + }, + { + "epoch": 2.3267996733169993, + "grad_norm": 0.9876407384872437, + "learning_rate": 0.00013510564505856874, + "loss": 1.9053, + "step": 19943 + }, + { + "epoch": 2.326916345817291, + "grad_norm": 1.0898669958114624, + "learning_rate": 0.00013509061760318087, + "loss": 2.0795, + "step": 19944 + }, + { + "epoch": 2.3270330183175827, + "grad_norm": 1.0585449934005737, + "learning_rate": 0.00013507559031790737, + "loss": 1.8307, + "step": 19945 + }, + { + "epoch": 2.3271496908178744, + "grad_norm": 1.1499165296554565, + "learning_rate": 0.0001350605632029039, + "loss": 1.9135, + "step": 19946 + }, + { + "epoch": 2.327266363318166, + "grad_norm": 1.2830846309661865, + "learning_rate": 0.0001350455362583264, + "loss": 1.9172, + "step": 19947 + }, + { + "epoch": 2.3273830358184577, + "grad_norm": 1.432426929473877, + "learning_rate": 0.0001350305094843305, + "loss": 2.0896, + "step": 19948 + }, + { + "epoch": 2.3274997083187494, + "grad_norm": 1.239604115486145, + "learning_rate": 0.00013501548288107206, + "loss": 2.0642, + "step": 19949 + }, + { + "epoch": 2.327616380819041, + "grad_norm": 1.2951595783233643, + "learning_rate": 0.0001350004564487069, + "loss": 2.0762, + "step": 19950 + }, + { + "epoch": 2.327733053319333, + "grad_norm": 1.3224413394927979, + "learning_rate": 0.00013498543018739076, + "loss": 2.0529, + "step": 19951 + }, + { + "epoch": 2.3278497258196245, + "grad_norm": 1.1322786808013916, + "learning_rate": 0.0001349704040972794, + "loss": 1.9781, + "step": 19952 + }, + { + "epoch": 2.327966398319916, + "grad_norm": 1.2161608934402466, + "learning_rate": 0.00013495537817852858, + "loss": 2.1157, + "step": 19953 + }, + { + "epoch": 2.328083070820208, + "grad_norm": 1.1953504085540771, + "learning_rate": 0.00013494035243129414, + "loss": 1.9347, + "step": 19954 + }, + { + "epoch": 2.3281997433204995, + "grad_norm": 1.0544054508209229, + "learning_rate": 0.0001349253268557317, + "loss": 2.0383, + "step": 19955 + }, + { + "epoch": 2.328316415820791, + "grad_norm": 1.0714088678359985, + "learning_rate": 0.00013491030145199726, + "loss": 2.0402, + "step": 19956 + }, + { + "epoch": 2.328433088321083, + "grad_norm": 1.0671182870864868, + "learning_rate": 0.00013489527622024638, + "loss": 1.9966, + "step": 19957 + }, + { + "epoch": 2.3285497608213745, + "grad_norm": 1.1193946599960327, + "learning_rate": 0.00013488025116063497, + "loss": 1.8779, + "step": 19958 + }, + { + "epoch": 2.3286664333216662, + "grad_norm": 1.2783176898956299, + "learning_rate": 0.00013486522627331865, + "loss": 2.0434, + "step": 19959 + }, + { + "epoch": 2.328783105821958, + "grad_norm": 1.1623953580856323, + "learning_rate": 0.00013485020155845333, + "loss": 2.0074, + "step": 19960 + }, + { + "epoch": 2.3288997783222496, + "grad_norm": 1.3141018152236938, + "learning_rate": 0.00013483517701619465, + "loss": 2.1129, + "step": 19961 + }, + { + "epoch": 2.3290164508225413, + "grad_norm": 1.06935715675354, + "learning_rate": 0.00013482015264669844, + "loss": 1.9702, + "step": 19962 + }, + { + "epoch": 2.329133123322833, + "grad_norm": 1.1581611633300781, + "learning_rate": 0.00013480512845012046, + "loss": 2.0755, + "step": 19963 + }, + { + "epoch": 2.3292497958231246, + "grad_norm": 1.0793383121490479, + "learning_rate": 0.0001347901044266164, + "loss": 1.8672, + "step": 19964 + }, + { + "epoch": 2.3293664683234163, + "grad_norm": 1.3108590841293335, + "learning_rate": 0.00013477508057634208, + "loss": 2.1228, + "step": 19965 + }, + { + "epoch": 2.329483140823708, + "grad_norm": 1.0139845609664917, + "learning_rate": 0.00013476005689945316, + "loss": 1.9359, + "step": 19966 + }, + { + "epoch": 2.3295998133239997, + "grad_norm": 1.0428613424301147, + "learning_rate": 0.00013474503339610551, + "loss": 2.0791, + "step": 19967 + }, + { + "epoch": 2.3297164858242914, + "grad_norm": 1.1606353521347046, + "learning_rate": 0.0001347300100664548, + "loss": 2.0427, + "step": 19968 + }, + { + "epoch": 2.329833158324583, + "grad_norm": 1.3105463981628418, + "learning_rate": 0.0001347149869106568, + "loss": 2.2531, + "step": 19969 + }, + { + "epoch": 2.3299498308248747, + "grad_norm": 1.3563498258590698, + "learning_rate": 0.00013469996392886723, + "loss": 1.9242, + "step": 19970 + }, + { + "epoch": 2.3300665033251664, + "grad_norm": 1.1681580543518066, + "learning_rate": 0.00013468494112124188, + "loss": 1.9006, + "step": 19971 + }, + { + "epoch": 2.330183175825458, + "grad_norm": 1.1078850030899048, + "learning_rate": 0.00013466991848793642, + "loss": 1.9932, + "step": 19972 + }, + { + "epoch": 2.3302998483257498, + "grad_norm": 1.054371953010559, + "learning_rate": 0.00013465489602910665, + "loss": 1.9861, + "step": 19973 + }, + { + "epoch": 2.3304165208260414, + "grad_norm": 1.2721391916275024, + "learning_rate": 0.00013463987374490825, + "loss": 1.9103, + "step": 19974 + }, + { + "epoch": 2.330533193326333, + "grad_norm": 0.9842793941497803, + "learning_rate": 0.00013462485163549698, + "loss": 1.7423, + "step": 19975 + }, + { + "epoch": 2.330649865826625, + "grad_norm": 1.1844264268875122, + "learning_rate": 0.0001346098297010286, + "loss": 1.9259, + "step": 19976 + }, + { + "epoch": 2.3307665383269165, + "grad_norm": 1.1659138202667236, + "learning_rate": 0.00013459480794165884, + "loss": 1.9461, + "step": 19977 + }, + { + "epoch": 2.330883210827208, + "grad_norm": 1.2427250146865845, + "learning_rate": 0.00013457978635754344, + "loss": 1.9828, + "step": 19978 + }, + { + "epoch": 2.3309998833275, + "grad_norm": 1.0861629247665405, + "learning_rate": 0.000134564764948838, + "loss": 1.8173, + "step": 19979 + }, + { + "epoch": 2.3311165558277915, + "grad_norm": 0.935529887676239, + "learning_rate": 0.00013454974371569843, + "loss": 1.9591, + "step": 19980 + }, + { + "epoch": 2.331233228328083, + "grad_norm": 1.3387868404388428, + "learning_rate": 0.00013453472265828026, + "loss": 1.9382, + "step": 19981 + }, + { + "epoch": 2.331349900828375, + "grad_norm": 1.1194080114364624, + "learning_rate": 0.0001345197017767394, + "loss": 2.1479, + "step": 19982 + }, + { + "epoch": 2.3314665733286666, + "grad_norm": 1.1020151376724243, + "learning_rate": 0.00013450468107123145, + "loss": 1.891, + "step": 19983 + }, + { + "epoch": 2.3315832458289583, + "grad_norm": 1.291694164276123, + "learning_rate": 0.00013448966054191222, + "loss": 1.9604, + "step": 19984 + }, + { + "epoch": 2.33169991832925, + "grad_norm": 1.1855148077011108, + "learning_rate": 0.00013447464018893728, + "loss": 1.919, + "step": 19985 + }, + { + "epoch": 2.3318165908295416, + "grad_norm": 1.159856915473938, + "learning_rate": 0.0001344596200124625, + "loss": 2.0455, + "step": 19986 + }, + { + "epoch": 2.3319332633298333, + "grad_norm": 1.008846640586853, + "learning_rate": 0.00013444460001264352, + "loss": 2.0204, + "step": 19987 + }, + { + "epoch": 2.332049935830125, + "grad_norm": 1.3820523023605347, + "learning_rate": 0.000134429580189636, + "loss": 2.1081, + "step": 19988 + }, + { + "epoch": 2.3321666083304167, + "grad_norm": 1.127837896347046, + "learning_rate": 0.0001344145605435958, + "loss": 2.0952, + "step": 19989 + }, + { + "epoch": 2.3322832808307083, + "grad_norm": 1.1268595457077026, + "learning_rate": 0.00013439954107467847, + "loss": 1.9915, + "step": 19990 + }, + { + "epoch": 2.332399953331, + "grad_norm": 1.130100965499878, + "learning_rate": 0.00013438452178303982, + "loss": 1.9161, + "step": 19991 + }, + { + "epoch": 2.3325166258312917, + "grad_norm": 1.149368166923523, + "learning_rate": 0.00013436950266883544, + "loss": 2.0667, + "step": 19992 + }, + { + "epoch": 2.3326332983315834, + "grad_norm": 0.9595828056335449, + "learning_rate": 0.0001343544837322212, + "loss": 1.6501, + "step": 19993 + }, + { + "epoch": 2.332749970831875, + "grad_norm": 1.422331690788269, + "learning_rate": 0.00013433946497335266, + "loss": 1.9014, + "step": 19994 + }, + { + "epoch": 2.3328666433321668, + "grad_norm": 1.3580793142318726, + "learning_rate": 0.00013432444639238558, + "loss": 2.0157, + "step": 19995 + }, + { + "epoch": 2.3329833158324584, + "grad_norm": 1.2953758239746094, + "learning_rate": 0.00013430942798947558, + "loss": 1.8931, + "step": 19996 + }, + { + "epoch": 2.33309998833275, + "grad_norm": 1.2249393463134766, + "learning_rate": 0.00013429440976477848, + "loss": 1.9829, + "step": 19997 + }, + { + "epoch": 2.333216660833042, + "grad_norm": 1.250049352645874, + "learning_rate": 0.00013427939171844984, + "loss": 2.0171, + "step": 19998 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 1.0792618989944458, + "learning_rate": 0.00013426437385064547, + "loss": 1.8727, + "step": 19999 + }, + { + "epoch": 2.333450005833625, + "grad_norm": 1.2517582178115845, + "learning_rate": 0.00013424935616152095, + "loss": 1.9474, + "step": 20000 + }, + { + "epoch": 2.333566678333917, + "grad_norm": 1.3431271314620972, + "learning_rate": 0.00013423433865123206, + "loss": 1.9549, + "step": 20001 + }, + { + "epoch": 2.3336833508342085, + "grad_norm": 1.055233359336853, + "learning_rate": 0.00013421932131993446, + "loss": 1.7778, + "step": 20002 + }, + { + "epoch": 2.3338000233345, + "grad_norm": 1.1401166915893555, + "learning_rate": 0.00013420430416778378, + "loss": 2.0544, + "step": 20003 + }, + { + "epoch": 2.333916695834792, + "grad_norm": 1.2620525360107422, + "learning_rate": 0.00013418928719493577, + "loss": 1.9556, + "step": 20004 + }, + { + "epoch": 2.3340333683350836, + "grad_norm": 1.1581984758377075, + "learning_rate": 0.00013417427040154602, + "loss": 1.9225, + "step": 20005 + }, + { + "epoch": 2.3341500408353753, + "grad_norm": 1.2699790000915527, + "learning_rate": 0.00013415925378777035, + "loss": 1.9923, + "step": 20006 + }, + { + "epoch": 2.334266713335667, + "grad_norm": 1.227298378944397, + "learning_rate": 0.00013414423735376427, + "loss": 1.9197, + "step": 20007 + }, + { + "epoch": 2.3343833858359586, + "grad_norm": 1.2368948459625244, + "learning_rate": 0.00013412922109968362, + "loss": 1.9165, + "step": 20008 + }, + { + "epoch": 2.3345000583362503, + "grad_norm": 1.2249913215637207, + "learning_rate": 0.00013411420502568391, + "loss": 2.1446, + "step": 20009 + }, + { + "epoch": 2.334616730836542, + "grad_norm": 1.2755643129348755, + "learning_rate": 0.00013409918913192094, + "loss": 1.9912, + "step": 20010 + }, + { + "epoch": 2.3347334033368337, + "grad_norm": 1.1478074789047241, + "learning_rate": 0.0001340841734185503, + "loss": 1.9041, + "step": 20011 + }, + { + "epoch": 2.3348500758371253, + "grad_norm": 1.2367219924926758, + "learning_rate": 0.00013406915788572772, + "loss": 2.1562, + "step": 20012 + }, + { + "epoch": 2.334966748337417, + "grad_norm": 1.1326440572738647, + "learning_rate": 0.00013405414253360876, + "loss": 1.9115, + "step": 20013 + }, + { + "epoch": 2.3350834208377087, + "grad_norm": 1.1397966146469116, + "learning_rate": 0.00013403912736234923, + "loss": 1.9627, + "step": 20014 + }, + { + "epoch": 2.3352000933380004, + "grad_norm": 1.1783922910690308, + "learning_rate": 0.00013402411237210465, + "loss": 2.1242, + "step": 20015 + }, + { + "epoch": 2.335316765838292, + "grad_norm": 1.211215853691101, + "learning_rate": 0.00013400909756303072, + "loss": 1.9619, + "step": 20016 + }, + { + "epoch": 2.3354334383385837, + "grad_norm": 1.265529990196228, + "learning_rate": 0.00013399408293528317, + "loss": 2.0166, + "step": 20017 + }, + { + "epoch": 2.3355501108388754, + "grad_norm": 1.135179042816162, + "learning_rate": 0.00013397906848901754, + "loss": 1.9444, + "step": 20018 + }, + { + "epoch": 2.335666783339167, + "grad_norm": 1.3276416063308716, + "learning_rate": 0.00013396405422438963, + "loss": 1.9845, + "step": 20019 + }, + { + "epoch": 2.335783455839459, + "grad_norm": 1.310968279838562, + "learning_rate": 0.00013394904014155494, + "loss": 2.0154, + "step": 20020 + }, + { + "epoch": 2.3359001283397505, + "grad_norm": 1.0605896711349487, + "learning_rate": 0.00013393402624066925, + "loss": 1.9468, + "step": 20021 + }, + { + "epoch": 2.336016800840042, + "grad_norm": 1.2175648212432861, + "learning_rate": 0.00013391901252188804, + "loss": 1.9578, + "step": 20022 + }, + { + "epoch": 2.336133473340334, + "grad_norm": 1.1761566400527954, + "learning_rate": 0.00013390399898536716, + "loss": 1.8975, + "step": 20023 + }, + { + "epoch": 2.3362501458406255, + "grad_norm": 1.2540761232376099, + "learning_rate": 0.00013388898563126205, + "loss": 1.9505, + "step": 20024 + }, + { + "epoch": 2.336366818340917, + "grad_norm": 1.1184824705123901, + "learning_rate": 0.00013387397245972856, + "loss": 1.9541, + "step": 20025 + }, + { + "epoch": 2.336483490841209, + "grad_norm": 1.2362784147262573, + "learning_rate": 0.00013385895947092214, + "loss": 1.9346, + "step": 20026 + }, + { + "epoch": 2.3366001633415006, + "grad_norm": 1.217559576034546, + "learning_rate": 0.00013384394666499856, + "loss": 2.0028, + "step": 20027 + }, + { + "epoch": 2.3367168358417922, + "grad_norm": 1.1457325220108032, + "learning_rate": 0.00013382893404211335, + "loss": 1.9751, + "step": 20028 + }, + { + "epoch": 2.336833508342084, + "grad_norm": 1.2137807607650757, + "learning_rate": 0.00013381392160242224, + "loss": 2.1442, + "step": 20029 + }, + { + "epoch": 2.3369501808423756, + "grad_norm": 1.2427016496658325, + "learning_rate": 0.00013379890934608085, + "loss": 1.8964, + "step": 20030 + }, + { + "epoch": 2.3370668533426673, + "grad_norm": 1.115727424621582, + "learning_rate": 0.00013378389727324472, + "loss": 1.9761, + "step": 20031 + }, + { + "epoch": 2.337183525842959, + "grad_norm": 1.0806833505630493, + "learning_rate": 0.00013376888538406962, + "loss": 1.9108, + "step": 20032 + }, + { + "epoch": 2.3373001983432506, + "grad_norm": 1.1239135265350342, + "learning_rate": 0.00013375387367871102, + "loss": 2.0335, + "step": 20033 + }, + { + "epoch": 2.3374168708435423, + "grad_norm": 1.2863744497299194, + "learning_rate": 0.0001337388621573247, + "loss": 2.0566, + "step": 20034 + }, + { + "epoch": 2.337533543343834, + "grad_norm": 1.2083393335342407, + "learning_rate": 0.00013372385082006615, + "loss": 2.0205, + "step": 20035 + }, + { + "epoch": 2.3376502158441257, + "grad_norm": 1.1324381828308105, + "learning_rate": 0.0001337088396670911, + "loss": 1.843, + "step": 20036 + }, + { + "epoch": 2.3377668883444174, + "grad_norm": 1.2873914241790771, + "learning_rate": 0.00013369382869855506, + "loss": 2.0238, + "step": 20037 + }, + { + "epoch": 2.337883560844709, + "grad_norm": 1.1882646083831787, + "learning_rate": 0.00013367881791461375, + "loss": 1.9537, + "step": 20038 + }, + { + "epoch": 2.3380002333450007, + "grad_norm": 1.0834826231002808, + "learning_rate": 0.00013366380731542272, + "loss": 1.8687, + "step": 20039 + }, + { + "epoch": 2.3381169058452924, + "grad_norm": 1.1688638925552368, + "learning_rate": 0.00013364879690113763, + "loss": 1.9949, + "step": 20040 + }, + { + "epoch": 2.338233578345584, + "grad_norm": 1.272693395614624, + "learning_rate": 0.000133633786671914, + "loss": 2.1278, + "step": 20041 + }, + { + "epoch": 2.3383502508458758, + "grad_norm": 1.0097064971923828, + "learning_rate": 0.00013361877662790753, + "loss": 1.8681, + "step": 20042 + }, + { + "epoch": 2.3384669233461675, + "grad_norm": 1.1518511772155762, + "learning_rate": 0.00013360376676927385, + "loss": 1.9342, + "step": 20043 + }, + { + "epoch": 2.338583595846459, + "grad_norm": 1.1614125967025757, + "learning_rate": 0.00013358875709616841, + "loss": 1.8827, + "step": 20044 + }, + { + "epoch": 2.338700268346751, + "grad_norm": 1.0526955127716064, + "learning_rate": 0.00013357374760874705, + "loss": 2.0211, + "step": 20045 + }, + { + "epoch": 2.3388169408470425, + "grad_norm": 1.2486897706985474, + "learning_rate": 0.00013355873830716517, + "loss": 1.8969, + "step": 20046 + }, + { + "epoch": 2.338933613347334, + "grad_norm": 1.349603295326233, + "learning_rate": 0.0001335437291915785, + "loss": 1.9535, + "step": 20047 + }, + { + "epoch": 2.339050285847626, + "grad_norm": 1.1521055698394775, + "learning_rate": 0.00013352872026214246, + "loss": 2.0479, + "step": 20048 + }, + { + "epoch": 2.3391669583479175, + "grad_norm": 1.2166755199432373, + "learning_rate": 0.0001335137115190129, + "loss": 1.8878, + "step": 20049 + }, + { + "epoch": 2.3392836308482092, + "grad_norm": 1.09147047996521, + "learning_rate": 0.00013349870296234517, + "loss": 1.7751, + "step": 20050 + }, + { + "epoch": 2.339400303348501, + "grad_norm": 1.1322972774505615, + "learning_rate": 0.00013348369459229508, + "loss": 1.9919, + "step": 20051 + }, + { + "epoch": 2.3395169758487926, + "grad_norm": 1.1266629695892334, + "learning_rate": 0.000133468686409018, + "loss": 2.0214, + "step": 20052 + }, + { + "epoch": 2.3396336483490843, + "grad_norm": 1.1229320764541626, + "learning_rate": 0.00013345367841266972, + "loss": 1.8689, + "step": 20053 + }, + { + "epoch": 2.339750320849376, + "grad_norm": 1.1761196851730347, + "learning_rate": 0.00013343867060340564, + "loss": 1.8624, + "step": 20054 + }, + { + "epoch": 2.3398669933496676, + "grad_norm": 1.0035598278045654, + "learning_rate": 0.00013342366298138152, + "loss": 1.9674, + "step": 20055 + }, + { + "epoch": 2.3399836658499593, + "grad_norm": 1.0840669870376587, + "learning_rate": 0.00013340865554675288, + "loss": 2.015, + "step": 20056 + }, + { + "epoch": 2.340100338350251, + "grad_norm": 1.146730899810791, + "learning_rate": 0.00013339364829967522, + "loss": 1.91, + "step": 20057 + }, + { + "epoch": 2.3402170108505427, + "grad_norm": 1.0889736413955688, + "learning_rate": 0.00013337864124030423, + "loss": 1.9155, + "step": 20058 + }, + { + "epoch": 2.3403336833508344, + "grad_norm": 1.231439232826233, + "learning_rate": 0.0001333636343687954, + "loss": 2.179, + "step": 20059 + }, + { + "epoch": 2.340450355851126, + "grad_norm": 1.2440688610076904, + "learning_rate": 0.0001333486276853044, + "loss": 2.049, + "step": 20060 + }, + { + "epoch": 2.3405670283514177, + "grad_norm": 1.1841386556625366, + "learning_rate": 0.00013333362118998665, + "loss": 1.9277, + "step": 20061 + }, + { + "epoch": 2.3406837008517094, + "grad_norm": 1.299710988998413, + "learning_rate": 0.00013331861488299795, + "loss": 2.2036, + "step": 20062 + }, + { + "epoch": 2.340800373352001, + "grad_norm": 1.166414737701416, + "learning_rate": 0.00013330360876449366, + "loss": 1.9199, + "step": 20063 + }, + { + "epoch": 2.3409170458522928, + "grad_norm": 1.1491318941116333, + "learning_rate": 0.00013328860283462943, + "loss": 1.9814, + "step": 20064 + }, + { + "epoch": 2.3410337183525844, + "grad_norm": 1.0947182178497314, + "learning_rate": 0.0001332735970935608, + "loss": 1.8927, + "step": 20065 + }, + { + "epoch": 2.341150390852876, + "grad_norm": 1.3655776977539062, + "learning_rate": 0.0001332585915414434, + "loss": 2.1119, + "step": 20066 + }, + { + "epoch": 2.341267063353168, + "grad_norm": 1.1213077306747437, + "learning_rate": 0.00013324358617843268, + "loss": 1.876, + "step": 20067 + }, + { + "epoch": 2.3413837358534595, + "grad_norm": 1.3253214359283447, + "learning_rate": 0.00013322858100468424, + "loss": 2.0793, + "step": 20068 + }, + { + "epoch": 2.341500408353751, + "grad_norm": 1.0658366680145264, + "learning_rate": 0.00013321357602035376, + "loss": 1.9428, + "step": 20069 + }, + { + "epoch": 2.341617080854043, + "grad_norm": 0.9676355719566345, + "learning_rate": 0.00013319857122559662, + "loss": 1.8723, + "step": 20070 + }, + { + "epoch": 2.3417337533543345, + "grad_norm": 1.025099515914917, + "learning_rate": 0.00013318356662056846, + "loss": 1.9368, + "step": 20071 + }, + { + "epoch": 2.341850425854626, + "grad_norm": 1.0728914737701416, + "learning_rate": 0.00013316856220542482, + "loss": 1.9303, + "step": 20072 + }, + { + "epoch": 2.341967098354918, + "grad_norm": 1.0679608583450317, + "learning_rate": 0.00013315355798032127, + "loss": 1.9907, + "step": 20073 + }, + { + "epoch": 2.3420837708552096, + "grad_norm": 1.3955662250518799, + "learning_rate": 0.0001331385539454133, + "loss": 2.0109, + "step": 20074 + }, + { + "epoch": 2.3422004433555013, + "grad_norm": 1.2837550640106201, + "learning_rate": 0.00013312355010085651, + "loss": 1.9206, + "step": 20075 + }, + { + "epoch": 2.342317115855793, + "grad_norm": 1.1800585985183716, + "learning_rate": 0.0001331085464468064, + "loss": 2.0487, + "step": 20076 + }, + { + "epoch": 2.3424337883560846, + "grad_norm": 1.0362153053283691, + "learning_rate": 0.00013309354298341857, + "loss": 1.8856, + "step": 20077 + }, + { + "epoch": 2.3425504608563763, + "grad_norm": 1.1103304624557495, + "learning_rate": 0.00013307853971084846, + "loss": 2.0183, + "step": 20078 + }, + { + "epoch": 2.342667133356668, + "grad_norm": 1.2677545547485352, + "learning_rate": 0.00013306353662925175, + "loss": 2.029, + "step": 20079 + }, + { + "epoch": 2.3427838058569597, + "grad_norm": 1.2331211566925049, + "learning_rate": 0.00013304853373878386, + "loss": 1.8872, + "step": 20080 + }, + { + "epoch": 2.3429004783572513, + "grad_norm": 1.1302050352096558, + "learning_rate": 0.0001330335310396003, + "loss": 1.8445, + "step": 20081 + }, + { + "epoch": 2.343017150857543, + "grad_norm": 1.314368724822998, + "learning_rate": 0.00013301852853185676, + "loss": 1.9838, + "step": 20082 + }, + { + "epoch": 2.3431338233578347, + "grad_norm": 1.395270824432373, + "learning_rate": 0.00013300352621570857, + "loss": 2.0427, + "step": 20083 + }, + { + "epoch": 2.3432504958581264, + "grad_norm": 1.1101857423782349, + "learning_rate": 0.00013298852409131143, + "loss": 1.8925, + "step": 20084 + }, + { + "epoch": 2.343367168358418, + "grad_norm": 1.1708534955978394, + "learning_rate": 0.00013297352215882073, + "loss": 1.9374, + "step": 20085 + }, + { + "epoch": 2.3434838408587098, + "grad_norm": 1.3066892623901367, + "learning_rate": 0.00013295852041839213, + "loss": 2.1073, + "step": 20086 + }, + { + "epoch": 2.3436005133590014, + "grad_norm": 1.3345462083816528, + "learning_rate": 0.00013294351887018104, + "loss": 1.9594, + "step": 20087 + }, + { + "epoch": 2.343717185859293, + "grad_norm": 1.1326007843017578, + "learning_rate": 0.00013292851751434302, + "loss": 1.8546, + "step": 20088 + }, + { + "epoch": 2.343833858359585, + "grad_norm": 0.9747181534767151, + "learning_rate": 0.0001329135163510336, + "loss": 1.8186, + "step": 20089 + }, + { + "epoch": 2.3439505308598765, + "grad_norm": 1.0954738855361938, + "learning_rate": 0.00013289851538040827, + "loss": 1.753, + "step": 20090 + }, + { + "epoch": 2.344067203360168, + "grad_norm": 1.1802736520767212, + "learning_rate": 0.0001328835146026225, + "loss": 2.0696, + "step": 20091 + }, + { + "epoch": 2.34418387586046, + "grad_norm": 1.1680009365081787, + "learning_rate": 0.00013286851401783195, + "loss": 1.9822, + "step": 20092 + }, + { + "epoch": 2.3443005483607515, + "grad_norm": 1.1375765800476074, + "learning_rate": 0.00013285351362619195, + "loss": 1.8999, + "step": 20093 + }, + { + "epoch": 2.344417220861043, + "grad_norm": 1.0274219512939453, + "learning_rate": 0.00013283851342785807, + "loss": 1.799, + "step": 20094 + }, + { + "epoch": 2.344533893361335, + "grad_norm": 0.9117540121078491, + "learning_rate": 0.00013282351342298594, + "loss": 1.7923, + "step": 20095 + }, + { + "epoch": 2.3446505658616266, + "grad_norm": 1.0665128231048584, + "learning_rate": 0.0001328085136117309, + "loss": 1.8993, + "step": 20096 + }, + { + "epoch": 2.3447672383619182, + "grad_norm": 1.2359346151351929, + "learning_rate": 0.00013279351399424853, + "loss": 1.9609, + "step": 20097 + }, + { + "epoch": 2.34488391086221, + "grad_norm": 1.0493861436843872, + "learning_rate": 0.0001327785145706943, + "loss": 1.9768, + "step": 20098 + }, + { + "epoch": 2.3450005833625016, + "grad_norm": 1.2548801898956299, + "learning_rate": 0.00013276351534122376, + "loss": 2.0376, + "step": 20099 + }, + { + "epoch": 2.3451172558627933, + "grad_norm": 1.3454983234405518, + "learning_rate": 0.0001327485163059923, + "loss": 2.1205, + "step": 20100 + }, + { + "epoch": 2.345233928363085, + "grad_norm": 1.1985372304916382, + "learning_rate": 0.00013273351746515553, + "loss": 1.9879, + "step": 20101 + }, + { + "epoch": 2.3453506008633767, + "grad_norm": 1.386091947555542, + "learning_rate": 0.00013271851881886882, + "loss": 1.9741, + "step": 20102 + }, + { + "epoch": 2.3454672733636683, + "grad_norm": 1.230617880821228, + "learning_rate": 0.0001327035203672878, + "loss": 1.9826, + "step": 20103 + }, + { + "epoch": 2.34558394586396, + "grad_norm": 1.3211240768432617, + "learning_rate": 0.00013268852211056783, + "loss": 2.0195, + "step": 20104 + }, + { + "epoch": 2.3457006183642517, + "grad_norm": 0.9294710159301758, + "learning_rate": 0.0001326735240488645, + "loss": 1.7148, + "step": 20105 + }, + { + "epoch": 2.3458172908645434, + "grad_norm": 1.1280614137649536, + "learning_rate": 0.00013265852618233324, + "loss": 2.0898, + "step": 20106 + }, + { + "epoch": 2.345933963364835, + "grad_norm": 0.9892628192901611, + "learning_rate": 0.00013264352851112944, + "loss": 1.7451, + "step": 20107 + }, + { + "epoch": 2.3460506358651267, + "grad_norm": 1.272204041481018, + "learning_rate": 0.0001326285310354088, + "loss": 1.9052, + "step": 20108 + }, + { + "epoch": 2.3461673083654184, + "grad_norm": 1.4256021976470947, + "learning_rate": 0.0001326135337553266, + "loss": 1.9746, + "step": 20109 + }, + { + "epoch": 2.34628398086571, + "grad_norm": 1.0710184574127197, + "learning_rate": 0.0001325985366710384, + "loss": 1.9758, + "step": 20110 + }, + { + "epoch": 2.346400653366002, + "grad_norm": 1.2695646286010742, + "learning_rate": 0.00013258353978269966, + "loss": 2.0147, + "step": 20111 + }, + { + "epoch": 2.3465173258662935, + "grad_norm": 1.3638038635253906, + "learning_rate": 0.0001325685430904659, + "loss": 2.0006, + "step": 20112 + }, + { + "epoch": 2.346633998366585, + "grad_norm": 1.204787254333496, + "learning_rate": 0.00013255354659449248, + "loss": 2.2206, + "step": 20113 + }, + { + "epoch": 2.346750670866877, + "grad_norm": 1.3913743495941162, + "learning_rate": 0.00013253855029493494, + "loss": 2.0334, + "step": 20114 + }, + { + "epoch": 2.3468673433671685, + "grad_norm": 1.2353260517120361, + "learning_rate": 0.00013252355419194877, + "loss": 2.0885, + "step": 20115 + }, + { + "epoch": 2.34698401586746, + "grad_norm": 1.0649465322494507, + "learning_rate": 0.0001325085582856894, + "loss": 1.9621, + "step": 20116 + }, + { + "epoch": 2.347100688367752, + "grad_norm": 1.414908766746521, + "learning_rate": 0.00013249356257631222, + "loss": 1.9177, + "step": 20117 + }, + { + "epoch": 2.3472173608680436, + "grad_norm": 1.3465543985366821, + "learning_rate": 0.00013247856706397282, + "loss": 2.1287, + "step": 20118 + }, + { + "epoch": 2.3473340333683352, + "grad_norm": 1.1977838277816772, + "learning_rate": 0.00013246357174882651, + "loss": 1.9529, + "step": 20119 + }, + { + "epoch": 2.347450705868627, + "grad_norm": 1.1594648361206055, + "learning_rate": 0.00013244857663102886, + "loss": 1.8683, + "step": 20120 + }, + { + "epoch": 2.3475673783689186, + "grad_norm": 1.2754297256469727, + "learning_rate": 0.00013243358171073535, + "loss": 2.1634, + "step": 20121 + }, + { + "epoch": 2.3476840508692103, + "grad_norm": 1.1730549335479736, + "learning_rate": 0.00013241858698810133, + "loss": 1.8659, + "step": 20122 + }, + { + "epoch": 2.347800723369502, + "grad_norm": 1.1248137950897217, + "learning_rate": 0.00013240359246328228, + "loss": 2.0562, + "step": 20123 + }, + { + "epoch": 2.3479173958697936, + "grad_norm": 1.2575160264968872, + "learning_rate": 0.00013238859813643366, + "loss": 2.0191, + "step": 20124 + }, + { + "epoch": 2.3480340683700853, + "grad_norm": 1.1217669248580933, + "learning_rate": 0.00013237360400771095, + "loss": 1.9333, + "step": 20125 + }, + { + "epoch": 2.348150740870377, + "grad_norm": 1.374438762664795, + "learning_rate": 0.0001323586100772695, + "loss": 2.0186, + "step": 20126 + }, + { + "epoch": 2.3482674133706687, + "grad_norm": 1.1781550645828247, + "learning_rate": 0.00013234361634526483, + "loss": 2.3096, + "step": 20127 + }, + { + "epoch": 2.3483840858709604, + "grad_norm": 1.2632825374603271, + "learning_rate": 0.00013232862281185233, + "loss": 1.875, + "step": 20128 + }, + { + "epoch": 2.348500758371252, + "grad_norm": 1.2868329286575317, + "learning_rate": 0.0001323136294771875, + "loss": 1.8007, + "step": 20129 + }, + { + "epoch": 2.3486174308715437, + "grad_norm": 1.14353609085083, + "learning_rate": 0.00013229863634142566, + "loss": 2.0293, + "step": 20130 + }, + { + "epoch": 2.3487341033718354, + "grad_norm": 1.4018337726593018, + "learning_rate": 0.0001322836434047224, + "loss": 1.9274, + "step": 20131 + }, + { + "epoch": 2.348850775872127, + "grad_norm": 1.0916132926940918, + "learning_rate": 0.000132268650667233, + "loss": 2.0544, + "step": 20132 + }, + { + "epoch": 2.3489674483724188, + "grad_norm": 1.2775267362594604, + "learning_rate": 0.00013225365812911296, + "loss": 1.9972, + "step": 20133 + }, + { + "epoch": 2.3490841208727105, + "grad_norm": 1.1061478853225708, + "learning_rate": 0.00013223866579051776, + "loss": 1.9537, + "step": 20134 + }, + { + "epoch": 2.349200793373002, + "grad_norm": 1.1498435735702515, + "learning_rate": 0.00013222367365160268, + "loss": 2.0515, + "step": 20135 + }, + { + "epoch": 2.349317465873294, + "grad_norm": 1.1224093437194824, + "learning_rate": 0.00013220868171252332, + "loss": 1.6407, + "step": 20136 + }, + { + "epoch": 2.3494341383735855, + "grad_norm": 1.2021186351776123, + "learning_rate": 0.0001321936899734349, + "loss": 1.9657, + "step": 20137 + }, + { + "epoch": 2.349550810873877, + "grad_norm": 1.178205966949463, + "learning_rate": 0.00013217869843449304, + "loss": 2.0633, + "step": 20138 + }, + { + "epoch": 2.349667483374169, + "grad_norm": 1.2822624444961548, + "learning_rate": 0.00013216370709585298, + "loss": 2.1025, + "step": 20139 + }, + { + "epoch": 2.3497841558744605, + "grad_norm": 1.3080940246582031, + "learning_rate": 0.0001321487159576703, + "loss": 2.02, + "step": 20140 + }, + { + "epoch": 2.3499008283747522, + "grad_norm": 1.093976378440857, + "learning_rate": 0.00013213372502010026, + "loss": 1.8506, + "step": 20141 + }, + { + "epoch": 2.350017500875044, + "grad_norm": 1.1838635206222534, + "learning_rate": 0.00013211873428329836, + "loss": 2.184, + "step": 20142 + }, + { + "epoch": 2.3501341733753356, + "grad_norm": 1.06486976146698, + "learning_rate": 0.00013210374374741994, + "loss": 1.7542, + "step": 20143 + }, + { + "epoch": 2.3502508458756273, + "grad_norm": 1.4227617979049683, + "learning_rate": 0.0001320887534126205, + "loss": 2.1168, + "step": 20144 + }, + { + "epoch": 2.350367518375919, + "grad_norm": 1.1975961923599243, + "learning_rate": 0.00013207376327905532, + "loss": 1.9226, + "step": 20145 + }, + { + "epoch": 2.3504841908762106, + "grad_norm": 1.0308572053909302, + "learning_rate": 0.0001320587733468799, + "loss": 1.8171, + "step": 20146 + }, + { + "epoch": 2.3506008633765023, + "grad_norm": 1.362335205078125, + "learning_rate": 0.00013204378361624962, + "loss": 2.1384, + "step": 20147 + }, + { + "epoch": 2.350717535876794, + "grad_norm": 1.1682515144348145, + "learning_rate": 0.00013202879408731987, + "loss": 2.1424, + "step": 20148 + }, + { + "epoch": 2.3508342083770857, + "grad_norm": 1.2565348148345947, + "learning_rate": 0.00013201380476024604, + "loss": 2.2516, + "step": 20149 + }, + { + "epoch": 2.3509508808773774, + "grad_norm": 0.9632634520530701, + "learning_rate": 0.00013199881563518353, + "loss": 1.8015, + "step": 20150 + }, + { + "epoch": 2.351067553377669, + "grad_norm": 1.1826977729797363, + "learning_rate": 0.00013198382671228772, + "loss": 2.1302, + "step": 20151 + }, + { + "epoch": 2.3511842258779607, + "grad_norm": 1.2408064603805542, + "learning_rate": 0.000131968837991714, + "loss": 2.0787, + "step": 20152 + }, + { + "epoch": 2.3513008983782524, + "grad_norm": 1.3241724967956543, + "learning_rate": 0.00013195384947361775, + "loss": 2.0622, + "step": 20153 + }, + { + "epoch": 2.351417570878544, + "grad_norm": 1.2853647470474243, + "learning_rate": 0.00013193886115815437, + "loss": 2.116, + "step": 20154 + }, + { + "epoch": 2.3515342433788358, + "grad_norm": 1.142802357673645, + "learning_rate": 0.00013192387304547927, + "loss": 2.1234, + "step": 20155 + }, + { + "epoch": 2.3516509158791274, + "grad_norm": 1.1551544666290283, + "learning_rate": 0.0001319088851357477, + "loss": 1.9479, + "step": 20156 + }, + { + "epoch": 2.351767588379419, + "grad_norm": 1.209061622619629, + "learning_rate": 0.00013189389742911524, + "loss": 2.1032, + "step": 20157 + }, + { + "epoch": 2.351884260879711, + "grad_norm": 1.213623046875, + "learning_rate": 0.00013187890992573712, + "loss": 2.1672, + "step": 20158 + }, + { + "epoch": 2.3520009333800025, + "grad_norm": 1.2097398042678833, + "learning_rate": 0.00013186392262576872, + "loss": 2.0083, + "step": 20159 + }, + { + "epoch": 2.352117605880294, + "grad_norm": 1.128405213356018, + "learning_rate": 0.0001318489355293655, + "loss": 1.9488, + "step": 20160 + }, + { + "epoch": 2.352234278380586, + "grad_norm": 1.119718313217163, + "learning_rate": 0.00013183394863668273, + "loss": 1.8779, + "step": 20161 + }, + { + "epoch": 2.3523509508808775, + "grad_norm": 1.157647967338562, + "learning_rate": 0.0001318189619478759, + "loss": 2.0749, + "step": 20162 + }, + { + "epoch": 2.352467623381169, + "grad_norm": 1.1305304765701294, + "learning_rate": 0.00013180397546310019, + "loss": 2.1358, + "step": 20163 + }, + { + "epoch": 2.352584295881461, + "grad_norm": 1.189666748046875, + "learning_rate": 0.00013178898918251116, + "loss": 1.9028, + "step": 20164 + }, + { + "epoch": 2.3527009683817526, + "grad_norm": 1.2064282894134521, + "learning_rate": 0.00013177400310626405, + "loss": 1.8132, + "step": 20165 + }, + { + "epoch": 2.3528176408820443, + "grad_norm": 1.3255378007888794, + "learning_rate": 0.00013175901723451425, + "loss": 2.1051, + "step": 20166 + }, + { + "epoch": 2.352934313382336, + "grad_norm": 1.1312953233718872, + "learning_rate": 0.00013174403156741713, + "loss": 2.0141, + "step": 20167 + }, + { + "epoch": 2.3530509858826276, + "grad_norm": 0.9934409856796265, + "learning_rate": 0.00013172904610512804, + "loss": 1.8748, + "step": 20168 + }, + { + "epoch": 2.3531676583829193, + "grad_norm": 1.2117689847946167, + "learning_rate": 0.00013171406084780228, + "loss": 1.8796, + "step": 20169 + }, + { + "epoch": 2.353284330883211, + "grad_norm": 1.3916584253311157, + "learning_rate": 0.00013169907579559532, + "loss": 1.9915, + "step": 20170 + }, + { + "epoch": 2.3534010033835027, + "grad_norm": 1.1246193647384644, + "learning_rate": 0.00013168409094866236, + "loss": 1.9562, + "step": 20171 + }, + { + "epoch": 2.3535176758837943, + "grad_norm": 1.2529606819152832, + "learning_rate": 0.00013166910630715883, + "loss": 1.7929, + "step": 20172 + }, + { + "epoch": 2.353634348384086, + "grad_norm": 1.3629884719848633, + "learning_rate": 0.0001316541218712401, + "loss": 2.0876, + "step": 20173 + }, + { + "epoch": 2.3537510208843777, + "grad_norm": 1.2760270833969116, + "learning_rate": 0.0001316391376410615, + "loss": 1.9726, + "step": 20174 + }, + { + "epoch": 2.3538676933846694, + "grad_norm": 1.0225080251693726, + "learning_rate": 0.00013162415361677834, + "loss": 1.8693, + "step": 20175 + }, + { + "epoch": 2.353984365884961, + "grad_norm": 1.2561310529708862, + "learning_rate": 0.0001316091697985459, + "loss": 1.9422, + "step": 20176 + }, + { + "epoch": 2.3541010383852528, + "grad_norm": 1.0489704608917236, + "learning_rate": 0.00013159418618651967, + "loss": 1.975, + "step": 20177 + }, + { + "epoch": 2.3542177108855444, + "grad_norm": 1.1234278678894043, + "learning_rate": 0.0001315792027808548, + "loss": 2.039, + "step": 20178 + }, + { + "epoch": 2.354334383385836, + "grad_norm": 1.1737700700759888, + "learning_rate": 0.0001315642195817068, + "loss": 2.0504, + "step": 20179 + }, + { + "epoch": 2.354451055886128, + "grad_norm": 1.2623279094696045, + "learning_rate": 0.00013154923658923086, + "loss": 1.9903, + "step": 20180 + }, + { + "epoch": 2.3545677283864195, + "grad_norm": 1.1099659204483032, + "learning_rate": 0.00013153425380358242, + "loss": 1.9592, + "step": 20181 + }, + { + "epoch": 2.354684400886711, + "grad_norm": 1.3221919536590576, + "learning_rate": 0.00013151927122491668, + "loss": 1.9813, + "step": 20182 + }, + { + "epoch": 2.354801073387003, + "grad_norm": 1.2136650085449219, + "learning_rate": 0.00013150428885338908, + "loss": 1.7935, + "step": 20183 + }, + { + "epoch": 2.3549177458872945, + "grad_norm": 1.13620126247406, + "learning_rate": 0.00013148930668915488, + "loss": 1.8803, + "step": 20184 + }, + { + "epoch": 2.355034418387586, + "grad_norm": 1.0268328189849854, + "learning_rate": 0.00013147432473236935, + "loss": 1.9172, + "step": 20185 + }, + { + "epoch": 2.355151090887878, + "grad_norm": 1.084726333618164, + "learning_rate": 0.00013145934298318797, + "loss": 1.9398, + "step": 20186 + }, + { + "epoch": 2.3552677633881696, + "grad_norm": 1.124194622039795, + "learning_rate": 0.00013144436144176585, + "loss": 1.9852, + "step": 20187 + }, + { + "epoch": 2.3553844358884612, + "grad_norm": 1.1535001993179321, + "learning_rate": 0.00013142938010825847, + "loss": 2.2417, + "step": 20188 + }, + { + "epoch": 2.355501108388753, + "grad_norm": 1.1190553903579712, + "learning_rate": 0.00013141439898282104, + "loss": 1.9452, + "step": 20189 + }, + { + "epoch": 2.3556177808890446, + "grad_norm": 1.182781457901001, + "learning_rate": 0.00013139941806560894, + "loss": 2.0408, + "step": 20190 + }, + { + "epoch": 2.3557344533893363, + "grad_norm": 1.4311738014221191, + "learning_rate": 0.00013138443735677744, + "loss": 1.8135, + "step": 20191 + }, + { + "epoch": 2.355851125889628, + "grad_norm": 1.1430840492248535, + "learning_rate": 0.00013136945685648179, + "loss": 1.9729, + "step": 20192 + }, + { + "epoch": 2.3559677983899197, + "grad_norm": 1.3588387966156006, + "learning_rate": 0.00013135447656487735, + "loss": 1.9719, + "step": 20193 + }, + { + "epoch": 2.3560844708902113, + "grad_norm": 1.4992692470550537, + "learning_rate": 0.00013133949648211945, + "loss": 2.1777, + "step": 20194 + }, + { + "epoch": 2.356201143390503, + "grad_norm": 1.2599166631698608, + "learning_rate": 0.00013132451660836328, + "loss": 2.0513, + "step": 20195 + }, + { + "epoch": 2.3563178158907947, + "grad_norm": 1.275457739830017, + "learning_rate": 0.00013130953694376424, + "loss": 1.9841, + "step": 20196 + }, + { + "epoch": 2.3564344883910864, + "grad_norm": 1.1299229860305786, + "learning_rate": 0.00013129455748847756, + "loss": 2.0703, + "step": 20197 + }, + { + "epoch": 2.356551160891378, + "grad_norm": 1.0368059873580933, + "learning_rate": 0.00013127957824265852, + "loss": 1.8396, + "step": 20198 + }, + { + "epoch": 2.3566678333916697, + "grad_norm": 1.1309597492218018, + "learning_rate": 0.00013126459920646252, + "loss": 1.842, + "step": 20199 + }, + { + "epoch": 2.3567845058919614, + "grad_norm": 1.0333739519119263, + "learning_rate": 0.00013124962038004476, + "loss": 1.948, + "step": 20200 + }, + { + "epoch": 2.356901178392253, + "grad_norm": 1.1355090141296387, + "learning_rate": 0.00013123464176356054, + "loss": 1.9748, + "step": 20201 + }, + { + "epoch": 2.357017850892545, + "grad_norm": 1.196919322013855, + "learning_rate": 0.00013121966335716506, + "loss": 2.0836, + "step": 20202 + }, + { + "epoch": 2.3571345233928365, + "grad_norm": 0.9871403574943542, + "learning_rate": 0.00013120468516101378, + "loss": 1.7204, + "step": 20203 + }, + { + "epoch": 2.357251195893128, + "grad_norm": 1.0888745784759521, + "learning_rate": 0.00013118970717526174, + "loss": 1.7709, + "step": 20204 + }, + { + "epoch": 2.35736786839342, + "grad_norm": 1.166569709777832, + "learning_rate": 0.00013117472940006445, + "loss": 2.1763, + "step": 20205 + }, + { + "epoch": 2.3574845408937115, + "grad_norm": 1.305846095085144, + "learning_rate": 0.00013115975183557703, + "loss": 2.0716, + "step": 20206 + }, + { + "epoch": 2.357601213394003, + "grad_norm": 1.163745403289795, + "learning_rate": 0.00013114477448195484, + "loss": 2.0871, + "step": 20207 + }, + { + "epoch": 2.357717885894295, + "grad_norm": 1.038985013961792, + "learning_rate": 0.00013112979733935303, + "loss": 1.8672, + "step": 20208 + }, + { + "epoch": 2.3578345583945866, + "grad_norm": 1.1602017879486084, + "learning_rate": 0.00013111482040792705, + "loss": 1.9915, + "step": 20209 + }, + { + "epoch": 2.3579512308948782, + "grad_norm": 1.194814920425415, + "learning_rate": 0.000131099843687832, + "loss": 2.0191, + "step": 20210 + }, + { + "epoch": 2.35806790339517, + "grad_norm": 1.3680050373077393, + "learning_rate": 0.00013108486717922316, + "loss": 2.0845, + "step": 20211 + }, + { + "epoch": 2.3581845758954616, + "grad_norm": 1.081498622894287, + "learning_rate": 0.00013106989088225595, + "loss": 1.8666, + "step": 20212 + }, + { + "epoch": 2.3583012483957533, + "grad_norm": 1.2460777759552002, + "learning_rate": 0.00013105491479708538, + "loss": 2.1976, + "step": 20213 + }, + { + "epoch": 2.358417920896045, + "grad_norm": 1.0684617757797241, + "learning_rate": 0.00013103993892386694, + "loss": 1.9792, + "step": 20214 + }, + { + "epoch": 2.3585345933963366, + "grad_norm": 1.1281192302703857, + "learning_rate": 0.00013102496326275568, + "loss": 1.9922, + "step": 20215 + }, + { + "epoch": 2.3586512658966283, + "grad_norm": 1.1562976837158203, + "learning_rate": 0.00013100998781390703, + "loss": 1.9557, + "step": 20216 + }, + { + "epoch": 2.35876793839692, + "grad_norm": 1.030512809753418, + "learning_rate": 0.00013099501257747613, + "loss": 1.9858, + "step": 20217 + }, + { + "epoch": 2.3588846108972117, + "grad_norm": 1.2360169887542725, + "learning_rate": 0.00013098003755361822, + "loss": 2.1322, + "step": 20218 + }, + { + "epoch": 2.3590012833975034, + "grad_norm": 1.0855870246887207, + "learning_rate": 0.00013096506274248862, + "loss": 1.9939, + "step": 20219 + }, + { + "epoch": 2.359117955897795, + "grad_norm": 1.1139286756515503, + "learning_rate": 0.00013095008814424257, + "loss": 2.0037, + "step": 20220 + }, + { + "epoch": 2.3592346283980867, + "grad_norm": 1.2000502347946167, + "learning_rate": 0.00013093511375903514, + "loss": 2.0303, + "step": 20221 + }, + { + "epoch": 2.3593513008983784, + "grad_norm": 1.1087678670883179, + "learning_rate": 0.00013092013958702182, + "loss": 1.81, + "step": 20222 + }, + { + "epoch": 2.35946797339867, + "grad_norm": 1.2216840982437134, + "learning_rate": 0.00013090516562835766, + "loss": 1.9489, + "step": 20223 + }, + { + "epoch": 2.3595846458989618, + "grad_norm": 1.0539684295654297, + "learning_rate": 0.00013089019188319793, + "loss": 1.9226, + "step": 20224 + }, + { + "epoch": 2.3597013183992535, + "grad_norm": 1.1626355648040771, + "learning_rate": 0.00013087521835169797, + "loss": 2.0527, + "step": 20225 + }, + { + "epoch": 2.359817990899545, + "grad_norm": 1.226506233215332, + "learning_rate": 0.0001308602450340129, + "loss": 2.0667, + "step": 20226 + }, + { + "epoch": 2.359934663399837, + "grad_norm": 1.1274391412734985, + "learning_rate": 0.000130845271930298, + "loss": 2.0235, + "step": 20227 + }, + { + "epoch": 2.3600513359001285, + "grad_norm": 1.225359559059143, + "learning_rate": 0.00013083029904070842, + "loss": 2.2077, + "step": 20228 + }, + { + "epoch": 2.36016800840042, + "grad_norm": 1.0729475021362305, + "learning_rate": 0.00013081532636539947, + "loss": 1.9458, + "step": 20229 + }, + { + "epoch": 2.360284680900712, + "grad_norm": 1.1176749467849731, + "learning_rate": 0.0001308003539045263, + "loss": 1.9724, + "step": 20230 + }, + { + "epoch": 2.3604013534010035, + "grad_norm": 1.148353934288025, + "learning_rate": 0.00013078538165824417, + "loss": 2.0027, + "step": 20231 + }, + { + "epoch": 2.3605180259012952, + "grad_norm": 1.2290825843811035, + "learning_rate": 0.00013077040962670825, + "loss": 2.01, + "step": 20232 + }, + { + "epoch": 2.360634698401587, + "grad_norm": 1.1895440816879272, + "learning_rate": 0.0001307554378100739, + "loss": 2.0562, + "step": 20233 + }, + { + "epoch": 2.3607513709018786, + "grad_norm": 1.1596851348876953, + "learning_rate": 0.0001307404662084961, + "loss": 1.9868, + "step": 20234 + }, + { + "epoch": 2.3608680434021703, + "grad_norm": 1.1991597414016724, + "learning_rate": 0.00013072549482213026, + "loss": 2.0086, + "step": 20235 + }, + { + "epoch": 2.360984715902462, + "grad_norm": 1.3022278547286987, + "learning_rate": 0.00013071052365113146, + "loss": 2.1091, + "step": 20236 + }, + { + "epoch": 2.3611013884027536, + "grad_norm": 1.3254311084747314, + "learning_rate": 0.00013069555269565493, + "loss": 1.9128, + "step": 20237 + }, + { + "epoch": 2.3612180609030453, + "grad_norm": 1.175521731376648, + "learning_rate": 0.00013068058195585594, + "loss": 1.955, + "step": 20238 + }, + { + "epoch": 2.361334733403337, + "grad_norm": 1.1473898887634277, + "learning_rate": 0.00013066561143188963, + "loss": 1.8852, + "step": 20239 + }, + { + "epoch": 2.3614514059036287, + "grad_norm": 1.1393039226531982, + "learning_rate": 0.00013065064112391123, + "loss": 1.7751, + "step": 20240 + }, + { + "epoch": 2.3615680784039204, + "grad_norm": 1.0604907274246216, + "learning_rate": 0.00013063567103207586, + "loss": 1.9566, + "step": 20241 + }, + { + "epoch": 2.361684750904212, + "grad_norm": 1.1246877908706665, + "learning_rate": 0.00013062070115653883, + "loss": 1.9208, + "step": 20242 + }, + { + "epoch": 2.3618014234045037, + "grad_norm": 1.101876974105835, + "learning_rate": 0.00013060573149745525, + "loss": 2.0317, + "step": 20243 + }, + { + "epoch": 2.3619180959047954, + "grad_norm": 1.0875810384750366, + "learning_rate": 0.00013059076205498033, + "loss": 1.9713, + "step": 20244 + }, + { + "epoch": 2.362034768405087, + "grad_norm": 1.8127223253250122, + "learning_rate": 0.00013057579282926924, + "loss": 1.9162, + "step": 20245 + }, + { + "epoch": 2.3621514409053788, + "grad_norm": 1.0875338315963745, + "learning_rate": 0.00013056082382047725, + "loss": 2.0037, + "step": 20246 + }, + { + "epoch": 2.3622681134056704, + "grad_norm": 1.0564931631088257, + "learning_rate": 0.00013054585502875936, + "loss": 1.9042, + "step": 20247 + }, + { + "epoch": 2.362384785905962, + "grad_norm": 1.153383493423462, + "learning_rate": 0.00013053088645427095, + "loss": 2.2731, + "step": 20248 + }, + { + "epoch": 2.362501458406254, + "grad_norm": 1.2583166360855103, + "learning_rate": 0.00013051591809716705, + "loss": 2.0567, + "step": 20249 + }, + { + "epoch": 2.3626181309065455, + "grad_norm": 1.2980560064315796, + "learning_rate": 0.00013050094995760292, + "loss": 1.9457, + "step": 20250 + }, + { + "epoch": 2.362734803406837, + "grad_norm": 1.1486889123916626, + "learning_rate": 0.0001304859820357337, + "loss": 1.9552, + "step": 20251 + }, + { + "epoch": 2.362851475907129, + "grad_norm": 1.2363258600234985, + "learning_rate": 0.00013047101433171459, + "loss": 1.8705, + "step": 20252 + }, + { + "epoch": 2.3629681484074205, + "grad_norm": 1.0915067195892334, + "learning_rate": 0.00013045604684570078, + "loss": 2.038, + "step": 20253 + }, + { + "epoch": 2.363084820907712, + "grad_norm": 1.099971055984497, + "learning_rate": 0.0001304410795778473, + "loss": 1.9395, + "step": 20254 + }, + { + "epoch": 2.363201493408004, + "grad_norm": 1.2038018703460693, + "learning_rate": 0.00013042611252830948, + "loss": 1.9652, + "step": 20255 + }, + { + "epoch": 2.3633181659082956, + "grad_norm": 1.1754728555679321, + "learning_rate": 0.00013041114569724234, + "loss": 1.8531, + "step": 20256 + }, + { + "epoch": 2.3634348384085873, + "grad_norm": 1.0348258018493652, + "learning_rate": 0.0001303961790848012, + "loss": 1.9769, + "step": 20257 + }, + { + "epoch": 2.363551510908879, + "grad_norm": 1.2573269605636597, + "learning_rate": 0.00013038121269114104, + "loss": 1.8021, + "step": 20258 + }, + { + "epoch": 2.3636681834091706, + "grad_norm": 1.3560912609100342, + "learning_rate": 0.00013036624651641715, + "loss": 1.8448, + "step": 20259 + }, + { + "epoch": 2.3637848559094623, + "grad_norm": 1.3106387853622437, + "learning_rate": 0.00013035128056078466, + "loss": 1.9596, + "step": 20260 + }, + { + "epoch": 2.363901528409754, + "grad_norm": 1.4024032354354858, + "learning_rate": 0.00013033631482439864, + "loss": 2.0666, + "step": 20261 + }, + { + "epoch": 2.3640182009100457, + "grad_norm": 1.0671533346176147, + "learning_rate": 0.00013032134930741432, + "loss": 1.9677, + "step": 20262 + }, + { + "epoch": 2.3641348734103373, + "grad_norm": 1.0164343118667603, + "learning_rate": 0.00013030638400998677, + "loss": 1.9592, + "step": 20263 + }, + { + "epoch": 2.364251545910629, + "grad_norm": 1.1434952020645142, + "learning_rate": 0.00013029141893227127, + "loss": 1.8767, + "step": 20264 + }, + { + "epoch": 2.3643682184109207, + "grad_norm": 1.0951439142227173, + "learning_rate": 0.0001302764540744228, + "loss": 2.0452, + "step": 20265 + }, + { + "epoch": 2.3644848909112124, + "grad_norm": 1.3021196126937866, + "learning_rate": 0.00013026148943659662, + "loss": 2.0425, + "step": 20266 + }, + { + "epoch": 2.364601563411504, + "grad_norm": 1.1035188436508179, + "learning_rate": 0.0001302465250189478, + "loss": 2.0005, + "step": 20267 + }, + { + "epoch": 2.3647182359117958, + "grad_norm": 1.1500616073608398, + "learning_rate": 0.0001302315608216315, + "loss": 1.9277, + "step": 20268 + }, + { + "epoch": 2.3648349084120874, + "grad_norm": 1.0519636869430542, + "learning_rate": 0.00013021659684480288, + "loss": 2.0198, + "step": 20269 + }, + { + "epoch": 2.364951580912379, + "grad_norm": 1.2248649597167969, + "learning_rate": 0.00013020163308861702, + "loss": 2.0258, + "step": 20270 + }, + { + "epoch": 2.365068253412671, + "grad_norm": 1.1554083824157715, + "learning_rate": 0.00013018666955322904, + "loss": 1.9772, + "step": 20271 + }, + { + "epoch": 2.3651849259129625, + "grad_norm": 1.338852047920227, + "learning_rate": 0.00013017170623879411, + "loss": 1.8184, + "step": 20272 + }, + { + "epoch": 2.365301598413254, + "grad_norm": 1.032444953918457, + "learning_rate": 0.0001301567431454673, + "loss": 1.9304, + "step": 20273 + }, + { + "epoch": 2.365418270913546, + "grad_norm": 1.2140837907791138, + "learning_rate": 0.00013014178027340382, + "loss": 2.0101, + "step": 20274 + }, + { + "epoch": 2.3655349434138375, + "grad_norm": 1.1322352886199951, + "learning_rate": 0.00013012681762275866, + "loss": 2.0515, + "step": 20275 + }, + { + "epoch": 2.365651615914129, + "grad_norm": 1.319244384765625, + "learning_rate": 0.00013011185519368705, + "loss": 2.1976, + "step": 20276 + }, + { + "epoch": 2.365768288414421, + "grad_norm": 1.1992303133010864, + "learning_rate": 0.00013009689298634409, + "loss": 1.9814, + "step": 20277 + }, + { + "epoch": 2.3658849609147126, + "grad_norm": 1.3922245502471924, + "learning_rate": 0.0001300819310008848, + "loss": 2.1621, + "step": 20278 + }, + { + "epoch": 2.3660016334150042, + "grad_norm": 1.0708792209625244, + "learning_rate": 0.00013006696923746443, + "loss": 1.9607, + "step": 20279 + }, + { + "epoch": 2.366118305915296, + "grad_norm": 1.2540041208267212, + "learning_rate": 0.00013005200769623796, + "loss": 1.9402, + "step": 20280 + }, + { + "epoch": 2.3662349784155876, + "grad_norm": 1.1095036268234253, + "learning_rate": 0.00013003704637736056, + "loss": 1.9203, + "step": 20281 + }, + { + "epoch": 2.3663516509158793, + "grad_norm": 1.1299625635147095, + "learning_rate": 0.0001300220852809873, + "loss": 1.9621, + "step": 20282 + }, + { + "epoch": 2.366468323416171, + "grad_norm": 1.03847336769104, + "learning_rate": 0.00013000712440727332, + "loss": 2.0033, + "step": 20283 + }, + { + "epoch": 2.3665849959164627, + "grad_norm": 1.1116491556167603, + "learning_rate": 0.00012999216375637367, + "loss": 1.8329, + "step": 20284 + }, + { + "epoch": 2.3667016684167543, + "grad_norm": 1.1431021690368652, + "learning_rate": 0.00012997720332844352, + "loss": 1.983, + "step": 20285 + }, + { + "epoch": 2.366818340917046, + "grad_norm": 1.1035256385803223, + "learning_rate": 0.00012996224312363788, + "loss": 1.8996, + "step": 20286 + }, + { + "epoch": 2.3669350134173377, + "grad_norm": 1.1902121305465698, + "learning_rate": 0.00012994728314211186, + "loss": 1.9399, + "step": 20287 + }, + { + "epoch": 2.3670516859176294, + "grad_norm": 1.0763003826141357, + "learning_rate": 0.0001299323233840206, + "loss": 1.9042, + "step": 20288 + }, + { + "epoch": 2.367168358417921, + "grad_norm": 0.9693830609321594, + "learning_rate": 0.0001299173638495191, + "loss": 2.0381, + "step": 20289 + }, + { + "epoch": 2.3672850309182127, + "grad_norm": 1.1733344793319702, + "learning_rate": 0.00012990240453876256, + "loss": 2.108, + "step": 20290 + }, + { + "epoch": 2.3674017034185044, + "grad_norm": 1.0657272338867188, + "learning_rate": 0.0001298874454519059, + "loss": 1.8953, + "step": 20291 + }, + { + "epoch": 2.367518375918796, + "grad_norm": 1.2896583080291748, + "learning_rate": 0.0001298724865891044, + "loss": 2.1765, + "step": 20292 + }, + { + "epoch": 2.367635048419088, + "grad_norm": 1.2099395990371704, + "learning_rate": 0.00012985752795051297, + "loss": 2.3163, + "step": 20293 + }, + { + "epoch": 2.3677517209193795, + "grad_norm": 1.2647844552993774, + "learning_rate": 0.00012984256953628678, + "loss": 1.8913, + "step": 20294 + }, + { + "epoch": 2.367868393419671, + "grad_norm": 1.310263991355896, + "learning_rate": 0.00012982761134658086, + "loss": 1.9485, + "step": 20295 + }, + { + "epoch": 2.367985065919963, + "grad_norm": 1.183854103088379, + "learning_rate": 0.00012981265338155032, + "loss": 2.1284, + "step": 20296 + }, + { + "epoch": 2.3681017384202545, + "grad_norm": 1.0610449314117432, + "learning_rate": 0.00012979769564135012, + "loss": 1.8829, + "step": 20297 + }, + { + "epoch": 2.368218410920546, + "grad_norm": 1.1027625799179077, + "learning_rate": 0.00012978273812613546, + "loss": 1.9014, + "step": 20298 + }, + { + "epoch": 2.368335083420838, + "grad_norm": 1.2289836406707764, + "learning_rate": 0.00012976778083606126, + "loss": 2.041, + "step": 20299 + }, + { + "epoch": 2.3684517559211296, + "grad_norm": 1.0610214471817017, + "learning_rate": 0.00012975282377128278, + "loss": 1.9188, + "step": 20300 + }, + { + "epoch": 2.3685684284214212, + "grad_norm": 1.2512534856796265, + "learning_rate": 0.00012973786693195488, + "loss": 2.0688, + "step": 20301 + }, + { + "epoch": 2.368685100921713, + "grad_norm": 1.1058379411697388, + "learning_rate": 0.00012972291031823272, + "loss": 1.8987, + "step": 20302 + }, + { + "epoch": 2.3688017734220046, + "grad_norm": 1.203739881515503, + "learning_rate": 0.00012970795393027135, + "loss": 1.9294, + "step": 20303 + }, + { + "epoch": 2.3689184459222963, + "grad_norm": 1.0067909955978394, + "learning_rate": 0.0001296929977682258, + "loss": 1.7922, + "step": 20304 + }, + { + "epoch": 2.369035118422588, + "grad_norm": 1.1374826431274414, + "learning_rate": 0.00012967804183225114, + "loss": 1.9985, + "step": 20305 + }, + { + "epoch": 2.3691517909228796, + "grad_norm": 1.325648307800293, + "learning_rate": 0.00012966308612250235, + "loss": 2.0835, + "step": 20306 + }, + { + "epoch": 2.3692684634231713, + "grad_norm": 1.1073359251022339, + "learning_rate": 0.00012964813063913458, + "loss": 1.8549, + "step": 20307 + }, + { + "epoch": 2.369385135923463, + "grad_norm": 1.292136549949646, + "learning_rate": 0.00012963317538230274, + "loss": 1.8198, + "step": 20308 + }, + { + "epoch": 2.3695018084237547, + "grad_norm": 1.5112133026123047, + "learning_rate": 0.00012961822035216205, + "loss": 2.1021, + "step": 20309 + }, + { + "epoch": 2.3696184809240464, + "grad_norm": 1.2081111669540405, + "learning_rate": 0.00012960326554886735, + "loss": 1.7726, + "step": 20310 + }, + { + "epoch": 2.369735153424338, + "grad_norm": 1.0647847652435303, + "learning_rate": 0.00012958831097257382, + "loss": 1.9283, + "step": 20311 + }, + { + "epoch": 2.3698518259246297, + "grad_norm": 1.0904988050460815, + "learning_rate": 0.00012957335662343644, + "loss": 1.9547, + "step": 20312 + }, + { + "epoch": 2.3699684984249214, + "grad_norm": 1.1998121738433838, + "learning_rate": 0.00012955840250161022, + "loss": 1.9665, + "step": 20313 + }, + { + "epoch": 2.370085170925213, + "grad_norm": 1.0345475673675537, + "learning_rate": 0.0001295434486072502, + "loss": 1.9325, + "step": 20314 + }, + { + "epoch": 2.3702018434255048, + "grad_norm": 1.2018438577651978, + "learning_rate": 0.00012952849494051138, + "loss": 1.988, + "step": 20315 + }, + { + "epoch": 2.3703185159257965, + "grad_norm": 1.1208443641662598, + "learning_rate": 0.0001295135415015489, + "loss": 1.9564, + "step": 20316 + }, + { + "epoch": 2.370435188426088, + "grad_norm": 1.230711579322815, + "learning_rate": 0.00012949858829051762, + "loss": 1.8376, + "step": 20317 + }, + { + "epoch": 2.37055186092638, + "grad_norm": 1.1073243618011475, + "learning_rate": 0.00012948363530757272, + "loss": 1.9139, + "step": 20318 + }, + { + "epoch": 2.3706685334266715, + "grad_norm": 1.0932807922363281, + "learning_rate": 0.00012946868255286906, + "loss": 1.9011, + "step": 20319 + }, + { + "epoch": 2.370785205926963, + "grad_norm": 1.2817777395248413, + "learning_rate": 0.00012945373002656177, + "loss": 1.9548, + "step": 20320 + }, + { + "epoch": 2.370901878427255, + "grad_norm": 0.9725944995880127, + "learning_rate": 0.0001294387777288058, + "loss": 1.8363, + "step": 20321 + }, + { + "epoch": 2.3710185509275465, + "grad_norm": 1.007967472076416, + "learning_rate": 0.0001294238256597562, + "loss": 1.9991, + "step": 20322 + }, + { + "epoch": 2.3711352234278382, + "grad_norm": 1.1198434829711914, + "learning_rate": 0.0001294088738195679, + "loss": 1.8263, + "step": 20323 + }, + { + "epoch": 2.37125189592813, + "grad_norm": 1.2623341083526611, + "learning_rate": 0.00012939392220839602, + "loss": 1.9147, + "step": 20324 + }, + { + "epoch": 2.3713685684284216, + "grad_norm": 1.3252806663513184, + "learning_rate": 0.0001293789708263954, + "loss": 2.1203, + "step": 20325 + }, + { + "epoch": 2.3714852409287133, + "grad_norm": 1.194390058517456, + "learning_rate": 0.00012936401967372123, + "loss": 1.9186, + "step": 20326 + }, + { + "epoch": 2.371601913429005, + "grad_norm": 1.0602611303329468, + "learning_rate": 0.00012934906875052834, + "loss": 1.9732, + "step": 20327 + }, + { + "epoch": 2.3717185859292966, + "grad_norm": 1.2099525928497314, + "learning_rate": 0.00012933411805697185, + "loss": 1.9334, + "step": 20328 + }, + { + "epoch": 2.3718352584295883, + "grad_norm": 1.1816363334655762, + "learning_rate": 0.0001293191675932067, + "loss": 2.1922, + "step": 20329 + }, + { + "epoch": 2.37195193092988, + "grad_norm": 1.1070626974105835, + "learning_rate": 0.00012930421735938785, + "loss": 1.9346, + "step": 20330 + }, + { + "epoch": 2.3720686034301717, + "grad_norm": 1.0095010995864868, + "learning_rate": 0.00012928926735567036, + "loss": 1.9805, + "step": 20331 + }, + { + "epoch": 2.3721852759304634, + "grad_norm": 1.0787873268127441, + "learning_rate": 0.00012927431758220912, + "loss": 1.8603, + "step": 20332 + }, + { + "epoch": 2.372301948430755, + "grad_norm": 1.190619707107544, + "learning_rate": 0.00012925936803915924, + "loss": 2.0375, + "step": 20333 + }, + { + "epoch": 2.3724186209310467, + "grad_norm": 1.1123908758163452, + "learning_rate": 0.00012924441872667552, + "loss": 2.1196, + "step": 20334 + }, + { + "epoch": 2.3725352934313384, + "grad_norm": 1.1783145666122437, + "learning_rate": 0.00012922946964491315, + "loss": 1.8804, + "step": 20335 + }, + { + "epoch": 2.37265196593163, + "grad_norm": 1.1871980428695679, + "learning_rate": 0.00012921452079402694, + "loss": 1.7984, + "step": 20336 + }, + { + "epoch": 2.3727686384319218, + "grad_norm": 1.0161973237991333, + "learning_rate": 0.00012919957217417196, + "loss": 1.9853, + "step": 20337 + }, + { + "epoch": 2.3728853109322134, + "grad_norm": 1.2432204484939575, + "learning_rate": 0.00012918462378550314, + "loss": 2.1182, + "step": 20338 + }, + { + "epoch": 2.373001983432505, + "grad_norm": 1.3896183967590332, + "learning_rate": 0.00012916967562817545, + "loss": 2.0301, + "step": 20339 + }, + { + "epoch": 2.373118655932797, + "grad_norm": 1.1387015581130981, + "learning_rate": 0.0001291547277023438, + "loss": 2.1793, + "step": 20340 + }, + { + "epoch": 2.3732353284330885, + "grad_norm": 1.0416193008422852, + "learning_rate": 0.0001291397800081633, + "loss": 2.026, + "step": 20341 + }, + { + "epoch": 2.37335200093338, + "grad_norm": 1.2468043565750122, + "learning_rate": 0.00012912483254578873, + "loss": 1.9671, + "step": 20342 + }, + { + "epoch": 2.373468673433672, + "grad_norm": 1.0818802118301392, + "learning_rate": 0.00012910988531537515, + "loss": 2.0163, + "step": 20343 + }, + { + "epoch": 2.3735853459339635, + "grad_norm": 1.1600431203842163, + "learning_rate": 0.00012909493831707758, + "loss": 1.9552, + "step": 20344 + }, + { + "epoch": 2.373702018434255, + "grad_norm": 1.216775894165039, + "learning_rate": 0.0001290799915510509, + "loss": 2.1473, + "step": 20345 + }, + { + "epoch": 2.373818690934547, + "grad_norm": 1.1264346837997437, + "learning_rate": 0.00012906504501745, + "loss": 1.8229, + "step": 20346 + }, + { + "epoch": 2.3739353634348386, + "grad_norm": 1.1078500747680664, + "learning_rate": 0.00012905009871642993, + "loss": 2.0084, + "step": 20347 + }, + { + "epoch": 2.3740520359351303, + "grad_norm": 1.1336842775344849, + "learning_rate": 0.00012903515264814564, + "loss": 2.0256, + "step": 20348 + }, + { + "epoch": 2.374168708435422, + "grad_norm": 1.166262149810791, + "learning_rate": 0.00012902020681275194, + "loss": 1.9284, + "step": 20349 + }, + { + "epoch": 2.3742853809357136, + "grad_norm": 1.262880563735962, + "learning_rate": 0.00012900526121040396, + "loss": 2.0766, + "step": 20350 + }, + { + "epoch": 2.3744020534360053, + "grad_norm": 1.4764186143875122, + "learning_rate": 0.00012899031584125644, + "loss": 2.0304, + "step": 20351 + }, + { + "epoch": 2.374518725936297, + "grad_norm": 1.0844683647155762, + "learning_rate": 0.00012897537070546454, + "loss": 2.0565, + "step": 20352 + }, + { + "epoch": 2.3746353984365887, + "grad_norm": 1.0196400880813599, + "learning_rate": 0.00012896042580318298, + "loss": 1.7927, + "step": 20353 + }, + { + "epoch": 2.3747520709368803, + "grad_norm": 1.1687978506088257, + "learning_rate": 0.00012894548113456686, + "loss": 2.0282, + "step": 20354 + }, + { + "epoch": 2.374868743437172, + "grad_norm": 1.211622953414917, + "learning_rate": 0.000128930536699771, + "loss": 2.0098, + "step": 20355 + }, + { + "epoch": 2.3749854159374637, + "grad_norm": 1.1739859580993652, + "learning_rate": 0.00012891559249895033, + "loss": 1.9938, + "step": 20356 + }, + { + "epoch": 2.3751020884377554, + "grad_norm": 1.175785779953003, + "learning_rate": 0.00012890064853225987, + "loss": 2.0827, + "step": 20357 + }, + { + "epoch": 2.375218760938047, + "grad_norm": 1.3505553007125854, + "learning_rate": 0.00012888570479985443, + "loss": 1.9603, + "step": 20358 + }, + { + "epoch": 2.3753354334383388, + "grad_norm": 1.235298991203308, + "learning_rate": 0.00012887076130188905, + "loss": 2.0161, + "step": 20359 + }, + { + "epoch": 2.3754521059386304, + "grad_norm": 1.125864863395691, + "learning_rate": 0.00012885581803851853, + "loss": 2.0621, + "step": 20360 + }, + { + "epoch": 2.375568778438922, + "grad_norm": 1.2600700855255127, + "learning_rate": 0.00012884087500989786, + "loss": 1.9484, + "step": 20361 + }, + { + "epoch": 2.375685450939214, + "grad_norm": 1.0332200527191162, + "learning_rate": 0.00012882593221618188, + "loss": 2.0736, + "step": 20362 + }, + { + "epoch": 2.3758021234395055, + "grad_norm": 1.1784683465957642, + "learning_rate": 0.00012881098965752564, + "loss": 1.9241, + "step": 20363 + }, + { + "epoch": 2.375918795939797, + "grad_norm": 1.054884433746338, + "learning_rate": 0.00012879604733408392, + "loss": 1.8681, + "step": 20364 + }, + { + "epoch": 2.376035468440089, + "grad_norm": 1.1303019523620605, + "learning_rate": 0.00012878110524601168, + "loss": 1.8736, + "step": 20365 + }, + { + "epoch": 2.3761521409403805, + "grad_norm": 1.0460379123687744, + "learning_rate": 0.00012876616339346374, + "loss": 1.8705, + "step": 20366 + }, + { + "epoch": 2.376268813440672, + "grad_norm": 1.0965145826339722, + "learning_rate": 0.00012875122177659514, + "loss": 1.898, + "step": 20367 + }, + { + "epoch": 2.376385485940964, + "grad_norm": 1.0900342464447021, + "learning_rate": 0.00012873628039556065, + "loss": 2.0948, + "step": 20368 + }, + { + "epoch": 2.3765021584412556, + "grad_norm": 1.0709458589553833, + "learning_rate": 0.00012872133925051517, + "loss": 1.8492, + "step": 20369 + }, + { + "epoch": 2.3766188309415472, + "grad_norm": 1.2417936325073242, + "learning_rate": 0.00012870639834161376, + "loss": 2.1077, + "step": 20370 + }, + { + "epoch": 2.376735503441839, + "grad_norm": 1.0219154357910156, + "learning_rate": 0.00012869145766901115, + "loss": 1.9614, + "step": 20371 + }, + { + "epoch": 2.3768521759421306, + "grad_norm": 1.1193885803222656, + "learning_rate": 0.00012867651723286227, + "loss": 2.0698, + "step": 20372 + }, + { + "epoch": 2.3769688484424223, + "grad_norm": 1.1007463932037354, + "learning_rate": 0.00012866157703332202, + "loss": 1.792, + "step": 20373 + }, + { + "epoch": 2.377085520942714, + "grad_norm": 1.2292338609695435, + "learning_rate": 0.00012864663707054527, + "loss": 2.0036, + "step": 20374 + }, + { + "epoch": 2.3772021934430057, + "grad_norm": 1.2752279043197632, + "learning_rate": 0.0001286316973446869, + "loss": 2.0813, + "step": 20375 + }, + { + "epoch": 2.3773188659432973, + "grad_norm": 1.2898390293121338, + "learning_rate": 0.00012861675785590178, + "loss": 2.0621, + "step": 20376 + }, + { + "epoch": 2.377435538443589, + "grad_norm": 1.0570034980773926, + "learning_rate": 0.0001286018186043448, + "loss": 1.9563, + "step": 20377 + }, + { + "epoch": 2.3775522109438807, + "grad_norm": 1.1389236450195312, + "learning_rate": 0.00012858687959017083, + "loss": 1.92, + "step": 20378 + }, + { + "epoch": 2.3776688834441724, + "grad_norm": 1.0944916009902954, + "learning_rate": 0.00012857194081353473, + "loss": 2.0421, + "step": 20379 + }, + { + "epoch": 2.377785555944464, + "grad_norm": 1.252322793006897, + "learning_rate": 0.00012855700227459144, + "loss": 2.0816, + "step": 20380 + }, + { + "epoch": 2.3779022284447557, + "grad_norm": 1.1437609195709229, + "learning_rate": 0.0001285420639734957, + "loss": 1.8546, + "step": 20381 + }, + { + "epoch": 2.3780189009450474, + "grad_norm": 1.1828664541244507, + "learning_rate": 0.00012852712591040246, + "loss": 2.046, + "step": 20382 + }, + { + "epoch": 2.378135573445339, + "grad_norm": 1.0774494409561157, + "learning_rate": 0.0001285121880854666, + "loss": 1.9646, + "step": 20383 + }, + { + "epoch": 2.378252245945631, + "grad_norm": 1.267208218574524, + "learning_rate": 0.00012849725049884287, + "loss": 2.0477, + "step": 20384 + }, + { + "epoch": 2.3783689184459225, + "grad_norm": 1.272911548614502, + "learning_rate": 0.00012848231315068628, + "loss": 2.1221, + "step": 20385 + }, + { + "epoch": 2.378485590946214, + "grad_norm": 1.2706555128097534, + "learning_rate": 0.00012846737604115153, + "loss": 2.1724, + "step": 20386 + }, + { + "epoch": 2.378602263446506, + "grad_norm": 1.0845482349395752, + "learning_rate": 0.0001284524391703936, + "loss": 2.0469, + "step": 20387 + }, + { + "epoch": 2.3787189359467975, + "grad_norm": 0.9874774217605591, + "learning_rate": 0.00012843750253856723, + "loss": 2.0079, + "step": 20388 + }, + { + "epoch": 2.378835608447089, + "grad_norm": 1.1747838258743286, + "learning_rate": 0.00012842256614582737, + "loss": 1.945, + "step": 20389 + }, + { + "epoch": 2.378952280947381, + "grad_norm": 1.2821357250213623, + "learning_rate": 0.0001284076299923288, + "loss": 1.9584, + "step": 20390 + }, + { + "epoch": 2.3790689534476726, + "grad_norm": 1.0082390308380127, + "learning_rate": 0.00012839269407822638, + "loss": 1.9863, + "step": 20391 + }, + { + "epoch": 2.3791856259479642, + "grad_norm": 1.1293818950653076, + "learning_rate": 0.0001283777584036749, + "loss": 1.9176, + "step": 20392 + }, + { + "epoch": 2.379302298448256, + "grad_norm": 1.0814632177352905, + "learning_rate": 0.00012836282296882932, + "loss": 1.8702, + "step": 20393 + }, + { + "epoch": 2.3794189709485476, + "grad_norm": 1.114858865737915, + "learning_rate": 0.0001283478877738443, + "loss": 2.1761, + "step": 20394 + }, + { + "epoch": 2.3795356434488393, + "grad_norm": 1.1107691526412964, + "learning_rate": 0.00012833295281887476, + "loss": 1.9566, + "step": 20395 + }, + { + "epoch": 2.379652315949131, + "grad_norm": 1.1655765771865845, + "learning_rate": 0.0001283180181040756, + "loss": 1.8891, + "step": 20396 + }, + { + "epoch": 2.3797689884494226, + "grad_norm": 1.0877888202667236, + "learning_rate": 0.00012830308362960158, + "loss": 2.0104, + "step": 20397 + }, + { + "epoch": 2.3798856609497143, + "grad_norm": 1.1015257835388184, + "learning_rate": 0.0001282881493956075, + "loss": 1.9998, + "step": 20398 + }, + { + "epoch": 2.380002333450006, + "grad_norm": 1.0604665279388428, + "learning_rate": 0.0001282732154022482, + "loss": 1.9914, + "step": 20399 + }, + { + "epoch": 2.3801190059502977, + "grad_norm": 1.16843843460083, + "learning_rate": 0.00012825828164967856, + "loss": 1.9161, + "step": 20400 + }, + { + "epoch": 2.3802356784505894, + "grad_norm": 1.2976702451705933, + "learning_rate": 0.00012824334813805327, + "loss": 2.2507, + "step": 20401 + }, + { + "epoch": 2.380352350950881, + "grad_norm": 1.1787742376327515, + "learning_rate": 0.00012822841486752727, + "loss": 1.9111, + "step": 20402 + }, + { + "epoch": 2.3804690234511727, + "grad_norm": 1.0674103498458862, + "learning_rate": 0.00012821348183825526, + "loss": 1.9884, + "step": 20403 + }, + { + "epoch": 2.3805856959514644, + "grad_norm": 1.2703580856323242, + "learning_rate": 0.00012819854905039217, + "loss": 2.1228, + "step": 20404 + }, + { + "epoch": 2.380702368451756, + "grad_norm": 1.1662085056304932, + "learning_rate": 0.00012818361650409268, + "loss": 1.9588, + "step": 20405 + }, + { + "epoch": 2.3808190409520478, + "grad_norm": 1.0879732370376587, + "learning_rate": 0.00012816868419951173, + "loss": 1.9807, + "step": 20406 + }, + { + "epoch": 2.3809357134523395, + "grad_norm": 1.4602676630020142, + "learning_rate": 0.000128153752136804, + "loss": 2.091, + "step": 20407 + }, + { + "epoch": 2.381052385952631, + "grad_norm": 1.108110785484314, + "learning_rate": 0.00012813882031612433, + "loss": 1.9432, + "step": 20408 + }, + { + "epoch": 2.381169058452923, + "grad_norm": 1.0930227041244507, + "learning_rate": 0.00012812388873762758, + "loss": 1.9437, + "step": 20409 + }, + { + "epoch": 2.3812857309532145, + "grad_norm": 1.142573356628418, + "learning_rate": 0.00012810895740146842, + "loss": 2.0552, + "step": 20410 + }, + { + "epoch": 2.381402403453506, + "grad_norm": 1.3110278844833374, + "learning_rate": 0.0001280940263078018, + "loss": 1.9923, + "step": 20411 + }, + { + "epoch": 2.381519075953798, + "grad_norm": 1.0165497064590454, + "learning_rate": 0.00012807909545678234, + "loss": 1.8671, + "step": 20412 + }, + { + "epoch": 2.3816357484540895, + "grad_norm": 1.4493229389190674, + "learning_rate": 0.00012806416484856497, + "loss": 2.1436, + "step": 20413 + }, + { + "epoch": 2.381752420954381, + "grad_norm": 1.0056663751602173, + "learning_rate": 0.00012804923448330436, + "loss": 1.8029, + "step": 20414 + }, + { + "epoch": 2.381869093454673, + "grad_norm": 1.1275078058242798, + "learning_rate": 0.00012803430436115535, + "loss": 2.0738, + "step": 20415 + }, + { + "epoch": 2.3819857659549646, + "grad_norm": 1.0816259384155273, + "learning_rate": 0.00012801937448227274, + "loss": 1.7589, + "step": 20416 + }, + { + "epoch": 2.3821024384552563, + "grad_norm": 1.2567296028137207, + "learning_rate": 0.0001280044448468113, + "loss": 2.0556, + "step": 20417 + }, + { + "epoch": 2.382219110955548, + "grad_norm": 1.062757134437561, + "learning_rate": 0.0001279895154549257, + "loss": 1.9391, + "step": 20418 + }, + { + "epoch": 2.3823357834558396, + "grad_norm": 1.1616630554199219, + "learning_rate": 0.00012797458630677084, + "loss": 2.0701, + "step": 20419 + }, + { + "epoch": 2.3824524559561313, + "grad_norm": 1.1556090116500854, + "learning_rate": 0.0001279596574025014, + "loss": 1.9566, + "step": 20420 + }, + { + "epoch": 2.382569128456423, + "grad_norm": 1.1899311542510986, + "learning_rate": 0.0001279447287422722, + "loss": 2.0711, + "step": 20421 + }, + { + "epoch": 2.3826858009567147, + "grad_norm": 1.068611741065979, + "learning_rate": 0.00012792980032623805, + "loss": 1.8377, + "step": 20422 + }, + { + "epoch": 2.3828024734570064, + "grad_norm": 1.2731181383132935, + "learning_rate": 0.00012791487215455362, + "loss": 2.0128, + "step": 20423 + }, + { + "epoch": 2.382919145957298, + "grad_norm": 1.141066312789917, + "learning_rate": 0.00012789994422737373, + "loss": 2.0403, + "step": 20424 + }, + { + "epoch": 2.3830358184575897, + "grad_norm": 1.1625263690948486, + "learning_rate": 0.00012788501654485305, + "loss": 1.9176, + "step": 20425 + }, + { + "epoch": 2.3831524909578814, + "grad_norm": 1.0015599727630615, + "learning_rate": 0.00012787008910714645, + "loss": 1.8477, + "step": 20426 + }, + { + "epoch": 2.383269163458173, + "grad_norm": 1.0293569564819336, + "learning_rate": 0.00012785516191440857, + "loss": 1.804, + "step": 20427 + }, + { + "epoch": 2.3833858359584648, + "grad_norm": 1.2376253604888916, + "learning_rate": 0.00012784023496679425, + "loss": 1.9528, + "step": 20428 + }, + { + "epoch": 2.3835025084587564, + "grad_norm": 1.1321078538894653, + "learning_rate": 0.00012782530826445818, + "loss": 1.89, + "step": 20429 + }, + { + "epoch": 2.383619180959048, + "grad_norm": 1.047217607498169, + "learning_rate": 0.00012781038180755515, + "loss": 1.8557, + "step": 20430 + }, + { + "epoch": 2.38373585345934, + "grad_norm": 1.0725539922714233, + "learning_rate": 0.00012779545559623982, + "loss": 2.0567, + "step": 20431 + }, + { + "epoch": 2.3838525259596315, + "grad_norm": 1.1649935245513916, + "learning_rate": 0.00012778052963066704, + "loss": 1.9255, + "step": 20432 + }, + { + "epoch": 2.383969198459923, + "grad_norm": 1.219519853591919, + "learning_rate": 0.00012776560391099143, + "loss": 2.2015, + "step": 20433 + }, + { + "epoch": 2.384085870960215, + "grad_norm": 1.2239561080932617, + "learning_rate": 0.00012775067843736777, + "loss": 1.9625, + "step": 20434 + }, + { + "epoch": 2.3842025434605065, + "grad_norm": 1.1118783950805664, + "learning_rate": 0.0001277357532099509, + "loss": 2.0432, + "step": 20435 + }, + { + "epoch": 2.384319215960798, + "grad_norm": 1.032832384109497, + "learning_rate": 0.00012772082822889536, + "loss": 1.8288, + "step": 20436 + }, + { + "epoch": 2.38443588846109, + "grad_norm": 1.121557354927063, + "learning_rate": 0.000127705903494356, + "loss": 1.9463, + "step": 20437 + }, + { + "epoch": 2.3845525609613816, + "grad_norm": 1.245110034942627, + "learning_rate": 0.00012769097900648747, + "loss": 1.876, + "step": 20438 + }, + { + "epoch": 2.3846692334616733, + "grad_norm": 1.2677083015441895, + "learning_rate": 0.0001276760547654446, + "loss": 1.9942, + "step": 20439 + }, + { + "epoch": 2.384785905961965, + "grad_norm": 1.2594465017318726, + "learning_rate": 0.00012766113077138197, + "loss": 2.0104, + "step": 20440 + }, + { + "epoch": 2.3849025784622566, + "grad_norm": 1.3607887029647827, + "learning_rate": 0.00012764620702445441, + "loss": 2.0965, + "step": 20441 + }, + { + "epoch": 2.3850192509625483, + "grad_norm": 1.3286761045455933, + "learning_rate": 0.00012763128352481656, + "loss": 2.0583, + "step": 20442 + }, + { + "epoch": 2.38513592346284, + "grad_norm": 1.1575019359588623, + "learning_rate": 0.00012761636027262317, + "loss": 1.9174, + "step": 20443 + }, + { + "epoch": 2.3852525959631317, + "grad_norm": 1.3790031671524048, + "learning_rate": 0.0001276014372680289, + "loss": 2.0539, + "step": 20444 + }, + { + "epoch": 2.3853692684634233, + "grad_norm": 1.1422916650772095, + "learning_rate": 0.00012758651451118854, + "loss": 1.8745, + "step": 20445 + }, + { + "epoch": 2.385485940963715, + "grad_norm": 1.2156528234481812, + "learning_rate": 0.0001275715920022567, + "loss": 2.0589, + "step": 20446 + }, + { + "epoch": 2.3856026134640067, + "grad_norm": 1.1073923110961914, + "learning_rate": 0.00012755666974138808, + "loss": 1.7944, + "step": 20447 + }, + { + "epoch": 2.3857192859642984, + "grad_norm": 1.1872127056121826, + "learning_rate": 0.0001275417477287375, + "loss": 2.0514, + "step": 20448 + }, + { + "epoch": 2.38583595846459, + "grad_norm": 1.2461761236190796, + "learning_rate": 0.00012752682596445955, + "loss": 2.0792, + "step": 20449 + }, + { + "epoch": 2.3859526309648817, + "grad_norm": 1.2474982738494873, + "learning_rate": 0.00012751190444870898, + "loss": 1.9371, + "step": 20450 + }, + { + "epoch": 2.3860693034651734, + "grad_norm": 1.1165934801101685, + "learning_rate": 0.00012749698318164035, + "loss": 1.9083, + "step": 20451 + }, + { + "epoch": 2.386185975965465, + "grad_norm": 1.2473727464675903, + "learning_rate": 0.00012748206216340853, + "loss": 1.9914, + "step": 20452 + }, + { + "epoch": 2.386302648465757, + "grad_norm": 1.3373253345489502, + "learning_rate": 0.00012746714139416805, + "loss": 2.0892, + "step": 20453 + }, + { + "epoch": 2.3864193209660485, + "grad_norm": 1.190386176109314, + "learning_rate": 0.00012745222087407374, + "loss": 2.0369, + "step": 20454 + }, + { + "epoch": 2.38653599346634, + "grad_norm": 1.2464728355407715, + "learning_rate": 0.0001274373006032801, + "loss": 1.8486, + "step": 20455 + }, + { + "epoch": 2.386652665966632, + "grad_norm": 1.1836436986923218, + "learning_rate": 0.000127422380581942, + "loss": 1.8794, + "step": 20456 + }, + { + "epoch": 2.3867693384669235, + "grad_norm": 1.2680033445358276, + "learning_rate": 0.00012740746081021394, + "loss": 2.0021, + "step": 20457 + }, + { + "epoch": 2.386886010967215, + "grad_norm": 1.117671251296997, + "learning_rate": 0.00012739254128825076, + "loss": 1.8298, + "step": 20458 + }, + { + "epoch": 2.387002683467507, + "grad_norm": 1.2211891412734985, + "learning_rate": 0.00012737762201620696, + "loss": 1.9231, + "step": 20459 + }, + { + "epoch": 2.3871193559677986, + "grad_norm": 1.2341773509979248, + "learning_rate": 0.00012736270299423732, + "loss": 2.0054, + "step": 20460 + }, + { + "epoch": 2.3872360284680902, + "grad_norm": 1.1026027202606201, + "learning_rate": 0.0001273477842224965, + "loss": 2.0582, + "step": 20461 + }, + { + "epoch": 2.387352700968382, + "grad_norm": 1.0748989582061768, + "learning_rate": 0.00012733286570113905, + "loss": 1.9504, + "step": 20462 + }, + { + "epoch": 2.3874693734686736, + "grad_norm": 1.175215482711792, + "learning_rate": 0.0001273179474303198, + "loss": 1.9579, + "step": 20463 + }, + { + "epoch": 2.3875860459689653, + "grad_norm": 1.0567201375961304, + "learning_rate": 0.00012730302941019325, + "loss": 1.9514, + "step": 20464 + }, + { + "epoch": 2.387702718469257, + "grad_norm": 1.0607848167419434, + "learning_rate": 0.00012728811164091418, + "loss": 1.9919, + "step": 20465 + }, + { + "epoch": 2.3878193909695486, + "grad_norm": 1.0820192098617554, + "learning_rate": 0.00012727319412263717, + "loss": 2.0157, + "step": 20466 + }, + { + "epoch": 2.3879360634698403, + "grad_norm": 1.1251825094223022, + "learning_rate": 0.00012725827685551686, + "loss": 1.925, + "step": 20467 + }, + { + "epoch": 2.388052735970132, + "grad_norm": 1.182987928390503, + "learning_rate": 0.00012724335983970792, + "loss": 1.9338, + "step": 20468 + }, + { + "epoch": 2.3881694084704237, + "grad_norm": 1.095353126525879, + "learning_rate": 0.000127228443075365, + "loss": 2.0071, + "step": 20469 + }, + { + "epoch": 2.3882860809707154, + "grad_norm": 1.189852237701416, + "learning_rate": 0.00012721352656264268, + "loss": 1.8012, + "step": 20470 + }, + { + "epoch": 2.388402753471007, + "grad_norm": 1.2365797758102417, + "learning_rate": 0.00012719861030169573, + "loss": 2.0895, + "step": 20471 + }, + { + "epoch": 2.3885194259712987, + "grad_norm": 1.1865301132202148, + "learning_rate": 0.0001271836942926786, + "loss": 1.8271, + "step": 20472 + }, + { + "epoch": 2.3886360984715904, + "grad_norm": 1.3097528219223022, + "learning_rate": 0.00012716877853574608, + "loss": 1.9715, + "step": 20473 + }, + { + "epoch": 2.388752770971882, + "grad_norm": 1.0241239070892334, + "learning_rate": 0.00012715386303105278, + "loss": 1.8377, + "step": 20474 + }, + { + "epoch": 2.388869443472174, + "grad_norm": 1.091996431350708, + "learning_rate": 0.00012713894777875327, + "loss": 1.9029, + "step": 20475 + }, + { + "epoch": 2.3889861159724655, + "grad_norm": 1.297080159187317, + "learning_rate": 0.00012712403277900222, + "loss": 2.1514, + "step": 20476 + }, + { + "epoch": 2.389102788472757, + "grad_norm": 1.231829285621643, + "learning_rate": 0.00012710911803195417, + "loss": 2.0615, + "step": 20477 + }, + { + "epoch": 2.389219460973049, + "grad_norm": 1.2755236625671387, + "learning_rate": 0.00012709420353776386, + "loss": 1.8686, + "step": 20478 + }, + { + "epoch": 2.3893361334733405, + "grad_norm": 1.092706561088562, + "learning_rate": 0.00012707928929658579, + "loss": 2.0145, + "step": 20479 + }, + { + "epoch": 2.389452805973632, + "grad_norm": 1.1485942602157593, + "learning_rate": 0.00012706437530857467, + "loss": 2.002, + "step": 20480 + }, + { + "epoch": 2.389569478473924, + "grad_norm": 1.1652989387512207, + "learning_rate": 0.00012704946157388503, + "loss": 1.9512, + "step": 20481 + }, + { + "epoch": 2.3896861509742156, + "grad_norm": 0.9342384338378906, + "learning_rate": 0.00012703454809267158, + "loss": 1.8481, + "step": 20482 + }, + { + "epoch": 2.3898028234745072, + "grad_norm": 1.1060012578964233, + "learning_rate": 0.0001270196348650888, + "loss": 2.0153, + "step": 20483 + }, + { + "epoch": 2.389919495974799, + "grad_norm": 1.3407397270202637, + "learning_rate": 0.00012700472189129143, + "loss": 1.9764, + "step": 20484 + }, + { + "epoch": 2.3900361684750906, + "grad_norm": 1.0946789979934692, + "learning_rate": 0.00012698980917143397, + "loss": 1.9649, + "step": 20485 + }, + { + "epoch": 2.3901528409753823, + "grad_norm": 1.227973222732544, + "learning_rate": 0.00012697489670567103, + "loss": 1.9288, + "step": 20486 + }, + { + "epoch": 2.390269513475674, + "grad_norm": 0.8999006152153015, + "learning_rate": 0.0001269599844941573, + "loss": 1.8264, + "step": 20487 + }, + { + "epoch": 2.3903861859759656, + "grad_norm": 1.3930705785751343, + "learning_rate": 0.00012694507253704722, + "loss": 2.0288, + "step": 20488 + }, + { + "epoch": 2.3905028584762573, + "grad_norm": 1.0807690620422363, + "learning_rate": 0.00012693016083449552, + "loss": 1.9922, + "step": 20489 + }, + { + "epoch": 2.390619530976549, + "grad_norm": 1.1888976097106934, + "learning_rate": 0.00012691524938665668, + "loss": 1.9925, + "step": 20490 + }, + { + "epoch": 2.3907362034768407, + "grad_norm": 1.133825421333313, + "learning_rate": 0.0001269003381936854, + "loss": 2.0321, + "step": 20491 + }, + { + "epoch": 2.3908528759771324, + "grad_norm": 1.0092542171478271, + "learning_rate": 0.00012688542725573614, + "loss": 1.8632, + "step": 20492 + }, + { + "epoch": 2.390969548477424, + "grad_norm": 1.1453276872634888, + "learning_rate": 0.00012687051657296356, + "loss": 1.9392, + "step": 20493 + }, + { + "epoch": 2.3910862209777157, + "grad_norm": 1.123120665550232, + "learning_rate": 0.00012685560614552222, + "loss": 1.9874, + "step": 20494 + }, + { + "epoch": 2.3912028934780074, + "grad_norm": 1.391799807548523, + "learning_rate": 0.0001268406959735667, + "loss": 1.9602, + "step": 20495 + }, + { + "epoch": 2.391319565978299, + "grad_norm": 1.2633661031723022, + "learning_rate": 0.00012682578605725152, + "loss": 2.2173, + "step": 20496 + }, + { + "epoch": 2.3914362384785908, + "grad_norm": 1.2215203046798706, + "learning_rate": 0.00012681087639673135, + "loss": 1.9368, + "step": 20497 + }, + { + "epoch": 2.3915529109788825, + "grad_norm": 1.1186771392822266, + "learning_rate": 0.00012679596699216063, + "loss": 2.0708, + "step": 20498 + }, + { + "epoch": 2.391669583479174, + "grad_norm": 1.234980821609497, + "learning_rate": 0.00012678105784369402, + "loss": 2.0662, + "step": 20499 + }, + { + "epoch": 2.391786255979466, + "grad_norm": 1.1950422525405884, + "learning_rate": 0.00012676614895148607, + "loss": 2.0019, + "step": 20500 + }, + { + "epoch": 2.3919029284797575, + "grad_norm": 1.1705820560455322, + "learning_rate": 0.00012675124031569132, + "loss": 2.1865, + "step": 20501 + }, + { + "epoch": 2.392019600980049, + "grad_norm": 1.2008851766586304, + "learning_rate": 0.00012673633193646436, + "loss": 2.0312, + "step": 20502 + }, + { + "epoch": 2.392136273480341, + "grad_norm": 1.307753086090088, + "learning_rate": 0.00012672142381395964, + "loss": 2.0264, + "step": 20503 + }, + { + "epoch": 2.3922529459806325, + "grad_norm": 1.1782881021499634, + "learning_rate": 0.00012670651594833186, + "loss": 1.9928, + "step": 20504 + }, + { + "epoch": 2.392369618480924, + "grad_norm": 1.0797761678695679, + "learning_rate": 0.00012669160833973542, + "loss": 2.1577, + "step": 20505 + }, + { + "epoch": 2.392486290981216, + "grad_norm": 1.3390144109725952, + "learning_rate": 0.000126676700988325, + "loss": 1.7852, + "step": 20506 + }, + { + "epoch": 2.3926029634815076, + "grad_norm": 1.1931567192077637, + "learning_rate": 0.00012666179389425505, + "loss": 2.0874, + "step": 20507 + }, + { + "epoch": 2.3927196359817993, + "grad_norm": 1.073498249053955, + "learning_rate": 0.00012664688705768017, + "loss": 1.9909, + "step": 20508 + }, + { + "epoch": 2.392836308482091, + "grad_norm": 1.1521036624908447, + "learning_rate": 0.00012663198047875484, + "loss": 2.0362, + "step": 20509 + }, + { + "epoch": 2.3929529809823826, + "grad_norm": 1.1353039741516113, + "learning_rate": 0.00012661707415763363, + "loss": 2.0334, + "step": 20510 + }, + { + "epoch": 2.3930696534826743, + "grad_norm": 1.2937532663345337, + "learning_rate": 0.00012660216809447106, + "loss": 1.9002, + "step": 20511 + }, + { + "epoch": 2.393186325982966, + "grad_norm": 1.1887483596801758, + "learning_rate": 0.00012658726228942165, + "loss": 1.8528, + "step": 20512 + }, + { + "epoch": 2.3933029984832577, + "grad_norm": 1.2056487798690796, + "learning_rate": 0.00012657235674264, + "loss": 2.0169, + "step": 20513 + }, + { + "epoch": 2.3934196709835494, + "grad_norm": 1.057424783706665, + "learning_rate": 0.0001265574514542805, + "loss": 2.0032, + "step": 20514 + }, + { + "epoch": 2.393536343483841, + "grad_norm": 1.0587538480758667, + "learning_rate": 0.0001265425464244978, + "loss": 2.017, + "step": 20515 + }, + { + "epoch": 2.3936530159841327, + "grad_norm": 1.1744927167892456, + "learning_rate": 0.00012652764165344634, + "loss": 2.047, + "step": 20516 + }, + { + "epoch": 2.3937696884844244, + "grad_norm": 1.0890566110610962, + "learning_rate": 0.0001265127371412807, + "loss": 1.9045, + "step": 20517 + }, + { + "epoch": 2.393886360984716, + "grad_norm": 1.1890138387680054, + "learning_rate": 0.00012649783288815535, + "loss": 1.8926, + "step": 20518 + }, + { + "epoch": 2.3940030334850078, + "grad_norm": 1.1149531602859497, + "learning_rate": 0.0001264829288942248, + "loss": 1.6895, + "step": 20519 + }, + { + "epoch": 2.3941197059852994, + "grad_norm": 1.0034189224243164, + "learning_rate": 0.00012646802515964354, + "loss": 1.7871, + "step": 20520 + }, + { + "epoch": 2.394236378485591, + "grad_norm": 1.226499080657959, + "learning_rate": 0.00012645312168456615, + "loss": 1.9735, + "step": 20521 + }, + { + "epoch": 2.394353050985883, + "grad_norm": 1.1463491916656494, + "learning_rate": 0.00012643821846914703, + "loss": 2.0245, + "step": 20522 + }, + { + "epoch": 2.3944697234861745, + "grad_norm": 1.3696283102035522, + "learning_rate": 0.0001264233155135408, + "loss": 2.0074, + "step": 20523 + }, + { + "epoch": 2.394586395986466, + "grad_norm": 1.2246990203857422, + "learning_rate": 0.0001264084128179018, + "loss": 2.015, + "step": 20524 + }, + { + "epoch": 2.394703068486758, + "grad_norm": 1.109273076057434, + "learning_rate": 0.00012639351038238466, + "loss": 1.9439, + "step": 20525 + }, + { + "epoch": 2.3948197409870495, + "grad_norm": 1.1421358585357666, + "learning_rate": 0.00012637860820714384, + "loss": 2.0471, + "step": 20526 + }, + { + "epoch": 2.394936413487341, + "grad_norm": 1.0748751163482666, + "learning_rate": 0.00012636370629233383, + "loss": 1.7801, + "step": 20527 + }, + { + "epoch": 2.395053085987633, + "grad_norm": 1.4200907945632935, + "learning_rate": 0.0001263488046381091, + "loss": 2.0127, + "step": 20528 + }, + { + "epoch": 2.3951697584879246, + "grad_norm": 1.1517430543899536, + "learning_rate": 0.0001263339032446241, + "loss": 1.8821, + "step": 20529 + }, + { + "epoch": 2.3952864309882163, + "grad_norm": 1.0567007064819336, + "learning_rate": 0.0001263190021120334, + "loss": 1.7505, + "step": 20530 + }, + { + "epoch": 2.395403103488508, + "grad_norm": 1.1128287315368652, + "learning_rate": 0.00012630410124049135, + "loss": 2.117, + "step": 20531 + }, + { + "epoch": 2.3955197759887996, + "grad_norm": 1.0682597160339355, + "learning_rate": 0.0001262892006301526, + "loss": 2.141, + "step": 20532 + }, + { + "epoch": 2.3956364484890913, + "grad_norm": 1.223745346069336, + "learning_rate": 0.00012627430028117143, + "loss": 2.1005, + "step": 20533 + }, + { + "epoch": 2.395753120989383, + "grad_norm": 1.2568250894546509, + "learning_rate": 0.0001262594001937025, + "loss": 2.2168, + "step": 20534 + }, + { + "epoch": 2.3958697934896747, + "grad_norm": 1.1706956624984741, + "learning_rate": 0.00012624450036790013, + "loss": 1.9423, + "step": 20535 + }, + { + "epoch": 2.3959864659899663, + "grad_norm": 1.2667165994644165, + "learning_rate": 0.00012622960080391885, + "loss": 1.781, + "step": 20536 + }, + { + "epoch": 2.396103138490258, + "grad_norm": 1.375542163848877, + "learning_rate": 0.00012621470150191313, + "loss": 2.1999, + "step": 20537 + }, + { + "epoch": 2.3962198109905497, + "grad_norm": 1.088990330696106, + "learning_rate": 0.00012619980246203732, + "loss": 1.6565, + "step": 20538 + }, + { + "epoch": 2.3963364834908414, + "grad_norm": 1.2364025115966797, + "learning_rate": 0.00012618490368444606, + "loss": 1.9952, + "step": 20539 + }, + { + "epoch": 2.396453155991133, + "grad_norm": 1.1897072792053223, + "learning_rate": 0.00012617000516929367, + "loss": 1.9718, + "step": 20540 + }, + { + "epoch": 2.3965698284914247, + "grad_norm": 1.148808240890503, + "learning_rate": 0.0001261551069167347, + "loss": 1.9565, + "step": 20541 + }, + { + "epoch": 2.3966865009917164, + "grad_norm": 1.078036904335022, + "learning_rate": 0.00012614020892692346, + "loss": 1.9318, + "step": 20542 + }, + { + "epoch": 2.396803173492008, + "grad_norm": 0.9362889528274536, + "learning_rate": 0.00012612531120001453, + "loss": 1.9892, + "step": 20543 + }, + { + "epoch": 2.3969198459923, + "grad_norm": 1.0802189111709595, + "learning_rate": 0.0001261104137361623, + "loss": 1.9457, + "step": 20544 + }, + { + "epoch": 2.3970365184925915, + "grad_norm": 1.3490440845489502, + "learning_rate": 0.00012609551653552126, + "loss": 2.04, + "step": 20545 + }, + { + "epoch": 2.397153190992883, + "grad_norm": 1.1960511207580566, + "learning_rate": 0.0001260806195982457, + "loss": 1.9242, + "step": 20546 + }, + { + "epoch": 2.397269863493175, + "grad_norm": 1.121222734451294, + "learning_rate": 0.0001260657229244902, + "loss": 1.8503, + "step": 20547 + }, + { + "epoch": 2.3973865359934665, + "grad_norm": 1.101967453956604, + "learning_rate": 0.0001260508265144091, + "loss": 2.1835, + "step": 20548 + }, + { + "epoch": 2.397503208493758, + "grad_norm": 1.1435891389846802, + "learning_rate": 0.00012603593036815695, + "loss": 2.0081, + "step": 20549 + }, + { + "epoch": 2.39761988099405, + "grad_norm": 1.021273136138916, + "learning_rate": 0.00012602103448588803, + "loss": 1.776, + "step": 20550 + }, + { + "epoch": 2.3977365534943416, + "grad_norm": 1.1089167594909668, + "learning_rate": 0.00012600613886775688, + "loss": 1.9566, + "step": 20551 + }, + { + "epoch": 2.3978532259946332, + "grad_norm": 1.1392347812652588, + "learning_rate": 0.00012599124351391785, + "loss": 1.887, + "step": 20552 + }, + { + "epoch": 2.397969898494925, + "grad_norm": 1.11027193069458, + "learning_rate": 0.0001259763484245254, + "loss": 1.912, + "step": 20553 + }, + { + "epoch": 2.3980865709952166, + "grad_norm": 1.1377224922180176, + "learning_rate": 0.00012596145359973398, + "loss": 1.9931, + "step": 20554 + }, + { + "epoch": 2.3982032434955083, + "grad_norm": 1.3188855648040771, + "learning_rate": 0.00012594655903969785, + "loss": 1.9135, + "step": 20555 + }, + { + "epoch": 2.3983199159958, + "grad_norm": 1.287774682044983, + "learning_rate": 0.00012593166474457164, + "loss": 2.1536, + "step": 20556 + }, + { + "epoch": 2.3984365884960916, + "grad_norm": 1.2004623413085938, + "learning_rate": 0.00012591677071450955, + "loss": 1.9654, + "step": 20557 + }, + { + "epoch": 2.3985532609963833, + "grad_norm": 1.0557520389556885, + "learning_rate": 0.00012590187694966616, + "loss": 1.796, + "step": 20558 + }, + { + "epoch": 2.398669933496675, + "grad_norm": 1.0761804580688477, + "learning_rate": 0.0001258869834501957, + "loss": 1.8819, + "step": 20559 + }, + { + "epoch": 2.3987866059969667, + "grad_norm": 1.1993945837020874, + "learning_rate": 0.00012587209021625273, + "loss": 1.9666, + "step": 20560 + }, + { + "epoch": 2.3989032784972584, + "grad_norm": 1.2429254055023193, + "learning_rate": 0.00012585719724799154, + "loss": 2.0559, + "step": 20561 + }, + { + "epoch": 2.39901995099755, + "grad_norm": 1.1259979009628296, + "learning_rate": 0.00012584230454556656, + "loss": 1.9586, + "step": 20562 + }, + { + "epoch": 2.3991366234978417, + "grad_norm": 1.4290965795516968, + "learning_rate": 0.00012582741210913218, + "loss": 2.0299, + "step": 20563 + }, + { + "epoch": 2.3992532959981334, + "grad_norm": 1.0800988674163818, + "learning_rate": 0.00012581251993884276, + "loss": 1.9789, + "step": 20564 + }, + { + "epoch": 2.399369968498425, + "grad_norm": 1.0891664028167725, + "learning_rate": 0.0001257976280348528, + "loss": 1.8636, + "step": 20565 + }, + { + "epoch": 2.399486640998717, + "grad_norm": 1.111728549003601, + "learning_rate": 0.0001257827363973165, + "loss": 1.9339, + "step": 20566 + }, + { + "epoch": 2.3996033134990085, + "grad_norm": 1.2431919574737549, + "learning_rate": 0.00012576784502638843, + "loss": 1.8851, + "step": 20567 + }, + { + "epoch": 2.3997199859993, + "grad_norm": 1.025785207748413, + "learning_rate": 0.00012575295392222279, + "loss": 1.7455, + "step": 20568 + }, + { + "epoch": 2.399836658499592, + "grad_norm": 1.21553635597229, + "learning_rate": 0.00012573806308497412, + "loss": 2.0206, + "step": 20569 + }, + { + "epoch": 2.3999533309998835, + "grad_norm": 1.1836813688278198, + "learning_rate": 0.00012572317251479667, + "loss": 2.1069, + "step": 20570 + }, + { + "epoch": 2.400070003500175, + "grad_norm": 1.2091176509857178, + "learning_rate": 0.00012570828221184486, + "loss": 1.9393, + "step": 20571 + }, + { + "epoch": 2.400186676000467, + "grad_norm": 1.1792086362838745, + "learning_rate": 0.000125693392176273, + "loss": 2.0103, + "step": 20572 + }, + { + "epoch": 2.4003033485007585, + "grad_norm": 1.3215123414993286, + "learning_rate": 0.00012567850240823556, + "loss": 1.9644, + "step": 20573 + }, + { + "epoch": 2.4004200210010502, + "grad_norm": 1.3836172819137573, + "learning_rate": 0.00012566361290788678, + "loss": 2.1335, + "step": 20574 + }, + { + "epoch": 2.400536693501342, + "grad_norm": 1.2690380811691284, + "learning_rate": 0.00012564872367538114, + "loss": 1.9279, + "step": 20575 + }, + { + "epoch": 2.4006533660016336, + "grad_norm": 1.1374212503433228, + "learning_rate": 0.00012563383471087287, + "loss": 2.1426, + "step": 20576 + }, + { + "epoch": 2.4007700385019253, + "grad_norm": 1.1421927213668823, + "learning_rate": 0.0001256189460145164, + "loss": 2.0075, + "step": 20577 + }, + { + "epoch": 2.400886711002217, + "grad_norm": 1.2041466236114502, + "learning_rate": 0.00012560405758646608, + "loss": 1.8675, + "step": 20578 + }, + { + "epoch": 2.4010033835025086, + "grad_norm": 1.2788968086242676, + "learning_rate": 0.00012558916942687624, + "loss": 2.0755, + "step": 20579 + }, + { + "epoch": 2.4011200560028003, + "grad_norm": 1.2493747472763062, + "learning_rate": 0.00012557428153590127, + "loss": 2.076, + "step": 20580 + }, + { + "epoch": 2.401236728503092, + "grad_norm": 1.266627311706543, + "learning_rate": 0.00012555939391369536, + "loss": 2.0134, + "step": 20581 + }, + { + "epoch": 2.4013534010033837, + "grad_norm": 1.2832037210464478, + "learning_rate": 0.00012554450656041306, + "loss": 1.9758, + "step": 20582 + }, + { + "epoch": 2.4014700735036754, + "grad_norm": 1.120251178741455, + "learning_rate": 0.0001255296194762085, + "loss": 1.9494, + "step": 20583 + }, + { + "epoch": 2.401586746003967, + "grad_norm": 1.3019558191299438, + "learning_rate": 0.00012551473266123622, + "loss": 1.9307, + "step": 20584 + }, + { + "epoch": 2.4017034185042587, + "grad_norm": 1.2902107238769531, + "learning_rate": 0.00012549984611565036, + "loss": 2.0153, + "step": 20585 + }, + { + "epoch": 2.4018200910045504, + "grad_norm": 1.0477510690689087, + "learning_rate": 0.0001254849598396054, + "loss": 1.9056, + "step": 20586 + }, + { + "epoch": 2.401936763504842, + "grad_norm": 1.1092219352722168, + "learning_rate": 0.00012547007383325554, + "loss": 1.9745, + "step": 20587 + }, + { + "epoch": 2.4020534360051338, + "grad_norm": 1.1513829231262207, + "learning_rate": 0.00012545518809675517, + "loss": 2.05, + "step": 20588 + }, + { + "epoch": 2.4021701085054254, + "grad_norm": 1.0843937397003174, + "learning_rate": 0.00012544030263025863, + "loss": 2.1232, + "step": 20589 + }, + { + "epoch": 2.402286781005717, + "grad_norm": 1.1919898986816406, + "learning_rate": 0.0001254254174339201, + "loss": 1.9743, + "step": 20590 + }, + { + "epoch": 2.402403453506009, + "grad_norm": 1.3063970804214478, + "learning_rate": 0.00012541053250789412, + "loss": 1.8428, + "step": 20591 + }, + { + "epoch": 2.4025201260063005, + "grad_norm": 1.0396794080734253, + "learning_rate": 0.0001253956478523348, + "loss": 1.8336, + "step": 20592 + }, + { + "epoch": 2.402636798506592, + "grad_norm": 1.229239821434021, + "learning_rate": 0.00012538076346739658, + "loss": 2.1686, + "step": 20593 + }, + { + "epoch": 2.402753471006884, + "grad_norm": 1.150572657585144, + "learning_rate": 0.00012536587935323365, + "loss": 2.0915, + "step": 20594 + }, + { + "epoch": 2.4028701435071755, + "grad_norm": 1.2279361486434937, + "learning_rate": 0.00012535099551000037, + "loss": 2.135, + "step": 20595 + }, + { + "epoch": 2.402986816007467, + "grad_norm": 1.1873120069503784, + "learning_rate": 0.00012533611193785106, + "loss": 1.861, + "step": 20596 + }, + { + "epoch": 2.403103488507759, + "grad_norm": 1.1527372598648071, + "learning_rate": 0.00012532122863694002, + "loss": 2.0886, + "step": 20597 + }, + { + "epoch": 2.4032201610080506, + "grad_norm": 1.1490743160247803, + "learning_rate": 0.00012530634560742148, + "loss": 1.9742, + "step": 20598 + }, + { + "epoch": 2.4033368335083423, + "grad_norm": 1.0409084558486938, + "learning_rate": 0.00012529146284944982, + "loss": 1.9619, + "step": 20599 + }, + { + "epoch": 2.403453506008634, + "grad_norm": 1.0000104904174805, + "learning_rate": 0.0001252765803631792, + "loss": 2.0559, + "step": 20600 + }, + { + "epoch": 2.4035701785089256, + "grad_norm": 1.325563669204712, + "learning_rate": 0.00012526169814876405, + "loss": 1.9866, + "step": 20601 + }, + { + "epoch": 2.4036868510092173, + "grad_norm": 1.149349331855774, + "learning_rate": 0.00012524681620635852, + "loss": 2.0779, + "step": 20602 + }, + { + "epoch": 2.403803523509509, + "grad_norm": 0.9956232309341431, + "learning_rate": 0.000125231934536117, + "loss": 1.7484, + "step": 20603 + }, + { + "epoch": 2.4039201960098007, + "grad_norm": 1.3458983898162842, + "learning_rate": 0.00012521705313819375, + "loss": 2.0244, + "step": 20604 + }, + { + "epoch": 2.4040368685100924, + "grad_norm": 1.0446953773498535, + "learning_rate": 0.00012520217201274293, + "loss": 1.9282, + "step": 20605 + }, + { + "epoch": 2.404153541010384, + "grad_norm": 1.1759308576583862, + "learning_rate": 0.00012518729115991898, + "loss": 1.8493, + "step": 20606 + }, + { + "epoch": 2.4042702135106757, + "grad_norm": 1.3427536487579346, + "learning_rate": 0.000125172410579876, + "loss": 1.9113, + "step": 20607 + }, + { + "epoch": 2.4043868860109674, + "grad_norm": 1.1254702806472778, + "learning_rate": 0.00012515753027276844, + "loss": 2.0208, + "step": 20608 + }, + { + "epoch": 2.404503558511259, + "grad_norm": 1.1797010898590088, + "learning_rate": 0.00012514265023875036, + "loss": 1.9767, + "step": 20609 + }, + { + "epoch": 2.4046202310115508, + "grad_norm": 1.0478013753890991, + "learning_rate": 0.0001251277704779762, + "loss": 1.9532, + "step": 20610 + }, + { + "epoch": 2.4047369035118424, + "grad_norm": 1.1875455379486084, + "learning_rate": 0.0001251128909906001, + "loss": 1.939, + "step": 20611 + }, + { + "epoch": 2.404853576012134, + "grad_norm": 1.1455928087234497, + "learning_rate": 0.00012509801177677639, + "loss": 1.9421, + "step": 20612 + }, + { + "epoch": 2.404970248512426, + "grad_norm": 1.334681510925293, + "learning_rate": 0.00012508313283665924, + "loss": 2.1155, + "step": 20613 + }, + { + "epoch": 2.4050869210127175, + "grad_norm": 1.2515586614608765, + "learning_rate": 0.000125068254170403, + "loss": 2.107, + "step": 20614 + }, + { + "epoch": 2.405203593513009, + "grad_norm": 1.0001753568649292, + "learning_rate": 0.00012505337577816179, + "loss": 1.7901, + "step": 20615 + }, + { + "epoch": 2.405320266013301, + "grad_norm": 1.1067087650299072, + "learning_rate": 0.00012503849766008993, + "loss": 1.9464, + "step": 20616 + }, + { + "epoch": 2.4054369385135925, + "grad_norm": 0.9866013526916504, + "learning_rate": 0.00012502361981634168, + "loss": 1.8349, + "step": 20617 + }, + { + "epoch": 2.405553611013884, + "grad_norm": 1.2923353910446167, + "learning_rate": 0.0001250087422470712, + "loss": 2.0171, + "step": 20618 + }, + { + "epoch": 2.405670283514176, + "grad_norm": 1.1173912286758423, + "learning_rate": 0.00012499386495243284, + "loss": 1.9929, + "step": 20619 + }, + { + "epoch": 2.4057869560144676, + "grad_norm": 1.2203006744384766, + "learning_rate": 0.00012497898793258075, + "loss": 1.9754, + "step": 20620 + }, + { + "epoch": 2.4059036285147593, + "grad_norm": 1.2937933206558228, + "learning_rate": 0.00012496411118766916, + "loss": 2.1348, + "step": 20621 + }, + { + "epoch": 2.406020301015051, + "grad_norm": 1.13163423538208, + "learning_rate": 0.0001249492347178523, + "loss": 1.9652, + "step": 20622 + }, + { + "epoch": 2.4061369735153426, + "grad_norm": 1.0793675184249878, + "learning_rate": 0.00012493435852328443, + "loss": 1.9672, + "step": 20623 + }, + { + "epoch": 2.4062536460156343, + "grad_norm": 1.371256709098816, + "learning_rate": 0.00012491948260411967, + "loss": 2.0836, + "step": 20624 + }, + { + "epoch": 2.406370318515926, + "grad_norm": 1.1937791109085083, + "learning_rate": 0.00012490460696051238, + "loss": 1.8662, + "step": 20625 + }, + { + "epoch": 2.4064869910162177, + "grad_norm": 1.1987775564193726, + "learning_rate": 0.00012488973159261664, + "loss": 1.884, + "step": 20626 + }, + { + "epoch": 2.4066036635165093, + "grad_norm": 1.005579948425293, + "learning_rate": 0.0001248748565005868, + "loss": 1.9491, + "step": 20627 + }, + { + "epoch": 2.406720336016801, + "grad_norm": 1.2912781238555908, + "learning_rate": 0.0001248599816845769, + "loss": 1.9727, + "step": 20628 + }, + { + "epoch": 2.4068370085170927, + "grad_norm": 1.1184687614440918, + "learning_rate": 0.00012484510714474127, + "loss": 1.9747, + "step": 20629 + }, + { + "epoch": 2.4069536810173844, + "grad_norm": 1.276435375213623, + "learning_rate": 0.00012483023288123414, + "loss": 2.027, + "step": 20630 + }, + { + "epoch": 2.407070353517676, + "grad_norm": 1.1789510250091553, + "learning_rate": 0.00012481535889420956, + "loss": 1.8472, + "step": 20631 + }, + { + "epoch": 2.4071870260179677, + "grad_norm": 1.257003903388977, + "learning_rate": 0.00012480048518382188, + "loss": 2.0577, + "step": 20632 + }, + { + "epoch": 2.4073036985182594, + "grad_norm": 1.2572309970855713, + "learning_rate": 0.0001247856117502252, + "loss": 1.9767, + "step": 20633 + }, + { + "epoch": 2.407420371018551, + "grad_norm": 1.2088377475738525, + "learning_rate": 0.00012477073859357377, + "loss": 2.0201, + "step": 20634 + }, + { + "epoch": 2.407537043518843, + "grad_norm": 1.232889175415039, + "learning_rate": 0.0001247558657140217, + "loss": 2.0921, + "step": 20635 + }, + { + "epoch": 2.4076537160191345, + "grad_norm": 1.1974161863327026, + "learning_rate": 0.00012474099311172328, + "loss": 1.8882, + "step": 20636 + }, + { + "epoch": 2.407770388519426, + "grad_norm": 1.1813764572143555, + "learning_rate": 0.00012472612078683257, + "loss": 2.035, + "step": 20637 + }, + { + "epoch": 2.407887061019718, + "grad_norm": 1.0145469903945923, + "learning_rate": 0.0001247112487395039, + "loss": 1.9365, + "step": 20638 + }, + { + "epoch": 2.4080037335200095, + "grad_norm": 1.1643191576004028, + "learning_rate": 0.00012469637696989132, + "loss": 2.0554, + "step": 20639 + }, + { + "epoch": 2.408120406020301, + "grad_norm": 1.0966309309005737, + "learning_rate": 0.00012468150547814908, + "loss": 2.0411, + "step": 20640 + }, + { + "epoch": 2.408237078520593, + "grad_norm": 1.1945762634277344, + "learning_rate": 0.0001246666342644313, + "loss": 1.9436, + "step": 20641 + }, + { + "epoch": 2.4083537510208846, + "grad_norm": 1.0279157161712646, + "learning_rate": 0.00012465176332889213, + "loss": 2.0683, + "step": 20642 + }, + { + "epoch": 2.4084704235211762, + "grad_norm": 1.1063693761825562, + "learning_rate": 0.00012463689267168586, + "loss": 2.0772, + "step": 20643 + }, + { + "epoch": 2.408587096021468, + "grad_norm": 0.91367506980896, + "learning_rate": 0.00012462202229296648, + "loss": 1.6411, + "step": 20644 + }, + { + "epoch": 2.4087037685217596, + "grad_norm": 1.127648949623108, + "learning_rate": 0.0001246071521928883, + "loss": 1.7309, + "step": 20645 + }, + { + "epoch": 2.4088204410220513, + "grad_norm": 1.0944074392318726, + "learning_rate": 0.00012459228237160538, + "loss": 1.9074, + "step": 20646 + }, + { + "epoch": 2.408937113522343, + "grad_norm": 1.071966528892517, + "learning_rate": 0.00012457741282927192, + "loss": 2.0767, + "step": 20647 + }, + { + "epoch": 2.4090537860226346, + "grad_norm": 1.1724541187286377, + "learning_rate": 0.00012456254356604205, + "loss": 1.9327, + "step": 20648 + }, + { + "epoch": 2.4091704585229263, + "grad_norm": 1.2137322425842285, + "learning_rate": 0.00012454767458206997, + "loss": 1.8579, + "step": 20649 + }, + { + "epoch": 2.409287131023218, + "grad_norm": 1.2813079357147217, + "learning_rate": 0.0001245328058775097, + "loss": 1.9565, + "step": 20650 + }, + { + "epoch": 2.4094038035235097, + "grad_norm": 1.1086148023605347, + "learning_rate": 0.00012451793745251556, + "loss": 1.8173, + "step": 20651 + }, + { + "epoch": 2.4095204760238014, + "grad_norm": 1.1114453077316284, + "learning_rate": 0.0001245030693072415, + "loss": 1.9156, + "step": 20652 + }, + { + "epoch": 2.409637148524093, + "grad_norm": 1.2784943580627441, + "learning_rate": 0.00012448820144184182, + "loss": 2.0741, + "step": 20653 + }, + { + "epoch": 2.4097538210243847, + "grad_norm": 1.2752313613891602, + "learning_rate": 0.00012447333385647052, + "loss": 1.9601, + "step": 20654 + }, + { + "epoch": 2.4098704935246764, + "grad_norm": 1.553205132484436, + "learning_rate": 0.00012445846655128183, + "loss": 2.0978, + "step": 20655 + }, + { + "epoch": 2.409987166024968, + "grad_norm": 1.15110182762146, + "learning_rate": 0.00012444359952642987, + "loss": 1.9344, + "step": 20656 + }, + { + "epoch": 2.41010383852526, + "grad_norm": 1.0281401872634888, + "learning_rate": 0.00012442873278206866, + "loss": 1.85, + "step": 20657 + }, + { + "epoch": 2.4102205110255515, + "grad_norm": 1.3337163925170898, + "learning_rate": 0.00012441386631835248, + "loss": 1.957, + "step": 20658 + }, + { + "epoch": 2.410337183525843, + "grad_norm": 1.1488643884658813, + "learning_rate": 0.00012439900013543528, + "loss": 1.9508, + "step": 20659 + }, + { + "epoch": 2.410453856026135, + "grad_norm": 1.037665605545044, + "learning_rate": 0.00012438413423347135, + "loss": 1.8804, + "step": 20660 + }, + { + "epoch": 2.4105705285264265, + "grad_norm": 1.0909614562988281, + "learning_rate": 0.00012436926861261464, + "loss": 1.934, + "step": 20661 + }, + { + "epoch": 2.410687201026718, + "grad_norm": 1.2322317361831665, + "learning_rate": 0.0001243544032730194, + "loss": 2.0127, + "step": 20662 + }, + { + "epoch": 2.41080387352701, + "grad_norm": 1.1228816509246826, + "learning_rate": 0.00012433953821483967, + "loss": 1.7199, + "step": 20663 + }, + { + "epoch": 2.4109205460273015, + "grad_norm": 1.0240919589996338, + "learning_rate": 0.00012432467343822954, + "loss": 1.802, + "step": 20664 + }, + { + "epoch": 2.4110372185275932, + "grad_norm": 1.1266570091247559, + "learning_rate": 0.0001243098089433431, + "loss": 1.8892, + "step": 20665 + }, + { + "epoch": 2.411153891027885, + "grad_norm": 1.1832022666931152, + "learning_rate": 0.00012429494473033457, + "loss": 2.0293, + "step": 20666 + }, + { + "epoch": 2.4112705635281766, + "grad_norm": 1.26087486743927, + "learning_rate": 0.00012428008079935787, + "loss": 2.0491, + "step": 20667 + }, + { + "epoch": 2.4113872360284683, + "grad_norm": 1.275475263595581, + "learning_rate": 0.00012426521715056723, + "loss": 2.0999, + "step": 20668 + }, + { + "epoch": 2.41150390852876, + "grad_norm": 1.467361330986023, + "learning_rate": 0.00012425035378411662, + "loss": 2.2091, + "step": 20669 + }, + { + "epoch": 2.4116205810290516, + "grad_norm": 1.355093240737915, + "learning_rate": 0.00012423549070016018, + "loss": 2.2623, + "step": 20670 + }, + { + "epoch": 2.4117372535293433, + "grad_norm": 1.3308417797088623, + "learning_rate": 0.0001242206278988521, + "loss": 2.3164, + "step": 20671 + }, + { + "epoch": 2.411853926029635, + "grad_norm": 1.1750319004058838, + "learning_rate": 0.00012420576538034632, + "loss": 1.8488, + "step": 20672 + }, + { + "epoch": 2.4119705985299267, + "grad_norm": 1.2410073280334473, + "learning_rate": 0.00012419090314479703, + "loss": 2.0543, + "step": 20673 + }, + { + "epoch": 2.4120872710302184, + "grad_norm": 1.1875299215316772, + "learning_rate": 0.00012417604119235816, + "loss": 1.7228, + "step": 20674 + }, + { + "epoch": 2.41220394353051, + "grad_norm": 1.2735483646392822, + "learning_rate": 0.00012416117952318394, + "loss": 1.9569, + "step": 20675 + }, + { + "epoch": 2.4123206160308017, + "grad_norm": 1.0799977779388428, + "learning_rate": 0.00012414631813742827, + "loss": 1.9826, + "step": 20676 + }, + { + "epoch": 2.4124372885310934, + "grad_norm": 1.4597057104110718, + "learning_rate": 0.0001241314570352454, + "loss": 2.1723, + "step": 20677 + }, + { + "epoch": 2.412553961031385, + "grad_norm": 1.1125861406326294, + "learning_rate": 0.00012411659621678923, + "loss": 2.1162, + "step": 20678 + }, + { + "epoch": 2.4126706335316768, + "grad_norm": 1.216668725013733, + "learning_rate": 0.00012410173568221396, + "loss": 2.0676, + "step": 20679 + }, + { + "epoch": 2.4127873060319684, + "grad_norm": 1.2972638607025146, + "learning_rate": 0.00012408687543167353, + "loss": 2.084, + "step": 20680 + }, + { + "epoch": 2.41290397853226, + "grad_norm": 1.1796847581863403, + "learning_rate": 0.0001240720154653221, + "loss": 2.0099, + "step": 20681 + }, + { + "epoch": 2.413020651032552, + "grad_norm": 1.1012275218963623, + "learning_rate": 0.00012405715578331364, + "loss": 1.9231, + "step": 20682 + }, + { + "epoch": 2.4131373235328435, + "grad_norm": 1.1195814609527588, + "learning_rate": 0.0001240422963858022, + "loss": 1.9417, + "step": 20683 + }, + { + "epoch": 2.413253996033135, + "grad_norm": 1.1310253143310547, + "learning_rate": 0.00012402743727294192, + "loss": 1.9802, + "step": 20684 + }, + { + "epoch": 2.413370668533427, + "grad_norm": 1.2696943283081055, + "learning_rate": 0.0001240125784448867, + "loss": 2.0212, + "step": 20685 + }, + { + "epoch": 2.4134873410337185, + "grad_norm": 1.1750565767288208, + "learning_rate": 0.00012399771990179073, + "loss": 2.1357, + "step": 20686 + }, + { + "epoch": 2.41360401353401, + "grad_norm": 1.1354213953018188, + "learning_rate": 0.00012398286164380792, + "loss": 2.139, + "step": 20687 + }, + { + "epoch": 2.413720686034302, + "grad_norm": 1.0713911056518555, + "learning_rate": 0.0001239680036710924, + "loss": 1.8331, + "step": 20688 + }, + { + "epoch": 2.4138373585345936, + "grad_norm": 1.0729058980941772, + "learning_rate": 0.00012395314598379813, + "loss": 2.0719, + "step": 20689 + }, + { + "epoch": 2.4139540310348853, + "grad_norm": 1.1534188985824585, + "learning_rate": 0.00012393828858207916, + "loss": 2.1173, + "step": 20690 + }, + { + "epoch": 2.414070703535177, + "grad_norm": 0.9190272688865662, + "learning_rate": 0.00012392343146608954, + "loss": 1.8425, + "step": 20691 + }, + { + "epoch": 2.4141873760354686, + "grad_norm": 1.089258074760437, + "learning_rate": 0.0001239085746359833, + "loss": 1.9613, + "step": 20692 + }, + { + "epoch": 2.4143040485357603, + "grad_norm": 1.306663155555725, + "learning_rate": 0.00012389371809191436, + "loss": 2.0008, + "step": 20693 + }, + { + "epoch": 2.414420721036052, + "grad_norm": 1.1757252216339111, + "learning_rate": 0.00012387886183403687, + "loss": 1.9714, + "step": 20694 + }, + { + "epoch": 2.4145373935363437, + "grad_norm": 1.2367929220199585, + "learning_rate": 0.00012386400586250471, + "loss": 2.0325, + "step": 20695 + }, + { + "epoch": 2.4146540660366353, + "grad_norm": 1.4055644273757935, + "learning_rate": 0.00012384915017747198, + "loss": 1.9832, + "step": 20696 + }, + { + "epoch": 2.414770738536927, + "grad_norm": 1.1606301069259644, + "learning_rate": 0.00012383429477909273, + "loss": 1.8126, + "step": 20697 + }, + { + "epoch": 2.4148874110372187, + "grad_norm": 1.1592615842819214, + "learning_rate": 0.0001238194396675209, + "loss": 1.9433, + "step": 20698 + }, + { + "epoch": 2.4150040835375104, + "grad_norm": 1.1320171356201172, + "learning_rate": 0.0001238045848429105, + "loss": 1.9906, + "step": 20699 + }, + { + "epoch": 2.415120756037802, + "grad_norm": 1.106603980064392, + "learning_rate": 0.00012378973030541544, + "loss": 1.7729, + "step": 20700 + }, + { + "epoch": 2.4152374285380938, + "grad_norm": 1.060214638710022, + "learning_rate": 0.00012377487605518987, + "loss": 1.879, + "step": 20701 + }, + { + "epoch": 2.4153541010383854, + "grad_norm": 1.0416321754455566, + "learning_rate": 0.00012376002209238767, + "loss": 1.8745, + "step": 20702 + }, + { + "epoch": 2.415470773538677, + "grad_norm": 1.1192210912704468, + "learning_rate": 0.00012374516841716292, + "loss": 1.8534, + "step": 20703 + }, + { + "epoch": 2.415587446038969, + "grad_norm": 0.9578922986984253, + "learning_rate": 0.00012373031502966952, + "loss": 1.8981, + "step": 20704 + }, + { + "epoch": 2.4157041185392605, + "grad_norm": 1.154555320739746, + "learning_rate": 0.0001237154619300615, + "loss": 1.9212, + "step": 20705 + }, + { + "epoch": 2.415820791039552, + "grad_norm": 1.4677926301956177, + "learning_rate": 0.0001237006091184928, + "loss": 1.8368, + "step": 20706 + }, + { + "epoch": 2.415937463539844, + "grad_norm": 1.1089340448379517, + "learning_rate": 0.0001236857565951175, + "loss": 1.8778, + "step": 20707 + }, + { + "epoch": 2.4160541360401355, + "grad_norm": 1.1947952508926392, + "learning_rate": 0.00012367090436008946, + "loss": 2.0177, + "step": 20708 + }, + { + "epoch": 2.416170808540427, + "grad_norm": 0.9949174523353577, + "learning_rate": 0.00012365605241356268, + "loss": 1.8554, + "step": 20709 + }, + { + "epoch": 2.416287481040719, + "grad_norm": 1.2205716371536255, + "learning_rate": 0.0001236412007556912, + "loss": 1.9158, + "step": 20710 + }, + { + "epoch": 2.4164041535410106, + "grad_norm": 1.2215073108673096, + "learning_rate": 0.00012362634938662884, + "loss": 1.8468, + "step": 20711 + }, + { + "epoch": 2.4165208260413023, + "grad_norm": 1.1734230518341064, + "learning_rate": 0.00012361149830652975, + "loss": 2.0189, + "step": 20712 + }, + { + "epoch": 2.416637498541594, + "grad_norm": 1.2423044443130493, + "learning_rate": 0.00012359664751554773, + "loss": 1.8916, + "step": 20713 + }, + { + "epoch": 2.4167541710418856, + "grad_norm": 1.0782967805862427, + "learning_rate": 0.0001235817970138368, + "loss": 1.8422, + "step": 20714 + }, + { + "epoch": 2.4168708435421773, + "grad_norm": 1.090207815170288, + "learning_rate": 0.00012356694680155095, + "loss": 1.9055, + "step": 20715 + }, + { + "epoch": 2.416987516042469, + "grad_norm": 1.022756814956665, + "learning_rate": 0.00012355209687884406, + "loss": 1.8138, + "step": 20716 + }, + { + "epoch": 2.4171041885427607, + "grad_norm": 1.0050506591796875, + "learning_rate": 0.00012353724724587013, + "loss": 1.935, + "step": 20717 + }, + { + "epoch": 2.4172208610430523, + "grad_norm": 1.1456354856491089, + "learning_rate": 0.00012352239790278308, + "loss": 1.7884, + "step": 20718 + }, + { + "epoch": 2.417337533543344, + "grad_norm": 1.105097770690918, + "learning_rate": 0.00012350754884973684, + "loss": 1.7476, + "step": 20719 + }, + { + "epoch": 2.4174542060436357, + "grad_norm": 1.3587682247161865, + "learning_rate": 0.00012349270008688538, + "loss": 1.9645, + "step": 20720 + }, + { + "epoch": 2.4175708785439274, + "grad_norm": 1.3215211629867554, + "learning_rate": 0.0001234778516143826, + "loss": 2.101, + "step": 20721 + }, + { + "epoch": 2.417687551044219, + "grad_norm": 1.4085692167282104, + "learning_rate": 0.00012346300343238242, + "loss": 2.0483, + "step": 20722 + }, + { + "epoch": 2.4178042235445107, + "grad_norm": 1.136364459991455, + "learning_rate": 0.00012344815554103889, + "loss": 1.9524, + "step": 20723 + }, + { + "epoch": 2.4179208960448024, + "grad_norm": 1.2303932905197144, + "learning_rate": 0.0001234333079405058, + "loss": 1.8449, + "step": 20724 + }, + { + "epoch": 2.418037568545094, + "grad_norm": 1.1312144994735718, + "learning_rate": 0.00012341846063093715, + "loss": 2.0986, + "step": 20725 + }, + { + "epoch": 2.418154241045386, + "grad_norm": 1.0719891786575317, + "learning_rate": 0.0001234036136124868, + "loss": 1.9384, + "step": 20726 + }, + { + "epoch": 2.4182709135456775, + "grad_norm": 1.1421061754226685, + "learning_rate": 0.00012338876688530872, + "loss": 2.0444, + "step": 20727 + }, + { + "epoch": 2.418387586045969, + "grad_norm": 1.1708643436431885, + "learning_rate": 0.00012337392044955674, + "loss": 2.0552, + "step": 20728 + }, + { + "epoch": 2.418504258546261, + "grad_norm": 1.2670079469680786, + "learning_rate": 0.00012335907430538492, + "loss": 1.9802, + "step": 20729 + }, + { + "epoch": 2.4186209310465525, + "grad_norm": 1.2785454988479614, + "learning_rate": 0.00012334422845294702, + "loss": 2.0238, + "step": 20730 + }, + { + "epoch": 2.418737603546844, + "grad_norm": 1.0698622465133667, + "learning_rate": 0.00012332938289239708, + "loss": 1.753, + "step": 20731 + }, + { + "epoch": 2.418854276047136, + "grad_norm": 1.2374098300933838, + "learning_rate": 0.00012331453762388884, + "loss": 2.0308, + "step": 20732 + }, + { + "epoch": 2.4189709485474276, + "grad_norm": 1.2166780233383179, + "learning_rate": 0.0001232996926475764, + "loss": 2.0655, + "step": 20733 + }, + { + "epoch": 2.4190876210477192, + "grad_norm": 1.2319549322128296, + "learning_rate": 0.00012328484796361348, + "loss": 1.9023, + "step": 20734 + }, + { + "epoch": 2.419204293548011, + "grad_norm": 1.198350191116333, + "learning_rate": 0.00012327000357215402, + "loss": 1.9784, + "step": 20735 + }, + { + "epoch": 2.4193209660483026, + "grad_norm": 1.2535256147384644, + "learning_rate": 0.000123255159473352, + "loss": 1.9777, + "step": 20736 + }, + { + "epoch": 2.4194376385485943, + "grad_norm": 1.2713638544082642, + "learning_rate": 0.00012324031566736117, + "loss": 2.1409, + "step": 20737 + }, + { + "epoch": 2.419554311048886, + "grad_norm": 1.3101584911346436, + "learning_rate": 0.0001232254721543355, + "loss": 2.064, + "step": 20738 + }, + { + "epoch": 2.4196709835491776, + "grad_norm": 1.3131829500198364, + "learning_rate": 0.00012321062893442885, + "loss": 2.0652, + "step": 20739 + }, + { + "epoch": 2.4197876560494693, + "grad_norm": 1.2593556642532349, + "learning_rate": 0.00012319578600779516, + "loss": 1.8519, + "step": 20740 + }, + { + "epoch": 2.419904328549761, + "grad_norm": 1.1472704410552979, + "learning_rate": 0.00012318094337458821, + "loss": 2.0597, + "step": 20741 + }, + { + "epoch": 2.4200210010500527, + "grad_norm": 1.1306859254837036, + "learning_rate": 0.0001231661010349619, + "loss": 1.8264, + "step": 20742 + }, + { + "epoch": 2.4201376735503444, + "grad_norm": 1.1318392753601074, + "learning_rate": 0.0001231512589890701, + "loss": 2.0282, + "step": 20743 + }, + { + "epoch": 2.420254346050636, + "grad_norm": 1.0752475261688232, + "learning_rate": 0.0001231364172370667, + "loss": 1.8687, + "step": 20744 + }, + { + "epoch": 2.4203710185509277, + "grad_norm": 1.160605549812317, + "learning_rate": 0.0001231215757791055, + "loss": 2.0891, + "step": 20745 + }, + { + "epoch": 2.4204876910512194, + "grad_norm": 1.0664697885513306, + "learning_rate": 0.0001231067346153405, + "loss": 1.9716, + "step": 20746 + }, + { + "epoch": 2.420604363551511, + "grad_norm": 1.4002227783203125, + "learning_rate": 0.0001230918937459254, + "loss": 1.803, + "step": 20747 + }, + { + "epoch": 2.4207210360518028, + "grad_norm": 1.1612426042556763, + "learning_rate": 0.0001230770531710141, + "loss": 1.9418, + "step": 20748 + }, + { + "epoch": 2.4208377085520945, + "grad_norm": 1.1370211839675903, + "learning_rate": 0.0001230622128907605, + "loss": 2.0294, + "step": 20749 + }, + { + "epoch": 2.420954381052386, + "grad_norm": 1.18899405002594, + "learning_rate": 0.0001230473729053184, + "loss": 1.9747, + "step": 20750 + }, + { + "epoch": 2.421071053552678, + "grad_norm": 1.0675891637802124, + "learning_rate": 0.00012303253321484172, + "loss": 2.0371, + "step": 20751 + }, + { + "epoch": 2.4211877260529695, + "grad_norm": 1.2043172121047974, + "learning_rate": 0.00012301769381948413, + "loss": 2.0315, + "step": 20752 + }, + { + "epoch": 2.421304398553261, + "grad_norm": 1.020006537437439, + "learning_rate": 0.00012300285471939968, + "loss": 1.951, + "step": 20753 + }, + { + "epoch": 2.421421071053553, + "grad_norm": 1.4137085676193237, + "learning_rate": 0.000122988015914742, + "loss": 2.0794, + "step": 20754 + }, + { + "epoch": 2.4215377435538445, + "grad_norm": 1.10053551197052, + "learning_rate": 0.00012297317740566513, + "loss": 1.9869, + "step": 20755 + }, + { + "epoch": 2.4216544160541362, + "grad_norm": 1.1098874807357788, + "learning_rate": 0.00012295833919232272, + "loss": 1.9024, + "step": 20756 + }, + { + "epoch": 2.421771088554428, + "grad_norm": 1.0813871622085571, + "learning_rate": 0.00012294350127486873, + "loss": 1.9949, + "step": 20757 + }, + { + "epoch": 2.4218877610547196, + "grad_norm": 1.235803484916687, + "learning_rate": 0.0001229286636534569, + "loss": 1.9343, + "step": 20758 + }, + { + "epoch": 2.4220044335550113, + "grad_norm": 1.164706826210022, + "learning_rate": 0.0001229138263282411, + "loss": 1.9922, + "step": 20759 + }, + { + "epoch": 2.422121106055303, + "grad_norm": 1.061077356338501, + "learning_rate": 0.0001228989892993751, + "loss": 2.0676, + "step": 20760 + }, + { + "epoch": 2.4222377785555946, + "grad_norm": 1.1238198280334473, + "learning_rate": 0.0001228841525670127, + "loss": 2.0119, + "step": 20761 + }, + { + "epoch": 2.4223544510558863, + "grad_norm": 1.0967175960540771, + "learning_rate": 0.0001228693161313078, + "loss": 2.0801, + "step": 20762 + }, + { + "epoch": 2.422471123556178, + "grad_norm": 1.1118175983428955, + "learning_rate": 0.0001228544799924141, + "loss": 1.9281, + "step": 20763 + }, + { + "epoch": 2.4225877960564697, + "grad_norm": 1.2026575803756714, + "learning_rate": 0.00012283964415048552, + "loss": 2.0413, + "step": 20764 + }, + { + "epoch": 2.4227044685567614, + "grad_norm": 1.0977963209152222, + "learning_rate": 0.00012282480860567575, + "loss": 1.8821, + "step": 20765 + }, + { + "epoch": 2.422821141057053, + "grad_norm": 1.1043795347213745, + "learning_rate": 0.0001228099733581387, + "loss": 2.0757, + "step": 20766 + }, + { + "epoch": 2.4229378135573447, + "grad_norm": 1.1849640607833862, + "learning_rate": 0.00012279513840802807, + "loss": 1.9495, + "step": 20767 + }, + { + "epoch": 2.4230544860576364, + "grad_norm": 1.2026196718215942, + "learning_rate": 0.0001227803037554977, + "loss": 2.0487, + "step": 20768 + }, + { + "epoch": 2.423171158557928, + "grad_norm": 1.1724014282226562, + "learning_rate": 0.00012276546940070135, + "loss": 1.8234, + "step": 20769 + }, + { + "epoch": 2.4232878310582198, + "grad_norm": 1.2369920015335083, + "learning_rate": 0.00012275063534379288, + "loss": 1.9191, + "step": 20770 + }, + { + "epoch": 2.4234045035585114, + "grad_norm": 1.3643279075622559, + "learning_rate": 0.00012273580158492591, + "loss": 1.9159, + "step": 20771 + }, + { + "epoch": 2.423521176058803, + "grad_norm": 1.1588853597640991, + "learning_rate": 0.00012272096812425442, + "loss": 1.8225, + "step": 20772 + }, + { + "epoch": 2.423637848559095, + "grad_norm": 1.2113981246948242, + "learning_rate": 0.00012270613496193203, + "loss": 2.0104, + "step": 20773 + }, + { + "epoch": 2.4237545210593865, + "grad_norm": 1.0189030170440674, + "learning_rate": 0.0001226913020981126, + "loss": 1.7012, + "step": 20774 + }, + { + "epoch": 2.423871193559678, + "grad_norm": 1.0997827053070068, + "learning_rate": 0.00012267646953294985, + "loss": 2.0063, + "step": 20775 + }, + { + "epoch": 2.42398786605997, + "grad_norm": 1.1767823696136475, + "learning_rate": 0.00012266163726659763, + "loss": 1.872, + "step": 20776 + }, + { + "epoch": 2.4241045385602615, + "grad_norm": 1.0726350545883179, + "learning_rate": 0.00012264680529920963, + "loss": 2.0127, + "step": 20777 + }, + { + "epoch": 2.424221211060553, + "grad_norm": 1.0361944437026978, + "learning_rate": 0.0001226319736309396, + "loss": 1.8028, + "step": 20778 + }, + { + "epoch": 2.424337883560845, + "grad_norm": 1.0007601976394653, + "learning_rate": 0.0001226171422619414, + "loss": 1.9803, + "step": 20779 + }, + { + "epoch": 2.4244545560611366, + "grad_norm": 1.2912037372589111, + "learning_rate": 0.00012260231119236862, + "loss": 1.895, + "step": 20780 + }, + { + "epoch": 2.4245712285614283, + "grad_norm": 1.2619189023971558, + "learning_rate": 0.00012258748042237518, + "loss": 2.0319, + "step": 20781 + }, + { + "epoch": 2.42468790106172, + "grad_norm": 1.097300410270691, + "learning_rate": 0.00012257264995211472, + "loss": 1.9549, + "step": 20782 + }, + { + "epoch": 2.4248045735620116, + "grad_norm": 1.106803059577942, + "learning_rate": 0.00012255781978174108, + "loss": 1.9025, + "step": 20783 + }, + { + "epoch": 2.4249212460623033, + "grad_norm": 1.2055444717407227, + "learning_rate": 0.00012254298991140787, + "loss": 1.7272, + "step": 20784 + }, + { + "epoch": 2.425037918562595, + "grad_norm": 1.1505460739135742, + "learning_rate": 0.00012252816034126894, + "loss": 2.0227, + "step": 20785 + }, + { + "epoch": 2.4251545910628867, + "grad_norm": 1.2184284925460815, + "learning_rate": 0.000122513331071478, + "loss": 2.1006, + "step": 20786 + }, + { + "epoch": 2.4252712635631783, + "grad_norm": 1.1059768199920654, + "learning_rate": 0.0001224985021021887, + "loss": 1.9114, + "step": 20787 + }, + { + "epoch": 2.42538793606347, + "grad_norm": 1.010094404220581, + "learning_rate": 0.00012248367343355493, + "loss": 1.8772, + "step": 20788 + }, + { + "epoch": 2.4255046085637617, + "grad_norm": 1.1953303813934326, + "learning_rate": 0.0001224688450657303, + "loss": 2.0164, + "step": 20789 + }, + { + "epoch": 2.4256212810640534, + "grad_norm": 1.1516993045806885, + "learning_rate": 0.00012245401699886857, + "loss": 2.1322, + "step": 20790 + }, + { + "epoch": 2.425737953564345, + "grad_norm": 1.141793131828308, + "learning_rate": 0.00012243918923312343, + "loss": 1.9976, + "step": 20791 + }, + { + "epoch": 2.4258546260646368, + "grad_norm": 1.1530472040176392, + "learning_rate": 0.00012242436176864867, + "loss": 1.9513, + "step": 20792 + }, + { + "epoch": 2.4259712985649284, + "grad_norm": 1.0939422845840454, + "learning_rate": 0.00012240953460559794, + "loss": 2.0439, + "step": 20793 + }, + { + "epoch": 2.42608797106522, + "grad_norm": 1.2034714221954346, + "learning_rate": 0.000122394707744125, + "loss": 2.0062, + "step": 20794 + }, + { + "epoch": 2.426204643565512, + "grad_norm": 1.177139163017273, + "learning_rate": 0.00012237988118438348, + "loss": 1.9892, + "step": 20795 + }, + { + "epoch": 2.4263213160658035, + "grad_norm": 1.0621910095214844, + "learning_rate": 0.0001223650549265272, + "loss": 2.0584, + "step": 20796 + }, + { + "epoch": 2.426437988566095, + "grad_norm": 1.349338173866272, + "learning_rate": 0.0001223502289707097, + "loss": 2.0171, + "step": 20797 + }, + { + "epoch": 2.426554661066387, + "grad_norm": 1.1747148036956787, + "learning_rate": 0.00012233540331708486, + "loss": 1.8049, + "step": 20798 + }, + { + "epoch": 2.4266713335666785, + "grad_norm": 1.2057865858078003, + "learning_rate": 0.00012232057796580624, + "loss": 1.8323, + "step": 20799 + }, + { + "epoch": 2.42678800606697, + "grad_norm": 1.3030359745025635, + "learning_rate": 0.0001223057529170276, + "loss": 2.105, + "step": 20800 + }, + { + "epoch": 2.426904678567262, + "grad_norm": 1.1094969511032104, + "learning_rate": 0.00012229092817090263, + "loss": 1.9506, + "step": 20801 + }, + { + "epoch": 2.4270213510675536, + "grad_norm": 1.2812623977661133, + "learning_rate": 0.000122276103727585, + "loss": 2.113, + "step": 20802 + }, + { + "epoch": 2.4271380235678452, + "grad_norm": 1.3703550100326538, + "learning_rate": 0.00012226127958722842, + "loss": 1.9331, + "step": 20803 + }, + { + "epoch": 2.427254696068137, + "grad_norm": 1.0082155466079712, + "learning_rate": 0.00012224645574998648, + "loss": 1.939, + "step": 20804 + }, + { + "epoch": 2.4273713685684286, + "grad_norm": 1.0994070768356323, + "learning_rate": 0.00012223163221601298, + "loss": 2.0882, + "step": 20805 + }, + { + "epoch": 2.4274880410687203, + "grad_norm": 1.0733100175857544, + "learning_rate": 0.0001222168089854615, + "loss": 1.9004, + "step": 20806 + }, + { + "epoch": 2.427604713569012, + "grad_norm": 1.0835518836975098, + "learning_rate": 0.00012220198605848576, + "loss": 1.9472, + "step": 20807 + }, + { + "epoch": 2.4277213860693037, + "grad_norm": 1.07133948802948, + "learning_rate": 0.00012218716343523938, + "loss": 2.0192, + "step": 20808 + }, + { + "epoch": 2.4278380585695953, + "grad_norm": 1.3508379459381104, + "learning_rate": 0.00012217234111587614, + "loss": 2.0601, + "step": 20809 + }, + { + "epoch": 2.427954731069887, + "grad_norm": 1.0353667736053467, + "learning_rate": 0.00012215751910054956, + "loss": 2.1294, + "step": 20810 + }, + { + "epoch": 2.4280714035701787, + "grad_norm": 1.152990698814392, + "learning_rate": 0.00012214269738941339, + "loss": 1.9206, + "step": 20811 + }, + { + "epoch": 2.4281880760704704, + "grad_norm": 1.157265543937683, + "learning_rate": 0.00012212787598262124, + "loss": 2.0228, + "step": 20812 + }, + { + "epoch": 2.428304748570762, + "grad_norm": 1.148241639137268, + "learning_rate": 0.0001221130548803267, + "loss": 1.9425, + "step": 20813 + }, + { + "epoch": 2.4284214210710537, + "grad_norm": 1.2693428993225098, + "learning_rate": 0.00012209823408268364, + "loss": 2.014, + "step": 20814 + }, + { + "epoch": 2.4285380935713454, + "grad_norm": 1.1373804807662964, + "learning_rate": 0.00012208341358984544, + "loss": 1.8071, + "step": 20815 + }, + { + "epoch": 2.428654766071637, + "grad_norm": 1.1592844724655151, + "learning_rate": 0.00012206859340196594, + "loss": 2.0095, + "step": 20816 + }, + { + "epoch": 2.428771438571929, + "grad_norm": 1.0959056615829468, + "learning_rate": 0.00012205377351919865, + "loss": 1.7307, + "step": 20817 + }, + { + "epoch": 2.4288881110722205, + "grad_norm": 1.228074073791504, + "learning_rate": 0.00012203895394169731, + "loss": 1.98, + "step": 20818 + }, + { + "epoch": 2.429004783572512, + "grad_norm": 1.2005352973937988, + "learning_rate": 0.00012202413466961548, + "loss": 1.9527, + "step": 20819 + }, + { + "epoch": 2.429121456072804, + "grad_norm": 1.133933424949646, + "learning_rate": 0.00012200931570310682, + "loss": 1.9055, + "step": 20820 + }, + { + "epoch": 2.4292381285730955, + "grad_norm": 1.111720323562622, + "learning_rate": 0.0001219944970423249, + "loss": 1.9488, + "step": 20821 + }, + { + "epoch": 2.429354801073387, + "grad_norm": 1.1947743892669678, + "learning_rate": 0.00012197967868742346, + "loss": 2.1671, + "step": 20822 + }, + { + "epoch": 2.429471473573679, + "grad_norm": 1.1755822896957397, + "learning_rate": 0.000121964860638556, + "loss": 1.9337, + "step": 20823 + }, + { + "epoch": 2.4295881460739706, + "grad_norm": 1.0967963933944702, + "learning_rate": 0.00012195004289587622, + "loss": 2.0058, + "step": 20824 + }, + { + "epoch": 2.4297048185742622, + "grad_norm": 1.2092846632003784, + "learning_rate": 0.00012193522545953765, + "loss": 2.0841, + "step": 20825 + }, + { + "epoch": 2.429821491074554, + "grad_norm": 1.1654866933822632, + "learning_rate": 0.00012192040832969399, + "loss": 1.9885, + "step": 20826 + }, + { + "epoch": 2.4299381635748456, + "grad_norm": 1.2171968221664429, + "learning_rate": 0.00012190559150649883, + "loss": 1.9719, + "step": 20827 + }, + { + "epoch": 2.4300548360751373, + "grad_norm": 1.1000784635543823, + "learning_rate": 0.00012189077499010573, + "loss": 1.9067, + "step": 20828 + }, + { + "epoch": 2.430171508575429, + "grad_norm": 1.2873731851577759, + "learning_rate": 0.00012187595878066836, + "loss": 2.0055, + "step": 20829 + }, + { + "epoch": 2.4302881810757206, + "grad_norm": 1.1513022184371948, + "learning_rate": 0.00012186114287834021, + "loss": 1.9355, + "step": 20830 + }, + { + "epoch": 2.4304048535760123, + "grad_norm": 1.1551778316497803, + "learning_rate": 0.000121846327283275, + "loss": 1.8955, + "step": 20831 + }, + { + "epoch": 2.430521526076304, + "grad_norm": 1.2247951030731201, + "learning_rate": 0.0001218315119956262, + "loss": 2.0986, + "step": 20832 + }, + { + "epoch": 2.4306381985765957, + "grad_norm": 1.3524450063705444, + "learning_rate": 0.00012181669701554752, + "loss": 1.9198, + "step": 20833 + }, + { + "epoch": 2.4307548710768874, + "grad_norm": 1.320990800857544, + "learning_rate": 0.00012180188234319241, + "loss": 2.1504, + "step": 20834 + }, + { + "epoch": 2.430871543577179, + "grad_norm": 1.2657558917999268, + "learning_rate": 0.00012178706797871462, + "loss": 2.0397, + "step": 20835 + }, + { + "epoch": 2.4309882160774707, + "grad_norm": 1.1179587841033936, + "learning_rate": 0.0001217722539222676, + "loss": 2.0021, + "step": 20836 + }, + { + "epoch": 2.4311048885777624, + "grad_norm": 1.4522637128829956, + "learning_rate": 0.00012175744017400495, + "loss": 2.0757, + "step": 20837 + }, + { + "epoch": 2.431221561078054, + "grad_norm": 1.050502896308899, + "learning_rate": 0.00012174262673408026, + "loss": 1.8984, + "step": 20838 + }, + { + "epoch": 2.4313382335783458, + "grad_norm": 1.1768606901168823, + "learning_rate": 0.00012172781360264705, + "loss": 1.9839, + "step": 20839 + }, + { + "epoch": 2.4314549060786375, + "grad_norm": 1.278135061264038, + "learning_rate": 0.00012171300077985898, + "loss": 2.0677, + "step": 20840 + }, + { + "epoch": 2.431571578578929, + "grad_norm": 1.299573540687561, + "learning_rate": 0.00012169818826586954, + "loss": 2.0385, + "step": 20841 + }, + { + "epoch": 2.431688251079221, + "grad_norm": 1.2028461694717407, + "learning_rate": 0.00012168337606083236, + "loss": 1.9901, + "step": 20842 + }, + { + "epoch": 2.4318049235795125, + "grad_norm": 1.067111849784851, + "learning_rate": 0.00012166856416490093, + "loss": 1.8831, + "step": 20843 + }, + { + "epoch": 2.431921596079804, + "grad_norm": 1.1383261680603027, + "learning_rate": 0.0001216537525782288, + "loss": 1.8617, + "step": 20844 + }, + { + "epoch": 2.432038268580096, + "grad_norm": 1.288753628730774, + "learning_rate": 0.00012163894130096956, + "loss": 2.0382, + "step": 20845 + }, + { + "epoch": 2.4321549410803875, + "grad_norm": 1.123520851135254, + "learning_rate": 0.00012162413033327675, + "loss": 1.7916, + "step": 20846 + }, + { + "epoch": 2.4322716135806792, + "grad_norm": 1.1548452377319336, + "learning_rate": 0.00012160931967530385, + "loss": 2.0616, + "step": 20847 + }, + { + "epoch": 2.432388286080971, + "grad_norm": 1.1474204063415527, + "learning_rate": 0.00012159450932720455, + "loss": 1.947, + "step": 20848 + }, + { + "epoch": 2.4325049585812626, + "grad_norm": 1.0352674722671509, + "learning_rate": 0.00012157969928913218, + "loss": 1.8053, + "step": 20849 + }, + { + "epoch": 2.4326216310815543, + "grad_norm": 1.065780758857727, + "learning_rate": 0.00012156488956124048, + "loss": 1.896, + "step": 20850 + }, + { + "epoch": 2.432738303581846, + "grad_norm": 1.1400846242904663, + "learning_rate": 0.0001215500801436828, + "loss": 1.8477, + "step": 20851 + }, + { + "epoch": 2.4328549760821376, + "grad_norm": 1.1197912693023682, + "learning_rate": 0.0001215352710366128, + "loss": 1.9546, + "step": 20852 + }, + { + "epoch": 2.4329716485824293, + "grad_norm": 1.1214303970336914, + "learning_rate": 0.000121520462240184, + "loss": 2.0821, + "step": 20853 + }, + { + "epoch": 2.433088321082721, + "grad_norm": 1.0130969285964966, + "learning_rate": 0.00012150565375454979, + "loss": 1.8917, + "step": 20854 + }, + { + "epoch": 2.4332049935830127, + "grad_norm": 1.2317110300064087, + "learning_rate": 0.00012149084557986388, + "loss": 2.0166, + "step": 20855 + }, + { + "epoch": 2.4333216660833044, + "grad_norm": 1.3531783819198608, + "learning_rate": 0.00012147603771627958, + "loss": 2.1685, + "step": 20856 + }, + { + "epoch": 2.433438338583596, + "grad_norm": 1.1781408786773682, + "learning_rate": 0.00012146123016395057, + "loss": 1.8756, + "step": 20857 + }, + { + "epoch": 2.4335550110838877, + "grad_norm": 1.2435015439987183, + "learning_rate": 0.00012144642292303026, + "loss": 2.0269, + "step": 20858 + }, + { + "epoch": 2.4336716835841794, + "grad_norm": 1.1901015043258667, + "learning_rate": 0.00012143161599367223, + "loss": 2.0349, + "step": 20859 + }, + { + "epoch": 2.433788356084471, + "grad_norm": 1.1718149185180664, + "learning_rate": 0.00012141680937602989, + "loss": 2.0099, + "step": 20860 + }, + { + "epoch": 2.4339050285847628, + "grad_norm": 1.0920486450195312, + "learning_rate": 0.00012140200307025683, + "loss": 2.0608, + "step": 20861 + }, + { + "epoch": 2.4340217010850544, + "grad_norm": 1.1110446453094482, + "learning_rate": 0.00012138719707650649, + "loss": 1.9017, + "step": 20862 + }, + { + "epoch": 2.434138373585346, + "grad_norm": 1.186893343925476, + "learning_rate": 0.00012137239139493239, + "loss": 2.0112, + "step": 20863 + }, + { + "epoch": 2.434255046085638, + "grad_norm": 1.0811935663223267, + "learning_rate": 0.00012135758602568795, + "loss": 1.9488, + "step": 20864 + }, + { + "epoch": 2.4343717185859295, + "grad_norm": 1.0579373836517334, + "learning_rate": 0.00012134278096892671, + "loss": 2.0269, + "step": 20865 + }, + { + "epoch": 2.434488391086221, + "grad_norm": 1.1452183723449707, + "learning_rate": 0.00012132797622480222, + "loss": 1.9123, + "step": 20866 + }, + { + "epoch": 2.434605063586513, + "grad_norm": 1.0493626594543457, + "learning_rate": 0.00012131317179346783, + "loss": 2.0358, + "step": 20867 + }, + { + "epoch": 2.4347217360868045, + "grad_norm": 1.2107486724853516, + "learning_rate": 0.00012129836767507713, + "loss": 1.9736, + "step": 20868 + }, + { + "epoch": 2.434838408587096, + "grad_norm": 1.1888792514801025, + "learning_rate": 0.00012128356386978352, + "loss": 1.9424, + "step": 20869 + }, + { + "epoch": 2.434955081087388, + "grad_norm": 1.4989739656448364, + "learning_rate": 0.00012126876037774049, + "loss": 2.0936, + "step": 20870 + }, + { + "epoch": 2.4350717535876796, + "grad_norm": 1.0604021549224854, + "learning_rate": 0.00012125395719910152, + "loss": 1.9167, + "step": 20871 + }, + { + "epoch": 2.4351884260879713, + "grad_norm": 1.074968695640564, + "learning_rate": 0.00012123915433402005, + "loss": 1.8849, + "step": 20872 + }, + { + "epoch": 2.435305098588263, + "grad_norm": 1.3096204996109009, + "learning_rate": 0.00012122435178264951, + "loss": 2.1152, + "step": 20873 + }, + { + "epoch": 2.4354217710885546, + "grad_norm": 1.1171913146972656, + "learning_rate": 0.00012120954954514345, + "loss": 1.9759, + "step": 20874 + }, + { + "epoch": 2.4355384435888463, + "grad_norm": 1.159767985343933, + "learning_rate": 0.00012119474762165522, + "loss": 1.972, + "step": 20875 + }, + { + "epoch": 2.435655116089138, + "grad_norm": 1.1582257747650146, + "learning_rate": 0.00012117994601233837, + "loss": 2.2005, + "step": 20876 + }, + { + "epoch": 2.4357717885894297, + "grad_norm": 1.1843825578689575, + "learning_rate": 0.00012116514471734623, + "loss": 2.1238, + "step": 20877 + }, + { + "epoch": 2.4358884610897213, + "grad_norm": 1.1473603248596191, + "learning_rate": 0.00012115034373683235, + "loss": 2.0102, + "step": 20878 + }, + { + "epoch": 2.436005133590013, + "grad_norm": 1.374295711517334, + "learning_rate": 0.00012113554307095015, + "loss": 2.0317, + "step": 20879 + }, + { + "epoch": 2.4361218060903047, + "grad_norm": 1.102724313735962, + "learning_rate": 0.00012112074271985299, + "loss": 1.9431, + "step": 20880 + }, + { + "epoch": 2.4362384785905964, + "grad_norm": 1.2927476167678833, + "learning_rate": 0.00012110594268369442, + "loss": 1.9522, + "step": 20881 + }, + { + "epoch": 2.436355151090888, + "grad_norm": 0.9984577894210815, + "learning_rate": 0.00012109114296262775, + "loss": 1.8824, + "step": 20882 + }, + { + "epoch": 2.4364718235911798, + "grad_norm": 1.254465103149414, + "learning_rate": 0.00012107634355680654, + "loss": 1.8886, + "step": 20883 + }, + { + "epoch": 2.4365884960914714, + "grad_norm": 1.1611384153366089, + "learning_rate": 0.00012106154446638406, + "loss": 2.0514, + "step": 20884 + }, + { + "epoch": 2.436705168591763, + "grad_norm": 1.0857799053192139, + "learning_rate": 0.00012104674569151389, + "loss": 2.0436, + "step": 20885 + }, + { + "epoch": 2.436821841092055, + "grad_norm": 0.981340229511261, + "learning_rate": 0.00012103194723234928, + "loss": 1.8406, + "step": 20886 + }, + { + "epoch": 2.4369385135923465, + "grad_norm": 1.1064889430999756, + "learning_rate": 0.00012101714908904383, + "loss": 1.778, + "step": 20887 + }, + { + "epoch": 2.437055186092638, + "grad_norm": 1.1382237672805786, + "learning_rate": 0.0001210023512617508, + "loss": 2.0036, + "step": 20888 + }, + { + "epoch": 2.43717185859293, + "grad_norm": 1.1823599338531494, + "learning_rate": 0.00012098755375062368, + "loss": 1.9673, + "step": 20889 + }, + { + "epoch": 2.4372885310932215, + "grad_norm": 1.0171234607696533, + "learning_rate": 0.00012097275655581582, + "loss": 2.0808, + "step": 20890 + }, + { + "epoch": 2.437405203593513, + "grad_norm": 1.2467507123947144, + "learning_rate": 0.0001209579596774806, + "loss": 2.1844, + "step": 20891 + }, + { + "epoch": 2.437521876093805, + "grad_norm": 1.2188317775726318, + "learning_rate": 0.00012094316311577155, + "loss": 2.0241, + "step": 20892 + }, + { + "epoch": 2.437638548594096, + "grad_norm": 0.9603626132011414, + "learning_rate": 0.00012092836687084188, + "loss": 1.9054, + "step": 20893 + }, + { + "epoch": 2.4377552210943882, + "grad_norm": 1.0471208095550537, + "learning_rate": 0.0001209135709428452, + "loss": 1.655, + "step": 20894 + }, + { + "epoch": 2.4378718935946795, + "grad_norm": 1.172646164894104, + "learning_rate": 0.00012089877533193472, + "loss": 2.0264, + "step": 20895 + }, + { + "epoch": 2.4379885660949716, + "grad_norm": 1.250509262084961, + "learning_rate": 0.00012088398003826388, + "loss": 2.0951, + "step": 20896 + }, + { + "epoch": 2.438105238595263, + "grad_norm": 1.2485136985778809, + "learning_rate": 0.00012086918506198606, + "loss": 1.9867, + "step": 20897 + }, + { + "epoch": 2.438221911095555, + "grad_norm": 1.1419421434402466, + "learning_rate": 0.00012085439040325467, + "loss": 2.0354, + "step": 20898 + }, + { + "epoch": 2.438338583595846, + "grad_norm": 1.142090082168579, + "learning_rate": 0.000120839596062223, + "loss": 1.9616, + "step": 20899 + }, + { + "epoch": 2.4384552560961383, + "grad_norm": 1.255602240562439, + "learning_rate": 0.00012082480203904453, + "loss": 2.0448, + "step": 20900 + }, + { + "epoch": 2.4385719285964296, + "grad_norm": 1.122961163520813, + "learning_rate": 0.00012081000833387254, + "loss": 1.9869, + "step": 20901 + }, + { + "epoch": 2.4386886010967217, + "grad_norm": 1.208411455154419, + "learning_rate": 0.00012079521494686047, + "loss": 1.8505, + "step": 20902 + }, + { + "epoch": 2.438805273597013, + "grad_norm": 1.154518961906433, + "learning_rate": 0.00012078042187816159, + "loss": 2.0194, + "step": 20903 + }, + { + "epoch": 2.438921946097305, + "grad_norm": 1.1485207080841064, + "learning_rate": 0.00012076562912792932, + "loss": 2.0953, + "step": 20904 + }, + { + "epoch": 2.4390386185975963, + "grad_norm": 1.016505241394043, + "learning_rate": 0.00012075083669631704, + "loss": 1.8687, + "step": 20905 + }, + { + "epoch": 2.4391552910978884, + "grad_norm": 1.2150156497955322, + "learning_rate": 0.00012073604458347801, + "loss": 2.1839, + "step": 20906 + }, + { + "epoch": 2.4392719635981797, + "grad_norm": 1.1331645250320435, + "learning_rate": 0.00012072125278956571, + "loss": 2.0065, + "step": 20907 + }, + { + "epoch": 2.439388636098472, + "grad_norm": 1.0206632614135742, + "learning_rate": 0.00012070646131473334, + "loss": 2.0114, + "step": 20908 + }, + { + "epoch": 2.439505308598763, + "grad_norm": 1.069709300994873, + "learning_rate": 0.00012069167015913436, + "loss": 1.8911, + "step": 20909 + }, + { + "epoch": 2.439621981099055, + "grad_norm": 1.2164387702941895, + "learning_rate": 0.000120676879322922, + "loss": 2.0287, + "step": 20910 + }, + { + "epoch": 2.4397386535993464, + "grad_norm": 1.2544851303100586, + "learning_rate": 0.00012066208880624974, + "loss": 2.0039, + "step": 20911 + }, + { + "epoch": 2.4398553260996385, + "grad_norm": 1.0938410758972168, + "learning_rate": 0.00012064729860927073, + "loss": 1.7913, + "step": 20912 + }, + { + "epoch": 2.4399719985999297, + "grad_norm": 1.334446907043457, + "learning_rate": 0.00012063250873213847, + "loss": 2.0397, + "step": 20913 + }, + { + "epoch": 2.440088671100222, + "grad_norm": 1.3389312028884888, + "learning_rate": 0.00012061771917500618, + "loss": 2.1201, + "step": 20914 + }, + { + "epoch": 2.440205343600513, + "grad_norm": 1.034993052482605, + "learning_rate": 0.00012060292993802724, + "loss": 1.855, + "step": 20915 + }, + { + "epoch": 2.4403220161008052, + "grad_norm": 1.1901979446411133, + "learning_rate": 0.00012058814102135488, + "loss": 1.8902, + "step": 20916 + }, + { + "epoch": 2.4404386886010965, + "grad_norm": 1.3317627906799316, + "learning_rate": 0.00012057335242514247, + "loss": 2.0707, + "step": 20917 + }, + { + "epoch": 2.4405553611013886, + "grad_norm": 1.1567559242248535, + "learning_rate": 0.0001205585641495434, + "loss": 1.9633, + "step": 20918 + }, + { + "epoch": 2.44067203360168, + "grad_norm": 1.2719014883041382, + "learning_rate": 0.00012054377619471083, + "loss": 1.9513, + "step": 20919 + }, + { + "epoch": 2.440788706101972, + "grad_norm": 1.4509941339492798, + "learning_rate": 0.00012052898856079821, + "loss": 2.0066, + "step": 20920 + }, + { + "epoch": 2.440905378602263, + "grad_norm": 1.1850334405899048, + "learning_rate": 0.00012051420124795875, + "loss": 2.0856, + "step": 20921 + }, + { + "epoch": 2.4410220511025553, + "grad_norm": 1.1135176420211792, + "learning_rate": 0.00012049941425634578, + "loss": 1.8472, + "step": 20922 + }, + { + "epoch": 2.4411387236028466, + "grad_norm": 1.1896954774856567, + "learning_rate": 0.00012048462758611253, + "loss": 1.9651, + "step": 20923 + }, + { + "epoch": 2.4412553961031387, + "grad_norm": 1.3844571113586426, + "learning_rate": 0.00012046984123741241, + "loss": 2.0408, + "step": 20924 + }, + { + "epoch": 2.44137206860343, + "grad_norm": 1.0000189542770386, + "learning_rate": 0.0001204550552103986, + "loss": 1.7983, + "step": 20925 + }, + { + "epoch": 2.441488741103722, + "grad_norm": 1.3563697338104248, + "learning_rate": 0.00012044026950522449, + "loss": 2.0418, + "step": 20926 + }, + { + "epoch": 2.4416054136040133, + "grad_norm": 1.181438684463501, + "learning_rate": 0.00012042548412204322, + "loss": 1.9567, + "step": 20927 + }, + { + "epoch": 2.4417220861043054, + "grad_norm": 1.0970934629440308, + "learning_rate": 0.00012041069906100823, + "loss": 2.0176, + "step": 20928 + }, + { + "epoch": 2.4418387586045966, + "grad_norm": 1.2060277462005615, + "learning_rate": 0.00012039591432227267, + "loss": 2.1348, + "step": 20929 + }, + { + "epoch": 2.4419554311048888, + "grad_norm": 1.1897006034851074, + "learning_rate": 0.00012038112990598984, + "loss": 1.992, + "step": 20930 + }, + { + "epoch": 2.44207210360518, + "grad_norm": 1.0334234237670898, + "learning_rate": 0.00012036634581231308, + "loss": 1.9042, + "step": 20931 + }, + { + "epoch": 2.442188776105472, + "grad_norm": 1.1711103916168213, + "learning_rate": 0.00012035156204139558, + "loss": 1.9436, + "step": 20932 + }, + { + "epoch": 2.4423054486057634, + "grad_norm": 1.1459112167358398, + "learning_rate": 0.00012033677859339064, + "loss": 2.1062, + "step": 20933 + }, + { + "epoch": 2.4424221211060555, + "grad_norm": 1.1454675197601318, + "learning_rate": 0.00012032199546845146, + "loss": 1.7904, + "step": 20934 + }, + { + "epoch": 2.4425387936063467, + "grad_norm": 1.2073235511779785, + "learning_rate": 0.00012030721266673136, + "loss": 1.8421, + "step": 20935 + }, + { + "epoch": 2.442655466106639, + "grad_norm": 1.1352895498275757, + "learning_rate": 0.00012029243018838357, + "loss": 2.055, + "step": 20936 + }, + { + "epoch": 2.44277213860693, + "grad_norm": 1.1469905376434326, + "learning_rate": 0.00012027764803356136, + "loss": 2.0805, + "step": 20937 + }, + { + "epoch": 2.4428888111072222, + "grad_norm": 1.1071442365646362, + "learning_rate": 0.00012026286620241792, + "loss": 1.9737, + "step": 20938 + }, + { + "epoch": 2.4430054836075135, + "grad_norm": 1.1831159591674805, + "learning_rate": 0.00012024808469510656, + "loss": 1.9246, + "step": 20939 + }, + { + "epoch": 2.4431221561078056, + "grad_norm": 1.110175371170044, + "learning_rate": 0.00012023330351178045, + "loss": 2.1489, + "step": 20940 + }, + { + "epoch": 2.443238828608097, + "grad_norm": 1.1402561664581299, + "learning_rate": 0.0001202185226525929, + "loss": 2.0221, + "step": 20941 + }, + { + "epoch": 2.443355501108389, + "grad_norm": 1.1401710510253906, + "learning_rate": 0.00012020374211769702, + "loss": 1.8649, + "step": 20942 + }, + { + "epoch": 2.44347217360868, + "grad_norm": 1.0488498210906982, + "learning_rate": 0.00012018896190724614, + "loss": 1.9948, + "step": 20943 + }, + { + "epoch": 2.4435888461089723, + "grad_norm": 1.1498645544052124, + "learning_rate": 0.00012017418202139351, + "loss": 2.0835, + "step": 20944 + }, + { + "epoch": 2.4437055186092635, + "grad_norm": 1.1433758735656738, + "learning_rate": 0.00012015940246029225, + "loss": 2.0708, + "step": 20945 + }, + { + "epoch": 2.4438221911095557, + "grad_norm": 1.2067866325378418, + "learning_rate": 0.00012014462322409567, + "loss": 1.9869, + "step": 20946 + }, + { + "epoch": 2.443938863609847, + "grad_norm": 1.1037153005599976, + "learning_rate": 0.00012012984431295694, + "loss": 1.8733, + "step": 20947 + }, + { + "epoch": 2.444055536110139, + "grad_norm": 1.1739059686660767, + "learning_rate": 0.0001201150657270293, + "loss": 1.9304, + "step": 20948 + }, + { + "epoch": 2.4441722086104303, + "grad_norm": 1.177714228630066, + "learning_rate": 0.0001201002874664659, + "loss": 2.0865, + "step": 20949 + }, + { + "epoch": 2.4442888811107224, + "grad_norm": 1.25453519821167, + "learning_rate": 0.00012008550953142003, + "loss": 2.0755, + "step": 20950 + }, + { + "epoch": 2.4444055536110136, + "grad_norm": 1.0370934009552002, + "learning_rate": 0.00012007073192204476, + "loss": 1.9384, + "step": 20951 + }, + { + "epoch": 2.4445222261113058, + "grad_norm": 1.1456077098846436, + "learning_rate": 0.00012005595463849345, + "loss": 1.9989, + "step": 20952 + }, + { + "epoch": 2.444638898611597, + "grad_norm": 1.0287986993789673, + "learning_rate": 0.00012004117768091914, + "loss": 1.7355, + "step": 20953 + }, + { + "epoch": 2.444755571111889, + "grad_norm": 1.0877504348754883, + "learning_rate": 0.00012002640104947516, + "loss": 1.9196, + "step": 20954 + }, + { + "epoch": 2.4448722436121804, + "grad_norm": 1.188607096672058, + "learning_rate": 0.00012001162474431458, + "loss": 1.8531, + "step": 20955 + }, + { + "epoch": 2.4449889161124725, + "grad_norm": 1.057214617729187, + "learning_rate": 0.00011999684876559064, + "loss": 1.889, + "step": 20956 + }, + { + "epoch": 2.4451055886127637, + "grad_norm": 1.123939871788025, + "learning_rate": 0.00011998207311345658, + "loss": 1.9601, + "step": 20957 + }, + { + "epoch": 2.445222261113056, + "grad_norm": 1.1974267959594727, + "learning_rate": 0.00011996729778806547, + "loss": 1.9434, + "step": 20958 + }, + { + "epoch": 2.445338933613347, + "grad_norm": 1.1154041290283203, + "learning_rate": 0.00011995252278957055, + "loss": 1.9717, + "step": 20959 + }, + { + "epoch": 2.445455606113639, + "grad_norm": 1.1024001836776733, + "learning_rate": 0.00011993774811812492, + "loss": 1.9956, + "step": 20960 + }, + { + "epoch": 2.4455722786139305, + "grad_norm": 1.2463892698287964, + "learning_rate": 0.00011992297377388188, + "loss": 1.8529, + "step": 20961 + }, + { + "epoch": 2.4456889511142226, + "grad_norm": 1.1576615571975708, + "learning_rate": 0.00011990819975699444, + "loss": 1.8535, + "step": 20962 + }, + { + "epoch": 2.445805623614514, + "grad_norm": 1.1975843906402588, + "learning_rate": 0.0001198934260676159, + "loss": 1.9514, + "step": 20963 + }, + { + "epoch": 2.445922296114806, + "grad_norm": 1.1955506801605225, + "learning_rate": 0.00011987865270589933, + "loss": 1.9685, + "step": 20964 + }, + { + "epoch": 2.446038968615097, + "grad_norm": 1.1373910903930664, + "learning_rate": 0.00011986387967199787, + "loss": 1.8653, + "step": 20965 + }, + { + "epoch": 2.4461556411153893, + "grad_norm": 1.3073252439498901, + "learning_rate": 0.00011984910696606473, + "loss": 1.9508, + "step": 20966 + }, + { + "epoch": 2.4462723136156805, + "grad_norm": 1.026758074760437, + "learning_rate": 0.00011983433458825306, + "loss": 1.7992, + "step": 20967 + }, + { + "epoch": 2.4463889861159727, + "grad_norm": 1.08527410030365, + "learning_rate": 0.00011981956253871591, + "loss": 1.805, + "step": 20968 + }, + { + "epoch": 2.446505658616264, + "grad_norm": 1.036699652671814, + "learning_rate": 0.00011980479081760648, + "loss": 1.8122, + "step": 20969 + }, + { + "epoch": 2.446622331116556, + "grad_norm": 1.1801234483718872, + "learning_rate": 0.00011979001942507798, + "loss": 2.0567, + "step": 20970 + }, + { + "epoch": 2.4467390036168473, + "grad_norm": 1.3336737155914307, + "learning_rate": 0.00011977524836128341, + "loss": 2.1592, + "step": 20971 + }, + { + "epoch": 2.4468556761171394, + "grad_norm": 1.2121937274932861, + "learning_rate": 0.00011976047762637604, + "loss": 2.064, + "step": 20972 + }, + { + "epoch": 2.4469723486174306, + "grad_norm": 1.3055033683776855, + "learning_rate": 0.00011974570722050891, + "loss": 2.1203, + "step": 20973 + }, + { + "epoch": 2.4470890211177228, + "grad_norm": 1.1123408079147339, + "learning_rate": 0.00011973093714383516, + "loss": 1.8962, + "step": 20974 + }, + { + "epoch": 2.447205693618014, + "grad_norm": 1.2658791542053223, + "learning_rate": 0.00011971616739650785, + "loss": 2.044, + "step": 20975 + }, + { + "epoch": 2.447322366118306, + "grad_norm": 1.1402015686035156, + "learning_rate": 0.0001197013979786802, + "loss": 1.9418, + "step": 20976 + }, + { + "epoch": 2.4474390386185974, + "grad_norm": 1.0964140892028809, + "learning_rate": 0.00011968662889050524, + "loss": 1.8255, + "step": 20977 + }, + { + "epoch": 2.4475557111188895, + "grad_norm": 1.2127858400344849, + "learning_rate": 0.00011967186013213616, + "loss": 1.9764, + "step": 20978 + }, + { + "epoch": 2.4476723836191807, + "grad_norm": 1.0223604440689087, + "learning_rate": 0.00011965709170372596, + "loss": 1.9544, + "step": 20979 + }, + { + "epoch": 2.447789056119473, + "grad_norm": 1.060196042060852, + "learning_rate": 0.00011964232360542787, + "loss": 1.9336, + "step": 20980 + }, + { + "epoch": 2.447905728619764, + "grad_norm": 1.1331884860992432, + "learning_rate": 0.00011962755583739487, + "loss": 1.9263, + "step": 20981 + }, + { + "epoch": 2.448022401120056, + "grad_norm": 1.0632001161575317, + "learning_rate": 0.00011961278839978013, + "loss": 2.051, + "step": 20982 + }, + { + "epoch": 2.4481390736203474, + "grad_norm": 1.2472296953201294, + "learning_rate": 0.00011959802129273675, + "loss": 2.0355, + "step": 20983 + }, + { + "epoch": 2.4482557461206396, + "grad_norm": 1.179711937904358, + "learning_rate": 0.00011958325451641773, + "loss": 1.9102, + "step": 20984 + }, + { + "epoch": 2.448372418620931, + "grad_norm": 1.0407627820968628, + "learning_rate": 0.00011956848807097628, + "loss": 2.0452, + "step": 20985 + }, + { + "epoch": 2.448489091121223, + "grad_norm": 1.1730942726135254, + "learning_rate": 0.00011955372195656534, + "loss": 2.0922, + "step": 20986 + }, + { + "epoch": 2.448605763621514, + "grad_norm": 1.0326036214828491, + "learning_rate": 0.00011953895617333817, + "loss": 2.0949, + "step": 20987 + }, + { + "epoch": 2.4487224361218063, + "grad_norm": 1.2140040397644043, + "learning_rate": 0.00011952419072144766, + "loss": 1.8782, + "step": 20988 + }, + { + "epoch": 2.4488391086220975, + "grad_norm": 1.151049256324768, + "learning_rate": 0.00011950942560104704, + "loss": 1.9485, + "step": 20989 + }, + { + "epoch": 2.4489557811223897, + "grad_norm": 1.1845608949661255, + "learning_rate": 0.00011949466081228924, + "loss": 1.974, + "step": 20990 + }, + { + "epoch": 2.449072453622681, + "grad_norm": 1.154753565788269, + "learning_rate": 0.00011947989635532739, + "loss": 1.9134, + "step": 20991 + }, + { + "epoch": 2.449189126122973, + "grad_norm": 1.2062060832977295, + "learning_rate": 0.00011946513223031457, + "loss": 1.9194, + "step": 20992 + }, + { + "epoch": 2.4493057986232643, + "grad_norm": 1.1021316051483154, + "learning_rate": 0.00011945036843740384, + "loss": 1.7659, + "step": 20993 + }, + { + "epoch": 2.4494224711235564, + "grad_norm": 1.32808256149292, + "learning_rate": 0.00011943560497674815, + "loss": 1.9681, + "step": 20994 + }, + { + "epoch": 2.4495391436238476, + "grad_norm": 1.3536031246185303, + "learning_rate": 0.00011942084184850073, + "loss": 1.9592, + "step": 20995 + }, + { + "epoch": 2.4496558161241397, + "grad_norm": 1.1116677522659302, + "learning_rate": 0.00011940607905281447, + "loss": 1.9625, + "step": 20996 + }, + { + "epoch": 2.449772488624431, + "grad_norm": 1.1977595090866089, + "learning_rate": 0.00011939131658984247, + "loss": 1.9673, + "step": 20997 + }, + { + "epoch": 2.449889161124723, + "grad_norm": 1.0217887163162231, + "learning_rate": 0.00011937655445973781, + "loss": 2.0876, + "step": 20998 + }, + { + "epoch": 2.4500058336250143, + "grad_norm": 1.170008897781372, + "learning_rate": 0.00011936179266265349, + "loss": 1.9703, + "step": 20999 + }, + { + "epoch": 2.4501225061253065, + "grad_norm": 1.1184138059616089, + "learning_rate": 0.00011934703119874256, + "loss": 1.9976, + "step": 21000 + }, + { + "epoch": 2.4502391786255977, + "grad_norm": 1.2406394481658936, + "learning_rate": 0.00011933227006815798, + "loss": 2.1691, + "step": 21001 + }, + { + "epoch": 2.45035585112589, + "grad_norm": 1.192674160003662, + "learning_rate": 0.00011931750927105291, + "loss": 1.9966, + "step": 21002 + }, + { + "epoch": 2.450472523626181, + "grad_norm": 1.134774923324585, + "learning_rate": 0.00011930274880758022, + "loss": 2.1238, + "step": 21003 + }, + { + "epoch": 2.450589196126473, + "grad_norm": 1.2811721563339233, + "learning_rate": 0.00011928798867789309, + "loss": 1.6215, + "step": 21004 + }, + { + "epoch": 2.4507058686267644, + "grad_norm": 1.0179623365402222, + "learning_rate": 0.00011927322888214438, + "loss": 1.9424, + "step": 21005 + }, + { + "epoch": 2.4508225411270566, + "grad_norm": 1.2080508470535278, + "learning_rate": 0.00011925846942048727, + "loss": 1.8899, + "step": 21006 + }, + { + "epoch": 2.450939213627348, + "grad_norm": 1.0808664560317993, + "learning_rate": 0.00011924371029307461, + "loss": 2.0256, + "step": 21007 + }, + { + "epoch": 2.45105588612764, + "grad_norm": 1.1316123008728027, + "learning_rate": 0.00011922895150005949, + "loss": 1.9216, + "step": 21008 + }, + { + "epoch": 2.451172558627931, + "grad_norm": 1.3156020641326904, + "learning_rate": 0.0001192141930415949, + "loss": 2.084, + "step": 21009 + }, + { + "epoch": 2.4512892311282233, + "grad_norm": 1.2360947132110596, + "learning_rate": 0.0001191994349178338, + "loss": 2.0618, + "step": 21010 + }, + { + "epoch": 2.4514059036285145, + "grad_norm": 1.089657187461853, + "learning_rate": 0.00011918467712892929, + "loss": 2.0067, + "step": 21011 + }, + { + "epoch": 2.4515225761288066, + "grad_norm": 1.2350832223892212, + "learning_rate": 0.00011916991967503424, + "loss": 2.0105, + "step": 21012 + }, + { + "epoch": 2.451639248629098, + "grad_norm": 1.1655383110046387, + "learning_rate": 0.00011915516255630175, + "loss": 1.8464, + "step": 21013 + }, + { + "epoch": 2.45175592112939, + "grad_norm": 1.20113205909729, + "learning_rate": 0.00011914040577288468, + "loss": 1.9547, + "step": 21014 + }, + { + "epoch": 2.4518725936296812, + "grad_norm": 0.9812204241752625, + "learning_rate": 0.00011912564932493614, + "loss": 1.9597, + "step": 21015 + }, + { + "epoch": 2.4519892661299734, + "grad_norm": 1.0963553190231323, + "learning_rate": 0.00011911089321260904, + "loss": 1.8942, + "step": 21016 + }, + { + "epoch": 2.4521059386302646, + "grad_norm": 1.0971816778182983, + "learning_rate": 0.00011909613743605639, + "loss": 2.0199, + "step": 21017 + }, + { + "epoch": 2.4522226111305567, + "grad_norm": 1.156996488571167, + "learning_rate": 0.00011908138199543108, + "loss": 2.1409, + "step": 21018 + }, + { + "epoch": 2.452339283630848, + "grad_norm": 1.385076642036438, + "learning_rate": 0.0001190666268908862, + "loss": 2.1939, + "step": 21019 + }, + { + "epoch": 2.45245595613114, + "grad_norm": 1.2281264066696167, + "learning_rate": 0.00011905187212257457, + "loss": 1.8183, + "step": 21020 + }, + { + "epoch": 2.4525726286314313, + "grad_norm": 1.0416302680969238, + "learning_rate": 0.00011903711769064929, + "loss": 1.8137, + "step": 21021 + }, + { + "epoch": 2.4526893011317235, + "grad_norm": 1.1770685911178589, + "learning_rate": 0.00011902236359526322, + "loss": 1.9894, + "step": 21022 + }, + { + "epoch": 2.4528059736320147, + "grad_norm": 1.0488195419311523, + "learning_rate": 0.00011900760983656937, + "loss": 1.9916, + "step": 21023 + }, + { + "epoch": 2.452922646132307, + "grad_norm": 1.0996580123901367, + "learning_rate": 0.00011899285641472068, + "loss": 1.9164, + "step": 21024 + }, + { + "epoch": 2.453039318632598, + "grad_norm": 1.1036407947540283, + "learning_rate": 0.00011897810332987008, + "loss": 1.9427, + "step": 21025 + }, + { + "epoch": 2.45315599113289, + "grad_norm": 1.2002406120300293, + "learning_rate": 0.00011896335058217056, + "loss": 1.9038, + "step": 21026 + }, + { + "epoch": 2.4532726636331814, + "grad_norm": 1.2137739658355713, + "learning_rate": 0.00011894859817177496, + "loss": 1.9691, + "step": 21027 + }, + { + "epoch": 2.4533893361334735, + "grad_norm": 1.2701183557510376, + "learning_rate": 0.00011893384609883635, + "loss": 1.9301, + "step": 21028 + }, + { + "epoch": 2.453506008633765, + "grad_norm": 1.2514758110046387, + "learning_rate": 0.00011891909436350752, + "loss": 2.048, + "step": 21029 + }, + { + "epoch": 2.453622681134057, + "grad_norm": 1.137855052947998, + "learning_rate": 0.00011890434296594156, + "loss": 2.0021, + "step": 21030 + }, + { + "epoch": 2.453739353634348, + "grad_norm": 1.089505672454834, + "learning_rate": 0.00011888959190629123, + "loss": 1.9404, + "step": 21031 + }, + { + "epoch": 2.4538560261346403, + "grad_norm": 1.2168149948120117, + "learning_rate": 0.0001188748411847096, + "loss": 1.9315, + "step": 21032 + }, + { + "epoch": 2.4539726986349315, + "grad_norm": 1.1592934131622314, + "learning_rate": 0.00011886009080134948, + "loss": 1.9171, + "step": 21033 + }, + { + "epoch": 2.4540893711352236, + "grad_norm": 1.085235357284546, + "learning_rate": 0.00011884534075636383, + "loss": 1.8657, + "step": 21034 + }, + { + "epoch": 2.454206043635515, + "grad_norm": 1.3061914443969727, + "learning_rate": 0.00011883059104990556, + "loss": 1.9577, + "step": 21035 + }, + { + "epoch": 2.454322716135807, + "grad_norm": 1.0966322422027588, + "learning_rate": 0.00011881584168212755, + "loss": 1.999, + "step": 21036 + }, + { + "epoch": 2.4544393886360982, + "grad_norm": 1.2922192811965942, + "learning_rate": 0.00011880109265318279, + "loss": 2.0836, + "step": 21037 + }, + { + "epoch": 2.4545560611363904, + "grad_norm": 1.1166753768920898, + "learning_rate": 0.00011878634396322408, + "loss": 1.7948, + "step": 21038 + }, + { + "epoch": 2.4546727336366816, + "grad_norm": 1.145400881767273, + "learning_rate": 0.00011877159561240444, + "loss": 2.0394, + "step": 21039 + }, + { + "epoch": 2.4547894061369737, + "grad_norm": 1.241930365562439, + "learning_rate": 0.00011875684760087657, + "loss": 2.0521, + "step": 21040 + }, + { + "epoch": 2.454906078637265, + "grad_norm": 1.1526412963867188, + "learning_rate": 0.00011874209992879359, + "loss": 1.8279, + "step": 21041 + }, + { + "epoch": 2.455022751137557, + "grad_norm": 1.2311540842056274, + "learning_rate": 0.00011872735259630823, + "loss": 1.9244, + "step": 21042 + }, + { + "epoch": 2.4551394236378483, + "grad_norm": 1.1731730699539185, + "learning_rate": 0.00011871260560357344, + "loss": 1.905, + "step": 21043 + }, + { + "epoch": 2.4552560961381404, + "grad_norm": 1.1930345296859741, + "learning_rate": 0.00011869785895074204, + "loss": 1.9893, + "step": 21044 + }, + { + "epoch": 2.4553727686384317, + "grad_norm": 1.1960877180099487, + "learning_rate": 0.000118683112637967, + "loss": 1.9948, + "step": 21045 + }, + { + "epoch": 2.455489441138724, + "grad_norm": 1.2127771377563477, + "learning_rate": 0.00011866836666540107, + "loss": 2.0832, + "step": 21046 + }, + { + "epoch": 2.455606113639015, + "grad_norm": 1.1636816263198853, + "learning_rate": 0.00011865362103319727, + "loss": 2.1776, + "step": 21047 + }, + { + "epoch": 2.455722786139307, + "grad_norm": 1.1923234462738037, + "learning_rate": 0.00011863887574150831, + "loss": 2.1415, + "step": 21048 + }, + { + "epoch": 2.4558394586395984, + "grad_norm": 1.0100866556167603, + "learning_rate": 0.00011862413079048716, + "loss": 1.9559, + "step": 21049 + }, + { + "epoch": 2.4559561311398905, + "grad_norm": 1.3377578258514404, + "learning_rate": 0.00011860938618028669, + "loss": 1.9527, + "step": 21050 + }, + { + "epoch": 2.4560728036401818, + "grad_norm": 1.0379811525344849, + "learning_rate": 0.00011859464191105966, + "loss": 1.7282, + "step": 21051 + }, + { + "epoch": 2.456189476140474, + "grad_norm": 1.4359917640686035, + "learning_rate": 0.00011857989798295903, + "loss": 1.9463, + "step": 21052 + }, + { + "epoch": 2.456306148640765, + "grad_norm": 1.038433313369751, + "learning_rate": 0.00011856515439613756, + "loss": 1.8409, + "step": 21053 + }, + { + "epoch": 2.4564228211410573, + "grad_norm": 1.118160367012024, + "learning_rate": 0.00011855041115074817, + "loss": 1.86, + "step": 21054 + }, + { + "epoch": 2.4565394936413485, + "grad_norm": 1.0978248119354248, + "learning_rate": 0.00011853566824694361, + "loss": 1.942, + "step": 21055 + }, + { + "epoch": 2.4566561661416406, + "grad_norm": 1.3228585720062256, + "learning_rate": 0.00011852092568487681, + "loss": 2.0972, + "step": 21056 + }, + { + "epoch": 2.456772838641932, + "grad_norm": 1.2174158096313477, + "learning_rate": 0.00011850618346470054, + "loss": 1.9269, + "step": 21057 + }, + { + "epoch": 2.456889511142224, + "grad_norm": 1.1235988140106201, + "learning_rate": 0.0001184914415865677, + "loss": 1.954, + "step": 21058 + }, + { + "epoch": 2.457006183642515, + "grad_norm": 1.112189769744873, + "learning_rate": 0.00011847670005063104, + "loss": 1.8888, + "step": 21059 + }, + { + "epoch": 2.4571228561428073, + "grad_norm": 1.1792011260986328, + "learning_rate": 0.00011846195885704342, + "loss": 1.9091, + "step": 21060 + }, + { + "epoch": 2.4572395286430986, + "grad_norm": 1.1734025478363037, + "learning_rate": 0.00011844721800595765, + "loss": 1.838, + "step": 21061 + }, + { + "epoch": 2.4573562011433907, + "grad_norm": 1.0641437768936157, + "learning_rate": 0.00011843247749752655, + "loss": 1.8854, + "step": 21062 + }, + { + "epoch": 2.457472873643682, + "grad_norm": 1.1631203889846802, + "learning_rate": 0.00011841773733190297, + "loss": 1.8327, + "step": 21063 + }, + { + "epoch": 2.457589546143974, + "grad_norm": 1.16288161277771, + "learning_rate": 0.00011840299750923966, + "loss": 1.9383, + "step": 21064 + }, + { + "epoch": 2.4577062186442653, + "grad_norm": 1.0477263927459717, + "learning_rate": 0.00011838825802968951, + "loss": 1.8779, + "step": 21065 + }, + { + "epoch": 2.4578228911445574, + "grad_norm": 1.2175871133804321, + "learning_rate": 0.00011837351889340519, + "loss": 2.0645, + "step": 21066 + }, + { + "epoch": 2.4579395636448487, + "grad_norm": 1.327744722366333, + "learning_rate": 0.00011835878010053965, + "loss": 2.0963, + "step": 21067 + }, + { + "epoch": 2.458056236145141, + "grad_norm": 1.108938217163086, + "learning_rate": 0.00011834404165124558, + "loss": 1.9438, + "step": 21068 + }, + { + "epoch": 2.458172908645432, + "grad_norm": 1.1111513376235962, + "learning_rate": 0.00011832930354567583, + "loss": 2.0128, + "step": 21069 + }, + { + "epoch": 2.458289581145724, + "grad_norm": 1.2478322982788086, + "learning_rate": 0.0001183145657839831, + "loss": 2.0487, + "step": 21070 + }, + { + "epoch": 2.4584062536460154, + "grad_norm": 1.1106637716293335, + "learning_rate": 0.00011829982836632032, + "loss": 1.9532, + "step": 21071 + }, + { + "epoch": 2.4585229261463075, + "grad_norm": 1.0746335983276367, + "learning_rate": 0.0001182850912928401, + "loss": 1.9068, + "step": 21072 + }, + { + "epoch": 2.4586395986465988, + "grad_norm": 1.18219792842865, + "learning_rate": 0.00011827035456369538, + "loss": 2.0705, + "step": 21073 + }, + { + "epoch": 2.458756271146891, + "grad_norm": 1.0397354364395142, + "learning_rate": 0.0001182556181790388, + "loss": 1.8665, + "step": 21074 + }, + { + "epoch": 2.458872943647182, + "grad_norm": 1.0665684938430786, + "learning_rate": 0.00011824088213902323, + "loss": 1.9943, + "step": 21075 + }, + { + "epoch": 2.4589896161474742, + "grad_norm": 1.0936392545700073, + "learning_rate": 0.00011822614644380138, + "loss": 2.0329, + "step": 21076 + }, + { + "epoch": 2.4591062886477655, + "grad_norm": 1.273567795753479, + "learning_rate": 0.00011821141109352604, + "loss": 2.0011, + "step": 21077 + }, + { + "epoch": 2.4592229611480576, + "grad_norm": 1.135254144668579, + "learning_rate": 0.00011819667608835, + "loss": 1.9376, + "step": 21078 + }, + { + "epoch": 2.459339633648349, + "grad_norm": 1.2216397523880005, + "learning_rate": 0.00011818194142842588, + "loss": 1.9886, + "step": 21079 + }, + { + "epoch": 2.459456306148641, + "grad_norm": 1.1553922891616821, + "learning_rate": 0.00011816720711390661, + "loss": 2.0511, + "step": 21080 + }, + { + "epoch": 2.459572978648932, + "grad_norm": 1.0604884624481201, + "learning_rate": 0.00011815247314494482, + "loss": 2.0149, + "step": 21081 + }, + { + "epoch": 2.4596896511492243, + "grad_norm": 1.0971375703811646, + "learning_rate": 0.00011813773952169335, + "loss": 1.8723, + "step": 21082 + }, + { + "epoch": 2.4598063236495156, + "grad_norm": 1.4216634035110474, + "learning_rate": 0.00011812300624430482, + "loss": 2.1414, + "step": 21083 + }, + { + "epoch": 2.4599229961498077, + "grad_norm": 1.184592843055725, + "learning_rate": 0.00011810827331293209, + "loss": 1.9903, + "step": 21084 + }, + { + "epoch": 2.460039668650099, + "grad_norm": 1.2589725255966187, + "learning_rate": 0.00011809354072772781, + "loss": 2.0926, + "step": 21085 + }, + { + "epoch": 2.460156341150391, + "grad_norm": 1.2685290575027466, + "learning_rate": 0.00011807880848884474, + "loss": 1.975, + "step": 21086 + }, + { + "epoch": 2.4602730136506823, + "grad_norm": 1.1002880334854126, + "learning_rate": 0.00011806407659643561, + "loss": 1.9132, + "step": 21087 + }, + { + "epoch": 2.4603896861509744, + "grad_norm": 1.159924864768982, + "learning_rate": 0.00011804934505065311, + "loss": 2.0477, + "step": 21088 + }, + { + "epoch": 2.4605063586512657, + "grad_norm": 1.0206890106201172, + "learning_rate": 0.00011803461385165005, + "loss": 1.7445, + "step": 21089 + }, + { + "epoch": 2.460623031151558, + "grad_norm": 1.0935040712356567, + "learning_rate": 0.00011801988299957905, + "loss": 2.1072, + "step": 21090 + }, + { + "epoch": 2.460739703651849, + "grad_norm": 1.206936240196228, + "learning_rate": 0.0001180051524945929, + "loss": 2.037, + "step": 21091 + }, + { + "epoch": 2.460856376152141, + "grad_norm": 1.202391505241394, + "learning_rate": 0.00011799042233684425, + "loss": 1.8981, + "step": 21092 + }, + { + "epoch": 2.4609730486524324, + "grad_norm": 1.241195797920227, + "learning_rate": 0.00011797569252648583, + "loss": 1.9989, + "step": 21093 + }, + { + "epoch": 2.4610897211527245, + "grad_norm": 1.0844006538391113, + "learning_rate": 0.00011796096306367033, + "loss": 2.0432, + "step": 21094 + }, + { + "epoch": 2.4612063936530157, + "grad_norm": 1.1155245304107666, + "learning_rate": 0.0001179462339485505, + "loss": 1.9537, + "step": 21095 + }, + { + "epoch": 2.461323066153308, + "grad_norm": 1.0523616075515747, + "learning_rate": 0.00011793150518127892, + "loss": 1.9171, + "step": 21096 + }, + { + "epoch": 2.461439738653599, + "grad_norm": 1.1373041868209839, + "learning_rate": 0.00011791677676200843, + "loss": 2.0892, + "step": 21097 + }, + { + "epoch": 2.4615564111538912, + "grad_norm": 1.278228998184204, + "learning_rate": 0.00011790204869089158, + "loss": 1.9447, + "step": 21098 + }, + { + "epoch": 2.4616730836541825, + "grad_norm": 1.1658594608306885, + "learning_rate": 0.0001178873209680812, + "loss": 2.0601, + "step": 21099 + }, + { + "epoch": 2.4617897561544746, + "grad_norm": 1.118485927581787, + "learning_rate": 0.00011787259359372979, + "loss": 2.0024, + "step": 21100 + }, + { + "epoch": 2.461906428654766, + "grad_norm": 1.0701961517333984, + "learning_rate": 0.00011785786656799018, + "loss": 2.0267, + "step": 21101 + }, + { + "epoch": 2.462023101155058, + "grad_norm": 1.1185967922210693, + "learning_rate": 0.00011784313989101502, + "loss": 1.9643, + "step": 21102 + }, + { + "epoch": 2.462139773655349, + "grad_norm": 1.1210949420928955, + "learning_rate": 0.00011782841356295688, + "loss": 1.9795, + "step": 21103 + }, + { + "epoch": 2.4622564461556413, + "grad_norm": 1.0719352960586548, + "learning_rate": 0.00011781368758396857, + "loss": 1.8908, + "step": 21104 + }, + { + "epoch": 2.4623731186559326, + "grad_norm": 1.0870532989501953, + "learning_rate": 0.0001177989619542026, + "loss": 1.8035, + "step": 21105 + }, + { + "epoch": 2.4624897911562247, + "grad_norm": 1.2210569381713867, + "learning_rate": 0.00011778423667381178, + "loss": 1.9075, + "step": 21106 + }, + { + "epoch": 2.462606463656516, + "grad_norm": 1.3962961435317993, + "learning_rate": 0.00011776951174294863, + "loss": 1.8375, + "step": 21107 + }, + { + "epoch": 2.462723136156808, + "grad_norm": 1.088129997253418, + "learning_rate": 0.00011775478716176593, + "loss": 1.9329, + "step": 21108 + }, + { + "epoch": 2.4628398086570993, + "grad_norm": 1.1914972066879272, + "learning_rate": 0.00011774006293041618, + "loss": 1.8156, + "step": 21109 + }, + { + "epoch": 2.4629564811573914, + "grad_norm": 1.2149629592895508, + "learning_rate": 0.00011772533904905218, + "loss": 1.9208, + "step": 21110 + }, + { + "epoch": 2.4630731536576826, + "grad_norm": 1.1011537313461304, + "learning_rate": 0.00011771061551782647, + "loss": 1.8551, + "step": 21111 + }, + { + "epoch": 2.4631898261579748, + "grad_norm": 1.2323333024978638, + "learning_rate": 0.00011769589233689176, + "loss": 1.9712, + "step": 21112 + }, + { + "epoch": 2.463306498658266, + "grad_norm": 1.1867015361785889, + "learning_rate": 0.00011768116950640056, + "loss": 2.059, + "step": 21113 + }, + { + "epoch": 2.463423171158558, + "grad_norm": 1.037305474281311, + "learning_rate": 0.00011766644702650557, + "loss": 1.8592, + "step": 21114 + }, + { + "epoch": 2.4635398436588494, + "grad_norm": 1.0958188772201538, + "learning_rate": 0.00011765172489735952, + "loss": 2.0638, + "step": 21115 + }, + { + "epoch": 2.4636565161591415, + "grad_norm": 1.3096539974212646, + "learning_rate": 0.00011763700311911484, + "loss": 2.0766, + "step": 21116 + }, + { + "epoch": 2.4637731886594327, + "grad_norm": 1.0578871965408325, + "learning_rate": 0.00011762228169192436, + "loss": 1.9897, + "step": 21117 + }, + { + "epoch": 2.463889861159725, + "grad_norm": 1.1058236360549927, + "learning_rate": 0.00011760756061594051, + "loss": 1.9173, + "step": 21118 + }, + { + "epoch": 2.464006533660016, + "grad_norm": 1.1292496919631958, + "learning_rate": 0.00011759283989131598, + "loss": 2.0737, + "step": 21119 + }, + { + "epoch": 2.464123206160308, + "grad_norm": 1.1993584632873535, + "learning_rate": 0.00011757811951820336, + "loss": 2.0435, + "step": 21120 + }, + { + "epoch": 2.4642398786605995, + "grad_norm": 1.1977052688598633, + "learning_rate": 0.00011756339949675535, + "loss": 1.9205, + "step": 21121 + }, + { + "epoch": 2.4643565511608916, + "grad_norm": 1.140566349029541, + "learning_rate": 0.00011754867982712434, + "loss": 2.0786, + "step": 21122 + }, + { + "epoch": 2.464473223661183, + "grad_norm": 1.412730097770691, + "learning_rate": 0.00011753396050946317, + "loss": 2.1286, + "step": 21123 + }, + { + "epoch": 2.464589896161475, + "grad_norm": 1.0711426734924316, + "learning_rate": 0.00011751924154392424, + "loss": 1.8598, + "step": 21124 + }, + { + "epoch": 2.464706568661766, + "grad_norm": 1.1273400783538818, + "learning_rate": 0.00011750452293066027, + "loss": 1.902, + "step": 21125 + }, + { + "epoch": 2.4648232411620583, + "grad_norm": 1.1297425031661987, + "learning_rate": 0.00011748980466982376, + "loss": 2.0828, + "step": 21126 + }, + { + "epoch": 2.4649399136623495, + "grad_norm": 1.17697012424469, + "learning_rate": 0.00011747508676156734, + "loss": 2.0099, + "step": 21127 + }, + { + "epoch": 2.4650565861626417, + "grad_norm": 1.0765535831451416, + "learning_rate": 0.0001174603692060436, + "loss": 1.8894, + "step": 21128 + }, + { + "epoch": 2.465173258662933, + "grad_norm": 1.0418773889541626, + "learning_rate": 0.00011744565200340503, + "loss": 2.0142, + "step": 21129 + }, + { + "epoch": 2.465289931163225, + "grad_norm": 1.1341286897659302, + "learning_rate": 0.00011743093515380433, + "loss": 2.0704, + "step": 21130 + }, + { + "epoch": 2.4654066036635163, + "grad_norm": 1.163978934288025, + "learning_rate": 0.00011741621865739396, + "loss": 1.9536, + "step": 21131 + }, + { + "epoch": 2.4655232761638084, + "grad_norm": 1.0896278619766235, + "learning_rate": 0.00011740150251432659, + "loss": 1.9574, + "step": 21132 + }, + { + "epoch": 2.4656399486640996, + "grad_norm": 1.1076774597167969, + "learning_rate": 0.00011738678672475461, + "loss": 2.0992, + "step": 21133 + }, + { + "epoch": 2.4657566211643918, + "grad_norm": 1.1279107332229614, + "learning_rate": 0.00011737207128883081, + "loss": 1.947, + "step": 21134 + }, + { + "epoch": 2.465873293664683, + "grad_norm": 1.2220537662506104, + "learning_rate": 0.00011735735620670752, + "loss": 1.9845, + "step": 21135 + }, + { + "epoch": 2.465989966164975, + "grad_norm": 1.1798900365829468, + "learning_rate": 0.00011734264147853742, + "loss": 2.0558, + "step": 21136 + }, + { + "epoch": 2.4661066386652664, + "grad_norm": 1.1959986686706543, + "learning_rate": 0.00011732792710447305, + "loss": 1.8953, + "step": 21137 + }, + { + "epoch": 2.4662233111655585, + "grad_norm": 0.9963653087615967, + "learning_rate": 0.00011731321308466693, + "loss": 1.9653, + "step": 21138 + }, + { + "epoch": 2.4663399836658497, + "grad_norm": 1.392751693725586, + "learning_rate": 0.00011729849941927154, + "loss": 1.8122, + "step": 21139 + }, + { + "epoch": 2.466456656166142, + "grad_norm": 1.3060013055801392, + "learning_rate": 0.00011728378610843945, + "loss": 1.9489, + "step": 21140 + }, + { + "epoch": 2.466573328666433, + "grad_norm": 0.9452179074287415, + "learning_rate": 0.00011726907315232329, + "loss": 1.7825, + "step": 21141 + }, + { + "epoch": 2.466690001166725, + "grad_norm": 1.1141074895858765, + "learning_rate": 0.00011725436055107543, + "loss": 1.9679, + "step": 21142 + }, + { + "epoch": 2.4668066736670164, + "grad_norm": 1.150901436805725, + "learning_rate": 0.00011723964830484856, + "loss": 1.7668, + "step": 21143 + }, + { + "epoch": 2.4669233461673086, + "grad_norm": 1.0560880899429321, + "learning_rate": 0.00011722493641379507, + "loss": 2.0186, + "step": 21144 + }, + { + "epoch": 2.4670400186676, + "grad_norm": 1.2754570245742798, + "learning_rate": 0.0001172102248780675, + "loss": 2.1806, + "step": 21145 + }, + { + "epoch": 2.467156691167892, + "grad_norm": 1.1848868131637573, + "learning_rate": 0.0001171955136978184, + "loss": 1.9626, + "step": 21146 + }, + { + "epoch": 2.467273363668183, + "grad_norm": 0.9962129592895508, + "learning_rate": 0.0001171808028732003, + "loss": 1.7435, + "step": 21147 + }, + { + "epoch": 2.4673900361684753, + "grad_norm": 1.1005860567092896, + "learning_rate": 0.0001171660924043656, + "loss": 1.8573, + "step": 21148 + }, + { + "epoch": 2.4675067086687665, + "grad_norm": 1.1567463874816895, + "learning_rate": 0.00011715138229146693, + "loss": 1.9238, + "step": 21149 + }, + { + "epoch": 2.4676233811690587, + "grad_norm": 1.2652822732925415, + "learning_rate": 0.00011713667253465665, + "loss": 1.9434, + "step": 21150 + }, + { + "epoch": 2.46774005366935, + "grad_norm": 1.0828441381454468, + "learning_rate": 0.00011712196313408741, + "loss": 1.9044, + "step": 21151 + }, + { + "epoch": 2.467856726169642, + "grad_norm": 1.1161004304885864, + "learning_rate": 0.00011710725408991156, + "loss": 1.8604, + "step": 21152 + }, + { + "epoch": 2.4679733986699333, + "grad_norm": 1.2430405616760254, + "learning_rate": 0.00011709254540228167, + "loss": 1.8598, + "step": 21153 + }, + { + "epoch": 2.4680900711702254, + "grad_norm": 1.046890377998352, + "learning_rate": 0.00011707783707135023, + "loss": 1.9225, + "step": 21154 + }, + { + "epoch": 2.4682067436705166, + "grad_norm": 1.3931342363357544, + "learning_rate": 0.00011706312909726965, + "loss": 2.0953, + "step": 21155 + }, + { + "epoch": 2.4683234161708087, + "grad_norm": 1.3068687915802002, + "learning_rate": 0.00011704842148019251, + "loss": 1.9989, + "step": 21156 + }, + { + "epoch": 2.4684400886711, + "grad_norm": 1.0870370864868164, + "learning_rate": 0.00011703371422027113, + "loss": 1.9773, + "step": 21157 + }, + { + "epoch": 2.468556761171392, + "grad_norm": 1.192507266998291, + "learning_rate": 0.00011701900731765817, + "loss": 1.8775, + "step": 21158 + }, + { + "epoch": 2.4686734336716833, + "grad_norm": 1.0677309036254883, + "learning_rate": 0.00011700430077250592, + "loss": 1.9012, + "step": 21159 + }, + { + "epoch": 2.4687901061719755, + "grad_norm": 1.0775644779205322, + "learning_rate": 0.00011698959458496697, + "loss": 1.9671, + "step": 21160 + }, + { + "epoch": 2.4689067786722667, + "grad_norm": 1.1999400854110718, + "learning_rate": 0.00011697488875519365, + "loss": 2.0257, + "step": 21161 + }, + { + "epoch": 2.469023451172559, + "grad_norm": 1.1055655479431152, + "learning_rate": 0.00011696018328333856, + "loss": 1.9149, + "step": 21162 + }, + { + "epoch": 2.46914012367285, + "grad_norm": 1.1822658777236938, + "learning_rate": 0.00011694547816955405, + "loss": 2.1956, + "step": 21163 + }, + { + "epoch": 2.469256796173142, + "grad_norm": 1.2888232469558716, + "learning_rate": 0.00011693077341399262, + "loss": 2.1162, + "step": 21164 + }, + { + "epoch": 2.4693734686734334, + "grad_norm": 1.2109824419021606, + "learning_rate": 0.00011691606901680662, + "loss": 1.8201, + "step": 21165 + }, + { + "epoch": 2.4694901411737256, + "grad_norm": 1.064976692199707, + "learning_rate": 0.00011690136497814855, + "loss": 1.7857, + "step": 21166 + }, + { + "epoch": 2.469606813674017, + "grad_norm": 1.187071681022644, + "learning_rate": 0.00011688666129817093, + "loss": 1.9042, + "step": 21167 + }, + { + "epoch": 2.469723486174309, + "grad_norm": 1.1970314979553223, + "learning_rate": 0.00011687195797702603, + "loss": 1.9882, + "step": 21168 + }, + { + "epoch": 2.4698401586746, + "grad_norm": 1.0945583581924438, + "learning_rate": 0.00011685725501486645, + "loss": 1.913, + "step": 21169 + }, + { + "epoch": 2.4699568311748923, + "grad_norm": 1.1281315088272095, + "learning_rate": 0.00011684255241184448, + "loss": 1.9822, + "step": 21170 + }, + { + "epoch": 2.4700735036751835, + "grad_norm": 0.9800664782524109, + "learning_rate": 0.00011682785016811256, + "loss": 1.8625, + "step": 21171 + }, + { + "epoch": 2.4701901761754756, + "grad_norm": 1.081915259361267, + "learning_rate": 0.00011681314828382315, + "loss": 1.9001, + "step": 21172 + }, + { + "epoch": 2.470306848675767, + "grad_norm": 1.2770943641662598, + "learning_rate": 0.00011679844675912868, + "loss": 2.0398, + "step": 21173 + }, + { + "epoch": 2.470423521176059, + "grad_norm": 1.0258766412734985, + "learning_rate": 0.00011678374559418143, + "loss": 1.7679, + "step": 21174 + }, + { + "epoch": 2.4705401936763502, + "grad_norm": 1.267394781112671, + "learning_rate": 0.00011676904478913398, + "loss": 1.9179, + "step": 21175 + }, + { + "epoch": 2.4706568661766424, + "grad_norm": 1.1799168586730957, + "learning_rate": 0.0001167543443441386, + "loss": 1.9193, + "step": 21176 + }, + { + "epoch": 2.4707735386769336, + "grad_norm": 1.1823281049728394, + "learning_rate": 0.00011673964425934778, + "loss": 1.7238, + "step": 21177 + }, + { + "epoch": 2.4708902111772257, + "grad_norm": 1.1721305847167969, + "learning_rate": 0.00011672494453491381, + "loss": 2.14, + "step": 21178 + }, + { + "epoch": 2.471006883677517, + "grad_norm": 1.031893014907837, + "learning_rate": 0.00011671024517098918, + "loss": 1.8971, + "step": 21179 + }, + { + "epoch": 2.471123556177809, + "grad_norm": 1.2216482162475586, + "learning_rate": 0.00011669554616772629, + "loss": 2.0426, + "step": 21180 + }, + { + "epoch": 2.4712402286781003, + "grad_norm": 1.3884214162826538, + "learning_rate": 0.00011668084752527738, + "loss": 2.0524, + "step": 21181 + }, + { + "epoch": 2.4713569011783925, + "grad_norm": 1.309131145477295, + "learning_rate": 0.00011666614924379499, + "loss": 1.9883, + "step": 21182 + }, + { + "epoch": 2.4714735736786837, + "grad_norm": 1.3612185716629028, + "learning_rate": 0.00011665145132343136, + "loss": 2.1203, + "step": 21183 + }, + { + "epoch": 2.471590246178976, + "grad_norm": 1.3035004138946533, + "learning_rate": 0.000116636753764339, + "loss": 2.0947, + "step": 21184 + }, + { + "epoch": 2.471706918679267, + "grad_norm": 1.322586178779602, + "learning_rate": 0.00011662205656667015, + "loss": 2.0646, + "step": 21185 + }, + { + "epoch": 2.471823591179559, + "grad_norm": 1.0711220502853394, + "learning_rate": 0.00011660735973057727, + "loss": 1.9671, + "step": 21186 + }, + { + "epoch": 2.4719402636798504, + "grad_norm": 1.2022521495819092, + "learning_rate": 0.00011659266325621266, + "loss": 2.0819, + "step": 21187 + }, + { + "epoch": 2.4720569361801426, + "grad_norm": 1.088853120803833, + "learning_rate": 0.0001165779671437287, + "loss": 1.9894, + "step": 21188 + }, + { + "epoch": 2.472173608680434, + "grad_norm": 1.15982186794281, + "learning_rate": 0.00011656327139327774, + "loss": 2.104, + "step": 21189 + }, + { + "epoch": 2.472290281180726, + "grad_norm": 1.1090315580368042, + "learning_rate": 0.00011654857600501217, + "loss": 1.9877, + "step": 21190 + }, + { + "epoch": 2.472406953681017, + "grad_norm": 1.1723233461380005, + "learning_rate": 0.00011653388097908422, + "loss": 2.0307, + "step": 21191 + }, + { + "epoch": 2.4725236261813093, + "grad_norm": 1.2652850151062012, + "learning_rate": 0.00011651918631564632, + "loss": 2.0096, + "step": 21192 + }, + { + "epoch": 2.4726402986816005, + "grad_norm": 1.1445212364196777, + "learning_rate": 0.00011650449201485084, + "loss": 1.8504, + "step": 21193 + }, + { + "epoch": 2.4727569711818926, + "grad_norm": 1.439172625541687, + "learning_rate": 0.00011648979807685001, + "loss": 2.0144, + "step": 21194 + }, + { + "epoch": 2.472873643682184, + "grad_norm": 1.2063804864883423, + "learning_rate": 0.00011647510450179631, + "loss": 1.9834, + "step": 21195 + }, + { + "epoch": 2.472990316182476, + "grad_norm": 1.097812294960022, + "learning_rate": 0.00011646041128984193, + "loss": 1.9264, + "step": 21196 + }, + { + "epoch": 2.4731069886827672, + "grad_norm": 1.0834426879882812, + "learning_rate": 0.00011644571844113926, + "loss": 1.884, + "step": 21197 + }, + { + "epoch": 2.4732236611830594, + "grad_norm": 1.085557460784912, + "learning_rate": 0.00011643102595584054, + "loss": 1.838, + "step": 21198 + }, + { + "epoch": 2.4733403336833506, + "grad_norm": 1.071306586265564, + "learning_rate": 0.00011641633383409821, + "loss": 1.8826, + "step": 21199 + }, + { + "epoch": 2.4734570061836427, + "grad_norm": 1.113125205039978, + "learning_rate": 0.00011640164207606446, + "loss": 1.8798, + "step": 21200 + }, + { + "epoch": 2.473573678683934, + "grad_norm": 1.1059893369674683, + "learning_rate": 0.00011638695068189171, + "loss": 2.1657, + "step": 21201 + }, + { + "epoch": 2.473690351184226, + "grad_norm": 1.2313441038131714, + "learning_rate": 0.00011637225965173214, + "loss": 2.0317, + "step": 21202 + }, + { + "epoch": 2.4738070236845173, + "grad_norm": 1.1672815084457397, + "learning_rate": 0.0001163575689857382, + "loss": 1.9824, + "step": 21203 + }, + { + "epoch": 2.4739236961848095, + "grad_norm": 1.02555251121521, + "learning_rate": 0.00011634287868406203, + "loss": 1.8118, + "step": 21204 + }, + { + "epoch": 2.4740403686851007, + "grad_norm": 1.2100763320922852, + "learning_rate": 0.00011632818874685604, + "loss": 2.045, + "step": 21205 + }, + { + "epoch": 2.474157041185393, + "grad_norm": 1.1223039627075195, + "learning_rate": 0.0001163134991742725, + "loss": 1.9616, + "step": 21206 + }, + { + "epoch": 2.474273713685684, + "grad_norm": 1.2049574851989746, + "learning_rate": 0.00011629880996646361, + "loss": 1.9844, + "step": 21207 + }, + { + "epoch": 2.474390386185976, + "grad_norm": 0.9598748683929443, + "learning_rate": 0.00011628412112358176, + "loss": 1.8475, + "step": 21208 + }, + { + "epoch": 2.4745070586862674, + "grad_norm": 1.1114286184310913, + "learning_rate": 0.00011626943264577911, + "loss": 1.9674, + "step": 21209 + }, + { + "epoch": 2.4746237311865595, + "grad_norm": 1.0325340032577515, + "learning_rate": 0.0001162547445332081, + "loss": 1.8624, + "step": 21210 + }, + { + "epoch": 2.4747404036868508, + "grad_norm": 1.0084413290023804, + "learning_rate": 0.00011624005678602084, + "loss": 1.9274, + "step": 21211 + }, + { + "epoch": 2.474857076187143, + "grad_norm": 1.164093017578125, + "learning_rate": 0.0001162253694043697, + "loss": 2.0521, + "step": 21212 + }, + { + "epoch": 2.474973748687434, + "grad_norm": 1.0612902641296387, + "learning_rate": 0.00011621068238840689, + "loss": 1.8688, + "step": 21213 + }, + { + "epoch": 2.4750904211877263, + "grad_norm": 1.2243151664733887, + "learning_rate": 0.0001161959957382847, + "loss": 2.0286, + "step": 21214 + }, + { + "epoch": 2.4752070936880175, + "grad_norm": 1.3492094278335571, + "learning_rate": 0.00011618130945415534, + "loss": 2.0337, + "step": 21215 + }, + { + "epoch": 2.4753237661883096, + "grad_norm": 1.0121257305145264, + "learning_rate": 0.00011616662353617114, + "loss": 1.9922, + "step": 21216 + }, + { + "epoch": 2.475440438688601, + "grad_norm": 1.1968295574188232, + "learning_rate": 0.00011615193798448423, + "loss": 1.9727, + "step": 21217 + }, + { + "epoch": 2.475557111188893, + "grad_norm": 1.2945969104766846, + "learning_rate": 0.00011613725279924691, + "loss": 1.9631, + "step": 21218 + }, + { + "epoch": 2.4756737836891842, + "grad_norm": 1.2168048620224, + "learning_rate": 0.0001161225679806115, + "loss": 1.9396, + "step": 21219 + }, + { + "epoch": 2.4757904561894764, + "grad_norm": 1.075719952583313, + "learning_rate": 0.0001161078835287301, + "loss": 1.7861, + "step": 21220 + }, + { + "epoch": 2.4759071286897676, + "grad_norm": 1.0747506618499756, + "learning_rate": 0.00011609319944375508, + "loss": 1.8238, + "step": 21221 + }, + { + "epoch": 2.4760238011900597, + "grad_norm": 1.1770732402801514, + "learning_rate": 0.00011607851572583855, + "loss": 1.8661, + "step": 21222 + }, + { + "epoch": 2.476140473690351, + "grad_norm": 1.2161307334899902, + "learning_rate": 0.0001160638323751328, + "loss": 2.0179, + "step": 21223 + }, + { + "epoch": 2.476257146190643, + "grad_norm": 1.2253820896148682, + "learning_rate": 0.00011604914939179, + "loss": 2.1497, + "step": 21224 + }, + { + "epoch": 2.4763738186909343, + "grad_norm": 1.0951144695281982, + "learning_rate": 0.00011603446677596244, + "loss": 1.9026, + "step": 21225 + }, + { + "epoch": 2.4764904911912264, + "grad_norm": 1.083194375038147, + "learning_rate": 0.00011601978452780223, + "loss": 1.9233, + "step": 21226 + }, + { + "epoch": 2.4766071636915177, + "grad_norm": 1.0544100999832153, + "learning_rate": 0.00011600510264746172, + "loss": 1.8554, + "step": 21227 + }, + { + "epoch": 2.47672383619181, + "grad_norm": 1.084769606590271, + "learning_rate": 0.00011599042113509295, + "loss": 1.8476, + "step": 21228 + }, + { + "epoch": 2.476840508692101, + "grad_norm": 1.174884557723999, + "learning_rate": 0.00011597573999084829, + "loss": 1.9402, + "step": 21229 + }, + { + "epoch": 2.476957181192393, + "grad_norm": 1.075057029724121, + "learning_rate": 0.00011596105921487978, + "loss": 2.0068, + "step": 21230 + }, + { + "epoch": 2.4770738536926844, + "grad_norm": 1.228261947631836, + "learning_rate": 0.00011594637880733971, + "loss": 1.9521, + "step": 21231 + }, + { + "epoch": 2.4771905261929765, + "grad_norm": 1.090717077255249, + "learning_rate": 0.00011593169876838032, + "loss": 1.898, + "step": 21232 + }, + { + "epoch": 2.4773071986932678, + "grad_norm": 1.1714218854904175, + "learning_rate": 0.00011591701909815364, + "loss": 2.0203, + "step": 21233 + }, + { + "epoch": 2.47742387119356, + "grad_norm": 1.1650776863098145, + "learning_rate": 0.000115902339796812, + "loss": 1.8095, + "step": 21234 + }, + { + "epoch": 2.477540543693851, + "grad_norm": 1.0830448865890503, + "learning_rate": 0.00011588766086450747, + "loss": 1.9743, + "step": 21235 + }, + { + "epoch": 2.4776572161941433, + "grad_norm": 1.220228910446167, + "learning_rate": 0.0001158729823013923, + "loss": 2.1146, + "step": 21236 + }, + { + "epoch": 2.4777738886944345, + "grad_norm": 1.1784470081329346, + "learning_rate": 0.0001158583041076186, + "loss": 1.9735, + "step": 21237 + }, + { + "epoch": 2.4778905611947266, + "grad_norm": 1.0561774969100952, + "learning_rate": 0.00011584362628333863, + "loss": 1.9218, + "step": 21238 + }, + { + "epoch": 2.478007233695018, + "grad_norm": 1.2104475498199463, + "learning_rate": 0.00011582894882870447, + "loss": 2.1761, + "step": 21239 + }, + { + "epoch": 2.47812390619531, + "grad_norm": 1.1132723093032837, + "learning_rate": 0.00011581427174386828, + "loss": 2.0601, + "step": 21240 + }, + { + "epoch": 2.478240578695601, + "grad_norm": 1.2854355573654175, + "learning_rate": 0.00011579959502898227, + "loss": 2.2122, + "step": 21241 + }, + { + "epoch": 2.4783572511958933, + "grad_norm": 1.1272075176239014, + "learning_rate": 0.00011578491868419857, + "loss": 2.0349, + "step": 21242 + }, + { + "epoch": 2.4784739236961846, + "grad_norm": 1.1710909605026245, + "learning_rate": 0.00011577024270966925, + "loss": 1.9618, + "step": 21243 + }, + { + "epoch": 2.4785905961964767, + "grad_norm": 1.2561392784118652, + "learning_rate": 0.00011575556710554656, + "loss": 2.0948, + "step": 21244 + }, + { + "epoch": 2.478707268696768, + "grad_norm": 1.2852851152420044, + "learning_rate": 0.00011574089187198264, + "loss": 2.0756, + "step": 21245 + }, + { + "epoch": 2.47882394119706, + "grad_norm": 1.1501458883285522, + "learning_rate": 0.00011572621700912956, + "loss": 1.9484, + "step": 21246 + }, + { + "epoch": 2.4789406136973513, + "grad_norm": 1.3342499732971191, + "learning_rate": 0.00011571154251713949, + "loss": 2.0276, + "step": 21247 + }, + { + "epoch": 2.4790572861976434, + "grad_norm": 1.283773422241211, + "learning_rate": 0.00011569686839616457, + "loss": 1.9536, + "step": 21248 + }, + { + "epoch": 2.4791739586979347, + "grad_norm": 1.1119329929351807, + "learning_rate": 0.0001156821946463569, + "loss": 1.9688, + "step": 21249 + }, + { + "epoch": 2.479290631198227, + "grad_norm": 1.1837143898010254, + "learning_rate": 0.0001156675212678686, + "loss": 1.9734, + "step": 21250 + }, + { + "epoch": 2.479407303698518, + "grad_norm": 1.2638903856277466, + "learning_rate": 0.00011565284826085182, + "loss": 2.1236, + "step": 21251 + }, + { + "epoch": 2.47952397619881, + "grad_norm": 1.1152966022491455, + "learning_rate": 0.00011563817562545857, + "loss": 1.9426, + "step": 21252 + }, + { + "epoch": 2.4796406486991014, + "grad_norm": 1.0935132503509521, + "learning_rate": 0.00011562350336184114, + "loss": 1.9155, + "step": 21253 + }, + { + "epoch": 2.4797573211993935, + "grad_norm": 0.9589179158210754, + "learning_rate": 0.00011560883147015146, + "loss": 1.7351, + "step": 21254 + }, + { + "epoch": 2.4798739936996848, + "grad_norm": 1.1713743209838867, + "learning_rate": 0.00011559415995054178, + "loss": 1.8922, + "step": 21255 + }, + { + "epoch": 2.479990666199977, + "grad_norm": 1.0889431238174438, + "learning_rate": 0.00011557948880316407, + "loss": 1.9397, + "step": 21256 + }, + { + "epoch": 2.480107338700268, + "grad_norm": 1.2429121732711792, + "learning_rate": 0.00011556481802817046, + "loss": 2.0125, + "step": 21257 + }, + { + "epoch": 2.4802240112005602, + "grad_norm": 1.2225306034088135, + "learning_rate": 0.00011555014762571314, + "loss": 2.1147, + "step": 21258 + }, + { + "epoch": 2.4803406837008515, + "grad_norm": 1.0552806854248047, + "learning_rate": 0.00011553547759594403, + "loss": 2.0387, + "step": 21259 + }, + { + "epoch": 2.4804573562011436, + "grad_norm": 1.1800585985183716, + "learning_rate": 0.00011552080793901538, + "loss": 1.7766, + "step": 21260 + }, + { + "epoch": 2.480574028701435, + "grad_norm": 1.1591131687164307, + "learning_rate": 0.00011550613865507912, + "loss": 1.9403, + "step": 21261 + }, + { + "epoch": 2.480690701201727, + "grad_norm": 1.1134878396987915, + "learning_rate": 0.00011549146974428744, + "loss": 2.0891, + "step": 21262 + }, + { + "epoch": 2.480807373702018, + "grad_norm": 1.2303035259246826, + "learning_rate": 0.00011547680120679231, + "loss": 2.1328, + "step": 21263 + }, + { + "epoch": 2.4809240462023103, + "grad_norm": 1.26754891872406, + "learning_rate": 0.00011546213304274591, + "loss": 2.0312, + "step": 21264 + }, + { + "epoch": 2.4810407187026016, + "grad_norm": 1.0712215900421143, + "learning_rate": 0.00011544746525230021, + "loss": 1.8207, + "step": 21265 + }, + { + "epoch": 2.4811573912028937, + "grad_norm": 1.373096227645874, + "learning_rate": 0.00011543279783560736, + "loss": 1.9749, + "step": 21266 + }, + { + "epoch": 2.481274063703185, + "grad_norm": 1.044329285621643, + "learning_rate": 0.00011541813079281928, + "loss": 1.7825, + "step": 21267 + }, + { + "epoch": 2.481390736203477, + "grad_norm": 1.082488775253296, + "learning_rate": 0.00011540346412408817, + "loss": 2.0033, + "step": 21268 + }, + { + "epoch": 2.4815074087037683, + "grad_norm": 1.1369839906692505, + "learning_rate": 0.00011538879782956592, + "loss": 2.0027, + "step": 21269 + }, + { + "epoch": 2.4816240812040604, + "grad_norm": 1.2969614267349243, + "learning_rate": 0.0001153741319094047, + "loss": 2.034, + "step": 21270 + }, + { + "epoch": 2.4817407537043517, + "grad_norm": 1.0077900886535645, + "learning_rate": 0.00011535946636375655, + "loss": 1.8445, + "step": 21271 + }, + { + "epoch": 2.481857426204644, + "grad_norm": 1.241405963897705, + "learning_rate": 0.00011534480119277345, + "loss": 1.7914, + "step": 21272 + }, + { + "epoch": 2.481974098704935, + "grad_norm": 1.026513695716858, + "learning_rate": 0.00011533013639660745, + "loss": 1.9082, + "step": 21273 + }, + { + "epoch": 2.482090771205227, + "grad_norm": 1.1559433937072754, + "learning_rate": 0.00011531547197541057, + "loss": 2.0809, + "step": 21274 + }, + { + "epoch": 2.4822074437055184, + "grad_norm": 0.9930005073547363, + "learning_rate": 0.0001153008079293349, + "loss": 2.0283, + "step": 21275 + }, + { + "epoch": 2.4823241162058105, + "grad_norm": 1.3482080698013306, + "learning_rate": 0.0001152861442585323, + "loss": 2.1243, + "step": 21276 + }, + { + "epoch": 2.4824407887061017, + "grad_norm": 1.1168677806854248, + "learning_rate": 0.00011527148096315499, + "loss": 2.1397, + "step": 21277 + }, + { + "epoch": 2.482557461206394, + "grad_norm": 1.1311380863189697, + "learning_rate": 0.0001152568180433548, + "loss": 2.0016, + "step": 21278 + }, + { + "epoch": 2.482674133706685, + "grad_norm": 1.251351237297058, + "learning_rate": 0.0001152421554992839, + "loss": 1.8589, + "step": 21279 + }, + { + "epoch": 2.4827908062069772, + "grad_norm": 1.2407963275909424, + "learning_rate": 0.00011522749333109414, + "loss": 1.863, + "step": 21280 + }, + { + "epoch": 2.4829074787072685, + "grad_norm": 1.1785708665847778, + "learning_rate": 0.00011521283153893768, + "loss": 1.974, + "step": 21281 + }, + { + "epoch": 2.4830241512075606, + "grad_norm": 1.3097925186157227, + "learning_rate": 0.0001151981701229664, + "loss": 1.861, + "step": 21282 + }, + { + "epoch": 2.483140823707852, + "grad_norm": 1.1935698986053467, + "learning_rate": 0.0001151835090833323, + "loss": 2.0115, + "step": 21283 + }, + { + "epoch": 2.483257496208144, + "grad_norm": 1.1900893449783325, + "learning_rate": 0.00011516884842018745, + "loss": 1.8516, + "step": 21284 + }, + { + "epoch": 2.483374168708435, + "grad_norm": 1.2807972431182861, + "learning_rate": 0.00011515418813368374, + "loss": 1.9331, + "step": 21285 + }, + { + "epoch": 2.4834908412087273, + "grad_norm": 1.2429941892623901, + "learning_rate": 0.00011513952822397324, + "loss": 1.7395, + "step": 21286 + }, + { + "epoch": 2.4836075137090186, + "grad_norm": 1.0313196182250977, + "learning_rate": 0.00011512486869120783, + "loss": 1.865, + "step": 21287 + }, + { + "epoch": 2.4837241862093107, + "grad_norm": 1.2607274055480957, + "learning_rate": 0.00011511020953553962, + "loss": 2.0264, + "step": 21288 + }, + { + "epoch": 2.483840858709602, + "grad_norm": 1.2429053783416748, + "learning_rate": 0.00011509555075712039, + "loss": 2.0304, + "step": 21289 + }, + { + "epoch": 2.483957531209894, + "grad_norm": 1.0563884973526, + "learning_rate": 0.00011508089235610232, + "loss": 2.0533, + "step": 21290 + }, + { + "epoch": 2.4840742037101853, + "grad_norm": 1.1660100221633911, + "learning_rate": 0.00011506623433263721, + "loss": 2.0143, + "step": 21291 + }, + { + "epoch": 2.4841908762104774, + "grad_norm": 1.0873215198516846, + "learning_rate": 0.00011505157668687712, + "loss": 2.104, + "step": 21292 + }, + { + "epoch": 2.4843075487107686, + "grad_norm": 1.2731010913848877, + "learning_rate": 0.00011503691941897387, + "loss": 1.9352, + "step": 21293 + }, + { + "epoch": 2.4844242212110608, + "grad_norm": 1.184970736503601, + "learning_rate": 0.0001150222625290796, + "loss": 1.9916, + "step": 21294 + }, + { + "epoch": 2.484540893711352, + "grad_norm": 1.094702959060669, + "learning_rate": 0.00011500760601734607, + "loss": 1.9737, + "step": 21295 + }, + { + "epoch": 2.484657566211644, + "grad_norm": 1.0951303243637085, + "learning_rate": 0.00011499294988392531, + "loss": 1.974, + "step": 21296 + }, + { + "epoch": 2.4847742387119354, + "grad_norm": 1.158484697341919, + "learning_rate": 0.00011497829412896932, + "loss": 1.8997, + "step": 21297 + }, + { + "epoch": 2.4848909112122275, + "grad_norm": 1.133813500404358, + "learning_rate": 0.00011496363875262995, + "loss": 2.0714, + "step": 21298 + }, + { + "epoch": 2.4850075837125187, + "grad_norm": 1.1477307081222534, + "learning_rate": 0.00011494898375505911, + "loss": 1.9311, + "step": 21299 + }, + { + "epoch": 2.485124256212811, + "grad_norm": 1.2669034004211426, + "learning_rate": 0.00011493432913640881, + "loss": 1.8968, + "step": 21300 + }, + { + "epoch": 2.485240928713102, + "grad_norm": 1.2306870222091675, + "learning_rate": 0.00011491967489683097, + "loss": 2.0712, + "step": 21301 + }, + { + "epoch": 2.485357601213394, + "grad_norm": 1.2623769044876099, + "learning_rate": 0.00011490502103647737, + "loss": 2.0197, + "step": 21302 + }, + { + "epoch": 2.4854742737136855, + "grad_norm": 1.037174940109253, + "learning_rate": 0.00011489036755550011, + "loss": 1.9415, + "step": 21303 + }, + { + "epoch": 2.4855909462139776, + "grad_norm": 1.053453803062439, + "learning_rate": 0.00011487571445405095, + "loss": 2.1009, + "step": 21304 + }, + { + "epoch": 2.485707618714269, + "grad_norm": 1.2360310554504395, + "learning_rate": 0.00011486106173228192, + "loss": 2.1069, + "step": 21305 + }, + { + "epoch": 2.485824291214561, + "grad_norm": 1.3249213695526123, + "learning_rate": 0.00011484640939034482, + "loss": 1.9963, + "step": 21306 + }, + { + "epoch": 2.485940963714852, + "grad_norm": 1.1358994245529175, + "learning_rate": 0.00011483175742839165, + "loss": 2.0333, + "step": 21307 + }, + { + "epoch": 2.4860576362151443, + "grad_norm": 1.001793622970581, + "learning_rate": 0.00011481710584657422, + "loss": 1.9529, + "step": 21308 + }, + { + "epoch": 2.4861743087154355, + "grad_norm": 1.0769118070602417, + "learning_rate": 0.00011480245464504443, + "loss": 1.9879, + "step": 21309 + }, + { + "epoch": 2.4862909812157277, + "grad_norm": 1.1244114637374878, + "learning_rate": 0.00011478780382395423, + "loss": 2.0853, + "step": 21310 + }, + { + "epoch": 2.486407653716019, + "grad_norm": 1.0050913095474243, + "learning_rate": 0.00011477315338345541, + "loss": 1.9798, + "step": 21311 + }, + { + "epoch": 2.486524326216311, + "grad_norm": 1.1308530569076538, + "learning_rate": 0.00011475850332369998, + "loss": 1.9296, + "step": 21312 + }, + { + "epoch": 2.4866409987166023, + "grad_norm": 1.2917091846466064, + "learning_rate": 0.00011474385364483967, + "loss": 2.0146, + "step": 21313 + }, + { + "epoch": 2.4867576712168944, + "grad_norm": 1.1576716899871826, + "learning_rate": 0.00011472920434702645, + "loss": 1.7838, + "step": 21314 + }, + { + "epoch": 2.4868743437171856, + "grad_norm": 1.1998634338378906, + "learning_rate": 0.00011471455543041215, + "loss": 2.1647, + "step": 21315 + }, + { + "epoch": 2.4869910162174778, + "grad_norm": 1.351821780204773, + "learning_rate": 0.00011469990689514867, + "loss": 1.9589, + "step": 21316 + }, + { + "epoch": 2.487107688717769, + "grad_norm": 1.2290881872177124, + "learning_rate": 0.00011468525874138782, + "loss": 1.9642, + "step": 21317 + }, + { + "epoch": 2.487224361218061, + "grad_norm": 1.0921604633331299, + "learning_rate": 0.0001146706109692815, + "loss": 1.9401, + "step": 21318 + }, + { + "epoch": 2.4873410337183524, + "grad_norm": 1.0248075723648071, + "learning_rate": 0.0001146559635789815, + "loss": 1.8013, + "step": 21319 + }, + { + "epoch": 2.4874577062186445, + "grad_norm": 1.0460466146469116, + "learning_rate": 0.00011464131657063974, + "loss": 1.7759, + "step": 21320 + }, + { + "epoch": 2.4875743787189357, + "grad_norm": 1.1466740369796753, + "learning_rate": 0.00011462666994440798, + "loss": 1.9339, + "step": 21321 + }, + { + "epoch": 2.487691051219228, + "grad_norm": 1.1704485416412354, + "learning_rate": 0.00011461202370043818, + "loss": 2.0401, + "step": 21322 + }, + { + "epoch": 2.487807723719519, + "grad_norm": 1.297194480895996, + "learning_rate": 0.00011459737783888205, + "loss": 2.0172, + "step": 21323 + }, + { + "epoch": 2.487924396219811, + "grad_norm": 1.2428898811340332, + "learning_rate": 0.00011458273235989152, + "loss": 1.9589, + "step": 21324 + }, + { + "epoch": 2.4880410687201024, + "grad_norm": 1.1122808456420898, + "learning_rate": 0.00011456808726361835, + "loss": 1.9903, + "step": 21325 + }, + { + "epoch": 2.4881577412203946, + "grad_norm": 1.1966488361358643, + "learning_rate": 0.00011455344255021441, + "loss": 2.0889, + "step": 21326 + }, + { + "epoch": 2.488274413720686, + "grad_norm": 1.139525055885315, + "learning_rate": 0.00011453879821983152, + "loss": 1.9386, + "step": 21327 + }, + { + "epoch": 2.488391086220978, + "grad_norm": 1.1546173095703125, + "learning_rate": 0.00011452415427262143, + "loss": 2.0979, + "step": 21328 + }, + { + "epoch": 2.488507758721269, + "grad_norm": 1.0955991744995117, + "learning_rate": 0.00011450951070873604, + "loss": 1.9699, + "step": 21329 + }, + { + "epoch": 2.4886244312215613, + "grad_norm": 1.1257383823394775, + "learning_rate": 0.00011449486752832707, + "loss": 1.9542, + "step": 21330 + }, + { + "epoch": 2.4887411037218525, + "grad_norm": 1.2930145263671875, + "learning_rate": 0.00011448022473154644, + "loss": 1.9431, + "step": 21331 + }, + { + "epoch": 2.4888577762221447, + "grad_norm": 1.4211431741714478, + "learning_rate": 0.00011446558231854582, + "loss": 2.0045, + "step": 21332 + }, + { + "epoch": 2.488974448722436, + "grad_norm": 1.1041102409362793, + "learning_rate": 0.00011445094028947712, + "loss": 2.0345, + "step": 21333 + }, + { + "epoch": 2.489091121222728, + "grad_norm": 1.0748732089996338, + "learning_rate": 0.00011443629864449207, + "loss": 2.1259, + "step": 21334 + }, + { + "epoch": 2.4892077937230193, + "grad_norm": 1.2923187017440796, + "learning_rate": 0.00011442165738374246, + "loss": 2.0667, + "step": 21335 + }, + { + "epoch": 2.4893244662233114, + "grad_norm": 1.1525216102600098, + "learning_rate": 0.00011440701650738008, + "loss": 1.9935, + "step": 21336 + }, + { + "epoch": 2.4894411387236026, + "grad_norm": 1.0579591989517212, + "learning_rate": 0.00011439237601555669, + "loss": 1.9176, + "step": 21337 + }, + { + "epoch": 2.4895578112238947, + "grad_norm": 1.0509614944458008, + "learning_rate": 0.00011437773590842416, + "loss": 1.9832, + "step": 21338 + }, + { + "epoch": 2.489674483724186, + "grad_norm": 1.2480343580245972, + "learning_rate": 0.00011436309618613413, + "loss": 2.1537, + "step": 21339 + }, + { + "epoch": 2.489791156224478, + "grad_norm": 1.1068841218948364, + "learning_rate": 0.0001143484568488385, + "loss": 2.0379, + "step": 21340 + }, + { + "epoch": 2.4899078287247693, + "grad_norm": 1.1742968559265137, + "learning_rate": 0.00011433381789668891, + "loss": 2.1013, + "step": 21341 + }, + { + "epoch": 2.4900245012250615, + "grad_norm": 1.2605223655700684, + "learning_rate": 0.0001143191793298372, + "loss": 1.9979, + "step": 21342 + }, + { + "epoch": 2.4901411737253527, + "grad_norm": 1.0181844234466553, + "learning_rate": 0.00011430454114843511, + "loss": 1.7755, + "step": 21343 + }, + { + "epoch": 2.490257846225645, + "grad_norm": 0.9922463893890381, + "learning_rate": 0.0001142899033526344, + "loss": 1.9249, + "step": 21344 + }, + { + "epoch": 2.490374518725936, + "grad_norm": 1.0557703971862793, + "learning_rate": 0.00011427526594258675, + "loss": 1.957, + "step": 21345 + }, + { + "epoch": 2.490491191226228, + "grad_norm": 1.0353895425796509, + "learning_rate": 0.00011426062891844402, + "loss": 1.6911, + "step": 21346 + }, + { + "epoch": 2.4906078637265194, + "grad_norm": 0.9100457429885864, + "learning_rate": 0.00011424599228035782, + "loss": 1.7775, + "step": 21347 + }, + { + "epoch": 2.4907245362268116, + "grad_norm": 1.1143405437469482, + "learning_rate": 0.00011423135602848004, + "loss": 1.8958, + "step": 21348 + }, + { + "epoch": 2.490841208727103, + "grad_norm": 1.2361100912094116, + "learning_rate": 0.00011421672016296224, + "loss": 1.8962, + "step": 21349 + }, + { + "epoch": 2.490957881227395, + "grad_norm": 1.2131465673446655, + "learning_rate": 0.00011420208468395626, + "loss": 2.0024, + "step": 21350 + }, + { + "epoch": 2.491074553727686, + "grad_norm": 1.2777981758117676, + "learning_rate": 0.00011418744959161387, + "loss": 2.0854, + "step": 21351 + }, + { + "epoch": 2.4911912262279783, + "grad_norm": 1.07626211643219, + "learning_rate": 0.00011417281488608665, + "loss": 1.9945, + "step": 21352 + }, + { + "epoch": 2.4913078987282695, + "grad_norm": 1.2547359466552734, + "learning_rate": 0.00011415818056752644, + "loss": 2.0049, + "step": 21353 + }, + { + "epoch": 2.4914245712285616, + "grad_norm": 1.2120599746704102, + "learning_rate": 0.00011414354663608485, + "loss": 1.7497, + "step": 21354 + }, + { + "epoch": 2.491541243728853, + "grad_norm": 1.0875507593154907, + "learning_rate": 0.00011412891309191367, + "loss": 2.0738, + "step": 21355 + }, + { + "epoch": 2.491657916229145, + "grad_norm": 1.0376728773117065, + "learning_rate": 0.00011411427993516454, + "loss": 1.9905, + "step": 21356 + }, + { + "epoch": 2.4917745887294362, + "grad_norm": 1.2966418266296387, + "learning_rate": 0.00011409964716598926, + "loss": 1.9844, + "step": 21357 + }, + { + "epoch": 2.4918912612297284, + "grad_norm": 1.2803497314453125, + "learning_rate": 0.00011408501478453937, + "loss": 2.0414, + "step": 21358 + }, + { + "epoch": 2.4920079337300196, + "grad_norm": 1.293264389038086, + "learning_rate": 0.00011407038279096674, + "loss": 1.9833, + "step": 21359 + }, + { + "epoch": 2.4921246062303117, + "grad_norm": 1.0886272192001343, + "learning_rate": 0.00011405575118542294, + "loss": 1.935, + "step": 21360 + }, + { + "epoch": 2.492241278730603, + "grad_norm": 1.2488508224487305, + "learning_rate": 0.00011404111996805971, + "loss": 1.929, + "step": 21361 + }, + { + "epoch": 2.492357951230895, + "grad_norm": 1.1548426151275635, + "learning_rate": 0.00011402648913902865, + "loss": 2.0263, + "step": 21362 + }, + { + "epoch": 2.4924746237311863, + "grad_norm": 1.1102054119110107, + "learning_rate": 0.00011401185869848148, + "loss": 1.8895, + "step": 21363 + }, + { + "epoch": 2.4925912962314785, + "grad_norm": 1.1580036878585815, + "learning_rate": 0.00011399722864656995, + "loss": 1.8944, + "step": 21364 + }, + { + "epoch": 2.4927079687317697, + "grad_norm": 1.197605013847351, + "learning_rate": 0.00011398259898344561, + "loss": 2.0958, + "step": 21365 + }, + { + "epoch": 2.492824641232062, + "grad_norm": 1.2173668146133423, + "learning_rate": 0.00011396796970926023, + "loss": 2.0343, + "step": 21366 + }, + { + "epoch": 2.492941313732353, + "grad_norm": 1.1891438961029053, + "learning_rate": 0.00011395334082416538, + "loss": 1.9953, + "step": 21367 + }, + { + "epoch": 2.493057986232645, + "grad_norm": 1.0986560583114624, + "learning_rate": 0.00011393871232831277, + "loss": 1.8911, + "step": 21368 + }, + { + "epoch": 2.4931746587329364, + "grad_norm": 1.145971417427063, + "learning_rate": 0.00011392408422185403, + "loss": 2.0179, + "step": 21369 + }, + { + "epoch": 2.4932913312332285, + "grad_norm": 1.0423321723937988, + "learning_rate": 0.00011390945650494085, + "loss": 2.0321, + "step": 21370 + }, + { + "epoch": 2.49340800373352, + "grad_norm": 1.1988822221755981, + "learning_rate": 0.00011389482917772476, + "loss": 2.1725, + "step": 21371 + }, + { + "epoch": 2.493524676233812, + "grad_norm": 1.1136159896850586, + "learning_rate": 0.00011388020224035755, + "loss": 1.9583, + "step": 21372 + }, + { + "epoch": 2.493641348734103, + "grad_norm": 1.3302788734436035, + "learning_rate": 0.0001138655756929907, + "loss": 2.148, + "step": 21373 + }, + { + "epoch": 2.4937580212343953, + "grad_norm": 1.3266853094100952, + "learning_rate": 0.00011385094953577602, + "loss": 2.091, + "step": 21374 + }, + { + "epoch": 2.4938746937346865, + "grad_norm": 1.0469863414764404, + "learning_rate": 0.00011383632376886495, + "loss": 1.8837, + "step": 21375 + }, + { + "epoch": 2.4939913662349786, + "grad_norm": 1.268001675605774, + "learning_rate": 0.00011382169839240923, + "loss": 2.138, + "step": 21376 + }, + { + "epoch": 2.49410803873527, + "grad_norm": 1.1920344829559326, + "learning_rate": 0.00011380707340656052, + "loss": 1.8399, + "step": 21377 + }, + { + "epoch": 2.494224711235562, + "grad_norm": 1.0638395547866821, + "learning_rate": 0.00011379244881147029, + "loss": 1.8513, + "step": 21378 + }, + { + "epoch": 2.4943413837358532, + "grad_norm": 1.1244570016860962, + "learning_rate": 0.00011377782460729029, + "loss": 1.8119, + "step": 21379 + }, + { + "epoch": 2.4944580562361454, + "grad_norm": 1.1334584951400757, + "learning_rate": 0.00011376320079417202, + "loss": 1.8795, + "step": 21380 + }, + { + "epoch": 2.4945747287364366, + "grad_norm": 1.2202560901641846, + "learning_rate": 0.00011374857737226722, + "loss": 1.8093, + "step": 21381 + }, + { + "epoch": 2.4946914012367287, + "grad_norm": 1.1977638006210327, + "learning_rate": 0.0001137339543417273, + "loss": 1.9263, + "step": 21382 + }, + { + "epoch": 2.49480807373702, + "grad_norm": 1.2940369844436646, + "learning_rate": 0.00011371933170270405, + "loss": 2.0171, + "step": 21383 + }, + { + "epoch": 2.494924746237312, + "grad_norm": 1.3740123510360718, + "learning_rate": 0.00011370470945534889, + "loss": 1.9212, + "step": 21384 + }, + { + "epoch": 2.4950414187376033, + "grad_norm": 1.2532060146331787, + "learning_rate": 0.00011369008759981356, + "loss": 1.9304, + "step": 21385 + }, + { + "epoch": 2.4951580912378954, + "grad_norm": 1.112714409828186, + "learning_rate": 0.00011367546613624953, + "loss": 2.0562, + "step": 21386 + }, + { + "epoch": 2.4952747637381867, + "grad_norm": 1.0868165493011475, + "learning_rate": 0.00011366084506480846, + "loss": 1.7379, + "step": 21387 + }, + { + "epoch": 2.495391436238479, + "grad_norm": 1.3210324048995972, + "learning_rate": 0.00011364622438564182, + "loss": 1.9844, + "step": 21388 + }, + { + "epoch": 2.49550810873877, + "grad_norm": 1.0424425601959229, + "learning_rate": 0.00011363160409890124, + "loss": 1.9107, + "step": 21389 + }, + { + "epoch": 2.495624781239062, + "grad_norm": 1.1634925603866577, + "learning_rate": 0.00011361698420473834, + "loss": 1.9711, + "step": 21390 + }, + { + "epoch": 2.4957414537393534, + "grad_norm": 1.2085024118423462, + "learning_rate": 0.00011360236470330463, + "loss": 1.9472, + "step": 21391 + }, + { + "epoch": 2.4958581262396455, + "grad_norm": 1.2064541578292847, + "learning_rate": 0.00011358774559475169, + "loss": 1.909, + "step": 21392 + }, + { + "epoch": 2.4959747987399368, + "grad_norm": 1.1167298555374146, + "learning_rate": 0.00011357312687923103, + "loss": 1.9429, + "step": 21393 + }, + { + "epoch": 2.496091471240229, + "grad_norm": 1.3167738914489746, + "learning_rate": 0.00011355850855689424, + "loss": 2.147, + "step": 21394 + }, + { + "epoch": 2.49620814374052, + "grad_norm": 1.1667225360870361, + "learning_rate": 0.00011354389062789287, + "loss": 2.0103, + "step": 21395 + }, + { + "epoch": 2.4963248162408123, + "grad_norm": 1.1831119060516357, + "learning_rate": 0.00011352927309237848, + "loss": 2.0377, + "step": 21396 + }, + { + "epoch": 2.4964414887411035, + "grad_norm": 1.2040637731552124, + "learning_rate": 0.00011351465595050249, + "loss": 1.7555, + "step": 21397 + }, + { + "epoch": 2.4965581612413956, + "grad_norm": 1.1420279741287231, + "learning_rate": 0.0001135000392024166, + "loss": 2.0267, + "step": 21398 + }, + { + "epoch": 2.496674833741687, + "grad_norm": 1.0502139329910278, + "learning_rate": 0.0001134854228482722, + "loss": 1.8844, + "step": 21399 + }, + { + "epoch": 2.496791506241979, + "grad_norm": 1.1127454042434692, + "learning_rate": 0.00011347080688822096, + "loss": 1.8678, + "step": 21400 + }, + { + "epoch": 2.4969081787422702, + "grad_norm": 0.9478994011878967, + "learning_rate": 0.00011345619132241422, + "loss": 1.9069, + "step": 21401 + }, + { + "epoch": 2.4970248512425623, + "grad_norm": 1.0949532985687256, + "learning_rate": 0.00011344157615100364, + "loss": 1.9355, + "step": 21402 + }, + { + "epoch": 2.4971415237428536, + "grad_norm": 1.1152942180633545, + "learning_rate": 0.00011342696137414074, + "loss": 1.9893, + "step": 21403 + }, + { + "epoch": 2.4972581962431457, + "grad_norm": 1.177109956741333, + "learning_rate": 0.00011341234699197692, + "loss": 2.0501, + "step": 21404 + }, + { + "epoch": 2.497374868743437, + "grad_norm": 1.1655818223953247, + "learning_rate": 0.0001133977330046638, + "loss": 1.9937, + "step": 21405 + }, + { + "epoch": 2.497491541243729, + "grad_norm": 1.0407969951629639, + "learning_rate": 0.00011338311941235276, + "loss": 1.9526, + "step": 21406 + }, + { + "epoch": 2.4976082137440203, + "grad_norm": 1.1478244066238403, + "learning_rate": 0.00011336850621519544, + "loss": 1.9408, + "step": 21407 + }, + { + "epoch": 2.4977248862443124, + "grad_norm": 1.1009798049926758, + "learning_rate": 0.00011335389341334321, + "loss": 2.0621, + "step": 21408 + }, + { + "epoch": 2.4978415587446037, + "grad_norm": 1.0076878070831299, + "learning_rate": 0.00011333928100694766, + "loss": 1.8096, + "step": 21409 + }, + { + "epoch": 2.497958231244896, + "grad_norm": 1.0876076221466064, + "learning_rate": 0.00011332466899616016, + "loss": 1.9295, + "step": 21410 + }, + { + "epoch": 2.498074903745187, + "grad_norm": 1.1293203830718994, + "learning_rate": 0.00011331005738113233, + "loss": 1.8392, + "step": 21411 + }, + { + "epoch": 2.498191576245479, + "grad_norm": 1.2170463800430298, + "learning_rate": 0.00011329544616201554, + "loss": 1.8576, + "step": 21412 + }, + { + "epoch": 2.4983082487457704, + "grad_norm": 1.130742073059082, + "learning_rate": 0.0001132808353389613, + "loss": 2.0463, + "step": 21413 + }, + { + "epoch": 2.4984249212460625, + "grad_norm": 1.153310775756836, + "learning_rate": 0.00011326622491212105, + "loss": 2.0432, + "step": 21414 + }, + { + "epoch": 2.4985415937463538, + "grad_norm": 1.1552363634109497, + "learning_rate": 0.00011325161488164628, + "loss": 2.0333, + "step": 21415 + }, + { + "epoch": 2.498658266246646, + "grad_norm": 1.0705088376998901, + "learning_rate": 0.00011323700524768849, + "loss": 1.9714, + "step": 21416 + }, + { + "epoch": 2.498774938746937, + "grad_norm": 1.1514581441879272, + "learning_rate": 0.00011322239601039907, + "loss": 1.9568, + "step": 21417 + }, + { + "epoch": 2.4988916112472292, + "grad_norm": 1.2017948627471924, + "learning_rate": 0.00011320778716992952, + "loss": 1.9863, + "step": 21418 + }, + { + "epoch": 2.4990082837475205, + "grad_norm": 1.0850528478622437, + "learning_rate": 0.00011319317872643125, + "loss": 2.0075, + "step": 21419 + }, + { + "epoch": 2.4991249562478126, + "grad_norm": 1.1405969858169556, + "learning_rate": 0.00011317857068005575, + "loss": 1.8907, + "step": 21420 + }, + { + "epoch": 2.499241628748104, + "grad_norm": 1.0208752155303955, + "learning_rate": 0.00011316396303095441, + "loss": 1.7974, + "step": 21421 + }, + { + "epoch": 2.499358301248396, + "grad_norm": 1.2495198249816895, + "learning_rate": 0.00011314935577927872, + "loss": 1.9042, + "step": 21422 + }, + { + "epoch": 2.499474973748687, + "grad_norm": 1.139682412147522, + "learning_rate": 0.00011313474892518001, + "loss": 1.7919, + "step": 21423 + }, + { + "epoch": 2.4995916462489793, + "grad_norm": 1.0087484121322632, + "learning_rate": 0.00011312014246880985, + "loss": 1.8212, + "step": 21424 + }, + { + "epoch": 2.4997083187492706, + "grad_norm": 1.2426849603652954, + "learning_rate": 0.00011310553641031953, + "loss": 1.9347, + "step": 21425 + }, + { + "epoch": 2.4998249912495627, + "grad_norm": 1.2780693769454956, + "learning_rate": 0.0001130909307498606, + "loss": 1.915, + "step": 21426 + }, + { + "epoch": 2.499941663749854, + "grad_norm": 1.219252109527588, + "learning_rate": 0.00011307632548758431, + "loss": 2.0948, + "step": 21427 + }, + { + "epoch": 2.500058336250146, + "grad_norm": 1.1861201524734497, + "learning_rate": 0.00011306172062364224, + "loss": 1.943, + "step": 21428 + }, + { + "epoch": 2.5001750087504373, + "grad_norm": 1.2255017757415771, + "learning_rate": 0.00011304711615818575, + "loss": 1.8995, + "step": 21429 + }, + { + "epoch": 2.5002916812507294, + "grad_norm": 1.0967129468917847, + "learning_rate": 0.00011303251209136614, + "loss": 2.0296, + "step": 21430 + }, + { + "epoch": 2.5004083537510207, + "grad_norm": 1.101889967918396, + "learning_rate": 0.00011301790842333495, + "loss": 1.9327, + "step": 21431 + }, + { + "epoch": 2.500525026251313, + "grad_norm": 1.2149691581726074, + "learning_rate": 0.00011300330515424346, + "loss": 1.9563, + "step": 21432 + }, + { + "epoch": 2.500641698751604, + "grad_norm": 1.183841586112976, + "learning_rate": 0.00011298870228424317, + "loss": 2.0882, + "step": 21433 + }, + { + "epoch": 2.500758371251896, + "grad_norm": 1.174075722694397, + "learning_rate": 0.00011297409981348534, + "loss": 1.9116, + "step": 21434 + }, + { + "epoch": 2.5008750437521874, + "grad_norm": 1.091601848602295, + "learning_rate": 0.00011295949774212147, + "loss": 2.0923, + "step": 21435 + }, + { + "epoch": 2.5009917162524795, + "grad_norm": 1.01925790309906, + "learning_rate": 0.00011294489607030287, + "loss": 1.9473, + "step": 21436 + }, + { + "epoch": 2.5011083887527708, + "grad_norm": 1.055532693862915, + "learning_rate": 0.00011293029479818095, + "loss": 1.9652, + "step": 21437 + }, + { + "epoch": 2.501225061253063, + "grad_norm": 1.187613606452942, + "learning_rate": 0.00011291569392590705, + "loss": 2.1408, + "step": 21438 + }, + { + "epoch": 2.501341733753354, + "grad_norm": 1.1902130842208862, + "learning_rate": 0.00011290109345363256, + "loss": 1.9237, + "step": 21439 + }, + { + "epoch": 2.5014584062536462, + "grad_norm": 1.118202805519104, + "learning_rate": 0.00011288649338150879, + "loss": 2.0168, + "step": 21440 + }, + { + "epoch": 2.5015750787539375, + "grad_norm": 1.1552941799163818, + "learning_rate": 0.00011287189370968713, + "loss": 2.0985, + "step": 21441 + }, + { + "epoch": 2.5016917512542296, + "grad_norm": 1.1437137126922607, + "learning_rate": 0.00011285729443831899, + "loss": 1.9207, + "step": 21442 + }, + { + "epoch": 2.501808423754521, + "grad_norm": 1.2096658945083618, + "learning_rate": 0.00011284269556755562, + "loss": 1.9484, + "step": 21443 + }, + { + "epoch": 2.501925096254813, + "grad_norm": 1.0648237466812134, + "learning_rate": 0.00011282809709754848, + "loss": 1.9379, + "step": 21444 + }, + { + "epoch": 2.502041768755104, + "grad_norm": 1.4124751091003418, + "learning_rate": 0.0001128134990284488, + "loss": 1.8798, + "step": 21445 + }, + { + "epoch": 2.5021584412553963, + "grad_norm": 1.0923069715499878, + "learning_rate": 0.00011279890136040801, + "loss": 2.0352, + "step": 21446 + }, + { + "epoch": 2.5022751137556876, + "grad_norm": 1.1509697437286377, + "learning_rate": 0.00011278430409357731, + "loss": 2.0282, + "step": 21447 + }, + { + "epoch": 2.5023917862559797, + "grad_norm": 1.1106278896331787, + "learning_rate": 0.00011276970722810818, + "loss": 2.076, + "step": 21448 + }, + { + "epoch": 2.502508458756271, + "grad_norm": 1.2728208303451538, + "learning_rate": 0.00011275511076415185, + "loss": 1.7723, + "step": 21449 + }, + { + "epoch": 2.502625131256563, + "grad_norm": 1.0773862600326538, + "learning_rate": 0.00011274051470185969, + "loss": 1.7997, + "step": 21450 + }, + { + "epoch": 2.5027418037568543, + "grad_norm": 1.1758257150650024, + "learning_rate": 0.00011272591904138294, + "loss": 2.0171, + "step": 21451 + }, + { + "epoch": 2.5028584762571464, + "grad_norm": 1.2039556503295898, + "learning_rate": 0.00011271132378287302, + "loss": 2.0209, + "step": 21452 + }, + { + "epoch": 2.5029751487574377, + "grad_norm": 1.2296229600906372, + "learning_rate": 0.00011269672892648114, + "loss": 1.995, + "step": 21453 + }, + { + "epoch": 2.5030918212577298, + "grad_norm": 1.1897400617599487, + "learning_rate": 0.00011268213447235863, + "loss": 2.1591, + "step": 21454 + }, + { + "epoch": 2.503208493758021, + "grad_norm": 1.0575748682022095, + "learning_rate": 0.0001126675404206569, + "loss": 1.816, + "step": 21455 + }, + { + "epoch": 2.503325166258313, + "grad_norm": 0.9343520998954773, + "learning_rate": 0.00011265294677152703, + "loss": 1.9736, + "step": 21456 + }, + { + "epoch": 2.5034418387586044, + "grad_norm": 1.1622202396392822, + "learning_rate": 0.00011263835352512053, + "loss": 1.7545, + "step": 21457 + }, + { + "epoch": 2.5035585112588965, + "grad_norm": 1.1569904088974, + "learning_rate": 0.0001126237606815885, + "loss": 1.9738, + "step": 21458 + }, + { + "epoch": 2.5036751837591877, + "grad_norm": 1.2280787229537964, + "learning_rate": 0.00011260916824108237, + "loss": 2.1675, + "step": 21459 + }, + { + "epoch": 2.50379185625948, + "grad_norm": 1.1098363399505615, + "learning_rate": 0.00011259457620375332, + "loss": 1.9522, + "step": 21460 + }, + { + "epoch": 2.503908528759771, + "grad_norm": 1.1947007179260254, + "learning_rate": 0.0001125799845697527, + "loss": 1.9198, + "step": 21461 + }, + { + "epoch": 2.5040252012600632, + "grad_norm": 1.174928069114685, + "learning_rate": 0.0001125653933392317, + "loss": 2.0145, + "step": 21462 + }, + { + "epoch": 2.5041418737603545, + "grad_norm": 1.1866384744644165, + "learning_rate": 0.00011255080251234167, + "loss": 1.9105, + "step": 21463 + }, + { + "epoch": 2.5042585462606466, + "grad_norm": 1.3519840240478516, + "learning_rate": 0.0001125362120892338, + "loss": 1.9381, + "step": 21464 + }, + { + "epoch": 2.504375218760938, + "grad_norm": 1.3058291673660278, + "learning_rate": 0.0001125216220700594, + "loss": 2.0745, + "step": 21465 + }, + { + "epoch": 2.50449189126123, + "grad_norm": 1.0168616771697998, + "learning_rate": 0.00011250703245496965, + "loss": 2.0633, + "step": 21466 + }, + { + "epoch": 2.504608563761521, + "grad_norm": 1.223578929901123, + "learning_rate": 0.00011249244324411583, + "loss": 2.0507, + "step": 21467 + }, + { + "epoch": 2.5047252362618133, + "grad_norm": 1.0696903467178345, + "learning_rate": 0.00011247785443764928, + "loss": 1.9599, + "step": 21468 + }, + { + "epoch": 2.5048419087621046, + "grad_norm": 1.135331392288208, + "learning_rate": 0.0001124632660357211, + "loss": 2.0379, + "step": 21469 + }, + { + "epoch": 2.5049585812623967, + "grad_norm": 1.0002665519714355, + "learning_rate": 0.00011244867803848266, + "loss": 1.8456, + "step": 21470 + }, + { + "epoch": 2.505075253762688, + "grad_norm": 1.135439157485962, + "learning_rate": 0.00011243409044608506, + "loss": 1.9407, + "step": 21471 + }, + { + "epoch": 2.50519192626298, + "grad_norm": 1.1531705856323242, + "learning_rate": 0.00011241950325867967, + "loss": 1.9858, + "step": 21472 + }, + { + "epoch": 2.5053085987632713, + "grad_norm": 1.1455270051956177, + "learning_rate": 0.00011240491647641753, + "loss": 2.0201, + "step": 21473 + }, + { + "epoch": 2.5054252712635634, + "grad_norm": 1.0792732238769531, + "learning_rate": 0.00011239033009945004, + "loss": 1.9203, + "step": 21474 + }, + { + "epoch": 2.5055419437638546, + "grad_norm": 0.9986802339553833, + "learning_rate": 0.0001123757441279283, + "loss": 1.9086, + "step": 21475 + }, + { + "epoch": 2.5056586162641468, + "grad_norm": 1.2789411544799805, + "learning_rate": 0.00011236115856200358, + "loss": 2.0112, + "step": 21476 + }, + { + "epoch": 2.505775288764438, + "grad_norm": 1.212470293045044, + "learning_rate": 0.00011234657340182702, + "loss": 1.9223, + "step": 21477 + }, + { + "epoch": 2.50589196126473, + "grad_norm": 1.3010108470916748, + "learning_rate": 0.00011233198864754994, + "loss": 1.9341, + "step": 21478 + }, + { + "epoch": 2.5060086337650214, + "grad_norm": 1.2068341970443726, + "learning_rate": 0.0001123174042993234, + "loss": 2.0414, + "step": 21479 + }, + { + "epoch": 2.5061253062653135, + "grad_norm": 1.1751885414123535, + "learning_rate": 0.0001123028203572987, + "loss": 1.9938, + "step": 21480 + }, + { + "epoch": 2.5062419787656047, + "grad_norm": 1.1288787126541138, + "learning_rate": 0.00011228823682162703, + "loss": 2.0285, + "step": 21481 + }, + { + "epoch": 2.506358651265897, + "grad_norm": 1.1572282314300537, + "learning_rate": 0.00011227365369245944, + "loss": 1.7997, + "step": 21482 + }, + { + "epoch": 2.506475323766188, + "grad_norm": 1.0633951425552368, + "learning_rate": 0.00011225907096994731, + "loss": 1.8779, + "step": 21483 + }, + { + "epoch": 2.50659199626648, + "grad_norm": 1.265939474105835, + "learning_rate": 0.00011224448865424165, + "loss": 1.9948, + "step": 21484 + }, + { + "epoch": 2.5067086687667715, + "grad_norm": 1.1771961450576782, + "learning_rate": 0.00011222990674549375, + "loss": 1.9523, + "step": 21485 + }, + { + "epoch": 2.5068253412670636, + "grad_norm": 1.1132179498672485, + "learning_rate": 0.00011221532524385468, + "loss": 1.8183, + "step": 21486 + }, + { + "epoch": 2.506942013767355, + "grad_norm": 1.2203218936920166, + "learning_rate": 0.00011220074414947571, + "loss": 2.0778, + "step": 21487 + }, + { + "epoch": 2.507058686267647, + "grad_norm": 1.1274564266204834, + "learning_rate": 0.00011218616346250793, + "loss": 1.8855, + "step": 21488 + }, + { + "epoch": 2.507175358767938, + "grad_norm": 1.1913937330245972, + "learning_rate": 0.0001121715831831025, + "loss": 2.0491, + "step": 21489 + }, + { + "epoch": 2.5072920312682303, + "grad_norm": 1.2580602169036865, + "learning_rate": 0.00011215700331141057, + "loss": 2.0958, + "step": 21490 + }, + { + "epoch": 2.5074087037685215, + "grad_norm": 1.1371742486953735, + "learning_rate": 0.00011214242384758337, + "loss": 1.8706, + "step": 21491 + }, + { + "epoch": 2.5075253762688137, + "grad_norm": 1.132633924484253, + "learning_rate": 0.00011212784479177188, + "loss": 2.0351, + "step": 21492 + }, + { + "epoch": 2.507642048769105, + "grad_norm": 1.0794583559036255, + "learning_rate": 0.00011211326614412737, + "loss": 1.907, + "step": 21493 + }, + { + "epoch": 2.507758721269397, + "grad_norm": 1.1448544263839722, + "learning_rate": 0.00011209868790480098, + "loss": 1.9064, + "step": 21494 + }, + { + "epoch": 2.5078753937696883, + "grad_norm": 1.28543221950531, + "learning_rate": 0.00011208411007394375, + "loss": 2.1711, + "step": 21495 + }, + { + "epoch": 2.5079920662699804, + "grad_norm": 1.1525123119354248, + "learning_rate": 0.00011206953265170692, + "loss": 1.9623, + "step": 21496 + }, + { + "epoch": 2.5081087387702716, + "grad_norm": 1.1545242071151733, + "learning_rate": 0.00011205495563824151, + "loss": 1.8316, + "step": 21497 + }, + { + "epoch": 2.5082254112705638, + "grad_norm": 1.1517629623413086, + "learning_rate": 0.0001120403790336987, + "loss": 2.0645, + "step": 21498 + }, + { + "epoch": 2.508342083770855, + "grad_norm": 1.0256515741348267, + "learning_rate": 0.00011202580283822956, + "loss": 1.8909, + "step": 21499 + }, + { + "epoch": 2.508458756271147, + "grad_norm": 1.3879278898239136, + "learning_rate": 0.00011201122705198528, + "loss": 1.9627, + "step": 21500 + }, + { + "epoch": 2.5085754287714384, + "grad_norm": 1.2349977493286133, + "learning_rate": 0.00011199665167511683, + "loss": 2.005, + "step": 21501 + }, + { + "epoch": 2.5086921012717305, + "grad_norm": 1.1792176961898804, + "learning_rate": 0.00011198207670777544, + "loss": 1.9457, + "step": 21502 + }, + { + "epoch": 2.5088087737720217, + "grad_norm": 1.3118515014648438, + "learning_rate": 0.00011196750215011212, + "loss": 1.9346, + "step": 21503 + }, + { + "epoch": 2.508925446272314, + "grad_norm": 1.183637022972107, + "learning_rate": 0.00011195292800227805, + "loss": 1.9154, + "step": 21504 + }, + { + "epoch": 2.509042118772605, + "grad_norm": 1.3280913829803467, + "learning_rate": 0.00011193835426442426, + "loss": 2.0567, + "step": 21505 + }, + { + "epoch": 2.509158791272897, + "grad_norm": 1.1954056024551392, + "learning_rate": 0.0001119237809367018, + "loss": 1.9827, + "step": 21506 + }, + { + "epoch": 2.5092754637731884, + "grad_norm": 1.1505268812179565, + "learning_rate": 0.00011190920801926183, + "loss": 1.998, + "step": 21507 + }, + { + "epoch": 2.5093921362734806, + "grad_norm": 1.1655734777450562, + "learning_rate": 0.00011189463551225537, + "loss": 2.0956, + "step": 21508 + }, + { + "epoch": 2.509508808773772, + "grad_norm": 1.2752578258514404, + "learning_rate": 0.00011188006341583356, + "loss": 2.1164, + "step": 21509 + }, + { + "epoch": 2.509625481274064, + "grad_norm": 1.14800226688385, + "learning_rate": 0.00011186549173014735, + "loss": 1.8983, + "step": 21510 + }, + { + "epoch": 2.509742153774355, + "grad_norm": 1.1246060132980347, + "learning_rate": 0.00011185092045534792, + "loss": 2.1413, + "step": 21511 + }, + { + "epoch": 2.5098588262746473, + "grad_norm": 1.0869964361190796, + "learning_rate": 0.00011183634959158625, + "loss": 1.9314, + "step": 21512 + }, + { + "epoch": 2.5099754987749385, + "grad_norm": 1.2121268510818481, + "learning_rate": 0.00011182177913901348, + "loss": 1.9666, + "step": 21513 + }, + { + "epoch": 2.5100921712752307, + "grad_norm": 1.1160086393356323, + "learning_rate": 0.00011180720909778058, + "loss": 1.9282, + "step": 21514 + }, + { + "epoch": 2.510208843775522, + "grad_norm": 1.264534592628479, + "learning_rate": 0.00011179263946803862, + "loss": 1.9804, + "step": 21515 + }, + { + "epoch": 2.510325516275814, + "grad_norm": 1.1087582111358643, + "learning_rate": 0.00011177807024993861, + "loss": 1.7344, + "step": 21516 + }, + { + "epoch": 2.5104421887761053, + "grad_norm": 1.0277882814407349, + "learning_rate": 0.00011176350144363171, + "loss": 2.1584, + "step": 21517 + }, + { + "epoch": 2.5105588612763974, + "grad_norm": 1.1392346620559692, + "learning_rate": 0.00011174893304926876, + "loss": 1.9225, + "step": 21518 + }, + { + "epoch": 2.5106755337766886, + "grad_norm": 1.0440119504928589, + "learning_rate": 0.00011173436506700089, + "loss": 1.9779, + "step": 21519 + }, + { + "epoch": 2.5107922062769807, + "grad_norm": 1.2877368927001953, + "learning_rate": 0.00011171979749697921, + "loss": 2.0755, + "step": 21520 + }, + { + "epoch": 2.510908878777272, + "grad_norm": 1.1000025272369385, + "learning_rate": 0.00011170523033935463, + "loss": 1.9278, + "step": 21521 + }, + { + "epoch": 2.511025551277564, + "grad_norm": 1.2569754123687744, + "learning_rate": 0.00011169066359427817, + "loss": 1.9726, + "step": 21522 + }, + { + "epoch": 2.5111422237778553, + "grad_norm": 1.022151231765747, + "learning_rate": 0.00011167609726190089, + "loss": 1.7618, + "step": 21523 + }, + { + "epoch": 2.5112588962781475, + "grad_norm": 1.242850422859192, + "learning_rate": 0.00011166153134237383, + "loss": 1.8372, + "step": 21524 + }, + { + "epoch": 2.5113755687784387, + "grad_norm": 1.114936113357544, + "learning_rate": 0.00011164696583584784, + "loss": 2.0296, + "step": 21525 + }, + { + "epoch": 2.511492241278731, + "grad_norm": 1.090485692024231, + "learning_rate": 0.00011163240074247408, + "loss": 2.0596, + "step": 21526 + }, + { + "epoch": 2.511608913779022, + "grad_norm": 1.1643702983856201, + "learning_rate": 0.00011161783606240343, + "loss": 1.9618, + "step": 21527 + }, + { + "epoch": 2.511725586279314, + "grad_norm": 1.0959599018096924, + "learning_rate": 0.00011160327179578699, + "loss": 1.9833, + "step": 21528 + }, + { + "epoch": 2.5118422587796054, + "grad_norm": 1.1842495203018188, + "learning_rate": 0.00011158870794277563, + "loss": 2.0451, + "step": 21529 + }, + { + "epoch": 2.5119589312798976, + "grad_norm": 1.0373057126998901, + "learning_rate": 0.00011157414450352044, + "loss": 1.9991, + "step": 21530 + }, + { + "epoch": 2.512075603780189, + "grad_norm": 1.1335903406143188, + "learning_rate": 0.00011155958147817233, + "loss": 1.9231, + "step": 21531 + }, + { + "epoch": 2.512192276280481, + "grad_norm": 1.3343621492385864, + "learning_rate": 0.00011154501886688225, + "loss": 1.9896, + "step": 21532 + }, + { + "epoch": 2.512308948780772, + "grad_norm": 1.137956142425537, + "learning_rate": 0.00011153045666980127, + "loss": 2.0228, + "step": 21533 + }, + { + "epoch": 2.5124256212810643, + "grad_norm": 1.1317572593688965, + "learning_rate": 0.00011151589488708026, + "loss": 2.1445, + "step": 21534 + }, + { + "epoch": 2.5125422937813555, + "grad_norm": 1.3361115455627441, + "learning_rate": 0.00011150133351887027, + "loss": 2.2241, + "step": 21535 + }, + { + "epoch": 2.5126589662816476, + "grad_norm": 1.1764131784439087, + "learning_rate": 0.00011148677256532212, + "loss": 1.7845, + "step": 21536 + }, + { + "epoch": 2.512775638781939, + "grad_norm": 1.2522069215774536, + "learning_rate": 0.00011147221202658693, + "loss": 2.0709, + "step": 21537 + }, + { + "epoch": 2.512892311282231, + "grad_norm": 1.067413330078125, + "learning_rate": 0.00011145765190281546, + "loss": 1.912, + "step": 21538 + }, + { + "epoch": 2.5130089837825222, + "grad_norm": 1.1357595920562744, + "learning_rate": 0.00011144309219415885, + "loss": 1.8745, + "step": 21539 + }, + { + "epoch": 2.5131256562828144, + "grad_norm": 1.4114283323287964, + "learning_rate": 0.00011142853290076791, + "loss": 2.1098, + "step": 21540 + }, + { + "epoch": 2.5132423287831056, + "grad_norm": 1.0408127307891846, + "learning_rate": 0.00011141397402279364, + "loss": 1.9107, + "step": 21541 + }, + { + "epoch": 2.5133590012833977, + "grad_norm": 1.06097412109375, + "learning_rate": 0.00011139941556038685, + "loss": 1.8666, + "step": 21542 + }, + { + "epoch": 2.513475673783689, + "grad_norm": 1.02198326587677, + "learning_rate": 0.00011138485751369864, + "loss": 1.9338, + "step": 21543 + }, + { + "epoch": 2.513592346283981, + "grad_norm": 1.1027911901474, + "learning_rate": 0.0001113702998828798, + "loss": 2.0119, + "step": 21544 + }, + { + "epoch": 2.5137090187842723, + "grad_norm": 1.216683268547058, + "learning_rate": 0.00011135574266808126, + "loss": 1.947, + "step": 21545 + }, + { + "epoch": 2.5138256912845645, + "grad_norm": 1.1781286001205444, + "learning_rate": 0.00011134118586945404, + "loss": 1.8447, + "step": 21546 + }, + { + "epoch": 2.5139423637848557, + "grad_norm": 1.3774256706237793, + "learning_rate": 0.00011132662948714893, + "loss": 2.038, + "step": 21547 + }, + { + "epoch": 2.514059036285148, + "grad_norm": 1.246333122253418, + "learning_rate": 0.0001113120735213169, + "loss": 2.0227, + "step": 21548 + }, + { + "epoch": 2.514175708785439, + "grad_norm": 1.15507173538208, + "learning_rate": 0.00011129751797210882, + "loss": 1.9759, + "step": 21549 + }, + { + "epoch": 2.514292381285731, + "grad_norm": 1.0479789972305298, + "learning_rate": 0.00011128296283967565, + "loss": 1.8983, + "step": 21550 + }, + { + "epoch": 2.5144090537860224, + "grad_norm": 1.237913966178894, + "learning_rate": 0.00011126840812416812, + "loss": 2.1657, + "step": 21551 + }, + { + "epoch": 2.5145257262863145, + "grad_norm": 1.094404935836792, + "learning_rate": 0.00011125385382573733, + "loss": 1.9499, + "step": 21552 + }, + { + "epoch": 2.514642398786606, + "grad_norm": 1.3280912637710571, + "learning_rate": 0.000111239299944534, + "loss": 2.0808, + "step": 21553 + }, + { + "epoch": 2.514759071286898, + "grad_norm": 1.0829658508300781, + "learning_rate": 0.00011122474648070909, + "loss": 1.6614, + "step": 21554 + }, + { + "epoch": 2.514875743787189, + "grad_norm": 1.0978479385375977, + "learning_rate": 0.0001112101934344134, + "loss": 1.8466, + "step": 21555 + }, + { + "epoch": 2.5149924162874813, + "grad_norm": 1.1714414358139038, + "learning_rate": 0.00011119564080579792, + "loss": 1.9036, + "step": 21556 + }, + { + "epoch": 2.5151090887877725, + "grad_norm": 1.1892906427383423, + "learning_rate": 0.00011118108859501342, + "loss": 1.9815, + "step": 21557 + }, + { + "epoch": 2.5152257612880646, + "grad_norm": 1.1963447332382202, + "learning_rate": 0.00011116653680221074, + "loss": 2.1127, + "step": 21558 + }, + { + "epoch": 2.515342433788356, + "grad_norm": 1.0466618537902832, + "learning_rate": 0.00011115198542754088, + "loss": 1.882, + "step": 21559 + }, + { + "epoch": 2.515459106288648, + "grad_norm": 1.1253011226654053, + "learning_rate": 0.00011113743447115449, + "loss": 2.0768, + "step": 21560 + }, + { + "epoch": 2.5155757787889392, + "grad_norm": 1.1416047811508179, + "learning_rate": 0.0001111228839332026, + "loss": 1.9695, + "step": 21561 + }, + { + "epoch": 2.5156924512892314, + "grad_norm": 1.2235620021820068, + "learning_rate": 0.00011110833381383592, + "loss": 2.0837, + "step": 21562 + }, + { + "epoch": 2.5158091237895226, + "grad_norm": 1.2077099084854126, + "learning_rate": 0.00011109378411320542, + "loss": 1.9878, + "step": 21563 + }, + { + "epoch": 2.5159257962898147, + "grad_norm": 1.345336675643921, + "learning_rate": 0.00011107923483146178, + "loss": 2.1403, + "step": 21564 + }, + { + "epoch": 2.516042468790106, + "grad_norm": 1.1932461261749268, + "learning_rate": 0.000111064685968756, + "loss": 1.983, + "step": 21565 + }, + { + "epoch": 2.516159141290398, + "grad_norm": 1.1489039659500122, + "learning_rate": 0.00011105013752523879, + "loss": 2.0686, + "step": 21566 + }, + { + "epoch": 2.5162758137906893, + "grad_norm": 1.1243795156478882, + "learning_rate": 0.00011103558950106101, + "loss": 1.9295, + "step": 21567 + }, + { + "epoch": 2.5163924862909814, + "grad_norm": 1.200266718864441, + "learning_rate": 0.00011102104189637343, + "loss": 1.9959, + "step": 21568 + }, + { + "epoch": 2.5165091587912727, + "grad_norm": 1.207709789276123, + "learning_rate": 0.00011100649471132693, + "loss": 1.8129, + "step": 21569 + }, + { + "epoch": 2.516625831291565, + "grad_norm": 1.0584579706192017, + "learning_rate": 0.00011099194794607227, + "loss": 1.9811, + "step": 21570 + }, + { + "epoch": 2.516742503791856, + "grad_norm": 1.2016589641571045, + "learning_rate": 0.00011097740160076027, + "loss": 1.9976, + "step": 21571 + }, + { + "epoch": 2.516859176292148, + "grad_norm": 1.1147164106369019, + "learning_rate": 0.00011096285567554181, + "loss": 1.8377, + "step": 21572 + }, + { + "epoch": 2.5169758487924394, + "grad_norm": 1.1170254945755005, + "learning_rate": 0.00011094831017056756, + "loss": 1.8758, + "step": 21573 + }, + { + "epoch": 2.5170925212927315, + "grad_norm": 1.153448462486267, + "learning_rate": 0.00011093376508598837, + "loss": 1.8932, + "step": 21574 + }, + { + "epoch": 2.5172091937930228, + "grad_norm": 1.1104236841201782, + "learning_rate": 0.00011091922042195503, + "loss": 1.8028, + "step": 21575 + }, + { + "epoch": 2.517325866293315, + "grad_norm": 1.1745661497116089, + "learning_rate": 0.00011090467617861834, + "loss": 2.0229, + "step": 21576 + }, + { + "epoch": 2.517442538793606, + "grad_norm": 1.1758606433868408, + "learning_rate": 0.00011089013235612899, + "loss": 1.9528, + "step": 21577 + }, + { + "epoch": 2.5175592112938983, + "grad_norm": 1.1405390501022339, + "learning_rate": 0.00011087558895463789, + "loss": 2.0864, + "step": 21578 + }, + { + "epoch": 2.5176758837941895, + "grad_norm": 1.1530957221984863, + "learning_rate": 0.0001108610459742957, + "loss": 2.0333, + "step": 21579 + }, + { + "epoch": 2.5177925562944816, + "grad_norm": 0.9579619765281677, + "learning_rate": 0.00011084650341525325, + "loss": 1.7893, + "step": 21580 + }, + { + "epoch": 2.517909228794773, + "grad_norm": 1.4143038988113403, + "learning_rate": 0.00011083196127766121, + "loss": 2.0721, + "step": 21581 + }, + { + "epoch": 2.518025901295065, + "grad_norm": 1.3817741870880127, + "learning_rate": 0.0001108174195616705, + "loss": 2.2317, + "step": 21582 + }, + { + "epoch": 2.518142573795356, + "grad_norm": 1.0769039392471313, + "learning_rate": 0.00011080287826743172, + "loss": 2.0829, + "step": 21583 + }, + { + "epoch": 2.5182592462956483, + "grad_norm": 1.1380524635314941, + "learning_rate": 0.00011078833739509565, + "loss": 1.8743, + "step": 21584 + }, + { + "epoch": 2.5183759187959396, + "grad_norm": 1.144254207611084, + "learning_rate": 0.00011077379694481311, + "loss": 2.0076, + "step": 21585 + }, + { + "epoch": 2.5184925912962317, + "grad_norm": 1.1728078126907349, + "learning_rate": 0.00011075925691673473, + "loss": 2.0434, + "step": 21586 + }, + { + "epoch": 2.518609263796523, + "grad_norm": 1.1366695165634155, + "learning_rate": 0.00011074471731101137, + "loss": 1.9981, + "step": 21587 + }, + { + "epoch": 2.518725936296815, + "grad_norm": 1.1983304023742676, + "learning_rate": 0.00011073017812779362, + "loss": 2.1629, + "step": 21588 + }, + { + "epoch": 2.5188426087971063, + "grad_norm": 1.18110191822052, + "learning_rate": 0.00011071563936723235, + "loss": 1.971, + "step": 21589 + }, + { + "epoch": 2.5189592812973984, + "grad_norm": 1.0816094875335693, + "learning_rate": 0.00011070110102947817, + "loss": 1.91, + "step": 21590 + }, + { + "epoch": 2.5190759537976897, + "grad_norm": 1.1437621116638184, + "learning_rate": 0.00011068656311468183, + "loss": 2.0828, + "step": 21591 + }, + { + "epoch": 2.519192626297982, + "grad_norm": 1.000442385673523, + "learning_rate": 0.00011067202562299404, + "loss": 1.7475, + "step": 21592 + }, + { + "epoch": 2.519309298798273, + "grad_norm": 1.239443063735962, + "learning_rate": 0.0001106574885545656, + "loss": 1.95, + "step": 21593 + }, + { + "epoch": 2.519425971298565, + "grad_norm": 1.2183198928833008, + "learning_rate": 0.00011064295190954702, + "loss": 1.9872, + "step": 21594 + }, + { + "epoch": 2.5195426437988564, + "grad_norm": 1.0822843313217163, + "learning_rate": 0.0001106284156880892, + "loss": 2.0011, + "step": 21595 + }, + { + "epoch": 2.5196593162991485, + "grad_norm": 1.170507550239563, + "learning_rate": 0.00011061387989034267, + "loss": 1.9839, + "step": 21596 + }, + { + "epoch": 2.5197759887994398, + "grad_norm": 1.221948504447937, + "learning_rate": 0.00011059934451645821, + "loss": 1.9747, + "step": 21597 + }, + { + "epoch": 2.519892661299732, + "grad_norm": 1.1302558183670044, + "learning_rate": 0.00011058480956658655, + "loss": 1.7077, + "step": 21598 + }, + { + "epoch": 2.520009333800023, + "grad_norm": 1.1928836107254028, + "learning_rate": 0.00011057027504087829, + "loss": 2.0141, + "step": 21599 + }, + { + "epoch": 2.5201260063003152, + "grad_norm": 1.3090153932571411, + "learning_rate": 0.00011055574093948417, + "loss": 2.0126, + "step": 21600 + }, + { + "epoch": 2.5202426788006065, + "grad_norm": 1.3753273487091064, + "learning_rate": 0.00011054120726255479, + "loss": 1.9751, + "step": 21601 + }, + { + "epoch": 2.5203593513008986, + "grad_norm": 1.272690773010254, + "learning_rate": 0.00011052667401024088, + "loss": 1.9915, + "step": 21602 + }, + { + "epoch": 2.52047602380119, + "grad_norm": 1.051530122756958, + "learning_rate": 0.00011051214118269305, + "loss": 1.8372, + "step": 21603 + }, + { + "epoch": 2.520592696301482, + "grad_norm": 1.4167730808258057, + "learning_rate": 0.00011049760878006204, + "loss": 2.0777, + "step": 21604 + }, + { + "epoch": 2.520709368801773, + "grad_norm": 1.3370552062988281, + "learning_rate": 0.0001104830768024984, + "loss": 1.9475, + "step": 21605 + }, + { + "epoch": 2.5208260413020653, + "grad_norm": 1.1794720888137817, + "learning_rate": 0.00011046854525015292, + "loss": 1.9048, + "step": 21606 + }, + { + "epoch": 2.5209427138023566, + "grad_norm": 1.1973475217819214, + "learning_rate": 0.00011045401412317607, + "loss": 1.8754, + "step": 21607 + }, + { + "epoch": 2.5210593863026487, + "grad_norm": 1.0023748874664307, + "learning_rate": 0.00011043948342171866, + "loss": 1.9389, + "step": 21608 + }, + { + "epoch": 2.52117605880294, + "grad_norm": 1.205970048904419, + "learning_rate": 0.00011042495314593124, + "loss": 1.8969, + "step": 21609 + }, + { + "epoch": 2.521292731303232, + "grad_norm": 1.0704922676086426, + "learning_rate": 0.00011041042329596445, + "loss": 1.8713, + "step": 21610 + }, + { + "epoch": 2.5214094038035233, + "grad_norm": 1.1551296710968018, + "learning_rate": 0.00011039589387196899, + "loss": 1.9444, + "step": 21611 + }, + { + "epoch": 2.5215260763038154, + "grad_norm": 1.2373321056365967, + "learning_rate": 0.00011038136487409533, + "loss": 1.897, + "step": 21612 + }, + { + "epoch": 2.5216427488041067, + "grad_norm": 1.3230996131896973, + "learning_rate": 0.00011036683630249429, + "loss": 2.0713, + "step": 21613 + }, + { + "epoch": 2.521759421304399, + "grad_norm": 1.1502156257629395, + "learning_rate": 0.00011035230815731629, + "loss": 1.9629, + "step": 21614 + }, + { + "epoch": 2.52187609380469, + "grad_norm": 1.3105746507644653, + "learning_rate": 0.00011033778043871212, + "loss": 1.9129, + "step": 21615 + }, + { + "epoch": 2.521992766304982, + "grad_norm": 1.2879847288131714, + "learning_rate": 0.00011032325314683225, + "loss": 2.1686, + "step": 21616 + }, + { + "epoch": 2.5221094388052734, + "grad_norm": 1.1294574737548828, + "learning_rate": 0.00011030872628182736, + "loss": 1.988, + "step": 21617 + }, + { + "epoch": 2.5222261113055655, + "grad_norm": 1.1935667991638184, + "learning_rate": 0.00011029419984384801, + "loss": 1.9903, + "step": 21618 + }, + { + "epoch": 2.5223427838058567, + "grad_norm": 1.2819485664367676, + "learning_rate": 0.00011027967383304484, + "loss": 1.9428, + "step": 21619 + }, + { + "epoch": 2.522459456306149, + "grad_norm": 1.2405110597610474, + "learning_rate": 0.00011026514824956834, + "loss": 2.1028, + "step": 21620 + }, + { + "epoch": 2.52257612880644, + "grad_norm": 1.2091031074523926, + "learning_rate": 0.00011025062309356922, + "loss": 1.9914, + "step": 21621 + }, + { + "epoch": 2.5226928013067322, + "grad_norm": 1.0860676765441895, + "learning_rate": 0.00011023609836519796, + "loss": 1.9511, + "step": 21622 + }, + { + "epoch": 2.5228094738070235, + "grad_norm": 1.100504755973816, + "learning_rate": 0.00011022157406460517, + "loss": 1.7448, + "step": 21623 + }, + { + "epoch": 2.5229261463073156, + "grad_norm": 1.0304373502731323, + "learning_rate": 0.00011020705019194152, + "loss": 1.962, + "step": 21624 + }, + { + "epoch": 2.523042818807607, + "grad_norm": 1.1886398792266846, + "learning_rate": 0.00011019252674735742, + "loss": 1.9103, + "step": 21625 + }, + { + "epoch": 2.523159491307899, + "grad_norm": 1.1708660125732422, + "learning_rate": 0.00011017800373100356, + "loss": 2.0987, + "step": 21626 + }, + { + "epoch": 2.52327616380819, + "grad_norm": 1.0779800415039062, + "learning_rate": 0.00011016348114303037, + "loss": 2.0293, + "step": 21627 + }, + { + "epoch": 2.5233928363084823, + "grad_norm": 1.0633563995361328, + "learning_rate": 0.00011014895898358852, + "loss": 1.8738, + "step": 21628 + }, + { + "epoch": 2.5235095088087736, + "grad_norm": 1.241040587425232, + "learning_rate": 0.0001101344372528285, + "loss": 1.9778, + "step": 21629 + }, + { + "epoch": 2.5236261813090657, + "grad_norm": 1.1825969219207764, + "learning_rate": 0.0001101199159509009, + "loss": 2.0462, + "step": 21630 + }, + { + "epoch": 2.523742853809357, + "grad_norm": 1.1114693880081177, + "learning_rate": 0.00011010539507795615, + "loss": 1.9105, + "step": 21631 + }, + { + "epoch": 2.523859526309649, + "grad_norm": 1.0768622159957886, + "learning_rate": 0.00011009087463414495, + "loss": 2.0188, + "step": 21632 + }, + { + "epoch": 2.5239761988099403, + "grad_norm": 1.06615149974823, + "learning_rate": 0.00011007635461961768, + "loss": 1.7825, + "step": 21633 + }, + { + "epoch": 2.5240928713102324, + "grad_norm": 1.0405539274215698, + "learning_rate": 0.000110061835034525, + "loss": 1.9142, + "step": 21634 + }, + { + "epoch": 2.5242095438105236, + "grad_norm": 1.0470250844955444, + "learning_rate": 0.00011004731587901735, + "loss": 1.9463, + "step": 21635 + }, + { + "epoch": 2.5243262163108158, + "grad_norm": 0.9735709428787231, + "learning_rate": 0.00011003279715324522, + "loss": 1.9506, + "step": 21636 + }, + { + "epoch": 2.524442888811107, + "grad_norm": 1.1110376119613647, + "learning_rate": 0.00011001827885735926, + "loss": 2.0093, + "step": 21637 + }, + { + "epoch": 2.524559561311399, + "grad_norm": 1.0342471599578857, + "learning_rate": 0.0001100037609915098, + "loss": 1.9777, + "step": 21638 + }, + { + "epoch": 2.5246762338116904, + "grad_norm": 1.126419186592102, + "learning_rate": 0.00010998924355584752, + "loss": 1.9593, + "step": 21639 + }, + { + "epoch": 2.5247929063119825, + "grad_norm": 1.414394736289978, + "learning_rate": 0.00010997472655052276, + "loss": 1.9546, + "step": 21640 + }, + { + "epoch": 2.5249095788122737, + "grad_norm": 1.1055713891983032, + "learning_rate": 0.00010996020997568616, + "loss": 1.9805, + "step": 21641 + }, + { + "epoch": 2.525026251312566, + "grad_norm": 1.2489197254180908, + "learning_rate": 0.00010994569383148812, + "loss": 2.1237, + "step": 21642 + }, + { + "epoch": 2.525142923812857, + "grad_norm": 1.1000198125839233, + "learning_rate": 0.00010993117811807915, + "loss": 1.9851, + "step": 21643 + }, + { + "epoch": 2.5252595963131492, + "grad_norm": 1.2281712293624878, + "learning_rate": 0.00010991666283560973, + "loss": 2.0905, + "step": 21644 + }, + { + "epoch": 2.5253762688134405, + "grad_norm": 1.053905725479126, + "learning_rate": 0.0001099021479842304, + "loss": 1.9748, + "step": 21645 + }, + { + "epoch": 2.5254929413137326, + "grad_norm": 0.9892138242721558, + "learning_rate": 0.0001098876335640915, + "loss": 1.9985, + "step": 21646 + }, + { + "epoch": 2.525609613814024, + "grad_norm": 1.1784225702285767, + "learning_rate": 0.00010987311957534364, + "loss": 2.0981, + "step": 21647 + }, + { + "epoch": 2.525726286314316, + "grad_norm": 1.0290532112121582, + "learning_rate": 0.00010985860601813715, + "loss": 1.8536, + "step": 21648 + }, + { + "epoch": 2.525842958814607, + "grad_norm": 1.2304084300994873, + "learning_rate": 0.00010984409289262263, + "loss": 2.0489, + "step": 21649 + }, + { + "epoch": 2.5259596313148993, + "grad_norm": 1.119299292564392, + "learning_rate": 0.00010982958019895042, + "loss": 2.0326, + "step": 21650 + }, + { + "epoch": 2.5260763038151905, + "grad_norm": 1.3178377151489258, + "learning_rate": 0.00010981506793727103, + "loss": 2.1236, + "step": 21651 + }, + { + "epoch": 2.5261929763154827, + "grad_norm": 1.1872042417526245, + "learning_rate": 0.00010980055610773493, + "loss": 1.9602, + "step": 21652 + }, + { + "epoch": 2.526309648815774, + "grad_norm": 1.1598615646362305, + "learning_rate": 0.00010978604471049249, + "loss": 2.0141, + "step": 21653 + }, + { + "epoch": 2.526426321316066, + "grad_norm": 1.239237666130066, + "learning_rate": 0.00010977153374569424, + "loss": 1.9977, + "step": 21654 + }, + { + "epoch": 2.5265429938163573, + "grad_norm": 1.059852957725525, + "learning_rate": 0.00010975702321349048, + "loss": 1.9238, + "step": 21655 + }, + { + "epoch": 2.5266596663166494, + "grad_norm": 1.1602476835250854, + "learning_rate": 0.00010974251311403182, + "loss": 2.1245, + "step": 21656 + }, + { + "epoch": 2.5267763388169406, + "grad_norm": 1.149972677230835, + "learning_rate": 0.00010972800344746852, + "loss": 1.9882, + "step": 21657 + }, + { + "epoch": 2.5268930113172328, + "grad_norm": 1.0574678182601929, + "learning_rate": 0.00010971349421395109, + "loss": 1.9595, + "step": 21658 + }, + { + "epoch": 2.527009683817524, + "grad_norm": 1.0467844009399414, + "learning_rate": 0.00010969898541362989, + "loss": 2.0715, + "step": 21659 + }, + { + "epoch": 2.527126356317816, + "grad_norm": 1.170177698135376, + "learning_rate": 0.00010968447704665542, + "loss": 2.184, + "step": 21660 + }, + { + "epoch": 2.5272430288181074, + "grad_norm": 1.1818534135818481, + "learning_rate": 0.00010966996911317802, + "loss": 2.1157, + "step": 21661 + }, + { + "epoch": 2.5273597013183995, + "grad_norm": 1.0760875940322876, + "learning_rate": 0.0001096554616133481, + "loss": 1.9199, + "step": 21662 + }, + { + "epoch": 2.5274763738186907, + "grad_norm": 1.0814234018325806, + "learning_rate": 0.000109640954547316, + "loss": 1.9204, + "step": 21663 + }, + { + "epoch": 2.527593046318983, + "grad_norm": 1.0582334995269775, + "learning_rate": 0.00010962644791523219, + "loss": 1.8685, + "step": 21664 + }, + { + "epoch": 2.527709718819274, + "grad_norm": 1.2395058870315552, + "learning_rate": 0.0001096119417172471, + "loss": 2.047, + "step": 21665 + }, + { + "epoch": 2.527826391319566, + "grad_norm": 1.0806794166564941, + "learning_rate": 0.00010959743595351098, + "loss": 1.9166, + "step": 21666 + }, + { + "epoch": 2.5279430638198574, + "grad_norm": 1.237879991531372, + "learning_rate": 0.00010958293062417434, + "loss": 1.9431, + "step": 21667 + }, + { + "epoch": 2.5280597363201496, + "grad_norm": 1.078906774520874, + "learning_rate": 0.0001095684257293875, + "loss": 1.8985, + "step": 21668 + }, + { + "epoch": 2.528176408820441, + "grad_norm": 1.2424603700637817, + "learning_rate": 0.00010955392126930081, + "loss": 2.1573, + "step": 21669 + }, + { + "epoch": 2.528293081320733, + "grad_norm": 1.0192828178405762, + "learning_rate": 0.00010953941724406468, + "loss": 2.0219, + "step": 21670 + }, + { + "epoch": 2.528409753821024, + "grad_norm": 1.1673870086669922, + "learning_rate": 0.00010952491365382945, + "loss": 2.1042, + "step": 21671 + }, + { + "epoch": 2.5285264263213163, + "grad_norm": 1.1618448495864868, + "learning_rate": 0.00010951041049874543, + "loss": 1.7324, + "step": 21672 + }, + { + "epoch": 2.5286430988216075, + "grad_norm": 1.1820197105407715, + "learning_rate": 0.00010949590777896308, + "loss": 1.8973, + "step": 21673 + }, + { + "epoch": 2.5287597713218997, + "grad_norm": 1.1530051231384277, + "learning_rate": 0.00010948140549463261, + "loss": 2.0444, + "step": 21674 + }, + { + "epoch": 2.528876443822191, + "grad_norm": 1.2011743783950806, + "learning_rate": 0.00010946690364590454, + "loss": 1.9619, + "step": 21675 + }, + { + "epoch": 2.528993116322483, + "grad_norm": 1.0793957710266113, + "learning_rate": 0.00010945240223292902, + "loss": 1.893, + "step": 21676 + }, + { + "epoch": 2.5291097888227743, + "grad_norm": 1.165686011314392, + "learning_rate": 0.0001094379012558565, + "loss": 1.9539, + "step": 21677 + }, + { + "epoch": 2.5292264613230664, + "grad_norm": 1.19617760181427, + "learning_rate": 0.00010942340071483734, + "loss": 2.0663, + "step": 21678 + }, + { + "epoch": 2.5293431338233576, + "grad_norm": 1.1446303129196167, + "learning_rate": 0.00010940890061002173, + "loss": 1.9675, + "step": 21679 + }, + { + "epoch": 2.5294598063236498, + "grad_norm": 1.0575164556503296, + "learning_rate": 0.00010939440094156013, + "loss": 2.01, + "step": 21680 + }, + { + "epoch": 2.529576478823941, + "grad_norm": 1.1539971828460693, + "learning_rate": 0.00010937990170960275, + "loss": 1.932, + "step": 21681 + }, + { + "epoch": 2.529693151324233, + "grad_norm": 1.1152997016906738, + "learning_rate": 0.00010936540291429998, + "loss": 1.9108, + "step": 21682 + }, + { + "epoch": 2.5298098238245244, + "grad_norm": 1.1704562902450562, + "learning_rate": 0.00010935090455580208, + "loss": 2.1826, + "step": 21683 + }, + { + "epoch": 2.5299264963248165, + "grad_norm": 1.1123987436294556, + "learning_rate": 0.00010933640663425941, + "loss": 2.0893, + "step": 21684 + }, + { + "epoch": 2.5300431688251077, + "grad_norm": 0.9874357581138611, + "learning_rate": 0.0001093219091498222, + "loss": 1.9583, + "step": 21685 + }, + { + "epoch": 2.5301598413254, + "grad_norm": 1.0215816497802734, + "learning_rate": 0.00010930741210264076, + "loss": 1.812, + "step": 21686 + }, + { + "epoch": 2.530276513825691, + "grad_norm": 1.1557230949401855, + "learning_rate": 0.0001092929154928654, + "loss": 1.949, + "step": 21687 + }, + { + "epoch": 2.530393186325983, + "grad_norm": 1.119339108467102, + "learning_rate": 0.00010927841932064645, + "loss": 1.9814, + "step": 21688 + }, + { + "epoch": 2.5305098588262744, + "grad_norm": 1.0439205169677734, + "learning_rate": 0.00010926392358613405, + "loss": 1.9598, + "step": 21689 + }, + { + "epoch": 2.5306265313265666, + "grad_norm": 1.3726489543914795, + "learning_rate": 0.00010924942828947857, + "loss": 1.9672, + "step": 21690 + }, + { + "epoch": 2.530743203826858, + "grad_norm": 1.0147069692611694, + "learning_rate": 0.00010923493343083033, + "loss": 1.8842, + "step": 21691 + }, + { + "epoch": 2.53085987632715, + "grad_norm": 1.0076406002044678, + "learning_rate": 0.00010922043901033949, + "loss": 1.9286, + "step": 21692 + }, + { + "epoch": 2.530976548827441, + "grad_norm": 1.2589656114578247, + "learning_rate": 0.00010920594502815642, + "loss": 1.905, + "step": 21693 + }, + { + "epoch": 2.5310932213277333, + "grad_norm": 1.0018121004104614, + "learning_rate": 0.00010919145148443127, + "loss": 1.9107, + "step": 21694 + }, + { + "epoch": 2.5312098938280245, + "grad_norm": 0.9308429956436157, + "learning_rate": 0.00010917695837931441, + "loss": 1.9356, + "step": 21695 + }, + { + "epoch": 2.5313265663283167, + "grad_norm": 1.1832636594772339, + "learning_rate": 0.00010916246571295592, + "loss": 2.011, + "step": 21696 + }, + { + "epoch": 2.531443238828608, + "grad_norm": 1.2992115020751953, + "learning_rate": 0.00010914797348550623, + "loss": 2.1387, + "step": 21697 + }, + { + "epoch": 2.5315599113289, + "grad_norm": 1.1117969751358032, + "learning_rate": 0.00010913348169711545, + "loss": 1.9668, + "step": 21698 + }, + { + "epoch": 2.5316765838291913, + "grad_norm": 1.5837124586105347, + "learning_rate": 0.00010911899034793389, + "loss": 1.8884, + "step": 21699 + }, + { + "epoch": 2.5317932563294834, + "grad_norm": 1.1001553535461426, + "learning_rate": 0.00010910449943811171, + "loss": 1.9694, + "step": 21700 + }, + { + "epoch": 2.5319099288297746, + "grad_norm": 1.174268126487732, + "learning_rate": 0.00010909000896779921, + "loss": 1.9451, + "step": 21701 + }, + { + "epoch": 2.5320266013300667, + "grad_norm": 1.0477278232574463, + "learning_rate": 0.00010907551893714655, + "loss": 1.952, + "step": 21702 + }, + { + "epoch": 2.532143273830358, + "grad_norm": 1.202803611755371, + "learning_rate": 0.00010906102934630398, + "loss": 1.9591, + "step": 21703 + }, + { + "epoch": 2.53225994633065, + "grad_norm": 1.1036754846572876, + "learning_rate": 0.00010904654019542176, + "loss": 1.912, + "step": 21704 + }, + { + "epoch": 2.5323766188309413, + "grad_norm": 1.2015990018844604, + "learning_rate": 0.00010903205148464994, + "loss": 1.8262, + "step": 21705 + }, + { + "epoch": 2.5324932913312335, + "grad_norm": 1.1072176694869995, + "learning_rate": 0.00010901756321413892, + "loss": 1.946, + "step": 21706 + }, + { + "epoch": 2.5326099638315247, + "grad_norm": 1.1200166940689087, + "learning_rate": 0.00010900307538403875, + "loss": 1.9124, + "step": 21707 + }, + { + "epoch": 2.532726636331817, + "grad_norm": 1.2601202726364136, + "learning_rate": 0.00010898858799449972, + "loss": 1.923, + "step": 21708 + }, + { + "epoch": 2.532843308832108, + "grad_norm": 1.246023178100586, + "learning_rate": 0.00010897410104567192, + "loss": 2.0243, + "step": 21709 + }, + { + "epoch": 2.5329599813324, + "grad_norm": 1.1798765659332275, + "learning_rate": 0.00010895961453770565, + "loss": 2.1006, + "step": 21710 + }, + { + "epoch": 2.5330766538326914, + "grad_norm": 1.1589045524597168, + "learning_rate": 0.00010894512847075101, + "loss": 1.903, + "step": 21711 + }, + { + "epoch": 2.5331933263329836, + "grad_norm": 1.1053946018218994, + "learning_rate": 0.0001089306428449582, + "loss": 1.9791, + "step": 21712 + }, + { + "epoch": 2.533309998833275, + "grad_norm": 1.1832014322280884, + "learning_rate": 0.00010891615766047738, + "loss": 2.0606, + "step": 21713 + }, + { + "epoch": 2.533426671333567, + "grad_norm": 1.1902828216552734, + "learning_rate": 0.00010890167291745877, + "loss": 2.0046, + "step": 21714 + }, + { + "epoch": 2.533543343833858, + "grad_norm": 1.2895781993865967, + "learning_rate": 0.00010888718861605241, + "loss": 2.1036, + "step": 21715 + }, + { + "epoch": 2.5336600163341503, + "grad_norm": 1.1520981788635254, + "learning_rate": 0.00010887270475640852, + "loss": 1.7674, + "step": 21716 + }, + { + "epoch": 2.5337766888344415, + "grad_norm": 1.148572325706482, + "learning_rate": 0.00010885822133867735, + "loss": 2.0241, + "step": 21717 + }, + { + "epoch": 2.5338933613347336, + "grad_norm": 1.308530569076538, + "learning_rate": 0.00010884373836300889, + "loss": 2.0557, + "step": 21718 + }, + { + "epoch": 2.534010033835025, + "grad_norm": 1.0866807699203491, + "learning_rate": 0.00010882925582955345, + "loss": 1.9977, + "step": 21719 + }, + { + "epoch": 2.534126706335317, + "grad_norm": 1.071172833442688, + "learning_rate": 0.000108814773738461, + "loss": 2.0286, + "step": 21720 + }, + { + "epoch": 2.5342433788356082, + "grad_norm": 1.2805863618850708, + "learning_rate": 0.00010880029208988181, + "loss": 2.1353, + "step": 21721 + }, + { + "epoch": 2.5343600513359004, + "grad_norm": 1.4270472526550293, + "learning_rate": 0.00010878581088396591, + "loss": 1.925, + "step": 21722 + }, + { + "epoch": 2.5344767238361916, + "grad_norm": 1.2182296514511108, + "learning_rate": 0.00010877133012086349, + "loss": 1.9537, + "step": 21723 + }, + { + "epoch": 2.5345933963364837, + "grad_norm": 1.1579142808914185, + "learning_rate": 0.0001087568498007246, + "loss": 1.8354, + "step": 21724 + }, + { + "epoch": 2.534710068836775, + "grad_norm": 1.2049676179885864, + "learning_rate": 0.00010874236992369947, + "loss": 2.0057, + "step": 21725 + }, + { + "epoch": 2.534826741337067, + "grad_norm": 1.2752469778060913, + "learning_rate": 0.00010872789048993807, + "loss": 1.9486, + "step": 21726 + }, + { + "epoch": 2.5349434138373583, + "grad_norm": 1.1582163572311401, + "learning_rate": 0.00010871341149959064, + "loss": 1.9458, + "step": 21727 + }, + { + "epoch": 2.5350600863376505, + "grad_norm": 1.1467580795288086, + "learning_rate": 0.0001086989329528072, + "loss": 1.9896, + "step": 21728 + }, + { + "epoch": 2.5351767588379417, + "grad_norm": 1.172411561012268, + "learning_rate": 0.00010868445484973786, + "loss": 2.0896, + "step": 21729 + }, + { + "epoch": 2.535293431338234, + "grad_norm": 0.9734925627708435, + "learning_rate": 0.00010866997719053277, + "loss": 1.7608, + "step": 21730 + }, + { + "epoch": 2.535410103838525, + "grad_norm": 1.1568989753723145, + "learning_rate": 0.0001086554999753419, + "loss": 1.97, + "step": 21731 + }, + { + "epoch": 2.535526776338817, + "grad_norm": 1.2492038011550903, + "learning_rate": 0.00010864102320431545, + "loss": 1.9301, + "step": 21732 + }, + { + "epoch": 2.5356434488391084, + "grad_norm": 1.2476868629455566, + "learning_rate": 0.00010862654687760342, + "loss": 2.1389, + "step": 21733 + }, + { + "epoch": 2.5357601213394005, + "grad_norm": 1.115809440612793, + "learning_rate": 0.00010861207099535598, + "loss": 2.0755, + "step": 21734 + }, + { + "epoch": 2.535876793839692, + "grad_norm": 1.03436279296875, + "learning_rate": 0.00010859759555772308, + "loss": 1.7499, + "step": 21735 + }, + { + "epoch": 2.535993466339984, + "grad_norm": 1.2452877759933472, + "learning_rate": 0.0001085831205648549, + "loss": 2.1367, + "step": 21736 + }, + { + "epoch": 2.536110138840275, + "grad_norm": 1.5457853078842163, + "learning_rate": 0.00010856864601690141, + "loss": 2.0428, + "step": 21737 + }, + { + "epoch": 2.5362268113405673, + "grad_norm": 1.207884430885315, + "learning_rate": 0.0001085541719140127, + "loss": 2.0752, + "step": 21738 + }, + { + "epoch": 2.5363434838408585, + "grad_norm": 1.0763792991638184, + "learning_rate": 0.00010853969825633884, + "loss": 2.0692, + "step": 21739 + }, + { + "epoch": 2.5364601563411506, + "grad_norm": 1.165533185005188, + "learning_rate": 0.00010852522504402987, + "loss": 1.9825, + "step": 21740 + }, + { + "epoch": 2.536576828841442, + "grad_norm": 1.0342923402786255, + "learning_rate": 0.00010851075227723577, + "loss": 1.9373, + "step": 21741 + }, + { + "epoch": 2.536693501341734, + "grad_norm": 1.1520237922668457, + "learning_rate": 0.00010849627995610662, + "loss": 1.7845, + "step": 21742 + }, + { + "epoch": 2.5368101738420252, + "grad_norm": 1.0767372846603394, + "learning_rate": 0.00010848180808079254, + "loss": 1.9631, + "step": 21743 + }, + { + "epoch": 2.5369268463423174, + "grad_norm": 1.1964375972747803, + "learning_rate": 0.00010846733665144342, + "loss": 2.0285, + "step": 21744 + }, + { + "epoch": 2.5370435188426086, + "grad_norm": 1.215423822402954, + "learning_rate": 0.00010845286566820938, + "loss": 2.0294, + "step": 21745 + }, + { + "epoch": 2.5371601913429007, + "grad_norm": 1.2440133094787598, + "learning_rate": 0.0001084383951312404, + "loss": 2.0893, + "step": 21746 + }, + { + "epoch": 2.537276863843192, + "grad_norm": 1.091883659362793, + "learning_rate": 0.00010842392504068651, + "loss": 1.9284, + "step": 21747 + }, + { + "epoch": 2.537393536343484, + "grad_norm": 1.410422682762146, + "learning_rate": 0.00010840945539669766, + "loss": 2.1444, + "step": 21748 + }, + { + "epoch": 2.5375102088437753, + "grad_norm": 1.0815093517303467, + "learning_rate": 0.00010839498619942396, + "loss": 1.9362, + "step": 21749 + }, + { + "epoch": 2.5376268813440674, + "grad_norm": 1.0424391031265259, + "learning_rate": 0.00010838051744901528, + "loss": 2.0123, + "step": 21750 + }, + { + "epoch": 2.5377435538443587, + "grad_norm": 1.0466145277023315, + "learning_rate": 0.00010836604914562178, + "loss": 1.8499, + "step": 21751 + }, + { + "epoch": 2.537860226344651, + "grad_norm": 1.4289166927337646, + "learning_rate": 0.0001083515812893933, + "loss": 2.2546, + "step": 21752 + }, + { + "epoch": 2.537976898844942, + "grad_norm": 1.0257078409194946, + "learning_rate": 0.0001083371138804799, + "loss": 2.0178, + "step": 21753 + }, + { + "epoch": 2.538093571345234, + "grad_norm": 1.0876905918121338, + "learning_rate": 0.00010832264691903154, + "loss": 1.9272, + "step": 21754 + }, + { + "epoch": 2.5382102438455254, + "grad_norm": 1.1765217781066895, + "learning_rate": 0.00010830818040519823, + "loss": 1.849, + "step": 21755 + }, + { + "epoch": 2.5383269163458175, + "grad_norm": 1.1665507555007935, + "learning_rate": 0.00010829371433912995, + "loss": 2.0665, + "step": 21756 + }, + { + "epoch": 2.5384435888461088, + "grad_norm": 1.1228759288787842, + "learning_rate": 0.00010827924872097657, + "loss": 1.9955, + "step": 21757 + }, + { + "epoch": 2.538560261346401, + "grad_norm": 1.263601303100586, + "learning_rate": 0.00010826478355088817, + "loss": 2.1377, + "step": 21758 + }, + { + "epoch": 2.538676933846692, + "grad_norm": 1.2408857345581055, + "learning_rate": 0.00010825031882901461, + "loss": 1.9309, + "step": 21759 + }, + { + "epoch": 2.5387936063469843, + "grad_norm": 1.2642790079116821, + "learning_rate": 0.00010823585455550597, + "loss": 2.048, + "step": 21760 + }, + { + "epoch": 2.5389102788472755, + "grad_norm": 1.298946499824524, + "learning_rate": 0.00010822139073051202, + "loss": 1.998, + "step": 21761 + }, + { + "epoch": 2.5390269513475676, + "grad_norm": 1.2598601579666138, + "learning_rate": 0.00010820692735418293, + "loss": 1.9215, + "step": 21762 + }, + { + "epoch": 2.539143623847859, + "grad_norm": 1.1116118431091309, + "learning_rate": 0.00010819246442666847, + "loss": 2.243, + "step": 21763 + }, + { + "epoch": 2.539260296348151, + "grad_norm": 1.0442447662353516, + "learning_rate": 0.00010817800194811862, + "loss": 1.924, + "step": 21764 + }, + { + "epoch": 2.539376968848442, + "grad_norm": 1.0289937257766724, + "learning_rate": 0.00010816353991868333, + "loss": 1.8837, + "step": 21765 + }, + { + "epoch": 2.5394936413487343, + "grad_norm": 0.999177098274231, + "learning_rate": 0.00010814907833851253, + "loss": 1.9142, + "step": 21766 + }, + { + "epoch": 2.5396103138490256, + "grad_norm": 1.1330980062484741, + "learning_rate": 0.00010813461720775605, + "loss": 1.9903, + "step": 21767 + }, + { + "epoch": 2.5397269863493177, + "grad_norm": 1.1739493608474731, + "learning_rate": 0.00010812015652656389, + "loss": 1.9565, + "step": 21768 + }, + { + "epoch": 2.539843658849609, + "grad_norm": 1.2115490436553955, + "learning_rate": 0.00010810569629508601, + "loss": 1.8187, + "step": 21769 + }, + { + "epoch": 2.539960331349901, + "grad_norm": 1.1002534627914429, + "learning_rate": 0.00010809123651347225, + "loss": 1.9501, + "step": 21770 + }, + { + "epoch": 2.5400770038501923, + "grad_norm": 1.1694130897521973, + "learning_rate": 0.00010807677718187253, + "loss": 1.8833, + "step": 21771 + }, + { + "epoch": 2.5401936763504844, + "grad_norm": 1.102903962135315, + "learning_rate": 0.00010806231830043673, + "loss": 1.8486, + "step": 21772 + }, + { + "epoch": 2.5403103488507757, + "grad_norm": 1.1762986183166504, + "learning_rate": 0.00010804785986931478, + "loss": 1.9267, + "step": 21773 + }, + { + "epoch": 2.540427021351068, + "grad_norm": 1.2918530702590942, + "learning_rate": 0.0001080334018886565, + "loss": 1.9601, + "step": 21774 + }, + { + "epoch": 2.540543693851359, + "grad_norm": 1.136040449142456, + "learning_rate": 0.00010801894435861187, + "loss": 2.0345, + "step": 21775 + }, + { + "epoch": 2.540660366351651, + "grad_norm": 1.1859931945800781, + "learning_rate": 0.00010800448727933065, + "loss": 2.1887, + "step": 21776 + }, + { + "epoch": 2.5407770388519424, + "grad_norm": 1.1401121616363525, + "learning_rate": 0.00010799003065096286, + "loss": 2.0612, + "step": 21777 + }, + { + "epoch": 2.5408937113522345, + "grad_norm": 1.0703794956207275, + "learning_rate": 0.00010797557447365825, + "loss": 2.1004, + "step": 21778 + }, + { + "epoch": 2.5410103838525258, + "grad_norm": 1.1554069519042969, + "learning_rate": 0.00010796111874756676, + "loss": 2.0723, + "step": 21779 + }, + { + "epoch": 2.541127056352818, + "grad_norm": 1.2521129846572876, + "learning_rate": 0.0001079466634728382, + "loss": 2.005, + "step": 21780 + }, + { + "epoch": 2.541243728853109, + "grad_norm": 1.1854798793792725, + "learning_rate": 0.00010793220864962242, + "loss": 2.0012, + "step": 21781 + }, + { + "epoch": 2.5413604013534012, + "grad_norm": 1.1110209226608276, + "learning_rate": 0.0001079177542780694, + "loss": 1.8075, + "step": 21782 + }, + { + "epoch": 2.5414770738536925, + "grad_norm": 1.1917158365249634, + "learning_rate": 0.00010790330035832876, + "loss": 2.0796, + "step": 21783 + }, + { + "epoch": 2.5415937463539846, + "grad_norm": 1.229708194732666, + "learning_rate": 0.00010788884689055057, + "loss": 1.8582, + "step": 21784 + }, + { + "epoch": 2.541710418854276, + "grad_norm": 1.2138745784759521, + "learning_rate": 0.00010787439387488451, + "loss": 2.0067, + "step": 21785 + }, + { + "epoch": 2.541827091354568, + "grad_norm": 1.409611463546753, + "learning_rate": 0.00010785994131148049, + "loss": 2.1995, + "step": 21786 + }, + { + "epoch": 2.541943763854859, + "grad_norm": 1.1112439632415771, + "learning_rate": 0.00010784548920048827, + "loss": 1.8977, + "step": 21787 + }, + { + "epoch": 2.5420604363551513, + "grad_norm": 1.0153495073318481, + "learning_rate": 0.0001078310375420578, + "loss": 1.8928, + "step": 21788 + }, + { + "epoch": 2.5421771088554426, + "grad_norm": 1.0955713987350464, + "learning_rate": 0.0001078165863363388, + "loss": 2.1374, + "step": 21789 + }, + { + "epoch": 2.5422937813557347, + "grad_norm": 1.0276867151260376, + "learning_rate": 0.00010780213558348111, + "loss": 1.9182, + "step": 21790 + }, + { + "epoch": 2.542410453856026, + "grad_norm": 1.113146424293518, + "learning_rate": 0.00010778768528363447, + "loss": 1.774, + "step": 21791 + }, + { + "epoch": 2.542527126356318, + "grad_norm": 1.0504510402679443, + "learning_rate": 0.00010777323543694884, + "loss": 1.8807, + "step": 21792 + }, + { + "epoch": 2.5426437988566093, + "grad_norm": 1.0209648609161377, + "learning_rate": 0.00010775878604357383, + "loss": 1.9744, + "step": 21793 + }, + { + "epoch": 2.5427604713569014, + "grad_norm": 0.9565929174423218, + "learning_rate": 0.00010774433710365936, + "loss": 1.8863, + "step": 21794 + }, + { + "epoch": 2.5428771438571927, + "grad_norm": 1.0926949977874756, + "learning_rate": 0.00010772988861735525, + "loss": 1.9429, + "step": 21795 + }, + { + "epoch": 2.542993816357485, + "grad_norm": 1.2105876207351685, + "learning_rate": 0.00010771544058481117, + "loss": 2.0036, + "step": 21796 + }, + { + "epoch": 2.543110488857776, + "grad_norm": 1.2093195915222168, + "learning_rate": 0.000107700993006177, + "loss": 1.8848, + "step": 21797 + }, + { + "epoch": 2.543227161358068, + "grad_norm": 1.2983087301254272, + "learning_rate": 0.00010768654588160247, + "loss": 2.0392, + "step": 21798 + }, + { + "epoch": 2.5433438338583594, + "grad_norm": 1.038393259048462, + "learning_rate": 0.00010767209921123738, + "loss": 1.9798, + "step": 21799 + }, + { + "epoch": 2.5434605063586515, + "grad_norm": 1.2834917306900024, + "learning_rate": 0.0001076576529952314, + "loss": 2.017, + "step": 21800 + }, + { + "epoch": 2.5435771788589427, + "grad_norm": 1.1206010580062866, + "learning_rate": 0.00010764320723373446, + "loss": 2.093, + "step": 21801 + }, + { + "epoch": 2.543693851359235, + "grad_norm": 1.129928708076477, + "learning_rate": 0.00010762876192689616, + "loss": 1.9674, + "step": 21802 + }, + { + "epoch": 2.543810523859526, + "grad_norm": 1.0415711402893066, + "learning_rate": 0.00010761431707486637, + "loss": 1.9163, + "step": 21803 + }, + { + "epoch": 2.5439271963598182, + "grad_norm": 1.1202176809310913, + "learning_rate": 0.00010759987267779473, + "loss": 2.0282, + "step": 21804 + }, + { + "epoch": 2.5440438688601095, + "grad_norm": 1.331869125366211, + "learning_rate": 0.00010758542873583112, + "loss": 1.9583, + "step": 21805 + }, + { + "epoch": 2.5441605413604016, + "grad_norm": 1.188004493713379, + "learning_rate": 0.00010757098524912514, + "loss": 1.9798, + "step": 21806 + }, + { + "epoch": 2.544277213860693, + "grad_norm": 1.109030842781067, + "learning_rate": 0.00010755654221782656, + "loss": 1.8058, + "step": 21807 + }, + { + "epoch": 2.544393886360985, + "grad_norm": 1.0139775276184082, + "learning_rate": 0.0001075420996420852, + "loss": 1.988, + "step": 21808 + }, + { + "epoch": 2.544510558861276, + "grad_norm": 1.2748074531555176, + "learning_rate": 0.00010752765752205066, + "loss": 1.9567, + "step": 21809 + }, + { + "epoch": 2.5446272313615683, + "grad_norm": 1.1138404607772827, + "learning_rate": 0.00010751321585787278, + "loss": 1.8845, + "step": 21810 + }, + { + "epoch": 2.5447439038618596, + "grad_norm": 1.1592422723770142, + "learning_rate": 0.00010749877464970116, + "loss": 1.9999, + "step": 21811 + }, + { + "epoch": 2.5448605763621517, + "grad_norm": 1.2851661443710327, + "learning_rate": 0.0001074843338976856, + "loss": 2.0819, + "step": 21812 + }, + { + "epoch": 2.544977248862443, + "grad_norm": 1.267672061920166, + "learning_rate": 0.00010746989360197574, + "loss": 1.9491, + "step": 21813 + }, + { + "epoch": 2.545093921362735, + "grad_norm": 1.0884027481079102, + "learning_rate": 0.00010745545376272135, + "loss": 1.8601, + "step": 21814 + }, + { + "epoch": 2.5452105938630263, + "grad_norm": 1.013960838317871, + "learning_rate": 0.00010744101438007207, + "loss": 1.9355, + "step": 21815 + }, + { + "epoch": 2.5453272663633184, + "grad_norm": 0.9426640272140503, + "learning_rate": 0.00010742657545417763, + "loss": 1.8752, + "step": 21816 + }, + { + "epoch": 2.5454439388636096, + "grad_norm": 1.1722311973571777, + "learning_rate": 0.00010741213698518765, + "loss": 1.939, + "step": 21817 + }, + { + "epoch": 2.5455606113639018, + "grad_norm": 1.140066385269165, + "learning_rate": 0.00010739769897325189, + "loss": 2.0969, + "step": 21818 + }, + { + "epoch": 2.545677283864193, + "grad_norm": 1.2343250513076782, + "learning_rate": 0.00010738326141851994, + "loss": 2.0137, + "step": 21819 + }, + { + "epoch": 2.545793956364485, + "grad_norm": 1.4082902669906616, + "learning_rate": 0.00010736882432114154, + "loss": 1.884, + "step": 21820 + }, + { + "epoch": 2.5459106288647764, + "grad_norm": 1.0957118272781372, + "learning_rate": 0.00010735438768126639, + "loss": 1.8739, + "step": 21821 + }, + { + "epoch": 2.5460273013650685, + "grad_norm": 1.2556616067886353, + "learning_rate": 0.0001073399514990441, + "loss": 1.8211, + "step": 21822 + }, + { + "epoch": 2.5461439738653597, + "grad_norm": 1.1619927883148193, + "learning_rate": 0.00010732551577462434, + "loss": 2.1819, + "step": 21823 + }, + { + "epoch": 2.546260646365652, + "grad_norm": 1.2430005073547363, + "learning_rate": 0.00010731108050815673, + "loss": 2.0713, + "step": 21824 + }, + { + "epoch": 2.546377318865943, + "grad_norm": 1.1364129781723022, + "learning_rate": 0.00010729664569979101, + "loss": 1.9967, + "step": 21825 + }, + { + "epoch": 2.546493991366235, + "grad_norm": 1.256536602973938, + "learning_rate": 0.00010728221134967667, + "loss": 1.9449, + "step": 21826 + }, + { + "epoch": 2.5466106638665265, + "grad_norm": 1.0996887683868408, + "learning_rate": 0.00010726777745796353, + "loss": 1.9203, + "step": 21827 + }, + { + "epoch": 2.5467273363668186, + "grad_norm": 1.171169638633728, + "learning_rate": 0.00010725334402480106, + "loss": 2.1383, + "step": 21828 + }, + { + "epoch": 2.54684400886711, + "grad_norm": 1.1224256753921509, + "learning_rate": 0.00010723891105033904, + "loss": 2.0104, + "step": 21829 + }, + { + "epoch": 2.546960681367402, + "grad_norm": 1.096348524093628, + "learning_rate": 0.00010722447853472696, + "loss": 1.8966, + "step": 21830 + }, + { + "epoch": 2.547077353867693, + "grad_norm": 1.2916786670684814, + "learning_rate": 0.00010721004647811457, + "loss": 2.0009, + "step": 21831 + }, + { + "epoch": 2.5471940263679853, + "grad_norm": 1.1362565755844116, + "learning_rate": 0.00010719561488065137, + "loss": 2.0706, + "step": 21832 + }, + { + "epoch": 2.5473106988682765, + "grad_norm": 1.0625919103622437, + "learning_rate": 0.000107181183742487, + "loss": 1.9194, + "step": 21833 + }, + { + "epoch": 2.5474273713685687, + "grad_norm": 1.2556955814361572, + "learning_rate": 0.00010716675306377114, + "loss": 1.9929, + "step": 21834 + }, + { + "epoch": 2.54754404386886, + "grad_norm": 1.2125595808029175, + "learning_rate": 0.00010715232284465327, + "loss": 2.0111, + "step": 21835 + }, + { + "epoch": 2.547660716369152, + "grad_norm": 1.1057114601135254, + "learning_rate": 0.00010713789308528314, + "loss": 1.838, + "step": 21836 + }, + { + "epoch": 2.5477773888694433, + "grad_norm": 1.0926051139831543, + "learning_rate": 0.00010712346378581016, + "loss": 1.8656, + "step": 21837 + }, + { + "epoch": 2.5478940613697354, + "grad_norm": 1.1215851306915283, + "learning_rate": 0.00010710903494638408, + "loss": 1.9092, + "step": 21838 + }, + { + "epoch": 2.5480107338700266, + "grad_norm": 1.1089528799057007, + "learning_rate": 0.0001070946065671544, + "loss": 2.0652, + "step": 21839 + }, + { + "epoch": 2.5481274063703188, + "grad_norm": 1.2399309873580933, + "learning_rate": 0.00010708017864827071, + "loss": 1.994, + "step": 21840 + }, + { + "epoch": 2.54824407887061, + "grad_norm": 1.223984718322754, + "learning_rate": 0.00010706575118988257, + "loss": 1.9877, + "step": 21841 + }, + { + "epoch": 2.548360751370902, + "grad_norm": 1.0894815921783447, + "learning_rate": 0.0001070513241921396, + "loss": 2.0266, + "step": 21842 + }, + { + "epoch": 2.5484774238711934, + "grad_norm": 1.2494139671325684, + "learning_rate": 0.00010703689765519126, + "loss": 2.0851, + "step": 21843 + }, + { + "epoch": 2.5485940963714855, + "grad_norm": 1.1266072988510132, + "learning_rate": 0.00010702247157918725, + "loss": 1.9715, + "step": 21844 + }, + { + "epoch": 2.5487107688717767, + "grad_norm": 1.1496034860610962, + "learning_rate": 0.00010700804596427697, + "loss": 1.9111, + "step": 21845 + }, + { + "epoch": 2.548827441372069, + "grad_norm": 1.1733150482177734, + "learning_rate": 0.00010699362081061006, + "loss": 2.114, + "step": 21846 + }, + { + "epoch": 2.54894411387236, + "grad_norm": 1.1038628816604614, + "learning_rate": 0.0001069791961183361, + "loss": 1.9524, + "step": 21847 + }, + { + "epoch": 2.549060786372652, + "grad_norm": 1.1430845260620117, + "learning_rate": 0.00010696477188760455, + "loss": 2.02, + "step": 21848 + }, + { + "epoch": 2.5491774588729434, + "grad_norm": 1.116550087928772, + "learning_rate": 0.00010695034811856499, + "loss": 1.9671, + "step": 21849 + }, + { + "epoch": 2.5492941313732356, + "grad_norm": 1.374362587928772, + "learning_rate": 0.0001069359248113669, + "loss": 2.075, + "step": 21850 + }, + { + "epoch": 2.549410803873527, + "grad_norm": 1.0422539710998535, + "learning_rate": 0.00010692150196615988, + "loss": 2.1179, + "step": 21851 + }, + { + "epoch": 2.549527476373819, + "grad_norm": 1.283109426498413, + "learning_rate": 0.00010690707958309336, + "loss": 1.9755, + "step": 21852 + }, + { + "epoch": 2.54964414887411, + "grad_norm": 0.9905092120170593, + "learning_rate": 0.00010689265766231696, + "loss": 1.6957, + "step": 21853 + }, + { + "epoch": 2.5497608213744023, + "grad_norm": 1.16947603225708, + "learning_rate": 0.00010687823620398007, + "loss": 1.8704, + "step": 21854 + }, + { + "epoch": 2.5498774938746935, + "grad_norm": 1.2337228059768677, + "learning_rate": 0.00010686381520823232, + "loss": 2.0248, + "step": 21855 + }, + { + "epoch": 2.5499941663749857, + "grad_norm": 1.276914358139038, + "learning_rate": 0.00010684939467522309, + "loss": 2.0379, + "step": 21856 + }, + { + "epoch": 2.550110838875277, + "grad_norm": 1.065548300743103, + "learning_rate": 0.00010683497460510201, + "loss": 1.9274, + "step": 21857 + }, + { + "epoch": 2.550227511375569, + "grad_norm": 1.165634274482727, + "learning_rate": 0.00010682055499801846, + "loss": 1.9622, + "step": 21858 + }, + { + "epoch": 2.5503441838758603, + "grad_norm": 1.2759088277816772, + "learning_rate": 0.00010680613585412193, + "loss": 1.9842, + "step": 21859 + }, + { + "epoch": 2.5504608563761524, + "grad_norm": 1.318353295326233, + "learning_rate": 0.000106791717173562, + "loss": 1.9354, + "step": 21860 + }, + { + "epoch": 2.5505775288764436, + "grad_norm": 1.1785932779312134, + "learning_rate": 0.00010677729895648802, + "loss": 1.8679, + "step": 21861 + }, + { + "epoch": 2.5506942013767357, + "grad_norm": 1.0909337997436523, + "learning_rate": 0.0001067628812030496, + "loss": 2.0739, + "step": 21862 + }, + { + "epoch": 2.550810873877027, + "grad_norm": 1.1210947036743164, + "learning_rate": 0.00010674846391339606, + "loss": 2.0459, + "step": 21863 + }, + { + "epoch": 2.550927546377319, + "grad_norm": 1.2115843296051025, + "learning_rate": 0.00010673404708767699, + "loss": 1.9481, + "step": 21864 + }, + { + "epoch": 2.5510442188776103, + "grad_norm": 1.4544389247894287, + "learning_rate": 0.00010671963072604178, + "loss": 1.9096, + "step": 21865 + }, + { + "epoch": 2.5511608913779025, + "grad_norm": 0.9831720590591431, + "learning_rate": 0.00010670521482863989, + "loss": 2.035, + "step": 21866 + }, + { + "epoch": 2.5512775638781937, + "grad_norm": 1.1138883829116821, + "learning_rate": 0.00010669079939562077, + "loss": 1.9752, + "step": 21867 + }, + { + "epoch": 2.551394236378486, + "grad_norm": 1.1787869930267334, + "learning_rate": 0.0001066763844271339, + "loss": 1.8583, + "step": 21868 + }, + { + "epoch": 2.551510908878777, + "grad_norm": 1.3080081939697266, + "learning_rate": 0.00010666196992332861, + "loss": 2.031, + "step": 21869 + }, + { + "epoch": 2.551627581379069, + "grad_norm": 1.261263370513916, + "learning_rate": 0.0001066475558843545, + "loss": 1.975, + "step": 21870 + }, + { + "epoch": 2.5517442538793604, + "grad_norm": 1.045255184173584, + "learning_rate": 0.00010663314231036084, + "loss": 2.0709, + "step": 21871 + }, + { + "epoch": 2.5518609263796526, + "grad_norm": 1.2377911806106567, + "learning_rate": 0.00010661872920149709, + "loss": 1.9836, + "step": 21872 + }, + { + "epoch": 2.551977598879944, + "grad_norm": 1.0453840494155884, + "learning_rate": 0.0001066043165579128, + "loss": 1.9139, + "step": 21873 + }, + { + "epoch": 2.552094271380236, + "grad_norm": 1.115315556526184, + "learning_rate": 0.00010658990437975723, + "loss": 1.7594, + "step": 21874 + }, + { + "epoch": 2.552210943880527, + "grad_norm": 1.173876404762268, + "learning_rate": 0.0001065754926671799, + "loss": 1.9772, + "step": 21875 + }, + { + "epoch": 2.5523276163808193, + "grad_norm": 1.0985403060913086, + "learning_rate": 0.0001065610814203301, + "loss": 1.7572, + "step": 21876 + }, + { + "epoch": 2.5524442888811105, + "grad_norm": 1.0325353145599365, + "learning_rate": 0.00010654667063935734, + "loss": 1.9566, + "step": 21877 + }, + { + "epoch": 2.5525609613814026, + "grad_norm": 1.2540619373321533, + "learning_rate": 0.00010653226032441089, + "loss": 2.0887, + "step": 21878 + }, + { + "epoch": 2.552677633881694, + "grad_norm": 1.1187297105789185, + "learning_rate": 0.00010651785047564031, + "loss": 1.8706, + "step": 21879 + }, + { + "epoch": 2.552794306381986, + "grad_norm": 1.0437639951705933, + "learning_rate": 0.00010650344109319479, + "loss": 1.9418, + "step": 21880 + }, + { + "epoch": 2.5529109788822772, + "grad_norm": 1.2798445224761963, + "learning_rate": 0.0001064890321772239, + "loss": 1.924, + "step": 21881 + }, + { + "epoch": 2.5530276513825694, + "grad_norm": 1.1350183486938477, + "learning_rate": 0.00010647462372787684, + "loss": 1.967, + "step": 21882 + }, + { + "epoch": 2.5531443238828606, + "grad_norm": 1.0905150175094604, + "learning_rate": 0.00010646021574530314, + "loss": 1.9473, + "step": 21883 + }, + { + "epoch": 2.5532609963831527, + "grad_norm": 1.3332056999206543, + "learning_rate": 0.00010644580822965208, + "loss": 1.8934, + "step": 21884 + }, + { + "epoch": 2.553377668883444, + "grad_norm": 1.1329543590545654, + "learning_rate": 0.00010643140118107299, + "loss": 2.027, + "step": 21885 + }, + { + "epoch": 2.553494341383736, + "grad_norm": 1.083871841430664, + "learning_rate": 0.00010641699459971534, + "loss": 1.9332, + "step": 21886 + }, + { + "epoch": 2.5536110138840273, + "grad_norm": 1.2902151346206665, + "learning_rate": 0.00010640258848572833, + "loss": 2.1211, + "step": 21887 + }, + { + "epoch": 2.5537276863843195, + "grad_norm": 1.1143494844436646, + "learning_rate": 0.00010638818283926147, + "loss": 1.9876, + "step": 21888 + }, + { + "epoch": 2.5538443588846107, + "grad_norm": 1.2099319696426392, + "learning_rate": 0.00010637377766046396, + "loss": 2.0624, + "step": 21889 + }, + { + "epoch": 2.553961031384903, + "grad_norm": 1.2794737815856934, + "learning_rate": 0.00010635937294948523, + "loss": 2.0254, + "step": 21890 + }, + { + "epoch": 2.554077703885194, + "grad_norm": 1.150239109992981, + "learning_rate": 0.00010634496870647461, + "loss": 1.9536, + "step": 21891 + }, + { + "epoch": 2.554194376385486, + "grad_norm": 1.3057223558425903, + "learning_rate": 0.00010633056493158135, + "loss": 1.954, + "step": 21892 + }, + { + "epoch": 2.5543110488857774, + "grad_norm": 1.060693383216858, + "learning_rate": 0.00010631616162495481, + "loss": 2.0572, + "step": 21893 + }, + { + "epoch": 2.5544277213860695, + "grad_norm": 1.1815801858901978, + "learning_rate": 0.00010630175878674439, + "loss": 1.8562, + "step": 21894 + }, + { + "epoch": 2.554544393886361, + "grad_norm": 1.3347007036209106, + "learning_rate": 0.00010628735641709925, + "loss": 2.1217, + "step": 21895 + }, + { + "epoch": 2.554661066386653, + "grad_norm": 1.1520434617996216, + "learning_rate": 0.00010627295451616885, + "loss": 1.944, + "step": 21896 + }, + { + "epoch": 2.554777738886944, + "grad_norm": 1.2912921905517578, + "learning_rate": 0.00010625855308410233, + "loss": 2.1138, + "step": 21897 + }, + { + "epoch": 2.5548944113872363, + "grad_norm": 1.0881773233413696, + "learning_rate": 0.0001062441521210491, + "loss": 1.7167, + "step": 21898 + }, + { + "epoch": 2.5550110838875275, + "grad_norm": 1.1006721258163452, + "learning_rate": 0.00010622975162715849, + "loss": 1.8647, + "step": 21899 + }, + { + "epoch": 2.5551277563878196, + "grad_norm": 1.0812774896621704, + "learning_rate": 0.0001062153516025797, + "loss": 2.2102, + "step": 21900 + }, + { + "epoch": 2.555244428888111, + "grad_norm": 1.127621054649353, + "learning_rate": 0.00010620095204746209, + "loss": 1.8766, + "step": 21901 + }, + { + "epoch": 2.555361101388403, + "grad_norm": 1.1582984924316406, + "learning_rate": 0.0001061865529619548, + "loss": 1.9658, + "step": 21902 + }, + { + "epoch": 2.5554777738886942, + "grad_norm": 1.2080104351043701, + "learning_rate": 0.00010617215434620728, + "loss": 2.1331, + "step": 21903 + }, + { + "epoch": 2.5555944463889864, + "grad_norm": 1.1160109043121338, + "learning_rate": 0.00010615775620036865, + "loss": 1.925, + "step": 21904 + }, + { + "epoch": 2.5557111188892776, + "grad_norm": 1.1747990846633911, + "learning_rate": 0.00010614335852458829, + "loss": 2.0111, + "step": 21905 + }, + { + "epoch": 2.5558277913895697, + "grad_norm": 1.2041230201721191, + "learning_rate": 0.00010612896131901537, + "loss": 2.1556, + "step": 21906 + }, + { + "epoch": 2.555944463889861, + "grad_norm": 1.1815235614776611, + "learning_rate": 0.00010611456458379922, + "loss": 1.9245, + "step": 21907 + }, + { + "epoch": 2.556061136390153, + "grad_norm": 1.101986289024353, + "learning_rate": 0.00010610016831908902, + "loss": 2.0138, + "step": 21908 + }, + { + "epoch": 2.5561778088904443, + "grad_norm": 1.0636906623840332, + "learning_rate": 0.00010608577252503408, + "loss": 1.9142, + "step": 21909 + }, + { + "epoch": 2.5562944813907365, + "grad_norm": 1.0274039506912231, + "learning_rate": 0.00010607137720178358, + "loss": 1.9349, + "step": 21910 + }, + { + "epoch": 2.5564111538910277, + "grad_norm": 1.2792420387268066, + "learning_rate": 0.00010605698234948676, + "loss": 1.9643, + "step": 21911 + }, + { + "epoch": 2.55652782639132, + "grad_norm": 1.0127699375152588, + "learning_rate": 0.00010604258796829293, + "loss": 1.8262, + "step": 21912 + }, + { + "epoch": 2.556644498891611, + "grad_norm": 1.1480382680892944, + "learning_rate": 0.0001060281940583512, + "loss": 1.9904, + "step": 21913 + }, + { + "epoch": 2.556761171391903, + "grad_norm": 1.0866526365280151, + "learning_rate": 0.00010601380061981089, + "loss": 1.9745, + "step": 21914 + }, + { + "epoch": 2.5568778438921944, + "grad_norm": 1.08357572555542, + "learning_rate": 0.00010599940765282114, + "loss": 2.1145, + "step": 21915 + }, + { + "epoch": 2.5569945163924865, + "grad_norm": 1.144492745399475, + "learning_rate": 0.00010598501515753122, + "loss": 1.9337, + "step": 21916 + }, + { + "epoch": 2.5571111888927778, + "grad_norm": 0.9340798854827881, + "learning_rate": 0.00010597062313409031, + "loss": 1.9431, + "step": 21917 + }, + { + "epoch": 2.55722786139307, + "grad_norm": 1.1162199974060059, + "learning_rate": 0.00010595623158264758, + "loss": 1.9007, + "step": 21918 + }, + { + "epoch": 2.557344533893361, + "grad_norm": 1.1730241775512695, + "learning_rate": 0.00010594184050335227, + "loss": 1.8285, + "step": 21919 + }, + { + "epoch": 2.5574612063936533, + "grad_norm": 1.1499042510986328, + "learning_rate": 0.00010592744989635356, + "loss": 2.0259, + "step": 21920 + }, + { + "epoch": 2.5575778788939445, + "grad_norm": 1.3605480194091797, + "learning_rate": 0.0001059130597618006, + "loss": 2.1361, + "step": 21921 + }, + { + "epoch": 2.5576945513942366, + "grad_norm": 1.1551620960235596, + "learning_rate": 0.00010589867009984265, + "loss": 2.0106, + "step": 21922 + }, + { + "epoch": 2.557811223894528, + "grad_norm": 1.101485252380371, + "learning_rate": 0.00010588428091062879, + "loss": 1.938, + "step": 21923 + }, + { + "epoch": 2.55792789639482, + "grad_norm": 1.1512806415557861, + "learning_rate": 0.00010586989219430825, + "loss": 1.9938, + "step": 21924 + }, + { + "epoch": 2.5580445688951112, + "grad_norm": 1.1751859188079834, + "learning_rate": 0.00010585550395103018, + "loss": 2.161, + "step": 21925 + }, + { + "epoch": 2.5581612413954034, + "grad_norm": 1.1886621713638306, + "learning_rate": 0.00010584111618094376, + "loss": 2.1327, + "step": 21926 + }, + { + "epoch": 2.5582779138956946, + "grad_norm": 1.3036401271820068, + "learning_rate": 0.00010582672888419818, + "loss": 1.87, + "step": 21927 + }, + { + "epoch": 2.5583945863959867, + "grad_norm": 1.005839228630066, + "learning_rate": 0.00010581234206094247, + "loss": 1.7425, + "step": 21928 + }, + { + "epoch": 2.558511258896278, + "grad_norm": 1.3058580160140991, + "learning_rate": 0.0001057979557113259, + "loss": 2.1389, + "step": 21929 + }, + { + "epoch": 2.55862793139657, + "grad_norm": 1.1214299201965332, + "learning_rate": 0.0001057835698354975, + "loss": 2.1349, + "step": 21930 + }, + { + "epoch": 2.5587446038968613, + "grad_norm": 1.2310642004013062, + "learning_rate": 0.00010576918443360657, + "loss": 1.8497, + "step": 21931 + }, + { + "epoch": 2.5588612763971534, + "grad_norm": 1.235647201538086, + "learning_rate": 0.00010575479950580203, + "loss": 2.0296, + "step": 21932 + }, + { + "epoch": 2.5589779488974447, + "grad_norm": 1.1119180917739868, + "learning_rate": 0.00010574041505223322, + "loss": 1.9317, + "step": 21933 + }, + { + "epoch": 2.559094621397737, + "grad_norm": 1.240168571472168, + "learning_rate": 0.0001057260310730491, + "loss": 1.8692, + "step": 21934 + }, + { + "epoch": 2.559211293898028, + "grad_norm": 1.1061815023422241, + "learning_rate": 0.00010571164756839887, + "loss": 2.0036, + "step": 21935 + }, + { + "epoch": 2.55932796639832, + "grad_norm": 1.006564736366272, + "learning_rate": 0.00010569726453843161, + "loss": 2.1175, + "step": 21936 + }, + { + "epoch": 2.5594446388986114, + "grad_norm": 1.1331419944763184, + "learning_rate": 0.00010568288198329642, + "loss": 1.9944, + "step": 21937 + }, + { + "epoch": 2.5595613113989035, + "grad_norm": 1.2107475996017456, + "learning_rate": 0.00010566849990314245, + "loss": 1.9521, + "step": 21938 + }, + { + "epoch": 2.5596779838991948, + "grad_norm": 1.1863867044448853, + "learning_rate": 0.00010565411829811874, + "loss": 1.9601, + "step": 21939 + }, + { + "epoch": 2.559794656399487, + "grad_norm": 0.9893634915351868, + "learning_rate": 0.00010563973716837446, + "loss": 1.9007, + "step": 21940 + }, + { + "epoch": 2.559911328899778, + "grad_norm": 1.1202739477157593, + "learning_rate": 0.0001056253565140586, + "loss": 1.8913, + "step": 21941 + }, + { + "epoch": 2.5600280014000703, + "grad_norm": 0.9688585996627808, + "learning_rate": 0.0001056109763353203, + "loss": 1.8414, + "step": 21942 + }, + { + "epoch": 2.5601446739003615, + "grad_norm": 1.0749882459640503, + "learning_rate": 0.00010559659663230863, + "loss": 1.7223, + "step": 21943 + }, + { + "epoch": 2.5602613464006536, + "grad_norm": 0.9805943965911865, + "learning_rate": 0.00010558221740517271, + "loss": 2.0613, + "step": 21944 + }, + { + "epoch": 2.560378018900945, + "grad_norm": 1.1908378601074219, + "learning_rate": 0.0001055678386540615, + "loss": 2.0912, + "step": 21945 + }, + { + "epoch": 2.560494691401237, + "grad_norm": 0.9734665155410767, + "learning_rate": 0.00010555346037912414, + "loss": 1.847, + "step": 21946 + }, + { + "epoch": 2.560611363901528, + "grad_norm": 1.0183664560317993, + "learning_rate": 0.00010553908258050964, + "loss": 1.7622, + "step": 21947 + }, + { + "epoch": 2.5607280364018203, + "grad_norm": 1.1270005702972412, + "learning_rate": 0.00010552470525836713, + "loss": 1.8794, + "step": 21948 + }, + { + "epoch": 2.5608447089021116, + "grad_norm": 1.0313338041305542, + "learning_rate": 0.00010551032841284557, + "loss": 1.9077, + "step": 21949 + }, + { + "epoch": 2.5609613814024037, + "grad_norm": 1.195623517036438, + "learning_rate": 0.00010549595204409405, + "loss": 1.9395, + "step": 21950 + }, + { + "epoch": 2.561078053902695, + "grad_norm": 1.2129650115966797, + "learning_rate": 0.00010548157615226164, + "loss": 1.8763, + "step": 21951 + }, + { + "epoch": 2.561194726402987, + "grad_norm": 1.1683310270309448, + "learning_rate": 0.0001054672007374973, + "loss": 1.7333, + "step": 21952 + }, + { + "epoch": 2.5613113989032783, + "grad_norm": 1.1972012519836426, + "learning_rate": 0.00010545282579995016, + "loss": 2.0695, + "step": 21953 + }, + { + "epoch": 2.5614280714035704, + "grad_norm": 1.1828747987747192, + "learning_rate": 0.00010543845133976908, + "loss": 1.8721, + "step": 21954 + }, + { + "epoch": 2.5615447439038617, + "grad_norm": 1.0509394407272339, + "learning_rate": 0.00010542407735710325, + "loss": 1.96, + "step": 21955 + }, + { + "epoch": 2.561661416404154, + "grad_norm": 1.0937219858169556, + "learning_rate": 0.00010540970385210157, + "loss": 2.1, + "step": 21956 + }, + { + "epoch": 2.561778088904445, + "grad_norm": 1.2630434036254883, + "learning_rate": 0.00010539533082491312, + "loss": 2.0214, + "step": 21957 + }, + { + "epoch": 2.561894761404737, + "grad_norm": 1.29259192943573, + "learning_rate": 0.00010538095827568683, + "loss": 1.999, + "step": 21958 + }, + { + "epoch": 2.5620114339050284, + "grad_norm": 1.211540699005127, + "learning_rate": 0.0001053665862045718, + "loss": 2.0633, + "step": 21959 + }, + { + "epoch": 2.5621281064053205, + "grad_norm": 1.1771942377090454, + "learning_rate": 0.00010535221461171691, + "loss": 1.9309, + "step": 21960 + }, + { + "epoch": 2.5622447789056118, + "grad_norm": 1.2132362127304077, + "learning_rate": 0.00010533784349727123, + "loss": 1.994, + "step": 21961 + }, + { + "epoch": 2.562361451405904, + "grad_norm": 1.1654802560806274, + "learning_rate": 0.0001053234728613837, + "loss": 1.8779, + "step": 21962 + }, + { + "epoch": 2.562478123906195, + "grad_norm": 1.3313101530075073, + "learning_rate": 0.0001053091027042033, + "loss": 1.9757, + "step": 21963 + }, + { + "epoch": 2.5625947964064872, + "grad_norm": 1.0718042850494385, + "learning_rate": 0.00010529473302587908, + "loss": 1.8747, + "step": 21964 + }, + { + "epoch": 2.5627114689067785, + "grad_norm": 1.1106194257736206, + "learning_rate": 0.0001052803638265599, + "loss": 1.858, + "step": 21965 + }, + { + "epoch": 2.5628281414070706, + "grad_norm": 1.1398074626922607, + "learning_rate": 0.0001052659951063948, + "loss": 1.9864, + "step": 21966 + }, + { + "epoch": 2.562944813907362, + "grad_norm": 1.0845545530319214, + "learning_rate": 0.00010525162686553268, + "loss": 1.8178, + "step": 21967 + }, + { + "epoch": 2.563061486407654, + "grad_norm": 1.0028510093688965, + "learning_rate": 0.00010523725910412257, + "loss": 1.8052, + "step": 21968 + }, + { + "epoch": 2.563178158907945, + "grad_norm": 1.190106749534607, + "learning_rate": 0.00010522289182231335, + "loss": 2.1293, + "step": 21969 + }, + { + "epoch": 2.5632948314082373, + "grad_norm": 1.0492477416992188, + "learning_rate": 0.00010520852502025404, + "loss": 1.7978, + "step": 21970 + }, + { + "epoch": 2.5634115039085286, + "grad_norm": 1.0056108236312866, + "learning_rate": 0.00010519415869809344, + "loss": 1.8549, + "step": 21971 + }, + { + "epoch": 2.5635281764088207, + "grad_norm": 1.2870169878005981, + "learning_rate": 0.00010517979285598065, + "loss": 2.0842, + "step": 21972 + }, + { + "epoch": 2.563644848909112, + "grad_norm": 1.2562170028686523, + "learning_rate": 0.00010516542749406445, + "loss": 2.2161, + "step": 21973 + }, + { + "epoch": 2.563761521409404, + "grad_norm": 1.1197279691696167, + "learning_rate": 0.0001051510626124939, + "loss": 2.0048, + "step": 21974 + }, + { + "epoch": 2.5638781939096953, + "grad_norm": 1.021051287651062, + "learning_rate": 0.00010513669821141779, + "loss": 2.0329, + "step": 21975 + }, + { + "epoch": 2.5639948664099874, + "grad_norm": 1.1658772230148315, + "learning_rate": 0.00010512233429098518, + "loss": 2.06, + "step": 21976 + }, + { + "epoch": 2.5641115389102787, + "grad_norm": 1.1813892126083374, + "learning_rate": 0.0001051079708513448, + "loss": 1.9689, + "step": 21977 + }, + { + "epoch": 2.564228211410571, + "grad_norm": 1.1740120649337769, + "learning_rate": 0.0001050936078926457, + "loss": 1.9181, + "step": 21978 + }, + { + "epoch": 2.564344883910862, + "grad_norm": 1.0958837270736694, + "learning_rate": 0.00010507924541503678, + "loss": 1.9998, + "step": 21979 + }, + { + "epoch": 2.564461556411154, + "grad_norm": 1.1097757816314697, + "learning_rate": 0.0001050648834186668, + "loss": 1.9208, + "step": 21980 + }, + { + "epoch": 2.5645782289114454, + "grad_norm": 1.0365341901779175, + "learning_rate": 0.00010505052190368481, + "loss": 1.8099, + "step": 21981 + }, + { + "epoch": 2.5646949014117375, + "grad_norm": 1.231230616569519, + "learning_rate": 0.00010503616087023956, + "loss": 1.8979, + "step": 21982 + }, + { + "epoch": 2.5648115739120287, + "grad_norm": 1.0998477935791016, + "learning_rate": 0.00010502180031848006, + "loss": 1.9512, + "step": 21983 + }, + { + "epoch": 2.564928246412321, + "grad_norm": 1.2702938318252563, + "learning_rate": 0.00010500744024855502, + "loss": 1.8539, + "step": 21984 + }, + { + "epoch": 2.565044918912612, + "grad_norm": 1.0454336404800415, + "learning_rate": 0.0001049930806606135, + "loss": 1.8436, + "step": 21985 + }, + { + "epoch": 2.5651615914129042, + "grad_norm": 0.9910571575164795, + "learning_rate": 0.00010497872155480421, + "loss": 1.9915, + "step": 21986 + }, + { + "epoch": 2.5652782639131955, + "grad_norm": 1.0762122869491577, + "learning_rate": 0.00010496436293127608, + "loss": 1.8853, + "step": 21987 + }, + { + "epoch": 2.5653949364134876, + "grad_norm": 1.018215537071228, + "learning_rate": 0.00010495000479017794, + "loss": 1.9153, + "step": 21988 + }, + { + "epoch": 2.565511608913779, + "grad_norm": 1.1750022172927856, + "learning_rate": 0.00010493564713165868, + "loss": 1.9944, + "step": 21989 + }, + { + "epoch": 2.565628281414071, + "grad_norm": 1.3286455869674683, + "learning_rate": 0.00010492128995586706, + "loss": 1.8765, + "step": 21990 + }, + { + "epoch": 2.565744953914362, + "grad_norm": 1.0335276126861572, + "learning_rate": 0.00010490693326295199, + "loss": 1.8691, + "step": 21991 + }, + { + "epoch": 2.5658616264146543, + "grad_norm": 0.9746255874633789, + "learning_rate": 0.00010489257705306233, + "loss": 1.8114, + "step": 21992 + }, + { + "epoch": 2.5659782989149456, + "grad_norm": 1.1083732843399048, + "learning_rate": 0.0001048782213263468, + "loss": 1.8301, + "step": 21993 + }, + { + "epoch": 2.5660949714152377, + "grad_norm": 1.0887290239334106, + "learning_rate": 0.00010486386608295438, + "loss": 1.9207, + "step": 21994 + }, + { + "epoch": 2.566211643915529, + "grad_norm": 1.3240686655044556, + "learning_rate": 0.00010484951132303374, + "loss": 2.1432, + "step": 21995 + }, + { + "epoch": 2.566328316415821, + "grad_norm": 1.082449197769165, + "learning_rate": 0.00010483515704673381, + "loss": 2.0379, + "step": 21996 + }, + { + "epoch": 2.5664449889161123, + "grad_norm": 1.2798675298690796, + "learning_rate": 0.00010482080325420325, + "loss": 2.0195, + "step": 21997 + }, + { + "epoch": 2.5665616614164044, + "grad_norm": 1.0712429285049438, + "learning_rate": 0.00010480644994559107, + "loss": 1.9143, + "step": 21998 + }, + { + "epoch": 2.5666783339166956, + "grad_norm": 1.2085503339767456, + "learning_rate": 0.00010479209712104585, + "loss": 1.9593, + "step": 21999 + }, + { + "epoch": 2.5667950064169878, + "grad_norm": 1.2083399295806885, + "learning_rate": 0.00010477774478071657, + "loss": 1.8842, + "step": 22000 + }, + { + "epoch": 2.566911678917279, + "grad_norm": 1.0405018329620361, + "learning_rate": 0.00010476339292475188, + "loss": 1.9836, + "step": 22001 + }, + { + "epoch": 2.567028351417571, + "grad_norm": 1.1185532808303833, + "learning_rate": 0.00010474904155330069, + "loss": 1.9869, + "step": 22002 + }, + { + "epoch": 2.5671450239178624, + "grad_norm": 1.0477818250656128, + "learning_rate": 0.00010473469066651164, + "loss": 1.9364, + "step": 22003 + }, + { + "epoch": 2.5672616964181545, + "grad_norm": 1.1428598165512085, + "learning_rate": 0.00010472034026453363, + "loss": 1.9423, + "step": 22004 + }, + { + "epoch": 2.5673783689184457, + "grad_norm": 1.1291383504867554, + "learning_rate": 0.00010470599034751537, + "loss": 1.8577, + "step": 22005 + }, + { + "epoch": 2.567495041418738, + "grad_norm": 1.0459152460098267, + "learning_rate": 0.00010469164091560561, + "loss": 1.8673, + "step": 22006 + }, + { + "epoch": 2.567611713919029, + "grad_norm": 1.2420294284820557, + "learning_rate": 0.00010467729196895318, + "loss": 2.1636, + "step": 22007 + }, + { + "epoch": 2.567728386419321, + "grad_norm": 1.0724513530731201, + "learning_rate": 0.00010466294350770673, + "loss": 1.9065, + "step": 22008 + }, + { + "epoch": 2.5678450589196125, + "grad_norm": 1.2243989706039429, + "learning_rate": 0.0001046485955320151, + "loss": 1.9752, + "step": 22009 + }, + { + "epoch": 2.5679617314199046, + "grad_norm": 1.0061839818954468, + "learning_rate": 0.00010463424804202697, + "loss": 1.8521, + "step": 22010 + }, + { + "epoch": 2.568078403920196, + "grad_norm": 1.081851601600647, + "learning_rate": 0.00010461990103789115, + "loss": 1.9792, + "step": 22011 + }, + { + "epoch": 2.568195076420488, + "grad_norm": 1.0736522674560547, + "learning_rate": 0.00010460555451975632, + "loss": 1.9682, + "step": 22012 + }, + { + "epoch": 2.568311748920779, + "grad_norm": 1.0827971696853638, + "learning_rate": 0.00010459120848777118, + "loss": 1.9017, + "step": 22013 + }, + { + "epoch": 2.5684284214210713, + "grad_norm": 1.0365151166915894, + "learning_rate": 0.00010457686294208453, + "loss": 2.0519, + "step": 22014 + }, + { + "epoch": 2.5685450939213625, + "grad_norm": 1.2390987873077393, + "learning_rate": 0.00010456251788284505, + "loss": 1.9031, + "step": 22015 + }, + { + "epoch": 2.5686617664216547, + "grad_norm": 1.1218074560165405, + "learning_rate": 0.00010454817331020142, + "loss": 1.8464, + "step": 22016 + }, + { + "epoch": 2.568778438921946, + "grad_norm": 1.0188454389572144, + "learning_rate": 0.00010453382922430239, + "loss": 1.9317, + "step": 22017 + }, + { + "epoch": 2.568895111422238, + "grad_norm": 1.0592467784881592, + "learning_rate": 0.0001045194856252967, + "loss": 2.0065, + "step": 22018 + }, + { + "epoch": 2.5690117839225293, + "grad_norm": 1.1573736667633057, + "learning_rate": 0.00010450514251333299, + "loss": 1.8494, + "step": 22019 + }, + { + "epoch": 2.5691284564228214, + "grad_norm": 1.2443437576293945, + "learning_rate": 0.00010449079988855997, + "loss": 1.9133, + "step": 22020 + }, + { + "epoch": 2.5692451289231126, + "grad_norm": 1.168931245803833, + "learning_rate": 0.0001044764577511263, + "loss": 1.8755, + "step": 22021 + }, + { + "epoch": 2.5693618014234048, + "grad_norm": 1.1070393323898315, + "learning_rate": 0.00010446211610118074, + "loss": 1.9746, + "step": 22022 + }, + { + "epoch": 2.569478473923696, + "grad_norm": 1.3414829969406128, + "learning_rate": 0.00010444777493887186, + "loss": 2.0639, + "step": 22023 + }, + { + "epoch": 2.569595146423988, + "grad_norm": 0.9999483823776245, + "learning_rate": 0.00010443343426434846, + "loss": 2.0103, + "step": 22024 + }, + { + "epoch": 2.5697118189242794, + "grad_norm": 1.0217058658599854, + "learning_rate": 0.00010441909407775907, + "loss": 1.8484, + "step": 22025 + }, + { + "epoch": 2.5698284914245715, + "grad_norm": 1.097883939743042, + "learning_rate": 0.0001044047543792525, + "loss": 2.1267, + "step": 22026 + }, + { + "epoch": 2.5699451639248627, + "grad_norm": 1.189744472503662, + "learning_rate": 0.00010439041516897726, + "loss": 2.0684, + "step": 22027 + }, + { + "epoch": 2.570061836425155, + "grad_norm": 1.016915202140808, + "learning_rate": 0.00010437607644708214, + "loss": 1.7936, + "step": 22028 + }, + { + "epoch": 2.570178508925446, + "grad_norm": 1.2601234912872314, + "learning_rate": 0.0001043617382137157, + "loss": 2.1899, + "step": 22029 + }, + { + "epoch": 2.570295181425738, + "grad_norm": 1.0980188846588135, + "learning_rate": 0.00010434740046902657, + "loss": 1.8816, + "step": 22030 + }, + { + "epoch": 2.5704118539260294, + "grad_norm": 1.0667359828948975, + "learning_rate": 0.00010433306321316348, + "loss": 1.9375, + "step": 22031 + }, + { + "epoch": 2.5705285264263216, + "grad_norm": 1.1624259948730469, + "learning_rate": 0.00010431872644627496, + "loss": 1.9502, + "step": 22032 + }, + { + "epoch": 2.570645198926613, + "grad_norm": 1.0018339157104492, + "learning_rate": 0.00010430439016850973, + "loss": 1.7663, + "step": 22033 + }, + { + "epoch": 2.570761871426905, + "grad_norm": 1.1596511602401733, + "learning_rate": 0.00010429005438001634, + "loss": 1.9052, + "step": 22034 + }, + { + "epoch": 2.570878543927196, + "grad_norm": 1.0422359704971313, + "learning_rate": 0.00010427571908094346, + "loss": 2.036, + "step": 22035 + }, + { + "epoch": 2.5709952164274883, + "grad_norm": 1.0812277793884277, + "learning_rate": 0.00010426138427143963, + "loss": 1.891, + "step": 22036 + }, + { + "epoch": 2.5711118889277795, + "grad_norm": 0.9203967452049255, + "learning_rate": 0.00010424704995165358, + "loss": 1.8729, + "step": 22037 + }, + { + "epoch": 2.5712285614280717, + "grad_norm": 1.265570878982544, + "learning_rate": 0.0001042327161217338, + "loss": 1.9933, + "step": 22038 + }, + { + "epoch": 2.571345233928363, + "grad_norm": 1.2630985975265503, + "learning_rate": 0.00010421838278182892, + "loss": 2.1169, + "step": 22039 + }, + { + "epoch": 2.571461906428655, + "grad_norm": 1.3853110074996948, + "learning_rate": 0.00010420404993208752, + "loss": 2.0966, + "step": 22040 + }, + { + "epoch": 2.5715785789289463, + "grad_norm": 1.0742403268814087, + "learning_rate": 0.00010418971757265826, + "loss": 1.8882, + "step": 22041 + }, + { + "epoch": 2.5716952514292384, + "grad_norm": 1.0818367004394531, + "learning_rate": 0.0001041753857036896, + "loss": 1.9033, + "step": 22042 + }, + { + "epoch": 2.5718119239295296, + "grad_norm": 1.2286499738693237, + "learning_rate": 0.00010416105432533019, + "loss": 1.9919, + "step": 22043 + }, + { + "epoch": 2.5719285964298217, + "grad_norm": 1.2616633176803589, + "learning_rate": 0.00010414672343772864, + "loss": 2.0681, + "step": 22044 + }, + { + "epoch": 2.572045268930113, + "grad_norm": 1.2873737812042236, + "learning_rate": 0.00010413239304103345, + "loss": 2.1075, + "step": 22045 + }, + { + "epoch": 2.572161941430405, + "grad_norm": 1.0809412002563477, + "learning_rate": 0.0001041180631353932, + "loss": 1.9846, + "step": 22046 + }, + { + "epoch": 2.5722786139306963, + "grad_norm": 1.0915428400039673, + "learning_rate": 0.00010410373372095647, + "loss": 2.1165, + "step": 22047 + }, + { + "epoch": 2.5723952864309885, + "grad_norm": 1.2099497318267822, + "learning_rate": 0.0001040894047978718, + "loss": 2.0457, + "step": 22048 + }, + { + "epoch": 2.5725119589312797, + "grad_norm": 1.2074671983718872, + "learning_rate": 0.00010407507636628768, + "loss": 1.7148, + "step": 22049 + }, + { + "epoch": 2.572628631431572, + "grad_norm": 1.2523787021636963, + "learning_rate": 0.00010406074842635277, + "loss": 1.9953, + "step": 22050 + }, + { + "epoch": 2.572745303931863, + "grad_norm": 1.2477710247039795, + "learning_rate": 0.00010404642097821545, + "loss": 2.1313, + "step": 22051 + }, + { + "epoch": 2.572861976432155, + "grad_norm": 1.1170427799224854, + "learning_rate": 0.00010403209402202442, + "loss": 1.9432, + "step": 22052 + }, + { + "epoch": 2.5729786489324464, + "grad_norm": 1.1955811977386475, + "learning_rate": 0.00010401776755792804, + "loss": 2.1333, + "step": 22053 + }, + { + "epoch": 2.5730953214327386, + "grad_norm": 1.378069281578064, + "learning_rate": 0.000104003441586075, + "loss": 2.0152, + "step": 22054 + }, + { + "epoch": 2.57321199393303, + "grad_norm": 1.0720595121383667, + "learning_rate": 0.00010398911610661367, + "loss": 1.8683, + "step": 22055 + }, + { + "epoch": 2.573328666433322, + "grad_norm": 1.260565161705017, + "learning_rate": 0.00010397479111969261, + "loss": 2.0112, + "step": 22056 + }, + { + "epoch": 2.573445338933613, + "grad_norm": 0.9810974597930908, + "learning_rate": 0.0001039604666254604, + "loss": 1.8404, + "step": 22057 + }, + { + "epoch": 2.5735620114339053, + "grad_norm": 1.0602893829345703, + "learning_rate": 0.00010394614262406541, + "loss": 1.9618, + "step": 22058 + }, + { + "epoch": 2.5736786839341965, + "grad_norm": 1.2488926649093628, + "learning_rate": 0.00010393181911565628, + "loss": 2.1092, + "step": 22059 + }, + { + "epoch": 2.5737953564344886, + "grad_norm": 1.2380706071853638, + "learning_rate": 0.00010391749610038133, + "loss": 1.9566, + "step": 22060 + }, + { + "epoch": 2.57391202893478, + "grad_norm": 0.9811068773269653, + "learning_rate": 0.00010390317357838924, + "loss": 1.8726, + "step": 22061 + }, + { + "epoch": 2.574028701435072, + "grad_norm": 1.033198356628418, + "learning_rate": 0.00010388885154982828, + "loss": 1.9456, + "step": 22062 + }, + { + "epoch": 2.5741453739353632, + "grad_norm": 1.0627809762954712, + "learning_rate": 0.00010387453001484713, + "loss": 1.9211, + "step": 22063 + }, + { + "epoch": 2.5742620464356554, + "grad_norm": 1.1104427576065063, + "learning_rate": 0.00010386020897359411, + "loss": 2.0853, + "step": 22064 + }, + { + "epoch": 2.5743787189359466, + "grad_norm": 1.1925469636917114, + "learning_rate": 0.00010384588842621777, + "loss": 2.0789, + "step": 22065 + }, + { + "epoch": 2.5744953914362387, + "grad_norm": 1.114080548286438, + "learning_rate": 0.00010383156837286649, + "loss": 2.0662, + "step": 22066 + }, + { + "epoch": 2.57461206393653, + "grad_norm": 1.097293734550476, + "learning_rate": 0.00010381724881368881, + "loss": 1.877, + "step": 22067 + }, + { + "epoch": 2.574728736436822, + "grad_norm": 1.0967007875442505, + "learning_rate": 0.0001038029297488331, + "loss": 1.8156, + "step": 22068 + }, + { + "epoch": 2.5748454089371133, + "grad_norm": 1.105331301689148, + "learning_rate": 0.00010378861117844783, + "loss": 2.0689, + "step": 22069 + }, + { + "epoch": 2.5749620814374055, + "grad_norm": 1.3789987564086914, + "learning_rate": 0.00010377429310268154, + "loss": 1.9626, + "step": 22070 + }, + { + "epoch": 2.5750787539376967, + "grad_norm": 1.156373143196106, + "learning_rate": 0.00010375997552168251, + "loss": 1.9745, + "step": 22071 + }, + { + "epoch": 2.575195426437989, + "grad_norm": 1.103050947189331, + "learning_rate": 0.00010374565843559929, + "loss": 2.053, + "step": 22072 + }, + { + "epoch": 2.57531209893828, + "grad_norm": 1.1276946067810059, + "learning_rate": 0.0001037313418445802, + "loss": 1.9332, + "step": 22073 + }, + { + "epoch": 2.575428771438572, + "grad_norm": 1.1043641567230225, + "learning_rate": 0.00010371702574877376, + "loss": 1.903, + "step": 22074 + }, + { + "epoch": 2.5755454439388634, + "grad_norm": 1.0141186714172363, + "learning_rate": 0.00010370271014832826, + "loss": 1.9863, + "step": 22075 + }, + { + "epoch": 2.5756621164391555, + "grad_norm": 1.049548625946045, + "learning_rate": 0.00010368839504339225, + "loss": 1.8045, + "step": 22076 + }, + { + "epoch": 2.575778788939447, + "grad_norm": 1.175697684288025, + "learning_rate": 0.00010367408043411401, + "loss": 1.99, + "step": 22077 + }, + { + "epoch": 2.575895461439739, + "grad_norm": 1.0768206119537354, + "learning_rate": 0.00010365976632064205, + "loss": 1.9389, + "step": 22078 + }, + { + "epoch": 2.57601213394003, + "grad_norm": 1.203576683998108, + "learning_rate": 0.00010364545270312465, + "loss": 1.9181, + "step": 22079 + }, + { + "epoch": 2.5761288064403223, + "grad_norm": 1.3274493217468262, + "learning_rate": 0.0001036311395817103, + "loss": 2.1403, + "step": 22080 + }, + { + "epoch": 2.5762454789406135, + "grad_norm": 1.2708077430725098, + "learning_rate": 0.00010361682695654731, + "loss": 1.9233, + "step": 22081 + }, + { + "epoch": 2.5763621514409056, + "grad_norm": 1.0587677955627441, + "learning_rate": 0.00010360251482778406, + "loss": 1.9853, + "step": 22082 + }, + { + "epoch": 2.576478823941197, + "grad_norm": 1.1927107572555542, + "learning_rate": 0.000103588203195569, + "loss": 2.2064, + "step": 22083 + }, + { + "epoch": 2.576595496441489, + "grad_norm": 1.041742205619812, + "learning_rate": 0.00010357389206005039, + "loss": 1.8891, + "step": 22084 + }, + { + "epoch": 2.5767121689417802, + "grad_norm": 1.3117151260375977, + "learning_rate": 0.00010355958142137668, + "loss": 2.0821, + "step": 22085 + }, + { + "epoch": 2.5768288414420724, + "grad_norm": 1.0872013568878174, + "learning_rate": 0.00010354527127969615, + "loss": 1.939, + "step": 22086 + }, + { + "epoch": 2.5769455139423636, + "grad_norm": 0.9964025020599365, + "learning_rate": 0.00010353096163515725, + "loss": 1.9271, + "step": 22087 + }, + { + "epoch": 2.5770621864426557, + "grad_norm": 1.2611125707626343, + "learning_rate": 0.00010351665248790823, + "loss": 1.9697, + "step": 22088 + }, + { + "epoch": 2.577178858942947, + "grad_norm": 1.0650362968444824, + "learning_rate": 0.00010350234383809748, + "loss": 1.9167, + "step": 22089 + }, + { + "epoch": 2.577295531443239, + "grad_norm": 1.0405948162078857, + "learning_rate": 0.00010348803568587333, + "loss": 1.8559, + "step": 22090 + }, + { + "epoch": 2.5774122039435303, + "grad_norm": 1.276466965675354, + "learning_rate": 0.00010347372803138414, + "loss": 1.9865, + "step": 22091 + }, + { + "epoch": 2.5775288764438224, + "grad_norm": 1.1151447296142578, + "learning_rate": 0.00010345942087477813, + "loss": 2.1143, + "step": 22092 + }, + { + "epoch": 2.5776455489441137, + "grad_norm": 1.115298867225647, + "learning_rate": 0.00010344511421620377, + "loss": 1.9205, + "step": 22093 + }, + { + "epoch": 2.577762221444406, + "grad_norm": 1.2438008785247803, + "learning_rate": 0.00010343080805580923, + "loss": 1.9994, + "step": 22094 + }, + { + "epoch": 2.577878893944697, + "grad_norm": 1.0338634252548218, + "learning_rate": 0.00010341650239374289, + "loss": 1.8803, + "step": 22095 + }, + { + "epoch": 2.577995566444989, + "grad_norm": 1.0652226209640503, + "learning_rate": 0.00010340219723015312, + "loss": 1.7245, + "step": 22096 + }, + { + "epoch": 2.5781122389452804, + "grad_norm": 1.3042391538619995, + "learning_rate": 0.00010338789256518813, + "loss": 2.0197, + "step": 22097 + }, + { + "epoch": 2.5782289114455725, + "grad_norm": 1.240767478942871, + "learning_rate": 0.00010337358839899628, + "loss": 1.9536, + "step": 22098 + }, + { + "epoch": 2.5783455839458638, + "grad_norm": 1.1366392374038696, + "learning_rate": 0.00010335928473172574, + "loss": 2.0863, + "step": 22099 + }, + { + "epoch": 2.578462256446156, + "grad_norm": 1.3262437582015991, + "learning_rate": 0.00010334498156352495, + "loss": 2.1634, + "step": 22100 + }, + { + "epoch": 2.578578928946447, + "grad_norm": 1.4312928915023804, + "learning_rate": 0.00010333067889454205, + "loss": 2.1192, + "step": 22101 + }, + { + "epoch": 2.5786956014467393, + "grad_norm": 1.2412492036819458, + "learning_rate": 0.00010331637672492545, + "loss": 2.0237, + "step": 22102 + }, + { + "epoch": 2.5788122739470305, + "grad_norm": 1.1099315881729126, + "learning_rate": 0.00010330207505482327, + "loss": 1.9572, + "step": 22103 + }, + { + "epoch": 2.5789289464473226, + "grad_norm": 1.0947911739349365, + "learning_rate": 0.00010328777388438393, + "loss": 1.9169, + "step": 22104 + }, + { + "epoch": 2.579045618947614, + "grad_norm": 1.1063132286071777, + "learning_rate": 0.00010327347321375552, + "loss": 1.9423, + "step": 22105 + }, + { + "epoch": 2.579162291447906, + "grad_norm": 1.0442944765090942, + "learning_rate": 0.0001032591730430865, + "loss": 1.9657, + "step": 22106 + }, + { + "epoch": 2.5792789639481972, + "grad_norm": 1.1038960218429565, + "learning_rate": 0.00010324487337252493, + "loss": 1.9839, + "step": 22107 + }, + { + "epoch": 2.5793956364484893, + "grad_norm": 1.1375348567962646, + "learning_rate": 0.0001032305742022191, + "loss": 2.0971, + "step": 22108 + }, + { + "epoch": 2.5795123089487806, + "grad_norm": 1.1448036432266235, + "learning_rate": 0.00010321627553231734, + "loss": 1.9329, + "step": 22109 + }, + { + "epoch": 2.5796289814490727, + "grad_norm": 1.0657908916473389, + "learning_rate": 0.00010320197736296777, + "loss": 1.8105, + "step": 22110 + }, + { + "epoch": 2.579745653949364, + "grad_norm": 1.1147792339324951, + "learning_rate": 0.00010318767969431869, + "loss": 2.0392, + "step": 22111 + }, + { + "epoch": 2.579862326449656, + "grad_norm": 1.3543624877929688, + "learning_rate": 0.00010317338252651827, + "loss": 2.1318, + "step": 22112 + }, + { + "epoch": 2.5799789989499473, + "grad_norm": 1.2046233415603638, + "learning_rate": 0.0001031590858597148, + "loss": 2.0052, + "step": 22113 + }, + { + "epoch": 2.5800956714502394, + "grad_norm": 0.9886380434036255, + "learning_rate": 0.00010314478969405642, + "loss": 1.8109, + "step": 22114 + }, + { + "epoch": 2.5802123439505307, + "grad_norm": 1.0550857782363892, + "learning_rate": 0.00010313049402969135, + "loss": 2.0021, + "step": 22115 + }, + { + "epoch": 2.580329016450823, + "grad_norm": 1.1883920431137085, + "learning_rate": 0.00010311619886676781, + "loss": 2.0533, + "step": 22116 + }, + { + "epoch": 2.580445688951114, + "grad_norm": 1.224244475364685, + "learning_rate": 0.00010310190420543404, + "loss": 1.9708, + "step": 22117 + }, + { + "epoch": 2.580562361451406, + "grad_norm": 1.3653080463409424, + "learning_rate": 0.0001030876100458381, + "loss": 2.0506, + "step": 22118 + }, + { + "epoch": 2.5806790339516974, + "grad_norm": 1.3497380018234253, + "learning_rate": 0.00010307331638812832, + "loss": 2.1014, + "step": 22119 + }, + { + "epoch": 2.5807957064519895, + "grad_norm": 1.1156094074249268, + "learning_rate": 0.00010305902323245278, + "loss": 1.8899, + "step": 22120 + }, + { + "epoch": 2.5809123789522808, + "grad_norm": 1.4089540243148804, + "learning_rate": 0.00010304473057895967, + "loss": 1.9768, + "step": 22121 + }, + { + "epoch": 2.581029051452573, + "grad_norm": 1.0631521940231323, + "learning_rate": 0.00010303043842779725, + "loss": 1.9567, + "step": 22122 + }, + { + "epoch": 2.581145723952864, + "grad_norm": 1.093322515487671, + "learning_rate": 0.00010301614677911361, + "loss": 1.8019, + "step": 22123 + }, + { + "epoch": 2.5812623964531562, + "grad_norm": 1.2500267028808594, + "learning_rate": 0.00010300185563305693, + "loss": 1.9999, + "step": 22124 + }, + { + "epoch": 2.5813790689534475, + "grad_norm": 1.2420878410339355, + "learning_rate": 0.0001029875649897753, + "loss": 1.9559, + "step": 22125 + }, + { + "epoch": 2.5814957414537396, + "grad_norm": 1.1253564357757568, + "learning_rate": 0.00010297327484941701, + "loss": 2.0833, + "step": 22126 + }, + { + "epoch": 2.581612413954031, + "grad_norm": 1.2805356979370117, + "learning_rate": 0.00010295898521213003, + "loss": 1.9156, + "step": 22127 + }, + { + "epoch": 2.581729086454323, + "grad_norm": 1.075031042098999, + "learning_rate": 0.00010294469607806264, + "loss": 1.7715, + "step": 22128 + }, + { + "epoch": 2.581845758954614, + "grad_norm": 1.1710227727890015, + "learning_rate": 0.00010293040744736288, + "loss": 2.0828, + "step": 22129 + }, + { + "epoch": 2.5819624314549063, + "grad_norm": 1.107580304145813, + "learning_rate": 0.00010291611932017897, + "loss": 1.9492, + "step": 22130 + }, + { + "epoch": 2.5820791039551976, + "grad_norm": 1.1924530267715454, + "learning_rate": 0.00010290183169665893, + "loss": 2.0631, + "step": 22131 + }, + { + "epoch": 2.5821957764554897, + "grad_norm": 1.332416296005249, + "learning_rate": 0.00010288754457695098, + "loss": 2.0795, + "step": 22132 + }, + { + "epoch": 2.582312448955781, + "grad_norm": 1.118451476097107, + "learning_rate": 0.00010287325796120318, + "loss": 1.8924, + "step": 22133 + }, + { + "epoch": 2.582429121456073, + "grad_norm": 1.2077525854110718, + "learning_rate": 0.0001028589718495636, + "loss": 2.0797, + "step": 22134 + }, + { + "epoch": 2.5825457939563643, + "grad_norm": 1.0860432386398315, + "learning_rate": 0.00010284468624218047, + "loss": 1.995, + "step": 22135 + }, + { + "epoch": 2.5826624664566564, + "grad_norm": 1.0015641450881958, + "learning_rate": 0.0001028304011392017, + "loss": 1.7703, + "step": 22136 + }, + { + "epoch": 2.5827791389569477, + "grad_norm": 1.2158187627792358, + "learning_rate": 0.00010281611654077558, + "loss": 2.1386, + "step": 22137 + }, + { + "epoch": 2.58289581145724, + "grad_norm": 1.172221064567566, + "learning_rate": 0.00010280183244705005, + "loss": 2.0308, + "step": 22138 + }, + { + "epoch": 2.583012483957531, + "grad_norm": 1.2479794025421143, + "learning_rate": 0.00010278754885817327, + "loss": 1.9328, + "step": 22139 + }, + { + "epoch": 2.583129156457823, + "grad_norm": 1.1735169887542725, + "learning_rate": 0.00010277326577429328, + "loss": 1.9137, + "step": 22140 + }, + { + "epoch": 2.5832458289581144, + "grad_norm": 1.019706130027771, + "learning_rate": 0.00010275898319555814, + "loss": 1.9136, + "step": 22141 + }, + { + "epoch": 2.5833625014584065, + "grad_norm": 1.2997126579284668, + "learning_rate": 0.00010274470112211596, + "loss": 2.203, + "step": 22142 + }, + { + "epoch": 2.5834791739586977, + "grad_norm": 1.2198320627212524, + "learning_rate": 0.0001027304195541148, + "loss": 2.09, + "step": 22143 + }, + { + "epoch": 2.58359584645899, + "grad_norm": 1.176699161529541, + "learning_rate": 0.00010271613849170265, + "loss": 1.9962, + "step": 22144 + }, + { + "epoch": 2.583712518959281, + "grad_norm": 1.098904013633728, + "learning_rate": 0.00010270185793502765, + "loss": 1.9548, + "step": 22145 + }, + { + "epoch": 2.5838291914595732, + "grad_norm": 1.1880966424942017, + "learning_rate": 0.00010268757788423775, + "loss": 2.1413, + "step": 22146 + }, + { + "epoch": 2.5839458639598645, + "grad_norm": 1.226907730102539, + "learning_rate": 0.00010267329833948102, + "loss": 2.0888, + "step": 22147 + }, + { + "epoch": 2.5840625364601566, + "grad_norm": 1.2139679193496704, + "learning_rate": 0.0001026590193009056, + "loss": 2.0706, + "step": 22148 + }, + { + "epoch": 2.584179208960448, + "grad_norm": 1.2931840419769287, + "learning_rate": 0.00010264474076865939, + "loss": 1.9079, + "step": 22149 + }, + { + "epoch": 2.58429588146074, + "grad_norm": 1.0791393518447876, + "learning_rate": 0.00010263046274289047, + "loss": 2.1231, + "step": 22150 + }, + { + "epoch": 2.584412553961031, + "grad_norm": 1.150927186012268, + "learning_rate": 0.00010261618522374678, + "loss": 2.0567, + "step": 22151 + }, + { + "epoch": 2.5845292264613233, + "grad_norm": 1.074655532836914, + "learning_rate": 0.00010260190821137651, + "loss": 2.0999, + "step": 22152 + }, + { + "epoch": 2.5846458989616146, + "grad_norm": 1.0354779958724976, + "learning_rate": 0.00010258763170592745, + "loss": 1.9955, + "step": 22153 + }, + { + "epoch": 2.5847625714619067, + "grad_norm": 0.9772351384162903, + "learning_rate": 0.00010257335570754779, + "loss": 1.8899, + "step": 22154 + }, + { + "epoch": 2.584879243962198, + "grad_norm": 1.2938311100006104, + "learning_rate": 0.00010255908021638538, + "loss": 2.2019, + "step": 22155 + }, + { + "epoch": 2.58499591646249, + "grad_norm": 1.1775333881378174, + "learning_rate": 0.00010254480523258836, + "loss": 1.9969, + "step": 22156 + }, + { + "epoch": 2.5851125889627813, + "grad_norm": 0.9769450426101685, + "learning_rate": 0.00010253053075630454, + "loss": 1.9755, + "step": 22157 + }, + { + "epoch": 2.5852292614630734, + "grad_norm": 1.2012848854064941, + "learning_rate": 0.0001025162567876821, + "loss": 2.1151, + "step": 22158 + }, + { + "epoch": 2.5853459339633647, + "grad_norm": 0.9640457034111023, + "learning_rate": 0.00010250198332686886, + "loss": 1.9246, + "step": 22159 + }, + { + "epoch": 2.5854626064636568, + "grad_norm": 1.2101738452911377, + "learning_rate": 0.00010248771037401284, + "loss": 1.9915, + "step": 22160 + }, + { + "epoch": 2.585579278963948, + "grad_norm": 1.262723445892334, + "learning_rate": 0.00010247343792926207, + "loss": 2.1019, + "step": 22161 + }, + { + "epoch": 2.58569595146424, + "grad_norm": 0.9956448674201965, + "learning_rate": 0.00010245916599276438, + "loss": 1.7375, + "step": 22162 + }, + { + "epoch": 2.5858126239645314, + "grad_norm": 0.9558485746383667, + "learning_rate": 0.00010244489456466785, + "loss": 1.97, + "step": 22163 + }, + { + "epoch": 2.5859292964648235, + "grad_norm": 1.1319893598556519, + "learning_rate": 0.00010243062364512036, + "loss": 2.1739, + "step": 22164 + }, + { + "epoch": 2.5860459689651147, + "grad_norm": 1.1050463914871216, + "learning_rate": 0.00010241635323426992, + "loss": 1.9322, + "step": 22165 + }, + { + "epoch": 2.586162641465407, + "grad_norm": 1.1384249925613403, + "learning_rate": 0.00010240208333226439, + "loss": 1.9958, + "step": 22166 + }, + { + "epoch": 2.586279313965698, + "grad_norm": 1.0156930685043335, + "learning_rate": 0.00010238781393925175, + "loss": 1.9614, + "step": 22167 + }, + { + "epoch": 2.5863959864659902, + "grad_norm": 1.1071105003356934, + "learning_rate": 0.0001023735450553799, + "loss": 1.9641, + "step": 22168 + }, + { + "epoch": 2.5865126589662815, + "grad_norm": 1.0340592861175537, + "learning_rate": 0.00010235927668079684, + "loss": 1.9543, + "step": 22169 + }, + { + "epoch": 2.5866293314665736, + "grad_norm": 1.1100746393203735, + "learning_rate": 0.00010234500881565035, + "loss": 1.7389, + "step": 22170 + }, + { + "epoch": 2.586746003966865, + "grad_norm": 1.254228115081787, + "learning_rate": 0.0001023307414600885, + "loss": 1.8989, + "step": 22171 + }, + { + "epoch": 2.586862676467157, + "grad_norm": 1.0254238843917847, + "learning_rate": 0.00010231647461425907, + "loss": 1.8061, + "step": 22172 + }, + { + "epoch": 2.586979348967448, + "grad_norm": 1.070016622543335, + "learning_rate": 0.00010230220827831004, + "loss": 1.7908, + "step": 22173 + }, + { + "epoch": 2.5870960214677403, + "grad_norm": 1.3387913703918457, + "learning_rate": 0.00010228794245238927, + "loss": 2.0967, + "step": 22174 + }, + { + "epoch": 2.5872126939680316, + "grad_norm": 1.146721363067627, + "learning_rate": 0.00010227367713664468, + "loss": 1.8696, + "step": 22175 + }, + { + "epoch": 2.5873293664683237, + "grad_norm": 0.9496138095855713, + "learning_rate": 0.00010225941233122419, + "loss": 1.974, + "step": 22176 + }, + { + "epoch": 2.587446038968615, + "grad_norm": 0.952260434627533, + "learning_rate": 0.00010224514803627556, + "loss": 1.9182, + "step": 22177 + }, + { + "epoch": 2.587562711468907, + "grad_norm": 1.2344982624053955, + "learning_rate": 0.0001022308842519468, + "loss": 1.8838, + "step": 22178 + }, + { + "epoch": 2.5876793839691983, + "grad_norm": 1.2061125040054321, + "learning_rate": 0.00010221662097838568, + "loss": 1.9764, + "step": 22179 + }, + { + "epoch": 2.5877960564694904, + "grad_norm": 1.0804468393325806, + "learning_rate": 0.00010220235821574015, + "loss": 1.9816, + "step": 22180 + }, + { + "epoch": 2.5879127289697816, + "grad_norm": 1.1807172298431396, + "learning_rate": 0.00010218809596415798, + "loss": 2.1131, + "step": 22181 + }, + { + "epoch": 2.5880294014700738, + "grad_norm": 1.14559006690979, + "learning_rate": 0.00010217383422378712, + "loss": 2.0117, + "step": 22182 + }, + { + "epoch": 2.588146073970365, + "grad_norm": 1.0752208232879639, + "learning_rate": 0.00010215957299477535, + "loss": 1.8708, + "step": 22183 + }, + { + "epoch": 2.588262746470657, + "grad_norm": 1.1807454824447632, + "learning_rate": 0.00010214531227727056, + "loss": 1.9772, + "step": 22184 + }, + { + "epoch": 2.5883794189709484, + "grad_norm": 1.1396293640136719, + "learning_rate": 0.00010213105207142053, + "loss": 2.0542, + "step": 22185 + }, + { + "epoch": 2.5884960914712405, + "grad_norm": 1.1560763120651245, + "learning_rate": 0.00010211679237737314, + "loss": 2.1248, + "step": 22186 + }, + { + "epoch": 2.5886127639715317, + "grad_norm": 1.101867914199829, + "learning_rate": 0.00010210253319527627, + "loss": 1.948, + "step": 22187 + }, + { + "epoch": 2.588729436471824, + "grad_norm": 1.0958774089813232, + "learning_rate": 0.00010208827452527761, + "loss": 1.773, + "step": 22188 + }, + { + "epoch": 2.588846108972115, + "grad_norm": 1.1683510541915894, + "learning_rate": 0.00010207401636752513, + "loss": 1.9055, + "step": 22189 + }, + { + "epoch": 2.588962781472407, + "grad_norm": 1.2226719856262207, + "learning_rate": 0.00010205975872216648, + "loss": 2.076, + "step": 22190 + }, + { + "epoch": 2.5890794539726985, + "grad_norm": 1.1443331241607666, + "learning_rate": 0.00010204550158934963, + "loss": 1.928, + "step": 22191 + }, + { + "epoch": 2.5891961264729906, + "grad_norm": 1.1221541166305542, + "learning_rate": 0.00010203124496922226, + "loss": 1.935, + "step": 22192 + }, + { + "epoch": 2.589312798973282, + "grad_norm": 1.175707221031189, + "learning_rate": 0.00010201698886193227, + "loss": 1.9131, + "step": 22193 + }, + { + "epoch": 2.589429471473574, + "grad_norm": 1.1015173196792603, + "learning_rate": 0.00010200273326762733, + "loss": 2.0319, + "step": 22194 + }, + { + "epoch": 2.589546143973865, + "grad_norm": 1.2097402811050415, + "learning_rate": 0.00010198847818645533, + "loss": 1.6679, + "step": 22195 + }, + { + "epoch": 2.5896628164741573, + "grad_norm": 1.1102701425552368, + "learning_rate": 0.00010197422361856396, + "loss": 2.1255, + "step": 22196 + }, + { + "epoch": 2.5897794889744485, + "grad_norm": 1.0585060119628906, + "learning_rate": 0.00010195996956410112, + "loss": 2.0346, + "step": 22197 + }, + { + "epoch": 2.5898961614747407, + "grad_norm": 1.169904351234436, + "learning_rate": 0.00010194571602321442, + "loss": 1.9488, + "step": 22198 + }, + { + "epoch": 2.590012833975032, + "grad_norm": 1.0111145973205566, + "learning_rate": 0.00010193146299605177, + "loss": 1.9684, + "step": 22199 + }, + { + "epoch": 2.590129506475324, + "grad_norm": 1.2114894390106201, + "learning_rate": 0.00010191721048276085, + "loss": 1.9506, + "step": 22200 + }, + { + "epoch": 2.5902461789756153, + "grad_norm": 1.1460983753204346, + "learning_rate": 0.00010190295848348946, + "loss": 1.9461, + "step": 22201 + }, + { + "epoch": 2.5903628514759074, + "grad_norm": 1.1579859256744385, + "learning_rate": 0.00010188870699838533, + "loss": 1.9955, + "step": 22202 + }, + { + "epoch": 2.5904795239761986, + "grad_norm": 1.203201413154602, + "learning_rate": 0.00010187445602759616, + "loss": 1.9303, + "step": 22203 + }, + { + "epoch": 2.5905961964764908, + "grad_norm": 1.1574273109436035, + "learning_rate": 0.00010186020557126978, + "loss": 1.988, + "step": 22204 + }, + { + "epoch": 2.590712868976782, + "grad_norm": 1.1885186433792114, + "learning_rate": 0.0001018459556295538, + "loss": 2.1363, + "step": 22205 + }, + { + "epoch": 2.590829541477074, + "grad_norm": 1.3644131422042847, + "learning_rate": 0.0001018317062025961, + "loss": 2.0587, + "step": 22206 + }, + { + "epoch": 2.5909462139773654, + "grad_norm": 1.1605043411254883, + "learning_rate": 0.00010181745729054424, + "loss": 1.9964, + "step": 22207 + }, + { + "epoch": 2.5910628864776575, + "grad_norm": 1.230273723602295, + "learning_rate": 0.00010180320889354607, + "loss": 2.057, + "step": 22208 + }, + { + "epoch": 2.5911795589779487, + "grad_norm": 1.105108380317688, + "learning_rate": 0.00010178896101174923, + "loss": 1.9004, + "step": 22209 + }, + { + "epoch": 2.591296231478241, + "grad_norm": 1.1778013706207275, + "learning_rate": 0.00010177471364530144, + "loss": 1.9239, + "step": 22210 + }, + { + "epoch": 2.591412903978532, + "grad_norm": 1.1756198406219482, + "learning_rate": 0.00010176046679435042, + "loss": 2.0897, + "step": 22211 + }, + { + "epoch": 2.591529576478824, + "grad_norm": 1.0548028945922852, + "learning_rate": 0.0001017462204590438, + "loss": 2.0155, + "step": 22212 + }, + { + "epoch": 2.5916462489791154, + "grad_norm": 1.104288935661316, + "learning_rate": 0.00010173197463952941, + "loss": 1.9982, + "step": 22213 + }, + { + "epoch": 2.5917629214794076, + "grad_norm": 1.3193167448043823, + "learning_rate": 0.00010171772933595475, + "loss": 2.0632, + "step": 22214 + }, + { + "epoch": 2.591879593979699, + "grad_norm": 1.3526862859725952, + "learning_rate": 0.00010170348454846769, + "loss": 2.1078, + "step": 22215 + }, + { + "epoch": 2.591996266479991, + "grad_norm": 1.1875301599502563, + "learning_rate": 0.00010168924027721572, + "loss": 2.0175, + "step": 22216 + }, + { + "epoch": 2.592112938980282, + "grad_norm": 1.0748317241668701, + "learning_rate": 0.00010167499652234667, + "loss": 1.8906, + "step": 22217 + }, + { + "epoch": 2.5922296114805743, + "grad_norm": 1.2077598571777344, + "learning_rate": 0.0001016607532840081, + "loss": 2.019, + "step": 22218 + }, + { + "epoch": 2.5923462839808655, + "grad_norm": 1.0343352556228638, + "learning_rate": 0.00010164651056234774, + "loss": 1.9736, + "step": 22219 + }, + { + "epoch": 2.5924629564811577, + "grad_norm": 1.1793193817138672, + "learning_rate": 0.00010163226835751312, + "loss": 2.0253, + "step": 22220 + }, + { + "epoch": 2.592579628981449, + "grad_norm": 1.135941982269287, + "learning_rate": 0.00010161802666965208, + "loss": 1.8333, + "step": 22221 + }, + { + "epoch": 2.592696301481741, + "grad_norm": 1.1329174041748047, + "learning_rate": 0.00010160378549891206, + "loss": 1.9358, + "step": 22222 + }, + { + "epoch": 2.5928129739820323, + "grad_norm": 1.2620337009429932, + "learning_rate": 0.00010158954484544086, + "loss": 1.9599, + "step": 22223 + }, + { + "epoch": 2.5929296464823244, + "grad_norm": 1.105287790298462, + "learning_rate": 0.00010157530470938595, + "loss": 1.9138, + "step": 22224 + }, + { + "epoch": 2.5930463189826156, + "grad_norm": 1.3292900323867798, + "learning_rate": 0.00010156106509089512, + "loss": 2.11, + "step": 22225 + }, + { + "epoch": 2.5931629914829077, + "grad_norm": 1.0043299198150635, + "learning_rate": 0.00010154682599011591, + "loss": 1.8245, + "step": 22226 + }, + { + "epoch": 2.593279663983199, + "grad_norm": 1.0306119918823242, + "learning_rate": 0.0001015325874071959, + "loss": 1.9049, + "step": 22227 + }, + { + "epoch": 2.593396336483491, + "grad_norm": 0.932502806186676, + "learning_rate": 0.00010151834934228282, + "loss": 1.951, + "step": 22228 + }, + { + "epoch": 2.5935130089837823, + "grad_norm": 1.1061948537826538, + "learning_rate": 0.00010150411179552411, + "loss": 2.2322, + "step": 22229 + }, + { + "epoch": 2.5936296814840745, + "grad_norm": 1.0975720882415771, + "learning_rate": 0.00010148987476706753, + "loss": 2.0031, + "step": 22230 + }, + { + "epoch": 2.5937463539843657, + "grad_norm": 1.0294649600982666, + "learning_rate": 0.00010147563825706054, + "loss": 2.056, + "step": 22231 + }, + { + "epoch": 2.593863026484658, + "grad_norm": 1.0606648921966553, + "learning_rate": 0.00010146140226565085, + "loss": 1.8207, + "step": 22232 + }, + { + "epoch": 2.593979698984949, + "grad_norm": 1.1835592985153198, + "learning_rate": 0.00010144716679298591, + "loss": 1.8764, + "step": 22233 + }, + { + "epoch": 2.594096371485241, + "grad_norm": 1.0717166662216187, + "learning_rate": 0.00010143293183921343, + "loss": 1.9751, + "step": 22234 + }, + { + "epoch": 2.5942130439855324, + "grad_norm": 1.0205038785934448, + "learning_rate": 0.00010141869740448091, + "loss": 1.9301, + "step": 22235 + }, + { + "epoch": 2.5943297164858246, + "grad_norm": 1.0994659662246704, + "learning_rate": 0.00010140446348893592, + "loss": 2.0014, + "step": 22236 + }, + { + "epoch": 2.594446388986116, + "grad_norm": 0.9964592456817627, + "learning_rate": 0.000101390230092726, + "loss": 1.8247, + "step": 22237 + }, + { + "epoch": 2.594563061486408, + "grad_norm": 1.1445412635803223, + "learning_rate": 0.00010137599721599874, + "loss": 1.9973, + "step": 22238 + }, + { + "epoch": 2.594679733986699, + "grad_norm": 1.117629885673523, + "learning_rate": 0.00010136176485890174, + "loss": 1.8972, + "step": 22239 + }, + { + "epoch": 2.5947964064869913, + "grad_norm": 1.0206291675567627, + "learning_rate": 0.00010134753302158243, + "loss": 1.9986, + "step": 22240 + }, + { + "epoch": 2.5949130789872825, + "grad_norm": 1.1863007545471191, + "learning_rate": 0.00010133330170418847, + "loss": 1.9747, + "step": 22241 + }, + { + "epoch": 2.5950297514875746, + "grad_norm": 1.3726294040679932, + "learning_rate": 0.00010131907090686727, + "loss": 1.988, + "step": 22242 + }, + { + "epoch": 2.595146423987866, + "grad_norm": 1.3051964044570923, + "learning_rate": 0.0001013048406297665, + "loss": 1.8739, + "step": 22243 + }, + { + "epoch": 2.595263096488158, + "grad_norm": 1.0316725969314575, + "learning_rate": 0.00010129061087303358, + "loss": 1.9942, + "step": 22244 + }, + { + "epoch": 2.5953797689884492, + "grad_norm": 1.1758025884628296, + "learning_rate": 0.00010127638163681607, + "loss": 2.0569, + "step": 22245 + }, + { + "epoch": 2.5954964414887414, + "grad_norm": 1.1268956661224365, + "learning_rate": 0.00010126215292126141, + "loss": 2.0734, + "step": 22246 + }, + { + "epoch": 2.5956131139890326, + "grad_norm": 1.3048832416534424, + "learning_rate": 0.00010124792472651726, + "loss": 1.8432, + "step": 22247 + }, + { + "epoch": 2.5957297864893247, + "grad_norm": 1.2164868116378784, + "learning_rate": 0.00010123369705273094, + "loss": 1.9241, + "step": 22248 + }, + { + "epoch": 2.595846458989616, + "grad_norm": 1.0246635675430298, + "learning_rate": 0.0001012194699000501, + "loss": 2.0405, + "step": 22249 + }, + { + "epoch": 2.595963131489908, + "grad_norm": 1.1718074083328247, + "learning_rate": 0.0001012052432686221, + "loss": 1.9148, + "step": 22250 + }, + { + "epoch": 2.5960798039901993, + "grad_norm": 1.0546995401382446, + "learning_rate": 0.00010119101715859456, + "loss": 1.7949, + "step": 22251 + }, + { + "epoch": 2.5961964764904915, + "grad_norm": 1.0986937284469604, + "learning_rate": 0.00010117679157011486, + "loss": 1.8489, + "step": 22252 + }, + { + "epoch": 2.5963131489907827, + "grad_norm": 1.2302058935165405, + "learning_rate": 0.00010116256650333051, + "loss": 2.0904, + "step": 22253 + }, + { + "epoch": 2.596429821491075, + "grad_norm": 1.207536220550537, + "learning_rate": 0.00010114834195838903, + "loss": 2.0609, + "step": 22254 + }, + { + "epoch": 2.596546493991366, + "grad_norm": 1.2297977209091187, + "learning_rate": 0.00010113411793543777, + "loss": 2.15, + "step": 22255 + }, + { + "epoch": 2.596663166491658, + "grad_norm": 1.0361828804016113, + "learning_rate": 0.0001011198944346243, + "loss": 1.8214, + "step": 22256 + }, + { + "epoch": 2.5967798389919494, + "grad_norm": 1.079610824584961, + "learning_rate": 0.000101105671456096, + "loss": 1.8764, + "step": 22257 + }, + { + "epoch": 2.5968965114922415, + "grad_norm": 1.0629656314849854, + "learning_rate": 0.00010109144900000035, + "loss": 2.0609, + "step": 22258 + }, + { + "epoch": 2.597013183992533, + "grad_norm": 1.1677311658859253, + "learning_rate": 0.00010107722706648478, + "loss": 2.0508, + "step": 22259 + }, + { + "epoch": 2.597129856492825, + "grad_norm": 1.0588243007659912, + "learning_rate": 0.00010106300565569677, + "loss": 2.0022, + "step": 22260 + }, + { + "epoch": 2.597246528993116, + "grad_norm": 1.2944388389587402, + "learning_rate": 0.0001010487847677837, + "loss": 1.8387, + "step": 22261 + }, + { + "epoch": 2.5973632014934083, + "grad_norm": 1.1177451610565186, + "learning_rate": 0.000101034564402893, + "loss": 2.0537, + "step": 22262 + }, + { + "epoch": 2.5974798739936995, + "grad_norm": 1.0051769018173218, + "learning_rate": 0.00010102034456117211, + "loss": 1.8222, + "step": 22263 + }, + { + "epoch": 2.5975965464939916, + "grad_norm": 1.1907610893249512, + "learning_rate": 0.0001010061252427684, + "loss": 2.0872, + "step": 22264 + }, + { + "epoch": 2.597713218994283, + "grad_norm": 1.0999443531036377, + "learning_rate": 0.00010099190644782938, + "loss": 2.021, + "step": 22265 + }, + { + "epoch": 2.597829891494575, + "grad_norm": 1.193418264389038, + "learning_rate": 0.00010097768817650235, + "loss": 1.9837, + "step": 22266 + }, + { + "epoch": 2.5979465639948662, + "grad_norm": 1.124986171722412, + "learning_rate": 0.00010096347042893482, + "loss": 1.982, + "step": 22267 + }, + { + "epoch": 2.5980632364951584, + "grad_norm": 1.19712495803833, + "learning_rate": 0.00010094925320527407, + "loss": 2.0094, + "step": 22268 + }, + { + "epoch": 2.5981799089954496, + "grad_norm": 1.0040491819381714, + "learning_rate": 0.00010093503650566752, + "loss": 1.6481, + "step": 22269 + }, + { + "epoch": 2.5982965814957417, + "grad_norm": 1.1163345575332642, + "learning_rate": 0.0001009208203302626, + "loss": 1.9967, + "step": 22270 + }, + { + "epoch": 2.598413253996033, + "grad_norm": 1.073167085647583, + "learning_rate": 0.00010090660467920668, + "loss": 2.015, + "step": 22271 + }, + { + "epoch": 2.598529926496325, + "grad_norm": 1.1340051889419556, + "learning_rate": 0.00010089238955264704, + "loss": 2.0953, + "step": 22272 + }, + { + "epoch": 2.5986465989966163, + "grad_norm": 1.1063863039016724, + "learning_rate": 0.00010087817495073118, + "loss": 2.0454, + "step": 22273 + }, + { + "epoch": 2.5987632714969084, + "grad_norm": 1.2896231412887573, + "learning_rate": 0.00010086396087360635, + "loss": 1.8352, + "step": 22274 + }, + { + "epoch": 2.5988799439971997, + "grad_norm": 1.1443533897399902, + "learning_rate": 0.00010084974732141998, + "loss": 1.9264, + "step": 22275 + }, + { + "epoch": 2.598996616497492, + "grad_norm": 0.9280379414558411, + "learning_rate": 0.00010083553429431936, + "loss": 1.8553, + "step": 22276 + }, + { + "epoch": 2.599113288997783, + "grad_norm": 1.1348721981048584, + "learning_rate": 0.0001008213217924519, + "loss": 1.9263, + "step": 22277 + }, + { + "epoch": 2.599229961498075, + "grad_norm": 1.1725654602050781, + "learning_rate": 0.00010080710981596493, + "loss": 1.9455, + "step": 22278 + }, + { + "epoch": 2.5993466339983664, + "grad_norm": 1.1540569067001343, + "learning_rate": 0.00010079289836500568, + "loss": 1.9046, + "step": 22279 + }, + { + "epoch": 2.5994633064986585, + "grad_norm": 1.0715876817703247, + "learning_rate": 0.00010077868743972165, + "loss": 2.0848, + "step": 22280 + }, + { + "epoch": 2.5995799789989498, + "grad_norm": 1.1012687683105469, + "learning_rate": 0.00010076447704026001, + "loss": 2.141, + "step": 22281 + }, + { + "epoch": 2.599696651499242, + "grad_norm": 1.088653564453125, + "learning_rate": 0.00010075026716676818, + "loss": 1.9301, + "step": 22282 + }, + { + "epoch": 2.599813323999533, + "grad_norm": 1.0576368570327759, + "learning_rate": 0.0001007360578193934, + "loss": 1.9902, + "step": 22283 + }, + { + "epoch": 2.5999299964998253, + "grad_norm": 0.9730786085128784, + "learning_rate": 0.00010072184899828304, + "loss": 1.7653, + "step": 22284 + }, + { + "epoch": 2.6000466690001165, + "grad_norm": 1.1130229234695435, + "learning_rate": 0.00010070764070358435, + "loss": 1.8022, + "step": 22285 + }, + { + "epoch": 2.6001633415004086, + "grad_norm": 1.1756037473678589, + "learning_rate": 0.00010069343293544469, + "loss": 1.9607, + "step": 22286 + }, + { + "epoch": 2.6002800140007, + "grad_norm": 1.2158435583114624, + "learning_rate": 0.00010067922569401127, + "loss": 2.0816, + "step": 22287 + }, + { + "epoch": 2.600396686500992, + "grad_norm": 1.1122448444366455, + "learning_rate": 0.00010066501897943146, + "loss": 2.0503, + "step": 22288 + }, + { + "epoch": 2.600513359001283, + "grad_norm": 1.0910981893539429, + "learning_rate": 0.0001006508127918524, + "loss": 1.9882, + "step": 22289 + }, + { + "epoch": 2.6006300315015753, + "grad_norm": 1.0726008415222168, + "learning_rate": 0.00010063660713142148, + "loss": 2.0488, + "step": 22290 + }, + { + "epoch": 2.6007467040018666, + "grad_norm": 1.020005464553833, + "learning_rate": 0.000100622401998286, + "loss": 1.8546, + "step": 22291 + }, + { + "epoch": 2.6008633765021587, + "grad_norm": 1.1987708806991577, + "learning_rate": 0.00010060819739259313, + "loss": 1.9097, + "step": 22292 + }, + { + "epoch": 2.60098004900245, + "grad_norm": 1.208176851272583, + "learning_rate": 0.00010059399331449018, + "loss": 2.0264, + "step": 22293 + }, + { + "epoch": 2.601096721502742, + "grad_norm": 1.190165638923645, + "learning_rate": 0.00010057978976412438, + "loss": 2.017, + "step": 22294 + }, + { + "epoch": 2.6012133940030333, + "grad_norm": 1.2236994504928589, + "learning_rate": 0.00010056558674164298, + "loss": 2.0275, + "step": 22295 + }, + { + "epoch": 2.6013300665033254, + "grad_norm": 1.1971408128738403, + "learning_rate": 0.00010055138424719325, + "loss": 2.0167, + "step": 22296 + }, + { + "epoch": 2.6014467390036167, + "grad_norm": 1.2293801307678223, + "learning_rate": 0.00010053718228092241, + "loss": 1.9552, + "step": 22297 + }, + { + "epoch": 2.601563411503909, + "grad_norm": 1.2187796831130981, + "learning_rate": 0.00010052298084297764, + "loss": 1.8362, + "step": 22298 + }, + { + "epoch": 2.6016800840042, + "grad_norm": 1.2513080835342407, + "learning_rate": 0.00010050877993350624, + "loss": 1.9596, + "step": 22299 + }, + { + "epoch": 2.601796756504492, + "grad_norm": 1.1505564451217651, + "learning_rate": 0.00010049457955265534, + "loss": 1.8613, + "step": 22300 + }, + { + "epoch": 2.6019134290047834, + "grad_norm": 1.0961973667144775, + "learning_rate": 0.00010048037970057228, + "loss": 1.9399, + "step": 22301 + }, + { + "epoch": 2.6020301015050755, + "grad_norm": 1.1121751070022583, + "learning_rate": 0.00010046618037740413, + "loss": 2.1121, + "step": 22302 + }, + { + "epoch": 2.6021467740053668, + "grad_norm": 1.1753547191619873, + "learning_rate": 0.00010045198158329821, + "loss": 1.8738, + "step": 22303 + }, + { + "epoch": 2.602263446505659, + "grad_norm": 1.272449016571045, + "learning_rate": 0.00010043778331840168, + "loss": 2.044, + "step": 22304 + }, + { + "epoch": 2.60238011900595, + "grad_norm": 1.1451548337936401, + "learning_rate": 0.00010042358558286166, + "loss": 1.8881, + "step": 22305 + }, + { + "epoch": 2.6024967915062422, + "grad_norm": 1.2487468719482422, + "learning_rate": 0.00010040938837682547, + "loss": 1.8777, + "step": 22306 + }, + { + "epoch": 2.6026134640065335, + "grad_norm": 1.0503771305084229, + "learning_rate": 0.00010039519170044015, + "loss": 2.1089, + "step": 22307 + }, + { + "epoch": 2.6027301365068256, + "grad_norm": 1.1282902956008911, + "learning_rate": 0.00010038099555385298, + "loss": 1.9666, + "step": 22308 + }, + { + "epoch": 2.602846809007117, + "grad_norm": 1.1191010475158691, + "learning_rate": 0.00010036679993721107, + "loss": 2.0409, + "step": 22309 + }, + { + "epoch": 2.602963481507409, + "grad_norm": 1.1913437843322754, + "learning_rate": 0.00010035260485066165, + "loss": 2.0542, + "step": 22310 + }, + { + "epoch": 2.6030801540077, + "grad_norm": 1.1574242115020752, + "learning_rate": 0.00010033841029435178, + "loss": 1.9503, + "step": 22311 + }, + { + "epoch": 2.6031968265079923, + "grad_norm": 1.3418517112731934, + "learning_rate": 0.00010032421626842873, + "loss": 1.9249, + "step": 22312 + }, + { + "epoch": 2.6033134990082836, + "grad_norm": 1.2639504671096802, + "learning_rate": 0.00010031002277303954, + "loss": 1.9856, + "step": 22313 + }, + { + "epoch": 2.6034301715085757, + "grad_norm": 1.0815764665603638, + "learning_rate": 0.00010029582980833145, + "loss": 1.9254, + "step": 22314 + }, + { + "epoch": 2.603546844008867, + "grad_norm": 1.2291194200515747, + "learning_rate": 0.00010028163737445147, + "loss": 2.0682, + "step": 22315 + }, + { + "epoch": 2.603663516509159, + "grad_norm": 1.1371369361877441, + "learning_rate": 0.00010026744547154689, + "loss": 1.976, + "step": 22316 + }, + { + "epoch": 2.6037801890094503, + "grad_norm": 1.1635907888412476, + "learning_rate": 0.00010025325409976467, + "loss": 1.794, + "step": 22317 + }, + { + "epoch": 2.6038968615097424, + "grad_norm": 1.3681590557098389, + "learning_rate": 0.00010023906325925204, + "loss": 2.105, + "step": 22318 + }, + { + "epoch": 2.6040135340100337, + "grad_norm": 1.2182399034500122, + "learning_rate": 0.00010022487295015615, + "loss": 2.0425, + "step": 22319 + }, + { + "epoch": 2.604130206510326, + "grad_norm": 1.1311312913894653, + "learning_rate": 0.00010021068317262401, + "loss": 1.9416, + "step": 22320 + }, + { + "epoch": 2.604246879010617, + "grad_norm": 1.2637176513671875, + "learning_rate": 0.00010019649392680277, + "loss": 2.1231, + "step": 22321 + }, + { + "epoch": 2.604363551510909, + "grad_norm": 1.1700536012649536, + "learning_rate": 0.0001001823052128395, + "loss": 2.0356, + "step": 22322 + }, + { + "epoch": 2.6044802240112004, + "grad_norm": 1.2189371585845947, + "learning_rate": 0.00010016811703088136, + "loss": 2.0847, + "step": 22323 + }, + { + "epoch": 2.6045968965114925, + "grad_norm": 1.2053362131118774, + "learning_rate": 0.00010015392938107531, + "loss": 1.8647, + "step": 22324 + }, + { + "epoch": 2.6047135690117837, + "grad_norm": 1.2051862478256226, + "learning_rate": 0.0001001397422635686, + "loss": 1.8911, + "step": 22325 + }, + { + "epoch": 2.604830241512076, + "grad_norm": 1.2395808696746826, + "learning_rate": 0.00010012555567850817, + "loss": 2.1086, + "step": 22326 + }, + { + "epoch": 2.604946914012367, + "grad_norm": 0.9574363827705383, + "learning_rate": 0.00010011136962604118, + "loss": 1.8142, + "step": 22327 + }, + { + "epoch": 2.6050635865126592, + "grad_norm": 1.07261323928833, + "learning_rate": 0.00010009718410631461, + "loss": 2.0436, + "step": 22328 + }, + { + "epoch": 2.6051802590129505, + "grad_norm": 1.0544800758361816, + "learning_rate": 0.00010008299911947561, + "loss": 1.8655, + "step": 22329 + }, + { + "epoch": 2.6052969315132426, + "grad_norm": 1.2437992095947266, + "learning_rate": 0.00010006881466567115, + "loss": 2.1388, + "step": 22330 + }, + { + "epoch": 2.605413604013534, + "grad_norm": 1.1501684188842773, + "learning_rate": 0.00010005463074504833, + "loss": 2.0091, + "step": 22331 + }, + { + "epoch": 2.605530276513826, + "grad_norm": 1.1193910837173462, + "learning_rate": 0.0001000404473577542, + "loss": 2.0754, + "step": 22332 + }, + { + "epoch": 2.605646949014117, + "grad_norm": 1.024642825126648, + "learning_rate": 0.00010002626450393574, + "loss": 1.9914, + "step": 22333 + }, + { + "epoch": 2.6057636215144093, + "grad_norm": 1.0630823373794556, + "learning_rate": 0.00010001208218374008, + "loss": 1.8619, + "step": 22334 + }, + { + "epoch": 2.6058802940147006, + "grad_norm": 1.0841144323349, + "learning_rate": 9.999790039731415e-05, + "loss": 1.833, + "step": 22335 + }, + { + "epoch": 2.6059969665149927, + "grad_norm": 1.1183569431304932, + "learning_rate": 9.998371914480503e-05, + "loss": 1.993, + "step": 22336 + }, + { + "epoch": 2.606113639015284, + "grad_norm": 1.1256258487701416, + "learning_rate": 9.996953842635968e-05, + "loss": 2.0089, + "step": 22337 + }, + { + "epoch": 2.606230311515576, + "grad_norm": 1.1162177324295044, + "learning_rate": 9.99553582421252e-05, + "loss": 1.8093, + "step": 22338 + }, + { + "epoch": 2.6063469840158673, + "grad_norm": 1.0006636381149292, + "learning_rate": 9.99411785922485e-05, + "loss": 1.9459, + "step": 22339 + }, + { + "epoch": 2.6064636565161594, + "grad_norm": 1.1088480949401855, + "learning_rate": 9.992699947687666e-05, + "loss": 1.8962, + "step": 22340 + }, + { + "epoch": 2.6065803290164506, + "grad_norm": 1.187496542930603, + "learning_rate": 9.991282089615655e-05, + "loss": 2.0123, + "step": 22341 + }, + { + "epoch": 2.6066970015167428, + "grad_norm": 1.1837810277938843, + "learning_rate": 9.989864285023532e-05, + "loss": 1.9065, + "step": 22342 + }, + { + "epoch": 2.606813674017034, + "grad_norm": 1.083717942237854, + "learning_rate": 9.988446533925982e-05, + "loss": 1.8591, + "step": 22343 + }, + { + "epoch": 2.606930346517326, + "grad_norm": 1.151214361190796, + "learning_rate": 9.987028836337706e-05, + "loss": 2.0809, + "step": 22344 + }, + { + "epoch": 2.6070470190176174, + "grad_norm": 1.1640454530715942, + "learning_rate": 9.98561119227341e-05, + "loss": 2.0345, + "step": 22345 + }, + { + "epoch": 2.6071636915179095, + "grad_norm": 1.1706862449645996, + "learning_rate": 9.984193601747782e-05, + "loss": 2.0168, + "step": 22346 + }, + { + "epoch": 2.6072803640182007, + "grad_norm": 1.185386300086975, + "learning_rate": 9.982776064775518e-05, + "loss": 1.9476, + "step": 22347 + }, + { + "epoch": 2.607397036518493, + "grad_norm": 1.2532882690429688, + "learning_rate": 9.981358581371315e-05, + "loss": 1.8948, + "step": 22348 + }, + { + "epoch": 2.607513709018784, + "grad_norm": 1.1529532670974731, + "learning_rate": 9.979941151549869e-05, + "loss": 2.0078, + "step": 22349 + }, + { + "epoch": 2.6076303815190762, + "grad_norm": 1.1234912872314453, + "learning_rate": 9.978523775325873e-05, + "loss": 1.9683, + "step": 22350 + }, + { + "epoch": 2.6077470540193675, + "grad_norm": 1.0950908660888672, + "learning_rate": 9.977106452714021e-05, + "loss": 1.9173, + "step": 22351 + }, + { + "epoch": 2.6078637265196596, + "grad_norm": 1.1783608198165894, + "learning_rate": 9.975689183729004e-05, + "loss": 2.0557, + "step": 22352 + }, + { + "epoch": 2.607980399019951, + "grad_norm": 0.9972140789031982, + "learning_rate": 9.974271968385519e-05, + "loss": 1.742, + "step": 22353 + }, + { + "epoch": 2.608097071520243, + "grad_norm": 1.145187497138977, + "learning_rate": 9.972854806698254e-05, + "loss": 2.0015, + "step": 22354 + }, + { + "epoch": 2.608213744020534, + "grad_norm": 1.150643229484558, + "learning_rate": 9.971437698681907e-05, + "loss": 1.8328, + "step": 22355 + }, + { + "epoch": 2.6083304165208263, + "grad_norm": 1.0199464559555054, + "learning_rate": 9.970020644351162e-05, + "loss": 1.884, + "step": 22356 + }, + { + "epoch": 2.6084470890211175, + "grad_norm": 1.0186750888824463, + "learning_rate": 9.96860364372071e-05, + "loss": 1.8309, + "step": 22357 + }, + { + "epoch": 2.6085637615214097, + "grad_norm": 1.0307646989822388, + "learning_rate": 9.967186696805247e-05, + "loss": 2.0892, + "step": 22358 + }, + { + "epoch": 2.608680434021701, + "grad_norm": 1.1713972091674805, + "learning_rate": 9.965769803619453e-05, + "loss": 1.8379, + "step": 22359 + }, + { + "epoch": 2.608797106521993, + "grad_norm": 1.3265137672424316, + "learning_rate": 9.96435296417803e-05, + "loss": 2.1634, + "step": 22360 + }, + { + "epoch": 2.6089137790222843, + "grad_norm": 1.076257586479187, + "learning_rate": 9.962936178495652e-05, + "loss": 1.9568, + "step": 22361 + }, + { + "epoch": 2.6090304515225764, + "grad_norm": 1.188939094543457, + "learning_rate": 9.961519446587017e-05, + "loss": 2.052, + "step": 22362 + }, + { + "epoch": 2.6091471240228676, + "grad_norm": 1.0310481786727905, + "learning_rate": 9.960102768466806e-05, + "loss": 1.9367, + "step": 22363 + }, + { + "epoch": 2.6092637965231598, + "grad_norm": 1.0577350854873657, + "learning_rate": 9.958686144149708e-05, + "loss": 1.9892, + "step": 22364 + }, + { + "epoch": 2.609380469023451, + "grad_norm": 1.072775959968567, + "learning_rate": 9.957269573650407e-05, + "loss": 2.0157, + "step": 22365 + }, + { + "epoch": 2.609497141523743, + "grad_norm": 1.0959402322769165, + "learning_rate": 9.955853056983597e-05, + "loss": 1.9827, + "step": 22366 + }, + { + "epoch": 2.6096138140240344, + "grad_norm": 1.106845736503601, + "learning_rate": 9.954436594163945e-05, + "loss": 1.9524, + "step": 22367 + }, + { + "epoch": 2.6097304865243265, + "grad_norm": 1.1212705373764038, + "learning_rate": 9.953020185206156e-05, + "loss": 1.8112, + "step": 22368 + }, + { + "epoch": 2.6098471590246177, + "grad_norm": 1.4368188381195068, + "learning_rate": 9.951603830124897e-05, + "loss": 2.1376, + "step": 22369 + }, + { + "epoch": 2.60996383152491, + "grad_norm": 1.0639054775238037, + "learning_rate": 9.950187528934856e-05, + "loss": 2.0279, + "step": 22370 + }, + { + "epoch": 2.610080504025201, + "grad_norm": 1.1504812240600586, + "learning_rate": 9.94877128165073e-05, + "loss": 2.0619, + "step": 22371 + }, + { + "epoch": 2.610197176525493, + "grad_norm": 1.3829822540283203, + "learning_rate": 9.94735508828718e-05, + "loss": 2.0096, + "step": 22372 + }, + { + "epoch": 2.6103138490257844, + "grad_norm": 1.112600564956665, + "learning_rate": 9.945938948858904e-05, + "loss": 1.9429, + "step": 22373 + }, + { + "epoch": 2.6104305215260766, + "grad_norm": 1.0772408246994019, + "learning_rate": 9.944522863380565e-05, + "loss": 1.9205, + "step": 22374 + }, + { + "epoch": 2.610547194026368, + "grad_norm": 1.156744360923767, + "learning_rate": 9.943106831866862e-05, + "loss": 2.0113, + "step": 22375 + }, + { + "epoch": 2.61066386652666, + "grad_norm": 1.0659730434417725, + "learning_rate": 9.94169085433246e-05, + "loss": 1.9231, + "step": 22376 + }, + { + "epoch": 2.610780539026951, + "grad_norm": 1.144572377204895, + "learning_rate": 9.940274930792053e-05, + "loss": 1.9686, + "step": 22377 + }, + { + "epoch": 2.6108972115272433, + "grad_norm": 1.160454273223877, + "learning_rate": 9.938859061260305e-05, + "loss": 1.9898, + "step": 22378 + }, + { + "epoch": 2.6110138840275345, + "grad_norm": 1.1423481702804565, + "learning_rate": 9.937443245751906e-05, + "loss": 1.9677, + "step": 22379 + }, + { + "epoch": 2.6111305565278267, + "grad_norm": 1.1030253171920776, + "learning_rate": 9.936027484281521e-05, + "loss": 1.7017, + "step": 22380 + }, + { + "epoch": 2.611247229028118, + "grad_norm": 1.179678201675415, + "learning_rate": 9.934611776863844e-05, + "loss": 1.8724, + "step": 22381 + }, + { + "epoch": 2.61136390152841, + "grad_norm": 1.0611729621887207, + "learning_rate": 9.93319612351354e-05, + "loss": 1.9018, + "step": 22382 + }, + { + "epoch": 2.6114805740287013, + "grad_norm": 1.1434566974639893, + "learning_rate": 9.931780524245279e-05, + "loss": 2.0739, + "step": 22383 + }, + { + "epoch": 2.6115972465289934, + "grad_norm": 1.0596354007720947, + "learning_rate": 9.930364979073756e-05, + "loss": 2.0225, + "step": 22384 + }, + { + "epoch": 2.6117139190292846, + "grad_norm": 1.1033598184585571, + "learning_rate": 9.928949488013625e-05, + "loss": 2.0042, + "step": 22385 + }, + { + "epoch": 2.6118305915295768, + "grad_norm": 1.2209186553955078, + "learning_rate": 9.927534051079575e-05, + "loss": 2.0646, + "step": 22386 + }, + { + "epoch": 2.611947264029868, + "grad_norm": 1.0417975187301636, + "learning_rate": 9.926118668286267e-05, + "loss": 1.9173, + "step": 22387 + }, + { + "epoch": 2.61206393653016, + "grad_norm": 1.1655194759368896, + "learning_rate": 9.924703339648391e-05, + "loss": 2.0335, + "step": 22388 + }, + { + "epoch": 2.6121806090304514, + "grad_norm": 1.209490418434143, + "learning_rate": 9.923288065180602e-05, + "loss": 2.0415, + "step": 22389 + }, + { + "epoch": 2.6122972815307435, + "grad_norm": 1.2838491201400757, + "learning_rate": 9.921872844897583e-05, + "loss": 1.9905, + "step": 22390 + }, + { + "epoch": 2.6124139540310347, + "grad_norm": 1.0669666528701782, + "learning_rate": 9.920457678814002e-05, + "loss": 1.974, + "step": 22391 + }, + { + "epoch": 2.612530626531327, + "grad_norm": 1.0897725820541382, + "learning_rate": 9.919042566944535e-05, + "loss": 1.8769, + "step": 22392 + }, + { + "epoch": 2.612647299031618, + "grad_norm": 0.9711970090866089, + "learning_rate": 9.91762750930384e-05, + "loss": 1.9059, + "step": 22393 + }, + { + "epoch": 2.61276397153191, + "grad_norm": 1.1851043701171875, + "learning_rate": 9.916212505906598e-05, + "loss": 2.1706, + "step": 22394 + }, + { + "epoch": 2.6128806440322014, + "grad_norm": 1.0503730773925781, + "learning_rate": 9.91479755676747e-05, + "loss": 1.7841, + "step": 22395 + }, + { + "epoch": 2.6129973165324936, + "grad_norm": 1.2829357385635376, + "learning_rate": 9.913382661901129e-05, + "loss": 2.0475, + "step": 22396 + }, + { + "epoch": 2.613113989032785, + "grad_norm": 0.9998330473899841, + "learning_rate": 9.911967821322248e-05, + "loss": 1.8296, + "step": 22397 + }, + { + "epoch": 2.613230661533077, + "grad_norm": 1.0358481407165527, + "learning_rate": 9.910553035045487e-05, + "loss": 1.9039, + "step": 22398 + }, + { + "epoch": 2.613347334033368, + "grad_norm": 1.1702498197555542, + "learning_rate": 9.90913830308552e-05, + "loss": 1.998, + "step": 22399 + }, + { + "epoch": 2.6134640065336603, + "grad_norm": 1.113500952720642, + "learning_rate": 9.907723625457003e-05, + "loss": 2.013, + "step": 22400 + }, + { + "epoch": 2.6135806790339515, + "grad_norm": 1.0226913690567017, + "learning_rate": 9.906309002174612e-05, + "loss": 1.948, + "step": 22401 + }, + { + "epoch": 2.6136973515342437, + "grad_norm": 1.0269509553909302, + "learning_rate": 9.904894433253004e-05, + "loss": 1.9792, + "step": 22402 + }, + { + "epoch": 2.613814024034535, + "grad_norm": 0.9832384586334229, + "learning_rate": 9.90347991870685e-05, + "loss": 1.7794, + "step": 22403 + }, + { + "epoch": 2.613930696534827, + "grad_norm": 1.1819219589233398, + "learning_rate": 9.90206545855081e-05, + "loss": 1.6946, + "step": 22404 + }, + { + "epoch": 2.6140473690351183, + "grad_norm": 1.0833107233047485, + "learning_rate": 9.900651052799551e-05, + "loss": 1.8481, + "step": 22405 + }, + { + "epoch": 2.6141640415354104, + "grad_norm": 1.097419261932373, + "learning_rate": 9.89923670146773e-05, + "loss": 1.9719, + "step": 22406 + }, + { + "epoch": 2.6142807140357016, + "grad_norm": 1.1849088668823242, + "learning_rate": 9.89782240457002e-05, + "loss": 1.9019, + "step": 22407 + }, + { + "epoch": 2.6143973865359937, + "grad_norm": 1.3185099363327026, + "learning_rate": 9.896408162121075e-05, + "loss": 2.0223, + "step": 22408 + }, + { + "epoch": 2.614514059036285, + "grad_norm": 1.1122528314590454, + "learning_rate": 9.894993974135554e-05, + "loss": 2.0099, + "step": 22409 + }, + { + "epoch": 2.614630731536577, + "grad_norm": 1.120535969734192, + "learning_rate": 9.893579840628127e-05, + "loss": 2.0017, + "step": 22410 + }, + { + "epoch": 2.6147474040368683, + "grad_norm": 1.1488122940063477, + "learning_rate": 9.892165761613444e-05, + "loss": 1.9902, + "step": 22411 + }, + { + "epoch": 2.6148640765371605, + "grad_norm": 1.0988337993621826, + "learning_rate": 9.890751737106174e-05, + "loss": 1.9155, + "step": 22412 + }, + { + "epoch": 2.6149807490374517, + "grad_norm": 1.1354140043258667, + "learning_rate": 9.889337767120964e-05, + "loss": 1.9948, + "step": 22413 + }, + { + "epoch": 2.615097421537744, + "grad_norm": 1.2489049434661865, + "learning_rate": 9.887923851672488e-05, + "loss": 2.1031, + "step": 22414 + }, + { + "epoch": 2.615214094038035, + "grad_norm": 1.3220852613449097, + "learning_rate": 9.886509990775394e-05, + "loss": 2.0201, + "step": 22415 + }, + { + "epoch": 2.615330766538327, + "grad_norm": 1.1059750318527222, + "learning_rate": 9.885096184444341e-05, + "loss": 1.9008, + "step": 22416 + }, + { + "epoch": 2.6154474390386184, + "grad_norm": 1.424281358718872, + "learning_rate": 9.883682432693985e-05, + "loss": 2.0708, + "step": 22417 + }, + { + "epoch": 2.6155641115389106, + "grad_norm": 1.0911924839019775, + "learning_rate": 9.882268735538985e-05, + "loss": 1.8935, + "step": 22418 + }, + { + "epoch": 2.615680784039202, + "grad_norm": 1.328527569770813, + "learning_rate": 9.880855092993989e-05, + "loss": 2.0465, + "step": 22419 + }, + { + "epoch": 2.615797456539494, + "grad_norm": 1.035265564918518, + "learning_rate": 9.879441505073668e-05, + "loss": 2.0594, + "step": 22420 + }, + { + "epoch": 2.615914129039785, + "grad_norm": 1.0903888940811157, + "learning_rate": 9.878027971792656e-05, + "loss": 1.9227, + "step": 22421 + }, + { + "epoch": 2.6160308015400773, + "grad_norm": 1.0112380981445312, + "learning_rate": 9.87661449316562e-05, + "loss": 1.9904, + "step": 22422 + }, + { + "epoch": 2.6161474740403685, + "grad_norm": 1.0181983709335327, + "learning_rate": 9.875201069207213e-05, + "loss": 1.9399, + "step": 22423 + }, + { + "epoch": 2.6162641465406606, + "grad_norm": 1.1493724584579468, + "learning_rate": 9.873787699932083e-05, + "loss": 1.8072, + "step": 22424 + }, + { + "epoch": 2.616380819040952, + "grad_norm": 1.2664783000946045, + "learning_rate": 9.872374385354892e-05, + "loss": 1.8595, + "step": 22425 + }, + { + "epoch": 2.616497491541244, + "grad_norm": 1.1790039539337158, + "learning_rate": 9.870961125490277e-05, + "loss": 1.9742, + "step": 22426 + }, + { + "epoch": 2.6166141640415352, + "grad_norm": 1.0669574737548828, + "learning_rate": 9.8695479203529e-05, + "loss": 1.9327, + "step": 22427 + }, + { + "epoch": 2.6167308365418274, + "grad_norm": 1.1268373727798462, + "learning_rate": 9.868134769957405e-05, + "loss": 2.1234, + "step": 22428 + }, + { + "epoch": 2.6168475090421186, + "grad_norm": 1.133703351020813, + "learning_rate": 9.86672167431845e-05, + "loss": 1.863, + "step": 22429 + }, + { + "epoch": 2.6169641815424107, + "grad_norm": 1.2534526586532593, + "learning_rate": 9.865308633450673e-05, + "loss": 2.0958, + "step": 22430 + }, + { + "epoch": 2.617080854042702, + "grad_norm": 1.1771008968353271, + "learning_rate": 9.863895647368738e-05, + "loss": 1.9081, + "step": 22431 + }, + { + "epoch": 2.617197526542994, + "grad_norm": 1.2662835121154785, + "learning_rate": 9.862482716087278e-05, + "loss": 1.9326, + "step": 22432 + }, + { + "epoch": 2.6173141990432853, + "grad_norm": 1.1020963191986084, + "learning_rate": 9.861069839620951e-05, + "loss": 1.7721, + "step": 22433 + }, + { + "epoch": 2.6174308715435775, + "grad_norm": 1.1612952947616577, + "learning_rate": 9.859657017984398e-05, + "loss": 2.0608, + "step": 22434 + }, + { + "epoch": 2.6175475440438687, + "grad_norm": 1.2428828477859497, + "learning_rate": 9.858244251192267e-05, + "loss": 1.9144, + "step": 22435 + }, + { + "epoch": 2.617664216544161, + "grad_norm": 1.0529990196228027, + "learning_rate": 9.856831539259207e-05, + "loss": 1.965, + "step": 22436 + }, + { + "epoch": 2.617780889044452, + "grad_norm": 1.106724500656128, + "learning_rate": 9.85541888219986e-05, + "loss": 1.9381, + "step": 22437 + }, + { + "epoch": 2.617897561544744, + "grad_norm": 1.035521388053894, + "learning_rate": 9.854006280028875e-05, + "loss": 1.9064, + "step": 22438 + }, + { + "epoch": 2.6180142340450354, + "grad_norm": 1.284877896308899, + "learning_rate": 9.852593732760892e-05, + "loss": 2.0141, + "step": 22439 + }, + { + "epoch": 2.6181309065453275, + "grad_norm": 0.9756803512573242, + "learning_rate": 9.851181240410558e-05, + "loss": 1.7844, + "step": 22440 + }, + { + "epoch": 2.618247579045619, + "grad_norm": 1.1991831064224243, + "learning_rate": 9.849768802992515e-05, + "loss": 1.9706, + "step": 22441 + }, + { + "epoch": 2.618364251545911, + "grad_norm": 1.2457456588745117, + "learning_rate": 9.848356420521406e-05, + "loss": 2.0644, + "step": 22442 + }, + { + "epoch": 2.618480924046202, + "grad_norm": 1.0761338472366333, + "learning_rate": 9.846944093011865e-05, + "loss": 2.026, + "step": 22443 + }, + { + "epoch": 2.6185975965464943, + "grad_norm": 1.0237587690353394, + "learning_rate": 9.84553182047855e-05, + "loss": 2.206, + "step": 22444 + }, + { + "epoch": 2.6187142690467855, + "grad_norm": 1.1680535078048706, + "learning_rate": 9.844119602936084e-05, + "loss": 2.0185, + "step": 22445 + }, + { + "epoch": 2.6188309415470776, + "grad_norm": 1.2762547731399536, + "learning_rate": 9.842707440399123e-05, + "loss": 2.1183, + "step": 22446 + }, + { + "epoch": 2.618947614047369, + "grad_norm": 1.0921677350997925, + "learning_rate": 9.841295332882293e-05, + "loss": 1.9954, + "step": 22447 + }, + { + "epoch": 2.619064286547661, + "grad_norm": 1.1094398498535156, + "learning_rate": 9.839883280400242e-05, + "loss": 2.0125, + "step": 22448 + }, + { + "epoch": 2.6191809590479522, + "grad_norm": 1.1301144361495972, + "learning_rate": 9.838471282967609e-05, + "loss": 2.0359, + "step": 22449 + }, + { + "epoch": 2.6192976315482444, + "grad_norm": 1.3676722049713135, + "learning_rate": 9.837059340599027e-05, + "loss": 1.8706, + "step": 22450 + }, + { + "epoch": 2.6194143040485356, + "grad_norm": 1.2620298862457275, + "learning_rate": 9.835647453309142e-05, + "loss": 2.0152, + "step": 22451 + }, + { + "epoch": 2.6195309765488277, + "grad_norm": 1.1790390014648438, + "learning_rate": 9.834235621112576e-05, + "loss": 2.0839, + "step": 22452 + }, + { + "epoch": 2.619647649049119, + "grad_norm": 1.3474087715148926, + "learning_rate": 9.832823844023979e-05, + "loss": 2.0663, + "step": 22453 + }, + { + "epoch": 2.619764321549411, + "grad_norm": 1.0623444318771362, + "learning_rate": 9.831412122057978e-05, + "loss": 2.0532, + "step": 22454 + }, + { + "epoch": 2.6198809940497023, + "grad_norm": 1.0832723379135132, + "learning_rate": 9.830000455229217e-05, + "loss": 1.9242, + "step": 22455 + }, + { + "epoch": 2.6199976665499944, + "grad_norm": 1.1865179538726807, + "learning_rate": 9.828588843552319e-05, + "loss": 1.9501, + "step": 22456 + }, + { + "epoch": 2.6201143390502857, + "grad_norm": 1.0966476202011108, + "learning_rate": 9.827177287041931e-05, + "loss": 1.9127, + "step": 22457 + }, + { + "epoch": 2.620231011550578, + "grad_norm": 1.0974876880645752, + "learning_rate": 9.825765785712677e-05, + "loss": 1.8081, + "step": 22458 + }, + { + "epoch": 2.620347684050869, + "grad_norm": 1.0989435911178589, + "learning_rate": 9.82435433957919e-05, + "loss": 1.7666, + "step": 22459 + }, + { + "epoch": 2.620464356551161, + "grad_norm": 1.1911052465438843, + "learning_rate": 9.82294294865611e-05, + "loss": 2.0089, + "step": 22460 + }, + { + "epoch": 2.6205810290514524, + "grad_norm": 1.1160163879394531, + "learning_rate": 9.82153161295806e-05, + "loss": 2.0433, + "step": 22461 + }, + { + "epoch": 2.6206977015517445, + "grad_norm": 1.2953076362609863, + "learning_rate": 9.820120332499679e-05, + "loss": 1.8645, + "step": 22462 + }, + { + "epoch": 2.6208143740520358, + "grad_norm": 1.03708815574646, + "learning_rate": 9.81870910729559e-05, + "loss": 1.8487, + "step": 22463 + }, + { + "epoch": 2.620931046552328, + "grad_norm": 0.9982561469078064, + "learning_rate": 9.817297937360432e-05, + "loss": 1.8837, + "step": 22464 + }, + { + "epoch": 2.621047719052619, + "grad_norm": 1.306571364402771, + "learning_rate": 9.815886822708824e-05, + "loss": 2.0028, + "step": 22465 + }, + { + "epoch": 2.6211643915529113, + "grad_norm": 1.1463911533355713, + "learning_rate": 9.814475763355403e-05, + "loss": 1.8572, + "step": 22466 + }, + { + "epoch": 2.6212810640532025, + "grad_norm": 1.1439955234527588, + "learning_rate": 9.813064759314793e-05, + "loss": 2.0379, + "step": 22467 + }, + { + "epoch": 2.6213977365534946, + "grad_norm": 1.102344036102295, + "learning_rate": 9.811653810601628e-05, + "loss": 2.0453, + "step": 22468 + }, + { + "epoch": 2.621514409053786, + "grad_norm": 1.113369107246399, + "learning_rate": 9.810242917230521e-05, + "loss": 1.8732, + "step": 22469 + }, + { + "epoch": 2.621631081554078, + "grad_norm": 1.3866685628890991, + "learning_rate": 9.808832079216115e-05, + "loss": 2.1097, + "step": 22470 + }, + { + "epoch": 2.621747754054369, + "grad_norm": 1.4210484027862549, + "learning_rate": 9.807421296573023e-05, + "loss": 1.8694, + "step": 22471 + }, + { + "epoch": 2.6218644265546613, + "grad_norm": 1.1253539323806763, + "learning_rate": 9.806010569315883e-05, + "loss": 1.975, + "step": 22472 + }, + { + "epoch": 2.6219810990549526, + "grad_norm": 1.050061583518982, + "learning_rate": 9.804599897459307e-05, + "loss": 2.0854, + "step": 22473 + }, + { + "epoch": 2.6220977715552447, + "grad_norm": 1.1104063987731934, + "learning_rate": 9.803189281017929e-05, + "loss": 1.9211, + "step": 22474 + }, + { + "epoch": 2.622214444055536, + "grad_norm": 1.1952219009399414, + "learning_rate": 9.801778720006369e-05, + "loss": 1.9821, + "step": 22475 + }, + { + "epoch": 2.622331116555828, + "grad_norm": 1.2382510900497437, + "learning_rate": 9.800368214439248e-05, + "loss": 1.8625, + "step": 22476 + }, + { + "epoch": 2.6224477890561193, + "grad_norm": 1.0040005445480347, + "learning_rate": 9.798957764331195e-05, + "loss": 2.0306, + "step": 22477 + }, + { + "epoch": 2.6225644615564114, + "grad_norm": 1.1749653816223145, + "learning_rate": 9.797547369696822e-05, + "loss": 1.872, + "step": 22478 + }, + { + "epoch": 2.6226811340567027, + "grad_norm": 1.1850521564483643, + "learning_rate": 9.796137030550763e-05, + "loss": 1.9582, + "step": 22479 + }, + { + "epoch": 2.622797806556995, + "grad_norm": 1.0544941425323486, + "learning_rate": 9.794726746907624e-05, + "loss": 1.8621, + "step": 22480 + }, + { + "epoch": 2.622914479057286, + "grad_norm": 1.1590267419815063, + "learning_rate": 9.793316518782043e-05, + "loss": 1.9534, + "step": 22481 + }, + { + "epoch": 2.623031151557578, + "grad_norm": 1.1425042152404785, + "learning_rate": 9.79190634618862e-05, + "loss": 2.0415, + "step": 22482 + }, + { + "epoch": 2.6231478240578694, + "grad_norm": 1.113862156867981, + "learning_rate": 9.79049622914199e-05, + "loss": 1.9803, + "step": 22483 + }, + { + "epoch": 2.6232644965581615, + "grad_norm": 1.2883766889572144, + "learning_rate": 9.789086167656765e-05, + "loss": 2.1116, + "step": 22484 + }, + { + "epoch": 2.6233811690584528, + "grad_norm": 1.0455504655838013, + "learning_rate": 9.787676161747565e-05, + "loss": 2.0285, + "step": 22485 + }, + { + "epoch": 2.623497841558745, + "grad_norm": 1.291536569595337, + "learning_rate": 9.786266211429001e-05, + "loss": 1.8452, + "step": 22486 + }, + { + "epoch": 2.623614514059036, + "grad_norm": 1.227568507194519, + "learning_rate": 9.784856316715697e-05, + "loss": 1.9295, + "step": 22487 + }, + { + "epoch": 2.6237311865593282, + "grad_norm": 1.059620976448059, + "learning_rate": 9.783446477622272e-05, + "loss": 1.9539, + "step": 22488 + }, + { + "epoch": 2.6238478590596195, + "grad_norm": 1.1522555351257324, + "learning_rate": 9.782036694163327e-05, + "loss": 2.0214, + "step": 22489 + }, + { + "epoch": 2.6239645315599116, + "grad_norm": 1.0158381462097168, + "learning_rate": 9.780626966353496e-05, + "loss": 1.9881, + "step": 22490 + }, + { + "epoch": 2.624081204060203, + "grad_norm": 1.2476527690887451, + "learning_rate": 9.779217294207378e-05, + "loss": 1.934, + "step": 22491 + }, + { + "epoch": 2.624197876560495, + "grad_norm": 1.1030951738357544, + "learning_rate": 9.777807677739596e-05, + "loss": 1.9124, + "step": 22492 + }, + { + "epoch": 2.624314549060786, + "grad_norm": 1.2495896816253662, + "learning_rate": 9.776398116964761e-05, + "loss": 2.1722, + "step": 22493 + }, + { + "epoch": 2.6244312215610783, + "grad_norm": 1.0661615133285522, + "learning_rate": 9.774988611897487e-05, + "loss": 1.9362, + "step": 22494 + }, + { + "epoch": 2.6245478940613696, + "grad_norm": 1.3084760904312134, + "learning_rate": 9.773579162552378e-05, + "loss": 1.9618, + "step": 22495 + }, + { + "epoch": 2.6246645665616617, + "grad_norm": 1.0670431852340698, + "learning_rate": 9.772169768944059e-05, + "loss": 2.0538, + "step": 22496 + }, + { + "epoch": 2.624781239061953, + "grad_norm": 1.0175766944885254, + "learning_rate": 9.770760431087127e-05, + "loss": 1.9042, + "step": 22497 + }, + { + "epoch": 2.624897911562245, + "grad_norm": 1.0706783533096313, + "learning_rate": 9.769351148996208e-05, + "loss": 2.1619, + "step": 22498 + }, + { + "epoch": 2.6250145840625363, + "grad_norm": 1.037813425064087, + "learning_rate": 9.767941922685897e-05, + "loss": 1.8316, + "step": 22499 + }, + { + "epoch": 2.625131256562828, + "grad_norm": 1.0201866626739502, + "learning_rate": 9.766532752170811e-05, + "loss": 1.9172, + "step": 22500 + }, + { + "epoch": 2.6252479290631197, + "grad_norm": 1.1945313215255737, + "learning_rate": 9.765123637465554e-05, + "loss": 1.9471, + "step": 22501 + }, + { + "epoch": 2.6253646015634113, + "grad_norm": 1.1129306554794312, + "learning_rate": 9.763714578584745e-05, + "loss": 2.0757, + "step": 22502 + }, + { + "epoch": 2.625481274063703, + "grad_norm": 1.012479543685913, + "learning_rate": 9.762305575542982e-05, + "loss": 1.7291, + "step": 22503 + }, + { + "epoch": 2.6255979465639947, + "grad_norm": 1.4123691320419312, + "learning_rate": 9.760896628354872e-05, + "loss": 2.1176, + "step": 22504 + }, + { + "epoch": 2.6257146190642864, + "grad_norm": 1.2743197679519653, + "learning_rate": 9.759487737035027e-05, + "loss": 2.1602, + "step": 22505 + }, + { + "epoch": 2.625831291564578, + "grad_norm": 1.303818941116333, + "learning_rate": 9.758078901598042e-05, + "loss": 1.8856, + "step": 22506 + }, + { + "epoch": 2.6259479640648697, + "grad_norm": 1.244618535041809, + "learning_rate": 9.756670122058539e-05, + "loss": 1.9597, + "step": 22507 + }, + { + "epoch": 2.6260646365651614, + "grad_norm": 1.1269886493682861, + "learning_rate": 9.755261398431106e-05, + "loss": 1.8378, + "step": 22508 + }, + { + "epoch": 2.626181309065453, + "grad_norm": 1.352455735206604, + "learning_rate": 9.753852730730358e-05, + "loss": 1.9623, + "step": 22509 + }, + { + "epoch": 2.626297981565745, + "grad_norm": 1.1993680000305176, + "learning_rate": 9.752444118970895e-05, + "loss": 1.7263, + "step": 22510 + }, + { + "epoch": 2.6264146540660365, + "grad_norm": 1.1604444980621338, + "learning_rate": 9.75103556316732e-05, + "loss": 1.9387, + "step": 22511 + }, + { + "epoch": 2.626531326566328, + "grad_norm": 1.1987205743789673, + "learning_rate": 9.749627063334233e-05, + "loss": 1.9724, + "step": 22512 + }, + { + "epoch": 2.62664799906662, + "grad_norm": 1.129008412361145, + "learning_rate": 9.748218619486236e-05, + "loss": 1.9063, + "step": 22513 + }, + { + "epoch": 2.6267646715669115, + "grad_norm": 1.2301900386810303, + "learning_rate": 9.746810231637939e-05, + "loss": 1.9956, + "step": 22514 + }, + { + "epoch": 2.626881344067203, + "grad_norm": 1.2390035390853882, + "learning_rate": 9.745401899803929e-05, + "loss": 2.0148, + "step": 22515 + }, + { + "epoch": 2.626998016567495, + "grad_norm": 1.196189284324646, + "learning_rate": 9.743993623998818e-05, + "loss": 2.0855, + "step": 22516 + }, + { + "epoch": 2.6271146890677866, + "grad_norm": 1.0740820169448853, + "learning_rate": 9.742585404237198e-05, + "loss": 1.8607, + "step": 22517 + }, + { + "epoch": 2.6272313615680782, + "grad_norm": 1.086957573890686, + "learning_rate": 9.74117724053367e-05, + "loss": 2.0621, + "step": 22518 + }, + { + "epoch": 2.62734803406837, + "grad_norm": 0.9467563033103943, + "learning_rate": 9.739769132902833e-05, + "loss": 1.8554, + "step": 22519 + }, + { + "epoch": 2.6274647065686616, + "grad_norm": 1.1446094512939453, + "learning_rate": 9.738361081359285e-05, + "loss": 2.0767, + "step": 22520 + }, + { + "epoch": 2.6275813790689533, + "grad_norm": 1.2823981046676636, + "learning_rate": 9.736953085917615e-05, + "loss": 1.9051, + "step": 22521 + }, + { + "epoch": 2.627698051569245, + "grad_norm": 1.1467928886413574, + "learning_rate": 9.735545146592437e-05, + "loss": 2.1077, + "step": 22522 + }, + { + "epoch": 2.6278147240695366, + "grad_norm": 1.2350575923919678, + "learning_rate": 9.734137263398328e-05, + "loss": 2.0866, + "step": 22523 + }, + { + "epoch": 2.6279313965698283, + "grad_norm": 1.054468035697937, + "learning_rate": 9.7327294363499e-05, + "loss": 2.1893, + "step": 22524 + }, + { + "epoch": 2.62804806907012, + "grad_norm": 1.1959649324417114, + "learning_rate": 9.731321665461734e-05, + "loss": 2.0486, + "step": 22525 + }, + { + "epoch": 2.6281647415704117, + "grad_norm": 1.174959421157837, + "learning_rate": 9.72991395074843e-05, + "loss": 2.0914, + "step": 22526 + }, + { + "epoch": 2.6282814140707034, + "grad_norm": 1.179660677909851, + "learning_rate": 9.728506292224585e-05, + "loss": 1.9235, + "step": 22527 + }, + { + "epoch": 2.628398086570995, + "grad_norm": 1.153544306755066, + "learning_rate": 9.727098689904786e-05, + "loss": 1.834, + "step": 22528 + }, + { + "epoch": 2.6285147590712867, + "grad_norm": 1.123165488243103, + "learning_rate": 9.725691143803631e-05, + "loss": 1.9968, + "step": 22529 + }, + { + "epoch": 2.6286314315715784, + "grad_norm": 1.167038917541504, + "learning_rate": 9.724283653935703e-05, + "loss": 2.0019, + "step": 22530 + }, + { + "epoch": 2.62874810407187, + "grad_norm": 1.0744003057479858, + "learning_rate": 9.722876220315608e-05, + "loss": 1.925, + "step": 22531 + }, + { + "epoch": 2.6288647765721618, + "grad_norm": 1.2645095586776733, + "learning_rate": 9.72146884295792e-05, + "loss": 2.062, + "step": 22532 + }, + { + "epoch": 2.6289814490724535, + "grad_norm": 1.1340221166610718, + "learning_rate": 9.720061521877246e-05, + "loss": 1.8053, + "step": 22533 + }, + { + "epoch": 2.629098121572745, + "grad_norm": 1.1066534519195557, + "learning_rate": 9.718654257088158e-05, + "loss": 2.0982, + "step": 22534 + }, + { + "epoch": 2.629214794073037, + "grad_norm": 1.1147390604019165, + "learning_rate": 9.71724704860526e-05, + "loss": 1.9779, + "step": 22535 + }, + { + "epoch": 2.6293314665733285, + "grad_norm": 1.1787174940109253, + "learning_rate": 9.715839896443131e-05, + "loss": 1.9108, + "step": 22536 + }, + { + "epoch": 2.62944813907362, + "grad_norm": 1.1068851947784424, + "learning_rate": 9.714432800616366e-05, + "loss": 1.9331, + "step": 22537 + }, + { + "epoch": 2.629564811573912, + "grad_norm": 1.1907604932785034, + "learning_rate": 9.713025761139542e-05, + "loss": 2.001, + "step": 22538 + }, + { + "epoch": 2.6296814840742035, + "grad_norm": 1.19448983669281, + "learning_rate": 9.71161877802725e-05, + "loss": 2.0074, + "step": 22539 + }, + { + "epoch": 2.6297981565744952, + "grad_norm": 1.0431804656982422, + "learning_rate": 9.710211851294084e-05, + "loss": 2.0547, + "step": 22540 + }, + { + "epoch": 2.629914829074787, + "grad_norm": 1.084470272064209, + "learning_rate": 9.708804980954622e-05, + "loss": 1.9738, + "step": 22541 + }, + { + "epoch": 2.6300315015750786, + "grad_norm": 1.1029914617538452, + "learning_rate": 9.707398167023452e-05, + "loss": 1.8714, + "step": 22542 + }, + { + "epoch": 2.6301481740753703, + "grad_norm": 1.152147889137268, + "learning_rate": 9.705991409515152e-05, + "loss": 1.9899, + "step": 22543 + }, + { + "epoch": 2.630264846575662, + "grad_norm": 1.177935242652893, + "learning_rate": 9.704584708444312e-05, + "loss": 2.0142, + "step": 22544 + }, + { + "epoch": 2.6303815190759536, + "grad_norm": 1.2462223768234253, + "learning_rate": 9.703178063825514e-05, + "loss": 2.1554, + "step": 22545 + }, + { + "epoch": 2.6304981915762453, + "grad_norm": 1.1496602296829224, + "learning_rate": 9.701771475673342e-05, + "loss": 1.8076, + "step": 22546 + }, + { + "epoch": 2.630614864076537, + "grad_norm": 1.2307515144348145, + "learning_rate": 9.70036494400237e-05, + "loss": 2.0075, + "step": 22547 + }, + { + "epoch": 2.6307315365768287, + "grad_norm": 1.2311325073242188, + "learning_rate": 9.698958468827191e-05, + "loss": 2.0979, + "step": 22548 + }, + { + "epoch": 2.6308482090771204, + "grad_norm": 1.0863169431686401, + "learning_rate": 9.697552050162372e-05, + "loss": 2.071, + "step": 22549 + }, + { + "epoch": 2.630964881577412, + "grad_norm": 1.2873330116271973, + "learning_rate": 9.69614568802251e-05, + "loss": 1.9819, + "step": 22550 + }, + { + "epoch": 2.6310815540777037, + "grad_norm": 1.0804164409637451, + "learning_rate": 9.694739382422167e-05, + "loss": 2.0109, + "step": 22551 + }, + { + "epoch": 2.6311982265779954, + "grad_norm": 1.2698962688446045, + "learning_rate": 9.693333133375937e-05, + "loss": 1.9519, + "step": 22552 + }, + { + "epoch": 2.631314899078287, + "grad_norm": 1.117431402206421, + "learning_rate": 9.691926940898393e-05, + "loss": 1.9104, + "step": 22553 + }, + { + "epoch": 2.6314315715785788, + "grad_norm": 1.1779354810714722, + "learning_rate": 9.690520805004106e-05, + "loss": 1.9361, + "step": 22554 + }, + { + "epoch": 2.6315482440788704, + "grad_norm": 1.135215401649475, + "learning_rate": 9.689114725707665e-05, + "loss": 1.8879, + "step": 22555 + }, + { + "epoch": 2.631664916579162, + "grad_norm": 1.0592241287231445, + "learning_rate": 9.687708703023634e-05, + "loss": 1.9305, + "step": 22556 + }, + { + "epoch": 2.631781589079454, + "grad_norm": 1.0646061897277832, + "learning_rate": 9.686302736966603e-05, + "loss": 1.8475, + "step": 22557 + }, + { + "epoch": 2.6318982615797455, + "grad_norm": 1.1427727937698364, + "learning_rate": 9.684896827551136e-05, + "loss": 2.0091, + "step": 22558 + }, + { + "epoch": 2.632014934080037, + "grad_norm": 1.1673787832260132, + "learning_rate": 9.683490974791815e-05, + "loss": 2.0201, + "step": 22559 + }, + { + "epoch": 2.632131606580329, + "grad_norm": 1.0562142133712769, + "learning_rate": 9.682085178703211e-05, + "loss": 1.9412, + "step": 22560 + }, + { + "epoch": 2.6322482790806205, + "grad_norm": 1.1553356647491455, + "learning_rate": 9.680679439299901e-05, + "loss": 2.086, + "step": 22561 + }, + { + "epoch": 2.632364951580912, + "grad_norm": 1.2488473653793335, + "learning_rate": 9.679273756596451e-05, + "loss": 1.9989, + "step": 22562 + }, + { + "epoch": 2.632481624081204, + "grad_norm": 1.112321376800537, + "learning_rate": 9.677868130607445e-05, + "loss": 2.0242, + "step": 22563 + }, + { + "epoch": 2.6325982965814956, + "grad_norm": 1.1521345376968384, + "learning_rate": 9.67646256134744e-05, + "loss": 2.0048, + "step": 22564 + }, + { + "epoch": 2.6327149690817873, + "grad_norm": 1.1942397356033325, + "learning_rate": 9.675057048831015e-05, + "loss": 1.9757, + "step": 22565 + }, + { + "epoch": 2.632831641582079, + "grad_norm": 1.0382484197616577, + "learning_rate": 9.673651593072749e-05, + "loss": 2.0408, + "step": 22566 + }, + { + "epoch": 2.6329483140823706, + "grad_norm": 1.2093181610107422, + "learning_rate": 9.672246194087199e-05, + "loss": 2.0506, + "step": 22567 + }, + { + "epoch": 2.6330649865826623, + "grad_norm": 1.0881503820419312, + "learning_rate": 9.670840851888943e-05, + "loss": 1.8559, + "step": 22568 + }, + { + "epoch": 2.633181659082954, + "grad_norm": 0.9913577437400818, + "learning_rate": 9.669435566492547e-05, + "loss": 1.8722, + "step": 22569 + }, + { + "epoch": 2.6332983315832457, + "grad_norm": 0.9906238913536072, + "learning_rate": 9.668030337912578e-05, + "loss": 1.8652, + "step": 22570 + }, + { + "epoch": 2.6334150040835373, + "grad_norm": 1.2162162065505981, + "learning_rate": 9.666625166163606e-05, + "loss": 1.993, + "step": 22571 + }, + { + "epoch": 2.633531676583829, + "grad_norm": 1.2586055994033813, + "learning_rate": 9.6652200512602e-05, + "loss": 1.9681, + "step": 22572 + }, + { + "epoch": 2.6336483490841207, + "grad_norm": 1.2469704151153564, + "learning_rate": 9.663814993216921e-05, + "loss": 2.0747, + "step": 22573 + }, + { + "epoch": 2.6337650215844124, + "grad_norm": 0.992246687412262, + "learning_rate": 9.662409992048344e-05, + "loss": 1.8975, + "step": 22574 + }, + { + "epoch": 2.633881694084704, + "grad_norm": 1.243873953819275, + "learning_rate": 9.661005047769021e-05, + "loss": 2.0013, + "step": 22575 + }, + { + "epoch": 2.6339983665849958, + "grad_norm": 1.1346819400787354, + "learning_rate": 9.659600160393532e-05, + "loss": 1.9055, + "step": 22576 + }, + { + "epoch": 2.6341150390852874, + "grad_norm": 1.2554144859313965, + "learning_rate": 9.658195329936427e-05, + "loss": 1.9639, + "step": 22577 + }, + { + "epoch": 2.634231711585579, + "grad_norm": 1.1655635833740234, + "learning_rate": 9.656790556412282e-05, + "loss": 2.0516, + "step": 22578 + }, + { + "epoch": 2.634348384085871, + "grad_norm": 1.2652446031570435, + "learning_rate": 9.655385839835657e-05, + "loss": 2.0639, + "step": 22579 + }, + { + "epoch": 2.6344650565861625, + "grad_norm": 1.0810145139694214, + "learning_rate": 9.653981180221108e-05, + "loss": 1.9795, + "step": 22580 + }, + { + "epoch": 2.634581729086454, + "grad_norm": 1.1456782817840576, + "learning_rate": 9.652576577583207e-05, + "loss": 2.1003, + "step": 22581 + }, + { + "epoch": 2.634698401586746, + "grad_norm": 1.0933068990707397, + "learning_rate": 9.651172031936503e-05, + "loss": 1.81, + "step": 22582 + }, + { + "epoch": 2.6348150740870375, + "grad_norm": 1.1001670360565186, + "learning_rate": 9.64976754329557e-05, + "loss": 2.002, + "step": 22583 + }, + { + "epoch": 2.634931746587329, + "grad_norm": 1.1348841190338135, + "learning_rate": 9.648363111674958e-05, + "loss": 2.0247, + "step": 22584 + }, + { + "epoch": 2.635048419087621, + "grad_norm": 1.1417795419692993, + "learning_rate": 9.646958737089232e-05, + "loss": 2.2104, + "step": 22585 + }, + { + "epoch": 2.6351650915879126, + "grad_norm": 1.0234653949737549, + "learning_rate": 9.645554419552948e-05, + "loss": 1.7189, + "step": 22586 + }, + { + "epoch": 2.6352817640882042, + "grad_norm": 1.092324137687683, + "learning_rate": 9.644150159080671e-05, + "loss": 1.9419, + "step": 22587 + }, + { + "epoch": 2.635398436588496, + "grad_norm": 1.0357866287231445, + "learning_rate": 9.64274595568695e-05, + "loss": 1.9947, + "step": 22588 + }, + { + "epoch": 2.6355151090887876, + "grad_norm": 1.0008108615875244, + "learning_rate": 9.64134180938635e-05, + "loss": 1.8064, + "step": 22589 + }, + { + "epoch": 2.6356317815890793, + "grad_norm": 1.11330246925354, + "learning_rate": 9.639937720193415e-05, + "loss": 1.9263, + "step": 22590 + }, + { + "epoch": 2.635748454089371, + "grad_norm": 1.4131137132644653, + "learning_rate": 9.638533688122713e-05, + "loss": 2.0228, + "step": 22591 + }, + { + "epoch": 2.6358651265896627, + "grad_norm": 1.0919865369796753, + "learning_rate": 9.637129713188803e-05, + "loss": 2.0593, + "step": 22592 + }, + { + "epoch": 2.6359817990899543, + "grad_norm": 1.2499237060546875, + "learning_rate": 9.635725795406223e-05, + "loss": 1.7679, + "step": 22593 + }, + { + "epoch": 2.636098471590246, + "grad_norm": 1.119377613067627, + "learning_rate": 9.634321934789543e-05, + "loss": 1.878, + "step": 22594 + }, + { + "epoch": 2.6362151440905377, + "grad_norm": 1.2645776271820068, + "learning_rate": 9.632918131353311e-05, + "loss": 2.01, + "step": 22595 + }, + { + "epoch": 2.6363318165908294, + "grad_norm": 1.1176681518554688, + "learning_rate": 9.63151438511208e-05, + "loss": 1.9773, + "step": 22596 + }, + { + "epoch": 2.636448489091121, + "grad_norm": 1.2483755350112915, + "learning_rate": 9.630110696080401e-05, + "loss": 2.0848, + "step": 22597 + }, + { + "epoch": 2.6365651615914127, + "grad_norm": 1.2937848567962646, + "learning_rate": 9.628707064272832e-05, + "loss": 2.037, + "step": 22598 + }, + { + "epoch": 2.6366818340917044, + "grad_norm": 1.1848820447921753, + "learning_rate": 9.62730348970391e-05, + "loss": 1.9496, + "step": 22599 + }, + { + "epoch": 2.636798506591996, + "grad_norm": 1.3857378959655762, + "learning_rate": 9.625899972388205e-05, + "loss": 2.0293, + "step": 22600 + }, + { + "epoch": 2.636915179092288, + "grad_norm": 1.1567094326019287, + "learning_rate": 9.624496512340251e-05, + "loss": 1.9371, + "step": 22601 + }, + { + "epoch": 2.6370318515925795, + "grad_norm": 1.0815989971160889, + "learning_rate": 9.623093109574611e-05, + "loss": 1.9457, + "step": 22602 + }, + { + "epoch": 2.637148524092871, + "grad_norm": 1.2051194906234741, + "learning_rate": 9.621689764105818e-05, + "loss": 2.0209, + "step": 22603 + }, + { + "epoch": 2.637265196593163, + "grad_norm": 1.2190313339233398, + "learning_rate": 9.620286475948434e-05, + "loss": 1.9868, + "step": 22604 + }, + { + "epoch": 2.6373818690934545, + "grad_norm": 1.22461998462677, + "learning_rate": 9.618883245117004e-05, + "loss": 1.8928, + "step": 22605 + }, + { + "epoch": 2.637498541593746, + "grad_norm": 1.1535611152648926, + "learning_rate": 9.61748007162607e-05, + "loss": 1.8072, + "step": 22606 + }, + { + "epoch": 2.637615214094038, + "grad_norm": 1.1351380348205566, + "learning_rate": 9.616076955490185e-05, + "loss": 2.1668, + "step": 22607 + }, + { + "epoch": 2.6377318865943296, + "grad_norm": 1.0525532960891724, + "learning_rate": 9.614673896723887e-05, + "loss": 1.7995, + "step": 22608 + }, + { + "epoch": 2.6378485590946212, + "grad_norm": 1.1075377464294434, + "learning_rate": 9.61327089534173e-05, + "loss": 2.04, + "step": 22609 + }, + { + "epoch": 2.637965231594913, + "grad_norm": 1.1582574844360352, + "learning_rate": 9.61186795135825e-05, + "loss": 2.0109, + "step": 22610 + }, + { + "epoch": 2.6380819040952046, + "grad_norm": 1.3880109786987305, + "learning_rate": 9.610465064788003e-05, + "loss": 2.174, + "step": 22611 + }, + { + "epoch": 2.6381985765954963, + "grad_norm": 1.1017341613769531, + "learning_rate": 9.609062235645523e-05, + "loss": 1.8731, + "step": 22612 + }, + { + "epoch": 2.638315249095788, + "grad_norm": 1.139886736869812, + "learning_rate": 9.607659463945354e-05, + "loss": 1.9983, + "step": 22613 + }, + { + "epoch": 2.6384319215960796, + "grad_norm": 1.1298420429229736, + "learning_rate": 9.60625674970204e-05, + "loss": 1.8958, + "step": 22614 + }, + { + "epoch": 2.6385485940963713, + "grad_norm": 1.0592877864837646, + "learning_rate": 9.604854092930126e-05, + "loss": 1.6378, + "step": 22615 + }, + { + "epoch": 2.638665266596663, + "grad_norm": 1.072257399559021, + "learning_rate": 9.603451493644144e-05, + "loss": 2.0329, + "step": 22616 + }, + { + "epoch": 2.6387819390969547, + "grad_norm": 1.0809557437896729, + "learning_rate": 9.602048951858641e-05, + "loss": 1.8886, + "step": 22617 + }, + { + "epoch": 2.6388986115972464, + "grad_norm": 1.2583204507827759, + "learning_rate": 9.600646467588164e-05, + "loss": 2.0552, + "step": 22618 + }, + { + "epoch": 2.639015284097538, + "grad_norm": 0.9837411642074585, + "learning_rate": 9.599244040847234e-05, + "loss": 1.8438, + "step": 22619 + }, + { + "epoch": 2.6391319565978297, + "grad_norm": 1.0315293073654175, + "learning_rate": 9.597841671650413e-05, + "loss": 1.8743, + "step": 22620 + }, + { + "epoch": 2.6392486290981214, + "grad_norm": 1.044276237487793, + "learning_rate": 9.596439360012219e-05, + "loss": 1.9594, + "step": 22621 + }, + { + "epoch": 2.639365301598413, + "grad_norm": 1.2123435735702515, + "learning_rate": 9.595037105947204e-05, + "loss": 1.9404, + "step": 22622 + }, + { + "epoch": 2.6394819740987048, + "grad_norm": 1.229518175125122, + "learning_rate": 9.593634909469889e-05, + "loss": 1.9647, + "step": 22623 + }, + { + "epoch": 2.6395986465989965, + "grad_norm": 1.0994007587432861, + "learning_rate": 9.592232770594828e-05, + "loss": 2.0267, + "step": 22624 + }, + { + "epoch": 2.639715319099288, + "grad_norm": 1.0389118194580078, + "learning_rate": 9.590830689336543e-05, + "loss": 1.8356, + "step": 22625 + }, + { + "epoch": 2.63983199159958, + "grad_norm": 1.0391253232955933, + "learning_rate": 9.589428665709579e-05, + "loss": 1.8436, + "step": 22626 + }, + { + "epoch": 2.6399486640998715, + "grad_norm": 1.1042499542236328, + "learning_rate": 9.588026699728464e-05, + "loss": 1.8249, + "step": 22627 + }, + { + "epoch": 2.640065336600163, + "grad_norm": 0.9849135279655457, + "learning_rate": 9.586624791407738e-05, + "loss": 1.8711, + "step": 22628 + }, + { + "epoch": 2.640182009100455, + "grad_norm": 1.275649070739746, + "learning_rate": 9.585222940761926e-05, + "loss": 1.8175, + "step": 22629 + }, + { + "epoch": 2.6402986816007465, + "grad_norm": 1.1045588254928589, + "learning_rate": 9.58382114780557e-05, + "loss": 2.0216, + "step": 22630 + }, + { + "epoch": 2.6404153541010382, + "grad_norm": 1.3082454204559326, + "learning_rate": 9.582419412553201e-05, + "loss": 2.1024, + "step": 22631 + }, + { + "epoch": 2.64053202660133, + "grad_norm": 1.263488531112671, + "learning_rate": 9.581017735019342e-05, + "loss": 2.1185, + "step": 22632 + }, + { + "epoch": 2.6406486991016216, + "grad_norm": 1.2445424795150757, + "learning_rate": 9.579616115218535e-05, + "loss": 2.0299, + "step": 22633 + }, + { + "epoch": 2.6407653716019133, + "grad_norm": 1.175246000289917, + "learning_rate": 9.5782145531653e-05, + "loss": 1.8729, + "step": 22634 + }, + { + "epoch": 2.640882044102205, + "grad_norm": 1.0712825059890747, + "learning_rate": 9.576813048874177e-05, + "loss": 2.0057, + "step": 22635 + }, + { + "epoch": 2.6409987166024966, + "grad_norm": 1.1983811855316162, + "learning_rate": 9.575411602359686e-05, + "loss": 2.0492, + "step": 22636 + }, + { + "epoch": 2.6411153891027883, + "grad_norm": 1.452849268913269, + "learning_rate": 9.574010213636366e-05, + "loss": 2.0063, + "step": 22637 + }, + { + "epoch": 2.64123206160308, + "grad_norm": 1.2126047611236572, + "learning_rate": 9.572608882718736e-05, + "loss": 1.8614, + "step": 22638 + }, + { + "epoch": 2.6413487341033717, + "grad_norm": 1.280461072921753, + "learning_rate": 9.571207609621326e-05, + "loss": 1.907, + "step": 22639 + }, + { + "epoch": 2.6414654066036634, + "grad_norm": 1.0957067012786865, + "learning_rate": 9.569806394358662e-05, + "loss": 2.0421, + "step": 22640 + }, + { + "epoch": 2.641582079103955, + "grad_norm": 1.2418208122253418, + "learning_rate": 9.568405236945278e-05, + "loss": 2.0031, + "step": 22641 + }, + { + "epoch": 2.6416987516042467, + "grad_norm": 0.9423143267631531, + "learning_rate": 9.567004137395686e-05, + "loss": 1.8448, + "step": 22642 + }, + { + "epoch": 2.6418154241045384, + "grad_norm": 1.0610742568969727, + "learning_rate": 9.565603095724422e-05, + "loss": 2.0612, + "step": 22643 + }, + { + "epoch": 2.64193209660483, + "grad_norm": 1.079058051109314, + "learning_rate": 9.564202111946003e-05, + "loss": 1.9845, + "step": 22644 + }, + { + "epoch": 2.6420487691051218, + "grad_norm": 1.0576528310775757, + "learning_rate": 9.562801186074955e-05, + "loss": 2.0911, + "step": 22645 + }, + { + "epoch": 2.6421654416054134, + "grad_norm": 1.1957907676696777, + "learning_rate": 9.561400318125812e-05, + "loss": 1.9508, + "step": 22646 + }, + { + "epoch": 2.642282114105705, + "grad_norm": 1.0957517623901367, + "learning_rate": 9.559999508113082e-05, + "loss": 2.0439, + "step": 22647 + }, + { + "epoch": 2.642398786605997, + "grad_norm": 1.1471003293991089, + "learning_rate": 9.5585987560513e-05, + "loss": 2.1329, + "step": 22648 + }, + { + "epoch": 2.6425154591062885, + "grad_norm": 1.2639830112457275, + "learning_rate": 9.557198061954967e-05, + "loss": 2.0238, + "step": 22649 + }, + { + "epoch": 2.64263213160658, + "grad_norm": 1.2920689582824707, + "learning_rate": 9.555797425838627e-05, + "loss": 2.0816, + "step": 22650 + }, + { + "epoch": 2.642748804106872, + "grad_norm": 1.0697591304779053, + "learning_rate": 9.554396847716784e-05, + "loss": 1.8703, + "step": 22651 + }, + { + "epoch": 2.6428654766071635, + "grad_norm": 1.1068155765533447, + "learning_rate": 9.55299632760397e-05, + "loss": 2.0875, + "step": 22652 + }, + { + "epoch": 2.642982149107455, + "grad_norm": 1.124191403388977, + "learning_rate": 9.551595865514693e-05, + "loss": 1.8715, + "step": 22653 + }, + { + "epoch": 2.643098821607747, + "grad_norm": 1.1498056650161743, + "learning_rate": 9.55019546146348e-05, + "loss": 2.0225, + "step": 22654 + }, + { + "epoch": 2.6432154941080386, + "grad_norm": 1.0479683876037598, + "learning_rate": 9.548795115464841e-05, + "loss": 1.7381, + "step": 22655 + }, + { + "epoch": 2.6433321666083303, + "grad_norm": 1.0602163076400757, + "learning_rate": 9.547394827533301e-05, + "loss": 1.7791, + "step": 22656 + }, + { + "epoch": 2.643448839108622, + "grad_norm": 1.0477089881896973, + "learning_rate": 9.545994597683371e-05, + "loss": 1.9772, + "step": 22657 + }, + { + "epoch": 2.6435655116089136, + "grad_norm": 1.1394782066345215, + "learning_rate": 9.544594425929568e-05, + "loss": 1.9636, + "step": 22658 + }, + { + "epoch": 2.6436821841092053, + "grad_norm": 1.1573795080184937, + "learning_rate": 9.543194312286412e-05, + "loss": 2.0257, + "step": 22659 + }, + { + "epoch": 2.643798856609497, + "grad_norm": 0.9995343089103699, + "learning_rate": 9.541794256768408e-05, + "loss": 1.9229, + "step": 22660 + }, + { + "epoch": 2.6439155291097887, + "grad_norm": 1.2907557487487793, + "learning_rate": 9.540394259390085e-05, + "loss": 2.0458, + "step": 22661 + }, + { + "epoch": 2.6440322016100803, + "grad_norm": 0.9813132882118225, + "learning_rate": 9.53899432016594e-05, + "loss": 1.7864, + "step": 22662 + }, + { + "epoch": 2.644148874110372, + "grad_norm": 1.1228210926055908, + "learning_rate": 9.537594439110502e-05, + "loss": 1.738, + "step": 22663 + }, + { + "epoch": 2.6442655466106637, + "grad_norm": 1.0288560390472412, + "learning_rate": 9.536194616238269e-05, + "loss": 1.9552, + "step": 22664 + }, + { + "epoch": 2.6443822191109554, + "grad_norm": 1.226019024848938, + "learning_rate": 9.534794851563765e-05, + "loss": 2.2378, + "step": 22665 + }, + { + "epoch": 2.644498891611247, + "grad_norm": 1.1550886631011963, + "learning_rate": 9.533395145101493e-05, + "loss": 1.927, + "step": 22666 + }, + { + "epoch": 2.6446155641115388, + "grad_norm": 1.1408292055130005, + "learning_rate": 9.531995496865966e-05, + "loss": 1.9461, + "step": 22667 + }, + { + "epoch": 2.6447322366118304, + "grad_norm": 1.2354438304901123, + "learning_rate": 9.530595906871694e-05, + "loss": 2.1547, + "step": 22668 + }, + { + "epoch": 2.644848909112122, + "grad_norm": 1.2039061784744263, + "learning_rate": 9.529196375133188e-05, + "loss": 2.0238, + "step": 22669 + }, + { + "epoch": 2.644965581612414, + "grad_norm": 1.1821738481521606, + "learning_rate": 9.527796901664953e-05, + "loss": 2.1066, + "step": 22670 + }, + { + "epoch": 2.6450822541127055, + "grad_norm": 1.0453433990478516, + "learning_rate": 9.526397486481497e-05, + "loss": 1.9675, + "step": 22671 + }, + { + "epoch": 2.645198926612997, + "grad_norm": 1.2610139846801758, + "learning_rate": 9.524998129597336e-05, + "loss": 1.9895, + "step": 22672 + }, + { + "epoch": 2.645315599113289, + "grad_norm": 1.0075349807739258, + "learning_rate": 9.523598831026969e-05, + "loss": 1.8176, + "step": 22673 + }, + { + "epoch": 2.6454322716135805, + "grad_norm": 1.0595546960830688, + "learning_rate": 9.52219959078491e-05, + "loss": 1.9386, + "step": 22674 + }, + { + "epoch": 2.645548944113872, + "grad_norm": 1.1805531978607178, + "learning_rate": 9.520800408885649e-05, + "loss": 1.926, + "step": 22675 + }, + { + "epoch": 2.645665616614164, + "grad_norm": 1.1511808633804321, + "learning_rate": 9.51940128534371e-05, + "loss": 1.762, + "step": 22676 + }, + { + "epoch": 2.6457822891144556, + "grad_norm": 1.305970311164856, + "learning_rate": 9.518002220173583e-05, + "loss": 2.0138, + "step": 22677 + }, + { + "epoch": 2.6458989616147472, + "grad_norm": 1.1738003492355347, + "learning_rate": 9.516603213389784e-05, + "loss": 1.9314, + "step": 22678 + }, + { + "epoch": 2.646015634115039, + "grad_norm": 1.355458378791809, + "learning_rate": 9.515204265006806e-05, + "loss": 2.1927, + "step": 22679 + }, + { + "epoch": 2.6461323066153306, + "grad_norm": 1.2106448411941528, + "learning_rate": 9.51380537503916e-05, + "loss": 1.9703, + "step": 22680 + }, + { + "epoch": 2.6462489791156223, + "grad_norm": 1.2033313512802124, + "learning_rate": 9.512406543501341e-05, + "loss": 2.0867, + "step": 22681 + }, + { + "epoch": 2.646365651615914, + "grad_norm": 1.2752224206924438, + "learning_rate": 9.511007770407855e-05, + "loss": 1.976, + "step": 22682 + }, + { + "epoch": 2.6464823241162057, + "grad_norm": 1.1723862886428833, + "learning_rate": 9.5096090557732e-05, + "loss": 2.0335, + "step": 22683 + }, + { + "epoch": 2.6465989966164973, + "grad_norm": 1.1207600831985474, + "learning_rate": 9.508210399611877e-05, + "loss": 1.9164, + "step": 22684 + }, + { + "epoch": 2.646715669116789, + "grad_norm": 1.0726925134658813, + "learning_rate": 9.506811801938393e-05, + "loss": 1.8647, + "step": 22685 + }, + { + "epoch": 2.6468323416170807, + "grad_norm": 1.112419605255127, + "learning_rate": 9.505413262767231e-05, + "loss": 2.1215, + "step": 22686 + }, + { + "epoch": 2.6469490141173724, + "grad_norm": 1.0538830757141113, + "learning_rate": 9.504014782112911e-05, + "loss": 2.0764, + "step": 22687 + }, + { + "epoch": 2.647065686617664, + "grad_norm": 1.2756363153457642, + "learning_rate": 9.50261635998991e-05, + "loss": 2.0326, + "step": 22688 + }, + { + "epoch": 2.6471823591179557, + "grad_norm": 0.8900141716003418, + "learning_rate": 9.501217996412741e-05, + "loss": 1.9651, + "step": 22689 + }, + { + "epoch": 2.6472990316182474, + "grad_norm": 1.1684901714324951, + "learning_rate": 9.499819691395894e-05, + "loss": 2.1121, + "step": 22690 + }, + { + "epoch": 2.647415704118539, + "grad_norm": 1.204116940498352, + "learning_rate": 9.498421444953866e-05, + "loss": 2.0211, + "step": 22691 + }, + { + "epoch": 2.647532376618831, + "grad_norm": 1.0915522575378418, + "learning_rate": 9.497023257101147e-05, + "loss": 1.9137, + "step": 22692 + }, + { + "epoch": 2.6476490491191225, + "grad_norm": 1.254892110824585, + "learning_rate": 9.495625127852241e-05, + "loss": 2.1636, + "step": 22693 + }, + { + "epoch": 2.647765721619414, + "grad_norm": 0.9305719137191772, + "learning_rate": 9.494227057221632e-05, + "loss": 1.9283, + "step": 22694 + }, + { + "epoch": 2.647882394119706, + "grad_norm": 1.2029738426208496, + "learning_rate": 9.492829045223828e-05, + "loss": 1.9837, + "step": 22695 + }, + { + "epoch": 2.6479990666199975, + "grad_norm": 1.211456537246704, + "learning_rate": 9.491431091873303e-05, + "loss": 2.1981, + "step": 22696 + }, + { + "epoch": 2.648115739120289, + "grad_norm": 1.1040376424789429, + "learning_rate": 9.490033197184567e-05, + "loss": 2.0957, + "step": 22697 + }, + { + "epoch": 2.648232411620581, + "grad_norm": 1.1102885007858276, + "learning_rate": 9.488635361172107e-05, + "loss": 1.966, + "step": 22698 + }, + { + "epoch": 2.6483490841208726, + "grad_norm": 1.1168198585510254, + "learning_rate": 9.487237583850409e-05, + "loss": 1.8762, + "step": 22699 + }, + { + "epoch": 2.6484657566211642, + "grad_norm": 1.0901377201080322, + "learning_rate": 9.485839865233967e-05, + "loss": 1.9168, + "step": 22700 + }, + { + "epoch": 2.648582429121456, + "grad_norm": 1.068448543548584, + "learning_rate": 9.48444220533727e-05, + "loss": 1.7801, + "step": 22701 + }, + { + "epoch": 2.6486991016217476, + "grad_norm": 1.1471210718154907, + "learning_rate": 9.48304460417481e-05, + "loss": 2.0795, + "step": 22702 + }, + { + "epoch": 2.6488157741220393, + "grad_norm": 1.0876634120941162, + "learning_rate": 9.481647061761071e-05, + "loss": 1.8827, + "step": 22703 + }, + { + "epoch": 2.648932446622331, + "grad_norm": 1.0149891376495361, + "learning_rate": 9.480249578110549e-05, + "loss": 1.9184, + "step": 22704 + }, + { + "epoch": 2.6490491191226226, + "grad_norm": 1.1672762632369995, + "learning_rate": 9.478852153237721e-05, + "loss": 1.8902, + "step": 22705 + }, + { + "epoch": 2.6491657916229143, + "grad_norm": 1.054376482963562, + "learning_rate": 9.477454787157086e-05, + "loss": 1.8359, + "step": 22706 + }, + { + "epoch": 2.649282464123206, + "grad_norm": 1.0381046533584595, + "learning_rate": 9.476057479883121e-05, + "loss": 1.9959, + "step": 22707 + }, + { + "epoch": 2.6493991366234977, + "grad_norm": 1.2508560419082642, + "learning_rate": 9.474660231430312e-05, + "loss": 2.0334, + "step": 22708 + }, + { + "epoch": 2.6495158091237894, + "grad_norm": 1.0108259916305542, + "learning_rate": 9.473263041813152e-05, + "loss": 1.8671, + "step": 22709 + }, + { + "epoch": 2.649632481624081, + "grad_norm": 1.2833566665649414, + "learning_rate": 9.471865911046111e-05, + "loss": 1.9825, + "step": 22710 + }, + { + "epoch": 2.6497491541243727, + "grad_norm": 1.275924801826477, + "learning_rate": 9.470468839143692e-05, + "loss": 1.9618, + "step": 22711 + }, + { + "epoch": 2.6498658266246644, + "grad_norm": 1.140608549118042, + "learning_rate": 9.469071826120362e-05, + "loss": 2.0651, + "step": 22712 + }, + { + "epoch": 2.649982499124956, + "grad_norm": 1.1006507873535156, + "learning_rate": 9.467674871990616e-05, + "loss": 1.8565, + "step": 22713 + }, + { + "epoch": 2.6500991716252478, + "grad_norm": 1.158168911933899, + "learning_rate": 9.466277976768926e-05, + "loss": 1.9836, + "step": 22714 + }, + { + "epoch": 2.6502158441255395, + "grad_norm": 1.1014182567596436, + "learning_rate": 9.464881140469783e-05, + "loss": 1.8388, + "step": 22715 + }, + { + "epoch": 2.650332516625831, + "grad_norm": 1.067488670349121, + "learning_rate": 9.463484363107658e-05, + "loss": 1.8868, + "step": 22716 + }, + { + "epoch": 2.650449189126123, + "grad_norm": 0.9967918395996094, + "learning_rate": 9.462087644697039e-05, + "loss": 1.8597, + "step": 22717 + }, + { + "epoch": 2.6505658616264145, + "grad_norm": 1.0423685312271118, + "learning_rate": 9.460690985252399e-05, + "loss": 1.8133, + "step": 22718 + }, + { + "epoch": 2.650682534126706, + "grad_norm": 1.292429804801941, + "learning_rate": 9.459294384788223e-05, + "loss": 1.9861, + "step": 22719 + }, + { + "epoch": 2.650799206626998, + "grad_norm": 1.0053359270095825, + "learning_rate": 9.457897843318984e-05, + "loss": 1.8857, + "step": 22720 + }, + { + "epoch": 2.6509158791272895, + "grad_norm": 1.2719497680664062, + "learning_rate": 9.456501360859167e-05, + "loss": 2.1866, + "step": 22721 + }, + { + "epoch": 2.6510325516275812, + "grad_norm": 1.2393163442611694, + "learning_rate": 9.45510493742324e-05, + "loss": 1.9558, + "step": 22722 + }, + { + "epoch": 2.651149224127873, + "grad_norm": 1.0880160331726074, + "learning_rate": 9.453708573025688e-05, + "loss": 1.9037, + "step": 22723 + }, + { + "epoch": 2.6512658966281646, + "grad_norm": 1.408520221710205, + "learning_rate": 9.452312267680982e-05, + "loss": 2.1249, + "step": 22724 + }, + { + "epoch": 2.6513825691284563, + "grad_norm": 1.2628209590911865, + "learning_rate": 9.450916021403597e-05, + "loss": 2.0901, + "step": 22725 + }, + { + "epoch": 2.651499241628748, + "grad_norm": 1.3341567516326904, + "learning_rate": 9.449519834208016e-05, + "loss": 2.0572, + "step": 22726 + }, + { + "epoch": 2.6516159141290396, + "grad_norm": 1.0943444967269897, + "learning_rate": 9.4481237061087e-05, + "loss": 2.0176, + "step": 22727 + }, + { + "epoch": 2.6517325866293313, + "grad_norm": 1.1342127323150635, + "learning_rate": 9.446727637120137e-05, + "loss": 2.0319, + "step": 22728 + }, + { + "epoch": 2.651849259129623, + "grad_norm": 1.2273041009902954, + "learning_rate": 9.445331627256783e-05, + "loss": 2.0132, + "step": 22729 + }, + { + "epoch": 2.6519659316299147, + "grad_norm": 1.1241567134857178, + "learning_rate": 9.443935676533125e-05, + "loss": 1.7663, + "step": 22730 + }, + { + "epoch": 2.6520826041302064, + "grad_norm": 1.2431257963180542, + "learning_rate": 9.442539784963624e-05, + "loss": 2.0757, + "step": 22731 + }, + { + "epoch": 2.652199276630498, + "grad_norm": 1.1201589107513428, + "learning_rate": 9.441143952562762e-05, + "loss": 2.1004, + "step": 22732 + }, + { + "epoch": 2.6523159491307897, + "grad_norm": 1.3114961385726929, + "learning_rate": 9.439748179345002e-05, + "loss": 2.0229, + "step": 22733 + }, + { + "epoch": 2.6524326216310814, + "grad_norm": 0.9626580476760864, + "learning_rate": 9.438352465324813e-05, + "loss": 1.7876, + "step": 22734 + }, + { + "epoch": 2.652549294131373, + "grad_norm": 1.1461870670318604, + "learning_rate": 9.436956810516667e-05, + "loss": 1.9613, + "step": 22735 + }, + { + "epoch": 2.6526659666316648, + "grad_norm": 1.1140588521957397, + "learning_rate": 9.435561214935031e-05, + "loss": 1.9263, + "step": 22736 + }, + { + "epoch": 2.6527826391319564, + "grad_norm": 1.1879189014434814, + "learning_rate": 9.434165678594378e-05, + "loss": 1.9646, + "step": 22737 + }, + { + "epoch": 2.652899311632248, + "grad_norm": 1.07096529006958, + "learning_rate": 9.432770201509167e-05, + "loss": 1.9761, + "step": 22738 + }, + { + "epoch": 2.65301598413254, + "grad_norm": 1.1208645105361938, + "learning_rate": 9.431374783693875e-05, + "loss": 2.0048, + "step": 22739 + }, + { + "epoch": 2.6531326566328315, + "grad_norm": 1.1794687509536743, + "learning_rate": 9.429979425162955e-05, + "loss": 1.9014, + "step": 22740 + }, + { + "epoch": 2.653249329133123, + "grad_norm": 1.0504251718521118, + "learning_rate": 9.428584125930887e-05, + "loss": 1.8251, + "step": 22741 + }, + { + "epoch": 2.653366001633415, + "grad_norm": 1.1395351886749268, + "learning_rate": 9.427188886012127e-05, + "loss": 1.9634, + "step": 22742 + }, + { + "epoch": 2.6534826741337065, + "grad_norm": 1.0795139074325562, + "learning_rate": 9.425793705421146e-05, + "loss": 2.0985, + "step": 22743 + }, + { + "epoch": 2.653599346633998, + "grad_norm": 1.0629644393920898, + "learning_rate": 9.424398584172393e-05, + "loss": 2.0423, + "step": 22744 + }, + { + "epoch": 2.65371601913429, + "grad_norm": 1.2048193216323853, + "learning_rate": 9.423003522280349e-05, + "loss": 1.9832, + "step": 22745 + }, + { + "epoch": 2.6538326916345816, + "grad_norm": 1.0979423522949219, + "learning_rate": 9.42160851975946e-05, + "loss": 2.0874, + "step": 22746 + }, + { + "epoch": 2.6539493641348733, + "grad_norm": 0.9950255751609802, + "learning_rate": 9.420213576624207e-05, + "loss": 1.9219, + "step": 22747 + }, + { + "epoch": 2.654066036635165, + "grad_norm": 1.1314549446105957, + "learning_rate": 9.418818692889031e-05, + "loss": 1.8315, + "step": 22748 + }, + { + "epoch": 2.6541827091354566, + "grad_norm": 0.8650663495063782, + "learning_rate": 9.417423868568403e-05, + "loss": 1.5727, + "step": 22749 + }, + { + "epoch": 2.6542993816357483, + "grad_norm": 1.2324202060699463, + "learning_rate": 9.416029103676788e-05, + "loss": 2.0386, + "step": 22750 + }, + { + "epoch": 2.65441605413604, + "grad_norm": 1.1266279220581055, + "learning_rate": 9.414634398228636e-05, + "loss": 2.1746, + "step": 22751 + }, + { + "epoch": 2.6545327266363317, + "grad_norm": 1.2028090953826904, + "learning_rate": 9.413239752238414e-05, + "loss": 1.9769, + "step": 22752 + }, + { + "epoch": 2.6546493991366233, + "grad_norm": 1.1246334314346313, + "learning_rate": 9.411845165720568e-05, + "loss": 1.9452, + "step": 22753 + }, + { + "epoch": 2.654766071636915, + "grad_norm": 1.1039388179779053, + "learning_rate": 9.410450638689568e-05, + "loss": 1.8889, + "step": 22754 + }, + { + "epoch": 2.6548827441372067, + "grad_norm": 1.4451361894607544, + "learning_rate": 9.409056171159862e-05, + "loss": 2.1894, + "step": 22755 + }, + { + "epoch": 2.6549994166374984, + "grad_norm": 1.1419728994369507, + "learning_rate": 9.407661763145915e-05, + "loss": 2.0479, + "step": 22756 + }, + { + "epoch": 2.65511608913779, + "grad_norm": 1.1532087326049805, + "learning_rate": 9.406267414662171e-05, + "loss": 1.8884, + "step": 22757 + }, + { + "epoch": 2.6552327616380818, + "grad_norm": 1.1624269485473633, + "learning_rate": 9.4048731257231e-05, + "loss": 1.8197, + "step": 22758 + }, + { + "epoch": 2.6553494341383734, + "grad_norm": 1.118029236793518, + "learning_rate": 9.403478896343143e-05, + "loss": 1.9133, + "step": 22759 + }, + { + "epoch": 2.655466106638665, + "grad_norm": 1.2400281429290771, + "learning_rate": 9.402084726536759e-05, + "loss": 1.9125, + "step": 22760 + }, + { + "epoch": 2.655582779138957, + "grad_norm": 1.12067711353302, + "learning_rate": 9.400690616318406e-05, + "loss": 1.9247, + "step": 22761 + }, + { + "epoch": 2.6556994516392485, + "grad_norm": 1.212530255317688, + "learning_rate": 9.399296565702524e-05, + "loss": 1.9809, + "step": 22762 + }, + { + "epoch": 2.65581612413954, + "grad_norm": 1.1300773620605469, + "learning_rate": 9.39790257470358e-05, + "loss": 1.911, + "step": 22763 + }, + { + "epoch": 2.655932796639832, + "grad_norm": 1.1967815160751343, + "learning_rate": 9.396508643336014e-05, + "loss": 2.0065, + "step": 22764 + }, + { + "epoch": 2.6560494691401235, + "grad_norm": 1.165776014328003, + "learning_rate": 9.395114771614286e-05, + "loss": 2.0181, + "step": 22765 + }, + { + "epoch": 2.656166141640415, + "grad_norm": 1.316367506980896, + "learning_rate": 9.393720959552838e-05, + "loss": 1.9968, + "step": 22766 + }, + { + "epoch": 2.656282814140707, + "grad_norm": 1.1182610988616943, + "learning_rate": 9.392327207166125e-05, + "loss": 1.902, + "step": 22767 + }, + { + "epoch": 2.6563994866409986, + "grad_norm": 1.0937069654464722, + "learning_rate": 9.390933514468587e-05, + "loss": 1.8635, + "step": 22768 + }, + { + "epoch": 2.6565161591412902, + "grad_norm": 1.3048380613327026, + "learning_rate": 9.389539881474687e-05, + "loss": 2.0796, + "step": 22769 + }, + { + "epoch": 2.656632831641582, + "grad_norm": 1.1660712957382202, + "learning_rate": 9.388146308198859e-05, + "loss": 1.993, + "step": 22770 + }, + { + "epoch": 2.6567495041418736, + "grad_norm": 1.2674174308776855, + "learning_rate": 9.386752794655559e-05, + "loss": 1.848, + "step": 22771 + }, + { + "epoch": 2.6568661766421653, + "grad_norm": 1.1701606512069702, + "learning_rate": 9.385359340859225e-05, + "loss": 1.9757, + "step": 22772 + }, + { + "epoch": 2.656982849142457, + "grad_norm": 1.3882673978805542, + "learning_rate": 9.383965946824314e-05, + "loss": 2.0128, + "step": 22773 + }, + { + "epoch": 2.6570995216427487, + "grad_norm": 0.9386249780654907, + "learning_rate": 9.382572612565258e-05, + "loss": 1.8959, + "step": 22774 + }, + { + "epoch": 2.6572161941430403, + "grad_norm": 1.0942450761795044, + "learning_rate": 9.381179338096512e-05, + "loss": 1.8408, + "step": 22775 + }, + { + "epoch": 2.657332866643332, + "grad_norm": 1.221885323524475, + "learning_rate": 9.37978612343252e-05, + "loss": 1.9868, + "step": 22776 + }, + { + "epoch": 2.6574495391436237, + "grad_norm": 1.1314873695373535, + "learning_rate": 9.378392968587714e-05, + "loss": 1.9996, + "step": 22777 + }, + { + "epoch": 2.6575662116439154, + "grad_norm": 1.0565739870071411, + "learning_rate": 9.376999873576552e-05, + "loss": 1.9664, + "step": 22778 + }, + { + "epoch": 2.657682884144207, + "grad_norm": 1.0309218168258667, + "learning_rate": 9.375606838413462e-05, + "loss": 1.8844, + "step": 22779 + }, + { + "epoch": 2.6577995566444987, + "grad_norm": 1.084460973739624, + "learning_rate": 9.374213863112897e-05, + "loss": 1.6964, + "step": 22780 + }, + { + "epoch": 2.6579162291447904, + "grad_norm": 1.1541614532470703, + "learning_rate": 9.372820947689288e-05, + "loss": 1.7892, + "step": 22781 + }, + { + "epoch": 2.658032901645082, + "grad_norm": 0.9709449410438538, + "learning_rate": 9.371428092157084e-05, + "loss": 2.0936, + "step": 22782 + }, + { + "epoch": 2.658149574145374, + "grad_norm": 1.1086978912353516, + "learning_rate": 9.370035296530716e-05, + "loss": 1.9724, + "step": 22783 + }, + { + "epoch": 2.6582662466456655, + "grad_norm": 1.0839446783065796, + "learning_rate": 9.368642560824632e-05, + "loss": 1.8955, + "step": 22784 + }, + { + "epoch": 2.658382919145957, + "grad_norm": 1.0197827816009521, + "learning_rate": 9.367249885053264e-05, + "loss": 2.0557, + "step": 22785 + }, + { + "epoch": 2.658499591646249, + "grad_norm": 1.2113829851150513, + "learning_rate": 9.365857269231054e-05, + "loss": 1.9336, + "step": 22786 + }, + { + "epoch": 2.6586162641465405, + "grad_norm": 1.2064476013183594, + "learning_rate": 9.36446471337243e-05, + "loss": 1.9642, + "step": 22787 + }, + { + "epoch": 2.658732936646832, + "grad_norm": 1.0978713035583496, + "learning_rate": 9.363072217491836e-05, + "loss": 2.0319, + "step": 22788 + }, + { + "epoch": 2.658849609147124, + "grad_norm": 1.1112184524536133, + "learning_rate": 9.361679781603712e-05, + "loss": 1.863, + "step": 22789 + }, + { + "epoch": 2.6589662816474156, + "grad_norm": 1.085357427597046, + "learning_rate": 9.360287405722483e-05, + "loss": 1.9741, + "step": 22790 + }, + { + "epoch": 2.6590829541477072, + "grad_norm": 1.1855138540267944, + "learning_rate": 9.358895089862594e-05, + "loss": 2.0688, + "step": 22791 + }, + { + "epoch": 2.659199626647999, + "grad_norm": 1.2183700799942017, + "learning_rate": 9.35750283403847e-05, + "loss": 2.0055, + "step": 22792 + }, + { + "epoch": 2.6593162991482906, + "grad_norm": 1.2150331735610962, + "learning_rate": 9.356110638264551e-05, + "loss": 2.1612, + "step": 22793 + }, + { + "epoch": 2.6594329716485823, + "grad_norm": 1.1823194026947021, + "learning_rate": 9.354718502555266e-05, + "loss": 2.0889, + "step": 22794 + }, + { + "epoch": 2.659549644148874, + "grad_norm": 1.1416410207748413, + "learning_rate": 9.353326426925049e-05, + "loss": 2.0525, + "step": 22795 + }, + { + "epoch": 2.6596663166491656, + "grad_norm": 1.0403616428375244, + "learning_rate": 9.351934411388325e-05, + "loss": 2.1209, + "step": 22796 + }, + { + "epoch": 2.6597829891494573, + "grad_norm": 1.05634605884552, + "learning_rate": 9.350542455959539e-05, + "loss": 1.9493, + "step": 22797 + }, + { + "epoch": 2.659899661649749, + "grad_norm": 1.1469110250473022, + "learning_rate": 9.349150560653104e-05, + "loss": 1.7696, + "step": 22798 + }, + { + "epoch": 2.6600163341500407, + "grad_norm": 1.215779185295105, + "learning_rate": 9.347758725483464e-05, + "loss": 2.0728, + "step": 22799 + }, + { + "epoch": 2.6601330066503324, + "grad_norm": 0.9885889887809753, + "learning_rate": 9.346366950465038e-05, + "loss": 1.8235, + "step": 22800 + }, + { + "epoch": 2.660249679150624, + "grad_norm": 1.0887409448623657, + "learning_rate": 9.344975235612258e-05, + "loss": 2.0834, + "step": 22801 + }, + { + "epoch": 2.6603663516509157, + "grad_norm": 1.3192335367202759, + "learning_rate": 9.343583580939559e-05, + "loss": 2.1928, + "step": 22802 + }, + { + "epoch": 2.6604830241512074, + "grad_norm": 0.9953429102897644, + "learning_rate": 9.342191986461357e-05, + "loss": 1.9761, + "step": 22803 + }, + { + "epoch": 2.660599696651499, + "grad_norm": 1.1554651260375977, + "learning_rate": 9.340800452192084e-05, + "loss": 1.9032, + "step": 22804 + }, + { + "epoch": 2.6607163691517908, + "grad_norm": 1.246393084526062, + "learning_rate": 9.339408978146162e-05, + "loss": 2.1344, + "step": 22805 + }, + { + "epoch": 2.6608330416520825, + "grad_norm": 1.0463783740997314, + "learning_rate": 9.338017564338026e-05, + "loss": 1.8624, + "step": 22806 + }, + { + "epoch": 2.660949714152374, + "grad_norm": 1.189658522605896, + "learning_rate": 9.336626210782085e-05, + "loss": 1.9351, + "step": 22807 + }, + { + "epoch": 2.661066386652666, + "grad_norm": 1.1852835416793823, + "learning_rate": 9.335234917492782e-05, + "loss": 1.9218, + "step": 22808 + }, + { + "epoch": 2.6611830591529575, + "grad_norm": 0.9463725686073303, + "learning_rate": 9.33384368448452e-05, + "loss": 1.8446, + "step": 22809 + }, + { + "epoch": 2.661299731653249, + "grad_norm": 1.2500998973846436, + "learning_rate": 9.33245251177174e-05, + "loss": 2.0107, + "step": 22810 + }, + { + "epoch": 2.661416404153541, + "grad_norm": 1.0072169303894043, + "learning_rate": 9.331061399368855e-05, + "loss": 1.9895, + "step": 22811 + }, + { + "epoch": 2.6615330766538325, + "grad_norm": 1.0373464822769165, + "learning_rate": 9.329670347290288e-05, + "loss": 1.8285, + "step": 22812 + }, + { + "epoch": 2.6616497491541242, + "grad_norm": 1.1453007459640503, + "learning_rate": 9.328279355550453e-05, + "loss": 2.013, + "step": 22813 + }, + { + "epoch": 2.661766421654416, + "grad_norm": 1.0951800346374512, + "learning_rate": 9.326888424163777e-05, + "loss": 1.9002, + "step": 22814 + }, + { + "epoch": 2.6618830941547076, + "grad_norm": 1.0162444114685059, + "learning_rate": 9.325497553144688e-05, + "loss": 1.9317, + "step": 22815 + }, + { + "epoch": 2.6619997666549993, + "grad_norm": 1.1591527462005615, + "learning_rate": 9.324106742507588e-05, + "loss": 2.0385, + "step": 22816 + }, + { + "epoch": 2.662116439155291, + "grad_norm": 1.344846248626709, + "learning_rate": 9.322715992266912e-05, + "loss": 1.9377, + "step": 22817 + }, + { + "epoch": 2.6622331116555826, + "grad_norm": 1.030730128288269, + "learning_rate": 9.321325302437062e-05, + "loss": 1.9084, + "step": 22818 + }, + { + "epoch": 2.6623497841558743, + "grad_norm": 1.1277556419372559, + "learning_rate": 9.31993467303247e-05, + "loss": 1.998, + "step": 22819 + }, + { + "epoch": 2.662466456656166, + "grad_norm": 1.1519635915756226, + "learning_rate": 9.318544104067539e-05, + "loss": 2.0333, + "step": 22820 + }, + { + "epoch": 2.6625831291564577, + "grad_norm": 1.1748424768447876, + "learning_rate": 9.317153595556697e-05, + "loss": 1.9804, + "step": 22821 + }, + { + "epoch": 2.6626998016567494, + "grad_norm": 1.2365453243255615, + "learning_rate": 9.315763147514347e-05, + "loss": 2.1293, + "step": 22822 + }, + { + "epoch": 2.662816474157041, + "grad_norm": 1.1090556383132935, + "learning_rate": 9.314372759954916e-05, + "loss": 1.9116, + "step": 22823 + }, + { + "epoch": 2.6629331466573327, + "grad_norm": 1.156952977180481, + "learning_rate": 9.312982432892806e-05, + "loss": 1.9269, + "step": 22824 + }, + { + "epoch": 2.6630498191576244, + "grad_norm": 1.133316159248352, + "learning_rate": 9.31159216634244e-05, + "loss": 2.0161, + "step": 22825 + }, + { + "epoch": 2.663166491657916, + "grad_norm": 1.0779919624328613, + "learning_rate": 9.310201960318225e-05, + "loss": 1.8146, + "step": 22826 + }, + { + "epoch": 2.6632831641582078, + "grad_norm": 1.1194853782653809, + "learning_rate": 9.308811814834576e-05, + "loss": 1.9751, + "step": 22827 + }, + { + "epoch": 2.6633998366584994, + "grad_norm": 1.0934016704559326, + "learning_rate": 9.307421729905907e-05, + "loss": 1.9868, + "step": 22828 + }, + { + "epoch": 2.663516509158791, + "grad_norm": 1.1549979448318481, + "learning_rate": 9.30603170554662e-05, + "loss": 2.12, + "step": 22829 + }, + { + "epoch": 2.663633181659083, + "grad_norm": 1.1130701303482056, + "learning_rate": 9.304641741771137e-05, + "loss": 1.9621, + "step": 22830 + }, + { + "epoch": 2.6637498541593745, + "grad_norm": 1.1733900308609009, + "learning_rate": 9.303251838593856e-05, + "loss": 2.1761, + "step": 22831 + }, + { + "epoch": 2.663866526659666, + "grad_norm": 1.2691704034805298, + "learning_rate": 9.301861996029196e-05, + "loss": 1.8502, + "step": 22832 + }, + { + "epoch": 2.663983199159958, + "grad_norm": 1.0282700061798096, + "learning_rate": 9.300472214091555e-05, + "loss": 2.076, + "step": 22833 + }, + { + "epoch": 2.6640998716602495, + "grad_norm": 1.0720570087432861, + "learning_rate": 9.299082492795355e-05, + "loss": 2.0501, + "step": 22834 + }, + { + "epoch": 2.664216544160541, + "grad_norm": 1.1144251823425293, + "learning_rate": 9.297692832154985e-05, + "loss": 1.8946, + "step": 22835 + }, + { + "epoch": 2.664333216660833, + "grad_norm": 1.034591555595398, + "learning_rate": 9.296303232184869e-05, + "loss": 2.1115, + "step": 22836 + }, + { + "epoch": 2.664333216660833, + "eval_train_loss": 1.8739230632781982, + "eval_train_mean_batch_perplexity": 7.4251804022732815, + "eval_train_runtime": 11051.7357, + "eval_train_samples_per_second": 12.409, + "eval_train_steps_per_second": 0.776, + "step": 22836 + }, + { + "epoch": 2.664333216660833, + "eval_test_loss": 2.0527446269989014, + "eval_test_mean_batch_perplexity": 9.042580250223313, + "eval_test_runtime": 2385.0361, + "eval_test_samples_per_second": 12.321, + "eval_test_steps_per_second": 0.77, + "step": 22836 + }, + { + "epoch": 2.664333216660833, + "eval_train_loss": 1.8739230632781982, + "eval_train_mean_batch_perplexity": 7.4251804022732815, + "eval_train_runtime": 11121.9859, + "eval_train_samples_per_second": 12.33, + "eval_train_steps_per_second": 0.771, + "step": 22836 + }, + { + "epoch": 2.664333216660833, + "eval_test_loss": 2.0527446269989014, + "eval_test_mean_batch_perplexity": 9.042580250223313, + "eval_test_runtime": 2398.8096, + "eval_test_samples_per_second": 12.251, + "eval_test_steps_per_second": 0.766, + "step": 22836 + }, + { + "epoch": 2.6644498891611246, + "grad_norm": 1.0341079235076904, + "learning_rate": 9.294913692899402e-05, + "loss": 1.8461, + "step": 22837 + }, + { + "epoch": 2.6645665616614163, + "grad_norm": 1.230709433555603, + "learning_rate": 9.293524214312993e-05, + "loss": 2.0847, + "step": 22838 + }, + { + "epoch": 2.664683234161708, + "grad_norm": 1.0225778818130493, + "learning_rate": 9.292134796440042e-05, + "loss": 2.0087, + "step": 22839 + }, + { + "epoch": 2.6647999066619996, + "grad_norm": 1.2495943307876587, + "learning_rate": 9.290745439294953e-05, + "loss": 2.0854, + "step": 22840 + }, + { + "epoch": 2.6649165791622913, + "grad_norm": 1.0112756490707397, + "learning_rate": 9.289356142892144e-05, + "loss": 1.8319, + "step": 22841 + }, + { + "epoch": 2.665033251662583, + "grad_norm": 1.2519408464431763, + "learning_rate": 9.287966907245998e-05, + "loss": 1.9856, + "step": 22842 + }, + { + "epoch": 2.6651499241628747, + "grad_norm": 1.015994668006897, + "learning_rate": 9.28657773237093e-05, + "loss": 1.8865, + "step": 22843 + }, + { + "epoch": 2.6652665966631663, + "grad_norm": 0.9624674320220947, + "learning_rate": 9.285188618281338e-05, + "loss": 1.7399, + "step": 22844 + }, + { + "epoch": 2.665383269163458, + "grad_norm": 1.1212252378463745, + "learning_rate": 9.28379956499162e-05, + "loss": 1.8891, + "step": 22845 + }, + { + "epoch": 2.6654999416637497, + "grad_norm": 1.0440994501113892, + "learning_rate": 9.28241057251618e-05, + "loss": 1.9393, + "step": 22846 + }, + { + "epoch": 2.6656166141640414, + "grad_norm": 1.2559810876846313, + "learning_rate": 9.281021640869416e-05, + "loss": 1.9934, + "step": 22847 + }, + { + "epoch": 2.665733286664333, + "grad_norm": 1.0950661897659302, + "learning_rate": 9.27963277006572e-05, + "loss": 1.9882, + "step": 22848 + }, + { + "epoch": 2.6658499591646247, + "grad_norm": 1.4490928649902344, + "learning_rate": 9.278243960119503e-05, + "loss": 2.0826, + "step": 22849 + }, + { + "epoch": 2.6659666316649164, + "grad_norm": 1.0373704433441162, + "learning_rate": 9.276855211045153e-05, + "loss": 1.8949, + "step": 22850 + }, + { + "epoch": 2.666083304165208, + "grad_norm": 1.1980493068695068, + "learning_rate": 9.275466522857073e-05, + "loss": 1.9962, + "step": 22851 + }, + { + "epoch": 2.6661999766655, + "grad_norm": 1.0266801118850708, + "learning_rate": 9.274077895569653e-05, + "loss": 1.9805, + "step": 22852 + }, + { + "epoch": 2.6663166491657915, + "grad_norm": 1.0529098510742188, + "learning_rate": 9.272689329197295e-05, + "loss": 1.7128, + "step": 22853 + }, + { + "epoch": 2.666433321666083, + "grad_norm": 1.1749534606933594, + "learning_rate": 9.271300823754395e-05, + "loss": 1.968, + "step": 22854 + }, + { + "epoch": 2.666549994166375, + "grad_norm": 1.1319599151611328, + "learning_rate": 9.269912379255337e-05, + "loss": 1.9305, + "step": 22855 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 1.0720030069351196, + "learning_rate": 9.268523995714531e-05, + "loss": 1.8881, + "step": 22856 + }, + { + "epoch": 2.666783339166958, + "grad_norm": 1.0373871326446533, + "learning_rate": 9.267135673146354e-05, + "loss": 1.9197, + "step": 22857 + }, + { + "epoch": 2.66690001166725, + "grad_norm": 1.1636219024658203, + "learning_rate": 9.26574741156521e-05, + "loss": 1.9454, + "step": 22858 + }, + { + "epoch": 2.6670166841675416, + "grad_norm": 1.1083369255065918, + "learning_rate": 9.264359210985483e-05, + "loss": 2.0629, + "step": 22859 + }, + { + "epoch": 2.6671333566678332, + "grad_norm": 1.125588297843933, + "learning_rate": 9.262971071421572e-05, + "loss": 1.8842, + "step": 22860 + }, + { + "epoch": 2.667250029168125, + "grad_norm": 1.1145384311676025, + "learning_rate": 9.261582992887863e-05, + "loss": 1.9305, + "step": 22861 + }, + { + "epoch": 2.6673667016684166, + "grad_norm": 1.1861354112625122, + "learning_rate": 9.260194975398747e-05, + "loss": 1.9577, + "step": 22862 + }, + { + "epoch": 2.6674833741687083, + "grad_norm": 1.2439793348312378, + "learning_rate": 9.258807018968612e-05, + "loss": 2.1069, + "step": 22863 + }, + { + "epoch": 2.667600046669, + "grad_norm": 1.079526424407959, + "learning_rate": 9.25741912361185e-05, + "loss": 1.8537, + "step": 22864 + }, + { + "epoch": 2.6677167191692917, + "grad_norm": 1.4413366317749023, + "learning_rate": 9.256031289342843e-05, + "loss": 2.0083, + "step": 22865 + }, + { + "epoch": 2.6678333916695833, + "grad_norm": 1.116979956626892, + "learning_rate": 9.254643516175982e-05, + "loss": 2.0077, + "step": 22866 + }, + { + "epoch": 2.667950064169875, + "grad_norm": 1.0525461435317993, + "learning_rate": 9.253255804125662e-05, + "loss": 1.7846, + "step": 22867 + }, + { + "epoch": 2.6680667366701667, + "grad_norm": 1.2532159090042114, + "learning_rate": 9.251868153206255e-05, + "loss": 2.0537, + "step": 22868 + }, + { + "epoch": 2.6681834091704584, + "grad_norm": 1.1629968881607056, + "learning_rate": 9.250480563432159e-05, + "loss": 1.819, + "step": 22869 + }, + { + "epoch": 2.66830008167075, + "grad_norm": 1.238107681274414, + "learning_rate": 9.249093034817753e-05, + "loss": 1.9762, + "step": 22870 + }, + { + "epoch": 2.6684167541710417, + "grad_norm": 1.0685969591140747, + "learning_rate": 9.247705567377424e-05, + "loss": 2.0052, + "step": 22871 + }, + { + "epoch": 2.6685334266713334, + "grad_norm": 1.051665186882019, + "learning_rate": 9.246318161125547e-05, + "loss": 1.8236, + "step": 22872 + }, + { + "epoch": 2.668650099171625, + "grad_norm": 1.0901213884353638, + "learning_rate": 9.244930816076518e-05, + "loss": 1.88, + "step": 22873 + }, + { + "epoch": 2.668766771671917, + "grad_norm": 1.4080132246017456, + "learning_rate": 9.24354353224471e-05, + "loss": 2.2473, + "step": 22874 + }, + { + "epoch": 2.6688834441722085, + "grad_norm": 1.111063838005066, + "learning_rate": 9.24215630964451e-05, + "loss": 1.6761, + "step": 22875 + }, + { + "epoch": 2.6690001166725, + "grad_norm": 1.2549152374267578, + "learning_rate": 9.240769148290292e-05, + "loss": 2.0415, + "step": 22876 + }, + { + "epoch": 2.669116789172792, + "grad_norm": 1.2409898042678833, + "learning_rate": 9.23938204819645e-05, + "loss": 1.9917, + "step": 22877 + }, + { + "epoch": 2.6692334616730835, + "grad_norm": 1.2608201503753662, + "learning_rate": 9.237995009377348e-05, + "loss": 1.9684, + "step": 22878 + }, + { + "epoch": 2.669350134173375, + "grad_norm": 1.026747226715088, + "learning_rate": 9.236608031847374e-05, + "loss": 1.9235, + "step": 22879 + }, + { + "epoch": 2.669466806673667, + "grad_norm": 1.0809615850448608, + "learning_rate": 9.235221115620912e-05, + "loss": 2.0408, + "step": 22880 + }, + { + "epoch": 2.6695834791739586, + "grad_norm": 1.0184686183929443, + "learning_rate": 9.233834260712325e-05, + "loss": 2.0284, + "step": 22881 + }, + { + "epoch": 2.6697001516742502, + "grad_norm": 0.9707874059677124, + "learning_rate": 9.232447467136007e-05, + "loss": 1.8354, + "step": 22882 + }, + { + "epoch": 2.669816824174542, + "grad_norm": 1.1542221307754517, + "learning_rate": 9.231060734906322e-05, + "loss": 2.0248, + "step": 22883 + }, + { + "epoch": 2.6699334966748336, + "grad_norm": 1.1511973142623901, + "learning_rate": 9.229674064037656e-05, + "loss": 2.0932, + "step": 22884 + }, + { + "epoch": 2.6700501691751253, + "grad_norm": 1.3648788928985596, + "learning_rate": 9.228287454544371e-05, + "loss": 2.1089, + "step": 22885 + }, + { + "epoch": 2.670166841675417, + "grad_norm": 1.220021367073059, + "learning_rate": 9.226900906440859e-05, + "loss": 1.9174, + "step": 22886 + }, + { + "epoch": 2.6702835141757086, + "grad_norm": 0.995465874671936, + "learning_rate": 9.225514419741481e-05, + "loss": 1.9879, + "step": 22887 + }, + { + "epoch": 2.6704001866760003, + "grad_norm": 1.0934412479400635, + "learning_rate": 9.224127994460617e-05, + "loss": 1.8345, + "step": 22888 + }, + { + "epoch": 2.670516859176292, + "grad_norm": 1.1406190395355225, + "learning_rate": 9.222741630612637e-05, + "loss": 1.8467, + "step": 22889 + }, + { + "epoch": 2.6706335316765837, + "grad_norm": 1.0028992891311646, + "learning_rate": 9.221355328211916e-05, + "loss": 1.8802, + "step": 22890 + }, + { + "epoch": 2.6707502041768754, + "grad_norm": 1.1319832801818848, + "learning_rate": 9.219969087272817e-05, + "loss": 1.9578, + "step": 22891 + }, + { + "epoch": 2.670866876677167, + "grad_norm": 1.0113823413848877, + "learning_rate": 9.218582907809719e-05, + "loss": 1.9986, + "step": 22892 + }, + { + "epoch": 2.6709835491774587, + "grad_norm": 0.9563696384429932, + "learning_rate": 9.217196789836995e-05, + "loss": 1.7254, + "step": 22893 + }, + { + "epoch": 2.6711002216777504, + "grad_norm": 1.17685866355896, + "learning_rate": 9.215810733369011e-05, + "loss": 2.0266, + "step": 22894 + }, + { + "epoch": 2.671216894178042, + "grad_norm": 1.2406468391418457, + "learning_rate": 9.214424738420138e-05, + "loss": 2.0508, + "step": 22895 + }, + { + "epoch": 2.6713335666783338, + "grad_norm": 1.285057544708252, + "learning_rate": 9.213038805004738e-05, + "loss": 2.0457, + "step": 22896 + }, + { + "epoch": 2.6714502391786255, + "grad_norm": 1.0280351638793945, + "learning_rate": 9.211652933137189e-05, + "loss": 1.9254, + "step": 22897 + }, + { + "epoch": 2.671566911678917, + "grad_norm": 1.141470193862915, + "learning_rate": 9.210267122831844e-05, + "loss": 1.9876, + "step": 22898 + }, + { + "epoch": 2.671683584179209, + "grad_norm": 1.1301385164260864, + "learning_rate": 9.208881374103089e-05, + "loss": 1.9077, + "step": 22899 + }, + { + "epoch": 2.6718002566795005, + "grad_norm": 1.146004557609558, + "learning_rate": 9.207495686965269e-05, + "loss": 2.0707, + "step": 22900 + }, + { + "epoch": 2.671916929179792, + "grad_norm": 1.2693405151367188, + "learning_rate": 9.206110061432766e-05, + "loss": 1.9918, + "step": 22901 + }, + { + "epoch": 2.672033601680084, + "grad_norm": 1.1507753133773804, + "learning_rate": 9.204724497519931e-05, + "loss": 1.9413, + "step": 22902 + }, + { + "epoch": 2.6721502741803755, + "grad_norm": 1.1020572185516357, + "learning_rate": 9.203338995241142e-05, + "loss": 1.9108, + "step": 22903 + }, + { + "epoch": 2.672266946680667, + "grad_norm": 1.0888932943344116, + "learning_rate": 9.201953554610749e-05, + "loss": 1.9052, + "step": 22904 + }, + { + "epoch": 2.672383619180959, + "grad_norm": 1.0477046966552734, + "learning_rate": 9.200568175643125e-05, + "loss": 1.8271, + "step": 22905 + }, + { + "epoch": 2.6725002916812506, + "grad_norm": 1.2796704769134521, + "learning_rate": 9.19918285835263e-05, + "loss": 2.0664, + "step": 22906 + }, + { + "epoch": 2.6726169641815423, + "grad_norm": 1.2184326648712158, + "learning_rate": 9.197797602753618e-05, + "loss": 2.1353, + "step": 22907 + }, + { + "epoch": 2.672733636681834, + "grad_norm": 1.1728835105895996, + "learning_rate": 9.19641240886046e-05, + "loss": 1.9907, + "step": 22908 + }, + { + "epoch": 2.6728503091821256, + "grad_norm": 1.1332896947860718, + "learning_rate": 9.195027276687506e-05, + "loss": 1.8314, + "step": 22909 + }, + { + "epoch": 2.6729669816824173, + "grad_norm": 1.1853492259979248, + "learning_rate": 9.193642206249128e-05, + "loss": 2.2041, + "step": 22910 + }, + { + "epoch": 2.673083654182709, + "grad_norm": 1.2108575105667114, + "learning_rate": 9.192257197559669e-05, + "loss": 2.0098, + "step": 22911 + }, + { + "epoch": 2.6732003266830007, + "grad_norm": 1.1090043783187866, + "learning_rate": 9.190872250633506e-05, + "loss": 1.9274, + "step": 22912 + }, + { + "epoch": 2.6733169991832924, + "grad_norm": 1.047053337097168, + "learning_rate": 9.189487365484982e-05, + "loss": 1.6535, + "step": 22913 + }, + { + "epoch": 2.673433671683584, + "grad_norm": 1.1789323091506958, + "learning_rate": 9.188102542128457e-05, + "loss": 2.0758, + "step": 22914 + }, + { + "epoch": 2.6735503441838757, + "grad_norm": 1.0601305961608887, + "learning_rate": 9.18671778057829e-05, + "loss": 1.8841, + "step": 22915 + }, + { + "epoch": 2.6736670166841674, + "grad_norm": 1.4042290449142456, + "learning_rate": 9.18533308084884e-05, + "loss": 2.0313, + "step": 22916 + }, + { + "epoch": 2.673783689184459, + "grad_norm": 1.0343730449676514, + "learning_rate": 9.18394844295445e-05, + "loss": 1.9673, + "step": 22917 + }, + { + "epoch": 2.6739003616847508, + "grad_norm": 1.1506084203720093, + "learning_rate": 9.182563866909483e-05, + "loss": 1.8387, + "step": 22918 + }, + { + "epoch": 2.6740170341850424, + "grad_norm": 1.233339548110962, + "learning_rate": 9.181179352728298e-05, + "loss": 1.868, + "step": 22919 + }, + { + "epoch": 2.674133706685334, + "grad_norm": 1.189246416091919, + "learning_rate": 9.179794900425235e-05, + "loss": 2.0023, + "step": 22920 + }, + { + "epoch": 2.674250379185626, + "grad_norm": 1.1330959796905518, + "learning_rate": 9.17841051001466e-05, + "loss": 1.9744, + "step": 22921 + }, + { + "epoch": 2.6743670516859175, + "grad_norm": 1.0328727960586548, + "learning_rate": 9.177026181510911e-05, + "loss": 1.8195, + "step": 22922 + }, + { + "epoch": 2.674483724186209, + "grad_norm": 1.039618968963623, + "learning_rate": 9.175641914928357e-05, + "loss": 1.941, + "step": 22923 + }, + { + "epoch": 2.674600396686501, + "grad_norm": 1.097070574760437, + "learning_rate": 9.174257710281328e-05, + "loss": 1.9778, + "step": 22924 + }, + { + "epoch": 2.6747170691867925, + "grad_norm": 1.0584503412246704, + "learning_rate": 9.17287356758419e-05, + "loss": 1.9185, + "step": 22925 + }, + { + "epoch": 2.674833741687084, + "grad_norm": 1.2356899976730347, + "learning_rate": 9.171489486851278e-05, + "loss": 1.9663, + "step": 22926 + }, + { + "epoch": 2.674950414187376, + "grad_norm": 1.2496342658996582, + "learning_rate": 9.17010546809696e-05, + "loss": 1.994, + "step": 22927 + }, + { + "epoch": 2.6750670866876676, + "grad_norm": 1.1674779653549194, + "learning_rate": 9.168721511335561e-05, + "loss": 2.0232, + "step": 22928 + }, + { + "epoch": 2.6751837591879593, + "grad_norm": 1.1677614450454712, + "learning_rate": 9.167337616581451e-05, + "loss": 2.076, + "step": 22929 + }, + { + "epoch": 2.675300431688251, + "grad_norm": 1.2501444816589355, + "learning_rate": 9.165953783848952e-05, + "loss": 1.9383, + "step": 22930 + }, + { + "epoch": 2.6754171041885426, + "grad_norm": 1.0281176567077637, + "learning_rate": 9.164570013152435e-05, + "loss": 1.9167, + "step": 22931 + }, + { + "epoch": 2.6755337766888343, + "grad_norm": 1.1626319885253906, + "learning_rate": 9.163186304506233e-05, + "loss": 1.9256, + "step": 22932 + }, + { + "epoch": 2.675650449189126, + "grad_norm": 1.0062040090560913, + "learning_rate": 9.161802657924686e-05, + "loss": 1.8712, + "step": 22933 + }, + { + "epoch": 2.6757671216894177, + "grad_norm": 1.2163041830062866, + "learning_rate": 9.160419073422152e-05, + "loss": 1.8164, + "step": 22934 + }, + { + "epoch": 2.6758837941897093, + "grad_norm": 1.0986045598983765, + "learning_rate": 9.15903555101296e-05, + "loss": 2.0053, + "step": 22935 + }, + { + "epoch": 2.676000466690001, + "grad_norm": 1.0725183486938477, + "learning_rate": 9.157652090711465e-05, + "loss": 1.8532, + "step": 22936 + }, + { + "epoch": 2.6761171391902927, + "grad_norm": 1.0717862844467163, + "learning_rate": 9.156268692531997e-05, + "loss": 1.9096, + "step": 22937 + }, + { + "epoch": 2.6762338116905844, + "grad_norm": 1.1324604749679565, + "learning_rate": 9.154885356488911e-05, + "loss": 1.9862, + "step": 22938 + }, + { + "epoch": 2.676350484190876, + "grad_norm": 1.1096733808517456, + "learning_rate": 9.153502082596537e-05, + "loss": 2.0498, + "step": 22939 + }, + { + "epoch": 2.6764671566911677, + "grad_norm": 1.0981146097183228, + "learning_rate": 9.152118870869218e-05, + "loss": 2.046, + "step": 22940 + }, + { + "epoch": 2.6765838291914594, + "grad_norm": 1.0227431058883667, + "learning_rate": 9.150735721321298e-05, + "loss": 2.0339, + "step": 22941 + }, + { + "epoch": 2.676700501691751, + "grad_norm": 1.1984119415283203, + "learning_rate": 9.149352633967115e-05, + "loss": 2.0714, + "step": 22942 + }, + { + "epoch": 2.676817174192043, + "grad_norm": 1.3471810817718506, + "learning_rate": 9.147969608820998e-05, + "loss": 2.156, + "step": 22943 + }, + { + "epoch": 2.6769338466923345, + "grad_norm": 1.309921145439148, + "learning_rate": 9.146586645897291e-05, + "loss": 2.0613, + "step": 22944 + }, + { + "epoch": 2.677050519192626, + "grad_norm": 1.044560432434082, + "learning_rate": 9.14520374521034e-05, + "loss": 1.9496, + "step": 22945 + }, + { + "epoch": 2.677167191692918, + "grad_norm": 1.0690417289733887, + "learning_rate": 9.143820906774468e-05, + "loss": 2.0315, + "step": 22946 + }, + { + "epoch": 2.6772838641932095, + "grad_norm": 1.140200138092041, + "learning_rate": 9.142438130604017e-05, + "loss": 1.9768, + "step": 22947 + }, + { + "epoch": 2.677400536693501, + "grad_norm": 1.266600489616394, + "learning_rate": 9.14105541671332e-05, + "loss": 1.8719, + "step": 22948 + }, + { + "epoch": 2.677517209193793, + "grad_norm": 1.426034688949585, + "learning_rate": 9.139672765116718e-05, + "loss": 2.0795, + "step": 22949 + }, + { + "epoch": 2.6776338816940846, + "grad_norm": 1.2447969913482666, + "learning_rate": 9.138290175828531e-05, + "loss": 2.0302, + "step": 22950 + }, + { + "epoch": 2.6777505541943762, + "grad_norm": 0.9983821511268616, + "learning_rate": 9.136907648863106e-05, + "loss": 1.9793, + "step": 22951 + }, + { + "epoch": 2.677867226694668, + "grad_norm": 1.045118808746338, + "learning_rate": 9.135525184234764e-05, + "loss": 1.9323, + "step": 22952 + }, + { + "epoch": 2.6779838991949596, + "grad_norm": 0.9868763089179993, + "learning_rate": 9.134142781957853e-05, + "loss": 2.0006, + "step": 22953 + }, + { + "epoch": 2.6781005716952513, + "grad_norm": 0.9985110759735107, + "learning_rate": 9.132760442046684e-05, + "loss": 2.0175, + "step": 22954 + }, + { + "epoch": 2.678217244195543, + "grad_norm": 1.0564212799072266, + "learning_rate": 9.131378164515604e-05, + "loss": 1.9797, + "step": 22955 + }, + { + "epoch": 2.6783339166958346, + "grad_norm": 1.1944026947021484, + "learning_rate": 9.129995949378936e-05, + "loss": 1.8978, + "step": 22956 + }, + { + "epoch": 2.6784505891961263, + "grad_norm": 1.2163602113723755, + "learning_rate": 9.128613796651004e-05, + "loss": 1.9482, + "step": 22957 + }, + { + "epoch": 2.678567261696418, + "grad_norm": 0.8871884346008301, + "learning_rate": 9.12723170634615e-05, + "loss": 1.8242, + "step": 22958 + }, + { + "epoch": 2.6786839341967097, + "grad_norm": 1.0306984186172485, + "learning_rate": 9.125849678478686e-05, + "loss": 1.96, + "step": 22959 + }, + { + "epoch": 2.6788006066970014, + "grad_norm": 1.0609114170074463, + "learning_rate": 9.124467713062956e-05, + "loss": 1.9274, + "step": 22960 + }, + { + "epoch": 2.678917279197293, + "grad_norm": 1.152355432510376, + "learning_rate": 9.123085810113272e-05, + "loss": 2.0159, + "step": 22961 + }, + { + "epoch": 2.6790339516975847, + "grad_norm": 1.1336365938186646, + "learning_rate": 9.121703969643972e-05, + "loss": 2.0655, + "step": 22962 + }, + { + "epoch": 2.6791506241978764, + "grad_norm": 1.1897306442260742, + "learning_rate": 9.120322191669371e-05, + "loss": 1.8775, + "step": 22963 + }, + { + "epoch": 2.679267296698168, + "grad_norm": 1.236939549446106, + "learning_rate": 9.118940476203805e-05, + "loss": 1.9877, + "step": 22964 + }, + { + "epoch": 2.67938396919846, + "grad_norm": 1.245805025100708, + "learning_rate": 9.117558823261588e-05, + "loss": 1.9697, + "step": 22965 + }, + { + "epoch": 2.6795006416987515, + "grad_norm": 1.0910320281982422, + "learning_rate": 9.11617723285705e-05, + "loss": 1.8657, + "step": 22966 + }, + { + "epoch": 2.679617314199043, + "grad_norm": 1.1376008987426758, + "learning_rate": 9.114795705004502e-05, + "loss": 1.9199, + "step": 22967 + }, + { + "epoch": 2.679733986699335, + "grad_norm": 1.198392629623413, + "learning_rate": 9.113414239718286e-05, + "loss": 2.0163, + "step": 22968 + }, + { + "epoch": 2.6798506591996265, + "grad_norm": 1.182547688484192, + "learning_rate": 9.112032837012702e-05, + "loss": 1.9992, + "step": 22969 + }, + { + "epoch": 2.679967331699918, + "grad_norm": 0.9830119013786316, + "learning_rate": 9.11065149690209e-05, + "loss": 1.7863, + "step": 22970 + }, + { + "epoch": 2.68008400420021, + "grad_norm": 1.1158957481384277, + "learning_rate": 9.109270219400757e-05, + "loss": 1.8712, + "step": 22971 + }, + { + "epoch": 2.6802006767005015, + "grad_norm": 1.1414895057678223, + "learning_rate": 9.107889004523028e-05, + "loss": 1.8374, + "step": 22972 + }, + { + "epoch": 2.6803173492007932, + "grad_norm": 1.307469129562378, + "learning_rate": 9.106507852283221e-05, + "loss": 2.1777, + "step": 22973 + }, + { + "epoch": 2.680434021701085, + "grad_norm": 1.2269916534423828, + "learning_rate": 9.105126762695657e-05, + "loss": 1.9595, + "step": 22974 + }, + { + "epoch": 2.6805506942013766, + "grad_norm": 1.1312087774276733, + "learning_rate": 9.103745735774653e-05, + "loss": 2.0143, + "step": 22975 + }, + { + "epoch": 2.6806673667016683, + "grad_norm": 1.2243733406066895, + "learning_rate": 9.102364771534516e-05, + "loss": 2.0998, + "step": 22976 + }, + { + "epoch": 2.68078403920196, + "grad_norm": 1.3657892942428589, + "learning_rate": 9.100983869989577e-05, + "loss": 2.0065, + "step": 22977 + }, + { + "epoch": 2.6809007117022516, + "grad_norm": 1.1768454313278198, + "learning_rate": 9.09960303115414e-05, + "loss": 1.9078, + "step": 22978 + }, + { + "epoch": 2.6810173842025433, + "grad_norm": 1.0714621543884277, + "learning_rate": 9.09822225504253e-05, + "loss": 1.8676, + "step": 22979 + }, + { + "epoch": 2.681134056702835, + "grad_norm": 1.1540751457214355, + "learning_rate": 9.096841541669052e-05, + "loss": 2.012, + "step": 22980 + }, + { + "epoch": 2.6812507292031267, + "grad_norm": 1.1490379571914673, + "learning_rate": 9.095460891048028e-05, + "loss": 2.2253, + "step": 22981 + }, + { + "epoch": 2.6813674017034184, + "grad_norm": 1.0385189056396484, + "learning_rate": 9.094080303193765e-05, + "loss": 1.7988, + "step": 22982 + }, + { + "epoch": 2.68148407420371, + "grad_norm": 1.2289491891860962, + "learning_rate": 9.092699778120578e-05, + "loss": 2.0276, + "step": 22983 + }, + { + "epoch": 2.6816007467040017, + "grad_norm": 1.1140484809875488, + "learning_rate": 9.091319315842777e-05, + "loss": 1.948, + "step": 22984 + }, + { + "epoch": 2.6817174192042934, + "grad_norm": 1.245511770248413, + "learning_rate": 9.089938916374673e-05, + "loss": 1.9278, + "step": 22985 + }, + { + "epoch": 2.681834091704585, + "grad_norm": 1.1367709636688232, + "learning_rate": 9.088558579730583e-05, + "loss": 2.0617, + "step": 22986 + }, + { + "epoch": 2.6819507642048768, + "grad_norm": 1.284506916999817, + "learning_rate": 9.087178305924806e-05, + "loss": 2.045, + "step": 22987 + }, + { + "epoch": 2.6820674367051685, + "grad_norm": 1.1437408924102783, + "learning_rate": 9.085798094971661e-05, + "loss": 1.9009, + "step": 22988 + }, + { + "epoch": 2.68218410920546, + "grad_norm": 1.1230906248092651, + "learning_rate": 9.084417946885446e-05, + "loss": 2.0195, + "step": 22989 + }, + { + "epoch": 2.682300781705752, + "grad_norm": 1.005812168121338, + "learning_rate": 9.083037861680484e-05, + "loss": 1.7636, + "step": 22990 + }, + { + "epoch": 2.6824174542060435, + "grad_norm": 1.0563805103302002, + "learning_rate": 9.08165783937107e-05, + "loss": 2.0622, + "step": 22991 + }, + { + "epoch": 2.682534126706335, + "grad_norm": 1.0003961324691772, + "learning_rate": 9.080277879971514e-05, + "loss": 1.8216, + "step": 22992 + }, + { + "epoch": 2.682650799206627, + "grad_norm": 1.2248568534851074, + "learning_rate": 9.078897983496116e-05, + "loss": 2.0932, + "step": 22993 + }, + { + "epoch": 2.6827674717069185, + "grad_norm": 1.1251940727233887, + "learning_rate": 9.077518149959196e-05, + "loss": 1.8862, + "step": 22994 + }, + { + "epoch": 2.68288414420721, + "grad_norm": 1.0939631462097168, + "learning_rate": 9.076138379375041e-05, + "loss": 1.9738, + "step": 22995 + }, + { + "epoch": 2.683000816707502, + "grad_norm": 0.9953552484512329, + "learning_rate": 9.07475867175797e-05, + "loss": 1.924, + "step": 22996 + }, + { + "epoch": 2.6831174892077936, + "grad_norm": 1.1773473024368286, + "learning_rate": 9.073379027122274e-05, + "loss": 2.1142, + "step": 22997 + }, + { + "epoch": 2.6832341617080853, + "grad_norm": 1.1132936477661133, + "learning_rate": 9.071999445482264e-05, + "loss": 1.9245, + "step": 22998 + }, + { + "epoch": 2.683350834208377, + "grad_norm": 1.1666828393936157, + "learning_rate": 9.070619926852241e-05, + "loss": 2.0607, + "step": 22999 + }, + { + "epoch": 2.6834675067086686, + "grad_norm": 1.0398505926132202, + "learning_rate": 9.069240471246503e-05, + "loss": 1.9431, + "step": 23000 + }, + { + "epoch": 2.6835841792089603, + "grad_norm": 1.2669936418533325, + "learning_rate": 9.067861078679358e-05, + "loss": 2.0165, + "step": 23001 + }, + { + "epoch": 2.683700851709252, + "grad_norm": 1.095780611038208, + "learning_rate": 9.066481749165091e-05, + "loss": 2.0185, + "step": 23002 + }, + { + "epoch": 2.6838175242095437, + "grad_norm": 1.013159990310669, + "learning_rate": 9.065102482718018e-05, + "loss": 1.9647, + "step": 23003 + }, + { + "epoch": 2.6839341967098354, + "grad_norm": 1.2051827907562256, + "learning_rate": 9.063723279352424e-05, + "loss": 2.0305, + "step": 23004 + }, + { + "epoch": 2.684050869210127, + "grad_norm": 1.1285277605056763, + "learning_rate": 9.062344139082621e-05, + "loss": 2.0073, + "step": 23005 + }, + { + "epoch": 2.6841675417104187, + "grad_norm": 0.923644483089447, + "learning_rate": 9.060965061922893e-05, + "loss": 1.7077, + "step": 23006 + }, + { + "epoch": 2.6842842142107104, + "grad_norm": 1.0061635971069336, + "learning_rate": 9.059586047887547e-05, + "loss": 1.8356, + "step": 23007 + }, + { + "epoch": 2.684400886711002, + "grad_norm": 1.1090734004974365, + "learning_rate": 9.058207096990872e-05, + "loss": 1.8978, + "step": 23008 + }, + { + "epoch": 2.6845175592112938, + "grad_norm": 1.1288813352584839, + "learning_rate": 9.056828209247167e-05, + "loss": 1.8866, + "step": 23009 + }, + { + "epoch": 2.6846342317115854, + "grad_norm": 1.095420002937317, + "learning_rate": 9.055449384670725e-05, + "loss": 2.1126, + "step": 23010 + }, + { + "epoch": 2.684750904211877, + "grad_norm": 1.1113169193267822, + "learning_rate": 9.05407062327584e-05, + "loss": 1.8106, + "step": 23011 + }, + { + "epoch": 2.684867576712169, + "grad_norm": 1.1574015617370605, + "learning_rate": 9.052691925076811e-05, + "loss": 2.0455, + "step": 23012 + }, + { + "epoch": 2.6849842492124605, + "grad_norm": 1.1249158382415771, + "learning_rate": 9.05131329008792e-05, + "loss": 1.9065, + "step": 23013 + }, + { + "epoch": 2.685100921712752, + "grad_norm": 1.0738177299499512, + "learning_rate": 9.049934718323472e-05, + "loss": 1.8554, + "step": 23014 + }, + { + "epoch": 2.685217594213044, + "grad_norm": 1.064475178718567, + "learning_rate": 9.048556209797751e-05, + "loss": 1.8279, + "step": 23015 + }, + { + "epoch": 2.6853342667133355, + "grad_norm": 1.0731558799743652, + "learning_rate": 9.047177764525048e-05, + "loss": 1.8458, + "step": 23016 + }, + { + "epoch": 2.685450939213627, + "grad_norm": 1.0491998195648193, + "learning_rate": 9.045799382519653e-05, + "loss": 1.8222, + "step": 23017 + }, + { + "epoch": 2.685567611713919, + "grad_norm": 1.220523476600647, + "learning_rate": 9.044421063795859e-05, + "loss": 1.9661, + "step": 23018 + }, + { + "epoch": 2.6856842842142106, + "grad_norm": 1.0421360731124878, + "learning_rate": 9.043042808367947e-05, + "loss": 1.8903, + "step": 23019 + }, + { + "epoch": 2.6858009567145023, + "grad_norm": 1.2393407821655273, + "learning_rate": 9.041664616250219e-05, + "loss": 2.1882, + "step": 23020 + }, + { + "epoch": 2.685917629214794, + "grad_norm": 1.0782995223999023, + "learning_rate": 9.040286487456946e-05, + "loss": 2.0102, + "step": 23021 + }, + { + "epoch": 2.6860343017150856, + "grad_norm": 1.2474154233932495, + "learning_rate": 9.03890842200243e-05, + "loss": 1.9447, + "step": 23022 + }, + { + "epoch": 2.6861509742153773, + "grad_norm": 0.9813899397850037, + "learning_rate": 9.037530419900943e-05, + "loss": 1.9476, + "step": 23023 + }, + { + "epoch": 2.686267646715669, + "grad_norm": 1.271155834197998, + "learning_rate": 9.036152481166784e-05, + "loss": 2.012, + "step": 23024 + }, + { + "epoch": 2.6863843192159607, + "grad_norm": 1.1471517086029053, + "learning_rate": 9.034774605814233e-05, + "loss": 1.8434, + "step": 23025 + }, + { + "epoch": 2.6865009917162523, + "grad_norm": 1.1662651300430298, + "learning_rate": 9.033396793857568e-05, + "loss": 1.9609, + "step": 23026 + }, + { + "epoch": 2.686617664216544, + "grad_norm": 1.070615530014038, + "learning_rate": 9.032019045311083e-05, + "loss": 1.9436, + "step": 23027 + }, + { + "epoch": 2.6867343367168357, + "grad_norm": 1.0007473230361938, + "learning_rate": 9.030641360189051e-05, + "loss": 1.8469, + "step": 23028 + }, + { + "epoch": 2.6868510092171274, + "grad_norm": 1.085256814956665, + "learning_rate": 9.029263738505767e-05, + "loss": 1.9983, + "step": 23029 + }, + { + "epoch": 2.686967681717419, + "grad_norm": 1.359573483467102, + "learning_rate": 9.027886180275496e-05, + "loss": 1.9804, + "step": 23030 + }, + { + "epoch": 2.6870843542177107, + "grad_norm": 1.182452917098999, + "learning_rate": 9.026508685512535e-05, + "loss": 2.0834, + "step": 23031 + }, + { + "epoch": 2.6872010267180024, + "grad_norm": 1.1778953075408936, + "learning_rate": 9.02513125423115e-05, + "loss": 1.9741, + "step": 23032 + }, + { + "epoch": 2.687317699218294, + "grad_norm": 1.18744957447052, + "learning_rate": 9.023753886445635e-05, + "loss": 1.9747, + "step": 23033 + }, + { + "epoch": 2.687434371718586, + "grad_norm": 1.162103533744812, + "learning_rate": 9.022376582170261e-05, + "loss": 1.9919, + "step": 23034 + }, + { + "epoch": 2.6875510442188775, + "grad_norm": 1.4097663164138794, + "learning_rate": 9.020999341419309e-05, + "loss": 2.0714, + "step": 23035 + }, + { + "epoch": 2.687667716719169, + "grad_norm": 1.255872368812561, + "learning_rate": 9.019622164207049e-05, + "loss": 2.0036, + "step": 23036 + }, + { + "epoch": 2.687784389219461, + "grad_norm": 1.3757619857788086, + "learning_rate": 9.018245050547762e-05, + "loss": 2.079, + "step": 23037 + }, + { + "epoch": 2.6879010617197525, + "grad_norm": 1.1256883144378662, + "learning_rate": 9.016868000455734e-05, + "loss": 2.0205, + "step": 23038 + }, + { + "epoch": 2.688017734220044, + "grad_norm": 1.0944974422454834, + "learning_rate": 9.015491013945227e-05, + "loss": 1.9609, + "step": 23039 + }, + { + "epoch": 2.688134406720336, + "grad_norm": 1.110669493675232, + "learning_rate": 9.014114091030526e-05, + "loss": 1.7903, + "step": 23040 + }, + { + "epoch": 2.6882510792206276, + "grad_norm": 1.1910902261734009, + "learning_rate": 9.012737231725903e-05, + "loss": 1.8902, + "step": 23041 + }, + { + "epoch": 2.6883677517209192, + "grad_norm": 1.0713948011398315, + "learning_rate": 9.011360436045627e-05, + "loss": 1.8904, + "step": 23042 + }, + { + "epoch": 2.688484424221211, + "grad_norm": 1.041332483291626, + "learning_rate": 9.009983704003977e-05, + "loss": 2.0151, + "step": 23043 + }, + { + "epoch": 2.6886010967215026, + "grad_norm": 1.089186191558838, + "learning_rate": 9.008607035615228e-05, + "loss": 1.922, + "step": 23044 + }, + { + "epoch": 2.6887177692217943, + "grad_norm": 1.1317294836044312, + "learning_rate": 9.007230430893638e-05, + "loss": 2.0076, + "step": 23045 + }, + { + "epoch": 2.688834441722086, + "grad_norm": 1.1824595928192139, + "learning_rate": 9.005853889853491e-05, + "loss": 1.7627, + "step": 23046 + }, + { + "epoch": 2.6889511142223776, + "grad_norm": 1.042677640914917, + "learning_rate": 9.004477412509048e-05, + "loss": 1.8602, + "step": 23047 + }, + { + "epoch": 2.6890677867226693, + "grad_norm": 1.2422059774398804, + "learning_rate": 9.003100998874592e-05, + "loss": 2.2472, + "step": 23048 + }, + { + "epoch": 2.689184459222961, + "grad_norm": 1.2321809530258179, + "learning_rate": 9.00172464896438e-05, + "loss": 2.0453, + "step": 23049 + }, + { + "epoch": 2.6893011317232527, + "grad_norm": 1.2264925241470337, + "learning_rate": 9.000348362792684e-05, + "loss": 1.8773, + "step": 23050 + }, + { + "epoch": 2.6894178042235444, + "grad_norm": 1.3430280685424805, + "learning_rate": 8.998972140373777e-05, + "loss": 2.1524, + "step": 23051 + }, + { + "epoch": 2.689534476723836, + "grad_norm": 1.1959471702575684, + "learning_rate": 8.997595981721914e-05, + "loss": 2.055, + "step": 23052 + }, + { + "epoch": 2.6896511492241277, + "grad_norm": 1.0079154968261719, + "learning_rate": 8.996219886851378e-05, + "loss": 1.8396, + "step": 23053 + }, + { + "epoch": 2.6897678217244194, + "grad_norm": 1.1279597282409668, + "learning_rate": 8.994843855776419e-05, + "loss": 2.0344, + "step": 23054 + }, + { + "epoch": 2.689884494224711, + "grad_norm": 1.0879830121994019, + "learning_rate": 8.993467888511313e-05, + "loss": 1.7693, + "step": 23055 + }, + { + "epoch": 2.690001166725003, + "grad_norm": 1.063037633895874, + "learning_rate": 8.992091985070317e-05, + "loss": 1.9177, + "step": 23056 + }, + { + "epoch": 2.6901178392252945, + "grad_norm": 1.163673758506775, + "learning_rate": 8.990716145467703e-05, + "loss": 1.8929, + "step": 23057 + }, + { + "epoch": 2.690234511725586, + "grad_norm": 1.077682614326477, + "learning_rate": 8.989340369717725e-05, + "loss": 1.972, + "step": 23058 + }, + { + "epoch": 2.690351184225878, + "grad_norm": 1.2164344787597656, + "learning_rate": 8.987964657834658e-05, + "loss": 2.094, + "step": 23059 + }, + { + "epoch": 2.6904678567261695, + "grad_norm": 0.9936056137084961, + "learning_rate": 8.98658900983275e-05, + "loss": 1.8446, + "step": 23060 + }, + { + "epoch": 2.690584529226461, + "grad_norm": 1.738202452659607, + "learning_rate": 8.985213425726273e-05, + "loss": 1.9693, + "step": 23061 + }, + { + "epoch": 2.690701201726753, + "grad_norm": 1.2630894184112549, + "learning_rate": 8.983837905529476e-05, + "loss": 1.9877, + "step": 23062 + }, + { + "epoch": 2.6908178742270445, + "grad_norm": 1.28529691696167, + "learning_rate": 8.982462449256625e-05, + "loss": 1.9965, + "step": 23063 + }, + { + "epoch": 2.6909345467273362, + "grad_norm": 1.0489726066589355, + "learning_rate": 8.981087056921987e-05, + "loss": 2.0037, + "step": 23064 + }, + { + "epoch": 2.691051219227628, + "grad_norm": 1.0582581758499146, + "learning_rate": 8.979711728539809e-05, + "loss": 2.0079, + "step": 23065 + }, + { + "epoch": 2.6911678917279196, + "grad_norm": 1.0429754257202148, + "learning_rate": 8.978336464124357e-05, + "loss": 1.8731, + "step": 23066 + }, + { + "epoch": 2.6912845642282113, + "grad_norm": 1.2882509231567383, + "learning_rate": 8.976961263689881e-05, + "loss": 1.9432, + "step": 23067 + }, + { + "epoch": 2.691401236728503, + "grad_norm": 1.0942100286483765, + "learning_rate": 8.975586127250644e-05, + "loss": 1.9495, + "step": 23068 + }, + { + "epoch": 2.6915179092287946, + "grad_norm": 1.1279911994934082, + "learning_rate": 8.974211054820897e-05, + "loss": 2.066, + "step": 23069 + }, + { + "epoch": 2.6916345817290863, + "grad_norm": 1.1491249799728394, + "learning_rate": 8.972836046414899e-05, + "loss": 1.9165, + "step": 23070 + }, + { + "epoch": 2.691751254229378, + "grad_norm": 1.1606336832046509, + "learning_rate": 8.9714611020469e-05, + "loss": 2.009, + "step": 23071 + }, + { + "epoch": 2.6918679267296697, + "grad_norm": 1.0161997079849243, + "learning_rate": 8.970086221731158e-05, + "loss": 1.8502, + "step": 23072 + }, + { + "epoch": 2.6919845992299614, + "grad_norm": 1.1058152914047241, + "learning_rate": 8.968711405481923e-05, + "loss": 1.9463, + "step": 23073 + }, + { + "epoch": 2.692101271730253, + "grad_norm": 1.091048002243042, + "learning_rate": 8.967336653313455e-05, + "loss": 1.9559, + "step": 23074 + }, + { + "epoch": 2.6922179442305447, + "grad_norm": 1.1026105880737305, + "learning_rate": 8.965961965239992e-05, + "loss": 1.9452, + "step": 23075 + }, + { + "epoch": 2.6923346167308364, + "grad_norm": 1.1742175817489624, + "learning_rate": 8.964587341275798e-05, + "loss": 1.9476, + "step": 23076 + }, + { + "epoch": 2.692451289231128, + "grad_norm": 1.1435744762420654, + "learning_rate": 8.963212781435121e-05, + "loss": 2.042, + "step": 23077 + }, + { + "epoch": 2.6925679617314198, + "grad_norm": 1.1081463098526, + "learning_rate": 8.961838285732205e-05, + "loss": 1.8895, + "step": 23078 + }, + { + "epoch": 2.6926846342317114, + "grad_norm": 1.3159675598144531, + "learning_rate": 8.960463854181308e-05, + "loss": 2.0688, + "step": 23079 + }, + { + "epoch": 2.692801306732003, + "grad_norm": 1.2032254934310913, + "learning_rate": 8.959089486796678e-05, + "loss": 1.9255, + "step": 23080 + }, + { + "epoch": 2.692917979232295, + "grad_norm": 1.2095201015472412, + "learning_rate": 8.957715183592552e-05, + "loss": 1.7693, + "step": 23081 + }, + { + "epoch": 2.6930346517325865, + "grad_norm": 1.321412205696106, + "learning_rate": 8.956340944583186e-05, + "loss": 2.1398, + "step": 23082 + }, + { + "epoch": 2.693151324232878, + "grad_norm": 1.1387643814086914, + "learning_rate": 8.954966769782828e-05, + "loss": 2.0375, + "step": 23083 + }, + { + "epoch": 2.69326799673317, + "grad_norm": 1.0216830968856812, + "learning_rate": 8.953592659205714e-05, + "loss": 2.0453, + "step": 23084 + }, + { + "epoch": 2.6933846692334615, + "grad_norm": 1.2128174304962158, + "learning_rate": 8.9522186128661e-05, + "loss": 2.0722, + "step": 23085 + }, + { + "epoch": 2.693501341733753, + "grad_norm": 1.1583236455917358, + "learning_rate": 8.950844630778223e-05, + "loss": 1.9954, + "step": 23086 + }, + { + "epoch": 2.693618014234045, + "grad_norm": 1.1565898656845093, + "learning_rate": 8.949470712956335e-05, + "loss": 1.8666, + "step": 23087 + }, + { + "epoch": 2.6937346867343366, + "grad_norm": 1.153465986251831, + "learning_rate": 8.948096859414672e-05, + "loss": 2.0658, + "step": 23088 + }, + { + "epoch": 2.6938513592346283, + "grad_norm": 0.9707480072975159, + "learning_rate": 8.946723070167474e-05, + "loss": 1.9426, + "step": 23089 + }, + { + "epoch": 2.69396803173492, + "grad_norm": 1.2749366760253906, + "learning_rate": 8.945349345228999e-05, + "loss": 2.0127, + "step": 23090 + }, + { + "epoch": 2.6940847042352116, + "grad_norm": 1.0809540748596191, + "learning_rate": 8.94397568461347e-05, + "loss": 2.1531, + "step": 23091 + }, + { + "epoch": 2.6942013767355033, + "grad_norm": 1.0297245979309082, + "learning_rate": 8.942602088335141e-05, + "loss": 1.7269, + "step": 23092 + }, + { + "epoch": 2.694318049235795, + "grad_norm": 1.09200119972229, + "learning_rate": 8.941228556408238e-05, + "loss": 1.7712, + "step": 23093 + }, + { + "epoch": 2.6944347217360867, + "grad_norm": 1.1196740865707397, + "learning_rate": 8.939855088847016e-05, + "loss": 2.0814, + "step": 23094 + }, + { + "epoch": 2.6945513942363783, + "grad_norm": 1.2379230260849, + "learning_rate": 8.938481685665702e-05, + "loss": 2.0011, + "step": 23095 + }, + { + "epoch": 2.69466806673667, + "grad_norm": 1.288480281829834, + "learning_rate": 8.93710834687854e-05, + "loss": 2.1917, + "step": 23096 + }, + { + "epoch": 2.6947847392369617, + "grad_norm": 1.1330554485321045, + "learning_rate": 8.935735072499762e-05, + "loss": 1.9927, + "step": 23097 + }, + { + "epoch": 2.6949014117372534, + "grad_norm": 1.1857560873031616, + "learning_rate": 8.93436186254361e-05, + "loss": 2.0286, + "step": 23098 + }, + { + "epoch": 2.695018084237545, + "grad_norm": 1.1146172285079956, + "learning_rate": 8.932988717024322e-05, + "loss": 2.1766, + "step": 23099 + }, + { + "epoch": 2.6951347567378368, + "grad_norm": 1.1253894567489624, + "learning_rate": 8.931615635956125e-05, + "loss": 2.0812, + "step": 23100 + }, + { + "epoch": 2.6952514292381284, + "grad_norm": 1.2149847745895386, + "learning_rate": 8.930242619353254e-05, + "loss": 2.1425, + "step": 23101 + }, + { + "epoch": 2.69536810173842, + "grad_norm": 1.0874056816101074, + "learning_rate": 8.928869667229946e-05, + "loss": 1.7343, + "step": 23102 + }, + { + "epoch": 2.695484774238712, + "grad_norm": 1.0092905759811401, + "learning_rate": 8.927496779600441e-05, + "loss": 1.8779, + "step": 23103 + }, + { + "epoch": 2.6956014467390035, + "grad_norm": 1.1810483932495117, + "learning_rate": 8.926123956478958e-05, + "loss": 1.8469, + "step": 23104 + }, + { + "epoch": 2.695718119239295, + "grad_norm": 1.2506840229034424, + "learning_rate": 8.92475119787974e-05, + "loss": 2.0646, + "step": 23105 + }, + { + "epoch": 2.695834791739587, + "grad_norm": 1.1117281913757324, + "learning_rate": 8.923378503817014e-05, + "loss": 1.9171, + "step": 23106 + }, + { + "epoch": 2.6959514642398785, + "grad_norm": 1.0464495420455933, + "learning_rate": 8.922005874305011e-05, + "loss": 2.0573, + "step": 23107 + }, + { + "epoch": 2.69606813674017, + "grad_norm": 0.9857576489448547, + "learning_rate": 8.920633309357957e-05, + "loss": 2.0139, + "step": 23108 + }, + { + "epoch": 2.696184809240462, + "grad_norm": 1.2752912044525146, + "learning_rate": 8.919260808990093e-05, + "loss": 2.0536, + "step": 23109 + }, + { + "epoch": 2.6963014817407536, + "grad_norm": 1.1519962549209595, + "learning_rate": 8.917888373215632e-05, + "loss": 2.0117, + "step": 23110 + }, + { + "epoch": 2.6964181542410453, + "grad_norm": 1.079940915107727, + "learning_rate": 8.916516002048816e-05, + "loss": 2.0982, + "step": 23111 + }, + { + "epoch": 2.696534826741337, + "grad_norm": 1.0610432624816895, + "learning_rate": 8.915143695503857e-05, + "loss": 2.0367, + "step": 23112 + }, + { + "epoch": 2.6966514992416286, + "grad_norm": 1.0955586433410645, + "learning_rate": 8.913771453594999e-05, + "loss": 2.0261, + "step": 23113 + }, + { + "epoch": 2.6967681717419203, + "grad_norm": 1.0860927104949951, + "learning_rate": 8.912399276336452e-05, + "loss": 1.921, + "step": 23114 + }, + { + "epoch": 2.696884844242212, + "grad_norm": 1.0574291944503784, + "learning_rate": 8.911027163742455e-05, + "loss": 2.0533, + "step": 23115 + }, + { + "epoch": 2.6970015167425037, + "grad_norm": 1.0568766593933105, + "learning_rate": 8.909655115827225e-05, + "loss": 2.0217, + "step": 23116 + }, + { + "epoch": 2.6971181892427953, + "grad_norm": 1.1738200187683105, + "learning_rate": 8.908283132604981e-05, + "loss": 2.041, + "step": 23117 + }, + { + "epoch": 2.697234861743087, + "grad_norm": 1.0898061990737915, + "learning_rate": 8.90691121408996e-05, + "loss": 2.0677, + "step": 23118 + }, + { + "epoch": 2.6973515342433787, + "grad_norm": 1.0810356140136719, + "learning_rate": 8.905539360296368e-05, + "loss": 2.121, + "step": 23119 + }, + { + "epoch": 2.6974682067436704, + "grad_norm": 1.0569632053375244, + "learning_rate": 8.904167571238445e-05, + "loss": 2.0029, + "step": 23120 + }, + { + "epoch": 2.697584879243962, + "grad_norm": 1.0759609937667847, + "learning_rate": 8.902795846930394e-05, + "loss": 2.0785, + "step": 23121 + }, + { + "epoch": 2.6977015517442537, + "grad_norm": 1.2016620635986328, + "learning_rate": 8.90142418738645e-05, + "loss": 2.1759, + "step": 23122 + }, + { + "epoch": 2.6978182242445454, + "grad_norm": 1.0855199098587036, + "learning_rate": 8.90005259262082e-05, + "loss": 1.8589, + "step": 23123 + }, + { + "epoch": 2.697934896744837, + "grad_norm": 1.0489267110824585, + "learning_rate": 8.898681062647737e-05, + "loss": 1.969, + "step": 23124 + }, + { + "epoch": 2.698051569245129, + "grad_norm": 1.4255445003509521, + "learning_rate": 8.897309597481406e-05, + "loss": 1.9887, + "step": 23125 + }, + { + "epoch": 2.6981682417454205, + "grad_norm": 1.0408109426498413, + "learning_rate": 8.895938197136059e-05, + "loss": 1.9968, + "step": 23126 + }, + { + "epoch": 2.698284914245712, + "grad_norm": 1.2078015804290771, + "learning_rate": 8.8945668616259e-05, + "loss": 2.1063, + "step": 23127 + }, + { + "epoch": 2.698401586746004, + "grad_norm": 1.0174437761306763, + "learning_rate": 8.89319559096515e-05, + "loss": 1.9046, + "step": 23128 + }, + { + "epoch": 2.6985182592462955, + "grad_norm": 1.0903633832931519, + "learning_rate": 8.89182438516803e-05, + "loss": 2.0467, + "step": 23129 + }, + { + "epoch": 2.698634931746587, + "grad_norm": 1.230753779411316, + "learning_rate": 8.890453244248748e-05, + "loss": 1.9555, + "step": 23130 + }, + { + "epoch": 2.698751604246879, + "grad_norm": 1.1182847023010254, + "learning_rate": 8.889082168221528e-05, + "loss": 1.9388, + "step": 23131 + }, + { + "epoch": 2.6988682767471706, + "grad_norm": 1.0938061475753784, + "learning_rate": 8.887711157100574e-05, + "loss": 2.0688, + "step": 23132 + }, + { + "epoch": 2.6989849492474622, + "grad_norm": 1.185957670211792, + "learning_rate": 8.886340210900105e-05, + "loss": 1.9539, + "step": 23133 + }, + { + "epoch": 2.699101621747754, + "grad_norm": 1.0635212659835815, + "learning_rate": 8.884969329634323e-05, + "loss": 1.8196, + "step": 23134 + }, + { + "epoch": 2.6992182942480456, + "grad_norm": 1.0736730098724365, + "learning_rate": 8.883598513317457e-05, + "loss": 1.8783, + "step": 23135 + }, + { + "epoch": 2.6993349667483373, + "grad_norm": 1.1167027950286865, + "learning_rate": 8.8822277619637e-05, + "loss": 1.8045, + "step": 23136 + }, + { + "epoch": 2.699451639248629, + "grad_norm": 1.1140809059143066, + "learning_rate": 8.880857075587279e-05, + "loss": 1.9258, + "step": 23137 + }, + { + "epoch": 2.6995683117489206, + "grad_norm": 0.9814231991767883, + "learning_rate": 8.87948645420239e-05, + "loss": 2.0562, + "step": 23138 + }, + { + "epoch": 2.6996849842492123, + "grad_norm": 1.2482632398605347, + "learning_rate": 8.87811589782325e-05, + "loss": 1.9019, + "step": 23139 + }, + { + "epoch": 2.699801656749504, + "grad_norm": 1.1468933820724487, + "learning_rate": 8.876745406464064e-05, + "loss": 2.0, + "step": 23140 + }, + { + "epoch": 2.6999183292497957, + "grad_norm": 1.103111743927002, + "learning_rate": 8.87537498013904e-05, + "loss": 1.8549, + "step": 23141 + }, + { + "epoch": 2.7000350017500874, + "grad_norm": 1.312228798866272, + "learning_rate": 8.874004618862392e-05, + "loss": 2.0695, + "step": 23142 + }, + { + "epoch": 2.700151674250379, + "grad_norm": 1.087306261062622, + "learning_rate": 8.872634322648314e-05, + "loss": 1.7908, + "step": 23143 + }, + { + "epoch": 2.7002683467506707, + "grad_norm": 1.3799288272857666, + "learning_rate": 8.871264091511028e-05, + "loss": 1.8977, + "step": 23144 + }, + { + "epoch": 2.7003850192509624, + "grad_norm": 0.9868550896644592, + "learning_rate": 8.86989392546472e-05, + "loss": 1.8151, + "step": 23145 + }, + { + "epoch": 2.700501691751254, + "grad_norm": 1.2259225845336914, + "learning_rate": 8.86852382452361e-05, + "loss": 2.0095, + "step": 23146 + }, + { + "epoch": 2.7006183642515458, + "grad_norm": 1.0073421001434326, + "learning_rate": 8.867153788701892e-05, + "loss": 1.7237, + "step": 23147 + }, + { + "epoch": 2.7007350367518375, + "grad_norm": 1.1147589683532715, + "learning_rate": 8.865783818013777e-05, + "loss": 1.9359, + "step": 23148 + }, + { + "epoch": 2.700851709252129, + "grad_norm": 1.1913299560546875, + "learning_rate": 8.864413912473461e-05, + "loss": 1.8089, + "step": 23149 + }, + { + "epoch": 2.700968381752421, + "grad_norm": 1.0593297481536865, + "learning_rate": 8.863044072095143e-05, + "loss": 1.9656, + "step": 23150 + }, + { + "epoch": 2.7010850542527125, + "grad_norm": 1.171419620513916, + "learning_rate": 8.861674296893035e-05, + "loss": 1.8563, + "step": 23151 + }, + { + "epoch": 2.701201726753004, + "grad_norm": 1.2163877487182617, + "learning_rate": 8.86030458688133e-05, + "loss": 2.0619, + "step": 23152 + }, + { + "epoch": 2.701318399253296, + "grad_norm": 1.1598719358444214, + "learning_rate": 8.85893494207422e-05, + "loss": 1.9015, + "step": 23153 + }, + { + "epoch": 2.7014350717535875, + "grad_norm": 1.1366246938705444, + "learning_rate": 8.857565362485918e-05, + "loss": 2.0779, + "step": 23154 + }, + { + "epoch": 2.7015517442538792, + "grad_norm": 1.0910091400146484, + "learning_rate": 8.85619584813062e-05, + "loss": 1.9253, + "step": 23155 + }, + { + "epoch": 2.701668416754171, + "grad_norm": 0.9804285168647766, + "learning_rate": 8.854826399022515e-05, + "loss": 1.8352, + "step": 23156 + }, + { + "epoch": 2.7017850892544626, + "grad_norm": 1.0605542659759521, + "learning_rate": 8.85345701517581e-05, + "loss": 1.938, + "step": 23157 + }, + { + "epoch": 2.7019017617547543, + "grad_norm": 1.0341057777404785, + "learning_rate": 8.852087696604691e-05, + "loss": 2.0024, + "step": 23158 + }, + { + "epoch": 2.702018434255046, + "grad_norm": 1.3112281560897827, + "learning_rate": 8.850718443323366e-05, + "loss": 1.8601, + "step": 23159 + }, + { + "epoch": 2.7021351067553376, + "grad_norm": 1.1210185289382935, + "learning_rate": 8.849349255346016e-05, + "loss": 2.0631, + "step": 23160 + }, + { + "epoch": 2.7022517792556293, + "grad_norm": 1.1413732767105103, + "learning_rate": 8.847980132686847e-05, + "loss": 1.9171, + "step": 23161 + }, + { + "epoch": 2.702368451755921, + "grad_norm": 1.0556347370147705, + "learning_rate": 8.846611075360044e-05, + "loss": 1.7316, + "step": 23162 + }, + { + "epoch": 2.7024851242562127, + "grad_norm": 1.1889604330062866, + "learning_rate": 8.845242083379809e-05, + "loss": 1.9519, + "step": 23163 + }, + { + "epoch": 2.7026017967565044, + "grad_norm": 1.2046034336090088, + "learning_rate": 8.84387315676032e-05, + "loss": 2.0823, + "step": 23164 + }, + { + "epoch": 2.702718469256796, + "grad_norm": 1.3277555704116821, + "learning_rate": 8.842504295515787e-05, + "loss": 2.2628, + "step": 23165 + }, + { + "epoch": 2.7028351417570877, + "grad_norm": 1.1874170303344727, + "learning_rate": 8.841135499660387e-05, + "loss": 2.0312, + "step": 23166 + }, + { + "epoch": 2.7029518142573794, + "grad_norm": 1.0534104108810425, + "learning_rate": 8.839766769208317e-05, + "loss": 1.8972, + "step": 23167 + }, + { + "epoch": 2.703068486757671, + "grad_norm": 1.0192489624023438, + "learning_rate": 8.838398104173763e-05, + "loss": 1.9671, + "step": 23168 + }, + { + "epoch": 2.7031851592579628, + "grad_norm": 1.1673316955566406, + "learning_rate": 8.837029504570912e-05, + "loss": 1.9527, + "step": 23169 + }, + { + "epoch": 2.7033018317582544, + "grad_norm": 1.2518419027328491, + "learning_rate": 8.835660970413959e-05, + "loss": 1.9761, + "step": 23170 + }, + { + "epoch": 2.703418504258546, + "grad_norm": 1.0913331508636475, + "learning_rate": 8.83429250171708e-05, + "loss": 1.9037, + "step": 23171 + }, + { + "epoch": 2.703535176758838, + "grad_norm": 1.3564753532409668, + "learning_rate": 8.832924098494478e-05, + "loss": 2.0108, + "step": 23172 + }, + { + "epoch": 2.7036518492591295, + "grad_norm": 1.0978727340698242, + "learning_rate": 8.831555760760321e-05, + "loss": 2.0623, + "step": 23173 + }, + { + "epoch": 2.703768521759421, + "grad_norm": 1.0808982849121094, + "learning_rate": 8.830187488528811e-05, + "loss": 2.0033, + "step": 23174 + }, + { + "epoch": 2.703885194259713, + "grad_norm": 1.045589804649353, + "learning_rate": 8.828819281814118e-05, + "loss": 1.975, + "step": 23175 + }, + { + "epoch": 2.7040018667600045, + "grad_norm": 1.1372581720352173, + "learning_rate": 8.827451140630439e-05, + "loss": 1.9289, + "step": 23176 + }, + { + "epoch": 2.704118539260296, + "grad_norm": 1.0574398040771484, + "learning_rate": 8.826083064991948e-05, + "loss": 1.7282, + "step": 23177 + }, + { + "epoch": 2.704235211760588, + "grad_norm": 1.1167337894439697, + "learning_rate": 8.824715054912838e-05, + "loss": 2.071, + "step": 23178 + }, + { + "epoch": 2.7043518842608796, + "grad_norm": 1.00379478931427, + "learning_rate": 8.823347110407275e-05, + "loss": 2.0113, + "step": 23179 + }, + { + "epoch": 2.7044685567611713, + "grad_norm": 1.4389535188674927, + "learning_rate": 8.821979231489452e-05, + "loss": 2.0924, + "step": 23180 + }, + { + "epoch": 2.704585229261463, + "grad_norm": 1.4890578985214233, + "learning_rate": 8.820611418173552e-05, + "loss": 1.8708, + "step": 23181 + }, + { + "epoch": 2.7047019017617546, + "grad_norm": 1.279205083847046, + "learning_rate": 8.81924367047375e-05, + "loss": 2.1063, + "step": 23182 + }, + { + "epoch": 2.7048185742620463, + "grad_norm": 0.9699485898017883, + "learning_rate": 8.817875988404221e-05, + "loss": 1.7742, + "step": 23183 + }, + { + "epoch": 2.704935246762338, + "grad_norm": 1.1378397941589355, + "learning_rate": 8.816508371979154e-05, + "loss": 1.9513, + "step": 23184 + }, + { + "epoch": 2.7050519192626297, + "grad_norm": 1.1809648275375366, + "learning_rate": 8.815140821212722e-05, + "loss": 2.0012, + "step": 23185 + }, + { + "epoch": 2.7051685917629213, + "grad_norm": 1.1434777975082397, + "learning_rate": 8.813773336119093e-05, + "loss": 1.807, + "step": 23186 + }, + { + "epoch": 2.705285264263213, + "grad_norm": 0.9592083692550659, + "learning_rate": 8.812405916712461e-05, + "loss": 1.9138, + "step": 23187 + }, + { + "epoch": 2.7054019367635047, + "grad_norm": 1.251198410987854, + "learning_rate": 8.811038563006984e-05, + "loss": 2.0574, + "step": 23188 + }, + { + "epoch": 2.7055186092637964, + "grad_norm": 1.2660924196243286, + "learning_rate": 8.809671275016854e-05, + "loss": 2.0411, + "step": 23189 + }, + { + "epoch": 2.705635281764088, + "grad_norm": 1.2763992547988892, + "learning_rate": 8.808304052756231e-05, + "loss": 2.1134, + "step": 23190 + }, + { + "epoch": 2.7057519542643798, + "grad_norm": 1.218002438545227, + "learning_rate": 8.806936896239301e-05, + "loss": 1.8592, + "step": 23191 + }, + { + "epoch": 2.7058686267646714, + "grad_norm": 1.0926368236541748, + "learning_rate": 8.805569805480226e-05, + "loss": 2.0228, + "step": 23192 + }, + { + "epoch": 2.705985299264963, + "grad_norm": 1.230711817741394, + "learning_rate": 8.804202780493185e-05, + "loss": 1.9948, + "step": 23193 + }, + { + "epoch": 2.706101971765255, + "grad_norm": 1.1290066242218018, + "learning_rate": 8.802835821292353e-05, + "loss": 1.8853, + "step": 23194 + }, + { + "epoch": 2.7062186442655465, + "grad_norm": 1.0549848079681396, + "learning_rate": 8.801468927891894e-05, + "loss": 1.8184, + "step": 23195 + }, + { + "epoch": 2.706335316765838, + "grad_norm": 1.0224496126174927, + "learning_rate": 8.800102100305986e-05, + "loss": 1.9359, + "step": 23196 + }, + { + "epoch": 2.70645198926613, + "grad_norm": 1.129311442375183, + "learning_rate": 8.798735338548787e-05, + "loss": 1.9174, + "step": 23197 + }, + { + "epoch": 2.7065686617664215, + "grad_norm": 1.0964261293411255, + "learning_rate": 8.79736864263448e-05, + "loss": 2.1712, + "step": 23198 + }, + { + "epoch": 2.706685334266713, + "grad_norm": 1.2542955875396729, + "learning_rate": 8.796002012577222e-05, + "loss": 2.0307, + "step": 23199 + }, + { + "epoch": 2.706802006767005, + "grad_norm": 1.2388442754745483, + "learning_rate": 8.79463544839119e-05, + "loss": 1.9417, + "step": 23200 + }, + { + "epoch": 2.7069186792672966, + "grad_norm": 1.2061734199523926, + "learning_rate": 8.793268950090548e-05, + "loss": 2.0805, + "step": 23201 + }, + { + "epoch": 2.7070353517675882, + "grad_norm": 1.1558167934417725, + "learning_rate": 8.79190251768946e-05, + "loss": 2.0668, + "step": 23202 + }, + { + "epoch": 2.70715202426788, + "grad_norm": 1.2231296300888062, + "learning_rate": 8.790536151202086e-05, + "loss": 1.8617, + "step": 23203 + }, + { + "epoch": 2.7072686967681716, + "grad_norm": 1.097631812095642, + "learning_rate": 8.789169850642603e-05, + "loss": 1.9103, + "step": 23204 + }, + { + "epoch": 2.7073853692684633, + "grad_norm": 1.1154708862304688, + "learning_rate": 8.787803616025163e-05, + "loss": 1.9873, + "step": 23205 + }, + { + "epoch": 2.707502041768755, + "grad_norm": 1.2361479997634888, + "learning_rate": 8.786437447363938e-05, + "loss": 2.027, + "step": 23206 + }, + { + "epoch": 2.7076187142690467, + "grad_norm": 1.1302952766418457, + "learning_rate": 8.785071344673095e-05, + "loss": 1.8788, + "step": 23207 + }, + { + "epoch": 2.7077353867693383, + "grad_norm": 1.0494930744171143, + "learning_rate": 8.783705307966785e-05, + "loss": 1.8994, + "step": 23208 + }, + { + "epoch": 2.70785205926963, + "grad_norm": 1.1378363370895386, + "learning_rate": 8.782339337259181e-05, + "loss": 1.8692, + "step": 23209 + }, + { + "epoch": 2.7079687317699217, + "grad_norm": 0.9965198040008545, + "learning_rate": 8.78097343256443e-05, + "loss": 1.8804, + "step": 23210 + }, + { + "epoch": 2.7080854042702134, + "grad_norm": 0.9243549108505249, + "learning_rate": 8.779607593896707e-05, + "loss": 1.7945, + "step": 23211 + }, + { + "epoch": 2.708202076770505, + "grad_norm": 1.255990982055664, + "learning_rate": 8.77824182127016e-05, + "loss": 1.8633, + "step": 23212 + }, + { + "epoch": 2.7083187492707967, + "grad_norm": 1.01899254322052, + "learning_rate": 8.776876114698956e-05, + "loss": 1.9733, + "step": 23213 + }, + { + "epoch": 2.7084354217710884, + "grad_norm": 1.107280969619751, + "learning_rate": 8.775510474197246e-05, + "loss": 1.7603, + "step": 23214 + }, + { + "epoch": 2.70855209427138, + "grad_norm": 1.1307708024978638, + "learning_rate": 8.774144899779196e-05, + "loss": 1.8186, + "step": 23215 + }, + { + "epoch": 2.708668766771672, + "grad_norm": 1.0021755695343018, + "learning_rate": 8.772779391458953e-05, + "loss": 1.9423, + "step": 23216 + }, + { + "epoch": 2.7087854392719635, + "grad_norm": 1.1296296119689941, + "learning_rate": 8.771413949250683e-05, + "loss": 1.9613, + "step": 23217 + }, + { + "epoch": 2.708902111772255, + "grad_norm": 1.1465978622436523, + "learning_rate": 8.770048573168535e-05, + "loss": 1.9405, + "step": 23218 + }, + { + "epoch": 2.709018784272547, + "grad_norm": 1.187833547592163, + "learning_rate": 8.76868326322666e-05, + "loss": 1.9111, + "step": 23219 + }, + { + "epoch": 2.7091354567728385, + "grad_norm": 1.1401088237762451, + "learning_rate": 8.767318019439222e-05, + "loss": 2.0514, + "step": 23220 + }, + { + "epoch": 2.70925212927313, + "grad_norm": 1.0772467851638794, + "learning_rate": 8.765952841820365e-05, + "loss": 1.871, + "step": 23221 + }, + { + "epoch": 2.709368801773422, + "grad_norm": 1.0041013956069946, + "learning_rate": 8.76458773038425e-05, + "loss": 1.7133, + "step": 23222 + }, + { + "epoch": 2.7094854742737136, + "grad_norm": 1.2115334272384644, + "learning_rate": 8.76322268514502e-05, + "loss": 1.9486, + "step": 23223 + }, + { + "epoch": 2.7096021467740052, + "grad_norm": 1.1754194498062134, + "learning_rate": 8.761857706116838e-05, + "loss": 1.9214, + "step": 23224 + }, + { + "epoch": 2.709718819274297, + "grad_norm": 1.0925688743591309, + "learning_rate": 8.760492793313842e-05, + "loss": 1.9794, + "step": 23225 + }, + { + "epoch": 2.7098354917745886, + "grad_norm": 1.0756622552871704, + "learning_rate": 8.759127946750194e-05, + "loss": 1.9898, + "step": 23226 + }, + { + "epoch": 2.7099521642748803, + "grad_norm": 1.1398652791976929, + "learning_rate": 8.75776316644003e-05, + "loss": 1.9246, + "step": 23227 + }, + { + "epoch": 2.710068836775172, + "grad_norm": 1.0999901294708252, + "learning_rate": 8.756398452397511e-05, + "loss": 2.0107, + "step": 23228 + }, + { + "epoch": 2.7101855092754636, + "grad_norm": 1.0084911584854126, + "learning_rate": 8.755033804636774e-05, + "loss": 2.0698, + "step": 23229 + }, + { + "epoch": 2.7103021817757553, + "grad_norm": 1.0044790506362915, + "learning_rate": 8.753669223171976e-05, + "loss": 1.986, + "step": 23230 + }, + { + "epoch": 2.710418854276047, + "grad_norm": 1.1259424686431885, + "learning_rate": 8.752304708017256e-05, + "loss": 1.9233, + "step": 23231 + }, + { + "epoch": 2.7105355267763387, + "grad_norm": 1.0077065229415894, + "learning_rate": 8.750940259186758e-05, + "loss": 1.8407, + "step": 23232 + }, + { + "epoch": 2.7106521992766304, + "grad_norm": 0.9539916515350342, + "learning_rate": 8.749575876694642e-05, + "loss": 1.8431, + "step": 23233 + }, + { + "epoch": 2.710768871776922, + "grad_norm": 1.0512365102767944, + "learning_rate": 8.748211560555043e-05, + "loss": 1.9351, + "step": 23234 + }, + { + "epoch": 2.7108855442772137, + "grad_norm": 1.6963084936141968, + "learning_rate": 8.746847310782094e-05, + "loss": 2.1708, + "step": 23235 + }, + { + "epoch": 2.7110022167775054, + "grad_norm": 1.3947899341583252, + "learning_rate": 8.745483127389955e-05, + "loss": 1.9531, + "step": 23236 + }, + { + "epoch": 2.711118889277797, + "grad_norm": 1.1172820329666138, + "learning_rate": 8.744119010392763e-05, + "loss": 2.0438, + "step": 23237 + }, + { + "epoch": 2.7112355617780888, + "grad_norm": 1.0984584093093872, + "learning_rate": 8.742754959804649e-05, + "loss": 2.1332, + "step": 23238 + }, + { + "epoch": 2.7113522342783805, + "grad_norm": 1.1929155588150024, + "learning_rate": 8.741390975639772e-05, + "loss": 1.9742, + "step": 23239 + }, + { + "epoch": 2.711468906778672, + "grad_norm": 1.2691746950149536, + "learning_rate": 8.740027057912258e-05, + "loss": 1.9345, + "step": 23240 + }, + { + "epoch": 2.711585579278964, + "grad_norm": 1.2381094694137573, + "learning_rate": 8.738663206636255e-05, + "loss": 1.9793, + "step": 23241 + }, + { + "epoch": 2.7117022517792555, + "grad_norm": 0.9913583993911743, + "learning_rate": 8.737299421825895e-05, + "loss": 1.8272, + "step": 23242 + }, + { + "epoch": 2.711818924279547, + "grad_norm": 1.1255664825439453, + "learning_rate": 8.735935703495324e-05, + "loss": 1.904, + "step": 23243 + }, + { + "epoch": 2.711935596779839, + "grad_norm": 1.110365867614746, + "learning_rate": 8.734572051658671e-05, + "loss": 1.837, + "step": 23244 + }, + { + "epoch": 2.7120522692801305, + "grad_norm": 1.0181941986083984, + "learning_rate": 8.733208466330077e-05, + "loss": 2.0214, + "step": 23245 + }, + { + "epoch": 2.7121689417804222, + "grad_norm": 1.169459581375122, + "learning_rate": 8.731844947523684e-05, + "loss": 2.07, + "step": 23246 + }, + { + "epoch": 2.712285614280714, + "grad_norm": 1.1442584991455078, + "learning_rate": 8.730481495253614e-05, + "loss": 1.9975, + "step": 23247 + }, + { + "epoch": 2.7124022867810056, + "grad_norm": 1.10006582736969, + "learning_rate": 8.729118109534018e-05, + "loss": 1.8848, + "step": 23248 + }, + { + "epoch": 2.7125189592812973, + "grad_norm": 1.1572169065475464, + "learning_rate": 8.727754790379018e-05, + "loss": 1.971, + "step": 23249 + }, + { + "epoch": 2.712635631781589, + "grad_norm": 0.9953882098197937, + "learning_rate": 8.726391537802754e-05, + "loss": 1.9333, + "step": 23250 + }, + { + "epoch": 2.7127523042818806, + "grad_norm": 1.233991026878357, + "learning_rate": 8.725028351819348e-05, + "loss": 2.1005, + "step": 23251 + }, + { + "epoch": 2.7128689767821723, + "grad_norm": 1.0551371574401855, + "learning_rate": 8.72366523244295e-05, + "loss": 1.7469, + "step": 23252 + }, + { + "epoch": 2.712985649282464, + "grad_norm": 1.1254419088363647, + "learning_rate": 8.722302179687679e-05, + "loss": 1.9712, + "step": 23253 + }, + { + "epoch": 2.7131023217827557, + "grad_norm": 1.1547086238861084, + "learning_rate": 8.720939193567667e-05, + "loss": 1.9793, + "step": 23254 + }, + { + "epoch": 2.7132189942830474, + "grad_norm": 1.2831192016601562, + "learning_rate": 8.71957627409704e-05, + "loss": 2.0165, + "step": 23255 + }, + { + "epoch": 2.713335666783339, + "grad_norm": 1.1080752611160278, + "learning_rate": 8.718213421289937e-05, + "loss": 1.9059, + "step": 23256 + }, + { + "epoch": 2.7134523392836307, + "grad_norm": 1.143534779548645, + "learning_rate": 8.716850635160477e-05, + "loss": 2.0501, + "step": 23257 + }, + { + "epoch": 2.7135690117839224, + "grad_norm": 1.0712432861328125, + "learning_rate": 8.715487915722792e-05, + "loss": 1.9803, + "step": 23258 + }, + { + "epoch": 2.713685684284214, + "grad_norm": 1.1039730310440063, + "learning_rate": 8.714125262991014e-05, + "loss": 1.9047, + "step": 23259 + }, + { + "epoch": 2.7138023567845058, + "grad_norm": 1.0153706073760986, + "learning_rate": 8.712762676979258e-05, + "loss": 2.2562, + "step": 23260 + }, + { + "epoch": 2.7139190292847974, + "grad_norm": 1.1037275791168213, + "learning_rate": 8.711400157701665e-05, + "loss": 1.8969, + "step": 23261 + }, + { + "epoch": 2.714035701785089, + "grad_norm": 1.1636996269226074, + "learning_rate": 8.710037705172345e-05, + "loss": 2.0556, + "step": 23262 + }, + { + "epoch": 2.714152374285381, + "grad_norm": 1.1744550466537476, + "learning_rate": 8.708675319405435e-05, + "loss": 1.9694, + "step": 23263 + }, + { + "epoch": 2.7142690467856725, + "grad_norm": 1.0381468534469604, + "learning_rate": 8.70731300041505e-05, + "loss": 2.1099, + "step": 23264 + }, + { + "epoch": 2.714385719285964, + "grad_norm": 1.2902079820632935, + "learning_rate": 8.705950748215318e-05, + "loss": 2.0829, + "step": 23265 + }, + { + "epoch": 2.714502391786256, + "grad_norm": 1.165200114250183, + "learning_rate": 8.704588562820355e-05, + "loss": 1.9496, + "step": 23266 + }, + { + "epoch": 2.7146190642865475, + "grad_norm": 1.1150175333023071, + "learning_rate": 8.703226444244292e-05, + "loss": 1.9549, + "step": 23267 + }, + { + "epoch": 2.714735736786839, + "grad_norm": 1.086632251739502, + "learning_rate": 8.701864392501242e-05, + "loss": 1.973, + "step": 23268 + }, + { + "epoch": 2.714852409287131, + "grad_norm": 1.129471778869629, + "learning_rate": 8.700502407605331e-05, + "loss": 1.9872, + "step": 23269 + }, + { + "epoch": 2.7149690817874226, + "grad_norm": 1.1559617519378662, + "learning_rate": 8.699140489570676e-05, + "loss": 1.8726, + "step": 23270 + }, + { + "epoch": 2.7150857542877143, + "grad_norm": 1.0757246017456055, + "learning_rate": 8.697778638411392e-05, + "loss": 1.9638, + "step": 23271 + }, + { + "epoch": 2.715202426788006, + "grad_norm": 1.0503923892974854, + "learning_rate": 8.696416854141603e-05, + "loss": 1.997, + "step": 23272 + }, + { + "epoch": 2.7153190992882976, + "grad_norm": 1.1830884218215942, + "learning_rate": 8.695055136775421e-05, + "loss": 2.0383, + "step": 23273 + }, + { + "epoch": 2.7154357717885893, + "grad_norm": 1.1572026014328003, + "learning_rate": 8.693693486326969e-05, + "loss": 2.0381, + "step": 23274 + }, + { + "epoch": 2.715552444288881, + "grad_norm": 1.0656023025512695, + "learning_rate": 8.692331902810357e-05, + "loss": 1.7027, + "step": 23275 + }, + { + "epoch": 2.7156691167891727, + "grad_norm": 1.090303897857666, + "learning_rate": 8.69097038623971e-05, + "loss": 1.8122, + "step": 23276 + }, + { + "epoch": 2.7157857892894643, + "grad_norm": 1.181834101676941, + "learning_rate": 8.689608936629125e-05, + "loss": 1.9204, + "step": 23277 + }, + { + "epoch": 2.715902461789756, + "grad_norm": 1.1908442974090576, + "learning_rate": 8.688247553992738e-05, + "loss": 1.9171, + "step": 23278 + }, + { + "epoch": 2.7160191342900477, + "grad_norm": 1.126741647720337, + "learning_rate": 8.686886238344646e-05, + "loss": 1.9292, + "step": 23279 + }, + { + "epoch": 2.7161358067903394, + "grad_norm": 1.0765821933746338, + "learning_rate": 8.685524989698969e-05, + "loss": 1.9722, + "step": 23280 + }, + { + "epoch": 2.716252479290631, + "grad_norm": 1.122902274131775, + "learning_rate": 8.684163808069812e-05, + "loss": 2.0112, + "step": 23281 + }, + { + "epoch": 2.7163691517909228, + "grad_norm": 1.1528984308242798, + "learning_rate": 8.682802693471301e-05, + "loss": 1.9917, + "step": 23282 + }, + { + "epoch": 2.7164858242912144, + "grad_norm": 1.0191774368286133, + "learning_rate": 8.681441645917526e-05, + "loss": 1.9745, + "step": 23283 + }, + { + "epoch": 2.716602496791506, + "grad_norm": 1.1930381059646606, + "learning_rate": 8.680080665422609e-05, + "loss": 1.8132, + "step": 23284 + }, + { + "epoch": 2.716719169291798, + "grad_norm": 1.053288221359253, + "learning_rate": 8.678719752000665e-05, + "loss": 1.8049, + "step": 23285 + }, + { + "epoch": 2.7168358417920895, + "grad_norm": 1.190943956375122, + "learning_rate": 8.677358905665795e-05, + "loss": 1.9636, + "step": 23286 + }, + { + "epoch": 2.716952514292381, + "grad_norm": 1.2235441207885742, + "learning_rate": 8.675998126432107e-05, + "loss": 2.1587, + "step": 23287 + }, + { + "epoch": 2.717069186792673, + "grad_norm": 1.195744514465332, + "learning_rate": 8.674637414313699e-05, + "loss": 1.943, + "step": 23288 + }, + { + "epoch": 2.7171858592929645, + "grad_norm": 1.1834975481033325, + "learning_rate": 8.673276769324695e-05, + "loss": 1.9019, + "step": 23289 + }, + { + "epoch": 2.717302531793256, + "grad_norm": 1.053670883178711, + "learning_rate": 8.671916191479183e-05, + "loss": 1.9122, + "step": 23290 + }, + { + "epoch": 2.717419204293548, + "grad_norm": 1.2419278621673584, + "learning_rate": 8.670555680791288e-05, + "loss": 2.1007, + "step": 23291 + }, + { + "epoch": 2.7175358767938396, + "grad_norm": 1.1948386430740356, + "learning_rate": 8.669195237275093e-05, + "loss": 1.9511, + "step": 23292 + }, + { + "epoch": 2.7176525492941312, + "grad_norm": 1.1189836263656616, + "learning_rate": 8.66783486094472e-05, + "loss": 2.0749, + "step": 23293 + }, + { + "epoch": 2.717769221794423, + "grad_norm": 1.2291698455810547, + "learning_rate": 8.666474551814259e-05, + "loss": 2.1876, + "step": 23294 + }, + { + "epoch": 2.7178858942947146, + "grad_norm": 1.1626991033554077, + "learning_rate": 8.665114309897818e-05, + "loss": 2.0719, + "step": 23295 + }, + { + "epoch": 2.7180025667950063, + "grad_norm": 1.1581064462661743, + "learning_rate": 8.663754135209496e-05, + "loss": 2.0478, + "step": 23296 + }, + { + "epoch": 2.718119239295298, + "grad_norm": 1.19234299659729, + "learning_rate": 8.662394027763398e-05, + "loss": 1.9702, + "step": 23297 + }, + { + "epoch": 2.7182359117955897, + "grad_norm": 1.1873482465744019, + "learning_rate": 8.661033987573618e-05, + "loss": 2.0198, + "step": 23298 + }, + { + "epoch": 2.7183525842958813, + "grad_norm": 1.2202436923980713, + "learning_rate": 8.659674014654257e-05, + "loss": 2.2075, + "step": 23299 + }, + { + "epoch": 2.718469256796173, + "grad_norm": 1.214615821838379, + "learning_rate": 8.65831410901942e-05, + "loss": 2.081, + "step": 23300 + }, + { + "epoch": 2.7185859292964647, + "grad_norm": 1.1859303712844849, + "learning_rate": 8.656954270683195e-05, + "loss": 1.9963, + "step": 23301 + }, + { + "epoch": 2.7187026017967564, + "grad_norm": 1.1672229766845703, + "learning_rate": 8.655594499659691e-05, + "loss": 2.0841, + "step": 23302 + }, + { + "epoch": 2.718819274297048, + "grad_norm": 1.0393987894058228, + "learning_rate": 8.654234795963e-05, + "loss": 2.1029, + "step": 23303 + }, + { + "epoch": 2.7189359467973397, + "grad_norm": 1.2045000791549683, + "learning_rate": 8.652875159607208e-05, + "loss": 2.0008, + "step": 23304 + }, + { + "epoch": 2.7190526192976314, + "grad_norm": 1.1008542776107788, + "learning_rate": 8.651515590606424e-05, + "loss": 2.0063, + "step": 23305 + }, + { + "epoch": 2.719169291797923, + "grad_norm": 1.1555469036102295, + "learning_rate": 8.650156088974738e-05, + "loss": 2.1326, + "step": 23306 + }, + { + "epoch": 2.719285964298215, + "grad_norm": 1.1101223230361938, + "learning_rate": 8.648796654726237e-05, + "loss": 1.9446, + "step": 23307 + }, + { + "epoch": 2.7194026367985065, + "grad_norm": 1.1129313707351685, + "learning_rate": 8.647437287875023e-05, + "loss": 1.8557, + "step": 23308 + }, + { + "epoch": 2.719519309298798, + "grad_norm": 1.2229667901992798, + "learning_rate": 8.646077988435181e-05, + "loss": 2.013, + "step": 23309 + }, + { + "epoch": 2.71963598179909, + "grad_norm": 1.0686935186386108, + "learning_rate": 8.644718756420812e-05, + "loss": 2.0279, + "step": 23310 + }, + { + "epoch": 2.7197526542993815, + "grad_norm": 1.160593867301941, + "learning_rate": 8.643359591845998e-05, + "loss": 1.9026, + "step": 23311 + }, + { + "epoch": 2.719869326799673, + "grad_norm": 1.252360463142395, + "learning_rate": 8.642000494724832e-05, + "loss": 1.9457, + "step": 23312 + }, + { + "epoch": 2.719985999299965, + "grad_norm": 1.3045799732208252, + "learning_rate": 8.64064146507141e-05, + "loss": 2.0768, + "step": 23313 + }, + { + "epoch": 2.7201026718002566, + "grad_norm": 1.061330795288086, + "learning_rate": 8.63928250289981e-05, + "loss": 1.9054, + "step": 23314 + }, + { + "epoch": 2.7202193443005482, + "grad_norm": 1.266409993171692, + "learning_rate": 8.637923608224132e-05, + "loss": 2.1842, + "step": 23315 + }, + { + "epoch": 2.72033601680084, + "grad_norm": 1.1451938152313232, + "learning_rate": 8.636564781058452e-05, + "loss": 2.0127, + "step": 23316 + }, + { + "epoch": 2.7204526893011316, + "grad_norm": 1.03715181350708, + "learning_rate": 8.635206021416867e-05, + "loss": 2.05, + "step": 23317 + }, + { + "epoch": 2.7205693618014233, + "grad_norm": 1.189008355140686, + "learning_rate": 8.633847329313454e-05, + "loss": 1.9763, + "step": 23318 + }, + { + "epoch": 2.720686034301715, + "grad_norm": 1.1156083345413208, + "learning_rate": 8.632488704762308e-05, + "loss": 1.9207, + "step": 23319 + }, + { + "epoch": 2.7208027068020066, + "grad_norm": 1.2684481143951416, + "learning_rate": 8.631130147777502e-05, + "loss": 2.052, + "step": 23320 + }, + { + "epoch": 2.7209193793022983, + "grad_norm": 1.2083375453948975, + "learning_rate": 8.629771658373134e-05, + "loss": 1.9594, + "step": 23321 + }, + { + "epoch": 2.72103605180259, + "grad_norm": 1.2902865409851074, + "learning_rate": 8.62841323656328e-05, + "loss": 1.9873, + "step": 23322 + }, + { + "epoch": 2.7211527243028817, + "grad_norm": 1.1079400777816772, + "learning_rate": 8.627054882362019e-05, + "loss": 1.8426, + "step": 23323 + }, + { + "epoch": 2.7212693968031734, + "grad_norm": 1.1504597663879395, + "learning_rate": 8.625696595783433e-05, + "loss": 2.0466, + "step": 23324 + }, + { + "epoch": 2.721386069303465, + "grad_norm": 1.0334017276763916, + "learning_rate": 8.624338376841605e-05, + "loss": 1.8751, + "step": 23325 + }, + { + "epoch": 2.7215027418037567, + "grad_norm": 1.0759198665618896, + "learning_rate": 8.622980225550625e-05, + "loss": 1.9781, + "step": 23326 + }, + { + "epoch": 2.7216194143040484, + "grad_norm": 1.0418299436569214, + "learning_rate": 8.621622141924557e-05, + "loss": 1.9665, + "step": 23327 + }, + { + "epoch": 2.72173608680434, + "grad_norm": 1.2243837118148804, + "learning_rate": 8.620264125977496e-05, + "loss": 1.7069, + "step": 23328 + }, + { + "epoch": 2.7218527593046318, + "grad_norm": 1.0691412687301636, + "learning_rate": 8.618906177723504e-05, + "loss": 1.9272, + "step": 23329 + }, + { + "epoch": 2.7219694318049235, + "grad_norm": 1.075510859489441, + "learning_rate": 8.617548297176675e-05, + "loss": 1.9809, + "step": 23330 + }, + { + "epoch": 2.722086104305215, + "grad_norm": 1.1129662990570068, + "learning_rate": 8.616190484351074e-05, + "loss": 1.9152, + "step": 23331 + }, + { + "epoch": 2.722202776805507, + "grad_norm": 1.0782512426376343, + "learning_rate": 8.614832739260785e-05, + "loss": 1.97, + "step": 23332 + }, + { + "epoch": 2.7223194493057985, + "grad_norm": 1.0673750638961792, + "learning_rate": 8.613475061919873e-05, + "loss": 1.9165, + "step": 23333 + }, + { + "epoch": 2.72243612180609, + "grad_norm": 1.1385067701339722, + "learning_rate": 8.612117452342427e-05, + "loss": 2.0661, + "step": 23334 + }, + { + "epoch": 2.722552794306382, + "grad_norm": 1.1713634729385376, + "learning_rate": 8.610759910542507e-05, + "loss": 2.1613, + "step": 23335 + }, + { + "epoch": 2.7226694668066735, + "grad_norm": 1.1234403848648071, + "learning_rate": 8.609402436534203e-05, + "loss": 1.8375, + "step": 23336 + }, + { + "epoch": 2.7227861393069652, + "grad_norm": 1.2020601034164429, + "learning_rate": 8.608045030331567e-05, + "loss": 2.1913, + "step": 23337 + }, + { + "epoch": 2.722902811807257, + "grad_norm": 1.124170184135437, + "learning_rate": 8.606687691948692e-05, + "loss": 1.834, + "step": 23338 + }, + { + "epoch": 2.7230194843075486, + "grad_norm": 1.3544692993164062, + "learning_rate": 8.60533042139964e-05, + "loss": 1.9567, + "step": 23339 + }, + { + "epoch": 2.7231361568078403, + "grad_norm": 1.2191569805145264, + "learning_rate": 8.603973218698476e-05, + "loss": 1.9574, + "step": 23340 + }, + { + "epoch": 2.723252829308132, + "grad_norm": 1.2000658512115479, + "learning_rate": 8.602616083859282e-05, + "loss": 2.2108, + "step": 23341 + }, + { + "epoch": 2.7233695018084236, + "grad_norm": 0.9324716329574585, + "learning_rate": 8.601259016896111e-05, + "loss": 1.8401, + "step": 23342 + }, + { + "epoch": 2.7234861743087153, + "grad_norm": 1.1398733854293823, + "learning_rate": 8.59990201782305e-05, + "loss": 2.0104, + "step": 23343 + }, + { + "epoch": 2.723602846809007, + "grad_norm": 1.0599238872528076, + "learning_rate": 8.598545086654153e-05, + "loss": 1.9891, + "step": 23344 + }, + { + "epoch": 2.7237195193092987, + "grad_norm": 1.0530482530593872, + "learning_rate": 8.597188223403497e-05, + "loss": 2.1083, + "step": 23345 + }, + { + "epoch": 2.7238361918095904, + "grad_norm": 0.9526976346969604, + "learning_rate": 8.595831428085136e-05, + "loss": 1.9132, + "step": 23346 + }, + { + "epoch": 2.723952864309882, + "grad_norm": 1.2764618396759033, + "learning_rate": 8.594474700713152e-05, + "loss": 2.0716, + "step": 23347 + }, + { + "epoch": 2.7240695368101737, + "grad_norm": 1.0836764574050903, + "learning_rate": 8.593118041301598e-05, + "loss": 1.8315, + "step": 23348 + }, + { + "epoch": 2.7241862093104654, + "grad_norm": 1.1462122201919556, + "learning_rate": 8.591761449864542e-05, + "loss": 1.796, + "step": 23349 + }, + { + "epoch": 2.724302881810757, + "grad_norm": 1.1508761644363403, + "learning_rate": 8.590404926416046e-05, + "loss": 1.8889, + "step": 23350 + }, + { + "epoch": 2.7244195543110488, + "grad_norm": 1.1739401817321777, + "learning_rate": 8.589048470970172e-05, + "loss": 1.956, + "step": 23351 + }, + { + "epoch": 2.7245362268113404, + "grad_norm": 1.2650492191314697, + "learning_rate": 8.587692083540992e-05, + "loss": 2.059, + "step": 23352 + }, + { + "epoch": 2.724652899311632, + "grad_norm": 1.3728792667388916, + "learning_rate": 8.586335764142556e-05, + "loss": 1.8977, + "step": 23353 + }, + { + "epoch": 2.724769571811924, + "grad_norm": 1.1412445306777954, + "learning_rate": 8.584979512788934e-05, + "loss": 2.1019, + "step": 23354 + }, + { + "epoch": 2.7248862443122155, + "grad_norm": 0.9504721760749817, + "learning_rate": 8.58362332949418e-05, + "loss": 1.7912, + "step": 23355 + }, + { + "epoch": 2.725002916812507, + "grad_norm": 1.1166397333145142, + "learning_rate": 8.58226721427235e-05, + "loss": 1.9382, + "step": 23356 + }, + { + "epoch": 2.725119589312799, + "grad_norm": 1.225756287574768, + "learning_rate": 8.580911167137513e-05, + "loss": 1.9799, + "step": 23357 + }, + { + "epoch": 2.7252362618130905, + "grad_norm": 1.2441003322601318, + "learning_rate": 8.579555188103721e-05, + "loss": 1.892, + "step": 23358 + }, + { + "epoch": 2.725352934313382, + "grad_norm": 1.1733072996139526, + "learning_rate": 8.578199277185025e-05, + "loss": 2.0717, + "step": 23359 + }, + { + "epoch": 2.725469606813674, + "grad_norm": 1.282413125038147, + "learning_rate": 8.576843434395496e-05, + "loss": 1.9425, + "step": 23360 + }, + { + "epoch": 2.7255862793139656, + "grad_norm": 1.1662087440490723, + "learning_rate": 8.575487659749173e-05, + "loss": 1.8577, + "step": 23361 + }, + { + "epoch": 2.7257029518142573, + "grad_norm": 1.5263123512268066, + "learning_rate": 8.57413195326013e-05, + "loss": 1.8586, + "step": 23362 + }, + { + "epoch": 2.725819624314549, + "grad_norm": 1.0438398122787476, + "learning_rate": 8.572776314942404e-05, + "loss": 1.838, + "step": 23363 + }, + { + "epoch": 2.7259362968148406, + "grad_norm": 1.2397246360778809, + "learning_rate": 8.571420744810057e-05, + "loss": 2.1064, + "step": 23364 + }, + { + "epoch": 2.7260529693151323, + "grad_norm": 1.194000005722046, + "learning_rate": 8.570065242877147e-05, + "loss": 2.0548, + "step": 23365 + }, + { + "epoch": 2.726169641815424, + "grad_norm": 1.2334556579589844, + "learning_rate": 8.568709809157713e-05, + "loss": 1.8926, + "step": 23366 + }, + { + "epoch": 2.7262863143157157, + "grad_norm": 1.1693212985992432, + "learning_rate": 8.567354443665824e-05, + "loss": 1.8074, + "step": 23367 + }, + { + "epoch": 2.7264029868160073, + "grad_norm": 1.078666090965271, + "learning_rate": 8.565999146415513e-05, + "loss": 2.1392, + "step": 23368 + }, + { + "epoch": 2.726519659316299, + "grad_norm": 1.0291109085083008, + "learning_rate": 8.564643917420846e-05, + "loss": 1.9399, + "step": 23369 + }, + { + "epoch": 2.7266363318165907, + "grad_norm": 1.109961748123169, + "learning_rate": 8.563288756695859e-05, + "loss": 2.1797, + "step": 23370 + }, + { + "epoch": 2.7267530043168824, + "grad_norm": 1.5651720762252808, + "learning_rate": 8.561933664254612e-05, + "loss": 2.0827, + "step": 23371 + }, + { + "epoch": 2.726869676817174, + "grad_norm": 1.103503704071045, + "learning_rate": 8.560578640111148e-05, + "loss": 2.0652, + "step": 23372 + }, + { + "epoch": 2.7269863493174658, + "grad_norm": 1.1669715642929077, + "learning_rate": 8.559223684279511e-05, + "loss": 1.9728, + "step": 23373 + }, + { + "epoch": 2.7271030218177574, + "grad_norm": 1.1563494205474854, + "learning_rate": 8.557868796773756e-05, + "loss": 1.8591, + "step": 23374 + }, + { + "epoch": 2.727219694318049, + "grad_norm": 1.2255414724349976, + "learning_rate": 8.556513977607925e-05, + "loss": 2.09, + "step": 23375 + }, + { + "epoch": 2.727336366818341, + "grad_norm": 0.9767301082611084, + "learning_rate": 8.555159226796054e-05, + "loss": 1.9001, + "step": 23376 + }, + { + "epoch": 2.7274530393186325, + "grad_norm": 1.0913454294204712, + "learning_rate": 8.553804544352199e-05, + "loss": 2.0698, + "step": 23377 + }, + { + "epoch": 2.727569711818924, + "grad_norm": 1.2166050672531128, + "learning_rate": 8.552449930290401e-05, + "loss": 1.9615, + "step": 23378 + }, + { + "epoch": 2.727686384319216, + "grad_norm": 1.1635417938232422, + "learning_rate": 8.551095384624704e-05, + "loss": 2.0601, + "step": 23379 + }, + { + "epoch": 2.7278030568195075, + "grad_norm": 1.2837276458740234, + "learning_rate": 8.54974090736915e-05, + "loss": 2.0426, + "step": 23380 + }, + { + "epoch": 2.727919729319799, + "grad_norm": 1.1817498207092285, + "learning_rate": 8.548386498537777e-05, + "loss": 1.8239, + "step": 23381 + }, + { + "epoch": 2.728036401820091, + "grad_norm": 1.0572749376296997, + "learning_rate": 8.547032158144635e-05, + "loss": 1.8713, + "step": 23382 + }, + { + "epoch": 2.7281530743203826, + "grad_norm": 1.0876824855804443, + "learning_rate": 8.545677886203752e-05, + "loss": 2.014, + "step": 23383 + }, + { + "epoch": 2.7282697468206742, + "grad_norm": 1.2915587425231934, + "learning_rate": 8.544323682729179e-05, + "loss": 2.0103, + "step": 23384 + }, + { + "epoch": 2.728386419320966, + "grad_norm": 1.1250834465026855, + "learning_rate": 8.542969547734944e-05, + "loss": 2.0522, + "step": 23385 + }, + { + "epoch": 2.7285030918212576, + "grad_norm": 1.320841908454895, + "learning_rate": 8.541615481235099e-05, + "loss": 2.0773, + "step": 23386 + }, + { + "epoch": 2.7286197643215493, + "grad_norm": 1.1253657341003418, + "learning_rate": 8.540261483243665e-05, + "loss": 2.0321, + "step": 23387 + }, + { + "epoch": 2.728736436821841, + "grad_norm": 1.0319136381149292, + "learning_rate": 8.538907553774695e-05, + "loss": 2.0154, + "step": 23388 + }, + { + "epoch": 2.7288531093221327, + "grad_norm": 1.2784373760223389, + "learning_rate": 8.537553692842212e-05, + "loss": 2.0336, + "step": 23389 + }, + { + "epoch": 2.7289697818224243, + "grad_norm": 1.1328299045562744, + "learning_rate": 8.536199900460262e-05, + "loss": 1.8061, + "step": 23390 + }, + { + "epoch": 2.729086454322716, + "grad_norm": 1.107505440711975, + "learning_rate": 8.534846176642873e-05, + "loss": 1.9669, + "step": 23391 + }, + { + "epoch": 2.7292031268230077, + "grad_norm": 1.146488070487976, + "learning_rate": 8.533492521404076e-05, + "loss": 1.9428, + "step": 23392 + }, + { + "epoch": 2.7293197993232994, + "grad_norm": 1.045682668685913, + "learning_rate": 8.532138934757913e-05, + "loss": 1.8173, + "step": 23393 + }, + { + "epoch": 2.729436471823591, + "grad_norm": 1.098190188407898, + "learning_rate": 8.530785416718407e-05, + "loss": 1.9588, + "step": 23394 + }, + { + "epoch": 2.7295531443238827, + "grad_norm": 1.0381497144699097, + "learning_rate": 8.529431967299599e-05, + "loss": 1.9523, + "step": 23395 + }, + { + "epoch": 2.7296698168241744, + "grad_norm": 1.1826013326644897, + "learning_rate": 8.528078586515513e-05, + "loss": 1.9496, + "step": 23396 + }, + { + "epoch": 2.729786489324466, + "grad_norm": 1.027308702468872, + "learning_rate": 8.526725274380185e-05, + "loss": 1.9796, + "step": 23397 + }, + { + "epoch": 2.729903161824758, + "grad_norm": 1.2095109224319458, + "learning_rate": 8.525372030907639e-05, + "loss": 2.1507, + "step": 23398 + }, + { + "epoch": 2.7300198343250495, + "grad_norm": 1.108818769454956, + "learning_rate": 8.524018856111906e-05, + "loss": 1.9776, + "step": 23399 + }, + { + "epoch": 2.730136506825341, + "grad_norm": 1.247910976409912, + "learning_rate": 8.522665750007015e-05, + "loss": 1.9273, + "step": 23400 + }, + { + "epoch": 2.730253179325633, + "grad_norm": 1.104251742362976, + "learning_rate": 8.521312712606998e-05, + "loss": 2.0046, + "step": 23401 + }, + { + "epoch": 2.7303698518259245, + "grad_norm": 1.1352449655532837, + "learning_rate": 8.51995974392587e-05, + "loss": 1.9713, + "step": 23402 + }, + { + "epoch": 2.730486524326216, + "grad_norm": 1.1539368629455566, + "learning_rate": 8.518606843977667e-05, + "loss": 1.9245, + "step": 23403 + }, + { + "epoch": 2.730603196826508, + "grad_norm": 0.9632248878479004, + "learning_rate": 8.517254012776415e-05, + "loss": 1.8728, + "step": 23404 + }, + { + "epoch": 2.7307198693267996, + "grad_norm": 1.2101683616638184, + "learning_rate": 8.515901250336129e-05, + "loss": 2.0256, + "step": 23405 + }, + { + "epoch": 2.7308365418270912, + "grad_norm": 1.1707836389541626, + "learning_rate": 8.514548556670846e-05, + "loss": 1.8182, + "step": 23406 + }, + { + "epoch": 2.730953214327383, + "grad_norm": 1.06849205493927, + "learning_rate": 8.513195931794584e-05, + "loss": 2.0367, + "step": 23407 + }, + { + "epoch": 2.7310698868276746, + "grad_norm": 1.1987261772155762, + "learning_rate": 8.511843375721363e-05, + "loss": 2.1363, + "step": 23408 + }, + { + "epoch": 2.7311865593279663, + "grad_norm": 0.9986512064933777, + "learning_rate": 8.510490888465198e-05, + "loss": 1.812, + "step": 23409 + }, + { + "epoch": 2.731303231828258, + "grad_norm": 1.2114243507385254, + "learning_rate": 8.509138470040126e-05, + "loss": 2.0257, + "step": 23410 + }, + { + "epoch": 2.7314199043285496, + "grad_norm": 0.9321691393852234, + "learning_rate": 8.507786120460153e-05, + "loss": 1.9063, + "step": 23411 + }, + { + "epoch": 2.7315365768288413, + "grad_norm": 1.0479681491851807, + "learning_rate": 8.50643383973931e-05, + "loss": 1.9473, + "step": 23412 + }, + { + "epoch": 2.731653249329133, + "grad_norm": 1.1351186037063599, + "learning_rate": 8.505081627891605e-05, + "loss": 1.7918, + "step": 23413 + }, + { + "epoch": 2.7317699218294247, + "grad_norm": 1.23249089717865, + "learning_rate": 8.50372948493107e-05, + "loss": 2.0541, + "step": 23414 + }, + { + "epoch": 2.7318865943297164, + "grad_norm": 1.0359399318695068, + "learning_rate": 8.502377410871707e-05, + "loss": 1.8735, + "step": 23415 + }, + { + "epoch": 2.732003266830008, + "grad_norm": 1.0714106559753418, + "learning_rate": 8.50102540572754e-05, + "loss": 1.7401, + "step": 23416 + }, + { + "epoch": 2.7321199393302997, + "grad_norm": 1.0885344743728638, + "learning_rate": 8.499673469512591e-05, + "loss": 2.0692, + "step": 23417 + }, + { + "epoch": 2.7322366118305914, + "grad_norm": 1.1158082485198975, + "learning_rate": 8.498321602240866e-05, + "loss": 2.0164, + "step": 23418 + }, + { + "epoch": 2.732353284330883, + "grad_norm": 1.2387518882751465, + "learning_rate": 8.496969803926388e-05, + "loss": 1.9778, + "step": 23419 + }, + { + "epoch": 2.7324699568311748, + "grad_norm": 1.2481151819229126, + "learning_rate": 8.495618074583162e-05, + "loss": 1.9611, + "step": 23420 + }, + { + "epoch": 2.7325866293314665, + "grad_norm": 1.2129483222961426, + "learning_rate": 8.49426641422521e-05, + "loss": 2.2067, + "step": 23421 + }, + { + "epoch": 2.732703301831758, + "grad_norm": 1.0439763069152832, + "learning_rate": 8.492914822866534e-05, + "loss": 1.8284, + "step": 23422 + }, + { + "epoch": 2.73281997433205, + "grad_norm": 1.274695634841919, + "learning_rate": 8.491563300521161e-05, + "loss": 1.9949, + "step": 23423 + }, + { + "epoch": 2.7329366468323415, + "grad_norm": 1.230622410774231, + "learning_rate": 8.490211847203091e-05, + "loss": 2.0541, + "step": 23424 + }, + { + "epoch": 2.733053319332633, + "grad_norm": 1.0556492805480957, + "learning_rate": 8.488860462926333e-05, + "loss": 1.8485, + "step": 23425 + }, + { + "epoch": 2.733169991832925, + "grad_norm": 0.9701511859893799, + "learning_rate": 8.487509147704903e-05, + "loss": 1.8582, + "step": 23426 + }, + { + "epoch": 2.7332866643332165, + "grad_norm": 1.154280662536621, + "learning_rate": 8.486157901552807e-05, + "loss": 1.9161, + "step": 23427 + }, + { + "epoch": 2.7334033368335082, + "grad_norm": 1.1559383869171143, + "learning_rate": 8.484806724484049e-05, + "loss": 1.8899, + "step": 23428 + }, + { + "epoch": 2.7335200093338, + "grad_norm": 1.0808210372924805, + "learning_rate": 8.483455616512643e-05, + "loss": 1.9624, + "step": 23429 + }, + { + "epoch": 2.7336366818340916, + "grad_norm": 0.9646337032318115, + "learning_rate": 8.482104577652597e-05, + "loss": 2.0551, + "step": 23430 + }, + { + "epoch": 2.7337533543343833, + "grad_norm": 1.248225450515747, + "learning_rate": 8.480753607917907e-05, + "loss": 1.9656, + "step": 23431 + }, + { + "epoch": 2.733870026834675, + "grad_norm": 1.2339428663253784, + "learning_rate": 8.479402707322592e-05, + "loss": 1.8478, + "step": 23432 + }, + { + "epoch": 2.7339866993349666, + "grad_norm": 1.2236382961273193, + "learning_rate": 8.478051875880644e-05, + "loss": 2.0416, + "step": 23433 + }, + { + "epoch": 2.7341033718352583, + "grad_norm": 1.141071081161499, + "learning_rate": 8.476701113606078e-05, + "loss": 2.0296, + "step": 23434 + }, + { + "epoch": 2.73422004433555, + "grad_norm": 1.154538869857788, + "learning_rate": 8.475350420512887e-05, + "loss": 1.9552, + "step": 23435 + }, + { + "epoch": 2.7343367168358417, + "grad_norm": 1.1542447805404663, + "learning_rate": 8.473999796615083e-05, + "loss": 1.9846, + "step": 23436 + }, + { + "epoch": 2.7344533893361334, + "grad_norm": 1.253594160079956, + "learning_rate": 8.472649241926658e-05, + "loss": 2.0421, + "step": 23437 + }, + { + "epoch": 2.734570061836425, + "grad_norm": 1.0653162002563477, + "learning_rate": 8.471298756461625e-05, + "loss": 2.0647, + "step": 23438 + }, + { + "epoch": 2.7346867343367167, + "grad_norm": 1.1677716970443726, + "learning_rate": 8.46994834023397e-05, + "loss": 1.7289, + "step": 23439 + }, + { + "epoch": 2.7348034068370084, + "grad_norm": 1.134783148765564, + "learning_rate": 8.468597993257708e-05, + "loss": 2.0055, + "step": 23440 + }, + { + "epoch": 2.7349200793373, + "grad_norm": 1.0842669010162354, + "learning_rate": 8.467247715546827e-05, + "loss": 1.8152, + "step": 23441 + }, + { + "epoch": 2.7350367518375918, + "grad_norm": 1.1406956911087036, + "learning_rate": 8.465897507115327e-05, + "loss": 2.0295, + "step": 23442 + }, + { + "epoch": 2.7351534243378834, + "grad_norm": 1.3427186012268066, + "learning_rate": 8.464547367977207e-05, + "loss": 1.8835, + "step": 23443 + }, + { + "epoch": 2.735270096838175, + "grad_norm": 1.003381371498108, + "learning_rate": 8.46319729814646e-05, + "loss": 1.8387, + "step": 23444 + }, + { + "epoch": 2.735386769338467, + "grad_norm": 1.11920964717865, + "learning_rate": 8.461847297637094e-05, + "loss": 2.0021, + "step": 23445 + }, + { + "epoch": 2.7355034418387585, + "grad_norm": 1.0777314901351929, + "learning_rate": 8.460497366463086e-05, + "loss": 1.9525, + "step": 23446 + }, + { + "epoch": 2.73562011433905, + "grad_norm": 1.0186705589294434, + "learning_rate": 8.459147504638449e-05, + "loss": 1.8515, + "step": 23447 + }, + { + "epoch": 2.735736786839342, + "grad_norm": 1.2144560813903809, + "learning_rate": 8.457797712177161e-05, + "loss": 1.9642, + "step": 23448 + }, + { + "epoch": 2.7358534593396335, + "grad_norm": 1.2342712879180908, + "learning_rate": 8.45644798909323e-05, + "loss": 1.8556, + "step": 23449 + }, + { + "epoch": 2.735970131839925, + "grad_norm": 1.2350332736968994, + "learning_rate": 8.455098335400633e-05, + "loss": 1.9887, + "step": 23450 + }, + { + "epoch": 2.736086804340217, + "grad_norm": 1.1280947923660278, + "learning_rate": 8.453748751113375e-05, + "loss": 1.9558, + "step": 23451 + }, + { + "epoch": 2.7362034768405086, + "grad_norm": 1.1119478940963745, + "learning_rate": 8.45239923624544e-05, + "loss": 2.0738, + "step": 23452 + }, + { + "epoch": 2.7363201493408003, + "grad_norm": 1.0901625156402588, + "learning_rate": 8.45104979081082e-05, + "loss": 2.079, + "step": 23453 + }, + { + "epoch": 2.736436821841092, + "grad_norm": 1.1130300760269165, + "learning_rate": 8.449700414823503e-05, + "loss": 2.0635, + "step": 23454 + }, + { + "epoch": 2.7365534943413836, + "grad_norm": 0.9071168899536133, + "learning_rate": 8.448351108297478e-05, + "loss": 1.7907, + "step": 23455 + }, + { + "epoch": 2.7366701668416753, + "grad_norm": 1.1346644163131714, + "learning_rate": 8.447001871246738e-05, + "loss": 2.0058, + "step": 23456 + }, + { + "epoch": 2.736786839341967, + "grad_norm": 1.2078478336334229, + "learning_rate": 8.445652703685268e-05, + "loss": 2.035, + "step": 23457 + }, + { + "epoch": 2.7369035118422587, + "grad_norm": 1.184733510017395, + "learning_rate": 8.44430360562705e-05, + "loss": 2.1788, + "step": 23458 + }, + { + "epoch": 2.7370201843425503, + "grad_norm": 1.2321971654891968, + "learning_rate": 8.442954577086078e-05, + "loss": 2.0023, + "step": 23459 + }, + { + "epoch": 2.737136856842842, + "grad_norm": 1.0927091836929321, + "learning_rate": 8.44160561807633e-05, + "loss": 1.8298, + "step": 23460 + }, + { + "epoch": 2.7372535293431337, + "grad_norm": 0.9754819273948669, + "learning_rate": 8.44025672861179e-05, + "loss": 1.864, + "step": 23461 + }, + { + "epoch": 2.7373702018434254, + "grad_norm": 1.1049001216888428, + "learning_rate": 8.43890790870645e-05, + "loss": 1.8212, + "step": 23462 + }, + { + "epoch": 2.737486874343717, + "grad_norm": 1.0609148740768433, + "learning_rate": 8.437559158374281e-05, + "loss": 1.775, + "step": 23463 + }, + { + "epoch": 2.7376035468440088, + "grad_norm": 1.237830638885498, + "learning_rate": 8.436210477629282e-05, + "loss": 1.9006, + "step": 23464 + }, + { + "epoch": 2.7377202193443004, + "grad_norm": 1.1289764642715454, + "learning_rate": 8.434861866485417e-05, + "loss": 1.8919, + "step": 23465 + }, + { + "epoch": 2.737836891844592, + "grad_norm": 1.4666730165481567, + "learning_rate": 8.433513324956677e-05, + "loss": 1.9492, + "step": 23466 + }, + { + "epoch": 2.737953564344884, + "grad_norm": 1.185449242591858, + "learning_rate": 8.43216485305704e-05, + "loss": 2.2297, + "step": 23467 + }, + { + "epoch": 2.7380702368451755, + "grad_norm": 1.165372610092163, + "learning_rate": 8.430816450800483e-05, + "loss": 2.0557, + "step": 23468 + }, + { + "epoch": 2.738186909345467, + "grad_norm": 1.029500961303711, + "learning_rate": 8.429468118200993e-05, + "loss": 1.8345, + "step": 23469 + }, + { + "epoch": 2.738303581845759, + "grad_norm": 1.2193236351013184, + "learning_rate": 8.428119855272536e-05, + "loss": 2.2663, + "step": 23470 + }, + { + "epoch": 2.7384202543460505, + "grad_norm": 1.0848095417022705, + "learning_rate": 8.426771662029102e-05, + "loss": 1.9064, + "step": 23471 + }, + { + "epoch": 2.738536926846342, + "grad_norm": 1.1917755603790283, + "learning_rate": 8.425423538484657e-05, + "loss": 1.9795, + "step": 23472 + }, + { + "epoch": 2.738653599346634, + "grad_norm": 1.1650593280792236, + "learning_rate": 8.424075484653186e-05, + "loss": 1.934, + "step": 23473 + }, + { + "epoch": 2.7387702718469256, + "grad_norm": 1.2069905996322632, + "learning_rate": 8.422727500548654e-05, + "loss": 2.0894, + "step": 23474 + }, + { + "epoch": 2.7388869443472172, + "grad_norm": 1.127487301826477, + "learning_rate": 8.421379586185044e-05, + "loss": 1.9807, + "step": 23475 + }, + { + "epoch": 2.739003616847509, + "grad_norm": 1.2379828691482544, + "learning_rate": 8.420031741576327e-05, + "loss": 1.9264, + "step": 23476 + }, + { + "epoch": 2.7391202893478006, + "grad_norm": 1.0125510692596436, + "learning_rate": 8.418683966736476e-05, + "loss": 1.9806, + "step": 23477 + }, + { + "epoch": 2.7392369618480923, + "grad_norm": 1.1620105504989624, + "learning_rate": 8.417336261679458e-05, + "loss": 1.9786, + "step": 23478 + }, + { + "epoch": 2.739353634348384, + "grad_norm": 1.1450170278549194, + "learning_rate": 8.415988626419253e-05, + "loss": 1.9467, + "step": 23479 + }, + { + "epoch": 2.7394703068486757, + "grad_norm": 1.1191421747207642, + "learning_rate": 8.41464106096982e-05, + "loss": 2.0357, + "step": 23480 + }, + { + "epoch": 2.7395869793489673, + "grad_norm": 1.1742439270019531, + "learning_rate": 8.41329356534514e-05, + "loss": 2.0095, + "step": 23481 + }, + { + "epoch": 2.739703651849259, + "grad_norm": 1.284332513809204, + "learning_rate": 8.411946139559184e-05, + "loss": 2.165, + "step": 23482 + }, + { + "epoch": 2.7398203243495507, + "grad_norm": 1.1182793378829956, + "learning_rate": 8.41059878362591e-05, + "loss": 1.9146, + "step": 23483 + }, + { + "epoch": 2.7399369968498424, + "grad_norm": 1.125752329826355, + "learning_rate": 8.409251497559299e-05, + "loss": 1.8977, + "step": 23484 + }, + { + "epoch": 2.740053669350134, + "grad_norm": 1.245977759361267, + "learning_rate": 8.407904281373303e-05, + "loss": 2.0059, + "step": 23485 + }, + { + "epoch": 2.7401703418504257, + "grad_norm": 1.0655322074890137, + "learning_rate": 8.406557135081902e-05, + "loss": 1.9464, + "step": 23486 + }, + { + "epoch": 2.7402870143507174, + "grad_norm": 1.1383023262023926, + "learning_rate": 8.40521005869905e-05, + "loss": 1.9353, + "step": 23487 + }, + { + "epoch": 2.740403686851009, + "grad_norm": 1.0755459070205688, + "learning_rate": 8.403863052238727e-05, + "loss": 1.9654, + "step": 23488 + }, + { + "epoch": 2.740520359351301, + "grad_norm": 1.2238818407058716, + "learning_rate": 8.402516115714879e-05, + "loss": 2.011, + "step": 23489 + }, + { + "epoch": 2.7406370318515925, + "grad_norm": 1.2908248901367188, + "learning_rate": 8.401169249141487e-05, + "loss": 1.9758, + "step": 23490 + }, + { + "epoch": 2.740753704351884, + "grad_norm": 1.0616248846054077, + "learning_rate": 8.399822452532499e-05, + "loss": 1.9445, + "step": 23491 + }, + { + "epoch": 2.740870376852176, + "grad_norm": 1.2519346475601196, + "learning_rate": 8.39847572590189e-05, + "loss": 1.9951, + "step": 23492 + }, + { + "epoch": 2.7409870493524675, + "grad_norm": 1.1480525732040405, + "learning_rate": 8.397129069263614e-05, + "loss": 2.1373, + "step": 23493 + }, + { + "epoch": 2.741103721852759, + "grad_norm": 1.1804778575897217, + "learning_rate": 8.395782482631628e-05, + "loss": 1.8411, + "step": 23494 + }, + { + "epoch": 2.741220394353051, + "grad_norm": 1.1408005952835083, + "learning_rate": 8.394435966019904e-05, + "loss": 1.8069, + "step": 23495 + }, + { + "epoch": 2.7413370668533426, + "grad_norm": 1.158699870109558, + "learning_rate": 8.393089519442384e-05, + "loss": 2.0245, + "step": 23496 + }, + { + "epoch": 2.7414537393536342, + "grad_norm": 1.3262701034545898, + "learning_rate": 8.391743142913046e-05, + "loss": 1.9311, + "step": 23497 + }, + { + "epoch": 2.741570411853926, + "grad_norm": 1.1054766178131104, + "learning_rate": 8.390396836445831e-05, + "loss": 1.9709, + "step": 23498 + }, + { + "epoch": 2.7416870843542176, + "grad_norm": 1.1879949569702148, + "learning_rate": 8.389050600054707e-05, + "loss": 2.0287, + "step": 23499 + }, + { + "epoch": 2.7418037568545093, + "grad_norm": 1.0604815483093262, + "learning_rate": 8.387704433753626e-05, + "loss": 1.8541, + "step": 23500 + }, + { + "epoch": 2.741920429354801, + "grad_norm": 1.2027950286865234, + "learning_rate": 8.386358337556545e-05, + "loss": 2.1316, + "step": 23501 + }, + { + "epoch": 2.7420371018550926, + "grad_norm": 1.2453244924545288, + "learning_rate": 8.385012311477413e-05, + "loss": 2.0097, + "step": 23502 + }, + { + "epoch": 2.7421537743553843, + "grad_norm": 1.197390079498291, + "learning_rate": 8.383666355530195e-05, + "loss": 1.9205, + "step": 23503 + }, + { + "epoch": 2.742270446855676, + "grad_norm": 0.95986407995224, + "learning_rate": 8.382320469728831e-05, + "loss": 1.885, + "step": 23504 + }, + { + "epoch": 2.7423871193559677, + "grad_norm": 1.2160894870758057, + "learning_rate": 8.38097465408729e-05, + "loss": 1.8956, + "step": 23505 + }, + { + "epoch": 2.7425037918562594, + "grad_norm": 1.0775537490844727, + "learning_rate": 8.379628908619508e-05, + "loss": 1.8619, + "step": 23506 + }, + { + "epoch": 2.742620464356551, + "grad_norm": 1.1996057033538818, + "learning_rate": 8.37828323333944e-05, + "loss": 2.0213, + "step": 23507 + }, + { + "epoch": 2.7427371368568427, + "grad_norm": 1.1167497634887695, + "learning_rate": 8.376937628261049e-05, + "loss": 2.1322, + "step": 23508 + }, + { + "epoch": 2.7428538093571344, + "grad_norm": 0.9813701510429382, + "learning_rate": 8.375592093398273e-05, + "loss": 1.9593, + "step": 23509 + }, + { + "epoch": 2.742970481857426, + "grad_norm": 1.06111741065979, + "learning_rate": 8.374246628765061e-05, + "loss": 1.7339, + "step": 23510 + }, + { + "epoch": 2.7430871543577178, + "grad_norm": 0.9064726829528809, + "learning_rate": 8.372901234375368e-05, + "loss": 1.8211, + "step": 23511 + }, + { + "epoch": 2.7432038268580095, + "grad_norm": 1.0781444311141968, + "learning_rate": 8.371555910243135e-05, + "loss": 1.8791, + "step": 23512 + }, + { + "epoch": 2.743320499358301, + "grad_norm": 1.2511358261108398, + "learning_rate": 8.370210656382307e-05, + "loss": 1.8599, + "step": 23513 + }, + { + "epoch": 2.743437171858593, + "grad_norm": 0.9689947366714478, + "learning_rate": 8.36886547280684e-05, + "loss": 1.8237, + "step": 23514 + }, + { + "epoch": 2.7435538443588845, + "grad_norm": 1.2358222007751465, + "learning_rate": 8.367520359530668e-05, + "loss": 1.89, + "step": 23515 + }, + { + "epoch": 2.743670516859176, + "grad_norm": 1.161811113357544, + "learning_rate": 8.366175316567746e-05, + "loss": 1.9735, + "step": 23516 + }, + { + "epoch": 2.743787189359468, + "grad_norm": 1.0997343063354492, + "learning_rate": 8.364830343932008e-05, + "loss": 1.922, + "step": 23517 + }, + { + "epoch": 2.7439038618597595, + "grad_norm": 1.1074395179748535, + "learning_rate": 8.363485441637408e-05, + "loss": 1.9943, + "step": 23518 + }, + { + "epoch": 2.7440205343600512, + "grad_norm": 1.1659109592437744, + "learning_rate": 8.362140609697875e-05, + "loss": 2.066, + "step": 23519 + }, + { + "epoch": 2.744137206860343, + "grad_norm": 1.1245105266571045, + "learning_rate": 8.360795848127361e-05, + "loss": 2.1833, + "step": 23520 + }, + { + "epoch": 2.7442538793606346, + "grad_norm": 1.069420576095581, + "learning_rate": 8.359451156939808e-05, + "loss": 1.891, + "step": 23521 + }, + { + "epoch": 2.7443705518609263, + "grad_norm": 1.173539161682129, + "learning_rate": 8.35810653614915e-05, + "loss": 2.061, + "step": 23522 + }, + { + "epoch": 2.744487224361218, + "grad_norm": 1.2900922298431396, + "learning_rate": 8.356761985769334e-05, + "loss": 1.9624, + "step": 23523 + }, + { + "epoch": 2.7446038968615096, + "grad_norm": 1.099198579788208, + "learning_rate": 8.355417505814286e-05, + "loss": 1.9155, + "step": 23524 + }, + { + "epoch": 2.7447205693618013, + "grad_norm": 1.1247695684432983, + "learning_rate": 8.354073096297961e-05, + "loss": 1.7729, + "step": 23525 + }, + { + "epoch": 2.744837241862093, + "grad_norm": 1.1745213270187378, + "learning_rate": 8.352728757234284e-05, + "loss": 2.0374, + "step": 23526 + }, + { + "epoch": 2.7449539143623847, + "grad_norm": 1.1608549356460571, + "learning_rate": 8.351384488637191e-05, + "loss": 2.0372, + "step": 23527 + }, + { + "epoch": 2.7450705868626764, + "grad_norm": 1.339309811592102, + "learning_rate": 8.350040290520629e-05, + "loss": 1.978, + "step": 23528 + }, + { + "epoch": 2.745187259362968, + "grad_norm": 1.035519003868103, + "learning_rate": 8.348696162898524e-05, + "loss": 1.6469, + "step": 23529 + }, + { + "epoch": 2.7453039318632597, + "grad_norm": 1.1926214694976807, + "learning_rate": 8.347352105784809e-05, + "loss": 2.1593, + "step": 23530 + }, + { + "epoch": 2.7454206043635514, + "grad_norm": 0.9856740236282349, + "learning_rate": 8.346008119193426e-05, + "loss": 2.0011, + "step": 23531 + }, + { + "epoch": 2.745537276863843, + "grad_norm": 1.105182409286499, + "learning_rate": 8.344664203138299e-05, + "loss": 1.9622, + "step": 23532 + }, + { + "epoch": 2.7456539493641348, + "grad_norm": 1.0042308568954468, + "learning_rate": 8.343320357633362e-05, + "loss": 1.8519, + "step": 23533 + }, + { + "epoch": 2.7457706218644264, + "grad_norm": 1.2722324132919312, + "learning_rate": 8.341976582692557e-05, + "loss": 2.035, + "step": 23534 + }, + { + "epoch": 2.745887294364718, + "grad_norm": 1.082195520401001, + "learning_rate": 8.340632878329797e-05, + "loss": 1.903, + "step": 23535 + }, + { + "epoch": 2.74600396686501, + "grad_norm": 1.1163805723190308, + "learning_rate": 8.33928924455903e-05, + "loss": 1.9212, + "step": 23536 + }, + { + "epoch": 2.7461206393653015, + "grad_norm": 1.0792335271835327, + "learning_rate": 8.337945681394174e-05, + "loss": 2.0136, + "step": 23537 + }, + { + "epoch": 2.746237311865593, + "grad_norm": 1.1361854076385498, + "learning_rate": 8.336602188849164e-05, + "loss": 1.9984, + "step": 23538 + }, + { + "epoch": 2.746353984365885, + "grad_norm": 1.27851402759552, + "learning_rate": 8.33525876693792e-05, + "loss": 2.0035, + "step": 23539 + }, + { + "epoch": 2.7464706568661765, + "grad_norm": 1.1212966442108154, + "learning_rate": 8.333915415674379e-05, + "loss": 1.9591, + "step": 23540 + }, + { + "epoch": 2.746587329366468, + "grad_norm": 1.0394316911697388, + "learning_rate": 8.332572135072455e-05, + "loss": 1.8335, + "step": 23541 + }, + { + "epoch": 2.74670400186676, + "grad_norm": 1.085447072982788, + "learning_rate": 8.331228925146085e-05, + "loss": 1.9847, + "step": 23542 + }, + { + "epoch": 2.7468206743670516, + "grad_norm": 1.1056572198867798, + "learning_rate": 8.329885785909188e-05, + "loss": 1.8423, + "step": 23543 + }, + { + "epoch": 2.7469373468673433, + "grad_norm": 1.1032607555389404, + "learning_rate": 8.328542717375692e-05, + "loss": 1.9725, + "step": 23544 + }, + { + "epoch": 2.747054019367635, + "grad_norm": 1.3047139644622803, + "learning_rate": 8.327199719559522e-05, + "loss": 1.9011, + "step": 23545 + }, + { + "epoch": 2.7471706918679266, + "grad_norm": 1.0544524192810059, + "learning_rate": 8.325856792474588e-05, + "loss": 1.884, + "step": 23546 + }, + { + "epoch": 2.7472873643682183, + "grad_norm": 1.1646974086761475, + "learning_rate": 8.324513936134826e-05, + "loss": 1.9953, + "step": 23547 + }, + { + "epoch": 2.74740403686851, + "grad_norm": 1.042782187461853, + "learning_rate": 8.32317115055415e-05, + "loss": 2.0175, + "step": 23548 + }, + { + "epoch": 2.7475207093688017, + "grad_norm": 1.0693199634552002, + "learning_rate": 8.321828435746486e-05, + "loss": 1.963, + "step": 23549 + }, + { + "epoch": 2.7476373818690933, + "grad_norm": 1.195588231086731, + "learning_rate": 8.320485791725746e-05, + "loss": 2.0366, + "step": 23550 + }, + { + "epoch": 2.747754054369385, + "grad_norm": 1.0403573513031006, + "learning_rate": 8.319143218505857e-05, + "loss": 1.9406, + "step": 23551 + }, + { + "epoch": 2.7478707268696767, + "grad_norm": 1.1765857934951782, + "learning_rate": 8.31780071610073e-05, + "loss": 1.9756, + "step": 23552 + }, + { + "epoch": 2.7479873993699684, + "grad_norm": 1.019716501235962, + "learning_rate": 8.31645828452429e-05, + "loss": 1.9739, + "step": 23553 + }, + { + "epoch": 2.74810407187026, + "grad_norm": 1.2642221450805664, + "learning_rate": 8.315115923790447e-05, + "loss": 2.0481, + "step": 23554 + }, + { + "epoch": 2.7482207443705517, + "grad_norm": 1.1065752506256104, + "learning_rate": 8.313773633913126e-05, + "loss": 1.9016, + "step": 23555 + }, + { + "epoch": 2.7483374168708434, + "grad_norm": 1.013372778892517, + "learning_rate": 8.312431414906229e-05, + "loss": 1.813, + "step": 23556 + }, + { + "epoch": 2.748454089371135, + "grad_norm": 1.209331750869751, + "learning_rate": 8.311089266783686e-05, + "loss": 2.0356, + "step": 23557 + }, + { + "epoch": 2.748570761871427, + "grad_norm": 1.1547727584838867, + "learning_rate": 8.309747189559398e-05, + "loss": 1.9349, + "step": 23558 + }, + { + "epoch": 2.7486874343717185, + "grad_norm": 1.0062475204467773, + "learning_rate": 8.308405183247282e-05, + "loss": 1.805, + "step": 23559 + }, + { + "epoch": 2.74880410687201, + "grad_norm": 1.13388192653656, + "learning_rate": 8.307063247861259e-05, + "loss": 2.0191, + "step": 23560 + }, + { + "epoch": 2.748920779372302, + "grad_norm": 1.21530020236969, + "learning_rate": 8.30572138341523e-05, + "loss": 2.0217, + "step": 23561 + }, + { + "epoch": 2.7490374518725935, + "grad_norm": 1.0019099712371826, + "learning_rate": 8.304379589923113e-05, + "loss": 1.9826, + "step": 23562 + }, + { + "epoch": 2.749154124372885, + "grad_norm": 1.224548578262329, + "learning_rate": 8.303037867398809e-05, + "loss": 2.0085, + "step": 23563 + }, + { + "epoch": 2.749270796873177, + "grad_norm": 1.0248208045959473, + "learning_rate": 8.301696215856238e-05, + "loss": 1.8932, + "step": 23564 + }, + { + "epoch": 2.7493874693734686, + "grad_norm": 1.1854087114334106, + "learning_rate": 8.300354635309298e-05, + "loss": 2.0724, + "step": 23565 + }, + { + "epoch": 2.7495041418737602, + "grad_norm": 1.0682743787765503, + "learning_rate": 8.29901312577191e-05, + "loss": 1.812, + "step": 23566 + }, + { + "epoch": 2.749620814374052, + "grad_norm": 1.3529698848724365, + "learning_rate": 8.297671687257969e-05, + "loss": 1.9293, + "step": 23567 + }, + { + "epoch": 2.7497374868743436, + "grad_norm": 1.1032845973968506, + "learning_rate": 8.296330319781393e-05, + "loss": 1.9763, + "step": 23568 + }, + { + "epoch": 2.7498541593746353, + "grad_norm": 1.1506232023239136, + "learning_rate": 8.294989023356072e-05, + "loss": 2.033, + "step": 23569 + }, + { + "epoch": 2.749970831874927, + "grad_norm": 1.310757040977478, + "learning_rate": 8.293647797995929e-05, + "loss": 2.0945, + "step": 23570 + }, + { + "epoch": 2.7500875043752186, + "grad_norm": 1.289191484451294, + "learning_rate": 8.292306643714854e-05, + "loss": 2.1572, + "step": 23571 + }, + { + "epoch": 2.7502041768755103, + "grad_norm": 1.0647733211517334, + "learning_rate": 8.290965560526759e-05, + "loss": 1.9396, + "step": 23572 + }, + { + "epoch": 2.750320849375802, + "grad_norm": 1.3456988334655762, + "learning_rate": 8.289624548445548e-05, + "loss": 2.1046, + "step": 23573 + }, + { + "epoch": 2.7504375218760937, + "grad_norm": 1.2350586652755737, + "learning_rate": 8.288283607485115e-05, + "loss": 1.88, + "step": 23574 + }, + { + "epoch": 2.7505541943763854, + "grad_norm": 1.155357003211975, + "learning_rate": 8.286942737659372e-05, + "loss": 2.0332, + "step": 23575 + }, + { + "epoch": 2.750670866876677, + "grad_norm": 1.0449053049087524, + "learning_rate": 8.285601938982206e-05, + "loss": 1.8802, + "step": 23576 + }, + { + "epoch": 2.7507875393769687, + "grad_norm": 1.27349853515625, + "learning_rate": 8.284261211467532e-05, + "loss": 2.1022, + "step": 23577 + }, + { + "epoch": 2.7509042118772604, + "grad_norm": 1.1357629299163818, + "learning_rate": 8.282920555129242e-05, + "loss": 1.9324, + "step": 23578 + }, + { + "epoch": 2.751020884377552, + "grad_norm": 1.1369067430496216, + "learning_rate": 8.281579969981228e-05, + "loss": 2.1017, + "step": 23579 + }, + { + "epoch": 2.751137556877844, + "grad_norm": 1.144891619682312, + "learning_rate": 8.280239456037404e-05, + "loss": 1.891, + "step": 23580 + }, + { + "epoch": 2.7512542293781355, + "grad_norm": 1.1110306978225708, + "learning_rate": 8.278899013311652e-05, + "loss": 2.0099, + "step": 23581 + }, + { + "epoch": 2.751370901878427, + "grad_norm": 1.0520097017288208, + "learning_rate": 8.277558641817868e-05, + "loss": 2.0169, + "step": 23582 + }, + { + "epoch": 2.751487574378719, + "grad_norm": 1.1651760339736938, + "learning_rate": 8.27621834156996e-05, + "loss": 2.107, + "step": 23583 + }, + { + "epoch": 2.7516042468790105, + "grad_norm": 1.1323148012161255, + "learning_rate": 8.274878112581807e-05, + "loss": 1.9551, + "step": 23584 + }, + { + "epoch": 2.751720919379302, + "grad_norm": 1.186126470565796, + "learning_rate": 8.273537954867314e-05, + "loss": 2.0046, + "step": 23585 + }, + { + "epoch": 2.751837591879594, + "grad_norm": 1.1295323371887207, + "learning_rate": 8.272197868440376e-05, + "loss": 1.9407, + "step": 23586 + }, + { + "epoch": 2.7519542643798856, + "grad_norm": 1.0973026752471924, + "learning_rate": 8.270857853314876e-05, + "loss": 1.9729, + "step": 23587 + }, + { + "epoch": 2.7520709368801772, + "grad_norm": 1.1079723834991455, + "learning_rate": 8.269517909504719e-05, + "loss": 2.0371, + "step": 23588 + }, + { + "epoch": 2.752187609380469, + "grad_norm": 1.2286114692687988, + "learning_rate": 8.26817803702378e-05, + "loss": 2.1259, + "step": 23589 + }, + { + "epoch": 2.7523042818807606, + "grad_norm": 1.1703131198883057, + "learning_rate": 8.266838235885964e-05, + "loss": 2.0277, + "step": 23590 + }, + { + "epoch": 2.7524209543810523, + "grad_norm": 1.2230418920516968, + "learning_rate": 8.265498506105148e-05, + "loss": 2.0006, + "step": 23591 + }, + { + "epoch": 2.752537626881344, + "grad_norm": 1.1455626487731934, + "learning_rate": 8.264158847695235e-05, + "loss": 2.014, + "step": 23592 + }, + { + "epoch": 2.7526542993816356, + "grad_norm": 1.179229736328125, + "learning_rate": 8.262819260670094e-05, + "loss": 1.7711, + "step": 23593 + }, + { + "epoch": 2.7527709718819273, + "grad_norm": 1.0502452850341797, + "learning_rate": 8.261479745043634e-05, + "loss": 1.8495, + "step": 23594 + }, + { + "epoch": 2.752887644382219, + "grad_norm": 1.156753420829773, + "learning_rate": 8.260140300829723e-05, + "loss": 1.9498, + "step": 23595 + }, + { + "epoch": 2.7530043168825107, + "grad_norm": 0.8979334235191345, + "learning_rate": 8.258800928042258e-05, + "loss": 1.7561, + "step": 23596 + }, + { + "epoch": 2.7531209893828024, + "grad_norm": 1.0456535816192627, + "learning_rate": 8.257461626695126e-05, + "loss": 1.9234, + "step": 23597 + }, + { + "epoch": 2.753237661883094, + "grad_norm": 1.0326335430145264, + "learning_rate": 8.256122396802196e-05, + "loss": 1.9756, + "step": 23598 + }, + { + "epoch": 2.7533543343833857, + "grad_norm": 1.2926654815673828, + "learning_rate": 8.25478323837737e-05, + "loss": 1.9498, + "step": 23599 + }, + { + "epoch": 2.7534710068836774, + "grad_norm": 1.1266037225723267, + "learning_rate": 8.253444151434519e-05, + "loss": 1.9338, + "step": 23600 + }, + { + "epoch": 2.753587679383969, + "grad_norm": 0.9702746868133545, + "learning_rate": 8.252105135987533e-05, + "loss": 1.8979, + "step": 23601 + }, + { + "epoch": 2.7537043518842608, + "grad_norm": 1.0736716985702515, + "learning_rate": 8.250766192050284e-05, + "loss": 1.9373, + "step": 23602 + }, + { + "epoch": 2.7538210243845525, + "grad_norm": 1.0534404516220093, + "learning_rate": 8.249427319636663e-05, + "loss": 1.8334, + "step": 23603 + }, + { + "epoch": 2.753937696884844, + "grad_norm": 1.217302918434143, + "learning_rate": 8.24808851876054e-05, + "loss": 2.0194, + "step": 23604 + }, + { + "epoch": 2.754054369385136, + "grad_norm": 1.0618489980697632, + "learning_rate": 8.246749789435806e-05, + "loss": 1.8377, + "step": 23605 + }, + { + "epoch": 2.7541710418854275, + "grad_norm": 1.1457322835922241, + "learning_rate": 8.245411131676324e-05, + "loss": 2.1871, + "step": 23606 + }, + { + "epoch": 2.754287714385719, + "grad_norm": 1.0220518112182617, + "learning_rate": 8.244072545495987e-05, + "loss": 1.8368, + "step": 23607 + }, + { + "epoch": 2.754404386886011, + "grad_norm": 1.1663188934326172, + "learning_rate": 8.242734030908663e-05, + "loss": 1.9674, + "step": 23608 + }, + { + "epoch": 2.7545210593863025, + "grad_norm": 1.1337409019470215, + "learning_rate": 8.241395587928236e-05, + "loss": 1.9868, + "step": 23609 + }, + { + "epoch": 2.754637731886594, + "grad_norm": 1.0691853761672974, + "learning_rate": 8.240057216568569e-05, + "loss": 1.8882, + "step": 23610 + }, + { + "epoch": 2.754754404386886, + "grad_norm": 1.185097098350525, + "learning_rate": 8.23871891684355e-05, + "loss": 1.8524, + "step": 23611 + }, + { + "epoch": 2.7548710768871776, + "grad_norm": 1.0301491022109985, + "learning_rate": 8.23738068876704e-05, + "loss": 2.0127, + "step": 23612 + }, + { + "epoch": 2.7549877493874693, + "grad_norm": 1.0882747173309326, + "learning_rate": 8.236042532352926e-05, + "loss": 2.041, + "step": 23613 + }, + { + "epoch": 2.755104421887761, + "grad_norm": 1.0761927366256714, + "learning_rate": 8.234704447615074e-05, + "loss": 1.8402, + "step": 23614 + }, + { + "epoch": 2.7552210943880526, + "grad_norm": 1.131523609161377, + "learning_rate": 8.233366434567351e-05, + "loss": 1.8422, + "step": 23615 + }, + { + "epoch": 2.7553377668883443, + "grad_norm": 1.2353489398956299, + "learning_rate": 8.232028493223637e-05, + "loss": 2.0714, + "step": 23616 + }, + { + "epoch": 2.755454439388636, + "grad_norm": 1.19621741771698, + "learning_rate": 8.230690623597792e-05, + "loss": 2.0889, + "step": 23617 + }, + { + "epoch": 2.7555711118889277, + "grad_norm": 1.2696623802185059, + "learning_rate": 8.229352825703697e-05, + "loss": 1.997, + "step": 23618 + }, + { + "epoch": 2.7556877843892194, + "grad_norm": 1.087583065032959, + "learning_rate": 8.228015099555211e-05, + "loss": 2.1483, + "step": 23619 + }, + { + "epoch": 2.755804456889511, + "grad_norm": 1.074615478515625, + "learning_rate": 8.22667744516621e-05, + "loss": 1.8369, + "step": 23620 + }, + { + "epoch": 2.7559211293898027, + "grad_norm": 1.1745712757110596, + "learning_rate": 8.225339862550553e-05, + "loss": 1.9043, + "step": 23621 + }, + { + "epoch": 2.7560378018900944, + "grad_norm": 0.9576388597488403, + "learning_rate": 8.224002351722118e-05, + "loss": 1.9139, + "step": 23622 + }, + { + "epoch": 2.756154474390386, + "grad_norm": 1.1055359840393066, + "learning_rate": 8.222664912694757e-05, + "loss": 1.9007, + "step": 23623 + }, + { + "epoch": 2.7562711468906778, + "grad_norm": 1.211425542831421, + "learning_rate": 8.221327545482348e-05, + "loss": 2.0952, + "step": 23624 + }, + { + "epoch": 2.7563878193909694, + "grad_norm": 1.2827481031417847, + "learning_rate": 8.219990250098745e-05, + "loss": 2.0046, + "step": 23625 + }, + { + "epoch": 2.756504491891261, + "grad_norm": 1.0007083415985107, + "learning_rate": 8.218653026557815e-05, + "loss": 1.9876, + "step": 23626 + }, + { + "epoch": 2.756621164391553, + "grad_norm": 1.056304693222046, + "learning_rate": 8.217315874873428e-05, + "loss": 1.9157, + "step": 23627 + }, + { + "epoch": 2.7567378368918445, + "grad_norm": 1.1446806192398071, + "learning_rate": 8.215978795059435e-05, + "loss": 2.1103, + "step": 23628 + }, + { + "epoch": 2.756854509392136, + "grad_norm": 1.1028143167495728, + "learning_rate": 8.214641787129706e-05, + "loss": 1.9496, + "step": 23629 + }, + { + "epoch": 2.756971181892428, + "grad_norm": 1.1355291604995728, + "learning_rate": 8.2133048510981e-05, + "loss": 1.9232, + "step": 23630 + }, + { + "epoch": 2.7570878543927195, + "grad_norm": 1.0005300045013428, + "learning_rate": 8.211967986978474e-05, + "loss": 2.0624, + "step": 23631 + }, + { + "epoch": 2.757204526893011, + "grad_norm": 1.1162071228027344, + "learning_rate": 8.210631194784684e-05, + "loss": 1.8229, + "step": 23632 + }, + { + "epoch": 2.757321199393303, + "grad_norm": 1.3414250612258911, + "learning_rate": 8.209294474530598e-05, + "loss": 1.8789, + "step": 23633 + }, + { + "epoch": 2.7574378718935946, + "grad_norm": 1.285873293876648, + "learning_rate": 8.20795782623006e-05, + "loss": 1.9528, + "step": 23634 + }, + { + "epoch": 2.7575545443938863, + "grad_norm": 1.2047282457351685, + "learning_rate": 8.206621249896943e-05, + "loss": 1.9716, + "step": 23635 + }, + { + "epoch": 2.757671216894178, + "grad_norm": 1.30771803855896, + "learning_rate": 8.205284745545087e-05, + "loss": 1.9494, + "step": 23636 + }, + { + "epoch": 2.7577878893944696, + "grad_norm": 1.205231785774231, + "learning_rate": 8.203948313188363e-05, + "loss": 1.9709, + "step": 23637 + }, + { + "epoch": 2.7579045618947613, + "grad_norm": 1.4016667604446411, + "learning_rate": 8.202611952840612e-05, + "loss": 2.0909, + "step": 23638 + }, + { + "epoch": 2.758021234395053, + "grad_norm": 1.1141828298568726, + "learning_rate": 8.201275664515694e-05, + "loss": 1.9586, + "step": 23639 + }, + { + "epoch": 2.7581379068953447, + "grad_norm": 1.5193185806274414, + "learning_rate": 8.199939448227467e-05, + "loss": 1.9516, + "step": 23640 + }, + { + "epoch": 2.7582545793956363, + "grad_norm": 1.0484174489974976, + "learning_rate": 8.198603303989773e-05, + "loss": 1.9413, + "step": 23641 + }, + { + "epoch": 2.758371251895928, + "grad_norm": 1.0663317441940308, + "learning_rate": 8.197267231816478e-05, + "loss": 2.0222, + "step": 23642 + }, + { + "epoch": 2.7584879243962197, + "grad_norm": 1.0085593461990356, + "learning_rate": 8.195931231721414e-05, + "loss": 2.0287, + "step": 23643 + }, + { + "epoch": 2.7586045968965114, + "grad_norm": 1.1576687097549438, + "learning_rate": 8.194595303718452e-05, + "loss": 1.9749, + "step": 23644 + }, + { + "epoch": 2.758721269396803, + "grad_norm": 1.1543323993682861, + "learning_rate": 8.193259447821423e-05, + "loss": 1.9946, + "step": 23645 + }, + { + "epoch": 2.7588379418970947, + "grad_norm": 1.3544842004776, + "learning_rate": 8.191923664044189e-05, + "loss": 1.8757, + "step": 23646 + }, + { + "epoch": 2.7589546143973864, + "grad_norm": 1.1912691593170166, + "learning_rate": 8.190587952400591e-05, + "loss": 2.2048, + "step": 23647 + }, + { + "epoch": 2.759071286897678, + "grad_norm": 1.2738852500915527, + "learning_rate": 8.189252312904473e-05, + "loss": 1.9644, + "step": 23648 + }, + { + "epoch": 2.75918795939797, + "grad_norm": 1.1410908699035645, + "learning_rate": 8.187916745569693e-05, + "loss": 2.0208, + "step": 23649 + }, + { + "epoch": 2.7593046318982615, + "grad_norm": 1.2212096452713013, + "learning_rate": 8.186581250410088e-05, + "loss": 1.9485, + "step": 23650 + }, + { + "epoch": 2.759421304398553, + "grad_norm": 1.1310107707977295, + "learning_rate": 8.185245827439501e-05, + "loss": 1.9147, + "step": 23651 + }, + { + "epoch": 2.759537976898845, + "grad_norm": 1.169202208518982, + "learning_rate": 8.183910476671776e-05, + "loss": 1.8455, + "step": 23652 + }, + { + "epoch": 2.7596546493991365, + "grad_norm": 1.301750898361206, + "learning_rate": 8.182575198120771e-05, + "loss": 1.8034, + "step": 23653 + }, + { + "epoch": 2.759771321899428, + "grad_norm": 1.1818864345550537, + "learning_rate": 8.181239991800309e-05, + "loss": 1.8332, + "step": 23654 + }, + { + "epoch": 2.75988799439972, + "grad_norm": 1.1680508852005005, + "learning_rate": 8.179904857724247e-05, + "loss": 2.0595, + "step": 23655 + }, + { + "epoch": 2.7600046669000116, + "grad_norm": 1.2331174612045288, + "learning_rate": 8.178569795906414e-05, + "loss": 2.0952, + "step": 23656 + }, + { + "epoch": 2.7601213394003032, + "grad_norm": 1.3273063898086548, + "learning_rate": 8.177234806360663e-05, + "loss": 1.9931, + "step": 23657 + }, + { + "epoch": 2.760238011900595, + "grad_norm": 1.1302261352539062, + "learning_rate": 8.175899889100822e-05, + "loss": 1.877, + "step": 23658 + }, + { + "epoch": 2.7603546844008866, + "grad_norm": 1.1692241430282593, + "learning_rate": 8.17456504414074e-05, + "loss": 1.9672, + "step": 23659 + }, + { + "epoch": 2.7604713569011783, + "grad_norm": 1.2636809349060059, + "learning_rate": 8.173230271494244e-05, + "loss": 1.9989, + "step": 23660 + }, + { + "epoch": 2.76058802940147, + "grad_norm": 1.08236825466156, + "learning_rate": 8.171895571175186e-05, + "loss": 2.0461, + "step": 23661 + }, + { + "epoch": 2.7607047019017616, + "grad_norm": 1.0128964185714722, + "learning_rate": 8.170560943197387e-05, + "loss": 2.0615, + "step": 23662 + }, + { + "epoch": 2.7608213744020533, + "grad_norm": 1.0854965448379517, + "learning_rate": 8.169226387574697e-05, + "loss": 1.9213, + "step": 23663 + }, + { + "epoch": 2.760938046902345, + "grad_norm": 1.2183266878128052, + "learning_rate": 8.167891904320937e-05, + "loss": 1.9594, + "step": 23664 + }, + { + "epoch": 2.7610547194026367, + "grad_norm": 1.2672916650772095, + "learning_rate": 8.166557493449958e-05, + "loss": 1.9693, + "step": 23665 + }, + { + "epoch": 2.7611713919029284, + "grad_norm": 1.1413207054138184, + "learning_rate": 8.165223154975582e-05, + "loss": 1.8595, + "step": 23666 + }, + { + "epoch": 2.76128806440322, + "grad_norm": 1.2233779430389404, + "learning_rate": 8.163888888911641e-05, + "loss": 1.9821, + "step": 23667 + }, + { + "epoch": 2.7614047369035117, + "grad_norm": 1.2789437770843506, + "learning_rate": 8.162554695271976e-05, + "loss": 1.935, + "step": 23668 + }, + { + "epoch": 2.7615214094038034, + "grad_norm": 1.079905390739441, + "learning_rate": 8.161220574070409e-05, + "loss": 2.0665, + "step": 23669 + }, + { + "epoch": 2.761638081904095, + "grad_norm": 1.1355271339416504, + "learning_rate": 8.159886525320777e-05, + "loss": 1.9079, + "step": 23670 + }, + { + "epoch": 2.761754754404387, + "grad_norm": 1.115761637687683, + "learning_rate": 8.158552549036906e-05, + "loss": 2.0533, + "step": 23671 + }, + { + "epoch": 2.7618714269046785, + "grad_norm": 0.9908506870269775, + "learning_rate": 8.157218645232631e-05, + "loss": 1.8857, + "step": 23672 + }, + { + "epoch": 2.76198809940497, + "grad_norm": 1.0030839443206787, + "learning_rate": 8.155884813921773e-05, + "loss": 1.9466, + "step": 23673 + }, + { + "epoch": 2.762104771905262, + "grad_norm": 1.0412324666976929, + "learning_rate": 8.154551055118165e-05, + "loss": 1.91, + "step": 23674 + }, + { + "epoch": 2.7622214444055535, + "grad_norm": 1.1793708801269531, + "learning_rate": 8.153217368835632e-05, + "loss": 1.9072, + "step": 23675 + }, + { + "epoch": 2.762338116905845, + "grad_norm": 1.148818850517273, + "learning_rate": 8.151883755088002e-05, + "loss": 2.0837, + "step": 23676 + }, + { + "epoch": 2.762454789406137, + "grad_norm": 1.112510323524475, + "learning_rate": 8.150550213889095e-05, + "loss": 1.9299, + "step": 23677 + }, + { + "epoch": 2.7625714619064285, + "grad_norm": 1.268910527229309, + "learning_rate": 8.149216745252739e-05, + "loss": 2.1054, + "step": 23678 + }, + { + "epoch": 2.7626881344067202, + "grad_norm": 1.0697003602981567, + "learning_rate": 8.147883349192764e-05, + "loss": 1.9818, + "step": 23679 + }, + { + "epoch": 2.762804806907012, + "grad_norm": 1.2176834344863892, + "learning_rate": 8.146550025722983e-05, + "loss": 1.9387, + "step": 23680 + }, + { + "epoch": 2.7629214794073036, + "grad_norm": 1.2359522581100464, + "learning_rate": 8.145216774857228e-05, + "loss": 1.791, + "step": 23681 + }, + { + "epoch": 2.7630381519075953, + "grad_norm": 1.011549711227417, + "learning_rate": 8.143883596609317e-05, + "loss": 1.7594, + "step": 23682 + }, + { + "epoch": 2.763154824407887, + "grad_norm": 1.0830066204071045, + "learning_rate": 8.142550490993067e-05, + "loss": 2.088, + "step": 23683 + }, + { + "epoch": 2.7632714969081786, + "grad_norm": 1.1208295822143555, + "learning_rate": 8.141217458022297e-05, + "loss": 1.8192, + "step": 23684 + }, + { + "epoch": 2.7633881694084703, + "grad_norm": 1.2504862546920776, + "learning_rate": 8.139884497710835e-05, + "loss": 2.1574, + "step": 23685 + }, + { + "epoch": 2.763504841908762, + "grad_norm": 0.9996962547302246, + "learning_rate": 8.138551610072491e-05, + "loss": 1.7422, + "step": 23686 + }, + { + "epoch": 2.7636215144090537, + "grad_norm": 1.0946303606033325, + "learning_rate": 8.137218795121092e-05, + "loss": 1.8998, + "step": 23687 + }, + { + "epoch": 2.7637381869093454, + "grad_norm": 1.0355714559555054, + "learning_rate": 8.135886052870444e-05, + "loss": 1.8384, + "step": 23688 + }, + { + "epoch": 2.763854859409637, + "grad_norm": 1.263752818107605, + "learning_rate": 8.134553383334374e-05, + "loss": 2.0812, + "step": 23689 + }, + { + "epoch": 2.7639715319099287, + "grad_norm": 1.0136393308639526, + "learning_rate": 8.133220786526687e-05, + "loss": 2.0489, + "step": 23690 + }, + { + "epoch": 2.7640882044102204, + "grad_norm": 1.0261801481246948, + "learning_rate": 8.131888262461208e-05, + "loss": 1.9568, + "step": 23691 + }, + { + "epoch": 2.764204876910512, + "grad_norm": 1.163041353225708, + "learning_rate": 8.130555811151749e-05, + "loss": 2.0221, + "step": 23692 + }, + { + "epoch": 2.7643215494108038, + "grad_norm": 1.1553162336349487, + "learning_rate": 8.129223432612115e-05, + "loss": 1.944, + "step": 23693 + }, + { + "epoch": 2.7644382219110955, + "grad_norm": 1.014323115348816, + "learning_rate": 8.127891126856133e-05, + "loss": 1.8153, + "step": 23694 + }, + { + "epoch": 2.764554894411387, + "grad_norm": 1.0263935327529907, + "learning_rate": 8.126558893897601e-05, + "loss": 1.9562, + "step": 23695 + }, + { + "epoch": 2.764671566911679, + "grad_norm": 0.9797463417053223, + "learning_rate": 8.125226733750341e-05, + "loss": 1.9526, + "step": 23696 + }, + { + "epoch": 2.7647882394119705, + "grad_norm": 1.0130980014801025, + "learning_rate": 8.123894646428152e-05, + "loss": 1.9069, + "step": 23697 + }, + { + "epoch": 2.764904911912262, + "grad_norm": 1.385854959487915, + "learning_rate": 8.122562631944858e-05, + "loss": 2.0865, + "step": 23698 + }, + { + "epoch": 2.765021584412554, + "grad_norm": 1.0816031694412231, + "learning_rate": 8.121230690314257e-05, + "loss": 1.7843, + "step": 23699 + }, + { + "epoch": 2.7651382569128455, + "grad_norm": 1.1294236183166504, + "learning_rate": 8.119898821550155e-05, + "loss": 1.8774, + "step": 23700 + }, + { + "epoch": 2.765254929413137, + "grad_norm": 1.0969537496566772, + "learning_rate": 8.118567025666373e-05, + "loss": 1.965, + "step": 23701 + }, + { + "epoch": 2.765371601913429, + "grad_norm": 1.372414469718933, + "learning_rate": 8.117235302676707e-05, + "loss": 1.9871, + "step": 23702 + }, + { + "epoch": 2.7654882744137206, + "grad_norm": 1.166113257408142, + "learning_rate": 8.115903652594957e-05, + "loss": 1.9356, + "step": 23703 + }, + { + "epoch": 2.7656049469140123, + "grad_norm": 1.1387414932250977, + "learning_rate": 8.114572075434939e-05, + "loss": 2.0936, + "step": 23704 + }, + { + "epoch": 2.765721619414304, + "grad_norm": 1.1509895324707031, + "learning_rate": 8.113240571210457e-05, + "loss": 1.8442, + "step": 23705 + }, + { + "epoch": 2.7658382919145956, + "grad_norm": 1.1025093793869019, + "learning_rate": 8.111909139935309e-05, + "loss": 2.1441, + "step": 23706 + }, + { + "epoch": 2.7659549644148873, + "grad_norm": 1.130273699760437, + "learning_rate": 8.110577781623303e-05, + "loss": 2.0163, + "step": 23707 + }, + { + "epoch": 2.766071636915179, + "grad_norm": 1.0723199844360352, + "learning_rate": 8.109246496288235e-05, + "loss": 1.9388, + "step": 23708 + }, + { + "epoch": 2.7661883094154707, + "grad_norm": 1.176416039466858, + "learning_rate": 8.107915283943914e-05, + "loss": 2.0304, + "step": 23709 + }, + { + "epoch": 2.7663049819157624, + "grad_norm": 1.0341428518295288, + "learning_rate": 8.106584144604131e-05, + "loss": 1.9176, + "step": 23710 + }, + { + "epoch": 2.766421654416054, + "grad_norm": 1.1191720962524414, + "learning_rate": 8.105253078282697e-05, + "loss": 2.0086, + "step": 23711 + }, + { + "epoch": 2.7665383269163457, + "grad_norm": 1.1944730281829834, + "learning_rate": 8.103922084993401e-05, + "loss": 1.7955, + "step": 23712 + }, + { + "epoch": 2.7666549994166374, + "grad_norm": 1.0126533508300781, + "learning_rate": 8.10259116475005e-05, + "loss": 1.827, + "step": 23713 + }, + { + "epoch": 2.766771671916929, + "grad_norm": 1.0659037828445435, + "learning_rate": 8.101260317566431e-05, + "loss": 2.0735, + "step": 23714 + }, + { + "epoch": 2.7668883444172208, + "grad_norm": 0.9799333810806274, + "learning_rate": 8.099929543456353e-05, + "loss": 1.8667, + "step": 23715 + }, + { + "epoch": 2.7670050169175124, + "grad_norm": 1.1552326679229736, + "learning_rate": 8.098598842433604e-05, + "loss": 1.9501, + "step": 23716 + }, + { + "epoch": 2.767121689417804, + "grad_norm": 1.093816876411438, + "learning_rate": 8.097268214511977e-05, + "loss": 2.1065, + "step": 23717 + }, + { + "epoch": 2.767238361918096, + "grad_norm": 1.015518069267273, + "learning_rate": 8.095937659705276e-05, + "loss": 1.9914, + "step": 23718 + }, + { + "epoch": 2.7673550344183875, + "grad_norm": 1.2044274806976318, + "learning_rate": 8.094607178027281e-05, + "loss": 2.0668, + "step": 23719 + }, + { + "epoch": 2.767471706918679, + "grad_norm": 1.0674866437911987, + "learning_rate": 8.093276769491802e-05, + "loss": 1.9861, + "step": 23720 + }, + { + "epoch": 2.767588379418971, + "grad_norm": 1.2212263345718384, + "learning_rate": 8.091946434112615e-05, + "loss": 2.0122, + "step": 23721 + }, + { + "epoch": 2.7677050519192625, + "grad_norm": 1.129715085029602, + "learning_rate": 8.090616171903525e-05, + "loss": 1.8979, + "step": 23722 + }, + { + "epoch": 2.767821724419554, + "grad_norm": 1.2127153873443604, + "learning_rate": 8.08928598287831e-05, + "loss": 2.1268, + "step": 23723 + }, + { + "epoch": 2.767938396919846, + "grad_norm": 1.0901652574539185, + "learning_rate": 8.087955867050775e-05, + "loss": 1.8872, + "step": 23724 + }, + { + "epoch": 2.7680550694201376, + "grad_norm": 1.1705104112625122, + "learning_rate": 8.086625824434691e-05, + "loss": 2.0122, + "step": 23725 + }, + { + "epoch": 2.7681717419204293, + "grad_norm": 1.364644169807434, + "learning_rate": 8.085295855043861e-05, + "loss": 2.3254, + "step": 23726 + }, + { + "epoch": 2.768288414420721, + "grad_norm": 1.1910429000854492, + "learning_rate": 8.083965958892066e-05, + "loss": 1.999, + "step": 23727 + }, + { + "epoch": 2.7684050869210126, + "grad_norm": 1.146255373954773, + "learning_rate": 8.082636135993097e-05, + "loss": 1.9081, + "step": 23728 + }, + { + "epoch": 2.7685217594213043, + "grad_norm": 1.2106804847717285, + "learning_rate": 8.081306386360734e-05, + "loss": 2.0239, + "step": 23729 + }, + { + "epoch": 2.768638431921596, + "grad_norm": 1.1591891050338745, + "learning_rate": 8.079976710008764e-05, + "loss": 1.982, + "step": 23730 + }, + { + "epoch": 2.7687551044218877, + "grad_norm": 1.1787112951278687, + "learning_rate": 8.078647106950981e-05, + "loss": 1.9506, + "step": 23731 + }, + { + "epoch": 2.7688717769221793, + "grad_norm": 1.1144527196884155, + "learning_rate": 8.077317577201159e-05, + "loss": 2.0963, + "step": 23732 + }, + { + "epoch": 2.768988449422471, + "grad_norm": 1.1699706315994263, + "learning_rate": 8.07598812077308e-05, + "loss": 1.9885, + "step": 23733 + }, + { + "epoch": 2.7691051219227627, + "grad_norm": 1.3919464349746704, + "learning_rate": 8.074658737680536e-05, + "loss": 2.2452, + "step": 23734 + }, + { + "epoch": 2.7692217944230544, + "grad_norm": 1.1094510555267334, + "learning_rate": 8.0733294279373e-05, + "loss": 1.9841, + "step": 23735 + }, + { + "epoch": 2.769338466923346, + "grad_norm": 1.2087981700897217, + "learning_rate": 8.072000191557153e-05, + "loss": 2.0585, + "step": 23736 + }, + { + "epoch": 2.7694551394236377, + "grad_norm": 1.053261160850525, + "learning_rate": 8.070671028553881e-05, + "loss": 1.9409, + "step": 23737 + }, + { + "epoch": 2.7695718119239294, + "grad_norm": 1.2423104047775269, + "learning_rate": 8.069341938941252e-05, + "loss": 1.8927, + "step": 23738 + }, + { + "epoch": 2.769688484424221, + "grad_norm": 1.1566964387893677, + "learning_rate": 8.06801292273306e-05, + "loss": 2.0941, + "step": 23739 + }, + { + "epoch": 2.769805156924513, + "grad_norm": 1.0606259107589722, + "learning_rate": 8.066683979943068e-05, + "loss": 1.8287, + "step": 23740 + }, + { + "epoch": 2.7699218294248045, + "grad_norm": 1.1129990816116333, + "learning_rate": 8.065355110585068e-05, + "loss": 1.9397, + "step": 23741 + }, + { + "epoch": 2.770038501925096, + "grad_norm": 1.2157410383224487, + "learning_rate": 8.064026314672819e-05, + "loss": 2.2041, + "step": 23742 + }, + { + "epoch": 2.770155174425388, + "grad_norm": 1.1313540935516357, + "learning_rate": 8.06269759222011e-05, + "loss": 1.8908, + "step": 23743 + }, + { + "epoch": 2.7702718469256795, + "grad_norm": 1.2350411415100098, + "learning_rate": 8.061368943240714e-05, + "loss": 2.0329, + "step": 23744 + }, + { + "epoch": 2.770388519425971, + "grad_norm": 0.9623071551322937, + "learning_rate": 8.060040367748397e-05, + "loss": 1.7959, + "step": 23745 + }, + { + "epoch": 2.770505191926263, + "grad_norm": 1.1439393758773804, + "learning_rate": 8.058711865756944e-05, + "loss": 1.9987, + "step": 23746 + }, + { + "epoch": 2.7706218644265546, + "grad_norm": 1.0283088684082031, + "learning_rate": 8.057383437280115e-05, + "loss": 1.9027, + "step": 23747 + }, + { + "epoch": 2.7707385369268462, + "grad_norm": 1.1021924018859863, + "learning_rate": 8.056055082331692e-05, + "loss": 1.9564, + "step": 23748 + }, + { + "epoch": 2.770855209427138, + "grad_norm": 1.4286620616912842, + "learning_rate": 8.054726800925438e-05, + "loss": 2.1006, + "step": 23749 + }, + { + "epoch": 2.7709718819274296, + "grad_norm": 1.082078218460083, + "learning_rate": 8.053398593075132e-05, + "loss": 1.8732, + "step": 23750 + }, + { + "epoch": 2.7710885544277213, + "grad_norm": 0.969716489315033, + "learning_rate": 8.052070458794537e-05, + "loss": 1.9824, + "step": 23751 + }, + { + "epoch": 2.771205226928013, + "grad_norm": 1.077446460723877, + "learning_rate": 8.050742398097422e-05, + "loss": 1.9206, + "step": 23752 + }, + { + "epoch": 2.7713218994283046, + "grad_norm": 1.2381917238235474, + "learning_rate": 8.049414410997552e-05, + "loss": 2.1699, + "step": 23753 + }, + { + "epoch": 2.7714385719285963, + "grad_norm": 1.1162956953048706, + "learning_rate": 8.048086497508703e-05, + "loss": 2.0409, + "step": 23754 + }, + { + "epoch": 2.771555244428888, + "grad_norm": 1.072223424911499, + "learning_rate": 8.046758657644628e-05, + "loss": 1.9158, + "step": 23755 + }, + { + "epoch": 2.7716719169291797, + "grad_norm": 1.1687296628952026, + "learning_rate": 8.045430891419105e-05, + "loss": 2.0141, + "step": 23756 + }, + { + "epoch": 2.7717885894294714, + "grad_norm": 1.166771411895752, + "learning_rate": 8.044103198845897e-05, + "loss": 1.9154, + "step": 23757 + }, + { + "epoch": 2.771905261929763, + "grad_norm": 1.149235725402832, + "learning_rate": 8.04277557993876e-05, + "loss": 1.9204, + "step": 23758 + }, + { + "epoch": 2.7720219344300547, + "grad_norm": 1.0948240756988525, + "learning_rate": 8.041448034711467e-05, + "loss": 2.0291, + "step": 23759 + }, + { + "epoch": 2.7721386069303464, + "grad_norm": 1.036160945892334, + "learning_rate": 8.040120563177774e-05, + "loss": 1.9866, + "step": 23760 + }, + { + "epoch": 2.772255279430638, + "grad_norm": 1.2735148668289185, + "learning_rate": 8.03879316535145e-05, + "loss": 2.0619, + "step": 23761 + }, + { + "epoch": 2.77237195193093, + "grad_norm": 1.1120771169662476, + "learning_rate": 8.037465841246242e-05, + "loss": 1.8759, + "step": 23762 + }, + { + "epoch": 2.7724886244312215, + "grad_norm": 1.062382459640503, + "learning_rate": 8.036138590875929e-05, + "loss": 1.8977, + "step": 23763 + }, + { + "epoch": 2.772605296931513, + "grad_norm": 1.1388462781906128, + "learning_rate": 8.034811414254253e-05, + "loss": 1.9321, + "step": 23764 + }, + { + "epoch": 2.772721969431805, + "grad_norm": 1.0988069772720337, + "learning_rate": 8.033484311394985e-05, + "loss": 1.9316, + "step": 23765 + }, + { + "epoch": 2.7728386419320965, + "grad_norm": 1.0058602094650269, + "learning_rate": 8.032157282311875e-05, + "loss": 1.8927, + "step": 23766 + }, + { + "epoch": 2.772955314432388, + "grad_norm": 1.076699137687683, + "learning_rate": 8.030830327018688e-05, + "loss": 1.9448, + "step": 23767 + }, + { + "epoch": 2.77307198693268, + "grad_norm": 1.1360652446746826, + "learning_rate": 8.029503445529179e-05, + "loss": 2.0511, + "step": 23768 + }, + { + "epoch": 2.7731886594329715, + "grad_norm": 1.1061594486236572, + "learning_rate": 8.02817663785709e-05, + "loss": 1.8567, + "step": 23769 + }, + { + "epoch": 2.7733053319332632, + "grad_norm": 1.1646209955215454, + "learning_rate": 8.026849904016195e-05, + "loss": 1.9572, + "step": 23770 + }, + { + "epoch": 2.773422004433555, + "grad_norm": 1.115897297859192, + "learning_rate": 8.025523244020234e-05, + "loss": 1.8411, + "step": 23771 + }, + { + "epoch": 2.7735386769338466, + "grad_norm": 1.0456498861312866, + "learning_rate": 8.024196657882968e-05, + "loss": 1.8152, + "step": 23772 + }, + { + "epoch": 2.7736553494341383, + "grad_norm": 1.2288262844085693, + "learning_rate": 8.022870145618146e-05, + "loss": 1.9857, + "step": 23773 + }, + { + "epoch": 2.77377202193443, + "grad_norm": 1.0511577129364014, + "learning_rate": 8.021543707239525e-05, + "loss": 1.9304, + "step": 23774 + }, + { + "epoch": 2.7738886944347216, + "grad_norm": 1.0944492816925049, + "learning_rate": 8.020217342760846e-05, + "loss": 1.7623, + "step": 23775 + }, + { + "epoch": 2.7740053669350133, + "grad_norm": 1.0934650897979736, + "learning_rate": 8.018891052195872e-05, + "loss": 2.0577, + "step": 23776 + }, + { + "epoch": 2.774122039435305, + "grad_norm": 0.9411082863807678, + "learning_rate": 8.01756483555834e-05, + "loss": 1.857, + "step": 23777 + }, + { + "epoch": 2.7742387119355967, + "grad_norm": 1.2774252891540527, + "learning_rate": 8.016238692862012e-05, + "loss": 1.9826, + "step": 23778 + }, + { + "epoch": 2.7743553844358884, + "grad_norm": 1.1538735628128052, + "learning_rate": 8.014912624120622e-05, + "loss": 1.9396, + "step": 23779 + }, + { + "epoch": 2.77447205693618, + "grad_norm": 0.9923681020736694, + "learning_rate": 8.01358662934793e-05, + "loss": 1.9105, + "step": 23780 + }, + { + "epoch": 2.7745887294364717, + "grad_norm": 1.1151915788650513, + "learning_rate": 8.01226070855767e-05, + "loss": 1.9486, + "step": 23781 + }, + { + "epoch": 2.7747054019367634, + "grad_norm": 1.2401220798492432, + "learning_rate": 8.010934861763597e-05, + "loss": 1.9313, + "step": 23782 + }, + { + "epoch": 2.774822074437055, + "grad_norm": 1.1170965433120728, + "learning_rate": 8.009609088979457e-05, + "loss": 1.9435, + "step": 23783 + }, + { + "epoch": 2.7749387469373468, + "grad_norm": 1.1470072269439697, + "learning_rate": 8.008283390218988e-05, + "loss": 2.0015, + "step": 23784 + }, + { + "epoch": 2.7750554194376384, + "grad_norm": 1.272939682006836, + "learning_rate": 8.006957765495938e-05, + "loss": 1.8375, + "step": 23785 + }, + { + "epoch": 2.77517209193793, + "grad_norm": 0.9861600995063782, + "learning_rate": 8.005632214824041e-05, + "loss": 1.9276, + "step": 23786 + }, + { + "epoch": 2.775288764438222, + "grad_norm": 1.1198524236679077, + "learning_rate": 8.004306738217051e-05, + "loss": 1.9512, + "step": 23787 + }, + { + "epoch": 2.7754054369385135, + "grad_norm": 1.1561087369918823, + "learning_rate": 8.002981335688697e-05, + "loss": 2.1365, + "step": 23788 + }, + { + "epoch": 2.775522109438805, + "grad_norm": 1.1208447217941284, + "learning_rate": 8.001656007252732e-05, + "loss": 1.8864, + "step": 23789 + }, + { + "epoch": 2.775638781939097, + "grad_norm": 1.1114885807037354, + "learning_rate": 8.000330752922882e-05, + "loss": 1.8759, + "step": 23790 + }, + { + "epoch": 2.7757554544393885, + "grad_norm": 1.144021987915039, + "learning_rate": 7.999005572712902e-05, + "loss": 1.8679, + "step": 23791 + }, + { + "epoch": 2.77587212693968, + "grad_norm": 1.2751398086547852, + "learning_rate": 7.997680466636511e-05, + "loss": 1.9426, + "step": 23792 + }, + { + "epoch": 2.775988799439972, + "grad_norm": 1.3358168601989746, + "learning_rate": 7.996355434707463e-05, + "loss": 1.9442, + "step": 23793 + }, + { + "epoch": 2.7761054719402636, + "grad_norm": 1.11429762840271, + "learning_rate": 7.99503047693948e-05, + "loss": 1.9809, + "step": 23794 + }, + { + "epoch": 2.7762221444405553, + "grad_norm": 1.0516016483306885, + "learning_rate": 7.993705593346309e-05, + "loss": 2.0616, + "step": 23795 + }, + { + "epoch": 2.776338816940847, + "grad_norm": 1.188110589981079, + "learning_rate": 7.992380783941683e-05, + "loss": 2.0305, + "step": 23796 + }, + { + "epoch": 2.7764554894411386, + "grad_norm": 1.1375116109848022, + "learning_rate": 7.991056048739331e-05, + "loss": 1.8407, + "step": 23797 + }, + { + "epoch": 2.7765721619414303, + "grad_norm": 1.119917392730713, + "learning_rate": 7.989731387752995e-05, + "loss": 1.9931, + "step": 23798 + }, + { + "epoch": 2.776688834441722, + "grad_norm": 1.0481315851211548, + "learning_rate": 7.988406800996396e-05, + "loss": 1.9055, + "step": 23799 + }, + { + "epoch": 2.7768055069420137, + "grad_norm": 1.355847954750061, + "learning_rate": 7.987082288483278e-05, + "loss": 2.1573, + "step": 23800 + }, + { + "epoch": 2.7769221794423053, + "grad_norm": 1.1935663223266602, + "learning_rate": 7.985757850227365e-05, + "loss": 1.9991, + "step": 23801 + }, + { + "epoch": 2.777038851942597, + "grad_norm": 1.3447703123092651, + "learning_rate": 7.984433486242384e-05, + "loss": 1.8873, + "step": 23802 + }, + { + "epoch": 2.7771555244428887, + "grad_norm": 1.0753753185272217, + "learning_rate": 7.983109196542073e-05, + "loss": 1.8963, + "step": 23803 + }, + { + "epoch": 2.7772721969431804, + "grad_norm": 1.0307345390319824, + "learning_rate": 7.981784981140161e-05, + "loss": 1.9745, + "step": 23804 + }, + { + "epoch": 2.777388869443472, + "grad_norm": 1.154898762702942, + "learning_rate": 7.980460840050361e-05, + "loss": 2.1057, + "step": 23805 + }, + { + "epoch": 2.7775055419437638, + "grad_norm": 1.1861789226531982, + "learning_rate": 7.97913677328642e-05, + "loss": 1.8923, + "step": 23806 + }, + { + "epoch": 2.7776222144440554, + "grad_norm": 1.0291918516159058, + "learning_rate": 7.977812780862049e-05, + "loss": 1.8656, + "step": 23807 + }, + { + "epoch": 2.777738886944347, + "grad_norm": 1.0515183210372925, + "learning_rate": 7.976488862790981e-05, + "loss": 1.9433, + "step": 23808 + }, + { + "epoch": 2.777855559444639, + "grad_norm": 1.1556476354599, + "learning_rate": 7.975165019086945e-05, + "loss": 1.9211, + "step": 23809 + }, + { + "epoch": 2.7779722319449305, + "grad_norm": 1.2082886695861816, + "learning_rate": 7.973841249763654e-05, + "loss": 2.186, + "step": 23810 + }, + { + "epoch": 2.778088904445222, + "grad_norm": 1.0132519006729126, + "learning_rate": 7.972517554834844e-05, + "loss": 1.7991, + "step": 23811 + }, + { + "epoch": 2.778205576945514, + "grad_norm": 1.055907964706421, + "learning_rate": 7.971193934314225e-05, + "loss": 2.1039, + "step": 23812 + }, + { + "epoch": 2.7783222494458055, + "grad_norm": 1.129183292388916, + "learning_rate": 7.96987038821553e-05, + "loss": 2.0412, + "step": 23813 + }, + { + "epoch": 2.778438921946097, + "grad_norm": 1.2749676704406738, + "learning_rate": 7.96854691655247e-05, + "loss": 2.1658, + "step": 23814 + }, + { + "epoch": 2.778555594446389, + "grad_norm": 0.9983744025230408, + "learning_rate": 7.967223519338777e-05, + "loss": 1.8013, + "step": 23815 + }, + { + "epoch": 2.7786722669466806, + "grad_norm": 1.1223808526992798, + "learning_rate": 7.965900196588159e-05, + "loss": 1.9281, + "step": 23816 + }, + { + "epoch": 2.7787889394469723, + "grad_norm": 1.0038509368896484, + "learning_rate": 7.964576948314344e-05, + "loss": 1.9807, + "step": 23817 + }, + { + "epoch": 2.778905611947264, + "grad_norm": 1.1128321886062622, + "learning_rate": 7.963253774531041e-05, + "loss": 1.9861, + "step": 23818 + }, + { + "epoch": 2.7790222844475556, + "grad_norm": 1.2670682668685913, + "learning_rate": 7.961930675251976e-05, + "loss": 2.013, + "step": 23819 + }, + { + "epoch": 2.7791389569478473, + "grad_norm": 0.9822789430618286, + "learning_rate": 7.960607650490862e-05, + "loss": 1.7903, + "step": 23820 + }, + { + "epoch": 2.779255629448139, + "grad_norm": 1.0638388395309448, + "learning_rate": 7.959284700261408e-05, + "loss": 1.6741, + "step": 23821 + }, + { + "epoch": 2.7793723019484307, + "grad_norm": 1.2036473751068115, + "learning_rate": 7.95796182457734e-05, + "loss": 2.062, + "step": 23822 + }, + { + "epoch": 2.7794889744487223, + "grad_norm": 1.187334656715393, + "learning_rate": 7.956639023452363e-05, + "loss": 1.6419, + "step": 23823 + }, + { + "epoch": 2.779605646949014, + "grad_norm": 0.9533673524856567, + "learning_rate": 7.955316296900198e-05, + "loss": 1.8593, + "step": 23824 + }, + { + "epoch": 2.7797223194493057, + "grad_norm": 1.0319128036499023, + "learning_rate": 7.95399364493455e-05, + "loss": 1.9815, + "step": 23825 + }, + { + "epoch": 2.7798389919495974, + "grad_norm": 1.1327909231185913, + "learning_rate": 7.95267106756914e-05, + "loss": 1.8741, + "step": 23826 + }, + { + "epoch": 2.779955664449889, + "grad_norm": 1.1912708282470703, + "learning_rate": 7.951348564817666e-05, + "loss": 2.0417, + "step": 23827 + }, + { + "epoch": 2.7800723369501807, + "grad_norm": 1.2710840702056885, + "learning_rate": 7.950026136693852e-05, + "loss": 1.9148, + "step": 23828 + }, + { + "epoch": 2.7801890094504724, + "grad_norm": 1.119805932044983, + "learning_rate": 7.948703783211398e-05, + "loss": 1.8943, + "step": 23829 + }, + { + "epoch": 2.780305681950764, + "grad_norm": 1.1531784534454346, + "learning_rate": 7.94738150438402e-05, + "loss": 1.9966, + "step": 23830 + }, + { + "epoch": 2.780422354451056, + "grad_norm": 1.102860450744629, + "learning_rate": 7.946059300225417e-05, + "loss": 1.8887, + "step": 23831 + }, + { + "epoch": 2.7805390269513475, + "grad_norm": 1.0484635829925537, + "learning_rate": 7.944737170749305e-05, + "loss": 1.7449, + "step": 23832 + }, + { + "epoch": 2.780655699451639, + "grad_norm": 1.0783811807632446, + "learning_rate": 7.943415115969382e-05, + "loss": 1.7468, + "step": 23833 + }, + { + "epoch": 2.780772371951931, + "grad_norm": 1.164473533630371, + "learning_rate": 7.942093135899359e-05, + "loss": 1.9221, + "step": 23834 + }, + { + "epoch": 2.7808890444522225, + "grad_norm": 1.1128125190734863, + "learning_rate": 7.940771230552944e-05, + "loss": 2.0215, + "step": 23835 + }, + { + "epoch": 2.781005716952514, + "grad_norm": 1.157278299331665, + "learning_rate": 7.93944939994384e-05, + "loss": 1.8302, + "step": 23836 + }, + { + "epoch": 2.781122389452806, + "grad_norm": 1.14433753490448, + "learning_rate": 7.938127644085743e-05, + "loss": 1.9337, + "step": 23837 + }, + { + "epoch": 2.7812390619530976, + "grad_norm": 1.2126884460449219, + "learning_rate": 7.936805962992357e-05, + "loss": 1.9192, + "step": 23838 + }, + { + "epoch": 2.7813557344533892, + "grad_norm": 1.095374345779419, + "learning_rate": 7.93548435667739e-05, + "loss": 2.2005, + "step": 23839 + }, + { + "epoch": 2.781472406953681, + "grad_norm": 1.0658600330352783, + "learning_rate": 7.934162825154536e-05, + "loss": 1.8361, + "step": 23840 + }, + { + "epoch": 2.7815890794539726, + "grad_norm": 0.9639149904251099, + "learning_rate": 7.932841368437504e-05, + "loss": 1.8802, + "step": 23841 + }, + { + "epoch": 2.7817057519542643, + "grad_norm": 1.1573898792266846, + "learning_rate": 7.931519986539983e-05, + "loss": 1.9851, + "step": 23842 + }, + { + "epoch": 2.781822424454556, + "grad_norm": 1.032021403312683, + "learning_rate": 7.930198679475682e-05, + "loss": 1.8536, + "step": 23843 + }, + { + "epoch": 2.7819390969548476, + "grad_norm": 1.2606055736541748, + "learning_rate": 7.928877447258286e-05, + "loss": 2.1045, + "step": 23844 + }, + { + "epoch": 2.7820557694551393, + "grad_norm": 1.1939032077789307, + "learning_rate": 7.927556289901505e-05, + "loss": 2.0533, + "step": 23845 + }, + { + "epoch": 2.782172441955431, + "grad_norm": 1.1928519010543823, + "learning_rate": 7.926235207419025e-05, + "loss": 1.9075, + "step": 23846 + }, + { + "epoch": 2.7822891144557227, + "grad_norm": 1.1167724132537842, + "learning_rate": 7.924914199824549e-05, + "loss": 2.0765, + "step": 23847 + }, + { + "epoch": 2.7824057869560144, + "grad_norm": 1.1661477088928223, + "learning_rate": 7.923593267131774e-05, + "loss": 2.0188, + "step": 23848 + }, + { + "epoch": 2.782522459456306, + "grad_norm": 1.0566116571426392, + "learning_rate": 7.92227240935438e-05, + "loss": 1.9275, + "step": 23849 + }, + { + "epoch": 2.7826391319565977, + "grad_norm": 1.2536181211471558, + "learning_rate": 7.920951626506075e-05, + "loss": 1.9982, + "step": 23850 + }, + { + "epoch": 2.7827558044568894, + "grad_norm": 1.2499808073043823, + "learning_rate": 7.919630918600543e-05, + "loss": 1.9606, + "step": 23851 + }, + { + "epoch": 2.782872476957181, + "grad_norm": 1.1189022064208984, + "learning_rate": 7.918310285651484e-05, + "loss": 1.8966, + "step": 23852 + }, + { + "epoch": 2.7829891494574728, + "grad_norm": 1.0144566297531128, + "learning_rate": 7.916989727672581e-05, + "loss": 1.9833, + "step": 23853 + }, + { + "epoch": 2.7831058219577645, + "grad_norm": 1.0269719362258911, + "learning_rate": 7.915669244677522e-05, + "loss": 1.8794, + "step": 23854 + }, + { + "epoch": 2.783222494458056, + "grad_norm": 0.9689869284629822, + "learning_rate": 7.91434883668001e-05, + "loss": 2.0703, + "step": 23855 + }, + { + "epoch": 2.783339166958348, + "grad_norm": 1.040270209312439, + "learning_rate": 7.913028503693719e-05, + "loss": 1.9014, + "step": 23856 + }, + { + "epoch": 2.7834558394586395, + "grad_norm": 0.9474818706512451, + "learning_rate": 7.911708245732337e-05, + "loss": 1.8439, + "step": 23857 + }, + { + "epoch": 2.783572511958931, + "grad_norm": 0.943109393119812, + "learning_rate": 7.910388062809562e-05, + "loss": 1.9123, + "step": 23858 + }, + { + "epoch": 2.783689184459223, + "grad_norm": 1.1893786191940308, + "learning_rate": 7.909067954939068e-05, + "loss": 2.0396, + "step": 23859 + }, + { + "epoch": 2.7838058569595145, + "grad_norm": 1.1253243684768677, + "learning_rate": 7.907747922134548e-05, + "loss": 1.8058, + "step": 23860 + }, + { + "epoch": 2.7839225294598062, + "grad_norm": 1.1308135986328125, + "learning_rate": 7.90642796440969e-05, + "loss": 1.9347, + "step": 23861 + }, + { + "epoch": 2.784039201960098, + "grad_norm": 1.013464093208313, + "learning_rate": 7.905108081778167e-05, + "loss": 1.6964, + "step": 23862 + }, + { + "epoch": 2.7841558744603896, + "grad_norm": 1.2006853818893433, + "learning_rate": 7.903788274253672e-05, + "loss": 1.9535, + "step": 23863 + }, + { + "epoch": 2.7842725469606813, + "grad_norm": 1.0681883096694946, + "learning_rate": 7.90246854184988e-05, + "loss": 2.1887, + "step": 23864 + }, + { + "epoch": 2.784389219460973, + "grad_norm": 1.1928815841674805, + "learning_rate": 7.901148884580481e-05, + "loss": 2.0603, + "step": 23865 + }, + { + "epoch": 2.7845058919612646, + "grad_norm": 1.1804819107055664, + "learning_rate": 7.899829302459147e-05, + "loss": 1.9302, + "step": 23866 + }, + { + "epoch": 2.7846225644615563, + "grad_norm": 1.2193841934204102, + "learning_rate": 7.898509795499565e-05, + "loss": 2.0757, + "step": 23867 + }, + { + "epoch": 2.784739236961848, + "grad_norm": 1.2945096492767334, + "learning_rate": 7.897190363715405e-05, + "loss": 1.8774, + "step": 23868 + }, + { + "epoch": 2.7848559094621397, + "grad_norm": 1.0529396533966064, + "learning_rate": 7.895871007120356e-05, + "loss": 1.892, + "step": 23869 + }, + { + "epoch": 2.7849725819624314, + "grad_norm": 1.1745758056640625, + "learning_rate": 7.894551725728095e-05, + "loss": 1.9112, + "step": 23870 + }, + { + "epoch": 2.785089254462723, + "grad_norm": 1.1112191677093506, + "learning_rate": 7.893232519552285e-05, + "loss": 1.6976, + "step": 23871 + }, + { + "epoch": 2.7852059269630147, + "grad_norm": 1.0716092586517334, + "learning_rate": 7.891913388606621e-05, + "loss": 1.8973, + "step": 23872 + }, + { + "epoch": 2.7853225994633064, + "grad_norm": 1.1470310688018799, + "learning_rate": 7.890594332904758e-05, + "loss": 1.9148, + "step": 23873 + }, + { + "epoch": 2.785439271963598, + "grad_norm": 1.1708486080169678, + "learning_rate": 7.889275352460391e-05, + "loss": 2.1585, + "step": 23874 + }, + { + "epoch": 2.7855559444638898, + "grad_norm": 1.0278874635696411, + "learning_rate": 7.88795644728718e-05, + "loss": 1.8854, + "step": 23875 + }, + { + "epoch": 2.7856726169641814, + "grad_norm": 1.030173420906067, + "learning_rate": 7.886637617398804e-05, + "loss": 2.047, + "step": 23876 + }, + { + "epoch": 2.785789289464473, + "grad_norm": 0.9869953393936157, + "learning_rate": 7.885318862808929e-05, + "loss": 1.9063, + "step": 23877 + }, + { + "epoch": 2.785905961964765, + "grad_norm": 1.138113021850586, + "learning_rate": 7.884000183531235e-05, + "loss": 1.8678, + "step": 23878 + }, + { + "epoch": 2.7860226344650565, + "grad_norm": 1.0196382999420166, + "learning_rate": 7.882681579579387e-05, + "loss": 1.6011, + "step": 23879 + }, + { + "epoch": 2.786139306965348, + "grad_norm": 1.1751987934112549, + "learning_rate": 7.88136305096706e-05, + "loss": 1.9877, + "step": 23880 + }, + { + "epoch": 2.78625597946564, + "grad_norm": 1.0051474571228027, + "learning_rate": 7.880044597707912e-05, + "loss": 1.6991, + "step": 23881 + }, + { + "epoch": 2.7863726519659315, + "grad_norm": 1.101763129234314, + "learning_rate": 7.878726219815624e-05, + "loss": 1.9081, + "step": 23882 + }, + { + "epoch": 2.786489324466223, + "grad_norm": 1.0364420413970947, + "learning_rate": 7.877407917303854e-05, + "loss": 1.9327, + "step": 23883 + }, + { + "epoch": 2.786605996966515, + "grad_norm": 1.0583326816558838, + "learning_rate": 7.87608969018628e-05, + "loss": 1.9674, + "step": 23884 + }, + { + "epoch": 2.7867226694668066, + "grad_norm": 1.0635806322097778, + "learning_rate": 7.874771538476552e-05, + "loss": 1.9131, + "step": 23885 + }, + { + "epoch": 2.7868393419670983, + "grad_norm": 1.3038928508758545, + "learning_rate": 7.87345346218835e-05, + "loss": 2.0231, + "step": 23886 + }, + { + "epoch": 2.78695601446739, + "grad_norm": 1.1065365076065063, + "learning_rate": 7.872135461335327e-05, + "loss": 1.8539, + "step": 23887 + }, + { + "epoch": 2.7870726869676816, + "grad_norm": 1.0721060037612915, + "learning_rate": 7.870817535931158e-05, + "loss": 1.9079, + "step": 23888 + }, + { + "epoch": 2.7871893594679733, + "grad_norm": 1.0505542755126953, + "learning_rate": 7.8694996859895e-05, + "loss": 1.9853, + "step": 23889 + }, + { + "epoch": 2.787306031968265, + "grad_norm": 1.0951558351516724, + "learning_rate": 7.868181911524006e-05, + "loss": 1.8682, + "step": 23890 + }, + { + "epoch": 2.7874227044685567, + "grad_norm": 1.1872919797897339, + "learning_rate": 7.866864212548352e-05, + "loss": 2.0531, + "step": 23891 + }, + { + "epoch": 2.7875393769688483, + "grad_norm": 1.1072404384613037, + "learning_rate": 7.865546589076188e-05, + "loss": 1.888, + "step": 23892 + }, + { + "epoch": 2.78765604946914, + "grad_norm": 1.5258407592773438, + "learning_rate": 7.864229041121181e-05, + "loss": 2.1536, + "step": 23893 + }, + { + "epoch": 2.7877727219694317, + "grad_norm": 1.1520053148269653, + "learning_rate": 7.862911568696986e-05, + "loss": 1.8837, + "step": 23894 + }, + { + "epoch": 2.7878893944697234, + "grad_norm": 1.048467755317688, + "learning_rate": 7.861594171817263e-05, + "loss": 1.8297, + "step": 23895 + }, + { + "epoch": 2.788006066970015, + "grad_norm": 1.178010106086731, + "learning_rate": 7.860276850495664e-05, + "loss": 2.0713, + "step": 23896 + }, + { + "epoch": 2.7881227394703068, + "grad_norm": 1.2221466302871704, + "learning_rate": 7.858959604745856e-05, + "loss": 2.0037, + "step": 23897 + }, + { + "epoch": 2.7882394119705984, + "grad_norm": 1.0603387355804443, + "learning_rate": 7.857642434581482e-05, + "loss": 1.9342, + "step": 23898 + }, + { + "epoch": 2.78835608447089, + "grad_norm": 1.19815993309021, + "learning_rate": 7.856325340016205e-05, + "loss": 2.0019, + "step": 23899 + }, + { + "epoch": 2.788472756971182, + "grad_norm": 1.229620337486267, + "learning_rate": 7.855008321063681e-05, + "loss": 2.1058, + "step": 23900 + }, + { + "epoch": 2.7885894294714735, + "grad_norm": 1.0913786888122559, + "learning_rate": 7.853691377737557e-05, + "loss": 1.9047, + "step": 23901 + }, + { + "epoch": 2.788706101971765, + "grad_norm": 1.072268009185791, + "learning_rate": 7.852374510051492e-05, + "loss": 1.8641, + "step": 23902 + }, + { + "epoch": 2.788822774472057, + "grad_norm": 1.117405891418457, + "learning_rate": 7.851057718019126e-05, + "loss": 1.8981, + "step": 23903 + }, + { + "epoch": 2.7889394469723485, + "grad_norm": 1.052243709564209, + "learning_rate": 7.849741001654129e-05, + "loss": 1.7815, + "step": 23904 + }, + { + "epoch": 2.78905611947264, + "grad_norm": 1.0426521301269531, + "learning_rate": 7.848424360970138e-05, + "loss": 1.9206, + "step": 23905 + }, + { + "epoch": 2.789172791972932, + "grad_norm": 1.1402182579040527, + "learning_rate": 7.847107795980807e-05, + "loss": 2.0317, + "step": 23906 + }, + { + "epoch": 2.7892894644732236, + "grad_norm": 1.1354701519012451, + "learning_rate": 7.845791306699777e-05, + "loss": 1.8481, + "step": 23907 + }, + { + "epoch": 2.7894061369735152, + "grad_norm": 1.222512125968933, + "learning_rate": 7.844474893140705e-05, + "loss": 1.9159, + "step": 23908 + }, + { + "epoch": 2.789522809473807, + "grad_norm": 1.0263458490371704, + "learning_rate": 7.843158555317232e-05, + "loss": 1.8843, + "step": 23909 + }, + { + "epoch": 2.7896394819740986, + "grad_norm": 1.2007981538772583, + "learning_rate": 7.841842293243009e-05, + "loss": 1.9566, + "step": 23910 + }, + { + "epoch": 2.7897561544743903, + "grad_norm": 1.3258512020111084, + "learning_rate": 7.840526106931678e-05, + "loss": 2.0085, + "step": 23911 + }, + { + "epoch": 2.789872826974682, + "grad_norm": 1.2894628047943115, + "learning_rate": 7.839209996396883e-05, + "loss": 1.9413, + "step": 23912 + }, + { + "epoch": 2.7899894994749737, + "grad_norm": 0.9614120125770569, + "learning_rate": 7.83789396165228e-05, + "loss": 1.8441, + "step": 23913 + }, + { + "epoch": 2.7901061719752653, + "grad_norm": 1.1230570077896118, + "learning_rate": 7.836578002711492e-05, + "loss": 1.9002, + "step": 23914 + }, + { + "epoch": 2.790222844475557, + "grad_norm": 1.0352160930633545, + "learning_rate": 7.835262119588183e-05, + "loss": 1.8749, + "step": 23915 + }, + { + "epoch": 2.7903395169758487, + "grad_norm": 1.1406424045562744, + "learning_rate": 7.833946312295974e-05, + "loss": 2.0101, + "step": 23916 + }, + { + "epoch": 2.7904561894761404, + "grad_norm": 1.2934430837631226, + "learning_rate": 7.832630580848523e-05, + "loss": 2.0339, + "step": 23917 + }, + { + "epoch": 2.790572861976432, + "grad_norm": 1.067082166671753, + "learning_rate": 7.831314925259454e-05, + "loss": 1.9564, + "step": 23918 + }, + { + "epoch": 2.7906895344767237, + "grad_norm": 1.0611909627914429, + "learning_rate": 7.829999345542425e-05, + "loss": 1.9996, + "step": 23919 + }, + { + "epoch": 2.7908062069770154, + "grad_norm": 0.9862976670265198, + "learning_rate": 7.828683841711056e-05, + "loss": 1.9606, + "step": 23920 + }, + { + "epoch": 2.790922879477307, + "grad_norm": 1.1210079193115234, + "learning_rate": 7.827368413779e-05, + "loss": 1.8784, + "step": 23921 + }, + { + "epoch": 2.791039551977599, + "grad_norm": 1.13047194480896, + "learning_rate": 7.826053061759886e-05, + "loss": 1.9057, + "step": 23922 + }, + { + "epoch": 2.7911562244778905, + "grad_norm": 0.9488397240638733, + "learning_rate": 7.824737785667347e-05, + "loss": 1.8886, + "step": 23923 + }, + { + "epoch": 2.791272896978182, + "grad_norm": 0.9701293706893921, + "learning_rate": 7.823422585515027e-05, + "loss": 1.8357, + "step": 23924 + }, + { + "epoch": 2.791389569478474, + "grad_norm": 1.048075795173645, + "learning_rate": 7.822107461316547e-05, + "loss": 1.894, + "step": 23925 + }, + { + "epoch": 2.7915062419787655, + "grad_norm": 1.1397279500961304, + "learning_rate": 7.82079241308556e-05, + "loss": 1.9859, + "step": 23926 + }, + { + "epoch": 2.791622914479057, + "grad_norm": 0.9823477864265442, + "learning_rate": 7.819477440835681e-05, + "loss": 1.7995, + "step": 23927 + }, + { + "epoch": 2.791739586979349, + "grad_norm": 1.1680094003677368, + "learning_rate": 7.818162544580554e-05, + "loss": 2.1301, + "step": 23928 + }, + { + "epoch": 2.7918562594796406, + "grad_norm": 1.1899933815002441, + "learning_rate": 7.816847724333803e-05, + "loss": 2.0144, + "step": 23929 + }, + { + "epoch": 2.7919729319799322, + "grad_norm": 1.1783193349838257, + "learning_rate": 7.815532980109068e-05, + "loss": 2.0528, + "step": 23930 + }, + { + "epoch": 2.792089604480224, + "grad_norm": 1.0127898454666138, + "learning_rate": 7.814218311919965e-05, + "loss": 1.7846, + "step": 23931 + }, + { + "epoch": 2.7922062769805156, + "grad_norm": 1.07941472530365, + "learning_rate": 7.812903719780137e-05, + "loss": 2.0243, + "step": 23932 + }, + { + "epoch": 2.7923229494808073, + "grad_norm": 1.0774614810943604, + "learning_rate": 7.811589203703203e-05, + "loss": 1.933, + "step": 23933 + }, + { + "epoch": 2.792439621981099, + "grad_norm": 1.3610172271728516, + "learning_rate": 7.810274763702794e-05, + "loss": 1.9181, + "step": 23934 + }, + { + "epoch": 2.7925562944813906, + "grad_norm": 1.2291573286056519, + "learning_rate": 7.808960399792531e-05, + "loss": 1.9339, + "step": 23935 + }, + { + "epoch": 2.7926729669816823, + "grad_norm": 1.0506091117858887, + "learning_rate": 7.807646111986055e-05, + "loss": 1.7277, + "step": 23936 + }, + { + "epoch": 2.792789639481974, + "grad_norm": 1.024863600730896, + "learning_rate": 7.80633190029697e-05, + "loss": 2.1074, + "step": 23937 + }, + { + "epoch": 2.7929063119822657, + "grad_norm": 1.2216037511825562, + "learning_rate": 7.80501776473892e-05, + "loss": 1.9259, + "step": 23938 + }, + { + "epoch": 2.7930229844825574, + "grad_norm": 1.1445215940475464, + "learning_rate": 7.803703705325513e-05, + "loss": 1.8501, + "step": 23939 + }, + { + "epoch": 2.793139656982849, + "grad_norm": 1.1542941331863403, + "learning_rate": 7.802389722070384e-05, + "loss": 1.9878, + "step": 23940 + }, + { + "epoch": 2.7932563294831407, + "grad_norm": 0.984419047832489, + "learning_rate": 7.801075814987149e-05, + "loss": 1.9488, + "step": 23941 + }, + { + "epoch": 2.7933730019834324, + "grad_norm": 1.008609414100647, + "learning_rate": 7.799761984089426e-05, + "loss": 1.7183, + "step": 23942 + }, + { + "epoch": 2.793489674483724, + "grad_norm": 1.1185460090637207, + "learning_rate": 7.798448229390841e-05, + "loss": 1.8353, + "step": 23943 + }, + { + "epoch": 2.7936063469840158, + "grad_norm": 1.1369491815567017, + "learning_rate": 7.79713455090501e-05, + "loss": 2.0324, + "step": 23944 + }, + { + "epoch": 2.7937230194843075, + "grad_norm": 1.2290923595428467, + "learning_rate": 7.795820948645555e-05, + "loss": 1.9977, + "step": 23945 + }, + { + "epoch": 2.793839691984599, + "grad_norm": 1.1820437908172607, + "learning_rate": 7.79450742262609e-05, + "loss": 1.895, + "step": 23946 + }, + { + "epoch": 2.793956364484891, + "grad_norm": 1.0108602046966553, + "learning_rate": 7.793193972860237e-05, + "loss": 1.9195, + "step": 23947 + }, + { + "epoch": 2.7940730369851825, + "grad_norm": 1.0261238813400269, + "learning_rate": 7.791880599361605e-05, + "loss": 1.864, + "step": 23948 + }, + { + "epoch": 2.794189709485474, + "grad_norm": 1.2684171199798584, + "learning_rate": 7.79056730214382e-05, + "loss": 2.1995, + "step": 23949 + }, + { + "epoch": 2.794306381985766, + "grad_norm": 1.1047543287277222, + "learning_rate": 7.789254081220485e-05, + "loss": 1.952, + "step": 23950 + }, + { + "epoch": 2.7944230544860575, + "grad_norm": 1.1199289560317993, + "learning_rate": 7.787940936605228e-05, + "loss": 1.8246, + "step": 23951 + }, + { + "epoch": 2.7945397269863492, + "grad_norm": 1.048520565032959, + "learning_rate": 7.786627868311646e-05, + "loss": 2.0097, + "step": 23952 + }, + { + "epoch": 2.794656399486641, + "grad_norm": 1.52412748336792, + "learning_rate": 7.785314876353362e-05, + "loss": 2.1158, + "step": 23953 + }, + { + "epoch": 2.7947730719869326, + "grad_norm": 1.0922008752822876, + "learning_rate": 7.78400196074399e-05, + "loss": 2.0727, + "step": 23954 + }, + { + "epoch": 2.7948897444872243, + "grad_norm": 1.135825753211975, + "learning_rate": 7.782689121497137e-05, + "loss": 2.1313, + "step": 23955 + }, + { + "epoch": 2.795006416987516, + "grad_norm": 1.1965943574905396, + "learning_rate": 7.781376358626405e-05, + "loss": 1.9721, + "step": 23956 + }, + { + "epoch": 2.7951230894878076, + "grad_norm": 1.0805597305297852, + "learning_rate": 7.780063672145417e-05, + "loss": 2.0099, + "step": 23957 + }, + { + "epoch": 2.7952397619880993, + "grad_norm": 1.0768250226974487, + "learning_rate": 7.778751062067777e-05, + "loss": 1.836, + "step": 23958 + }, + { + "epoch": 2.795356434488391, + "grad_norm": 1.154886245727539, + "learning_rate": 7.777438528407085e-05, + "loss": 1.821, + "step": 23959 + }, + { + "epoch": 2.7954731069886827, + "grad_norm": 1.0873219966888428, + "learning_rate": 7.776126071176957e-05, + "loss": 1.8088, + "step": 23960 + }, + { + "epoch": 2.7955897794889744, + "grad_norm": 0.8722853660583496, + "learning_rate": 7.774813690390992e-05, + "loss": 1.7517, + "step": 23961 + }, + { + "epoch": 2.795706451989266, + "grad_norm": 1.2518028020858765, + "learning_rate": 7.773501386062803e-05, + "loss": 1.97, + "step": 23962 + }, + { + "epoch": 2.7958231244895577, + "grad_norm": 1.068901777267456, + "learning_rate": 7.772189158205989e-05, + "loss": 2.1285, + "step": 23963 + }, + { + "epoch": 2.7959397969898494, + "grad_norm": 0.9215750694274902, + "learning_rate": 7.770877006834158e-05, + "loss": 1.8381, + "step": 23964 + }, + { + "epoch": 2.796056469490141, + "grad_norm": 0.925142228603363, + "learning_rate": 7.769564931960905e-05, + "loss": 1.7293, + "step": 23965 + }, + { + "epoch": 2.7961731419904328, + "grad_norm": 1.0484628677368164, + "learning_rate": 7.768252933599837e-05, + "loss": 1.9426, + "step": 23966 + }, + { + "epoch": 2.7962898144907244, + "grad_norm": 1.2685691118240356, + "learning_rate": 7.766941011764562e-05, + "loss": 1.7831, + "step": 23967 + }, + { + "epoch": 2.796406486991016, + "grad_norm": 1.1443783044815063, + "learning_rate": 7.765629166468668e-05, + "loss": 2.0162, + "step": 23968 + }, + { + "epoch": 2.796523159491308, + "grad_norm": 1.2253998517990112, + "learning_rate": 7.764317397725768e-05, + "loss": 2.0526, + "step": 23969 + }, + { + "epoch": 2.7966398319915995, + "grad_norm": 1.1339373588562012, + "learning_rate": 7.763005705549448e-05, + "loss": 1.8144, + "step": 23970 + }, + { + "epoch": 2.796756504491891, + "grad_norm": 1.0871104001998901, + "learning_rate": 7.761694089953318e-05, + "loss": 2.066, + "step": 23971 + }, + { + "epoch": 2.796873176992183, + "grad_norm": 1.0255595445632935, + "learning_rate": 7.760382550950965e-05, + "loss": 1.7679, + "step": 23972 + }, + { + "epoch": 2.7969898494924745, + "grad_norm": 1.325268268585205, + "learning_rate": 7.759071088555994e-05, + "loss": 2.185, + "step": 23973 + }, + { + "epoch": 2.797106521992766, + "grad_norm": 1.1452879905700684, + "learning_rate": 7.757759702781998e-05, + "loss": 2.0629, + "step": 23974 + }, + { + "epoch": 2.797223194493058, + "grad_norm": 1.102278470993042, + "learning_rate": 7.756448393642571e-05, + "loss": 1.9876, + "step": 23975 + }, + { + "epoch": 2.7973398669933496, + "grad_norm": 1.105897068977356, + "learning_rate": 7.755137161151301e-05, + "loss": 1.9729, + "step": 23976 + }, + { + "epoch": 2.7974565394936413, + "grad_norm": 1.1512722969055176, + "learning_rate": 7.753826005321795e-05, + "loss": 2.0279, + "step": 23977 + }, + { + "epoch": 2.797573211993933, + "grad_norm": 1.0844428539276123, + "learning_rate": 7.75251492616763e-05, + "loss": 1.921, + "step": 23978 + }, + { + "epoch": 2.7976898844942246, + "grad_norm": 1.2124453783035278, + "learning_rate": 7.75120392370241e-05, + "loss": 1.7516, + "step": 23979 + }, + { + "epoch": 2.7978065569945163, + "grad_norm": 1.2809096574783325, + "learning_rate": 7.749892997939722e-05, + "loss": 1.93, + "step": 23980 + }, + { + "epoch": 2.797923229494808, + "grad_norm": 1.0641348361968994, + "learning_rate": 7.748582148893156e-05, + "loss": 1.8779, + "step": 23981 + }, + { + "epoch": 2.7980399019950997, + "grad_norm": 1.1366809606552124, + "learning_rate": 7.747271376576307e-05, + "loss": 1.8657, + "step": 23982 + }, + { + "epoch": 2.7981565744953913, + "grad_norm": 1.3439019918441772, + "learning_rate": 7.74596068100275e-05, + "loss": 2.0291, + "step": 23983 + }, + { + "epoch": 2.798273246995683, + "grad_norm": 0.9205772280693054, + "learning_rate": 7.744650062186086e-05, + "loss": 1.9479, + "step": 23984 + }, + { + "epoch": 2.7983899194959747, + "grad_norm": 1.124215006828308, + "learning_rate": 7.743339520139895e-05, + "loss": 2.1115, + "step": 23985 + }, + { + "epoch": 2.7985065919962664, + "grad_norm": 1.2746814489364624, + "learning_rate": 7.742029054877769e-05, + "loss": 2.0551, + "step": 23986 + }, + { + "epoch": 2.798623264496558, + "grad_norm": 1.153973937034607, + "learning_rate": 7.740718666413285e-05, + "loss": 1.9859, + "step": 23987 + }, + { + "epoch": 2.7987399369968498, + "grad_norm": 1.1088330745697021, + "learning_rate": 7.73940835476004e-05, + "loss": 2.0686, + "step": 23988 + }, + { + "epoch": 2.7988566094971414, + "grad_norm": 1.1098239421844482, + "learning_rate": 7.738098119931603e-05, + "loss": 1.7829, + "step": 23989 + }, + { + "epoch": 2.798973281997433, + "grad_norm": 1.293030858039856, + "learning_rate": 7.736787961941568e-05, + "loss": 1.9424, + "step": 23990 + }, + { + "epoch": 2.799089954497725, + "grad_norm": 1.2378029823303223, + "learning_rate": 7.735477880803518e-05, + "loss": 2.0075, + "step": 23991 + }, + { + "epoch": 2.7992066269980165, + "grad_norm": 1.1687257289886475, + "learning_rate": 7.734167876531024e-05, + "loss": 1.8219, + "step": 23992 + }, + { + "epoch": 2.799323299498308, + "grad_norm": 1.0458660125732422, + "learning_rate": 7.732857949137676e-05, + "loss": 1.7456, + "step": 23993 + }, + { + "epoch": 2.7994399719986, + "grad_norm": 1.2541357278823853, + "learning_rate": 7.731548098637048e-05, + "loss": 1.9572, + "step": 23994 + }, + { + "epoch": 2.7995566444988915, + "grad_norm": 1.0463134050369263, + "learning_rate": 7.730238325042727e-05, + "loss": 1.9913, + "step": 23995 + }, + { + "epoch": 2.799673316999183, + "grad_norm": 1.0291328430175781, + "learning_rate": 7.72892862836828e-05, + "loss": 1.8321, + "step": 23996 + }, + { + "epoch": 2.799789989499475, + "grad_norm": 0.9814932942390442, + "learning_rate": 7.727619008627296e-05, + "loss": 1.8852, + "step": 23997 + }, + { + "epoch": 2.7999066619997666, + "grad_norm": 1.0728387832641602, + "learning_rate": 7.726309465833341e-05, + "loss": 1.8502, + "step": 23998 + }, + { + "epoch": 2.8000233345000582, + "grad_norm": 1.0880916118621826, + "learning_rate": 7.725000000000003e-05, + "loss": 1.9171, + "step": 23999 + }, + { + "epoch": 2.80014000700035, + "grad_norm": 1.0664563179016113, + "learning_rate": 7.723690611140843e-05, + "loss": 1.8219, + "step": 24000 + }, + { + "epoch": 2.8002566795006416, + "grad_norm": 1.085408091545105, + "learning_rate": 7.72238129926945e-05, + "loss": 2.0662, + "step": 24001 + }, + { + "epoch": 2.8003733520009333, + "grad_norm": 1.027113676071167, + "learning_rate": 7.721072064399383e-05, + "loss": 2.0018, + "step": 24002 + }, + { + "epoch": 2.800490024501225, + "grad_norm": 1.111801028251648, + "learning_rate": 7.719762906544229e-05, + "loss": 2.0571, + "step": 24003 + }, + { + "epoch": 2.8006066970015167, + "grad_norm": 1.328308343887329, + "learning_rate": 7.718453825717544e-05, + "loss": 2.0452, + "step": 24004 + }, + { + "epoch": 2.8007233695018083, + "grad_norm": 1.1303551197052002, + "learning_rate": 7.717144821932911e-05, + "loss": 2.1728, + "step": 24005 + }, + { + "epoch": 2.8008400420021, + "grad_norm": 1.146216630935669, + "learning_rate": 7.715835895203904e-05, + "loss": 2.0547, + "step": 24006 + }, + { + "epoch": 2.8009567145023917, + "grad_norm": 1.0637223720550537, + "learning_rate": 7.714527045544083e-05, + "loss": 1.7969, + "step": 24007 + }, + { + "epoch": 2.8010733870026834, + "grad_norm": 1.0052849054336548, + "learning_rate": 7.713218272967015e-05, + "loss": 2.0956, + "step": 24008 + }, + { + "epoch": 2.801190059502975, + "grad_norm": 1.1430656909942627, + "learning_rate": 7.711909577486276e-05, + "loss": 2.0069, + "step": 24009 + }, + { + "epoch": 2.8013067320032667, + "grad_norm": 1.131178379058838, + "learning_rate": 7.710600959115432e-05, + "loss": 1.8795, + "step": 24010 + }, + { + "epoch": 2.8014234045035584, + "grad_norm": 1.1074771881103516, + "learning_rate": 7.709292417868039e-05, + "loss": 1.9979, + "step": 24011 + }, + { + "epoch": 2.80154007700385, + "grad_norm": 1.470991611480713, + "learning_rate": 7.707983953757677e-05, + "loss": 2.0595, + "step": 24012 + }, + { + "epoch": 2.801656749504142, + "grad_norm": 1.038270115852356, + "learning_rate": 7.706675566797898e-05, + "loss": 2.0842, + "step": 24013 + }, + { + "epoch": 2.8017734220044335, + "grad_norm": 1.1266132593154907, + "learning_rate": 7.705367257002278e-05, + "loss": 2.1745, + "step": 24014 + }, + { + "epoch": 2.801890094504725, + "grad_norm": 1.1408543586730957, + "learning_rate": 7.704059024384366e-05, + "loss": 1.9588, + "step": 24015 + }, + { + "epoch": 2.802006767005017, + "grad_norm": 0.9505099654197693, + "learning_rate": 7.702750868957743e-05, + "loss": 1.8075, + "step": 24016 + }, + { + "epoch": 2.8021234395053085, + "grad_norm": 1.1217987537384033, + "learning_rate": 7.70144279073595e-05, + "loss": 2.1257, + "step": 24017 + }, + { + "epoch": 2.8022401120056, + "grad_norm": 1.1128634214401245, + "learning_rate": 7.700134789732558e-05, + "loss": 1.9411, + "step": 24018 + }, + { + "epoch": 2.802356784505892, + "grad_norm": 1.159819483757019, + "learning_rate": 7.698826865961132e-05, + "loss": 1.9193, + "step": 24019 + }, + { + "epoch": 2.8024734570061836, + "grad_norm": 1.0866807699203491, + "learning_rate": 7.697519019435221e-05, + "loss": 1.8726, + "step": 24020 + }, + { + "epoch": 2.8025901295064752, + "grad_norm": 1.1103285551071167, + "learning_rate": 7.696211250168393e-05, + "loss": 2.0058, + "step": 24021 + }, + { + "epoch": 2.802706802006767, + "grad_norm": 1.21074640750885, + "learning_rate": 7.694903558174194e-05, + "loss": 2.1061, + "step": 24022 + }, + { + "epoch": 2.8028234745070586, + "grad_norm": 0.9905675053596497, + "learning_rate": 7.693595943466193e-05, + "loss": 1.8102, + "step": 24023 + }, + { + "epoch": 2.8029401470073503, + "grad_norm": 1.1328951120376587, + "learning_rate": 7.69228840605794e-05, + "loss": 2.1441, + "step": 24024 + }, + { + "epoch": 2.803056819507642, + "grad_norm": 0.9711671471595764, + "learning_rate": 7.690980945962985e-05, + "loss": 1.8834, + "step": 24025 + }, + { + "epoch": 2.8031734920079336, + "grad_norm": 1.2523612976074219, + "learning_rate": 7.689673563194891e-05, + "loss": 1.9445, + "step": 24026 + }, + { + "epoch": 2.8032901645082253, + "grad_norm": 1.1368001699447632, + "learning_rate": 7.68836625776721e-05, + "loss": 2.0283, + "step": 24027 + }, + { + "epoch": 2.803406837008517, + "grad_norm": 1.2209763526916504, + "learning_rate": 7.687059029693486e-05, + "loss": 2.0305, + "step": 24028 + }, + { + "epoch": 2.8035235095088087, + "grad_norm": 1.1641137599945068, + "learning_rate": 7.685751878987284e-05, + "loss": 2.1418, + "step": 24029 + }, + { + "epoch": 2.8036401820091004, + "grad_norm": 1.1007359027862549, + "learning_rate": 7.68444480566214e-05, + "loss": 1.9285, + "step": 24030 + }, + { + "epoch": 2.803756854509392, + "grad_norm": 1.2123669385910034, + "learning_rate": 7.683137809731615e-05, + "loss": 1.7361, + "step": 24031 + }, + { + "epoch": 2.8038735270096837, + "grad_norm": 1.060633659362793, + "learning_rate": 7.681830891209262e-05, + "loss": 1.7816, + "step": 24032 + }, + { + "epoch": 2.8039901995099754, + "grad_norm": 0.9673359394073486, + "learning_rate": 7.680524050108616e-05, + "loss": 1.9175, + "step": 24033 + }, + { + "epoch": 2.804106872010267, + "grad_norm": 1.1953078508377075, + "learning_rate": 7.679217286443241e-05, + "loss": 1.8856, + "step": 24034 + }, + { + "epoch": 2.8042235445105588, + "grad_norm": 1.1135526895523071, + "learning_rate": 7.677910600226669e-05, + "loss": 1.898, + "step": 24035 + }, + { + "epoch": 2.8043402170108505, + "grad_norm": 1.0474785566329956, + "learning_rate": 7.676603991472459e-05, + "loss": 1.8878, + "step": 24036 + }, + { + "epoch": 2.804456889511142, + "grad_norm": 1.1429764032363892, + "learning_rate": 7.675297460194147e-05, + "loss": 2.1634, + "step": 24037 + }, + { + "epoch": 2.804573562011434, + "grad_norm": 1.2223453521728516, + "learning_rate": 7.673991006405284e-05, + "loss": 1.9159, + "step": 24038 + }, + { + "epoch": 2.8046902345117255, + "grad_norm": 0.9494529366493225, + "learning_rate": 7.672684630119407e-05, + "loss": 1.7864, + "step": 24039 + }, + { + "epoch": 2.804806907012017, + "grad_norm": 1.1642721891403198, + "learning_rate": 7.671378331350069e-05, + "loss": 2.0068, + "step": 24040 + }, + { + "epoch": 2.804923579512309, + "grad_norm": 1.1185411214828491, + "learning_rate": 7.670072110110799e-05, + "loss": 2.0629, + "step": 24041 + }, + { + "epoch": 2.8050402520126005, + "grad_norm": 1.0198204517364502, + "learning_rate": 7.668765966415156e-05, + "loss": 1.6979, + "step": 24042 + }, + { + "epoch": 2.8051569245128922, + "grad_norm": 1.2785193920135498, + "learning_rate": 7.667459900276668e-05, + "loss": 1.9492, + "step": 24043 + }, + { + "epoch": 2.805273597013184, + "grad_norm": 1.0466303825378418, + "learning_rate": 7.66615391170887e-05, + "loss": 1.8664, + "step": 24044 + }, + { + "epoch": 2.8053902695134756, + "grad_norm": 1.3235493898391724, + "learning_rate": 7.664848000725317e-05, + "loss": 2.0221, + "step": 24045 + }, + { + "epoch": 2.8055069420137673, + "grad_norm": 1.0223454236984253, + "learning_rate": 7.663542167339531e-05, + "loss": 1.9438, + "step": 24046 + }, + { + "epoch": 2.805623614514059, + "grad_norm": 1.0003396272659302, + "learning_rate": 7.662236411565065e-05, + "loss": 1.869, + "step": 24047 + }, + { + "epoch": 2.8057402870143506, + "grad_norm": 1.0755469799041748, + "learning_rate": 7.660930733415441e-05, + "loss": 1.9014, + "step": 24048 + }, + { + "epoch": 2.8058569595146423, + "grad_norm": 1.0914629697799683, + "learning_rate": 7.659625132904209e-05, + "loss": 2.181, + "step": 24049 + }, + { + "epoch": 2.805973632014934, + "grad_norm": 1.0015144348144531, + "learning_rate": 7.658319610044889e-05, + "loss": 1.9038, + "step": 24050 + }, + { + "epoch": 2.8060903045152257, + "grad_norm": 1.189847469329834, + "learning_rate": 7.657014164851027e-05, + "loss": 1.9917, + "step": 24051 + }, + { + "epoch": 2.8062069770155174, + "grad_norm": 1.4138869047164917, + "learning_rate": 7.65570879733615e-05, + "loss": 1.869, + "step": 24052 + }, + { + "epoch": 2.806323649515809, + "grad_norm": 1.2938871383666992, + "learning_rate": 7.654403507513798e-05, + "loss": 1.8097, + "step": 24053 + }, + { + "epoch": 2.8064403220161007, + "grad_norm": 1.0761537551879883, + "learning_rate": 7.65309829539749e-05, + "loss": 2.0032, + "step": 24054 + }, + { + "epoch": 2.8065569945163924, + "grad_norm": 1.1020355224609375, + "learning_rate": 7.651793161000772e-05, + "loss": 1.9655, + "step": 24055 + }, + { + "epoch": 2.806673667016684, + "grad_norm": 1.1606765985488892, + "learning_rate": 7.650488104337161e-05, + "loss": 2.1056, + "step": 24056 + }, + { + "epoch": 2.8067903395169758, + "grad_norm": 1.1932214498519897, + "learning_rate": 7.649183125420193e-05, + "loss": 1.9442, + "step": 24057 + }, + { + "epoch": 2.8069070120172674, + "grad_norm": 1.10541570186615, + "learning_rate": 7.6478782242634e-05, + "loss": 2.0095, + "step": 24058 + }, + { + "epoch": 2.807023684517559, + "grad_norm": 0.8979233503341675, + "learning_rate": 7.646573400880307e-05, + "loss": 1.8202, + "step": 24059 + }, + { + "epoch": 2.807140357017851, + "grad_norm": 1.1278023719787598, + "learning_rate": 7.645268655284437e-05, + "loss": 1.9857, + "step": 24060 + }, + { + "epoch": 2.8072570295181425, + "grad_norm": 1.0917731523513794, + "learning_rate": 7.643963987489316e-05, + "loss": 2.0548, + "step": 24061 + }, + { + "epoch": 2.807373702018434, + "grad_norm": 1.1091790199279785, + "learning_rate": 7.642659397508476e-05, + "loss": 1.9218, + "step": 24062 + }, + { + "epoch": 2.807490374518726, + "grad_norm": 1.269929051399231, + "learning_rate": 7.641354885355435e-05, + "loss": 2.0127, + "step": 24063 + }, + { + "epoch": 2.8076070470190175, + "grad_norm": 0.9813669323921204, + "learning_rate": 7.640050451043721e-05, + "loss": 1.6928, + "step": 24064 + }, + { + "epoch": 2.807723719519309, + "grad_norm": 1.3013055324554443, + "learning_rate": 7.638746094586852e-05, + "loss": 2.0991, + "step": 24065 + }, + { + "epoch": 2.807840392019601, + "grad_norm": 1.1829631328582764, + "learning_rate": 7.637441815998358e-05, + "loss": 1.9587, + "step": 24066 + }, + { + "epoch": 2.8079570645198926, + "grad_norm": 1.098610281944275, + "learning_rate": 7.636137615291751e-05, + "loss": 1.9143, + "step": 24067 + }, + { + "epoch": 2.8080737370201843, + "grad_norm": 1.0936604738235474, + "learning_rate": 7.634833492480564e-05, + "loss": 1.9437, + "step": 24068 + }, + { + "epoch": 2.808190409520476, + "grad_norm": 1.1362850666046143, + "learning_rate": 7.6335294475783e-05, + "loss": 2.0197, + "step": 24069 + }, + { + "epoch": 2.8083070820207676, + "grad_norm": 0.8823561668395996, + "learning_rate": 7.632225480598487e-05, + "loss": 1.8635, + "step": 24070 + }, + { + "epoch": 2.8084237545210593, + "grad_norm": 1.0965825319290161, + "learning_rate": 7.63092159155465e-05, + "loss": 1.8765, + "step": 24071 + }, + { + "epoch": 2.808540427021351, + "grad_norm": 1.1763098239898682, + "learning_rate": 7.629617780460293e-05, + "loss": 1.8908, + "step": 24072 + }, + { + "epoch": 2.8086570995216427, + "grad_norm": 1.1390353441238403, + "learning_rate": 7.628314047328944e-05, + "loss": 1.9299, + "step": 24073 + }, + { + "epoch": 2.8087737720219343, + "grad_norm": 1.0586899518966675, + "learning_rate": 7.62701039217411e-05, + "loss": 2.0644, + "step": 24074 + }, + { + "epoch": 2.808890444522226, + "grad_norm": 1.081599235534668, + "learning_rate": 7.625706815009312e-05, + "loss": 2.0802, + "step": 24075 + }, + { + "epoch": 2.8090071170225177, + "grad_norm": 1.2772541046142578, + "learning_rate": 7.624403315848062e-05, + "loss": 2.1453, + "step": 24076 + }, + { + "epoch": 2.8091237895228094, + "grad_norm": 0.9874762296676636, + "learning_rate": 7.623099894703868e-05, + "loss": 1.8089, + "step": 24077 + }, + { + "epoch": 2.809240462023101, + "grad_norm": 1.00009286403656, + "learning_rate": 7.621796551590252e-05, + "loss": 1.9403, + "step": 24078 + }, + { + "epoch": 2.8093571345233928, + "grad_norm": 1.039926528930664, + "learning_rate": 7.62049328652072e-05, + "loss": 1.9249, + "step": 24079 + }, + { + "epoch": 2.8094738070236844, + "grad_norm": 1.1763916015625, + "learning_rate": 7.619190099508779e-05, + "loss": 2.0071, + "step": 24080 + }, + { + "epoch": 2.809590479523976, + "grad_norm": 1.1930272579193115, + "learning_rate": 7.617886990567947e-05, + "loss": 2.0775, + "step": 24081 + }, + { + "epoch": 2.809707152024268, + "grad_norm": 1.3357001543045044, + "learning_rate": 7.616583959711727e-05, + "loss": 1.978, + "step": 24082 + }, + { + "epoch": 2.8098238245245595, + "grad_norm": 1.026517629623413, + "learning_rate": 7.615281006953627e-05, + "loss": 1.7934, + "step": 24083 + }, + { + "epoch": 2.809940497024851, + "grad_norm": 1.2027158737182617, + "learning_rate": 7.613978132307164e-05, + "loss": 1.9806, + "step": 24084 + }, + { + "epoch": 2.810057169525143, + "grad_norm": 1.1442087888717651, + "learning_rate": 7.612675335785832e-05, + "loss": 1.9066, + "step": 24085 + }, + { + "epoch": 2.8101738420254345, + "grad_norm": 0.9906253218650818, + "learning_rate": 7.611372617403149e-05, + "loss": 1.7999, + "step": 24086 + }, + { + "epoch": 2.810290514525726, + "grad_norm": 1.0745035409927368, + "learning_rate": 7.610069977172613e-05, + "loss": 1.8687, + "step": 24087 + }, + { + "epoch": 2.810407187026018, + "grad_norm": 1.3793940544128418, + "learning_rate": 7.60876741510773e-05, + "loss": 1.9212, + "step": 24088 + }, + { + "epoch": 2.8105238595263096, + "grad_norm": 1.1241450309753418, + "learning_rate": 7.607464931221999e-05, + "loss": 1.8929, + "step": 24089 + }, + { + "epoch": 2.8106405320266012, + "grad_norm": 1.1157517433166504, + "learning_rate": 7.606162525528933e-05, + "loss": 1.9015, + "step": 24090 + }, + { + "epoch": 2.810757204526893, + "grad_norm": 1.0017465353012085, + "learning_rate": 7.604860198042022e-05, + "loss": 1.9708, + "step": 24091 + }, + { + "epoch": 2.8108738770271846, + "grad_norm": 1.1310089826583862, + "learning_rate": 7.603557948774779e-05, + "loss": 2.0978, + "step": 24092 + }, + { + "epoch": 2.8109905495274763, + "grad_norm": 1.3078604936599731, + "learning_rate": 7.602255777740694e-05, + "loss": 2.0384, + "step": 24093 + }, + { + "epoch": 2.811107222027768, + "grad_norm": 1.1446444988250732, + "learning_rate": 7.600953684953273e-05, + "loss": 1.9498, + "step": 24094 + }, + { + "epoch": 2.8112238945280597, + "grad_norm": 1.2201106548309326, + "learning_rate": 7.599651670426013e-05, + "loss": 2.0051, + "step": 24095 + }, + { + "epoch": 2.8113405670283513, + "grad_norm": 1.1223416328430176, + "learning_rate": 7.598349734172407e-05, + "loss": 1.9868, + "step": 24096 + }, + { + "epoch": 2.811457239528643, + "grad_norm": 1.0502432584762573, + "learning_rate": 7.597047876205964e-05, + "loss": 2.0589, + "step": 24097 + }, + { + "epoch": 2.8115739120289347, + "grad_norm": 1.188437581062317, + "learning_rate": 7.595746096540164e-05, + "loss": 1.949, + "step": 24098 + }, + { + "epoch": 2.8116905845292264, + "grad_norm": 1.1618660688400269, + "learning_rate": 7.594444395188517e-05, + "loss": 2.0616, + "step": 24099 + }, + { + "epoch": 2.811807257029518, + "grad_norm": 1.1143810749053955, + "learning_rate": 7.593142772164505e-05, + "loss": 2.089, + "step": 24100 + }, + { + "epoch": 2.8119239295298097, + "grad_norm": 1.333685040473938, + "learning_rate": 7.591841227481635e-05, + "loss": 1.9519, + "step": 24101 + }, + { + "epoch": 2.8120406020301014, + "grad_norm": 1.1245956420898438, + "learning_rate": 7.590539761153385e-05, + "loss": 2.0485, + "step": 24102 + }, + { + "epoch": 2.812157274530393, + "grad_norm": 1.0959360599517822, + "learning_rate": 7.589238373193263e-05, + "loss": 2.0081, + "step": 24103 + }, + { + "epoch": 2.812273947030685, + "grad_norm": 1.2230308055877686, + "learning_rate": 7.58793706361475e-05, + "loss": 1.9111, + "step": 24104 + }, + { + "epoch": 2.8123906195309765, + "grad_norm": 1.0389742851257324, + "learning_rate": 7.586635832431341e-05, + "loss": 1.9432, + "step": 24105 + }, + { + "epoch": 2.812507292031268, + "grad_norm": 1.3440678119659424, + "learning_rate": 7.585334679656519e-05, + "loss": 2.0601, + "step": 24106 + }, + { + "epoch": 2.81262396453156, + "grad_norm": 1.1144626140594482, + "learning_rate": 7.584033605303785e-05, + "loss": 2.1842, + "step": 24107 + }, + { + "epoch": 2.8127406370318515, + "grad_norm": 1.2125157117843628, + "learning_rate": 7.582732609386612e-05, + "loss": 2.0621, + "step": 24108 + }, + { + "epoch": 2.812857309532143, + "grad_norm": 1.1142081022262573, + "learning_rate": 7.581431691918503e-05, + "loss": 1.7642, + "step": 24109 + }, + { + "epoch": 2.812973982032435, + "grad_norm": 1.0169248580932617, + "learning_rate": 7.58013085291293e-05, + "loss": 1.8816, + "step": 24110 + }, + { + "epoch": 2.8130906545327266, + "grad_norm": 1.196913480758667, + "learning_rate": 7.578830092383393e-05, + "loss": 2.0539, + "step": 24111 + }, + { + "epoch": 2.8132073270330182, + "grad_norm": 1.0647783279418945, + "learning_rate": 7.577529410343366e-05, + "loss": 2.0068, + "step": 24112 + }, + { + "epoch": 2.81332399953331, + "grad_norm": 1.0687955617904663, + "learning_rate": 7.576228806806334e-05, + "loss": 1.8767, + "step": 24113 + }, + { + "epoch": 2.8134406720336016, + "grad_norm": 1.1545114517211914, + "learning_rate": 7.574928281785787e-05, + "loss": 1.9394, + "step": 24114 + }, + { + "epoch": 2.8135573445338933, + "grad_norm": 1.1991791725158691, + "learning_rate": 7.573627835295196e-05, + "loss": 1.865, + "step": 24115 + }, + { + "epoch": 2.813674017034185, + "grad_norm": 0.9967076182365417, + "learning_rate": 7.572327467348058e-05, + "loss": 1.9445, + "step": 24116 + }, + { + "epoch": 2.8137906895344766, + "grad_norm": 1.2507702112197876, + "learning_rate": 7.571027177957835e-05, + "loss": 2.1225, + "step": 24117 + }, + { + "epoch": 2.8139073620347683, + "grad_norm": 1.1141937971115112, + "learning_rate": 7.569726967138027e-05, + "loss": 1.9734, + "step": 24118 + }, + { + "epoch": 2.81402403453506, + "grad_norm": 1.0067639350891113, + "learning_rate": 7.568426834902097e-05, + "loss": 1.8732, + "step": 24119 + }, + { + "epoch": 2.8141407070353517, + "grad_norm": 1.0814107656478882, + "learning_rate": 7.567126781263532e-05, + "loss": 1.9339, + "step": 24120 + }, + { + "epoch": 2.8142573795356434, + "grad_norm": 0.946601390838623, + "learning_rate": 7.565826806235806e-05, + "loss": 1.8564, + "step": 24121 + }, + { + "epoch": 2.814374052035935, + "grad_norm": 1.1321886777877808, + "learning_rate": 7.564526909832394e-05, + "loss": 1.7736, + "step": 24122 + }, + { + "epoch": 2.8144907245362267, + "grad_norm": 1.168405294418335, + "learning_rate": 7.563227092066784e-05, + "loss": 1.9616, + "step": 24123 + }, + { + "epoch": 2.8146073970365184, + "grad_norm": 1.1023215055465698, + "learning_rate": 7.561927352952433e-05, + "loss": 1.9818, + "step": 24124 + }, + { + "epoch": 2.81472406953681, + "grad_norm": 1.0407209396362305, + "learning_rate": 7.560627692502833e-05, + "loss": 1.7827, + "step": 24125 + }, + { + "epoch": 2.8148407420371018, + "grad_norm": 1.1960339546203613, + "learning_rate": 7.559328110731441e-05, + "loss": 2.1139, + "step": 24126 + }, + { + "epoch": 2.8149574145373935, + "grad_norm": 1.0068436861038208, + "learning_rate": 7.55802860765174e-05, + "loss": 2.0758, + "step": 24127 + }, + { + "epoch": 2.815074087037685, + "grad_norm": 1.2094858884811401, + "learning_rate": 7.556729183277202e-05, + "loss": 2.169, + "step": 24128 + }, + { + "epoch": 2.815190759537977, + "grad_norm": 1.0755996704101562, + "learning_rate": 7.555429837621297e-05, + "loss": 1.9575, + "step": 24129 + }, + { + "epoch": 2.8153074320382685, + "grad_norm": 1.064942479133606, + "learning_rate": 7.554130570697486e-05, + "loss": 2.0485, + "step": 24130 + }, + { + "epoch": 2.81542410453856, + "grad_norm": 1.1683865785598755, + "learning_rate": 7.55283138251925e-05, + "loss": 1.8771, + "step": 24131 + }, + { + "epoch": 2.815540777038852, + "grad_norm": 1.2074592113494873, + "learning_rate": 7.551532273100049e-05, + "loss": 2.128, + "step": 24132 + }, + { + "epoch": 2.8156574495391435, + "grad_norm": 1.2205678224563599, + "learning_rate": 7.550233242453359e-05, + "loss": 1.9058, + "step": 24133 + }, + { + "epoch": 2.8157741220394352, + "grad_norm": 1.142980694770813, + "learning_rate": 7.548934290592637e-05, + "loss": 2.0307, + "step": 24134 + }, + { + "epoch": 2.815890794539727, + "grad_norm": 1.1961544752120972, + "learning_rate": 7.547635417531357e-05, + "loss": 1.9354, + "step": 24135 + }, + { + "epoch": 2.8160074670400186, + "grad_norm": 1.1350429058074951, + "learning_rate": 7.546336623282988e-05, + "loss": 1.8756, + "step": 24136 + }, + { + "epoch": 2.8161241395403103, + "grad_norm": 1.082119345664978, + "learning_rate": 7.54503790786098e-05, + "loss": 1.9815, + "step": 24137 + }, + { + "epoch": 2.816240812040602, + "grad_norm": 1.0544136762619019, + "learning_rate": 7.543739271278813e-05, + "loss": 1.9443, + "step": 24138 + }, + { + "epoch": 2.8163574845408936, + "grad_norm": 1.1170779466629028, + "learning_rate": 7.542440713549936e-05, + "loss": 1.9875, + "step": 24139 + }, + { + "epoch": 2.8164741570411853, + "grad_norm": 1.0690505504608154, + "learning_rate": 7.541142234687823e-05, + "loss": 2.0704, + "step": 24140 + }, + { + "epoch": 2.816590829541477, + "grad_norm": 1.1075878143310547, + "learning_rate": 7.539843834705924e-05, + "loss": 1.8879, + "step": 24141 + }, + { + "epoch": 2.8167075020417687, + "grad_norm": 1.1072782278060913, + "learning_rate": 7.538545513617708e-05, + "loss": 1.9987, + "step": 24142 + }, + { + "epoch": 2.8168241745420604, + "grad_norm": 1.2834924459457397, + "learning_rate": 7.537247271436628e-05, + "loss": 2.0922, + "step": 24143 + }, + { + "epoch": 2.816940847042352, + "grad_norm": 1.1036491394042969, + "learning_rate": 7.53594910817615e-05, + "loss": 1.9681, + "step": 24144 + }, + { + "epoch": 2.8170575195426437, + "grad_norm": 1.1444464921951294, + "learning_rate": 7.53465102384973e-05, + "loss": 1.9334, + "step": 24145 + }, + { + "epoch": 2.8171741920429354, + "grad_norm": 1.0738434791564941, + "learning_rate": 7.533353018470814e-05, + "loss": 2.0152, + "step": 24146 + }, + { + "epoch": 2.817290864543227, + "grad_norm": 1.1617168188095093, + "learning_rate": 7.532055092052875e-05, + "loss": 2.0465, + "step": 24147 + }, + { + "epoch": 2.8174075370435188, + "grad_norm": 1.0566837787628174, + "learning_rate": 7.530757244609355e-05, + "loss": 1.9642, + "step": 24148 + }, + { + "epoch": 2.8175242095438104, + "grad_norm": 1.1402606964111328, + "learning_rate": 7.529459476153721e-05, + "loss": 2.111, + "step": 24149 + }, + { + "epoch": 2.817640882044102, + "grad_norm": 1.070831298828125, + "learning_rate": 7.528161786699414e-05, + "loss": 2.0331, + "step": 24150 + }, + { + "epoch": 2.817757554544394, + "grad_norm": 1.0846045017242432, + "learning_rate": 7.526864176259898e-05, + "loss": 2.0904, + "step": 24151 + }, + { + "epoch": 2.8178742270446855, + "grad_norm": 0.9813400506973267, + "learning_rate": 7.525566644848616e-05, + "loss": 1.7886, + "step": 24152 + }, + { + "epoch": 2.817990899544977, + "grad_norm": 1.0217550992965698, + "learning_rate": 7.524269192479027e-05, + "loss": 1.9618, + "step": 24153 + }, + { + "epoch": 2.818107572045269, + "grad_norm": 0.9342487454414368, + "learning_rate": 7.522971819164574e-05, + "loss": 1.98, + "step": 24154 + }, + { + "epoch": 2.8182242445455605, + "grad_norm": 1.0250048637390137, + "learning_rate": 7.521674524918718e-05, + "loss": 1.9453, + "step": 24155 + }, + { + "epoch": 2.818340917045852, + "grad_norm": 1.1460484266281128, + "learning_rate": 7.520377309754896e-05, + "loss": 1.9648, + "step": 24156 + }, + { + "epoch": 2.818457589546144, + "grad_norm": 1.0590070486068726, + "learning_rate": 7.519080173686563e-05, + "loss": 2.0843, + "step": 24157 + }, + { + "epoch": 2.8185742620464356, + "grad_norm": 1.2106163501739502, + "learning_rate": 7.517783116727161e-05, + "loss": 2.0419, + "step": 24158 + }, + { + "epoch": 2.8186909345467273, + "grad_norm": 1.304437279701233, + "learning_rate": 7.516486138890147e-05, + "loss": 2.0128, + "step": 24159 + }, + { + "epoch": 2.818807607047019, + "grad_norm": 1.0062605142593384, + "learning_rate": 7.515189240188951e-05, + "loss": 1.9397, + "step": 24160 + }, + { + "epoch": 2.8189242795473106, + "grad_norm": 1.209843635559082, + "learning_rate": 7.513892420637033e-05, + "loss": 2.0877, + "step": 24161 + }, + { + "epoch": 2.8190409520476023, + "grad_norm": 1.0189557075500488, + "learning_rate": 7.512595680247824e-05, + "loss": 1.8903, + "step": 24162 + }, + { + "epoch": 2.819157624547894, + "grad_norm": 1.0554500818252563, + "learning_rate": 7.511299019034779e-05, + "loss": 1.8669, + "step": 24163 + }, + { + "epoch": 2.8192742970481857, + "grad_norm": 1.1452398300170898, + "learning_rate": 7.510002437011336e-05, + "loss": 1.9936, + "step": 24164 + }, + { + "epoch": 2.8193909695484773, + "grad_norm": 1.1616746187210083, + "learning_rate": 7.508705934190928e-05, + "loss": 1.9081, + "step": 24165 + }, + { + "epoch": 2.819507642048769, + "grad_norm": 1.0540701150894165, + "learning_rate": 7.507409510587011e-05, + "loss": 1.8216, + "step": 24166 + }, + { + "epoch": 2.8196243145490607, + "grad_norm": 1.258405327796936, + "learning_rate": 7.506113166213007e-05, + "loss": 2.041, + "step": 24167 + }, + { + "epoch": 2.8197409870493524, + "grad_norm": 1.294071912765503, + "learning_rate": 7.504816901082375e-05, + "loss": 1.9219, + "step": 24168 + }, + { + "epoch": 2.819857659549644, + "grad_norm": 1.0886456966400146, + "learning_rate": 7.503520715208536e-05, + "loss": 1.7307, + "step": 24169 + }, + { + "epoch": 2.8199743320499358, + "grad_norm": 1.0998224020004272, + "learning_rate": 7.50222460860494e-05, + "loss": 1.9667, + "step": 24170 + }, + { + "epoch": 2.8200910045502274, + "grad_norm": 0.9545168876647949, + "learning_rate": 7.500928581285013e-05, + "loss": 1.9685, + "step": 24171 + }, + { + "epoch": 2.820207677050519, + "grad_norm": 1.1204866170883179, + "learning_rate": 7.4996326332622e-05, + "loss": 1.7676, + "step": 24172 + }, + { + "epoch": 2.820324349550811, + "grad_norm": 1.0842610597610474, + "learning_rate": 7.498336764549929e-05, + "loss": 1.8621, + "step": 24173 + }, + { + "epoch": 2.8204410220511025, + "grad_norm": 1.2301602363586426, + "learning_rate": 7.497040975161634e-05, + "loss": 1.8503, + "step": 24174 + }, + { + "epoch": 2.820557694551394, + "grad_norm": 1.1002686023712158, + "learning_rate": 7.495745265110758e-05, + "loss": 1.9622, + "step": 24175 + }, + { + "epoch": 2.820674367051686, + "grad_norm": 1.2244905233383179, + "learning_rate": 7.49444963441072e-05, + "loss": 1.9745, + "step": 24176 + }, + { + "epoch": 2.8207910395519775, + "grad_norm": 0.8993510007858276, + "learning_rate": 7.493154083074968e-05, + "loss": 1.922, + "step": 24177 + }, + { + "epoch": 2.820907712052269, + "grad_norm": 1.1031244993209839, + "learning_rate": 7.491858611116915e-05, + "loss": 1.9011, + "step": 24178 + }, + { + "epoch": 2.821024384552561, + "grad_norm": 1.060878872871399, + "learning_rate": 7.490563218550005e-05, + "loss": 1.896, + "step": 24179 + }, + { + "epoch": 2.8211410570528526, + "grad_norm": 1.1544041633605957, + "learning_rate": 7.489267905387663e-05, + "loss": 2.0567, + "step": 24180 + }, + { + "epoch": 2.8212577295531442, + "grad_norm": 1.0399672985076904, + "learning_rate": 7.487972671643315e-05, + "loss": 1.9193, + "step": 24181 + }, + { + "epoch": 2.821374402053436, + "grad_norm": 1.3713241815567017, + "learning_rate": 7.486677517330384e-05, + "loss": 2.0265, + "step": 24182 + }, + { + "epoch": 2.8214910745537276, + "grad_norm": 1.114595890045166, + "learning_rate": 7.485382442462306e-05, + "loss": 1.8583, + "step": 24183 + }, + { + "epoch": 2.8216077470540193, + "grad_norm": 1.1226338148117065, + "learning_rate": 7.4840874470525e-05, + "loss": 1.8662, + "step": 24184 + }, + { + "epoch": 2.821724419554311, + "grad_norm": 1.1916735172271729, + "learning_rate": 7.482792531114398e-05, + "loss": 1.9954, + "step": 24185 + }, + { + "epoch": 2.8218410920546027, + "grad_norm": 1.1060312986373901, + "learning_rate": 7.481497694661416e-05, + "loss": 1.9883, + "step": 24186 + }, + { + "epoch": 2.8219577645548943, + "grad_norm": 0.9726078510284424, + "learning_rate": 7.480202937706982e-05, + "loss": 2.1098, + "step": 24187 + }, + { + "epoch": 2.822074437055186, + "grad_norm": 1.2068992853164673, + "learning_rate": 7.478908260264524e-05, + "loss": 1.9487, + "step": 24188 + }, + { + "epoch": 2.8221911095554777, + "grad_norm": 1.1315808296203613, + "learning_rate": 7.477613662347453e-05, + "loss": 2.032, + "step": 24189 + }, + { + "epoch": 2.8223077820557694, + "grad_norm": 1.2249984741210938, + "learning_rate": 7.476319143969201e-05, + "loss": 2.1014, + "step": 24190 + }, + { + "epoch": 2.822424454556061, + "grad_norm": 1.2527871131896973, + "learning_rate": 7.475024705143174e-05, + "loss": 2.0715, + "step": 24191 + }, + { + "epoch": 2.8225411270563527, + "grad_norm": 1.159120798110962, + "learning_rate": 7.473730345882808e-05, + "loss": 1.876, + "step": 24192 + }, + { + "epoch": 2.8226577995566444, + "grad_norm": 1.1238164901733398, + "learning_rate": 7.472436066201508e-05, + "loss": 1.9532, + "step": 24193 + }, + { + "epoch": 2.822774472056936, + "grad_norm": 1.0516101121902466, + "learning_rate": 7.4711418661127e-05, + "loss": 1.8491, + "step": 24194 + }, + { + "epoch": 2.822891144557228, + "grad_norm": 0.9779733419418335, + "learning_rate": 7.469847745629794e-05, + "loss": 1.8706, + "step": 24195 + }, + { + "epoch": 2.8230078170575195, + "grad_norm": 1.0057413578033447, + "learning_rate": 7.468553704766215e-05, + "loss": 2.0571, + "step": 24196 + }, + { + "epoch": 2.823124489557811, + "grad_norm": 1.0204660892486572, + "learning_rate": 7.467259743535373e-05, + "loss": 1.8134, + "step": 24197 + }, + { + "epoch": 2.823241162058103, + "grad_norm": 1.0541861057281494, + "learning_rate": 7.465965861950676e-05, + "loss": 1.9636, + "step": 24198 + }, + { + "epoch": 2.8233578345583945, + "grad_norm": 1.2155194282531738, + "learning_rate": 7.46467206002555e-05, + "loss": 1.9306, + "step": 24199 + }, + { + "epoch": 2.823474507058686, + "grad_norm": 1.1320008039474487, + "learning_rate": 7.463378337773395e-05, + "loss": 1.9504, + "step": 24200 + }, + { + "epoch": 2.823591179558978, + "grad_norm": 0.9656549692153931, + "learning_rate": 7.462084695207637e-05, + "loss": 1.9959, + "step": 24201 + }, + { + "epoch": 2.8237078520592696, + "grad_norm": 0.9440733790397644, + "learning_rate": 7.46079113234167e-05, + "loss": 1.8391, + "step": 24202 + }, + { + "epoch": 2.8238245245595612, + "grad_norm": 1.0317918062210083, + "learning_rate": 7.459497649188922e-05, + "loss": 1.8012, + "step": 24203 + }, + { + "epoch": 2.823941197059853, + "grad_norm": 1.0535697937011719, + "learning_rate": 7.458204245762788e-05, + "loss": 1.9574, + "step": 24204 + }, + { + "epoch": 2.8240578695601446, + "grad_norm": 1.1120076179504395, + "learning_rate": 7.456910922076688e-05, + "loss": 2.0118, + "step": 24205 + }, + { + "epoch": 2.8241745420604363, + "grad_norm": 1.078528642654419, + "learning_rate": 7.455617678144019e-05, + "loss": 2.1154, + "step": 24206 + }, + { + "epoch": 2.824291214560728, + "grad_norm": 1.05685293674469, + "learning_rate": 7.454324513978199e-05, + "loss": 2.1399, + "step": 24207 + }, + { + "epoch": 2.8244078870610196, + "grad_norm": 1.2329576015472412, + "learning_rate": 7.453031429592622e-05, + "loss": 2.2191, + "step": 24208 + }, + { + "epoch": 2.8245245595613113, + "grad_norm": 1.1216944456100464, + "learning_rate": 7.451738425000706e-05, + "loss": 2.0627, + "step": 24209 + }, + { + "epoch": 2.824641232061603, + "grad_norm": 1.1035584211349487, + "learning_rate": 7.450445500215843e-05, + "loss": 2.0231, + "step": 24210 + }, + { + "epoch": 2.8247579045618947, + "grad_norm": 0.9773833155632019, + "learning_rate": 7.449152655251448e-05, + "loss": 2.1052, + "step": 24211 + }, + { + "epoch": 2.8248745770621864, + "grad_norm": 1.0151066780090332, + "learning_rate": 7.447859890120913e-05, + "loss": 1.9451, + "step": 24212 + }, + { + "epoch": 2.824991249562478, + "grad_norm": 1.0936882495880127, + "learning_rate": 7.446567204837653e-05, + "loss": 2.0887, + "step": 24213 + }, + { + "epoch": 2.8251079220627697, + "grad_norm": 1.0245574712753296, + "learning_rate": 7.445274599415059e-05, + "loss": 1.9914, + "step": 24214 + }, + { + "epoch": 2.8252245945630614, + "grad_norm": 1.1110259294509888, + "learning_rate": 7.443982073866529e-05, + "loss": 1.8719, + "step": 24215 + }, + { + "epoch": 2.825341267063353, + "grad_norm": 1.2091046571731567, + "learning_rate": 7.442689628205473e-05, + "loss": 2.1541, + "step": 24216 + }, + { + "epoch": 2.8254579395636448, + "grad_norm": 0.8914376497268677, + "learning_rate": 7.44139726244528e-05, + "loss": 1.758, + "step": 24217 + }, + { + "epoch": 2.8255746120639365, + "grad_norm": 1.041609764099121, + "learning_rate": 7.440104976599354e-05, + "loss": 2.1644, + "step": 24218 + }, + { + "epoch": 2.825691284564228, + "grad_norm": 1.0194462537765503, + "learning_rate": 7.438812770681087e-05, + "loss": 2.0169, + "step": 24219 + }, + { + "epoch": 2.82580795706452, + "grad_norm": 1.3641612529754639, + "learning_rate": 7.437520644703883e-05, + "loss": 1.9638, + "step": 24220 + }, + { + "epoch": 2.8259246295648115, + "grad_norm": 1.0842536687850952, + "learning_rate": 7.436228598681125e-05, + "loss": 2.0105, + "step": 24221 + }, + { + "epoch": 2.826041302065103, + "grad_norm": 1.392494559288025, + "learning_rate": 7.434936632626223e-05, + "loss": 2.1081, + "step": 24222 + }, + { + "epoch": 2.826157974565395, + "grad_norm": 1.0235189199447632, + "learning_rate": 7.433644746552556e-05, + "loss": 2.0583, + "step": 24223 + }, + { + "epoch": 2.8262746470656865, + "grad_norm": 0.9694953560829163, + "learning_rate": 7.432352940473527e-05, + "loss": 1.8483, + "step": 24224 + }, + { + "epoch": 2.826391319565978, + "grad_norm": 1.1140865087509155, + "learning_rate": 7.43106121440252e-05, + "loss": 2.0427, + "step": 24225 + }, + { + "epoch": 2.82650799206627, + "grad_norm": 1.15293550491333, + "learning_rate": 7.429769568352932e-05, + "loss": 2.0893, + "step": 24226 + }, + { + "epoch": 2.8266246645665616, + "grad_norm": 1.0523045063018799, + "learning_rate": 7.428478002338156e-05, + "loss": 1.9445, + "step": 24227 + }, + { + "epoch": 2.8267413370668533, + "grad_norm": 1.0677748918533325, + "learning_rate": 7.427186516371573e-05, + "loss": 2.0187, + "step": 24228 + }, + { + "epoch": 2.826858009567145, + "grad_norm": 1.0943940877914429, + "learning_rate": 7.42589511046658e-05, + "loss": 1.9622, + "step": 24229 + }, + { + "epoch": 2.8269746820674366, + "grad_norm": 1.2291914224624634, + "learning_rate": 7.424603784636563e-05, + "loss": 2.0594, + "step": 24230 + }, + { + "epoch": 2.8270913545677283, + "grad_norm": 1.1192526817321777, + "learning_rate": 7.4233125388949e-05, + "loss": 1.7843, + "step": 24231 + }, + { + "epoch": 2.82720802706802, + "grad_norm": 1.0064201354980469, + "learning_rate": 7.42202137325499e-05, + "loss": 1.9199, + "step": 24232 + }, + { + "epoch": 2.8273246995683117, + "grad_norm": 1.0024864673614502, + "learning_rate": 7.420730287730214e-05, + "loss": 1.745, + "step": 24233 + }, + { + "epoch": 2.8274413720686034, + "grad_norm": 1.2564868927001953, + "learning_rate": 7.419439282333946e-05, + "loss": 1.8708, + "step": 24234 + }, + { + "epoch": 2.827558044568895, + "grad_norm": 1.0156290531158447, + "learning_rate": 7.418148357079586e-05, + "loss": 1.7864, + "step": 24235 + }, + { + "epoch": 2.8276747170691867, + "grad_norm": 0.9442567825317383, + "learning_rate": 7.416857511980505e-05, + "loss": 1.7304, + "step": 24236 + }, + { + "epoch": 2.8277913895694784, + "grad_norm": 1.2830936908721924, + "learning_rate": 7.415566747050096e-05, + "loss": 1.9708, + "step": 24237 + }, + { + "epoch": 2.82790806206977, + "grad_norm": 1.1574311256408691, + "learning_rate": 7.414276062301728e-05, + "loss": 1.8939, + "step": 24238 + }, + { + "epoch": 2.8280247345700618, + "grad_norm": 1.1104975938796997, + "learning_rate": 7.412985457748787e-05, + "loss": 1.8586, + "step": 24239 + }, + { + "epoch": 2.8281414070703534, + "grad_norm": 1.236574649810791, + "learning_rate": 7.411694933404657e-05, + "loss": 1.8393, + "step": 24240 + }, + { + "epoch": 2.828258079570645, + "grad_norm": 1.2895679473876953, + "learning_rate": 7.41040448928271e-05, + "loss": 2.0884, + "step": 24241 + }, + { + "epoch": 2.828374752070937, + "grad_norm": 1.128158688545227, + "learning_rate": 7.409114125396331e-05, + "loss": 1.959, + "step": 24242 + }, + { + "epoch": 2.8284914245712285, + "grad_norm": 1.0876127481460571, + "learning_rate": 7.407823841758886e-05, + "loss": 2.1086, + "step": 24243 + }, + { + "epoch": 2.82860809707152, + "grad_norm": 1.3720818758010864, + "learning_rate": 7.406533638383764e-05, + "loss": 2.1067, + "step": 24244 + }, + { + "epoch": 2.828724769571812, + "grad_norm": 1.1630836725234985, + "learning_rate": 7.40524351528433e-05, + "loss": 1.9588, + "step": 24245 + }, + { + "epoch": 2.8288414420721035, + "grad_norm": 1.1309750080108643, + "learning_rate": 7.403953472473967e-05, + "loss": 1.926, + "step": 24246 + }, + { + "epoch": 2.828958114572395, + "grad_norm": 1.1783370971679688, + "learning_rate": 7.402663509966039e-05, + "loss": 1.8632, + "step": 24247 + }, + { + "epoch": 2.829074787072687, + "grad_norm": 1.1799806356430054, + "learning_rate": 7.401373627773932e-05, + "loss": 1.8124, + "step": 24248 + }, + { + "epoch": 2.8291914595729786, + "grad_norm": 1.0773780345916748, + "learning_rate": 7.400083825911008e-05, + "loss": 1.9601, + "step": 24249 + }, + { + "epoch": 2.8293081320732703, + "grad_norm": 1.0269144773483276, + "learning_rate": 7.398794104390644e-05, + "loss": 1.746, + "step": 24250 + }, + { + "epoch": 2.829424804573562, + "grad_norm": 1.2024481296539307, + "learning_rate": 7.397504463226198e-05, + "loss": 2.0421, + "step": 24251 + }, + { + "epoch": 2.8295414770738536, + "grad_norm": 1.0284087657928467, + "learning_rate": 7.396214902431052e-05, + "loss": 1.803, + "step": 24252 + }, + { + "epoch": 2.8296581495741453, + "grad_norm": 1.333073377609253, + "learning_rate": 7.394925422018573e-05, + "loss": 2.1821, + "step": 24253 + }, + { + "epoch": 2.829774822074437, + "grad_norm": 1.0289913415908813, + "learning_rate": 7.393636022002126e-05, + "loss": 1.8969, + "step": 24254 + }, + { + "epoch": 2.8298914945747287, + "grad_norm": 1.2363678216934204, + "learning_rate": 7.392346702395083e-05, + "loss": 2.0857, + "step": 24255 + }, + { + "epoch": 2.8300081670750203, + "grad_norm": 1.0233060121536255, + "learning_rate": 7.391057463210801e-05, + "loss": 1.7598, + "step": 24256 + }, + { + "epoch": 2.830124839575312, + "grad_norm": 1.1428502798080444, + "learning_rate": 7.389768304462658e-05, + "loss": 2.0076, + "step": 24257 + }, + { + "epoch": 2.8302415120756037, + "grad_norm": 1.184105634689331, + "learning_rate": 7.388479226164005e-05, + "loss": 1.8987, + "step": 24258 + }, + { + "epoch": 2.8303581845758954, + "grad_norm": 1.116845726966858, + "learning_rate": 7.38719022832822e-05, + "loss": 1.8961, + "step": 24259 + }, + { + "epoch": 2.830474857076187, + "grad_norm": 1.2467721700668335, + "learning_rate": 7.385901310968651e-05, + "loss": 2.0699, + "step": 24260 + }, + { + "epoch": 2.8305915295764787, + "grad_norm": 1.0557777881622314, + "learning_rate": 7.384612474098673e-05, + "loss": 1.9376, + "step": 24261 + }, + { + "epoch": 2.8307082020767704, + "grad_norm": 1.0371558666229248, + "learning_rate": 7.383323717731636e-05, + "loss": 2.0379, + "step": 24262 + }, + { + "epoch": 2.830824874577062, + "grad_norm": 1.211887240409851, + "learning_rate": 7.382035041880913e-05, + "loss": 1.9489, + "step": 24263 + }, + { + "epoch": 2.830941547077354, + "grad_norm": 1.1005276441574097, + "learning_rate": 7.38074644655985e-05, + "loss": 2.0299, + "step": 24264 + }, + { + "epoch": 2.8310582195776455, + "grad_norm": 1.006805419921875, + "learning_rate": 7.379457931781818e-05, + "loss": 1.9698, + "step": 24265 + }, + { + "epoch": 2.831174892077937, + "grad_norm": 1.2911436557769775, + "learning_rate": 7.378169497560169e-05, + "loss": 1.9178, + "step": 24266 + }, + { + "epoch": 2.831291564578229, + "grad_norm": 1.2406286001205444, + "learning_rate": 7.376881143908255e-05, + "loss": 2.0279, + "step": 24267 + }, + { + "epoch": 2.8314082370785205, + "grad_norm": 1.1425420045852661, + "learning_rate": 7.375592870839445e-05, + "loss": 1.921, + "step": 24268 + }, + { + "epoch": 2.831524909578812, + "grad_norm": 0.9502052068710327, + "learning_rate": 7.374304678367078e-05, + "loss": 2.0023, + "step": 24269 + }, + { + "epoch": 2.831641582079104, + "grad_norm": 1.1624730825424194, + "learning_rate": 7.373016566504525e-05, + "loss": 2.0017, + "step": 24270 + }, + { + "epoch": 2.8317582545793956, + "grad_norm": 1.148964524269104, + "learning_rate": 7.371728535265127e-05, + "loss": 1.772, + "step": 24271 + }, + { + "epoch": 2.8318749270796872, + "grad_norm": 1.1260290145874023, + "learning_rate": 7.370440584662246e-05, + "loss": 1.9159, + "step": 24272 + }, + { + "epoch": 2.831991599579979, + "grad_norm": 1.169307827949524, + "learning_rate": 7.369152714709225e-05, + "loss": 2.0774, + "step": 24273 + }, + { + "epoch": 2.8321082720802706, + "grad_norm": 1.2122101783752441, + "learning_rate": 7.367864925419428e-05, + "loss": 1.8883, + "step": 24274 + }, + { + "epoch": 2.8322249445805623, + "grad_norm": 1.062070608139038, + "learning_rate": 7.366577216806189e-05, + "loss": 1.9016, + "step": 24275 + }, + { + "epoch": 2.832341617080854, + "grad_norm": 1.1832866668701172, + "learning_rate": 7.365289588882874e-05, + "loss": 2.0632, + "step": 24276 + }, + { + "epoch": 2.8324582895811456, + "grad_norm": 1.1000607013702393, + "learning_rate": 7.364002041662819e-05, + "loss": 1.9153, + "step": 24277 + }, + { + "epoch": 2.8325749620814373, + "grad_norm": 1.1013466119766235, + "learning_rate": 7.362714575159381e-05, + "loss": 1.9822, + "step": 24278 + }, + { + "epoch": 2.832691634581729, + "grad_norm": 1.048244833946228, + "learning_rate": 7.361427189385897e-05, + "loss": 1.8512, + "step": 24279 + }, + { + "epoch": 2.8328083070820207, + "grad_norm": 1.1262274980545044, + "learning_rate": 7.36013988435572e-05, + "loss": 1.8956, + "step": 24280 + }, + { + "epoch": 2.8329249795823124, + "grad_norm": 1.1369657516479492, + "learning_rate": 7.358852660082199e-05, + "loss": 1.9522, + "step": 24281 + }, + { + "epoch": 2.833041652082604, + "grad_norm": 1.257383942604065, + "learning_rate": 7.357565516578675e-05, + "loss": 1.976, + "step": 24282 + }, + { + "epoch": 2.8331583245828957, + "grad_norm": 1.0894227027893066, + "learning_rate": 7.356278453858485e-05, + "loss": 1.8909, + "step": 24283 + }, + { + "epoch": 2.8332749970831874, + "grad_norm": 1.2114423513412476, + "learning_rate": 7.354991471934982e-05, + "loss": 1.9982, + "step": 24284 + }, + { + "epoch": 2.833391669583479, + "grad_norm": 1.078319787979126, + "learning_rate": 7.353704570821504e-05, + "loss": 1.9942, + "step": 24285 + }, + { + "epoch": 2.833508342083771, + "grad_norm": 1.0400336980819702, + "learning_rate": 7.352417750531387e-05, + "loss": 1.9297, + "step": 24286 + }, + { + "epoch": 2.8336250145840625, + "grad_norm": 1.039182186126709, + "learning_rate": 7.351131011077981e-05, + "loss": 2.0623, + "step": 24287 + }, + { + "epoch": 2.833741687084354, + "grad_norm": 1.0704940557479858, + "learning_rate": 7.349844352474614e-05, + "loss": 1.9188, + "step": 24288 + }, + { + "epoch": 2.833858359584646, + "grad_norm": 1.1667015552520752, + "learning_rate": 7.348557774734636e-05, + "loss": 1.9931, + "step": 24289 + }, + { + "epoch": 2.8339750320849375, + "grad_norm": 1.161584734916687, + "learning_rate": 7.347271277871377e-05, + "loss": 2.1186, + "step": 24290 + }, + { + "epoch": 2.834091704585229, + "grad_norm": 1.0294344425201416, + "learning_rate": 7.345984861898181e-05, + "loss": 1.8686, + "step": 24291 + }, + { + "epoch": 2.834208377085521, + "grad_norm": 1.1365655660629272, + "learning_rate": 7.344698526828374e-05, + "loss": 2.1827, + "step": 24292 + }, + { + "epoch": 2.8343250495858126, + "grad_norm": 1.0870238542556763, + "learning_rate": 7.343412272675298e-05, + "loss": 2.0111, + "step": 24293 + }, + { + "epoch": 2.8344417220861042, + "grad_norm": 1.0961401462554932, + "learning_rate": 7.342126099452291e-05, + "loss": 2.0272, + "step": 24294 + }, + { + "epoch": 2.834558394586396, + "grad_norm": 1.2131887674331665, + "learning_rate": 7.340840007172678e-05, + "loss": 1.9493, + "step": 24295 + }, + { + "epoch": 2.8346750670866876, + "grad_norm": 1.1953351497650146, + "learning_rate": 7.339553995849802e-05, + "loss": 2.026, + "step": 24296 + }, + { + "epoch": 2.8347917395869793, + "grad_norm": 1.2625951766967773, + "learning_rate": 7.338268065496983e-05, + "loss": 1.89, + "step": 24297 + }, + { + "epoch": 2.834908412087271, + "grad_norm": 1.2082535028457642, + "learning_rate": 7.336982216127565e-05, + "loss": 1.9751, + "step": 24298 + }, + { + "epoch": 2.8350250845875626, + "grad_norm": 1.4756768941879272, + "learning_rate": 7.335696447754871e-05, + "loss": 2.1259, + "step": 24299 + }, + { + "epoch": 2.8351417570878543, + "grad_norm": 1.1518455743789673, + "learning_rate": 7.334410760392223e-05, + "loss": 2.0725, + "step": 24300 + }, + { + "epoch": 2.835258429588146, + "grad_norm": 1.354429841041565, + "learning_rate": 7.333125154052965e-05, + "loss": 1.975, + "step": 24301 + }, + { + "epoch": 2.8353751020884377, + "grad_norm": 1.1581525802612305, + "learning_rate": 7.331839628750417e-05, + "loss": 1.8965, + "step": 24302 + }, + { + "epoch": 2.8354917745887294, + "grad_norm": 1.1175106763839722, + "learning_rate": 7.330554184497902e-05, + "loss": 1.9582, + "step": 24303 + }, + { + "epoch": 2.835608447089021, + "grad_norm": 1.030395746231079, + "learning_rate": 7.329268821308753e-05, + "loss": 1.7687, + "step": 24304 + }, + { + "epoch": 2.8357251195893127, + "grad_norm": 0.9823726415634155, + "learning_rate": 7.32798353919629e-05, + "loss": 1.8486, + "step": 24305 + }, + { + "epoch": 2.8358417920896044, + "grad_norm": 1.1735846996307373, + "learning_rate": 7.326698338173838e-05, + "loss": 1.9798, + "step": 24306 + }, + { + "epoch": 2.835958464589896, + "grad_norm": 1.4011486768722534, + "learning_rate": 7.325413218254728e-05, + "loss": 1.7633, + "step": 24307 + }, + { + "epoch": 2.8360751370901878, + "grad_norm": 1.0152274370193481, + "learning_rate": 7.32412817945227e-05, + "loss": 1.8803, + "step": 24308 + }, + { + "epoch": 2.8361918095904795, + "grad_norm": 1.1476056575775146, + "learning_rate": 7.322843221779802e-05, + "loss": 1.9533, + "step": 24309 + }, + { + "epoch": 2.836308482090771, + "grad_norm": 1.2649385929107666, + "learning_rate": 7.321558345250629e-05, + "loss": 1.9508, + "step": 24310 + }, + { + "epoch": 2.836425154591063, + "grad_norm": 1.1017320156097412, + "learning_rate": 7.320273549878082e-05, + "loss": 1.972, + "step": 24311 + }, + { + "epoch": 2.8365418270913545, + "grad_norm": 0.9826095104217529, + "learning_rate": 7.318988835675471e-05, + "loss": 1.68, + "step": 24312 + }, + { + "epoch": 2.836658499591646, + "grad_norm": 1.0679599046707153, + "learning_rate": 7.317704202656128e-05, + "loss": 2.0341, + "step": 24313 + }, + { + "epoch": 2.836775172091938, + "grad_norm": 1.0804297924041748, + "learning_rate": 7.316419650833355e-05, + "loss": 1.9462, + "step": 24314 + }, + { + "epoch": 2.8368918445922295, + "grad_norm": 1.023881435394287, + "learning_rate": 7.315135180220483e-05, + "loss": 2.0331, + "step": 24315 + }, + { + "epoch": 2.837008517092521, + "grad_norm": 1.104872226715088, + "learning_rate": 7.313850790830815e-05, + "loss": 1.9233, + "step": 24316 + }, + { + "epoch": 2.837125189592813, + "grad_norm": 1.2440611124038696, + "learning_rate": 7.312566482677679e-05, + "loss": 1.8532, + "step": 24317 + }, + { + "epoch": 2.8372418620931046, + "grad_norm": 1.0148414373397827, + "learning_rate": 7.311282255774382e-05, + "loss": 1.9208, + "step": 24318 + }, + { + "epoch": 2.8373585345933963, + "grad_norm": 1.0969562530517578, + "learning_rate": 7.309998110134232e-05, + "loss": 1.9377, + "step": 24319 + }, + { + "epoch": 2.837475207093688, + "grad_norm": 1.0017850399017334, + "learning_rate": 7.308714045770557e-05, + "loss": 1.9616, + "step": 24320 + }, + { + "epoch": 2.8375918795939796, + "grad_norm": 1.1031982898712158, + "learning_rate": 7.30743006269665e-05, + "loss": 2.0062, + "step": 24321 + }, + { + "epoch": 2.8377085520942713, + "grad_norm": 0.918362557888031, + "learning_rate": 7.306146160925838e-05, + "loss": 1.8373, + "step": 24322 + }, + { + "epoch": 2.837825224594563, + "grad_norm": 0.9917141795158386, + "learning_rate": 7.30486234047142e-05, + "loss": 1.8706, + "step": 24323 + }, + { + "epoch": 2.8379418970948547, + "grad_norm": 1.2925317287445068, + "learning_rate": 7.303578601346714e-05, + "loss": 1.8106, + "step": 24324 + }, + { + "epoch": 2.8380585695951464, + "grad_norm": 1.158141016960144, + "learning_rate": 7.302294943565021e-05, + "loss": 1.9565, + "step": 24325 + }, + { + "epoch": 2.838175242095438, + "grad_norm": 1.2439813613891602, + "learning_rate": 7.301011367139656e-05, + "loss": 2.1351, + "step": 24326 + }, + { + "epoch": 2.8382919145957297, + "grad_norm": 1.2053965330123901, + "learning_rate": 7.299727872083914e-05, + "loss": 1.9147, + "step": 24327 + }, + { + "epoch": 2.8384085870960214, + "grad_norm": 1.068791389465332, + "learning_rate": 7.298444458411116e-05, + "loss": 1.9315, + "step": 24328 + }, + { + "epoch": 2.838525259596313, + "grad_norm": 1.0686925649642944, + "learning_rate": 7.297161126134555e-05, + "loss": 2.0042, + "step": 24329 + }, + { + "epoch": 2.8386419320966048, + "grad_norm": 1.259939432144165, + "learning_rate": 7.295877875267543e-05, + "loss": 1.9495, + "step": 24330 + }, + { + "epoch": 2.8387586045968964, + "grad_norm": 1.1881022453308105, + "learning_rate": 7.294594705823374e-05, + "loss": 1.9136, + "step": 24331 + }, + { + "epoch": 2.838875277097188, + "grad_norm": 1.0285192728042603, + "learning_rate": 7.293311617815358e-05, + "loss": 1.7229, + "step": 24332 + }, + { + "epoch": 2.83899194959748, + "grad_norm": 1.2153812646865845, + "learning_rate": 7.292028611256799e-05, + "loss": 1.8997, + "step": 24333 + }, + { + "epoch": 2.8391086220977715, + "grad_norm": 1.001731276512146, + "learning_rate": 7.290745686160994e-05, + "loss": 1.7237, + "step": 24334 + }, + { + "epoch": 2.839225294598063, + "grad_norm": 1.2857859134674072, + "learning_rate": 7.289462842541244e-05, + "loss": 2.0419, + "step": 24335 + }, + { + "epoch": 2.839341967098355, + "grad_norm": 0.9823650121688843, + "learning_rate": 7.28818008041084e-05, + "loss": 1.8796, + "step": 24336 + }, + { + "epoch": 2.8394586395986465, + "grad_norm": 1.0901724100112915, + "learning_rate": 7.286897399783092e-05, + "loss": 1.7944, + "step": 24337 + }, + { + "epoch": 2.839575312098938, + "grad_norm": 1.175248146057129, + "learning_rate": 7.285614800671289e-05, + "loss": 1.8905, + "step": 24338 + }, + { + "epoch": 2.83969198459923, + "grad_norm": 1.0228378772735596, + "learning_rate": 7.284332283088734e-05, + "loss": 1.9957, + "step": 24339 + }, + { + "epoch": 2.8398086570995216, + "grad_norm": 1.0766805410385132, + "learning_rate": 7.283049847048716e-05, + "loss": 1.8683, + "step": 24340 + }, + { + "epoch": 2.8399253295998133, + "grad_norm": 1.2261099815368652, + "learning_rate": 7.281767492564539e-05, + "loss": 1.8647, + "step": 24341 + }, + { + "epoch": 2.840042002100105, + "grad_norm": 1.0600591897964478, + "learning_rate": 7.280485219649485e-05, + "loss": 1.9357, + "step": 24342 + }, + { + "epoch": 2.8401586746003966, + "grad_norm": 1.1234513521194458, + "learning_rate": 7.27920302831686e-05, + "loss": 1.9264, + "step": 24343 + }, + { + "epoch": 2.8402753471006883, + "grad_norm": 1.0025252103805542, + "learning_rate": 7.277920918579945e-05, + "loss": 1.8385, + "step": 24344 + }, + { + "epoch": 2.84039201960098, + "grad_norm": 1.1912262439727783, + "learning_rate": 7.276638890452036e-05, + "loss": 1.9054, + "step": 24345 + }, + { + "epoch": 2.8405086921012717, + "grad_norm": 1.1176995038986206, + "learning_rate": 7.275356943946432e-05, + "loss": 2.013, + "step": 24346 + }, + { + "epoch": 2.8406253646015633, + "grad_norm": 1.1309925317764282, + "learning_rate": 7.274075079076407e-05, + "loss": 1.8474, + "step": 24347 + }, + { + "epoch": 2.840742037101855, + "grad_norm": 0.9529240727424622, + "learning_rate": 7.272793295855265e-05, + "loss": 1.7959, + "step": 24348 + }, + { + "epoch": 2.8408587096021467, + "grad_norm": 1.1288704872131348, + "learning_rate": 7.271511594296282e-05, + "loss": 2.0382, + "step": 24349 + }, + { + "epoch": 2.8409753821024384, + "grad_norm": 1.091220736503601, + "learning_rate": 7.270229974412753e-05, + "loss": 1.9101, + "step": 24350 + }, + { + "epoch": 2.84109205460273, + "grad_norm": 1.1306582689285278, + "learning_rate": 7.268948436217966e-05, + "loss": 1.9244, + "step": 24351 + }, + { + "epoch": 2.8412087271030217, + "grad_norm": 1.0810470581054688, + "learning_rate": 7.267666979725196e-05, + "loss": 1.8935, + "step": 24352 + }, + { + "epoch": 2.8413253996033134, + "grad_norm": 1.1265645027160645, + "learning_rate": 7.266385604947739e-05, + "loss": 1.9489, + "step": 24353 + }, + { + "epoch": 2.841442072103605, + "grad_norm": 1.2256308794021606, + "learning_rate": 7.265104311898872e-05, + "loss": 2.1321, + "step": 24354 + }, + { + "epoch": 2.841558744603897, + "grad_norm": 0.9278960824012756, + "learning_rate": 7.263823100591876e-05, + "loss": 1.9505, + "step": 24355 + }, + { + "epoch": 2.8416754171041885, + "grad_norm": 1.0979957580566406, + "learning_rate": 7.262541971040044e-05, + "loss": 2.1344, + "step": 24356 + }, + { + "epoch": 2.84179208960448, + "grad_norm": 1.0098646879196167, + "learning_rate": 7.261260923256644e-05, + "loss": 1.8401, + "step": 24357 + }, + { + "epoch": 2.841908762104772, + "grad_norm": 0.966763973236084, + "learning_rate": 7.259979957254964e-05, + "loss": 1.9253, + "step": 24358 + }, + { + "epoch": 2.8420254346050635, + "grad_norm": 1.2436269521713257, + "learning_rate": 7.258699073048287e-05, + "loss": 2.1538, + "step": 24359 + }, + { + "epoch": 2.842142107105355, + "grad_norm": 1.1831315755844116, + "learning_rate": 7.257418270649883e-05, + "loss": 1.948, + "step": 24360 + }, + { + "epoch": 2.842258779605647, + "grad_norm": 1.1174004077911377, + "learning_rate": 7.256137550073038e-05, + "loss": 2.0009, + "step": 24361 + }, + { + "epoch": 2.8423754521059386, + "grad_norm": 1.0325852632522583, + "learning_rate": 7.254856911331023e-05, + "loss": 1.9111, + "step": 24362 + }, + { + "epoch": 2.8424921246062302, + "grad_norm": 1.1385120153427124, + "learning_rate": 7.25357635443712e-05, + "loss": 1.9706, + "step": 24363 + }, + { + "epoch": 2.842608797106522, + "grad_norm": 1.1150277853012085, + "learning_rate": 7.252295879404597e-05, + "loss": 2.0874, + "step": 24364 + }, + { + "epoch": 2.8427254696068136, + "grad_norm": 1.0396515130996704, + "learning_rate": 7.251015486246738e-05, + "loss": 1.8619, + "step": 24365 + }, + { + "epoch": 2.8428421421071053, + "grad_norm": 1.1376018524169922, + "learning_rate": 7.249735174976805e-05, + "loss": 2.0232, + "step": 24366 + }, + { + "epoch": 2.842958814607397, + "grad_norm": 1.1602610349655151, + "learning_rate": 7.248454945608085e-05, + "loss": 2.0631, + "step": 24367 + }, + { + "epoch": 2.8430754871076886, + "grad_norm": 0.9666716456413269, + "learning_rate": 7.247174798153839e-05, + "loss": 1.7844, + "step": 24368 + }, + { + "epoch": 2.8431921596079803, + "grad_norm": 1.1162166595458984, + "learning_rate": 7.245894732627339e-05, + "loss": 2.0395, + "step": 24369 + }, + { + "epoch": 2.843308832108272, + "grad_norm": 1.2295277118682861, + "learning_rate": 7.24461474904186e-05, + "loss": 1.8811, + "step": 24370 + }, + { + "epoch": 2.8434255046085637, + "grad_norm": 1.0914992094039917, + "learning_rate": 7.243334847410666e-05, + "loss": 2.0309, + "step": 24371 + }, + { + "epoch": 2.8435421771088554, + "grad_norm": 1.0826783180236816, + "learning_rate": 7.242055027747035e-05, + "loss": 1.8682, + "step": 24372 + }, + { + "epoch": 2.843658849609147, + "grad_norm": 1.1467841863632202, + "learning_rate": 7.240775290064222e-05, + "loss": 2.099, + "step": 24373 + }, + { + "epoch": 2.8437755221094387, + "grad_norm": 1.144135594367981, + "learning_rate": 7.239495634375507e-05, + "loss": 1.9845, + "step": 24374 + }, + { + "epoch": 2.8438921946097304, + "grad_norm": 1.0850415229797363, + "learning_rate": 7.238216060694141e-05, + "loss": 1.9698, + "step": 24375 + }, + { + "epoch": 2.844008867110022, + "grad_norm": 0.9735828638076782, + "learning_rate": 7.236936569033406e-05, + "loss": 1.9619, + "step": 24376 + }, + { + "epoch": 2.844125539610314, + "grad_norm": 1.16916823387146, + "learning_rate": 7.235657159406552e-05, + "loss": 2.0275, + "step": 24377 + }, + { + "epoch": 2.8442422121106055, + "grad_norm": 1.034474492073059, + "learning_rate": 7.234377831826854e-05, + "loss": 2.0385, + "step": 24378 + }, + { + "epoch": 2.844358884610897, + "grad_norm": 1.086735725402832, + "learning_rate": 7.233098586307564e-05, + "loss": 2.0522, + "step": 24379 + }, + { + "epoch": 2.844475557111189, + "grad_norm": 1.3648332357406616, + "learning_rate": 7.231819422861954e-05, + "loss": 2.143, + "step": 24380 + }, + { + "epoch": 2.8445922296114805, + "grad_norm": 1.1987478733062744, + "learning_rate": 7.230540341503275e-05, + "loss": 2.2109, + "step": 24381 + }, + { + "epoch": 2.844708902111772, + "grad_norm": 1.1534322500228882, + "learning_rate": 7.229261342244797e-05, + "loss": 1.7765, + "step": 24382 + }, + { + "epoch": 2.844825574612064, + "grad_norm": 1.0220921039581299, + "learning_rate": 7.227982425099767e-05, + "loss": 1.934, + "step": 24383 + }, + { + "epoch": 2.8449422471123555, + "grad_norm": 1.2076501846313477, + "learning_rate": 7.226703590081459e-05, + "loss": 1.8054, + "step": 24384 + }, + { + "epoch": 2.8450589196126472, + "grad_norm": 1.0748056173324585, + "learning_rate": 7.225424837203117e-05, + "loss": 1.9157, + "step": 24385 + }, + { + "epoch": 2.845175592112939, + "grad_norm": 1.0123597383499146, + "learning_rate": 7.224146166478007e-05, + "loss": 1.8902, + "step": 24386 + }, + { + "epoch": 2.8452922646132306, + "grad_norm": 1.0487815141677856, + "learning_rate": 7.222867577919382e-05, + "loss": 1.8384, + "step": 24387 + }, + { + "epoch": 2.8454089371135223, + "grad_norm": 1.1744787693023682, + "learning_rate": 7.221589071540489e-05, + "loss": 2.0305, + "step": 24388 + }, + { + "epoch": 2.845525609613814, + "grad_norm": 1.0848314762115479, + "learning_rate": 7.220310647354595e-05, + "loss": 1.8358, + "step": 24389 + }, + { + "epoch": 2.8456422821141056, + "grad_norm": 1.039636254310608, + "learning_rate": 7.219032305374943e-05, + "loss": 1.8785, + "step": 24390 + }, + { + "epoch": 2.8457589546143973, + "grad_norm": 1.1353905200958252, + "learning_rate": 7.217754045614793e-05, + "loss": 1.8953, + "step": 24391 + }, + { + "epoch": 2.845875627114689, + "grad_norm": 1.114249587059021, + "learning_rate": 7.216475868087388e-05, + "loss": 1.9943, + "step": 24392 + }, + { + "epoch": 2.8459922996149807, + "grad_norm": 0.9781426787376404, + "learning_rate": 7.215197772805993e-05, + "loss": 2.0385, + "step": 24393 + }, + { + "epoch": 2.8461089721152724, + "grad_norm": 1.2175761461257935, + "learning_rate": 7.213919759783838e-05, + "loss": 1.9349, + "step": 24394 + }, + { + "epoch": 2.846225644615564, + "grad_norm": 1.0210646390914917, + "learning_rate": 7.212641829034192e-05, + "loss": 1.7181, + "step": 24395 + }, + { + "epoch": 2.8463423171158557, + "grad_norm": 0.9680715203285217, + "learning_rate": 7.211363980570289e-05, + "loss": 1.9317, + "step": 24396 + }, + { + "epoch": 2.8464589896161474, + "grad_norm": 1.0896140336990356, + "learning_rate": 7.210086214405378e-05, + "loss": 2.0351, + "step": 24397 + }, + { + "epoch": 2.846575662116439, + "grad_norm": 1.060739278793335, + "learning_rate": 7.208808530552716e-05, + "loss": 2.0344, + "step": 24398 + }, + { + "epoch": 2.8466923346167308, + "grad_norm": 0.9605180621147156, + "learning_rate": 7.207530929025537e-05, + "loss": 1.76, + "step": 24399 + }, + { + "epoch": 2.8468090071170224, + "grad_norm": 1.112412929534912, + "learning_rate": 7.206253409837095e-05, + "loss": 1.8663, + "step": 24400 + }, + { + "epoch": 2.846925679617314, + "grad_norm": 1.0082154273986816, + "learning_rate": 7.204975973000623e-05, + "loss": 1.9668, + "step": 24401 + }, + { + "epoch": 2.847042352117606, + "grad_norm": 1.0150035619735718, + "learning_rate": 7.203698618529377e-05, + "loss": 1.7664, + "step": 24402 + }, + { + "epoch": 2.8471590246178975, + "grad_norm": 1.0233936309814453, + "learning_rate": 7.202421346436592e-05, + "loss": 1.9876, + "step": 24403 + }, + { + "epoch": 2.847275697118189, + "grad_norm": 1.033396601676941, + "learning_rate": 7.201144156735511e-05, + "loss": 1.932, + "step": 24404 + }, + { + "epoch": 2.847392369618481, + "grad_norm": 1.265470266342163, + "learning_rate": 7.199867049439365e-05, + "loss": 2.0126, + "step": 24405 + }, + { + "epoch": 2.8475090421187725, + "grad_norm": 1.2004224061965942, + "learning_rate": 7.198590024561407e-05, + "loss": 1.9382, + "step": 24406 + }, + { + "epoch": 2.847625714619064, + "grad_norm": 0.9601656198501587, + "learning_rate": 7.197313082114867e-05, + "loss": 2.021, + "step": 24407 + }, + { + "epoch": 2.847742387119356, + "grad_norm": 0.9946311712265015, + "learning_rate": 7.196036222112992e-05, + "loss": 1.8159, + "step": 24408 + }, + { + "epoch": 2.8478590596196476, + "grad_norm": 1.1546603441238403, + "learning_rate": 7.194759444569008e-05, + "loss": 1.9275, + "step": 24409 + }, + { + "epoch": 2.8479757321199393, + "grad_norm": 1.0184165239334106, + "learning_rate": 7.193482749496156e-05, + "loss": 1.9713, + "step": 24410 + }, + { + "epoch": 2.848092404620231, + "grad_norm": 1.0614274740219116, + "learning_rate": 7.192206136907678e-05, + "loss": 1.9226, + "step": 24411 + }, + { + "epoch": 2.8482090771205226, + "grad_norm": 1.1571002006530762, + "learning_rate": 7.190929606816798e-05, + "loss": 1.9108, + "step": 24412 + }, + { + "epoch": 2.8483257496208143, + "grad_norm": 1.1620301008224487, + "learning_rate": 7.189653159236758e-05, + "loss": 1.9715, + "step": 24413 + }, + { + "epoch": 2.848442422121106, + "grad_norm": 1.1140730381011963, + "learning_rate": 7.188376794180783e-05, + "loss": 2.0238, + "step": 24414 + }, + { + "epoch": 2.8485590946213977, + "grad_norm": 1.1023603677749634, + "learning_rate": 7.187100511662116e-05, + "loss": 1.8232, + "step": 24415 + }, + { + "epoch": 2.8486757671216894, + "grad_norm": 1.1048446893692017, + "learning_rate": 7.185824311693974e-05, + "loss": 1.8661, + "step": 24416 + }, + { + "epoch": 2.848792439621981, + "grad_norm": 1.1463240385055542, + "learning_rate": 7.1845481942896e-05, + "loss": 1.9078, + "step": 24417 + }, + { + "epoch": 2.8489091121222727, + "grad_norm": 1.2606266736984253, + "learning_rate": 7.183272159462212e-05, + "loss": 2.0461, + "step": 24418 + }, + { + "epoch": 2.8490257846225644, + "grad_norm": 1.058530330657959, + "learning_rate": 7.18199620722505e-05, + "loss": 1.9518, + "step": 24419 + }, + { + "epoch": 2.849142457122856, + "grad_norm": 1.1215124130249023, + "learning_rate": 7.180720337591337e-05, + "loss": 2.094, + "step": 24420 + }, + { + "epoch": 2.8492591296231478, + "grad_norm": 1.0367861986160278, + "learning_rate": 7.179444550574293e-05, + "loss": 1.9104, + "step": 24421 + }, + { + "epoch": 2.8493758021234394, + "grad_norm": 1.0840158462524414, + "learning_rate": 7.178168846187155e-05, + "loss": 2.0444, + "step": 24422 + }, + { + "epoch": 2.849492474623731, + "grad_norm": 1.0871145725250244, + "learning_rate": 7.176893224443137e-05, + "loss": 1.9905, + "step": 24423 + }, + { + "epoch": 2.849609147124023, + "grad_norm": 0.972414493560791, + "learning_rate": 7.175617685355475e-05, + "loss": 1.7758, + "step": 24424 + }, + { + "epoch": 2.8497258196243145, + "grad_norm": 1.0453070402145386, + "learning_rate": 7.17434222893738e-05, + "loss": 1.9585, + "step": 24425 + }, + { + "epoch": 2.849842492124606, + "grad_norm": 0.9818877577781677, + "learning_rate": 7.173066855202088e-05, + "loss": 1.8158, + "step": 24426 + }, + { + "epoch": 2.849959164624898, + "grad_norm": 0.9807240962982178, + "learning_rate": 7.171791564162806e-05, + "loss": 1.8618, + "step": 24427 + }, + { + "epoch": 2.8500758371251895, + "grad_norm": 1.1496691703796387, + "learning_rate": 7.170516355832768e-05, + "loss": 1.8516, + "step": 24428 + }, + { + "epoch": 2.850192509625481, + "grad_norm": 1.0901193618774414, + "learning_rate": 7.169241230225185e-05, + "loss": 2.0385, + "step": 24429 + }, + { + "epoch": 2.850309182125773, + "grad_norm": 1.1117639541625977, + "learning_rate": 7.167966187353283e-05, + "loss": 1.9644, + "step": 24430 + }, + { + "epoch": 2.8504258546260646, + "grad_norm": 1.1039620637893677, + "learning_rate": 7.16669122723027e-05, + "loss": 1.86, + "step": 24431 + }, + { + "epoch": 2.8505425271263563, + "grad_norm": 1.0633870363235474, + "learning_rate": 7.165416349869379e-05, + "loss": 1.872, + "step": 24432 + }, + { + "epoch": 2.850659199626648, + "grad_norm": 1.1519356966018677, + "learning_rate": 7.164141555283811e-05, + "loss": 2.0559, + "step": 24433 + }, + { + "epoch": 2.8507758721269396, + "grad_norm": 0.9162716865539551, + "learning_rate": 7.162866843486795e-05, + "loss": 1.8646, + "step": 24434 + }, + { + "epoch": 2.8508925446272313, + "grad_norm": 1.1950151920318604, + "learning_rate": 7.161592214491529e-05, + "loss": 1.9745, + "step": 24435 + }, + { + "epoch": 2.851009217127523, + "grad_norm": 1.102800726890564, + "learning_rate": 7.160317668311246e-05, + "loss": 2.0019, + "step": 24436 + }, + { + "epoch": 2.8511258896278147, + "grad_norm": 1.164207100868225, + "learning_rate": 7.159043204959145e-05, + "loss": 2.0967, + "step": 24437 + }, + { + "epoch": 2.8512425621281063, + "grad_norm": 1.1535449028015137, + "learning_rate": 7.157768824448447e-05, + "loss": 2.0785, + "step": 24438 + }, + { + "epoch": 2.851359234628398, + "grad_norm": 1.026382327079773, + "learning_rate": 7.15649452679236e-05, + "loss": 1.9688, + "step": 24439 + }, + { + "epoch": 2.8514759071286897, + "grad_norm": 1.0422242879867554, + "learning_rate": 7.15522031200409e-05, + "loss": 2.1012, + "step": 24440 + }, + { + "epoch": 2.8515925796289814, + "grad_norm": 0.9987968802452087, + "learning_rate": 7.153946180096855e-05, + "loss": 1.9263, + "step": 24441 + }, + { + "epoch": 2.851709252129273, + "grad_norm": 1.0824416875839233, + "learning_rate": 7.152672131083855e-05, + "loss": 1.9487, + "step": 24442 + }, + { + "epoch": 2.8518259246295647, + "grad_norm": 1.1604431867599487, + "learning_rate": 7.151398164978307e-05, + "loss": 2.082, + "step": 24443 + }, + { + "epoch": 2.8519425971298564, + "grad_norm": 1.0469970703125, + "learning_rate": 7.150124281793409e-05, + "loss": 1.7764, + "step": 24444 + }, + { + "epoch": 2.852059269630148, + "grad_norm": 1.1124097108840942, + "learning_rate": 7.148850481542376e-05, + "loss": 1.9626, + "step": 24445 + }, + { + "epoch": 2.85217594213044, + "grad_norm": 0.8918253779411316, + "learning_rate": 7.147576764238406e-05, + "loss": 1.8123, + "step": 24446 + }, + { + "epoch": 2.8522926146307315, + "grad_norm": 1.1830165386199951, + "learning_rate": 7.146303129894711e-05, + "loss": 1.9573, + "step": 24447 + }, + { + "epoch": 2.852409287131023, + "grad_norm": 1.1404705047607422, + "learning_rate": 7.145029578524484e-05, + "loss": 1.9068, + "step": 24448 + }, + { + "epoch": 2.852525959631315, + "grad_norm": 0.9882129430770874, + "learning_rate": 7.143756110140934e-05, + "loss": 1.8621, + "step": 24449 + }, + { + "epoch": 2.8526426321316065, + "grad_norm": 1.1853097677230835, + "learning_rate": 7.14248272475727e-05, + "loss": 2.021, + "step": 24450 + }, + { + "epoch": 2.852759304631898, + "grad_norm": 1.0663044452667236, + "learning_rate": 7.14120942238668e-05, + "loss": 1.8565, + "step": 24451 + }, + { + "epoch": 2.85287597713219, + "grad_norm": 1.0797841548919678, + "learning_rate": 7.139936203042375e-05, + "loss": 2.0029, + "step": 24452 + }, + { + "epoch": 2.8529926496324816, + "grad_norm": 1.03446364402771, + "learning_rate": 7.138663066737551e-05, + "loss": 1.8263, + "step": 24453 + }, + { + "epoch": 2.8531093221327732, + "grad_norm": 1.0883493423461914, + "learning_rate": 7.137390013485398e-05, + "loss": 1.893, + "step": 24454 + }, + { + "epoch": 2.853225994633065, + "grad_norm": 1.131172776222229, + "learning_rate": 7.136117043299126e-05, + "loss": 1.815, + "step": 24455 + }, + { + "epoch": 2.8533426671333566, + "grad_norm": 1.080195426940918, + "learning_rate": 7.134844156191928e-05, + "loss": 2.0839, + "step": 24456 + }, + { + "epoch": 2.8534593396336483, + "grad_norm": 1.1877466440200806, + "learning_rate": 7.133571352176991e-05, + "loss": 1.93, + "step": 24457 + }, + { + "epoch": 2.85357601213394, + "grad_norm": 1.030122995376587, + "learning_rate": 7.132298631267524e-05, + "loss": 1.9365, + "step": 24458 + }, + { + "epoch": 2.8536926846342316, + "grad_norm": 0.9582026600837708, + "learning_rate": 7.131025993476708e-05, + "loss": 1.7535, + "step": 24459 + }, + { + "epoch": 2.8538093571345233, + "grad_norm": 0.9012330174446106, + "learning_rate": 7.129753438817749e-05, + "loss": 1.6658, + "step": 24460 + }, + { + "epoch": 2.853926029634815, + "grad_norm": 1.113684058189392, + "learning_rate": 7.128480967303827e-05, + "loss": 1.947, + "step": 24461 + }, + { + "epoch": 2.8540427021351067, + "grad_norm": 1.1180495023727417, + "learning_rate": 7.127208578948141e-05, + "loss": 2.1349, + "step": 24462 + }, + { + "epoch": 2.8541593746353984, + "grad_norm": 1.1269900798797607, + "learning_rate": 7.125936273763885e-05, + "loss": 2.0172, + "step": 24463 + }, + { + "epoch": 2.85427604713569, + "grad_norm": 1.1584718227386475, + "learning_rate": 7.124664051764239e-05, + "loss": 2.05, + "step": 24464 + }, + { + "epoch": 2.8543927196359817, + "grad_norm": 1.0945504903793335, + "learning_rate": 7.123391912962403e-05, + "loss": 1.9159, + "step": 24465 + }, + { + "epoch": 2.8545093921362734, + "grad_norm": 1.2145618200302124, + "learning_rate": 7.122119857371554e-05, + "loss": 2.0086, + "step": 24466 + }, + { + "epoch": 2.854626064636565, + "grad_norm": 1.0603044033050537, + "learning_rate": 7.12084788500489e-05, + "loss": 1.7335, + "step": 24467 + }, + { + "epoch": 2.854742737136857, + "grad_norm": 1.0471134185791016, + "learning_rate": 7.119575995875588e-05, + "loss": 1.8877, + "step": 24468 + }, + { + "epoch": 2.8548594096371485, + "grad_norm": 1.0596134662628174, + "learning_rate": 7.118304189996841e-05, + "loss": 1.9575, + "step": 24469 + }, + { + "epoch": 2.85497608213744, + "grad_norm": 1.110478162765503, + "learning_rate": 7.117032467381825e-05, + "loss": 1.9094, + "step": 24470 + }, + { + "epoch": 2.855092754637732, + "grad_norm": 1.0365983247756958, + "learning_rate": 7.115760828043734e-05, + "loss": 1.9128, + "step": 24471 + }, + { + "epoch": 2.8552094271380235, + "grad_norm": 1.0118627548217773, + "learning_rate": 7.114489271995748e-05, + "loss": 1.7866, + "step": 24472 + }, + { + "epoch": 2.855326099638315, + "grad_norm": 1.2434773445129395, + "learning_rate": 7.113217799251047e-05, + "loss": 1.9596, + "step": 24473 + }, + { + "epoch": 2.855442772138607, + "grad_norm": 1.1099128723144531, + "learning_rate": 7.111946409822805e-05, + "loss": 1.9576, + "step": 24474 + }, + { + "epoch": 2.8555594446388985, + "grad_norm": 1.19852876663208, + "learning_rate": 7.110675103724213e-05, + "loss": 1.9933, + "step": 24475 + }, + { + "epoch": 2.8556761171391902, + "grad_norm": 1.0953075885772705, + "learning_rate": 7.10940388096845e-05, + "loss": 1.9553, + "step": 24476 + }, + { + "epoch": 2.855792789639482, + "grad_norm": 1.1470626592636108, + "learning_rate": 7.108132741568688e-05, + "loss": 1.8391, + "step": 24477 + }, + { + "epoch": 2.8559094621397736, + "grad_norm": 1.1731657981872559, + "learning_rate": 7.106861685538114e-05, + "loss": 1.8405, + "step": 24478 + }, + { + "epoch": 2.8560261346400653, + "grad_norm": 1.0905295610427856, + "learning_rate": 7.105590712889894e-05, + "loss": 1.8671, + "step": 24479 + }, + { + "epoch": 2.856142807140357, + "grad_norm": 1.286058783531189, + "learning_rate": 7.104319823637213e-05, + "loss": 2.13, + "step": 24480 + }, + { + "epoch": 2.8562594796406486, + "grad_norm": 0.9903110861778259, + "learning_rate": 7.103049017793239e-05, + "loss": 1.8159, + "step": 24481 + }, + { + "epoch": 2.8563761521409403, + "grad_norm": 1.1658693552017212, + "learning_rate": 7.101778295371156e-05, + "loss": 2.0091, + "step": 24482 + }, + { + "epoch": 2.856492824641232, + "grad_norm": 0.9659681916236877, + "learning_rate": 7.100507656384125e-05, + "loss": 1.9705, + "step": 24483 + }, + { + "epoch": 2.8566094971415237, + "grad_norm": 1.1038001775741577, + "learning_rate": 7.099237100845332e-05, + "loss": 1.8851, + "step": 24484 + }, + { + "epoch": 2.8567261696418154, + "grad_norm": 1.3328580856323242, + "learning_rate": 7.097966628767935e-05, + "loss": 2.1085, + "step": 24485 + }, + { + "epoch": 2.856842842142107, + "grad_norm": 0.9259812831878662, + "learning_rate": 7.096696240165118e-05, + "loss": 1.8039, + "step": 24486 + }, + { + "epoch": 2.8569595146423987, + "grad_norm": 1.1148276329040527, + "learning_rate": 7.09542593505004e-05, + "loss": 1.958, + "step": 24487 + }, + { + "epoch": 2.8570761871426904, + "grad_norm": 1.1671470403671265, + "learning_rate": 7.094155713435877e-05, + "loss": 1.9638, + "step": 24488 + }, + { + "epoch": 2.857192859642982, + "grad_norm": 1.195444941520691, + "learning_rate": 7.092885575335797e-05, + "loss": 2.1297, + "step": 24489 + }, + { + "epoch": 2.8573095321432738, + "grad_norm": 1.167907953262329, + "learning_rate": 7.091615520762961e-05, + "loss": 2.0399, + "step": 24490 + }, + { + "epoch": 2.8574262046435654, + "grad_norm": 1.1085271835327148, + "learning_rate": 7.090345549730545e-05, + "loss": 1.8683, + "step": 24491 + }, + { + "epoch": 2.857542877143857, + "grad_norm": 1.0413265228271484, + "learning_rate": 7.089075662251703e-05, + "loss": 2.0155, + "step": 24492 + }, + { + "epoch": 2.857659549644149, + "grad_norm": 1.2089179754257202, + "learning_rate": 7.087805858339612e-05, + "loss": 2.1553, + "step": 24493 + }, + { + "epoch": 2.8577762221444405, + "grad_norm": 1.1420326232910156, + "learning_rate": 7.086536138007424e-05, + "loss": 2.0138, + "step": 24494 + }, + { + "epoch": 2.857892894644732, + "grad_norm": 0.9823436737060547, + "learning_rate": 7.085266501268318e-05, + "loss": 1.8337, + "step": 24495 + }, + { + "epoch": 2.858009567145024, + "grad_norm": 1.1876767873764038, + "learning_rate": 7.083996948135437e-05, + "loss": 2.06, + "step": 24496 + }, + { + "epoch": 2.8581262396453155, + "grad_norm": 1.0507711172103882, + "learning_rate": 7.082727478621958e-05, + "loss": 1.8879, + "step": 24497 + }, + { + "epoch": 2.858242912145607, + "grad_norm": 1.206639289855957, + "learning_rate": 7.08145809274103e-05, + "loss": 1.9083, + "step": 24498 + }, + { + "epoch": 2.858359584645899, + "grad_norm": 1.0874377489089966, + "learning_rate": 7.080188790505824e-05, + "loss": 1.8149, + "step": 24499 + }, + { + "epoch": 2.8584762571461906, + "grad_norm": 1.3177520036697388, + "learning_rate": 7.078919571929487e-05, + "loss": 2.1241, + "step": 24500 + }, + { + "epoch": 2.8585929296464823, + "grad_norm": 1.421706199645996, + "learning_rate": 7.077650437025181e-05, + "loss": 2.0825, + "step": 24501 + }, + { + "epoch": 2.858709602146774, + "grad_norm": 1.0604463815689087, + "learning_rate": 7.076381385806071e-05, + "loss": 1.6304, + "step": 24502 + }, + { + "epoch": 2.8588262746470656, + "grad_norm": 1.195105791091919, + "learning_rate": 7.075112418285303e-05, + "loss": 1.967, + "step": 24503 + }, + { + "epoch": 2.8589429471473573, + "grad_norm": 1.095814824104309, + "learning_rate": 7.07384353447604e-05, + "loss": 1.9275, + "step": 24504 + }, + { + "epoch": 2.859059619647649, + "grad_norm": 1.1125677824020386, + "learning_rate": 7.07257473439143e-05, + "loss": 1.9682, + "step": 24505 + }, + { + "epoch": 2.8591762921479407, + "grad_norm": 1.1529937982559204, + "learning_rate": 7.071306018044626e-05, + "loss": 1.9133, + "step": 24506 + }, + { + "epoch": 2.8592929646482323, + "grad_norm": 1.3634520769119263, + "learning_rate": 7.070037385448789e-05, + "loss": 1.968, + "step": 24507 + }, + { + "epoch": 2.859409637148524, + "grad_norm": 0.9331654906272888, + "learning_rate": 7.068768836617063e-05, + "loss": 1.9508, + "step": 24508 + }, + { + "epoch": 2.8595263096488157, + "grad_norm": 1.064005732536316, + "learning_rate": 7.067500371562598e-05, + "loss": 2.0452, + "step": 24509 + }, + { + "epoch": 2.8596429821491074, + "grad_norm": 1.1592702865600586, + "learning_rate": 7.06623199029855e-05, + "loss": 1.8287, + "step": 24510 + }, + { + "epoch": 2.859759654649399, + "grad_norm": 1.0790672302246094, + "learning_rate": 7.064963692838062e-05, + "loss": 1.9992, + "step": 24511 + }, + { + "epoch": 2.8598763271496908, + "grad_norm": 1.0561357736587524, + "learning_rate": 7.06369547919429e-05, + "loss": 1.936, + "step": 24512 + }, + { + "epoch": 2.8599929996499824, + "grad_norm": 1.0718512535095215, + "learning_rate": 7.062427349380372e-05, + "loss": 1.9076, + "step": 24513 + }, + { + "epoch": 2.860109672150274, + "grad_norm": 1.2629846334457397, + "learning_rate": 7.061159303409459e-05, + "loss": 1.9629, + "step": 24514 + }, + { + "epoch": 2.860226344650566, + "grad_norm": 1.0993003845214844, + "learning_rate": 7.059891341294702e-05, + "loss": 2.0851, + "step": 24515 + }, + { + "epoch": 2.8603430171508575, + "grad_norm": 1.1149439811706543, + "learning_rate": 7.058623463049237e-05, + "loss": 1.9067, + "step": 24516 + }, + { + "epoch": 2.860459689651149, + "grad_norm": 1.150511384010315, + "learning_rate": 7.057355668686218e-05, + "loss": 1.905, + "step": 24517 + }, + { + "epoch": 2.860576362151441, + "grad_norm": 1.0242844820022583, + "learning_rate": 7.056087958218775e-05, + "loss": 2.0198, + "step": 24518 + }, + { + "epoch": 2.8606930346517325, + "grad_norm": 1.1585211753845215, + "learning_rate": 7.054820331660065e-05, + "loss": 1.9426, + "step": 24519 + }, + { + "epoch": 2.860809707152024, + "grad_norm": 1.209755301475525, + "learning_rate": 7.053552789023215e-05, + "loss": 2.0566, + "step": 24520 + }, + { + "epoch": 2.860926379652316, + "grad_norm": 1.1047372817993164, + "learning_rate": 7.052285330321377e-05, + "loss": 2.0385, + "step": 24521 + }, + { + "epoch": 2.8610430521526076, + "grad_norm": 1.1657525300979614, + "learning_rate": 7.051017955567682e-05, + "loss": 1.7805, + "step": 24522 + }, + { + "epoch": 2.8611597246528992, + "grad_norm": 1.0323405265808105, + "learning_rate": 7.049750664775276e-05, + "loss": 1.874, + "step": 24523 + }, + { + "epoch": 2.861276397153191, + "grad_norm": 1.1432939767837524, + "learning_rate": 7.048483457957297e-05, + "loss": 1.9688, + "step": 24524 + }, + { + "epoch": 2.8613930696534826, + "grad_norm": 1.1058669090270996, + "learning_rate": 7.047216335126876e-05, + "loss": 1.8638, + "step": 24525 + }, + { + "epoch": 2.8615097421537743, + "grad_norm": 0.9963906407356262, + "learning_rate": 7.045949296297145e-05, + "loss": 2.0593, + "step": 24526 + }, + { + "epoch": 2.861626414654066, + "grad_norm": 1.15596604347229, + "learning_rate": 7.044682341481249e-05, + "loss": 2.0886, + "step": 24527 + }, + { + "epoch": 2.8617430871543577, + "grad_norm": 1.0547672510147095, + "learning_rate": 7.043415470692321e-05, + "loss": 2.0604, + "step": 24528 + }, + { + "epoch": 2.8618597596546493, + "grad_norm": 1.2516714334487915, + "learning_rate": 7.042148683943489e-05, + "loss": 2.1872, + "step": 24529 + }, + { + "epoch": 2.861976432154941, + "grad_norm": 1.159676194190979, + "learning_rate": 7.040881981247895e-05, + "loss": 1.8936, + "step": 24530 + }, + { + "epoch": 2.8620931046552327, + "grad_norm": 0.9939154982566833, + "learning_rate": 7.039615362618658e-05, + "loss": 1.8761, + "step": 24531 + }, + { + "epoch": 2.8622097771555244, + "grad_norm": 1.0188530683517456, + "learning_rate": 7.038348828068924e-05, + "loss": 1.9344, + "step": 24532 + }, + { + "epoch": 2.862326449655816, + "grad_norm": 1.1617095470428467, + "learning_rate": 7.037082377611809e-05, + "loss": 1.8974, + "step": 24533 + }, + { + "epoch": 2.8624431221561077, + "grad_norm": 1.2031865119934082, + "learning_rate": 7.035816011260454e-05, + "loss": 1.8889, + "step": 24534 + }, + { + "epoch": 2.8625597946563994, + "grad_norm": 1.1390963792800903, + "learning_rate": 7.034549729027973e-05, + "loss": 1.9811, + "step": 24535 + }, + { + "epoch": 2.862676467156691, + "grad_norm": 1.0245863199234009, + "learning_rate": 7.033283530927511e-05, + "loss": 1.9214, + "step": 24536 + }, + { + "epoch": 2.862793139656983, + "grad_norm": 1.0439234972000122, + "learning_rate": 7.03201741697218e-05, + "loss": 1.8694, + "step": 24537 + }, + { + "epoch": 2.8629098121572745, + "grad_norm": 1.148889422416687, + "learning_rate": 7.030751387175114e-05, + "loss": 1.9734, + "step": 24538 + }, + { + "epoch": 2.863026484657566, + "grad_norm": 1.0664170980453491, + "learning_rate": 7.029485441549432e-05, + "loss": 1.9013, + "step": 24539 + }, + { + "epoch": 2.863143157157858, + "grad_norm": 1.157822608947754, + "learning_rate": 7.028219580108265e-05, + "loss": 1.8988, + "step": 24540 + }, + { + "epoch": 2.8632598296581495, + "grad_norm": 1.4849321842193604, + "learning_rate": 7.026953802864732e-05, + "loss": 2.056, + "step": 24541 + }, + { + "epoch": 2.863376502158441, + "grad_norm": 1.1292400360107422, + "learning_rate": 7.02568810983195e-05, + "loss": 1.9365, + "step": 24542 + }, + { + "epoch": 2.863493174658733, + "grad_norm": 1.0848296880722046, + "learning_rate": 7.024422501023049e-05, + "loss": 1.8381, + "step": 24543 + }, + { + "epoch": 2.8636098471590246, + "grad_norm": 1.0724910497665405, + "learning_rate": 7.023156976451142e-05, + "loss": 1.9452, + "step": 24544 + }, + { + "epoch": 2.8637265196593162, + "grad_norm": 1.0735106468200684, + "learning_rate": 7.021891536129357e-05, + "loss": 1.7187, + "step": 24545 + }, + { + "epoch": 2.863843192159608, + "grad_norm": 1.1514254808425903, + "learning_rate": 7.020626180070804e-05, + "loss": 1.9372, + "step": 24546 + }, + { + "epoch": 2.8639598646598996, + "grad_norm": 1.3254640102386475, + "learning_rate": 7.01936090828861e-05, + "loss": 2.1238, + "step": 24547 + }, + { + "epoch": 2.8640765371601913, + "grad_norm": 1.154624104499817, + "learning_rate": 7.01809572079588e-05, + "loss": 1.9537, + "step": 24548 + }, + { + "epoch": 2.864193209660483, + "grad_norm": 0.989569902420044, + "learning_rate": 7.016830617605743e-05, + "loss": 1.8976, + "step": 24549 + }, + { + "epoch": 2.8643098821607746, + "grad_norm": 1.1159803867340088, + "learning_rate": 7.015565598731305e-05, + "loss": 2.0202, + "step": 24550 + }, + { + "epoch": 2.8644265546610663, + "grad_norm": 1.209540605545044, + "learning_rate": 7.014300664185686e-05, + "loss": 2.0517, + "step": 24551 + }, + { + "epoch": 2.864543227161358, + "grad_norm": 1.1374177932739258, + "learning_rate": 7.013035813981992e-05, + "loss": 1.8412, + "step": 24552 + }, + { + "epoch": 2.8646598996616497, + "grad_norm": 1.1518871784210205, + "learning_rate": 7.01177104813334e-05, + "loss": 1.9754, + "step": 24553 + }, + { + "epoch": 2.8647765721619414, + "grad_norm": 1.1742901802062988, + "learning_rate": 7.010506366652848e-05, + "loss": 1.9742, + "step": 24554 + }, + { + "epoch": 2.864893244662233, + "grad_norm": 1.1721400022506714, + "learning_rate": 7.009241769553615e-05, + "loss": 2.0892, + "step": 24555 + }, + { + "epoch": 2.8650099171625247, + "grad_norm": 1.1837694644927979, + "learning_rate": 7.007977256848764e-05, + "loss": 1.9671, + "step": 24556 + }, + { + "epoch": 2.8651265896628164, + "grad_norm": 1.1977916955947876, + "learning_rate": 7.006712828551396e-05, + "loss": 1.9683, + "step": 24557 + }, + { + "epoch": 2.865243262163108, + "grad_norm": 1.168705940246582, + "learning_rate": 7.00544848467462e-05, + "loss": 2.1778, + "step": 24558 + }, + { + "epoch": 2.8653599346633998, + "grad_norm": 1.087767481803894, + "learning_rate": 7.004184225231537e-05, + "loss": 1.9445, + "step": 24559 + }, + { + "epoch": 2.8654766071636915, + "grad_norm": 1.0611945390701294, + "learning_rate": 7.002920050235265e-05, + "loss": 1.6466, + "step": 24560 + }, + { + "epoch": 2.865593279663983, + "grad_norm": 1.1996853351593018, + "learning_rate": 7.001655959698902e-05, + "loss": 1.8693, + "step": 24561 + }, + { + "epoch": 2.865709952164275, + "grad_norm": 1.0541102886199951, + "learning_rate": 7.000391953635557e-05, + "loss": 1.9892, + "step": 24562 + }, + { + "epoch": 2.8658266246645665, + "grad_norm": 1.1010271310806274, + "learning_rate": 6.999128032058329e-05, + "loss": 1.9186, + "step": 24563 + }, + { + "epoch": 2.865943297164858, + "grad_norm": 1.2587357759475708, + "learning_rate": 6.997864194980328e-05, + "loss": 2.1642, + "step": 24564 + }, + { + "epoch": 2.86605996966515, + "grad_norm": 1.10434889793396, + "learning_rate": 6.996600442414647e-05, + "loss": 2.1022, + "step": 24565 + }, + { + "epoch": 2.8661766421654415, + "grad_norm": 1.1628056764602661, + "learning_rate": 6.995336774374392e-05, + "loss": 1.9835, + "step": 24566 + }, + { + "epoch": 2.8662933146657332, + "grad_norm": 0.9827075600624084, + "learning_rate": 6.994073190872668e-05, + "loss": 1.9805, + "step": 24567 + }, + { + "epoch": 2.866409987166025, + "grad_norm": 1.0278756618499756, + "learning_rate": 6.992809691922565e-05, + "loss": 1.8782, + "step": 24568 + }, + { + "epoch": 2.8665266596663166, + "grad_norm": 0.9855588674545288, + "learning_rate": 6.991546277537195e-05, + "loss": 1.7892, + "step": 24569 + }, + { + "epoch": 2.8666433321666083, + "grad_norm": 1.1858456134796143, + "learning_rate": 6.990282947729637e-05, + "loss": 1.839, + "step": 24570 + }, + { + "epoch": 2.8667600046669, + "grad_norm": 1.098821759223938, + "learning_rate": 6.989019702513007e-05, + "loss": 1.8908, + "step": 24571 + }, + { + "epoch": 2.8668766771671916, + "grad_norm": 1.2309699058532715, + "learning_rate": 6.987756541900385e-05, + "loss": 2.1277, + "step": 24572 + }, + { + "epoch": 2.8669933496674833, + "grad_norm": 1.1030299663543701, + "learning_rate": 6.986493465904877e-05, + "loss": 1.8498, + "step": 24573 + }, + { + "epoch": 2.867110022167775, + "grad_norm": 1.1748381853103638, + "learning_rate": 6.985230474539576e-05, + "loss": 1.896, + "step": 24574 + }, + { + "epoch": 2.8672266946680667, + "grad_norm": 1.0335756540298462, + "learning_rate": 6.983967567817563e-05, + "loss": 2.0578, + "step": 24575 + }, + { + "epoch": 2.8673433671683584, + "grad_norm": 1.1296602487564087, + "learning_rate": 6.982704745751949e-05, + "loss": 1.9269, + "step": 24576 + }, + { + "epoch": 2.86746003966865, + "grad_norm": 1.0401891469955444, + "learning_rate": 6.981442008355814e-05, + "loss": 2.0921, + "step": 24577 + }, + { + "epoch": 2.8675767121689417, + "grad_norm": 0.9418867230415344, + "learning_rate": 6.980179355642247e-05, + "loss": 1.7253, + "step": 24578 + }, + { + "epoch": 2.8676933846692334, + "grad_norm": 1.1633813381195068, + "learning_rate": 6.978916787624342e-05, + "loss": 2.1825, + "step": 24579 + }, + { + "epoch": 2.867810057169525, + "grad_norm": 1.5650385618209839, + "learning_rate": 6.977654304315193e-05, + "loss": 1.9072, + "step": 24580 + }, + { + "epoch": 2.8679267296698168, + "grad_norm": 1.0909314155578613, + "learning_rate": 6.976391905727876e-05, + "loss": 2.0105, + "step": 24581 + }, + { + "epoch": 2.8680434021701084, + "grad_norm": 1.0579251050949097, + "learning_rate": 6.97512959187549e-05, + "loss": 1.9801, + "step": 24582 + }, + { + "epoch": 2.8681600746704, + "grad_norm": 1.0950926542282104, + "learning_rate": 6.973867362771114e-05, + "loss": 1.937, + "step": 24583 + }, + { + "epoch": 2.868276747170692, + "grad_norm": 1.2421412467956543, + "learning_rate": 6.972605218427836e-05, + "loss": 2.1166, + "step": 24584 + }, + { + "epoch": 2.8683934196709835, + "grad_norm": 1.1951279640197754, + "learning_rate": 6.97134315885874e-05, + "loss": 2.0846, + "step": 24585 + }, + { + "epoch": 2.868510092171275, + "grad_norm": 1.2224465608596802, + "learning_rate": 6.97008118407691e-05, + "loss": 2.1215, + "step": 24586 + }, + { + "epoch": 2.868626764671567, + "grad_norm": 1.2761517763137817, + "learning_rate": 6.968819294095426e-05, + "loss": 1.966, + "step": 24587 + }, + { + "epoch": 2.8687434371718585, + "grad_norm": 1.2059930562973022, + "learning_rate": 6.967557488927377e-05, + "loss": 2.0627, + "step": 24588 + }, + { + "epoch": 2.86886010967215, + "grad_norm": 1.1060048341751099, + "learning_rate": 6.966295768585835e-05, + "loss": 1.8753, + "step": 24589 + }, + { + "epoch": 2.868976782172442, + "grad_norm": 1.071471095085144, + "learning_rate": 6.965034133083889e-05, + "loss": 1.7647, + "step": 24590 + }, + { + "epoch": 2.8690934546727336, + "grad_norm": 1.0135343074798584, + "learning_rate": 6.963772582434609e-05, + "loss": 1.7408, + "step": 24591 + }, + { + "epoch": 2.8692101271730253, + "grad_norm": 0.9831835627555847, + "learning_rate": 6.962511116651082e-05, + "loss": 2.0341, + "step": 24592 + }, + { + "epoch": 2.869326799673317, + "grad_norm": 1.1506543159484863, + "learning_rate": 6.961249735746383e-05, + "loss": 1.9826, + "step": 24593 + }, + { + "epoch": 2.8694434721736086, + "grad_norm": 1.1935728788375854, + "learning_rate": 6.959988439733581e-05, + "loss": 2.0023, + "step": 24594 + }, + { + "epoch": 2.8695601446739003, + "grad_norm": 1.1565020084381104, + "learning_rate": 6.958727228625765e-05, + "loss": 1.892, + "step": 24595 + }, + { + "epoch": 2.869676817174192, + "grad_norm": 1.0364760160446167, + "learning_rate": 6.957466102435995e-05, + "loss": 1.8498, + "step": 24596 + }, + { + "epoch": 2.8697934896744837, + "grad_norm": 1.1114122867584229, + "learning_rate": 6.956205061177358e-05, + "loss": 1.9186, + "step": 24597 + }, + { + "epoch": 2.8699101621747753, + "grad_norm": 1.2751086950302124, + "learning_rate": 6.954944104862919e-05, + "loss": 1.9741, + "step": 24598 + }, + { + "epoch": 2.870026834675067, + "grad_norm": 1.0439444780349731, + "learning_rate": 6.953683233505756e-05, + "loss": 1.8994, + "step": 24599 + }, + { + "epoch": 2.8701435071753587, + "grad_norm": 1.1998004913330078, + "learning_rate": 6.952422447118932e-05, + "loss": 1.8523, + "step": 24600 + }, + { + "epoch": 2.8702601796756504, + "grad_norm": 1.0089277029037476, + "learning_rate": 6.951161745715529e-05, + "loss": 1.7927, + "step": 24601 + }, + { + "epoch": 2.870376852175942, + "grad_norm": 0.9119684100151062, + "learning_rate": 6.949901129308605e-05, + "loss": 1.9229, + "step": 24602 + }, + { + "epoch": 2.8704935246762338, + "grad_norm": 1.0564686059951782, + "learning_rate": 6.948640597911238e-05, + "loss": 1.9908, + "step": 24603 + }, + { + "epoch": 2.8706101971765254, + "grad_norm": 0.9616572856903076, + "learning_rate": 6.947380151536488e-05, + "loss": 1.8758, + "step": 24604 + }, + { + "epoch": 2.870726869676817, + "grad_norm": 0.969740092754364, + "learning_rate": 6.946119790197431e-05, + "loss": 2.0707, + "step": 24605 + }, + { + "epoch": 2.870843542177109, + "grad_norm": 0.9397979378700256, + "learning_rate": 6.94485951390712e-05, + "loss": 1.8497, + "step": 24606 + }, + { + "epoch": 2.8709602146774005, + "grad_norm": 0.9976321458816528, + "learning_rate": 6.943599322678635e-05, + "loss": 1.899, + "step": 24607 + }, + { + "epoch": 2.871076887177692, + "grad_norm": 1.0269798040390015, + "learning_rate": 6.942339216525028e-05, + "loss": 1.8393, + "step": 24608 + }, + { + "epoch": 2.871193559677984, + "grad_norm": 0.9921414852142334, + "learning_rate": 6.941079195459372e-05, + "loss": 1.8217, + "step": 24609 + }, + { + "epoch": 2.8713102321782755, + "grad_norm": 0.9782779812812805, + "learning_rate": 6.939819259494728e-05, + "loss": 1.9259, + "step": 24610 + }, + { + "epoch": 2.871426904678567, + "grad_norm": 1.0840489864349365, + "learning_rate": 6.938559408644146e-05, + "loss": 1.868, + "step": 24611 + }, + { + "epoch": 2.871543577178859, + "grad_norm": 1.0109163522720337, + "learning_rate": 6.937299642920701e-05, + "loss": 1.9008, + "step": 24612 + }, + { + "epoch": 2.8716602496791506, + "grad_norm": 1.1986563205718994, + "learning_rate": 6.936039962337442e-05, + "loss": 2.0038, + "step": 24613 + }, + { + "epoch": 2.8717769221794422, + "grad_norm": 1.1640132665634155, + "learning_rate": 6.934780366907438e-05, + "loss": 2.0142, + "step": 24614 + }, + { + "epoch": 2.871893594679734, + "grad_norm": 1.188593864440918, + "learning_rate": 6.933520856643738e-05, + "loss": 2.181, + "step": 24615 + }, + { + "epoch": 2.8720102671800256, + "grad_norm": 1.1618720293045044, + "learning_rate": 6.932261431559409e-05, + "loss": 1.8959, + "step": 24616 + }, + { + "epoch": 2.8721269396803173, + "grad_norm": 1.0653678178787231, + "learning_rate": 6.931002091667495e-05, + "loss": 1.987, + "step": 24617 + }, + { + "epoch": 2.872243612180609, + "grad_norm": 1.1552910804748535, + "learning_rate": 6.929742836981066e-05, + "loss": 2.006, + "step": 24618 + }, + { + "epoch": 2.8723602846809007, + "grad_norm": 0.9897218942642212, + "learning_rate": 6.928483667513163e-05, + "loss": 2.0047, + "step": 24619 + }, + { + "epoch": 2.8724769571811923, + "grad_norm": 1.3269602060317993, + "learning_rate": 6.927224583276845e-05, + "loss": 2.0353, + "step": 24620 + }, + { + "epoch": 2.872593629681484, + "grad_norm": 1.0188159942626953, + "learning_rate": 6.925965584285169e-05, + "loss": 2.0213, + "step": 24621 + }, + { + "epoch": 2.8727103021817757, + "grad_norm": 1.0685234069824219, + "learning_rate": 6.924706670551181e-05, + "loss": 1.8515, + "step": 24622 + }, + { + "epoch": 2.8728269746820674, + "grad_norm": 1.0998183488845825, + "learning_rate": 6.923447842087936e-05, + "loss": 2.0523, + "step": 24623 + }, + { + "epoch": 2.872943647182359, + "grad_norm": 0.9687432050704956, + "learning_rate": 6.922189098908478e-05, + "loss": 1.8241, + "step": 24624 + }, + { + "epoch": 2.8730603196826507, + "grad_norm": 1.0661516189575195, + "learning_rate": 6.920930441025869e-05, + "loss": 1.9721, + "step": 24625 + }, + { + "epoch": 2.8731769921829424, + "grad_norm": 0.9233605265617371, + "learning_rate": 6.919671868453146e-05, + "loss": 1.8059, + "step": 24626 + }, + { + "epoch": 2.873293664683234, + "grad_norm": 1.1213027238845825, + "learning_rate": 6.91841338120336e-05, + "loss": 2.1664, + "step": 24627 + }, + { + "epoch": 2.873410337183526, + "grad_norm": 1.2221323251724243, + "learning_rate": 6.917154979289553e-05, + "loss": 2.0998, + "step": 24628 + }, + { + "epoch": 2.8735270096838175, + "grad_norm": 0.9708958268165588, + "learning_rate": 6.915896662724777e-05, + "loss": 1.9165, + "step": 24629 + }, + { + "epoch": 2.873643682184109, + "grad_norm": 0.9621061086654663, + "learning_rate": 6.914638431522072e-05, + "loss": 1.8953, + "step": 24630 + }, + { + "epoch": 2.873760354684401, + "grad_norm": 1.0735846757888794, + "learning_rate": 6.91338028569449e-05, + "loss": 1.7395, + "step": 24631 + }, + { + "epoch": 2.8738770271846925, + "grad_norm": 1.104992389678955, + "learning_rate": 6.912122225255064e-05, + "loss": 1.9522, + "step": 24632 + }, + { + "epoch": 2.873993699684984, + "grad_norm": 1.0836378335952759, + "learning_rate": 6.910864250216839e-05, + "loss": 2.1778, + "step": 24633 + }, + { + "epoch": 2.874110372185276, + "grad_norm": 1.1411336660385132, + "learning_rate": 6.909606360592865e-05, + "loss": 1.9062, + "step": 24634 + }, + { + "epoch": 2.8742270446855676, + "grad_norm": 1.0627269744873047, + "learning_rate": 6.90834855639617e-05, + "loss": 2.0629, + "step": 24635 + }, + { + "epoch": 2.8743437171858592, + "grad_norm": 1.0931349992752075, + "learning_rate": 6.907090837639803e-05, + "loss": 2.1144, + "step": 24636 + }, + { + "epoch": 2.874460389686151, + "grad_norm": 1.4033982753753662, + "learning_rate": 6.905833204336793e-05, + "loss": 1.942, + "step": 24637 + }, + { + "epoch": 2.8745770621864426, + "grad_norm": 1.3220943212509155, + "learning_rate": 6.904575656500189e-05, + "loss": 1.9581, + "step": 24638 + }, + { + "epoch": 2.8746937346867343, + "grad_norm": 1.0192041397094727, + "learning_rate": 6.903318194143017e-05, + "loss": 1.9476, + "step": 24639 + }, + { + "epoch": 2.874810407187026, + "grad_norm": 1.0545052289962769, + "learning_rate": 6.902060817278327e-05, + "loss": 1.8789, + "step": 24640 + }, + { + "epoch": 2.8749270796873176, + "grad_norm": 0.9633917808532715, + "learning_rate": 6.900803525919136e-05, + "loss": 1.7779, + "step": 24641 + }, + { + "epoch": 2.8750437521876093, + "grad_norm": 1.0862303972244263, + "learning_rate": 6.899546320078494e-05, + "loss": 1.9122, + "step": 24642 + }, + { + "epoch": 2.875160424687901, + "grad_norm": 1.0977283716201782, + "learning_rate": 6.898289199769429e-05, + "loss": 1.8505, + "step": 24643 + }, + { + "epoch": 2.8752770971881927, + "grad_norm": 0.9740467071533203, + "learning_rate": 6.897032165004967e-05, + "loss": 1.6738, + "step": 24644 + }, + { + "epoch": 2.8753937696884844, + "grad_norm": 1.141897201538086, + "learning_rate": 6.895775215798148e-05, + "loss": 1.8734, + "step": 24645 + }, + { + "epoch": 2.875510442188776, + "grad_norm": 1.149958610534668, + "learning_rate": 6.894518352161996e-05, + "loss": 1.9431, + "step": 24646 + }, + { + "epoch": 2.8756271146890677, + "grad_norm": 1.020745038986206, + "learning_rate": 6.893261574109549e-05, + "loss": 1.8244, + "step": 24647 + }, + { + "epoch": 2.8757437871893594, + "grad_norm": 1.1851065158843994, + "learning_rate": 6.892004881653827e-05, + "loss": 2.0322, + "step": 24648 + }, + { + "epoch": 2.875860459689651, + "grad_norm": 1.4375022649765015, + "learning_rate": 6.890748274807866e-05, + "loss": 2.151, + "step": 24649 + }, + { + "epoch": 2.8759771321899428, + "grad_norm": 1.1153948307037354, + "learning_rate": 6.889491753584684e-05, + "loss": 1.8654, + "step": 24650 + }, + { + "epoch": 2.8760938046902345, + "grad_norm": 1.0836865901947021, + "learning_rate": 6.88823531799732e-05, + "loss": 1.8854, + "step": 24651 + }, + { + "epoch": 2.876210477190526, + "grad_norm": 1.1431256532669067, + "learning_rate": 6.886978968058786e-05, + "loss": 1.9045, + "step": 24652 + }, + { + "epoch": 2.876327149690818, + "grad_norm": 1.0414564609527588, + "learning_rate": 6.885722703782116e-05, + "loss": 1.9278, + "step": 24653 + }, + { + "epoch": 2.8764438221911095, + "grad_norm": 1.0571494102478027, + "learning_rate": 6.884466525180325e-05, + "loss": 1.903, + "step": 24654 + }, + { + "epoch": 2.876560494691401, + "grad_norm": 1.0598978996276855, + "learning_rate": 6.883210432266446e-05, + "loss": 1.971, + "step": 24655 + }, + { + "epoch": 2.876677167191693, + "grad_norm": 1.2082624435424805, + "learning_rate": 6.88195442505349e-05, + "loss": 2.1442, + "step": 24656 + }, + { + "epoch": 2.8767938396919845, + "grad_norm": 1.135241150856018, + "learning_rate": 6.880698503554488e-05, + "loss": 1.8243, + "step": 24657 + }, + { + "epoch": 2.8769105121922762, + "grad_norm": 0.9530935287475586, + "learning_rate": 6.879442667782451e-05, + "loss": 1.9149, + "step": 24658 + }, + { + "epoch": 2.877027184692568, + "grad_norm": 1.232161283493042, + "learning_rate": 6.878186917750408e-05, + "loss": 1.7582, + "step": 24659 + }, + { + "epoch": 2.8771438571928596, + "grad_norm": 1.0541512966156006, + "learning_rate": 6.876931253471365e-05, + "loss": 2.0647, + "step": 24660 + }, + { + "epoch": 2.8772605296931513, + "grad_norm": 0.9326372742652893, + "learning_rate": 6.875675674958352e-05, + "loss": 1.983, + "step": 24661 + }, + { + "epoch": 2.877377202193443, + "grad_norm": 1.1528847217559814, + "learning_rate": 6.874420182224379e-05, + "loss": 1.9946, + "step": 24662 + }, + { + "epoch": 2.8774938746937346, + "grad_norm": 1.087585687637329, + "learning_rate": 6.873164775282455e-05, + "loss": 1.8973, + "step": 24663 + }, + { + "epoch": 2.8776105471940263, + "grad_norm": 1.1723685264587402, + "learning_rate": 6.871909454145609e-05, + "loss": 1.9647, + "step": 24664 + }, + { + "epoch": 2.877727219694318, + "grad_norm": 1.2582789659500122, + "learning_rate": 6.870654218826842e-05, + "loss": 1.989, + "step": 24665 + }, + { + "epoch": 2.8778438921946097, + "grad_norm": 0.8995254039764404, + "learning_rate": 6.869399069339177e-05, + "loss": 1.602, + "step": 24666 + }, + { + "epoch": 2.8779605646949014, + "grad_norm": 1.0263687372207642, + "learning_rate": 6.868144005695617e-05, + "loss": 1.8441, + "step": 24667 + }, + { + "epoch": 2.878077237195193, + "grad_norm": 1.2652430534362793, + "learning_rate": 6.866889027909181e-05, + "loss": 1.9386, + "step": 24668 + }, + { + "epoch": 2.8781939096954847, + "grad_norm": 1.2135227918624878, + "learning_rate": 6.86563413599287e-05, + "loss": 1.9396, + "step": 24669 + }, + { + "epoch": 2.8783105821957764, + "grad_norm": 1.1332210302352905, + "learning_rate": 6.864379329959705e-05, + "loss": 1.8785, + "step": 24670 + }, + { + "epoch": 2.878427254696068, + "grad_norm": 1.0956155061721802, + "learning_rate": 6.863124609822684e-05, + "loss": 2.1647, + "step": 24671 + }, + { + "epoch": 2.8785439271963598, + "grad_norm": 0.9735612273216248, + "learning_rate": 6.861869975594819e-05, + "loss": 1.837, + "step": 24672 + }, + { + "epoch": 2.8786605996966514, + "grad_norm": 1.0024114847183228, + "learning_rate": 6.86061542728912e-05, + "loss": 1.8578, + "step": 24673 + }, + { + "epoch": 2.878777272196943, + "grad_norm": 1.0069704055786133, + "learning_rate": 6.859360964918584e-05, + "loss": 1.7656, + "step": 24674 + }, + { + "epoch": 2.878893944697235, + "grad_norm": 1.050265908241272, + "learning_rate": 6.85810658849623e-05, + "loss": 1.9822, + "step": 24675 + }, + { + "epoch": 2.8790106171975265, + "grad_norm": 1.2415891885757446, + "learning_rate": 6.856852298035044e-05, + "loss": 1.9195, + "step": 24676 + }, + { + "epoch": 2.879127289697818, + "grad_norm": 0.9967816472053528, + "learning_rate": 6.855598093548045e-05, + "loss": 1.828, + "step": 24677 + }, + { + "epoch": 2.87924396219811, + "grad_norm": 1.0857353210449219, + "learning_rate": 6.85434397504823e-05, + "loss": 1.8131, + "step": 24678 + }, + { + "epoch": 2.8793606346984015, + "grad_norm": 1.0250242948532104, + "learning_rate": 6.853089942548596e-05, + "loss": 1.9145, + "step": 24679 + }, + { + "epoch": 2.879477307198693, + "grad_norm": 1.1090269088745117, + "learning_rate": 6.851835996062142e-05, + "loss": 2.0926, + "step": 24680 + }, + { + "epoch": 2.879593979698985, + "grad_norm": 1.133575439453125, + "learning_rate": 6.850582135601877e-05, + "loss": 1.9357, + "step": 24681 + }, + { + "epoch": 2.8797106521992766, + "grad_norm": 1.137115716934204, + "learning_rate": 6.84932836118079e-05, + "loss": 1.8823, + "step": 24682 + }, + { + "epoch": 2.8798273246995683, + "grad_norm": 1.147433876991272, + "learning_rate": 6.848074672811888e-05, + "loss": 1.9056, + "step": 24683 + }, + { + "epoch": 2.87994399719986, + "grad_norm": 1.254689335823059, + "learning_rate": 6.846821070508158e-05, + "loss": 2.0951, + "step": 24684 + }, + { + "epoch": 2.8800606697001516, + "grad_norm": 1.2045538425445557, + "learning_rate": 6.8455675542826e-05, + "loss": 2.0289, + "step": 24685 + }, + { + "epoch": 2.8801773422004433, + "grad_norm": 1.1008273363113403, + "learning_rate": 6.844314124148216e-05, + "loss": 2.0015, + "step": 24686 + }, + { + "epoch": 2.880294014700735, + "grad_norm": 1.0964347124099731, + "learning_rate": 6.843060780117989e-05, + "loss": 1.7921, + "step": 24687 + }, + { + "epoch": 2.8804106872010267, + "grad_norm": 1.1128062009811401, + "learning_rate": 6.841807522204922e-05, + "loss": 1.7834, + "step": 24688 + }, + { + "epoch": 2.8805273597013183, + "grad_norm": 1.0933656692504883, + "learning_rate": 6.840554350421999e-05, + "loss": 1.9181, + "step": 24689 + }, + { + "epoch": 2.88064403220161, + "grad_norm": 1.2559415102005005, + "learning_rate": 6.83930126478222e-05, + "loss": 1.8552, + "step": 24690 + }, + { + "epoch": 2.8807607047019017, + "grad_norm": 1.0543456077575684, + "learning_rate": 6.838048265298565e-05, + "loss": 1.8993, + "step": 24691 + }, + { + "epoch": 2.8808773772021934, + "grad_norm": 1.0815293788909912, + "learning_rate": 6.836795351984036e-05, + "loss": 1.947, + "step": 24692 + }, + { + "epoch": 2.880994049702485, + "grad_norm": 1.056463360786438, + "learning_rate": 6.83554252485161e-05, + "loss": 1.9561, + "step": 24693 + }, + { + "epoch": 2.8811107222027768, + "grad_norm": 0.9980660080909729, + "learning_rate": 6.834289783914284e-05, + "loss": 1.8679, + "step": 24694 + }, + { + "epoch": 2.8812273947030684, + "grad_norm": 1.2283285856246948, + "learning_rate": 6.833037129185043e-05, + "loss": 1.9793, + "step": 24695 + }, + { + "epoch": 2.88134406720336, + "grad_norm": 1.0186411142349243, + "learning_rate": 6.831784560676868e-05, + "loss": 1.868, + "step": 24696 + }, + { + "epoch": 2.881460739703652, + "grad_norm": 1.155120611190796, + "learning_rate": 6.830532078402749e-05, + "loss": 1.9583, + "step": 24697 + }, + { + "epoch": 2.8815774122039435, + "grad_norm": 1.1451443433761597, + "learning_rate": 6.829279682375668e-05, + "loss": 1.7599, + "step": 24698 + }, + { + "epoch": 2.881694084704235, + "grad_norm": 0.9754831790924072, + "learning_rate": 6.828027372608613e-05, + "loss": 1.8281, + "step": 24699 + }, + { + "epoch": 2.881810757204527, + "grad_norm": 1.0400933027267456, + "learning_rate": 6.826775149114557e-05, + "loss": 1.89, + "step": 24700 + }, + { + "epoch": 2.8819274297048185, + "grad_norm": 1.0764285326004028, + "learning_rate": 6.825523011906496e-05, + "loss": 1.9765, + "step": 24701 + }, + { + "epoch": 2.88204410220511, + "grad_norm": 1.0792324542999268, + "learning_rate": 6.824270960997396e-05, + "loss": 1.9782, + "step": 24702 + }, + { + "epoch": 2.882160774705402, + "grad_norm": 1.081531286239624, + "learning_rate": 6.823018996400247e-05, + "loss": 1.8495, + "step": 24703 + }, + { + "epoch": 2.8822774472056936, + "grad_norm": 1.1646192073822021, + "learning_rate": 6.821767118128022e-05, + "loss": 2.0134, + "step": 24704 + }, + { + "epoch": 2.8823941197059852, + "grad_norm": 1.228289246559143, + "learning_rate": 6.820515326193707e-05, + "loss": 1.9082, + "step": 24705 + }, + { + "epoch": 2.882510792206277, + "grad_norm": 1.0847774744033813, + "learning_rate": 6.819263620610265e-05, + "loss": 1.9772, + "step": 24706 + }, + { + "epoch": 2.8826274647065686, + "grad_norm": 1.0399304628372192, + "learning_rate": 6.81801200139069e-05, + "loss": 2.1218, + "step": 24707 + }, + { + "epoch": 2.8827441372068603, + "grad_norm": 1.3884023427963257, + "learning_rate": 6.816760468547941e-05, + "loss": 2.1031, + "step": 24708 + }, + { + "epoch": 2.882860809707152, + "grad_norm": 1.1294656991958618, + "learning_rate": 6.815509022095007e-05, + "loss": 2.0338, + "step": 24709 + }, + { + "epoch": 2.8829774822074437, + "grad_norm": 1.064534068107605, + "learning_rate": 6.814257662044848e-05, + "loss": 1.9422, + "step": 24710 + }, + { + "epoch": 2.8830941547077353, + "grad_norm": 1.3450862169265747, + "learning_rate": 6.813006388410451e-05, + "loss": 2.0255, + "step": 24711 + }, + { + "epoch": 2.883210827208027, + "grad_norm": 0.9979971051216125, + "learning_rate": 6.811755201204776e-05, + "loss": 1.9428, + "step": 24712 + }, + { + "epoch": 2.8833274997083187, + "grad_norm": 1.1579967737197876, + "learning_rate": 6.810504100440797e-05, + "loss": 2.0071, + "step": 24713 + }, + { + "epoch": 2.8834441722086104, + "grad_norm": 1.0024714469909668, + "learning_rate": 6.809253086131489e-05, + "loss": 2.0093, + "step": 24714 + }, + { + "epoch": 2.883560844708902, + "grad_norm": 1.1478216648101807, + "learning_rate": 6.808002158289812e-05, + "loss": 2.0869, + "step": 24715 + }, + { + "epoch": 2.8836775172091937, + "grad_norm": 0.9894187450408936, + "learning_rate": 6.806751316928743e-05, + "loss": 1.7641, + "step": 24716 + }, + { + "epoch": 2.8837941897094854, + "grad_norm": 1.2326774597167969, + "learning_rate": 6.805500562061243e-05, + "loss": 1.923, + "step": 24717 + }, + { + "epoch": 2.883910862209777, + "grad_norm": 1.062198519706726, + "learning_rate": 6.804249893700285e-05, + "loss": 1.9194, + "step": 24718 + }, + { + "epoch": 2.884027534710069, + "grad_norm": 1.0739301443099976, + "learning_rate": 6.802999311858827e-05, + "loss": 1.8776, + "step": 24719 + }, + { + "epoch": 2.8841442072103605, + "grad_norm": 1.1740425825119019, + "learning_rate": 6.801748816549842e-05, + "loss": 1.9429, + "step": 24720 + }, + { + "epoch": 2.884260879710652, + "grad_norm": 1.3188265562057495, + "learning_rate": 6.800498407786283e-05, + "loss": 1.9559, + "step": 24721 + }, + { + "epoch": 2.884377552210944, + "grad_norm": 1.1296122074127197, + "learning_rate": 6.799248085581126e-05, + "loss": 1.9297, + "step": 24722 + }, + { + "epoch": 2.8844942247112355, + "grad_norm": 1.1066477298736572, + "learning_rate": 6.797997849947319e-05, + "loss": 1.9692, + "step": 24723 + }, + { + "epoch": 2.884610897211527, + "grad_norm": 1.1517813205718994, + "learning_rate": 6.796747700897832e-05, + "loss": 2.097, + "step": 24724 + }, + { + "epoch": 2.884727569711819, + "grad_norm": 0.9935147166252136, + "learning_rate": 6.795497638445626e-05, + "loss": 1.7664, + "step": 24725 + }, + { + "epoch": 2.8848442422121106, + "grad_norm": 1.091901183128357, + "learning_rate": 6.794247662603654e-05, + "loss": 2.0179, + "step": 24726 + }, + { + "epoch": 2.8849609147124022, + "grad_norm": 1.0800923109054565, + "learning_rate": 6.792997773384882e-05, + "loss": 2.1883, + "step": 24727 + }, + { + "epoch": 2.885077587212694, + "grad_norm": 1.041405439376831, + "learning_rate": 6.791747970802266e-05, + "loss": 1.808, + "step": 24728 + }, + { + "epoch": 2.8851942597129856, + "grad_norm": 1.1133196353912354, + "learning_rate": 6.790498254868752e-05, + "loss": 2.0408, + "step": 24729 + }, + { + "epoch": 2.8853109322132773, + "grad_norm": 0.9434691071510315, + "learning_rate": 6.789248625597312e-05, + "loss": 1.9041, + "step": 24730 + }, + { + "epoch": 2.885427604713569, + "grad_norm": 1.0514189004898071, + "learning_rate": 6.787999083000889e-05, + "loss": 2.0516, + "step": 24731 + }, + { + "epoch": 2.8855442772138606, + "grad_norm": 1.0522658824920654, + "learning_rate": 6.786749627092434e-05, + "loss": 1.8694, + "step": 24732 + }, + { + "epoch": 2.8856609497141523, + "grad_norm": 1.0424251556396484, + "learning_rate": 6.785500257884917e-05, + "loss": 1.8705, + "step": 24733 + }, + { + "epoch": 2.885777622214444, + "grad_norm": 1.0678280591964722, + "learning_rate": 6.784250975391268e-05, + "loss": 1.8752, + "step": 24734 + }, + { + "epoch": 2.8858942947147357, + "grad_norm": 1.1683863401412964, + "learning_rate": 6.783001779624459e-05, + "loss": 1.8643, + "step": 24735 + }, + { + "epoch": 2.8860109672150274, + "grad_norm": 1.0169907808303833, + "learning_rate": 6.781752670597424e-05, + "loss": 2.0889, + "step": 24736 + }, + { + "epoch": 2.886127639715319, + "grad_norm": 0.9821038842201233, + "learning_rate": 6.78050364832312e-05, + "loss": 1.9392, + "step": 24737 + }, + { + "epoch": 2.8862443122156107, + "grad_norm": 1.1288002729415894, + "learning_rate": 6.779254712814499e-05, + "loss": 1.8712, + "step": 24738 + }, + { + "epoch": 2.8863609847159024, + "grad_norm": 0.9536816477775574, + "learning_rate": 6.778005864084499e-05, + "loss": 1.934, + "step": 24739 + }, + { + "epoch": 2.886477657216194, + "grad_norm": 0.9678584933280945, + "learning_rate": 6.776757102146076e-05, + "loss": 1.8718, + "step": 24740 + }, + { + "epoch": 2.8865943297164858, + "grad_norm": 1.0893820524215698, + "learning_rate": 6.775508427012167e-05, + "loss": 1.9374, + "step": 24741 + }, + { + "epoch": 2.8867110022167775, + "grad_norm": 1.1043072938919067, + "learning_rate": 6.774259838695728e-05, + "loss": 1.9684, + "step": 24742 + }, + { + "epoch": 2.886827674717069, + "grad_norm": 1.1368329524993896, + "learning_rate": 6.77301133720969e-05, + "loss": 2.0868, + "step": 24743 + }, + { + "epoch": 2.886944347217361, + "grad_norm": 1.159812092781067, + "learning_rate": 6.771762922567009e-05, + "loss": 1.9855, + "step": 24744 + }, + { + "epoch": 2.8870610197176525, + "grad_norm": 1.1124640703201294, + "learning_rate": 6.770514594780616e-05, + "loss": 1.9059, + "step": 24745 + }, + { + "epoch": 2.887177692217944, + "grad_norm": 1.0949547290802002, + "learning_rate": 6.76926635386346e-05, + "loss": 1.9723, + "step": 24746 + }, + { + "epoch": 2.887294364718236, + "grad_norm": 1.0889041423797607, + "learning_rate": 6.76801819982848e-05, + "loss": 2.0604, + "step": 24747 + }, + { + "epoch": 2.8874110372185275, + "grad_norm": 1.2892783880233765, + "learning_rate": 6.766770132688615e-05, + "loss": 1.8731, + "step": 24748 + }, + { + "epoch": 2.8875277097188192, + "grad_norm": 1.19571852684021, + "learning_rate": 6.765522152456795e-05, + "loss": 1.9552, + "step": 24749 + }, + { + "epoch": 2.887644382219111, + "grad_norm": 1.073311686515808, + "learning_rate": 6.764274259145966e-05, + "loss": 1.9263, + "step": 24750 + }, + { + "epoch": 2.8877610547194026, + "grad_norm": 1.1149858236312866, + "learning_rate": 6.763026452769071e-05, + "loss": 1.9329, + "step": 24751 + }, + { + "epoch": 2.8878777272196943, + "grad_norm": 1.035418152809143, + "learning_rate": 6.761778733339031e-05, + "loss": 1.9413, + "step": 24752 + }, + { + "epoch": 2.887994399719986, + "grad_norm": 1.1504791975021362, + "learning_rate": 6.760531100868796e-05, + "loss": 1.9541, + "step": 24753 + }, + { + "epoch": 2.8881110722202776, + "grad_norm": 1.144432783126831, + "learning_rate": 6.759283555371286e-05, + "loss": 2.0584, + "step": 24754 + }, + { + "epoch": 2.8882277447205693, + "grad_norm": 1.0000026226043701, + "learning_rate": 6.758036096859448e-05, + "loss": 2.0598, + "step": 24755 + }, + { + "epoch": 2.888344417220861, + "grad_norm": 1.092452883720398, + "learning_rate": 6.7567887253462e-05, + "loss": 1.9827, + "step": 24756 + }, + { + "epoch": 2.8884610897211527, + "grad_norm": 1.129052758216858, + "learning_rate": 6.755541440844487e-05, + "loss": 2.0299, + "step": 24757 + }, + { + "epoch": 2.8885777622214444, + "grad_norm": 1.136927843093872, + "learning_rate": 6.754294243367228e-05, + "loss": 2.1193, + "step": 24758 + }, + { + "epoch": 2.888694434721736, + "grad_norm": 1.1647776365280151, + "learning_rate": 6.753047132927362e-05, + "loss": 2.0143, + "step": 24759 + }, + { + "epoch": 2.8888111072220277, + "grad_norm": 0.9171489477157593, + "learning_rate": 6.75180010953781e-05, + "loss": 1.7298, + "step": 24760 + }, + { + "epoch": 2.8889277797223194, + "grad_norm": 1.1428111791610718, + "learning_rate": 6.750553173211507e-05, + "loss": 1.9183, + "step": 24761 + }, + { + "epoch": 2.889044452222611, + "grad_norm": 1.1053320169448853, + "learning_rate": 6.749306323961373e-05, + "loss": 1.9167, + "step": 24762 + }, + { + "epoch": 2.8891611247229028, + "grad_norm": 1.2358707189559937, + "learning_rate": 6.748059561800339e-05, + "loss": 2.0696, + "step": 24763 + }, + { + "epoch": 2.8892777972231944, + "grad_norm": 1.0778214931488037, + "learning_rate": 6.746812886741332e-05, + "loss": 2.1014, + "step": 24764 + }, + { + "epoch": 2.889394469723486, + "grad_norm": 1.0899139642715454, + "learning_rate": 6.745566298797262e-05, + "loss": 2.0901, + "step": 24765 + }, + { + "epoch": 2.889511142223778, + "grad_norm": 1.0638926029205322, + "learning_rate": 6.744319797981073e-05, + "loss": 1.9194, + "step": 24766 + }, + { + "epoch": 2.8896278147240695, + "grad_norm": 1.0683072805404663, + "learning_rate": 6.743073384305667e-05, + "loss": 2.0235, + "step": 24767 + }, + { + "epoch": 2.889744487224361, + "grad_norm": 0.9701563715934753, + "learning_rate": 6.741827057783983e-05, + "loss": 1.9746, + "step": 24768 + }, + { + "epoch": 2.889861159724653, + "grad_norm": 1.0645575523376465, + "learning_rate": 6.740580818428928e-05, + "loss": 2.0036, + "step": 24769 + }, + { + "epoch": 2.8899778322249445, + "grad_norm": 1.1342664957046509, + "learning_rate": 6.739334666253431e-05, + "loss": 1.9123, + "step": 24770 + }, + { + "epoch": 2.890094504725236, + "grad_norm": 1.1561558246612549, + "learning_rate": 6.738088601270404e-05, + "loss": 1.954, + "step": 24771 + }, + { + "epoch": 2.890211177225528, + "grad_norm": 1.183068871498108, + "learning_rate": 6.736842623492772e-05, + "loss": 2.0329, + "step": 24772 + }, + { + "epoch": 2.8903278497258196, + "grad_norm": 1.0155436992645264, + "learning_rate": 6.735596732933443e-05, + "loss": 1.9331, + "step": 24773 + }, + { + "epoch": 2.8904445222261113, + "grad_norm": 1.1469790935516357, + "learning_rate": 6.734350929605343e-05, + "loss": 1.9483, + "step": 24774 + }, + { + "epoch": 2.890561194726403, + "grad_norm": 0.9572188854217529, + "learning_rate": 6.733105213521376e-05, + "loss": 1.9893, + "step": 24775 + }, + { + "epoch": 2.8906778672266946, + "grad_norm": 1.1973804235458374, + "learning_rate": 6.731859584694463e-05, + "loss": 1.928, + "step": 24776 + }, + { + "epoch": 2.8907945397269863, + "grad_norm": 1.075204849243164, + "learning_rate": 6.730614043137521e-05, + "loss": 1.8297, + "step": 24777 + }, + { + "epoch": 2.890911212227278, + "grad_norm": 1.0785231590270996, + "learning_rate": 6.729368588863454e-05, + "loss": 1.8507, + "step": 24778 + }, + { + "epoch": 2.8910278847275697, + "grad_norm": 1.2164041996002197, + "learning_rate": 6.728123221885182e-05, + "loss": 1.95, + "step": 24779 + }, + { + "epoch": 2.8911445572278613, + "grad_norm": 1.1027320623397827, + "learning_rate": 6.726877942215611e-05, + "loss": 2.0559, + "step": 24780 + }, + { + "epoch": 2.891261229728153, + "grad_norm": 1.0942835807800293, + "learning_rate": 6.725632749867645e-05, + "loss": 1.9992, + "step": 24781 + }, + { + "epoch": 2.8913779022284447, + "grad_norm": 1.325661063194275, + "learning_rate": 6.724387644854203e-05, + "loss": 1.8685, + "step": 24782 + }, + { + "epoch": 2.8914945747287364, + "grad_norm": 1.0867383480072021, + "learning_rate": 6.72314262718819e-05, + "loss": 2.0318, + "step": 24783 + }, + { + "epoch": 2.891611247229028, + "grad_norm": 1.10561203956604, + "learning_rate": 6.721897696882503e-05, + "loss": 1.916, + "step": 24784 + }, + { + "epoch": 2.8917279197293198, + "grad_norm": 1.1094874143600464, + "learning_rate": 6.720652853950064e-05, + "loss": 1.8746, + "step": 24785 + }, + { + "epoch": 2.8918445922296114, + "grad_norm": 1.0937327146530151, + "learning_rate": 6.719408098403764e-05, + "loss": 2.0683, + "step": 24786 + }, + { + "epoch": 2.891961264729903, + "grad_norm": 1.1816864013671875, + "learning_rate": 6.718163430256519e-05, + "loss": 1.9372, + "step": 24787 + }, + { + "epoch": 2.892077937230195, + "grad_norm": 1.2675907611846924, + "learning_rate": 6.716918849521222e-05, + "loss": 1.9608, + "step": 24788 + }, + { + "epoch": 2.8921946097304865, + "grad_norm": 1.1328437328338623, + "learning_rate": 6.715674356210782e-05, + "loss": 1.9239, + "step": 24789 + }, + { + "epoch": 2.892311282230778, + "grad_norm": 1.15468168258667, + "learning_rate": 6.714429950338102e-05, + "loss": 1.9733, + "step": 24790 + }, + { + "epoch": 2.89242795473107, + "grad_norm": 1.0175248384475708, + "learning_rate": 6.713185631916077e-05, + "loss": 1.9045, + "step": 24791 + }, + { + "epoch": 2.8925446272313615, + "grad_norm": 1.0359848737716675, + "learning_rate": 6.711941400957613e-05, + "loss": 1.9943, + "step": 24792 + }, + { + "epoch": 2.892661299731653, + "grad_norm": 1.281246542930603, + "learning_rate": 6.7106972574756e-05, + "loss": 2.1539, + "step": 24793 + }, + { + "epoch": 2.892777972231945, + "grad_norm": 1.0528897047042847, + "learning_rate": 6.709453201482945e-05, + "loss": 2.0136, + "step": 24794 + }, + { + "epoch": 2.8928946447322366, + "grad_norm": 1.1465877294540405, + "learning_rate": 6.708209232992537e-05, + "loss": 2.1042, + "step": 24795 + }, + { + "epoch": 2.8930113172325282, + "grad_norm": 1.0695219039916992, + "learning_rate": 6.706965352017282e-05, + "loss": 1.9582, + "step": 24796 + }, + { + "epoch": 2.89312798973282, + "grad_norm": 1.144063949584961, + "learning_rate": 6.705721558570069e-05, + "loss": 1.9233, + "step": 24797 + }, + { + "epoch": 2.8932446622331116, + "grad_norm": 1.0905187129974365, + "learning_rate": 6.704477852663787e-05, + "loss": 1.9244, + "step": 24798 + }, + { + "epoch": 2.8933613347334033, + "grad_norm": 0.9968244433403015, + "learning_rate": 6.703234234311342e-05, + "loss": 1.992, + "step": 24799 + }, + { + "epoch": 2.893478007233695, + "grad_norm": 1.0906107425689697, + "learning_rate": 6.701990703525617e-05, + "loss": 1.8666, + "step": 24800 + }, + { + "epoch": 2.8935946797339867, + "grad_norm": 1.1643781661987305, + "learning_rate": 6.7007472603195e-05, + "loss": 2.0096, + "step": 24801 + }, + { + "epoch": 2.8937113522342783, + "grad_norm": 1.1363455057144165, + "learning_rate": 6.69950390470589e-05, + "loss": 1.9991, + "step": 24802 + }, + { + "epoch": 2.89382802473457, + "grad_norm": 1.148705005645752, + "learning_rate": 6.698260636697679e-05, + "loss": 2.0233, + "step": 24803 + }, + { + "epoch": 2.8939446972348617, + "grad_norm": 1.0860756635665894, + "learning_rate": 6.697017456307746e-05, + "loss": 1.9344, + "step": 24804 + }, + { + "epoch": 2.8940613697351534, + "grad_norm": 1.2087045907974243, + "learning_rate": 6.695774363548989e-05, + "loss": 1.9328, + "step": 24805 + }, + { + "epoch": 2.894178042235445, + "grad_norm": 1.1389365196228027, + "learning_rate": 6.694531358434284e-05, + "loss": 2.0216, + "step": 24806 + }, + { + "epoch": 2.8942947147357367, + "grad_norm": 1.0228246450424194, + "learning_rate": 6.69328844097653e-05, + "loss": 1.8874, + "step": 24807 + }, + { + "epoch": 2.8944113872360284, + "grad_norm": 1.119452953338623, + "learning_rate": 6.692045611188597e-05, + "loss": 1.8434, + "step": 24808 + }, + { + "epoch": 2.89452805973632, + "grad_norm": 1.1996787786483765, + "learning_rate": 6.690802869083387e-05, + "loss": 1.8107, + "step": 24809 + }, + { + "epoch": 2.894644732236612, + "grad_norm": 1.0721664428710938, + "learning_rate": 6.689560214673768e-05, + "loss": 1.8483, + "step": 24810 + }, + { + "epoch": 2.8947614047369035, + "grad_norm": 1.2163217067718506, + "learning_rate": 6.688317647972632e-05, + "loss": 2.1189, + "step": 24811 + }, + { + "epoch": 2.894878077237195, + "grad_norm": 1.2454038858413696, + "learning_rate": 6.687075168992851e-05, + "loss": 1.8658, + "step": 24812 + }, + { + "epoch": 2.894994749737487, + "grad_norm": 1.1171510219573975, + "learning_rate": 6.68583277774732e-05, + "loss": 1.8633, + "step": 24813 + }, + { + "epoch": 2.8951114222377785, + "grad_norm": 1.227751612663269, + "learning_rate": 6.684590474248904e-05, + "loss": 2.0244, + "step": 24814 + }, + { + "epoch": 2.89522809473807, + "grad_norm": 1.138763666152954, + "learning_rate": 6.683348258510494e-05, + "loss": 1.9373, + "step": 24815 + }, + { + "epoch": 2.895344767238362, + "grad_norm": 1.230470895767212, + "learning_rate": 6.682106130544962e-05, + "loss": 1.9891, + "step": 24816 + }, + { + "epoch": 2.8954614397386536, + "grad_norm": 1.2436330318450928, + "learning_rate": 6.68086409036518e-05, + "loss": 1.9812, + "step": 24817 + }, + { + "epoch": 2.8955781122389452, + "grad_norm": 1.2273523807525635, + "learning_rate": 6.679622137984034e-05, + "loss": 2.1454, + "step": 24818 + }, + { + "epoch": 2.895694784739237, + "grad_norm": 1.22922945022583, + "learning_rate": 6.678380273414389e-05, + "loss": 1.8948, + "step": 24819 + }, + { + "epoch": 2.8958114572395286, + "grad_norm": 1.113181233406067, + "learning_rate": 6.677138496669131e-05, + "loss": 1.9026, + "step": 24820 + }, + { + "epoch": 2.8959281297398203, + "grad_norm": 1.0537320375442505, + "learning_rate": 6.675896807761122e-05, + "loss": 1.8864, + "step": 24821 + }, + { + "epoch": 2.896044802240112, + "grad_norm": 1.2088652849197388, + "learning_rate": 6.674655206703244e-05, + "loss": 2.0567, + "step": 24822 + }, + { + "epoch": 2.8961614747404036, + "grad_norm": 1.0497297048568726, + "learning_rate": 6.67341369350836e-05, + "loss": 1.9588, + "step": 24823 + }, + { + "epoch": 2.8962781472406953, + "grad_norm": 1.1367868185043335, + "learning_rate": 6.672172268189348e-05, + "loss": 1.9467, + "step": 24824 + }, + { + "epoch": 2.896394819740987, + "grad_norm": 1.0355849266052246, + "learning_rate": 6.670930930759072e-05, + "loss": 1.9886, + "step": 24825 + }, + { + "epoch": 2.8965114922412787, + "grad_norm": 1.1449788808822632, + "learning_rate": 6.669689681230408e-05, + "loss": 2.0534, + "step": 24826 + }, + { + "epoch": 2.8966281647415704, + "grad_norm": 1.0033071041107178, + "learning_rate": 6.668448519616212e-05, + "loss": 1.9916, + "step": 24827 + }, + { + "epoch": 2.896744837241862, + "grad_norm": 1.157428503036499, + "learning_rate": 6.667207445929359e-05, + "loss": 2.1749, + "step": 24828 + }, + { + "epoch": 2.8968615097421537, + "grad_norm": 1.1351763010025024, + "learning_rate": 6.66596646018272e-05, + "loss": 1.8308, + "step": 24829 + }, + { + "epoch": 2.8969781822424454, + "grad_norm": 1.0787620544433594, + "learning_rate": 6.66472556238915e-05, + "loss": 2.0273, + "step": 24830 + }, + { + "epoch": 2.897094854742737, + "grad_norm": 1.013668179512024, + "learning_rate": 6.663484752561521e-05, + "loss": 1.8555, + "step": 24831 + }, + { + "epoch": 2.8972115272430288, + "grad_norm": 1.0978130102157593, + "learning_rate": 6.662244030712694e-05, + "loss": 1.9505, + "step": 24832 + }, + { + "epoch": 2.8973281997433205, + "grad_norm": 1.1857659816741943, + "learning_rate": 6.661003396855532e-05, + "loss": 2.0426, + "step": 24833 + }, + { + "epoch": 2.897444872243612, + "grad_norm": 1.375027060508728, + "learning_rate": 6.659762851002889e-05, + "loss": 2.0742, + "step": 24834 + }, + { + "epoch": 2.897561544743904, + "grad_norm": 1.2347594499588013, + "learning_rate": 6.658522393167634e-05, + "loss": 2.005, + "step": 24835 + }, + { + "epoch": 2.8976782172441955, + "grad_norm": 1.1573703289031982, + "learning_rate": 6.657282023362624e-05, + "loss": 1.9602, + "step": 24836 + }, + { + "epoch": 2.897794889744487, + "grad_norm": 1.0414689779281616, + "learning_rate": 6.65604174160072e-05, + "loss": 2.0139, + "step": 24837 + }, + { + "epoch": 2.897911562244779, + "grad_norm": 1.0584888458251953, + "learning_rate": 6.654801547894774e-05, + "loss": 1.9071, + "step": 24838 + }, + { + "epoch": 2.8980282347450705, + "grad_norm": 1.0623682737350464, + "learning_rate": 6.653561442257652e-05, + "loss": 1.887, + "step": 24839 + }, + { + "epoch": 2.8981449072453622, + "grad_norm": 1.241868495941162, + "learning_rate": 6.652321424702199e-05, + "loss": 2.0791, + "step": 24840 + }, + { + "epoch": 2.898261579745654, + "grad_norm": 1.1842269897460938, + "learning_rate": 6.651081495241276e-05, + "loss": 1.921, + "step": 24841 + }, + { + "epoch": 2.8983782522459456, + "grad_norm": 1.2037047147750854, + "learning_rate": 6.649841653887745e-05, + "loss": 2.0816, + "step": 24842 + }, + { + "epoch": 2.8984949247462373, + "grad_norm": 1.1969965696334839, + "learning_rate": 6.648601900654443e-05, + "loss": 1.9639, + "step": 24843 + }, + { + "epoch": 2.898611597246529, + "grad_norm": 1.1266555786132812, + "learning_rate": 6.647362235554238e-05, + "loss": 1.9819, + "step": 24844 + }, + { + "epoch": 2.8987282697468206, + "grad_norm": 0.9902128577232361, + "learning_rate": 6.646122658599969e-05, + "loss": 1.9626, + "step": 24845 + }, + { + "epoch": 2.8988449422471123, + "grad_norm": 0.971239447593689, + "learning_rate": 6.644883169804497e-05, + "loss": 1.8708, + "step": 24846 + }, + { + "epoch": 2.898961614747404, + "grad_norm": 1.2486451864242554, + "learning_rate": 6.643643769180661e-05, + "loss": 2.1487, + "step": 24847 + }, + { + "epoch": 2.8990782872476957, + "grad_norm": 1.0451315641403198, + "learning_rate": 6.642404456741322e-05, + "loss": 1.9018, + "step": 24848 + }, + { + "epoch": 2.8991949597479874, + "grad_norm": 1.0305590629577637, + "learning_rate": 6.641165232499321e-05, + "loss": 1.9047, + "step": 24849 + }, + { + "epoch": 2.899311632248279, + "grad_norm": 1.0229229927062988, + "learning_rate": 6.639926096467501e-05, + "loss": 1.8731, + "step": 24850 + }, + { + "epoch": 2.8994283047485707, + "grad_norm": 1.166791558265686, + "learning_rate": 6.638687048658715e-05, + "loss": 2.0514, + "step": 24851 + }, + { + "epoch": 2.8995449772488624, + "grad_norm": 1.276718020439148, + "learning_rate": 6.637448089085807e-05, + "loss": 2.1374, + "step": 24852 + }, + { + "epoch": 2.899661649749154, + "grad_norm": 1.172504186630249, + "learning_rate": 6.636209217761613e-05, + "loss": 2.0639, + "step": 24853 + }, + { + "epoch": 2.8997783222494458, + "grad_norm": 1.1855911016464233, + "learning_rate": 6.634970434698984e-05, + "loss": 2.0002, + "step": 24854 + }, + { + "epoch": 2.8998949947497374, + "grad_norm": 0.9233330488204956, + "learning_rate": 6.633731739910765e-05, + "loss": 1.9163, + "step": 24855 + }, + { + "epoch": 2.900011667250029, + "grad_norm": 1.0471020936965942, + "learning_rate": 6.632493133409788e-05, + "loss": 1.9557, + "step": 24856 + }, + { + "epoch": 2.900128339750321, + "grad_norm": 1.166164755821228, + "learning_rate": 6.631254615208905e-05, + "loss": 1.852, + "step": 24857 + }, + { + "epoch": 2.9002450122506125, + "grad_norm": 1.0141379833221436, + "learning_rate": 6.630016185320944e-05, + "loss": 1.8121, + "step": 24858 + }, + { + "epoch": 2.900361684750904, + "grad_norm": 1.3830077648162842, + "learning_rate": 6.628777843758753e-05, + "loss": 2.1105, + "step": 24859 + }, + { + "epoch": 2.900478357251196, + "grad_norm": 1.0559054613113403, + "learning_rate": 6.627539590535162e-05, + "loss": 2.1702, + "step": 24860 + }, + { + "epoch": 2.9005950297514875, + "grad_norm": 1.1427280902862549, + "learning_rate": 6.626301425663016e-05, + "loss": 2.0696, + "step": 24861 + }, + { + "epoch": 2.900711702251779, + "grad_norm": 1.0839964151382446, + "learning_rate": 6.625063349155144e-05, + "loss": 1.9833, + "step": 24862 + }, + { + "epoch": 2.900828374752071, + "grad_norm": 1.3255724906921387, + "learning_rate": 6.623825361024387e-05, + "loss": 2.1079, + "step": 24863 + }, + { + "epoch": 2.9009450472523626, + "grad_norm": 1.1127469539642334, + "learning_rate": 6.62258746128357e-05, + "loss": 1.9997, + "step": 24864 + }, + { + "epoch": 2.9010617197526543, + "grad_norm": 1.0337508916854858, + "learning_rate": 6.621349649945538e-05, + "loss": 2.0817, + "step": 24865 + }, + { + "epoch": 2.901178392252946, + "grad_norm": 0.9657566547393799, + "learning_rate": 6.620111927023114e-05, + "loss": 1.7515, + "step": 24866 + }, + { + "epoch": 2.9012950647532376, + "grad_norm": 1.0740257501602173, + "learning_rate": 6.618874292529135e-05, + "loss": 1.9836, + "step": 24867 + }, + { + "epoch": 2.9014117372535293, + "grad_norm": 1.070096731185913, + "learning_rate": 6.617636746476429e-05, + "loss": 1.7967, + "step": 24868 + }, + { + "epoch": 2.901528409753821, + "grad_norm": 1.0001022815704346, + "learning_rate": 6.616399288877822e-05, + "loss": 1.8691, + "step": 24869 + }, + { + "epoch": 2.9016450822541127, + "grad_norm": 1.0146664381027222, + "learning_rate": 6.61516191974615e-05, + "loss": 1.8101, + "step": 24870 + }, + { + "epoch": 2.9017617547544043, + "grad_norm": 1.0608888864517212, + "learning_rate": 6.61392463909423e-05, + "loss": 2.0093, + "step": 24871 + }, + { + "epoch": 2.901878427254696, + "grad_norm": 1.1093988418579102, + "learning_rate": 6.612687446934903e-05, + "loss": 2.0664, + "step": 24872 + }, + { + "epoch": 2.9019950997549877, + "grad_norm": 1.098340392112732, + "learning_rate": 6.611450343280983e-05, + "loss": 1.9142, + "step": 24873 + }, + { + "epoch": 2.9021117722552794, + "grad_norm": 1.1737765073776245, + "learning_rate": 6.610213328145302e-05, + "loss": 1.999, + "step": 24874 + }, + { + "epoch": 2.902228444755571, + "grad_norm": 1.085856556892395, + "learning_rate": 6.608976401540676e-05, + "loss": 2.0821, + "step": 24875 + }, + { + "epoch": 2.9023451172558627, + "grad_norm": 1.1719893217086792, + "learning_rate": 6.607739563479941e-05, + "loss": 1.9737, + "step": 24876 + }, + { + "epoch": 2.9024617897561544, + "grad_norm": 1.4245167970657349, + "learning_rate": 6.606502813975905e-05, + "loss": 1.9835, + "step": 24877 + }, + { + "epoch": 2.902578462256446, + "grad_norm": 1.105873703956604, + "learning_rate": 6.605266153041401e-05, + "loss": 2.0527, + "step": 24878 + }, + { + "epoch": 2.902695134756738, + "grad_norm": 1.088432788848877, + "learning_rate": 6.604029580689239e-05, + "loss": 1.9298, + "step": 24879 + }, + { + "epoch": 2.9028118072570295, + "grad_norm": 1.1913721561431885, + "learning_rate": 6.602793096932242e-05, + "loss": 2.052, + "step": 24880 + }, + { + "epoch": 2.902928479757321, + "grad_norm": 1.128813624382019, + "learning_rate": 6.601556701783238e-05, + "loss": 2.0447, + "step": 24881 + }, + { + "epoch": 2.903045152257613, + "grad_norm": 1.0117404460906982, + "learning_rate": 6.600320395255034e-05, + "loss": 1.9519, + "step": 24882 + }, + { + "epoch": 2.9031618247579045, + "grad_norm": 1.0886019468307495, + "learning_rate": 6.599084177360444e-05, + "loss": 1.9091, + "step": 24883 + }, + { + "epoch": 2.903278497258196, + "grad_norm": 1.0364580154418945, + "learning_rate": 6.597848048112296e-05, + "loss": 1.8555, + "step": 24884 + }, + { + "epoch": 2.903395169758488, + "grad_norm": 1.1887089014053345, + "learning_rate": 6.596612007523399e-05, + "loss": 1.9431, + "step": 24885 + }, + { + "epoch": 2.9035118422587796, + "grad_norm": 1.0834709405899048, + "learning_rate": 6.595376055606559e-05, + "loss": 1.911, + "step": 24886 + }, + { + "epoch": 2.9036285147590712, + "grad_norm": 1.0295261144638062, + "learning_rate": 6.594140192374599e-05, + "loss": 2.0074, + "step": 24887 + }, + { + "epoch": 2.903745187259363, + "grad_norm": 1.2233182191848755, + "learning_rate": 6.592904417840324e-05, + "loss": 2.0155, + "step": 24888 + }, + { + "epoch": 2.9038618597596546, + "grad_norm": 1.0595847368240356, + "learning_rate": 6.591668732016554e-05, + "loss": 1.9507, + "step": 24889 + }, + { + "epoch": 2.9039785322599463, + "grad_norm": 1.312964916229248, + "learning_rate": 6.59043313491609e-05, + "loss": 1.9471, + "step": 24890 + }, + { + "epoch": 2.904095204760238, + "grad_norm": 1.1105777025222778, + "learning_rate": 6.58919762655175e-05, + "loss": 1.8838, + "step": 24891 + }, + { + "epoch": 2.9042118772605297, + "grad_norm": 1.0532329082489014, + "learning_rate": 6.587962206936332e-05, + "loss": 1.7883, + "step": 24892 + }, + { + "epoch": 2.9043285497608213, + "grad_norm": 0.9808430075645447, + "learning_rate": 6.586726876082651e-05, + "loss": 1.8301, + "step": 24893 + }, + { + "epoch": 2.904445222261113, + "grad_norm": 1.1056435108184814, + "learning_rate": 6.585491634003514e-05, + "loss": 1.7494, + "step": 24894 + }, + { + "epoch": 2.9045618947614047, + "grad_norm": 1.0056777000427246, + "learning_rate": 6.584256480711722e-05, + "loss": 1.8517, + "step": 24895 + }, + { + "epoch": 2.9046785672616964, + "grad_norm": 1.1844193935394287, + "learning_rate": 6.583021416220086e-05, + "loss": 1.9196, + "step": 24896 + }, + { + "epoch": 2.904795239761988, + "grad_norm": 1.3132456541061401, + "learning_rate": 6.581786440541401e-05, + "loss": 2.0222, + "step": 24897 + }, + { + "epoch": 2.9049119122622797, + "grad_norm": 1.1275644302368164, + "learning_rate": 6.580551553688479e-05, + "loss": 1.9226, + "step": 24898 + }, + { + "epoch": 2.9050285847625714, + "grad_norm": 1.1708828210830688, + "learning_rate": 6.579316755674113e-05, + "loss": 1.967, + "step": 24899 + }, + { + "epoch": 2.905145257262863, + "grad_norm": 1.12368643283844, + "learning_rate": 6.578082046511116e-05, + "loss": 1.8515, + "step": 24900 + }, + { + "epoch": 2.905261929763155, + "grad_norm": 1.0095000267028809, + "learning_rate": 6.57684742621228e-05, + "loss": 1.9322, + "step": 24901 + }, + { + "epoch": 2.9053786022634465, + "grad_norm": 1.0049844980239868, + "learning_rate": 6.575612894790403e-05, + "loss": 1.9033, + "step": 24902 + }, + { + "epoch": 2.905495274763738, + "grad_norm": 1.313662052154541, + "learning_rate": 6.574378452258283e-05, + "loss": 1.936, + "step": 24903 + }, + { + "epoch": 2.90561194726403, + "grad_norm": 1.0538952350616455, + "learning_rate": 6.573144098628723e-05, + "loss": 1.8281, + "step": 24904 + }, + { + "epoch": 2.9057286197643215, + "grad_norm": 1.0063244104385376, + "learning_rate": 6.57190983391451e-05, + "loss": 1.9278, + "step": 24905 + }, + { + "epoch": 2.905845292264613, + "grad_norm": 1.2101161479949951, + "learning_rate": 6.570675658128446e-05, + "loss": 1.8354, + "step": 24906 + }, + { + "epoch": 2.905961964764905, + "grad_norm": 1.1942211389541626, + "learning_rate": 6.569441571283331e-05, + "loss": 2.0145, + "step": 24907 + }, + { + "epoch": 2.9060786372651966, + "grad_norm": 1.0850447416305542, + "learning_rate": 6.568207573391949e-05, + "loss": 1.6998, + "step": 24908 + }, + { + "epoch": 2.9061953097654882, + "grad_norm": 1.2169920206069946, + "learning_rate": 6.566973664467099e-05, + "loss": 1.9366, + "step": 24909 + }, + { + "epoch": 2.90631198226578, + "grad_norm": 1.1639950275421143, + "learning_rate": 6.565739844521566e-05, + "loss": 1.9896, + "step": 24910 + }, + { + "epoch": 2.9064286547660716, + "grad_norm": 0.9480323195457458, + "learning_rate": 6.564506113568149e-05, + "loss": 1.913, + "step": 24911 + }, + { + "epoch": 2.9065453272663633, + "grad_norm": 1.0355806350708008, + "learning_rate": 6.563272471619631e-05, + "loss": 1.8723, + "step": 24912 + }, + { + "epoch": 2.906661999766655, + "grad_norm": 1.0914522409439087, + "learning_rate": 6.562038918688809e-05, + "loss": 2.0214, + "step": 24913 + }, + { + "epoch": 2.9067786722669466, + "grad_norm": 1.193298578262329, + "learning_rate": 6.56080545478846e-05, + "loss": 1.9847, + "step": 24914 + }, + { + "epoch": 2.9068953447672383, + "grad_norm": 1.2068901062011719, + "learning_rate": 6.559572079931384e-05, + "loss": 1.9678, + "step": 24915 + }, + { + "epoch": 2.90701201726753, + "grad_norm": 0.98606938123703, + "learning_rate": 6.558338794130356e-05, + "loss": 1.8786, + "step": 24916 + }, + { + "epoch": 2.9071286897678217, + "grad_norm": 1.1803418397903442, + "learning_rate": 6.557105597398169e-05, + "loss": 1.998, + "step": 24917 + }, + { + "epoch": 2.9072453622681134, + "grad_norm": 1.0550150871276855, + "learning_rate": 6.555872489747606e-05, + "loss": 1.9924, + "step": 24918 + }, + { + "epoch": 2.907362034768405, + "grad_norm": 1.0146435499191284, + "learning_rate": 6.554639471191442e-05, + "loss": 1.8649, + "step": 24919 + }, + { + "epoch": 2.9074787072686967, + "grad_norm": 1.2404999732971191, + "learning_rate": 6.553406541742473e-05, + "loss": 2.1603, + "step": 24920 + }, + { + "epoch": 2.9075953797689884, + "grad_norm": 1.060628890991211, + "learning_rate": 6.55217370141347e-05, + "loss": 2.0516, + "step": 24921 + }, + { + "epoch": 2.90771205226928, + "grad_norm": 1.2619174718856812, + "learning_rate": 6.550940950217223e-05, + "loss": 2.2174, + "step": 24922 + }, + { + "epoch": 2.9078287247695718, + "grad_norm": 1.0917205810546875, + "learning_rate": 6.549708288166499e-05, + "loss": 1.9984, + "step": 24923 + }, + { + "epoch": 2.9079453972698635, + "grad_norm": 1.0143307447433472, + "learning_rate": 6.548475715274092e-05, + "loss": 1.8282, + "step": 24924 + }, + { + "epoch": 2.908062069770155, + "grad_norm": 1.0752149820327759, + "learning_rate": 6.547243231552765e-05, + "loss": 1.9856, + "step": 24925 + }, + { + "epoch": 2.908178742270447, + "grad_norm": 1.1696617603302002, + "learning_rate": 6.546010837015312e-05, + "loss": 2.0194, + "step": 24926 + }, + { + "epoch": 2.9082954147707385, + "grad_norm": 1.117074728012085, + "learning_rate": 6.54477853167449e-05, + "loss": 1.9439, + "step": 24927 + }, + { + "epoch": 2.90841208727103, + "grad_norm": 1.0885924100875854, + "learning_rate": 6.543546315543093e-05, + "loss": 1.9508, + "step": 24928 + }, + { + "epoch": 2.908528759771322, + "grad_norm": 1.1415646076202393, + "learning_rate": 6.542314188633878e-05, + "loss": 1.9601, + "step": 24929 + }, + { + "epoch": 2.9086454322716135, + "grad_norm": 1.149431586265564, + "learning_rate": 6.541082150959632e-05, + "loss": 2.0092, + "step": 24930 + }, + { + "epoch": 2.908762104771905, + "grad_norm": 0.9592660069465637, + "learning_rate": 6.539850202533116e-05, + "loss": 1.925, + "step": 24931 + }, + { + "epoch": 2.908878777272197, + "grad_norm": 1.0385419130325317, + "learning_rate": 6.538618343367115e-05, + "loss": 1.9001, + "step": 24932 + }, + { + "epoch": 2.9089954497724886, + "grad_norm": 1.1239588260650635, + "learning_rate": 6.537386573474386e-05, + "loss": 1.9684, + "step": 24933 + }, + { + "epoch": 2.9091121222727803, + "grad_norm": 1.113085389137268, + "learning_rate": 6.536154892867709e-05, + "loss": 1.9292, + "step": 24934 + }, + { + "epoch": 2.909228794773072, + "grad_norm": 1.1911824941635132, + "learning_rate": 6.534923301559842e-05, + "loss": 1.9506, + "step": 24935 + }, + { + "epoch": 2.9093454672733636, + "grad_norm": 1.1618484258651733, + "learning_rate": 6.533691799563566e-05, + "loss": 2.2433, + "step": 24936 + }, + { + "epoch": 2.9094621397736553, + "grad_norm": 1.1397793292999268, + "learning_rate": 6.53246038689164e-05, + "loss": 2.1378, + "step": 24937 + }, + { + "epoch": 2.909578812273947, + "grad_norm": 1.1187258958816528, + "learning_rate": 6.531229063556825e-05, + "loss": 1.7707, + "step": 24938 + }, + { + "epoch": 2.9096954847742387, + "grad_norm": 0.8661563396453857, + "learning_rate": 6.529997829571897e-05, + "loss": 1.781, + "step": 24939 + }, + { + "epoch": 2.9098121572745304, + "grad_norm": 1.2757465839385986, + "learning_rate": 6.528766684949611e-05, + "loss": 1.9684, + "step": 24940 + }, + { + "epoch": 2.909928829774822, + "grad_norm": 1.2119663953781128, + "learning_rate": 6.527535629702738e-05, + "loss": 1.9137, + "step": 24941 + }, + { + "epoch": 2.9100455022751137, + "grad_norm": 1.0484366416931152, + "learning_rate": 6.52630466384403e-05, + "loss": 2.0639, + "step": 24942 + }, + { + "epoch": 2.9101621747754054, + "grad_norm": 1.134986162185669, + "learning_rate": 6.525073787386261e-05, + "loss": 2.0209, + "step": 24943 + }, + { + "epoch": 2.910278847275697, + "grad_norm": 1.204527735710144, + "learning_rate": 6.523843000342179e-05, + "loss": 1.8611, + "step": 24944 + }, + { + "epoch": 2.9103955197759888, + "grad_norm": 1.3430317640304565, + "learning_rate": 6.522612302724555e-05, + "loss": 2.0155, + "step": 24945 + }, + { + "epoch": 2.9105121922762804, + "grad_norm": 1.113490104675293, + "learning_rate": 6.521381694546137e-05, + "loss": 1.9527, + "step": 24946 + }, + { + "epoch": 2.910628864776572, + "grad_norm": 1.160298466682434, + "learning_rate": 6.520151175819687e-05, + "loss": 1.8891, + "step": 24947 + }, + { + "epoch": 2.910745537276864, + "grad_norm": 0.939276933670044, + "learning_rate": 6.518920746557967e-05, + "loss": 1.8753, + "step": 24948 + }, + { + "epoch": 2.9108622097771555, + "grad_norm": 1.1168333292007446, + "learning_rate": 6.517690406773723e-05, + "loss": 1.8425, + "step": 24949 + }, + { + "epoch": 2.910978882277447, + "grad_norm": 1.0258535146713257, + "learning_rate": 6.516460156479719e-05, + "loss": 2.0193, + "step": 24950 + }, + { + "epoch": 2.911095554777739, + "grad_norm": 1.0718967914581299, + "learning_rate": 6.515229995688705e-05, + "loss": 2.022, + "step": 24951 + }, + { + "epoch": 2.9112122272780305, + "grad_norm": 1.2858357429504395, + "learning_rate": 6.513999924413428e-05, + "loss": 2.1742, + "step": 24952 + }, + { + "epoch": 2.911328899778322, + "grad_norm": 1.2960553169250488, + "learning_rate": 6.51276994266665e-05, + "loss": 2.0727, + "step": 24953 + }, + { + "epoch": 2.911445572278614, + "grad_norm": 1.007680892944336, + "learning_rate": 6.511540050461118e-05, + "loss": 2.0669, + "step": 24954 + }, + { + "epoch": 2.9115622447789056, + "grad_norm": 1.0616532564163208, + "learning_rate": 6.510310247809577e-05, + "loss": 1.7951, + "step": 24955 + }, + { + "epoch": 2.9116789172791973, + "grad_norm": 1.0290230512619019, + "learning_rate": 6.509080534724785e-05, + "loss": 1.7868, + "step": 24956 + }, + { + "epoch": 2.911795589779489, + "grad_norm": 1.1437686681747437, + "learning_rate": 6.507850911219481e-05, + "loss": 1.8123, + "step": 24957 + }, + { + "epoch": 2.9119122622797806, + "grad_norm": 1.1169273853302002, + "learning_rate": 6.506621377306424e-05, + "loss": 2.1161, + "step": 24958 + }, + { + "epoch": 2.9120289347800723, + "grad_norm": 1.2053468227386475, + "learning_rate": 6.505391932998348e-05, + "loss": 1.8183, + "step": 24959 + }, + { + "epoch": 2.912145607280364, + "grad_norm": 1.0138071775436401, + "learning_rate": 6.504162578308005e-05, + "loss": 2.0292, + "step": 24960 + }, + { + "epoch": 2.9122622797806557, + "grad_norm": 1.3372743129730225, + "learning_rate": 6.502933313248144e-05, + "loss": 2.0839, + "step": 24961 + }, + { + "epoch": 2.9123789522809473, + "grad_norm": 1.0951429605484009, + "learning_rate": 6.501704137831498e-05, + "loss": 1.8523, + "step": 24962 + }, + { + "epoch": 2.912495624781239, + "grad_norm": 1.0310348272323608, + "learning_rate": 6.500475052070822e-05, + "loss": 1.8737, + "step": 24963 + }, + { + "epoch": 2.9126122972815307, + "grad_norm": 0.9895809292793274, + "learning_rate": 6.499246055978844e-05, + "loss": 1.8667, + "step": 24964 + }, + { + "epoch": 2.9127289697818224, + "grad_norm": 1.1497687101364136, + "learning_rate": 6.498017149568319e-05, + "loss": 1.9468, + "step": 24965 + }, + { + "epoch": 2.912845642282114, + "grad_norm": 0.9758260250091553, + "learning_rate": 6.496788332851975e-05, + "loss": 1.8657, + "step": 24966 + }, + { + "epoch": 2.9129623147824057, + "grad_norm": 1.337698221206665, + "learning_rate": 6.495559605842562e-05, + "loss": 1.9669, + "step": 24967 + }, + { + "epoch": 2.9130789872826974, + "grad_norm": 1.2085533142089844, + "learning_rate": 6.494330968552805e-05, + "loss": 2.0199, + "step": 24968 + }, + { + "epoch": 2.913195659782989, + "grad_norm": 1.1469011306762695, + "learning_rate": 6.493102420995454e-05, + "loss": 1.9356, + "step": 24969 + }, + { + "epoch": 2.913312332283281, + "grad_norm": 1.1515953540802002, + "learning_rate": 6.49187396318324e-05, + "loss": 1.7437, + "step": 24970 + }, + { + "epoch": 2.9134290047835725, + "grad_norm": 1.0350191593170166, + "learning_rate": 6.490645595128898e-05, + "loss": 1.6831, + "step": 24971 + }, + { + "epoch": 2.913545677283864, + "grad_norm": 1.0635801553726196, + "learning_rate": 6.489417316845158e-05, + "loss": 2.12, + "step": 24972 + }, + { + "epoch": 2.913662349784156, + "grad_norm": 1.1542836427688599, + "learning_rate": 6.488189128344755e-05, + "loss": 1.7634, + "step": 24973 + }, + { + "epoch": 2.9137790222844475, + "grad_norm": 1.1502856016159058, + "learning_rate": 6.48696102964043e-05, + "loss": 1.9878, + "step": 24974 + }, + { + "epoch": 2.913895694784739, + "grad_norm": 1.135174036026001, + "learning_rate": 6.485733020744906e-05, + "loss": 2.0295, + "step": 24975 + }, + { + "epoch": 2.914012367285031, + "grad_norm": 0.9458388686180115, + "learning_rate": 6.48450510167092e-05, + "loss": 1.842, + "step": 24976 + }, + { + "epoch": 2.9141290397853226, + "grad_norm": 1.0881781578063965, + "learning_rate": 6.483277272431194e-05, + "loss": 1.9615, + "step": 24977 + }, + { + "epoch": 2.9142457122856142, + "grad_norm": 1.2106108665466309, + "learning_rate": 6.482049533038464e-05, + "loss": 1.9508, + "step": 24978 + }, + { + "epoch": 2.914362384785906, + "grad_norm": 1.0708138942718506, + "learning_rate": 6.480821883505451e-05, + "loss": 1.9122, + "step": 24979 + }, + { + "epoch": 2.9144790572861976, + "grad_norm": 1.121045708656311, + "learning_rate": 6.47959432384489e-05, + "loss": 1.9184, + "step": 24980 + }, + { + "epoch": 2.9145957297864893, + "grad_norm": 1.3097426891326904, + "learning_rate": 6.478366854069499e-05, + "loss": 2.0412, + "step": 24981 + }, + { + "epoch": 2.914712402286781, + "grad_norm": 1.0724111795425415, + "learning_rate": 6.47713947419201e-05, + "loss": 1.9008, + "step": 24982 + }, + { + "epoch": 2.9148290747870726, + "grad_norm": 1.2370545864105225, + "learning_rate": 6.475912184225138e-05, + "loss": 2.0222, + "step": 24983 + }, + { + "epoch": 2.9149457472873643, + "grad_norm": 1.148140549659729, + "learning_rate": 6.474684984181619e-05, + "loss": 1.9752, + "step": 24984 + }, + { + "epoch": 2.915062419787656, + "grad_norm": 1.2959100008010864, + "learning_rate": 6.473457874074163e-05, + "loss": 1.9114, + "step": 24985 + }, + { + "epoch": 2.9151790922879477, + "grad_norm": 0.9692922234535217, + "learning_rate": 6.472230853915501e-05, + "loss": 1.8946, + "step": 24986 + }, + { + "epoch": 2.9152957647882394, + "grad_norm": 1.0676565170288086, + "learning_rate": 6.471003923718348e-05, + "loss": 1.8656, + "step": 24987 + }, + { + "epoch": 2.915412437288531, + "grad_norm": 1.0761469602584839, + "learning_rate": 6.469777083495421e-05, + "loss": 2.0417, + "step": 24988 + }, + { + "epoch": 2.9155291097888227, + "grad_norm": 0.9463155269622803, + "learning_rate": 6.468550333259446e-05, + "loss": 1.9382, + "step": 24989 + }, + { + "epoch": 2.9156457822891144, + "grad_norm": 1.1380749940872192, + "learning_rate": 6.46732367302313e-05, + "loss": 1.8596, + "step": 24990 + }, + { + "epoch": 2.915762454789406, + "grad_norm": 1.159389853477478, + "learning_rate": 6.466097102799203e-05, + "loss": 2.0111, + "step": 24991 + }, + { + "epoch": 2.915879127289698, + "grad_norm": 1.1271928548812866, + "learning_rate": 6.46487062260037e-05, + "loss": 1.9665, + "step": 24992 + }, + { + "epoch": 2.9159957997899895, + "grad_norm": 1.3215614557266235, + "learning_rate": 6.463644232439351e-05, + "loss": 2.0688, + "step": 24993 + }, + { + "epoch": 2.916112472290281, + "grad_norm": 1.085996389389038, + "learning_rate": 6.462417932328857e-05, + "loss": 1.9288, + "step": 24994 + }, + { + "epoch": 2.916229144790573, + "grad_norm": 1.251147747039795, + "learning_rate": 6.461191722281606e-05, + "loss": 1.7521, + "step": 24995 + }, + { + "epoch": 2.9163458172908645, + "grad_norm": 1.0344963073730469, + "learning_rate": 6.459965602310302e-05, + "loss": 2.1111, + "step": 24996 + }, + { + "epoch": 2.916462489791156, + "grad_norm": 1.1900969743728638, + "learning_rate": 6.458739572427664e-05, + "loss": 1.9698, + "step": 24997 + }, + { + "epoch": 2.916579162291448, + "grad_norm": 1.2498685121536255, + "learning_rate": 6.457513632646395e-05, + "loss": 1.9808, + "step": 24998 + }, + { + "epoch": 2.9166958347917395, + "grad_norm": 1.2563822269439697, + "learning_rate": 6.456287782979206e-05, + "loss": 2.1036, + "step": 24999 + }, + { + "epoch": 2.9168125072920312, + "grad_norm": 1.1176469326019287, + "learning_rate": 6.455062023438815e-05, + "loss": 1.977, + "step": 25000 + }, + { + "epoch": 2.916929179792323, + "grad_norm": 1.0848718881607056, + "learning_rate": 6.453836354037914e-05, + "loss": 1.8903, + "step": 25001 + }, + { + "epoch": 2.9170458522926146, + "grad_norm": 1.0177756547927856, + "learning_rate": 6.452610774789222e-05, + "loss": 1.9088, + "step": 25002 + }, + { + "epoch": 2.9171625247929063, + "grad_norm": 1.0897561311721802, + "learning_rate": 6.45138528570544e-05, + "loss": 1.9924, + "step": 25003 + }, + { + "epoch": 2.917279197293198, + "grad_norm": 1.0452523231506348, + "learning_rate": 6.450159886799266e-05, + "loss": 1.8223, + "step": 25004 + }, + { + "epoch": 2.9173958697934896, + "grad_norm": 1.1954164505004883, + "learning_rate": 6.448934578083414e-05, + "loss": 2.0192, + "step": 25005 + }, + { + "epoch": 2.9175125422937813, + "grad_norm": 1.0524264574050903, + "learning_rate": 6.447709359570582e-05, + "loss": 2.1284, + "step": 25006 + }, + { + "epoch": 2.917629214794073, + "grad_norm": 1.1579467058181763, + "learning_rate": 6.446484231273468e-05, + "loss": 2.0854, + "step": 25007 + }, + { + "epoch": 2.9177458872943647, + "grad_norm": 1.0201916694641113, + "learning_rate": 6.445259193204781e-05, + "loss": 1.8464, + "step": 25008 + }, + { + "epoch": 2.9178625597946564, + "grad_norm": 1.1688703298568726, + "learning_rate": 6.444034245377209e-05, + "loss": 2.0132, + "step": 25009 + }, + { + "epoch": 2.917979232294948, + "grad_norm": 1.159482479095459, + "learning_rate": 6.442809387803466e-05, + "loss": 1.9596, + "step": 25010 + }, + { + "epoch": 2.9180959047952397, + "grad_norm": 1.0795457363128662, + "learning_rate": 6.441584620496234e-05, + "loss": 2.0448, + "step": 25011 + }, + { + "epoch": 2.9182125772955314, + "grad_norm": 1.1567295789718628, + "learning_rate": 6.440359943468223e-05, + "loss": 1.9083, + "step": 25012 + }, + { + "epoch": 2.918329249795823, + "grad_norm": 1.1486910581588745, + "learning_rate": 6.439135356732124e-05, + "loss": 1.9544, + "step": 25013 + }, + { + "epoch": 2.9184459222961148, + "grad_norm": 1.245975136756897, + "learning_rate": 6.437910860300631e-05, + "loss": 1.8608, + "step": 25014 + }, + { + "epoch": 2.9185625947964065, + "grad_norm": 1.1207947731018066, + "learning_rate": 6.436686454186442e-05, + "loss": 2.0059, + "step": 25015 + }, + { + "epoch": 2.918679267296698, + "grad_norm": 1.0000966787338257, + "learning_rate": 6.435462138402244e-05, + "loss": 1.9067, + "step": 25016 + }, + { + "epoch": 2.91879593979699, + "grad_norm": 1.3351564407348633, + "learning_rate": 6.43423791296074e-05, + "loss": 2.0867, + "step": 25017 + }, + { + "epoch": 2.9189126122972815, + "grad_norm": 0.9358735680580139, + "learning_rate": 6.433013777874607e-05, + "loss": 1.9676, + "step": 25018 + }, + { + "epoch": 2.919029284797573, + "grad_norm": 1.2554311752319336, + "learning_rate": 6.431789733156551e-05, + "loss": 2.0273, + "step": 25019 + }, + { + "epoch": 2.919145957297865, + "grad_norm": 1.2169605493545532, + "learning_rate": 6.430565778819246e-05, + "loss": 2.0639, + "step": 25020 + }, + { + "epoch": 2.9192626297981565, + "grad_norm": 0.8619248270988464, + "learning_rate": 6.429341914875394e-05, + "loss": 1.665, + "step": 25021 + }, + { + "epoch": 2.919379302298448, + "grad_norm": 1.2028300762176514, + "learning_rate": 6.428118141337678e-05, + "loss": 1.9634, + "step": 25022 + }, + { + "epoch": 2.91949597479874, + "grad_norm": 1.1667524576187134, + "learning_rate": 6.426894458218783e-05, + "loss": 2.0582, + "step": 25023 + }, + { + "epoch": 2.9196126472990316, + "grad_norm": 1.0146993398666382, + "learning_rate": 6.42567086553139e-05, + "loss": 1.823, + "step": 25024 + }, + { + "epoch": 2.9197293197993233, + "grad_norm": 1.2824375629425049, + "learning_rate": 6.42444736328819e-05, + "loss": 1.9943, + "step": 25025 + }, + { + "epoch": 2.919845992299615, + "grad_norm": 1.2183501720428467, + "learning_rate": 6.423223951501872e-05, + "loss": 2.158, + "step": 25026 + }, + { + "epoch": 2.9199626647999066, + "grad_norm": 1.1086045503616333, + "learning_rate": 6.422000630185107e-05, + "loss": 2.0397, + "step": 25027 + }, + { + "epoch": 2.9200793373001983, + "grad_norm": 1.09561288356781, + "learning_rate": 6.420777399350589e-05, + "loss": 1.9161, + "step": 25028 + }, + { + "epoch": 2.92019600980049, + "grad_norm": 0.9974002838134766, + "learning_rate": 6.419554259010988e-05, + "loss": 1.882, + "step": 25029 + }, + { + "epoch": 2.9203126823007817, + "grad_norm": 0.9952442049980164, + "learning_rate": 6.418331209178996e-05, + "loss": 1.8114, + "step": 25030 + }, + { + "epoch": 2.9204293548010734, + "grad_norm": 1.130807876586914, + "learning_rate": 6.417108249867278e-05, + "loss": 1.9412, + "step": 25031 + }, + { + "epoch": 2.920546027301365, + "grad_norm": 0.9796034693717957, + "learning_rate": 6.415885381088525e-05, + "loss": 1.9405, + "step": 25032 + }, + { + "epoch": 2.9206626998016567, + "grad_norm": 1.1539499759674072, + "learning_rate": 6.414662602855407e-05, + "loss": 2.0008, + "step": 25033 + }, + { + "epoch": 2.9207793723019484, + "grad_norm": 1.1129486560821533, + "learning_rate": 6.413439915180607e-05, + "loss": 1.868, + "step": 25034 + }, + { + "epoch": 2.92089604480224, + "grad_norm": 1.0789124965667725, + "learning_rate": 6.41221731807679e-05, + "loss": 1.8315, + "step": 25035 + }, + { + "epoch": 2.9210127173025318, + "grad_norm": 1.0206375122070312, + "learning_rate": 6.410994811556644e-05, + "loss": 2.0036, + "step": 25036 + }, + { + "epoch": 2.9211293898028234, + "grad_norm": 1.162124514579773, + "learning_rate": 6.40977239563283e-05, + "loss": 2.1305, + "step": 25037 + }, + { + "epoch": 2.921246062303115, + "grad_norm": 1.1142048835754395, + "learning_rate": 6.40855007031803e-05, + "loss": 1.8558, + "step": 25038 + }, + { + "epoch": 2.921362734803407, + "grad_norm": 1.1214839220046997, + "learning_rate": 6.407327835624911e-05, + "loss": 2.0802, + "step": 25039 + }, + { + "epoch": 2.9214794073036985, + "grad_norm": 1.2442110776901245, + "learning_rate": 6.406105691566142e-05, + "loss": 2.0361, + "step": 25040 + }, + { + "epoch": 2.92159607980399, + "grad_norm": 1.1187493801116943, + "learning_rate": 6.404883638154399e-05, + "loss": 1.7656, + "step": 25041 + }, + { + "epoch": 2.921712752304282, + "grad_norm": 1.1794413328170776, + "learning_rate": 6.403661675402342e-05, + "loss": 1.941, + "step": 25042 + }, + { + "epoch": 2.9218294248045735, + "grad_norm": 1.0062222480773926, + "learning_rate": 6.40243980332265e-05, + "loss": 1.9698, + "step": 25043 + }, + { + "epoch": 2.921946097304865, + "grad_norm": 1.3281461000442505, + "learning_rate": 6.401218021927979e-05, + "loss": 2.0328, + "step": 25044 + }, + { + "epoch": 2.922062769805157, + "grad_norm": 1.1186692714691162, + "learning_rate": 6.399996331231006e-05, + "loss": 2.0927, + "step": 25045 + }, + { + "epoch": 2.9221794423054486, + "grad_norm": 0.9563767313957214, + "learning_rate": 6.398774731244384e-05, + "loss": 1.7983, + "step": 25046 + }, + { + "epoch": 2.9222961148057403, + "grad_norm": 1.1019951105117798, + "learning_rate": 6.397553221980791e-05, + "loss": 1.9998, + "step": 25047 + }, + { + "epoch": 2.922412787306032, + "grad_norm": 1.0167341232299805, + "learning_rate": 6.396331803452876e-05, + "loss": 1.8107, + "step": 25048 + }, + { + "epoch": 2.9225294598063236, + "grad_norm": 1.207653522491455, + "learning_rate": 6.395110475673314e-05, + "loss": 1.8854, + "step": 25049 + }, + { + "epoch": 2.9226461323066153, + "grad_norm": 1.2184326648712158, + "learning_rate": 6.393889238654755e-05, + "loss": 1.968, + "step": 25050 + }, + { + "epoch": 2.922762804806907, + "grad_norm": 1.1998547315597534, + "learning_rate": 6.392668092409867e-05, + "loss": 1.9997, + "step": 25051 + }, + { + "epoch": 2.9228794773071987, + "grad_norm": 1.1154472827911377, + "learning_rate": 6.391447036951311e-05, + "loss": 1.8938, + "step": 25052 + }, + { + "epoch": 2.9229961498074903, + "grad_norm": 1.113154411315918, + "learning_rate": 6.390226072291738e-05, + "loss": 2.0475, + "step": 25053 + }, + { + "epoch": 2.923112822307782, + "grad_norm": 1.256758451461792, + "learning_rate": 6.389005198443814e-05, + "loss": 2.0504, + "step": 25054 + }, + { + "epoch": 2.9232294948080737, + "grad_norm": 1.0777695178985596, + "learning_rate": 6.38778441542019e-05, + "loss": 1.759, + "step": 25055 + }, + { + "epoch": 2.9233461673083654, + "grad_norm": 1.0287964344024658, + "learning_rate": 6.386563723233526e-05, + "loss": 1.8695, + "step": 25056 + }, + { + "epoch": 2.923462839808657, + "grad_norm": 1.0716077089309692, + "learning_rate": 6.385343121896468e-05, + "loss": 1.9504, + "step": 25057 + }, + { + "epoch": 2.9235795123089487, + "grad_norm": 1.119188904762268, + "learning_rate": 6.38412261142168e-05, + "loss": 1.8583, + "step": 25058 + }, + { + "epoch": 2.9236961848092404, + "grad_norm": 1.1668939590454102, + "learning_rate": 6.382902191821807e-05, + "loss": 1.939, + "step": 25059 + }, + { + "epoch": 2.923812857309532, + "grad_norm": 1.102744698524475, + "learning_rate": 6.381681863109508e-05, + "loss": 2.0352, + "step": 25060 + }, + { + "epoch": 2.923929529809824, + "grad_norm": 1.2120634317398071, + "learning_rate": 6.380461625297427e-05, + "loss": 1.9195, + "step": 25061 + }, + { + "epoch": 2.9240462023101155, + "grad_norm": 1.2197585105895996, + "learning_rate": 6.379241478398221e-05, + "loss": 2.1382, + "step": 25062 + }, + { + "epoch": 2.924162874810407, + "grad_norm": 1.0437637567520142, + "learning_rate": 6.378021422424535e-05, + "loss": 1.9669, + "step": 25063 + }, + { + "epoch": 2.924279547310699, + "grad_norm": 1.320505976676941, + "learning_rate": 6.376801457389013e-05, + "loss": 2.0382, + "step": 25064 + }, + { + "epoch": 2.9243962198109905, + "grad_norm": 1.1472091674804688, + "learning_rate": 6.375581583304315e-05, + "loss": 2.1374, + "step": 25065 + }, + { + "epoch": 2.924512892311282, + "grad_norm": 1.1587762832641602, + "learning_rate": 6.374361800183074e-05, + "loss": 1.8572, + "step": 25066 + }, + { + "epoch": 2.924629564811574, + "grad_norm": 0.9845432043075562, + "learning_rate": 6.373142108037947e-05, + "loss": 2.0592, + "step": 25067 + }, + { + "epoch": 2.9247462373118656, + "grad_norm": 1.0698314905166626, + "learning_rate": 6.371922506881567e-05, + "loss": 1.9739, + "step": 25068 + }, + { + "epoch": 2.9248629098121572, + "grad_norm": 1.0981029272079468, + "learning_rate": 6.370702996726586e-05, + "loss": 2.0625, + "step": 25069 + }, + { + "epoch": 2.924979582312449, + "grad_norm": 1.1646232604980469, + "learning_rate": 6.36948357758564e-05, + "loss": 2.03, + "step": 25070 + }, + { + "epoch": 2.9250962548127406, + "grad_norm": 1.0666662454605103, + "learning_rate": 6.368264249471378e-05, + "loss": 2.0764, + "step": 25071 + }, + { + "epoch": 2.9252129273130323, + "grad_norm": 1.0978938341140747, + "learning_rate": 6.367045012396438e-05, + "loss": 1.7839, + "step": 25072 + }, + { + "epoch": 2.925329599813324, + "grad_norm": 1.1487594842910767, + "learning_rate": 6.365825866373453e-05, + "loss": 1.9799, + "step": 25073 + }, + { + "epoch": 2.9254462723136156, + "grad_norm": 1.3676031827926636, + "learning_rate": 6.364606811415072e-05, + "loss": 2.0272, + "step": 25074 + }, + { + "epoch": 2.9255629448139073, + "grad_norm": 1.0264697074890137, + "learning_rate": 6.363387847533928e-05, + "loss": 2.0264, + "step": 25075 + }, + { + "epoch": 2.925679617314199, + "grad_norm": 1.0661842823028564, + "learning_rate": 6.362168974742651e-05, + "loss": 1.8039, + "step": 25076 + }, + { + "epoch": 2.9257962898144907, + "grad_norm": 1.162453293800354, + "learning_rate": 6.360950193053885e-05, + "loss": 1.8756, + "step": 25077 + }, + { + "epoch": 2.9259129623147824, + "grad_norm": 1.1397730112075806, + "learning_rate": 6.359731502480269e-05, + "loss": 1.8794, + "step": 25078 + }, + { + "epoch": 2.926029634815074, + "grad_norm": 1.2707533836364746, + "learning_rate": 6.358512903034427e-05, + "loss": 2.0755, + "step": 25079 + }, + { + "epoch": 2.9261463073153657, + "grad_norm": 1.1149332523345947, + "learning_rate": 6.357294394729002e-05, + "loss": 2.0113, + "step": 25080 + }, + { + "epoch": 2.9262629798156574, + "grad_norm": 1.0634909868240356, + "learning_rate": 6.356075977576615e-05, + "loss": 1.9726, + "step": 25081 + }, + { + "epoch": 2.926379652315949, + "grad_norm": 1.1668850183486938, + "learning_rate": 6.354857651589908e-05, + "loss": 2.058, + "step": 25082 + }, + { + "epoch": 2.926496324816241, + "grad_norm": 1.2271510362625122, + "learning_rate": 6.353639416781503e-05, + "loss": 2.1222, + "step": 25083 + }, + { + "epoch": 2.9266129973165325, + "grad_norm": 1.150885820388794, + "learning_rate": 6.352421273164035e-05, + "loss": 2.0168, + "step": 25084 + }, + { + "epoch": 2.926729669816824, + "grad_norm": 1.0750116109848022, + "learning_rate": 6.351203220750127e-05, + "loss": 1.9756, + "step": 25085 + }, + { + "epoch": 2.926846342317116, + "grad_norm": 1.1062490940093994, + "learning_rate": 6.349985259552415e-05, + "loss": 2.0292, + "step": 25086 + }, + { + "epoch": 2.9269630148174075, + "grad_norm": 1.0457615852355957, + "learning_rate": 6.348767389583513e-05, + "loss": 1.6751, + "step": 25087 + }, + { + "epoch": 2.927079687317699, + "grad_norm": 1.0211135149002075, + "learning_rate": 6.347549610856062e-05, + "loss": 1.7777, + "step": 25088 + }, + { + "epoch": 2.927196359817991, + "grad_norm": 1.2381285429000854, + "learning_rate": 6.34633192338267e-05, + "loss": 1.9588, + "step": 25089 + }, + { + "epoch": 2.9273130323182825, + "grad_norm": 1.1421704292297363, + "learning_rate": 6.345114327175974e-05, + "loss": 1.998, + "step": 25090 + }, + { + "epoch": 2.9274297048185742, + "grad_norm": 1.1400953531265259, + "learning_rate": 6.343896822248592e-05, + "loss": 1.9972, + "step": 25091 + }, + { + "epoch": 2.927546377318866, + "grad_norm": 1.0399099588394165, + "learning_rate": 6.34267940861314e-05, + "loss": 1.8931, + "step": 25092 + }, + { + "epoch": 2.9276630498191576, + "grad_norm": 1.109047293663025, + "learning_rate": 6.34146208628225e-05, + "loss": 1.7615, + "step": 25093 + }, + { + "epoch": 2.9277797223194493, + "grad_norm": 1.1163250207901, + "learning_rate": 6.340244855268528e-05, + "loss": 1.945, + "step": 25094 + }, + { + "epoch": 2.927896394819741, + "grad_norm": 1.1380181312561035, + "learning_rate": 6.339027715584609e-05, + "loss": 2.045, + "step": 25095 + }, + { + "epoch": 2.9280130673200326, + "grad_norm": 1.0082647800445557, + "learning_rate": 6.337810667243095e-05, + "loss": 1.948, + "step": 25096 + }, + { + "epoch": 2.9281297398203243, + "grad_norm": 1.1851743459701538, + "learning_rate": 6.336593710256617e-05, + "loss": 1.975, + "step": 25097 + }, + { + "epoch": 2.928246412320616, + "grad_norm": 1.1305713653564453, + "learning_rate": 6.335376844637781e-05, + "loss": 2.1304, + "step": 25098 + }, + { + "epoch": 2.9283630848209077, + "grad_norm": 1.2260535955429077, + "learning_rate": 6.334160070399208e-05, + "loss": 1.9176, + "step": 25099 + }, + { + "epoch": 2.9284797573211994, + "grad_norm": 1.1849558353424072, + "learning_rate": 6.332943387553507e-05, + "loss": 1.8698, + "step": 25100 + }, + { + "epoch": 2.928596429821491, + "grad_norm": 1.133385419845581, + "learning_rate": 6.331726796113298e-05, + "loss": 1.901, + "step": 25101 + }, + { + "epoch": 2.9287131023217827, + "grad_norm": 1.0300794839859009, + "learning_rate": 6.330510296091185e-05, + "loss": 2.0193, + "step": 25102 + }, + { + "epoch": 2.9288297748220744, + "grad_norm": 1.043565034866333, + "learning_rate": 6.329293887499785e-05, + "loss": 1.9121, + "step": 25103 + }, + { + "epoch": 2.928946447322366, + "grad_norm": 1.0091899633407593, + "learning_rate": 6.32807757035171e-05, + "loss": 1.974, + "step": 25104 + }, + { + "epoch": 2.9290631198226578, + "grad_norm": 1.064287781715393, + "learning_rate": 6.326861344659564e-05, + "loss": 1.9806, + "step": 25105 + }, + { + "epoch": 2.9291797923229494, + "grad_norm": 1.0661379098892212, + "learning_rate": 6.325645210435962e-05, + "loss": 1.9946, + "step": 25106 + }, + { + "epoch": 2.929296464823241, + "grad_norm": 0.9768481850624084, + "learning_rate": 6.324429167693509e-05, + "loss": 1.8112, + "step": 25107 + }, + { + "epoch": 2.929413137323533, + "grad_norm": 1.0633212327957153, + "learning_rate": 6.32321321644481e-05, + "loss": 1.8916, + "step": 25108 + }, + { + "epoch": 2.9295298098238245, + "grad_norm": 1.165356159210205, + "learning_rate": 6.321997356702464e-05, + "loss": 1.8645, + "step": 25109 + }, + { + "epoch": 2.929646482324116, + "grad_norm": 1.2943592071533203, + "learning_rate": 6.32078158847909e-05, + "loss": 1.9219, + "step": 25110 + }, + { + "epoch": 2.929763154824408, + "grad_norm": 1.0273970365524292, + "learning_rate": 6.319565911787277e-05, + "loss": 2.0156, + "step": 25111 + }, + { + "epoch": 2.9298798273246995, + "grad_norm": 0.9564052820205688, + "learning_rate": 6.318350326639639e-05, + "loss": 1.8834, + "step": 25112 + }, + { + "epoch": 2.929996499824991, + "grad_norm": 0.9610393047332764, + "learning_rate": 6.317134833048773e-05, + "loss": 1.6746, + "step": 25113 + }, + { + "epoch": 2.930113172325283, + "grad_norm": 1.1328364610671997, + "learning_rate": 6.315919431027284e-05, + "loss": 1.9799, + "step": 25114 + }, + { + "epoch": 2.9302298448255746, + "grad_norm": 1.0212010145187378, + "learning_rate": 6.314704120587763e-05, + "loss": 2.021, + "step": 25115 + }, + { + "epoch": 2.9303465173258663, + "grad_norm": 1.2018471956253052, + "learning_rate": 6.313488901742816e-05, + "loss": 1.7901, + "step": 25116 + }, + { + "epoch": 2.930463189826158, + "grad_norm": 1.2108845710754395, + "learning_rate": 6.312273774505044e-05, + "loss": 2.0254, + "step": 25117 + }, + { + "epoch": 2.9305798623264496, + "grad_norm": 1.1538190841674805, + "learning_rate": 6.311058738887034e-05, + "loss": 2.0327, + "step": 25118 + }, + { + "epoch": 2.9306965348267413, + "grad_norm": 1.1699141263961792, + "learning_rate": 6.309843794901395e-05, + "loss": 2.141, + "step": 25119 + }, + { + "epoch": 2.930813207327033, + "grad_norm": 1.051895022392273, + "learning_rate": 6.308628942560711e-05, + "loss": 1.9597, + "step": 25120 + }, + { + "epoch": 2.9309298798273247, + "grad_norm": 1.2152371406555176, + "learning_rate": 6.307414181877584e-05, + "loss": 1.9988, + "step": 25121 + }, + { + "epoch": 2.9310465523276164, + "grad_norm": 0.991256833076477, + "learning_rate": 6.306199512864597e-05, + "loss": 1.9686, + "step": 25122 + }, + { + "epoch": 2.931163224827908, + "grad_norm": 1.0329939126968384, + "learning_rate": 6.304984935534357e-05, + "loss": 1.97, + "step": 25123 + }, + { + "epoch": 2.9312798973281997, + "grad_norm": 1.10145902633667, + "learning_rate": 6.303770449899447e-05, + "loss": 2.0084, + "step": 25124 + }, + { + "epoch": 2.9313965698284914, + "grad_norm": 1.1903367042541504, + "learning_rate": 6.302556055972453e-05, + "loss": 2.0673, + "step": 25125 + }, + { + "epoch": 2.931513242328783, + "grad_norm": 1.1302787065505981, + "learning_rate": 6.301341753765974e-05, + "loss": 1.9608, + "step": 25126 + }, + { + "epoch": 2.9316299148290748, + "grad_norm": 1.1985853910446167, + "learning_rate": 6.300127543292595e-05, + "loss": 1.9502, + "step": 25127 + }, + { + "epoch": 2.9317465873293664, + "grad_norm": 1.0865626335144043, + "learning_rate": 6.298913424564896e-05, + "loss": 1.8586, + "step": 25128 + }, + { + "epoch": 2.931863259829658, + "grad_norm": 1.0773378610610962, + "learning_rate": 6.297699397595471e-05, + "loss": 1.9418, + "step": 25129 + }, + { + "epoch": 2.93197993232995, + "grad_norm": 1.0998196601867676, + "learning_rate": 6.296485462396909e-05, + "loss": 1.9818, + "step": 25130 + }, + { + "epoch": 2.9320966048302415, + "grad_norm": 1.3592578172683716, + "learning_rate": 6.295271618981788e-05, + "loss": 2.1959, + "step": 25131 + }, + { + "epoch": 2.932213277330533, + "grad_norm": 1.0455552339553833, + "learning_rate": 6.294057867362696e-05, + "loss": 1.9447, + "step": 25132 + }, + { + "epoch": 2.932329949830825, + "grad_norm": 0.9476732611656189, + "learning_rate": 6.292844207552214e-05, + "loss": 1.7781, + "step": 25133 + }, + { + "epoch": 2.9324466223311165, + "grad_norm": 1.0038632154464722, + "learning_rate": 6.291630639562926e-05, + "loss": 1.8593, + "step": 25134 + }, + { + "epoch": 2.932563294831408, + "grad_norm": 1.2677271366119385, + "learning_rate": 6.290417163407405e-05, + "loss": 2.0712, + "step": 25135 + }, + { + "epoch": 2.9326799673317, + "grad_norm": 0.8929181098937988, + "learning_rate": 6.289203779098245e-05, + "loss": 1.8242, + "step": 25136 + }, + { + "epoch": 2.9327966398319916, + "grad_norm": 1.0408169031143188, + "learning_rate": 6.28799048664801e-05, + "loss": 1.8579, + "step": 25137 + }, + { + "epoch": 2.9329133123322833, + "grad_norm": 0.950933575630188, + "learning_rate": 6.286777286069293e-05, + "loss": 1.8984, + "step": 25138 + }, + { + "epoch": 2.933029984832575, + "grad_norm": 1.1004410982131958, + "learning_rate": 6.285564177374655e-05, + "loss": 1.8777, + "step": 25139 + }, + { + "epoch": 2.9331466573328666, + "grad_norm": 1.1192351579666138, + "learning_rate": 6.284351160576687e-05, + "loss": 2.023, + "step": 25140 + }, + { + "epoch": 2.9332633298331583, + "grad_norm": 1.1758079528808594, + "learning_rate": 6.283138235687958e-05, + "loss": 1.934, + "step": 25141 + }, + { + "epoch": 2.93338000233345, + "grad_norm": 1.1113100051879883, + "learning_rate": 6.281925402721037e-05, + "loss": 1.8937, + "step": 25142 + }, + { + "epoch": 2.9334966748337417, + "grad_norm": 1.2462506294250488, + "learning_rate": 6.280712661688508e-05, + "loss": 2.1661, + "step": 25143 + }, + { + "epoch": 2.9336133473340333, + "grad_norm": 1.068766713142395, + "learning_rate": 6.27950001260293e-05, + "loss": 2.1352, + "step": 25144 + }, + { + "epoch": 2.933730019834325, + "grad_norm": 1.0820649862289429, + "learning_rate": 6.278287455476888e-05, + "loss": 1.8269, + "step": 25145 + }, + { + "epoch": 2.9338466923346167, + "grad_norm": 1.240290880203247, + "learning_rate": 6.277074990322944e-05, + "loss": 1.9178, + "step": 25146 + }, + { + "epoch": 2.9339633648349084, + "grad_norm": 1.1740734577178955, + "learning_rate": 6.275862617153673e-05, + "loss": 1.9591, + "step": 25147 + }, + { + "epoch": 2.9340800373352, + "grad_norm": 1.10285484790802, + "learning_rate": 6.274650335981635e-05, + "loss": 1.9103, + "step": 25148 + }, + { + "epoch": 2.9341967098354917, + "grad_norm": 1.1871103048324585, + "learning_rate": 6.27343814681941e-05, + "loss": 2.138, + "step": 25149 + }, + { + "epoch": 2.9343133823357834, + "grad_norm": 1.1234285831451416, + "learning_rate": 6.27222604967955e-05, + "loss": 1.9442, + "step": 25150 + }, + { + "epoch": 2.934430054836075, + "grad_norm": 1.4424636363983154, + "learning_rate": 6.271014044574636e-05, + "loss": 2.0127, + "step": 25151 + }, + { + "epoch": 2.934546727336367, + "grad_norm": 1.0045350790023804, + "learning_rate": 6.269802131517218e-05, + "loss": 1.8795, + "step": 25152 + }, + { + "epoch": 2.9346633998366585, + "grad_norm": 1.0974339246749878, + "learning_rate": 6.268590310519874e-05, + "loss": 2.0113, + "step": 25153 + }, + { + "epoch": 2.93478007233695, + "grad_norm": 1.172676682472229, + "learning_rate": 6.267378581595152e-05, + "loss": 1.9207, + "step": 25154 + }, + { + "epoch": 2.934896744837242, + "grad_norm": 0.9485155344009399, + "learning_rate": 6.266166944755624e-05, + "loss": 1.9013, + "step": 25155 + }, + { + "epoch": 2.9350134173375335, + "grad_norm": 1.337951898574829, + "learning_rate": 6.264955400013853e-05, + "loss": 2.1085, + "step": 25156 + }, + { + "epoch": 2.935130089837825, + "grad_norm": 1.098944067955017, + "learning_rate": 6.263743947382394e-05, + "loss": 1.8925, + "step": 25157 + }, + { + "epoch": 2.935246762338117, + "grad_norm": 1.3457030057907104, + "learning_rate": 6.262532586873803e-05, + "loss": 1.9848, + "step": 25158 + }, + { + "epoch": 2.9353634348384086, + "grad_norm": 1.0764638185501099, + "learning_rate": 6.261321318500645e-05, + "loss": 1.721, + "step": 25159 + }, + { + "epoch": 2.9354801073387002, + "grad_norm": 0.9333838224411011, + "learning_rate": 6.260110142275475e-05, + "loss": 1.8833, + "step": 25160 + }, + { + "epoch": 2.935596779838992, + "grad_norm": 1.1503510475158691, + "learning_rate": 6.258899058210841e-05, + "loss": 1.8212, + "step": 25161 + }, + { + "epoch": 2.9357134523392836, + "grad_norm": 1.0754505395889282, + "learning_rate": 6.25768806631931e-05, + "loss": 1.9381, + "step": 25162 + }, + { + "epoch": 2.9358301248395753, + "grad_norm": 1.2499061822891235, + "learning_rate": 6.256477166613428e-05, + "loss": 1.8581, + "step": 25163 + }, + { + "epoch": 2.935946797339867, + "grad_norm": 0.9937861561775208, + "learning_rate": 6.255266359105754e-05, + "loss": 1.8176, + "step": 25164 + }, + { + "epoch": 2.9360634698401586, + "grad_norm": 0.9675425291061401, + "learning_rate": 6.254055643808835e-05, + "loss": 1.9604, + "step": 25165 + }, + { + "epoch": 2.9361801423404503, + "grad_norm": 1.1034588813781738, + "learning_rate": 6.25284502073523e-05, + "loss": 2.1548, + "step": 25166 + }, + { + "epoch": 2.936296814840742, + "grad_norm": 1.2224886417388916, + "learning_rate": 6.251634489897477e-05, + "loss": 2.0281, + "step": 25167 + }, + { + "epoch": 2.9364134873410337, + "grad_norm": 1.106436014175415, + "learning_rate": 6.250424051308136e-05, + "loss": 1.8376, + "step": 25168 + }, + { + "epoch": 2.9365301598413254, + "grad_norm": 0.9399104714393616, + "learning_rate": 6.249213704979756e-05, + "loss": 1.88, + "step": 25169 + }, + { + "epoch": 2.936646832341617, + "grad_norm": 1.338207483291626, + "learning_rate": 6.248003450924876e-05, + "loss": 2.1878, + "step": 25170 + }, + { + "epoch": 2.9367635048419087, + "grad_norm": 1.0723638534545898, + "learning_rate": 6.246793289156052e-05, + "loss": 2.0033, + "step": 25171 + }, + { + "epoch": 2.9368801773422004, + "grad_norm": 0.942631185054779, + "learning_rate": 6.245583219685822e-05, + "loss": 1.7269, + "step": 25172 + }, + { + "epoch": 2.936996849842492, + "grad_norm": 1.1143075227737427, + "learning_rate": 6.244373242526737e-05, + "loss": 1.978, + "step": 25173 + }, + { + "epoch": 2.9371135223427838, + "grad_norm": 1.195264220237732, + "learning_rate": 6.243163357691333e-05, + "loss": 1.9866, + "step": 25174 + }, + { + "epoch": 2.9372301948430755, + "grad_norm": 1.076511025428772, + "learning_rate": 6.241953565192162e-05, + "loss": 1.9111, + "step": 25175 + }, + { + "epoch": 2.937346867343367, + "grad_norm": 1.2207403182983398, + "learning_rate": 6.240743865041761e-05, + "loss": 1.9384, + "step": 25176 + }, + { + "epoch": 2.937463539843659, + "grad_norm": 1.093578577041626, + "learning_rate": 6.239534257252673e-05, + "loss": 2.0104, + "step": 25177 + }, + { + "epoch": 2.9375802123439505, + "grad_norm": 1.127555251121521, + "learning_rate": 6.238324741837428e-05, + "loss": 2.1009, + "step": 25178 + }, + { + "epoch": 2.937696884844242, + "grad_norm": 1.100947618484497, + "learning_rate": 6.23711531880858e-05, + "loss": 2.0208, + "step": 25179 + }, + { + "epoch": 2.937813557344534, + "grad_norm": 1.2492430210113525, + "learning_rate": 6.235905988178653e-05, + "loss": 1.8677, + "step": 25180 + }, + { + "epoch": 2.9379302298448255, + "grad_norm": 0.8724475502967834, + "learning_rate": 6.23469674996019e-05, + "loss": 1.7299, + "step": 25181 + }, + { + "epoch": 2.9380469023451172, + "grad_norm": 0.9514425992965698, + "learning_rate": 6.233487604165734e-05, + "loss": 1.7538, + "step": 25182 + }, + { + "epoch": 2.938163574845409, + "grad_norm": 1.098815679550171, + "learning_rate": 6.232278550807809e-05, + "loss": 2.105, + "step": 25183 + }, + { + "epoch": 2.9382802473457006, + "grad_norm": 1.002455472946167, + "learning_rate": 6.231069589898959e-05, + "loss": 1.9079, + "step": 25184 + }, + { + "epoch": 2.9383969198459923, + "grad_norm": 1.3717081546783447, + "learning_rate": 6.229860721451706e-05, + "loss": 1.9753, + "step": 25185 + }, + { + "epoch": 2.938513592346284, + "grad_norm": 1.026258945465088, + "learning_rate": 6.228651945478594e-05, + "loss": 1.8985, + "step": 25186 + }, + { + "epoch": 2.9386302648465756, + "grad_norm": 0.9974349737167358, + "learning_rate": 6.227443261992142e-05, + "loss": 1.9014, + "step": 25187 + }, + { + "epoch": 2.9387469373468673, + "grad_norm": 1.085444450378418, + "learning_rate": 6.226234671004894e-05, + "loss": 1.8795, + "step": 25188 + }, + { + "epoch": 2.938863609847159, + "grad_norm": 1.0401966571807861, + "learning_rate": 6.225026172529367e-05, + "loss": 1.8228, + "step": 25189 + }, + { + "epoch": 2.9389802823474507, + "grad_norm": 1.0829147100448608, + "learning_rate": 6.2238177665781e-05, + "loss": 1.8915, + "step": 25190 + }, + { + "epoch": 2.9390969548477424, + "grad_norm": 1.2577656507492065, + "learning_rate": 6.222609453163608e-05, + "loss": 1.9049, + "step": 25191 + }, + { + "epoch": 2.939213627348034, + "grad_norm": 1.063856840133667, + "learning_rate": 6.22140123229843e-05, + "loss": 1.9082, + "step": 25192 + }, + { + "epoch": 2.9393302998483257, + "grad_norm": 1.0659300088882446, + "learning_rate": 6.220193103995086e-05, + "loss": 2.045, + "step": 25193 + }, + { + "epoch": 2.9394469723486174, + "grad_norm": 1.1148332357406616, + "learning_rate": 6.218985068266097e-05, + "loss": 1.9711, + "step": 25194 + }, + { + "epoch": 2.939563644848909, + "grad_norm": 1.1079661846160889, + "learning_rate": 6.217777125123993e-05, + "loss": 2.0341, + "step": 25195 + }, + { + "epoch": 2.9396803173492008, + "grad_norm": 1.1182079315185547, + "learning_rate": 6.21656927458129e-05, + "loss": 1.8859, + "step": 25196 + }, + { + "epoch": 2.9397969898494924, + "grad_norm": 1.1100574731826782, + "learning_rate": 6.215361516650516e-05, + "loss": 1.9648, + "step": 25197 + }, + { + "epoch": 2.939913662349784, + "grad_norm": 1.0969234704971313, + "learning_rate": 6.214153851344187e-05, + "loss": 2.0874, + "step": 25198 + }, + { + "epoch": 2.940030334850076, + "grad_norm": 0.9703131318092346, + "learning_rate": 6.212946278674828e-05, + "loss": 1.9053, + "step": 25199 + }, + { + "epoch": 2.9401470073503675, + "grad_norm": 1.0737367868423462, + "learning_rate": 6.21173879865495e-05, + "loss": 1.8635, + "step": 25200 + }, + { + "epoch": 2.940263679850659, + "grad_norm": 1.1816939115524292, + "learning_rate": 6.21053141129708e-05, + "loss": 2.0077, + "step": 25201 + }, + { + "epoch": 2.940380352350951, + "grad_norm": 1.0595226287841797, + "learning_rate": 6.209324116613726e-05, + "loss": 1.9077, + "step": 25202 + }, + { + "epoch": 2.9404970248512425, + "grad_norm": 0.9815572500228882, + "learning_rate": 6.208116914617412e-05, + "loss": 1.8488, + "step": 25203 + }, + { + "epoch": 2.940613697351534, + "grad_norm": 1.0575720071792603, + "learning_rate": 6.206909805320646e-05, + "loss": 1.8686, + "step": 25204 + }, + { + "epoch": 2.940730369851826, + "grad_norm": 1.0206468105316162, + "learning_rate": 6.205702788735949e-05, + "loss": 1.9878, + "step": 25205 + }, + { + "epoch": 2.9408470423521176, + "grad_norm": 1.1832536458969116, + "learning_rate": 6.204495864875824e-05, + "loss": 1.9654, + "step": 25206 + }, + { + "epoch": 2.9409637148524093, + "grad_norm": 1.156765341758728, + "learning_rate": 6.203289033752788e-05, + "loss": 1.8644, + "step": 25207 + }, + { + "epoch": 2.941080387352701, + "grad_norm": 1.0698704719543457, + "learning_rate": 6.20208229537936e-05, + "loss": 2.0658, + "step": 25208 + }, + { + "epoch": 2.9411970598529926, + "grad_norm": 1.2326308488845825, + "learning_rate": 6.200875649768044e-05, + "loss": 2.1057, + "step": 25209 + }, + { + "epoch": 2.9413137323532843, + "grad_norm": 1.209999918937683, + "learning_rate": 6.199669096931347e-05, + "loss": 1.8494, + "step": 25210 + }, + { + "epoch": 2.941430404853576, + "grad_norm": 1.015129566192627, + "learning_rate": 6.198462636881776e-05, + "loss": 2.0248, + "step": 25211 + }, + { + "epoch": 2.9415470773538677, + "grad_norm": 1.0390138626098633, + "learning_rate": 6.197256269631844e-05, + "loss": 1.8334, + "step": 25212 + }, + { + "epoch": 2.9416637498541593, + "grad_norm": 1.2443873882293701, + "learning_rate": 6.196049995194049e-05, + "loss": 1.8546, + "step": 25213 + }, + { + "epoch": 2.941780422354451, + "grad_norm": 1.127098560333252, + "learning_rate": 6.194843813580906e-05, + "loss": 2.0558, + "step": 25214 + }, + { + "epoch": 2.9418970948547427, + "grad_norm": 1.2019140720367432, + "learning_rate": 6.19363772480491e-05, + "loss": 2.1613, + "step": 25215 + }, + { + "epoch": 2.9420137673550344, + "grad_norm": 1.1073224544525146, + "learning_rate": 6.192431728878575e-05, + "loss": 2.0815, + "step": 25216 + }, + { + "epoch": 2.942130439855326, + "grad_norm": 1.1573008298873901, + "learning_rate": 6.191225825814391e-05, + "loss": 2.0267, + "step": 25217 + }, + { + "epoch": 2.9422471123556178, + "grad_norm": 1.2213400602340698, + "learning_rate": 6.190020015624875e-05, + "loss": 2.0016, + "step": 25218 + }, + { + "epoch": 2.9423637848559094, + "grad_norm": 1.1097280979156494, + "learning_rate": 6.18881429832251e-05, + "loss": 1.6346, + "step": 25219 + }, + { + "epoch": 2.942480457356201, + "grad_norm": 0.9809451699256897, + "learning_rate": 6.187608673919806e-05, + "loss": 1.7816, + "step": 25220 + }, + { + "epoch": 2.942597129856493, + "grad_norm": 1.0325387716293335, + "learning_rate": 6.186403142429264e-05, + "loss": 1.8678, + "step": 25221 + }, + { + "epoch": 2.9427138023567845, + "grad_norm": 1.2039858102798462, + "learning_rate": 6.185197703863374e-05, + "loss": 2.1632, + "step": 25222 + }, + { + "epoch": 2.942830474857076, + "grad_norm": 1.1828619241714478, + "learning_rate": 6.183992358234639e-05, + "loss": 2.0005, + "step": 25223 + }, + { + "epoch": 2.942947147357368, + "grad_norm": 1.2142918109893799, + "learning_rate": 6.182787105555547e-05, + "loss": 1.9158, + "step": 25224 + }, + { + "epoch": 2.9430638198576595, + "grad_norm": 1.081817865371704, + "learning_rate": 6.181581945838604e-05, + "loss": 2.0084, + "step": 25225 + }, + { + "epoch": 2.943180492357951, + "grad_norm": 1.0987476110458374, + "learning_rate": 6.180376879096294e-05, + "loss": 2.1306, + "step": 25226 + }, + { + "epoch": 2.943297164858243, + "grad_norm": 1.2233095169067383, + "learning_rate": 6.179171905341112e-05, + "loss": 1.9524, + "step": 25227 + }, + { + "epoch": 2.9434138373585346, + "grad_norm": 0.9995876550674438, + "learning_rate": 6.177967024585554e-05, + "loss": 1.8726, + "step": 25228 + }, + { + "epoch": 2.9435305098588262, + "grad_norm": 1.0795753002166748, + "learning_rate": 6.176762236842106e-05, + "loss": 1.7619, + "step": 25229 + }, + { + "epoch": 2.943647182359118, + "grad_norm": 1.0783804655075073, + "learning_rate": 6.175557542123259e-05, + "loss": 2.0128, + "step": 25230 + }, + { + "epoch": 2.9437638548594096, + "grad_norm": 1.1354385614395142, + "learning_rate": 6.174352940441504e-05, + "loss": 1.9482, + "step": 25231 + }, + { + "epoch": 2.9438805273597013, + "grad_norm": 1.116315245628357, + "learning_rate": 6.173148431809322e-05, + "loss": 1.998, + "step": 25232 + }, + { + "epoch": 2.943997199859993, + "grad_norm": 1.0268278121948242, + "learning_rate": 6.171944016239209e-05, + "loss": 1.9258, + "step": 25233 + }, + { + "epoch": 2.9441138723602847, + "grad_norm": 1.3011527061462402, + "learning_rate": 6.17073969374365e-05, + "loss": 2.006, + "step": 25234 + }, + { + "epoch": 2.9442305448605763, + "grad_norm": 0.9538992047309875, + "learning_rate": 6.169535464335122e-05, + "loss": 1.9294, + "step": 25235 + }, + { + "epoch": 2.944347217360868, + "grad_norm": 1.266460657119751, + "learning_rate": 6.168331328026121e-05, + "loss": 2.0988, + "step": 25236 + }, + { + "epoch": 2.9444638898611597, + "grad_norm": 1.2245877981185913, + "learning_rate": 6.16712728482912e-05, + "loss": 2.1409, + "step": 25237 + }, + { + "epoch": 2.9445805623614514, + "grad_norm": 1.2980598211288452, + "learning_rate": 6.165923334756608e-05, + "loss": 1.9361, + "step": 25238 + }, + { + "epoch": 2.944697234861743, + "grad_norm": 1.130829930305481, + "learning_rate": 6.164719477821061e-05, + "loss": 1.9716, + "step": 25239 + }, + { + "epoch": 2.9448139073620347, + "grad_norm": 0.9217222929000854, + "learning_rate": 6.163515714034964e-05, + "loss": 1.7865, + "step": 25240 + }, + { + "epoch": 2.9449305798623264, + "grad_norm": 1.1031843423843384, + "learning_rate": 6.162312043410792e-05, + "loss": 1.9542, + "step": 25241 + }, + { + "epoch": 2.945047252362618, + "grad_norm": 1.054753065109253, + "learning_rate": 6.161108465961029e-05, + "loss": 2.0274, + "step": 25242 + }, + { + "epoch": 2.94516392486291, + "grad_norm": 1.0658576488494873, + "learning_rate": 6.159904981698142e-05, + "loss": 1.9326, + "step": 25243 + }, + { + "epoch": 2.9452805973632015, + "grad_norm": 1.15060555934906, + "learning_rate": 6.15870159063462e-05, + "loss": 1.9369, + "step": 25244 + }, + { + "epoch": 2.945397269863493, + "grad_norm": 1.142728328704834, + "learning_rate": 6.157498292782934e-05, + "loss": 1.9371, + "step": 25245 + }, + { + "epoch": 2.945513942363785, + "grad_norm": 1.137795090675354, + "learning_rate": 6.15629508815555e-05, + "loss": 2.0038, + "step": 25246 + }, + { + "epoch": 2.9456306148640765, + "grad_norm": 1.1912271976470947, + "learning_rate": 6.155091976764954e-05, + "loss": 2.0856, + "step": 25247 + }, + { + "epoch": 2.945747287364368, + "grad_norm": 1.2012625932693481, + "learning_rate": 6.153888958623607e-05, + "loss": 2.0112, + "step": 25248 + }, + { + "epoch": 2.94586395986466, + "grad_norm": 1.0106201171875, + "learning_rate": 6.152686033743993e-05, + "loss": 2.0298, + "step": 25249 + }, + { + "epoch": 2.9459806323649516, + "grad_norm": 1.159196138381958, + "learning_rate": 6.15148320213857e-05, + "loss": 2.0344, + "step": 25250 + }, + { + "epoch": 2.9460973048652432, + "grad_norm": 1.151766300201416, + "learning_rate": 6.150280463819819e-05, + "loss": 1.8872, + "step": 25251 + }, + { + "epoch": 2.946213977365535, + "grad_norm": 1.2136714458465576, + "learning_rate": 6.149077818800199e-05, + "loss": 2.0252, + "step": 25252 + }, + { + "epoch": 2.9463306498658266, + "grad_norm": 1.2824416160583496, + "learning_rate": 6.147875267092186e-05, + "loss": 2.1785, + "step": 25253 + }, + { + "epoch": 2.9464473223661183, + "grad_norm": 1.2965198755264282, + "learning_rate": 6.146672808708236e-05, + "loss": 2.0143, + "step": 25254 + }, + { + "epoch": 2.94656399486641, + "grad_norm": 1.1082102060317993, + "learning_rate": 6.145470443660831e-05, + "loss": 2.1271, + "step": 25255 + }, + { + "epoch": 2.9466806673667016, + "grad_norm": 1.055772066116333, + "learning_rate": 6.144268171962418e-05, + "loss": 1.9958, + "step": 25256 + }, + { + "epoch": 2.9467973398669933, + "grad_norm": 1.12046480178833, + "learning_rate": 6.143065993625476e-05, + "loss": 1.9133, + "step": 25257 + }, + { + "epoch": 2.946914012367285, + "grad_norm": 1.1371784210205078, + "learning_rate": 6.141863908662455e-05, + "loss": 2.27, + "step": 25258 + }, + { + "epoch": 2.9470306848675767, + "grad_norm": 1.111033320426941, + "learning_rate": 6.140661917085829e-05, + "loss": 1.8333, + "step": 25259 + }, + { + "epoch": 2.9471473573678684, + "grad_norm": 0.9980785250663757, + "learning_rate": 6.139460018908047e-05, + "loss": 1.7783, + "step": 25260 + }, + { + "epoch": 2.94726402986816, + "grad_norm": 1.0360931158065796, + "learning_rate": 6.138258214141578e-05, + "loss": 1.8558, + "step": 25261 + }, + { + "epoch": 2.9473807023684517, + "grad_norm": 1.006590723991394, + "learning_rate": 6.13705650279888e-05, + "loss": 1.9641, + "step": 25262 + }, + { + "epoch": 2.9474973748687434, + "grad_norm": 1.0173972845077515, + "learning_rate": 6.135854884892402e-05, + "loss": 1.962, + "step": 25263 + }, + { + "epoch": 2.947614047369035, + "grad_norm": 1.1497441530227661, + "learning_rate": 6.134653360434615e-05, + "loss": 2.1716, + "step": 25264 + }, + { + "epoch": 2.9477307198693268, + "grad_norm": 1.1428290605545044, + "learning_rate": 6.133451929437961e-05, + "loss": 1.9679, + "step": 25265 + }, + { + "epoch": 2.9478473923696185, + "grad_norm": 1.1987216472625732, + "learning_rate": 6.132250591914908e-05, + "loss": 2.153, + "step": 25266 + }, + { + "epoch": 2.94796406486991, + "grad_norm": 1.030868649482727, + "learning_rate": 6.131049347877898e-05, + "loss": 1.9988, + "step": 25267 + }, + { + "epoch": 2.948080737370202, + "grad_norm": 1.1314458847045898, + "learning_rate": 6.129848197339397e-05, + "loss": 1.9768, + "step": 25268 + }, + { + "epoch": 2.9481974098704935, + "grad_norm": 0.9186486005783081, + "learning_rate": 6.128647140311844e-05, + "loss": 1.8748, + "step": 25269 + }, + { + "epoch": 2.948314082370785, + "grad_norm": 1.1814098358154297, + "learning_rate": 6.127446176807705e-05, + "loss": 2.1484, + "step": 25270 + }, + { + "epoch": 2.948430754871077, + "grad_norm": 1.1030341386795044, + "learning_rate": 6.126245306839412e-05, + "loss": 1.7982, + "step": 25271 + }, + { + "epoch": 2.9485474273713685, + "grad_norm": 1.0712313652038574, + "learning_rate": 6.125044530419433e-05, + "loss": 1.873, + "step": 25272 + }, + { + "epoch": 2.9486640998716602, + "grad_norm": 1.114302158355713, + "learning_rate": 6.1238438475602e-05, + "loss": 1.8514, + "step": 25273 + }, + { + "epoch": 2.948780772371952, + "grad_norm": 1.1401771306991577, + "learning_rate": 6.122643258274171e-05, + "loss": 2.0622, + "step": 25274 + }, + { + "epoch": 2.9488974448722436, + "grad_norm": 1.0271246433258057, + "learning_rate": 6.121442762573792e-05, + "loss": 1.829, + "step": 25275 + }, + { + "epoch": 2.9490141173725353, + "grad_norm": 1.1704003810882568, + "learning_rate": 6.120242360471501e-05, + "loss": 1.9429, + "step": 25276 + }, + { + "epoch": 2.949130789872827, + "grad_norm": 1.2693064212799072, + "learning_rate": 6.119042051979752e-05, + "loss": 2.0761, + "step": 25277 + }, + { + "epoch": 2.9492474623731186, + "grad_norm": 1.0329344272613525, + "learning_rate": 6.117841837110985e-05, + "loss": 1.9021, + "step": 25278 + }, + { + "epoch": 2.9493641348734103, + "grad_norm": 1.0319517850875854, + "learning_rate": 6.116641715877634e-05, + "loss": 2.0553, + "step": 25279 + }, + { + "epoch": 2.949480807373702, + "grad_norm": 1.0076757669448853, + "learning_rate": 6.115441688292155e-05, + "loss": 1.8378, + "step": 25280 + }, + { + "epoch": 2.9495974798739937, + "grad_norm": 1.133506417274475, + "learning_rate": 6.11424175436698e-05, + "loss": 2.0354, + "step": 25281 + }, + { + "epoch": 2.9497141523742854, + "grad_norm": 1.097952127456665, + "learning_rate": 6.113041914114545e-05, + "loss": 1.9616, + "step": 25282 + }, + { + "epoch": 2.949830824874577, + "grad_norm": 1.3097021579742432, + "learning_rate": 6.111842167547299e-05, + "loss": 2.0093, + "step": 25283 + }, + { + "epoch": 2.9499474973748687, + "grad_norm": 1.0581172704696655, + "learning_rate": 6.110642514677668e-05, + "loss": 1.9922, + "step": 25284 + }, + { + "epoch": 2.9500641698751604, + "grad_norm": 1.1015114784240723, + "learning_rate": 6.109442955518101e-05, + "loss": 1.8452, + "step": 25285 + }, + { + "epoch": 2.950180842375452, + "grad_norm": 0.8912085294723511, + "learning_rate": 6.108243490081021e-05, + "loss": 2.0602, + "step": 25286 + }, + { + "epoch": 2.9502975148757438, + "grad_norm": 1.0428740978240967, + "learning_rate": 6.10704411837887e-05, + "loss": 1.9518, + "step": 25287 + }, + { + "epoch": 2.9504141873760354, + "grad_norm": 0.9141280055046082, + "learning_rate": 6.105844840424086e-05, + "loss": 1.7149, + "step": 25288 + }, + { + "epoch": 2.950530859876327, + "grad_norm": 1.0762956142425537, + "learning_rate": 6.10464565622909e-05, + "loss": 2.0218, + "step": 25289 + }, + { + "epoch": 2.950647532376619, + "grad_norm": 1.2565287351608276, + "learning_rate": 6.103446565806329e-05, + "loss": 2.0026, + "step": 25290 + }, + { + "epoch": 2.9507642048769105, + "grad_norm": 1.1103825569152832, + "learning_rate": 6.102247569168218e-05, + "loss": 1.9211, + "step": 25291 + }, + { + "epoch": 2.950880877377202, + "grad_norm": 0.9801827669143677, + "learning_rate": 6.1010486663272e-05, + "loss": 2.0032, + "step": 25292 + }, + { + "epoch": 2.950997549877494, + "grad_norm": 1.1127749681472778, + "learning_rate": 6.0998498572956935e-05, + "loss": 1.8995, + "step": 25293 + }, + { + "epoch": 2.9511142223777855, + "grad_norm": 0.9669615626335144, + "learning_rate": 6.098651142086135e-05, + "loss": 1.9299, + "step": 25294 + }, + { + "epoch": 2.951230894878077, + "grad_norm": 1.2593748569488525, + "learning_rate": 6.0974525207109495e-05, + "loss": 1.9779, + "step": 25295 + }, + { + "epoch": 2.951347567378369, + "grad_norm": 1.1145753860473633, + "learning_rate": 6.096253993182553e-05, + "loss": 2.0904, + "step": 25296 + }, + { + "epoch": 2.9514642398786606, + "grad_norm": 1.1493964195251465, + "learning_rate": 6.095055559513387e-05, + "loss": 1.9944, + "step": 25297 + }, + { + "epoch": 2.9515809123789523, + "grad_norm": 1.199779987335205, + "learning_rate": 6.093857219715865e-05, + "loss": 1.9867, + "step": 25298 + }, + { + "epoch": 2.951697584879244, + "grad_norm": 1.042738914489746, + "learning_rate": 6.0926589738024066e-05, + "loss": 2.1398, + "step": 25299 + }, + { + "epoch": 2.9518142573795356, + "grad_norm": 1.2274013757705688, + "learning_rate": 6.0914608217854404e-05, + "loss": 1.9156, + "step": 25300 + }, + { + "epoch": 2.9519309298798273, + "grad_norm": 1.1832987070083618, + "learning_rate": 6.0902627636773914e-05, + "loss": 2.0788, + "step": 25301 + }, + { + "epoch": 2.952047602380119, + "grad_norm": 1.1347495317459106, + "learning_rate": 6.089064799490669e-05, + "loss": 2.0532, + "step": 25302 + }, + { + "epoch": 2.9521642748804107, + "grad_norm": 1.1186734437942505, + "learning_rate": 6.0878669292377056e-05, + "loss": 1.9684, + "step": 25303 + }, + { + "epoch": 2.9522809473807023, + "grad_norm": 1.099888801574707, + "learning_rate": 6.086669152930905e-05, + "loss": 1.9805, + "step": 25304 + }, + { + "epoch": 2.952397619880994, + "grad_norm": 1.1010816097259521, + "learning_rate": 6.085471470582697e-05, + "loss": 1.9192, + "step": 25305 + }, + { + "epoch": 2.9525142923812857, + "grad_norm": 1.3636404275894165, + "learning_rate": 6.084273882205488e-05, + "loss": 1.7581, + "step": 25306 + }, + { + "epoch": 2.9526309648815774, + "grad_norm": 1.19615638256073, + "learning_rate": 6.083076387811701e-05, + "loss": 2.0247, + "step": 25307 + }, + { + "epoch": 2.952747637381869, + "grad_norm": 1.1882585287094116, + "learning_rate": 6.081878987413742e-05, + "loss": 1.9861, + "step": 25308 + }, + { + "epoch": 2.9528643098821608, + "grad_norm": 1.0249545574188232, + "learning_rate": 6.080681681024036e-05, + "loss": 1.8684, + "step": 25309 + }, + { + "epoch": 2.9529809823824524, + "grad_norm": 1.0445880889892578, + "learning_rate": 6.079484468654982e-05, + "loss": 1.9677, + "step": 25310 + }, + { + "epoch": 2.953097654882744, + "grad_norm": 1.033809781074524, + "learning_rate": 6.078287350319003e-05, + "loss": 2.0257, + "step": 25311 + }, + { + "epoch": 2.953214327383036, + "grad_norm": 1.0441439151763916, + "learning_rate": 6.0770903260285004e-05, + "loss": 1.9432, + "step": 25312 + }, + { + "epoch": 2.9533309998833275, + "grad_norm": 1.0749033689498901, + "learning_rate": 6.0758933957958914e-05, + "loss": 1.9697, + "step": 25313 + }, + { + "epoch": 2.953447672383619, + "grad_norm": 0.995341956615448, + "learning_rate": 6.074696559633581e-05, + "loss": 1.896, + "step": 25314 + }, + { + "epoch": 2.953564344883911, + "grad_norm": 1.0459213256835938, + "learning_rate": 6.073499817553972e-05, + "loss": 1.8929, + "step": 25315 + }, + { + "epoch": 2.9536810173842025, + "grad_norm": 1.4436471462249756, + "learning_rate": 6.072303169569479e-05, + "loss": 2.1991, + "step": 25316 + }, + { + "epoch": 2.953797689884494, + "grad_norm": 1.1371809244155884, + "learning_rate": 6.071106615692499e-05, + "loss": 2.0872, + "step": 25317 + }, + { + "epoch": 2.953914362384786, + "grad_norm": 1.0326485633850098, + "learning_rate": 6.0699101559354455e-05, + "loss": 1.9247, + "step": 25318 + }, + { + "epoch": 2.9540310348850776, + "grad_norm": 1.2834668159484863, + "learning_rate": 6.0687137903107136e-05, + "loss": 2.158, + "step": 25319 + }, + { + "epoch": 2.9541477073853692, + "grad_norm": 1.0342416763305664, + "learning_rate": 6.067517518830715e-05, + "loss": 2.1671, + "step": 25320 + }, + { + "epoch": 2.954264379885661, + "grad_norm": 1.0607904195785522, + "learning_rate": 6.0663213415078414e-05, + "loss": 1.8167, + "step": 25321 + }, + { + "epoch": 2.9543810523859526, + "grad_norm": 1.1420258283615112, + "learning_rate": 6.0651252583545035e-05, + "loss": 2.0703, + "step": 25322 + }, + { + "epoch": 2.9544977248862443, + "grad_norm": 1.047405481338501, + "learning_rate": 6.063929269383092e-05, + "loss": 2.0901, + "step": 25323 + }, + { + "epoch": 2.954614397386536, + "grad_norm": 1.101117730140686, + "learning_rate": 6.062733374606012e-05, + "loss": 1.9303, + "step": 25324 + }, + { + "epoch": 2.9547310698868277, + "grad_norm": 1.1012598276138306, + "learning_rate": 6.061537574035657e-05, + "loss": 2.1087, + "step": 25325 + }, + { + "epoch": 2.9548477423871193, + "grad_norm": 0.9670316576957703, + "learning_rate": 6.060341867684422e-05, + "loss": 1.8415, + "step": 25326 + }, + { + "epoch": 2.954964414887411, + "grad_norm": 1.031083106994629, + "learning_rate": 6.0591462555647136e-05, + "loss": 1.7899, + "step": 25327 + }, + { + "epoch": 2.9550810873877027, + "grad_norm": 1.125357747077942, + "learning_rate": 6.057950737688914e-05, + "loss": 2.0237, + "step": 25328 + }, + { + "epoch": 2.9551977598879944, + "grad_norm": 1.0855793952941895, + "learning_rate": 6.0567553140694264e-05, + "loss": 1.7971, + "step": 25329 + }, + { + "epoch": 2.955314432388286, + "grad_norm": 1.1355693340301514, + "learning_rate": 6.05555998471864e-05, + "loss": 1.9042, + "step": 25330 + }, + { + "epoch": 2.9554311048885777, + "grad_norm": 1.0805069208145142, + "learning_rate": 6.054364749648945e-05, + "loss": 1.9446, + "step": 25331 + }, + { + "epoch": 2.9555477773888694, + "grad_norm": 0.8985477089881897, + "learning_rate": 6.0531696088727295e-05, + "loss": 1.674, + "step": 25332 + }, + { + "epoch": 2.955664449889161, + "grad_norm": 1.2163729667663574, + "learning_rate": 6.0519745624023916e-05, + "loss": 1.8871, + "step": 25333 + }, + { + "epoch": 2.955781122389453, + "grad_norm": 1.1382496356964111, + "learning_rate": 6.050779610250311e-05, + "loss": 1.8707, + "step": 25334 + }, + { + "epoch": 2.9558977948897445, + "grad_norm": 1.116172432899475, + "learning_rate": 6.049584752428885e-05, + "loss": 1.8739, + "step": 25335 + }, + { + "epoch": 2.956014467390036, + "grad_norm": 1.0834503173828125, + "learning_rate": 6.048389988950493e-05, + "loss": 2.0174, + "step": 25336 + }, + { + "epoch": 2.956131139890328, + "grad_norm": 1.1481913328170776, + "learning_rate": 6.047195319827527e-05, + "loss": 1.8543, + "step": 25337 + }, + { + "epoch": 2.9562478123906195, + "grad_norm": 1.069867730140686, + "learning_rate": 6.046000745072366e-05, + "loss": 1.9829, + "step": 25338 + }, + { + "epoch": 2.956364484890911, + "grad_norm": 0.9138146042823792, + "learning_rate": 6.044806264697396e-05, + "loss": 1.8804, + "step": 25339 + }, + { + "epoch": 2.956481157391203, + "grad_norm": 1.2481077909469604, + "learning_rate": 6.043611878715007e-05, + "loss": 1.8302, + "step": 25340 + }, + { + "epoch": 2.9565978298914946, + "grad_norm": 0.9907140135765076, + "learning_rate": 6.042417587137569e-05, + "loss": 1.8614, + "step": 25341 + }, + { + "epoch": 2.9567145023917862, + "grad_norm": 1.1796740293502808, + "learning_rate": 6.041223389977474e-05, + "loss": 1.9317, + "step": 25342 + }, + { + "epoch": 2.956831174892078, + "grad_norm": 1.1103508472442627, + "learning_rate": 6.040029287247093e-05, + "loss": 1.9381, + "step": 25343 + }, + { + "epoch": 2.9569478473923696, + "grad_norm": 1.0929940938949585, + "learning_rate": 6.038835278958815e-05, + "loss": 1.8271, + "step": 25344 + }, + { + "epoch": 2.9570645198926613, + "grad_norm": 1.2459537982940674, + "learning_rate": 6.0376413651250066e-05, + "loss": 1.7595, + "step": 25345 + }, + { + "epoch": 2.957181192392953, + "grad_norm": 0.9798278212547302, + "learning_rate": 6.0364475457580594e-05, + "loss": 1.6931, + "step": 25346 + }, + { + "epoch": 2.9572978648932446, + "grad_norm": 1.1387349367141724, + "learning_rate": 6.0352538208703374e-05, + "loss": 1.9674, + "step": 25347 + }, + { + "epoch": 2.9574145373935363, + "grad_norm": 1.2290126085281372, + "learning_rate": 6.034060190474217e-05, + "loss": 1.9057, + "step": 25348 + }, + { + "epoch": 2.957531209893828, + "grad_norm": 0.937784731388092, + "learning_rate": 6.0328666545820805e-05, + "loss": 1.7151, + "step": 25349 + }, + { + "epoch": 2.9576478823941197, + "grad_norm": 1.0790880918502808, + "learning_rate": 6.031673213206294e-05, + "loss": 1.9185, + "step": 25350 + }, + { + "epoch": 2.9577645548944114, + "grad_norm": 1.0731028318405151, + "learning_rate": 6.0304798663592276e-05, + "loss": 1.8258, + "step": 25351 + }, + { + "epoch": 2.957881227394703, + "grad_norm": 1.1755021810531616, + "learning_rate": 6.029286614053255e-05, + "loss": 2.0036, + "step": 25352 + }, + { + "epoch": 2.9579978998949947, + "grad_norm": 1.1699155569076538, + "learning_rate": 6.028093456300755e-05, + "loss": 2.011, + "step": 25353 + }, + { + "epoch": 2.9581145723952864, + "grad_norm": 0.9976935982704163, + "learning_rate": 6.0269003931140856e-05, + "loss": 1.9855, + "step": 25354 + }, + { + "epoch": 2.958231244895578, + "grad_norm": 0.996485710144043, + "learning_rate": 6.025707424505624e-05, + "loss": 2.0083, + "step": 25355 + }, + { + "epoch": 2.9583479173958698, + "grad_norm": 1.1318308115005493, + "learning_rate": 6.0245145504877275e-05, + "loss": 2.1008, + "step": 25356 + }, + { + "epoch": 2.9584645898961615, + "grad_norm": 1.2603625059127808, + "learning_rate": 6.023321771072774e-05, + "loss": 2.213, + "step": 25357 + }, + { + "epoch": 2.958581262396453, + "grad_norm": 1.2068321704864502, + "learning_rate": 6.022129086273118e-05, + "loss": 2.11, + "step": 25358 + }, + { + "epoch": 2.958697934896745, + "grad_norm": 1.1145488023757935, + "learning_rate": 6.020936496101133e-05, + "loss": 1.911, + "step": 25359 + }, + { + "epoch": 2.9588146073970365, + "grad_norm": 1.0230271816253662, + "learning_rate": 6.019744000569174e-05, + "loss": 2.0297, + "step": 25360 + }, + { + "epoch": 2.958931279897328, + "grad_norm": 1.049753189086914, + "learning_rate": 6.018551599689613e-05, + "loss": 1.9008, + "step": 25361 + }, + { + "epoch": 2.95904795239762, + "grad_norm": 1.1176910400390625, + "learning_rate": 6.017359293474803e-05, + "loss": 2.0401, + "step": 25362 + }, + { + "epoch": 2.9591646248979115, + "grad_norm": 1.3751842975616455, + "learning_rate": 6.0161670819371116e-05, + "loss": 2.1202, + "step": 25363 + }, + { + "epoch": 2.9592812973982032, + "grad_norm": 1.0450319051742554, + "learning_rate": 6.014974965088889e-05, + "loss": 1.9938, + "step": 25364 + }, + { + "epoch": 2.959397969898495, + "grad_norm": 1.0823755264282227, + "learning_rate": 6.0137829429425055e-05, + "loss": 1.978, + "step": 25365 + }, + { + "epoch": 2.9595146423987866, + "grad_norm": 1.0664342641830444, + "learning_rate": 6.012591015510312e-05, + "loss": 2.004, + "step": 25366 + }, + { + "epoch": 2.9596313148990783, + "grad_norm": 1.1911033391952515, + "learning_rate": 6.011399182804661e-05, + "loss": 2.089, + "step": 25367 + }, + { + "epoch": 2.95974798739937, + "grad_norm": 1.0806970596313477, + "learning_rate": 6.010207444837919e-05, + "loss": 1.8443, + "step": 25368 + }, + { + "epoch": 2.9598646598996616, + "grad_norm": 1.1490143537521362, + "learning_rate": 6.0090158016224274e-05, + "loss": 1.8491, + "step": 25369 + }, + { + "epoch": 2.9599813323999533, + "grad_norm": 1.1664642095565796, + "learning_rate": 6.007824253170554e-05, + "loss": 1.884, + "step": 25370 + }, + { + "epoch": 2.960098004900245, + "grad_norm": 1.0299127101898193, + "learning_rate": 6.0066327994946386e-05, + "loss": 1.8217, + "step": 25371 + }, + { + "epoch": 2.9602146774005367, + "grad_norm": 1.0537863969802856, + "learning_rate": 6.005441440607043e-05, + "loss": 1.9882, + "step": 25372 + }, + { + "epoch": 2.9603313499008284, + "grad_norm": 1.1702113151550293, + "learning_rate": 6.004250176520109e-05, + "loss": 1.9312, + "step": 25373 + }, + { + "epoch": 2.96044802240112, + "grad_norm": 1.1999951601028442, + "learning_rate": 6.003059007246196e-05, + "loss": 1.9935, + "step": 25374 + }, + { + "epoch": 2.9605646949014117, + "grad_norm": 1.1091355085372925, + "learning_rate": 6.001867932797641e-05, + "loss": 1.8876, + "step": 25375 + }, + { + "epoch": 2.9606813674017034, + "grad_norm": 1.1384047269821167, + "learning_rate": 6.0006769531868066e-05, + "loss": 1.9849, + "step": 25376 + }, + { + "epoch": 2.960798039901995, + "grad_norm": 1.2140803337097168, + "learning_rate": 5.9994860684260226e-05, + "loss": 1.8482, + "step": 25377 + }, + { + "epoch": 2.9609147124022868, + "grad_norm": 1.0602160692214966, + "learning_rate": 5.998295278527645e-05, + "loss": 2.0002, + "step": 25378 + }, + { + "epoch": 2.9610313849025784, + "grad_norm": 1.2122005224227905, + "learning_rate": 5.997104583504023e-05, + "loss": 2.1054, + "step": 25379 + }, + { + "epoch": 2.96114805740287, + "grad_norm": 1.0350794792175293, + "learning_rate": 5.995913983367493e-05, + "loss": 1.9021, + "step": 25380 + }, + { + "epoch": 2.961264729903162, + "grad_norm": 1.1273136138916016, + "learning_rate": 5.994723478130394e-05, + "loss": 1.9168, + "step": 25381 + }, + { + "epoch": 2.9613814024034535, + "grad_norm": 1.2833917140960693, + "learning_rate": 5.9935330678050786e-05, + "loss": 1.9732, + "step": 25382 + }, + { + "epoch": 2.961498074903745, + "grad_norm": 1.3741222620010376, + "learning_rate": 5.992342752403883e-05, + "loss": 2.1873, + "step": 25383 + }, + { + "epoch": 2.961614747404037, + "grad_norm": 1.1759439706802368, + "learning_rate": 5.991152531939141e-05, + "loss": 2.0186, + "step": 25384 + }, + { + "epoch": 2.9617314199043285, + "grad_norm": 1.048547387123108, + "learning_rate": 5.9899624064232e-05, + "loss": 1.9836, + "step": 25385 + }, + { + "epoch": 2.96184809240462, + "grad_norm": 1.2702308893203735, + "learning_rate": 5.988772375868391e-05, + "loss": 1.9824, + "step": 25386 + }, + { + "epoch": 2.961964764904912, + "grad_norm": 1.198149561882019, + "learning_rate": 5.987582440287059e-05, + "loss": 1.9983, + "step": 25387 + }, + { + "epoch": 2.9620814374052036, + "grad_norm": 1.1691302061080933, + "learning_rate": 5.9863925996915315e-05, + "loss": 1.9557, + "step": 25388 + }, + { + "epoch": 2.9621981099054953, + "grad_norm": 0.9845282435417175, + "learning_rate": 5.985202854094151e-05, + "loss": 1.857, + "step": 25389 + }, + { + "epoch": 2.962314782405787, + "grad_norm": 1.0696098804473877, + "learning_rate": 5.984013203507245e-05, + "loss": 2.0222, + "step": 25390 + }, + { + "epoch": 2.9624314549060786, + "grad_norm": 1.1485594511032104, + "learning_rate": 5.9828236479431466e-05, + "loss": 2.0531, + "step": 25391 + }, + { + "epoch": 2.9625481274063703, + "grad_norm": 0.9644242525100708, + "learning_rate": 5.9816341874141977e-05, + "loss": 1.9878, + "step": 25392 + }, + { + "epoch": 2.962664799906662, + "grad_norm": 1.151315689086914, + "learning_rate": 5.980444821932716e-05, + "loss": 1.9865, + "step": 25393 + }, + { + "epoch": 2.9627814724069537, + "grad_norm": 1.0048415660858154, + "learning_rate": 5.979255551511044e-05, + "loss": 1.8502, + "step": 25394 + }, + { + "epoch": 2.9628981449072453, + "grad_norm": 0.9807999730110168, + "learning_rate": 5.978066376161498e-05, + "loss": 1.7396, + "step": 25395 + }, + { + "epoch": 2.963014817407537, + "grad_norm": 1.1172431707382202, + "learning_rate": 5.9768772958964177e-05, + "loss": 1.961, + "step": 25396 + }, + { + "epoch": 2.9631314899078287, + "grad_norm": 1.2930911779403687, + "learning_rate": 5.97568831072812e-05, + "loss": 2.1912, + "step": 25397 + }, + { + "epoch": 2.9632481624081204, + "grad_norm": 1.2145192623138428, + "learning_rate": 5.974499420668943e-05, + "loss": 1.9566, + "step": 25398 + }, + { + "epoch": 2.963364834908412, + "grad_norm": 1.0959839820861816, + "learning_rate": 5.9733106257312016e-05, + "loss": 2.0216, + "step": 25399 + }, + { + "epoch": 2.9634815074087038, + "grad_norm": 1.1398221254348755, + "learning_rate": 5.972121925927225e-05, + "loss": 1.9461, + "step": 25400 + }, + { + "epoch": 2.9635981799089954, + "grad_norm": 1.0529710054397583, + "learning_rate": 5.970933321269328e-05, + "loss": 2.0151, + "step": 25401 + }, + { + "epoch": 2.963714852409287, + "grad_norm": 1.0699406862258911, + "learning_rate": 5.9697448117698454e-05, + "loss": 2.0041, + "step": 25402 + }, + { + "epoch": 2.963831524909579, + "grad_norm": 1.0444724559783936, + "learning_rate": 5.968556397441086e-05, + "loss": 2.0308, + "step": 25403 + }, + { + "epoch": 2.9639481974098705, + "grad_norm": 1.1858654022216797, + "learning_rate": 5.967368078295378e-05, + "loss": 1.8906, + "step": 25404 + }, + { + "epoch": 2.964064869910162, + "grad_norm": 1.0297893285751343, + "learning_rate": 5.966179854345041e-05, + "loss": 1.9248, + "step": 25405 + }, + { + "epoch": 2.964181542410454, + "grad_norm": 1.123887300491333, + "learning_rate": 5.964991725602387e-05, + "loss": 1.9034, + "step": 25406 + }, + { + "epoch": 2.9642982149107455, + "grad_norm": 1.149145483970642, + "learning_rate": 5.963803692079742e-05, + "loss": 1.7991, + "step": 25407 + }, + { + "epoch": 2.964414887411037, + "grad_norm": 1.0668299198150635, + "learning_rate": 5.962615753789413e-05, + "loss": 1.8363, + "step": 25408 + }, + { + "epoch": 2.964531559911329, + "grad_norm": 1.1296088695526123, + "learning_rate": 5.961427910743724e-05, + "loss": 2.0087, + "step": 25409 + }, + { + "epoch": 2.9646482324116206, + "grad_norm": 1.1396479606628418, + "learning_rate": 5.9602401629549784e-05, + "loss": 1.7408, + "step": 25410 + }, + { + "epoch": 2.9647649049119122, + "grad_norm": 1.268107533454895, + "learning_rate": 5.9590525104355015e-05, + "loss": 2.0515, + "step": 25411 + }, + { + "epoch": 2.964881577412204, + "grad_norm": 1.1910959482192993, + "learning_rate": 5.957864953197595e-05, + "loss": 1.9743, + "step": 25412 + }, + { + "epoch": 2.9649982499124956, + "grad_norm": 1.1113394498825073, + "learning_rate": 5.956677491253581e-05, + "loss": 1.8629, + "step": 25413 + }, + { + "epoch": 2.9651149224127873, + "grad_norm": 1.0492603778839111, + "learning_rate": 5.955490124615757e-05, + "loss": 1.9108, + "step": 25414 + }, + { + "epoch": 2.965231594913079, + "grad_norm": 0.9860180616378784, + "learning_rate": 5.9543028532964446e-05, + "loss": 1.9322, + "step": 25415 + }, + { + "epoch": 2.9653482674133707, + "grad_norm": 1.3999323844909668, + "learning_rate": 5.953115677307947e-05, + "loss": 1.9948, + "step": 25416 + }, + { + "epoch": 2.9654649399136623, + "grad_norm": 1.0494970083236694, + "learning_rate": 5.951928596662567e-05, + "loss": 1.8802, + "step": 25417 + }, + { + "epoch": 2.965581612413954, + "grad_norm": 1.0092673301696777, + "learning_rate": 5.9507416113726177e-05, + "loss": 1.83, + "step": 25418 + }, + { + "epoch": 2.9656982849142457, + "grad_norm": 1.153199315071106, + "learning_rate": 5.949554721450398e-05, + "loss": 2.1516, + "step": 25419 + }, + { + "epoch": 2.9658149574145374, + "grad_norm": 1.0468950271606445, + "learning_rate": 5.9483679269082225e-05, + "loss": 1.7237, + "step": 25420 + }, + { + "epoch": 2.965931629914829, + "grad_norm": 1.437819242477417, + "learning_rate": 5.9471812277583825e-05, + "loss": 2.0538, + "step": 25421 + }, + { + "epoch": 2.9660483024151207, + "grad_norm": 1.2118533849716187, + "learning_rate": 5.945994624013191e-05, + "loss": 1.8931, + "step": 25422 + }, + { + "epoch": 2.9661649749154124, + "grad_norm": 1.1001940965652466, + "learning_rate": 5.944808115684938e-05, + "loss": 2.1442, + "step": 25423 + }, + { + "epoch": 2.966281647415704, + "grad_norm": 1.3210104703903198, + "learning_rate": 5.943621702785936e-05, + "loss": 2.011, + "step": 25424 + }, + { + "epoch": 2.966398319915996, + "grad_norm": 1.0977915525436401, + "learning_rate": 5.942435385328474e-05, + "loss": 2.0613, + "step": 25425 + }, + { + "epoch": 2.9665149924162875, + "grad_norm": 1.178668737411499, + "learning_rate": 5.9412491633248604e-05, + "loss": 2.0179, + "step": 25426 + }, + { + "epoch": 2.966631664916579, + "grad_norm": 0.9932199120521545, + "learning_rate": 5.940063036787381e-05, + "loss": 1.6844, + "step": 25427 + }, + { + "epoch": 2.966748337416871, + "grad_norm": 0.9828704595565796, + "learning_rate": 5.938877005728346e-05, + "loss": 2.0017, + "step": 25428 + }, + { + "epoch": 2.9668650099171625, + "grad_norm": 1.1224067211151123, + "learning_rate": 5.937691070160037e-05, + "loss": 1.9765, + "step": 25429 + }, + { + "epoch": 2.966981682417454, + "grad_norm": 1.2650874853134155, + "learning_rate": 5.936505230094756e-05, + "loss": 2.0557, + "step": 25430 + }, + { + "epoch": 2.967098354917746, + "grad_norm": 1.0326666831970215, + "learning_rate": 5.935319485544799e-05, + "loss": 2.0219, + "step": 25431 + }, + { + "epoch": 2.9672150274180376, + "grad_norm": 1.0066983699798584, + "learning_rate": 5.9341338365224565e-05, + "loss": 1.817, + "step": 25432 + }, + { + "epoch": 2.9673316999183292, + "grad_norm": 1.3597615957260132, + "learning_rate": 5.9329482830400126e-05, + "loss": 1.8754, + "step": 25433 + }, + { + "epoch": 2.967448372418621, + "grad_norm": 1.2150236368179321, + "learning_rate": 5.931762825109769e-05, + "loss": 2.0265, + "step": 25434 + }, + { + "epoch": 2.9675650449189126, + "grad_norm": 1.1996365785598755, + "learning_rate": 5.930577462744011e-05, + "loss": 2.0862, + "step": 25435 + }, + { + "epoch": 2.9676817174192043, + "grad_norm": 1.0500061511993408, + "learning_rate": 5.929392195955019e-05, + "loss": 2.0452, + "step": 25436 + }, + { + "epoch": 2.967798389919496, + "grad_norm": 1.0010960102081299, + "learning_rate": 5.928207024755092e-05, + "loss": 2.0175, + "step": 25437 + }, + { + "epoch": 2.9679150624197876, + "grad_norm": 1.0928419828414917, + "learning_rate": 5.9270219491565094e-05, + "loss": 2.0058, + "step": 25438 + }, + { + "epoch": 2.9680317349200793, + "grad_norm": 1.2257369756698608, + "learning_rate": 5.925836969171564e-05, + "loss": 2.1508, + "step": 25439 + }, + { + "epoch": 2.968148407420371, + "grad_norm": 1.4227627515792847, + "learning_rate": 5.924652084812529e-05, + "loss": 1.8604, + "step": 25440 + }, + { + "epoch": 2.9682650799206627, + "grad_norm": 1.063901662826538, + "learning_rate": 5.9234672960917016e-05, + "loss": 1.9408, + "step": 25441 + }, + { + "epoch": 2.9683817524209544, + "grad_norm": 1.0771541595458984, + "learning_rate": 5.922282603021351e-05, + "loss": 2.015, + "step": 25442 + }, + { + "epoch": 2.968498424921246, + "grad_norm": 1.0592477321624756, + "learning_rate": 5.921098005613766e-05, + "loss": 1.9453, + "step": 25443 + }, + { + "epoch": 2.9686150974215377, + "grad_norm": 0.9822883605957031, + "learning_rate": 5.9199135038812326e-05, + "loss": 1.9809, + "step": 25444 + }, + { + "epoch": 2.9687317699218294, + "grad_norm": 1.116964340209961, + "learning_rate": 5.918729097836018e-05, + "loss": 1.9284, + "step": 25445 + }, + { + "epoch": 2.968848442422121, + "grad_norm": 1.120286464691162, + "learning_rate": 5.917544787490414e-05, + "loss": 1.867, + "step": 25446 + }, + { + "epoch": 2.9689651149224128, + "grad_norm": 1.1185153722763062, + "learning_rate": 5.916360572856684e-05, + "loss": 1.9063, + "step": 25447 + }, + { + "epoch": 2.9690817874227045, + "grad_norm": 1.0446265935897827, + "learning_rate": 5.915176453947118e-05, + "loss": 1.99, + "step": 25448 + }, + { + "epoch": 2.969198459922996, + "grad_norm": 0.9715766310691833, + "learning_rate": 5.91399243077398e-05, + "loss": 1.7906, + "step": 25449 + }, + { + "epoch": 2.969315132423288, + "grad_norm": 1.1842272281646729, + "learning_rate": 5.912808503349558e-05, + "loss": 2.1264, + "step": 25450 + }, + { + "epoch": 2.9694318049235795, + "grad_norm": 1.1267980337142944, + "learning_rate": 5.911624671686114e-05, + "loss": 1.9699, + "step": 25451 + }, + { + "epoch": 2.969548477423871, + "grad_norm": 1.0618674755096436, + "learning_rate": 5.910440935795928e-05, + "loss": 1.9642, + "step": 25452 + }, + { + "epoch": 2.969665149924163, + "grad_norm": 1.1520847082138062, + "learning_rate": 5.90925729569126e-05, + "loss": 2.0899, + "step": 25453 + }, + { + "epoch": 2.9697818224244545, + "grad_norm": 0.9295305013656616, + "learning_rate": 5.908073751384397e-05, + "loss": 1.8457, + "step": 25454 + }, + { + "epoch": 2.9698984949247462, + "grad_norm": 1.0576432943344116, + "learning_rate": 5.906890302887594e-05, + "loss": 1.9343, + "step": 25455 + }, + { + "epoch": 2.970015167425038, + "grad_norm": 1.2012362480163574, + "learning_rate": 5.905706950213125e-05, + "loss": 2.1323, + "step": 25456 + }, + { + "epoch": 2.9701318399253296, + "grad_norm": 1.485556960105896, + "learning_rate": 5.904523693373267e-05, + "loss": 1.9473, + "step": 25457 + }, + { + "epoch": 2.9702485124256213, + "grad_norm": 1.2613433599472046, + "learning_rate": 5.9033405323802715e-05, + "loss": 2.0364, + "step": 25458 + }, + { + "epoch": 2.970365184925913, + "grad_norm": 1.3042395114898682, + "learning_rate": 5.902157467246416e-05, + "loss": 1.9207, + "step": 25459 + }, + { + "epoch": 2.9704818574262046, + "grad_norm": 1.093355655670166, + "learning_rate": 5.9009744979839574e-05, + "loss": 1.8488, + "step": 25460 + }, + { + "epoch": 2.9705985299264963, + "grad_norm": 1.0353537797927856, + "learning_rate": 5.899791624605167e-05, + "loss": 1.9095, + "step": 25461 + }, + { + "epoch": 2.970715202426788, + "grad_norm": 1.114911675453186, + "learning_rate": 5.898608847122296e-05, + "loss": 2.0905, + "step": 25462 + }, + { + "epoch": 2.9708318749270797, + "grad_norm": 1.3392338752746582, + "learning_rate": 5.897426165547622e-05, + "loss": 2.0056, + "step": 25463 + }, + { + "epoch": 2.9709485474273714, + "grad_norm": 1.1941906213760376, + "learning_rate": 5.89624357989339e-05, + "loss": 2.0296, + "step": 25464 + }, + { + "epoch": 2.971065219927663, + "grad_norm": 1.0558289289474487, + "learning_rate": 5.895061090171871e-05, + "loss": 1.8464, + "step": 25465 + }, + { + "epoch": 2.9711818924279547, + "grad_norm": 0.9687297344207764, + "learning_rate": 5.8938786963953154e-05, + "loss": 1.8136, + "step": 25466 + }, + { + "epoch": 2.9712985649282464, + "grad_norm": 1.360217571258545, + "learning_rate": 5.892696398575989e-05, + "loss": 1.9624, + "step": 25467 + }, + { + "epoch": 2.971415237428538, + "grad_norm": 1.2282770872116089, + "learning_rate": 5.8915141967261454e-05, + "loss": 2.0579, + "step": 25468 + }, + { + "epoch": 2.9715319099288298, + "grad_norm": 1.1015149354934692, + "learning_rate": 5.890332090858035e-05, + "loss": 1.8728, + "step": 25469 + }, + { + "epoch": 2.9716485824291214, + "grad_norm": 1.3312867879867554, + "learning_rate": 5.889150080983922e-05, + "loss": 2.1534, + "step": 25470 + }, + { + "epoch": 2.971765254929413, + "grad_norm": 1.1494052410125732, + "learning_rate": 5.8879681671160494e-05, + "loss": 1.9845, + "step": 25471 + }, + { + "epoch": 2.971881927429705, + "grad_norm": 1.0720024108886719, + "learning_rate": 5.8867863492666825e-05, + "loss": 1.979, + "step": 25472 + }, + { + "epoch": 2.9719985999299965, + "grad_norm": 1.0727351903915405, + "learning_rate": 5.885604627448059e-05, + "loss": 1.8519, + "step": 25473 + }, + { + "epoch": 2.972115272430288, + "grad_norm": 1.10289466381073, + "learning_rate": 5.884423001672444e-05, + "loss": 2.1919, + "step": 25474 + }, + { + "epoch": 2.97223194493058, + "grad_norm": 1.1890894174575806, + "learning_rate": 5.883241471952075e-05, + "loss": 1.9607, + "step": 25475 + }, + { + "epoch": 2.9723486174308715, + "grad_norm": 1.053431510925293, + "learning_rate": 5.88206003829921e-05, + "loss": 2.0606, + "step": 25476 + }, + { + "epoch": 2.972465289931163, + "grad_norm": 1.1062939167022705, + "learning_rate": 5.8808787007260885e-05, + "loss": 1.9756, + "step": 25477 + }, + { + "epoch": 2.972581962431455, + "grad_norm": 0.9796121120452881, + "learning_rate": 5.8796974592449675e-05, + "loss": 1.7383, + "step": 25478 + }, + { + "epoch": 2.9726986349317466, + "grad_norm": 1.0754185914993286, + "learning_rate": 5.8785163138680815e-05, + "loss": 2.0071, + "step": 25479 + }, + { + "epoch": 2.9728153074320383, + "grad_norm": 1.186850666999817, + "learning_rate": 5.877335264607686e-05, + "loss": 1.9707, + "step": 25480 + }, + { + "epoch": 2.97293197993233, + "grad_norm": 1.0911157131195068, + "learning_rate": 5.876154311476016e-05, + "loss": 1.8348, + "step": 25481 + }, + { + "epoch": 2.9730486524326216, + "grad_norm": 1.0219212770462036, + "learning_rate": 5.8749734544853155e-05, + "loss": 1.8335, + "step": 25482 + }, + { + "epoch": 2.9731653249329133, + "grad_norm": 1.1440290212631226, + "learning_rate": 5.873792693647837e-05, + "loss": 1.9857, + "step": 25483 + }, + { + "epoch": 2.973281997433205, + "grad_norm": 1.1710293292999268, + "learning_rate": 5.872612028975811e-05, + "loss": 2.0483, + "step": 25484 + }, + { + "epoch": 2.9733986699334967, + "grad_norm": 0.9495840072631836, + "learning_rate": 5.87143146048148e-05, + "loss": 1.9224, + "step": 25485 + }, + { + "epoch": 2.9735153424337883, + "grad_norm": 1.1423492431640625, + "learning_rate": 5.870250988177076e-05, + "loss": 2.0364, + "step": 25486 + }, + { + "epoch": 2.97363201493408, + "grad_norm": 1.0731017589569092, + "learning_rate": 5.8690706120748504e-05, + "loss": 1.8278, + "step": 25487 + }, + { + "epoch": 2.9737486874343717, + "grad_norm": 1.0729737281799316, + "learning_rate": 5.867890332187026e-05, + "loss": 1.9899, + "step": 25488 + }, + { + "epoch": 2.9738653599346634, + "grad_norm": 0.9961547255516052, + "learning_rate": 5.866710148525852e-05, + "loss": 1.7913, + "step": 25489 + }, + { + "epoch": 2.973982032434955, + "grad_norm": 1.043949007987976, + "learning_rate": 5.865530061103551e-05, + "loss": 2.1021, + "step": 25490 + }, + { + "epoch": 2.9740987049352468, + "grad_norm": 1.017932653427124, + "learning_rate": 5.864350069932368e-05, + "loss": 1.9769, + "step": 25491 + }, + { + "epoch": 2.9742153774355384, + "grad_norm": 1.124234676361084, + "learning_rate": 5.863170175024524e-05, + "loss": 1.9623, + "step": 25492 + }, + { + "epoch": 2.97433204993583, + "grad_norm": 1.212998628616333, + "learning_rate": 5.861990376392263e-05, + "loss": 2.135, + "step": 25493 + }, + { + "epoch": 2.974448722436122, + "grad_norm": 1.1242963075637817, + "learning_rate": 5.8608106740478054e-05, + "loss": 2.0945, + "step": 25494 + }, + { + "epoch": 2.9745653949364135, + "grad_norm": 1.1268702745437622, + "learning_rate": 5.859631068003385e-05, + "loss": 1.9715, + "step": 25495 + }, + { + "epoch": 2.974682067436705, + "grad_norm": 0.9516973495483398, + "learning_rate": 5.858451558271236e-05, + "loss": 1.9241, + "step": 25496 + }, + { + "epoch": 2.974798739936997, + "grad_norm": 1.0083953142166138, + "learning_rate": 5.857272144863578e-05, + "loss": 1.7835, + "step": 25497 + }, + { + "epoch": 2.9749154124372885, + "grad_norm": 1.220950961112976, + "learning_rate": 5.856092827792645e-05, + "loss": 1.9393, + "step": 25498 + }, + { + "epoch": 2.97503208493758, + "grad_norm": 1.0655336380004883, + "learning_rate": 5.8549136070706546e-05, + "loss": 1.9101, + "step": 25499 + }, + { + "epoch": 2.975148757437872, + "grad_norm": 1.1172369718551636, + "learning_rate": 5.853734482709842e-05, + "loss": 1.915, + "step": 25500 + }, + { + "epoch": 2.9752654299381636, + "grad_norm": 1.0457441806793213, + "learning_rate": 5.852555454722426e-05, + "loss": 2.0541, + "step": 25501 + }, + { + "epoch": 2.9753821024384552, + "grad_norm": 1.1565393209457397, + "learning_rate": 5.851376523120621e-05, + "loss": 2.1066, + "step": 25502 + }, + { + "epoch": 2.975498774938747, + "grad_norm": 1.0906091928482056, + "learning_rate": 5.8501976879166644e-05, + "loss": 1.8827, + "step": 25503 + }, + { + "epoch": 2.9756154474390386, + "grad_norm": 1.0230902433395386, + "learning_rate": 5.849018949122769e-05, + "loss": 1.9343, + "step": 25504 + }, + { + "epoch": 2.9757321199393303, + "grad_norm": 1.1723624467849731, + "learning_rate": 5.84784030675115e-05, + "loss": 1.9468, + "step": 25505 + }, + { + "epoch": 2.975848792439622, + "grad_norm": 1.1890684366226196, + "learning_rate": 5.846661760814034e-05, + "loss": 1.8406, + "step": 25506 + }, + { + "epoch": 2.9759654649399137, + "grad_norm": 1.1652852296829224, + "learning_rate": 5.845483311323635e-05, + "loss": 1.9863, + "step": 25507 + }, + { + "epoch": 2.9760821374402053, + "grad_norm": 1.2075568437576294, + "learning_rate": 5.8443049582921694e-05, + "loss": 2.0482, + "step": 25508 + }, + { + "epoch": 2.976198809940497, + "grad_norm": 1.1677944660186768, + "learning_rate": 5.8431267017318585e-05, + "loss": 1.9311, + "step": 25509 + }, + { + "epoch": 2.9763154824407887, + "grad_norm": 1.3820313215255737, + "learning_rate": 5.841948541654909e-05, + "loss": 1.9526, + "step": 25510 + }, + { + "epoch": 2.9764321549410804, + "grad_norm": 0.9537005424499512, + "learning_rate": 5.8407704780735445e-05, + "loss": 1.9142, + "step": 25511 + }, + { + "epoch": 2.976548827441372, + "grad_norm": 1.2562263011932373, + "learning_rate": 5.839592510999967e-05, + "loss": 1.9112, + "step": 25512 + }, + { + "epoch": 2.9766654999416637, + "grad_norm": 1.1516231298446655, + "learning_rate": 5.8384146404463994e-05, + "loss": 2.0985, + "step": 25513 + }, + { + "epoch": 2.9767821724419554, + "grad_norm": 1.1262868642807007, + "learning_rate": 5.8372368664250416e-05, + "loss": 1.94, + "step": 25514 + }, + { + "epoch": 2.976898844942247, + "grad_norm": 0.9965476989746094, + "learning_rate": 5.836059188948115e-05, + "loss": 1.8334, + "step": 25515 + }, + { + "epoch": 2.977015517442539, + "grad_norm": 1.0504367351531982, + "learning_rate": 5.8348816080278154e-05, + "loss": 1.8494, + "step": 25516 + }, + { + "epoch": 2.9771321899428305, + "grad_norm": 1.25812828540802, + "learning_rate": 5.833704123676365e-05, + "loss": 2.1168, + "step": 25517 + }, + { + "epoch": 2.977248862443122, + "grad_norm": 1.0451548099517822, + "learning_rate": 5.832526735905955e-05, + "loss": 1.952, + "step": 25518 + }, + { + "epoch": 2.977365534943414, + "grad_norm": 1.0170786380767822, + "learning_rate": 5.831349444728806e-05, + "loss": 1.9915, + "step": 25519 + }, + { + "epoch": 2.9774822074437055, + "grad_norm": 1.106330156326294, + "learning_rate": 5.8301722501571174e-05, + "loss": 2.0218, + "step": 25520 + }, + { + "epoch": 2.977598879943997, + "grad_norm": 1.1687703132629395, + "learning_rate": 5.8289951522030856e-05, + "loss": 2.018, + "step": 25521 + }, + { + "epoch": 2.977715552444289, + "grad_norm": 1.1149778366088867, + "learning_rate": 5.827818150878925e-05, + "loss": 2.018, + "step": 25522 + }, + { + "epoch": 2.9778322249445806, + "grad_norm": 1.178065538406372, + "learning_rate": 5.826641246196829e-05, + "loss": 2.1039, + "step": 25523 + }, + { + "epoch": 2.9779488974448722, + "grad_norm": 1.1976630687713623, + "learning_rate": 5.825464438169003e-05, + "loss": 1.9801, + "step": 25524 + }, + { + "epoch": 2.978065569945164, + "grad_norm": 1.0095394849777222, + "learning_rate": 5.824287726807645e-05, + "loss": 1.8758, + "step": 25525 + }, + { + "epoch": 2.9781822424454556, + "grad_norm": 1.059532642364502, + "learning_rate": 5.823111112124955e-05, + "loss": 1.9871, + "step": 25526 + }, + { + "epoch": 2.9782989149457473, + "grad_norm": 1.1983393430709839, + "learning_rate": 5.8219345941331285e-05, + "loss": 1.9584, + "step": 25527 + }, + { + "epoch": 2.978415587446039, + "grad_norm": 0.9788383841514587, + "learning_rate": 5.8207581728443675e-05, + "loss": 2.1266, + "step": 25528 + }, + { + "epoch": 2.9785322599463306, + "grad_norm": 1.1014161109924316, + "learning_rate": 5.8195818482708605e-05, + "loss": 1.8198, + "step": 25529 + }, + { + "epoch": 2.9786489324466223, + "grad_norm": 1.2909973859786987, + "learning_rate": 5.8184056204248094e-05, + "loss": 1.9781, + "step": 25530 + }, + { + "epoch": 2.978765604946914, + "grad_norm": 1.0988365411758423, + "learning_rate": 5.817229489318402e-05, + "loss": 1.846, + "step": 25531 + }, + { + "epoch": 2.9788822774472057, + "grad_norm": 1.143738031387329, + "learning_rate": 5.816053454963838e-05, + "loss": 1.9474, + "step": 25532 + }, + { + "epoch": 2.9789989499474974, + "grad_norm": 1.1848876476287842, + "learning_rate": 5.814877517373299e-05, + "loss": 1.8139, + "step": 25533 + }, + { + "epoch": 2.979115622447789, + "grad_norm": 1.017322301864624, + "learning_rate": 5.813701676558989e-05, + "loss": 1.8725, + "step": 25534 + }, + { + "epoch": 2.9792322949480807, + "grad_norm": 1.0764238834381104, + "learning_rate": 5.8125259325330846e-05, + "loss": 2.0745, + "step": 25535 + }, + { + "epoch": 2.9793489674483724, + "grad_norm": 1.0840297937393188, + "learning_rate": 5.811350285307788e-05, + "loss": 2.0323, + "step": 25536 + }, + { + "epoch": 2.979465639948664, + "grad_norm": 1.098725438117981, + "learning_rate": 5.810174734895278e-05, + "loss": 1.9067, + "step": 25537 + }, + { + "epoch": 2.9795823124489558, + "grad_norm": 1.111599326133728, + "learning_rate": 5.808999281307739e-05, + "loss": 2.019, + "step": 25538 + }, + { + "epoch": 2.9796989849492475, + "grad_norm": 1.2033498287200928, + "learning_rate": 5.807823924557366e-05, + "loss": 2.0709, + "step": 25539 + }, + { + "epoch": 2.979815657449539, + "grad_norm": 1.0493375062942505, + "learning_rate": 5.806648664656332e-05, + "loss": 1.7881, + "step": 25540 + }, + { + "epoch": 2.979932329949831, + "grad_norm": 1.038886308670044, + "learning_rate": 5.8054735016168355e-05, + "loss": 2.0489, + "step": 25541 + }, + { + "epoch": 2.9800490024501225, + "grad_norm": 1.0687744617462158, + "learning_rate": 5.8042984354510444e-05, + "loss": 1.8719, + "step": 25542 + }, + { + "epoch": 2.980165674950414, + "grad_norm": 0.9863707423210144, + "learning_rate": 5.8031234661711524e-05, + "loss": 1.8415, + "step": 25543 + }, + { + "epoch": 2.980282347450706, + "grad_norm": 1.0219732522964478, + "learning_rate": 5.801948593789331e-05, + "loss": 1.9856, + "step": 25544 + }, + { + "epoch": 2.9803990199509975, + "grad_norm": 1.0494530200958252, + "learning_rate": 5.800773818317771e-05, + "loss": 1.8588, + "step": 25545 + }, + { + "epoch": 2.9805156924512892, + "grad_norm": 1.1474162340164185, + "learning_rate": 5.799599139768638e-05, + "loss": 1.9768, + "step": 25546 + }, + { + "epoch": 2.980632364951581, + "grad_norm": 1.1888949871063232, + "learning_rate": 5.7984245581541166e-05, + "loss": 1.9479, + "step": 25547 + }, + { + "epoch": 2.9807490374518726, + "grad_norm": 1.1956931352615356, + "learning_rate": 5.797250073486389e-05, + "loss": 2.0531, + "step": 25548 + }, + { + "epoch": 2.9808657099521643, + "grad_norm": 1.1459954977035522, + "learning_rate": 5.7960756857776193e-05, + "loss": 1.9908, + "step": 25549 + }, + { + "epoch": 2.980982382452456, + "grad_norm": 0.9880764484405518, + "learning_rate": 5.7949013950399935e-05, + "loss": 1.9224, + "step": 25550 + }, + { + "epoch": 2.9810990549527476, + "grad_norm": 1.1614022254943848, + "learning_rate": 5.793727201285674e-05, + "loss": 1.9774, + "step": 25551 + }, + { + "epoch": 2.9812157274530393, + "grad_norm": 0.979316771030426, + "learning_rate": 5.792553104526848e-05, + "loss": 1.7838, + "step": 25552 + }, + { + "epoch": 2.981332399953331, + "grad_norm": 1.2666516304016113, + "learning_rate": 5.791379104775676e-05, + "loss": 1.9315, + "step": 25553 + }, + { + "epoch": 2.9814490724536227, + "grad_norm": 1.152076005935669, + "learning_rate": 5.790205202044333e-05, + "loss": 1.9243, + "step": 25554 + }, + { + "epoch": 2.9815657449539144, + "grad_norm": 1.1990739107131958, + "learning_rate": 5.789031396344981e-05, + "loss": 2.1031, + "step": 25555 + }, + { + "epoch": 2.981682417454206, + "grad_norm": 1.2588900327682495, + "learning_rate": 5.7878576876898e-05, + "loss": 1.9847, + "step": 25556 + }, + { + "epoch": 2.9817990899544977, + "grad_norm": 1.0876121520996094, + "learning_rate": 5.7866840760909486e-05, + "loss": 1.9745, + "step": 25557 + }, + { + "epoch": 2.9819157624547894, + "grad_norm": 1.182890772819519, + "learning_rate": 5.785510561560601e-05, + "loss": 1.8572, + "step": 25558 + }, + { + "epoch": 2.982032434955081, + "grad_norm": 1.0134034156799316, + "learning_rate": 5.784337144110916e-05, + "loss": 1.9467, + "step": 25559 + }, + { + "epoch": 2.9821491074553728, + "grad_norm": 1.0250171422958374, + "learning_rate": 5.7831638237540626e-05, + "loss": 1.8728, + "step": 25560 + }, + { + "epoch": 2.9822657799556644, + "grad_norm": 1.0763758420944214, + "learning_rate": 5.7819906005022073e-05, + "loss": 1.9765, + "step": 25561 + }, + { + "epoch": 2.982382452455956, + "grad_norm": 1.091866135597229, + "learning_rate": 5.780817474367505e-05, + "loss": 2.0732, + "step": 25562 + }, + { + "epoch": 2.982499124956248, + "grad_norm": 1.2996344566345215, + "learning_rate": 5.7796444453621266e-05, + "loss": 2.0565, + "step": 25563 + }, + { + "epoch": 2.9826157974565395, + "grad_norm": 1.238960862159729, + "learning_rate": 5.778471513498222e-05, + "loss": 1.8974, + "step": 25564 + }, + { + "epoch": 2.982732469956831, + "grad_norm": 1.1975035667419434, + "learning_rate": 5.7772986787879635e-05, + "loss": 2.0003, + "step": 25565 + }, + { + "epoch": 2.982849142457123, + "grad_norm": 1.0179423093795776, + "learning_rate": 5.776125941243495e-05, + "loss": 1.9713, + "step": 25566 + }, + { + "epoch": 2.9829658149574145, + "grad_norm": 1.1334813833236694, + "learning_rate": 5.774953300876989e-05, + "loss": 1.9489, + "step": 25567 + }, + { + "epoch": 2.983082487457706, + "grad_norm": 0.9529379606246948, + "learning_rate": 5.7737807577005885e-05, + "loss": 1.884, + "step": 25568 + }, + { + "epoch": 2.983199159957998, + "grad_norm": 1.0599396228790283, + "learning_rate": 5.7726083117264627e-05, + "loss": 1.9604, + "step": 25569 + }, + { + "epoch": 2.9833158324582896, + "grad_norm": 1.1806578636169434, + "learning_rate": 5.771435962966759e-05, + "loss": 1.9239, + "step": 25570 + }, + { + "epoch": 2.9834325049585813, + "grad_norm": 1.1705870628356934, + "learning_rate": 5.7702637114336254e-05, + "loss": 1.964, + "step": 25571 + }, + { + "epoch": 2.983549177458873, + "grad_norm": 0.9087419509887695, + "learning_rate": 5.7690915571392266e-05, + "loss": 1.7228, + "step": 25572 + }, + { + "epoch": 2.9836658499591646, + "grad_norm": 1.0961990356445312, + "learning_rate": 5.7679195000957006e-05, + "loss": 1.9104, + "step": 25573 + }, + { + "epoch": 2.9837825224594563, + "grad_norm": 1.3271712064743042, + "learning_rate": 5.766747540315213e-05, + "loss": 1.851, + "step": 25574 + }, + { + "epoch": 2.983899194959748, + "grad_norm": 1.1052426099777222, + "learning_rate": 5.7655756778099005e-05, + "loss": 1.9489, + "step": 25575 + }, + { + "epoch": 2.9840158674600397, + "grad_norm": 1.0937303304672241, + "learning_rate": 5.7644039125919205e-05, + "loss": 2.0509, + "step": 25576 + }, + { + "epoch": 2.9841325399603313, + "grad_norm": 1.1222902536392212, + "learning_rate": 5.763232244673412e-05, + "loss": 1.8016, + "step": 25577 + }, + { + "epoch": 2.984249212460623, + "grad_norm": 0.9721499085426331, + "learning_rate": 5.762060674066534e-05, + "loss": 1.8799, + "step": 25578 + }, + { + "epoch": 2.9843658849609147, + "grad_norm": 1.1622499227523804, + "learning_rate": 5.760889200783416e-05, + "loss": 1.9265, + "step": 25579 + }, + { + "epoch": 2.9844825574612064, + "grad_norm": 1.2727779150009155, + "learning_rate": 5.759717824836219e-05, + "loss": 1.9252, + "step": 25580 + }, + { + "epoch": 2.984599229961498, + "grad_norm": 1.1905262470245361, + "learning_rate": 5.7585465462370726e-05, + "loss": 1.9404, + "step": 25581 + }, + { + "epoch": 2.9847159024617897, + "grad_norm": 1.1647471189498901, + "learning_rate": 5.7573753649981316e-05, + "loss": 1.9858, + "step": 25582 + }, + { + "epoch": 2.9848325749620814, + "grad_norm": 1.487775444984436, + "learning_rate": 5.7562042811315245e-05, + "loss": 1.9239, + "step": 25583 + }, + { + "epoch": 2.984949247462373, + "grad_norm": 1.013530969619751, + "learning_rate": 5.7550332946494054e-05, + "loss": 1.8575, + "step": 25584 + }, + { + "epoch": 2.985065919962665, + "grad_norm": 1.1136541366577148, + "learning_rate": 5.7538624055639e-05, + "loss": 2.033, + "step": 25585 + }, + { + "epoch": 2.9851825924629565, + "grad_norm": 1.270903468132019, + "learning_rate": 5.7526916138871604e-05, + "loss": 2.1241, + "step": 25586 + }, + { + "epoch": 2.985299264963248, + "grad_norm": 1.0121123790740967, + "learning_rate": 5.751520919631312e-05, + "loss": 1.9238, + "step": 25587 + }, + { + "epoch": 2.98541593746354, + "grad_norm": 0.9491339325904846, + "learning_rate": 5.750350322808503e-05, + "loss": 2.0068, + "step": 25588 + }, + { + "epoch": 2.9855326099638315, + "grad_norm": 1.0457607507705688, + "learning_rate": 5.74917982343086e-05, + "loss": 1.988, + "step": 25589 + }, + { + "epoch": 2.985649282464123, + "grad_norm": 1.3061268329620361, + "learning_rate": 5.748009421510517e-05, + "loss": 1.916, + "step": 25590 + }, + { + "epoch": 2.985765954964415, + "grad_norm": 0.9996154308319092, + "learning_rate": 5.746839117059615e-05, + "loss": 1.9687, + "step": 25591 + }, + { + "epoch": 2.9858826274647066, + "grad_norm": 1.239219307899475, + "learning_rate": 5.7456689100902764e-05, + "loss": 1.8676, + "step": 25592 + }, + { + "epoch": 2.9859992999649982, + "grad_norm": 1.1448390483856201, + "learning_rate": 5.7444988006146434e-05, + "loss": 2.0927, + "step": 25593 + }, + { + "epoch": 2.98611597246529, + "grad_norm": 1.0253994464874268, + "learning_rate": 5.743328788644837e-05, + "loss": 1.8507, + "step": 25594 + }, + { + "epoch": 2.9862326449655816, + "grad_norm": 1.1045310497283936, + "learning_rate": 5.742158874192995e-05, + "loss": 1.9366, + "step": 25595 + }, + { + "epoch": 2.9863493174658733, + "grad_norm": 1.0815207958221436, + "learning_rate": 5.7409890572712356e-05, + "loss": 2.047, + "step": 25596 + }, + { + "epoch": 2.986465989966165, + "grad_norm": 1.0663087368011475, + "learning_rate": 5.739819337891697e-05, + "loss": 2.0149, + "step": 25597 + }, + { + "epoch": 2.9865826624664567, + "grad_norm": 1.142771601676941, + "learning_rate": 5.738649716066498e-05, + "loss": 2.1, + "step": 25598 + }, + { + "epoch": 2.9866993349667483, + "grad_norm": 1.071779727935791, + "learning_rate": 5.737480191807771e-05, + "loss": 1.9395, + "step": 25599 + }, + { + "epoch": 2.98681600746704, + "grad_norm": 1.0114688873291016, + "learning_rate": 5.736310765127628e-05, + "loss": 1.9015, + "step": 25600 + }, + { + "epoch": 2.9869326799673317, + "grad_norm": 1.1099148988723755, + "learning_rate": 5.7351414360382025e-05, + "loss": 1.8915, + "step": 25601 + }, + { + "epoch": 2.9870493524676234, + "grad_norm": 1.1530849933624268, + "learning_rate": 5.7339722045516186e-05, + "loss": 1.9417, + "step": 25602 + }, + { + "epoch": 2.987166024967915, + "grad_norm": 1.1023391485214233, + "learning_rate": 5.73280307067999e-05, + "loss": 1.925, + "step": 25603 + }, + { + "epoch": 2.9872826974682067, + "grad_norm": 1.067866563796997, + "learning_rate": 5.731634034435444e-05, + "loss": 1.9281, + "step": 25604 + }, + { + "epoch": 2.9873993699684984, + "grad_norm": 1.1021673679351807, + "learning_rate": 5.730465095830097e-05, + "loss": 1.9033, + "step": 25605 + }, + { + "epoch": 2.98751604246879, + "grad_norm": 1.1422919034957886, + "learning_rate": 5.7292962548760674e-05, + "loss": 2.0024, + "step": 25606 + }, + { + "epoch": 2.987632714969082, + "grad_norm": 1.0742249488830566, + "learning_rate": 5.7281275115854654e-05, + "loss": 1.8942, + "step": 25607 + }, + { + "epoch": 2.9877493874693735, + "grad_norm": 1.0570878982543945, + "learning_rate": 5.726958865970419e-05, + "loss": 1.9279, + "step": 25608 + }, + { + "epoch": 2.987866059969665, + "grad_norm": 1.141356110572815, + "learning_rate": 5.725790318043032e-05, + "loss": 1.9878, + "step": 25609 + }, + { + "epoch": 2.987982732469957, + "grad_norm": 1.0260404348373413, + "learning_rate": 5.724621867815431e-05, + "loss": 1.9071, + "step": 25610 + }, + { + "epoch": 2.9880994049702485, + "grad_norm": 1.2499408721923828, + "learning_rate": 5.723453515299716e-05, + "loss": 2.1983, + "step": 25611 + }, + { + "epoch": 2.98821607747054, + "grad_norm": 1.2522363662719727, + "learning_rate": 5.722285260508013e-05, + "loss": 1.9781, + "step": 25612 + }, + { + "epoch": 2.988332749970832, + "grad_norm": 1.1254730224609375, + "learning_rate": 5.7211171034524195e-05, + "loss": 1.8356, + "step": 25613 + }, + { + "epoch": 2.9884494224711236, + "grad_norm": 1.1210662126541138, + "learning_rate": 5.7199490441450525e-05, + "loss": 2.1032, + "step": 25614 + }, + { + "epoch": 2.9885660949714152, + "grad_norm": 1.1425937414169312, + "learning_rate": 5.718781082598028e-05, + "loss": 1.9725, + "step": 25615 + }, + { + "epoch": 2.988682767471707, + "grad_norm": 1.1806392669677734, + "learning_rate": 5.717613218823441e-05, + "loss": 2.0898, + "step": 25616 + }, + { + "epoch": 2.9887994399719986, + "grad_norm": 1.0985788106918335, + "learning_rate": 5.7164454528334076e-05, + "loss": 1.9482, + "step": 25617 + }, + { + "epoch": 2.9889161124722903, + "grad_norm": 1.011552095413208, + "learning_rate": 5.7152777846400285e-05, + "loss": 1.9473, + "step": 25618 + }, + { + "epoch": 2.989032784972582, + "grad_norm": 1.0160664319992065, + "learning_rate": 5.714110214255414e-05, + "loss": 1.8472, + "step": 25619 + }, + { + "epoch": 2.9891494574728736, + "grad_norm": 1.068846344947815, + "learning_rate": 5.7129427416916626e-05, + "loss": 1.7943, + "step": 25620 + }, + { + "epoch": 2.9892661299731653, + "grad_norm": 1.1563427448272705, + "learning_rate": 5.711775366960884e-05, + "loss": 2.002, + "step": 25621 + }, + { + "epoch": 2.989382802473457, + "grad_norm": 1.0487691164016724, + "learning_rate": 5.7106080900751774e-05, + "loss": 1.8547, + "step": 25622 + }, + { + "epoch": 2.9894994749737487, + "grad_norm": 1.1355186700820923, + "learning_rate": 5.709440911046637e-05, + "loss": 1.9021, + "step": 25623 + }, + { + "epoch": 2.9896161474740404, + "grad_norm": 1.094379186630249, + "learning_rate": 5.708273829887375e-05, + "loss": 2.0253, + "step": 25624 + }, + { + "epoch": 2.989732819974332, + "grad_norm": 1.0699574947357178, + "learning_rate": 5.707106846609483e-05, + "loss": 1.9633, + "step": 25625 + }, + { + "epoch": 2.9898494924746237, + "grad_norm": 0.9778090715408325, + "learning_rate": 5.7059399612250554e-05, + "loss": 1.8923, + "step": 25626 + }, + { + "epoch": 2.9899661649749154, + "grad_norm": 1.1568926572799683, + "learning_rate": 5.7047731737461935e-05, + "loss": 1.8748, + "step": 25627 + }, + { + "epoch": 2.990082837475207, + "grad_norm": 1.1135072708129883, + "learning_rate": 5.703606484184999e-05, + "loss": 1.9696, + "step": 25628 + }, + { + "epoch": 2.9901995099754988, + "grad_norm": 1.1138169765472412, + "learning_rate": 5.7024398925535565e-05, + "loss": 2.1047, + "step": 25629 + }, + { + "epoch": 2.9903161824757905, + "grad_norm": 1.285209059715271, + "learning_rate": 5.70127339886397e-05, + "loss": 2.0197, + "step": 25630 + }, + { + "epoch": 2.990432854976082, + "grad_norm": 1.1320233345031738, + "learning_rate": 5.700107003128323e-05, + "loss": 1.9754, + "step": 25631 + }, + { + "epoch": 2.990549527476374, + "grad_norm": 0.9797199964523315, + "learning_rate": 5.698940705358715e-05, + "loss": 1.7662, + "step": 25632 + }, + { + "epoch": 2.9906661999766655, + "grad_norm": 1.0952249765396118, + "learning_rate": 5.6977745055672296e-05, + "loss": 1.9548, + "step": 25633 + }, + { + "epoch": 2.990782872476957, + "grad_norm": 1.125484824180603, + "learning_rate": 5.6966084037659654e-05, + "loss": 2.0024, + "step": 25634 + }, + { + "epoch": 2.990899544977249, + "grad_norm": 1.2195199728012085, + "learning_rate": 5.695442399967003e-05, + "loss": 2.0996, + "step": 25635 + }, + { + "epoch": 2.9910162174775405, + "grad_norm": 1.1827309131622314, + "learning_rate": 5.6942764941824365e-05, + "loss": 2.0319, + "step": 25636 + }, + { + "epoch": 2.991132889977832, + "grad_norm": 1.1157245635986328, + "learning_rate": 5.693110686424346e-05, + "loss": 1.9536, + "step": 25637 + }, + { + "epoch": 2.991249562478124, + "grad_norm": 1.1446791887283325, + "learning_rate": 5.6919449767048266e-05, + "loss": 2.0107, + "step": 25638 + }, + { + "epoch": 2.9913662349784156, + "grad_norm": 1.041357398033142, + "learning_rate": 5.690779365035957e-05, + "loss": 2.0428, + "step": 25639 + }, + { + "epoch": 2.9914829074787073, + "grad_norm": 1.0183727741241455, + "learning_rate": 5.689613851429819e-05, + "loss": 1.8495, + "step": 25640 + }, + { + "epoch": 2.991599579978999, + "grad_norm": 1.261600375175476, + "learning_rate": 5.6884484358985015e-05, + "loss": 1.924, + "step": 25641 + }, + { + "epoch": 2.9917162524792906, + "grad_norm": 1.1980278491973877, + "learning_rate": 5.6872831184540785e-05, + "loss": 2.002, + "step": 25642 + }, + { + "epoch": 2.9918329249795823, + "grad_norm": 0.9584138989448547, + "learning_rate": 5.6861178991086394e-05, + "loss": 1.9793, + "step": 25643 + }, + { + "epoch": 2.991949597479874, + "grad_norm": 1.1175774335861206, + "learning_rate": 5.684952777874254e-05, + "loss": 1.9362, + "step": 25644 + }, + { + "epoch": 2.9920662699801657, + "grad_norm": 1.3570444583892822, + "learning_rate": 5.6837877547630115e-05, + "loss": 2.1081, + "step": 25645 + }, + { + "epoch": 2.9921829424804574, + "grad_norm": 1.0298632383346558, + "learning_rate": 5.682622829786981e-05, + "loss": 1.7986, + "step": 25646 + }, + { + "epoch": 2.992299614980749, + "grad_norm": 1.0779762268066406, + "learning_rate": 5.6814580029582485e-05, + "loss": 1.9042, + "step": 25647 + }, + { + "epoch": 2.9924162874810407, + "grad_norm": 1.095400333404541, + "learning_rate": 5.6802932742888775e-05, + "loss": 1.8864, + "step": 25648 + }, + { + "epoch": 2.9925329599813324, + "grad_norm": 0.8225147724151611, + "learning_rate": 5.679128643790954e-05, + "loss": 1.6894, + "step": 25649 + }, + { + "epoch": 2.992649632481624, + "grad_norm": 0.9840691089630127, + "learning_rate": 5.677964111476542e-05, + "loss": 1.8021, + "step": 25650 + }, + { + "epoch": 2.9927663049819158, + "grad_norm": 1.2843300104141235, + "learning_rate": 5.676799677357724e-05, + "loss": 1.8659, + "step": 25651 + }, + { + "epoch": 2.9928829774822074, + "grad_norm": 1.2431199550628662, + "learning_rate": 5.6756353414465604e-05, + "loss": 2.1342, + "step": 25652 + }, + { + "epoch": 2.992999649982499, + "grad_norm": 1.3104904890060425, + "learning_rate": 5.674471103755129e-05, + "loss": 1.9366, + "step": 25653 + }, + { + "epoch": 2.993116322482791, + "grad_norm": 1.1958613395690918, + "learning_rate": 5.673306964295503e-05, + "loss": 1.8523, + "step": 25654 + }, + { + "epoch": 2.9932329949830825, + "grad_norm": 1.0393128395080566, + "learning_rate": 5.672142923079745e-05, + "loss": 1.9091, + "step": 25655 + }, + { + "epoch": 2.993349667483374, + "grad_norm": 1.1386079788208008, + "learning_rate": 5.670978980119918e-05, + "loss": 2.038, + "step": 25656 + }, + { + "epoch": 2.993466339983666, + "grad_norm": 1.0961486101150513, + "learning_rate": 5.669815135428101e-05, + "loss": 2.0155, + "step": 25657 + }, + { + "epoch": 2.9935830124839575, + "grad_norm": 0.9885286092758179, + "learning_rate": 5.668651389016351e-05, + "loss": 1.5711, + "step": 25658 + }, + { + "epoch": 2.993699684984249, + "grad_norm": 1.06369149684906, + "learning_rate": 5.667487740896728e-05, + "loss": 1.7951, + "step": 25659 + }, + { + "epoch": 2.993816357484541, + "grad_norm": 1.049058437347412, + "learning_rate": 5.666324191081307e-05, + "loss": 1.97, + "step": 25660 + }, + { + "epoch": 2.9939330299848326, + "grad_norm": 1.027007818222046, + "learning_rate": 5.665160739582139e-05, + "loss": 1.766, + "step": 25661 + }, + { + "epoch": 2.9940497024851243, + "grad_norm": 1.106170892715454, + "learning_rate": 5.6639973864112966e-05, + "loss": 2.14, + "step": 25662 + }, + { + "epoch": 2.994166374985416, + "grad_norm": 0.959161102771759, + "learning_rate": 5.662834131580829e-05, + "loss": 1.9712, + "step": 25663 + }, + { + "epoch": 2.9942830474857076, + "grad_norm": 0.9469273090362549, + "learning_rate": 5.661670975102806e-05, + "loss": 1.8898, + "step": 25664 + }, + { + "epoch": 2.9943997199859993, + "grad_norm": 1.2648929357528687, + "learning_rate": 5.660507916989277e-05, + "loss": 2.1154, + "step": 25665 + }, + { + "epoch": 2.994516392486291, + "grad_norm": 1.1217617988586426, + "learning_rate": 5.659344957252302e-05, + "loss": 1.9718, + "step": 25666 + }, + { + "epoch": 2.9946330649865827, + "grad_norm": 1.1281402111053467, + "learning_rate": 5.658182095903946e-05, + "loss": 1.8354, + "step": 25667 + }, + { + "epoch": 2.9947497374868743, + "grad_norm": 1.0063737630844116, + "learning_rate": 5.6570193329562485e-05, + "loss": 1.8425, + "step": 25668 + }, + { + "epoch": 2.994866409987166, + "grad_norm": 1.087354302406311, + "learning_rate": 5.655856668421279e-05, + "loss": 1.8501, + "step": 25669 + }, + { + "epoch": 2.9949830824874577, + "grad_norm": 1.0499557256698608, + "learning_rate": 5.654694102311077e-05, + "loss": 1.9361, + "step": 25670 + }, + { + "epoch": 2.9950997549877494, + "grad_norm": 1.1167160272598267, + "learning_rate": 5.653531634637707e-05, + "loss": 1.9392, + "step": 25671 + }, + { + "epoch": 2.995216427488041, + "grad_norm": 1.0886069536209106, + "learning_rate": 5.652369265413212e-05, + "loss": 1.7922, + "step": 25672 + }, + { + "epoch": 2.9953330999883327, + "grad_norm": 1.116902470588684, + "learning_rate": 5.651206994649647e-05, + "loss": 1.7946, + "step": 25673 + }, + { + "epoch": 2.9954497724886244, + "grad_norm": 1.0565261840820312, + "learning_rate": 5.650044822359061e-05, + "loss": 2.1207, + "step": 25674 + }, + { + "epoch": 2.995566444988916, + "grad_norm": 1.1489028930664062, + "learning_rate": 5.6488827485535e-05, + "loss": 2.0526, + "step": 25675 + }, + { + "epoch": 2.995683117489208, + "grad_norm": 1.0998469591140747, + "learning_rate": 5.647720773245005e-05, + "loss": 1.8956, + "step": 25676 + }, + { + "epoch": 2.9957997899894995, + "grad_norm": 1.0497658252716064, + "learning_rate": 5.6465588964456336e-05, + "loss": 1.9646, + "step": 25677 + }, + { + "epoch": 2.995916462489791, + "grad_norm": 1.2984826564788818, + "learning_rate": 5.64539711816742e-05, + "loss": 2.0581, + "step": 25678 + }, + { + "epoch": 2.996033134990083, + "grad_norm": 1.1118053197860718, + "learning_rate": 5.644235438422415e-05, + "loss": 1.875, + "step": 25679 + }, + { + "epoch": 2.9961498074903745, + "grad_norm": 1.0291171073913574, + "learning_rate": 5.6430738572226625e-05, + "loss": 1.9054, + "step": 25680 + }, + { + "epoch": 2.996266479990666, + "grad_norm": 0.9980053305625916, + "learning_rate": 5.6419123745802e-05, + "loss": 1.9494, + "step": 25681 + }, + { + "epoch": 2.996383152490958, + "grad_norm": 0.8587753772735596, + "learning_rate": 5.640750990507074e-05, + "loss": 1.7574, + "step": 25682 + }, + { + "epoch": 2.9964998249912496, + "grad_norm": 1.1954172849655151, + "learning_rate": 5.6395897050153185e-05, + "loss": 1.7864, + "step": 25683 + }, + { + "epoch": 2.9966164974915412, + "grad_norm": 1.064708948135376, + "learning_rate": 5.6384285181169784e-05, + "loss": 1.8796, + "step": 25684 + }, + { + "epoch": 2.996733169991833, + "grad_norm": 1.1986056566238403, + "learning_rate": 5.637267429824084e-05, + "loss": 2.0566, + "step": 25685 + }, + { + "epoch": 2.9968498424921246, + "grad_norm": 1.2993873357772827, + "learning_rate": 5.63610644014868e-05, + "loss": 2.0167, + "step": 25686 + }, + { + "epoch": 2.9969665149924163, + "grad_norm": 1.1420224905014038, + "learning_rate": 5.6349455491027964e-05, + "loss": 1.8357, + "step": 25687 + }, + { + "epoch": 2.997083187492708, + "grad_norm": 1.198864459991455, + "learning_rate": 5.633784756698473e-05, + "loss": 1.9376, + "step": 25688 + }, + { + "epoch": 2.9971998599929996, + "grad_norm": 1.0465956926345825, + "learning_rate": 5.6326240629477385e-05, + "loss": 1.958, + "step": 25689 + }, + { + "epoch": 2.9973165324932913, + "grad_norm": 1.253801703453064, + "learning_rate": 5.631463467862632e-05, + "loss": 2.0292, + "step": 25690 + }, + { + "epoch": 2.997433204993583, + "grad_norm": 0.9830984473228455, + "learning_rate": 5.630302971455181e-05, + "loss": 1.9701, + "step": 25691 + }, + { + "epoch": 2.9975498774938747, + "grad_norm": 0.9202513098716736, + "learning_rate": 5.629142573737411e-05, + "loss": 1.8207, + "step": 25692 + }, + { + "epoch": 2.9976665499941664, + "grad_norm": 1.0402953624725342, + "learning_rate": 5.627982274721364e-05, + "loss": 1.9881, + "step": 25693 + }, + { + "epoch": 2.997783222494458, + "grad_norm": 1.141993761062622, + "learning_rate": 5.626822074419057e-05, + "loss": 2.0567, + "step": 25694 + }, + { + "epoch": 2.9978998949947497, + "grad_norm": 1.0957040786743164, + "learning_rate": 5.625661972842527e-05, + "loss": 2.0533, + "step": 25695 + }, + { + "epoch": 2.9980165674950414, + "grad_norm": 1.1000878810882568, + "learning_rate": 5.624501970003792e-05, + "loss": 1.9155, + "step": 25696 + }, + { + "epoch": 2.998133239995333, + "grad_norm": 1.0642818212509155, + "learning_rate": 5.6233420659148874e-05, + "loss": 2.0496, + "step": 25697 + }, + { + "epoch": 2.998249912495625, + "grad_norm": 1.0214756727218628, + "learning_rate": 5.622182260587828e-05, + "loss": 1.8255, + "step": 25698 + }, + { + "epoch": 2.9983665849959165, + "grad_norm": 1.0055407285690308, + "learning_rate": 5.621022554034644e-05, + "loss": 1.7403, + "step": 25699 + }, + { + "epoch": 2.998483257496208, + "grad_norm": 1.1187334060668945, + "learning_rate": 5.619862946267354e-05, + "loss": 1.7977, + "step": 25700 + }, + { + "epoch": 2.9985999299965, + "grad_norm": 1.2278317213058472, + "learning_rate": 5.6187034372979854e-05, + "loss": 2.1035, + "step": 25701 + }, + { + "epoch": 2.9987166024967915, + "grad_norm": 0.9827549457550049, + "learning_rate": 5.6175440271385494e-05, + "loss": 1.912, + "step": 25702 + }, + { + "epoch": 2.998833274997083, + "grad_norm": 1.1147273778915405, + "learning_rate": 5.616384715801075e-05, + "loss": 2.0478, + "step": 25703 + }, + { + "epoch": 2.998949947497375, + "grad_norm": 1.0609878301620483, + "learning_rate": 5.615225503297572e-05, + "loss": 1.8487, + "step": 25704 + }, + { + "epoch": 2.9990666199976665, + "grad_norm": 1.0822064876556396, + "learning_rate": 5.614066389640062e-05, + "loss": 2.1925, + "step": 25705 + }, + { + "epoch": 2.9991832924979582, + "grad_norm": 1.0870463848114014, + "learning_rate": 5.612907374840568e-05, + "loss": 1.9338, + "step": 25706 + }, + { + "epoch": 2.99929996499825, + "grad_norm": 1.013463020324707, + "learning_rate": 5.611748458911098e-05, + "loss": 1.8794, + "step": 25707 + }, + { + "epoch": 2.9994166374985416, + "grad_norm": 1.0087389945983887, + "learning_rate": 5.6105896418636625e-05, + "loss": 1.9031, + "step": 25708 + }, + { + "epoch": 2.9995333099988333, + "grad_norm": 1.0661334991455078, + "learning_rate": 5.609430923710284e-05, + "loss": 1.9215, + "step": 25709 + }, + { + "epoch": 2.999649982499125, + "grad_norm": 1.1778652667999268, + "learning_rate": 5.608272304462971e-05, + "loss": 2.0423, + "step": 25710 + }, + { + "epoch": 2.9997666549994166, + "grad_norm": 1.043257236480713, + "learning_rate": 5.607113784133731e-05, + "loss": 1.898, + "step": 25711 + }, + { + "epoch": 2.9998833274997083, + "grad_norm": 1.1265394687652588, + "learning_rate": 5.6059553627345804e-05, + "loss": 2.0543, + "step": 25712 + }, + { + "epoch": 3.0, + "grad_norm": 1.3341724872589111, + "learning_rate": 5.6047970402775206e-05, + "loss": 1.9947, + "step": 25713 + }, + { + "epoch": 3.0001166725002917, + "grad_norm": 1.0815556049346924, + "learning_rate": 5.603638816774571e-05, + "loss": 1.8271, + "step": 25714 + }, + { + "epoch": 3.0002333450005834, + "grad_norm": 0.911355197429657, + "learning_rate": 5.602480692237727e-05, + "loss": 1.6642, + "step": 25715 + }, + { + "epoch": 3.000350017500875, + "grad_norm": 1.0640026330947876, + "learning_rate": 5.6013226666790054e-05, + "loss": 1.7728, + "step": 25716 + }, + { + "epoch": 3.0004666900011667, + "grad_norm": 1.0728137493133545, + "learning_rate": 5.600164740110401e-05, + "loss": 1.9023, + "step": 25717 + }, + { + "epoch": 3.0005833625014584, + "grad_norm": 0.9731166958808899, + "learning_rate": 5.599006912543923e-05, + "loss": 1.8885, + "step": 25718 + }, + { + "epoch": 3.00070003500175, + "grad_norm": 1.1075003147125244, + "learning_rate": 5.5978491839915805e-05, + "loss": 1.6243, + "step": 25719 + }, + { + "epoch": 3.0008167075020418, + "grad_norm": 1.1637756824493408, + "learning_rate": 5.596691554465364e-05, + "loss": 1.8614, + "step": 25720 + }, + { + "epoch": 3.0009333800023335, + "grad_norm": 1.1141879558563232, + "learning_rate": 5.595534023977286e-05, + "loss": 1.7834, + "step": 25721 + }, + { + "epoch": 3.001050052502625, + "grad_norm": 1.0302120447158813, + "learning_rate": 5.594376592539334e-05, + "loss": 1.868, + "step": 25722 + }, + { + "epoch": 3.001166725002917, + "grad_norm": 1.0744282007217407, + "learning_rate": 5.593219260163521e-05, + "loss": 1.8387, + "step": 25723 + }, + { + "epoch": 3.0012833975032085, + "grad_norm": 1.2348875999450684, + "learning_rate": 5.5920620268618355e-05, + "loss": 1.8913, + "step": 25724 + }, + { + "epoch": 3.0014000700035, + "grad_norm": 1.106575846672058, + "learning_rate": 5.590904892646273e-05, + "loss": 1.6589, + "step": 25725 + }, + { + "epoch": 3.001516742503792, + "grad_norm": 1.0549136400222778, + "learning_rate": 5.589747857528838e-05, + "loss": 1.8003, + "step": 25726 + }, + { + "epoch": 3.0016334150040835, + "grad_norm": 1.0813567638397217, + "learning_rate": 5.5885909215215186e-05, + "loss": 1.7063, + "step": 25727 + }, + { + "epoch": 3.001750087504375, + "grad_norm": 1.0342103242874146, + "learning_rate": 5.5874340846363064e-05, + "loss": 1.8693, + "step": 25728 + }, + { + "epoch": 3.001866760004667, + "grad_norm": 1.1489616632461548, + "learning_rate": 5.586277346885204e-05, + "loss": 1.885, + "step": 25729 + }, + { + "epoch": 3.0019834325049586, + "grad_norm": 1.1485087871551514, + "learning_rate": 5.585120708280191e-05, + "loss": 1.9132, + "step": 25730 + }, + { + "epoch": 3.0021001050052503, + "grad_norm": 1.1593955755233765, + "learning_rate": 5.583964168833264e-05, + "loss": 1.8109, + "step": 25731 + }, + { + "epoch": 3.002216777505542, + "grad_norm": 1.3569473028182983, + "learning_rate": 5.5828077285564194e-05, + "loss": 1.9488, + "step": 25732 + }, + { + "epoch": 3.0023334500058336, + "grad_norm": 1.0843533277511597, + "learning_rate": 5.5816513874616344e-05, + "loss": 1.8545, + "step": 25733 + }, + { + "epoch": 3.0024501225061253, + "grad_norm": 1.251214861869812, + "learning_rate": 5.580495145560907e-05, + "loss": 1.926, + "step": 25734 + }, + { + "epoch": 3.002566795006417, + "grad_norm": 1.0312089920043945, + "learning_rate": 5.5793390028662124e-05, + "loss": 1.8488, + "step": 25735 + }, + { + "epoch": 3.0026834675067087, + "grad_norm": 1.101485252380371, + "learning_rate": 5.57818295938955e-05, + "loss": 1.799, + "step": 25736 + }, + { + "epoch": 3.0028001400070004, + "grad_norm": 1.2358099222183228, + "learning_rate": 5.5770270151428906e-05, + "loss": 1.8674, + "step": 25737 + }, + { + "epoch": 3.002916812507292, + "grad_norm": 1.07048499584198, + "learning_rate": 5.575871170138228e-05, + "loss": 1.6754, + "step": 25738 + }, + { + "epoch": 3.0030334850075837, + "grad_norm": 1.1601167917251587, + "learning_rate": 5.574715424387537e-05, + "loss": 1.8718, + "step": 25739 + }, + { + "epoch": 3.0031501575078754, + "grad_norm": 1.2938700914382935, + "learning_rate": 5.573559777902808e-05, + "loss": 1.8641, + "step": 25740 + }, + { + "epoch": 3.003266830008167, + "grad_norm": 1.2443759441375732, + "learning_rate": 5.572404230696012e-05, + "loss": 1.9408, + "step": 25741 + }, + { + "epoch": 3.0033835025084588, + "grad_norm": 1.307594656944275, + "learning_rate": 5.571248782779137e-05, + "loss": 1.8075, + "step": 25742 + }, + { + "epoch": 3.0035001750087504, + "grad_norm": 1.2607098817825317, + "learning_rate": 5.5700934341641564e-05, + "loss": 1.8141, + "step": 25743 + }, + { + "epoch": 3.003616847509042, + "grad_norm": 1.247778296470642, + "learning_rate": 5.568938184863044e-05, + "loss": 1.8063, + "step": 25744 + }, + { + "epoch": 3.003733520009334, + "grad_norm": 1.0091005563735962, + "learning_rate": 5.567783034887786e-05, + "loss": 1.7549, + "step": 25745 + }, + { + "epoch": 3.0038501925096255, + "grad_norm": 1.2854918241500854, + "learning_rate": 5.566627984250348e-05, + "loss": 1.8583, + "step": 25746 + }, + { + "epoch": 3.003966865009917, + "grad_norm": 1.1135034561157227, + "learning_rate": 5.565473032962712e-05, + "loss": 1.8934, + "step": 25747 + }, + { + "epoch": 3.004083537510209, + "grad_norm": 1.1841380596160889, + "learning_rate": 5.564318181036844e-05, + "loss": 1.7701, + "step": 25748 + }, + { + "epoch": 3.0042002100105005, + "grad_norm": 1.1817076206207275, + "learning_rate": 5.563163428484724e-05, + "loss": 1.7712, + "step": 25749 + }, + { + "epoch": 3.004316882510792, + "grad_norm": 1.1556916236877441, + "learning_rate": 5.5620087753183144e-05, + "loss": 1.9083, + "step": 25750 + }, + { + "epoch": 3.004433555011084, + "grad_norm": 1.46597421169281, + "learning_rate": 5.5608542215495945e-05, + "loss": 1.8661, + "step": 25751 + }, + { + "epoch": 3.0045502275113756, + "grad_norm": 1.0612952709197998, + "learning_rate": 5.559699767190526e-05, + "loss": 1.8127, + "step": 25752 + }, + { + "epoch": 3.0046669000116673, + "grad_norm": 1.1641827821731567, + "learning_rate": 5.558545412253085e-05, + "loss": 1.7343, + "step": 25753 + }, + { + "epoch": 3.004783572511959, + "grad_norm": 1.1423224210739136, + "learning_rate": 5.5573911567492264e-05, + "loss": 1.9127, + "step": 25754 + }, + { + "epoch": 3.0049002450122506, + "grad_norm": 1.0826994180679321, + "learning_rate": 5.5562370006909315e-05, + "loss": 1.791, + "step": 25755 + }, + { + "epoch": 3.0050169175125423, + "grad_norm": 1.276734709739685, + "learning_rate": 5.5550829440901496e-05, + "loss": 1.9508, + "step": 25756 + }, + { + "epoch": 3.005133590012834, + "grad_norm": 1.2178105115890503, + "learning_rate": 5.553928986958855e-05, + "loss": 2.0004, + "step": 25757 + }, + { + "epoch": 3.0052502625131257, + "grad_norm": 1.281654715538025, + "learning_rate": 5.552775129309011e-05, + "loss": 1.9004, + "step": 25758 + }, + { + "epoch": 3.0053669350134173, + "grad_norm": 1.3088171482086182, + "learning_rate": 5.551621371152577e-05, + "loss": 1.8162, + "step": 25759 + }, + { + "epoch": 3.005483607513709, + "grad_norm": 1.2461888790130615, + "learning_rate": 5.5504677125015146e-05, + "loss": 1.8404, + "step": 25760 + }, + { + "epoch": 3.0056002800140007, + "grad_norm": 1.0663331747055054, + "learning_rate": 5.549314153367777e-05, + "loss": 1.7617, + "step": 25761 + }, + { + "epoch": 3.0057169525142924, + "grad_norm": 1.1097673177719116, + "learning_rate": 5.5481606937633344e-05, + "loss": 1.6868, + "step": 25762 + }, + { + "epoch": 3.005833625014584, + "grad_norm": 1.2523175477981567, + "learning_rate": 5.547007333700133e-05, + "loss": 1.814, + "step": 25763 + }, + { + "epoch": 3.0059502975148757, + "grad_norm": 1.3443080186843872, + "learning_rate": 5.545854073190139e-05, + "loss": 1.8282, + "step": 25764 + }, + { + "epoch": 3.0060669700151674, + "grad_norm": 1.3132884502410889, + "learning_rate": 5.544700912245301e-05, + "loss": 1.9168, + "step": 25765 + }, + { + "epoch": 3.006183642515459, + "grad_norm": 1.2599525451660156, + "learning_rate": 5.543547850877582e-05, + "loss": 1.8269, + "step": 25766 + }, + { + "epoch": 3.006300315015751, + "grad_norm": 1.3858305215835571, + "learning_rate": 5.542394889098926e-05, + "loss": 1.7996, + "step": 25767 + }, + { + "epoch": 3.0064169875160425, + "grad_norm": 1.173218846321106, + "learning_rate": 5.541242026921294e-05, + "loss": 1.8286, + "step": 25768 + }, + { + "epoch": 3.006533660016334, + "grad_norm": 1.0829306840896606, + "learning_rate": 5.540089264356632e-05, + "loss": 1.8186, + "step": 25769 + }, + { + "epoch": 3.006650332516626, + "grad_norm": 1.1809152364730835, + "learning_rate": 5.53893660141689e-05, + "loss": 1.924, + "step": 25770 + }, + { + "epoch": 3.0067670050169175, + "grad_norm": 1.2349308729171753, + "learning_rate": 5.5377840381140255e-05, + "loss": 1.9129, + "step": 25771 + }, + { + "epoch": 3.006883677517209, + "grad_norm": 1.259524941444397, + "learning_rate": 5.536631574459978e-05, + "loss": 1.8668, + "step": 25772 + }, + { + "epoch": 3.007000350017501, + "grad_norm": 1.1660863161087036, + "learning_rate": 5.535479210466702e-05, + "loss": 1.8419, + "step": 25773 + }, + { + "epoch": 3.0071170225177926, + "grad_norm": 1.3681280612945557, + "learning_rate": 5.534326946146138e-05, + "loss": 1.8464, + "step": 25774 + }, + { + "epoch": 3.0072336950180842, + "grad_norm": 1.1717368364334106, + "learning_rate": 5.533174781510238e-05, + "loss": 1.6825, + "step": 25775 + }, + { + "epoch": 3.007350367518376, + "grad_norm": 1.3264458179473877, + "learning_rate": 5.53202271657094e-05, + "loss": 1.7751, + "step": 25776 + }, + { + "epoch": 3.0074670400186676, + "grad_norm": 1.1171427965164185, + "learning_rate": 5.5308707513401876e-05, + "loss": 1.8049, + "step": 25777 + }, + { + "epoch": 3.0075837125189593, + "grad_norm": 1.2238285541534424, + "learning_rate": 5.529718885829928e-05, + "loss": 1.9533, + "step": 25778 + }, + { + "epoch": 3.007700385019251, + "grad_norm": 0.9912027716636658, + "learning_rate": 5.528567120052101e-05, + "loss": 1.7109, + "step": 25779 + }, + { + "epoch": 3.0078170575195426, + "grad_norm": 1.336846947669983, + "learning_rate": 5.527415454018639e-05, + "loss": 1.8322, + "step": 25780 + }, + { + "epoch": 3.0079337300198343, + "grad_norm": 1.3788903951644897, + "learning_rate": 5.526263887741495e-05, + "loss": 1.9396, + "step": 25781 + }, + { + "epoch": 3.008050402520126, + "grad_norm": 1.0998485088348389, + "learning_rate": 5.5251124212325934e-05, + "loss": 1.7997, + "step": 25782 + }, + { + "epoch": 3.0081670750204177, + "grad_norm": 1.2846715450286865, + "learning_rate": 5.523961054503878e-05, + "loss": 1.9291, + "step": 25783 + }, + { + "epoch": 3.0082837475207094, + "grad_norm": 1.202794075012207, + "learning_rate": 5.522809787567289e-05, + "loss": 1.8712, + "step": 25784 + }, + { + "epoch": 3.008400420021001, + "grad_norm": 1.4139528274536133, + "learning_rate": 5.521658620434754e-05, + "loss": 1.8853, + "step": 25785 + }, + { + "epoch": 3.0085170925212927, + "grad_norm": 1.1788500547409058, + "learning_rate": 5.520507553118214e-05, + "loss": 1.7996, + "step": 25786 + }, + { + "epoch": 3.0086337650215844, + "grad_norm": 1.1100692749023438, + "learning_rate": 5.519356585629593e-05, + "loss": 1.9983, + "step": 25787 + }, + { + "epoch": 3.008750437521876, + "grad_norm": 1.355668067932129, + "learning_rate": 5.5182057179808334e-05, + "loss": 1.9866, + "step": 25788 + }, + { + "epoch": 3.008867110022168, + "grad_norm": 1.323110580444336, + "learning_rate": 5.517054950183855e-05, + "loss": 1.7866, + "step": 25789 + }, + { + "epoch": 3.0089837825224595, + "grad_norm": 1.1435991525650024, + "learning_rate": 5.5159042822506e-05, + "loss": 1.8798, + "step": 25790 + }, + { + "epoch": 3.009100455022751, + "grad_norm": 1.1456642150878906, + "learning_rate": 5.514753714192984e-05, + "loss": 1.8041, + "step": 25791 + }, + { + "epoch": 3.009217127523043, + "grad_norm": 1.2052302360534668, + "learning_rate": 5.513603246022947e-05, + "loss": 1.8471, + "step": 25792 + }, + { + "epoch": 3.0093338000233345, + "grad_norm": 1.115834355354309, + "learning_rate": 5.512452877752412e-05, + "loss": 1.6721, + "step": 25793 + }, + { + "epoch": 3.009450472523626, + "grad_norm": 1.144765019416809, + "learning_rate": 5.511302609393296e-05, + "loss": 1.8247, + "step": 25794 + }, + { + "epoch": 3.009567145023918, + "grad_norm": 1.1269383430480957, + "learning_rate": 5.5101524409575375e-05, + "loss": 1.7566, + "step": 25795 + }, + { + "epoch": 3.0096838175242095, + "grad_norm": 1.0970244407653809, + "learning_rate": 5.509002372457048e-05, + "loss": 1.7648, + "step": 25796 + }, + { + "epoch": 3.0098004900245012, + "grad_norm": 1.1778531074523926, + "learning_rate": 5.50785240390376e-05, + "loss": 1.7882, + "step": 25797 + }, + { + "epoch": 3.009917162524793, + "grad_norm": 1.2961208820343018, + "learning_rate": 5.506702535309587e-05, + "loss": 1.9343, + "step": 25798 + }, + { + "epoch": 3.0100338350250846, + "grad_norm": 1.2971198558807373, + "learning_rate": 5.505552766686458e-05, + "loss": 1.8462, + "step": 25799 + }, + { + "epoch": 3.0101505075253763, + "grad_norm": 1.1784909963607788, + "learning_rate": 5.504403098046283e-05, + "loss": 1.7963, + "step": 25800 + }, + { + "epoch": 3.010267180025668, + "grad_norm": 1.3338085412979126, + "learning_rate": 5.503253529400989e-05, + "loss": 1.8777, + "step": 25801 + }, + { + "epoch": 3.0103838525259596, + "grad_norm": 1.440506935119629, + "learning_rate": 5.502104060762484e-05, + "loss": 1.952, + "step": 25802 + }, + { + "epoch": 3.0105005250262513, + "grad_norm": 1.1449289321899414, + "learning_rate": 5.500954692142697e-05, + "loss": 1.6774, + "step": 25803 + }, + { + "epoch": 3.010617197526543, + "grad_norm": 1.1716516017913818, + "learning_rate": 5.499805423553531e-05, + "loss": 1.9293, + "step": 25804 + }, + { + "epoch": 3.0107338700268347, + "grad_norm": 1.0898336172103882, + "learning_rate": 5.498656255006911e-05, + "loss": 1.6776, + "step": 25805 + }, + { + "epoch": 3.0108505425271264, + "grad_norm": 1.244641900062561, + "learning_rate": 5.497507186514741e-05, + "loss": 1.8579, + "step": 25806 + }, + { + "epoch": 3.010967215027418, + "grad_norm": 1.2344807386398315, + "learning_rate": 5.496358218088942e-05, + "loss": 1.6808, + "step": 25807 + }, + { + "epoch": 3.0110838875277097, + "grad_norm": 1.0625725984573364, + "learning_rate": 5.495209349741416e-05, + "loss": 1.7726, + "step": 25808 + }, + { + "epoch": 3.0112005600280014, + "grad_norm": 1.0856568813323975, + "learning_rate": 5.4940605814840824e-05, + "loss": 1.8746, + "step": 25809 + }, + { + "epoch": 3.011317232528293, + "grad_norm": 1.1585959196090698, + "learning_rate": 5.492911913328841e-05, + "loss": 1.7541, + "step": 25810 + }, + { + "epoch": 3.0114339050285848, + "grad_norm": 1.0908575057983398, + "learning_rate": 5.49176334528761e-05, + "loss": 1.7962, + "step": 25811 + }, + { + "epoch": 3.0115505775288764, + "grad_norm": 1.0814746618270874, + "learning_rate": 5.49061487737229e-05, + "loss": 1.8674, + "step": 25812 + }, + { + "epoch": 3.011667250029168, + "grad_norm": 1.0735070705413818, + "learning_rate": 5.489466509594785e-05, + "loss": 1.8383, + "step": 25813 + }, + { + "epoch": 3.01178392252946, + "grad_norm": 1.1482908725738525, + "learning_rate": 5.4883182419670084e-05, + "loss": 1.9501, + "step": 25814 + }, + { + "epoch": 3.0119005950297515, + "grad_norm": 1.2918280363082886, + "learning_rate": 5.487170074500853e-05, + "loss": 2.0212, + "step": 25815 + }, + { + "epoch": 3.012017267530043, + "grad_norm": 1.1236755847930908, + "learning_rate": 5.486022007208233e-05, + "loss": 1.7182, + "step": 25816 + }, + { + "epoch": 3.012133940030335, + "grad_norm": 1.1831759214401245, + "learning_rate": 5.484874040101041e-05, + "loss": 1.8178, + "step": 25817 + }, + { + "epoch": 3.0122506125306265, + "grad_norm": 1.2027509212493896, + "learning_rate": 5.483726173191188e-05, + "loss": 1.7102, + "step": 25818 + }, + { + "epoch": 3.012367285030918, + "grad_norm": 1.2216333150863647, + "learning_rate": 5.482578406490559e-05, + "loss": 1.9314, + "step": 25819 + }, + { + "epoch": 3.01248395753121, + "grad_norm": 1.1715583801269531, + "learning_rate": 5.48143074001107e-05, + "loss": 1.9038, + "step": 25820 + }, + { + "epoch": 3.0126006300315016, + "grad_norm": 1.1408065557479858, + "learning_rate": 5.480283173764604e-05, + "loss": 1.8679, + "step": 25821 + }, + { + "epoch": 3.0127173025317933, + "grad_norm": 1.2498416900634766, + "learning_rate": 5.4791357077630645e-05, + "loss": 1.8965, + "step": 25822 + }, + { + "epoch": 3.012833975032085, + "grad_norm": 1.187508225440979, + "learning_rate": 5.4779883420183515e-05, + "loss": 1.938, + "step": 25823 + }, + { + "epoch": 3.0129506475323766, + "grad_norm": 1.1440277099609375, + "learning_rate": 5.476841076542349e-05, + "loss": 1.6918, + "step": 25824 + }, + { + "epoch": 3.0130673200326683, + "grad_norm": 1.3217564821243286, + "learning_rate": 5.475693911346962e-05, + "loss": 1.9779, + "step": 25825 + }, + { + "epoch": 3.01318399253296, + "grad_norm": 1.1286442279815674, + "learning_rate": 5.474546846444073e-05, + "loss": 1.856, + "step": 25826 + }, + { + "epoch": 3.0133006650332517, + "grad_norm": 1.2799092531204224, + "learning_rate": 5.473399881845582e-05, + "loss": 1.7173, + "step": 25827 + }, + { + "epoch": 3.0134173375335433, + "grad_norm": 1.073533296585083, + "learning_rate": 5.4722530175633764e-05, + "loss": 1.8074, + "step": 25828 + }, + { + "epoch": 3.013534010033835, + "grad_norm": 1.1054058074951172, + "learning_rate": 5.4711062536093435e-05, + "loss": 1.7726, + "step": 25829 + }, + { + "epoch": 3.0136506825341267, + "grad_norm": 1.1218544244766235, + "learning_rate": 5.469959589995368e-05, + "loss": 1.882, + "step": 25830 + }, + { + "epoch": 3.0137673550344184, + "grad_norm": 1.327463984489441, + "learning_rate": 5.468813026733348e-05, + "loss": 1.9949, + "step": 25831 + }, + { + "epoch": 3.01388402753471, + "grad_norm": 1.2997548580169678, + "learning_rate": 5.467666563835158e-05, + "loss": 1.7555, + "step": 25832 + }, + { + "epoch": 3.0140007000350018, + "grad_norm": 1.379711389541626, + "learning_rate": 5.466520201312693e-05, + "loss": 1.8222, + "step": 25833 + }, + { + "epoch": 3.0141173725352934, + "grad_norm": 1.2479640245437622, + "learning_rate": 5.465373939177829e-05, + "loss": 1.6166, + "step": 25834 + }, + { + "epoch": 3.014234045035585, + "grad_norm": 1.0886913537979126, + "learning_rate": 5.464227777442454e-05, + "loss": 1.7048, + "step": 25835 + }, + { + "epoch": 3.014350717535877, + "grad_norm": 1.5029878616333008, + "learning_rate": 5.463081716118455e-05, + "loss": 1.9999, + "step": 25836 + }, + { + "epoch": 3.0144673900361685, + "grad_norm": 1.1337087154388428, + "learning_rate": 5.461935755217703e-05, + "loss": 1.8244, + "step": 25837 + }, + { + "epoch": 3.01458406253646, + "grad_norm": 1.2478961944580078, + "learning_rate": 5.460789894752087e-05, + "loss": 1.6047, + "step": 25838 + }, + { + "epoch": 3.014700735036752, + "grad_norm": 1.3416194915771484, + "learning_rate": 5.459644134733475e-05, + "loss": 1.8608, + "step": 25839 + }, + { + "epoch": 3.0148174075370435, + "grad_norm": 1.2299576997756958, + "learning_rate": 5.45849847517376e-05, + "loss": 1.8776, + "step": 25840 + }, + { + "epoch": 3.014934080037335, + "grad_norm": 1.223634123802185, + "learning_rate": 5.4573529160848034e-05, + "loss": 1.8144, + "step": 25841 + }, + { + "epoch": 3.015050752537627, + "grad_norm": 1.2513748407363892, + "learning_rate": 5.456207457478494e-05, + "loss": 1.8926, + "step": 25842 + }, + { + "epoch": 3.0151674250379186, + "grad_norm": 1.1642972230911255, + "learning_rate": 5.455062099366696e-05, + "loss": 1.8932, + "step": 25843 + }, + { + "epoch": 3.0152840975382103, + "grad_norm": 1.2216004133224487, + "learning_rate": 5.453916841761293e-05, + "loss": 1.8991, + "step": 25844 + }, + { + "epoch": 3.015400770038502, + "grad_norm": 1.3359090089797974, + "learning_rate": 5.4527716846741534e-05, + "loss": 1.8766, + "step": 25845 + }, + { + "epoch": 3.0155174425387936, + "grad_norm": 1.2202683687210083, + "learning_rate": 5.451626628117144e-05, + "loss": 1.7752, + "step": 25846 + }, + { + "epoch": 3.0156341150390853, + "grad_norm": 1.0808019638061523, + "learning_rate": 5.450481672102144e-05, + "loss": 1.7708, + "step": 25847 + }, + { + "epoch": 3.015750787539377, + "grad_norm": 1.2574462890625, + "learning_rate": 5.4493368166410134e-05, + "loss": 1.7252, + "step": 25848 + }, + { + "epoch": 3.0158674600396687, + "grad_norm": 1.193302869796753, + "learning_rate": 5.4481920617456335e-05, + "loss": 1.8527, + "step": 25849 + }, + { + "epoch": 3.0159841325399603, + "grad_norm": 1.2803016901016235, + "learning_rate": 5.447047407427858e-05, + "loss": 1.789, + "step": 25850 + }, + { + "epoch": 3.016100805040252, + "grad_norm": 1.218181848526001, + "learning_rate": 5.4459028536995644e-05, + "loss": 1.7631, + "step": 25851 + }, + { + "epoch": 3.0162174775405437, + "grad_norm": 1.1048551797866821, + "learning_rate": 5.4447584005726105e-05, + "loss": 1.9872, + "step": 25852 + }, + { + "epoch": 3.0163341500408354, + "grad_norm": 1.3557755947113037, + "learning_rate": 5.4436140480588695e-05, + "loss": 1.9536, + "step": 25853 + }, + { + "epoch": 3.016450822541127, + "grad_norm": 1.220057487487793, + "learning_rate": 5.442469796170194e-05, + "loss": 1.8132, + "step": 25854 + }, + { + "epoch": 3.0165674950414187, + "grad_norm": 1.2343083620071411, + "learning_rate": 5.441325644918456e-05, + "loss": 1.9756, + "step": 25855 + }, + { + "epoch": 3.0166841675417104, + "grad_norm": 1.2163203954696655, + "learning_rate": 5.440181594315508e-05, + "loss": 1.7362, + "step": 25856 + }, + { + "epoch": 3.016800840042002, + "grad_norm": 1.404833436012268, + "learning_rate": 5.4390376443732204e-05, + "loss": 1.7375, + "step": 25857 + }, + { + "epoch": 3.016917512542294, + "grad_norm": 1.174107551574707, + "learning_rate": 5.4378937951034414e-05, + "loss": 1.69, + "step": 25858 + }, + { + "epoch": 3.0170341850425855, + "grad_norm": 1.2474955320358276, + "learning_rate": 5.4367500465180395e-05, + "loss": 1.9231, + "step": 25859 + }, + { + "epoch": 3.017150857542877, + "grad_norm": 1.1331146955490112, + "learning_rate": 5.435606398628863e-05, + "loss": 1.8144, + "step": 25860 + }, + { + "epoch": 3.017267530043169, + "grad_norm": 1.2314337491989136, + "learning_rate": 5.4344628514477754e-05, + "loss": 1.8527, + "step": 25861 + }, + { + "epoch": 3.0173842025434605, + "grad_norm": 1.217645525932312, + "learning_rate": 5.433319404986623e-05, + "loss": 1.836, + "step": 25862 + }, + { + "epoch": 3.017500875043752, + "grad_norm": 1.1891157627105713, + "learning_rate": 5.4321760592572706e-05, + "loss": 1.8886, + "step": 25863 + }, + { + "epoch": 3.017617547544044, + "grad_norm": 1.6867793798446655, + "learning_rate": 5.431032814271567e-05, + "loss": 1.7339, + "step": 25864 + }, + { + "epoch": 3.0177342200443356, + "grad_norm": 1.2443004846572876, + "learning_rate": 5.4298896700413555e-05, + "loss": 1.8309, + "step": 25865 + }, + { + "epoch": 3.0178508925446272, + "grad_norm": 1.2973904609680176, + "learning_rate": 5.4287466265785005e-05, + "loss": 1.8935, + "step": 25866 + }, + { + "epoch": 3.017967565044919, + "grad_norm": 1.1193734407424927, + "learning_rate": 5.4276036838948396e-05, + "loss": 1.713, + "step": 25867 + }, + { + "epoch": 3.0180842375452106, + "grad_norm": 1.4514672756195068, + "learning_rate": 5.426460842002234e-05, + "loss": 1.9506, + "step": 25868 + }, + { + "epoch": 3.0182009100455023, + "grad_norm": 1.2378987073898315, + "learning_rate": 5.425318100912517e-05, + "loss": 1.8128, + "step": 25869 + }, + { + "epoch": 3.018317582545794, + "grad_norm": 1.2035120725631714, + "learning_rate": 5.42417546063755e-05, + "loss": 1.7432, + "step": 25870 + }, + { + "epoch": 3.0184342550460856, + "grad_norm": 1.241473913192749, + "learning_rate": 5.4230329211891656e-05, + "loss": 1.7933, + "step": 25871 + }, + { + "epoch": 3.0185509275463773, + "grad_norm": 1.1163376569747925, + "learning_rate": 5.4218904825792215e-05, + "loss": 1.6883, + "step": 25872 + }, + { + "epoch": 3.018667600046669, + "grad_norm": 1.0959796905517578, + "learning_rate": 5.420748144819547e-05, + "loss": 1.7984, + "step": 25873 + }, + { + "epoch": 3.0187842725469607, + "grad_norm": 1.130645513534546, + "learning_rate": 5.419605907921994e-05, + "loss": 1.7034, + "step": 25874 + }, + { + "epoch": 3.0189009450472524, + "grad_norm": 1.2985599040985107, + "learning_rate": 5.418463771898403e-05, + "loss": 1.906, + "step": 25875 + }, + { + "epoch": 3.019017617547544, + "grad_norm": 1.2516236305236816, + "learning_rate": 5.417321736760612e-05, + "loss": 1.8557, + "step": 25876 + }, + { + "epoch": 3.0191342900478357, + "grad_norm": 1.1039026975631714, + "learning_rate": 5.416179802520465e-05, + "loss": 1.7637, + "step": 25877 + }, + { + "epoch": 3.0192509625481274, + "grad_norm": 1.146612286567688, + "learning_rate": 5.4150379691897974e-05, + "loss": 1.747, + "step": 25878 + }, + { + "epoch": 3.019367635048419, + "grad_norm": 1.1520878076553345, + "learning_rate": 5.41389623678044e-05, + "loss": 2.0227, + "step": 25879 + }, + { + "epoch": 3.0194843075487108, + "grad_norm": 1.3356266021728516, + "learning_rate": 5.412754605304241e-05, + "loss": 1.8907, + "step": 25880 + }, + { + "epoch": 3.0196009800490025, + "grad_norm": 1.2581400871276855, + "learning_rate": 5.4116130747730285e-05, + "loss": 1.6817, + "step": 25881 + }, + { + "epoch": 3.019717652549294, + "grad_norm": 1.2268025875091553, + "learning_rate": 5.410471645198633e-05, + "loss": 1.707, + "step": 25882 + }, + { + "epoch": 3.019834325049586, + "grad_norm": 1.3372565507888794, + "learning_rate": 5.409330316592897e-05, + "loss": 1.8852, + "step": 25883 + }, + { + "epoch": 3.0199509975498775, + "grad_norm": 1.2777043581008911, + "learning_rate": 5.4081890889676435e-05, + "loss": 1.9347, + "step": 25884 + }, + { + "epoch": 3.020067670050169, + "grad_norm": 1.358889102935791, + "learning_rate": 5.4070479623347124e-05, + "loss": 1.792, + "step": 25885 + }, + { + "epoch": 3.020184342550461, + "grad_norm": 1.0791326761245728, + "learning_rate": 5.405906936705924e-05, + "loss": 1.7495, + "step": 25886 + }, + { + "epoch": 3.0203010150507525, + "grad_norm": 1.157664179801941, + "learning_rate": 5.4047660120931116e-05, + "loss": 1.8613, + "step": 25887 + }, + { + "epoch": 3.0204176875510442, + "grad_norm": 1.065595030784607, + "learning_rate": 5.403625188508109e-05, + "loss": 1.7638, + "step": 25888 + }, + { + "epoch": 3.020534360051336, + "grad_norm": 1.213238000869751, + "learning_rate": 5.402484465962733e-05, + "loss": 1.7688, + "step": 25889 + }, + { + "epoch": 3.0206510325516276, + "grad_norm": 1.3634942770004272, + "learning_rate": 5.401343844468818e-05, + "loss": 2.0633, + "step": 25890 + }, + { + "epoch": 3.0207677050519193, + "grad_norm": 1.285382628440857, + "learning_rate": 5.40020332403818e-05, + "loss": 1.8085, + "step": 25891 + }, + { + "epoch": 3.020884377552211, + "grad_norm": 1.188560128211975, + "learning_rate": 5.399062904682652e-05, + "loss": 1.7704, + "step": 25892 + }, + { + "epoch": 3.0210010500525026, + "grad_norm": 1.1832151412963867, + "learning_rate": 5.397922586414049e-05, + "loss": 1.8488, + "step": 25893 + }, + { + "epoch": 3.0211177225527943, + "grad_norm": 1.160385012626648, + "learning_rate": 5.396782369244198e-05, + "loss": 1.9307, + "step": 25894 + }, + { + "epoch": 3.021234395053086, + "grad_norm": 1.238685131072998, + "learning_rate": 5.3956422531849125e-05, + "loss": 1.8284, + "step": 25895 + }, + { + "epoch": 3.0213510675533777, + "grad_norm": 1.2242484092712402, + "learning_rate": 5.394502238248023e-05, + "loss": 1.7534, + "step": 25896 + }, + { + "epoch": 3.0214677400536694, + "grad_norm": 1.1654974222183228, + "learning_rate": 5.3933623244453395e-05, + "loss": 1.9587, + "step": 25897 + }, + { + "epoch": 3.021584412553961, + "grad_norm": 1.1009262800216675, + "learning_rate": 5.3922225117886805e-05, + "loss": 1.9494, + "step": 25898 + }, + { + "epoch": 3.0217010850542527, + "grad_norm": 1.220295786857605, + "learning_rate": 5.391082800289861e-05, + "loss": 1.8625, + "step": 25899 + }, + { + "epoch": 3.0218177575545444, + "grad_norm": 1.1284767389297485, + "learning_rate": 5.389943189960696e-05, + "loss": 1.828, + "step": 25900 + }, + { + "epoch": 3.021934430054836, + "grad_norm": 1.0571727752685547, + "learning_rate": 5.388803680813008e-05, + "loss": 1.853, + "step": 25901 + }, + { + "epoch": 3.0220511025551278, + "grad_norm": 1.1290087699890137, + "learning_rate": 5.3876642728585974e-05, + "loss": 1.7322, + "step": 25902 + }, + { + "epoch": 3.0221677750554194, + "grad_norm": 1.1628766059875488, + "learning_rate": 5.3865249661092906e-05, + "loss": 1.7251, + "step": 25903 + }, + { + "epoch": 3.022284447555711, + "grad_norm": 1.2011311054229736, + "learning_rate": 5.3853857605768845e-05, + "loss": 1.8145, + "step": 25904 + }, + { + "epoch": 3.022401120056003, + "grad_norm": 1.198826551437378, + "learning_rate": 5.384246656273202e-05, + "loss": 1.7644, + "step": 25905 + }, + { + "epoch": 3.0225177925562945, + "grad_norm": 1.3972374200820923, + "learning_rate": 5.383107653210039e-05, + "loss": 1.9621, + "step": 25906 + }, + { + "epoch": 3.022634465056586, + "grad_norm": 1.4415496587753296, + "learning_rate": 5.3819687513992176e-05, + "loss": 1.8286, + "step": 25907 + }, + { + "epoch": 3.022751137556878, + "grad_norm": 1.4246549606323242, + "learning_rate": 5.3808299508525305e-05, + "loss": 1.8059, + "step": 25908 + }, + { + "epoch": 3.0228678100571695, + "grad_norm": 1.2919481992721558, + "learning_rate": 5.379691251581797e-05, + "loss": 1.8193, + "step": 25909 + }, + { + "epoch": 3.022984482557461, + "grad_norm": 1.295478105545044, + "learning_rate": 5.3785526535988103e-05, + "loss": 1.8635, + "step": 25910 + }, + { + "epoch": 3.023101155057753, + "grad_norm": 1.177741527557373, + "learning_rate": 5.377414156915383e-05, + "loss": 1.7933, + "step": 25911 + }, + { + "epoch": 3.0232178275580446, + "grad_norm": 1.0091558694839478, + "learning_rate": 5.3762757615433094e-05, + "loss": 1.721, + "step": 25912 + }, + { + "epoch": 3.0233345000583363, + "grad_norm": 1.2700859308242798, + "learning_rate": 5.3751374674944005e-05, + "loss": 1.7309, + "step": 25913 + }, + { + "epoch": 3.023451172558628, + "grad_norm": 1.2104196548461914, + "learning_rate": 5.373999274780452e-05, + "loss": 1.753, + "step": 25914 + }, + { + "epoch": 3.0235678450589196, + "grad_norm": 1.1011292934417725, + "learning_rate": 5.372861183413258e-05, + "loss": 1.6315, + "step": 25915 + }, + { + "epoch": 3.0236845175592113, + "grad_norm": 1.3300986289978027, + "learning_rate": 5.3717231934046276e-05, + "loss": 1.8247, + "step": 25916 + }, + { + "epoch": 3.023801190059503, + "grad_norm": 1.2148820161819458, + "learning_rate": 5.3705853047663494e-05, + "loss": 1.9156, + "step": 25917 + }, + { + "epoch": 3.0239178625597947, + "grad_norm": 1.2524943351745605, + "learning_rate": 5.3694475175102246e-05, + "loss": 1.9189, + "step": 25918 + }, + { + "epoch": 3.0240345350600863, + "grad_norm": 1.2250827550888062, + "learning_rate": 5.368309831648045e-05, + "loss": 1.7201, + "step": 25919 + }, + { + "epoch": 3.024151207560378, + "grad_norm": 1.2699447870254517, + "learning_rate": 5.367172247191613e-05, + "loss": 1.8822, + "step": 25920 + }, + { + "epoch": 3.0242678800606697, + "grad_norm": 1.3214203119277954, + "learning_rate": 5.3660347641527095e-05, + "loss": 1.8746, + "step": 25921 + }, + { + "epoch": 3.0243845525609614, + "grad_norm": 1.0998185873031616, + "learning_rate": 5.364897382543137e-05, + "loss": 1.7799, + "step": 25922 + }, + { + "epoch": 3.024501225061253, + "grad_norm": 1.1904546022415161, + "learning_rate": 5.36376010237468e-05, + "loss": 1.9111, + "step": 25923 + }, + { + "epoch": 3.0246178975615448, + "grad_norm": 1.223078966140747, + "learning_rate": 5.362622923659136e-05, + "loss": 1.9783, + "step": 25924 + }, + { + "epoch": 3.0247345700618364, + "grad_norm": 1.2921197414398193, + "learning_rate": 5.361485846408283e-05, + "loss": 1.8104, + "step": 25925 + }, + { + "epoch": 3.024851242562128, + "grad_norm": 1.156257152557373, + "learning_rate": 5.360348870633922e-05, + "loss": 1.9224, + "step": 25926 + }, + { + "epoch": 3.02496791506242, + "grad_norm": 1.2496228218078613, + "learning_rate": 5.359211996347826e-05, + "loss": 1.8844, + "step": 25927 + }, + { + "epoch": 3.0250845875627115, + "grad_norm": 1.2975982427597046, + "learning_rate": 5.3580752235617904e-05, + "loss": 1.7906, + "step": 25928 + }, + { + "epoch": 3.025201260063003, + "grad_norm": 1.110027551651001, + "learning_rate": 5.3569385522876015e-05, + "loss": 1.7033, + "step": 25929 + }, + { + "epoch": 3.025317932563295, + "grad_norm": 1.2177588939666748, + "learning_rate": 5.355801982537039e-05, + "loss": 1.7456, + "step": 25930 + }, + { + "epoch": 3.0254346050635865, + "grad_norm": 1.4973033666610718, + "learning_rate": 5.354665514321881e-05, + "loss": 1.9097, + "step": 25931 + }, + { + "epoch": 3.025551277563878, + "grad_norm": 1.2622501850128174, + "learning_rate": 5.353529147653918e-05, + "loss": 1.8627, + "step": 25932 + }, + { + "epoch": 3.02566795006417, + "grad_norm": 1.3003268241882324, + "learning_rate": 5.352392882544927e-05, + "loss": 1.8312, + "step": 25933 + }, + { + "epoch": 3.0257846225644616, + "grad_norm": 1.284133791923523, + "learning_rate": 5.351256719006683e-05, + "loss": 1.8741, + "step": 25934 + }, + { + "epoch": 3.0259012950647532, + "grad_norm": 1.2287126779556274, + "learning_rate": 5.350120657050973e-05, + "loss": 1.7235, + "step": 25935 + }, + { + "epoch": 3.026017967565045, + "grad_norm": 1.2884316444396973, + "learning_rate": 5.348984696689566e-05, + "loss": 1.901, + "step": 25936 + }, + { + "epoch": 3.0261346400653366, + "grad_norm": 1.1245440244674683, + "learning_rate": 5.347848837934245e-05, + "loss": 1.5975, + "step": 25937 + }, + { + "epoch": 3.0262513125656283, + "grad_norm": 1.1825700998306274, + "learning_rate": 5.346713080796779e-05, + "loss": 1.9768, + "step": 25938 + }, + { + "epoch": 3.02636798506592, + "grad_norm": 1.2652459144592285, + "learning_rate": 5.3455774252889506e-05, + "loss": 1.7791, + "step": 25939 + }, + { + "epoch": 3.0264846575662117, + "grad_norm": 1.2165427207946777, + "learning_rate": 5.344441871422524e-05, + "loss": 1.847, + "step": 25940 + }, + { + "epoch": 3.0266013300665033, + "grad_norm": 1.2779510021209717, + "learning_rate": 5.343306419209275e-05, + "loss": 1.7168, + "step": 25941 + }, + { + "epoch": 3.026718002566795, + "grad_norm": 1.1701792478561401, + "learning_rate": 5.342171068660982e-05, + "loss": 1.8033, + "step": 25942 + }, + { + "epoch": 3.0268346750670867, + "grad_norm": 1.161865472793579, + "learning_rate": 5.341035819789402e-05, + "loss": 1.7992, + "step": 25943 + }, + { + "epoch": 3.0269513475673784, + "grad_norm": 1.2253360748291016, + "learning_rate": 5.339900672606317e-05, + "loss": 1.8038, + "step": 25944 + }, + { + "epoch": 3.02706802006767, + "grad_norm": 1.169884443283081, + "learning_rate": 5.3387656271234814e-05, + "loss": 1.8293, + "step": 25945 + }, + { + "epoch": 3.0271846925679617, + "grad_norm": 1.1105624437332153, + "learning_rate": 5.337630683352676e-05, + "loss": 1.7312, + "step": 25946 + }, + { + "epoch": 3.0273013650682534, + "grad_norm": 1.3824981451034546, + "learning_rate": 5.336495841305654e-05, + "loss": 1.8306, + "step": 25947 + }, + { + "epoch": 3.027418037568545, + "grad_norm": 1.173349142074585, + "learning_rate": 5.3353611009941904e-05, + "loss": 1.7091, + "step": 25948 + }, + { + "epoch": 3.027534710068837, + "grad_norm": 1.0702159404754639, + "learning_rate": 5.334226462430046e-05, + "loss": 1.9294, + "step": 25949 + }, + { + "epoch": 3.0276513825691285, + "grad_norm": 1.1516467332839966, + "learning_rate": 5.3330919256249815e-05, + "loss": 1.8179, + "step": 25950 + }, + { + "epoch": 3.02776805506942, + "grad_norm": 1.2168457508087158, + "learning_rate": 5.331957490590752e-05, + "loss": 1.9669, + "step": 25951 + }, + { + "epoch": 3.027884727569712, + "grad_norm": 1.2587600946426392, + "learning_rate": 5.3308231573391334e-05, + "loss": 2.0062, + "step": 25952 + }, + { + "epoch": 3.0280014000700035, + "grad_norm": 1.5644971132278442, + "learning_rate": 5.32968892588187e-05, + "loss": 1.8743, + "step": 25953 + }, + { + "epoch": 3.028118072570295, + "grad_norm": 1.0915690660476685, + "learning_rate": 5.3285547962307266e-05, + "loss": 1.6062, + "step": 25954 + }, + { + "epoch": 3.028234745070587, + "grad_norm": 1.2771159410476685, + "learning_rate": 5.3274207683974685e-05, + "loss": 1.8367, + "step": 25955 + }, + { + "epoch": 3.0283514175708786, + "grad_norm": 1.1736526489257812, + "learning_rate": 5.326286842393837e-05, + "loss": 1.8534, + "step": 25956 + }, + { + "epoch": 3.0284680900711702, + "grad_norm": 1.2509911060333252, + "learning_rate": 5.325153018231602e-05, + "loss": 1.6995, + "step": 25957 + }, + { + "epoch": 3.028584762571462, + "grad_norm": 1.3556936979293823, + "learning_rate": 5.3240192959225053e-05, + "loss": 1.9711, + "step": 25958 + }, + { + "epoch": 3.0287014350717536, + "grad_norm": 1.1365420818328857, + "learning_rate": 5.322885675478311e-05, + "loss": 1.842, + "step": 25959 + }, + { + "epoch": 3.0288181075720453, + "grad_norm": 1.3255581855773926, + "learning_rate": 5.321752156910761e-05, + "loss": 1.9587, + "step": 25960 + }, + { + "epoch": 3.028934780072337, + "grad_norm": 1.0247657299041748, + "learning_rate": 5.320618740231614e-05, + "loss": 1.6353, + "step": 25961 + }, + { + "epoch": 3.0290514525726286, + "grad_norm": 1.1215811967849731, + "learning_rate": 5.3194854254526145e-05, + "loss": 1.7894, + "step": 25962 + }, + { + "epoch": 3.0291681250729203, + "grad_norm": 1.3163554668426514, + "learning_rate": 5.318352212585519e-05, + "loss": 1.8397, + "step": 25963 + }, + { + "epoch": 3.029284797573212, + "grad_norm": 1.2199128866195679, + "learning_rate": 5.317219101642064e-05, + "loss": 1.7115, + "step": 25964 + }, + { + "epoch": 3.0294014700735037, + "grad_norm": 1.4639458656311035, + "learning_rate": 5.3160860926340085e-05, + "loss": 1.721, + "step": 25965 + }, + { + "epoch": 3.0295181425737954, + "grad_norm": 1.315922737121582, + "learning_rate": 5.3149531855730934e-05, + "loss": 2.0199, + "step": 25966 + }, + { + "epoch": 3.029634815074087, + "grad_norm": 1.2526130676269531, + "learning_rate": 5.3138203804710574e-05, + "loss": 1.7033, + "step": 25967 + }, + { + "epoch": 3.0297514875743787, + "grad_norm": 1.2738940715789795, + "learning_rate": 5.312687677339655e-05, + "loss": 1.866, + "step": 25968 + }, + { + "epoch": 3.0298681600746704, + "grad_norm": 1.3251917362213135, + "learning_rate": 5.3115550761906184e-05, + "loss": 1.7928, + "step": 25969 + }, + { + "epoch": 3.029984832574962, + "grad_norm": 1.279341459274292, + "learning_rate": 5.310422577035698e-05, + "loss": 1.749, + "step": 25970 + }, + { + "epoch": 3.0301015050752538, + "grad_norm": 1.3680781126022339, + "learning_rate": 5.3092901798866274e-05, + "loss": 1.9479, + "step": 25971 + }, + { + "epoch": 3.0302181775755455, + "grad_norm": 1.121633529663086, + "learning_rate": 5.308157884755152e-05, + "loss": 1.7345, + "step": 25972 + }, + { + "epoch": 3.030334850075837, + "grad_norm": 1.1949836015701294, + "learning_rate": 5.3070256916530014e-05, + "loss": 1.7495, + "step": 25973 + }, + { + "epoch": 3.030451522576129, + "grad_norm": 1.3155492544174194, + "learning_rate": 5.3058936005919256e-05, + "loss": 1.7336, + "step": 25974 + }, + { + "epoch": 3.0305681950764205, + "grad_norm": 1.1131787300109863, + "learning_rate": 5.304761611583648e-05, + "loss": 1.6851, + "step": 25975 + }, + { + "epoch": 3.030684867576712, + "grad_norm": 1.1897386312484741, + "learning_rate": 5.3036297246399144e-05, + "loss": 1.7865, + "step": 25976 + }, + { + "epoch": 3.030801540077004, + "grad_norm": 1.306559681892395, + "learning_rate": 5.302497939772451e-05, + "loss": 1.944, + "step": 25977 + }, + { + "epoch": 3.0309182125772955, + "grad_norm": 1.1208277940750122, + "learning_rate": 5.3013662569929974e-05, + "loss": 1.7235, + "step": 25978 + }, + { + "epoch": 3.0310348850775872, + "grad_norm": 1.193263292312622, + "learning_rate": 5.300234676313278e-05, + "loss": 1.9057, + "step": 25979 + }, + { + "epoch": 3.031151557577879, + "grad_norm": 1.1844428777694702, + "learning_rate": 5.299103197745029e-05, + "loss": 1.8812, + "step": 25980 + }, + { + "epoch": 3.0312682300781706, + "grad_norm": 1.1717617511749268, + "learning_rate": 5.297971821299984e-05, + "loss": 1.8984, + "step": 25981 + }, + { + "epoch": 3.0313849025784623, + "grad_norm": 1.1126465797424316, + "learning_rate": 5.2968405469898676e-05, + "loss": 1.6646, + "step": 25982 + }, + { + "epoch": 3.031501575078754, + "grad_norm": 1.5366700887680054, + "learning_rate": 5.295709374826408e-05, + "loss": 1.8724, + "step": 25983 + }, + { + "epoch": 3.0316182475790456, + "grad_norm": 1.3820222616195679, + "learning_rate": 5.2945783048213255e-05, + "loss": 1.7356, + "step": 25984 + }, + { + "epoch": 3.0317349200793373, + "grad_norm": 1.3769570589065552, + "learning_rate": 5.293447336986357e-05, + "loss": 1.8966, + "step": 25985 + }, + { + "epoch": 3.031851592579629, + "grad_norm": 1.3179407119750977, + "learning_rate": 5.2923164713332174e-05, + "loss": 1.7736, + "step": 25986 + }, + { + "epoch": 3.0319682650799207, + "grad_norm": 1.3571282625198364, + "learning_rate": 5.291185707873637e-05, + "loss": 1.92, + "step": 25987 + }, + { + "epoch": 3.0320849375802124, + "grad_norm": 1.0121408700942993, + "learning_rate": 5.2900550466193344e-05, + "loss": 1.7501, + "step": 25988 + }, + { + "epoch": 3.032201610080504, + "grad_norm": 1.4024500846862793, + "learning_rate": 5.2889244875820353e-05, + "loss": 1.8664, + "step": 25989 + }, + { + "epoch": 3.0323182825807957, + "grad_norm": 1.3518162965774536, + "learning_rate": 5.2877940307734524e-05, + "loss": 1.9719, + "step": 25990 + }, + { + "epoch": 3.0324349550810874, + "grad_norm": 1.371474027633667, + "learning_rate": 5.286663676205316e-05, + "loss": 1.9473, + "step": 25991 + }, + { + "epoch": 3.032551627581379, + "grad_norm": 1.2977774143218994, + "learning_rate": 5.285533423889334e-05, + "loss": 1.8063, + "step": 25992 + }, + { + "epoch": 3.0326683000816708, + "grad_norm": 1.2495965957641602, + "learning_rate": 5.284403273837225e-05, + "loss": 1.7995, + "step": 25993 + }, + { + "epoch": 3.0327849725819624, + "grad_norm": 1.0962039232254028, + "learning_rate": 5.283273226060715e-05, + "loss": 1.6825, + "step": 25994 + }, + { + "epoch": 3.032901645082254, + "grad_norm": 1.2324566841125488, + "learning_rate": 5.282143280571508e-05, + "loss": 2.0112, + "step": 25995 + }, + { + "epoch": 3.033018317582546, + "grad_norm": 1.2590469121932983, + "learning_rate": 5.2810134373813276e-05, + "loss": 1.8336, + "step": 25996 + }, + { + "epoch": 3.0331349900828375, + "grad_norm": 1.2790796756744385, + "learning_rate": 5.2798836965018754e-05, + "loss": 1.8435, + "step": 25997 + }, + { + "epoch": 3.033251662583129, + "grad_norm": 1.1376174688339233, + "learning_rate": 5.2787540579448726e-05, + "loss": 1.7992, + "step": 25998 + }, + { + "epoch": 3.033368335083421, + "grad_norm": 1.2154033184051514, + "learning_rate": 5.277624521722029e-05, + "loss": 2.0022, + "step": 25999 + }, + { + "epoch": 3.0334850075837125, + "grad_norm": 1.30832839012146, + "learning_rate": 5.276495087845046e-05, + "loss": 1.8175, + "step": 26000 + }, + { + "epoch": 3.033601680084004, + "grad_norm": 1.325175404548645, + "learning_rate": 5.275365756325644e-05, + "loss": 1.8845, + "step": 26001 + }, + { + "epoch": 3.033718352584296, + "grad_norm": 1.1662853956222534, + "learning_rate": 5.2742365271755237e-05, + "loss": 1.712, + "step": 26002 + }, + { + "epoch": 3.0338350250845876, + "grad_norm": 1.2996056079864502, + "learning_rate": 5.273107400406389e-05, + "loss": 1.8619, + "step": 26003 + }, + { + "epoch": 3.0339516975848793, + "grad_norm": 1.2088634967803955, + "learning_rate": 5.2719783760299555e-05, + "loss": 1.7299, + "step": 26004 + }, + { + "epoch": 3.034068370085171, + "grad_norm": 1.287696123123169, + "learning_rate": 5.270849454057917e-05, + "loss": 1.7949, + "step": 26005 + }, + { + "epoch": 3.0341850425854626, + "grad_norm": 1.3106712102890015, + "learning_rate": 5.269720634501979e-05, + "loss": 1.8243, + "step": 26006 + }, + { + "epoch": 3.0343017150857543, + "grad_norm": 1.2476071119308472, + "learning_rate": 5.268591917373853e-05, + "loss": 1.752, + "step": 26007 + }, + { + "epoch": 3.034418387586046, + "grad_norm": 1.3247170448303223, + "learning_rate": 5.267463302685229e-05, + "loss": 1.8412, + "step": 26008 + }, + { + "epoch": 3.0345350600863377, + "grad_norm": 1.2958625555038452, + "learning_rate": 5.266334790447816e-05, + "loss": 1.7868, + "step": 26009 + }, + { + "epoch": 3.0346517325866293, + "grad_norm": 1.2799581289291382, + "learning_rate": 5.265206380673306e-05, + "loss": 1.6869, + "step": 26010 + }, + { + "epoch": 3.034768405086921, + "grad_norm": 1.277795672416687, + "learning_rate": 5.264078073373404e-05, + "loss": 1.7441, + "step": 26011 + }, + { + "epoch": 3.0348850775872127, + "grad_norm": 1.3841708898544312, + "learning_rate": 5.262949868559798e-05, + "loss": 1.7994, + "step": 26012 + }, + { + "epoch": 3.0350017500875044, + "grad_norm": 1.3880949020385742, + "learning_rate": 5.261821766244196e-05, + "loss": 1.9524, + "step": 26013 + }, + { + "epoch": 3.035118422587796, + "grad_norm": 1.2193676233291626, + "learning_rate": 5.26069376643828e-05, + "loss": 1.8604, + "step": 26014 + }, + { + "epoch": 3.0352350950880878, + "grad_norm": 1.2370198965072632, + "learning_rate": 5.259565869153754e-05, + "loss": 1.6873, + "step": 26015 + }, + { + "epoch": 3.0353517675883794, + "grad_norm": 1.1580379009246826, + "learning_rate": 5.258438074402304e-05, + "loss": 1.7551, + "step": 26016 + }, + { + "epoch": 3.035468440088671, + "grad_norm": 1.2859002351760864, + "learning_rate": 5.257310382195627e-05, + "loss": 1.9666, + "step": 26017 + }, + { + "epoch": 3.035585112588963, + "grad_norm": 1.0753339529037476, + "learning_rate": 5.2561827925454127e-05, + "loss": 1.9003, + "step": 26018 + }, + { + "epoch": 3.0357017850892545, + "grad_norm": 1.1973739862442017, + "learning_rate": 5.255055305463345e-05, + "loss": 1.9312, + "step": 26019 + }, + { + "epoch": 3.035818457589546, + "grad_norm": 1.0831986665725708, + "learning_rate": 5.2539279209611194e-05, + "loss": 1.6069, + "step": 26020 + }, + { + "epoch": 3.035935130089838, + "grad_norm": 1.2675989866256714, + "learning_rate": 5.2528006390504177e-05, + "loss": 1.9559, + "step": 26021 + }, + { + "epoch": 3.0360518025901295, + "grad_norm": 1.2324243783950806, + "learning_rate": 5.251673459742933e-05, + "loss": 1.9373, + "step": 26022 + }, + { + "epoch": 3.036168475090421, + "grad_norm": 1.394049882888794, + "learning_rate": 5.250546383050344e-05, + "loss": 1.8593, + "step": 26023 + }, + { + "epoch": 3.036285147590713, + "grad_norm": 1.2761242389678955, + "learning_rate": 5.249419408984341e-05, + "loss": 1.8944, + "step": 26024 + }, + { + "epoch": 3.0364018200910046, + "grad_norm": 1.2299532890319824, + "learning_rate": 5.248292537556599e-05, + "loss": 1.7195, + "step": 26025 + }, + { + "epoch": 3.0365184925912962, + "grad_norm": 1.3655320405960083, + "learning_rate": 5.247165768778811e-05, + "loss": 1.7706, + "step": 26026 + }, + { + "epoch": 3.036635165091588, + "grad_norm": 1.3811712265014648, + "learning_rate": 5.246039102662647e-05, + "loss": 1.9472, + "step": 26027 + }, + { + "epoch": 3.0367518375918796, + "grad_norm": 1.0579041242599487, + "learning_rate": 5.244912539219797e-05, + "loss": 1.7617, + "step": 26028 + }, + { + "epoch": 3.0368685100921713, + "grad_norm": 1.1322475671768188, + "learning_rate": 5.243786078461932e-05, + "loss": 1.8185, + "step": 26029 + }, + { + "epoch": 3.036985182592463, + "grad_norm": 1.160805344581604, + "learning_rate": 5.242659720400737e-05, + "loss": 1.7899, + "step": 26030 + }, + { + "epoch": 3.0371018550927547, + "grad_norm": 1.166193962097168, + "learning_rate": 5.24153346504788e-05, + "loss": 1.7674, + "step": 26031 + }, + { + "epoch": 3.0372185275930463, + "grad_norm": 1.217920184135437, + "learning_rate": 5.2404073124150434e-05, + "loss": 1.7681, + "step": 26032 + }, + { + "epoch": 3.037335200093338, + "grad_norm": 1.1173360347747803, + "learning_rate": 5.239281262513903e-05, + "loss": 1.9043, + "step": 26033 + }, + { + "epoch": 3.0374518725936297, + "grad_norm": 1.233989953994751, + "learning_rate": 5.23815531535613e-05, + "loss": 1.8364, + "step": 26034 + }, + { + "epoch": 3.0375685450939214, + "grad_norm": 1.1328483819961548, + "learning_rate": 5.237029470953398e-05, + "loss": 1.6792, + "step": 26035 + }, + { + "epoch": 3.037685217594213, + "grad_norm": 1.2780954837799072, + "learning_rate": 5.2359037293173715e-05, + "loss": 1.868, + "step": 26036 + }, + { + "epoch": 3.0378018900945047, + "grad_norm": 1.2691789865493774, + "learning_rate": 5.2347780904597326e-05, + "loss": 1.8355, + "step": 26037 + }, + { + "epoch": 3.0379185625947964, + "grad_norm": 1.1782951354980469, + "learning_rate": 5.233652554392139e-05, + "loss": 1.7505, + "step": 26038 + }, + { + "epoch": 3.038035235095088, + "grad_norm": 1.2077356576919556, + "learning_rate": 5.23252712112627e-05, + "loss": 2.0, + "step": 26039 + }, + { + "epoch": 3.03815190759538, + "grad_norm": 1.1066968441009521, + "learning_rate": 5.2314017906737834e-05, + "loss": 1.7092, + "step": 26040 + }, + { + "epoch": 3.0382685800956715, + "grad_norm": 1.0459156036376953, + "learning_rate": 5.230276563046354e-05, + "loss": 1.7219, + "step": 26041 + }, + { + "epoch": 3.038385252595963, + "grad_norm": 1.257426142692566, + "learning_rate": 5.229151438255637e-05, + "loss": 1.8545, + "step": 26042 + }, + { + "epoch": 3.038501925096255, + "grad_norm": 1.2488479614257812, + "learning_rate": 5.2280264163133074e-05, + "loss": 1.8895, + "step": 26043 + }, + { + "epoch": 3.0386185975965465, + "grad_norm": 1.2571133375167847, + "learning_rate": 5.2269014972310186e-05, + "loss": 1.839, + "step": 26044 + }, + { + "epoch": 3.038735270096838, + "grad_norm": 1.1767743825912476, + "learning_rate": 5.2257766810204354e-05, + "loss": 1.9364, + "step": 26045 + }, + { + "epoch": 3.03885194259713, + "grad_norm": 1.2642748355865479, + "learning_rate": 5.2246519676932266e-05, + "loss": 1.8553, + "step": 26046 + }, + { + "epoch": 3.0389686150974216, + "grad_norm": 1.0952328443527222, + "learning_rate": 5.22352735726104e-05, + "loss": 1.7107, + "step": 26047 + }, + { + "epoch": 3.0390852875977132, + "grad_norm": 1.5044554471969604, + "learning_rate": 5.222402849735542e-05, + "loss": 2.0241, + "step": 26048 + }, + { + "epoch": 3.039201960098005, + "grad_norm": 1.4107266664505005, + "learning_rate": 5.221278445128385e-05, + "loss": 1.7595, + "step": 26049 + }, + { + "epoch": 3.0393186325982966, + "grad_norm": 1.1328357458114624, + "learning_rate": 5.220154143451234e-05, + "loss": 1.7638, + "step": 26050 + }, + { + "epoch": 3.0394353050985883, + "grad_norm": 1.192029356956482, + "learning_rate": 5.21902994471574e-05, + "loss": 1.7551, + "step": 26051 + }, + { + "epoch": 3.03955197759888, + "grad_norm": 1.2776129245758057, + "learning_rate": 5.217905848933549e-05, + "loss": 1.8434, + "step": 26052 + }, + { + "epoch": 3.0396686500991716, + "grad_norm": 1.2380623817443848, + "learning_rate": 5.2167818561163266e-05, + "loss": 1.8634, + "step": 26053 + }, + { + "epoch": 3.0397853225994633, + "grad_norm": 1.1900001764297485, + "learning_rate": 5.2156579662757234e-05, + "loss": 2.0062, + "step": 26054 + }, + { + "epoch": 3.039901995099755, + "grad_norm": 1.2488081455230713, + "learning_rate": 5.214534179423379e-05, + "loss": 1.9878, + "step": 26055 + }, + { + "epoch": 3.0400186676000467, + "grad_norm": 1.0439400672912598, + "learning_rate": 5.21341049557096e-05, + "loss": 1.8091, + "step": 26056 + }, + { + "epoch": 3.0401353401003384, + "grad_norm": 1.3000134229660034, + "learning_rate": 5.212286914730102e-05, + "loss": 1.8231, + "step": 26057 + }, + { + "epoch": 3.04025201260063, + "grad_norm": 1.1180648803710938, + "learning_rate": 5.211163436912458e-05, + "loss": 1.6573, + "step": 26058 + }, + { + "epoch": 3.0403686851009217, + "grad_norm": 1.314426302909851, + "learning_rate": 5.2100400621296803e-05, + "loss": 1.8061, + "step": 26059 + }, + { + "epoch": 3.0404853576012134, + "grad_norm": 1.2783344984054565, + "learning_rate": 5.208916790393407e-05, + "loss": 1.8153, + "step": 26060 + }, + { + "epoch": 3.040602030101505, + "grad_norm": 1.3864259719848633, + "learning_rate": 5.2077936217152885e-05, + "loss": 1.9711, + "step": 26061 + }, + { + "epoch": 3.0407187026017968, + "grad_norm": 1.218706488609314, + "learning_rate": 5.206670556106964e-05, + "loss": 1.8317, + "step": 26062 + }, + { + "epoch": 3.0408353751020885, + "grad_norm": 1.10364830493927, + "learning_rate": 5.205547593580081e-05, + "loss": 1.7919, + "step": 26063 + }, + { + "epoch": 3.04095204760238, + "grad_norm": 1.1272170543670654, + "learning_rate": 5.2044247341462746e-05, + "loss": 1.7833, + "step": 26064 + }, + { + "epoch": 3.041068720102672, + "grad_norm": 1.2715046405792236, + "learning_rate": 5.2033019778171954e-05, + "loss": 1.8991, + "step": 26065 + }, + { + "epoch": 3.0411853926029635, + "grad_norm": 1.2182095050811768, + "learning_rate": 5.20217932460447e-05, + "loss": 1.8606, + "step": 26066 + }, + { + "epoch": 3.041302065103255, + "grad_norm": 1.348634123802185, + "learning_rate": 5.201056774519751e-05, + "loss": 1.7462, + "step": 26067 + }, + { + "epoch": 3.041418737603547, + "grad_norm": 1.1799490451812744, + "learning_rate": 5.199934327574666e-05, + "loss": 1.7289, + "step": 26068 + }, + { + "epoch": 3.0415354101038385, + "grad_norm": 1.3757203817367554, + "learning_rate": 5.19881198378085e-05, + "loss": 1.834, + "step": 26069 + }, + { + "epoch": 3.0416520826041302, + "grad_norm": 1.415149211883545, + "learning_rate": 5.197689743149946e-05, + "loss": 1.77, + "step": 26070 + }, + { + "epoch": 3.041768755104422, + "grad_norm": 1.413608193397522, + "learning_rate": 5.196567605693579e-05, + "loss": 1.8627, + "step": 26071 + }, + { + "epoch": 3.0418854276047136, + "grad_norm": 1.3572862148284912, + "learning_rate": 5.195445571423394e-05, + "loss": 1.8205, + "step": 26072 + }, + { + "epoch": 3.0420021001050053, + "grad_norm": 1.3244552612304688, + "learning_rate": 5.19432364035101e-05, + "loss": 1.8746, + "step": 26073 + }, + { + "epoch": 3.042118772605297, + "grad_norm": 1.2417407035827637, + "learning_rate": 5.193201812488068e-05, + "loss": 1.6424, + "step": 26074 + }, + { + "epoch": 3.0422354451055886, + "grad_norm": 1.2442820072174072, + "learning_rate": 5.192080087846191e-05, + "loss": 1.7169, + "step": 26075 + }, + { + "epoch": 3.0423521176058803, + "grad_norm": 1.2910562753677368, + "learning_rate": 5.190958466437014e-05, + "loss": 1.8265, + "step": 26076 + }, + { + "epoch": 3.042468790106172, + "grad_norm": 1.1229780912399292, + "learning_rate": 5.189836948272157e-05, + "loss": 1.9377, + "step": 26077 + }, + { + "epoch": 3.0425854626064637, + "grad_norm": 0.9692203998565674, + "learning_rate": 5.188715533363256e-05, + "loss": 1.6285, + "step": 26078 + }, + { + "epoch": 3.0427021351067554, + "grad_norm": 1.1208868026733398, + "learning_rate": 5.187594221721927e-05, + "loss": 1.874, + "step": 26079 + }, + { + "epoch": 3.042818807607047, + "grad_norm": 1.216658592224121, + "learning_rate": 5.1864730133598055e-05, + "loss": 1.8072, + "step": 26080 + }, + { + "epoch": 3.0429354801073387, + "grad_norm": 1.3162676095962524, + "learning_rate": 5.1853519082885026e-05, + "loss": 1.8464, + "step": 26081 + }, + { + "epoch": 3.0430521526076304, + "grad_norm": 1.3262568712234497, + "learning_rate": 5.184230906519651e-05, + "loss": 1.8874, + "step": 26082 + }, + { + "epoch": 3.043168825107922, + "grad_norm": 1.3498343229293823, + "learning_rate": 5.183110008064864e-05, + "loss": 1.8163, + "step": 26083 + }, + { + "epoch": 3.0432854976082138, + "grad_norm": 1.1900728940963745, + "learning_rate": 5.181989212935769e-05, + "loss": 1.843, + "step": 26084 + }, + { + "epoch": 3.0434021701085054, + "grad_norm": 1.1128429174423218, + "learning_rate": 5.180868521143978e-05, + "loss": 1.8685, + "step": 26085 + }, + { + "epoch": 3.043518842608797, + "grad_norm": 1.3293850421905518, + "learning_rate": 5.179747932701117e-05, + "loss": 1.9409, + "step": 26086 + }, + { + "epoch": 3.043635515109089, + "grad_norm": 1.0631300210952759, + "learning_rate": 5.1786274476188e-05, + "loss": 1.8134, + "step": 26087 + }, + { + "epoch": 3.0437521876093805, + "grad_norm": 1.084287166595459, + "learning_rate": 5.177507065908638e-05, + "loss": 1.7701, + "step": 26088 + }, + { + "epoch": 3.043868860109672, + "grad_norm": 1.2373813390731812, + "learning_rate": 5.176386787582252e-05, + "loss": 1.9112, + "step": 26089 + }, + { + "epoch": 3.043985532609964, + "grad_norm": 1.342844009399414, + "learning_rate": 5.1752666126512506e-05, + "loss": 1.8363, + "step": 26090 + }, + { + "epoch": 3.0441022051102555, + "grad_norm": 1.1914410591125488, + "learning_rate": 5.174146541127252e-05, + "loss": 1.7877, + "step": 26091 + }, + { + "epoch": 3.044218877610547, + "grad_norm": 1.1921679973602295, + "learning_rate": 5.173026573021863e-05, + "loss": 1.8216, + "step": 26092 + }, + { + "epoch": 3.044335550110839, + "grad_norm": 1.1464277505874634, + "learning_rate": 5.1719067083467e-05, + "loss": 1.7341, + "step": 26093 + }, + { + "epoch": 3.0444522226111306, + "grad_norm": 1.1681004762649536, + "learning_rate": 5.170786947113364e-05, + "loss": 1.8143, + "step": 26094 + }, + { + "epoch": 3.0445688951114223, + "grad_norm": 1.365182876586914, + "learning_rate": 5.169667289333474e-05, + "loss": 1.9148, + "step": 26095 + }, + { + "epoch": 3.044685567611714, + "grad_norm": 1.2361353635787964, + "learning_rate": 5.168547735018628e-05, + "loss": 1.9619, + "step": 26096 + }, + { + "epoch": 3.0448022401120056, + "grad_norm": 1.1415605545043945, + "learning_rate": 5.167428284180434e-05, + "loss": 1.8368, + "step": 26097 + }, + { + "epoch": 3.0449189126122973, + "grad_norm": 1.0277705192565918, + "learning_rate": 5.1663089368305064e-05, + "loss": 1.7032, + "step": 26098 + }, + { + "epoch": 3.045035585112589, + "grad_norm": 1.2332266569137573, + "learning_rate": 5.165189692980435e-05, + "loss": 1.7485, + "step": 26099 + }, + { + "epoch": 3.0451522576128807, + "grad_norm": 1.1894556283950806, + "learning_rate": 5.164070552641838e-05, + "loss": 1.6786, + "step": 26100 + }, + { + "epoch": 3.0452689301131723, + "grad_norm": 1.1831512451171875, + "learning_rate": 5.162951515826303e-05, + "loss": 1.7218, + "step": 26101 + }, + { + "epoch": 3.045385602613464, + "grad_norm": 1.0957833528518677, + "learning_rate": 5.1618325825454426e-05, + "loss": 1.856, + "step": 26102 + }, + { + "epoch": 3.0455022751137557, + "grad_norm": 1.6008906364440918, + "learning_rate": 5.160713752810851e-05, + "loss": 1.8247, + "step": 26103 + }, + { + "epoch": 3.0456189476140474, + "grad_norm": 1.2954529523849487, + "learning_rate": 5.159595026634129e-05, + "loss": 1.8648, + "step": 26104 + }, + { + "epoch": 3.045735620114339, + "grad_norm": 1.2689237594604492, + "learning_rate": 5.158476404026867e-05, + "loss": 1.8418, + "step": 26105 + }, + { + "epoch": 3.0458522926146308, + "grad_norm": 1.4867074489593506, + "learning_rate": 5.1573578850006714e-05, + "loss": 1.8969, + "step": 26106 + }, + { + "epoch": 3.0459689651149224, + "grad_norm": 1.1721045970916748, + "learning_rate": 5.1562394695671314e-05, + "loss": 1.7914, + "step": 26107 + }, + { + "epoch": 3.046085637615214, + "grad_norm": 1.1710022687911987, + "learning_rate": 5.1551211577378476e-05, + "loss": 1.7376, + "step": 26108 + }, + { + "epoch": 3.046202310115506, + "grad_norm": 1.2196664810180664, + "learning_rate": 5.154002949524405e-05, + "loss": 1.6936, + "step": 26109 + }, + { + "epoch": 3.0463189826157975, + "grad_norm": 1.0822811126708984, + "learning_rate": 5.152884844938401e-05, + "loss": 1.7773, + "step": 26110 + }, + { + "epoch": 3.046435655116089, + "grad_norm": 1.2727134227752686, + "learning_rate": 5.1517668439914315e-05, + "loss": 1.7907, + "step": 26111 + }, + { + "epoch": 3.046552327616381, + "grad_norm": 1.2594506740570068, + "learning_rate": 5.150648946695079e-05, + "loss": 1.8361, + "step": 26112 + }, + { + "epoch": 3.0466690001166725, + "grad_norm": 1.3407424688339233, + "learning_rate": 5.149531153060937e-05, + "loss": 1.6561, + "step": 26113 + }, + { + "epoch": 3.046785672616964, + "grad_norm": 1.198189616203308, + "learning_rate": 5.148413463100589e-05, + "loss": 1.7668, + "step": 26114 + }, + { + "epoch": 3.046902345117256, + "grad_norm": 1.2654905319213867, + "learning_rate": 5.14729587682563e-05, + "loss": 1.7376, + "step": 26115 + }, + { + "epoch": 3.0470190176175476, + "grad_norm": 1.2774875164031982, + "learning_rate": 5.1461783942476345e-05, + "loss": 1.8466, + "step": 26116 + }, + { + "epoch": 3.0471356901178392, + "grad_norm": 1.3060245513916016, + "learning_rate": 5.145061015378201e-05, + "loss": 1.7514, + "step": 26117 + }, + { + "epoch": 3.047252362618131, + "grad_norm": 1.2460540533065796, + "learning_rate": 5.143943740228899e-05, + "loss": 1.9227, + "step": 26118 + }, + { + "epoch": 3.0473690351184226, + "grad_norm": 1.4008783102035522, + "learning_rate": 5.1428265688113243e-05, + "loss": 1.8567, + "step": 26119 + }, + { + "epoch": 3.0474857076187143, + "grad_norm": 1.1384739875793457, + "learning_rate": 5.1417095011370515e-05, + "loss": 1.72, + "step": 26120 + }, + { + "epoch": 3.047602380119006, + "grad_norm": 1.390806794166565, + "learning_rate": 5.140592537217657e-05, + "loss": 2.0162, + "step": 26121 + }, + { + "epoch": 3.0477190526192977, + "grad_norm": 1.2759507894515991, + "learning_rate": 5.1394756770647304e-05, + "loss": 1.9283, + "step": 26122 + }, + { + "epoch": 3.0478357251195893, + "grad_norm": 1.2043994665145874, + "learning_rate": 5.1383589206898406e-05, + "loss": 1.9482, + "step": 26123 + }, + { + "epoch": 3.047952397619881, + "grad_norm": 1.2059446573257446, + "learning_rate": 5.137242268104574e-05, + "loss": 1.8396, + "step": 26124 + }, + { + "epoch": 3.0480690701201727, + "grad_norm": 1.0207394361495972, + "learning_rate": 5.136125719320498e-05, + "loss": 1.664, + "step": 26125 + }, + { + "epoch": 3.0481857426204644, + "grad_norm": 1.2682433128356934, + "learning_rate": 5.1350092743491944e-05, + "loss": 1.7994, + "step": 26126 + }, + { + "epoch": 3.048302415120756, + "grad_norm": 1.2804462909698486, + "learning_rate": 5.1338929332022316e-05, + "loss": 1.7909, + "step": 26127 + }, + { + "epoch": 3.0484190876210477, + "grad_norm": 1.3060716390609741, + "learning_rate": 5.132776695891188e-05, + "loss": 1.8996, + "step": 26128 + }, + { + "epoch": 3.0485357601213394, + "grad_norm": 1.206502079963684, + "learning_rate": 5.131660562427631e-05, + "loss": 1.6892, + "step": 26129 + }, + { + "epoch": 3.048652432621631, + "grad_norm": 1.14938223361969, + "learning_rate": 5.1305445328231374e-05, + "loss": 1.8608, + "step": 26130 + }, + { + "epoch": 3.048769105121923, + "grad_norm": 1.042902946472168, + "learning_rate": 5.1294286070892676e-05, + "loss": 1.7027, + "step": 26131 + }, + { + "epoch": 3.0488857776222145, + "grad_norm": 1.3913075923919678, + "learning_rate": 5.128312785237602e-05, + "loss": 1.797, + "step": 26132 + }, + { + "epoch": 3.049002450122506, + "grad_norm": 1.0626581907272339, + "learning_rate": 5.127197067279695e-05, + "loss": 1.7184, + "step": 26133 + }, + { + "epoch": 3.049119122622798, + "grad_norm": 1.134833574295044, + "learning_rate": 5.1260814532271276e-05, + "loss": 1.8932, + "step": 26134 + }, + { + "epoch": 3.0492357951230895, + "grad_norm": 1.0895357131958008, + "learning_rate": 5.124965943091451e-05, + "loss": 1.7722, + "step": 26135 + }, + { + "epoch": 3.049352467623381, + "grad_norm": 1.1999191045761108, + "learning_rate": 5.1238505368842416e-05, + "loss": 1.8472, + "step": 26136 + }, + { + "epoch": 3.049469140123673, + "grad_norm": 1.254757285118103, + "learning_rate": 5.122735234617059e-05, + "loss": 1.7825, + "step": 26137 + }, + { + "epoch": 3.0495858126239646, + "grad_norm": 1.2074137926101685, + "learning_rate": 5.121620036301457e-05, + "loss": 1.8638, + "step": 26138 + }, + { + "epoch": 3.0497024851242562, + "grad_norm": 1.2022563219070435, + "learning_rate": 5.120504941949009e-05, + "loss": 1.8185, + "step": 26139 + }, + { + "epoch": 3.049819157624548, + "grad_norm": 1.2516390085220337, + "learning_rate": 5.119389951571264e-05, + "loss": 1.7288, + "step": 26140 + }, + { + "epoch": 3.0499358301248396, + "grad_norm": 1.3263754844665527, + "learning_rate": 5.118275065179793e-05, + "loss": 1.8705, + "step": 26141 + }, + { + "epoch": 3.0500525026251313, + "grad_norm": 1.2101140022277832, + "learning_rate": 5.1171602827861416e-05, + "loss": 1.8666, + "step": 26142 + }, + { + "epoch": 3.050169175125423, + "grad_norm": 1.072031021118164, + "learning_rate": 5.116045604401879e-05, + "loss": 1.8704, + "step": 26143 + }, + { + "epoch": 3.0502858476257146, + "grad_norm": 1.108901023864746, + "learning_rate": 5.1149310300385485e-05, + "loss": 1.7094, + "step": 26144 + }, + { + "epoch": 3.0504025201260063, + "grad_norm": 1.216025710105896, + "learning_rate": 5.113816559707717e-05, + "loss": 1.8196, + "step": 26145 + }, + { + "epoch": 3.050519192626298, + "grad_norm": 1.2225590944290161, + "learning_rate": 5.112702193420925e-05, + "loss": 1.9136, + "step": 26146 + }, + { + "epoch": 3.0506358651265897, + "grad_norm": 1.6105202436447144, + "learning_rate": 5.1115879311897384e-05, + "loss": 1.8772, + "step": 26147 + }, + { + "epoch": 3.0507525376268814, + "grad_norm": 1.139509916305542, + "learning_rate": 5.110473773025697e-05, + "loss": 1.8293, + "step": 26148 + }, + { + "epoch": 3.050869210127173, + "grad_norm": 1.1859321594238281, + "learning_rate": 5.109359718940355e-05, + "loss": 1.7524, + "step": 26149 + }, + { + "epoch": 3.0509858826274647, + "grad_norm": 1.1693288087844849, + "learning_rate": 5.1082457689452706e-05, + "loss": 1.7872, + "step": 26150 + }, + { + "epoch": 3.0511025551277564, + "grad_norm": 1.263797402381897, + "learning_rate": 5.107131923051978e-05, + "loss": 1.7748, + "step": 26151 + }, + { + "epoch": 3.051219227628048, + "grad_norm": 1.2171365022659302, + "learning_rate": 5.106018181272035e-05, + "loss": 1.6448, + "step": 26152 + }, + { + "epoch": 3.0513359001283398, + "grad_norm": 1.2700694799423218, + "learning_rate": 5.1049045436169847e-05, + "loss": 1.7499, + "step": 26153 + }, + { + "epoch": 3.0514525726286315, + "grad_norm": 1.0385169982910156, + "learning_rate": 5.1037910100983636e-05, + "loss": 1.6785, + "step": 26154 + }, + { + "epoch": 3.051569245128923, + "grad_norm": 1.1274381875991821, + "learning_rate": 5.1026775807277296e-05, + "loss": 1.669, + "step": 26155 + }, + { + "epoch": 3.051685917629215, + "grad_norm": 1.423659324645996, + "learning_rate": 5.101564255516619e-05, + "loss": 1.9489, + "step": 26156 + }, + { + "epoch": 3.0518025901295065, + "grad_norm": 1.3078117370605469, + "learning_rate": 5.1004510344765676e-05, + "loss": 1.7558, + "step": 26157 + }, + { + "epoch": 3.051919262629798, + "grad_norm": 1.4474694728851318, + "learning_rate": 5.0993379176191264e-05, + "loss": 1.9223, + "step": 26158 + }, + { + "epoch": 3.05203593513009, + "grad_norm": 1.2216119766235352, + "learning_rate": 5.098224904955824e-05, + "loss": 1.7651, + "step": 26159 + }, + { + "epoch": 3.0521526076303815, + "grad_norm": 1.3807299137115479, + "learning_rate": 5.097111996498212e-05, + "loss": 1.7069, + "step": 26160 + }, + { + "epoch": 3.0522692801306732, + "grad_norm": 1.2360212802886963, + "learning_rate": 5.095999192257816e-05, + "loss": 1.8723, + "step": 26161 + }, + { + "epoch": 3.052385952630965, + "grad_norm": 1.259460210800171, + "learning_rate": 5.094886492246176e-05, + "loss": 1.814, + "step": 26162 + }, + { + "epoch": 3.0525026251312566, + "grad_norm": 1.3173636198043823, + "learning_rate": 5.093773896474833e-05, + "loss": 1.9451, + "step": 26163 + }, + { + "epoch": 3.0526192976315483, + "grad_norm": 1.2749534845352173, + "learning_rate": 5.0926614049553135e-05, + "loss": 1.8187, + "step": 26164 + }, + { + "epoch": 3.05273597013184, + "grad_norm": 1.3490943908691406, + "learning_rate": 5.091549017699156e-05, + "loss": 1.9106, + "step": 26165 + }, + { + "epoch": 3.0528526426321316, + "grad_norm": 1.395774006843567, + "learning_rate": 5.090436734717888e-05, + "loss": 1.835, + "step": 26166 + }, + { + "epoch": 3.0529693151324233, + "grad_norm": 1.2261266708374023, + "learning_rate": 5.0893245560230463e-05, + "loss": 1.8265, + "step": 26167 + }, + { + "epoch": 3.053085987632715, + "grad_norm": 1.3156113624572754, + "learning_rate": 5.0882124816261525e-05, + "loss": 1.8813, + "step": 26168 + }, + { + "epoch": 3.0532026601330067, + "grad_norm": 1.1414823532104492, + "learning_rate": 5.0871005115387444e-05, + "loss": 1.7538, + "step": 26169 + }, + { + "epoch": 3.0533193326332984, + "grad_norm": 1.2041538953781128, + "learning_rate": 5.085988645772338e-05, + "loss": 1.7712, + "step": 26170 + }, + { + "epoch": 3.05343600513359, + "grad_norm": 1.2764161825180054, + "learning_rate": 5.084876884338475e-05, + "loss": 1.8024, + "step": 26171 + }, + { + "epoch": 3.0535526776338817, + "grad_norm": 1.4913432598114014, + "learning_rate": 5.083765227248672e-05, + "loss": 1.9154, + "step": 26172 + }, + { + "epoch": 3.0536693501341734, + "grad_norm": 1.3976976871490479, + "learning_rate": 5.0826536745144544e-05, + "loss": 2.0439, + "step": 26173 + }, + { + "epoch": 3.053786022634465, + "grad_norm": 1.2374833822250366, + "learning_rate": 5.081542226147341e-05, + "loss": 1.7566, + "step": 26174 + }, + { + "epoch": 3.0539026951347568, + "grad_norm": 1.1547714471817017, + "learning_rate": 5.0804308821588585e-05, + "loss": 1.9694, + "step": 26175 + }, + { + "epoch": 3.0540193676350484, + "grad_norm": 1.299154281616211, + "learning_rate": 5.079319642560534e-05, + "loss": 1.8822, + "step": 26176 + }, + { + "epoch": 3.05413604013534, + "grad_norm": 1.5708116292953491, + "learning_rate": 5.078208507363874e-05, + "loss": 1.9516, + "step": 26177 + }, + { + "epoch": 3.054252712635632, + "grad_norm": 1.234053134918213, + "learning_rate": 5.077097476580412e-05, + "loss": 1.7963, + "step": 26178 + }, + { + "epoch": 3.0543693851359235, + "grad_norm": 1.1419600248336792, + "learning_rate": 5.0759865502216554e-05, + "loss": 1.7476, + "step": 26179 + }, + { + "epoch": 3.054486057636215, + "grad_norm": 1.3305308818817139, + "learning_rate": 5.0748757282991295e-05, + "loss": 1.8693, + "step": 26180 + }, + { + "epoch": 3.054602730136507, + "grad_norm": 1.375942349433899, + "learning_rate": 5.0737650108243394e-05, + "loss": 1.6897, + "step": 26181 + }, + { + "epoch": 3.0547194026367985, + "grad_norm": 1.1477046012878418, + "learning_rate": 5.072654397808811e-05, + "loss": 1.7984, + "step": 26182 + }, + { + "epoch": 3.05483607513709, + "grad_norm": 1.1359686851501465, + "learning_rate": 5.071543889264048e-05, + "loss": 1.6067, + "step": 26183 + }, + { + "epoch": 3.054952747637382, + "grad_norm": 1.1907027959823608, + "learning_rate": 5.070433485201574e-05, + "loss": 2.0509, + "step": 26184 + }, + { + "epoch": 3.0550694201376736, + "grad_norm": 1.1692538261413574, + "learning_rate": 5.069323185632888e-05, + "loss": 1.8734, + "step": 26185 + }, + { + "epoch": 3.0551860926379653, + "grad_norm": 1.1811171770095825, + "learning_rate": 5.068212990569511e-05, + "loss": 1.7742, + "step": 26186 + }, + { + "epoch": 3.055302765138257, + "grad_norm": 1.1176447868347168, + "learning_rate": 5.0671029000229427e-05, + "loss": 1.7951, + "step": 26187 + }, + { + "epoch": 3.0554194376385486, + "grad_norm": 1.2249559164047241, + "learning_rate": 5.065992914004703e-05, + "loss": 1.8255, + "step": 26188 + }, + { + "epoch": 3.0555361101388403, + "grad_norm": 1.2856030464172363, + "learning_rate": 5.0648830325262906e-05, + "loss": 1.7499, + "step": 26189 + }, + { + "epoch": 3.055652782639132, + "grad_norm": 1.2308686971664429, + "learning_rate": 5.06377325559921e-05, + "loss": 1.9235, + "step": 26190 + }, + { + "epoch": 3.0557694551394237, + "grad_norm": 1.3978526592254639, + "learning_rate": 5.062663583234973e-05, + "loss": 1.8196, + "step": 26191 + }, + { + "epoch": 3.0558861276397153, + "grad_norm": 1.0842729806900024, + "learning_rate": 5.061554015445075e-05, + "loss": 1.7717, + "step": 26192 + }, + { + "epoch": 3.056002800140007, + "grad_norm": 1.2023530006408691, + "learning_rate": 5.060444552241029e-05, + "loss": 1.77, + "step": 26193 + }, + { + "epoch": 3.0561194726402987, + "grad_norm": 1.2328953742980957, + "learning_rate": 5.0593351936343276e-05, + "loss": 1.8777, + "step": 26194 + }, + { + "epoch": 3.0562361451405904, + "grad_norm": 1.3459508419036865, + "learning_rate": 5.058225939636478e-05, + "loss": 1.6893, + "step": 26195 + }, + { + "epoch": 3.056352817640882, + "grad_norm": 1.154862403869629, + "learning_rate": 5.0571167902589734e-05, + "loss": 1.6586, + "step": 26196 + }, + { + "epoch": 3.0564694901411738, + "grad_norm": 1.179547667503357, + "learning_rate": 5.0560077455133206e-05, + "loss": 1.7658, + "step": 26197 + }, + { + "epoch": 3.0565861626414654, + "grad_norm": 1.6392954587936401, + "learning_rate": 5.054898805411006e-05, + "loss": 2.0011, + "step": 26198 + }, + { + "epoch": 3.056702835141757, + "grad_norm": 1.215429663658142, + "learning_rate": 5.053789969963537e-05, + "loss": 1.7184, + "step": 26199 + }, + { + "epoch": 3.056819507642049, + "grad_norm": 1.3189525604248047, + "learning_rate": 5.0526812391824e-05, + "loss": 1.827, + "step": 26200 + }, + { + "epoch": 3.0569361801423405, + "grad_norm": 1.2367099523544312, + "learning_rate": 5.051572613079091e-05, + "loss": 1.8579, + "step": 26201 + }, + { + "epoch": 3.057052852642632, + "grad_norm": 1.2492287158966064, + "learning_rate": 5.050464091665111e-05, + "loss": 1.875, + "step": 26202 + }, + { + "epoch": 3.057169525142924, + "grad_norm": 1.2644156217575073, + "learning_rate": 5.0493556749519404e-05, + "loss": 1.8197, + "step": 26203 + }, + { + "epoch": 3.0572861976432155, + "grad_norm": 1.147269368171692, + "learning_rate": 5.048247362951082e-05, + "loss": 1.7335, + "step": 26204 + }, + { + "epoch": 3.057402870143507, + "grad_norm": 1.2854676246643066, + "learning_rate": 5.0471391556740155e-05, + "loss": 1.7533, + "step": 26205 + }, + { + "epoch": 3.057519542643799, + "grad_norm": 1.2251713275909424, + "learning_rate": 5.04603105313223e-05, + "loss": 1.7299, + "step": 26206 + }, + { + "epoch": 3.0576362151440906, + "grad_norm": 1.3031822443008423, + "learning_rate": 5.044923055337222e-05, + "loss": 1.7325, + "step": 26207 + }, + { + "epoch": 3.0577528876443822, + "grad_norm": 1.1181541681289673, + "learning_rate": 5.04381516230047e-05, + "loss": 1.8809, + "step": 26208 + }, + { + "epoch": 3.057869560144674, + "grad_norm": 1.2922319173812866, + "learning_rate": 5.0427073740334575e-05, + "loss": 1.9222, + "step": 26209 + }, + { + "epoch": 3.0579862326449656, + "grad_norm": 1.0025843381881714, + "learning_rate": 5.041599690547679e-05, + "loss": 1.7504, + "step": 26210 + }, + { + "epoch": 3.0581029051452573, + "grad_norm": 1.3856041431427002, + "learning_rate": 5.040492111854605e-05, + "loss": 1.8443, + "step": 26211 + }, + { + "epoch": 3.058219577645549, + "grad_norm": 1.327996850013733, + "learning_rate": 5.039384637965729e-05, + "loss": 1.8663, + "step": 26212 + }, + { + "epoch": 3.0583362501458407, + "grad_norm": 1.1970059871673584, + "learning_rate": 5.0382772688925233e-05, + "loss": 1.7273, + "step": 26213 + }, + { + "epoch": 3.0584529226461323, + "grad_norm": 1.18326997756958, + "learning_rate": 5.037170004646472e-05, + "loss": 1.7996, + "step": 26214 + }, + { + "epoch": 3.058569595146424, + "grad_norm": 1.3803939819335938, + "learning_rate": 5.036062845239057e-05, + "loss": 1.8339, + "step": 26215 + }, + { + "epoch": 3.0586862676467157, + "grad_norm": 1.3273838758468628, + "learning_rate": 5.03495579068175e-05, + "loss": 1.8718, + "step": 26216 + }, + { + "epoch": 3.0588029401470074, + "grad_norm": 1.3163350820541382, + "learning_rate": 5.033848840986034e-05, + "loss": 1.772, + "step": 26217 + }, + { + "epoch": 3.058919612647299, + "grad_norm": 1.2510052919387817, + "learning_rate": 5.032741996163376e-05, + "loss": 1.7919, + "step": 26218 + }, + { + "epoch": 3.0590362851475907, + "grad_norm": 1.1817537546157837, + "learning_rate": 5.031635256225263e-05, + "loss": 1.8775, + "step": 26219 + }, + { + "epoch": 3.0591529576478824, + "grad_norm": 1.1998368501663208, + "learning_rate": 5.030528621183155e-05, + "loss": 1.8553, + "step": 26220 + }, + { + "epoch": 3.059269630148174, + "grad_norm": 1.1810486316680908, + "learning_rate": 5.0294220910485356e-05, + "loss": 1.7827, + "step": 26221 + }, + { + "epoch": 3.059386302648466, + "grad_norm": 1.1503031253814697, + "learning_rate": 5.028315665832871e-05, + "loss": 1.7203, + "step": 26222 + }, + { + "epoch": 3.0595029751487575, + "grad_norm": 1.4906293153762817, + "learning_rate": 5.027209345547627e-05, + "loss": 1.9509, + "step": 26223 + }, + { + "epoch": 3.059619647649049, + "grad_norm": 1.1600899696350098, + "learning_rate": 5.026103130204281e-05, + "loss": 1.876, + "step": 26224 + }, + { + "epoch": 3.059736320149341, + "grad_norm": 1.6436570882797241, + "learning_rate": 5.024997019814298e-05, + "loss": 1.8421, + "step": 26225 + }, + { + "epoch": 3.0598529926496325, + "grad_norm": 1.267439842224121, + "learning_rate": 5.023891014389139e-05, + "loss": 1.8877, + "step": 26226 + }, + { + "epoch": 3.059969665149924, + "grad_norm": 1.25113844871521, + "learning_rate": 5.0227851139402765e-05, + "loss": 1.9654, + "step": 26227 + }, + { + "epoch": 3.060086337650216, + "grad_norm": 1.2814134359359741, + "learning_rate": 5.021679318479177e-05, + "loss": 1.8374, + "step": 26228 + }, + { + "epoch": 3.0602030101505076, + "grad_norm": 1.065202236175537, + "learning_rate": 5.020573628017296e-05, + "loss": 1.6025, + "step": 26229 + }, + { + "epoch": 3.0603196826507992, + "grad_norm": 1.300919771194458, + "learning_rate": 5.019468042566106e-05, + "loss": 1.9877, + "step": 26230 + }, + { + "epoch": 3.060436355151091, + "grad_norm": 1.194186806678772, + "learning_rate": 5.018362562137059e-05, + "loss": 1.8882, + "step": 26231 + }, + { + "epoch": 3.0605530276513826, + "grad_norm": 1.1964863538742065, + "learning_rate": 5.017257186741624e-05, + "loss": 1.9026, + "step": 26232 + }, + { + "epoch": 3.0606697001516743, + "grad_norm": 1.8079735040664673, + "learning_rate": 5.0161519163912526e-05, + "loss": 1.8548, + "step": 26233 + }, + { + "epoch": 3.060786372651966, + "grad_norm": 1.0622062683105469, + "learning_rate": 5.015046751097412e-05, + "loss": 1.7883, + "step": 26234 + }, + { + "epoch": 3.0609030451522576, + "grad_norm": 1.2012823820114136, + "learning_rate": 5.013941690871547e-05, + "loss": 1.7714, + "step": 26235 + }, + { + "epoch": 3.0610197176525493, + "grad_norm": 1.1084635257720947, + "learning_rate": 5.0128367357251266e-05, + "loss": 1.611, + "step": 26236 + }, + { + "epoch": 3.061136390152841, + "grad_norm": 1.1960097551345825, + "learning_rate": 5.011731885669595e-05, + "loss": 1.7975, + "step": 26237 + }, + { + "epoch": 3.0612530626531327, + "grad_norm": 1.0800046920776367, + "learning_rate": 5.010627140716415e-05, + "loss": 1.6334, + "step": 26238 + }, + { + "epoch": 3.0613697351534244, + "grad_norm": 1.227168321609497, + "learning_rate": 5.0095225008770304e-05, + "loss": 1.853, + "step": 26239 + }, + { + "epoch": 3.061486407653716, + "grad_norm": 1.0493284463882446, + "learning_rate": 5.008417966162903e-05, + "loss": 1.7464, + "step": 26240 + }, + { + "epoch": 3.0616030801540077, + "grad_norm": 1.154240369796753, + "learning_rate": 5.007313536585478e-05, + "loss": 1.7803, + "step": 26241 + }, + { + "epoch": 3.0617197526542994, + "grad_norm": 1.2398264408111572, + "learning_rate": 5.0062092121561996e-05, + "loss": 1.7933, + "step": 26242 + }, + { + "epoch": 3.061836425154591, + "grad_norm": 1.2166216373443604, + "learning_rate": 5.0051049928865254e-05, + "loss": 1.73, + "step": 26243 + }, + { + "epoch": 3.0619530976548828, + "grad_norm": 1.1216181516647339, + "learning_rate": 5.004000878787897e-05, + "loss": 1.8649, + "step": 26244 + }, + { + "epoch": 3.0620697701551745, + "grad_norm": 1.326841115951538, + "learning_rate": 5.002896869871765e-05, + "loss": 1.8808, + "step": 26245 + }, + { + "epoch": 3.062186442655466, + "grad_norm": 1.2541346549987793, + "learning_rate": 5.001792966149568e-05, + "loss": 1.8797, + "step": 26246 + }, + { + "epoch": 3.062303115155758, + "grad_norm": 1.5210436582565308, + "learning_rate": 5.0006891676327585e-05, + "loss": 1.8541, + "step": 26247 + }, + { + "epoch": 3.0624197876560495, + "grad_norm": 1.156085729598999, + "learning_rate": 4.999585474332771e-05, + "loss": 1.6567, + "step": 26248 + }, + { + "epoch": 3.062536460156341, + "grad_norm": 1.2605153322219849, + "learning_rate": 4.998481886261056e-05, + "loss": 1.7248, + "step": 26249 + }, + { + "epoch": 3.062653132656633, + "grad_norm": 1.2810022830963135, + "learning_rate": 4.997378403429045e-05, + "loss": 1.8256, + "step": 26250 + }, + { + "epoch": 3.0627698051569245, + "grad_norm": 1.461142659187317, + "learning_rate": 4.996275025848186e-05, + "loss": 1.881, + "step": 26251 + }, + { + "epoch": 3.0628864776572162, + "grad_norm": 1.2397187948226929, + "learning_rate": 4.99517175352991e-05, + "loss": 1.9631, + "step": 26252 + }, + { + "epoch": 3.063003150157508, + "grad_norm": 1.354343295097351, + "learning_rate": 4.994068586485663e-05, + "loss": 1.8752, + "step": 26253 + }, + { + "epoch": 3.0631198226577996, + "grad_norm": 1.0978569984436035, + "learning_rate": 4.992965524726873e-05, + "loss": 1.6562, + "step": 26254 + }, + { + "epoch": 3.0632364951580913, + "grad_norm": 1.2949061393737793, + "learning_rate": 4.991862568264978e-05, + "loss": 1.8737, + "step": 26255 + }, + { + "epoch": 3.063353167658383, + "grad_norm": 1.260159969329834, + "learning_rate": 4.990759717111419e-05, + "loss": 1.7143, + "step": 26256 + }, + { + "epoch": 3.0634698401586746, + "grad_norm": 1.2337108850479126, + "learning_rate": 4.989656971277624e-05, + "loss": 1.6435, + "step": 26257 + }, + { + "epoch": 3.0635865126589663, + "grad_norm": 1.130759596824646, + "learning_rate": 4.988554330775024e-05, + "loss": 1.9496, + "step": 26258 + }, + { + "epoch": 3.063703185159258, + "grad_norm": 1.2827540636062622, + "learning_rate": 4.987451795615047e-05, + "loss": 1.9331, + "step": 26259 + }, + { + "epoch": 3.0638198576595497, + "grad_norm": 1.4615695476531982, + "learning_rate": 4.98634936580913e-05, + "loss": 1.8548, + "step": 26260 + }, + { + "epoch": 3.0639365301598414, + "grad_norm": 0.9930498600006104, + "learning_rate": 4.9852470413686936e-05, + "loss": 1.7449, + "step": 26261 + }, + { + "epoch": 3.064053202660133, + "grad_norm": 1.2037663459777832, + "learning_rate": 4.9841448223051766e-05, + "loss": 1.7648, + "step": 26262 + }, + { + "epoch": 3.0641698751604247, + "grad_norm": 1.1059619188308716, + "learning_rate": 4.983042708629992e-05, + "loss": 1.9407, + "step": 26263 + }, + { + "epoch": 3.0642865476607164, + "grad_norm": 1.4130523204803467, + "learning_rate": 4.981940700354579e-05, + "loss": 1.8112, + "step": 26264 + }, + { + "epoch": 3.064403220161008, + "grad_norm": 1.192773461341858, + "learning_rate": 4.9808387974903494e-05, + "loss": 1.7616, + "step": 26265 + }, + { + "epoch": 3.0645198926612998, + "grad_norm": 1.2268637418746948, + "learning_rate": 4.979737000048738e-05, + "loss": 1.9006, + "step": 26266 + }, + { + "epoch": 3.0646365651615914, + "grad_norm": 1.1882423162460327, + "learning_rate": 4.9786353080411575e-05, + "loss": 1.7172, + "step": 26267 + }, + { + "epoch": 3.064753237661883, + "grad_norm": 1.2353858947753906, + "learning_rate": 4.977533721479033e-05, + "loss": 1.8449, + "step": 26268 + }, + { + "epoch": 3.064869910162175, + "grad_norm": 1.1923913955688477, + "learning_rate": 4.9764322403737876e-05, + "loss": 1.8674, + "step": 26269 + }, + { + "epoch": 3.0649865826624665, + "grad_norm": 1.3191773891448975, + "learning_rate": 4.975330864736835e-05, + "loss": 1.8015, + "step": 26270 + }, + { + "epoch": 3.065103255162758, + "grad_norm": 1.3415039777755737, + "learning_rate": 4.9742295945795965e-05, + "loss": 2.031, + "step": 26271 + }, + { + "epoch": 3.06521992766305, + "grad_norm": 1.212060570716858, + "learning_rate": 4.9731284299134855e-05, + "loss": 1.7777, + "step": 26272 + }, + { + "epoch": 3.0653366001633415, + "grad_norm": 1.207112193107605, + "learning_rate": 4.9720273707499236e-05, + "loss": 1.7883, + "step": 26273 + }, + { + "epoch": 3.065453272663633, + "grad_norm": 1.4010294675827026, + "learning_rate": 4.970926417100322e-05, + "loss": 1.9914, + "step": 26274 + }, + { + "epoch": 3.065569945163925, + "grad_norm": 1.2563567161560059, + "learning_rate": 4.96982556897609e-05, + "loss": 1.7012, + "step": 26275 + }, + { + "epoch": 3.0656866176642166, + "grad_norm": 1.1387494802474976, + "learning_rate": 4.968724826388648e-05, + "loss": 1.6495, + "step": 26276 + }, + { + "epoch": 3.0658032901645083, + "grad_norm": 1.327236533164978, + "learning_rate": 4.9676241893494015e-05, + "loss": 1.9006, + "step": 26277 + }, + { + "epoch": 3.0659199626648, + "grad_norm": 1.1098054647445679, + "learning_rate": 4.96652365786976e-05, + "loss": 1.6925, + "step": 26278 + }, + { + "epoch": 3.0660366351650916, + "grad_norm": 1.2833893299102783, + "learning_rate": 4.965423231961137e-05, + "loss": 1.8976, + "step": 26279 + }, + { + "epoch": 3.0661533076653833, + "grad_norm": 1.3150523900985718, + "learning_rate": 4.9643229116349346e-05, + "loss": 1.832, + "step": 26280 + }, + { + "epoch": 3.066269980165675, + "grad_norm": 1.256859540939331, + "learning_rate": 4.963222696902563e-05, + "loss": 1.8442, + "step": 26281 + }, + { + "epoch": 3.0663866526659667, + "grad_norm": 1.490511178970337, + "learning_rate": 4.9621225877754345e-05, + "loss": 1.8048, + "step": 26282 + }, + { + "epoch": 3.0665033251662583, + "grad_norm": 1.1498302221298218, + "learning_rate": 4.9610225842649406e-05, + "loss": 1.8182, + "step": 26283 + }, + { + "epoch": 3.06661999766655, + "grad_norm": 1.1121031045913696, + "learning_rate": 4.959922686382499e-05, + "loss": 1.8003, + "step": 26284 + }, + { + "epoch": 3.0667366701668417, + "grad_norm": 1.8829010725021362, + "learning_rate": 4.958822894139499e-05, + "loss": 1.7107, + "step": 26285 + }, + { + "epoch": 3.0668533426671334, + "grad_norm": 1.0392554998397827, + "learning_rate": 4.9577232075473534e-05, + "loss": 1.638, + "step": 26286 + }, + { + "epoch": 3.066970015167425, + "grad_norm": 1.2796285152435303, + "learning_rate": 4.9566236266174525e-05, + "loss": 1.9773, + "step": 26287 + }, + { + "epoch": 3.0670866876677167, + "grad_norm": 1.3554869890213013, + "learning_rate": 4.955524151361207e-05, + "loss": 1.8519, + "step": 26288 + }, + { + "epoch": 3.0672033601680084, + "grad_norm": 1.3205422163009644, + "learning_rate": 4.954424781790002e-05, + "loss": 1.9667, + "step": 26289 + }, + { + "epoch": 3.0673200326683, + "grad_norm": 1.343489170074463, + "learning_rate": 4.9533255179152455e-05, + "loss": 2.0154, + "step": 26290 + }, + { + "epoch": 3.067436705168592, + "grad_norm": 1.322637915611267, + "learning_rate": 4.952226359748325e-05, + "loss": 1.8711, + "step": 26291 + }, + { + "epoch": 3.0675533776688835, + "grad_norm": 1.1015636920928955, + "learning_rate": 4.951127307300643e-05, + "loss": 1.8007, + "step": 26292 + }, + { + "epoch": 3.067670050169175, + "grad_norm": 1.2419326305389404, + "learning_rate": 4.95002836058359e-05, + "loss": 1.7132, + "step": 26293 + }, + { + "epoch": 3.067786722669467, + "grad_norm": 1.6186363697052002, + "learning_rate": 4.948929519608555e-05, + "loss": 1.8856, + "step": 26294 + }, + { + "epoch": 3.0679033951697585, + "grad_norm": 1.195281982421875, + "learning_rate": 4.947830784386936e-05, + "loss": 1.7678, + "step": 26295 + }, + { + "epoch": 3.06802006767005, + "grad_norm": 1.2660174369812012, + "learning_rate": 4.946732154930117e-05, + "loss": 2.0167, + "step": 26296 + }, + { + "epoch": 3.068136740170342, + "grad_norm": 1.4075192213058472, + "learning_rate": 4.945633631249494e-05, + "loss": 1.9881, + "step": 26297 + }, + { + "epoch": 3.0682534126706336, + "grad_norm": 1.2351418733596802, + "learning_rate": 4.9445352133564486e-05, + "loss": 1.8483, + "step": 26298 + }, + { + "epoch": 3.0683700851709252, + "grad_norm": 1.1764564514160156, + "learning_rate": 4.9434369012623756e-05, + "loss": 1.7209, + "step": 26299 + }, + { + "epoch": 3.068486757671217, + "grad_norm": 1.2145034074783325, + "learning_rate": 4.942338694978651e-05, + "loss": 1.8738, + "step": 26300 + }, + { + "epoch": 3.0686034301715086, + "grad_norm": 1.1800127029418945, + "learning_rate": 4.941240594516671e-05, + "loss": 1.8138, + "step": 26301 + }, + { + "epoch": 3.0687201026718003, + "grad_norm": 1.2264900207519531, + "learning_rate": 4.9401425998878104e-05, + "loss": 1.7904, + "step": 26302 + }, + { + "epoch": 3.068836775172092, + "grad_norm": 1.1257046461105347, + "learning_rate": 4.93904471110346e-05, + "loss": 1.7924, + "step": 26303 + }, + { + "epoch": 3.0689534476723836, + "grad_norm": 1.2693766355514526, + "learning_rate": 4.937946928174992e-05, + "loss": 1.8981, + "step": 26304 + }, + { + "epoch": 3.0690701201726753, + "grad_norm": 1.3762027025222778, + "learning_rate": 4.9368492511137964e-05, + "loss": 1.9013, + "step": 26305 + }, + { + "epoch": 3.069186792672967, + "grad_norm": 1.2840895652770996, + "learning_rate": 4.9357516799312446e-05, + "loss": 1.8716, + "step": 26306 + }, + { + "epoch": 3.0693034651732587, + "grad_norm": 1.2183537483215332, + "learning_rate": 4.934654214638724e-05, + "loss": 1.942, + "step": 26307 + }, + { + "epoch": 3.0694201376735504, + "grad_norm": 1.2581117153167725, + "learning_rate": 4.9335568552476e-05, + "loss": 1.9151, + "step": 26308 + }, + { + "epoch": 3.069536810173842, + "grad_norm": 1.3490474224090576, + "learning_rate": 4.932459601769263e-05, + "loss": 1.781, + "step": 26309 + }, + { + "epoch": 3.0696534826741337, + "grad_norm": 1.399034023284912, + "learning_rate": 4.9313624542150784e-05, + "loss": 1.7254, + "step": 26310 + }, + { + "epoch": 3.0697701551744254, + "grad_norm": 1.3055202960968018, + "learning_rate": 4.930265412596419e-05, + "loss": 1.8332, + "step": 26311 + }, + { + "epoch": 3.069886827674717, + "grad_norm": 1.351904034614563, + "learning_rate": 4.9291684769246656e-05, + "loss": 2.0973, + "step": 26312 + }, + { + "epoch": 3.070003500175009, + "grad_norm": 1.3462351560592651, + "learning_rate": 4.928071647211182e-05, + "loss": 1.8143, + "step": 26313 + }, + { + "epoch": 3.0701201726753005, + "grad_norm": 1.2814439535140991, + "learning_rate": 4.926974923467348e-05, + "loss": 1.8292, + "step": 26314 + }, + { + "epoch": 3.070236845175592, + "grad_norm": 1.127272129058838, + "learning_rate": 4.92587830570452e-05, + "loss": 1.8434, + "step": 26315 + }, + { + "epoch": 3.070353517675884, + "grad_norm": 1.0754797458648682, + "learning_rate": 4.9247817939340815e-05, + "loss": 1.8588, + "step": 26316 + }, + { + "epoch": 3.0704701901761755, + "grad_norm": 1.3996351957321167, + "learning_rate": 4.923685388167387e-05, + "loss": 1.8951, + "step": 26317 + }, + { + "epoch": 3.070586862676467, + "grad_norm": 1.343936800956726, + "learning_rate": 4.922589088415812e-05, + "loss": 1.8784, + "step": 26318 + }, + { + "epoch": 3.070703535176759, + "grad_norm": 1.3630090951919556, + "learning_rate": 4.921492894690716e-05, + "loss": 2.0456, + "step": 26319 + }, + { + "epoch": 3.0708202076770506, + "grad_norm": 1.2002570629119873, + "learning_rate": 4.920396807003465e-05, + "loss": 1.7696, + "step": 26320 + }, + { + "epoch": 3.0709368801773422, + "grad_norm": 1.3429944515228271, + "learning_rate": 4.919300825365427e-05, + "loss": 1.763, + "step": 26321 + }, + { + "epoch": 3.071053552677634, + "grad_norm": 1.1261399984359741, + "learning_rate": 4.918204949787953e-05, + "loss": 1.6475, + "step": 26322 + }, + { + "epoch": 3.0711702251779256, + "grad_norm": 1.2253773212432861, + "learning_rate": 4.917109180282417e-05, + "loss": 1.8095, + "step": 26323 + }, + { + "epoch": 3.0712868976782173, + "grad_norm": 1.1721761226654053, + "learning_rate": 4.916013516860166e-05, + "loss": 1.9568, + "step": 26324 + }, + { + "epoch": 3.071403570178509, + "grad_norm": 1.1794800758361816, + "learning_rate": 4.914917959532569e-05, + "loss": 1.7748, + "step": 26325 + }, + { + "epoch": 3.0715202426788006, + "grad_norm": 1.18965744972229, + "learning_rate": 4.91382250831098e-05, + "loss": 1.6873, + "step": 26326 + }, + { + "epoch": 3.0716369151790923, + "grad_norm": 1.322331190109253, + "learning_rate": 4.912727163206754e-05, + "loss": 1.897, + "step": 26327 + }, + { + "epoch": 3.071753587679384, + "grad_norm": 1.3681551218032837, + "learning_rate": 4.911631924231243e-05, + "loss": 1.978, + "step": 26328 + }, + { + "epoch": 3.0718702601796757, + "grad_norm": 1.0591071844100952, + "learning_rate": 4.910536791395808e-05, + "loss": 1.7376, + "step": 26329 + }, + { + "epoch": 3.0719869326799674, + "grad_norm": 1.1813387870788574, + "learning_rate": 4.909441764711796e-05, + "loss": 1.8167, + "step": 26330 + }, + { + "epoch": 3.072103605180259, + "grad_norm": 1.3856260776519775, + "learning_rate": 4.9083468441905665e-05, + "loss": 1.9364, + "step": 26331 + }, + { + "epoch": 3.0722202776805507, + "grad_norm": 1.2476348876953125, + "learning_rate": 4.907252029843462e-05, + "loss": 1.7437, + "step": 26332 + }, + { + "epoch": 3.0723369501808424, + "grad_norm": 1.221793293952942, + "learning_rate": 4.906157321681835e-05, + "loss": 1.8633, + "step": 26333 + }, + { + "epoch": 3.072453622681134, + "grad_norm": 1.3166332244873047, + "learning_rate": 4.9050627197170406e-05, + "loss": 1.8316, + "step": 26334 + }, + { + "epoch": 3.0725702951814258, + "grad_norm": 1.3126612901687622, + "learning_rate": 4.9039682239604175e-05, + "loss": 1.8687, + "step": 26335 + }, + { + "epoch": 3.0726869676817175, + "grad_norm": 1.2408745288848877, + "learning_rate": 4.902873834423321e-05, + "loss": 1.8577, + "step": 26336 + }, + { + "epoch": 3.072803640182009, + "grad_norm": 1.3302321434020996, + "learning_rate": 4.901779551117087e-05, + "loss": 1.7852, + "step": 26337 + }, + { + "epoch": 3.072920312682301, + "grad_norm": 1.194350242614746, + "learning_rate": 4.900685374053069e-05, + "loss": 1.8161, + "step": 26338 + }, + { + "epoch": 3.0730369851825925, + "grad_norm": 1.1453065872192383, + "learning_rate": 4.899591303242602e-05, + "loss": 1.8164, + "step": 26339 + }, + { + "epoch": 3.073153657682884, + "grad_norm": 1.2666257619857788, + "learning_rate": 4.898497338697034e-05, + "loss": 1.8693, + "step": 26340 + }, + { + "epoch": 3.073270330183176, + "grad_norm": 1.3194411993026733, + "learning_rate": 4.897403480427701e-05, + "loss": 1.9121, + "step": 26341 + }, + { + "epoch": 3.0733870026834675, + "grad_norm": 1.238150954246521, + "learning_rate": 4.8963097284459506e-05, + "loss": 1.8269, + "step": 26342 + }, + { + "epoch": 3.073503675183759, + "grad_norm": 1.4775621891021729, + "learning_rate": 4.895216082763116e-05, + "loss": 1.8452, + "step": 26343 + }, + { + "epoch": 3.073620347684051, + "grad_norm": 1.3258121013641357, + "learning_rate": 4.8941225433905326e-05, + "loss": 1.717, + "step": 26344 + }, + { + "epoch": 3.0737370201843426, + "grad_norm": 1.233337163925171, + "learning_rate": 4.893029110339544e-05, + "loss": 1.8123, + "step": 26345 + }, + { + "epoch": 3.0738536926846343, + "grad_norm": 1.173447608947754, + "learning_rate": 4.891935783621477e-05, + "loss": 1.7785, + "step": 26346 + }, + { + "epoch": 3.073970365184926, + "grad_norm": 1.5523619651794434, + "learning_rate": 4.890842563247675e-05, + "loss": 1.9525, + "step": 26347 + }, + { + "epoch": 3.0740870376852176, + "grad_norm": 1.3454779386520386, + "learning_rate": 4.889749449229462e-05, + "loss": 1.8199, + "step": 26348 + }, + { + "epoch": 3.0742037101855093, + "grad_norm": 1.4642654657363892, + "learning_rate": 4.8886564415781825e-05, + "loss": 1.9666, + "step": 26349 + }, + { + "epoch": 3.074320382685801, + "grad_norm": 1.2507582902908325, + "learning_rate": 4.887563540305154e-05, + "loss": 1.7392, + "step": 26350 + }, + { + "epoch": 3.0744370551860927, + "grad_norm": 1.158685326576233, + "learning_rate": 4.8864707454217175e-05, + "loss": 1.753, + "step": 26351 + }, + { + "epoch": 3.0745537276863844, + "grad_norm": 1.3758807182312012, + "learning_rate": 4.8853780569391934e-05, + "loss": 2.0572, + "step": 26352 + }, + { + "epoch": 3.074670400186676, + "grad_norm": 1.2000913619995117, + "learning_rate": 4.884285474868917e-05, + "loss": 1.6975, + "step": 26353 + }, + { + "epoch": 3.0747870726869677, + "grad_norm": 1.4499032497406006, + "learning_rate": 4.883192999222209e-05, + "loss": 1.9463, + "step": 26354 + }, + { + "epoch": 3.0749037451872594, + "grad_norm": 1.3000175952911377, + "learning_rate": 4.8821006300103985e-05, + "loss": 1.8507, + "step": 26355 + }, + { + "epoch": 3.075020417687551, + "grad_norm": 1.3038326501846313, + "learning_rate": 4.881008367244807e-05, + "loss": 1.8915, + "step": 26356 + }, + { + "epoch": 3.0751370901878428, + "grad_norm": 1.2951298952102661, + "learning_rate": 4.879916210936762e-05, + "loss": 1.8587, + "step": 26357 + }, + { + "epoch": 3.0752537626881344, + "grad_norm": 1.2840136289596558, + "learning_rate": 4.878824161097581e-05, + "loss": 1.9118, + "step": 26358 + }, + { + "epoch": 3.075370435188426, + "grad_norm": 1.2660528421401978, + "learning_rate": 4.8777322177385916e-05, + "loss": 1.7275, + "step": 26359 + }, + { + "epoch": 3.075487107688718, + "grad_norm": 1.4873849153518677, + "learning_rate": 4.876640380871104e-05, + "loss": 1.9118, + "step": 26360 + }, + { + "epoch": 3.0756037801890095, + "grad_norm": 1.1464468240737915, + "learning_rate": 4.8755486505064495e-05, + "loss": 1.712, + "step": 26361 + }, + { + "epoch": 3.075720452689301, + "grad_norm": 1.3173967599868774, + "learning_rate": 4.8744570266559383e-05, + "loss": 1.8589, + "step": 26362 + }, + { + "epoch": 3.075837125189593, + "grad_norm": 1.1895864009857178, + "learning_rate": 4.873365509330885e-05, + "loss": 1.8125, + "step": 26363 + }, + { + "epoch": 3.0759537976898845, + "grad_norm": 1.3291141986846924, + "learning_rate": 4.872274098542613e-05, + "loss": 1.8595, + "step": 26364 + }, + { + "epoch": 3.076070470190176, + "grad_norm": 1.2258684635162354, + "learning_rate": 4.8711827943024275e-05, + "loss": 1.8916, + "step": 26365 + }, + { + "epoch": 3.076187142690468, + "grad_norm": 1.1779769659042358, + "learning_rate": 4.87009159662165e-05, + "loss": 1.783, + "step": 26366 + }, + { + "epoch": 3.0763038151907596, + "grad_norm": 1.1763169765472412, + "learning_rate": 4.869000505511586e-05, + "loss": 1.9373, + "step": 26367 + }, + { + "epoch": 3.0764204876910513, + "grad_norm": 1.2232372760772705, + "learning_rate": 4.8679095209835566e-05, + "loss": 1.9141, + "step": 26368 + }, + { + "epoch": 3.076537160191343, + "grad_norm": 1.1514629125595093, + "learning_rate": 4.8668186430488596e-05, + "loss": 1.7661, + "step": 26369 + }, + { + "epoch": 3.0766538326916346, + "grad_norm": 1.3079639673233032, + "learning_rate": 4.865727871718815e-05, + "loss": 1.9663, + "step": 26370 + }, + { + "epoch": 3.0767705051919263, + "grad_norm": 1.3031498193740845, + "learning_rate": 4.864637207004722e-05, + "loss": 1.9964, + "step": 26371 + }, + { + "epoch": 3.076887177692218, + "grad_norm": 1.3226557970046997, + "learning_rate": 4.86354664891789e-05, + "loss": 1.9528, + "step": 26372 + }, + { + "epoch": 3.0770038501925097, + "grad_norm": 1.3121545314788818, + "learning_rate": 4.862456197469632e-05, + "loss": 2.0009, + "step": 26373 + }, + { + "epoch": 3.0771205226928013, + "grad_norm": 1.3020875453948975, + "learning_rate": 4.861365852671241e-05, + "loss": 1.864, + "step": 26374 + }, + { + "epoch": 3.077237195193093, + "grad_norm": 1.1663494110107422, + "learning_rate": 4.8602756145340304e-05, + "loss": 1.9301, + "step": 26375 + }, + { + "epoch": 3.0773538676933847, + "grad_norm": 1.2610177993774414, + "learning_rate": 4.859185483069299e-05, + "loss": 1.9126, + "step": 26376 + }, + { + "epoch": 3.0774705401936764, + "grad_norm": 1.1384499073028564, + "learning_rate": 4.858095458288342e-05, + "loss": 1.829, + "step": 26377 + }, + { + "epoch": 3.077587212693968, + "grad_norm": 1.1461600065231323, + "learning_rate": 4.85700554020247e-05, + "loss": 1.8159, + "step": 26378 + }, + { + "epoch": 3.0777038851942597, + "grad_norm": 1.2830466032028198, + "learning_rate": 4.855915728822978e-05, + "loss": 1.7907, + "step": 26379 + }, + { + "epoch": 3.0778205576945514, + "grad_norm": 1.1415965557098389, + "learning_rate": 4.854826024161157e-05, + "loss": 1.6779, + "step": 26380 + }, + { + "epoch": 3.077937230194843, + "grad_norm": 1.036262035369873, + "learning_rate": 4.853736426228313e-05, + "loss": 1.639, + "step": 26381 + }, + { + "epoch": 3.078053902695135, + "grad_norm": 1.529105305671692, + "learning_rate": 4.8526469350357356e-05, + "loss": 1.9196, + "step": 26382 + }, + { + "epoch": 3.0781705751954265, + "grad_norm": 1.1922413110733032, + "learning_rate": 4.851557550594727e-05, + "loss": 1.8013, + "step": 26383 + }, + { + "epoch": 3.078287247695718, + "grad_norm": 1.1898564100265503, + "learning_rate": 4.850468272916571e-05, + "loss": 1.9414, + "step": 26384 + }, + { + "epoch": 3.07840392019601, + "grad_norm": 1.348413109779358, + "learning_rate": 4.8493791020125674e-05, + "loss": 1.83, + "step": 26385 + }, + { + "epoch": 3.0785205926963015, + "grad_norm": 1.1481857299804688, + "learning_rate": 4.848290037894006e-05, + "loss": 1.8256, + "step": 26386 + }, + { + "epoch": 3.078637265196593, + "grad_norm": 1.317211627960205, + "learning_rate": 4.847201080572174e-05, + "loss": 1.7693, + "step": 26387 + }, + { + "epoch": 3.078753937696885, + "grad_norm": 1.0444942712783813, + "learning_rate": 4.846112230058368e-05, + "loss": 1.6844, + "step": 26388 + }, + { + "epoch": 3.0788706101971766, + "grad_norm": 1.2239584922790527, + "learning_rate": 4.8450234863638634e-05, + "loss": 1.9203, + "step": 26389 + }, + { + "epoch": 3.0789872826974682, + "grad_norm": 1.1817227602005005, + "learning_rate": 4.8439348494999624e-05, + "loss": 1.862, + "step": 26390 + }, + { + "epoch": 3.07910395519776, + "grad_norm": 1.2286843061447144, + "learning_rate": 4.842846319477935e-05, + "loss": 1.8477, + "step": 26391 + }, + { + "epoch": 3.0792206276980516, + "grad_norm": 1.3383187055587769, + "learning_rate": 4.8417578963090795e-05, + "loss": 1.6858, + "step": 26392 + }, + { + "epoch": 3.0793373001983433, + "grad_norm": 1.3037593364715576, + "learning_rate": 4.8406695800046706e-05, + "loss": 1.8912, + "step": 26393 + }, + { + "epoch": 3.079453972698635, + "grad_norm": 1.309247612953186, + "learning_rate": 4.8395813705759965e-05, + "loss": 2.1745, + "step": 26394 + }, + { + "epoch": 3.0795706451989266, + "grad_norm": 1.354748249053955, + "learning_rate": 4.838493268034336e-05, + "loss": 1.8895, + "step": 26395 + }, + { + "epoch": 3.0796873176992183, + "grad_norm": 1.17351233959198, + "learning_rate": 4.8374052723909705e-05, + "loss": 1.9239, + "step": 26396 + }, + { + "epoch": 3.07980399019951, + "grad_norm": 1.1732170581817627, + "learning_rate": 4.836317383657172e-05, + "loss": 1.6309, + "step": 26397 + }, + { + "epoch": 3.0799206626998017, + "grad_norm": 1.4541423320770264, + "learning_rate": 4.8352296018442255e-05, + "loss": 1.9553, + "step": 26398 + }, + { + "epoch": 3.0800373352000934, + "grad_norm": 1.401413917541504, + "learning_rate": 4.8341419269634104e-05, + "loss": 1.8587, + "step": 26399 + }, + { + "epoch": 3.080154007700385, + "grad_norm": 1.3851289749145508, + "learning_rate": 4.833054359025995e-05, + "loss": 1.8843, + "step": 26400 + }, + { + "epoch": 3.0802706802006767, + "grad_norm": 1.1886485815048218, + "learning_rate": 4.8319668980432624e-05, + "loss": 2.0505, + "step": 26401 + }, + { + "epoch": 3.0803873527009684, + "grad_norm": 1.1962616443634033, + "learning_rate": 4.8308795440264775e-05, + "loss": 1.9508, + "step": 26402 + }, + { + "epoch": 3.08050402520126, + "grad_norm": 1.0775578022003174, + "learning_rate": 4.82979229698692e-05, + "loss": 1.6657, + "step": 26403 + }, + { + "epoch": 3.080620697701552, + "grad_norm": 1.3467655181884766, + "learning_rate": 4.8287051569358545e-05, + "loss": 1.8911, + "step": 26404 + }, + { + "epoch": 3.0807373702018435, + "grad_norm": 1.287684440612793, + "learning_rate": 4.82761812388456e-05, + "loss": 1.9628, + "step": 26405 + }, + { + "epoch": 3.080854042702135, + "grad_norm": 1.3464816808700562, + "learning_rate": 4.826531197844296e-05, + "loss": 1.9354, + "step": 26406 + }, + { + "epoch": 3.080970715202427, + "grad_norm": 1.0829628705978394, + "learning_rate": 4.825444378826339e-05, + "loss": 1.7999, + "step": 26407 + }, + { + "epoch": 3.0810873877027185, + "grad_norm": 1.1255748271942139, + "learning_rate": 4.8243576668419483e-05, + "loss": 1.81, + "step": 26408 + }, + { + "epoch": 3.08120406020301, + "grad_norm": 1.2728091478347778, + "learning_rate": 4.8232710619023974e-05, + "loss": 1.9514, + "step": 26409 + }, + { + "epoch": 3.081320732703302, + "grad_norm": 0.9614567756652832, + "learning_rate": 4.822184564018943e-05, + "loss": 1.7936, + "step": 26410 + }, + { + "epoch": 3.0814374052035935, + "grad_norm": 1.3683801889419556, + "learning_rate": 4.821098173202858e-05, + "loss": 1.7573, + "step": 26411 + }, + { + "epoch": 3.0815540777038852, + "grad_norm": 1.1311391592025757, + "learning_rate": 4.8200118894654e-05, + "loss": 1.6256, + "step": 26412 + }, + { + "epoch": 3.081670750204177, + "grad_norm": 1.2649420499801636, + "learning_rate": 4.818925712817825e-05, + "loss": 1.9634, + "step": 26413 + }, + { + "epoch": 3.0817874227044686, + "grad_norm": 1.3531486988067627, + "learning_rate": 4.8178396432714034e-05, + "loss": 1.7503, + "step": 26414 + }, + { + "epoch": 3.0819040952047603, + "grad_norm": 1.278813123703003, + "learning_rate": 4.816753680837385e-05, + "loss": 1.8413, + "step": 26415 + }, + { + "epoch": 3.082020767705052, + "grad_norm": 1.3487050533294678, + "learning_rate": 4.815667825527037e-05, + "loss": 1.7937, + "step": 26416 + }, + { + "epoch": 3.0821374402053436, + "grad_norm": 1.424076795578003, + "learning_rate": 4.814582077351606e-05, + "loss": 1.9931, + "step": 26417 + }, + { + "epoch": 3.0822541127056353, + "grad_norm": 1.3128862380981445, + "learning_rate": 4.8134964363223585e-05, + "loss": 1.9521, + "step": 26418 + }, + { + "epoch": 3.082370785205927, + "grad_norm": 1.0940759181976318, + "learning_rate": 4.812410902450541e-05, + "loss": 1.7309, + "step": 26419 + }, + { + "epoch": 3.0824874577062187, + "grad_norm": 1.3608204126358032, + "learning_rate": 4.8113254757474134e-05, + "loss": 1.8752, + "step": 26420 + }, + { + "epoch": 3.0826041302065104, + "grad_norm": 1.2284623384475708, + "learning_rate": 4.81024015622422e-05, + "loss": 1.7623, + "step": 26421 + }, + { + "epoch": 3.082720802706802, + "grad_norm": 1.2062456607818604, + "learning_rate": 4.809154943892223e-05, + "loss": 1.6648, + "step": 26422 + }, + { + "epoch": 3.0828374752070937, + "grad_norm": 1.1892424821853638, + "learning_rate": 4.808069838762663e-05, + "loss": 1.855, + "step": 26423 + }, + { + "epoch": 3.0829541477073854, + "grad_norm": 1.2857383489608765, + "learning_rate": 4.806984840846792e-05, + "loss": 1.8904, + "step": 26424 + }, + { + "epoch": 3.083070820207677, + "grad_norm": 1.2197563648223877, + "learning_rate": 4.805899950155864e-05, + "loss": 1.752, + "step": 26425 + }, + { + "epoch": 3.0831874927079688, + "grad_norm": 1.1572264432907104, + "learning_rate": 4.8048151667011176e-05, + "loss": 1.873, + "step": 26426 + }, + { + "epoch": 3.0833041652082605, + "grad_norm": 1.3128025531768799, + "learning_rate": 4.8037304904938064e-05, + "loss": 1.8333, + "step": 26427 + }, + { + "epoch": 3.083420837708552, + "grad_norm": 1.17830491065979, + "learning_rate": 4.802645921545169e-05, + "loss": 1.7363, + "step": 26428 + }, + { + "epoch": 3.083537510208844, + "grad_norm": 1.19819176197052, + "learning_rate": 4.801561459866449e-05, + "loss": 1.8267, + "step": 26429 + }, + { + "epoch": 3.0836541827091355, + "grad_norm": 1.3534302711486816, + "learning_rate": 4.800477105468894e-05, + "loss": 1.8958, + "step": 26430 + }, + { + "epoch": 3.083770855209427, + "grad_norm": 1.1898996829986572, + "learning_rate": 4.7993928583637425e-05, + "loss": 1.8899, + "step": 26431 + }, + { + "epoch": 3.083887527709719, + "grad_norm": 1.3465207815170288, + "learning_rate": 4.798308718562229e-05, + "loss": 1.7313, + "step": 26432 + }, + { + "epoch": 3.0840042002100105, + "grad_norm": 1.1877377033233643, + "learning_rate": 4.797224686075604e-05, + "loss": 1.6762, + "step": 26433 + }, + { + "epoch": 3.084120872710302, + "grad_norm": 1.0801125764846802, + "learning_rate": 4.7961407609150965e-05, + "loss": 1.9178, + "step": 26434 + }, + { + "epoch": 3.084237545210594, + "grad_norm": 1.177191972732544, + "learning_rate": 4.79505694309195e-05, + "loss": 1.8031, + "step": 26435 + }, + { + "epoch": 3.0843542177108856, + "grad_norm": 1.0804331302642822, + "learning_rate": 4.793973232617391e-05, + "loss": 1.7372, + "step": 26436 + }, + { + "epoch": 3.0844708902111773, + "grad_norm": 1.4047950506210327, + "learning_rate": 4.792889629502661e-05, + "loss": 1.9348, + "step": 26437 + }, + { + "epoch": 3.084587562711469, + "grad_norm": 1.243972659111023, + "learning_rate": 4.791806133758998e-05, + "loss": 1.7939, + "step": 26438 + }, + { + "epoch": 3.0847042352117606, + "grad_norm": 1.4229658842086792, + "learning_rate": 4.790722745397624e-05, + "loss": 1.7687, + "step": 26439 + }, + { + "epoch": 3.0848209077120523, + "grad_norm": 1.29078209400177, + "learning_rate": 4.789639464429781e-05, + "loss": 1.8241, + "step": 26440 + }, + { + "epoch": 3.084937580212344, + "grad_norm": 1.2266006469726562, + "learning_rate": 4.788556290866687e-05, + "loss": 1.8375, + "step": 26441 + }, + { + "epoch": 3.0850542527126357, + "grad_norm": 1.137696623802185, + "learning_rate": 4.787473224719586e-05, + "loss": 1.7502, + "step": 26442 + }, + { + "epoch": 3.0851709252129274, + "grad_norm": 1.1655418872833252, + "learning_rate": 4.786390265999691e-05, + "loss": 1.8297, + "step": 26443 + }, + { + "epoch": 3.085287597713219, + "grad_norm": 1.294267177581787, + "learning_rate": 4.785307414718243e-05, + "loss": 1.68, + "step": 26444 + }, + { + "epoch": 3.0854042702135107, + "grad_norm": 1.157738447189331, + "learning_rate": 4.784224670886454e-05, + "loss": 1.6848, + "step": 26445 + }, + { + "epoch": 3.0855209427138024, + "grad_norm": 1.3351880311965942, + "learning_rate": 4.7831420345155605e-05, + "loss": 2.0035, + "step": 26446 + }, + { + "epoch": 3.085637615214094, + "grad_norm": 1.191954493522644, + "learning_rate": 4.782059505616781e-05, + "loss": 1.8691, + "step": 26447 + }, + { + "epoch": 3.0857542877143858, + "grad_norm": 1.4048430919647217, + "learning_rate": 4.780977084201337e-05, + "loss": 1.9316, + "step": 26448 + }, + { + "epoch": 3.0858709602146774, + "grad_norm": 1.1184383630752563, + "learning_rate": 4.7798947702804496e-05, + "loss": 1.7425, + "step": 26449 + }, + { + "epoch": 3.085987632714969, + "grad_norm": 1.433029055595398, + "learning_rate": 4.778812563865337e-05, + "loss": 1.9579, + "step": 26450 + }, + { + "epoch": 3.086104305215261, + "grad_norm": 1.101996660232544, + "learning_rate": 4.777730464967228e-05, + "loss": 1.6574, + "step": 26451 + }, + { + "epoch": 3.0862209777155525, + "grad_norm": 1.1659657955169678, + "learning_rate": 4.776648473597329e-05, + "loss": 1.7271, + "step": 26452 + }, + { + "epoch": 3.086337650215844, + "grad_norm": 1.257369041442871, + "learning_rate": 4.775566589766868e-05, + "loss": 1.7467, + "step": 26453 + }, + { + "epoch": 3.086454322716136, + "grad_norm": 1.2251458168029785, + "learning_rate": 4.774484813487048e-05, + "loss": 1.8784, + "step": 26454 + }, + { + "epoch": 3.0865709952164275, + "grad_norm": 1.0841634273529053, + "learning_rate": 4.773403144769097e-05, + "loss": 1.8126, + "step": 26455 + }, + { + "epoch": 3.086687667716719, + "grad_norm": 1.5624617338180542, + "learning_rate": 4.7723215836242166e-05, + "loss": 1.8323, + "step": 26456 + }, + { + "epoch": 3.086804340217011, + "grad_norm": 1.0423272848129272, + "learning_rate": 4.77124013006363e-05, + "loss": 1.7134, + "step": 26457 + }, + { + "epoch": 3.0869210127173026, + "grad_norm": 1.1725419759750366, + "learning_rate": 4.770158784098538e-05, + "loss": 1.6028, + "step": 26458 + }, + { + "epoch": 3.0870376852175943, + "grad_norm": 1.275762915611267, + "learning_rate": 4.7690775457401606e-05, + "loss": 1.6747, + "step": 26459 + }, + { + "epoch": 3.087154357717886, + "grad_norm": 1.1257237195968628, + "learning_rate": 4.767996414999699e-05, + "loss": 1.7476, + "step": 26460 + }, + { + "epoch": 3.0872710302181776, + "grad_norm": 1.5236397981643677, + "learning_rate": 4.766915391888368e-05, + "loss": 1.9732, + "step": 26461 + }, + { + "epoch": 3.0873877027184693, + "grad_norm": 1.0756824016571045, + "learning_rate": 4.7658344764173655e-05, + "loss": 1.7949, + "step": 26462 + }, + { + "epoch": 3.087504375218761, + "grad_norm": 1.2029246091842651, + "learning_rate": 4.7647536685979084e-05, + "loss": 1.8952, + "step": 26463 + }, + { + "epoch": 3.0876210477190527, + "grad_norm": 1.3862472772598267, + "learning_rate": 4.763672968441197e-05, + "loss": 1.8546, + "step": 26464 + }, + { + "epoch": 3.0877377202193443, + "grad_norm": 1.2353757619857788, + "learning_rate": 4.762592375958427e-05, + "loss": 1.8201, + "step": 26465 + }, + { + "epoch": 3.087854392719636, + "grad_norm": 1.1809524297714233, + "learning_rate": 4.761511891160811e-05, + "loss": 1.7106, + "step": 26466 + }, + { + "epoch": 3.0879710652199277, + "grad_norm": 1.1129072904586792, + "learning_rate": 4.760431514059542e-05, + "loss": 1.922, + "step": 26467 + }, + { + "epoch": 3.0880877377202194, + "grad_norm": 1.1611597537994385, + "learning_rate": 4.759351244665831e-05, + "loss": 1.906, + "step": 26468 + }, + { + "epoch": 3.088204410220511, + "grad_norm": 1.2196033000946045, + "learning_rate": 4.7582710829908646e-05, + "loss": 1.958, + "step": 26469 + }, + { + "epoch": 3.0883210827208027, + "grad_norm": 1.158146858215332, + "learning_rate": 4.757191029045852e-05, + "loss": 1.793, + "step": 26470 + }, + { + "epoch": 3.0884377552210944, + "grad_norm": 1.1208621263504028, + "learning_rate": 4.756111082841981e-05, + "loss": 1.6898, + "step": 26471 + }, + { + "epoch": 3.088554427721386, + "grad_norm": 1.126263976097107, + "learning_rate": 4.755031244390455e-05, + "loss": 1.7999, + "step": 26472 + }, + { + "epoch": 3.088671100221678, + "grad_norm": 1.1391562223434448, + "learning_rate": 4.753951513702458e-05, + "loss": 1.767, + "step": 26473 + }, + { + "epoch": 3.0887877727219695, + "grad_norm": 1.3784914016723633, + "learning_rate": 4.752871890789198e-05, + "loss": 1.8151, + "step": 26474 + }, + { + "epoch": 3.088904445222261, + "grad_norm": 1.300102949142456, + "learning_rate": 4.7517923756618534e-05, + "loss": 1.8844, + "step": 26475 + }, + { + "epoch": 3.089021117722553, + "grad_norm": 1.4084241390228271, + "learning_rate": 4.750712968331621e-05, + "loss": 1.9366, + "step": 26476 + }, + { + "epoch": 3.0891377902228445, + "grad_norm": 1.50592041015625, + "learning_rate": 4.749633668809696e-05, + "loss": 1.863, + "step": 26477 + }, + { + "epoch": 3.089254462723136, + "grad_norm": 1.2491979598999023, + "learning_rate": 4.7485544771072596e-05, + "loss": 1.9182, + "step": 26478 + }, + { + "epoch": 3.089371135223428, + "grad_norm": 1.4339866638183594, + "learning_rate": 4.747475393235505e-05, + "loss": 1.9902, + "step": 26479 + }, + { + "epoch": 3.0894878077237196, + "grad_norm": 1.2893155813217163, + "learning_rate": 4.746396417205619e-05, + "loss": 1.9594, + "step": 26480 + }, + { + "epoch": 3.0896044802240112, + "grad_norm": 1.1580199003219604, + "learning_rate": 4.7453175490287844e-05, + "loss": 1.7769, + "step": 26481 + }, + { + "epoch": 3.089721152724303, + "grad_norm": 1.2882237434387207, + "learning_rate": 4.7442387887161814e-05, + "loss": 1.7735, + "step": 26482 + }, + { + "epoch": 3.0898378252245946, + "grad_norm": 1.1180115938186646, + "learning_rate": 4.743160136279002e-05, + "loss": 1.7721, + "step": 26483 + }, + { + "epoch": 3.0899544977248863, + "grad_norm": 1.2870032787322998, + "learning_rate": 4.742081591728421e-05, + "loss": 1.9511, + "step": 26484 + }, + { + "epoch": 3.090071170225178, + "grad_norm": 1.0971894264221191, + "learning_rate": 4.741003155075628e-05, + "loss": 1.6909, + "step": 26485 + }, + { + "epoch": 3.0901878427254696, + "grad_norm": 1.2324416637420654, + "learning_rate": 4.739924826331793e-05, + "loss": 1.8682, + "step": 26486 + }, + { + "epoch": 3.0903045152257613, + "grad_norm": 1.3895597457885742, + "learning_rate": 4.7388466055081064e-05, + "loss": 1.9959, + "step": 26487 + }, + { + "epoch": 3.090421187726053, + "grad_norm": 1.35720956325531, + "learning_rate": 4.7377684926157326e-05, + "loss": 1.8863, + "step": 26488 + }, + { + "epoch": 3.0905378602263447, + "grad_norm": 1.286993145942688, + "learning_rate": 4.7366904876658595e-05, + "loss": 1.8767, + "step": 26489 + }, + { + "epoch": 3.0906545327266364, + "grad_norm": 1.3052269220352173, + "learning_rate": 4.735612590669659e-05, + "loss": 1.7689, + "step": 26490 + }, + { + "epoch": 3.090771205226928, + "grad_norm": 1.1408727169036865, + "learning_rate": 4.734534801638304e-05, + "loss": 1.6867, + "step": 26491 + }, + { + "epoch": 3.0908878777272197, + "grad_norm": 1.2399630546569824, + "learning_rate": 4.733457120582972e-05, + "loss": 1.8015, + "step": 26492 + }, + { + "epoch": 3.0910045502275114, + "grad_norm": 1.2098407745361328, + "learning_rate": 4.7323795475148284e-05, + "loss": 1.8918, + "step": 26493 + }, + { + "epoch": 3.091121222727803, + "grad_norm": 1.206540822982788, + "learning_rate": 4.731302082445052e-05, + "loss": 1.8269, + "step": 26494 + }, + { + "epoch": 3.091237895228095, + "grad_norm": 1.1057873964309692, + "learning_rate": 4.730224725384806e-05, + "loss": 1.6929, + "step": 26495 + }, + { + "epoch": 3.0913545677283865, + "grad_norm": 1.2770425081253052, + "learning_rate": 4.7291474763452644e-05, + "loss": 1.7809, + "step": 26496 + }, + { + "epoch": 3.091471240228678, + "grad_norm": 1.1106332540512085, + "learning_rate": 4.728070335337595e-05, + "loss": 1.7581, + "step": 26497 + }, + { + "epoch": 3.09158791272897, + "grad_norm": 1.214296579360962, + "learning_rate": 4.726993302372955e-05, + "loss": 1.6741, + "step": 26498 + }, + { + "epoch": 3.0917045852292615, + "grad_norm": 1.3358957767486572, + "learning_rate": 4.725916377462523e-05, + "loss": 1.78, + "step": 26499 + }, + { + "epoch": 3.091821257729553, + "grad_norm": 1.190848469734192, + "learning_rate": 4.724839560617459e-05, + "loss": 1.7572, + "step": 26500 + }, + { + "epoch": 3.091937930229845, + "grad_norm": 1.5865862369537354, + "learning_rate": 4.723762851848918e-05, + "loss": 1.8895, + "step": 26501 + }, + { + "epoch": 3.0920546027301365, + "grad_norm": 1.1448601484298706, + "learning_rate": 4.722686251168068e-05, + "loss": 1.8557, + "step": 26502 + }, + { + "epoch": 3.0921712752304282, + "grad_norm": 1.4429556131362915, + "learning_rate": 4.721609758586077e-05, + "loss": 1.8516, + "step": 26503 + }, + { + "epoch": 3.09228794773072, + "grad_norm": 1.1521077156066895, + "learning_rate": 4.720533374114093e-05, + "loss": 1.7472, + "step": 26504 + }, + { + "epoch": 3.0924046202310116, + "grad_norm": 1.291182041168213, + "learning_rate": 4.719457097763286e-05, + "loss": 1.8351, + "step": 26505 + }, + { + "epoch": 3.0925212927313033, + "grad_norm": 1.1086862087249756, + "learning_rate": 4.7183809295448025e-05, + "loss": 1.8706, + "step": 26506 + }, + { + "epoch": 3.092637965231595, + "grad_norm": 1.2605646848678589, + "learning_rate": 4.717304869469809e-05, + "loss": 1.8863, + "step": 26507 + }, + { + "epoch": 3.0927546377318866, + "grad_norm": 1.2360901832580566, + "learning_rate": 4.716228917549454e-05, + "loss": 1.8264, + "step": 26508 + }, + { + "epoch": 3.0928713102321783, + "grad_norm": 1.101944923400879, + "learning_rate": 4.7151530737948954e-05, + "loss": 1.598, + "step": 26509 + }, + { + "epoch": 3.09298798273247, + "grad_norm": 1.2538936138153076, + "learning_rate": 4.714077338217283e-05, + "loss": 1.8653, + "step": 26510 + }, + { + "epoch": 3.0931046552327617, + "grad_norm": 1.1385835409164429, + "learning_rate": 4.7130017108277764e-05, + "loss": 1.7184, + "step": 26511 + }, + { + "epoch": 3.0932213277330534, + "grad_norm": 1.4013599157333374, + "learning_rate": 4.7119261916375145e-05, + "loss": 1.8089, + "step": 26512 + }, + { + "epoch": 3.093338000233345, + "grad_norm": 1.256486415863037, + "learning_rate": 4.7108507806576584e-05, + "loss": 1.8623, + "step": 26513 + }, + { + "epoch": 3.0934546727336367, + "grad_norm": 1.353289008140564, + "learning_rate": 4.709775477899349e-05, + "loss": 1.7526, + "step": 26514 + }, + { + "epoch": 3.0935713452339284, + "grad_norm": 1.203951120376587, + "learning_rate": 4.70870028337374e-05, + "loss": 1.9535, + "step": 26515 + }, + { + "epoch": 3.09368801773422, + "grad_norm": 1.3108110427856445, + "learning_rate": 4.707625197091976e-05, + "loss": 1.8833, + "step": 26516 + }, + { + "epoch": 3.0938046902345118, + "grad_norm": 1.2126139402389526, + "learning_rate": 4.706550219065196e-05, + "loss": 1.8258, + "step": 26517 + }, + { + "epoch": 3.0939213627348034, + "grad_norm": 1.2763350009918213, + "learning_rate": 4.705475349304553e-05, + "loss": 1.848, + "step": 26518 + }, + { + "epoch": 3.094038035235095, + "grad_norm": 1.281922459602356, + "learning_rate": 4.704400587821183e-05, + "loss": 1.7884, + "step": 26519 + }, + { + "epoch": 3.094154707735387, + "grad_norm": 1.3550580739974976, + "learning_rate": 4.7033259346262356e-05, + "loss": 1.7661, + "step": 26520 + }, + { + "epoch": 3.0942713802356785, + "grad_norm": 1.2014800310134888, + "learning_rate": 4.702251389730842e-05, + "loss": 1.7498, + "step": 26521 + }, + { + "epoch": 3.09438805273597, + "grad_norm": 1.1473270654678345, + "learning_rate": 4.701176953146153e-05, + "loss": 1.7859, + "step": 26522 + }, + { + "epoch": 3.094504725236262, + "grad_norm": 1.2704172134399414, + "learning_rate": 4.7001026248832954e-05, + "loss": 1.9882, + "step": 26523 + }, + { + "epoch": 3.0946213977365535, + "grad_norm": 1.3353431224822998, + "learning_rate": 4.6990284049534195e-05, + "loss": 1.8633, + "step": 26524 + }, + { + "epoch": 3.094738070236845, + "grad_norm": 1.1203287839889526, + "learning_rate": 4.697954293367648e-05, + "loss": 1.7206, + "step": 26525 + }, + { + "epoch": 3.094854742737137, + "grad_norm": 1.2416932582855225, + "learning_rate": 4.696880290137127e-05, + "loss": 1.8456, + "step": 26526 + }, + { + "epoch": 3.0949714152374286, + "grad_norm": 1.104122519493103, + "learning_rate": 4.6958063952729824e-05, + "loss": 1.6171, + "step": 26527 + }, + { + "epoch": 3.0950880877377203, + "grad_norm": 1.276610255241394, + "learning_rate": 4.694732608786353e-05, + "loss": 1.983, + "step": 26528 + }, + { + "epoch": 3.095204760238012, + "grad_norm": 1.2727159261703491, + "learning_rate": 4.693658930688372e-05, + "loss": 1.6987, + "step": 26529 + }, + { + "epoch": 3.0953214327383036, + "grad_norm": 1.2763136625289917, + "learning_rate": 4.692585360990162e-05, + "loss": 1.8952, + "step": 26530 + }, + { + "epoch": 3.0954381052385953, + "grad_norm": 1.1762182712554932, + "learning_rate": 4.6915118997028615e-05, + "loss": 1.7879, + "step": 26531 + }, + { + "epoch": 3.095554777738887, + "grad_norm": 1.1360808610916138, + "learning_rate": 4.6904385468375965e-05, + "loss": 1.7713, + "step": 26532 + }, + { + "epoch": 3.0956714502391787, + "grad_norm": 1.1786686182022095, + "learning_rate": 4.689365302405491e-05, + "loss": 1.9662, + "step": 26533 + }, + { + "epoch": 3.0957881227394703, + "grad_norm": 1.2124260663986206, + "learning_rate": 4.6882921664176696e-05, + "loss": 1.7334, + "step": 26534 + }, + { + "epoch": 3.095904795239762, + "grad_norm": 1.2703797817230225, + "learning_rate": 4.687219138885264e-05, + "loss": 1.8253, + "step": 26535 + }, + { + "epoch": 3.0960214677400537, + "grad_norm": 1.2487714290618896, + "learning_rate": 4.6861462198193913e-05, + "loss": 1.8054, + "step": 26536 + }, + { + "epoch": 3.0961381402403454, + "grad_norm": 1.3801143169403076, + "learning_rate": 4.685073409231183e-05, + "loss": 1.948, + "step": 26537 + }, + { + "epoch": 3.096254812740637, + "grad_norm": 1.3696783781051636, + "learning_rate": 4.6840007071317495e-05, + "loss": 2.1146, + "step": 26538 + }, + { + "epoch": 3.0963714852409288, + "grad_norm": 1.2785334587097168, + "learning_rate": 4.682928113532224e-05, + "loss": 1.8981, + "step": 26539 + }, + { + "epoch": 3.0964881577412204, + "grad_norm": 1.2609624862670898, + "learning_rate": 4.681855628443716e-05, + "loss": 1.9976, + "step": 26540 + }, + { + "epoch": 3.096604830241512, + "grad_norm": 1.4214142560958862, + "learning_rate": 4.680783251877345e-05, + "loss": 1.9189, + "step": 26541 + }, + { + "epoch": 3.096721502741804, + "grad_norm": 1.2672765254974365, + "learning_rate": 4.679710983844237e-05, + "loss": 1.7722, + "step": 26542 + }, + { + "epoch": 3.0968381752420955, + "grad_norm": 1.3499462604522705, + "learning_rate": 4.6786388243554956e-05, + "loss": 1.7758, + "step": 26543 + }, + { + "epoch": 3.096954847742387, + "grad_norm": 1.229763388633728, + "learning_rate": 4.677566773422249e-05, + "loss": 1.7406, + "step": 26544 + }, + { + "epoch": 3.097071520242679, + "grad_norm": 1.2276791334152222, + "learning_rate": 4.6764948310555965e-05, + "loss": 1.8885, + "step": 26545 + }, + { + "epoch": 3.0971881927429705, + "grad_norm": 1.1271443367004395, + "learning_rate": 4.675422997266665e-05, + "loss": 1.9019, + "step": 26546 + }, + { + "epoch": 3.097304865243262, + "grad_norm": 1.2693768739700317, + "learning_rate": 4.6743512720665535e-05, + "loss": 2.0598, + "step": 26547 + }, + { + "epoch": 3.097421537743554, + "grad_norm": 1.272855281829834, + "learning_rate": 4.673279655466383e-05, + "loss": 1.9008, + "step": 26548 + }, + { + "epoch": 3.0975382102438456, + "grad_norm": 1.2917693853378296, + "learning_rate": 4.672208147477259e-05, + "loss": 2.0572, + "step": 26549 + }, + { + "epoch": 3.0976548827441373, + "grad_norm": 1.2950843572616577, + "learning_rate": 4.6711367481102826e-05, + "loss": 1.9621, + "step": 26550 + }, + { + "epoch": 3.097771555244429, + "grad_norm": 1.3095545768737793, + "learning_rate": 4.6700654573765745e-05, + "loss": 1.895, + "step": 26551 + }, + { + "epoch": 3.0978882277447206, + "grad_norm": 1.2161155939102173, + "learning_rate": 4.668994275287232e-05, + "loss": 1.7213, + "step": 26552 + }, + { + "epoch": 3.0980049002450123, + "grad_norm": 1.2351255416870117, + "learning_rate": 4.667923201853356e-05, + "loss": 1.7587, + "step": 26553 + }, + { + "epoch": 3.098121572745304, + "grad_norm": 1.1735038757324219, + "learning_rate": 4.6668522370860564e-05, + "loss": 1.8939, + "step": 26554 + }, + { + "epoch": 3.0982382452455957, + "grad_norm": 1.3512519598007202, + "learning_rate": 4.6657813809964395e-05, + "loss": 2.0245, + "step": 26555 + }, + { + "epoch": 3.0983549177458873, + "grad_norm": 1.2570379972457886, + "learning_rate": 4.664710633595595e-05, + "loss": 1.7731, + "step": 26556 + }, + { + "epoch": 3.098471590246179, + "grad_norm": 1.248772382736206, + "learning_rate": 4.663639994894638e-05, + "loss": 1.601, + "step": 26557 + }, + { + "epoch": 3.0985882627464707, + "grad_norm": 1.274443507194519, + "learning_rate": 4.662569464904654e-05, + "loss": 1.7686, + "step": 26558 + }, + { + "epoch": 3.0987049352467624, + "grad_norm": 1.0306460857391357, + "learning_rate": 4.6614990436367515e-05, + "loss": 1.7341, + "step": 26559 + }, + { + "epoch": 3.098821607747054, + "grad_norm": 1.3642840385437012, + "learning_rate": 4.6604287311020186e-05, + "loss": 1.8463, + "step": 26560 + }, + { + "epoch": 3.0989382802473457, + "grad_norm": 1.167082667350769, + "learning_rate": 4.659358527311561e-05, + "loss": 1.8433, + "step": 26561 + }, + { + "epoch": 3.0990549527476374, + "grad_norm": 1.1686275005340576, + "learning_rate": 4.6582884322764636e-05, + "loss": 1.8406, + "step": 26562 + }, + { + "epoch": 3.099171625247929, + "grad_norm": 1.1977438926696777, + "learning_rate": 4.657218446007827e-05, + "loss": 1.8065, + "step": 26563 + }, + { + "epoch": 3.099288297748221, + "grad_norm": 1.218024492263794, + "learning_rate": 4.656148568516736e-05, + "loss": 1.8328, + "step": 26564 + }, + { + "epoch": 3.0994049702485125, + "grad_norm": 1.3784133195877075, + "learning_rate": 4.6550787998142926e-05, + "loss": 1.782, + "step": 26565 + }, + { + "epoch": 3.099521642748804, + "grad_norm": 1.2542688846588135, + "learning_rate": 4.654009139911582e-05, + "loss": 1.8564, + "step": 26566 + }, + { + "epoch": 3.099638315249096, + "grad_norm": 1.2365859746932983, + "learning_rate": 4.6529395888196866e-05, + "loss": 1.8772, + "step": 26567 + }, + { + "epoch": 3.0997549877493875, + "grad_norm": 1.2034060955047607, + "learning_rate": 4.6518701465497055e-05, + "loss": 1.8645, + "step": 26568 + }, + { + "epoch": 3.099871660249679, + "grad_norm": 1.406821846961975, + "learning_rate": 4.650800813112716e-05, + "loss": 1.7946, + "step": 26569 + }, + { + "epoch": 3.099988332749971, + "grad_norm": 1.3849554061889648, + "learning_rate": 4.6497315885198125e-05, + "loss": 1.9548, + "step": 26570 + }, + { + "epoch": 3.1001050052502626, + "grad_norm": 1.2809598445892334, + "learning_rate": 4.6486624727820685e-05, + "loss": 1.8689, + "step": 26571 + }, + { + "epoch": 3.1002216777505542, + "grad_norm": 1.1263175010681152, + "learning_rate": 4.647593465910581e-05, + "loss": 1.8474, + "step": 26572 + }, + { + "epoch": 3.100338350250846, + "grad_norm": 1.4557232856750488, + "learning_rate": 4.646524567916419e-05, + "loss": 1.9659, + "step": 26573 + }, + { + "epoch": 3.1004550227511376, + "grad_norm": 1.2344520092010498, + "learning_rate": 4.6454557788106745e-05, + "loss": 1.917, + "step": 26574 + }, + { + "epoch": 3.1005716952514293, + "grad_norm": 1.2096914052963257, + "learning_rate": 4.644387098604418e-05, + "loss": 1.8006, + "step": 26575 + }, + { + "epoch": 3.100688367751721, + "grad_norm": 1.361991047859192, + "learning_rate": 4.6433185273087387e-05, + "loss": 1.8456, + "step": 26576 + }, + { + "epoch": 3.1008050402520126, + "grad_norm": 1.2090867757797241, + "learning_rate": 4.6422500649347036e-05, + "loss": 1.885, + "step": 26577 + }, + { + "epoch": 3.1009217127523043, + "grad_norm": 1.1451106071472168, + "learning_rate": 4.6411817114934e-05, + "loss": 1.7767, + "step": 26578 + }, + { + "epoch": 3.101038385252596, + "grad_norm": 1.3987321853637695, + "learning_rate": 4.640113466995893e-05, + "loss": 2.0521, + "step": 26579 + }, + { + "epoch": 3.1011550577528877, + "grad_norm": 1.249330759048462, + "learning_rate": 4.6390453314532655e-05, + "loss": 1.9473, + "step": 26580 + }, + { + "epoch": 3.1012717302531794, + "grad_norm": 1.4650996923446655, + "learning_rate": 4.637977304876585e-05, + "loss": 1.8767, + "step": 26581 + }, + { + "epoch": 3.101388402753471, + "grad_norm": 1.1759542226791382, + "learning_rate": 4.636909387276929e-05, + "loss": 1.7572, + "step": 26582 + }, + { + "epoch": 3.1015050752537627, + "grad_norm": 1.4029241800308228, + "learning_rate": 4.6358415786653614e-05, + "loss": 1.9153, + "step": 26583 + }, + { + "epoch": 3.1016217477540544, + "grad_norm": 1.1936548948287964, + "learning_rate": 4.634773879052959e-05, + "loss": 1.846, + "step": 26584 + }, + { + "epoch": 3.101738420254346, + "grad_norm": 1.2683594226837158, + "learning_rate": 4.6337062884507884e-05, + "loss": 1.821, + "step": 26585 + }, + { + "epoch": 3.1018550927546378, + "grad_norm": 1.2616589069366455, + "learning_rate": 4.632638806869912e-05, + "loss": 1.7608, + "step": 26586 + }, + { + "epoch": 3.1019717652549295, + "grad_norm": 1.2843914031982422, + "learning_rate": 4.631571434321406e-05, + "loss": 1.7682, + "step": 26587 + }, + { + "epoch": 3.102088437755221, + "grad_norm": 1.2632639408111572, + "learning_rate": 4.630504170816325e-05, + "loss": 1.7891, + "step": 26588 + }, + { + "epoch": 3.102205110255513, + "grad_norm": 1.2970943450927734, + "learning_rate": 4.629437016365744e-05, + "loss": 1.7515, + "step": 26589 + }, + { + "epoch": 3.1023217827558045, + "grad_norm": 1.1790335178375244, + "learning_rate": 4.628369970980716e-05, + "loss": 1.9199, + "step": 26590 + }, + { + "epoch": 3.102438455256096, + "grad_norm": 1.5443600416183472, + "learning_rate": 4.62730303467231e-05, + "loss": 2.0006, + "step": 26591 + }, + { + "epoch": 3.102555127756388, + "grad_norm": 1.4547597169876099, + "learning_rate": 4.626236207451582e-05, + "loss": 1.9173, + "step": 26592 + }, + { + "epoch": 3.1026718002566795, + "grad_norm": 1.2485820055007935, + "learning_rate": 4.6251694893295976e-05, + "loss": 1.806, + "step": 26593 + }, + { + "epoch": 3.1027884727569712, + "grad_norm": 1.3233002424240112, + "learning_rate": 4.6241028803174086e-05, + "loss": 1.8385, + "step": 26594 + }, + { + "epoch": 3.102905145257263, + "grad_norm": 1.2988011837005615, + "learning_rate": 4.623036380426074e-05, + "loss": 1.9261, + "step": 26595 + }, + { + "epoch": 3.1030218177575546, + "grad_norm": 1.3348731994628906, + "learning_rate": 4.621969989666657e-05, + "loss": 1.8212, + "step": 26596 + }, + { + "epoch": 3.1031384902578463, + "grad_norm": 1.0966936349868774, + "learning_rate": 4.620903708050203e-05, + "loss": 1.8777, + "step": 26597 + }, + { + "epoch": 3.103255162758138, + "grad_norm": 1.098814845085144, + "learning_rate": 4.619837535587775e-05, + "loss": 1.6576, + "step": 26598 + }, + { + "epoch": 3.1033718352584296, + "grad_norm": 1.1918386220932007, + "learning_rate": 4.618771472290417e-05, + "loss": 1.7512, + "step": 26599 + }, + { + "epoch": 3.1034885077587213, + "grad_norm": 1.1989054679870605, + "learning_rate": 4.61770551816919e-05, + "loss": 1.8504, + "step": 26600 + }, + { + "epoch": 3.103605180259013, + "grad_norm": 1.3156766891479492, + "learning_rate": 4.6166396732351396e-05, + "loss": 1.9416, + "step": 26601 + }, + { + "epoch": 3.1037218527593047, + "grad_norm": 1.2431038618087769, + "learning_rate": 4.615573937499315e-05, + "loss": 1.8235, + "step": 26602 + }, + { + "epoch": 3.1038385252595964, + "grad_norm": 1.2910946607589722, + "learning_rate": 4.614508310972761e-05, + "loss": 1.858, + "step": 26603 + }, + { + "epoch": 3.103955197759888, + "grad_norm": 1.0844281911849976, + "learning_rate": 4.6134427936665334e-05, + "loss": 1.7733, + "step": 26604 + }, + { + "epoch": 3.1040718702601797, + "grad_norm": 1.2745107412338257, + "learning_rate": 4.61237738559167e-05, + "loss": 1.8951, + "step": 26605 + }, + { + "epoch": 3.1041885427604714, + "grad_norm": 1.2292174100875854, + "learning_rate": 4.611312086759224e-05, + "loss": 1.6573, + "step": 26606 + }, + { + "epoch": 3.104305215260763, + "grad_norm": 1.1648775339126587, + "learning_rate": 4.6102468971802297e-05, + "loss": 1.8269, + "step": 26607 + }, + { + "epoch": 3.1044218877610548, + "grad_norm": 1.5405240058898926, + "learning_rate": 4.609181816865735e-05, + "loss": 1.8153, + "step": 26608 + }, + { + "epoch": 3.1045385602613464, + "grad_norm": 1.1031136512756348, + "learning_rate": 4.608116845826786e-05, + "loss": 1.6958, + "step": 26609 + }, + { + "epoch": 3.104655232761638, + "grad_norm": 1.2686684131622314, + "learning_rate": 4.607051984074415e-05, + "loss": 1.7968, + "step": 26610 + }, + { + "epoch": 3.10477190526193, + "grad_norm": 1.2347302436828613, + "learning_rate": 4.605987231619669e-05, + "loss": 1.6958, + "step": 26611 + }, + { + "epoch": 3.1048885777622215, + "grad_norm": 1.0903292894363403, + "learning_rate": 4.6049225884735774e-05, + "loss": 1.6215, + "step": 26612 + }, + { + "epoch": 3.105005250262513, + "grad_norm": 1.375043511390686, + "learning_rate": 4.603858054647186e-05, + "loss": 1.7693, + "step": 26613 + }, + { + "epoch": 3.105121922762805, + "grad_norm": 1.2757914066314697, + "learning_rate": 4.602793630151521e-05, + "loss": 1.8735, + "step": 26614 + }, + { + "epoch": 3.1052385952630965, + "grad_norm": 1.1886478662490845, + "learning_rate": 4.601729314997629e-05, + "loss": 1.8412, + "step": 26615 + }, + { + "epoch": 3.105355267763388, + "grad_norm": 1.2698676586151123, + "learning_rate": 4.600665109196532e-05, + "loss": 1.7101, + "step": 26616 + }, + { + "epoch": 3.10547194026368, + "grad_norm": 1.3448978662490845, + "learning_rate": 4.5996010127592746e-05, + "loss": 1.8566, + "step": 26617 + }, + { + "epoch": 3.1055886127639716, + "grad_norm": 1.2988673448562622, + "learning_rate": 4.5985370256968796e-05, + "loss": 1.962, + "step": 26618 + }, + { + "epoch": 3.1057052852642633, + "grad_norm": 1.257623553276062, + "learning_rate": 4.5974731480203744e-05, + "loss": 1.792, + "step": 26619 + }, + { + "epoch": 3.105821957764555, + "grad_norm": 1.1010053157806396, + "learning_rate": 4.596409379740799e-05, + "loss": 1.7757, + "step": 26620 + }, + { + "epoch": 3.1059386302648466, + "grad_norm": 1.2380560636520386, + "learning_rate": 4.595345720869168e-05, + "loss": 1.708, + "step": 26621 + }, + { + "epoch": 3.1060553027651383, + "grad_norm": 1.423209547996521, + "learning_rate": 4.594282171416523e-05, + "loss": 1.9468, + "step": 26622 + }, + { + "epoch": 3.10617197526543, + "grad_norm": 1.3038791418075562, + "learning_rate": 4.5932187313938754e-05, + "loss": 1.7564, + "step": 26623 + }, + { + "epoch": 3.1062886477657217, + "grad_norm": 1.3135813474655151, + "learning_rate": 4.592155400812264e-05, + "loss": 1.8711, + "step": 26624 + }, + { + "epoch": 3.1064053202660133, + "grad_norm": 1.3372647762298584, + "learning_rate": 4.5910921796826965e-05, + "loss": 1.8035, + "step": 26625 + }, + { + "epoch": 3.106521992766305, + "grad_norm": 1.2202202081680298, + "learning_rate": 4.590029068016211e-05, + "loss": 1.8691, + "step": 26626 + }, + { + "epoch": 3.1066386652665967, + "grad_norm": 1.2168883085250854, + "learning_rate": 4.5889660658238146e-05, + "loss": 1.7076, + "step": 26627 + }, + { + "epoch": 3.1067553377668884, + "grad_norm": 1.2808314561843872, + "learning_rate": 4.58790317311654e-05, + "loss": 1.8538, + "step": 26628 + }, + { + "epoch": 3.10687201026718, + "grad_norm": 1.3031929731369019, + "learning_rate": 4.586840389905395e-05, + "loss": 1.8194, + "step": 26629 + }, + { + "epoch": 3.1069886827674718, + "grad_norm": 1.2725868225097656, + "learning_rate": 4.585777716201406e-05, + "loss": 1.865, + "step": 26630 + }, + { + "epoch": 3.1071053552677634, + "grad_norm": 1.1705193519592285, + "learning_rate": 4.584715152015582e-05, + "loss": 1.6371, + "step": 26631 + }, + { + "epoch": 3.107222027768055, + "grad_norm": 1.3271880149841309, + "learning_rate": 4.583652697358947e-05, + "loss": 1.8653, + "step": 26632 + }, + { + "epoch": 3.107338700268347, + "grad_norm": 1.3496159315109253, + "learning_rate": 4.582590352242507e-05, + "loss": 1.7603, + "step": 26633 + }, + { + "epoch": 3.1074553727686385, + "grad_norm": 1.2536723613739014, + "learning_rate": 4.581528116677283e-05, + "loss": 1.8318, + "step": 26634 + }, + { + "epoch": 3.10757204526893, + "grad_norm": 1.2405037879943848, + "learning_rate": 4.580465990674277e-05, + "loss": 1.741, + "step": 26635 + }, + { + "epoch": 3.107688717769222, + "grad_norm": 1.2682729959487915, + "learning_rate": 4.579403974244513e-05, + "loss": 2.0374, + "step": 26636 + }, + { + "epoch": 3.1078053902695135, + "grad_norm": 1.3599119186401367, + "learning_rate": 4.578342067398993e-05, + "loss": 1.6557, + "step": 26637 + }, + { + "epoch": 3.107922062769805, + "grad_norm": 1.4345526695251465, + "learning_rate": 4.577280270148722e-05, + "loss": 1.8593, + "step": 26638 + }, + { + "epoch": 3.108038735270097, + "grad_norm": 1.144094705581665, + "learning_rate": 4.5762185825047156e-05, + "loss": 1.6925, + "step": 26639 + }, + { + "epoch": 3.1081554077703886, + "grad_norm": 1.2482432126998901, + "learning_rate": 4.5751570044779734e-05, + "loss": 1.9695, + "step": 26640 + }, + { + "epoch": 3.1082720802706802, + "grad_norm": 1.2856029272079468, + "learning_rate": 4.574095536079508e-05, + "loss": 1.8249, + "step": 26641 + }, + { + "epoch": 3.108388752770972, + "grad_norm": 1.4031413793563843, + "learning_rate": 4.573034177320316e-05, + "loss": 1.7708, + "step": 26642 + }, + { + "epoch": 3.108388752770972, + "eval_train_loss": 1.8153020143508911, + "eval_train_mean_batch_perplexity": 6.982585733529324, + "eval_train_runtime": 11117.2306, + "eval_train_samples_per_second": 12.335, + "eval_train_steps_per_second": 0.771, + "step": 26642 + }, + { + "epoch": 3.108388752770972, + "eval_test_loss": 2.0593981742858887, + "eval_test_mean_batch_perplexity": 9.14354256202856, + "eval_test_runtime": 2399.2171, + "eval_test_samples_per_second": 12.249, + "eval_test_steps_per_second": 0.766, + "step": 26642 } ], "logging_steps": 1, @@ -53364,7 +186682,7 @@ "attributes": {} } }, - "total_flos": 1.9899230550270935e+18, + "total_flos": 6.96140725683644e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null