diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.15009081402388685, + "epoch": 0.20012108536518244, "eval_steps": 500, - "global_step": 2727, + "global_step": 3636, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -19096,6 +19096,6369 @@ "learning_rate": 9.870891892373397e-06, "loss": 0.8069, "step": 2727 + }, + { + "epoch": 0.1501458528262425, + "grad_norm": 0.8530564308166504, + "learning_rate": 9.870794006093188e-06, + "loss": 0.9229, + "step": 2728 + }, + { + "epoch": 0.15020089162859815, + "grad_norm": 0.7640067934989929, + "learning_rate": 9.870696083205394e-06, + "loss": 0.761, + "step": 2729 + }, + { + "epoch": 0.1502559304309538, + "grad_norm": 0.8953121900558472, + "learning_rate": 9.87059812371075e-06, + "loss": 0.8537, + "step": 2730 + }, + { + "epoch": 0.15031096923330947, + "grad_norm": 0.7779926657676697, + "learning_rate": 9.870500127609996e-06, + "loss": 0.8184, + "step": 2731 + }, + { + "epoch": 0.15036600803566513, + "grad_norm": 0.9181544184684753, + "learning_rate": 9.870402094903865e-06, + "loss": 0.8583, + "step": 2732 + }, + { + "epoch": 0.1504210468380208, + "grad_norm": 0.7629374861717224, + "learning_rate": 9.870304025593097e-06, + "loss": 0.6741, + "step": 2733 + }, + { + "epoch": 0.15047608564037646, + "grad_norm": 1.1455601453781128, + "learning_rate": 9.87020591967843e-06, + "loss": 0.8602, + "step": 2734 + }, + { + "epoch": 0.15053112444273212, + "grad_norm": 0.83924800157547, + "learning_rate": 9.870107777160596e-06, + "loss": 0.8847, + "step": 2735 + }, + { + "epoch": 0.15058616324508778, + "grad_norm": 0.9293402433395386, + "learning_rate": 9.870009598040336e-06, + "loss": 0.9008, + "step": 2736 + }, + { + "epoch": 0.15064120204744344, + "grad_norm": 0.8198057413101196, + "learning_rate": 9.869911382318389e-06, + "loss": 0.8004, + "step": 2737 + }, + { + "epoch": 0.1506962408497991, + "grad_norm": 0.8139753341674805, + "learning_rate": 9.86981312999549e-06, + "loss": 0.7316, + "step": 2738 + }, + { + "epoch": 0.15075127965215476, + "grad_norm": 0.854184091091156, + "learning_rate": 9.86971484107238e-06, + "loss": 0.9424, + "step": 2739 + }, + { + "epoch": 0.15080631845451042, + "grad_norm": 0.8626797199249268, + "learning_rate": 9.869616515549797e-06, + "loss": 0.8882, + "step": 2740 + }, + { + "epoch": 0.15086135725686609, + "grad_norm": 0.8447514176368713, + "learning_rate": 9.869518153428479e-06, + "loss": 0.7762, + "step": 2741 + }, + { + "epoch": 0.15091639605922175, + "grad_norm": 1.1359349489212036, + "learning_rate": 9.869419754709166e-06, + "loss": 0.9233, + "step": 2742 + }, + { + "epoch": 0.1509714348615774, + "grad_norm": 0.8095758557319641, + "learning_rate": 9.869321319392597e-06, + "loss": 0.8833, + "step": 2743 + }, + { + "epoch": 0.15102647366393307, + "grad_norm": 0.8364169001579285, + "learning_rate": 9.869222847479514e-06, + "loss": 0.833, + "step": 2744 + }, + { + "epoch": 0.15108151246628873, + "grad_norm": 0.7664803266525269, + "learning_rate": 9.869124338970653e-06, + "loss": 0.8125, + "step": 2745 + }, + { + "epoch": 0.1511365512686444, + "grad_norm": 0.8129634857177734, + "learning_rate": 9.86902579386676e-06, + "loss": 0.8277, + "step": 2746 + }, + { + "epoch": 0.15119159007100005, + "grad_norm": 0.8195592164993286, + "learning_rate": 9.86892721216857e-06, + "loss": 0.8489, + "step": 2747 + }, + { + "epoch": 0.15124662887335572, + "grad_norm": 0.8116651177406311, + "learning_rate": 9.868828593876827e-06, + "loss": 0.7831, + "step": 2748 + }, + { + "epoch": 0.15130166767571138, + "grad_norm": 0.8200114369392395, + "learning_rate": 9.868729938992272e-06, + "loss": 0.8956, + "step": 2749 + }, + { + "epoch": 0.15135670647806704, + "grad_norm": 0.8521816730499268, + "learning_rate": 9.868631247515645e-06, + "loss": 0.804, + "step": 2750 + }, + { + "epoch": 0.1514117452804227, + "grad_norm": 1.0386497974395752, + "learning_rate": 9.868532519447691e-06, + "loss": 0.8563, + "step": 2751 + }, + { + "epoch": 0.15146678408277836, + "grad_norm": 0.8345486521720886, + "learning_rate": 9.868433754789149e-06, + "loss": 0.9838, + "step": 2752 + }, + { + "epoch": 0.15152182288513402, + "grad_norm": 0.7207526564598083, + "learning_rate": 9.868334953540762e-06, + "loss": 0.6711, + "step": 2753 + }, + { + "epoch": 0.15157686168748968, + "grad_norm": 0.8159164786338806, + "learning_rate": 9.86823611570327e-06, + "loss": 0.7591, + "step": 2754 + }, + { + "epoch": 0.15163190048984534, + "grad_norm": 0.9062225818634033, + "learning_rate": 9.868137241277422e-06, + "loss": 0.8009, + "step": 2755 + }, + { + "epoch": 0.151686939292201, + "grad_norm": 0.8136696219444275, + "learning_rate": 9.868038330263957e-06, + "loss": 0.7014, + "step": 2756 + }, + { + "epoch": 0.15174197809455667, + "grad_norm": 0.7237691283226013, + "learning_rate": 9.867939382663618e-06, + "loss": 0.7766, + "step": 2757 + }, + { + "epoch": 0.15179701689691233, + "grad_norm": 0.8913742303848267, + "learning_rate": 9.86784039847715e-06, + "loss": 0.9362, + "step": 2758 + }, + { + "epoch": 0.151852055699268, + "grad_norm": 0.7763763070106506, + "learning_rate": 9.867741377705296e-06, + "loss": 0.7843, + "step": 2759 + }, + { + "epoch": 0.15190709450162365, + "grad_norm": 0.8973854780197144, + "learning_rate": 9.867642320348803e-06, + "loss": 0.911, + "step": 2760 + }, + { + "epoch": 0.1519621333039793, + "grad_norm": 0.7979685664176941, + "learning_rate": 9.86754322640841e-06, + "loss": 0.81, + "step": 2761 + }, + { + "epoch": 0.15201717210633497, + "grad_norm": 0.7740911841392517, + "learning_rate": 9.867444095884867e-06, + "loss": 0.8197, + "step": 2762 + }, + { + "epoch": 0.15207221090869064, + "grad_norm": 0.8400475978851318, + "learning_rate": 9.867344928778916e-06, + "loss": 0.8809, + "step": 2763 + }, + { + "epoch": 0.1521272497110463, + "grad_norm": 0.8995040655136108, + "learning_rate": 9.867245725091305e-06, + "loss": 0.8382, + "step": 2764 + }, + { + "epoch": 0.15218228851340196, + "grad_norm": 0.8162381052970886, + "learning_rate": 9.867146484822779e-06, + "loss": 0.9238, + "step": 2765 + }, + { + "epoch": 0.15223732731575762, + "grad_norm": 0.7668827176094055, + "learning_rate": 9.867047207974079e-06, + "loss": 0.8345, + "step": 2766 + }, + { + "epoch": 0.15229236611811328, + "grad_norm": 0.8719204664230347, + "learning_rate": 9.866947894545957e-06, + "loss": 0.7899, + "step": 2767 + }, + { + "epoch": 0.15234740492046894, + "grad_norm": 0.9043570756912231, + "learning_rate": 9.866848544539159e-06, + "loss": 0.8783, + "step": 2768 + }, + { + "epoch": 0.1524024437228246, + "grad_norm": 0.8859694004058838, + "learning_rate": 9.866749157954428e-06, + "loss": 0.862, + "step": 2769 + }, + { + "epoch": 0.15245748252518027, + "grad_norm": 1.022719144821167, + "learning_rate": 9.866649734792514e-06, + "loss": 0.8943, + "step": 2770 + }, + { + "epoch": 0.1525125213275359, + "grad_norm": 0.8710635900497437, + "learning_rate": 9.866550275054163e-06, + "loss": 0.7002, + "step": 2771 + }, + { + "epoch": 0.15256756012989156, + "grad_norm": 0.8482942581176758, + "learning_rate": 9.866450778740122e-06, + "loss": 0.7529, + "step": 2772 + }, + { + "epoch": 0.15262259893224722, + "grad_norm": 0.9637784361839294, + "learning_rate": 9.866351245851142e-06, + "loss": 0.8147, + "step": 2773 + }, + { + "epoch": 0.15267763773460288, + "grad_norm": 1.0472246408462524, + "learning_rate": 9.866251676387967e-06, + "loss": 0.8019, + "step": 2774 + }, + { + "epoch": 0.15273267653695854, + "grad_norm": 0.7916847467422485, + "learning_rate": 9.866152070351347e-06, + "loss": 0.7698, + "step": 2775 + }, + { + "epoch": 0.1527877153393142, + "grad_norm": 0.8421853184700012, + "learning_rate": 9.86605242774203e-06, + "loss": 0.8085, + "step": 2776 + }, + { + "epoch": 0.15284275414166987, + "grad_norm": 0.7990233898162842, + "learning_rate": 9.865952748560768e-06, + "loss": 0.8878, + "step": 2777 + }, + { + "epoch": 0.15289779294402553, + "grad_norm": 0.8017451167106628, + "learning_rate": 9.865853032808305e-06, + "loss": 0.8707, + "step": 2778 + }, + { + "epoch": 0.1529528317463812, + "grad_norm": 0.739850640296936, + "learning_rate": 9.865753280485393e-06, + "loss": 0.7884, + "step": 2779 + }, + { + "epoch": 0.15300787054873685, + "grad_norm": 1.0682430267333984, + "learning_rate": 9.865653491592784e-06, + "loss": 0.8548, + "step": 2780 + }, + { + "epoch": 0.1530629093510925, + "grad_norm": 0.7766296863555908, + "learning_rate": 9.865553666131225e-06, + "loss": 0.7786, + "step": 2781 + }, + { + "epoch": 0.15311794815344817, + "grad_norm": 0.8903290629386902, + "learning_rate": 9.865453804101466e-06, + "loss": 0.8978, + "step": 2782 + }, + { + "epoch": 0.15317298695580384, + "grad_norm": 0.8624514937400818, + "learning_rate": 9.86535390550426e-06, + "loss": 0.8472, + "step": 2783 + }, + { + "epoch": 0.1532280257581595, + "grad_norm": 0.7765294909477234, + "learning_rate": 9.865253970340356e-06, + "loss": 0.7702, + "step": 2784 + }, + { + "epoch": 0.15328306456051516, + "grad_norm": 0.9349095225334167, + "learning_rate": 9.865153998610504e-06, + "loss": 0.9154, + "step": 2785 + }, + { + "epoch": 0.15333810336287082, + "grad_norm": 0.8435478210449219, + "learning_rate": 9.865053990315458e-06, + "loss": 0.8986, + "step": 2786 + }, + { + "epoch": 0.15339314216522648, + "grad_norm": 0.8003486394882202, + "learning_rate": 9.864953945455968e-06, + "loss": 0.767, + "step": 2787 + }, + { + "epoch": 0.15344818096758214, + "grad_norm": 0.8060823678970337, + "learning_rate": 9.86485386403279e-06, + "loss": 0.8332, + "step": 2788 + }, + { + "epoch": 0.1535032197699378, + "grad_norm": 0.7914995551109314, + "learning_rate": 9.864753746046668e-06, + "loss": 0.6706, + "step": 2789 + }, + { + "epoch": 0.15355825857229347, + "grad_norm": 0.7792215943336487, + "learning_rate": 9.86465359149836e-06, + "loss": 0.8721, + "step": 2790 + }, + { + "epoch": 0.15361329737464913, + "grad_norm": 0.8572536110877991, + "learning_rate": 9.864553400388619e-06, + "loss": 0.8378, + "step": 2791 + }, + { + "epoch": 0.1536683361770048, + "grad_norm": 0.7645615339279175, + "learning_rate": 9.864453172718195e-06, + "loss": 0.6909, + "step": 2792 + }, + { + "epoch": 0.15372337497936045, + "grad_norm": 0.7627308964729309, + "learning_rate": 9.864352908487846e-06, + "loss": 0.7918, + "step": 2793 + }, + { + "epoch": 0.1537784137817161, + "grad_norm": 1.0830100774765015, + "learning_rate": 9.86425260769832e-06, + "loss": 0.9007, + "step": 2794 + }, + { + "epoch": 0.15383345258407177, + "grad_norm": 0.7667998671531677, + "learning_rate": 9.864152270350374e-06, + "loss": 0.832, + "step": 2795 + }, + { + "epoch": 0.15388849138642743, + "grad_norm": 0.9967591762542725, + "learning_rate": 9.864051896444764e-06, + "loss": 0.8917, + "step": 2796 + }, + { + "epoch": 0.1539435301887831, + "grad_norm": 0.8948462605476379, + "learning_rate": 9.86395148598224e-06, + "loss": 0.983, + "step": 2797 + }, + { + "epoch": 0.15399856899113876, + "grad_norm": 0.7857423424720764, + "learning_rate": 9.863851038963556e-06, + "loss": 0.7826, + "step": 2798 + }, + { + "epoch": 0.15405360779349442, + "grad_norm": 0.8821337223052979, + "learning_rate": 9.863750555389473e-06, + "loss": 0.8918, + "step": 2799 + }, + { + "epoch": 0.15410864659585008, + "grad_norm": 0.7896875143051147, + "learning_rate": 9.863650035260742e-06, + "loss": 0.8199, + "step": 2800 + }, + { + "epoch": 0.15416368539820574, + "grad_norm": 0.8046941161155701, + "learning_rate": 9.86354947857812e-06, + "loss": 0.8572, + "step": 2801 + }, + { + "epoch": 0.1542187242005614, + "grad_norm": 0.7266830205917358, + "learning_rate": 9.863448885342361e-06, + "loss": 0.8315, + "step": 2802 + }, + { + "epoch": 0.15427376300291706, + "grad_norm": 0.9009475708007812, + "learning_rate": 9.863348255554222e-06, + "loss": 0.7928, + "step": 2803 + }, + { + "epoch": 0.15432880180527273, + "grad_norm": 0.963364839553833, + "learning_rate": 9.863247589214459e-06, + "loss": 0.918, + "step": 2804 + }, + { + "epoch": 0.1543838406076284, + "grad_norm": 0.8278035521507263, + "learning_rate": 9.863146886323829e-06, + "loss": 0.8497, + "step": 2805 + }, + { + "epoch": 0.15443887940998405, + "grad_norm": 0.7360561490058899, + "learning_rate": 9.86304614688309e-06, + "loss": 0.676, + "step": 2806 + }, + { + "epoch": 0.1544939182123397, + "grad_norm": 0.7679837346076965, + "learning_rate": 9.862945370892996e-06, + "loss": 0.8114, + "step": 2807 + }, + { + "epoch": 0.15454895701469537, + "grad_norm": 0.8550567030906677, + "learning_rate": 9.862844558354309e-06, + "loss": 0.8222, + "step": 2808 + }, + { + "epoch": 0.15460399581705103, + "grad_norm": 0.7852397561073303, + "learning_rate": 9.86274370926778e-06, + "loss": 0.7449, + "step": 2809 + }, + { + "epoch": 0.1546590346194067, + "grad_norm": 0.9120833277702332, + "learning_rate": 9.862642823634175e-06, + "loss": 0.8702, + "step": 2810 + }, + { + "epoch": 0.15471407342176235, + "grad_norm": 0.8729703426361084, + "learning_rate": 9.862541901454246e-06, + "loss": 0.8064, + "step": 2811 + }, + { + "epoch": 0.15476911222411802, + "grad_norm": 0.7935470342636108, + "learning_rate": 9.862440942728754e-06, + "loss": 0.8502, + "step": 2812 + }, + { + "epoch": 0.15482415102647368, + "grad_norm": 0.8640689849853516, + "learning_rate": 9.86233994745846e-06, + "loss": 0.8159, + "step": 2813 + }, + { + "epoch": 0.1548791898288293, + "grad_norm": 0.9959222078323364, + "learning_rate": 9.862238915644116e-06, + "loss": 0.7767, + "step": 2814 + }, + { + "epoch": 0.15493422863118497, + "grad_norm": 0.7889506220817566, + "learning_rate": 9.862137847286487e-06, + "loss": 0.8293, + "step": 2815 + }, + { + "epoch": 0.15498926743354063, + "grad_norm": 0.8764606714248657, + "learning_rate": 9.862036742386335e-06, + "loss": 0.856, + "step": 2816 + }, + { + "epoch": 0.1550443062358963, + "grad_norm": 0.743727445602417, + "learning_rate": 9.861935600944413e-06, + "loss": 0.7099, + "step": 2817 + }, + { + "epoch": 0.15509934503825196, + "grad_norm": 0.7866224050521851, + "learning_rate": 9.861834422961485e-06, + "loss": 0.8805, + "step": 2818 + }, + { + "epoch": 0.15515438384060762, + "grad_norm": 0.8333723545074463, + "learning_rate": 9.861733208438311e-06, + "loss": 0.8486, + "step": 2819 + }, + { + "epoch": 0.15520942264296328, + "grad_norm": 0.8261659741401672, + "learning_rate": 9.861631957375652e-06, + "loss": 0.8896, + "step": 2820 + }, + { + "epoch": 0.15526446144531894, + "grad_norm": 0.8381538987159729, + "learning_rate": 9.861530669774268e-06, + "loss": 0.8686, + "step": 2821 + }, + { + "epoch": 0.1553195002476746, + "grad_norm": 0.9184440970420837, + "learning_rate": 9.861429345634923e-06, + "loss": 0.9702, + "step": 2822 + }, + { + "epoch": 0.15537453905003026, + "grad_norm": 0.8170294165611267, + "learning_rate": 9.861327984958374e-06, + "loss": 0.8298, + "step": 2823 + }, + { + "epoch": 0.15542957785238593, + "grad_norm": 0.8361968398094177, + "learning_rate": 9.861226587745385e-06, + "loss": 0.8232, + "step": 2824 + }, + { + "epoch": 0.1554846166547416, + "grad_norm": 0.7437820434570312, + "learning_rate": 9.861125153996718e-06, + "loss": 0.8271, + "step": 2825 + }, + { + "epoch": 0.15553965545709725, + "grad_norm": 0.715887188911438, + "learning_rate": 9.861023683713137e-06, + "loss": 0.7726, + "step": 2826 + }, + { + "epoch": 0.1555946942594529, + "grad_norm": 0.8358462452888489, + "learning_rate": 9.860922176895403e-06, + "loss": 0.8247, + "step": 2827 + }, + { + "epoch": 0.15564973306180857, + "grad_norm": 0.8620158433914185, + "learning_rate": 9.860820633544278e-06, + "loss": 0.8804, + "step": 2828 + }, + { + "epoch": 0.15570477186416423, + "grad_norm": 0.9035346508026123, + "learning_rate": 9.860719053660527e-06, + "loss": 0.7973, + "step": 2829 + }, + { + "epoch": 0.1557598106665199, + "grad_norm": 0.8014782071113586, + "learning_rate": 9.860617437244914e-06, + "loss": 0.7914, + "step": 2830 + }, + { + "epoch": 0.15581484946887555, + "grad_norm": 0.7788864970207214, + "learning_rate": 9.8605157842982e-06, + "loss": 0.7377, + "step": 2831 + }, + { + "epoch": 0.15586988827123122, + "grad_norm": 0.7475222945213318, + "learning_rate": 9.860414094821152e-06, + "loss": 0.7173, + "step": 2832 + }, + { + "epoch": 0.15592492707358688, + "grad_norm": 0.8866652846336365, + "learning_rate": 9.86031236881453e-06, + "loss": 0.8231, + "step": 2833 + }, + { + "epoch": 0.15597996587594254, + "grad_norm": 0.8725677728652954, + "learning_rate": 9.860210606279102e-06, + "loss": 0.9025, + "step": 2834 + }, + { + "epoch": 0.1560350046782982, + "grad_norm": 0.7608423233032227, + "learning_rate": 9.860108807215634e-06, + "loss": 0.8385, + "step": 2835 + }, + { + "epoch": 0.15609004348065386, + "grad_norm": 0.8237566351890564, + "learning_rate": 9.860006971624887e-06, + "loss": 0.8635, + "step": 2836 + }, + { + "epoch": 0.15614508228300952, + "grad_norm": 0.8078347444534302, + "learning_rate": 9.859905099507629e-06, + "loss": 0.7916, + "step": 2837 + }, + { + "epoch": 0.15620012108536518, + "grad_norm": 0.8282070755958557, + "learning_rate": 9.859803190864626e-06, + "loss": 0.8141, + "step": 2838 + }, + { + "epoch": 0.15625515988772085, + "grad_norm": 0.7639191150665283, + "learning_rate": 9.859701245696642e-06, + "loss": 0.7457, + "step": 2839 + }, + { + "epoch": 0.1563101986900765, + "grad_norm": 0.8429144620895386, + "learning_rate": 9.859599264004446e-06, + "loss": 0.9176, + "step": 2840 + }, + { + "epoch": 0.15636523749243217, + "grad_norm": 0.7792791724205017, + "learning_rate": 9.859497245788801e-06, + "loss": 0.8738, + "step": 2841 + }, + { + "epoch": 0.15642027629478783, + "grad_norm": 0.9018417596817017, + "learning_rate": 9.859395191050476e-06, + "loss": 0.841, + "step": 2842 + }, + { + "epoch": 0.1564753150971435, + "grad_norm": 0.7113705277442932, + "learning_rate": 9.859293099790239e-06, + "loss": 0.6576, + "step": 2843 + }, + { + "epoch": 0.15653035389949915, + "grad_norm": 0.8376311659812927, + "learning_rate": 9.859190972008853e-06, + "loss": 0.8559, + "step": 2844 + }, + { + "epoch": 0.15658539270185481, + "grad_norm": 0.7689141035079956, + "learning_rate": 9.859088807707092e-06, + "loss": 0.7844, + "step": 2845 + }, + { + "epoch": 0.15664043150421048, + "grad_norm": 0.7559483647346497, + "learning_rate": 9.858986606885717e-06, + "loss": 0.8676, + "step": 2846 + }, + { + "epoch": 0.15669547030656614, + "grad_norm": 0.7743827700614929, + "learning_rate": 9.8588843695455e-06, + "loss": 0.7995, + "step": 2847 + }, + { + "epoch": 0.1567505091089218, + "grad_norm": 0.8631327152252197, + "learning_rate": 9.85878209568721e-06, + "loss": 0.801, + "step": 2848 + }, + { + "epoch": 0.15680554791127746, + "grad_norm": 0.7454009056091309, + "learning_rate": 9.858679785311613e-06, + "loss": 0.8172, + "step": 2849 + }, + { + "epoch": 0.15686058671363312, + "grad_norm": 0.7915313839912415, + "learning_rate": 9.858577438419479e-06, + "loss": 0.833, + "step": 2850 + }, + { + "epoch": 0.15691562551598878, + "grad_norm": 0.8472526669502258, + "learning_rate": 9.858475055011578e-06, + "loss": 0.8249, + "step": 2851 + }, + { + "epoch": 0.15697066431834444, + "grad_norm": 0.7967580556869507, + "learning_rate": 9.85837263508868e-06, + "loss": 0.7533, + "step": 2852 + }, + { + "epoch": 0.1570257031207001, + "grad_norm": 0.7476257085800171, + "learning_rate": 9.858270178651554e-06, + "loss": 0.7918, + "step": 2853 + }, + { + "epoch": 0.15708074192305577, + "grad_norm": 0.8736184239387512, + "learning_rate": 9.858167685700968e-06, + "loss": 0.8254, + "step": 2854 + }, + { + "epoch": 0.15713578072541143, + "grad_norm": 0.8734819889068604, + "learning_rate": 9.858065156237694e-06, + "loss": 0.749, + "step": 2855 + }, + { + "epoch": 0.1571908195277671, + "grad_norm": 1.0344874858856201, + "learning_rate": 9.857962590262506e-06, + "loss": 0.9578, + "step": 2856 + }, + { + "epoch": 0.15724585833012272, + "grad_norm": 0.81183922290802, + "learning_rate": 9.857859987776168e-06, + "loss": 0.8845, + "step": 2857 + }, + { + "epoch": 0.15730089713247838, + "grad_norm": 0.8252540230751038, + "learning_rate": 9.857757348779456e-06, + "loss": 0.7862, + "step": 2858 + }, + { + "epoch": 0.15735593593483405, + "grad_norm": 0.7468119859695435, + "learning_rate": 9.85765467327314e-06, + "loss": 0.7587, + "step": 2859 + }, + { + "epoch": 0.1574109747371897, + "grad_norm": 0.8095998167991638, + "learning_rate": 9.857551961257993e-06, + "loss": 0.7467, + "step": 2860 + }, + { + "epoch": 0.15746601353954537, + "grad_norm": 0.8908564448356628, + "learning_rate": 9.857449212734785e-06, + "loss": 0.8199, + "step": 2861 + }, + { + "epoch": 0.15752105234190103, + "grad_norm": 0.7605593204498291, + "learning_rate": 9.857346427704288e-06, + "loss": 0.7196, + "step": 2862 + }, + { + "epoch": 0.1575760911442567, + "grad_norm": 0.9250784516334534, + "learning_rate": 9.857243606167276e-06, + "loss": 0.7366, + "step": 2863 + }, + { + "epoch": 0.15763112994661235, + "grad_norm": 0.882796585559845, + "learning_rate": 9.85714074812452e-06, + "loss": 0.8422, + "step": 2864 + }, + { + "epoch": 0.15768616874896801, + "grad_norm": 1.0014574527740479, + "learning_rate": 9.857037853576797e-06, + "loss": 0.8762, + "step": 2865 + }, + { + "epoch": 0.15774120755132368, + "grad_norm": 0.86713045835495, + "learning_rate": 9.856934922524877e-06, + "loss": 0.9282, + "step": 2866 + }, + { + "epoch": 0.15779624635367934, + "grad_norm": 1.1457390785217285, + "learning_rate": 9.856831954969532e-06, + "loss": 0.7947, + "step": 2867 + }, + { + "epoch": 0.157851285156035, + "grad_norm": 0.8902556896209717, + "learning_rate": 9.85672895091154e-06, + "loss": 0.928, + "step": 2868 + }, + { + "epoch": 0.15790632395839066, + "grad_norm": 0.7978467345237732, + "learning_rate": 9.856625910351674e-06, + "loss": 0.7382, + "step": 2869 + }, + { + "epoch": 0.15796136276074632, + "grad_norm": 0.741457462310791, + "learning_rate": 9.856522833290705e-06, + "loss": 0.7736, + "step": 2870 + }, + { + "epoch": 0.15801640156310198, + "grad_norm": 0.8330628871917725, + "learning_rate": 9.856419719729413e-06, + "loss": 0.8396, + "step": 2871 + }, + { + "epoch": 0.15807144036545764, + "grad_norm": 0.8771876692771912, + "learning_rate": 9.85631656966857e-06, + "loss": 0.6669, + "step": 2872 + }, + { + "epoch": 0.1581264791678133, + "grad_norm": 0.8073394298553467, + "learning_rate": 9.85621338310895e-06, + "loss": 0.8206, + "step": 2873 + }, + { + "epoch": 0.15818151797016897, + "grad_norm": 1.1058349609375, + "learning_rate": 9.85611016005133e-06, + "loss": 0.9526, + "step": 2874 + }, + { + "epoch": 0.15823655677252463, + "grad_norm": 0.7734992504119873, + "learning_rate": 9.856006900496488e-06, + "loss": 0.7477, + "step": 2875 + }, + { + "epoch": 0.1582915955748803, + "grad_norm": 0.9053219556808472, + "learning_rate": 9.855903604445196e-06, + "loss": 0.8009, + "step": 2876 + }, + { + "epoch": 0.15834663437723595, + "grad_norm": 0.8774041533470154, + "learning_rate": 9.855800271898233e-06, + "loss": 0.854, + "step": 2877 + }, + { + "epoch": 0.1584016731795916, + "grad_norm": 0.8346550464630127, + "learning_rate": 9.855696902856376e-06, + "loss": 0.7976, + "step": 2878 + }, + { + "epoch": 0.15845671198194727, + "grad_norm": 0.7781139016151428, + "learning_rate": 9.855593497320401e-06, + "loss": 0.7693, + "step": 2879 + }, + { + "epoch": 0.15851175078430293, + "grad_norm": 0.8707864880561829, + "learning_rate": 9.855490055291084e-06, + "loss": 0.882, + "step": 2880 + }, + { + "epoch": 0.1585667895866586, + "grad_norm": 0.7982275485992432, + "learning_rate": 9.855386576769203e-06, + "loss": 0.8457, + "step": 2881 + }, + { + "epoch": 0.15862182838901426, + "grad_norm": 0.7577090263366699, + "learning_rate": 9.855283061755536e-06, + "loss": 0.754, + "step": 2882 + }, + { + "epoch": 0.15867686719136992, + "grad_norm": 0.7422069311141968, + "learning_rate": 9.855179510250863e-06, + "loss": 0.673, + "step": 2883 + }, + { + "epoch": 0.15873190599372558, + "grad_norm": 0.7730041742324829, + "learning_rate": 9.85507592225596e-06, + "loss": 0.7888, + "step": 2884 + }, + { + "epoch": 0.15878694479608124, + "grad_norm": 0.7370560169219971, + "learning_rate": 9.854972297771605e-06, + "loss": 0.7762, + "step": 2885 + }, + { + "epoch": 0.1588419835984369, + "grad_norm": 0.725074291229248, + "learning_rate": 9.854868636798577e-06, + "loss": 0.7951, + "step": 2886 + }, + { + "epoch": 0.15889702240079256, + "grad_norm": 0.8088375926017761, + "learning_rate": 9.854764939337657e-06, + "loss": 0.8557, + "step": 2887 + }, + { + "epoch": 0.15895206120314823, + "grad_norm": 0.8268256187438965, + "learning_rate": 9.854661205389624e-06, + "loss": 0.7641, + "step": 2888 + }, + { + "epoch": 0.1590071000055039, + "grad_norm": 0.8079462051391602, + "learning_rate": 9.854557434955257e-06, + "loss": 0.7947, + "step": 2889 + }, + { + "epoch": 0.15906213880785955, + "grad_norm": 0.8257912993431091, + "learning_rate": 9.854453628035335e-06, + "loss": 0.771, + "step": 2890 + }, + { + "epoch": 0.1591171776102152, + "grad_norm": 0.8901774287223816, + "learning_rate": 9.85434978463064e-06, + "loss": 0.9415, + "step": 2891 + }, + { + "epoch": 0.15917221641257087, + "grad_norm": 0.8283013105392456, + "learning_rate": 9.854245904741948e-06, + "loss": 0.7267, + "step": 2892 + }, + { + "epoch": 0.15922725521492653, + "grad_norm": 0.8665382266044617, + "learning_rate": 9.854141988370045e-06, + "loss": 0.8681, + "step": 2893 + }, + { + "epoch": 0.1592822940172822, + "grad_norm": 0.786494255065918, + "learning_rate": 9.854038035515712e-06, + "loss": 0.7614, + "step": 2894 + }, + { + "epoch": 0.15933733281963786, + "grad_norm": 1.0548759698867798, + "learning_rate": 9.853934046179727e-06, + "loss": 0.861, + "step": 2895 + }, + { + "epoch": 0.15939237162199352, + "grad_norm": 0.8565425276756287, + "learning_rate": 9.853830020362873e-06, + "loss": 0.7858, + "step": 2896 + }, + { + "epoch": 0.15944741042434918, + "grad_norm": 0.7982691526412964, + "learning_rate": 9.853725958065933e-06, + "loss": 0.8797, + "step": 2897 + }, + { + "epoch": 0.15950244922670484, + "grad_norm": 0.8613169193267822, + "learning_rate": 9.853621859289686e-06, + "loss": 0.9217, + "step": 2898 + }, + { + "epoch": 0.1595574880290605, + "grad_norm": 0.950639009475708, + "learning_rate": 9.853517724034918e-06, + "loss": 0.8315, + "step": 2899 + }, + { + "epoch": 0.15961252683141613, + "grad_norm": 0.7940176129341125, + "learning_rate": 9.853413552302409e-06, + "loss": 0.7713, + "step": 2900 + }, + { + "epoch": 0.1596675656337718, + "grad_norm": 0.7716153264045715, + "learning_rate": 9.853309344092944e-06, + "loss": 0.7922, + "step": 2901 + }, + { + "epoch": 0.15972260443612746, + "grad_norm": 0.7626190781593323, + "learning_rate": 9.853205099407303e-06, + "loss": 0.7278, + "step": 2902 + }, + { + "epoch": 0.15977764323848312, + "grad_norm": 0.8523116707801819, + "learning_rate": 9.853100818246274e-06, + "loss": 0.8136, + "step": 2903 + }, + { + "epoch": 0.15983268204083878, + "grad_norm": 0.7636643052101135, + "learning_rate": 9.852996500610637e-06, + "loss": 0.6984, + "step": 2904 + }, + { + "epoch": 0.15988772084319444, + "grad_norm": 0.799201250076294, + "learning_rate": 9.852892146501179e-06, + "loss": 0.8319, + "step": 2905 + }, + { + "epoch": 0.1599427596455501, + "grad_norm": 0.7743694186210632, + "learning_rate": 9.85278775591868e-06, + "loss": 0.81, + "step": 2906 + }, + { + "epoch": 0.15999779844790576, + "grad_norm": 0.8964856863021851, + "learning_rate": 9.85268332886393e-06, + "loss": 0.9227, + "step": 2907 + }, + { + "epoch": 0.16005283725026143, + "grad_norm": 0.8809369802474976, + "learning_rate": 9.852578865337708e-06, + "loss": 0.9285, + "step": 2908 + }, + { + "epoch": 0.1601078760526171, + "grad_norm": 0.8960002064704895, + "learning_rate": 9.852474365340806e-06, + "loss": 0.8611, + "step": 2909 + }, + { + "epoch": 0.16016291485497275, + "grad_norm": 0.7539754509925842, + "learning_rate": 9.852369828874002e-06, + "loss": 0.7455, + "step": 2910 + }, + { + "epoch": 0.1602179536573284, + "grad_norm": 0.8189692497253418, + "learning_rate": 9.852265255938088e-06, + "loss": 0.8321, + "step": 2911 + }, + { + "epoch": 0.16027299245968407, + "grad_norm": 0.8708549737930298, + "learning_rate": 9.852160646533844e-06, + "loss": 0.8373, + "step": 2912 + }, + { + "epoch": 0.16032803126203973, + "grad_norm": 0.7701451778411865, + "learning_rate": 9.852056000662063e-06, + "loss": 0.805, + "step": 2913 + }, + { + "epoch": 0.1603830700643954, + "grad_norm": 0.9111948609352112, + "learning_rate": 9.851951318323526e-06, + "loss": 0.8513, + "step": 2914 + }, + { + "epoch": 0.16043810886675106, + "grad_norm": 0.7863909602165222, + "learning_rate": 9.85184659951902e-06, + "loss": 0.7856, + "step": 2915 + }, + { + "epoch": 0.16049314766910672, + "grad_norm": 0.9000817537307739, + "learning_rate": 9.851741844249336e-06, + "loss": 0.9172, + "step": 2916 + }, + { + "epoch": 0.16054818647146238, + "grad_norm": 1.0953118801116943, + "learning_rate": 9.851637052515259e-06, + "loss": 0.8564, + "step": 2917 + }, + { + "epoch": 0.16060322527381804, + "grad_norm": 0.8405389785766602, + "learning_rate": 9.851532224317575e-06, + "loss": 0.8317, + "step": 2918 + }, + { + "epoch": 0.1606582640761737, + "grad_norm": 0.8524565100669861, + "learning_rate": 9.851427359657075e-06, + "loss": 0.8765, + "step": 2919 + }, + { + "epoch": 0.16071330287852936, + "grad_norm": 0.8234089016914368, + "learning_rate": 9.851322458534546e-06, + "loss": 0.7873, + "step": 2920 + }, + { + "epoch": 0.16076834168088502, + "grad_norm": 0.7879638671875, + "learning_rate": 9.851217520950775e-06, + "loss": 0.8394, + "step": 2921 + }, + { + "epoch": 0.16082338048324069, + "grad_norm": 0.8168820738792419, + "learning_rate": 9.851112546906552e-06, + "loss": 0.8223, + "step": 2922 + }, + { + "epoch": 0.16087841928559635, + "grad_norm": 0.9423845410346985, + "learning_rate": 9.851007536402666e-06, + "loss": 0.9256, + "step": 2923 + }, + { + "epoch": 0.160933458087952, + "grad_norm": 0.7875099778175354, + "learning_rate": 9.850902489439906e-06, + "loss": 0.8199, + "step": 2924 + }, + { + "epoch": 0.16098849689030767, + "grad_norm": 0.6934793591499329, + "learning_rate": 9.85079740601906e-06, + "loss": 0.671, + "step": 2925 + }, + { + "epoch": 0.16104353569266333, + "grad_norm": 0.8172206282615662, + "learning_rate": 9.85069228614092e-06, + "loss": 0.7633, + "step": 2926 + }, + { + "epoch": 0.161098574495019, + "grad_norm": 0.72749263048172, + "learning_rate": 9.850587129806274e-06, + "loss": 0.8719, + "step": 2927 + }, + { + "epoch": 0.16115361329737465, + "grad_norm": 0.8416743874549866, + "learning_rate": 9.850481937015917e-06, + "loss": 0.8438, + "step": 2928 + }, + { + "epoch": 0.16120865209973032, + "grad_norm": 0.7415444850921631, + "learning_rate": 9.850376707770633e-06, + "loss": 0.7673, + "step": 2929 + }, + { + "epoch": 0.16126369090208598, + "grad_norm": 0.9364289045333862, + "learning_rate": 9.850271442071217e-06, + "loss": 0.7224, + "step": 2930 + }, + { + "epoch": 0.16131872970444164, + "grad_norm": 0.7314212918281555, + "learning_rate": 9.85016613991846e-06, + "loss": 0.7759, + "step": 2931 + }, + { + "epoch": 0.1613737685067973, + "grad_norm": 0.8940219283103943, + "learning_rate": 9.850060801313151e-06, + "loss": 0.8432, + "step": 2932 + }, + { + "epoch": 0.16142880730915296, + "grad_norm": 0.7499691843986511, + "learning_rate": 9.849955426256084e-06, + "loss": 0.8171, + "step": 2933 + }, + { + "epoch": 0.16148384611150862, + "grad_norm": 0.8123053312301636, + "learning_rate": 9.84985001474805e-06, + "loss": 0.7839, + "step": 2934 + }, + { + "epoch": 0.16153888491386428, + "grad_norm": 0.819618821144104, + "learning_rate": 9.849744566789842e-06, + "loss": 0.9123, + "step": 2935 + }, + { + "epoch": 0.16159392371621994, + "grad_norm": 0.791088342666626, + "learning_rate": 9.849639082382251e-06, + "loss": 0.8347, + "step": 2936 + }, + { + "epoch": 0.1616489625185756, + "grad_norm": 0.8166706562042236, + "learning_rate": 9.849533561526072e-06, + "loss": 0.8309, + "step": 2937 + }, + { + "epoch": 0.16170400132093127, + "grad_norm": 0.7944774031639099, + "learning_rate": 9.849428004222098e-06, + "loss": 0.8387, + "step": 2938 + }, + { + "epoch": 0.16175904012328693, + "grad_norm": 0.7414719462394714, + "learning_rate": 9.849322410471119e-06, + "loss": 0.71, + "step": 2939 + }, + { + "epoch": 0.1618140789256426, + "grad_norm": 0.8983511924743652, + "learning_rate": 9.849216780273931e-06, + "loss": 0.8902, + "step": 2940 + }, + { + "epoch": 0.16186911772799825, + "grad_norm": 0.9058687686920166, + "learning_rate": 9.849111113631329e-06, + "loss": 0.8804, + "step": 2941 + }, + { + "epoch": 0.1619241565303539, + "grad_norm": 0.948871374130249, + "learning_rate": 9.849005410544105e-06, + "loss": 0.9871, + "step": 2942 + }, + { + "epoch": 0.16197919533270955, + "grad_norm": 0.8240115642547607, + "learning_rate": 9.848899671013055e-06, + "loss": 0.8708, + "step": 2943 + }, + { + "epoch": 0.1620342341350652, + "grad_norm": 0.879953145980835, + "learning_rate": 9.848793895038972e-06, + "loss": 0.9279, + "step": 2944 + }, + { + "epoch": 0.16208927293742087, + "grad_norm": 0.8464690446853638, + "learning_rate": 9.848688082622653e-06, + "loss": 0.8418, + "step": 2945 + }, + { + "epoch": 0.16214431173977653, + "grad_norm": 0.8990732431411743, + "learning_rate": 9.848582233764891e-06, + "loss": 0.8622, + "step": 2946 + }, + { + "epoch": 0.1621993505421322, + "grad_norm": 0.8054911494255066, + "learning_rate": 9.848476348466483e-06, + "loss": 0.8295, + "step": 2947 + }, + { + "epoch": 0.16225438934448785, + "grad_norm": 0.7904845476150513, + "learning_rate": 9.848370426728226e-06, + "loss": 0.7777, + "step": 2948 + }, + { + "epoch": 0.16230942814684352, + "grad_norm": 1.0143954753875732, + "learning_rate": 9.848264468550915e-06, + "loss": 0.8556, + "step": 2949 + }, + { + "epoch": 0.16236446694919918, + "grad_norm": 0.7201125621795654, + "learning_rate": 9.848158473935344e-06, + "loss": 0.7981, + "step": 2950 + }, + { + "epoch": 0.16241950575155484, + "grad_norm": 0.8322157263755798, + "learning_rate": 9.848052442882312e-06, + "loss": 0.8323, + "step": 2951 + }, + { + "epoch": 0.1624745445539105, + "grad_norm": 0.7740346193313599, + "learning_rate": 9.847946375392617e-06, + "loss": 0.8355, + "step": 2952 + }, + { + "epoch": 0.16252958335626616, + "grad_norm": 0.8955645561218262, + "learning_rate": 9.847840271467053e-06, + "loss": 0.7161, + "step": 2953 + }, + { + "epoch": 0.16258462215862182, + "grad_norm": 0.800364077091217, + "learning_rate": 9.847734131106421e-06, + "loss": 0.8165, + "step": 2954 + }, + { + "epoch": 0.16263966096097748, + "grad_norm": 0.8305484056472778, + "learning_rate": 9.847627954311516e-06, + "loss": 0.7846, + "step": 2955 + }, + { + "epoch": 0.16269469976333314, + "grad_norm": 0.7354590892791748, + "learning_rate": 9.847521741083136e-06, + "loss": 0.7743, + "step": 2956 + }, + { + "epoch": 0.1627497385656888, + "grad_norm": 0.8173812627792358, + "learning_rate": 9.847415491422083e-06, + "loss": 0.8626, + "step": 2957 + }, + { + "epoch": 0.16280477736804447, + "grad_norm": 0.7959356307983398, + "learning_rate": 9.84730920532915e-06, + "loss": 0.8016, + "step": 2958 + }, + { + "epoch": 0.16285981617040013, + "grad_norm": 0.8256500363349915, + "learning_rate": 9.84720288280514e-06, + "loss": 0.7407, + "step": 2959 + }, + { + "epoch": 0.1629148549727558, + "grad_norm": 0.8522148728370667, + "learning_rate": 9.84709652385085e-06, + "loss": 0.8342, + "step": 2960 + }, + { + "epoch": 0.16296989377511145, + "grad_norm": 0.7791039943695068, + "learning_rate": 9.84699012846708e-06, + "loss": 0.7239, + "step": 2961 + }, + { + "epoch": 0.1630249325774671, + "grad_norm": 0.84294193983078, + "learning_rate": 9.84688369665463e-06, + "loss": 0.7498, + "step": 2962 + }, + { + "epoch": 0.16307997137982277, + "grad_norm": 0.7948899865150452, + "learning_rate": 9.846777228414299e-06, + "loss": 0.7713, + "step": 2963 + }, + { + "epoch": 0.16313501018217844, + "grad_norm": 0.6673180460929871, + "learning_rate": 9.846670723746888e-06, + "loss": 0.6759, + "step": 2964 + }, + { + "epoch": 0.1631900489845341, + "grad_norm": 0.8141015768051147, + "learning_rate": 9.846564182653199e-06, + "loss": 0.7928, + "step": 2965 + }, + { + "epoch": 0.16324508778688976, + "grad_norm": 0.967830240726471, + "learning_rate": 9.846457605134028e-06, + "loss": 0.823, + "step": 2966 + }, + { + "epoch": 0.16330012658924542, + "grad_norm": 0.8099361658096313, + "learning_rate": 9.84635099119018e-06, + "loss": 0.8724, + "step": 2967 + }, + { + "epoch": 0.16335516539160108, + "grad_norm": 0.7913978099822998, + "learning_rate": 9.846244340822456e-06, + "loss": 0.7106, + "step": 2968 + }, + { + "epoch": 0.16341020419395674, + "grad_norm": 0.8308563828468323, + "learning_rate": 9.846137654031655e-06, + "loss": 0.7631, + "step": 2969 + }, + { + "epoch": 0.1634652429963124, + "grad_norm": 0.8634191751480103, + "learning_rate": 9.846030930818582e-06, + "loss": 0.7363, + "step": 2970 + }, + { + "epoch": 0.16352028179866807, + "grad_norm": 0.8936432600021362, + "learning_rate": 9.845924171184038e-06, + "loss": 0.8714, + "step": 2971 + }, + { + "epoch": 0.16357532060102373, + "grad_norm": 0.8776300549507141, + "learning_rate": 9.845817375128825e-06, + "loss": 0.914, + "step": 2972 + }, + { + "epoch": 0.1636303594033794, + "grad_norm": 0.8793039321899414, + "learning_rate": 9.845710542653745e-06, + "loss": 0.7999, + "step": 2973 + }, + { + "epoch": 0.16368539820573505, + "grad_norm": 0.8391743302345276, + "learning_rate": 9.845603673759603e-06, + "loss": 0.8124, + "step": 2974 + }, + { + "epoch": 0.1637404370080907, + "grad_norm": 0.8487186431884766, + "learning_rate": 9.845496768447199e-06, + "loss": 0.8551, + "step": 2975 + }, + { + "epoch": 0.16379547581044637, + "grad_norm": 0.7780638933181763, + "learning_rate": 9.845389826717339e-06, + "loss": 0.7281, + "step": 2976 + }, + { + "epoch": 0.16385051461280203, + "grad_norm": 0.7209637761116028, + "learning_rate": 9.845282848570825e-06, + "loss": 0.6737, + "step": 2977 + }, + { + "epoch": 0.1639055534151577, + "grad_norm": 0.8414756059646606, + "learning_rate": 9.845175834008464e-06, + "loss": 0.8003, + "step": 2978 + }, + { + "epoch": 0.16396059221751336, + "grad_norm": 1.2730679512023926, + "learning_rate": 9.845068783031057e-06, + "loss": 0.8243, + "step": 2979 + }, + { + "epoch": 0.16401563101986902, + "grad_norm": 0.8573475480079651, + "learning_rate": 9.844961695639413e-06, + "loss": 0.7844, + "step": 2980 + }, + { + "epoch": 0.16407066982222468, + "grad_norm": 0.8029958605766296, + "learning_rate": 9.84485457183433e-06, + "loss": 0.7722, + "step": 2981 + }, + { + "epoch": 0.16412570862458034, + "grad_norm": 0.7839805483818054, + "learning_rate": 9.844747411616619e-06, + "loss": 0.8146, + "step": 2982 + }, + { + "epoch": 0.164180747426936, + "grad_norm": 0.7563499212265015, + "learning_rate": 9.844640214987082e-06, + "loss": 0.6909, + "step": 2983 + }, + { + "epoch": 0.16423578622929166, + "grad_norm": 0.7199193239212036, + "learning_rate": 9.844532981946527e-06, + "loss": 0.702, + "step": 2984 + }, + { + "epoch": 0.16429082503164733, + "grad_norm": 0.7519383430480957, + "learning_rate": 9.844425712495758e-06, + "loss": 0.6493, + "step": 2985 + }, + { + "epoch": 0.16434586383400296, + "grad_norm": 0.7493193745613098, + "learning_rate": 9.844318406635584e-06, + "loss": 0.8318, + "step": 2986 + }, + { + "epoch": 0.16440090263635862, + "grad_norm": 0.7951106429100037, + "learning_rate": 9.84421106436681e-06, + "loss": 0.923, + "step": 2987 + }, + { + "epoch": 0.16445594143871428, + "grad_norm": 0.8350820541381836, + "learning_rate": 9.844103685690238e-06, + "loss": 0.8091, + "step": 2988 + }, + { + "epoch": 0.16451098024106994, + "grad_norm": 0.773932695388794, + "learning_rate": 9.843996270606683e-06, + "loss": 0.8016, + "step": 2989 + }, + { + "epoch": 0.1645660190434256, + "grad_norm": 0.8208432793617249, + "learning_rate": 9.843888819116947e-06, + "loss": 0.7704, + "step": 2990 + }, + { + "epoch": 0.16462105784578127, + "grad_norm": 0.8552223443984985, + "learning_rate": 9.84378133122184e-06, + "loss": 0.8519, + "step": 2991 + }, + { + "epoch": 0.16467609664813693, + "grad_norm": 0.8015661835670471, + "learning_rate": 9.84367380692217e-06, + "loss": 0.8389, + "step": 2992 + }, + { + "epoch": 0.1647311354504926, + "grad_norm": 0.7828749418258667, + "learning_rate": 9.843566246218743e-06, + "loss": 0.7385, + "step": 2993 + }, + { + "epoch": 0.16478617425284825, + "grad_norm": 0.7761647701263428, + "learning_rate": 9.84345864911237e-06, + "loss": 0.8419, + "step": 2994 + }, + { + "epoch": 0.1648412130552039, + "grad_norm": 0.8839839100837708, + "learning_rate": 9.843351015603857e-06, + "loss": 0.8069, + "step": 2995 + }, + { + "epoch": 0.16489625185755957, + "grad_norm": 0.8611735105514526, + "learning_rate": 9.843243345694014e-06, + "loss": 0.9406, + "step": 2996 + }, + { + "epoch": 0.16495129065991523, + "grad_norm": 0.9042683839797974, + "learning_rate": 9.84313563938365e-06, + "loss": 0.821, + "step": 2997 + }, + { + "epoch": 0.1650063294622709, + "grad_norm": 0.8333690762519836, + "learning_rate": 9.843027896673577e-06, + "loss": 0.781, + "step": 2998 + }, + { + "epoch": 0.16506136826462656, + "grad_norm": 0.819922924041748, + "learning_rate": 9.8429201175646e-06, + "loss": 0.869, + "step": 2999 + }, + { + "epoch": 0.16511640706698222, + "grad_norm": 0.8349948525428772, + "learning_rate": 9.842812302057534e-06, + "loss": 0.9271, + "step": 3000 + }, + { + "epoch": 0.16517144586933788, + "grad_norm": 0.8981684446334839, + "learning_rate": 9.842704450153187e-06, + "loss": 0.7384, + "step": 3001 + }, + { + "epoch": 0.16522648467169354, + "grad_norm": 0.839133083820343, + "learning_rate": 9.842596561852369e-06, + "loss": 0.9016, + "step": 3002 + }, + { + "epoch": 0.1652815234740492, + "grad_norm": 0.8303349614143372, + "learning_rate": 9.842488637155891e-06, + "loss": 0.7488, + "step": 3003 + }, + { + "epoch": 0.16533656227640486, + "grad_norm": 0.8748323917388916, + "learning_rate": 9.842380676064566e-06, + "loss": 0.8163, + "step": 3004 + }, + { + "epoch": 0.16539160107876053, + "grad_norm": 0.782844603061676, + "learning_rate": 9.842272678579203e-06, + "loss": 0.8465, + "step": 3005 + }, + { + "epoch": 0.1654466398811162, + "grad_norm": 0.8068844676017761, + "learning_rate": 9.842164644700615e-06, + "loss": 0.8856, + "step": 3006 + }, + { + "epoch": 0.16550167868347185, + "grad_norm": 0.9174006581306458, + "learning_rate": 9.842056574429615e-06, + "loss": 0.7748, + "step": 3007 + }, + { + "epoch": 0.1655567174858275, + "grad_norm": 0.7453809380531311, + "learning_rate": 9.841948467767012e-06, + "loss": 0.7565, + "step": 3008 + }, + { + "epoch": 0.16561175628818317, + "grad_norm": 0.8408182859420776, + "learning_rate": 9.841840324713622e-06, + "loss": 0.7345, + "step": 3009 + }, + { + "epoch": 0.16566679509053883, + "grad_norm": 0.8599638938903809, + "learning_rate": 9.841732145270254e-06, + "loss": 0.8163, + "step": 3010 + }, + { + "epoch": 0.1657218338928945, + "grad_norm": 0.877616822719574, + "learning_rate": 9.841623929437725e-06, + "loss": 0.8685, + "step": 3011 + }, + { + "epoch": 0.16577687269525015, + "grad_norm": 0.7765643000602722, + "learning_rate": 9.841515677216846e-06, + "loss": 0.7281, + "step": 3012 + }, + { + "epoch": 0.16583191149760582, + "grad_norm": 0.7891712784767151, + "learning_rate": 9.841407388608431e-06, + "loss": 0.8618, + "step": 3013 + }, + { + "epoch": 0.16588695029996148, + "grad_norm": 0.9215571284294128, + "learning_rate": 9.841299063613295e-06, + "loss": 0.8709, + "step": 3014 + }, + { + "epoch": 0.16594198910231714, + "grad_norm": 0.8428288698196411, + "learning_rate": 9.841190702232249e-06, + "loss": 0.8227, + "step": 3015 + }, + { + "epoch": 0.1659970279046728, + "grad_norm": 0.9294042587280273, + "learning_rate": 9.841082304466112e-06, + "loss": 0.8203, + "step": 3016 + }, + { + "epoch": 0.16605206670702846, + "grad_norm": 0.7530880570411682, + "learning_rate": 9.840973870315695e-06, + "loss": 0.7681, + "step": 3017 + }, + { + "epoch": 0.16610710550938412, + "grad_norm": 1.0149626731872559, + "learning_rate": 9.840865399781814e-06, + "loss": 0.9212, + "step": 3018 + }, + { + "epoch": 0.16616214431173978, + "grad_norm": 0.7595353722572327, + "learning_rate": 9.840756892865285e-06, + "loss": 0.795, + "step": 3019 + }, + { + "epoch": 0.16621718311409545, + "grad_norm": 0.7893253564834595, + "learning_rate": 9.840648349566924e-06, + "loss": 0.8147, + "step": 3020 + }, + { + "epoch": 0.1662722219164511, + "grad_norm": 0.8190789222717285, + "learning_rate": 9.840539769887543e-06, + "loss": 0.7233, + "step": 3021 + }, + { + "epoch": 0.16632726071880677, + "grad_norm": 0.7771229147911072, + "learning_rate": 9.840431153827963e-06, + "loss": 0.7172, + "step": 3022 + }, + { + "epoch": 0.16638229952116243, + "grad_norm": 0.7379328012466431, + "learning_rate": 9.840322501388998e-06, + "loss": 0.7603, + "step": 3023 + }, + { + "epoch": 0.1664373383235181, + "grad_norm": 0.9488499760627747, + "learning_rate": 9.840213812571464e-06, + "loss": 0.8025, + "step": 3024 + }, + { + "epoch": 0.16649237712587375, + "grad_norm": 0.7135865092277527, + "learning_rate": 9.84010508737618e-06, + "loss": 0.7412, + "step": 3025 + }, + { + "epoch": 0.16654741592822941, + "grad_norm": 1.6780112981796265, + "learning_rate": 9.83999632580396e-06, + "loss": 0.9231, + "step": 3026 + }, + { + "epoch": 0.16660245473058508, + "grad_norm": 0.8815935850143433, + "learning_rate": 9.839887527855623e-06, + "loss": 0.7903, + "step": 3027 + }, + { + "epoch": 0.16665749353294074, + "grad_norm": 0.8942261338233948, + "learning_rate": 9.83977869353199e-06, + "loss": 0.8328, + "step": 3028 + }, + { + "epoch": 0.16671253233529637, + "grad_norm": 0.7866815328598022, + "learning_rate": 9.839669822833873e-06, + "loss": 0.8483, + "step": 3029 + }, + { + "epoch": 0.16676757113765203, + "grad_norm": 0.8133070468902588, + "learning_rate": 9.839560915762094e-06, + "loss": 0.8665, + "step": 3030 + }, + { + "epoch": 0.1668226099400077, + "grad_norm": 0.7442927360534668, + "learning_rate": 9.839451972317469e-06, + "loss": 0.6296, + "step": 3031 + }, + { + "epoch": 0.16687764874236335, + "grad_norm": 0.7505021691322327, + "learning_rate": 9.83934299250082e-06, + "loss": 0.7976, + "step": 3032 + }, + { + "epoch": 0.16693268754471902, + "grad_norm": 0.8310422897338867, + "learning_rate": 9.839233976312964e-06, + "loss": 0.9022, + "step": 3033 + }, + { + "epoch": 0.16698772634707468, + "grad_norm": 0.8175413012504578, + "learning_rate": 9.839124923754721e-06, + "loss": 0.8653, + "step": 3034 + }, + { + "epoch": 0.16704276514943034, + "grad_norm": 0.7963089346885681, + "learning_rate": 9.839015834826912e-06, + "loss": 0.7888, + "step": 3035 + }, + { + "epoch": 0.167097803951786, + "grad_norm": 0.8923391699790955, + "learning_rate": 9.838906709530353e-06, + "loss": 0.9396, + "step": 3036 + }, + { + "epoch": 0.16715284275414166, + "grad_norm": 0.7851678133010864, + "learning_rate": 9.838797547865869e-06, + "loss": 0.8163, + "step": 3037 + }, + { + "epoch": 0.16720788155649732, + "grad_norm": 0.817877471446991, + "learning_rate": 9.838688349834275e-06, + "loss": 0.8928, + "step": 3038 + }, + { + "epoch": 0.16726292035885298, + "grad_norm": 0.7603926062583923, + "learning_rate": 9.838579115436395e-06, + "loss": 0.7418, + "step": 3039 + }, + { + "epoch": 0.16731795916120865, + "grad_norm": 0.8086446523666382, + "learning_rate": 9.83846984467305e-06, + "loss": 0.8017, + "step": 3040 + }, + { + "epoch": 0.1673729979635643, + "grad_norm": 1.4745439291000366, + "learning_rate": 9.838360537545061e-06, + "loss": 0.7964, + "step": 3041 + }, + { + "epoch": 0.16742803676591997, + "grad_norm": 0.778404176235199, + "learning_rate": 9.83825119405325e-06, + "loss": 0.7395, + "step": 3042 + }, + { + "epoch": 0.16748307556827563, + "grad_norm": 0.8245886564254761, + "learning_rate": 9.838141814198439e-06, + "loss": 0.8697, + "step": 3043 + }, + { + "epoch": 0.1675381143706313, + "grad_norm": 0.8395472764968872, + "learning_rate": 9.838032397981448e-06, + "loss": 0.8545, + "step": 3044 + }, + { + "epoch": 0.16759315317298695, + "grad_norm": 0.8973744511604309, + "learning_rate": 9.8379229454031e-06, + "loss": 0.8999, + "step": 3045 + }, + { + "epoch": 0.16764819197534261, + "grad_norm": 1.2034368515014648, + "learning_rate": 9.837813456464219e-06, + "loss": 0.9039, + "step": 3046 + }, + { + "epoch": 0.16770323077769828, + "grad_norm": 0.862167477607727, + "learning_rate": 9.837703931165625e-06, + "loss": 0.889, + "step": 3047 + }, + { + "epoch": 0.16775826958005394, + "grad_norm": 0.7624714970588684, + "learning_rate": 9.837594369508146e-06, + "loss": 0.7072, + "step": 3048 + }, + { + "epoch": 0.1678133083824096, + "grad_norm": 0.7771621346473694, + "learning_rate": 9.8374847714926e-06, + "loss": 0.8769, + "step": 3049 + }, + { + "epoch": 0.16786834718476526, + "grad_norm": 0.7834492921829224, + "learning_rate": 9.837375137119816e-06, + "loss": 0.841, + "step": 3050 + }, + { + "epoch": 0.16792338598712092, + "grad_norm": 0.8175067901611328, + "learning_rate": 9.837265466390612e-06, + "loss": 0.8149, + "step": 3051 + }, + { + "epoch": 0.16797842478947658, + "grad_norm": 0.7474493384361267, + "learning_rate": 9.83715575930582e-06, + "loss": 0.7716, + "step": 3052 + }, + { + "epoch": 0.16803346359183224, + "grad_norm": 1.1263303756713867, + "learning_rate": 9.837046015866257e-06, + "loss": 0.8026, + "step": 3053 + }, + { + "epoch": 0.1680885023941879, + "grad_norm": 0.8741740584373474, + "learning_rate": 9.836936236072752e-06, + "loss": 0.8795, + "step": 3054 + }, + { + "epoch": 0.16814354119654357, + "grad_norm": 0.8108506798744202, + "learning_rate": 9.83682641992613e-06, + "loss": 0.7682, + "step": 3055 + }, + { + "epoch": 0.16819857999889923, + "grad_norm": 0.9380543231964111, + "learning_rate": 9.836716567427213e-06, + "loss": 0.8739, + "step": 3056 + }, + { + "epoch": 0.1682536188012549, + "grad_norm": 0.7755940556526184, + "learning_rate": 9.83660667857683e-06, + "loss": 0.7287, + "step": 3057 + }, + { + "epoch": 0.16830865760361055, + "grad_norm": 0.808907151222229, + "learning_rate": 9.836496753375807e-06, + "loss": 0.7988, + "step": 3058 + }, + { + "epoch": 0.1683636964059662, + "grad_norm": 1.1496524810791016, + "learning_rate": 9.836386791824967e-06, + "loss": 0.8621, + "step": 3059 + }, + { + "epoch": 0.16841873520832187, + "grad_norm": 0.8550384640693665, + "learning_rate": 9.83627679392514e-06, + "loss": 0.913, + "step": 3060 + }, + { + "epoch": 0.16847377401067753, + "grad_norm": 0.761142909526825, + "learning_rate": 9.83616675967715e-06, + "loss": 0.7271, + "step": 3061 + }, + { + "epoch": 0.1685288128130332, + "grad_norm": 0.8496200442314148, + "learning_rate": 9.836056689081828e-06, + "loss": 0.7885, + "step": 3062 + }, + { + "epoch": 0.16858385161538886, + "grad_norm": 0.8310382962226868, + "learning_rate": 9.835946582139996e-06, + "loss": 0.858, + "step": 3063 + }, + { + "epoch": 0.16863889041774452, + "grad_norm": 0.7870821952819824, + "learning_rate": 9.835836438852485e-06, + "loss": 0.7791, + "step": 3064 + }, + { + "epoch": 0.16869392922010018, + "grad_norm": 0.7170534729957581, + "learning_rate": 9.83572625922012e-06, + "loss": 0.6666, + "step": 3065 + }, + { + "epoch": 0.16874896802245584, + "grad_norm": 0.9764187932014465, + "learning_rate": 9.835616043243732e-06, + "loss": 0.8341, + "step": 3066 + }, + { + "epoch": 0.1688040068248115, + "grad_norm": 0.7453315258026123, + "learning_rate": 9.83550579092415e-06, + "loss": 0.7032, + "step": 3067 + }, + { + "epoch": 0.16885904562716716, + "grad_norm": 0.9205759763717651, + "learning_rate": 9.835395502262196e-06, + "loss": 0.813, + "step": 3068 + }, + { + "epoch": 0.16891408442952283, + "grad_norm": 0.8152205944061279, + "learning_rate": 9.835285177258708e-06, + "loss": 0.8275, + "step": 3069 + }, + { + "epoch": 0.1689691232318785, + "grad_norm": 0.8065707087516785, + "learning_rate": 9.83517481591451e-06, + "loss": 0.8648, + "step": 3070 + }, + { + "epoch": 0.16902416203423415, + "grad_norm": 0.7774410247802734, + "learning_rate": 9.835064418230432e-06, + "loss": 0.7818, + "step": 3071 + }, + { + "epoch": 0.16907920083658978, + "grad_norm": 0.8591069579124451, + "learning_rate": 9.834953984207305e-06, + "loss": 0.8055, + "step": 3072 + }, + { + "epoch": 0.16913423963894544, + "grad_norm": 0.7421612739562988, + "learning_rate": 9.834843513845958e-06, + "loss": 0.7543, + "step": 3073 + }, + { + "epoch": 0.1691892784413011, + "grad_norm": 0.7855183482170105, + "learning_rate": 9.83473300714722e-06, + "loss": 0.7011, + "step": 3074 + }, + { + "epoch": 0.16924431724365677, + "grad_norm": 0.8061636686325073, + "learning_rate": 9.834622464111924e-06, + "loss": 0.8096, + "step": 3075 + }, + { + "epoch": 0.16929935604601243, + "grad_norm": 0.8048406839370728, + "learning_rate": 9.834511884740898e-06, + "loss": 0.8166, + "step": 3076 + }, + { + "epoch": 0.1693543948483681, + "grad_norm": 0.8776549696922302, + "learning_rate": 9.834401269034977e-06, + "loss": 0.8169, + "step": 3077 + }, + { + "epoch": 0.16940943365072375, + "grad_norm": 1.0208356380462646, + "learning_rate": 9.83429061699499e-06, + "loss": 0.6976, + "step": 3078 + }, + { + "epoch": 0.1694644724530794, + "grad_norm": 0.7641016840934753, + "learning_rate": 9.834179928621767e-06, + "loss": 0.7109, + "step": 3079 + }, + { + "epoch": 0.16951951125543507, + "grad_norm": 0.7648905515670776, + "learning_rate": 9.834069203916143e-06, + "loss": 0.7927, + "step": 3080 + }, + { + "epoch": 0.16957455005779073, + "grad_norm": 0.7898744344711304, + "learning_rate": 9.833958442878948e-06, + "loss": 0.7911, + "step": 3081 + }, + { + "epoch": 0.1696295888601464, + "grad_norm": 0.8812462687492371, + "learning_rate": 9.833847645511016e-06, + "loss": 0.8381, + "step": 3082 + }, + { + "epoch": 0.16968462766250206, + "grad_norm": 0.8141197562217712, + "learning_rate": 9.833736811813179e-06, + "loss": 0.7422, + "step": 3083 + }, + { + "epoch": 0.16973966646485772, + "grad_norm": 0.7860949635505676, + "learning_rate": 9.83362594178627e-06, + "loss": 0.7568, + "step": 3084 + }, + { + "epoch": 0.16979470526721338, + "grad_norm": 0.6688396334648132, + "learning_rate": 9.833515035431123e-06, + "loss": 0.7143, + "step": 3085 + }, + { + "epoch": 0.16984974406956904, + "grad_norm": 0.7525103092193604, + "learning_rate": 9.833404092748569e-06, + "loss": 0.8026, + "step": 3086 + }, + { + "epoch": 0.1699047828719247, + "grad_norm": 0.8505181670188904, + "learning_rate": 9.833293113739444e-06, + "loss": 0.8894, + "step": 3087 + }, + { + "epoch": 0.16995982167428036, + "grad_norm": 0.8432300090789795, + "learning_rate": 9.833182098404583e-06, + "loss": 0.7801, + "step": 3088 + }, + { + "epoch": 0.17001486047663603, + "grad_norm": 0.7655903100967407, + "learning_rate": 9.833071046744819e-06, + "loss": 0.7838, + "step": 3089 + }, + { + "epoch": 0.1700698992789917, + "grad_norm": 0.8436369895935059, + "learning_rate": 9.832959958760986e-06, + "loss": 0.8636, + "step": 3090 + }, + { + "epoch": 0.17012493808134735, + "grad_norm": 0.7880234122276306, + "learning_rate": 9.83284883445392e-06, + "loss": 0.7701, + "step": 3091 + }, + { + "epoch": 0.170179976883703, + "grad_norm": 0.7713757753372192, + "learning_rate": 9.832737673824455e-06, + "loss": 0.8652, + "step": 3092 + }, + { + "epoch": 0.17023501568605867, + "grad_norm": 0.7905295491218567, + "learning_rate": 9.832626476873428e-06, + "loss": 0.8666, + "step": 3093 + }, + { + "epoch": 0.17029005448841433, + "grad_norm": 0.7589883804321289, + "learning_rate": 9.832515243601675e-06, + "loss": 0.8051, + "step": 3094 + }, + { + "epoch": 0.17034509329077, + "grad_norm": 0.9068838953971863, + "learning_rate": 9.83240397401003e-06, + "loss": 0.9037, + "step": 3095 + }, + { + "epoch": 0.17040013209312566, + "grad_norm": 0.7465278506278992, + "learning_rate": 9.83229266809933e-06, + "loss": 0.7425, + "step": 3096 + }, + { + "epoch": 0.17045517089548132, + "grad_norm": 0.8111177086830139, + "learning_rate": 9.83218132587041e-06, + "loss": 0.8034, + "step": 3097 + }, + { + "epoch": 0.17051020969783698, + "grad_norm": 1.1007672548294067, + "learning_rate": 9.832069947324112e-06, + "loss": 0.9139, + "step": 3098 + }, + { + "epoch": 0.17056524850019264, + "grad_norm": 0.881179690361023, + "learning_rate": 9.831958532461269e-06, + "loss": 0.9062, + "step": 3099 + }, + { + "epoch": 0.1706202873025483, + "grad_norm": 0.8012413382530212, + "learning_rate": 9.831847081282718e-06, + "loss": 0.7956, + "step": 3100 + }, + { + "epoch": 0.17067532610490396, + "grad_norm": 0.741731584072113, + "learning_rate": 9.831735593789298e-06, + "loss": 0.8754, + "step": 3101 + }, + { + "epoch": 0.17073036490725962, + "grad_norm": 0.8945604562759399, + "learning_rate": 9.831624069981848e-06, + "loss": 0.8293, + "step": 3102 + }, + { + "epoch": 0.17078540370961529, + "grad_norm": 0.7865545749664307, + "learning_rate": 9.831512509861203e-06, + "loss": 0.7812, + "step": 3103 + }, + { + "epoch": 0.17084044251197095, + "grad_norm": 0.832847535610199, + "learning_rate": 9.831400913428205e-06, + "loss": 0.8925, + "step": 3104 + }, + { + "epoch": 0.1708954813143266, + "grad_norm": 0.7374216914176941, + "learning_rate": 9.83128928068369e-06, + "loss": 0.8275, + "step": 3105 + }, + { + "epoch": 0.17095052011668227, + "grad_norm": 0.748725414276123, + "learning_rate": 9.831177611628497e-06, + "loss": 0.8364, + "step": 3106 + }, + { + "epoch": 0.17100555891903793, + "grad_norm": 0.810276448726654, + "learning_rate": 9.831065906263468e-06, + "loss": 0.861, + "step": 3107 + }, + { + "epoch": 0.1710605977213936, + "grad_norm": 0.7607758641242981, + "learning_rate": 9.83095416458944e-06, + "loss": 0.7989, + "step": 3108 + }, + { + "epoch": 0.17111563652374925, + "grad_norm": 0.7206127047538757, + "learning_rate": 9.830842386607253e-06, + "loss": 0.7187, + "step": 3109 + }, + { + "epoch": 0.17117067532610492, + "grad_norm": 0.7775895595550537, + "learning_rate": 9.83073057231775e-06, + "loss": 0.8008, + "step": 3110 + }, + { + "epoch": 0.17122571412846058, + "grad_norm": 0.8351094722747803, + "learning_rate": 9.830618721721768e-06, + "loss": 0.8025, + "step": 3111 + }, + { + "epoch": 0.17128075293081624, + "grad_norm": 0.8090646266937256, + "learning_rate": 9.830506834820148e-06, + "loss": 0.8012, + "step": 3112 + }, + { + "epoch": 0.1713357917331719, + "grad_norm": 0.7762801051139832, + "learning_rate": 9.830394911613733e-06, + "loss": 0.8428, + "step": 3113 + }, + { + "epoch": 0.17139083053552756, + "grad_norm": 0.8117541074752808, + "learning_rate": 9.83028295210336e-06, + "loss": 0.8566, + "step": 3114 + }, + { + "epoch": 0.1714458693378832, + "grad_norm": 0.8786184787750244, + "learning_rate": 9.830170956289876e-06, + "loss": 0.8386, + "step": 3115 + }, + { + "epoch": 0.17150090814023886, + "grad_norm": 1.0181046724319458, + "learning_rate": 9.83005892417412e-06, + "loss": 0.8555, + "step": 3116 + }, + { + "epoch": 0.17155594694259452, + "grad_norm": 0.8236173391342163, + "learning_rate": 9.829946855756934e-06, + "loss": 0.7933, + "step": 3117 + }, + { + "epoch": 0.17161098574495018, + "grad_norm": 0.8058149814605713, + "learning_rate": 9.829834751039157e-06, + "loss": 0.842, + "step": 3118 + }, + { + "epoch": 0.17166602454730584, + "grad_norm": 0.7419908046722412, + "learning_rate": 9.82972261002164e-06, + "loss": 0.8397, + "step": 3119 + }, + { + "epoch": 0.1717210633496615, + "grad_norm": 0.7528164982795715, + "learning_rate": 9.829610432705216e-06, + "loss": 0.7931, + "step": 3120 + }, + { + "epoch": 0.17177610215201716, + "grad_norm": 0.7357296943664551, + "learning_rate": 9.829498219090736e-06, + "loss": 0.8089, + "step": 3121 + }, + { + "epoch": 0.17183114095437282, + "grad_norm": 0.7635773420333862, + "learning_rate": 9.829385969179039e-06, + "loss": 0.7442, + "step": 3122 + }, + { + "epoch": 0.17188617975672849, + "grad_norm": 0.8200171589851379, + "learning_rate": 9.82927368297097e-06, + "loss": 0.757, + "step": 3123 + }, + { + "epoch": 0.17194121855908415, + "grad_norm": 0.8367171287536621, + "learning_rate": 9.829161360467374e-06, + "loss": 0.915, + "step": 3124 + }, + { + "epoch": 0.1719962573614398, + "grad_norm": 0.8460778594017029, + "learning_rate": 9.829049001669091e-06, + "loss": 0.8568, + "step": 3125 + }, + { + "epoch": 0.17205129616379547, + "grad_norm": 0.7301799058914185, + "learning_rate": 9.82893660657697e-06, + "loss": 0.8041, + "step": 3126 + }, + { + "epoch": 0.17210633496615113, + "grad_norm": 0.7858132123947144, + "learning_rate": 9.828824175191854e-06, + "loss": 0.8367, + "step": 3127 + }, + { + "epoch": 0.1721613737685068, + "grad_norm": 0.8118360042572021, + "learning_rate": 9.82871170751459e-06, + "loss": 0.85, + "step": 3128 + }, + { + "epoch": 0.17221641257086245, + "grad_norm": 0.9020261764526367, + "learning_rate": 9.828599203546019e-06, + "loss": 0.789, + "step": 3129 + }, + { + "epoch": 0.17227145137321812, + "grad_norm": 0.8194546699523926, + "learning_rate": 9.828486663286989e-06, + "loss": 0.8644, + "step": 3130 + }, + { + "epoch": 0.17232649017557378, + "grad_norm": 0.7764905095100403, + "learning_rate": 9.828374086738345e-06, + "loss": 0.7961, + "step": 3131 + }, + { + "epoch": 0.17238152897792944, + "grad_norm": 0.7712632417678833, + "learning_rate": 9.828261473900935e-06, + "loss": 0.8082, + "step": 3132 + }, + { + "epoch": 0.1724365677802851, + "grad_norm": 0.7100280523300171, + "learning_rate": 9.828148824775604e-06, + "loss": 0.7514, + "step": 3133 + }, + { + "epoch": 0.17249160658264076, + "grad_norm": 0.7812890410423279, + "learning_rate": 9.8280361393632e-06, + "loss": 0.7125, + "step": 3134 + }, + { + "epoch": 0.17254664538499642, + "grad_norm": 0.8772642612457275, + "learning_rate": 9.827923417664568e-06, + "loss": 0.8355, + "step": 3135 + }, + { + "epoch": 0.17260168418735208, + "grad_norm": 0.9161205291748047, + "learning_rate": 9.827810659680555e-06, + "loss": 0.7511, + "step": 3136 + }, + { + "epoch": 0.17265672298970774, + "grad_norm": 0.7628560662269592, + "learning_rate": 9.82769786541201e-06, + "loss": 0.882, + "step": 3137 + }, + { + "epoch": 0.1727117617920634, + "grad_norm": 0.8203405737876892, + "learning_rate": 9.827585034859781e-06, + "loss": 0.8172, + "step": 3138 + }, + { + "epoch": 0.17276680059441907, + "grad_norm": 0.8318095207214355, + "learning_rate": 9.827472168024715e-06, + "loss": 0.7784, + "step": 3139 + }, + { + "epoch": 0.17282183939677473, + "grad_norm": 0.9137747287750244, + "learning_rate": 9.827359264907658e-06, + "loss": 0.8643, + "step": 3140 + }, + { + "epoch": 0.1728768781991304, + "grad_norm": 0.9441068768501282, + "learning_rate": 9.827246325509463e-06, + "loss": 0.7936, + "step": 3141 + }, + { + "epoch": 0.17293191700148605, + "grad_norm": 0.7402390837669373, + "learning_rate": 9.827133349830977e-06, + "loss": 0.7813, + "step": 3142 + }, + { + "epoch": 0.1729869558038417, + "grad_norm": 0.8328836560249329, + "learning_rate": 9.827020337873048e-06, + "loss": 0.7676, + "step": 3143 + }, + { + "epoch": 0.17304199460619737, + "grad_norm": 0.8106881380081177, + "learning_rate": 9.826907289636526e-06, + "loss": 0.9037, + "step": 3144 + }, + { + "epoch": 0.17309703340855304, + "grad_norm": 0.8457425236701965, + "learning_rate": 9.826794205122263e-06, + "loss": 0.78, + "step": 3145 + }, + { + "epoch": 0.1731520722109087, + "grad_norm": 0.9335517883300781, + "learning_rate": 9.826681084331105e-06, + "loss": 0.9197, + "step": 3146 + }, + { + "epoch": 0.17320711101326436, + "grad_norm": 0.9098715782165527, + "learning_rate": 9.826567927263904e-06, + "loss": 0.932, + "step": 3147 + }, + { + "epoch": 0.17326214981562002, + "grad_norm": 0.767234206199646, + "learning_rate": 9.826454733921512e-06, + "loss": 0.8717, + "step": 3148 + }, + { + "epoch": 0.17331718861797568, + "grad_norm": 0.8114444017410278, + "learning_rate": 9.826341504304775e-06, + "loss": 0.8744, + "step": 3149 + }, + { + "epoch": 0.17337222742033134, + "grad_norm": 0.7948976755142212, + "learning_rate": 9.82622823841455e-06, + "loss": 0.7947, + "step": 3150 + }, + { + "epoch": 0.173427266222687, + "grad_norm": 0.7808204889297485, + "learning_rate": 9.826114936251684e-06, + "loss": 0.8151, + "step": 3151 + }, + { + "epoch": 0.17348230502504267, + "grad_norm": 0.733860969543457, + "learning_rate": 9.82600159781703e-06, + "loss": 0.8018, + "step": 3152 + }, + { + "epoch": 0.17353734382739833, + "grad_norm": 0.7630699276924133, + "learning_rate": 9.825888223111442e-06, + "loss": 0.7937, + "step": 3153 + }, + { + "epoch": 0.173592382629754, + "grad_norm": 0.7892931699752808, + "learning_rate": 9.825774812135766e-06, + "loss": 0.782, + "step": 3154 + }, + { + "epoch": 0.17364742143210965, + "grad_norm": 0.6642436385154724, + "learning_rate": 9.825661364890862e-06, + "loss": 0.6611, + "step": 3155 + }, + { + "epoch": 0.1737024602344653, + "grad_norm": 0.7755968570709229, + "learning_rate": 9.825547881377577e-06, + "loss": 0.7835, + "step": 3156 + }, + { + "epoch": 0.17375749903682097, + "grad_norm": 0.8406579494476318, + "learning_rate": 9.825434361596766e-06, + "loss": 0.9178, + "step": 3157 + }, + { + "epoch": 0.1738125378391766, + "grad_norm": 0.8887308835983276, + "learning_rate": 9.825320805549284e-06, + "loss": 0.7951, + "step": 3158 + }, + { + "epoch": 0.17386757664153227, + "grad_norm": 0.85418701171875, + "learning_rate": 9.825207213235978e-06, + "loss": 0.8671, + "step": 3159 + }, + { + "epoch": 0.17392261544388793, + "grad_norm": 0.8831202983856201, + "learning_rate": 9.82509358465771e-06, + "loss": 0.8708, + "step": 3160 + }, + { + "epoch": 0.1739776542462436, + "grad_norm": 0.9041616320610046, + "learning_rate": 9.82497991981533e-06, + "loss": 0.8981, + "step": 3161 + }, + { + "epoch": 0.17403269304859925, + "grad_norm": 0.8169258832931519, + "learning_rate": 9.824866218709692e-06, + "loss": 0.8857, + "step": 3162 + }, + { + "epoch": 0.1740877318509549, + "grad_norm": 0.8714475631713867, + "learning_rate": 9.824752481341651e-06, + "loss": 0.8552, + "step": 3163 + }, + { + "epoch": 0.17414277065331057, + "grad_norm": 0.8261111378669739, + "learning_rate": 9.824638707712061e-06, + "loss": 0.808, + "step": 3164 + }, + { + "epoch": 0.17419780945566624, + "grad_norm": 0.7542527914047241, + "learning_rate": 9.82452489782178e-06, + "loss": 0.8078, + "step": 3165 + }, + { + "epoch": 0.1742528482580219, + "grad_norm": 1.309218168258667, + "learning_rate": 9.824411051671658e-06, + "loss": 0.9325, + "step": 3166 + }, + { + "epoch": 0.17430788706037756, + "grad_norm": 0.8528563380241394, + "learning_rate": 9.824297169262555e-06, + "loss": 0.8493, + "step": 3167 + }, + { + "epoch": 0.17436292586273322, + "grad_norm": 0.7777062058448792, + "learning_rate": 9.824183250595328e-06, + "loss": 0.7002, + "step": 3168 + }, + { + "epoch": 0.17441796466508888, + "grad_norm": 0.7385506629943848, + "learning_rate": 9.824069295670828e-06, + "loss": 0.8396, + "step": 3169 + }, + { + "epoch": 0.17447300346744454, + "grad_norm": 0.8316949605941772, + "learning_rate": 9.823955304489918e-06, + "loss": 0.8769, + "step": 3170 + }, + { + "epoch": 0.1745280422698002, + "grad_norm": 0.8149139285087585, + "learning_rate": 9.823841277053448e-06, + "loss": 0.8009, + "step": 3171 + }, + { + "epoch": 0.17458308107215587, + "grad_norm": 0.8761584162712097, + "learning_rate": 9.82372721336228e-06, + "loss": 0.7366, + "step": 3172 + }, + { + "epoch": 0.17463811987451153, + "grad_norm": 0.7104084491729736, + "learning_rate": 9.82361311341727e-06, + "loss": 0.6704, + "step": 3173 + }, + { + "epoch": 0.1746931586768672, + "grad_norm": 0.791806697845459, + "learning_rate": 9.823498977219273e-06, + "loss": 0.9054, + "step": 3174 + }, + { + "epoch": 0.17474819747922285, + "grad_norm": 0.7675086855888367, + "learning_rate": 9.82338480476915e-06, + "loss": 0.751, + "step": 3175 + }, + { + "epoch": 0.1748032362815785, + "grad_norm": 0.7380725145339966, + "learning_rate": 9.823270596067759e-06, + "loss": 0.7618, + "step": 3176 + }, + { + "epoch": 0.17485827508393417, + "grad_norm": 0.7311519384384155, + "learning_rate": 9.823156351115954e-06, + "loss": 0.7424, + "step": 3177 + }, + { + "epoch": 0.17491331388628983, + "grad_norm": 0.7888365387916565, + "learning_rate": 9.8230420699146e-06, + "loss": 0.7717, + "step": 3178 + }, + { + "epoch": 0.1749683526886455, + "grad_norm": 0.9329265356063843, + "learning_rate": 9.822927752464552e-06, + "loss": 0.8256, + "step": 3179 + }, + { + "epoch": 0.17502339149100116, + "grad_norm": 0.711794912815094, + "learning_rate": 9.822813398766671e-06, + "loss": 0.7373, + "step": 3180 + }, + { + "epoch": 0.17507843029335682, + "grad_norm": 0.8713497519493103, + "learning_rate": 9.822699008821813e-06, + "loss": 0.8135, + "step": 3181 + }, + { + "epoch": 0.17513346909571248, + "grad_norm": 0.6923471689224243, + "learning_rate": 9.822584582630841e-06, + "loss": 0.7589, + "step": 3182 + }, + { + "epoch": 0.17518850789806814, + "grad_norm": 0.8648017048835754, + "learning_rate": 9.822470120194616e-06, + "loss": 0.7828, + "step": 3183 + }, + { + "epoch": 0.1752435467004238, + "grad_norm": 0.8407077789306641, + "learning_rate": 9.822355621513994e-06, + "loss": 0.8537, + "step": 3184 + }, + { + "epoch": 0.17529858550277946, + "grad_norm": 0.8076738119125366, + "learning_rate": 9.822241086589841e-06, + "loss": 0.7827, + "step": 3185 + }, + { + "epoch": 0.17535362430513513, + "grad_norm": 0.8402661085128784, + "learning_rate": 9.822126515423011e-06, + "loss": 0.8247, + "step": 3186 + }, + { + "epoch": 0.1754086631074908, + "grad_norm": 0.8911813497543335, + "learning_rate": 9.822011908014373e-06, + "loss": 0.8996, + "step": 3187 + }, + { + "epoch": 0.17546370190984645, + "grad_norm": 0.8060111999511719, + "learning_rate": 9.821897264364782e-06, + "loss": 0.796, + "step": 3188 + }, + { + "epoch": 0.1755187407122021, + "grad_norm": 0.8476423621177673, + "learning_rate": 9.8217825844751e-06, + "loss": 0.8657, + "step": 3189 + }, + { + "epoch": 0.17557377951455777, + "grad_norm": 0.7614054083824158, + "learning_rate": 9.821667868346194e-06, + "loss": 0.8583, + "step": 3190 + }, + { + "epoch": 0.17562881831691343, + "grad_norm": 0.8312287330627441, + "learning_rate": 9.821553115978923e-06, + "loss": 0.7718, + "step": 3191 + }, + { + "epoch": 0.1756838571192691, + "grad_norm": 0.8199487328529358, + "learning_rate": 9.82143832737415e-06, + "loss": 0.7617, + "step": 3192 + }, + { + "epoch": 0.17573889592162475, + "grad_norm": 0.7529115080833435, + "learning_rate": 9.821323502532733e-06, + "loss": 0.7587, + "step": 3193 + }, + { + "epoch": 0.17579393472398042, + "grad_norm": 0.9205463528633118, + "learning_rate": 9.821208641455542e-06, + "loss": 0.7871, + "step": 3194 + }, + { + "epoch": 0.17584897352633608, + "grad_norm": 0.8055161833763123, + "learning_rate": 9.821093744143437e-06, + "loss": 0.8133, + "step": 3195 + }, + { + "epoch": 0.17590401232869174, + "grad_norm": 0.7322981953620911, + "learning_rate": 9.82097881059728e-06, + "loss": 0.7442, + "step": 3196 + }, + { + "epoch": 0.1759590511310474, + "grad_norm": 1.0465941429138184, + "learning_rate": 9.82086384081794e-06, + "loss": 1.0073, + "step": 3197 + }, + { + "epoch": 0.17601408993340306, + "grad_norm": 0.7607331275939941, + "learning_rate": 9.820748834806278e-06, + "loss": 0.8128, + "step": 3198 + }, + { + "epoch": 0.17606912873575872, + "grad_norm": 0.7901879549026489, + "learning_rate": 9.820633792563156e-06, + "loss": 0.7928, + "step": 3199 + }, + { + "epoch": 0.17612416753811436, + "grad_norm": 0.8010839223861694, + "learning_rate": 9.820518714089442e-06, + "loss": 0.7025, + "step": 3200 + }, + { + "epoch": 0.17617920634047002, + "grad_norm": 0.8511317372322083, + "learning_rate": 9.820403599385999e-06, + "loss": 0.7947, + "step": 3201 + }, + { + "epoch": 0.17623424514282568, + "grad_norm": 0.7978847026824951, + "learning_rate": 9.820288448453693e-06, + "loss": 0.7395, + "step": 3202 + }, + { + "epoch": 0.17628928394518134, + "grad_norm": 0.6991232633590698, + "learning_rate": 9.820173261293388e-06, + "loss": 0.7113, + "step": 3203 + }, + { + "epoch": 0.176344322747537, + "grad_norm": 0.8966444730758667, + "learning_rate": 9.820058037905954e-06, + "loss": 0.7399, + "step": 3204 + }, + { + "epoch": 0.17639936154989266, + "grad_norm": 0.8042632341384888, + "learning_rate": 9.819942778292253e-06, + "loss": 0.8183, + "step": 3205 + }, + { + "epoch": 0.17645440035224833, + "grad_norm": 0.8047537803649902, + "learning_rate": 9.81982748245315e-06, + "loss": 0.852, + "step": 3206 + }, + { + "epoch": 0.176509439154604, + "grad_norm": 0.8277122378349304, + "learning_rate": 9.819712150389517e-06, + "loss": 0.8828, + "step": 3207 + }, + { + "epoch": 0.17656447795695965, + "grad_norm": 0.8677185773849487, + "learning_rate": 9.819596782102216e-06, + "loss": 0.8416, + "step": 3208 + }, + { + "epoch": 0.1766195167593153, + "grad_norm": 0.8750975728034973, + "learning_rate": 9.819481377592115e-06, + "loss": 0.9289, + "step": 3209 + }, + { + "epoch": 0.17667455556167097, + "grad_norm": 0.7665122151374817, + "learning_rate": 9.819365936860084e-06, + "loss": 0.8653, + "step": 3210 + }, + { + "epoch": 0.17672959436402663, + "grad_norm": 0.9341353178024292, + "learning_rate": 9.819250459906989e-06, + "loss": 0.7225, + "step": 3211 + }, + { + "epoch": 0.1767846331663823, + "grad_norm": 0.7007241249084473, + "learning_rate": 9.819134946733696e-06, + "loss": 0.7429, + "step": 3212 + }, + { + "epoch": 0.17683967196873795, + "grad_norm": 0.8001461029052734, + "learning_rate": 9.819019397341074e-06, + "loss": 0.759, + "step": 3213 + }, + { + "epoch": 0.17689471077109362, + "grad_norm": 0.8936446905136108, + "learning_rate": 9.818903811729993e-06, + "loss": 0.8248, + "step": 3214 + }, + { + "epoch": 0.17694974957344928, + "grad_norm": 0.805570125579834, + "learning_rate": 9.818788189901321e-06, + "loss": 0.9214, + "step": 3215 + }, + { + "epoch": 0.17700478837580494, + "grad_norm": 0.7762455940246582, + "learning_rate": 9.818672531855926e-06, + "loss": 0.7848, + "step": 3216 + }, + { + "epoch": 0.1770598271781606, + "grad_norm": 0.8391497731208801, + "learning_rate": 9.81855683759468e-06, + "loss": 0.7543, + "step": 3217 + }, + { + "epoch": 0.17711486598051626, + "grad_norm": 0.8489046692848206, + "learning_rate": 9.818441107118449e-06, + "loss": 0.7908, + "step": 3218 + }, + { + "epoch": 0.17716990478287192, + "grad_norm": 1.0949461460113525, + "learning_rate": 9.818325340428105e-06, + "loss": 0.8255, + "step": 3219 + }, + { + "epoch": 0.17722494358522758, + "grad_norm": 0.8710842132568359, + "learning_rate": 9.81820953752452e-06, + "loss": 0.859, + "step": 3220 + }, + { + "epoch": 0.17727998238758325, + "grad_norm": 0.7936064600944519, + "learning_rate": 9.818093698408558e-06, + "loss": 0.8475, + "step": 3221 + }, + { + "epoch": 0.1773350211899389, + "grad_norm": 0.790341854095459, + "learning_rate": 9.817977823081095e-06, + "loss": 0.8137, + "step": 3222 + }, + { + "epoch": 0.17739005999229457, + "grad_norm": 0.8154531717300415, + "learning_rate": 9.817861911543002e-06, + "loss": 0.8687, + "step": 3223 + }, + { + "epoch": 0.17744509879465023, + "grad_norm": 0.8346067070960999, + "learning_rate": 9.817745963795144e-06, + "loss": 0.8905, + "step": 3224 + }, + { + "epoch": 0.1775001375970059, + "grad_norm": 0.7137764096260071, + "learning_rate": 9.817629979838401e-06, + "loss": 0.7715, + "step": 3225 + }, + { + "epoch": 0.17755517639936155, + "grad_norm": 0.7237628102302551, + "learning_rate": 9.81751395967364e-06, + "loss": 0.7824, + "step": 3226 + }, + { + "epoch": 0.17761021520171721, + "grad_norm": 0.9481163024902344, + "learning_rate": 9.817397903301733e-06, + "loss": 0.7451, + "step": 3227 + }, + { + "epoch": 0.17766525400407288, + "grad_norm": 0.9472424387931824, + "learning_rate": 9.817281810723552e-06, + "loss": 0.8774, + "step": 3228 + }, + { + "epoch": 0.17772029280642854, + "grad_norm": 0.9295538663864136, + "learning_rate": 9.81716568193997e-06, + "loss": 0.8507, + "step": 3229 + }, + { + "epoch": 0.1777753316087842, + "grad_norm": 0.7668172717094421, + "learning_rate": 9.817049516951863e-06, + "loss": 0.8547, + "step": 3230 + }, + { + "epoch": 0.17783037041113986, + "grad_norm": 0.8640413880348206, + "learning_rate": 9.8169333157601e-06, + "loss": 0.8485, + "step": 3231 + }, + { + "epoch": 0.17788540921349552, + "grad_norm": 0.9901431798934937, + "learning_rate": 9.816817078365554e-06, + "loss": 0.9236, + "step": 3232 + }, + { + "epoch": 0.17794044801585118, + "grad_norm": 1.0242371559143066, + "learning_rate": 9.816700804769104e-06, + "loss": 0.8096, + "step": 3233 + }, + { + "epoch": 0.17799548681820684, + "grad_norm": 0.910498857498169, + "learning_rate": 9.816584494971617e-06, + "loss": 0.829, + "step": 3234 + }, + { + "epoch": 0.1780505256205625, + "grad_norm": 0.8254473805427551, + "learning_rate": 9.816468148973972e-06, + "loss": 0.7828, + "step": 3235 + }, + { + "epoch": 0.17810556442291817, + "grad_norm": 0.7971221804618835, + "learning_rate": 9.816351766777039e-06, + "loss": 0.8057, + "step": 3236 + }, + { + "epoch": 0.17816060322527383, + "grad_norm": 0.8151674270629883, + "learning_rate": 9.816235348381697e-06, + "loss": 0.7801, + "step": 3237 + }, + { + "epoch": 0.1782156420276295, + "grad_norm": 0.7587556838989258, + "learning_rate": 9.81611889378882e-06, + "loss": 0.7814, + "step": 3238 + }, + { + "epoch": 0.17827068082998515, + "grad_norm": 0.8843516111373901, + "learning_rate": 9.816002402999283e-06, + "loss": 0.8873, + "step": 3239 + }, + { + "epoch": 0.1783257196323408, + "grad_norm": 0.917859673500061, + "learning_rate": 9.81588587601396e-06, + "loss": 0.8963, + "step": 3240 + }, + { + "epoch": 0.17838075843469647, + "grad_norm": 0.8256439566612244, + "learning_rate": 9.815769312833727e-06, + "loss": 0.9157, + "step": 3241 + }, + { + "epoch": 0.17843579723705214, + "grad_norm": 0.8364603519439697, + "learning_rate": 9.815652713459462e-06, + "loss": 0.8253, + "step": 3242 + }, + { + "epoch": 0.17849083603940777, + "grad_norm": 0.7717131972312927, + "learning_rate": 9.81553607789204e-06, + "loss": 0.7211, + "step": 3243 + }, + { + "epoch": 0.17854587484176343, + "grad_norm": 0.8069111704826355, + "learning_rate": 9.815419406132338e-06, + "loss": 0.8986, + "step": 3244 + }, + { + "epoch": 0.1786009136441191, + "grad_norm": 0.9176943302154541, + "learning_rate": 9.815302698181233e-06, + "loss": 0.8084, + "step": 3245 + }, + { + "epoch": 0.17865595244647475, + "grad_norm": 0.769183874130249, + "learning_rate": 9.815185954039601e-06, + "loss": 0.8084, + "step": 3246 + }, + { + "epoch": 0.17871099124883041, + "grad_norm": 0.8070697784423828, + "learning_rate": 9.815069173708321e-06, + "loss": 0.8371, + "step": 3247 + }, + { + "epoch": 0.17876603005118608, + "grad_norm": 0.7837347388267517, + "learning_rate": 9.81495235718827e-06, + "loss": 0.8015, + "step": 3248 + }, + { + "epoch": 0.17882106885354174, + "grad_norm": 0.9248430728912354, + "learning_rate": 9.814835504480327e-06, + "loss": 0.8396, + "step": 3249 + }, + { + "epoch": 0.1788761076558974, + "grad_norm": 0.7914367914199829, + "learning_rate": 9.814718615585367e-06, + "loss": 0.8068, + "step": 3250 + }, + { + "epoch": 0.17893114645825306, + "grad_norm": 0.8612570762634277, + "learning_rate": 9.814601690504273e-06, + "loss": 0.8227, + "step": 3251 + }, + { + "epoch": 0.17898618526060872, + "grad_norm": 0.7476248741149902, + "learning_rate": 9.81448472923792e-06, + "loss": 0.8609, + "step": 3252 + }, + { + "epoch": 0.17904122406296438, + "grad_norm": 0.7455218434333801, + "learning_rate": 9.81436773178719e-06, + "loss": 0.7992, + "step": 3253 + }, + { + "epoch": 0.17909626286532004, + "grad_norm": 0.7917896509170532, + "learning_rate": 9.814250698152958e-06, + "loss": 0.8383, + "step": 3254 + }, + { + "epoch": 0.1791513016676757, + "grad_norm": 0.6926130652427673, + "learning_rate": 9.81413362833611e-06, + "loss": 0.709, + "step": 3255 + }, + { + "epoch": 0.17920634047003137, + "grad_norm": 0.8219630718231201, + "learning_rate": 9.814016522337519e-06, + "loss": 0.9387, + "step": 3256 + }, + { + "epoch": 0.17926137927238703, + "grad_norm": 0.8588619828224182, + "learning_rate": 9.81389938015807e-06, + "loss": 0.8354, + "step": 3257 + }, + { + "epoch": 0.1793164180747427, + "grad_norm": 0.7868718504905701, + "learning_rate": 9.81378220179864e-06, + "loss": 0.8464, + "step": 3258 + }, + { + "epoch": 0.17937145687709835, + "grad_norm": 0.789479672908783, + "learning_rate": 9.813664987260114e-06, + "loss": 0.8577, + "step": 3259 + }, + { + "epoch": 0.179426495679454, + "grad_norm": 0.8280717730522156, + "learning_rate": 9.81354773654337e-06, + "loss": 0.765, + "step": 3260 + }, + { + "epoch": 0.17948153448180967, + "grad_norm": 0.7660181522369385, + "learning_rate": 9.813430449649289e-06, + "loss": 0.7116, + "step": 3261 + }, + { + "epoch": 0.17953657328416534, + "grad_norm": 0.8043892979621887, + "learning_rate": 9.813313126578754e-06, + "loss": 0.8398, + "step": 3262 + }, + { + "epoch": 0.179591612086521, + "grad_norm": 0.8708420991897583, + "learning_rate": 9.813195767332647e-06, + "loss": 0.8246, + "step": 3263 + }, + { + "epoch": 0.17964665088887666, + "grad_norm": 1.1456964015960693, + "learning_rate": 9.813078371911846e-06, + "loss": 0.8798, + "step": 3264 + }, + { + "epoch": 0.17970168969123232, + "grad_norm": 0.9668154716491699, + "learning_rate": 9.812960940317238e-06, + "loss": 0.9645, + "step": 3265 + }, + { + "epoch": 0.17975672849358798, + "grad_norm": 0.862050473690033, + "learning_rate": 9.812843472549705e-06, + "loss": 0.8675, + "step": 3266 + }, + { + "epoch": 0.17981176729594364, + "grad_norm": 0.7776491641998291, + "learning_rate": 9.812725968610126e-06, + "loss": 0.7727, + "step": 3267 + }, + { + "epoch": 0.1798668060982993, + "grad_norm": 0.7197048664093018, + "learning_rate": 9.812608428499389e-06, + "loss": 0.6877, + "step": 3268 + }, + { + "epoch": 0.17992184490065496, + "grad_norm": 0.7995713353157043, + "learning_rate": 9.812490852218375e-06, + "loss": 0.8576, + "step": 3269 + }, + { + "epoch": 0.17997688370301063, + "grad_norm": 0.8300820589065552, + "learning_rate": 9.812373239767967e-06, + "loss": 0.8119, + "step": 3270 + }, + { + "epoch": 0.1800319225053663, + "grad_norm": 0.8625856041908264, + "learning_rate": 9.812255591149052e-06, + "loss": 0.7547, + "step": 3271 + }, + { + "epoch": 0.18008696130772195, + "grad_norm": 1.016419768333435, + "learning_rate": 9.812137906362511e-06, + "loss": 0.8457, + "step": 3272 + }, + { + "epoch": 0.1801420001100776, + "grad_norm": 0.7303110361099243, + "learning_rate": 9.812020185409229e-06, + "loss": 0.7954, + "step": 3273 + }, + { + "epoch": 0.18019703891243327, + "grad_norm": 0.8632498383522034, + "learning_rate": 9.811902428290093e-06, + "loss": 0.8952, + "step": 3274 + }, + { + "epoch": 0.18025207771478893, + "grad_norm": 0.7666932940483093, + "learning_rate": 9.811784635005984e-06, + "loss": 0.746, + "step": 3275 + }, + { + "epoch": 0.1803071165171446, + "grad_norm": 0.8962032198905945, + "learning_rate": 9.811666805557791e-06, + "loss": 0.8654, + "step": 3276 + }, + { + "epoch": 0.18036215531950026, + "grad_norm": 0.9399656057357788, + "learning_rate": 9.811548939946397e-06, + "loss": 0.8062, + "step": 3277 + }, + { + "epoch": 0.18041719412185592, + "grad_norm": 0.7469807863235474, + "learning_rate": 9.811431038172692e-06, + "loss": 0.79, + "step": 3278 + }, + { + "epoch": 0.18047223292421158, + "grad_norm": 0.7661105394363403, + "learning_rate": 9.811313100237556e-06, + "loss": 0.7768, + "step": 3279 + }, + { + "epoch": 0.18052727172656724, + "grad_norm": 0.7567458748817444, + "learning_rate": 9.811195126141881e-06, + "loss": 0.7329, + "step": 3280 + }, + { + "epoch": 0.1805823105289229, + "grad_norm": 0.7187278866767883, + "learning_rate": 9.811077115886552e-06, + "loss": 0.6511, + "step": 3281 + }, + { + "epoch": 0.18063734933127856, + "grad_norm": 0.7641230821609497, + "learning_rate": 9.810959069472452e-06, + "loss": 0.7704, + "step": 3282 + }, + { + "epoch": 0.18069238813363422, + "grad_norm": 0.7790704369544983, + "learning_rate": 9.810840986900474e-06, + "loss": 0.8142, + "step": 3283 + }, + { + "epoch": 0.18074742693598989, + "grad_norm": 0.8102816343307495, + "learning_rate": 9.810722868171502e-06, + "loss": 0.765, + "step": 3284 + }, + { + "epoch": 0.18080246573834555, + "grad_norm": 0.7251957058906555, + "learning_rate": 9.810604713286424e-06, + "loss": 0.7836, + "step": 3285 + }, + { + "epoch": 0.18085750454070118, + "grad_norm": 0.845348060131073, + "learning_rate": 9.81048652224613e-06, + "loss": 0.8386, + "step": 3286 + }, + { + "epoch": 0.18091254334305684, + "grad_norm": 0.8397864103317261, + "learning_rate": 9.810368295051507e-06, + "loss": 0.805, + "step": 3287 + }, + { + "epoch": 0.1809675821454125, + "grad_norm": 1.0739909410476685, + "learning_rate": 9.810250031703444e-06, + "loss": 0.8735, + "step": 3288 + }, + { + "epoch": 0.18102262094776816, + "grad_norm": 0.752091646194458, + "learning_rate": 9.810131732202826e-06, + "loss": 0.7814, + "step": 3289 + }, + { + "epoch": 0.18107765975012383, + "grad_norm": 0.7826841473579407, + "learning_rate": 9.810013396550548e-06, + "loss": 0.7761, + "step": 3290 + }, + { + "epoch": 0.1811326985524795, + "grad_norm": 0.6979131102561951, + "learning_rate": 9.809895024747498e-06, + "loss": 0.672, + "step": 3291 + }, + { + "epoch": 0.18118773735483515, + "grad_norm": 0.8571394085884094, + "learning_rate": 9.809776616794562e-06, + "loss": 0.8795, + "step": 3292 + }, + { + "epoch": 0.1812427761571908, + "grad_norm": 0.8287902474403381, + "learning_rate": 9.809658172692634e-06, + "loss": 0.9032, + "step": 3293 + }, + { + "epoch": 0.18129781495954647, + "grad_norm": 0.7884420156478882, + "learning_rate": 9.809539692442602e-06, + "loss": 0.87, + "step": 3294 + }, + { + "epoch": 0.18135285376190213, + "grad_norm": 0.8955305218696594, + "learning_rate": 9.809421176045358e-06, + "loss": 0.7982, + "step": 3295 + }, + { + "epoch": 0.1814078925642578, + "grad_norm": 0.7893335819244385, + "learning_rate": 9.809302623501791e-06, + "loss": 0.7792, + "step": 3296 + }, + { + "epoch": 0.18146293136661346, + "grad_norm": 0.8077870011329651, + "learning_rate": 9.809184034812794e-06, + "loss": 0.829, + "step": 3297 + }, + { + "epoch": 0.18151797016896912, + "grad_norm": 0.8282631635665894, + "learning_rate": 9.809065409979256e-06, + "loss": 0.8502, + "step": 3298 + }, + { + "epoch": 0.18157300897132478, + "grad_norm": 0.7988418936729431, + "learning_rate": 9.808946749002068e-06, + "loss": 0.7853, + "step": 3299 + }, + { + "epoch": 0.18162804777368044, + "grad_norm": 0.7776056528091431, + "learning_rate": 9.808828051882127e-06, + "loss": 0.7843, + "step": 3300 + }, + { + "epoch": 0.1816830865760361, + "grad_norm": 0.8772258758544922, + "learning_rate": 9.80870931862032e-06, + "loss": 0.896, + "step": 3301 + }, + { + "epoch": 0.18173812537839176, + "grad_norm": 0.8080328702926636, + "learning_rate": 9.80859054921754e-06, + "loss": 0.8464, + "step": 3302 + }, + { + "epoch": 0.18179316418074742, + "grad_norm": 0.862707257270813, + "learning_rate": 9.808471743674682e-06, + "loss": 0.8732, + "step": 3303 + }, + { + "epoch": 0.18184820298310309, + "grad_norm": 1.1964820623397827, + "learning_rate": 9.808352901992637e-06, + "loss": 0.9911, + "step": 3304 + }, + { + "epoch": 0.18190324178545875, + "grad_norm": 0.8597685694694519, + "learning_rate": 9.808234024172298e-06, + "loss": 0.8724, + "step": 3305 + }, + { + "epoch": 0.1819582805878144, + "grad_norm": 0.8068556189537048, + "learning_rate": 9.80811511021456e-06, + "loss": 0.8116, + "step": 3306 + }, + { + "epoch": 0.18201331939017007, + "grad_norm": 1.0014268159866333, + "learning_rate": 9.807996160120317e-06, + "loss": 0.8585, + "step": 3307 + }, + { + "epoch": 0.18206835819252573, + "grad_norm": 0.8541132807731628, + "learning_rate": 9.80787717389046e-06, + "loss": 0.8505, + "step": 3308 + }, + { + "epoch": 0.1821233969948814, + "grad_norm": 0.7973629832267761, + "learning_rate": 9.807758151525886e-06, + "loss": 0.8312, + "step": 3309 + }, + { + "epoch": 0.18217843579723705, + "grad_norm": 0.82973712682724, + "learning_rate": 9.807639093027488e-06, + "loss": 0.894, + "step": 3310 + }, + { + "epoch": 0.18223347459959272, + "grad_norm": 0.7729674577713013, + "learning_rate": 9.807519998396162e-06, + "loss": 0.7459, + "step": 3311 + }, + { + "epoch": 0.18228851340194838, + "grad_norm": 0.8106189370155334, + "learning_rate": 9.807400867632804e-06, + "loss": 0.914, + "step": 3312 + }, + { + "epoch": 0.18234355220430404, + "grad_norm": 0.7672377228736877, + "learning_rate": 9.807281700738305e-06, + "loss": 0.8475, + "step": 3313 + }, + { + "epoch": 0.1823985910066597, + "grad_norm": 0.8776688575744629, + "learning_rate": 9.807162497713566e-06, + "loss": 0.7641, + "step": 3314 + }, + { + "epoch": 0.18245362980901536, + "grad_norm": 0.8781917691230774, + "learning_rate": 9.807043258559479e-06, + "loss": 0.86, + "step": 3315 + }, + { + "epoch": 0.18250866861137102, + "grad_norm": 0.819362998008728, + "learning_rate": 9.806923983276942e-06, + "loss": 0.8829, + "step": 3316 + }, + { + "epoch": 0.18256370741372668, + "grad_norm": 0.8065270185470581, + "learning_rate": 9.80680467186685e-06, + "loss": 0.7335, + "step": 3317 + }, + { + "epoch": 0.18261874621608234, + "grad_norm": 0.8692485690116882, + "learning_rate": 9.806685324330102e-06, + "loss": 0.8582, + "step": 3318 + }, + { + "epoch": 0.182673785018438, + "grad_norm": 0.7910160422325134, + "learning_rate": 9.806565940667594e-06, + "loss": 0.8569, + "step": 3319 + }, + { + "epoch": 0.18272882382079367, + "grad_norm": 0.8282253742218018, + "learning_rate": 9.806446520880225e-06, + "loss": 0.7791, + "step": 3320 + }, + { + "epoch": 0.18278386262314933, + "grad_norm": 0.7513861060142517, + "learning_rate": 9.806327064968887e-06, + "loss": 0.7287, + "step": 3321 + }, + { + "epoch": 0.182838901425505, + "grad_norm": 0.8141188621520996, + "learning_rate": 9.806207572934483e-06, + "loss": 0.7772, + "step": 3322 + }, + { + "epoch": 0.18289394022786065, + "grad_norm": 0.7963125705718994, + "learning_rate": 9.806088044777909e-06, + "loss": 0.7993, + "step": 3323 + }, + { + "epoch": 0.1829489790302163, + "grad_norm": 0.8527218103408813, + "learning_rate": 9.805968480500063e-06, + "loss": 0.822, + "step": 3324 + }, + { + "epoch": 0.18300401783257197, + "grad_norm": 0.822467565536499, + "learning_rate": 9.805848880101845e-06, + "loss": 0.8606, + "step": 3325 + }, + { + "epoch": 0.18305905663492764, + "grad_norm": 0.8197154402732849, + "learning_rate": 9.805729243584154e-06, + "loss": 0.9004, + "step": 3326 + }, + { + "epoch": 0.1831140954372833, + "grad_norm": 0.8379594683647156, + "learning_rate": 9.805609570947887e-06, + "loss": 0.8467, + "step": 3327 + }, + { + "epoch": 0.18316913423963896, + "grad_norm": 0.7787355184555054, + "learning_rate": 9.805489862193947e-06, + "loss": 0.8221, + "step": 3328 + }, + { + "epoch": 0.1832241730419946, + "grad_norm": 0.8464100956916809, + "learning_rate": 9.80537011732323e-06, + "loss": 0.7722, + "step": 3329 + }, + { + "epoch": 0.18327921184435025, + "grad_norm": 0.8351306319236755, + "learning_rate": 9.805250336336637e-06, + "loss": 0.7638, + "step": 3330 + }, + { + "epoch": 0.18333425064670592, + "grad_norm": 0.8098864555358887, + "learning_rate": 9.805130519235068e-06, + "loss": 0.8448, + "step": 3331 + }, + { + "epoch": 0.18338928944906158, + "grad_norm": 0.8290563821792603, + "learning_rate": 9.805010666019427e-06, + "loss": 0.6574, + "step": 3332 + }, + { + "epoch": 0.18344432825141724, + "grad_norm": 0.7748262882232666, + "learning_rate": 9.804890776690611e-06, + "loss": 0.8002, + "step": 3333 + }, + { + "epoch": 0.1834993670537729, + "grad_norm": 0.8422787189483643, + "learning_rate": 9.80477085124952e-06, + "loss": 0.8452, + "step": 3334 + }, + { + "epoch": 0.18355440585612856, + "grad_norm": 0.7776510119438171, + "learning_rate": 9.804650889697061e-06, + "loss": 0.8774, + "step": 3335 + }, + { + "epoch": 0.18360944465848422, + "grad_norm": 0.8449370861053467, + "learning_rate": 9.80453089203413e-06, + "loss": 0.8233, + "step": 3336 + }, + { + "epoch": 0.18366448346083988, + "grad_norm": 0.8254217505455017, + "learning_rate": 9.804410858261632e-06, + "loss": 0.8778, + "step": 3337 + }, + { + "epoch": 0.18371952226319554, + "grad_norm": 0.8673515915870667, + "learning_rate": 9.804290788380466e-06, + "loss": 0.8005, + "step": 3338 + }, + { + "epoch": 0.1837745610655512, + "grad_norm": 0.8106067776679993, + "learning_rate": 9.804170682391538e-06, + "loss": 0.86, + "step": 3339 + }, + { + "epoch": 0.18382959986790687, + "grad_norm": 0.8211669325828552, + "learning_rate": 9.804050540295749e-06, + "loss": 0.8013, + "step": 3340 + }, + { + "epoch": 0.18388463867026253, + "grad_norm": 0.7866180539131165, + "learning_rate": 9.803930362094003e-06, + "loss": 0.8108, + "step": 3341 + }, + { + "epoch": 0.1839396774726182, + "grad_norm": 0.8192055225372314, + "learning_rate": 9.8038101477872e-06, + "loss": 0.7586, + "step": 3342 + }, + { + "epoch": 0.18399471627497385, + "grad_norm": 0.940910279750824, + "learning_rate": 9.803689897376248e-06, + "loss": 0.8174, + "step": 3343 + }, + { + "epoch": 0.1840497550773295, + "grad_norm": 0.7979292869567871, + "learning_rate": 9.803569610862048e-06, + "loss": 0.8341, + "step": 3344 + }, + { + "epoch": 0.18410479387968517, + "grad_norm": 0.7577546238899231, + "learning_rate": 9.803449288245504e-06, + "loss": 0.7775, + "step": 3345 + }, + { + "epoch": 0.18415983268204084, + "grad_norm": 0.7255160212516785, + "learning_rate": 9.80332892952752e-06, + "loss": 0.7648, + "step": 3346 + }, + { + "epoch": 0.1842148714843965, + "grad_norm": 0.8269388675689697, + "learning_rate": 9.803208534709004e-06, + "loss": 0.8902, + "step": 3347 + }, + { + "epoch": 0.18426991028675216, + "grad_norm": 0.783867359161377, + "learning_rate": 9.803088103790857e-06, + "loss": 0.8191, + "step": 3348 + }, + { + "epoch": 0.18432494908910782, + "grad_norm": 0.7658863663673401, + "learning_rate": 9.802967636773986e-06, + "loss": 0.7505, + "step": 3349 + }, + { + "epoch": 0.18437998789146348, + "grad_norm": 0.701225757598877, + "learning_rate": 9.802847133659294e-06, + "loss": 0.7159, + "step": 3350 + }, + { + "epoch": 0.18443502669381914, + "grad_norm": 0.9224311709403992, + "learning_rate": 9.802726594447692e-06, + "loss": 0.7766, + "step": 3351 + }, + { + "epoch": 0.1844900654961748, + "grad_norm": 0.8835979700088501, + "learning_rate": 9.80260601914008e-06, + "loss": 0.9304, + "step": 3352 + }, + { + "epoch": 0.18454510429853047, + "grad_norm": 0.7918481826782227, + "learning_rate": 9.802485407737368e-06, + "loss": 0.7691, + "step": 3353 + }, + { + "epoch": 0.18460014310088613, + "grad_norm": 0.8855286240577698, + "learning_rate": 9.80236476024046e-06, + "loss": 0.9213, + "step": 3354 + }, + { + "epoch": 0.1846551819032418, + "grad_norm": 0.7863314747810364, + "learning_rate": 9.802244076650264e-06, + "loss": 0.7675, + "step": 3355 + }, + { + "epoch": 0.18471022070559745, + "grad_norm": 0.8230198621749878, + "learning_rate": 9.802123356967687e-06, + "loss": 0.7243, + "step": 3356 + }, + { + "epoch": 0.1847652595079531, + "grad_norm": 0.8038737773895264, + "learning_rate": 9.80200260119364e-06, + "loss": 0.8094, + "step": 3357 + }, + { + "epoch": 0.18482029831030877, + "grad_norm": 0.7656993269920349, + "learning_rate": 9.801881809329022e-06, + "loss": 0.7736, + "step": 3358 + }, + { + "epoch": 0.18487533711266443, + "grad_norm": 0.8222082853317261, + "learning_rate": 9.801760981374747e-06, + "loss": 0.844, + "step": 3359 + }, + { + "epoch": 0.1849303759150201, + "grad_norm": 0.7632889747619629, + "learning_rate": 9.801640117331723e-06, + "loss": 0.8354, + "step": 3360 + }, + { + "epoch": 0.18498541471737576, + "grad_norm": 0.8308513760566711, + "learning_rate": 9.801519217200857e-06, + "loss": 0.8277, + "step": 3361 + }, + { + "epoch": 0.18504045351973142, + "grad_norm": 0.7865434885025024, + "learning_rate": 9.801398280983057e-06, + "loss": 0.8614, + "step": 3362 + }, + { + "epoch": 0.18509549232208708, + "grad_norm": 0.7249410152435303, + "learning_rate": 9.801277308679232e-06, + "loss": 0.7259, + "step": 3363 + }, + { + "epoch": 0.18515053112444274, + "grad_norm": 0.7604461908340454, + "learning_rate": 9.801156300290293e-06, + "loss": 0.8507, + "step": 3364 + }, + { + "epoch": 0.1852055699267984, + "grad_norm": 0.8725959062576294, + "learning_rate": 9.801035255817149e-06, + "loss": 0.7688, + "step": 3365 + }, + { + "epoch": 0.18526060872915406, + "grad_norm": 0.7798827290534973, + "learning_rate": 9.800914175260708e-06, + "loss": 0.8788, + "step": 3366 + }, + { + "epoch": 0.18531564753150973, + "grad_norm": 0.7060996890068054, + "learning_rate": 9.800793058621882e-06, + "loss": 0.8183, + "step": 3367 + }, + { + "epoch": 0.1853706863338654, + "grad_norm": 0.7558063268661499, + "learning_rate": 9.80067190590158e-06, + "loss": 0.7834, + "step": 3368 + }, + { + "epoch": 0.18542572513622105, + "grad_norm": 0.7411057353019714, + "learning_rate": 9.800550717100714e-06, + "loss": 0.8298, + "step": 3369 + }, + { + "epoch": 0.1854807639385767, + "grad_norm": 0.8466144800186157, + "learning_rate": 9.800429492220193e-06, + "loss": 0.8297, + "step": 3370 + }, + { + "epoch": 0.18553580274093237, + "grad_norm": 0.7302330136299133, + "learning_rate": 9.800308231260928e-06, + "loss": 0.72, + "step": 3371 + }, + { + "epoch": 0.185590841543288, + "grad_norm": 0.8140530586242676, + "learning_rate": 9.800186934223832e-06, + "loss": 0.9287, + "step": 3372 + }, + { + "epoch": 0.18564588034564367, + "grad_norm": 0.8246129751205444, + "learning_rate": 9.800065601109817e-06, + "loss": 0.7891, + "step": 3373 + }, + { + "epoch": 0.18570091914799933, + "grad_norm": 0.8746623396873474, + "learning_rate": 9.799944231919794e-06, + "loss": 0.8549, + "step": 3374 + }, + { + "epoch": 0.185755957950355, + "grad_norm": 0.9977195858955383, + "learning_rate": 9.799822826654672e-06, + "loss": 0.821, + "step": 3375 + }, + { + "epoch": 0.18581099675271065, + "grad_norm": 0.8937395811080933, + "learning_rate": 9.79970138531537e-06, + "loss": 0.8639, + "step": 3376 + }, + { + "epoch": 0.1858660355550663, + "grad_norm": 1.039695143699646, + "learning_rate": 9.799579907902794e-06, + "loss": 1.0425, + "step": 3377 + }, + { + "epoch": 0.18592107435742197, + "grad_norm": 0.7847749590873718, + "learning_rate": 9.799458394417863e-06, + "loss": 0.8505, + "step": 3378 + }, + { + "epoch": 0.18597611315977763, + "grad_norm": 0.760334312915802, + "learning_rate": 9.799336844861486e-06, + "loss": 0.7418, + "step": 3379 + }, + { + "epoch": 0.1860311519621333, + "grad_norm": 0.7599604725837708, + "learning_rate": 9.799215259234578e-06, + "loss": 0.8305, + "step": 3380 + }, + { + "epoch": 0.18608619076448896, + "grad_norm": 0.846767246723175, + "learning_rate": 9.799093637538054e-06, + "loss": 0.7526, + "step": 3381 + }, + { + "epoch": 0.18614122956684462, + "grad_norm": 0.7840956449508667, + "learning_rate": 9.798971979772825e-06, + "loss": 0.8009, + "step": 3382 + }, + { + "epoch": 0.18619626836920028, + "grad_norm": 0.7826499342918396, + "learning_rate": 9.798850285939809e-06, + "loss": 0.821, + "step": 3383 + }, + { + "epoch": 0.18625130717155594, + "grad_norm": 0.7829813361167908, + "learning_rate": 9.798728556039918e-06, + "loss": 0.8053, + "step": 3384 + }, + { + "epoch": 0.1863063459739116, + "grad_norm": 0.7267470359802246, + "learning_rate": 9.798606790074067e-06, + "loss": 0.6797, + "step": 3385 + }, + { + "epoch": 0.18636138477626726, + "grad_norm": 0.8560196757316589, + "learning_rate": 9.798484988043173e-06, + "loss": 0.8476, + "step": 3386 + }, + { + "epoch": 0.18641642357862293, + "grad_norm": 0.7920921444892883, + "learning_rate": 9.798363149948148e-06, + "loss": 0.8832, + "step": 3387 + }, + { + "epoch": 0.1864714623809786, + "grad_norm": 0.8414384126663208, + "learning_rate": 9.798241275789912e-06, + "loss": 0.8607, + "step": 3388 + }, + { + "epoch": 0.18652650118333425, + "grad_norm": 0.7255431413650513, + "learning_rate": 9.798119365569378e-06, + "loss": 0.6426, + "step": 3389 + }, + { + "epoch": 0.1865815399856899, + "grad_norm": 0.8842852711677551, + "learning_rate": 9.797997419287465e-06, + "loss": 0.9058, + "step": 3390 + }, + { + "epoch": 0.18663657878804557, + "grad_norm": 0.7178265452384949, + "learning_rate": 9.797875436945086e-06, + "loss": 0.8134, + "step": 3391 + }, + { + "epoch": 0.18669161759040123, + "grad_norm": 0.7275096774101257, + "learning_rate": 9.797753418543161e-06, + "loss": 0.6858, + "step": 3392 + }, + { + "epoch": 0.1867466563927569, + "grad_norm": 0.7587800025939941, + "learning_rate": 9.797631364082605e-06, + "loss": 0.7437, + "step": 3393 + }, + { + "epoch": 0.18680169519511255, + "grad_norm": 0.9769744873046875, + "learning_rate": 9.797509273564336e-06, + "loss": 0.8024, + "step": 3394 + }, + { + "epoch": 0.18685673399746822, + "grad_norm": 0.7662433385848999, + "learning_rate": 9.79738714698927e-06, + "loss": 0.8122, + "step": 3395 + }, + { + "epoch": 0.18691177279982388, + "grad_norm": 0.8620306849479675, + "learning_rate": 9.797264984358328e-06, + "loss": 0.7952, + "step": 3396 + }, + { + "epoch": 0.18696681160217954, + "grad_norm": 0.7542591094970703, + "learning_rate": 9.797142785672427e-06, + "loss": 0.8315, + "step": 3397 + }, + { + "epoch": 0.1870218504045352, + "grad_norm": 0.7273713946342468, + "learning_rate": 9.797020550932483e-06, + "loss": 0.7316, + "step": 3398 + }, + { + "epoch": 0.18707688920689086, + "grad_norm": 1.031592845916748, + "learning_rate": 9.796898280139417e-06, + "loss": 0.7478, + "step": 3399 + }, + { + "epoch": 0.18713192800924652, + "grad_norm": 0.791407585144043, + "learning_rate": 9.796775973294147e-06, + "loss": 0.7742, + "step": 3400 + }, + { + "epoch": 0.18718696681160218, + "grad_norm": 0.8311418294906616, + "learning_rate": 9.796653630397595e-06, + "loss": 0.8182, + "step": 3401 + }, + { + "epoch": 0.18724200561395785, + "grad_norm": 0.7960993051528931, + "learning_rate": 9.796531251450678e-06, + "loss": 0.7606, + "step": 3402 + }, + { + "epoch": 0.1872970444163135, + "grad_norm": 0.8671618103981018, + "learning_rate": 9.796408836454316e-06, + "loss": 0.7136, + "step": 3403 + }, + { + "epoch": 0.18735208321866917, + "grad_norm": 1.1071348190307617, + "learning_rate": 9.796286385409428e-06, + "loss": 0.7729, + "step": 3404 + }, + { + "epoch": 0.18740712202102483, + "grad_norm": 0.738217294216156, + "learning_rate": 9.796163898316935e-06, + "loss": 0.7425, + "step": 3405 + }, + { + "epoch": 0.1874621608233805, + "grad_norm": 0.7567199468612671, + "learning_rate": 9.796041375177758e-06, + "loss": 0.8442, + "step": 3406 + }, + { + "epoch": 0.18751719962573615, + "grad_norm": 0.7942413091659546, + "learning_rate": 9.79591881599282e-06, + "loss": 0.852, + "step": 3407 + }, + { + "epoch": 0.18757223842809181, + "grad_norm": 0.7529355883598328, + "learning_rate": 9.795796220763038e-06, + "loss": 0.8086, + "step": 3408 + }, + { + "epoch": 0.18762727723044748, + "grad_norm": 0.7645192742347717, + "learning_rate": 9.795673589489337e-06, + "loss": 0.831, + "step": 3409 + }, + { + "epoch": 0.18768231603280314, + "grad_norm": 0.694791853427887, + "learning_rate": 9.795550922172635e-06, + "loss": 0.6919, + "step": 3410 + }, + { + "epoch": 0.1877373548351588, + "grad_norm": 0.7041944265365601, + "learning_rate": 9.795428218813858e-06, + "loss": 0.7284, + "step": 3411 + }, + { + "epoch": 0.18779239363751446, + "grad_norm": 0.8972276449203491, + "learning_rate": 9.795305479413924e-06, + "loss": 0.7156, + "step": 3412 + }, + { + "epoch": 0.18784743243987012, + "grad_norm": 0.9730873107910156, + "learning_rate": 9.795182703973758e-06, + "loss": 0.8739, + "step": 3413 + }, + { + "epoch": 0.18790247124222578, + "grad_norm": 0.8137956261634827, + "learning_rate": 9.795059892494283e-06, + "loss": 0.8189, + "step": 3414 + }, + { + "epoch": 0.18795751004458142, + "grad_norm": 0.8171416521072388, + "learning_rate": 9.794937044976422e-06, + "loss": 0.9449, + "step": 3415 + }, + { + "epoch": 0.18801254884693708, + "grad_norm": 0.7929911017417908, + "learning_rate": 9.794814161421098e-06, + "loss": 0.8034, + "step": 3416 + }, + { + "epoch": 0.18806758764929274, + "grad_norm": 1.1045749187469482, + "learning_rate": 9.794691241829233e-06, + "loss": 0.875, + "step": 3417 + }, + { + "epoch": 0.1881226264516484, + "grad_norm": 0.8141040205955505, + "learning_rate": 9.794568286201752e-06, + "loss": 0.787, + "step": 3418 + }, + { + "epoch": 0.18817766525400406, + "grad_norm": 0.7615541815757751, + "learning_rate": 9.79444529453958e-06, + "loss": 0.8491, + "step": 3419 + }, + { + "epoch": 0.18823270405635972, + "grad_norm": 0.848419189453125, + "learning_rate": 9.79432226684364e-06, + "loss": 0.7445, + "step": 3420 + }, + { + "epoch": 0.18828774285871538, + "grad_norm": 0.8075067400932312, + "learning_rate": 9.794199203114858e-06, + "loss": 0.6581, + "step": 3421 + }, + { + "epoch": 0.18834278166107105, + "grad_norm": 0.8473401069641113, + "learning_rate": 9.794076103354158e-06, + "loss": 0.839, + "step": 3422 + }, + { + "epoch": 0.1883978204634267, + "grad_norm": 0.8211609721183777, + "learning_rate": 9.793952967562463e-06, + "loss": 0.7709, + "step": 3423 + }, + { + "epoch": 0.18845285926578237, + "grad_norm": 0.7527804374694824, + "learning_rate": 9.793829795740703e-06, + "loss": 0.7315, + "step": 3424 + }, + { + "epoch": 0.18850789806813803, + "grad_norm": 0.7971188426017761, + "learning_rate": 9.793706587889802e-06, + "loss": 0.7507, + "step": 3425 + }, + { + "epoch": 0.1885629368704937, + "grad_norm": 1.024066686630249, + "learning_rate": 9.793583344010684e-06, + "loss": 0.9043, + "step": 3426 + }, + { + "epoch": 0.18861797567284935, + "grad_norm": 0.7428625226020813, + "learning_rate": 9.793460064104276e-06, + "loss": 0.7435, + "step": 3427 + }, + { + "epoch": 0.18867301447520501, + "grad_norm": 0.8438264727592468, + "learning_rate": 9.793336748171507e-06, + "loss": 0.8618, + "step": 3428 + }, + { + "epoch": 0.18872805327756068, + "grad_norm": 0.7846877574920654, + "learning_rate": 9.793213396213302e-06, + "loss": 0.8064, + "step": 3429 + }, + { + "epoch": 0.18878309207991634, + "grad_norm": 0.7527204751968384, + "learning_rate": 9.793090008230587e-06, + "loss": 0.7596, + "step": 3430 + }, + { + "epoch": 0.188838130882272, + "grad_norm": 1.1236757040023804, + "learning_rate": 9.792966584224292e-06, + "loss": 0.8292, + "step": 3431 + }, + { + "epoch": 0.18889316968462766, + "grad_norm": 0.8128102421760559, + "learning_rate": 9.792843124195343e-06, + "loss": 0.8073, + "step": 3432 + }, + { + "epoch": 0.18894820848698332, + "grad_norm": 0.7668742537498474, + "learning_rate": 9.792719628144667e-06, + "loss": 0.7848, + "step": 3433 + }, + { + "epoch": 0.18900324728933898, + "grad_norm": 1.8663485050201416, + "learning_rate": 9.792596096073193e-06, + "loss": 0.9388, + "step": 3434 + }, + { + "epoch": 0.18905828609169464, + "grad_norm": 0.8066239356994629, + "learning_rate": 9.792472527981852e-06, + "loss": 0.6647, + "step": 3435 + }, + { + "epoch": 0.1891133248940503, + "grad_norm": 0.8268817067146301, + "learning_rate": 9.792348923871567e-06, + "loss": 0.9676, + "step": 3436 + }, + { + "epoch": 0.18916836369640597, + "grad_norm": 0.7165037393569946, + "learning_rate": 9.792225283743272e-06, + "loss": 0.6937, + "step": 3437 + }, + { + "epoch": 0.18922340249876163, + "grad_norm": 0.7850403785705566, + "learning_rate": 9.792101607597895e-06, + "loss": 0.7782, + "step": 3438 + }, + { + "epoch": 0.1892784413011173, + "grad_norm": 0.8839808702468872, + "learning_rate": 9.791977895436365e-06, + "loss": 0.7639, + "step": 3439 + }, + { + "epoch": 0.18933348010347295, + "grad_norm": 0.8260362148284912, + "learning_rate": 9.791854147259611e-06, + "loss": 0.8201, + "step": 3440 + }, + { + "epoch": 0.1893885189058286, + "grad_norm": 0.8792916536331177, + "learning_rate": 9.791730363068564e-06, + "loss": 0.8251, + "step": 3441 + }, + { + "epoch": 0.18944355770818427, + "grad_norm": 0.8192774653434753, + "learning_rate": 9.791606542864154e-06, + "loss": 0.7944, + "step": 3442 + }, + { + "epoch": 0.18949859651053994, + "grad_norm": 0.751470685005188, + "learning_rate": 9.791482686647313e-06, + "loss": 0.7563, + "step": 3443 + }, + { + "epoch": 0.1895536353128956, + "grad_norm": 0.8902072906494141, + "learning_rate": 9.79135879441897e-06, + "loss": 0.7719, + "step": 3444 + }, + { + "epoch": 0.18960867411525126, + "grad_norm": 0.7166435122489929, + "learning_rate": 9.791234866180058e-06, + "loss": 0.7871, + "step": 3445 + }, + { + "epoch": 0.18966371291760692, + "grad_norm": 0.763416588306427, + "learning_rate": 9.791110901931505e-06, + "loss": 0.8226, + "step": 3446 + }, + { + "epoch": 0.18971875171996258, + "grad_norm": 0.806633472442627, + "learning_rate": 9.790986901674246e-06, + "loss": 0.7828, + "step": 3447 + }, + { + "epoch": 0.18977379052231824, + "grad_norm": 0.8139312863349915, + "learning_rate": 9.790862865409213e-06, + "loss": 0.8441, + "step": 3448 + }, + { + "epoch": 0.1898288293246739, + "grad_norm": 0.8362452387809753, + "learning_rate": 9.790738793137335e-06, + "loss": 0.8765, + "step": 3449 + }, + { + "epoch": 0.18988386812702956, + "grad_norm": 0.7736263871192932, + "learning_rate": 9.790614684859549e-06, + "loss": 0.8373, + "step": 3450 + }, + { + "epoch": 0.18993890692938523, + "grad_norm": 0.8742800354957581, + "learning_rate": 9.790490540576784e-06, + "loss": 0.8976, + "step": 3451 + }, + { + "epoch": 0.1899939457317409, + "grad_norm": 0.701505720615387, + "learning_rate": 9.790366360289974e-06, + "loss": 0.7799, + "step": 3452 + }, + { + "epoch": 0.19004898453409655, + "grad_norm": 0.7771356701850891, + "learning_rate": 9.790242144000055e-06, + "loss": 0.7617, + "step": 3453 + }, + { + "epoch": 0.1901040233364522, + "grad_norm": 0.897576093673706, + "learning_rate": 9.790117891707955e-06, + "loss": 0.7817, + "step": 3454 + }, + { + "epoch": 0.19015906213880787, + "grad_norm": 0.7296561002731323, + "learning_rate": 9.789993603414613e-06, + "loss": 0.8344, + "step": 3455 + }, + { + "epoch": 0.19021410094116353, + "grad_norm": 0.8099396228790283, + "learning_rate": 9.789869279120962e-06, + "loss": 0.7369, + "step": 3456 + }, + { + "epoch": 0.1902691397435192, + "grad_norm": 0.7802554368972778, + "learning_rate": 9.789744918827935e-06, + "loss": 0.8383, + "step": 3457 + }, + { + "epoch": 0.19032417854587483, + "grad_norm": 0.7508029341697693, + "learning_rate": 9.789620522536467e-06, + "loss": 0.825, + "step": 3458 + }, + { + "epoch": 0.1903792173482305, + "grad_norm": 0.7782164216041565, + "learning_rate": 9.789496090247494e-06, + "loss": 0.7737, + "step": 3459 + }, + { + "epoch": 0.19043425615058615, + "grad_norm": 0.7711489796638489, + "learning_rate": 9.78937162196195e-06, + "loss": 0.7694, + "step": 3460 + }, + { + "epoch": 0.1904892949529418, + "grad_norm": 0.821579098701477, + "learning_rate": 9.789247117680769e-06, + "loss": 0.7493, + "step": 3461 + }, + { + "epoch": 0.19054433375529747, + "grad_norm": 0.6700833439826965, + "learning_rate": 9.789122577404892e-06, + "loss": 0.7696, + "step": 3462 + }, + { + "epoch": 0.19059937255765314, + "grad_norm": 0.854340136051178, + "learning_rate": 9.78899800113525e-06, + "loss": 0.9503, + "step": 3463 + }, + { + "epoch": 0.1906544113600088, + "grad_norm": 0.8095537424087524, + "learning_rate": 9.78887338887278e-06, + "loss": 0.8435, + "step": 3464 + }, + { + "epoch": 0.19070945016236446, + "grad_norm": 0.8156480193138123, + "learning_rate": 9.78874874061842e-06, + "loss": 0.8561, + "step": 3465 + }, + { + "epoch": 0.19076448896472012, + "grad_norm": 0.8065482378005981, + "learning_rate": 9.788624056373108e-06, + "loss": 0.7793, + "step": 3466 + }, + { + "epoch": 0.19081952776707578, + "grad_norm": 0.789601743221283, + "learning_rate": 9.788499336137778e-06, + "loss": 0.7523, + "step": 3467 + }, + { + "epoch": 0.19087456656943144, + "grad_norm": 0.8322301506996155, + "learning_rate": 9.788374579913369e-06, + "loss": 0.9034, + "step": 3468 + }, + { + "epoch": 0.1909296053717871, + "grad_norm": 0.8194506764411926, + "learning_rate": 9.788249787700818e-06, + "loss": 0.8601, + "step": 3469 + }, + { + "epoch": 0.19098464417414276, + "grad_norm": 0.8419962525367737, + "learning_rate": 9.788124959501065e-06, + "loss": 0.869, + "step": 3470 + }, + { + "epoch": 0.19103968297649843, + "grad_norm": 0.760637104511261, + "learning_rate": 9.788000095315044e-06, + "loss": 0.7293, + "step": 3471 + }, + { + "epoch": 0.1910947217788541, + "grad_norm": 1.3964574337005615, + "learning_rate": 9.787875195143697e-06, + "loss": 0.8032, + "step": 3472 + }, + { + "epoch": 0.19114976058120975, + "grad_norm": 0.8205012679100037, + "learning_rate": 9.787750258987962e-06, + "loss": 0.8868, + "step": 3473 + }, + { + "epoch": 0.1912047993835654, + "grad_norm": 0.8183104991912842, + "learning_rate": 9.78762528684878e-06, + "loss": 0.7531, + "step": 3474 + }, + { + "epoch": 0.19125983818592107, + "grad_norm": 0.7659775018692017, + "learning_rate": 9.787500278727083e-06, + "loss": 0.8081, + "step": 3475 + }, + { + "epoch": 0.19131487698827673, + "grad_norm": 0.8262091279029846, + "learning_rate": 9.787375234623819e-06, + "loss": 0.82, + "step": 3476 + }, + { + "epoch": 0.1913699157906324, + "grad_norm": 0.857761025428772, + "learning_rate": 9.787250154539923e-06, + "loss": 0.9133, + "step": 3477 + }, + { + "epoch": 0.19142495459298806, + "grad_norm": 0.7551915645599365, + "learning_rate": 9.787125038476334e-06, + "loss": 0.7822, + "step": 3478 + }, + { + "epoch": 0.19147999339534372, + "grad_norm": 0.7777357697486877, + "learning_rate": 9.786999886433998e-06, + "loss": 0.7676, + "step": 3479 + }, + { + "epoch": 0.19153503219769938, + "grad_norm": 0.8389080166816711, + "learning_rate": 9.786874698413852e-06, + "loss": 0.7901, + "step": 3480 + }, + { + "epoch": 0.19159007100005504, + "grad_norm": 0.7894837856292725, + "learning_rate": 9.786749474416836e-06, + "loss": 0.8393, + "step": 3481 + }, + { + "epoch": 0.1916451098024107, + "grad_norm": 1.9752860069274902, + "learning_rate": 9.786624214443893e-06, + "loss": 0.7611, + "step": 3482 + }, + { + "epoch": 0.19170014860476636, + "grad_norm": 0.8023802042007446, + "learning_rate": 9.786498918495963e-06, + "loss": 0.8426, + "step": 3483 + }, + { + "epoch": 0.19175518740712202, + "grad_norm": 0.7232086658477783, + "learning_rate": 9.78637358657399e-06, + "loss": 0.6611, + "step": 3484 + }, + { + "epoch": 0.19181022620947769, + "grad_norm": 0.8198665380477905, + "learning_rate": 9.786248218678912e-06, + "loss": 0.8795, + "step": 3485 + }, + { + "epoch": 0.19186526501183335, + "grad_norm": 0.942404568195343, + "learning_rate": 9.786122814811675e-06, + "loss": 0.9146, + "step": 3486 + }, + { + "epoch": 0.191920303814189, + "grad_norm": 0.7602691054344177, + "learning_rate": 9.78599737497322e-06, + "loss": 0.7514, + "step": 3487 + }, + { + "epoch": 0.19197534261654467, + "grad_norm": 0.7981933951377869, + "learning_rate": 9.785871899164489e-06, + "loss": 0.7722, + "step": 3488 + }, + { + "epoch": 0.19203038141890033, + "grad_norm": 0.8617631793022156, + "learning_rate": 9.785746387386427e-06, + "loss": 0.8989, + "step": 3489 + }, + { + "epoch": 0.192085420221256, + "grad_norm": 0.7691803574562073, + "learning_rate": 9.785620839639976e-06, + "loss": 0.7929, + "step": 3490 + }, + { + "epoch": 0.19214045902361165, + "grad_norm": 1.3053189516067505, + "learning_rate": 9.785495255926078e-06, + "loss": 0.8478, + "step": 3491 + }, + { + "epoch": 0.19219549782596732, + "grad_norm": 0.807064950466156, + "learning_rate": 9.785369636245681e-06, + "loss": 0.7452, + "step": 3492 + }, + { + "epoch": 0.19225053662832298, + "grad_norm": 0.8182778358459473, + "learning_rate": 9.785243980599726e-06, + "loss": 0.8371, + "step": 3493 + }, + { + "epoch": 0.19230557543067864, + "grad_norm": 0.7654449343681335, + "learning_rate": 9.785118288989157e-06, + "loss": 0.8321, + "step": 3494 + }, + { + "epoch": 0.1923606142330343, + "grad_norm": 0.7192448973655701, + "learning_rate": 9.784992561414922e-06, + "loss": 0.7451, + "step": 3495 + }, + { + "epoch": 0.19241565303538996, + "grad_norm": 0.8639407753944397, + "learning_rate": 9.784866797877964e-06, + "loss": 0.9272, + "step": 3496 + }, + { + "epoch": 0.19247069183774562, + "grad_norm": 0.8329927921295166, + "learning_rate": 9.784740998379225e-06, + "loss": 0.8034, + "step": 3497 + }, + { + "epoch": 0.19252573064010128, + "grad_norm": 0.7975476980209351, + "learning_rate": 9.784615162919656e-06, + "loss": 0.6885, + "step": 3498 + }, + { + "epoch": 0.19258076944245694, + "grad_norm": 0.8077559471130371, + "learning_rate": 9.7844892915002e-06, + "loss": 0.8745, + "step": 3499 + }, + { + "epoch": 0.1926358082448126, + "grad_norm": 0.7957825660705566, + "learning_rate": 9.7843633841218e-06, + "loss": 0.7612, + "step": 3500 + }, + { + "epoch": 0.19269084704716824, + "grad_norm": 0.8478250503540039, + "learning_rate": 9.784237440785408e-06, + "loss": 0.8675, + "step": 3501 + }, + { + "epoch": 0.1927458858495239, + "grad_norm": 0.7289726138114929, + "learning_rate": 9.78411146149197e-06, + "loss": 0.7126, + "step": 3502 + }, + { + "epoch": 0.19280092465187956, + "grad_norm": 0.7608509063720703, + "learning_rate": 9.783985446242427e-06, + "loss": 0.7049, + "step": 3503 + }, + { + "epoch": 0.19285596345423522, + "grad_norm": 0.8985201120376587, + "learning_rate": 9.783859395037733e-06, + "loss": 0.8067, + "step": 3504 + }, + { + "epoch": 0.19291100225659089, + "grad_norm": 0.7563273906707764, + "learning_rate": 9.78373330787883e-06, + "loss": 0.7018, + "step": 3505 + }, + { + "epoch": 0.19296604105894655, + "grad_norm": 0.8022900223731995, + "learning_rate": 9.78360718476667e-06, + "loss": 0.8346, + "step": 3506 + }, + { + "epoch": 0.1930210798613022, + "grad_norm": 0.897566020488739, + "learning_rate": 9.783481025702197e-06, + "loss": 0.9465, + "step": 3507 + }, + { + "epoch": 0.19307611866365787, + "grad_norm": 0.9550303220748901, + "learning_rate": 9.783354830686363e-06, + "loss": 0.8904, + "step": 3508 + }, + { + "epoch": 0.19313115746601353, + "grad_norm": 0.8152582049369812, + "learning_rate": 9.783228599720114e-06, + "loss": 0.7776, + "step": 3509 + }, + { + "epoch": 0.1931861962683692, + "grad_norm": 0.7421940565109253, + "learning_rate": 9.783102332804398e-06, + "loss": 0.6847, + "step": 3510 + }, + { + "epoch": 0.19324123507072485, + "grad_norm": 0.7414368391036987, + "learning_rate": 9.782976029940167e-06, + "loss": 0.8435, + "step": 3511 + }, + { + "epoch": 0.19329627387308052, + "grad_norm": 0.7845529317855835, + "learning_rate": 9.782849691128366e-06, + "loss": 0.8255, + "step": 3512 + }, + { + "epoch": 0.19335131267543618, + "grad_norm": 0.7779788970947266, + "learning_rate": 9.78272331636995e-06, + "loss": 0.7801, + "step": 3513 + }, + { + "epoch": 0.19340635147779184, + "grad_norm": 0.7537885904312134, + "learning_rate": 9.782596905665865e-06, + "loss": 0.7501, + "step": 3514 + }, + { + "epoch": 0.1934613902801475, + "grad_norm": 0.7585812211036682, + "learning_rate": 9.782470459017059e-06, + "loss": 0.8425, + "step": 3515 + }, + { + "epoch": 0.19351642908250316, + "grad_norm": 0.7923589944839478, + "learning_rate": 9.78234397642449e-06, + "loss": 0.8412, + "step": 3516 + }, + { + "epoch": 0.19357146788485882, + "grad_norm": 0.8710628151893616, + "learning_rate": 9.7822174578891e-06, + "loss": 0.8014, + "step": 3517 + }, + { + "epoch": 0.19362650668721448, + "grad_norm": 0.7646920084953308, + "learning_rate": 9.782090903411845e-06, + "loss": 0.8256, + "step": 3518 + }, + { + "epoch": 0.19368154548957014, + "grad_norm": 0.7560480833053589, + "learning_rate": 9.781964312993675e-06, + "loss": 0.7816, + "step": 3519 + }, + { + "epoch": 0.1937365842919258, + "grad_norm": 0.7438123226165771, + "learning_rate": 9.78183768663554e-06, + "loss": 0.8319, + "step": 3520 + }, + { + "epoch": 0.19379162309428147, + "grad_norm": 0.7239874601364136, + "learning_rate": 9.781711024338394e-06, + "loss": 0.6968, + "step": 3521 + }, + { + "epoch": 0.19384666189663713, + "grad_norm": 0.881197988986969, + "learning_rate": 9.781584326103188e-06, + "loss": 0.9493, + "step": 3522 + }, + { + "epoch": 0.1939017006989928, + "grad_norm": 0.7903854846954346, + "learning_rate": 9.781457591930874e-06, + "loss": 0.8312, + "step": 3523 + }, + { + "epoch": 0.19395673950134845, + "grad_norm": 0.7375456094741821, + "learning_rate": 9.781330821822405e-06, + "loss": 0.7434, + "step": 3524 + }, + { + "epoch": 0.1940117783037041, + "grad_norm": 0.7101724743843079, + "learning_rate": 9.781204015778733e-06, + "loss": 0.75, + "step": 3525 + }, + { + "epoch": 0.19406681710605977, + "grad_norm": 0.8267471194267273, + "learning_rate": 9.781077173800812e-06, + "loss": 0.8807, + "step": 3526 + }, + { + "epoch": 0.19412185590841544, + "grad_norm": 0.9014178514480591, + "learning_rate": 9.780950295889594e-06, + "loss": 0.7836, + "step": 3527 + }, + { + "epoch": 0.1941768947107711, + "grad_norm": 0.7579739689826965, + "learning_rate": 9.780823382046034e-06, + "loss": 0.8331, + "step": 3528 + }, + { + "epoch": 0.19423193351312676, + "grad_norm": 0.8308925032615662, + "learning_rate": 9.780696432271084e-06, + "loss": 0.794, + "step": 3529 + }, + { + "epoch": 0.19428697231548242, + "grad_norm": 0.7461574673652649, + "learning_rate": 9.780569446565701e-06, + "loss": 0.8155, + "step": 3530 + }, + { + "epoch": 0.19434201111783808, + "grad_norm": 0.8658885359764099, + "learning_rate": 9.780442424930836e-06, + "loss": 0.7907, + "step": 3531 + }, + { + "epoch": 0.19439704992019374, + "grad_norm": 0.7243279218673706, + "learning_rate": 9.780315367367449e-06, + "loss": 0.7985, + "step": 3532 + }, + { + "epoch": 0.1944520887225494, + "grad_norm": 0.8482224345207214, + "learning_rate": 9.780188273876486e-06, + "loss": 0.9095, + "step": 3533 + }, + { + "epoch": 0.19450712752490507, + "grad_norm": 0.8675364255905151, + "learning_rate": 9.78006114445891e-06, + "loss": 0.759, + "step": 3534 + }, + { + "epoch": 0.19456216632726073, + "grad_norm": 0.8388474583625793, + "learning_rate": 9.779933979115675e-06, + "loss": 0.8331, + "step": 3535 + }, + { + "epoch": 0.1946172051296164, + "grad_norm": 0.8050872683525085, + "learning_rate": 9.779806777847735e-06, + "loss": 0.861, + "step": 3536 + }, + { + "epoch": 0.19467224393197205, + "grad_norm": 0.8401390910148621, + "learning_rate": 9.779679540656046e-06, + "loss": 0.755, + "step": 3537 + }, + { + "epoch": 0.1947272827343277, + "grad_norm": 0.865160346031189, + "learning_rate": 9.779552267541566e-06, + "loss": 0.7515, + "step": 3538 + }, + { + "epoch": 0.19478232153668337, + "grad_norm": 0.923086941242218, + "learning_rate": 9.77942495850525e-06, + "loss": 0.8032, + "step": 3539 + }, + { + "epoch": 0.19483736033903903, + "grad_norm": 0.8402467966079712, + "learning_rate": 9.779297613548056e-06, + "loss": 0.9198, + "step": 3540 + }, + { + "epoch": 0.1948923991413947, + "grad_norm": 0.7875306606292725, + "learning_rate": 9.779170232670939e-06, + "loss": 0.712, + "step": 3541 + }, + { + "epoch": 0.19494743794375036, + "grad_norm": 0.7996379137039185, + "learning_rate": 9.779042815874858e-06, + "loss": 0.8126, + "step": 3542 + }, + { + "epoch": 0.19500247674610602, + "grad_norm": 0.7644525766372681, + "learning_rate": 9.778915363160773e-06, + "loss": 0.8602, + "step": 3543 + }, + { + "epoch": 0.19505751554846165, + "grad_norm": 0.8068630695343018, + "learning_rate": 9.778787874529635e-06, + "loss": 0.736, + "step": 3544 + }, + { + "epoch": 0.1951125543508173, + "grad_norm": 0.7889519929885864, + "learning_rate": 9.77866034998241e-06, + "loss": 0.755, + "step": 3545 + }, + { + "epoch": 0.19516759315317297, + "grad_norm": 0.7895978689193726, + "learning_rate": 9.778532789520053e-06, + "loss": 0.8213, + "step": 3546 + }, + { + "epoch": 0.19522263195552864, + "grad_norm": 0.8571796417236328, + "learning_rate": 9.77840519314352e-06, + "loss": 0.8193, + "step": 3547 + }, + { + "epoch": 0.1952776707578843, + "grad_norm": 0.6880007982254028, + "learning_rate": 9.778277560853775e-06, + "loss": 0.6354, + "step": 3548 + }, + { + "epoch": 0.19533270956023996, + "grad_norm": 0.8155353665351868, + "learning_rate": 9.778149892651775e-06, + "loss": 0.8518, + "step": 3549 + }, + { + "epoch": 0.19538774836259562, + "grad_norm": 0.851021945476532, + "learning_rate": 9.778022188538479e-06, + "loss": 0.8506, + "step": 3550 + }, + { + "epoch": 0.19544278716495128, + "grad_norm": 0.8910510540008545, + "learning_rate": 9.777894448514847e-06, + "loss": 0.8825, + "step": 3551 + }, + { + "epoch": 0.19549782596730694, + "grad_norm": 0.8156018853187561, + "learning_rate": 9.777766672581838e-06, + "loss": 0.8262, + "step": 3552 + }, + { + "epoch": 0.1955528647696626, + "grad_norm": 0.756340503692627, + "learning_rate": 9.777638860740415e-06, + "loss": 0.7094, + "step": 3553 + }, + { + "epoch": 0.19560790357201827, + "grad_norm": 0.7604243159294128, + "learning_rate": 9.777511012991538e-06, + "loss": 0.8089, + "step": 3554 + }, + { + "epoch": 0.19566294237437393, + "grad_norm": 0.7609277963638306, + "learning_rate": 9.777383129336167e-06, + "loss": 0.7853, + "step": 3555 + }, + { + "epoch": 0.1957179811767296, + "grad_norm": 1.3562177419662476, + "learning_rate": 9.77725520977526e-06, + "loss": 0.7051, + "step": 3556 + }, + { + "epoch": 0.19577301997908525, + "grad_norm": 0.7428582310676575, + "learning_rate": 9.777127254309784e-06, + "loss": 0.734, + "step": 3557 + }, + { + "epoch": 0.1958280587814409, + "grad_norm": 0.6941032409667969, + "learning_rate": 9.776999262940698e-06, + "loss": 0.7862, + "step": 3558 + }, + { + "epoch": 0.19588309758379657, + "grad_norm": 0.8249906301498413, + "learning_rate": 9.776871235668966e-06, + "loss": 0.8324, + "step": 3559 + }, + { + "epoch": 0.19593813638615223, + "grad_norm": 0.6778795719146729, + "learning_rate": 9.776743172495546e-06, + "loss": 0.743, + "step": 3560 + }, + { + "epoch": 0.1959931751885079, + "grad_norm": 0.8454411625862122, + "learning_rate": 9.776615073421405e-06, + "loss": 0.8625, + "step": 3561 + }, + { + "epoch": 0.19604821399086356, + "grad_norm": 0.8303809762001038, + "learning_rate": 9.776486938447503e-06, + "loss": 0.8806, + "step": 3562 + }, + { + "epoch": 0.19610325279321922, + "grad_norm": 0.8814080357551575, + "learning_rate": 9.776358767574803e-06, + "loss": 0.9096, + "step": 3563 + }, + { + "epoch": 0.19615829159557488, + "grad_norm": 0.7860022187232971, + "learning_rate": 9.77623056080427e-06, + "loss": 0.8101, + "step": 3564 + }, + { + "epoch": 0.19621333039793054, + "grad_norm": 0.7604898810386658, + "learning_rate": 9.776102318136866e-06, + "loss": 0.8121, + "step": 3565 + }, + { + "epoch": 0.1962683692002862, + "grad_norm": 0.810708224773407, + "learning_rate": 9.775974039573555e-06, + "loss": 0.8334, + "step": 3566 + }, + { + "epoch": 0.19632340800264186, + "grad_norm": 1.0174707174301147, + "learning_rate": 9.775845725115301e-06, + "loss": 0.8147, + "step": 3567 + }, + { + "epoch": 0.19637844680499753, + "grad_norm": 0.825137734413147, + "learning_rate": 9.77571737476307e-06, + "loss": 0.816, + "step": 3568 + }, + { + "epoch": 0.1964334856073532, + "grad_norm": 0.9023691415786743, + "learning_rate": 9.775588988517826e-06, + "loss": 0.9157, + "step": 3569 + }, + { + "epoch": 0.19648852440970885, + "grad_norm": 0.7287655472755432, + "learning_rate": 9.775460566380534e-06, + "loss": 0.7414, + "step": 3570 + }, + { + "epoch": 0.1965435632120645, + "grad_norm": 0.8675361275672913, + "learning_rate": 9.775332108352158e-06, + "loss": 0.7212, + "step": 3571 + }, + { + "epoch": 0.19659860201442017, + "grad_norm": 0.8633139729499817, + "learning_rate": 9.775203614433664e-06, + "loss": 0.7254, + "step": 3572 + }, + { + "epoch": 0.19665364081677583, + "grad_norm": 0.8628275394439697, + "learning_rate": 9.775075084626017e-06, + "loss": 0.7403, + "step": 3573 + }, + { + "epoch": 0.1967086796191315, + "grad_norm": 0.86918044090271, + "learning_rate": 9.774946518930184e-06, + "loss": 0.8208, + "step": 3574 + }, + { + "epoch": 0.19676371842148715, + "grad_norm": 1.3616218566894531, + "learning_rate": 9.774817917347132e-06, + "loss": 0.7432, + "step": 3575 + }, + { + "epoch": 0.19681875722384282, + "grad_norm": 0.929084062576294, + "learning_rate": 9.774689279877827e-06, + "loss": 0.9567, + "step": 3576 + }, + { + "epoch": 0.19687379602619848, + "grad_norm": 0.7732542753219604, + "learning_rate": 9.774560606523234e-06, + "loss": 0.8682, + "step": 3577 + }, + { + "epoch": 0.19692883482855414, + "grad_norm": 0.7933471202850342, + "learning_rate": 9.774431897284323e-06, + "loss": 0.7112, + "step": 3578 + }, + { + "epoch": 0.1969838736309098, + "grad_norm": 0.8229583501815796, + "learning_rate": 9.77430315216206e-06, + "loss": 0.762, + "step": 3579 + }, + { + "epoch": 0.19703891243326546, + "grad_norm": 0.7571341395378113, + "learning_rate": 9.774174371157412e-06, + "loss": 0.7627, + "step": 3580 + }, + { + "epoch": 0.19709395123562112, + "grad_norm": 1.1551839113235474, + "learning_rate": 9.774045554271347e-06, + "loss": 0.8621, + "step": 3581 + }, + { + "epoch": 0.19714899003797678, + "grad_norm": 0.8546237349510193, + "learning_rate": 9.773916701504833e-06, + "loss": 0.8183, + "step": 3582 + }, + { + "epoch": 0.19720402884033245, + "grad_norm": 0.7297555804252625, + "learning_rate": 9.773787812858841e-06, + "loss": 0.8098, + "step": 3583 + }, + { + "epoch": 0.1972590676426881, + "grad_norm": 0.7846053838729858, + "learning_rate": 9.773658888334336e-06, + "loss": 0.7874, + "step": 3584 + }, + { + "epoch": 0.19731410644504377, + "grad_norm": 0.8949562907218933, + "learning_rate": 9.773529927932288e-06, + "loss": 0.8651, + "step": 3585 + }, + { + "epoch": 0.19736914524739943, + "grad_norm": 0.8041829466819763, + "learning_rate": 9.773400931653668e-06, + "loss": 0.7519, + "step": 3586 + }, + { + "epoch": 0.19742418404975506, + "grad_norm": 0.8090983033180237, + "learning_rate": 9.773271899499444e-06, + "loss": 0.8606, + "step": 3587 + }, + { + "epoch": 0.19747922285211073, + "grad_norm": 0.7954100966453552, + "learning_rate": 9.773142831470587e-06, + "loss": 0.9028, + "step": 3588 + }, + { + "epoch": 0.1975342616544664, + "grad_norm": 0.6865562796592712, + "learning_rate": 9.773013727568066e-06, + "loss": 0.7323, + "step": 3589 + }, + { + "epoch": 0.19758930045682205, + "grad_norm": 0.9144858717918396, + "learning_rate": 9.772884587792851e-06, + "loss": 0.8178, + "step": 3590 + }, + { + "epoch": 0.1976443392591777, + "grad_norm": 0.8096563220024109, + "learning_rate": 9.772755412145913e-06, + "loss": 0.7749, + "step": 3591 + }, + { + "epoch": 0.19769937806153337, + "grad_norm": 1.4496957063674927, + "learning_rate": 9.772626200628222e-06, + "loss": 0.7981, + "step": 3592 + }, + { + "epoch": 0.19775441686388903, + "grad_norm": 0.7699438333511353, + "learning_rate": 9.77249695324075e-06, + "loss": 0.7683, + "step": 3593 + }, + { + "epoch": 0.1978094556662447, + "grad_norm": 0.7883017063140869, + "learning_rate": 9.77236766998447e-06, + "loss": 0.7668, + "step": 3594 + }, + { + "epoch": 0.19786449446860035, + "grad_norm": 0.7552568912506104, + "learning_rate": 9.772238350860352e-06, + "loss": 0.7914, + "step": 3595 + }, + { + "epoch": 0.19791953327095602, + "grad_norm": 0.8585009574890137, + "learning_rate": 9.772108995869366e-06, + "loss": 0.9888, + "step": 3596 + }, + { + "epoch": 0.19797457207331168, + "grad_norm": 0.9459839463233948, + "learning_rate": 9.77197960501249e-06, + "loss": 0.9923, + "step": 3597 + }, + { + "epoch": 0.19802961087566734, + "grad_norm": 0.844771683216095, + "learning_rate": 9.77185017829069e-06, + "loss": 0.8427, + "step": 3598 + }, + { + "epoch": 0.198084649678023, + "grad_norm": 0.749700665473938, + "learning_rate": 9.77172071570494e-06, + "loss": 0.8111, + "step": 3599 + }, + { + "epoch": 0.19813968848037866, + "grad_norm": 0.7297450304031372, + "learning_rate": 9.771591217256216e-06, + "loss": 0.7783, + "step": 3600 + }, + { + "epoch": 0.19819472728273432, + "grad_norm": 0.7928450703620911, + "learning_rate": 9.77146168294549e-06, + "loss": 0.8755, + "step": 3601 + }, + { + "epoch": 0.19824976608508998, + "grad_norm": 0.7236143946647644, + "learning_rate": 9.771332112773734e-06, + "loss": 0.7159, + "step": 3602 + }, + { + "epoch": 0.19830480488744565, + "grad_norm": 0.8170965313911438, + "learning_rate": 9.771202506741926e-06, + "loss": 0.9093, + "step": 3603 + }, + { + "epoch": 0.1983598436898013, + "grad_norm": 0.8834578990936279, + "learning_rate": 9.771072864851035e-06, + "loss": 0.8961, + "step": 3604 + }, + { + "epoch": 0.19841488249215697, + "grad_norm": 1.3750289678573608, + "learning_rate": 9.770943187102037e-06, + "loss": 0.8175, + "step": 3605 + }, + { + "epoch": 0.19846992129451263, + "grad_norm": 0.7016286253929138, + "learning_rate": 9.770813473495909e-06, + "loss": 0.7171, + "step": 3606 + }, + { + "epoch": 0.1985249600968683, + "grad_norm": 0.7792307734489441, + "learning_rate": 9.770683724033622e-06, + "loss": 0.6892, + "step": 3607 + }, + { + "epoch": 0.19857999889922395, + "grad_norm": 0.789820671081543, + "learning_rate": 9.770553938716153e-06, + "loss": 0.8531, + "step": 3608 + }, + { + "epoch": 0.19863503770157961, + "grad_norm": 0.7585997581481934, + "learning_rate": 9.77042411754448e-06, + "loss": 0.8195, + "step": 3609 + }, + { + "epoch": 0.19869007650393528, + "grad_norm": 0.8989273905754089, + "learning_rate": 9.770294260519573e-06, + "loss": 0.891, + "step": 3610 + }, + { + "epoch": 0.19874511530629094, + "grad_norm": 0.8044012188911438, + "learning_rate": 9.770164367642414e-06, + "loss": 0.8428, + "step": 3611 + }, + { + "epoch": 0.1988001541086466, + "grad_norm": 0.7847021222114563, + "learning_rate": 9.770034438913975e-06, + "loss": 0.8302, + "step": 3612 + }, + { + "epoch": 0.19885519291100226, + "grad_norm": 0.9260531663894653, + "learning_rate": 9.769904474335234e-06, + "loss": 0.8187, + "step": 3613 + }, + { + "epoch": 0.19891023171335792, + "grad_norm": 0.7491805553436279, + "learning_rate": 9.769774473907168e-06, + "loss": 0.8374, + "step": 3614 + }, + { + "epoch": 0.19896527051571358, + "grad_norm": 1.1665992736816406, + "learning_rate": 9.769644437630754e-06, + "loss": 0.8154, + "step": 3615 + }, + { + "epoch": 0.19902030931806924, + "grad_norm": 0.9162279963493347, + "learning_rate": 9.769514365506968e-06, + "loss": 0.8883, + "step": 3616 + }, + { + "epoch": 0.1990753481204249, + "grad_norm": 0.8980437517166138, + "learning_rate": 9.769384257536791e-06, + "loss": 0.8948, + "step": 3617 + }, + { + "epoch": 0.19913038692278057, + "grad_norm": 0.7544137835502625, + "learning_rate": 9.769254113721197e-06, + "loss": 0.7763, + "step": 3618 + }, + { + "epoch": 0.19918542572513623, + "grad_norm": 0.8393334746360779, + "learning_rate": 9.769123934061168e-06, + "loss": 0.8361, + "step": 3619 + }, + { + "epoch": 0.1992404645274919, + "grad_norm": 0.8184031248092651, + "learning_rate": 9.768993718557678e-06, + "loss": 0.8104, + "step": 3620 + }, + { + "epoch": 0.19929550332984755, + "grad_norm": 0.8023706674575806, + "learning_rate": 9.76886346721171e-06, + "loss": 0.7824, + "step": 3621 + }, + { + "epoch": 0.1993505421322032, + "grad_norm": 0.9354264736175537, + "learning_rate": 9.768733180024238e-06, + "loss": 0.7782, + "step": 3622 + }, + { + "epoch": 0.19940558093455887, + "grad_norm": 0.7037177681922913, + "learning_rate": 9.768602856996244e-06, + "loss": 0.8054, + "step": 3623 + }, + { + "epoch": 0.19946061973691454, + "grad_norm": 0.7926928997039795, + "learning_rate": 9.768472498128709e-06, + "loss": 0.8864, + "step": 3624 + }, + { + "epoch": 0.1995156585392702, + "grad_norm": 0.7963769435882568, + "learning_rate": 9.76834210342261e-06, + "loss": 0.8505, + "step": 3625 + }, + { + "epoch": 0.19957069734162586, + "grad_norm": 0.8553926944732666, + "learning_rate": 9.768211672878929e-06, + "loss": 0.8519, + "step": 3626 + }, + { + "epoch": 0.19962573614398152, + "grad_norm": 0.8147156834602356, + "learning_rate": 9.768081206498644e-06, + "loss": 0.8091, + "step": 3627 + }, + { + "epoch": 0.19968077494633718, + "grad_norm": 0.8226443529129028, + "learning_rate": 9.767950704282739e-06, + "loss": 0.8561, + "step": 3628 + }, + { + "epoch": 0.19973581374869284, + "grad_norm": 0.7246909141540527, + "learning_rate": 9.76782016623219e-06, + "loss": 0.7318, + "step": 3629 + }, + { + "epoch": 0.19979085255104848, + "grad_norm": 1.0527293682098389, + "learning_rate": 9.767689592347983e-06, + "loss": 0.7699, + "step": 3630 + }, + { + "epoch": 0.19984589135340414, + "grad_norm": 0.7433847188949585, + "learning_rate": 9.767558982631097e-06, + "loss": 0.8619, + "step": 3631 + }, + { + "epoch": 0.1999009301557598, + "grad_norm": 0.7901468873023987, + "learning_rate": 9.767428337082513e-06, + "loss": 0.8365, + "step": 3632 + }, + { + "epoch": 0.19995596895811546, + "grad_norm": 0.7766845226287842, + "learning_rate": 9.767297655703215e-06, + "loss": 0.7767, + "step": 3633 + }, + { + "epoch": 0.20001100776047112, + "grad_norm": 0.7785109281539917, + "learning_rate": 9.767166938494183e-06, + "loss": 0.7114, + "step": 3634 + }, + { + "epoch": 0.20006604656282678, + "grad_norm": 0.8068187832832336, + "learning_rate": 9.767036185456402e-06, + "loss": 0.8142, + "step": 3635 + }, + { + "epoch": 0.20012108536518244, + "grad_norm": 0.7893292307853699, + "learning_rate": 9.766905396590851e-06, + "loss": 0.8658, + "step": 3636 } ], "logging_steps": 1, @@ -19115,7 +25478,7 @@ "attributes": {} } }, - "total_flos": 8.047544142365983e+18, + "total_flos": 1.0730058856487977e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null