{ "best_global_step": 650, "best_metric": 0.3949255049228668, "best_model_checkpoint": "runs/cpt_run_v1/checkpoints/checkpoint-600", "epoch": 2.0, "eval_steps": 50, "global_step": 686, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0029170464904284413, "grad_norm": 1.1577509641647339, "learning_rate": 0.0, "loss": 0.9893555045127869, "step": 1 }, { "epoch": 0.005834092980856883, "grad_norm": 0.9491796493530273, "learning_rate": 2.8985507246376816e-07, "loss": 0.8791205883026123, "step": 2 }, { "epoch": 0.008751139471285323, "grad_norm": 1.1600768566131592, "learning_rate": 5.797101449275363e-07, "loss": 0.9858248233795166, "step": 3 }, { "epoch": 0.011668185961713765, "grad_norm": 1.2298306226730347, "learning_rate": 8.695652173913044e-07, "loss": 1.0516364574432373, "step": 4 }, { "epoch": 0.014585232452142206, "grad_norm": 0.9520533680915833, "learning_rate": 1.1594202898550726e-06, "loss": 0.8392249345779419, "step": 5 }, { "epoch": 0.017502278942570646, "grad_norm": 1.2451188564300537, "learning_rate": 1.4492753623188408e-06, "loss": 1.0955077409744263, "step": 6 }, { "epoch": 0.02041932543299909, "grad_norm": 1.1123991012573242, "learning_rate": 1.7391304347826088e-06, "loss": 0.9201866388320923, "step": 7 }, { "epoch": 0.02333637192342753, "grad_norm": 0.9283139705657959, "learning_rate": 2.028985507246377e-06, "loss": 0.9770950078964233, "step": 8 }, { "epoch": 0.02625341841385597, "grad_norm": 0.9589216113090515, "learning_rate": 2.3188405797101453e-06, "loss": 0.9442565441131592, "step": 9 }, { "epoch": 0.02917046490428441, "grad_norm": 0.8866703510284424, "learning_rate": 2.6086956521739132e-06, "loss": 0.9354464411735535, "step": 10 }, { "epoch": 0.03208751139471285, "grad_norm": 0.7191241383552551, "learning_rate": 2.8985507246376816e-06, "loss": 0.7659736275672913, "step": 11 }, { "epoch": 0.03500455788514129, "grad_norm": 0.9110142588615417, "learning_rate": 3.188405797101449e-06, "loss": 0.9319326877593994, "step": 12 }, { "epoch": 0.03792160437556973, "grad_norm": 0.8754057288169861, "learning_rate": 3.4782608695652175e-06, "loss": 0.9819356203079224, "step": 13 }, { "epoch": 0.04083865086599818, "grad_norm": 0.896181046962738, "learning_rate": 3.768115942028986e-06, "loss": 1.026316523551941, "step": 14 }, { "epoch": 0.04375569735642662, "grad_norm": 0.6104832887649536, "learning_rate": 4.057971014492754e-06, "loss": 0.8427562713623047, "step": 15 }, { "epoch": 0.04667274384685506, "grad_norm": 0.6529208421707153, "learning_rate": 4.347826086956522e-06, "loss": 0.8496565222740173, "step": 16 }, { "epoch": 0.0495897903372835, "grad_norm": 0.6319335699081421, "learning_rate": 4.637681159420291e-06, "loss": 0.9139047861099243, "step": 17 }, { "epoch": 0.05250683682771194, "grad_norm": 0.7458649277687073, "learning_rate": 4.927536231884059e-06, "loss": 0.8867442011833191, "step": 18 }, { "epoch": 0.05542388331814038, "grad_norm": 0.6179773211479187, "learning_rate": 5.2173913043478265e-06, "loss": 0.9579408168792725, "step": 19 }, { "epoch": 0.05834092980856882, "grad_norm": 0.794481635093689, "learning_rate": 5.507246376811595e-06, "loss": 0.8736554980278015, "step": 20 }, { "epoch": 0.06125797629899726, "grad_norm": 0.8356145620346069, "learning_rate": 5.797101449275363e-06, "loss": 0.9358762502670288, "step": 21 }, { "epoch": 0.0641750227894257, "grad_norm": 0.5891932845115662, "learning_rate": 6.086956521739132e-06, "loss": 0.8972038626670837, "step": 22 }, { "epoch": 0.06709206927985414, "grad_norm": 0.6931268572807312, "learning_rate": 6.376811594202898e-06, "loss": 0.9583507776260376, "step": 23 }, { "epoch": 0.07000911577028258, "grad_norm": 0.7298229336738586, "learning_rate": 6.666666666666667e-06, "loss": 0.8119489550590515, "step": 24 }, { "epoch": 0.07292616226071102, "grad_norm": 0.6419956684112549, "learning_rate": 6.956521739130435e-06, "loss": 0.9386100769042969, "step": 25 }, { "epoch": 0.07584320875113947, "grad_norm": 0.7508338689804077, "learning_rate": 7.246376811594203e-06, "loss": 0.9272583723068237, "step": 26 }, { "epoch": 0.0787602552415679, "grad_norm": 0.5848079919815063, "learning_rate": 7.536231884057972e-06, "loss": 0.8967856168746948, "step": 27 }, { "epoch": 0.08167730173199636, "grad_norm": 0.7384837865829468, "learning_rate": 7.82608695652174e-06, "loss": 0.8696568012237549, "step": 28 }, { "epoch": 0.0845943482224248, "grad_norm": 0.5069604516029358, "learning_rate": 8.115942028985508e-06, "loss": 0.9121193885803223, "step": 29 }, { "epoch": 0.08751139471285324, "grad_norm": 0.833165168762207, "learning_rate": 8.405797101449275e-06, "loss": 0.8180589079856873, "step": 30 }, { "epoch": 0.09042844120328168, "grad_norm": 0.6355920433998108, "learning_rate": 8.695652173913044e-06, "loss": 0.8640957474708557, "step": 31 }, { "epoch": 0.09334548769371012, "grad_norm": 1.0429315567016602, "learning_rate": 8.985507246376812e-06, "loss": 0.9517915844917297, "step": 32 }, { "epoch": 0.09626253418413856, "grad_norm": 0.5875154733657837, "learning_rate": 9.275362318840581e-06, "loss": 0.9443603754043579, "step": 33 }, { "epoch": 0.099179580674567, "grad_norm": 1.9913769960403442, "learning_rate": 9.565217391304349e-06, "loss": 0.9510866403579712, "step": 34 }, { "epoch": 0.10209662716499544, "grad_norm": 0.5310097932815552, "learning_rate": 9.855072463768118e-06, "loss": 0.8653419613838196, "step": 35 }, { "epoch": 0.10501367365542388, "grad_norm": 0.624421238899231, "learning_rate": 1.0144927536231885e-05, "loss": 0.7941208481788635, "step": 36 }, { "epoch": 0.10793072014585232, "grad_norm": 0.6314200758934021, "learning_rate": 1.0434782608695653e-05, "loss": 0.8931174278259277, "step": 37 }, { "epoch": 0.11084776663628076, "grad_norm": 0.6272342205047607, "learning_rate": 1.0724637681159422e-05, "loss": 0.8978185057640076, "step": 38 }, { "epoch": 0.1137648131267092, "grad_norm": 0.5711184740066528, "learning_rate": 1.101449275362319e-05, "loss": 0.808263897895813, "step": 39 }, { "epoch": 0.11668185961713765, "grad_norm": 0.7581208944320679, "learning_rate": 1.1304347826086957e-05, "loss": 0.7456756830215454, "step": 40 }, { "epoch": 0.11959890610756609, "grad_norm": 0.4989977180957794, "learning_rate": 1.1594202898550726e-05, "loss": 0.8273333311080933, "step": 41 }, { "epoch": 0.12251595259799453, "grad_norm": 0.8602972626686096, "learning_rate": 1.1884057971014494e-05, "loss": 0.8514784574508667, "step": 42 }, { "epoch": 0.12543299908842298, "grad_norm": 0.6918581128120422, "learning_rate": 1.2173913043478263e-05, "loss": 0.8182265162467957, "step": 43 }, { "epoch": 0.1283500455788514, "grad_norm": 0.653099536895752, "learning_rate": 1.2463768115942029e-05, "loss": 0.8242791891098022, "step": 44 }, { "epoch": 0.13126709206927986, "grad_norm": 0.7485584616661072, "learning_rate": 1.2753623188405797e-05, "loss": 0.8229591250419617, "step": 45 }, { "epoch": 0.1341841385597083, "grad_norm": 0.6724833250045776, "learning_rate": 1.3043478260869566e-05, "loss": 0.8146833181381226, "step": 46 }, { "epoch": 0.13710118505013674, "grad_norm": 0.857208251953125, "learning_rate": 1.3333333333333333e-05, "loss": 0.8154427409172058, "step": 47 }, { "epoch": 0.14001823154056517, "grad_norm": 0.5559669137001038, "learning_rate": 1.3623188405797103e-05, "loss": 0.879005491733551, "step": 48 }, { "epoch": 0.14293527803099362, "grad_norm": 0.5910897850990295, "learning_rate": 1.391304347826087e-05, "loss": 0.8148283362388611, "step": 49 }, { "epoch": 0.14585232452142205, "grad_norm": 0.6478891372680664, "learning_rate": 1.420289855072464e-05, "loss": 0.8293006420135498, "step": 50 }, { "epoch": 0.14585232452142205, "eval_loss": 0.7892261147499084, "eval_runtime": 973.2157, "eval_samples_per_second": 0.649, "eval_steps_per_second": 0.649, "step": 50 }, { "epoch": 0.1487693710118505, "grad_norm": 0.757882833480835, "learning_rate": 1.4492753623188407e-05, "loss": 0.8114852905273438, "step": 51 }, { "epoch": 0.15168641750227893, "grad_norm": 0.8496116995811462, "learning_rate": 1.4782608695652174e-05, "loss": 0.7886185050010681, "step": 52 }, { "epoch": 0.15460346399270739, "grad_norm": 0.6078857183456421, "learning_rate": 1.5072463768115944e-05, "loss": 0.7298170924186707, "step": 53 }, { "epoch": 0.1575205104831358, "grad_norm": 0.5856835246086121, "learning_rate": 1.536231884057971e-05, "loss": 0.7407160997390747, "step": 54 }, { "epoch": 0.16043755697356427, "grad_norm": 1.0533701181411743, "learning_rate": 1.565217391304348e-05, "loss": 0.7057831287384033, "step": 55 }, { "epoch": 0.16335460346399272, "grad_norm": 0.8087610006332397, "learning_rate": 1.5942028985507246e-05, "loss": 0.7409019470214844, "step": 56 }, { "epoch": 0.16627164995442115, "grad_norm": 0.629945695400238, "learning_rate": 1.6231884057971015e-05, "loss": 0.7768293023109436, "step": 57 }, { "epoch": 0.1691886964448496, "grad_norm": 0.5187911987304688, "learning_rate": 1.6521739130434785e-05, "loss": 0.825718104839325, "step": 58 }, { "epoch": 0.17210574293527803, "grad_norm": 0.5866358280181885, "learning_rate": 1.681159420289855e-05, "loss": 0.8575979471206665, "step": 59 }, { "epoch": 0.17502278942570648, "grad_norm": 1.5098934173583984, "learning_rate": 1.710144927536232e-05, "loss": 0.8058848977088928, "step": 60 }, { "epoch": 0.1779398359161349, "grad_norm": 0.6981958150863647, "learning_rate": 1.739130434782609e-05, "loss": 0.7640778422355652, "step": 61 }, { "epoch": 0.18085688240656336, "grad_norm": 0.631349503993988, "learning_rate": 1.7681159420289858e-05, "loss": 0.7896331548690796, "step": 62 }, { "epoch": 0.1837739288969918, "grad_norm": 0.6930747032165527, "learning_rate": 1.7971014492753624e-05, "loss": 0.6762524247169495, "step": 63 }, { "epoch": 0.18669097538742024, "grad_norm": 0.599399209022522, "learning_rate": 1.8260869565217393e-05, "loss": 0.7285035848617554, "step": 64 }, { "epoch": 0.18960802187784867, "grad_norm": 0.6194344758987427, "learning_rate": 1.8550724637681162e-05, "loss": 0.7682523131370544, "step": 65 }, { "epoch": 0.19252506836827712, "grad_norm": 0.5691342949867249, "learning_rate": 1.8840579710144928e-05, "loss": 0.6791993379592896, "step": 66 }, { "epoch": 0.19544211485870555, "grad_norm": 0.6257390379905701, "learning_rate": 1.9130434782608697e-05, "loss": 0.6744828224182129, "step": 67 }, { "epoch": 0.198359161349134, "grad_norm": 0.5871018767356873, "learning_rate": 1.9420289855072467e-05, "loss": 0.7317330837249756, "step": 68 }, { "epoch": 0.20127620783956243, "grad_norm": 1.0744612216949463, "learning_rate": 1.9710144927536236e-05, "loss": 0.6617178916931152, "step": 69 }, { "epoch": 0.2041932543299909, "grad_norm": 0.675946831703186, "learning_rate": 2e-05, "loss": 0.7615712881088257, "step": 70 }, { "epoch": 0.2071103008204193, "grad_norm": 0.7663411498069763, "learning_rate": 1.9999870372100614e-05, "loss": 0.7131291627883911, "step": 71 }, { "epoch": 0.21002734731084777, "grad_norm": 0.6725395321846008, "learning_rate": 1.9999481491763123e-05, "loss": 0.7452989816665649, "step": 72 }, { "epoch": 0.21294439380127622, "grad_norm": 0.6505664587020874, "learning_rate": 1.9998833369069483e-05, "loss": 0.7477136850357056, "step": 73 }, { "epoch": 0.21586144029170465, "grad_norm": 0.7032860517501831, "learning_rate": 1.9997926020822643e-05, "loss": 0.6854275465011597, "step": 74 }, { "epoch": 0.2187784867821331, "grad_norm": 0.645345151424408, "learning_rate": 1.999675947054614e-05, "loss": 0.7552425265312195, "step": 75 }, { "epoch": 0.22169553327256153, "grad_norm": 0.6620492935180664, "learning_rate": 1.9995333748483464e-05, "loss": 0.7262853384017944, "step": 76 }, { "epoch": 0.22461257976298998, "grad_norm": 0.6511455774307251, "learning_rate": 1.9993648891597284e-05, "loss": 0.7591732144355774, "step": 77 }, { "epoch": 0.2275296262534184, "grad_norm": 0.6775254011154175, "learning_rate": 1.9991704943568497e-05, "loss": 0.7498704195022583, "step": 78 }, { "epoch": 0.23044667274384686, "grad_norm": 0.8199896216392517, "learning_rate": 1.9989501954795076e-05, "loss": 0.7238684296607971, "step": 79 }, { "epoch": 0.2333637192342753, "grad_norm": 0.8197569847106934, "learning_rate": 1.998703998239079e-05, "loss": 0.7028778195381165, "step": 80 }, { "epoch": 0.23628076572470375, "grad_norm": 0.6602625250816345, "learning_rate": 1.9984319090183692e-05, "loss": 0.8842703104019165, "step": 81 }, { "epoch": 0.23919781221513217, "grad_norm": 0.9587129354476929, "learning_rate": 1.99813393487145e-05, "loss": 0.732614278793335, "step": 82 }, { "epoch": 0.24211485870556063, "grad_norm": 0.6822189092636108, "learning_rate": 1.997810083523473e-05, "loss": 0.7544928193092346, "step": 83 }, { "epoch": 0.24503190519598905, "grad_norm": 0.8980082869529724, "learning_rate": 1.9974603633704726e-05, "loss": 0.6704054474830627, "step": 84 }, { "epoch": 0.2479489516864175, "grad_norm": 0.7413425445556641, "learning_rate": 1.9970847834791472e-05, "loss": 0.693661093711853, "step": 85 }, { "epoch": 0.25086599817684596, "grad_norm": 0.8314999341964722, "learning_rate": 1.9966833535866223e-05, "loss": 0.667654275894165, "step": 86 }, { "epoch": 0.25378304466727436, "grad_norm": 0.7972444891929626, "learning_rate": 1.9962560841002013e-05, "loss": 0.8403134942054749, "step": 87 }, { "epoch": 0.2567000911577028, "grad_norm": 0.8519951701164246, "learning_rate": 1.995802986097093e-05, "loss": 0.6897370219230652, "step": 88 }, { "epoch": 0.25961713764813127, "grad_norm": 0.8268933892250061, "learning_rate": 1.995324071324126e-05, "loss": 0.6690632700920105, "step": 89 }, { "epoch": 0.2625341841385597, "grad_norm": 0.7133983969688416, "learning_rate": 1.9948193521974436e-05, "loss": 0.6314147114753723, "step": 90 }, { "epoch": 0.2654512306289881, "grad_norm": 0.889302134513855, "learning_rate": 1.9942888418021814e-05, "loss": 0.7389825582504272, "step": 91 }, { "epoch": 0.2683682771194166, "grad_norm": 0.7022432088851929, "learning_rate": 1.99373255389213e-05, "loss": 0.6916261911392212, "step": 92 }, { "epoch": 0.27128532360984503, "grad_norm": 0.696432888507843, "learning_rate": 1.9931505028893748e-05, "loss": 0.6908476948738098, "step": 93 }, { "epoch": 0.2742023701002735, "grad_norm": 0.7667419910430908, "learning_rate": 1.9925427038839267e-05, "loss": 0.6500837206840515, "step": 94 }, { "epoch": 0.27711941659070194, "grad_norm": 0.6974894404411316, "learning_rate": 1.9919091726333265e-05, "loss": 0.7059191465377808, "step": 95 }, { "epoch": 0.28003646308113034, "grad_norm": 0.7047077417373657, "learning_rate": 1.9912499255622397e-05, "loss": 0.6287837624549866, "step": 96 }, { "epoch": 0.2829535095715588, "grad_norm": 0.7729557156562805, "learning_rate": 1.990564979762029e-05, "loss": 0.6738612055778503, "step": 97 }, { "epoch": 0.28587055606198725, "grad_norm": 0.7020529508590698, "learning_rate": 1.989854352990311e-05, "loss": 0.662042498588562, "step": 98 }, { "epoch": 0.2887876025524157, "grad_norm": 0.7369800209999084, "learning_rate": 1.9891180636704975e-05, "loss": 0.6246830821037292, "step": 99 }, { "epoch": 0.2917046490428441, "grad_norm": 0.7412623167037964, "learning_rate": 1.9883561308913154e-05, "loss": 0.6623879075050354, "step": 100 }, { "epoch": 0.2917046490428441, "eval_loss": 0.6552971005439758, "eval_runtime": 966.7072, "eval_samples_per_second": 0.654, "eval_steps_per_second": 0.654, "step": 100 }, { "epoch": 0.29462169553327255, "grad_norm": 0.8428792953491211, "learning_rate": 1.987568574406314e-05, "loss": 0.6312171816825867, "step": 101 }, { "epoch": 0.297538742023701, "grad_norm": 0.6948133707046509, "learning_rate": 1.9867554146333517e-05, "loss": 0.6266146898269653, "step": 102 }, { "epoch": 0.30045578851412946, "grad_norm": 1.3897597789764404, "learning_rate": 1.985916672654068e-05, "loss": 0.6669265031814575, "step": 103 }, { "epoch": 0.30337283500455786, "grad_norm": 0.8838400840759277, "learning_rate": 1.985052370213334e-05, "loss": 0.6601086854934692, "step": 104 }, { "epoch": 0.3062898814949863, "grad_norm": 0.8471395373344421, "learning_rate": 1.9841625297186925e-05, "loss": 0.5984431505203247, "step": 105 }, { "epoch": 0.30920692798541477, "grad_norm": 0.8940042853355408, "learning_rate": 1.983247174239774e-05, "loss": 0.7223822474479675, "step": 106 }, { "epoch": 0.3121239744758432, "grad_norm": 0.7833696603775024, "learning_rate": 1.9823063275076998e-05, "loss": 0.6868705749511719, "step": 107 }, { "epoch": 0.3150410209662716, "grad_norm": 0.8794649243354797, "learning_rate": 1.9813400139144673e-05, "loss": 0.6246675848960876, "step": 108 }, { "epoch": 0.3179580674567001, "grad_norm": 0.8126057982444763, "learning_rate": 1.9803482585123165e-05, "loss": 0.5908697247505188, "step": 109 }, { "epoch": 0.32087511394712853, "grad_norm": 0.7947676777839661, "learning_rate": 1.979331087013082e-05, "loss": 0.5751246809959412, "step": 110 }, { "epoch": 0.323792160437557, "grad_norm": 0.713545560836792, "learning_rate": 1.978288525787524e-05, "loss": 0.6081106066703796, "step": 111 }, { "epoch": 0.32670920692798544, "grad_norm": 1.011828064918518, "learning_rate": 1.977220601864647e-05, "loss": 0.7039169669151306, "step": 112 }, { "epoch": 0.32962625341841384, "grad_norm": 0.730570912361145, "learning_rate": 1.9761273429309982e-05, "loss": 0.6140255928039551, "step": 113 }, { "epoch": 0.3325432999088423, "grad_norm": 1.059688687324524, "learning_rate": 1.9750087773299492e-05, "loss": 0.648114025592804, "step": 114 }, { "epoch": 0.33546034639927075, "grad_norm": 0.9336895942687988, "learning_rate": 1.973864934060962e-05, "loss": 0.622555673122406, "step": 115 }, { "epoch": 0.3383773928896992, "grad_norm": 0.7195945978164673, "learning_rate": 1.9726958427788367e-05, "loss": 0.70485520362854, "step": 116 }, { "epoch": 0.3412944393801276, "grad_norm": 0.8101872801780701, "learning_rate": 1.971501533792942e-05, "loss": 0.6958848834037781, "step": 117 }, { "epoch": 0.34421148587055606, "grad_norm": 1.6075212955474854, "learning_rate": 1.970282038066432e-05, "loss": 0.6021550893783569, "step": 118 }, { "epoch": 0.3471285323609845, "grad_norm": 0.7881433963775635, "learning_rate": 1.9690373872154396e-05, "loss": 0.6449777483940125, "step": 119 }, { "epoch": 0.35004557885141296, "grad_norm": 1.014639973640442, "learning_rate": 1.9677676135082606e-05, "loss": 0.5939379930496216, "step": 120 }, { "epoch": 0.35296262534184136, "grad_norm": 0.8198449611663818, "learning_rate": 1.9664727498645144e-05, "loss": 0.6210286617279053, "step": 121 }, { "epoch": 0.3558796718322698, "grad_norm": 1.0194576978683472, "learning_rate": 1.9651528298542918e-05, "loss": 0.624247670173645, "step": 122 }, { "epoch": 0.35879671832269827, "grad_norm": 0.7963470220565796, "learning_rate": 1.9638078876972842e-05, "loss": 0.6479315757751465, "step": 123 }, { "epoch": 0.3617137648131267, "grad_norm": 0.9007541537284851, "learning_rate": 1.9624379582618976e-05, "loss": 0.6131505370140076, "step": 124 }, { "epoch": 0.3646308113035551, "grad_norm": 0.8712120056152344, "learning_rate": 1.9610430770643464e-05, "loss": 0.6249448657035828, "step": 125 }, { "epoch": 0.3675478577939836, "grad_norm": 1.1482540369033813, "learning_rate": 1.9596232802677347e-05, "loss": 0.5844688415527344, "step": 126 }, { "epoch": 0.37046490428441203, "grad_norm": 0.8662379384040833, "learning_rate": 1.9581786046811175e-05, "loss": 0.6573485732078552, "step": 127 }, { "epoch": 0.3733819507748405, "grad_norm": 0.8191388845443726, "learning_rate": 1.9567090877585477e-05, "loss": 0.5896862745285034, "step": 128 }, { "epoch": 0.37629899726526894, "grad_norm": 1.0187078714370728, "learning_rate": 1.955214767598103e-05, "loss": 0.613490879535675, "step": 129 }, { "epoch": 0.37921604375569734, "grad_norm": 0.8444119691848755, "learning_rate": 1.953695682940901e-05, "loss": 0.727687656879425, "step": 130 }, { "epoch": 0.3821330902461258, "grad_norm": 0.74753737449646, "learning_rate": 1.9521518731700913e-05, "loss": 0.6102436780929565, "step": 131 }, { "epoch": 0.38505013673655425, "grad_norm": 1.0166202783584595, "learning_rate": 1.9505833783098378e-05, "loss": 0.6244844198226929, "step": 132 }, { "epoch": 0.3879671832269827, "grad_norm": 0.8175772428512573, "learning_rate": 1.9489902390242793e-05, "loss": 0.5939282178878784, "step": 133 }, { "epoch": 0.3908842297174111, "grad_norm": 1.0177713632583618, "learning_rate": 1.947372496616476e-05, "loss": 0.6418229937553406, "step": 134 }, { "epoch": 0.39380127620783956, "grad_norm": 0.8652453422546387, "learning_rate": 1.9457301930273376e-05, "loss": 0.5870395302772522, "step": 135 }, { "epoch": 0.396718322698268, "grad_norm": 0.8378894925117493, "learning_rate": 1.9440633708345365e-05, "loss": 0.6480278372764587, "step": 136 }, { "epoch": 0.39963536918869647, "grad_norm": 0.8303541541099548, "learning_rate": 1.9423720732514052e-05, "loss": 0.6191359758377075, "step": 137 }, { "epoch": 0.40255241567912486, "grad_norm": 0.8576734662055969, "learning_rate": 1.9406563441258145e-05, "loss": 0.5696198344230652, "step": 138 }, { "epoch": 0.4054694621695533, "grad_norm": 0.9558727145195007, "learning_rate": 1.9389162279390362e-05, "loss": 0.6177623271942139, "step": 139 }, { "epoch": 0.4083865086599818, "grad_norm": 0.7046042084693909, "learning_rate": 1.9371517698045922e-05, "loss": 0.5836521983146667, "step": 140 }, { "epoch": 0.4113035551504102, "grad_norm": 1.0522717237472534, "learning_rate": 1.935363015467082e-05, "loss": 0.5728275775909424, "step": 141 }, { "epoch": 0.4142206016408386, "grad_norm": 0.9554787874221802, "learning_rate": 1.933550011301e-05, "loss": 0.632586658000946, "step": 142 }, { "epoch": 0.4171376481312671, "grad_norm": 0.8874214291572571, "learning_rate": 1.9317128043095293e-05, "loss": 0.5850118398666382, "step": 143 }, { "epoch": 0.42005469462169553, "grad_norm": 1.0708963871002197, "learning_rate": 1.9298514421233276e-05, "loss": 0.6260685324668884, "step": 144 }, { "epoch": 0.422971741112124, "grad_norm": 0.8135736584663391, "learning_rate": 1.9279659729992888e-05, "loss": 0.6031094193458557, "step": 145 }, { "epoch": 0.42588878760255244, "grad_norm": 0.7971774339675903, "learning_rate": 1.9260564458192926e-05, "loss": 0.6101322770118713, "step": 146 }, { "epoch": 0.42880583409298084, "grad_norm": 0.9374974966049194, "learning_rate": 1.9241229100889397e-05, "loss": 0.5836313366889954, "step": 147 }, { "epoch": 0.4317228805834093, "grad_norm": 0.8043425679206848, "learning_rate": 1.9221654159362636e-05, "loss": 0.6181215047836304, "step": 148 }, { "epoch": 0.43463992707383775, "grad_norm": 0.8923380374908447, "learning_rate": 1.920184014110436e-05, "loss": 0.6149677634239197, "step": 149 }, { "epoch": 0.4375569735642662, "grad_norm": 0.8908132314682007, "learning_rate": 1.918178755980449e-05, "loss": 0.5899742841720581, "step": 150 }, { "epoch": 0.4375569735642662, "eval_loss": 0.5903874635696411, "eval_runtime": 1186.9542, "eval_samples_per_second": 0.532, "eval_steps_per_second": 0.532, "step": 150 }, { "epoch": 0.4404740200546946, "grad_norm": 1.060531497001648, "learning_rate": 1.9161496935337808e-05, "loss": 0.5852696895599365, "step": 151 }, { "epoch": 0.44339106654512306, "grad_norm": 0.9723032712936401, "learning_rate": 1.914096879375053e-05, "loss": 0.5822056531906128, "step": 152 }, { "epoch": 0.4463081130355515, "grad_norm": 0.9519931674003601, "learning_rate": 1.912020366724663e-05, "loss": 0.6183493137359619, "step": 153 }, { "epoch": 0.44922515952597997, "grad_norm": 0.8282918334007263, "learning_rate": 1.9099202094174055e-05, "loss": 0.6229860782623291, "step": 154 }, { "epoch": 0.45214220601640837, "grad_norm": 0.9251292943954468, "learning_rate": 1.907796461901076e-05, "loss": 0.6552959680557251, "step": 155 }, { "epoch": 0.4550592525068368, "grad_norm": 1.0349540710449219, "learning_rate": 1.9056491792350606e-05, "loss": 0.6170098781585693, "step": 156 }, { "epoch": 0.4579762989972653, "grad_norm": 0.8720711469650269, "learning_rate": 1.9034784170889076e-05, "loss": 0.5870137810707092, "step": 157 }, { "epoch": 0.46089334548769373, "grad_norm": 1.0785977840423584, "learning_rate": 1.9012842317408843e-05, "loss": 0.5515124201774597, "step": 158 }, { "epoch": 0.4638103919781221, "grad_norm": 1.0634154081344604, "learning_rate": 1.8990666800765187e-05, "loss": 0.6073828339576721, "step": 159 }, { "epoch": 0.4667274384685506, "grad_norm": 0.8770879507064819, "learning_rate": 1.896825819587123e-05, "loss": 0.5960907936096191, "step": 160 }, { "epoch": 0.46964448495897904, "grad_norm": 1.1225898265838623, "learning_rate": 1.894561708368305e-05, "loss": 0.545990526676178, "step": 161 }, { "epoch": 0.4725615314494075, "grad_norm": 0.9373893141746521, "learning_rate": 1.8922744051184613e-05, "loss": 0.5566108822822571, "step": 162 }, { "epoch": 0.4754785779398359, "grad_norm": 1.5016087293624878, "learning_rate": 1.8899639691372545e-05, "loss": 0.558845043182373, "step": 163 }, { "epoch": 0.47839562443026434, "grad_norm": 0.903020977973938, "learning_rate": 1.8876304603240773e-05, "loss": 0.6824233531951904, "step": 164 }, { "epoch": 0.4813126709206928, "grad_norm": 0.8239623308181763, "learning_rate": 1.8852739391764993e-05, "loss": 0.5630610585212708, "step": 165 }, { "epoch": 0.48422971741112125, "grad_norm": 0.926069438457489, "learning_rate": 1.882894466788697e-05, "loss": 0.6211802363395691, "step": 166 }, { "epoch": 0.4871467639015497, "grad_norm": 1.0098828077316284, "learning_rate": 1.8804921048498722e-05, "loss": 0.5513257384300232, "step": 167 }, { "epoch": 0.4900638103919781, "grad_norm": 0.9228141903877258, "learning_rate": 1.8780669156426517e-05, "loss": 0.6197121739387512, "step": 168 }, { "epoch": 0.49298085688240656, "grad_norm": 1.0551754236221313, "learning_rate": 1.8756189620414712e-05, "loss": 0.5221806764602661, "step": 169 }, { "epoch": 0.495897903372835, "grad_norm": 0.9017496109008789, "learning_rate": 1.873148307510948e-05, "loss": 0.5766995549201965, "step": 170 }, { "epoch": 0.49881494986326347, "grad_norm": 0.9704970717430115, "learning_rate": 1.870655016104233e-05, "loss": 0.6514763832092285, "step": 171 }, { "epoch": 0.5017319963536919, "grad_norm": 0.9972712397575378, "learning_rate": 1.8681391524613518e-05, "loss": 0.5273895263671875, "step": 172 }, { "epoch": 0.5046490428441204, "grad_norm": 0.9473339319229126, "learning_rate": 1.8656007818075288e-05, "loss": 0.5548599362373352, "step": 173 }, { "epoch": 0.5075660893345487, "grad_norm": 1.2493574619293213, "learning_rate": 1.8630399699514944e-05, "loss": 0.5593586564064026, "step": 174 }, { "epoch": 0.5104831358249772, "grad_norm": 1.2766696214675903, "learning_rate": 1.860456783283781e-05, "loss": 0.6054630279541016, "step": 175 }, { "epoch": 0.5134001823154056, "grad_norm": 0.9555240869522095, "learning_rate": 1.857851288775002e-05, "loss": 0.508592963218689, "step": 176 }, { "epoch": 0.5163172288058341, "grad_norm": 1.260219931602478, "learning_rate": 1.8552235539741118e-05, "loss": 0.5532065629959106, "step": 177 }, { "epoch": 0.5192342752962625, "grad_norm": 1.1859954595565796, "learning_rate": 1.8525736470066595e-05, "loss": 0.5683344006538391, "step": 178 }, { "epoch": 0.522151321786691, "grad_norm": 1.3044344186782837, "learning_rate": 1.8499016365730203e-05, "loss": 0.5281959772109985, "step": 179 }, { "epoch": 0.5250683682771194, "grad_norm": 1.3049921989440918, "learning_rate": 1.8472075919466137e-05, "loss": 0.49621230363845825, "step": 180 }, { "epoch": 0.5279854147675479, "grad_norm": 1.0488537549972534, "learning_rate": 1.844491582972109e-05, "loss": 0.6194032430648804, "step": 181 }, { "epoch": 0.5309024612579762, "grad_norm": 1.5553455352783203, "learning_rate": 1.8417536800636138e-05, "loss": 0.5645846724510193, "step": 182 }, { "epoch": 0.5338195077484047, "grad_norm": 1.2673912048339844, "learning_rate": 1.8389939542028484e-05, "loss": 0.6267315745353699, "step": 183 }, { "epoch": 0.5367365542388332, "grad_norm": 1.0273847579956055, "learning_rate": 1.8362124769373064e-05, "loss": 0.5256403684616089, "step": 184 }, { "epoch": 0.5396536007292616, "grad_norm": 1.006093978881836, "learning_rate": 1.8334093203783986e-05, "loss": 0.5916382074356079, "step": 185 }, { "epoch": 0.5425706472196901, "grad_norm": 1.2740857601165771, "learning_rate": 1.8305845571995843e-05, "loss": 0.581648588180542, "step": 186 }, { "epoch": 0.5454876937101185, "grad_norm": 1.494248390197754, "learning_rate": 1.8277382606344872e-05, "loss": 0.4824523627758026, "step": 187 }, { "epoch": 0.548404740200547, "grad_norm": 1.1862496137619019, "learning_rate": 1.824870504474996e-05, "loss": 0.5531858205795288, "step": 188 }, { "epoch": 0.5513217866909754, "grad_norm": 3.503049373626709, "learning_rate": 1.8219813630693523e-05, "loss": 0.6308296918869019, "step": 189 }, { "epoch": 0.5542388331814039, "grad_norm": 1.7544710636138916, "learning_rate": 1.819070911320222e-05, "loss": 0.6146273016929626, "step": 190 }, { "epoch": 0.5571558796718322, "grad_norm": 1.3367774486541748, "learning_rate": 1.8161392246827546e-05, "loss": 0.5848966240882874, "step": 191 }, { "epoch": 0.5600729261622607, "grad_norm": 1.696418046951294, "learning_rate": 1.8131863791626263e-05, "loss": 0.6621730327606201, "step": 192 }, { "epoch": 0.5629899726526891, "grad_norm": 1.360052227973938, "learning_rate": 1.8102124513140694e-05, "loss": 0.5972204208374023, "step": 193 }, { "epoch": 0.5659070191431176, "grad_norm": 1.5376263856887817, "learning_rate": 1.807217518237888e-05, "loss": 0.4938785433769226, "step": 194 }, { "epoch": 0.568824065633546, "grad_norm": 1.2249681949615479, "learning_rate": 1.8042016575794585e-05, "loss": 0.5366095304489136, "step": 195 }, { "epoch": 0.5717411121239745, "grad_norm": 1.7868080139160156, "learning_rate": 1.8011649475267178e-05, "loss": 0.5116773843765259, "step": 196 }, { "epoch": 0.574658158614403, "grad_norm": 2.369993209838867, "learning_rate": 1.7981074668081345e-05, "loss": 0.49072742462158203, "step": 197 }, { "epoch": 0.5775752051048314, "grad_norm": 1.0168434381484985, "learning_rate": 1.7950292946906695e-05, "loss": 0.5691611170768738, "step": 198 }, { "epoch": 0.5804922515952597, "grad_norm": 1.2990851402282715, "learning_rate": 1.7919305109777195e-05, "loss": 0.5515039563179016, "step": 199 }, { "epoch": 0.5834092980856882, "grad_norm": 1.4859853982925415, "learning_rate": 1.7888111960070493e-05, "loss": 0.5017011165618896, "step": 200 }, { "epoch": 0.5834092980856882, "eval_loss": 0.5414339303970337, "eval_runtime": 1180.7894, "eval_samples_per_second": 0.535, "eval_steps_per_second": 0.535, "step": 200 }, { "epoch": 0.5863263445761167, "grad_norm": 1.0065829753875732, "learning_rate": 1.7856714306487088e-05, "loss": 0.5677731037139893, "step": 201 }, { "epoch": 0.5892433910665451, "grad_norm": 1.1727538108825684, "learning_rate": 1.7825112963029352e-05, "loss": 0.4525509476661682, "step": 202 }, { "epoch": 0.5921604375569736, "grad_norm": 1.3376752138137817, "learning_rate": 1.7793308748980437e-05, "loss": 0.5208959579467773, "step": 203 }, { "epoch": 0.595077484047402, "grad_norm": 0.9196159839630127, "learning_rate": 1.776130248888304e-05, "loss": 0.6033903360366821, "step": 204 }, { "epoch": 0.5979945305378305, "grad_norm": 1.0750919580459595, "learning_rate": 1.772909501251801e-05, "loss": 0.5449609160423279, "step": 205 }, { "epoch": 0.6009115770282589, "grad_norm": 1.2459467649459839, "learning_rate": 1.769668715488285e-05, "loss": 0.5685338377952576, "step": 206 }, { "epoch": 0.6038286235186874, "grad_norm": 1.1690552234649658, "learning_rate": 1.766407975617006e-05, "loss": 0.5240382552146912, "step": 207 }, { "epoch": 0.6067456700091157, "grad_norm": 1.0816599130630493, "learning_rate": 1.7631273661745362e-05, "loss": 0.6802893877029419, "step": 208 }, { "epoch": 0.6096627164995442, "grad_norm": 1.3662947416305542, "learning_rate": 1.7598269722125775e-05, "loss": 0.48193931579589844, "step": 209 }, { "epoch": 0.6125797629899726, "grad_norm": 0.9364766478538513, "learning_rate": 1.7565068792957576e-05, "loss": 0.5675849914550781, "step": 210 }, { "epoch": 0.6154968094804011, "grad_norm": 1.123828411102295, "learning_rate": 1.75316717349941e-05, "loss": 0.5474762916564941, "step": 211 }, { "epoch": 0.6184138559708295, "grad_norm": 1.1924363374710083, "learning_rate": 1.749807941407345e-05, "loss": 0.4918654263019562, "step": 212 }, { "epoch": 0.621330902461258, "grad_norm": 1.101293921470642, "learning_rate": 1.7464292701096014e-05, "loss": 0.5742691159248352, "step": 213 }, { "epoch": 0.6242479489516864, "grad_norm": 1.7374963760375977, "learning_rate": 1.7430312472001928e-05, "loss": 0.5828965902328491, "step": 214 }, { "epoch": 0.6271649954421149, "grad_norm": 1.3195666074752808, "learning_rate": 1.739613960774833e-05, "loss": 0.5265159010887146, "step": 215 }, { "epoch": 0.6300820419325432, "grad_norm": 1.254686713218689, "learning_rate": 1.7361774994286545e-05, "loss": 0.4929371476173401, "step": 216 }, { "epoch": 0.6329990884229717, "grad_norm": 1.1476380825042725, "learning_rate": 1.7327219522539102e-05, "loss": 0.5060417652130127, "step": 217 }, { "epoch": 0.6359161349134002, "grad_norm": 1.0914150476455688, "learning_rate": 1.7292474088376643e-05, "loss": 0.504043698310852, "step": 218 }, { "epoch": 0.6388331814038286, "grad_norm": 1.1339508295059204, "learning_rate": 1.7257539592594698e-05, "loss": 0.4797310531139374, "step": 219 }, { "epoch": 0.6417502278942571, "grad_norm": 1.0805399417877197, "learning_rate": 1.722241694089033e-05, "loss": 0.5878555178642273, "step": 220 }, { "epoch": 0.6446672743846855, "grad_norm": 1.8615056276321411, "learning_rate": 1.718710704383865e-05, "loss": 0.5005823969841003, "step": 221 }, { "epoch": 0.647584320875114, "grad_norm": 1.1445401906967163, "learning_rate": 1.7151610816869214e-05, "loss": 0.4949319064617157, "step": 222 }, { "epoch": 0.6505013673655424, "grad_norm": 0.9726515412330627, "learning_rate": 1.711592918024229e-05, "loss": 0.5073204040527344, "step": 223 }, { "epoch": 0.6534184138559709, "grad_norm": 1.4491140842437744, "learning_rate": 1.7080063059024998e-05, "loss": 0.47885262966156006, "step": 224 }, { "epoch": 0.6563354603463992, "grad_norm": 1.0070592164993286, "learning_rate": 1.7044013383067327e-05, "loss": 0.5775837898254395, "step": 225 }, { "epoch": 0.6592525068368277, "grad_norm": 0.966221272945404, "learning_rate": 1.7007781086978037e-05, "loss": 0.5050399899482727, "step": 226 }, { "epoch": 0.6621695533272561, "grad_norm": 0.9808815121650696, "learning_rate": 1.6971367110100407e-05, "loss": 0.5737045407295227, "step": 227 }, { "epoch": 0.6650865998176846, "grad_norm": 1.0158127546310425, "learning_rate": 1.6934772396487906e-05, "loss": 0.48077821731567383, "step": 228 }, { "epoch": 0.668003646308113, "grad_norm": 1.32015860080719, "learning_rate": 1.6897997894879706e-05, "loss": 0.5614925026893616, "step": 229 }, { "epoch": 0.6709206927985415, "grad_norm": 1.1055903434753418, "learning_rate": 1.686104455867608e-05, "loss": 0.4970760643482208, "step": 230 }, { "epoch": 0.67383773928897, "grad_norm": 1.0804500579833984, "learning_rate": 1.682391334591371e-05, "loss": 0.5540452003479004, "step": 231 }, { "epoch": 0.6767547857793984, "grad_norm": 1.1906245946884155, "learning_rate": 1.6786605219240807e-05, "loss": 0.5778501033782959, "step": 232 }, { "epoch": 0.6796718322698267, "grad_norm": 0.9758645296096802, "learning_rate": 1.6749121145892192e-05, "loss": 0.49073565006256104, "step": 233 }, { "epoch": 0.6825888787602552, "grad_norm": 1.1678364276885986, "learning_rate": 1.6711462097664207e-05, "loss": 0.4828741252422333, "step": 234 }, { "epoch": 0.6855059252506837, "grad_norm": 1.148301362991333, "learning_rate": 1.6673629050889507e-05, "loss": 0.5143818855285645, "step": 235 }, { "epoch": 0.6884229717411121, "grad_norm": 1.005898356437683, "learning_rate": 1.6635622986411776e-05, "loss": 0.5301160216331482, "step": 236 }, { "epoch": 0.6913400182315406, "grad_norm": 1.2227320671081543, "learning_rate": 1.659744488956027e-05, "loss": 0.4800386130809784, "step": 237 }, { "epoch": 0.694257064721969, "grad_norm": 0.986456573009491, "learning_rate": 1.6559095750124296e-05, "loss": 0.5098081827163696, "step": 238 }, { "epoch": 0.6971741112123975, "grad_norm": 1.1474376916885376, "learning_rate": 1.6520576562327518e-05, "loss": 0.5147273540496826, "step": 239 }, { "epoch": 0.7000911577028259, "grad_norm": 1.10917067527771, "learning_rate": 1.6481888324802223e-05, "loss": 0.5023190379142761, "step": 240 }, { "epoch": 0.7030082041932544, "grad_norm": 1.2339262962341309, "learning_rate": 1.644303204056341e-05, "loss": 0.5282092690467834, "step": 241 }, { "epoch": 0.7059252506836827, "grad_norm": 0.997941255569458, "learning_rate": 1.640400871698277e-05, "loss": 0.5635963082313538, "step": 242 }, { "epoch": 0.7088422971741112, "grad_norm": 1.0345823764801025, "learning_rate": 1.63648193657626e-05, "loss": 0.5577977895736694, "step": 243 }, { "epoch": 0.7117593436645396, "grad_norm": 1.3468303680419922, "learning_rate": 1.6325465002909554e-05, "loss": 0.4365362524986267, "step": 244 }, { "epoch": 0.7146763901549681, "grad_norm": 1.2817128896713257, "learning_rate": 1.628594664870831e-05, "loss": 0.46069926023483276, "step": 245 }, { "epoch": 0.7175934366453965, "grad_norm": 1.043311357498169, "learning_rate": 1.6246265327695117e-05, "loss": 0.5476971864700317, "step": 246 }, { "epoch": 0.720510483135825, "grad_norm": 1.0297389030456543, "learning_rate": 1.620642206863124e-05, "loss": 0.48051249980926514, "step": 247 }, { "epoch": 0.7234275296262535, "grad_norm": 1.4869836568832397, "learning_rate": 1.6166417904476257e-05, "loss": 0.5683314800262451, "step": 248 }, { "epoch": 0.7263445761166819, "grad_norm": 1.0628005266189575, "learning_rate": 1.6126253872361336e-05, "loss": 0.5277887582778931, "step": 249 }, { "epoch": 0.7292616226071102, "grad_norm": 1.2682170867919922, "learning_rate": 1.608593101356229e-05, "loss": 0.5048879384994507, "step": 250 }, { "epoch": 0.7292616226071102, "eval_loss": 0.5038471221923828, "eval_runtime": 1175.0375, "eval_samples_per_second": 0.538, "eval_steps_per_second": 0.538, "step": 250 }, { "epoch": 0.7321786690975387, "grad_norm": 1.7376199960708618, "learning_rate": 1.6045450373472626e-05, "loss": 0.5093721151351929, "step": 251 }, { "epoch": 0.7350957155879672, "grad_norm": 1.6047718524932861, "learning_rate": 1.6004813001576405e-05, "loss": 0.4796055555343628, "step": 252 }, { "epoch": 0.7380127620783956, "grad_norm": 1.3582886457443237, "learning_rate": 1.5964019951421058e-05, "loss": 0.4733014702796936, "step": 253 }, { "epoch": 0.7409298085688241, "grad_norm": 0.9468897581100464, "learning_rate": 1.5923072280590072e-05, "loss": 0.5312032103538513, "step": 254 }, { "epoch": 0.7438468550592525, "grad_norm": 1.3890198469161987, "learning_rate": 1.5881971050675547e-05, "loss": 0.47576645016670227, "step": 255 }, { "epoch": 0.746763901549681, "grad_norm": 1.782992959022522, "learning_rate": 1.584071732725071e-05, "loss": 0.5555092096328735, "step": 256 }, { "epoch": 0.7496809480401094, "grad_norm": 1.1790621280670166, "learning_rate": 1.5799312179842265e-05, "loss": 0.5148727893829346, "step": 257 }, { "epoch": 0.7525979945305379, "grad_norm": 1.446694254875183, "learning_rate": 1.5757756681902664e-05, "loss": 0.49939870834350586, "step": 258 }, { "epoch": 0.7555150410209662, "grad_norm": 1.1786166429519653, "learning_rate": 1.571605191078229e-05, "loss": 0.562156081199646, "step": 259 }, { "epoch": 0.7584320875113947, "grad_norm": 1.16925847530365, "learning_rate": 1.567419894770151e-05, "loss": 0.49580734968185425, "step": 260 }, { "epoch": 0.7613491340018231, "grad_norm": 1.60944664478302, "learning_rate": 1.5632198877722676e-05, "loss": 0.4821680784225464, "step": 261 }, { "epoch": 0.7642661804922516, "grad_norm": 1.3957884311676025, "learning_rate": 1.5590052789721946e-05, "loss": 0.4392276406288147, "step": 262 }, { "epoch": 0.76718322698268, "grad_norm": 1.636195421218872, "learning_rate": 1.5547761776361096e-05, "loss": 0.39603114128112793, "step": 263 }, { "epoch": 0.7701002734731085, "grad_norm": 1.496766448020935, "learning_rate": 1.550532693405917e-05, "loss": 0.4833749234676361, "step": 264 }, { "epoch": 0.773017319963537, "grad_norm": 1.3587844371795654, "learning_rate": 1.5462749362964058e-05, "loss": 0.43738317489624023, "step": 265 }, { "epoch": 0.7759343664539654, "grad_norm": 1.670704960823059, "learning_rate": 1.5420030166923983e-05, "loss": 0.4476737380027771, "step": 266 }, { "epoch": 0.7788514129443938, "grad_norm": 1.2674932479858398, "learning_rate": 1.537717045345888e-05, "loss": 0.42266708612442017, "step": 267 }, { "epoch": 0.7817684594348222, "grad_norm": 2.0639536380767822, "learning_rate": 1.5334171333731666e-05, "loss": 0.5245381593704224, "step": 268 }, { "epoch": 0.7846855059252507, "grad_norm": 1.2091766595840454, "learning_rate": 1.529103392251946e-05, "loss": 0.5166443586349487, "step": 269 }, { "epoch": 0.7876025524156791, "grad_norm": 1.1021631956100464, "learning_rate": 1.5247759338184653e-05, "loss": 0.5674265027046204, "step": 270 }, { "epoch": 0.7905195989061076, "grad_norm": 1.3143829107284546, "learning_rate": 1.520434870264595e-05, "loss": 0.40855613350868225, "step": 271 }, { "epoch": 0.793436645396536, "grad_norm": 1.1784812211990356, "learning_rate": 1.5160803141349244e-05, "loss": 0.4308925271034241, "step": 272 }, { "epoch": 0.7963536918869645, "grad_norm": 2.1635706424713135, "learning_rate": 1.5117123783238458e-05, "loss": 0.45035502314567566, "step": 273 }, { "epoch": 0.7992707383773929, "grad_norm": 1.569203495979309, "learning_rate": 1.5073311760726287e-05, "loss": 0.5095728635787964, "step": 274 }, { "epoch": 0.8021877848678214, "grad_norm": 2.532621383666992, "learning_rate": 1.5029368209664822e-05, "loss": 0.496748685836792, "step": 275 }, { "epoch": 0.8051048313582497, "grad_norm": 1.6312552690505981, "learning_rate": 1.4985294269316098e-05, "loss": 0.4972914159297943, "step": 276 }, { "epoch": 0.8080218778486782, "grad_norm": 1.3996756076812744, "learning_rate": 1.4941091082322579e-05, "loss": 0.5589750409126282, "step": 277 }, { "epoch": 0.8109389243391066, "grad_norm": 1.1288363933563232, "learning_rate": 1.4896759794677526e-05, "loss": 0.5349453687667847, "step": 278 }, { "epoch": 0.8138559708295351, "grad_norm": 1.6913920640945435, "learning_rate": 1.4852301555695268e-05, "loss": 0.46511000394821167, "step": 279 }, { "epoch": 0.8167730173199635, "grad_norm": 1.1913212537765503, "learning_rate": 1.4807717517981439e-05, "loss": 0.4715422987937927, "step": 280 }, { "epoch": 0.819690063810392, "grad_norm": 1.1179691553115845, "learning_rate": 1.476300883740307e-05, "loss": 0.53330397605896, "step": 281 }, { "epoch": 0.8226071103008205, "grad_norm": 1.7473797798156738, "learning_rate": 1.4718176673058624e-05, "loss": 0.47564437985420227, "step": 282 }, { "epoch": 0.8255241567912489, "grad_norm": 1.2653177976608276, "learning_rate": 1.4673222187247963e-05, "loss": 0.46364277601242065, "step": 283 }, { "epoch": 0.8284412032816773, "grad_norm": 1.2567330598831177, "learning_rate": 1.4628146545442202e-05, "loss": 0.4778091013431549, "step": 284 }, { "epoch": 0.8313582497721057, "grad_norm": 1.5848406553268433, "learning_rate": 1.4582950916253488e-05, "loss": 0.4480203688144684, "step": 285 }, { "epoch": 0.8342752962625342, "grad_norm": 1.3278183937072754, "learning_rate": 1.453763647140472e-05, "loss": 0.37945032119750977, "step": 286 }, { "epoch": 0.8371923427529626, "grad_norm": 1.0961651802062988, "learning_rate": 1.4492204385699155e-05, "loss": 0.5306747555732727, "step": 287 }, { "epoch": 0.8401093892433911, "grad_norm": 1.176276683807373, "learning_rate": 1.4446655836989961e-05, "loss": 0.49950045347213745, "step": 288 }, { "epoch": 0.8430264357338195, "grad_norm": 1.2228269577026367, "learning_rate": 1.4400992006149674e-05, "loss": 0.494475394487381, "step": 289 }, { "epoch": 0.845943482224248, "grad_norm": 1.1584209203720093, "learning_rate": 1.4355214077039592e-05, "loss": 0.44170859456062317, "step": 290 }, { "epoch": 0.8488605287146764, "grad_norm": 1.2041938304901123, "learning_rate": 1.4309323236479071e-05, "loss": 0.4359871745109558, "step": 291 }, { "epoch": 0.8517775752051049, "grad_norm": 1.279645562171936, "learning_rate": 1.4263320674214762e-05, "loss": 0.45031386613845825, "step": 292 }, { "epoch": 0.8546946216955332, "grad_norm": 1.3958357572555542, "learning_rate": 1.4217207582889769e-05, "loss": 0.4832204580307007, "step": 293 }, { "epoch": 0.8576116681859617, "grad_norm": 1.2788586616516113, "learning_rate": 1.4170985158012725e-05, "loss": 0.5154346227645874, "step": 294 }, { "epoch": 0.8605287146763901, "grad_norm": 1.3634892702102661, "learning_rate": 1.4124654597926795e-05, "loss": 0.46777206659317017, "step": 295 }, { "epoch": 0.8634457611668186, "grad_norm": 1.2719579935073853, "learning_rate": 1.4078217103778619e-05, "loss": 0.4247053265571594, "step": 296 }, { "epoch": 0.866362807657247, "grad_norm": 2.890467643737793, "learning_rate": 1.4031673879487161e-05, "loss": 0.38349640369415283, "step": 297 }, { "epoch": 0.8692798541476755, "grad_norm": 2.4354801177978516, "learning_rate": 1.3985026131712499e-05, "loss": 0.4134889543056488, "step": 298 }, { "epoch": 0.872196900638104, "grad_norm": 1.0138323307037354, "learning_rate": 1.3938275069824541e-05, "loss": 0.5176680684089661, "step": 299 }, { "epoch": 0.8751139471285324, "grad_norm": 1.2316186428070068, "learning_rate": 1.389142190587168e-05, "loss": 0.4818477928638458, "step": 300 }, { "epoch": 0.8751139471285324, "eval_loss": 0.4752846360206604, "eval_runtime": 1189.1666, "eval_samples_per_second": 0.531, "eval_steps_per_second": 0.531, "step": 300 }, { "epoch": 0.8780309936189608, "grad_norm": 1.515487551689148, "learning_rate": 1.384446785454936e-05, "loss": 0.47766175866127014, "step": 301 }, { "epoch": 0.8809480401093892, "grad_norm": 1.4357497692108154, "learning_rate": 1.3797414133168591e-05, "loss": 0.49297061562538147, "step": 302 }, { "epoch": 0.8838650865998177, "grad_norm": 1.2523037195205688, "learning_rate": 1.3750261961624383e-05, "loss": 0.4629015326499939, "step": 303 }, { "epoch": 0.8867821330902461, "grad_norm": 3.5790023803710938, "learning_rate": 1.3703012562364124e-05, "loss": 0.3773120045661926, "step": 304 }, { "epoch": 0.8896991795806746, "grad_norm": 1.9305704832077026, "learning_rate": 1.3655667160355892e-05, "loss": 0.496719628572464, "step": 305 }, { "epoch": 0.892616226071103, "grad_norm": 1.1506154537200928, "learning_rate": 1.3608226983056687e-05, "loss": 0.49487072229385376, "step": 306 }, { "epoch": 0.8955332725615315, "grad_norm": 1.8046090602874756, "learning_rate": 1.3560693260380614e-05, "loss": 0.4910697937011719, "step": 307 }, { "epoch": 0.8984503190519599, "grad_norm": 2.0088653564453125, "learning_rate": 1.3513067224667e-05, "loss": 0.508246660232544, "step": 308 }, { "epoch": 0.9013673655423883, "grad_norm": 1.2966033220291138, "learning_rate": 1.3465350110648437e-05, "loss": 0.5125166177749634, "step": 309 }, { "epoch": 0.9042844120328167, "grad_norm": 1.9976309537887573, "learning_rate": 1.3417543155418775e-05, "loss": 0.43942537903785706, "step": 310 }, { "epoch": 0.9072014585232452, "grad_norm": 1.2663682699203491, "learning_rate": 1.336964759840105e-05, "loss": 0.4839101731777191, "step": 311 }, { "epoch": 0.9101185050136736, "grad_norm": 1.1223328113555908, "learning_rate": 1.3321664681315354e-05, "loss": 0.48008066415786743, "step": 312 }, { "epoch": 0.9130355515041021, "grad_norm": 1.5786972045898438, "learning_rate": 1.3273595648146634e-05, "loss": 0.47250309586524963, "step": 313 }, { "epoch": 0.9159525979945305, "grad_norm": 1.2150241136550903, "learning_rate": 1.322544174511245e-05, "loss": 0.5149738788604736, "step": 314 }, { "epoch": 0.918869644484959, "grad_norm": 1.3676542043685913, "learning_rate": 1.3177204220630662e-05, "loss": 0.4430195093154907, "step": 315 }, { "epoch": 0.9217866909753875, "grad_norm": 1.0703285932540894, "learning_rate": 1.3128884325287064e-05, "loss": 0.4798983037471771, "step": 316 }, { "epoch": 0.9247037374658159, "grad_norm": 1.3131535053253174, "learning_rate": 1.308048331180296e-05, "loss": 0.4241073727607727, "step": 317 }, { "epoch": 0.9276207839562443, "grad_norm": 1.4485348463058472, "learning_rate": 1.3032002435002698e-05, "loss": 0.527199923992157, "step": 318 }, { "epoch": 0.9305378304466727, "grad_norm": 1.370936393737793, "learning_rate": 1.2983442951781114e-05, "loss": 0.47125962376594543, "step": 319 }, { "epoch": 0.9334548769371012, "grad_norm": 1.2369643449783325, "learning_rate": 1.2934806121070973e-05, "loss": 0.4814244210720062, "step": 320 }, { "epoch": 0.9363719234275296, "grad_norm": 1.2632933855056763, "learning_rate": 1.2886093203810314e-05, "loss": 0.4915548264980316, "step": 321 }, { "epoch": 0.9392889699179581, "grad_norm": 1.054569959640503, "learning_rate": 1.2837305462909764e-05, "loss": 0.5325602293014526, "step": 322 }, { "epoch": 0.9422060164083865, "grad_norm": 1.15959632396698, "learning_rate": 1.27884441632198e-05, "loss": 0.43607404828071594, "step": 323 }, { "epoch": 0.945123062898815, "grad_norm": 1.1667979955673218, "learning_rate": 1.2739510571497945e-05, "loss": 0.4631507992744446, "step": 324 }, { "epoch": 0.9480401093892434, "grad_norm": 1.6009081602096558, "learning_rate": 1.2690505956375944e-05, "loss": 0.4935731887817383, "step": 325 }, { "epoch": 0.9509571558796718, "grad_norm": 1.1193996667861938, "learning_rate": 1.2641431588326858e-05, "loss": 0.45883435010910034, "step": 326 }, { "epoch": 0.9538742023701002, "grad_norm": 1.5365067720413208, "learning_rate": 1.2592288739632138e-05, "loss": 0.5206276178359985, "step": 327 }, { "epoch": 0.9567912488605287, "grad_norm": 1.0714622735977173, "learning_rate": 1.2543078684348632e-05, "loss": 0.5242853760719299, "step": 328 }, { "epoch": 0.9597082953509571, "grad_norm": 1.3009248971939087, "learning_rate": 1.2493802698275557e-05, "loss": 0.4794357717037201, "step": 329 }, { "epoch": 0.9626253418413856, "grad_norm": 1.495771050453186, "learning_rate": 1.244446205892143e-05, "loss": 0.5849282145500183, "step": 330 }, { "epoch": 0.965542388331814, "grad_norm": 1.2046003341674805, "learning_rate": 1.2395058045470935e-05, "loss": 0.47758305072784424, "step": 331 }, { "epoch": 0.9684594348222425, "grad_norm": 1.1362569332122803, "learning_rate": 1.2345591938751772e-05, "loss": 0.4490663409233093, "step": 332 }, { "epoch": 0.971376481312671, "grad_norm": 1.2658129930496216, "learning_rate": 1.2296065021201438e-05, "loss": 0.4035309851169586, "step": 333 }, { "epoch": 0.9742935278030994, "grad_norm": 4.370306015014648, "learning_rate": 1.2246478576833993e-05, "loss": 0.495273619890213, "step": 334 }, { "epoch": 0.9772105742935278, "grad_norm": 1.3863654136657715, "learning_rate": 1.219683389120676e-05, "loss": 0.46410733461380005, "step": 335 }, { "epoch": 0.9801276207839562, "grad_norm": 1.4544321298599243, "learning_rate": 1.2147132251387004e-05, "loss": 0.4301709830760956, "step": 336 }, { "epoch": 0.9830446672743847, "grad_norm": 1.0852457284927368, "learning_rate": 1.2097374945918554e-05, "loss": 0.48892468214035034, "step": 337 }, { "epoch": 0.9859617137648131, "grad_norm": 1.5062257051467896, "learning_rate": 1.2047563264788412e-05, "loss": 0.4667983055114746, "step": 338 }, { "epoch": 0.9888787602552416, "grad_norm": 1.2472951412200928, "learning_rate": 1.199769849939329e-05, "loss": 0.4827345013618469, "step": 339 }, { "epoch": 0.99179580674567, "grad_norm": 1.2589871883392334, "learning_rate": 1.1947781942506151e-05, "loss": 0.405245304107666, "step": 340 }, { "epoch": 0.9947128532360985, "grad_norm": 1.25636625289917, "learning_rate": 1.1897814888242679e-05, "loss": 0.37956133484840393, "step": 341 }, { "epoch": 0.9976298997265269, "grad_norm": 2.7064895629882812, "learning_rate": 1.1847798632027726e-05, "loss": 0.489456444978714, "step": 342 }, { "epoch": 1.0, "grad_norm": 1.6156240701675415, "learning_rate": 1.1797734470561744e-05, "loss": 0.46473199129104614, "step": 343 }, { "epoch": 1.0029170464904285, "grad_norm": 1.3046343326568604, "learning_rate": 1.1747623701787143e-05, "loss": 0.3504878282546997, "step": 344 }, { "epoch": 1.005834092980857, "grad_norm": 1.414828896522522, "learning_rate": 1.1697467624854666e-05, "loss": 0.4719260334968567, "step": 345 }, { "epoch": 1.0087511394712854, "grad_norm": 1.1873356103897095, "learning_rate": 1.164726754008969e-05, "loss": 0.45313555002212524, "step": 346 }, { "epoch": 1.0116681859617138, "grad_norm": 1.1382380723953247, "learning_rate": 1.1597024748958526e-05, "loss": 0.4365478456020355, "step": 347 }, { "epoch": 1.0145852324521423, "grad_norm": 1.8141961097717285, "learning_rate": 1.1546740554034661e-05, "loss": 0.3694503605365753, "step": 348 }, { "epoch": 1.0175022789425707, "grad_norm": 1.333388328552246, "learning_rate": 1.1496416258965015e-05, "loss": 0.4755721688270569, "step": 349 }, { "epoch": 1.0204193254329992, "grad_norm": 1.3464443683624268, "learning_rate": 1.1446053168436117e-05, "loss": 0.4227846562862396, "step": 350 }, { "epoch": 1.0204193254329992, "eval_loss": 0.44924086332321167, "eval_runtime": 1214.6648, "eval_samples_per_second": 0.52, "eval_steps_per_second": 0.52, "step": 350 }, { "epoch": 1.0233363719234276, "grad_norm": 1.2682689428329468, "learning_rate": 1.1395652588140292e-05, "loss": 0.44300130009651184, "step": 351 }, { "epoch": 1.0262534184138559, "grad_norm": 1.7737696170806885, "learning_rate": 1.1345215824741814e-05, "loss": 0.5106258988380432, "step": 352 }, { "epoch": 1.0291704649042843, "grad_norm": 1.2601238489151, "learning_rate": 1.1294744185843014e-05, "loss": 0.45930635929107666, "step": 353 }, { "epoch": 1.0320875113947128, "grad_norm": 1.2162678241729736, "learning_rate": 1.1244238979950406e-05, "loss": 0.44163084030151367, "step": 354 }, { "epoch": 1.0350045578851412, "grad_norm": 1.0905817747116089, "learning_rate": 1.1193701516440733e-05, "loss": 0.510662317276001, "step": 355 }, { "epoch": 1.0379216043755697, "grad_norm": 0.9624952673912048, "learning_rate": 1.1143133105527048e-05, "loss": 0.5297917127609253, "step": 356 }, { "epoch": 1.0408386508659981, "grad_norm": 1.2757681608200073, "learning_rate": 1.1092535058224725e-05, "loss": 0.4332093596458435, "step": 357 }, { "epoch": 1.0437556973564266, "grad_norm": 1.6885719299316406, "learning_rate": 1.104190868631748e-05, "loss": 0.4337635040283203, "step": 358 }, { "epoch": 1.046672743846855, "grad_norm": 1.175484538078308, "learning_rate": 1.099125530232336e-05, "loss": 0.45411020517349243, "step": 359 }, { "epoch": 1.0495897903372835, "grad_norm": 1.0964939594268799, "learning_rate": 1.0940576219460723e-05, "loss": 0.5333439707756042, "step": 360 }, { "epoch": 1.052506836827712, "grad_norm": 1.5493136644363403, "learning_rate": 1.0889872751614176e-05, "loss": 0.4400906264781952, "step": 361 }, { "epoch": 1.0554238833181404, "grad_norm": 1.2491416931152344, "learning_rate": 1.0839146213300526e-05, "loss": 0.31049978733062744, "step": 362 }, { "epoch": 1.0583409298085689, "grad_norm": 1.7213693857192993, "learning_rate": 1.0788397919634694e-05, "loss": 0.389009028673172, "step": 363 }, { "epoch": 1.0612579762989973, "grad_norm": 1.5405336618423462, "learning_rate": 1.0737629186295621e-05, "loss": 0.4068562984466553, "step": 364 }, { "epoch": 1.0641750227894258, "grad_norm": 1.225455641746521, "learning_rate": 1.0686841329492159e-05, "loss": 0.47358617186546326, "step": 365 }, { "epoch": 1.0670920692798542, "grad_norm": 1.3436250686645508, "learning_rate": 1.0636035665928945e-05, "loss": 0.47050854563713074, "step": 366 }, { "epoch": 1.0700091157702827, "grad_norm": 1.4952112436294556, "learning_rate": 1.058521351277227e-05, "loss": 0.43496906757354736, "step": 367 }, { "epoch": 1.072926162260711, "grad_norm": 1.549112319946289, "learning_rate": 1.0534376187615924e-05, "loss": 0.45711052417755127, "step": 368 }, { "epoch": 1.0758432087511394, "grad_norm": 1.3851526975631714, "learning_rate": 1.048352500844704e-05, "loss": 0.45045915246009827, "step": 369 }, { "epoch": 1.0787602552415678, "grad_norm": 1.6302049160003662, "learning_rate": 1.0432661293611927e-05, "loss": 0.3736046254634857, "step": 370 }, { "epoch": 1.0816773017319963, "grad_norm": 1.3365869522094727, "learning_rate": 1.0381786361781885e-05, "loss": 0.42242100834846497, "step": 371 }, { "epoch": 1.0845943482224247, "grad_norm": 1.4369138479232788, "learning_rate": 1.0330901531919026e-05, "loss": 0.44570961594581604, "step": 372 }, { "epoch": 1.0875113947128532, "grad_norm": 1.3528283834457397, "learning_rate": 1.0280008123242069e-05, "loss": 0.43440738320350647, "step": 373 }, { "epoch": 1.0904284412032816, "grad_norm": 1.469660997390747, "learning_rate": 1.0229107455192147e-05, "loss": 0.3960394263267517, "step": 374 }, { "epoch": 1.09334548769371, "grad_norm": 1.4542185068130493, "learning_rate": 1.0178200847398595e-05, "loss": 0.47834208607673645, "step": 375 }, { "epoch": 1.0962625341841385, "grad_norm": 1.6470292806625366, "learning_rate": 1.0127289619644737e-05, "loss": 0.42791086435317993, "step": 376 }, { "epoch": 1.099179580674567, "grad_norm": 1.1934021711349487, "learning_rate": 1.0076375091833681e-05, "loss": 0.4401305019855499, "step": 377 }, { "epoch": 1.1020966271649955, "grad_norm": 0.9786668419837952, "learning_rate": 1.0025458583954078e-05, "loss": 0.4816555678844452, "step": 378 }, { "epoch": 1.105013673655424, "grad_norm": 1.1348779201507568, "learning_rate": 9.974541416045924e-06, "loss": 0.41516968607902527, "step": 379 }, { "epoch": 1.1079307201458524, "grad_norm": 1.0188615322113037, "learning_rate": 9.923624908166322e-06, "loss": 0.48087278008461, "step": 380 }, { "epoch": 1.1108477666362808, "grad_norm": 1.0821740627288818, "learning_rate": 9.872710380355263e-06, "loss": 0.41974008083343506, "step": 381 }, { "epoch": 1.1137648131267093, "grad_norm": 1.250951886177063, "learning_rate": 9.82179915260141e-06, "loss": 0.42703643441200256, "step": 382 }, { "epoch": 1.1166818596171377, "grad_norm": 1.4528254270553589, "learning_rate": 9.770892544807856e-06, "loss": 0.43801453709602356, "step": 383 }, { "epoch": 1.1195989061075662, "grad_norm": 1.813859462738037, "learning_rate": 9.719991876757934e-06, "loss": 0.4344240725040436, "step": 384 }, { "epoch": 1.1225159525979946, "grad_norm": 1.6681253910064697, "learning_rate": 9.669098468080976e-06, "loss": 0.4356998801231384, "step": 385 }, { "epoch": 1.125432999088423, "grad_norm": 1.3447953462600708, "learning_rate": 9.618213638218117e-06, "loss": 0.43189188838005066, "step": 386 }, { "epoch": 1.1283500455788513, "grad_norm": 1.9577926397323608, "learning_rate": 9.567338706388074e-06, "loss": 0.34984707832336426, "step": 387 }, { "epoch": 1.1312670920692798, "grad_norm": 1.5225576162338257, "learning_rate": 9.516474991552965e-06, "loss": 0.4243963062763214, "step": 388 }, { "epoch": 1.1341841385597082, "grad_norm": 1.7416809797286987, "learning_rate": 9.46562381238408e-06, "loss": 0.3414606750011444, "step": 389 }, { "epoch": 1.1371011850501367, "grad_norm": 1.8358951807022095, "learning_rate": 9.414786487227732e-06, "loss": 0.387447327375412, "step": 390 }, { "epoch": 1.1400182315405651, "grad_norm": 1.9706153869628906, "learning_rate": 9.363964334071057e-06, "loss": 0.4599088728427887, "step": 391 }, { "epoch": 1.1429352780309936, "grad_norm": 1.0604286193847656, "learning_rate": 9.313158670507843e-06, "loss": 0.4633581042289734, "step": 392 }, { "epoch": 1.145852324521422, "grad_norm": 1.4851202964782715, "learning_rate": 9.262370813704379e-06, "loss": 0.3872259557247162, "step": 393 }, { "epoch": 1.1487693710118505, "grad_norm": 1.7839159965515137, "learning_rate": 9.21160208036531e-06, "loss": 0.5215944647789001, "step": 394 }, { "epoch": 1.151686417502279, "grad_norm": 1.3054656982421875, "learning_rate": 9.160853786699475e-06, "loss": 0.4030425548553467, "step": 395 }, { "epoch": 1.1546034639927074, "grad_norm": 3.8467981815338135, "learning_rate": 9.110127248385827e-06, "loss": 0.4032524824142456, "step": 396 }, { "epoch": 1.1575205104831359, "grad_norm": 1.8513801097869873, "learning_rate": 9.05942378053928e-06, "loss": 0.46577155590057373, "step": 397 }, { "epoch": 1.1604375569735643, "grad_norm": 1.312689185142517, "learning_rate": 9.008744697676642e-06, "loss": 0.39114487171173096, "step": 398 }, { "epoch": 1.1633546034639928, "grad_norm": 1.1996328830718994, "learning_rate": 8.958091313682521e-06, "loss": 0.481199711561203, "step": 399 }, { "epoch": 1.1662716499544212, "grad_norm": 5.172409534454346, "learning_rate": 8.90746494177528e-06, "loss": 0.3803558945655823, "step": 400 }, { "epoch": 1.1662716499544212, "eval_loss": 0.4318464398384094, "eval_runtime": 1206.0306, "eval_samples_per_second": 0.524, "eval_steps_per_second": 0.524, "step": 400 }, { "epoch": 1.1691886964448497, "grad_norm": 1.0115015506744385, "learning_rate": 8.856866894472954e-06, "loss": 0.39636704325675964, "step": 401 }, { "epoch": 1.172105742935278, "grad_norm": 1.1557435989379883, "learning_rate": 8.806298483559268e-06, "loss": 0.4076298475265503, "step": 402 }, { "epoch": 1.1750227894257064, "grad_norm": 1.2802515029907227, "learning_rate": 8.755761020049597e-06, "loss": 0.44352248311042786, "step": 403 }, { "epoch": 1.1779398359161348, "grad_norm": 1.2755069732666016, "learning_rate": 8.705255814156988e-06, "loss": 0.390497624874115, "step": 404 }, { "epoch": 1.1808568824065633, "grad_norm": 1.2799782752990723, "learning_rate": 8.654784175258188e-06, "loss": 0.35810694098472595, "step": 405 }, { "epoch": 1.1837739288969917, "grad_norm": 1.0968674421310425, "learning_rate": 8.604347411859713e-06, "loss": 0.3890265226364136, "step": 406 }, { "epoch": 1.1866909753874202, "grad_norm": 1.3334455490112305, "learning_rate": 8.553946831563886e-06, "loss": 0.3916901648044586, "step": 407 }, { "epoch": 1.1896080218778486, "grad_norm": 1.1888184547424316, "learning_rate": 8.503583741034988e-06, "loss": 0.5231326222419739, "step": 408 }, { "epoch": 1.192525068368277, "grad_norm": 1.1163763999938965, "learning_rate": 8.45325944596534e-06, "loss": 0.4249858558177948, "step": 409 }, { "epoch": 1.1954421148587056, "grad_norm": 1.3470333814620972, "learning_rate": 8.40297525104148e-06, "loss": 0.5201632380485535, "step": 410 }, { "epoch": 1.198359161349134, "grad_norm": 1.5412285327911377, "learning_rate": 8.35273245991031e-06, "loss": 0.39376699924468994, "step": 411 }, { "epoch": 1.2012762078395625, "grad_norm": 1.3408735990524292, "learning_rate": 8.302532375145339e-06, "loss": 0.39554283022880554, "step": 412 }, { "epoch": 1.204193254329991, "grad_norm": 1.990668773651123, "learning_rate": 8.25237629821286e-06, "loss": 0.42424261569976807, "step": 413 }, { "epoch": 1.2071103008204194, "grad_norm": 1.6471989154815674, "learning_rate": 8.202265529438259e-06, "loss": 0.3234582543373108, "step": 414 }, { "epoch": 1.2100273473108478, "grad_norm": 1.1483631134033203, "learning_rate": 8.152201367972275e-06, "loss": 0.39163246750831604, "step": 415 }, { "epoch": 1.2129443938012763, "grad_norm": 1.800149917602539, "learning_rate": 8.102185111757323e-06, "loss": 0.5055042505264282, "step": 416 }, { "epoch": 1.2158614402917047, "grad_norm": 1.4394795894622803, "learning_rate": 8.052218057493849e-06, "loss": 0.4761751592159271, "step": 417 }, { "epoch": 1.2187784867821332, "grad_norm": 1.622689962387085, "learning_rate": 8.002301500606715e-06, "loss": 0.4490141272544861, "step": 418 }, { "epoch": 1.2216955332725616, "grad_norm": 1.2564961910247803, "learning_rate": 7.952436735211593e-06, "loss": 0.3964035212993622, "step": 419 }, { "epoch": 1.22461257976299, "grad_norm": 1.3248411417007446, "learning_rate": 7.902625054081449e-06, "loss": 0.46039122343063354, "step": 420 }, { "epoch": 1.2275296262534183, "grad_norm": 1.568983793258667, "learning_rate": 7.852867748613e-06, "loss": 0.49916595220565796, "step": 421 }, { "epoch": 1.2304466727438468, "grad_norm": 1.4784491062164307, "learning_rate": 7.803166108793243e-06, "loss": 0.4035068154335022, "step": 422 }, { "epoch": 1.2333637192342752, "grad_norm": 1.2940057516098022, "learning_rate": 7.753521423166007e-06, "loss": 0.4154140055179596, "step": 423 }, { "epoch": 1.2362807657247037, "grad_norm": 1.167786717414856, "learning_rate": 7.703934978798565e-06, "loss": 0.39541637897491455, "step": 424 }, { "epoch": 1.2391978122151321, "grad_norm": 1.5126771926879883, "learning_rate": 7.65440806124823e-06, "loss": 0.37744253873825073, "step": 425 }, { "epoch": 1.2421148587055606, "grad_norm": 1.2595263719558716, "learning_rate": 7.604941954529067e-06, "loss": 0.46380615234375, "step": 426 }, { "epoch": 1.245031905195989, "grad_norm": 1.4258298873901367, "learning_rate": 7.555537941078573e-06, "loss": 0.3391319513320923, "step": 427 }, { "epoch": 1.2479489516864175, "grad_norm": 1.5371774435043335, "learning_rate": 7.506197301724446e-06, "loss": 0.39805102348327637, "step": 428 }, { "epoch": 1.250865998176846, "grad_norm": 1.3789173364639282, "learning_rate": 7.456921315651371e-06, "loss": 0.37969034910202026, "step": 429 }, { "epoch": 1.2537830446672744, "grad_norm": 1.32931649684906, "learning_rate": 7.407711260367867e-06, "loss": 0.3841526508331299, "step": 430 }, { "epoch": 1.2567000911577029, "grad_norm": 1.2836817502975464, "learning_rate": 7.358568411673145e-06, "loss": 0.340289443731308, "step": 431 }, { "epoch": 1.2596171376481313, "grad_norm": 1.0418318510055542, "learning_rate": 7.309494043624059e-06, "loss": 0.44747158885002136, "step": 432 }, { "epoch": 1.2625341841385598, "grad_norm": 1.1769362688064575, "learning_rate": 7.260489428502058e-06, "loss": 0.45737382769584656, "step": 433 }, { "epoch": 1.265451230628988, "grad_norm": 2.2730748653411865, "learning_rate": 7.211555836780203e-06, "loss": 0.3827931582927704, "step": 434 }, { "epoch": 1.2683682771194165, "grad_norm": 1.263096809387207, "learning_rate": 7.162694537090235e-06, "loss": 0.3589435815811157, "step": 435 }, { "epoch": 1.271285323609845, "grad_norm": 1.4073514938354492, "learning_rate": 7.113906796189692e-06, "loss": 0.45206642150878906, "step": 436 }, { "epoch": 1.2742023701002734, "grad_norm": 1.064585566520691, "learning_rate": 7.0651938789290306e-06, "loss": 0.5409261584281921, "step": 437 }, { "epoch": 1.2771194165907018, "grad_norm": 1.2346999645233154, "learning_rate": 7.016557048218889e-06, "loss": 0.40680158138275146, "step": 438 }, { "epoch": 1.2800364630811303, "grad_norm": 1.5816547870635986, "learning_rate": 6.967997564997306e-06, "loss": 0.38718655705451965, "step": 439 }, { "epoch": 1.2829535095715587, "grad_norm": 1.085268259048462, "learning_rate": 6.919516688197041e-06, "loss": 0.4863276779651642, "step": 440 }, { "epoch": 1.2858705560619872, "grad_norm": 1.0984629392623901, "learning_rate": 6.871115674712937e-06, "loss": 0.39562875032424927, "step": 441 }, { "epoch": 1.2887876025524156, "grad_norm": 1.3004229068756104, "learning_rate": 6.822795779369339e-06, "loss": 0.44437694549560547, "step": 442 }, { "epoch": 1.291704649042844, "grad_norm": 1.3541183471679688, "learning_rate": 6.774558254887553e-06, "loss": 0.4728967249393463, "step": 443 }, { "epoch": 1.2946216955332726, "grad_norm": 1.2485377788543701, "learning_rate": 6.7264043518533695e-06, "loss": 0.4052809476852417, "step": 444 }, { "epoch": 1.297538742023701, "grad_norm": 1.412827730178833, "learning_rate": 6.67833531868465e-06, "loss": 0.40149861574172974, "step": 445 }, { "epoch": 1.3004557885141295, "grad_norm": 1.5576224327087402, "learning_rate": 6.630352401598953e-06, "loss": 0.44107240438461304, "step": 446 }, { "epoch": 1.303372835004558, "grad_norm": 1.1551047563552856, "learning_rate": 6.582456844581226e-06, "loss": 0.4898405969142914, "step": 447 }, { "epoch": 1.3062898814949864, "grad_norm": 1.9939689636230469, "learning_rate": 6.5346498893515645e-06, "loss": 0.4791329801082611, "step": 448 }, { "epoch": 1.3092069279854148, "grad_norm": 1.4782553911209106, "learning_rate": 6.486932775333002e-06, "loss": 0.472908616065979, "step": 449 }, { "epoch": 1.3121239744758433, "grad_norm": 1.2496148347854614, "learning_rate": 6.439306739619387e-06, "loss": 0.514995276927948, "step": 450 }, { "epoch": 1.3121239744758433, "eval_loss": 0.4178673028945923, "eval_runtime": 1197.5534, "eval_samples_per_second": 0.528, "eval_steps_per_second": 0.528, "step": 450 }, { "epoch": 1.3150410209662717, "grad_norm": 1.3996772766113281, "learning_rate": 6.391773016943316e-06, "loss": 0.4087896943092346, "step": 451 }, { "epoch": 1.3179580674567002, "grad_norm": 1.20390784740448, "learning_rate": 6.344332839644111e-06, "loss": 0.43224579095840454, "step": 452 }, { "epoch": 1.3208751139471286, "grad_norm": 1.2709496021270752, "learning_rate": 6.296987437635876e-06, "loss": 0.44104093313217163, "step": 453 }, { "epoch": 1.323792160437557, "grad_norm": 1.0112334489822388, "learning_rate": 6.249738038375618e-06, "loss": 0.47084498405456543, "step": 454 }, { "epoch": 1.3267092069279856, "grad_norm": 1.0771515369415283, "learning_rate": 6.202585866831411e-06, "loss": 0.4700928032398224, "step": 455 }, { "epoch": 1.3296262534184138, "grad_norm": 1.4937143325805664, "learning_rate": 6.15553214545064e-06, "loss": 0.345747709274292, "step": 456 }, { "epoch": 1.3325432999088422, "grad_norm": 1.1348456144332886, "learning_rate": 6.108578094128321e-06, "loss": 0.33824583888053894, "step": 457 }, { "epoch": 1.3354603463992707, "grad_norm": 1.2502707242965698, "learning_rate": 6.061724930175461e-06, "loss": 0.3528832197189331, "step": 458 }, { "epoch": 1.3383773928896991, "grad_norm": 1.5359619855880737, "learning_rate": 6.014973868287504e-06, "loss": 0.4413869082927704, "step": 459 }, { "epoch": 1.3412944393801276, "grad_norm": 0.9747081398963928, "learning_rate": 5.9683261205128395e-06, "loss": 0.6849499940872192, "step": 460 }, { "epoch": 1.344211485870556, "grad_norm": 1.3150533437728882, "learning_rate": 5.921782896221383e-06, "loss": 0.3901931047439575, "step": 461 }, { "epoch": 1.3471285323609845, "grad_norm": 1.137770652770996, "learning_rate": 5.875345402073207e-06, "loss": 0.37498384714126587, "step": 462 }, { "epoch": 1.350045578851413, "grad_norm": 1.2216367721557617, "learning_rate": 5.829014841987277e-06, "loss": 0.3874579966068268, "step": 463 }, { "epoch": 1.3529626253418414, "grad_norm": 1.135439157485962, "learning_rate": 5.782792417110233e-06, "loss": 0.384797066450119, "step": 464 }, { "epoch": 1.3558796718322699, "grad_norm": 1.2400696277618408, "learning_rate": 5.736679325785239e-06, "loss": 0.46303266286849976, "step": 465 }, { "epoch": 1.3587967183226983, "grad_norm": 1.8848882913589478, "learning_rate": 5.6906767635209304e-06, "loss": 0.5068309903144836, "step": 466 }, { "epoch": 1.3617137648131268, "grad_norm": 1.4707008600234985, "learning_rate": 5.644785922960412e-06, "loss": 0.364332914352417, "step": 467 }, { "epoch": 1.364630811303555, "grad_norm": 2.4436841011047363, "learning_rate": 5.599007993850329e-06, "loss": 0.485107421875, "step": 468 }, { "epoch": 1.3675478577939835, "grad_norm": 1.1924740076065063, "learning_rate": 5.553344163010039e-06, "loss": 0.34547489881515503, "step": 469 }, { "epoch": 1.370464904284412, "grad_norm": 1.1255877017974854, "learning_rate": 5.507795614300846e-06, "loss": 0.39645254611968994, "step": 470 }, { "epoch": 1.3733819507748404, "grad_norm": 1.0937018394470215, "learning_rate": 5.4623635285952815e-06, "loss": 0.4267856478691101, "step": 471 }, { "epoch": 1.3762989972652688, "grad_norm": 1.3355520963668823, "learning_rate": 5.417049083746513e-06, "loss": 0.3669992983341217, "step": 472 }, { "epoch": 1.3792160437556973, "grad_norm": 1.7302504777908325, "learning_rate": 5.3718534545578035e-06, "loss": 0.3873697519302368, "step": 473 }, { "epoch": 1.3821330902461257, "grad_norm": 1.17263662815094, "learning_rate": 5.326777812752041e-06, "loss": 0.4581540524959564, "step": 474 }, { "epoch": 1.3850501367365542, "grad_norm": 1.0998128652572632, "learning_rate": 5.281823326941377e-06, "loss": 0.43062761425971985, "step": 475 }, { "epoch": 1.3879671832269826, "grad_norm": 1.1194556951522827, "learning_rate": 5.236991162596932e-06, "loss": 0.381741464138031, "step": 476 }, { "epoch": 1.390884229717411, "grad_norm": 1.2759051322937012, "learning_rate": 5.19228248201856e-06, "loss": 0.49175748229026794, "step": 477 }, { "epoch": 1.3938012762078396, "grad_norm": 1.2134747505187988, "learning_rate": 5.147698444304732e-06, "loss": 0.4997562766075134, "step": 478 }, { "epoch": 1.396718322698268, "grad_norm": 1.0833078622817993, "learning_rate": 5.1032402053224804e-06, "loss": 0.42580488324165344, "step": 479 }, { "epoch": 1.3996353691886965, "grad_norm": 1.4838510751724243, "learning_rate": 5.058908917677426e-06, "loss": 0.5015593767166138, "step": 480 }, { "epoch": 1.402552415679125, "grad_norm": 1.218610167503357, "learning_rate": 5.014705730683904e-06, "loss": 0.34739193320274353, "step": 481 }, { "epoch": 1.4054694621695534, "grad_norm": 1.1883307695388794, "learning_rate": 4.970631790335181e-06, "loss": 0.41708022356033325, "step": 482 }, { "epoch": 1.4083865086599818, "grad_norm": 1.209291696548462, "learning_rate": 4.926688239273713e-06, "loss": 0.43546172976493835, "step": 483 }, { "epoch": 1.4113035551504103, "grad_norm": 1.0801606178283691, "learning_rate": 4.882876216761543e-06, "loss": 0.44491735100746155, "step": 484 }, { "epoch": 1.4142206016408387, "grad_norm": 1.2746628522872925, "learning_rate": 4.839196858650763e-06, "loss": 0.436122864484787, "step": 485 }, { "epoch": 1.4171376481312672, "grad_norm": 1.4465962648391724, "learning_rate": 4.795651297354056e-06, "loss": 0.3750447630882263, "step": 486 }, { "epoch": 1.4200546946216956, "grad_norm": 1.6736211776733398, "learning_rate": 4.752240661815346e-06, "loss": 0.38286519050598145, "step": 487 }, { "epoch": 1.422971741112124, "grad_norm": 1.1946996450424194, "learning_rate": 4.708966077480544e-06, "loss": 0.4488063156604767, "step": 488 }, { "epoch": 1.4258887876025526, "grad_norm": 1.42599356174469, "learning_rate": 4.665828666268335e-06, "loss": 0.44088613986968994, "step": 489 }, { "epoch": 1.4288058340929808, "grad_norm": 1.2281016111373901, "learning_rate": 4.622829546541121e-06, "loss": 0.4030645489692688, "step": 490 }, { "epoch": 1.4317228805834092, "grad_norm": 1.2875670194625854, "learning_rate": 4.57996983307602e-06, "loss": 0.44702020287513733, "step": 491 }, { "epoch": 1.4346399270738377, "grad_norm": 1.2456860542297363, "learning_rate": 4.537250637035947e-06, "loss": 0.4067370593547821, "step": 492 }, { "epoch": 1.4375569735642661, "grad_norm": 1.2822725772857666, "learning_rate": 4.494673065940833e-06, "loss": 0.4237740635871887, "step": 493 }, { "epoch": 1.4404740200546946, "grad_norm": 1.5517818927764893, "learning_rate": 4.452238223638906e-06, "loss": 0.40579724311828613, "step": 494 }, { "epoch": 1.443391066545123, "grad_norm": 1.275344967842102, "learning_rate": 4.409947210278056e-06, "loss": 0.38880717754364014, "step": 495 }, { "epoch": 1.4463081130355515, "grad_norm": 1.22952139377594, "learning_rate": 4.367801122277327e-06, "loss": 0.4042310416698456, "step": 496 }, { "epoch": 1.44922515952598, "grad_norm": 1.122261643409729, "learning_rate": 4.325801052298493e-06, "loss": 0.5408368110656738, "step": 497 }, { "epoch": 1.4521422060164084, "grad_norm": 1.5885361433029175, "learning_rate": 4.283948089217715e-06, "loss": 0.37697717547416687, "step": 498 }, { "epoch": 1.4550592525068369, "grad_norm": 2.3565149307250977, "learning_rate": 4.242243318097338e-06, "loss": 0.3811529576778412, "step": 499 }, { "epoch": 1.4579762989972653, "grad_norm": 1.1944137811660767, "learning_rate": 4.200687820157735e-06, "loss": 0.414781391620636, "step": 500 }, { "epoch": 1.4579762989972653, "eval_loss": 0.40706494450569153, "eval_runtime": 1189.1593, "eval_samples_per_second": 0.531, "eval_steps_per_second": 0.531, "step": 500 }, { "epoch": 1.4608933454876938, "grad_norm": 1.0442464351654053, "learning_rate": 4.159282672749289e-06, "loss": 0.38155990839004517, "step": 501 }, { "epoch": 1.463810391978122, "grad_norm": 1.7274727821350098, "learning_rate": 4.118028949324453e-06, "loss": 0.4830601215362549, "step": 502 }, { "epoch": 1.4667274384685505, "grad_norm": 2.064513921737671, "learning_rate": 4.0769277194099345e-06, "loss": 0.3975123167037964, "step": 503 }, { "epoch": 1.469644484958979, "grad_norm": 1.7695534229278564, "learning_rate": 4.035980048578942e-06, "loss": 0.37033841013908386, "step": 504 }, { "epoch": 1.4725615314494074, "grad_norm": 1.4455046653747559, "learning_rate": 3.995186998423597e-06, "loss": 0.39567673206329346, "step": 505 }, { "epoch": 1.4754785779398358, "grad_norm": 1.1791958808898926, "learning_rate": 3.9545496265273765e-06, "loss": 0.44786664843559265, "step": 506 }, { "epoch": 1.4783956244302643, "grad_norm": 2.0874717235565186, "learning_rate": 3.9140689864377105e-06, "loss": 0.3333263099193573, "step": 507 }, { "epoch": 1.4813126709206927, "grad_norm": 1.5897501707077026, "learning_rate": 3.873746127638668e-06, "loss": 0.5105943083763123, "step": 508 }, { "epoch": 1.4842297174111212, "grad_norm": 1.5059760808944702, "learning_rate": 3.833582095523749e-06, "loss": 0.43922683596611023, "step": 509 }, { "epoch": 1.4871467639015497, "grad_norm": 1.379347562789917, "learning_rate": 3.7935779313687648e-06, "loss": 0.4584790766239166, "step": 510 }, { "epoch": 1.490063810391978, "grad_norm": 1.0984690189361572, "learning_rate": 3.7537346723048816e-06, "loss": 0.5217512249946594, "step": 511 }, { "epoch": 1.4929808568824066, "grad_norm": 1.5944225788116455, "learning_rate": 3.71405335129169e-06, "loss": 0.4180052876472473, "step": 512 }, { "epoch": 1.495897903372835, "grad_norm": 1.2745033502578735, "learning_rate": 3.6745349970904465e-06, "loss": 0.4584833085536957, "step": 513 }, { "epoch": 1.4988149498632635, "grad_norm": 1.2746814489364624, "learning_rate": 3.6351806342374007e-06, "loss": 0.3202287554740906, "step": 514 }, { "epoch": 1.501731996353692, "grad_norm": 1.409638524055481, "learning_rate": 3.5959912830172348e-06, "loss": 0.37963351607322693, "step": 515 }, { "epoch": 1.5046490428441204, "grad_norm": 1.1655553579330444, "learning_rate": 3.556967959436591e-06, "loss": 0.43133026361465454, "step": 516 }, { "epoch": 1.5075660893345488, "grad_norm": 1.0495020151138306, "learning_rate": 3.518111675197776e-06, "loss": 0.3739299178123474, "step": 517 }, { "epoch": 1.5104831358249773, "grad_norm": 1.3055057525634766, "learning_rate": 3.4794234376724835e-06, "loss": 0.4099601209163666, "step": 518 }, { "epoch": 1.5134001823154057, "grad_norm": 1.2252463102340698, "learning_rate": 3.4409042498757084e-06, "loss": 0.380616158246994, "step": 519 }, { "epoch": 1.5163172288058342, "grad_norm": 1.2728638648986816, "learning_rate": 3.4025551104397294e-06, "loss": 0.3510003685951233, "step": 520 }, { "epoch": 1.5192342752962626, "grad_norm": 2.70664644241333, "learning_rate": 3.3643770135882282e-06, "loss": 0.4087940752506256, "step": 521 }, { "epoch": 1.522151321786691, "grad_norm": 1.6197112798690796, "learning_rate": 3.3263709491104933e-06, "loss": 0.45614126324653625, "step": 522 }, { "epoch": 1.5250683682771196, "grad_norm": 1.3596103191375732, "learning_rate": 3.2885379023357956e-06, "loss": 0.3824586272239685, "step": 523 }, { "epoch": 1.527985414767548, "grad_norm": 1.1768635511398315, "learning_rate": 3.2508788541078097e-06, "loss": 0.47717779874801636, "step": 524 }, { "epoch": 1.5309024612579762, "grad_norm": 1.669474482536316, "learning_rate": 3.2133947807591958e-06, "loss": 0.4013281762599945, "step": 525 }, { "epoch": 1.5338195077484047, "grad_norm": 1.600868582725525, "learning_rate": 3.1760866540862932e-06, "loss": 0.367280513048172, "step": 526 }, { "epoch": 1.5367365542388332, "grad_norm": 1.1689515113830566, "learning_rate": 3.138955441323923e-06, "loss": 0.4432409405708313, "step": 527 }, { "epoch": 1.5396536007292616, "grad_norm": 2.361961603164673, "learning_rate": 3.1020021051202973e-06, "loss": 0.4219942092895508, "step": 528 }, { "epoch": 1.54257064721969, "grad_norm": 1.1962230205535889, "learning_rate": 3.0652276035120964e-06, "loss": 0.3672596514225006, "step": 529 }, { "epoch": 1.5454876937101185, "grad_norm": 1.4149441719055176, "learning_rate": 3.0286328898995963e-06, "loss": 0.42919260263442993, "step": 530 }, { "epoch": 1.548404740200547, "grad_norm": 1.2668434381484985, "learning_rate": 2.992218913021966e-06, "loss": 0.4499061107635498, "step": 531 }, { "epoch": 1.5513217866909754, "grad_norm": 1.268114686012268, "learning_rate": 2.9559866169326734e-06, "loss": 0.34660714864730835, "step": 532 }, { "epoch": 1.5542388331814039, "grad_norm": 1.0086419582366943, "learning_rate": 2.919936940975007e-06, "loss": 0.38239023089408875, "step": 533 }, { "epoch": 1.557155879671832, "grad_norm": 1.0700170993804932, "learning_rate": 2.884070819757712e-06, "loss": 0.48240017890930176, "step": 534 }, { "epoch": 1.5600729261622606, "grad_norm": 1.2101227045059204, "learning_rate": 2.8483891831307873e-06, "loss": 0.4098761975765228, "step": 535 }, { "epoch": 1.562989972652689, "grad_norm": 1.2731400728225708, "learning_rate": 2.8128929561613505e-06, "loss": 0.45641395449638367, "step": 536 }, { "epoch": 1.5659070191431175, "grad_norm": 1.1474392414093018, "learning_rate": 2.777583059109671e-06, "loss": 0.42283985018730164, "step": 537 }, { "epoch": 1.568824065633546, "grad_norm": 1.789881944656372, "learning_rate": 2.7424604074053028e-06, "loss": 0.3469158113002777, "step": 538 }, { "epoch": 1.5717411121239744, "grad_norm": 1.3426933288574219, "learning_rate": 2.707525911623362e-06, "loss": 0.35837510228157043, "step": 539 }, { "epoch": 1.5746581586144028, "grad_norm": 1.2343578338623047, "learning_rate": 2.672780477460901e-06, "loss": 0.4736083745956421, "step": 540 }, { "epoch": 1.5775752051048313, "grad_norm": 1.516298770904541, "learning_rate": 2.638225005713457e-06, "loss": 0.34345340728759766, "step": 541 }, { "epoch": 1.5804922515952597, "grad_norm": 1.1488829851150513, "learning_rate": 2.6038603922516705e-06, "loss": 0.4134179949760437, "step": 542 }, { "epoch": 1.5834092980856882, "grad_norm": 1.4486491680145264, "learning_rate": 2.569687527998073e-06, "loss": 0.3297592103481293, "step": 543 }, { "epoch": 1.5863263445761167, "grad_norm": 1.272691011428833, "learning_rate": 2.5357072989039855e-06, "loss": 0.3958476185798645, "step": 544 }, { "epoch": 1.589243391066545, "grad_norm": 1.244240641593933, "learning_rate": 2.501920585926555e-06, "loss": 0.4125611186027527, "step": 545 }, { "epoch": 1.5921604375569736, "grad_norm": 1.5844073295593262, "learning_rate": 2.4683282650058992e-06, "loss": 0.3762253224849701, "step": 546 }, { "epoch": 1.595077484047402, "grad_norm": 1.8209946155548096, "learning_rate": 2.4349312070424258e-06, "loss": 0.37053319811820984, "step": 547 }, { "epoch": 1.5979945305378305, "grad_norm": 1.3752915859222412, "learning_rate": 2.4017302778742247e-06, "loss": 0.5004774332046509, "step": 548 }, { "epoch": 1.600911577028259, "grad_norm": 5.143753528594971, "learning_rate": 2.36872633825464e-06, "loss": 0.39014023542404175, "step": 549 }, { "epoch": 1.6038286235186874, "grad_norm": 1.0730944871902466, "learning_rate": 2.335920243829941e-06, "loss": 0.378440260887146, "step": 550 }, { "epoch": 1.6038286235186874, "eval_loss": 0.40037089586257935, "eval_runtime": 893.7411, "eval_samples_per_second": 0.707, "eval_steps_per_second": 0.707, "step": 550 }, { "epoch": 1.6067456700091158, "grad_norm": 1.5507797002792358, "learning_rate": 2.3033128451171548e-06, "loss": 0.4471960663795471, "step": 551 }, { "epoch": 1.6096627164995443, "grad_norm": 1.9462968111038208, "learning_rate": 2.2709049874819924e-06, "loss": 0.3658301830291748, "step": 552 }, { "epoch": 1.6125797629899727, "grad_norm": 1.2034238576889038, "learning_rate": 2.238697511116962e-06, "loss": 0.3911179304122925, "step": 553 }, { "epoch": 1.6154968094804012, "grad_norm": 1.3574327230453491, "learning_rate": 2.2066912510195636e-06, "loss": 0.3998897671699524, "step": 554 }, { "epoch": 1.6184138559708297, "grad_norm": 1.1973012685775757, "learning_rate": 2.1748870369706507e-06, "loss": 0.38577449321746826, "step": 555 }, { "epoch": 1.621330902461258, "grad_norm": 1.9365874528884888, "learning_rate": 2.1432856935129144e-06, "loss": 0.411307156085968, "step": 556 }, { "epoch": 1.6242479489516866, "grad_norm": 1.3558642864227295, "learning_rate": 2.1118880399295106e-06, "loss": 0.38424253463745117, "step": 557 }, { "epoch": 1.627164995442115, "grad_norm": 1.4368890523910522, "learning_rate": 2.0806948902228075e-06, "loss": 0.39943546056747437, "step": 558 }, { "epoch": 1.6300820419325432, "grad_norm": 1.6266753673553467, "learning_rate": 2.0497070530933084e-06, "loss": 0.36787641048431396, "step": 559 }, { "epoch": 1.6329990884229717, "grad_norm": 1.2600938081741333, "learning_rate": 2.0189253319186576e-06, "loss": 0.3781934380531311, "step": 560 }, { "epoch": 1.6359161349134002, "grad_norm": 1.975071907043457, "learning_rate": 1.9883505247328237e-06, "loss": 0.4132305383682251, "step": 561 }, { "epoch": 1.6388331814038286, "grad_norm": 1.4095909595489502, "learning_rate": 1.9579834242054154e-06, "loss": 0.3727574646472931, "step": 562 }, { "epoch": 1.641750227894257, "grad_norm": 1.4271371364593506, "learning_rate": 1.9278248176211243e-06, "loss": 0.33786773681640625, "step": 563 }, { "epoch": 1.6446672743846855, "grad_norm": 1.5907646417617798, "learning_rate": 1.8978754868593074e-06, "loss": 0.33035099506378174, "step": 564 }, { "epoch": 1.647584320875114, "grad_norm": 1.1315702199935913, "learning_rate": 1.8681362083737387e-06, "loss": 0.41707149147987366, "step": 565 }, { "epoch": 1.6505013673655424, "grad_norm": 1.4737143516540527, "learning_rate": 1.8386077531724556e-06, "loss": 0.43079230189323425, "step": 566 }, { "epoch": 1.6534184138559709, "grad_norm": 1.1006760597229004, "learning_rate": 1.8092908867977822e-06, "loss": 0.3524904251098633, "step": 567 }, { "epoch": 1.6563354603463991, "grad_norm": 1.4066118001937866, "learning_rate": 1.780186369306479e-06, "loss": 0.3695681691169739, "step": 568 }, { "epoch": 1.6592525068368276, "grad_norm": 1.6444640159606934, "learning_rate": 1.7512949552500412e-06, "loss": 0.35596007108688354, "step": 569 }, { "epoch": 1.662169553327256, "grad_norm": 1.159480094909668, "learning_rate": 1.7226173936551282e-06, "loss": 0.4520571827888489, "step": 570 }, { "epoch": 1.6650865998176845, "grad_norm": 1.5874221324920654, "learning_rate": 1.6941544280041567e-06, "loss": 0.4702282249927521, "step": 571 }, { "epoch": 1.668003646308113, "grad_norm": 1.6153535842895508, "learning_rate": 1.6659067962160157e-06, "loss": 0.3803800046443939, "step": 572 }, { "epoch": 1.6709206927985414, "grad_norm": 1.0748940706253052, "learning_rate": 1.6378752306269386e-06, "loss": 0.4368419051170349, "step": 573 }, { "epoch": 1.6738377392889698, "grad_norm": 1.5286788940429688, "learning_rate": 1.6100604579715185e-06, "loss": 0.4195623993873596, "step": 574 }, { "epoch": 1.6767547857793983, "grad_norm": 1.1433510780334473, "learning_rate": 1.5824631993638651e-06, "loss": 0.4366849660873413, "step": 575 }, { "epoch": 1.6796718322698267, "grad_norm": 1.9694907665252686, "learning_rate": 1.5550841702789122e-06, "loss": 0.5555303692817688, "step": 576 }, { "epoch": 1.6825888787602552, "grad_norm": 1.7587188482284546, "learning_rate": 1.5279240805338647e-06, "loss": 0.40394848585128784, "step": 577 }, { "epoch": 1.6855059252506837, "grad_norm": 1.063381314277649, "learning_rate": 1.5009836342697993e-06, "loss": 0.49564215540885925, "step": 578 }, { "epoch": 1.688422971741112, "grad_norm": 1.1742531061172485, "learning_rate": 1.4742635299334063e-06, "loss": 0.3891904950141907, "step": 579 }, { "epoch": 1.6913400182315406, "grad_norm": 1.499934196472168, "learning_rate": 1.4477644602588848e-06, "loss": 0.35497623682022095, "step": 580 }, { "epoch": 1.694257064721969, "grad_norm": 1.5112360715866089, "learning_rate": 1.421487112249984e-06, "loss": 0.4062272012233734, "step": 581 }, { "epoch": 1.6971741112123975, "grad_norm": 1.3583141565322876, "learning_rate": 1.3954321671621885e-06, "loss": 0.3655265271663666, "step": 582 }, { "epoch": 1.700091157702826, "grad_norm": 2.8181653022766113, "learning_rate": 1.3696003004850577e-06, "loss": 0.37418332695961, "step": 583 }, { "epoch": 1.7030082041932544, "grad_norm": 0.967166543006897, "learning_rate": 1.3439921819247138e-06, "loss": 0.4946930408477783, "step": 584 }, { "epoch": 1.7059252506836828, "grad_norm": 1.2773699760437012, "learning_rate": 1.3186084753864813e-06, "loss": 0.5101871490478516, "step": 585 }, { "epoch": 1.7088422971741113, "grad_norm": 1.2814991474151611, "learning_rate": 1.293449838957671e-06, "loss": 0.3688133656978607, "step": 586 }, { "epoch": 1.7117593436645397, "grad_norm": 1.594966173171997, "learning_rate": 1.2685169248905228e-06, "loss": 0.4739398956298828, "step": 587 }, { "epoch": 1.7146763901549682, "grad_norm": 1.1471531391143799, "learning_rate": 1.2438103795852885e-06, "loss": 0.3719588816165924, "step": 588 }, { "epoch": 1.7175934366453967, "grad_norm": 1.1657356023788452, "learning_rate": 1.2193308435734852e-06, "loss": 0.4119298458099365, "step": 589 }, { "epoch": 1.720510483135825, "grad_norm": 1.1239042282104492, "learning_rate": 1.1950789515012783e-06, "loss": 0.38277503848075867, "step": 590 }, { "epoch": 1.7234275296262536, "grad_norm": 1.149478554725647, "learning_rate": 1.1710553321130324e-06, "loss": 0.35080626606941223, "step": 591 }, { "epoch": 1.726344576116682, "grad_norm": 1.2020260095596313, "learning_rate": 1.1472606082350112e-06, "loss": 0.3991318345069885, "step": 592 }, { "epoch": 1.7292616226071102, "grad_norm": 1.101475477218628, "learning_rate": 1.123695396759229e-06, "loss": 0.45791420340538025, "step": 593 }, { "epoch": 1.7321786690975387, "grad_norm": 0.9617101550102234, "learning_rate": 1.1003603086274584e-06, "loss": 0.39805036783218384, "step": 594 }, { "epoch": 1.7350957155879672, "grad_norm": 1.1439731121063232, "learning_rate": 1.07725594881539e-06, "loss": 0.35753339529037476, "step": 595 }, { "epoch": 1.7380127620783956, "grad_norm": 1.0350618362426758, "learning_rate": 1.0543829163169516e-06, "loss": 0.42581748962402344, "step": 596 }, { "epoch": 1.740929808568824, "grad_norm": 1.2865227460861206, "learning_rate": 1.031741804128773e-06, "loss": 0.34685325622558594, "step": 597 }, { "epoch": 1.7438468550592525, "grad_norm": 1.2079373598098755, "learning_rate": 1.0093331992348154e-06, "loss": 0.48401936888694763, "step": 598 }, { "epoch": 1.746763901549681, "grad_norm": 1.1684436798095703, "learning_rate": 9.871576825911577e-07, "loss": 0.387456476688385, "step": 599 }, { "epoch": 1.7496809480401094, "grad_norm": 1.298045039176941, "learning_rate": 9.65215829110927e-07, "loss": 0.40196847915649414, "step": 600 }, { "epoch": 1.7496809480401094, "eval_loss": 0.3965963125228882, "eval_runtime": 912.3102, "eval_samples_per_second": 0.693, "eval_steps_per_second": 0.693, "step": 600 }, { "epoch": 1.7525979945305379, "grad_norm": 1.24501371383667, "learning_rate": 9.435082076493974e-07, "loss": 0.3990224003791809, "step": 601 }, { "epoch": 1.7555150410209661, "grad_norm": 1.0634632110595703, "learning_rate": 9.220353809892435e-07, "loss": 0.44232451915740967, "step": 602 }, { "epoch": 1.7584320875113946, "grad_norm": 1.0276325941085815, "learning_rate": 9.007979058259475e-07, "loss": 0.5336061716079712, "step": 603 }, { "epoch": 1.761349134001823, "grad_norm": 1.1488786935806274, "learning_rate": 8.797963327533698e-07, "loss": 0.35023194551467896, "step": 604 }, { "epoch": 1.7642661804922515, "grad_norm": 1.171109676361084, "learning_rate": 8.590312062494699e-07, "loss": 0.4461829662322998, "step": 605 }, { "epoch": 1.76718322698268, "grad_norm": 1.3948134183883667, "learning_rate": 8.385030646621938e-07, "loss": 0.3448236584663391, "step": 606 }, { "epoch": 1.7701002734731084, "grad_norm": 1.144608497619629, "learning_rate": 8.18212440195515e-07, "loss": 0.39913487434387207, "step": 607 }, { "epoch": 1.7730173199635368, "grad_norm": 1.1941088438034058, "learning_rate": 7.981598588956396e-07, "loss": 0.40005186200141907, "step": 608 }, { "epoch": 1.7759343664539653, "grad_norm": 1.1087690591812134, "learning_rate": 7.783458406373656e-07, "loss": 0.38895174860954285, "step": 609 }, { "epoch": 1.7788514129443938, "grad_norm": 1.1787676811218262, "learning_rate": 7.587708991106069e-07, "loss": 0.36259594559669495, "step": 610 }, { "epoch": 1.7817684594348222, "grad_norm": 1.1265360116958618, "learning_rate": 7.394355418070731e-07, "loss": 0.44475269317626953, "step": 611 }, { "epoch": 1.7846855059252507, "grad_norm": 1.2230898141860962, "learning_rate": 7.203402700071138e-07, "loss": 0.3823542594909668, "step": 612 }, { "epoch": 1.7876025524156791, "grad_norm": 1.0893492698669434, "learning_rate": 7.01485578766724e-07, "loss": 0.43276944756507874, "step": 613 }, { "epoch": 1.7905195989061076, "grad_norm": 1.039494514465332, "learning_rate": 6.828719569047082e-07, "loss": 0.5362570881843567, "step": 614 }, { "epoch": 1.793436645396536, "grad_norm": 1.0307413339614868, "learning_rate": 6.644998869900054e-07, "loss": 0.34828731417655945, "step": 615 }, { "epoch": 1.7963536918869645, "grad_norm": 1.1253540515899658, "learning_rate": 6.463698453291823e-07, "loss": 0.3669811487197876, "step": 616 }, { "epoch": 1.799270738377393, "grad_norm": 1.1103028059005737, "learning_rate": 6.28482301954082e-07, "loss": 0.3868233561515808, "step": 617 }, { "epoch": 1.8021877848678214, "grad_norm": 1.0804798603057861, "learning_rate": 6.108377206096394e-07, "loss": 0.4123673439025879, "step": 618 }, { "epoch": 1.8051048313582498, "grad_norm": 1.1068788766860962, "learning_rate": 5.934365587418567e-07, "loss": 0.44468799233436584, "step": 619 }, { "epoch": 1.8080218778486783, "grad_norm": 1.0318645238876343, "learning_rate": 5.762792674859474e-07, "loss": 0.3586595356464386, "step": 620 }, { "epoch": 1.8109389243391067, "grad_norm": 1.1553035974502563, "learning_rate": 5.593662916546361e-07, "loss": 0.4580552577972412, "step": 621 }, { "epoch": 1.8138559708295352, "grad_norm": 1.3010531663894653, "learning_rate": 5.426980697266271e-07, "loss": 0.42412641644477844, "step": 622 }, { "epoch": 1.8167730173199637, "grad_norm": 1.1858006715774536, "learning_rate": 5.262750338352418e-07, "loss": 0.38257676362991333, "step": 623 }, { "epoch": 1.8196900638103921, "grad_norm": 1.1341536045074463, "learning_rate": 5.100976097572074e-07, "loss": 0.48365846276283264, "step": 624 }, { "epoch": 1.8226071103008206, "grad_norm": 1.112844467163086, "learning_rate": 4.941662169016237e-07, "loss": 0.3893233835697174, "step": 625 }, { "epoch": 1.825524156791249, "grad_norm": 1.1846497058868408, "learning_rate": 4.784812682990903e-07, "loss": 0.38869139552116394, "step": 626 }, { "epoch": 1.8284412032816773, "grad_norm": 1.1383928060531616, "learning_rate": 4.6304317059099326e-07, "loss": 0.36156678199768066, "step": 627 }, { "epoch": 1.8313582497721057, "grad_norm": 1.0891298055648804, "learning_rate": 4.478523240189703e-07, "loss": 0.40910348296165466, "step": 628 }, { "epoch": 1.8342752962625342, "grad_norm": 1.1337662935256958, "learning_rate": 4.3290912241452545e-07, "loss": 0.3360365629196167, "step": 629 }, { "epoch": 1.8371923427529626, "grad_norm": 1.280463695526123, "learning_rate": 4.182139531888263e-07, "loss": 0.44318532943725586, "step": 630 }, { "epoch": 1.840109389243391, "grad_norm": 1.1408170461654663, "learning_rate": 4.0376719732265647e-07, "loss": 0.37003564834594727, "step": 631 }, { "epoch": 1.8430264357338195, "grad_norm": 0.9730168581008911, "learning_rate": 3.8956922935653895e-07, "loss": 0.355985552072525, "step": 632 }, { "epoch": 1.845943482224248, "grad_norm": 1.0643151998519897, "learning_rate": 3.756204173810263e-07, "loss": 0.3911808729171753, "step": 633 }, { "epoch": 1.8488605287146764, "grad_norm": 1.1769851446151733, "learning_rate": 3.61921123027158e-07, "loss": 0.314385324716568, "step": 634 }, { "epoch": 1.8517775752051049, "grad_norm": 0.921336829662323, "learning_rate": 3.484717014570838e-07, "loss": 0.3375144302845001, "step": 635 }, { "epoch": 1.8546946216955331, "grad_norm": 0.9904773235321045, "learning_rate": 3.3527250135485744e-07, "loss": 0.4461369514465332, "step": 636 }, { "epoch": 1.8576116681859616, "grad_norm": 1.0844534635543823, "learning_rate": 3.223238649173954e-07, "loss": 0.398414671421051, "step": 637 }, { "epoch": 1.86052871467639, "grad_norm": 0.9829220771789551, "learning_rate": 3.096261278456048e-07, "loss": 0.35938704013824463, "step": 638 }, { "epoch": 1.8634457611668185, "grad_norm": 1.13048255443573, "learning_rate": 2.971796193356835e-07, "loss": 0.3783624768257141, "step": 639 }, { "epoch": 1.866362807657247, "grad_norm": 1.4307893514633179, "learning_rate": 2.8498466207058095e-07, "loss": 0.3601874113082886, "step": 640 }, { "epoch": 1.8692798541476754, "grad_norm": 1.1835116147994995, "learning_rate": 2.7304157221163753e-07, "loss": 0.43897169828414917, "step": 641 }, { "epoch": 1.8721969006381038, "grad_norm": 1.0730469226837158, "learning_rate": 2.613506593903825e-07, "loss": 0.4407995343208313, "step": 642 }, { "epoch": 1.8751139471285323, "grad_norm": 0.9504678845405579, "learning_rate": 2.499122267005105e-07, "loss": 0.4105035960674286, "step": 643 }, { "epoch": 1.8780309936189608, "grad_norm": 1.2599385976791382, "learning_rate": 2.387265706900199e-07, "loss": 0.41521430015563965, "step": 644 }, { "epoch": 1.8809480401093892, "grad_norm": 1.035783052444458, "learning_rate": 2.2779398135353127e-07, "loss": 0.33491846919059753, "step": 645 }, { "epoch": 1.8838650865998177, "grad_norm": 1.1612690687179565, "learning_rate": 2.1711474212476325e-07, "loss": 0.3367970287799835, "step": 646 }, { "epoch": 1.8867821330902461, "grad_norm": 1.2541207075119019, "learning_rate": 2.066891298691831e-07, "loss": 0.46374717354774475, "step": 647 }, { "epoch": 1.8896991795806746, "grad_norm": 1.1037088632583618, "learning_rate": 1.9651741487683562e-07, "loss": 0.3799871802330017, "step": 648 }, { "epoch": 1.892616226071103, "grad_norm": 1.3611476421356201, "learning_rate": 1.8659986085532988e-07, "loss": 0.40523889660835266, "step": 649 }, { "epoch": 1.8955332725615315, "grad_norm": 1.1628823280334473, "learning_rate": 1.7693672492300473e-07, "loss": 0.38399839401245117, "step": 650 }, { "epoch": 1.8955332725615315, "eval_loss": 0.3949255049228668, "eval_runtime": 903.6455, "eval_samples_per_second": 0.699, "eval_steps_per_second": 0.699, "step": 650 }, { "epoch": 1.89845031905196, "grad_norm": 1.1185522079467773, "learning_rate": 1.675282576022641e-07, "loss": 0.4280855059623718, "step": 651 }, { "epoch": 1.9013673655423884, "grad_norm": 1.1962717771530151, "learning_rate": 1.5837470281307666e-07, "loss": 0.3026162087917328, "step": 652 }, { "epoch": 1.9042844120328168, "grad_norm": 1.1818240880966187, "learning_rate": 1.4947629786666084e-07, "loss": 0.43283963203430176, "step": 653 }, { "epoch": 1.9072014585232453, "grad_norm": 1.161944031715393, "learning_rate": 1.4083327345932208e-07, "loss": 0.435259610414505, "step": 654 }, { "epoch": 1.9101185050136738, "grad_norm": 1.1311709880828857, "learning_rate": 1.32445853666483e-07, "loss": 0.3258042633533478, "step": 655 }, { "epoch": 1.9130355515041022, "grad_norm": 1.0152852535247803, "learning_rate": 1.2431425593686263e-07, "loss": 0.40951770544052124, "step": 656 }, { "epoch": 1.9159525979945307, "grad_norm": 1.2698794603347778, "learning_rate": 1.164386910868498e-07, "loss": 0.3610893785953522, "step": 657 }, { "epoch": 1.9188696444849591, "grad_norm": 1.1092722415924072, "learning_rate": 1.0881936329502851e-07, "loss": 0.31951773166656494, "step": 658 }, { "epoch": 1.9217866909753876, "grad_norm": 1.2378597259521484, "learning_rate": 1.0145647009689008e-07, "loss": 0.3756055235862732, "step": 659 }, { "epoch": 1.924703737465816, "grad_norm": 1.0100237131118774, "learning_rate": 9.43502023797116e-08, "loss": 0.26117536425590515, "step": 660 }, { "epoch": 1.9276207839562443, "grad_norm": 1.2368487119674683, "learning_rate": 8.750074437760325e-08, "loss": 0.3092282712459564, "step": 661 }, { "epoch": 1.9305378304466727, "grad_norm": 1.0328837633132935, "learning_rate": 8.090827366673548e-08, "loss": 0.4076297879219055, "step": 662 }, { "epoch": 1.9334548769371012, "grad_norm": 0.9885771870613098, "learning_rate": 7.457296116073487e-08, "loss": 0.40007251501083374, "step": 663 }, { "epoch": 1.9363719234275296, "grad_norm": 1.19287109375, "learning_rate": 6.849497110625214e-08, "loss": 0.3751019239425659, "step": 664 }, { "epoch": 1.939288969917958, "grad_norm": 1.134682536125183, "learning_rate": 6.267446107870334e-08, "loss": 0.4558236300945282, "step": 665 }, { "epoch": 1.9422060164083865, "grad_norm": 3.414883852005005, "learning_rate": 5.7111581978185336e-08, "loss": 0.5070392489433289, "step": 666 }, { "epoch": 1.945123062898815, "grad_norm": 1.179479956626892, "learning_rate": 5.180647802556671e-08, "loss": 0.389989972114563, "step": 667 }, { "epoch": 1.9480401093892434, "grad_norm": 1.1473273038864136, "learning_rate": 4.675928675874186e-08, "loss": 0.460910826921463, "step": 668 }, { "epoch": 1.9509571558796717, "grad_norm": 0.9269355535507202, "learning_rate": 4.197013902907165e-08, "loss": 0.5488728284835815, "step": 669 }, { "epoch": 1.9538742023701001, "grad_norm": 1.1781370639801025, "learning_rate": 3.7439158997989445e-08, "loss": 0.39483463764190674, "step": 670 }, { "epoch": 1.9567912488605286, "grad_norm": 1.1759430170059204, "learning_rate": 3.316646413377811e-08, "loss": 0.38600990176200867, "step": 671 }, { "epoch": 1.959708295350957, "grad_norm": 1.1981792449951172, "learning_rate": 2.9152165208529147e-08, "loss": 0.4657193422317505, "step": 672 }, { "epoch": 1.9626253418413855, "grad_norm": 1.186043620109558, "learning_rate": 2.5396366295272756e-08, "loss": 0.46212077140808105, "step": 673 }, { "epoch": 1.965542388331814, "grad_norm": 1.115103840827942, "learning_rate": 2.1899164765271096e-08, "loss": 0.4416077733039856, "step": 674 }, { "epoch": 1.9684594348222424, "grad_norm": 1.2150691747665405, "learning_rate": 1.866065128550365e-08, "loss": 0.3557685911655426, "step": 675 }, { "epoch": 1.9713764813126708, "grad_norm": 1.096506953239441, "learning_rate": 1.5680909816309098e-08, "loss": 0.32865390181541443, "step": 676 }, { "epoch": 1.9742935278030993, "grad_norm": 1.0974191427230835, "learning_rate": 1.2960017609213727e-08, "loss": 0.37568721175193787, "step": 677 }, { "epoch": 1.9772105742935278, "grad_norm": 1.1290082931518555, "learning_rate": 1.0498045204924145e-08, "loss": 0.329836905002594, "step": 678 }, { "epoch": 1.9801276207839562, "grad_norm": 1.0609803199768066, "learning_rate": 8.295056431504301e-09, "loss": 0.2694982886314392, "step": 679 }, { "epoch": 1.9830446672743847, "grad_norm": 0.9838472604751587, "learning_rate": 6.3511084027156885e-09, "loss": 0.4270719587802887, "step": 680 }, { "epoch": 1.9859617137648131, "grad_norm": 1.1900098323822021, "learning_rate": 4.666251516536324e-09, "loss": 0.4060650169849396, "step": 681 }, { "epoch": 1.9888787602552416, "grad_norm": 0.9812174439430237, "learning_rate": 3.2405294538606637e-09, "loss": 0.3900409936904907, "step": 682 }, { "epoch": 1.99179580674567, "grad_norm": 1.1988210678100586, "learning_rate": 2.073979177357188e-09, "loss": 0.3999583125114441, "step": 683 }, { "epoch": 1.9947128532360985, "grad_norm": 0.9738736152648926, "learning_rate": 1.1666309305202738e-09, "loss": 0.46780622005462646, "step": 684 }, { "epoch": 1.997629899726527, "grad_norm": 0.9841824173927307, "learning_rate": 5.18508236878601e-10, "loss": 0.4595794975757599, "step": 685 }, { "epoch": 2.0, "grad_norm": 1.0865421295166016, "learning_rate": 1.2962789938897323e-10, "loss": 0.5136060118675232, "step": 686 } ], "logging_steps": 1, "max_steps": 686, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.317102071220797e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }