diff --git "a/all_experiments_details.json" "b/all_experiments_details.json" --- "a/all_experiments_details.json" +++ "b/all_experiments_details.json" @@ -4,7 +4,8 @@ "BASE_MODEL_ID": "Qwen/Qwen3-4B-Instruct-2507", "DATASET_ID": [ "u-10bei/structured_data_with_cot_dataset_512", - "u-10bei/structured_data_with_cot_dataset_512_v2u-10bei/structured_data_with_cot_dataset_512_v4", + "u-10bei/structured_data_with_cot_dataset_512_v2", + "u-10bei/structured_data_with_cot_dataset_512_v4", "u-10bei/structured_data_with_cot_dataset_512_v5", "u-10bei/structured_data_with_cot_dataset_v2" ], @@ -12,8 +13,8 @@ "SEED": 3407, "VAL_RATIO": 0.05, "MAX_SEQ_LEN": 512, - "LORA_R": 64, - "LORA_ALPHA": 128, + "LORA_R": 32, + "LORA_ALPHA": 64, "LORA_DROPOUT": 0.0, "LORA_TARGET_MODULES": [ "q_proj", @@ -24,11 +25,11 @@ "up_proj", "down_proj" ], - "EPOCHS": 5, + "EPOCHS": 1, "PER_DEVICE_TRAIN_BS": 2, "PER_DEVICE_EVAL_BS": 2, "GRAD_ACCUM": 8, - "LR": 1e-06, + "LR": 0.0001, "WARMUP_RATIO": 0.1, "WEIGHT_DECAY": 0.05, "MAX_STEPS": -1, @@ -52,2572 +53,945 @@ }, "history": [ { - "loss": 1.8907, - "grad_norm": 3.5059776306152344, - "learning_rate": 3.010033444816053e-08, - "epoch": 0.016760946993505135, + "loss": 1.9109, + "grad_norm": 3.2909586429595947, + "learning_rate": 8.181818181818183e-06, + "epoch": 0.009169054441260744, "step": 10 }, { - "loss": 2.1073, - "grad_norm": 5.140995502471924, - "learning_rate": 6.354515050167224e-08, - "epoch": 0.03352189398701027, + "loss": 1.5949, + "grad_norm": 0.8696144223213196, + "learning_rate": 1.7272727272727274e-05, + "epoch": 0.01833810888252149, "step": 20 }, { - "loss": 2.1196, - "grad_norm": 4.122618198394775, - "learning_rate": 9.698996655518395e-08, - "epoch": 0.0502828409805154, + "loss": 1.4542, + "grad_norm": 0.41898515820503235, + "learning_rate": 2.636363636363636e-05, + "epoch": 0.027507163323782235, "step": 30 }, { - "loss": 1.9858, - "grad_norm": 4.268527984619141, - "learning_rate": 1.3043478260869563e-07, - "epoch": 0.06704378797402054, + "loss": 1.241, + "grad_norm": 0.42209118604660034, + "learning_rate": 3.545454545454546e-05, + "epoch": 0.03667621776504298, "step": 40 }, { - "loss": 2.0532, - "grad_norm": 3.432629108428955, - "learning_rate": 1.6387959866220735e-07, - "epoch": 0.08380473496752566, + "loss": 1.3934, + "grad_norm": 0.7735824584960938, + "learning_rate": 4.454545454545455e-05, + "epoch": 0.045845272206303724, "step": 50 }, { - "eval_loss": 2.1637799739837646, - "eval_runtime": 79.031, - "eval_samples_per_second": 6.301, - "eval_steps_per_second": 3.151, - "epoch": 0.08380473496752566, + "eval_loss": 1.1486613750457764, + "eval_runtime": 157.2295, + "eval_samples_per_second": 5.902, + "eval_steps_per_second": 2.951, + "epoch": 0.045845272206303724, "step": 50 }, { - "loss": 1.9455, - "grad_norm": 3.9416215419769287, - "learning_rate": 1.9732441471571906e-07, - "epoch": 0.1005656819610308, + "loss": 1.1159, + "grad_norm": 0.4702383875846863, + "learning_rate": 5.363636363636364e-05, + "epoch": 0.05501432664756447, "step": 60 }, { - "loss": 2.0286, - "grad_norm": 3.982705593109131, - "learning_rate": 2.3076923076923078e-07, - "epoch": 0.11732662895453592, + "loss": 1.0729, + "grad_norm": 0.4587116539478302, + "learning_rate": 6.272727272727273e-05, + "epoch": 0.06418338108882521, "step": 70 }, { - "loss": 2.02, - "grad_norm": 5.758102893829346, - "learning_rate": 2.6421404682274245e-07, - "epoch": 0.13408757594804108, + "loss": 1.049, + "grad_norm": 0.5562591552734375, + "learning_rate": 7.181818181818182e-05, + "epoch": 0.07335243553008595, "step": 80 }, { - "loss": 1.964, - "grad_norm": 5.359775066375732, - "learning_rate": 2.976588628762542e-07, - "epoch": 0.1508485229415462, + "loss": 0.9021, + "grad_norm": 0.6512398719787598, + "learning_rate": 8.090909090909092e-05, + "epoch": 0.0825214899713467, "step": 90 }, { - "loss": 2.0269, - "grad_norm": 4.892724990844727, - "learning_rate": 3.311036789297659e-07, - "epoch": 0.16760946993505133, + "loss": 0.9578, + "grad_norm": 0.6464130878448486, + "learning_rate": 9e-05, + "epoch": 0.09169054441260745, "step": 100 }, { - "eval_loss": 2.112635612487793, - "eval_runtime": 78.9151, - "eval_samples_per_second": 6.311, - "eval_steps_per_second": 3.155, - "epoch": 0.16760946993505133, + "eval_loss": 0.8903455138206482, + "eval_runtime": 149.8861, + "eval_samples_per_second": 6.191, + "eval_steps_per_second": 3.096, + "epoch": 0.09169054441260745, "step": 100 }, { - "loss": 2.1211, - "grad_norm": 4.311831474304199, - "learning_rate": 3.6454849498327757e-07, - "epoch": 0.18437041692855646, + "loss": 0.9056, + "grad_norm": 0.5816523432731628, + "learning_rate": 9.909090909090911e-05, + "epoch": 0.1008595988538682, "step": 110 }, { - "loss": 1.7914, - "grad_norm": 3.819464683532715, - "learning_rate": 3.979933110367893e-07, - "epoch": 0.2011313639220616, + "loss": 0.8961, + "grad_norm": 0.6272807121276855, + "learning_rate": 9.997923381619256e-05, + "epoch": 0.11002865329512894, "step": 120 }, { - "loss": 1.958, - "grad_norm": 3.3366334438323975, - "learning_rate": 4.3143812709030095e-07, - "epoch": 0.21789231091556674, + "loss": 0.8718, + "grad_norm": 0.3907865881919861, + "learning_rate": 9.990747162241872e-05, + "epoch": 0.11919770773638969, "step": 130 }, { - "loss": 1.9575, - "grad_norm": 3.8945889472961426, - "learning_rate": 4.648829431438127e-07, - "epoch": 0.23465325790907185, + "loss": 0.8814, + "grad_norm": 0.4261883497238159, + "learning_rate": 9.978453061876696e-05, + "epoch": 0.12836676217765042, "step": 140 }, { - "loss": 2.0176, - "grad_norm": 3.6403913497924805, - "learning_rate": 4.983277591973244e-07, - "epoch": 0.251414204902577, + "loss": 0.9294, + "grad_norm": 0.3960123658180237, + "learning_rate": 9.96105368780285e-05, + "epoch": 0.13753581661891118, "step": 150 }, { - "eval_loss": 1.9623165130615234, - "eval_runtime": 78.7041, - "eval_samples_per_second": 6.327, - "eval_steps_per_second": 3.164, - "epoch": 0.251414204902577, + "eval_loss": 0.8402041792869568, + "eval_runtime": 153.7211, + "eval_samples_per_second": 6.037, + "eval_steps_per_second": 3.018, + "epoch": 0.13753581661891118, "step": 150 }, { - "loss": 1.7846, - "grad_norm": 2.4720418453216553, - "learning_rate": 5.317725752508361e-07, - "epoch": 0.26817515189608215, + "loss": 0.8915, + "grad_norm": 0.45985275506973267, + "learning_rate": 9.938566882624436e-05, + "epoch": 0.1467048710601719, "step": 160 }, { - "loss": 1.7023, - "grad_norm": 2.5866756439208984, - "learning_rate": 5.652173913043477e-07, - "epoch": 0.28493609888958726, + "loss": 0.8776, + "grad_norm": 0.4007140100002289, + "learning_rate": 9.9110157059734e-05, + "epoch": 0.15587392550143267, "step": 170 }, { - "loss": 1.6763, - "grad_norm": 2.9348649978637695, - "learning_rate": 5.986622073578596e-07, - "epoch": 0.3016970458830924, + "loss": 0.8804, + "grad_norm": 0.5031015872955322, + "learning_rate": 9.878428410862483e-05, + "epoch": 0.1650429799426934, "step": 180 }, { - "loss": 1.5781, - "grad_norm": 2.411113739013672, - "learning_rate": 6.321070234113712e-07, - "epoch": 0.31845799287659754, + "loss": 0.8733, + "grad_norm": 0.3346236050128937, + "learning_rate": 9.840838414712501e-05, + "epoch": 0.17421203438395416, "step": 190 }, { - "loss": 1.6026, - "grad_norm": 2.2124366760253906, - "learning_rate": 6.655518394648829e-07, - "epoch": 0.33521893987010265, + "loss": 0.7578, + "grad_norm": 0.37552139163017273, + "learning_rate": 9.798284265083642e-05, + "epoch": 0.1833810888252149, "step": 200 }, { - "eval_loss": 1.7386550903320312, - "eval_runtime": 78.883, - "eval_samples_per_second": 6.313, - "eval_steps_per_second": 3.157, - "epoch": 0.33521893987010265, + "eval_loss": 0.8151949048042297, + "eval_runtime": 147.771, + "eval_samples_per_second": 6.28, + "eval_steps_per_second": 3.14, + "epoch": 0.1833810888252149, "step": 200 }, { - "loss": 1.4584, - "grad_norm": 1.65778386592865, - "learning_rate": 6.989966555183946e-07, - "epoch": 0.3519798868636078, + "loss": 0.8844, + "grad_norm": 0.3008505403995514, + "learning_rate": 9.750809600145954e-05, + "epoch": 0.19255014326647565, "step": 210 }, { - "loss": 1.4641, - "grad_norm": 1.6079872846603394, - "learning_rate": 7.324414715719063e-07, - "epoch": 0.3687408338571129, + "loss": 0.806, + "grad_norm": 0.40879538655281067, + "learning_rate": 9.698463103929542e-05, + "epoch": 0.2017191977077364, "step": 220 }, { - "loss": 1.4958, - "grad_norm": 1.3282182216644287, - "learning_rate": 7.658862876254181e-07, - "epoch": 0.38550178085061804, + "loss": 0.757, + "grad_norm": 0.2864895164966583, + "learning_rate": 9.641298456400363e-05, + "epoch": 0.21088825214899715, "step": 230 }, { - "loss": 1.5744, - "grad_norm": 0.7879586815834045, - "learning_rate": 7.993311036789297e-07, - "epoch": 0.4022627278441232, + "loss": 0.8971, + "grad_norm": 0.4410439431667328, + "learning_rate": 9.579374278412819e-05, + "epoch": 0.22005730659025788, "step": 240 }, { - "loss": 1.6045, - "grad_norm": 0.9567638635635376, - "learning_rate": 8.327759197324414e-07, - "epoch": 0.4190236748376283, + "loss": 0.8188, + "grad_norm": 0.3572053611278534, + "learning_rate": 9.512754071595605e-05, + "epoch": 0.22922636103151864, "step": 250 }, { - "eval_loss": 1.5930449962615967, - "eval_runtime": 80.1329, - "eval_samples_per_second": 6.215, - "eval_steps_per_second": 3.107, - "epoch": 0.4190236748376283, + "eval_loss": 0.8022013902664185, + "eval_runtime": 149.5554, + "eval_samples_per_second": 6.205, + "eval_steps_per_second": 3.103, + "epoch": 0.22922636103151864, "step": 250 }, { - "loss": 1.5604, - "grad_norm": 0.7901080250740051, - "learning_rate": 8.662207357859531e-07, - "epoch": 0.4357846218311335, + "loss": 0.8794, + "grad_norm": 0.3161059617996216, + "learning_rate": 9.441506153232443e-05, + "epoch": 0.23839541547277937, "step": 260 }, { - "loss": 1.3883, - "grad_norm": 0.6865659952163696, - "learning_rate": 8.996655518394648e-07, - "epoch": 0.4525455688246386, + "loss": 0.7863, + "grad_norm": 0.3244767487049103, + "learning_rate": 9.365703586204496e-05, + "epoch": 0.2475644699140401, "step": 270 }, { - "loss": 1.4236, - "grad_norm": 0.8144264221191406, - "learning_rate": 9.331103678929766e-07, - "epoch": 0.4693065158181437, + "loss": 0.9084, + "grad_norm": 0.40793028473854065, + "learning_rate": 9.285424104066276e-05, + "epoch": 0.25673352435530084, "step": 280 }, { - "loss": 1.3641, - "grad_norm": 0.6162766218185425, - "learning_rate": 9.665551839464883e-07, - "epoch": 0.48606746281164886, + "loss": 0.8005, + "grad_norm": 0.2878340184688568, + "learning_rate": 9.20075003133193e-05, + "epoch": 0.2659025787965616, "step": 290 }, { - "loss": 1.3853, - "grad_norm": 0.7314286231994629, - "learning_rate": 1e-06, - "epoch": 0.502828409805154, + "loss": 0.8717, + "grad_norm": 0.42775702476501465, + "learning_rate": 9.111768199053588e-05, + "epoch": 0.27507163323782235, "step": 300 }, { - "eval_loss": 1.5064737796783447, - "eval_runtime": 78.6727, - "eval_samples_per_second": 6.33, - "eval_steps_per_second": 3.165, - "epoch": 0.502828409805154, + "eval_loss": 0.7930753231048584, + "eval_runtime": 159.9775, + "eval_samples_per_second": 5.801, + "eval_steps_per_second": 2.9, + "epoch": 0.27507163323782235, "step": 300 }, { - "loss": 1.3806, - "grad_norm": 0.5293618440628052, - "learning_rate": 9.999658002614155e-07, - "epoch": 0.5195893567986591, + "loss": 0.9456, + "grad_norm": 0.44111281633377075, + "learning_rate": 9.018569855778383e-05, + "epoch": 0.2842406876790831, "step": 310 }, { - "loss": 1.6231, - "grad_norm": 0.5351464748382568, - "learning_rate": 9.998632057241507e-07, - "epoch": 0.5363503037921643, + "loss": 0.8296, + "grad_norm": 0.29679298400878906, + "learning_rate": 8.921250573975456e-05, + "epoch": 0.2934097421203438, "step": 320 }, { - "loss": 1.3607, - "grad_norm": 0.5800453424453735, - "learning_rate": 9.99692230423031e-07, - "epoch": 0.5531112507856694, + "loss": 0.7403, + "grad_norm": 0.3140794634819031, + "learning_rate": 8.819910152028872e-05, + "epoch": 0.3025787965616046, "step": 330 }, { - "loss": 1.4255, - "grad_norm": 0.5451828241348267, - "learning_rate": 9.994528977472987e-07, - "epoch": 0.5698721977791745, + "loss": 0.8083, + "grad_norm": 0.2960476279258728, + "learning_rate": 8.714652511896994e-05, + "epoch": 0.31174785100286534, "step": 340 }, { - "loss": 1.458, - "grad_norm": 0.6325808167457581, - "learning_rate": 9.991452404374136e-07, - "epoch": 0.5866331447726797, + "loss": 0.7474, + "grad_norm": 0.5192521214485168, + "learning_rate": 8.605585592543212e-05, + "epoch": 0.3209169054441261, "step": 350 }, { - "eval_loss": 1.4532791376113892, - "eval_runtime": 78.8426, - "eval_samples_per_second": 6.316, - "eval_steps_per_second": 3.158, - "epoch": 0.5866331447726797, + "eval_loss": 0.7890114784240723, + "eval_runtime": 156.9758, + "eval_samples_per_second": 5.912, + "eval_steps_per_second": 2.956, + "epoch": 0.3209169054441261, "step": 350 }, { - "loss": 1.2395, - "grad_norm": 0.582156777381897, - "learning_rate": 9.98769300580574e-07, - "epoch": 0.6033940917661847, + "loss": 0.8881, + "grad_norm": 0.26124686002731323, + "learning_rate": 8.492821239247364e-05, + "epoch": 0.3300859598853868, "step": 360 }, { - "loss": 1.2645, - "grad_norm": 0.6619167923927307, - "learning_rate": 9.983251296049592e-07, - "epoch": 0.6201550387596899, + "loss": 0.9664, + "grad_norm": 1.590624213218689, + "learning_rate": 8.376475088911317e-05, + "epoch": 0.33925501432664756, "step": 370 }, { - "loss": 1.2875, - "grad_norm": 0.5032066106796265, - "learning_rate": 9.978127882726946e-07, - "epoch": 0.6369159857531951, + "loss": 0.7176, + "grad_norm": 0.3093242943286896, + "learning_rate": 8.256666451476337e-05, + "epoch": 0.3484240687679083, "step": 380 }, { - "loss": 1.3396, - "grad_norm": 0.739372968673706, - "learning_rate": 9.97232346671538e-07, - "epoch": 0.6536769327467001, + "loss": 0.9026, + "grad_norm": 0.5042882561683655, + "learning_rate": 8.133518187573862e-05, + "epoch": 0.35759312320916903, "step": 390 }, { - "loss": 1.4678, - "grad_norm": 0.7378211617469788, - "learning_rate": 9.96583884205294e-07, - "epoch": 0.6704378797402053, + "loss": 0.7752, + "grad_norm": 0.3653980791568756, + "learning_rate": 8.007156582535131e-05, + "epoch": 0.3667621776504298, "step": 400 }, { - "eval_loss": 1.4118926525115967, - "eval_runtime": 80.1337, - "eval_samples_per_second": 6.215, + "eval_loss": 0.7844048738479614, + "eval_runtime": 149.3433, + "eval_samples_per_second": 6.214, "eval_steps_per_second": 3.107, - "epoch": 0.6704378797402053, + "epoch": 0.3667621776504298, "step": 400 }, { - "loss": 1.2475, - "grad_norm": 0.5222254991531372, - "learning_rate": 9.958674895829497e-07, - "epoch": 0.6871988267337105, + "loss": 0.8571, + "grad_norm": 0.23480232059955597, + "learning_rate": 7.877711216888867e-05, + "epoch": 0.37593123209169055, "step": 410 }, { - "loss": 1.4585, - "grad_norm": 0.8027982115745544, - "learning_rate": 9.950832608065402e-07, - "epoch": 0.7039597737272156, + "loss": 0.769, + "grad_norm": 0.34751203656196594, + "learning_rate": 7.745314833479833e-05, + "epoch": 0.3851002865329513, "step": 420 }, { - "loss": 1.3031, - "grad_norm": 0.5209378600120544, - "learning_rate": 9.942313051577426e-07, - "epoch": 0.7207207207207207, + "loss": 0.7989, + "grad_norm": 0.2936677932739258, + "learning_rate": 7.6101032013445e-05, + "epoch": 0.394269340974212, "step": 430 }, { - "loss": 1.4187, - "grad_norm": 0.6038073897361755, - "learning_rate": 9.933117391831984e-07, - "epoch": 0.7374816677142259, + "loss": 0.789, + "grad_norm": 0.3774430751800537, + "learning_rate": 7.472214976483452e-05, + "epoch": 0.4034383954154728, "step": 440 }, { - "loss": 1.335, - "grad_norm": 0.7092902660369873, - "learning_rate": 9.923246886785712e-07, - "epoch": 0.754242614707731, + "loss": 0.7272, + "grad_norm": 0.3077329099178314, + "learning_rate": 7.33179155967327e-05, + "epoch": 0.41260744985673353, "step": 450 }, { - "eval_loss": 1.37515389919281, - "eval_runtime": 78.7004, - "eval_samples_per_second": 6.328, - "eval_steps_per_second": 3.164, - "epoch": 0.754242614707731, + "eval_loss": 0.7796526551246643, + "eval_runtime": 152.2118, + "eval_samples_per_second": 6.097, + "eval_steps_per_second": 3.048, + "epoch": 0.41260744985673353, "step": 450 }, { - "loss": 1.2147, - "grad_norm": 0.6610886454582214, - "learning_rate": 9.912702886713383e-07, - "epoch": 0.7710035617012361, + "loss": 0.7975, + "grad_norm": 0.2882915437221527, + "learning_rate": 7.188976951463723e-05, + "epoch": 0.4217765042979943, "step": 460 }, { - "loss": 1.342, - "grad_norm": 0.6342706084251404, - "learning_rate": 9.901486834023181e-07, - "epoch": 0.7877645086947412, + "loss": 0.9331, + "grad_norm": 0.26507413387298584, + "learning_rate": 7.043917604508971e-05, + "epoch": 0.430945558739255, "step": 470 }, { - "loss": 1.309, - "grad_norm": 0.8199939727783203, - "learning_rate": 9.889600263059384e-07, - "epoch": 0.8045254556882464, + "loss": 0.8037, + "grad_norm": 0.34291401505470276, + "learning_rate": 6.896762273384178e-05, + "epoch": 0.44011461318051576, "step": 480 }, { - "loss": 1.3816, - "grad_norm": 0.8439438939094543, - "learning_rate": 9.87704479989247e-07, - "epoch": 0.8212864026817516, + "loss": 0.7612, + "grad_norm": 0.43661966919898987, + "learning_rate": 6.747661862041585e-05, + "epoch": 0.4492836676217765, "step": 490 }, { - "loss": 1.3429, - "grad_norm": 0.7948578596115112, - "learning_rate": 9.863822162096672e-07, - "epoch": 0.8380473496752566, + "loss": 0.9427, + "grad_norm": 0.37725234031677246, + "learning_rate": 6.596769269062444e-05, + "epoch": 0.4584527220630373, "step": 500 }, { - "eval_loss": 1.341338038444519, - "eval_runtime": 78.8544, - "eval_samples_per_second": 6.315, - "eval_steps_per_second": 3.158, - "epoch": 0.8380473496752566, + "eval_loss": 0.774621307849884, + "eval_runtime": 154.4593, + "eval_samples_per_second": 6.008, + "eval_steps_per_second": 3.004, + "epoch": 0.4584527220630373, "step": 500 }, { - "loss": 1.3624, - "grad_norm": 0.6835984587669373, - "learning_rate": 9.849934158515017e-07, - "epoch": 0.8548082966687618, + "loss": 0.8304, + "grad_norm": 0.30780285596847534, + "learning_rate": 6.444239230863504e-05, + "epoch": 0.467621776504298, "step": 510 }, { - "loss": 1.1831, - "grad_norm": 0.6039908528327942, - "learning_rate": 9.835382689011868e-07, - "epoch": 0.871569243662267, + "loss": 0.715, + "grad_norm": 0.27012601494789124, + "learning_rate": 6.290228163018868e-05, + "epoch": 0.47679083094555874, "step": 520 }, { - "loss": 1.243, - "grad_norm": 0.5669766664505005, - "learning_rate": 9.82016974421304e-07, - "epoch": 0.888330190655772, + "loss": 0.7056, + "grad_norm": 0.2883375585079193, + "learning_rate": 6.134893999859887e-05, + "epoch": 0.4859598853868195, "step": 530 }, { - "loss": 1.2414, - "grad_norm": 0.724523663520813, - "learning_rate": 9.804297405233474e-07, - "epoch": 0.9050911376492772, + "loss": 0.8827, + "grad_norm": 0.3120364248752594, + "learning_rate": 5.97839603251764e-05, + "epoch": 0.4951289398280802, "step": 540 }, { - "loss": 1.2665, - "grad_norm": 0.5642393231391907, - "learning_rate": 9.787767843392544e-07, - "epoch": 0.9218520846427823, + "loss": 0.7032, + "grad_norm": 0.2870006859302521, + "learning_rate": 5.820894745574025e-05, + "epoch": 0.504297994269341, "step": 550 }, { - "eval_loss": 1.3112993240356445, - "eval_runtime": 80.5311, - "eval_samples_per_second": 6.184, - "eval_steps_per_second": 3.092, - "epoch": 0.9218520846427823, + "eval_loss": 0.7724801898002625, + "eval_runtime": 154.0554, + "eval_samples_per_second": 6.024, + "eval_steps_per_second": 3.012, + "epoch": 0.504297994269341, "step": 550 }, { - "loss": 1.2605, - "grad_norm": 0.6140392422676086, - "learning_rate": 9.770583319917029e-07, - "epoch": 0.9386130316362874, + "loss": 0.7996, + "grad_norm": 0.2602643072605133, + "learning_rate": 5.662551652489009e-05, + "epoch": 0.5134670487106017, "step": 560 }, { - "loss": 1.2227, - "grad_norm": 0.6274238228797913, - "learning_rate": 9.752746185631766e-07, - "epoch": 0.9553739786297926, + "loss": 0.7694, + "grad_norm": 0.43268686532974243, + "learning_rate": 5.503529129972792e-05, + "epoch": 0.5226361031518625, "step": 570 }, { - "loss": 1.2693, - "grad_norm": 0.5612761974334717, - "learning_rate": 9.734258880638076e-07, - "epoch": 0.9721349256232977, + "loss": 0.7802, + "grad_norm": 0.3927740156650543, + "learning_rate": 5.34399025147273e-05, + "epoch": 0.5318051575931232, "step": 580 }, { - "loss": 1.3072, - "grad_norm": 0.7661623954772949, - "learning_rate": 9.715123933979953e-07, - "epoch": 0.9888958726168029, + "loss": 0.7649, + "grad_norm": 0.3333654999732971, + "learning_rate": 5.1840986199457606e-05, + "epoch": 0.540974212034384, "step": 590 }, { - "loss": 1.2428, - "grad_norm": 0.649665892124176, - "learning_rate": 9.695343963298086e-07, - "epoch": 1.0050282840980516, + "loss": 0.8586, + "grad_norm": 0.29149171710014343, + "learning_rate": 5.024018200087855e-05, + "epoch": 0.5501432664756447, "step": 600 }, { - "eval_loss": 1.2851722240447998, - "eval_runtime": 79.0258, - "eval_samples_per_second": 6.302, - "eval_steps_per_second": 3.151, - "epoch": 1.0050282840980516, + "eval_loss": 0.7711001038551331, + "eval_runtime": 160.1497, + "eval_samples_per_second": 5.795, + "eval_steps_per_second": 2.897, + "epoch": 0.5501432664756447, "step": 600 }, { - "loss": 1.263, - "grad_norm": 0.6536946892738342, - "learning_rate": 9.674921674471785e-07, - "epoch": 1.0217892310915566, + "loss": 0.8023, + "grad_norm": 0.48755213618278503, + "learning_rate": 4.863913150192481e-05, + "epoch": 0.5593123209169054, "step": 610 }, { - "loss": 1.2098, - "grad_norm": 0.5894418358802795, - "learning_rate": 9.653859861248805e-07, - "epoch": 1.038550178085062, + "loss": 0.737, + "grad_norm": 0.290272057056427, + "learning_rate": 4.703947653810575e-05, + "epoch": 0.5684813753581662, "step": 620 }, { - "loss": 1.2002, - "grad_norm": 0.5719261169433594, - "learning_rate": 9.632161404863174e-07, - "epoch": 1.055311125078567, + "loss": 0.8758, + "grad_norm": 0.2664808928966522, + "learning_rate": 4.544285751384584e-05, + "epoch": 0.5776504297994269, "step": 630 }, { - "loss": 1.3116, - "grad_norm": 0.7122625112533569, - "learning_rate": 9.609829273641032e-07, - "epoch": 1.072072072072072, + "loss": 0.8121, + "grad_norm": 0.7378506660461426, + "learning_rate": 4.3850911720292756e-05, + "epoch": 0.5868194842406876, "step": 640 }, { - "loss": 1.0923, - "grad_norm": 0.8119747042655945, - "learning_rate": 9.586866522594584e-07, - "epoch": 1.0888330190655773, + "loss": 0.7965, + "grad_norm": 0.33212271332740784, + "learning_rate": 4.226527165631801e-05, + "epoch": 0.5959885386819485, "step": 650 }, { - "eval_loss": 1.2604608535766602, - "eval_runtime": 79.0547, - "eval_samples_per_second": 6.299, - "eval_steps_per_second": 3.15, - "epoch": 1.0888330190655773, + "eval_loss": 0.7688117027282715, + "eval_runtime": 157.579, + "eval_samples_per_second": 5.889, + "eval_steps_per_second": 2.945, + "epoch": 0.5959885386819485, "step": 650 }, { - "loss": 1.2157, - "grad_norm": 0.5922772288322449, - "learning_rate": 9.563276293004155e-07, - "epoch": 1.1055939660590823, + "loss": 0.8721, + "grad_norm": 0.45228180289268494, + "learning_rate": 4.0687563354431984e-05, + "epoch": 0.6051575931232092, "step": 660 }, { - "loss": 1.2892, - "grad_norm": 0.6105348467826843, - "learning_rate": 9.53906181198849e-07, - "epoch": 1.1223549130525874, + "loss": 0.9747, + "grad_norm": 0.39334630966186523, + "learning_rate": 3.911940471333002e-05, + "epoch": 0.6143266475644699, "step": 670 }, { - "loss": 1.1476, - "grad_norm": 0.7103086113929749, - "learning_rate": 9.51422639206327e-07, - "epoch": 1.1391158600460927, + "loss": 0.8033, + "grad_norm": 0.2843310534954071, + "learning_rate": 3.756240383877947e-05, + "epoch": 0.6234957020057307, "step": 680 }, { - "loss": 1.108, - "grad_norm": 0.716642439365387, - "learning_rate": 9.488773430687973e-07, - "epoch": 1.1558768070395977, + "loss": 0.8623, + "grad_norm": 0.3566271662712097, + "learning_rate": 3.6018157394549284e-05, + "epoch": 0.6326647564469914, "step": 690 }, { - "loss": 1.3284, - "grad_norm": 0.6440051794052124, - "learning_rate": 9.4627064098011e-07, - "epoch": 1.1726377540331028, + "loss": 0.7507, + "grad_norm": 0.2564896047115326, + "learning_rate": 3.448824896507292e-05, + "epoch": 0.6418338108882522, "step": 700 }, { - "eval_loss": 1.2372387647628784, - "eval_runtime": 78.588, - "eval_samples_per_second": 6.337, - "eval_steps_per_second": 3.168, - "epoch": 1.1726377540331028, + "eval_loss": 0.7661372423171997, + "eval_runtime": 154.0479, + "eval_samples_per_second": 6.024, + "eval_steps_per_second": 3.012, + "epoch": 0.6418338108882522, "step": 700 }, { - "loss": 1.0961, - "grad_norm": 0.5385606288909912, - "learning_rate": 9.436028895343848e-07, - "epoch": 1.189398701026608, + "loss": 0.8036, + "grad_norm": 0.279855340719223, + "learning_rate": 3.297424743152382e-05, + "epoch": 0.6510028653295129, "step": 710 }, { - "loss": 1.1597, - "grad_norm": 0.8156595230102539, - "learning_rate": 9.408744536772303e-07, - "epoch": 1.206159648020113, + "loss": 0.7921, + "grad_norm": 0.3235679268836975, + "learning_rate": 3.14777053629687e-05, + "epoch": 0.6601719197707736, "step": 720 }, { - "loss": 1.1478, - "grad_norm": 0.7858404517173767, - "learning_rate": 9.380857066558184e-07, - "epoch": 1.2229205950136182, + "loss": 0.7928, + "grad_norm": 0.37638944387435913, + "learning_rate": 3.0000157424248575e-05, + "epoch": 0.6693409742120344, "step": 730 }, { - "loss": 1.2074, - "grad_norm": 0.7118250727653503, - "learning_rate": 9.352370299678258e-07, - "epoch": 1.2396815420071234, + "loss": 0.9068, + "grad_norm": 0.40034306049346924, + "learning_rate": 2.8543118802219904e-05, + "epoch": 0.6785100286532951, "step": 740 }, { - "loss": 1.1792, - "grad_norm": 0.7732047438621521, - "learning_rate": 9.323288133092445e-07, - "epoch": 1.2564424890006285, + "loss": 0.8321, + "grad_norm": 0.15902051329612732, + "learning_rate": 2.710808365197e-05, + "epoch": 0.6876790830945558, "step": 750 }, { - "eval_loss": 1.2164807319641113, - "eval_runtime": 78.9387, - "eval_samples_per_second": 6.309, - "eval_steps_per_second": 3.154, - "epoch": 1.2564424890006285, + "eval_loss": 0.7649410367012024, + "eval_runtime": 155.9897, + "eval_samples_per_second": 5.949, + "eval_steps_per_second": 2.975, + "epoch": 0.6876790830945558, "step": 750 }, { - "loss": 0.9987, - "grad_norm": 0.7803258895874023, - "learning_rate": 9.293614545210724e-07, - "epoch": 1.2732034359941338, + "loss": 0.8081, + "grad_norm": 0.22720754146575928, + "learning_rate": 2.5696523564600074e-05, + "epoch": 0.6968481375358166, "step": 760 }, { - "loss": 1.2523, - "grad_norm": 0.69403475522995, - "learning_rate": 9.263353595348891e-07, - "epoch": 1.2899643829876388, + "loss": 0.8296, + "grad_norm": 0.3821977972984314, + "learning_rate": 2.4309886058146912e-05, + "epoch": 0.7060171919770774, "step": 770 }, { - "loss": 1.1585, - "grad_norm": 0.7996134161949158, - "learning_rate": 9.23250942317324e-07, - "epoch": 1.3067253299811439, + "loss": 0.7437, + "grad_norm": 0.23446418344974518, + "learning_rate": 2.2949593093190862e-05, + "epoch": 0.7151862464183381, "step": 780 }, { - "loss": 1.1139, - "grad_norm": 0.6396450400352478, - "learning_rate": 9.201086248134276e-07, - "epoch": 1.323486276974649, + "loss": 0.8172, + "grad_norm": 0.39708301424980164, + "learning_rate": 2.161703961467238e-05, + "epoch": 0.7243553008595989, "step": 790 }, { - "loss": 1.1116, - "grad_norm": 0.6725043058395386, - "learning_rate": 9.169088368889484e-07, - "epoch": 1.3402472239681542, + "loss": 0.7895, + "grad_norm": 0.27752557396888733, + "learning_rate": 2.0313592121412466e-05, + "epoch": 0.7335243553008596, "step": 800 }, { - "eval_loss": 1.1973844766616821, - "eval_runtime": 80.6555, - "eval_samples_per_second": 6.174, - "eval_steps_per_second": 3.087, - "epoch": 1.3402472239681542, + "eval_loss": 0.7634205222129822, + "eval_runtime": 155.4241, + "eval_samples_per_second": 5.971, + "eval_steps_per_second": 2.985, + "epoch": 0.7335243553008596, "step": 800 }, { - "loss": 1.0994, - "grad_norm": 0.7814534306526184, - "learning_rate": 9.136520162715286e-07, - "epoch": 1.3570081709616593, + "loss": 0.7796, + "grad_norm": 0.3197689354419708, + "learning_rate": 1.904058726480367e-05, + "epoch": 0.7426934097421204, "step": 810 }, { - "loss": 1.0118, - "grad_norm": 0.7589584589004517, - "learning_rate": 9.103386084908233e-07, - "epoch": 1.3737691179551645, + "loss": 0.6869, + "grad_norm": 0.2617953419685364, + "learning_rate": 1.7799330478109027e-05, + "epoch": 0.7518624641833811, "step": 820 }, { - "loss": 1.1388, - "grad_norm": 0.7946352958679199, - "learning_rate": 9.069690668175519e-07, - "epoch": 1.3905300649486696, + "loss": 0.9052, + "grad_norm": 0.3640119135379791, + "learning_rate": 1.6591094637774303e-05, + "epoch": 0.7610315186246418, "step": 830 }, { - "loss": 1.1872, - "grad_norm": 1.228780746459961, - "learning_rate": 9.035438522014923e-07, - "epoch": 1.4072910119421747, + "loss": 0.7278, + "grad_norm": 0.3398507237434387, + "learning_rate": 1.541711875812641e-05, + "epoch": 0.7702005730659026, "step": 840 }, { - "loss": 1.1591, - "grad_norm": 0.6566089987754822, - "learning_rate": 9.000634332084219e-07, - "epoch": 1.42405195893568, + "loss": 0.7398, + "grad_norm": 0.3581792116165161, + "learning_rate": 1.4278606720796544e-05, + "epoch": 0.7793696275071633, "step": 850 }, { - "eval_loss": 1.1793015003204346, - "eval_runtime": 80.6691, - "eval_samples_per_second": 6.173, - "eval_steps_per_second": 3.087, - "epoch": 1.42405195893568, + "eval_loss": 0.7617470026016235, + "eval_runtime": 159.2523, + "eval_samples_per_second": 5.827, + "eval_steps_per_second": 2.914, + "epoch": 0.7793696275071633, "step": 850 }, { - "loss": 1.1832, - "grad_norm": 0.8144505023956299, - "learning_rate": 8.9652828595602e-07, - "epoch": 1.440812905929185, + "loss": 0.871, + "grad_norm": 0.4080051779747009, + "learning_rate": 1.3176726040171e-05, + "epoch": 0.788538681948424, "step": 860 }, { - "loss": 0.9717, - "grad_norm": 0.659873366355896, - "learning_rate": 8.929388940487338e-07, - "epoch": 1.4575738529226903, + "loss": 0.877, + "grad_norm": 0.4084082543849945, + "learning_rate": 1.2112606666135602e-05, + "epoch": 0.7977077363896848, "step": 870 }, { - "loss": 1.0355, - "grad_norm": 0.703195333480835, - "learning_rate": 8.892957485116233e-07, - "epoch": 1.4743347999161953, + "loss": 0.8205, + "grad_norm": 0.26790153980255127, + "learning_rate": 1.1087339825341592e-05, + "epoch": 0.8068767908309455, "step": 880 }, { - "loss": 0.9693, - "grad_norm": 0.6595771908760071, - "learning_rate": 8.855993477231883e-07, - "epoch": 1.4910957469097004, + "loss": 0.8441, + "grad_norm": 0.3132439851760864, + "learning_rate": 1.0101976902181226e-05, + "epoch": 0.8160458452722062, "step": 890 }, { - "loss": 1.1721, - "grad_norm": 0.9032782912254333, - "learning_rate": 8.818501973471912e-07, - "epoch": 1.5078566939032054, + "loss": 0.7718, + "grad_norm": 0.41226926445961, + "learning_rate": 9.157528360620415e-06, + "epoch": 0.8252148997134671, "step": 900 }, { - "eval_loss": 1.1632665395736694, - "eval_runtime": 78.9179, - "eval_samples_per_second": 6.31, - "eval_steps_per_second": 3.155, - "epoch": 1.5078566939032054, + "eval_loss": 0.7602015733718872, + "eval_runtime": 159.5675, + "eval_samples_per_second": 5.816, + "eval_steps_per_second": 2.908, + "epoch": 0.8252148997134671, "step": 900 }, { - "loss": 1.1077, - "grad_norm": 0.7769476175308228, - "learning_rate": 8.780488102634836e-07, - "epoch": 1.5246176408967107, + "loss": 0.8352, + "grad_norm": 0.30735114216804504, + "learning_rate": 8.254962707994374e-06, + "epoch": 0.8343839541547278, "step": 910 }, { - "loss": 1.1579, - "grad_norm": 0.8173115849494934, - "learning_rate": 8.741957064978433e-07, - "epoch": 1.5413785878902158, + "loss": 0.796, + "grad_norm": 0.38642844557762146, + "learning_rate": 7.395205501828578e-06, + "epoch": 0.8435530085959886, "step": 920 }, { - "loss": 1.0498, - "grad_norm": 0.774395763874054, - "learning_rate": 8.702914131508365e-07, - "epoch": 1.558139534883721, + "loss": 0.8314, + "grad_norm": 0.3047927916049957, + "learning_rate": 6.579138400703716e-06, + "epoch": 0.8527220630372493, "step": 930 }, { - "loss": 1.0696, - "grad_norm": 0.7452784180641174, - "learning_rate": 8.663364643257104e-07, - "epoch": 1.574900481877226, + "loss": 0.7396, + "grad_norm": 0.26591596007347107, + "learning_rate": 5.807598260137759e-06, + "epoch": 0.86189111747851, "step": 940 }, { - "loss": 1.214, - "grad_norm": 0.7853773236274719, - "learning_rate": 8.623314010553288e-07, - "epoch": 1.5916614288707311, + "loss": 0.7736, + "grad_norm": 0.41588667035102844, + "learning_rate": 5.081376274412531e-06, + "epoch": 0.8710601719197708, "step": 950 }, { - "eval_loss": 1.1473941802978516, - "eval_runtime": 80.6501, - "eval_samples_per_second": 6.175, - "eval_steps_per_second": 3.087, - "epoch": 1.5916614288707311, + "eval_loss": 0.7594464421272278, + "eval_runtime": 158.8458, + "eval_samples_per_second": 5.842, + "eval_steps_per_second": 2.921, + "epoch": 0.8710601719197708, "step": 950 }, { - "loss": 1.2111, - "grad_norm": 0.7786531448364258, - "learning_rate": 8.582767712281591e-07, - "epoch": 1.6084223758642362, + "loss": 0.7564, + "grad_norm": 0.28924882411956787, + "learning_rate": 4.4012171652245635e-06, + "epoch": 0.8802292263610315, "step": 960 }, { - "loss": 1.0599, - "grad_norm": 0.78724604845047, - "learning_rate": 8.541731295133219e-07, - "epoch": 1.6251833228577415, + "loss": 0.7387, + "grad_norm": 0.2902253568172455, + "learning_rate": 3.767818417992447e-06, + "epoch": 0.8893982808022922, "step": 970 }, { - "loss": 1.0114, - "grad_norm": 0.9002764821052551, - "learning_rate": 8.500210372847126e-07, - "epoch": 1.6419442698512468, + "loss": 0.7774, + "grad_norm": 0.354568749666214, + "learning_rate": 3.1818295666037724e-06, + "epoch": 0.898567335243553, "step": 980 }, { - "loss": 0.9775, - "grad_norm": 0.8179922103881836, - "learning_rate": 8.458210625442068e-07, - "epoch": 1.6587052168447518, + "loss": 0.7819, + "grad_norm": 0.21420103311538696, + "learning_rate": 2.643851527335006e-06, + "epoch": 0.9077363896848137, "step": 990 }, { - "loss": 1.1444, - "grad_norm": 0.6862105131149292, - "learning_rate": 8.415737798439568e-07, - "epoch": 1.6754661638382569, + "loss": 0.9055, + "grad_norm": 0.4397925138473511, + "learning_rate": 2.154435982627573e-06, + "epoch": 0.9169054441260746, "step": 1000 }, { - "eval_loss": 1.1338735818862915, - "eval_runtime": 78.5686, - "eval_samples_per_second": 6.338, - "eval_steps_per_second": 3.169, - "epoch": 1.6754661638382569, + "eval_loss": 0.7588858604431152, + "eval_runtime": 153.725, + "eval_samples_per_second": 6.037, + "eval_steps_per_second": 3.018, + "epoch": 0.9169054441260746, "step": 1000 }, { - "loss": 1.0762, - "grad_norm": 0.7431554794311523, - "learning_rate": 8.372797702077952e-07, - "epoch": 1.692227110831762, + "loss": 0.7463, + "grad_norm": 0.37509453296661377, + "learning_rate": 1.7140848153519129e-06, + "epoch": 0.9260744985673353, "step": 1010 }, { - "loss": 1.0976, - "grad_norm": 0.7127805948257446, - "learning_rate": 8.329396210517496e-07, - "epoch": 1.708988057825267, + "loss": 0.7624, + "grad_norm": 0.33944201469421387, + "learning_rate": 1.3232495941396639e-06, + "epoch": 0.935243553008596, "step": 1020 }, { - "loss": 1.1504, - "grad_norm": 0.9699788093566895, - "learning_rate": 8.285539261036868e-07, - "epoch": 1.7257490048187722, + "loss": 0.7424, + "grad_norm": 0.3569444715976715, + "learning_rate": 9.82331110311857e-07, + "epoch": 0.9444126074498568, "step": 1030 }, { - "loss": 1.0243, - "grad_norm": 0.6960281729698181, - "learning_rate": 8.241232853220894e-07, - "epoch": 1.7425099518122775, + "loss": 0.8083, + "grad_norm": 0.42634230852127075, + "learning_rate": 6.916789668778123e-07, + "epoch": 0.9535816618911175, "step": 1040 }, { - "loss": 1.1155, - "grad_norm": 0.9629572033882141, - "learning_rate": 8.196483048139834e-07, - "epoch": 1.7592708988057826, + "loss": 0.8018, + "grad_norm": 0.36843785643577576, + "learning_rate": 4.5159122002644274e-07, + "epoch": 0.9627507163323782, "step": 1050 }, { - "eval_loss": 1.122306227684021, - "eval_runtime": 78.8404, - "eval_samples_per_second": 6.317, - "eval_steps_per_second": 3.158, - "epoch": 1.7592708988057826, + "eval_loss": 0.7587710022926331, + "eval_runtime": 158.0207, + "eval_samples_per_second": 5.873, + "eval_steps_per_second": 2.936, + "epoch": 0.9627507163323782, "step": 1050 }, { - "loss": 1.0379, - "grad_norm": 0.9436600208282471, - "learning_rate": 8.151295967520231e-07, - "epoch": 1.7760318457992876, + "loss": 0.831, + "grad_norm": 0.3824850618839264, + "learning_rate": 2.6231407347736546e-07, + "epoch": 0.971919770773639, "step": 1060 }, { - "loss": 0.9271, - "grad_norm": 0.8760473132133484, - "learning_rate": 8.105677792907462e-07, - "epoch": 1.7927927927927927, + "loss": 0.8253, + "grad_norm": 0.34650319814682007, + "learning_rate": 1.2404162600541115e-07, + "epoch": 0.9810888252148997, "step": 1070 }, { - "loss": 1.1017, - "grad_norm": 0.8457250595092773, - "learning_rate": 8.059634764820114e-07, - "epoch": 1.809553739786298, + "loss": 0.7623, + "grad_norm": 0.33678287267684937, + "learning_rate": 3.691567239743621e-08, + "epoch": 0.9902578796561604, "step": 1080 }, { - "loss": 1.0982, - "grad_norm": 0.7704038619995117, - "learning_rate": 8.013173181896282e-07, - "epoch": 1.826314686779803, + "loss": 0.7956, + "grad_norm": 0.3255111575126648, + "learning_rate": 1.0255580454254788e-09, + "epoch": 0.9994269340974212, "step": 1090 }, { - "loss": 0.9232, - "grad_norm": 0.8354272246360779, - "learning_rate": 7.966299400031928e-07, - "epoch": 1.8430756337733083, - "step": 1100 - }, - { - "eval_loss": 1.1116451025009155, - "eval_runtime": 78.8427, - "eval_samples_per_second": 6.316, - "eval_steps_per_second": 3.158, - "epoch": 1.8430756337733083, - "step": 1100 - }, - { - "loss": 0.9978, - "grad_norm": 0.8250349164009094, - "learning_rate": 7.919019831511399e-07, - "epoch": 1.8598365807668134, - "step": 1110 - }, - { - "loss": 1.0313, - "grad_norm": 0.7708654999732971, - "learning_rate": 7.871340944130228e-07, - "epoch": 1.8765975277603184, - "step": 1120 - }, - { - "loss": 1.0518, - "grad_norm": 0.6994651556015015, - "learning_rate": 7.823269260310351e-07, - "epoch": 1.8933584747538235, - "step": 1130 - }, - { - "loss": 1.0081, - "grad_norm": 0.8944517374038696, - "learning_rate": 7.774811356207851e-07, - "epoch": 1.9101194217473287, - "step": 1140 - }, - { - "loss": 1.0257, - "grad_norm": 0.8253750801086426, - "learning_rate": 7.725973860813338e-07, - "epoch": 1.926880368740834, - "step": 1150 - }, - { - "eval_loss": 1.1017853021621704, - "eval_runtime": 78.7751, - "eval_samples_per_second": 6.322, - "eval_steps_per_second": 3.161, - "epoch": 1.926880368740834, - "step": 1150 - }, - { - "loss": 0.9687, - "grad_norm": 0.7468390464782715, - "learning_rate": 7.676763455045113e-07, - "epoch": 1.943641315734339, - "step": 1160 - }, - { - "loss": 0.9275, - "grad_norm": 0.8286957740783691, - "learning_rate": 7.627186870835228e-07, - "epoch": 1.9604022627278441, - "step": 1170 - }, - { - "loss": 0.9793, - "grad_norm": 0.7848206758499146, - "learning_rate": 7.577250890208564e-07, - "epoch": 1.9771632097213492, - "step": 1180 - }, - { - "loss": 0.9569, - "grad_norm": 0.8074430823326111, - "learning_rate": 7.526962344355055e-07, - "epoch": 1.9939241567148542, - "step": 1190 - }, - { - "loss": 0.9004, - "grad_norm": 0.8617448210716248, - "learning_rate": 7.476328112695185e-07, - "epoch": 2.010056568196103, - "step": 1200 - }, - { - "eval_loss": 1.0935511589050293, - "eval_runtime": 79.7429, - "eval_samples_per_second": 6.245, - "eval_steps_per_second": 3.123, - "epoch": 2.010056568196103, - "step": 1200 - }, - { - "loss": 0.929, - "grad_norm": 1.1476644277572632, - "learning_rate": 7.425355121938901e-07, - "epoch": 2.026817515189608, - "step": 1210 - }, - { - "loss": 1.014, - "grad_norm": 0.8628258109092712, - "learning_rate": 7.37405034513804e-07, - "epoch": 2.0435784621831132, - "step": 1220 - }, - { - "loss": 1.0312, - "grad_norm": 0.8636768460273743, - "learning_rate": 7.322420800732418e-07, - "epoch": 2.0603394091766183, - "step": 1230 - }, - { - "loss": 1.0864, - "grad_norm": 0.6343582272529602, - "learning_rate": 7.270473551589723e-07, - "epoch": 2.077100356170124, - "step": 1240 - }, - { - "loss": 1.1058, - "grad_norm": 0.9320090413093567, - "learning_rate": 7.218215704039321e-07, - "epoch": 2.093861303163629, - "step": 1250 - }, - { - "eval_loss": 1.0855053663253784, - "eval_runtime": 86.4087, - "eval_samples_per_second": 5.763, - "eval_steps_per_second": 2.882, - "epoch": 2.093861303163629, - "step": 1250 - }, - { - "loss": 1.0237, - "grad_norm": 0.8435965776443481, - "learning_rate": 7.16565440690011e-07, - "epoch": 2.110622250157134, - "step": 1260 - }, - { - "loss": 1.0289, - "grad_norm": 2.9359328746795654, - "learning_rate": 7.112796850502578e-07, - "epoch": 2.127383197150639, - "step": 1270 - }, - { - "loss": 1.1569, - "grad_norm": 0.8461847901344299, - "learning_rate": 7.05965026570517e-07, - "epoch": 2.144144144144144, - "step": 1280 - }, - { - "loss": 1.0697, - "grad_norm": 0.7913004755973816, - "learning_rate": 7.006221922905111e-07, - "epoch": 2.160905091137649, - "step": 1290 - }, - { - "loss": 1.0053, - "grad_norm": 0.9257745146751404, - "learning_rate": 6.95251913104383e-07, - "epoch": 2.1776660381311546, - "step": 1300 - }, - { - "eval_loss": 1.0785434246063232, - "eval_runtime": 91.5293, - "eval_samples_per_second": 5.441, - "eval_steps_per_second": 2.72, - "epoch": 2.1776660381311546, - "step": 1300 - }, - { - "loss": 0.9318, - "grad_norm": 1.099394679069519, - "learning_rate": 6.898549236607098e-07, - "epoch": 2.1944269851246596, - "step": 1310 - }, - { - "loss": 0.9994, - "grad_norm": 1.126979947090149, - "learning_rate": 6.844319622620039e-07, - "epoch": 2.2111879321181647, - "step": 1320 - }, - { - "loss": 1.0099, - "grad_norm": 0.7986570596694946, - "learning_rate": 6.789837707637142e-07, - "epoch": 2.2279488791116697, - "step": 1330 - }, - { - "loss": 0.9439, - "grad_norm": 0.7648953199386597, - "learning_rate": 6.735110944727404e-07, - "epoch": 2.2447098261051748, - "step": 1340 - }, - { - "loss": 1.034, - "grad_norm": 1.128675103187561, - "learning_rate": 6.68014682045477e-07, - "epoch": 2.2614707730986803, - "step": 1350 - }, - { - "eval_loss": 1.0722501277923584, - "eval_runtime": 87.4702, - "eval_samples_per_second": 5.693, - "eval_steps_per_second": 2.847, - "epoch": 2.2614707730986803, - "step": 1350 - }, - { - "loss": 0.9582, - "grad_norm": 1.0245682001113892, - "learning_rate": 6.624952853853962e-07, - "epoch": 2.2782317200921853, - "step": 1360 - }, - { - "loss": 0.999, - "grad_norm": 0.8576367497444153, - "learning_rate": 6.569536595401899e-07, - "epoch": 2.2949926670856904, - "step": 1370 - }, - { - "loss": 0.8678, - "grad_norm": 0.8585372567176819, - "learning_rate": 6.513905625984792e-07, - "epoch": 2.3117536140791954, - "step": 1380 - }, - { - "loss": 1.0609, - "grad_norm": 0.7375667095184326, - "learning_rate": 6.458067555861082e-07, - "epoch": 2.3285145610727005, - "step": 1390 - }, - { - "loss": 0.9888, - "grad_norm": 0.892026424407959, - "learning_rate": 6.402030023620378e-07, - "epoch": 2.3452755080662055, - "step": 1400 - }, - { - "eval_loss": 1.0668587684631348, - "eval_runtime": 88.1591, - "eval_samples_per_second": 5.649, - "eval_steps_per_second": 2.824, - "epoch": 2.3452755080662055, - "step": 1400 - }, - { - "loss": 0.9441, - "grad_norm": 0.9670091867446899, - "learning_rate": 6.345800695138491e-07, - "epoch": 2.362036455059711, - "step": 1410 - }, - { - "loss": 1.0477, - "grad_norm": 0.7848078608512878, - "learning_rate": 6.289387262528765e-07, - "epoch": 2.378797402053216, - "step": 1420 - }, - { - "loss": 1.0631, - "grad_norm": 0.8168392181396484, - "learning_rate": 6.232797443089786e-07, - "epoch": 2.395558349046721, - "step": 1430 - }, - { - "loss": 1.0002, - "grad_norm": 1.408933162689209, - "learning_rate": 6.176038978249682e-07, - "epoch": 2.412319296040226, - "step": 1440 - }, - { - "loss": 1.0355, - "grad_norm": 0.7367719411849976, - "learning_rate": 6.119119632507095e-07, - "epoch": 2.4290802430337313, - "step": 1450 - }, - { - "eval_loss": 1.0617327690124512, - "eval_runtime": 91.8946, - "eval_samples_per_second": 5.419, - "eval_steps_per_second": 2.71, - "epoch": 2.4290802430337313, - "step": 1450 - }, - { - "loss": 1.0361, - "grad_norm": 1.1367297172546387, - "learning_rate": 6.062047192369002e-07, - "epoch": 2.4458411900272363, - "step": 1460 - }, - { - "loss": 0.887, - "grad_norm": 0.9835084080696106, - "learning_rate": 6.004829465285534e-07, - "epoch": 2.462602137020742, - "step": 1470 - }, - { - "loss": 0.9896, - "grad_norm": 0.7928352355957031, - "learning_rate": 5.947474278581929e-07, - "epoch": 2.479363084014247, - "step": 1480 - }, - { - "loss": 0.9035, - "grad_norm": 1.2313053607940674, - "learning_rate": 5.889989478387752e-07, - "epoch": 2.496124031007752, - "step": 1490 - }, - { - "loss": 0.9664, - "grad_norm": 1.1574859619140625, - "learning_rate": 5.832382928563559e-07, - "epoch": 2.512884978001257, - "step": 1500 - }, - { - "eval_loss": 1.0569761991500854, - "eval_runtime": 89.9236, - "eval_samples_per_second": 5.538, - "eval_steps_per_second": 2.769, - "epoch": 2.512884978001257, - "step": 1500 - }, - { - "loss": 0.9718, - "grad_norm": 0.8173345923423767, - "learning_rate": 5.77466250962513e-07, - "epoch": 2.529645924994762, - "step": 1510 - }, - { - "loss": 0.9839, - "grad_norm": 0.9848146438598633, - "learning_rate": 5.71683611766542e-07, - "epoch": 2.5464068719882675, - "step": 1520 - }, - { - "loss": 0.9926, - "grad_norm": 1.1053595542907715, - "learning_rate": 5.658911663274381e-07, - "epoch": 2.5631678189817726, - "step": 1530 - }, - { - "loss": 1.0227, - "grad_norm": 1.3307827711105347, - "learning_rate": 5.600897070456806e-07, - "epoch": 2.5799287659752776, - "step": 1540 - }, - { - "loss": 1.0063, - "grad_norm": 0.9872063994407654, - "learning_rate": 5.542800275548328e-07, - "epoch": 2.5966897129687827, - "step": 1550 - }, - { - "eval_loss": 1.0520267486572266, - "eval_runtime": 91.1026, - "eval_samples_per_second": 5.466, - "eval_steps_per_second": 2.733, - "epoch": 2.5966897129687827, - "step": 1550 - }, - { - "loss": 1.0276, - "grad_norm": 0.9122388362884521, - "learning_rate": 5.484629226129741e-07, - "epoch": 2.6134506599622878, - "step": 1560 - }, - { - "loss": 1.0624, - "grad_norm": 0.9454676508903503, - "learning_rate": 5.426391879939778e-07, - "epoch": 2.6302116069557933, - "step": 1570 - }, - { - "loss": 0.9732, - "grad_norm": 1.2247437238693237, - "learning_rate": 5.368096203786499e-07, - "epoch": 2.646972553949298, - "step": 1580 - }, - { - "loss": 0.974, - "grad_norm": 0.9937558174133301, - "learning_rate": 5.30975017245745e-07, - "epoch": 2.6637335009428034, - "step": 1590 - }, - { - "loss": 0.8743, - "grad_norm": 1.4427547454833984, - "learning_rate": 5.251361767628701e-07, - "epoch": 2.6804944479363084, - "step": 1600 - }, - { - "eval_loss": 1.047125220298767, - "eval_runtime": 83.9836, - "eval_samples_per_second": 5.93, - "eval_steps_per_second": 2.965, - "epoch": 2.6804944479363084, - "step": 1600 - }, - { - "loss": 0.9842, - "grad_norm": 0.8265151381492615, - "learning_rate": 5.192938976772981e-07, - "epoch": 2.6972553949298135, - "step": 1610 - }, - { - "loss": 0.8733, - "grad_norm": 0.9592719078063965, - "learning_rate": 5.134489792066985e-07, - "epoch": 2.7140163419233185, - "step": 1620 - }, - { - "loss": 1.0066, - "grad_norm": 0.7584534287452698, - "learning_rate": 5.076022209298067e-07, - "epoch": 2.7307772889168236, - "step": 1630 - }, - { - "loss": 0.9494, - "grad_norm": 1.0090166330337524, - "learning_rate": 5.01754422677041e-07, - "epoch": 2.747538235910329, - "step": 1640 - }, - { - "loss": 1.0154, - "grad_norm": 0.8638355135917664, - "learning_rate": 4.959063844210877e-07, - "epoch": 2.764299182903834, - "step": 1650 - }, - { - "eval_loss": 1.043318510055542, - "eval_runtime": 83.5648, - "eval_samples_per_second": 5.959, - "eval_steps_per_second": 2.98, - "epoch": 2.764299182903834, - "step": 1650 - }, - { - "loss": 0.9535, - "grad_norm": 1.0650914907455444, - "learning_rate": 4.900589061674649e-07, - "epoch": 2.781060129897339, - "step": 1660 - }, - { - "loss": 1.0118, - "grad_norm": 0.974525511264801, - "learning_rate": 4.842127878450835e-07, - "epoch": 2.7978210768908442, - "step": 1670 - }, - { - "loss": 0.9973, - "grad_norm": 1.2957974672317505, - "learning_rate": 4.783688291968167e-07, - "epoch": 2.8145820238843493, - "step": 1680 - }, - { - "loss": 0.9681, - "grad_norm": 1.5214028358459473, - "learning_rate": 4.7252782967009695e-07, - "epoch": 2.831342970877855, - "step": 1690 - }, - { - "loss": 0.8866, - "grad_norm": 0.826191782951355, - "learning_rate": 4.666905883075516e-07, - "epoch": 2.84810391787136, - "step": 1700 - }, - { - "eval_loss": 1.0400274991989136, - "eval_runtime": 87.0299, - "eval_samples_per_second": 5.722, - "eval_steps_per_second": 2.861, - "epoch": 2.84810391787136, - "step": 1700 - }, - { - "loss": 0.8643, - "grad_norm": 1.1714636087417603, - "learning_rate": 4.608579036376955e-07, - "epoch": 2.864864864864865, - "step": 1710 - }, - { - "loss": 1.0582, - "grad_norm": 1.2290012836456299, - "learning_rate": 4.5503057356569236e-07, - "epoch": 2.88162581185837, - "step": 1720 - }, - { - "loss": 0.9494, - "grad_norm": 0.9297714829444885, - "learning_rate": 4.492093952642027e-07, - "epoch": 2.898386758851875, - "step": 1730 - }, - { - "loss": 0.9734, - "grad_norm": 0.9026303887367249, - "learning_rate": 4.433951650643307e-07, - "epoch": 2.9151477058453805, - "step": 1740 - }, - { - "loss": 1.0392, - "grad_norm": 0.9304074048995972, - "learning_rate": 4.375886783466887e-07, - "epoch": 2.931908652838885, - "step": 1750 - }, - { - "eval_loss": 1.0371840000152588, - "eval_runtime": 85.5465, - "eval_samples_per_second": 5.821, - "eval_steps_per_second": 2.911, - "epoch": 2.931908652838885, - "step": 1750 - }, - { - "loss": 1.0665, - "grad_norm": 1.2075411081314087, - "learning_rate": 4.3179072943258764e-07, - "epoch": 2.9486695998323906, - "step": 1760 - }, - { - "loss": 0.9014, - "grad_norm": 1.3210104703903198, - "learning_rate": 4.2600211147537634e-07, - "epoch": 2.9654305468258957, - "step": 1770 - }, - { - "loss": 0.8844, - "grad_norm": 0.9938521385192871, - "learning_rate": 4.2022361635193835e-07, - "epoch": 2.9821914938194007, - "step": 1780 - }, - { - "loss": 1.0234, - "grad_norm": 1.0057061910629272, - "learning_rate": 4.1445603455436425e-07, - "epoch": 2.998952440812906, - "step": 1790 - }, - { - "loss": 0.9891, - "grad_norm": 0.8251981735229492, - "learning_rate": 4.0870015508181304e-07, - "epoch": 3.0150848522941547, - "step": 1800 - }, - { - "eval_loss": 1.0345975160598755, - "eval_runtime": 87.5549, - "eval_samples_per_second": 5.688, - "eval_steps_per_second": 2.844, - "epoch": 3.0150848522941547, - "step": 1800 - }, - { - "loss": 0.9519, - "grad_norm": 1.1525200605392456, - "learning_rate": 4.029567653325778e-07, - "epoch": 3.0318457992876597, - "step": 1810 - }, - { - "loss": 1.0024, - "grad_norm": 0.8204362392425537, - "learning_rate": 3.972266509963707e-07, - "epoch": 3.048606746281165, - "step": 1820 - }, - { - "loss": 0.9529, - "grad_norm": 1.0004171133041382, - "learning_rate": 3.9151059594684093e-07, - "epoch": 3.06536769327467, - "step": 1830 - }, - { - "loss": 0.9776, - "grad_norm": 0.9302512407302856, - "learning_rate": 3.858093821343425e-07, - "epoch": 3.0821286402681753, - "step": 1840 - }, - { - "loss": 0.8896, - "grad_norm": 1.0334570407867432, - "learning_rate": 3.801237894789632e-07, - "epoch": 3.0988895872616804, - "step": 1850 - }, - { - "eval_loss": 1.0320886373519897, - "eval_runtime": 80.1434, - "eval_samples_per_second": 6.214, - "eval_steps_per_second": 3.107, - "epoch": 3.0988895872616804, - "step": 1850 - }, - { - "loss": 0.9555, - "grad_norm": 0.9561256766319275, - "learning_rate": 3.744545957638332e-07, - "epoch": 3.1156505342551855, - "step": 1860 - }, - { - "loss": 0.972, - "grad_norm": 1.024395227432251, - "learning_rate": 3.6880257652872447e-07, - "epoch": 3.1324114812486905, - "step": 1870 - }, - { - "loss": 1.0468, - "grad_norm": 1.217880129814148, - "learning_rate": 3.6316850496395855e-07, - "epoch": 3.1491724282421956, - "step": 1880 - }, - { - "loss": 0.817, - "grad_norm": 1.033187985420227, - "learning_rate": 3.57553151804634e-07, - "epoch": 3.1659333752357006, - "step": 1890 - }, - { - "loss": 1.0248, - "grad_norm": 1.149246335029602, - "learning_rate": 3.519572852251914e-07, - "epoch": 3.182694322229206, - "step": 1900 - }, - { - "eval_loss": 1.0297226905822754, - "eval_runtime": 82.8338, - "eval_samples_per_second": 6.012, - "eval_steps_per_second": 3.006, - "epoch": 3.182694322229206, - "step": 1900 - }, - { - "loss": 0.9628, - "grad_norm": 1.001043438911438, - "learning_rate": 3.4638167073432693e-07, - "epoch": 3.199455269222711, - "step": 1910 - }, - { - "loss": 0.9998, - "grad_norm": 1.1501474380493164, - "learning_rate": 3.4082707107027343e-07, - "epoch": 3.2162162162162162, - "step": 1920 - }, - { - "loss": 1.0307, - "grad_norm": 0.9195724725723267, - "learning_rate": 3.352942460964564e-07, - "epoch": 3.2329771632097213, - "step": 1930 - }, - { - "loss": 0.8789, - "grad_norm": 1.1816115379333496, - "learning_rate": 3.297839526975467e-07, - "epoch": 3.2497381102032263, - "step": 1940 - }, - { - "loss": 0.9866, - "grad_norm": 1.1459033489227295, - "learning_rate": 3.242969446759195e-07, - "epoch": 3.2664990571967314, - "step": 1950 - }, - { - "eval_loss": 1.0278631448745728, - "eval_runtime": 82.4435, - "eval_samples_per_second": 6.041, - "eval_steps_per_second": 3.02, - "epoch": 3.2664990571967314, - "step": 1950 - }, - { - "loss": 1.0124, - "grad_norm": 1.1944814920425415, - "learning_rate": 3.188339726485344e-07, - "epoch": 3.283260004190237, - "step": 1960 - }, - { - "loss": 0.964, - "grad_norm": 1.3220574855804443, - "learning_rate": 3.133957839442526e-07, - "epoch": 3.300020951183742, - "step": 1970 - }, - { - "loss": 1.0585, - "grad_norm": 0.9459046721458435, - "learning_rate": 3.079831225016023e-07, - "epoch": 3.316781898177247, - "step": 1980 - }, - { - "loss": 1.0015, - "grad_norm": 1.162500023841858, - "learning_rate": 3.02596728767009e-07, - "epoch": 3.333542845170752, - "step": 1990 - }, - { - "loss": 0.9869, - "grad_norm": 1.0170260667800903, - "learning_rate": 2.9723733959350303e-07, - "epoch": 3.350303792164257, - "step": 2000 - }, - { - "eval_loss": 1.0256966352462769, - "eval_runtime": 80.2113, - "eval_samples_per_second": 6.209, - "eval_steps_per_second": 3.104, - "epoch": 3.350303792164257, - "step": 2000 - }, - { - "loss": 1.0388, - "grad_norm": 0.9814099669456482, - "learning_rate": 2.9190568813991957e-07, - "epoch": 3.3670647391577626, - "step": 2010 - }, - { - "loss": 1.0851, - "grad_norm": 0.835394561290741, - "learning_rate": 2.8660250377060216e-07, - "epoch": 3.3838256861512677, - "step": 2020 - }, - { - "loss": 0.9552, - "grad_norm": 0.8727148771286011, - "learning_rate": 2.8132851195562717e-07, - "epoch": 3.4005866331447727, - "step": 2030 - }, - { - "loss": 0.9742, - "grad_norm": 0.9619453549385071, - "learning_rate": 2.7608443417155997e-07, - "epoch": 3.4173475801382778, - "step": 2040 - }, - { - "loss": 0.8432, - "grad_norm": 0.8250086903572083, - "learning_rate": 2.708709878027584e-07, - "epoch": 3.434108527131783, - "step": 2050 - }, - { - "eval_loss": 1.023971676826477, - "eval_runtime": 80.4758, - "eval_samples_per_second": 6.188, - "eval_steps_per_second": 3.094, - "epoch": 3.434108527131783, - "step": 2050 - }, - { - "loss": 0.9283, - "grad_norm": 1.4034401178359985, - "learning_rate": 2.656888860432337e-07, - "epoch": 3.4508694741252883, - "step": 2060 - }, - { - "loss": 1.0269, - "grad_norm": 1.1039758920669556, - "learning_rate": 2.605388377990879e-07, - "epoch": 3.4676304211187934, - "step": 2070 - }, - { - "loss": 0.9756, - "grad_norm": 1.1378097534179688, - "learning_rate": 2.554215475915358e-07, - "epoch": 3.4843913681122984, - "step": 2080 - }, - { - "loss": 0.9271, - "grad_norm": 1.0743006467819214, - "learning_rate": 2.503377154605264e-07, - "epoch": 3.5011523151058035, - "step": 2090 - }, - { - "loss": 0.9881, - "grad_norm": 1.0452345609664917, - "learning_rate": 2.452880368689798e-07, - "epoch": 3.5179132620993085, - "step": 2100 - }, - { - "eval_loss": 1.0225834846496582, - "eval_runtime": 81.0513, - "eval_samples_per_second": 6.144, - "eval_steps_per_second": 3.072, - "epoch": 3.5179132620993085, - "step": 2100 - }, - { - "loss": 0.9588, - "grad_norm": 0.8060430288314819, - "learning_rate": 2.402732026076468e-07, - "epoch": 3.5346742090928136, - "step": 2110 - }, - { - "loss": 0.9306, - "grad_norm": 1.1182574033737183, - "learning_rate": 2.352938987006106e-07, - "epoch": 3.5514351560863187, - "step": 2120 - }, - { - "loss": 0.7893, - "grad_norm": 1.048654317855835, - "learning_rate": 2.3035080631143893e-07, - "epoch": 3.568196103079824, - "step": 2130 - }, - { - "loss": 0.9771, - "grad_norm": 0.9173722863197327, - "learning_rate": 2.254446016500019e-07, - "epoch": 3.584957050073329, - "step": 2140 - }, - { - "loss": 0.8972, - "grad_norm": 0.6042903661727905, - "learning_rate": 2.205759558799669e-07, - "epoch": 3.6017179970668343, - "step": 2150 - }, - { - "eval_loss": 1.0213065147399902, - "eval_runtime": 78.4141, - "eval_samples_per_second": 6.351, - "eval_steps_per_second": 3.175, - "epoch": 3.6017179970668343, - "step": 2150 - }, - { - "loss": 0.9735, - "grad_norm": 1.3135992288589478, - "learning_rate": 2.1574553502698434e-07, - "epoch": 3.6184789440603393, - "step": 2160 - }, - { - "loss": 0.9689, - "grad_norm": 0.8670032620429993, - "learning_rate": 2.1095399988757572e-07, - "epoch": 3.6352398910538444, - "step": 2170 - }, - { - "loss": 0.9137, - "grad_norm": 0.9747626185417175, - "learning_rate": 2.0620200593873816e-07, - "epoch": 3.65200083804735, - "step": 2180 - }, - { - "loss": 1.0097, - "grad_norm": 1.069161057472229, - "learning_rate": 2.0149020324827487e-07, - "epoch": 3.668761785040855, - "step": 2190 - }, - { - "loss": 0.9414, - "grad_norm": 0.9334145784378052, - "learning_rate": 1.9681923638586657e-07, - "epoch": 3.68552273203436, - "step": 2200 - }, - { - "eval_loss": 1.0197768211364746, - "eval_runtime": 78.9262, - "eval_samples_per_second": 6.31, - "eval_steps_per_second": 3.155, - "epoch": 3.68552273203436, - "step": 2200 - }, - { - "loss": 0.9483, - "grad_norm": 0.9081120491027832, - "learning_rate": 1.921897443348958e-07, - "epoch": 3.702283679027865, - "step": 2210 - }, - { - "loss": 0.7712, - "grad_norm": 0.8620381355285645, - "learning_rate": 1.876023604050347e-07, - "epoch": 3.71904462602137, - "step": 2220 - }, - { - "loss": 0.9694, - "grad_norm": 0.9712274074554443, - "learning_rate": 1.8305771214560773e-07, - "epoch": 3.7358055730148756, - "step": 2230 - }, - { - "loss": 0.9673, - "grad_norm": 1.1614145040512085, - "learning_rate": 1.7855642125974458e-07, - "epoch": 3.75256652000838, - "step": 2240 - }, - { - "loss": 0.9736, - "grad_norm": 0.8890505433082581, - "learning_rate": 1.740991035193317e-07, - "epoch": 3.7693274670018857, - "step": 2250 - }, - { - "eval_loss": 1.019027829170227, - "eval_runtime": 78.5313, - "eval_samples_per_second": 6.341, - "eval_steps_per_second": 3.171, - "epoch": 3.7693274670018857, - "step": 2250 - }, - { - "loss": 0.9323, - "grad_norm": 0.9356293678283691, - "learning_rate": 1.6968636868077514e-07, - "epoch": 3.7860884139953908, - "step": 2260 - }, - { - "loss": 0.8985, - "grad_norm": 1.0536713600158691, - "learning_rate": 1.6531882040158645e-07, - "epoch": 3.802849360988896, - "step": 2270 - }, - { - "loss": 0.9493, - "grad_norm": 0.9968764185905457, - "learning_rate": 1.609970561578034e-07, - "epoch": 3.819610307982401, - "step": 2280 - }, - { - "loss": 0.9251, - "grad_norm": 1.304086685180664, - "learning_rate": 1.5672166716225533e-07, - "epoch": 3.836371254975906, - "step": 2290 - }, - { - "loss": 1.0068, - "grad_norm": 0.8861364722251892, - "learning_rate": 1.524932382836861e-07, - "epoch": 3.8531322019694114, - "step": 2300 - }, - { - "eval_loss": 1.0179270505905151, - "eval_runtime": 78.5978, - "eval_samples_per_second": 6.336, - "eval_steps_per_second": 3.168, - "epoch": 3.8531322019694114, - "step": 2300 - }, - { - "loss": 0.9483, - "grad_norm": 1.5795460939407349, - "learning_rate": 1.4831234796674515e-07, - "epoch": 3.8698931489629165, - "step": 2310 - }, - { - "loss": 0.9287, - "grad_norm": 0.9922671914100647, - "learning_rate": 1.4417956815285576e-07, - "epoch": 3.8866540959564215, - "step": 2320 - }, - { - "loss": 0.9983, - "grad_norm": 0.9773964881896973, - "learning_rate": 1.4009546420197522e-07, - "epoch": 3.9034150429499266, - "step": 2330 - }, - { - "loss": 0.8862, - "grad_norm": 0.863190233707428, - "learning_rate": 1.3606059481525296e-07, - "epoch": 3.9201759899434316, - "step": 2340 - }, - { - "loss": 0.901, - "grad_norm": 1.0745151042938232, - "learning_rate": 1.320755119586024e-07, - "epoch": 3.936936936936937, - "step": 2350 - }, - { - "eval_loss": 1.0171653032302856, - "eval_runtime": 78.7387, - "eval_samples_per_second": 6.325, - "eval_steps_per_second": 3.162, - "epoch": 3.936936936936937, - "step": 2350 - }, - { - "loss": 0.9247, - "grad_norm": 1.1230072975158691, - "learning_rate": 1.2814076078719111e-07, - "epoch": 3.953697883930442, - "step": 2360 - }, - { - "loss": 0.9028, - "grad_norm": 1.0916768312454224, - "learning_rate": 1.24256879570865e-07, - "epoch": 3.9704588309239472, - "step": 2370 - }, - { - "loss": 0.9669, - "grad_norm": 1.2703828811645508, - "learning_rate": 1.2042439962051316e-07, - "epoch": 3.9872197779174523, - "step": 2380 - }, - { - "loss": 0.9533, - "grad_norm": 0.9693753123283386, - "learning_rate": 1.1664384521538529e-07, - "epoch": 4.003352189398701, - "step": 2390 - }, - { - "loss": 0.9692, - "grad_norm": 0.9873965382575989, - "learning_rate": 1.129157335313709e-07, - "epoch": 4.020113136392206, - "step": 2400 - }, - { - "eval_loss": 1.0167144536972046, - "eval_runtime": 78.925, - "eval_samples_per_second": 6.31, - "eval_steps_per_second": 3.155, - "epoch": 4.020113136392206, - "step": 2400 - }, - { - "loss": 0.8831, - "grad_norm": 1.532692313194275, - "learning_rate": 1.0924057457025004e-07, - "epoch": 4.036874083385711, - "step": 2410 - }, - { - "loss": 0.9041, - "grad_norm": 1.508852481842041, - "learning_rate": 1.0561887108992557e-07, - "epoch": 4.053635030379216, - "step": 2420 - }, - { - "loss": 0.8563, - "grad_norm": 0.8709840178489685, - "learning_rate": 1.0205111853564635e-07, - "epoch": 4.070395977372722, - "step": 2430 - }, - { - "loss": 0.9356, - "grad_norm": 0.9158110022544861, - "learning_rate": 9.853780497223141e-08, - "epoch": 4.0871569243662265, - "step": 2440 - }, - { - "loss": 0.9407, - "grad_norm": 1.1595951318740845, - "learning_rate": 9.507941101730243e-08, - "epoch": 4.103917871359732, - "step": 2450 - }, - { - "eval_loss": 1.0161402225494385, - "eval_runtime": 85.8936, - "eval_samples_per_second": 5.798, - "eval_steps_per_second": 2.899, - "epoch": 4.103917871359732, - "step": 2450 - }, - { - "loss": 0.8546, - "grad_norm": 1.0961774587631226, - "learning_rate": 9.16764097755361e-08, - "epoch": 4.120678818353237, - "step": 2460 - }, - { - "loss": 0.8384, - "grad_norm": 1.2403303384780884, - "learning_rate": 8.832926677394387e-08, - "epoch": 4.137439765346742, - "step": 2470 - }, - { - "loss": 0.912, - "grad_norm": 0.8818101286888123, - "learning_rate": 8.503843989818843e-08, - "epoch": 4.154200712340248, - "step": 2480 - }, - { - "loss": 0.8917, - "grad_norm": 0.7812055349349976, - "learning_rate": 8.180437932994521e-08, - "epoch": 4.170961659333752, - "step": 2490 - }, - { - "loss": 0.9279, - "grad_norm": 1.0156097412109375, - "learning_rate": 7.862752748531831e-08, - "epoch": 4.187722606327258, - "step": 2500 - }, - { - "eval_loss": 1.015537977218628, - "eval_runtime": 83.5946, - "eval_samples_per_second": 5.957, - "eval_steps_per_second": 2.979, - "epoch": 4.187722606327258, - "step": 2500 - }, - { - "loss": 0.9586, - "grad_norm": 1.0586673021316528, - "learning_rate": 7.550831895431797e-08, - "epoch": 4.204483553320762, - "step": 2510 - }, - { - "loss": 0.9436, - "grad_norm": 1.20182466506958, - "learning_rate": 7.244718044140985e-08, - "epoch": 4.221244500314268, - "step": 2520 - }, - { - "loss": 0.9139, - "grad_norm": 0.9434934854507446, - "learning_rate": 6.944453070714162e-08, - "epoch": 4.238005447307773, - "step": 2530 - }, - { - "loss": 0.9247, - "grad_norm": 1.0741653442382812, - "learning_rate": 6.650078051085689e-08, - "epoch": 4.254766394301278, - "step": 2540 - }, - { - "loss": 1.0214, - "grad_norm": 1.047150731086731, - "learning_rate": 6.361633255450449e-08, - "epoch": 4.271527341294783, - "step": 2550 - }, - { - "eval_loss": 1.0152950286865234, - "eval_runtime": 87.3007, - "eval_samples_per_second": 5.704, - "eval_steps_per_second": 2.852, - "epoch": 4.271527341294783, - "step": 2550 - }, - { - "loss": 0.9456, - "grad_norm": 0.9247802495956421, - "learning_rate": 6.079158142754853e-08, - "epoch": 4.288288288288288, - "step": 2560 - }, - { - "loss": 0.9563, - "grad_norm": 0.912956714630127, - "learning_rate": 5.802691355298978e-08, - "epoch": 4.3050492352817935, - "step": 2570 - }, - { - "loss": 1.0284, - "grad_norm": 1.1536314487457275, - "learning_rate": 5.5322707134502374e-08, - "epoch": 4.321810182275298, - "step": 2580 - }, - { - "loss": 0.9638, - "grad_norm": 1.1163140535354614, - "learning_rate": 5.267933210469666e-08, - "epoch": 4.338571129268804, - "step": 2590 - }, - { - "loss": 0.9299, - "grad_norm": 0.9342344403266907, - "learning_rate": 5.009715007451265e-08, - "epoch": 4.355332076262309, - "step": 2600 - }, - { - "eval_loss": 1.0149102210998535, - "eval_runtime": 80.4709, - "eval_samples_per_second": 6.189, - "eval_steps_per_second": 3.094, - "epoch": 4.355332076262309, - "step": 2600 - }, - { - "loss": 0.949, - "grad_norm": 0.8422918319702148, - "learning_rate": 4.7576514283752034e-08, - "epoch": 4.372093023255814, - "step": 2610 - }, - { - "loss": 0.9393, - "grad_norm": 0.9308228492736816, - "learning_rate": 4.51177695527552e-08, - "epoch": 4.388853970249319, - "step": 2620 - }, - { - "loss": 0.9508, - "grad_norm": 1.0049588680267334, - "learning_rate": 4.272125223523038e-08, - "epoch": 4.405614917242824, - "step": 2630 - }, - { - "loss": 0.9882, - "grad_norm": 1.437691569328308, - "learning_rate": 4.038729017224052e-08, - "epoch": 4.422375864236329, - "step": 2640 - }, - { - "loss": 0.9925, - "grad_norm": 1.2916972637176514, - "learning_rate": 3.811620264735549e-08, - "epoch": 4.439136811229835, - "step": 2650 - }, - { - "eval_loss": 1.0145703554153442, - "eval_runtime": 80.4565, - "eval_samples_per_second": 6.19, - "eval_steps_per_second": 3.095, - "epoch": 4.439136811229835, - "step": 2650 - }, - { - "loss": 0.9384, - "grad_norm": 1.57413649559021, - "learning_rate": 3.590830034297382e-08, - "epoch": 4.4558977582233394, - "step": 2660 - }, - { - "loss": 0.9637, - "grad_norm": 0.9036434888839722, - "learning_rate": 3.376388529782215e-08, - "epoch": 4.472658705216845, - "step": 2670 - }, - { - "loss": 1.0354, - "grad_norm": 1.0106011629104614, - "learning_rate": 3.1683250865636114e-08, - "epoch": 4.4894196522103496, - "step": 2680 - }, - { - "loss": 1.0022, - "grad_norm": 1.2949162721633911, - "learning_rate": 2.9666681675030448e-08, - "epoch": 4.506180599203855, - "step": 2690 - }, - { - "loss": 0.9365, - "grad_norm": 1.0403434038162231, - "learning_rate": 2.7714453590561848e-08, - "epoch": 4.5229415461973606, - "step": 2700 - }, - { - "eval_loss": 1.0145213603973389, - "eval_runtime": 80.1701, - "eval_samples_per_second": 6.212, - "eval_steps_per_second": 3.106, - "epoch": 4.5229415461973606, - "step": 2700 - }, - { - "loss": 0.9626, - "grad_norm": 0.9351533651351929, - "learning_rate": 2.5826833674990888e-08, - "epoch": 4.539702493190865, - "step": 2710 - }, - { - "loss": 0.9194, - "grad_norm": 1.2137658596038818, - "learning_rate": 2.4004080152748184e-08, - "epoch": 4.556463440184371, - "step": 2720 - }, - { - "loss": 0.9633, - "grad_norm": 0.9525250792503357, - "learning_rate": 2.2246442374609597e-08, - "epoch": 4.573224387177875, - "step": 2730 - }, - { - "loss": 0.9341, - "grad_norm": 0.8369765281677246, - "learning_rate": 2.0554160783585294e-08, - "epoch": 4.589985334171381, - "step": 2740 - }, - { - "loss": 0.9747, - "grad_norm": 1.0690010786056519, - "learning_rate": 1.8927466882027344e-08, - "epoch": 4.606746281164886, - "step": 2750 - }, - { - "eval_loss": 1.0143725872039795, - "eval_runtime": 78.6989, - "eval_samples_per_second": 6.328, - "eval_steps_per_second": 3.164, - "epoch": 4.606746281164886, - "step": 2750 - }, - { - "loss": 1.0532, - "grad_norm": 1.0263952016830444, - "learning_rate": 1.736658319996054e-08, - "epoch": 4.623507228158391, - "step": 2760 - }, - { - "loss": 0.8263, - "grad_norm": 0.918011486530304, - "learning_rate": 1.5871723264640313e-08, - "epoch": 4.640268175151896, - "step": 2770 - }, - { - "loss": 0.8762, - "grad_norm": 0.8909319043159485, - "learning_rate": 1.444309157134288e-08, - "epoch": 4.657029122145401, - "step": 2780 - }, - { - "loss": 0.9734, - "grad_norm": 0.8371102213859558, - "learning_rate": 1.3080883555389944e-08, - "epoch": 4.6737900691389065, - "step": 2790 - }, - { - "loss": 0.8306, - "grad_norm": 1.1059422492980957, - "learning_rate": 1.1785285565413639e-08, - "epoch": 4.690551016132411, - "step": 2800 - }, - { - "eval_loss": 1.0143219232559204, - "eval_runtime": 83.9047, - "eval_samples_per_second": 5.935, - "eval_steps_per_second": 2.968, - "epoch": 4.690551016132411, - "step": 2800 - }, - { - "loss": 0.8965, - "grad_norm": 1.0232831239700317, - "learning_rate": 1.055647483786437e-08, - "epoch": 4.707311963125917, - "step": 2810 - }, - { - "loss": 0.8825, - "grad_norm": 1.0429071187973022, - "learning_rate": 9.394619472764486e-09, - "epoch": 4.724072910119422, - "step": 2820 - }, - { - "loss": 0.9159, - "grad_norm": 0.9357975721359253, - "learning_rate": 8.299878410713224e-09, - "epoch": 4.740833857112927, - "step": 2830 - }, - { - "loss": 1.0004, - "grad_norm": 1.033324956893921, - "learning_rate": 7.272401411143159e-09, - "epoch": 4.757594804106432, - "step": 2840 - }, - { - "loss": 0.9712, - "grad_norm": 1.018965721130371, - "learning_rate": 6.312329031833319e-09, - "epoch": 4.774355751099937, - "step": 2850 - }, - { - "eval_loss": 1.0143295526504517, - "eval_runtime": 86.6954, - "eval_samples_per_second": 5.744, - "eval_steps_per_second": 2.872, - "epoch": 4.774355751099937, - "step": 2850 - }, - { - "loss": 0.8875, - "grad_norm": 1.0147583484649658, - "learning_rate": 5.419792609681284e-09, - "epoch": 4.791116698093442, - "step": 2860 - }, - { - "loss": 0.9097, - "grad_norm": 1.2517958879470825, - "learning_rate": 4.594914242736503e-09, - "epoch": 4.807877645086947, - "step": 2870 - }, - { - "loss": 1.0227, - "grad_norm": 1.0632728338241577, - "learning_rate": 3.837806773496821e-09, - "epoch": 4.824638592080452, - "step": 2880 - }, - { - "loss": 0.9568, - "grad_norm": 0.9697940349578857, - "learning_rate": 3.1485737734724406e-09, - "epoch": 4.841399539073958, - "step": 2890 - }, - { - "loss": 0.9155, - "grad_norm": 0.8009698987007141, - "learning_rate": 2.5273095290169742e-09, - "epoch": 4.8581604860674625, - "step": 2900 - }, - { - "eval_loss": 1.014455795288086, - "eval_runtime": 87.3886, - "eval_samples_per_second": 5.699, - "eval_steps_per_second": 2.849, - "epoch": 4.8581604860674625, - "step": 2900 - }, - { - "loss": 0.9871, - "grad_norm": 1.121019959449768, - "learning_rate": 1.974099028429599e-09, - "epoch": 4.874921433060968, - "step": 2910 - }, - { - "loss": 0.9819, - "grad_norm": 1.091722846031189, - "learning_rate": 1.4890179503281862e-09, - "epoch": 4.891682380054473, - "step": 2920 - }, - { - "loss": 0.9754, - "grad_norm": 0.8344740271568298, - "learning_rate": 1.072132653297031e-09, - "epoch": 4.908443327047978, - "step": 2930 - }, - { - "loss": 0.9712, - "grad_norm": 0.9808083176612854, - "learning_rate": 7.235001668088325e-10, - "epoch": 4.925204274041484, - "step": 2940 - }, - { - "loss": 0.878, - "grad_norm": 0.7794699668884277, - "learning_rate": 4.4316818342321483e-10, - "epoch": 4.941965221034988, - "step": 2950 - }, - { - "eval_loss": 1.0143743753433228, - "eval_runtime": 85.6525, - "eval_samples_per_second": 5.814, - "eval_steps_per_second": 2.907, - "epoch": 4.941965221034988, - "step": 2950 - }, - { - "loss": 1.0162, - "grad_norm": 0.7946218252182007, - "learning_rate": 2.31175052262389e-10, - "epoch": 4.958726168028494, - "step": 2960 - }, - { - "loss": 1.0109, - "grad_norm": 0.9577650427818298, - "learning_rate": 8.754977376496108e-11, - "epoch": 4.975487115021998, - "step": 2970 - }, - { - "loss": 0.9705, - "grad_norm": 1.5243313312530518, - "learning_rate": 1.2311995718883306e-11, - "epoch": 4.992248062015504, - "step": 2980 - }, - { - "train_runtime": 24337.0096, - "train_samples_per_second": 1.961, - "train_steps_per_second": 0.123, - "total_flos": 4.190815953316639e+17, - "train_loss": 1.108071906442818, - "epoch": 5.0, - "step": 2985, - "total_runtime_sec": 24338.520318984985 + "train_runtime": 10397.2106, + "train_samples_per_second": 1.678, + "train_steps_per_second": 0.105, + "total_flos": 1.5403789002928742e+17, + "train_loss": 0.8588679365670321, + "epoch": 1.0, + "step": 1091, + "total_runtime_sec": 10398.61917591095 } ] }