{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 7.0, "eval_steps": 500, "global_step": 3213, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002178649237472767, "grad_norm": 3.4895544052124023, "learning_rate": 1e-06, "loss": 4.916027545928955, "step": 1 }, { "epoch": 0.004357298474945534, "grad_norm": 4.010141372680664, "learning_rate": 9.997821350762527e-07, "loss": 4.920527458190918, "step": 2 }, { "epoch": 0.006535947712418301, "grad_norm": 3.5525588989257812, "learning_rate": 9.995642701525054e-07, "loss": 5.006848335266113, "step": 3 }, { "epoch": 0.008714596949891068, "grad_norm": 3.3555800914764404, "learning_rate": 9.993464052287582e-07, "loss": 4.70272159576416, "step": 4 }, { "epoch": 0.010893246187363835, "grad_norm": 2.975907564163208, "learning_rate": 9.991285403050107e-07, "loss": 4.784679889678955, "step": 5 }, { "epoch": 0.013071895424836602, "grad_norm": 2.828676223754883, "learning_rate": 9.989106753812637e-07, "loss": 4.812711715698242, "step": 6 }, { "epoch": 0.015250544662309368, "grad_norm": 3.008113384246826, "learning_rate": 9.986928104575162e-07, "loss": 4.691270351409912, "step": 7 }, { "epoch": 0.017429193899782137, "grad_norm": 3.6902308464050293, "learning_rate": 9.98474945533769e-07, "loss": 5.013348579406738, "step": 8 }, { "epoch": 0.0196078431372549, "grad_norm": 2.7047119140625, "learning_rate": 9.982570806100217e-07, "loss": 4.6715989112854, "step": 9 }, { "epoch": 0.02178649237472767, "grad_norm": 2.4177377223968506, "learning_rate": 9.980392156862744e-07, "loss": 4.580904006958008, "step": 10 }, { "epoch": 0.023965141612200435, "grad_norm": 2.6055257320404053, "learning_rate": 9.978213507625272e-07, "loss": 4.5385613441467285, "step": 11 }, { "epoch": 0.026143790849673203, "grad_norm": 2.2186083793640137, "learning_rate": 9.9760348583878e-07, "loss": 4.605264663696289, "step": 12 }, { "epoch": 0.02832244008714597, "grad_norm": 3.1009016036987305, "learning_rate": 9.973856209150327e-07, "loss": 4.743993282318115, "step": 13 }, { "epoch": 0.030501089324618737, "grad_norm": 2.4467430114746094, "learning_rate": 9.971677559912854e-07, "loss": 4.731149673461914, "step": 14 }, { "epoch": 0.032679738562091505, "grad_norm": 3.1184093952178955, "learning_rate": 9.969498910675381e-07, "loss": 4.623490810394287, "step": 15 }, { "epoch": 0.034858387799564274, "grad_norm": 2.540374994277954, "learning_rate": 9.967320261437909e-07, "loss": 4.740448474884033, "step": 16 }, { "epoch": 0.037037037037037035, "grad_norm": 2.5037412643432617, "learning_rate": 9.965141612200434e-07, "loss": 4.6794538497924805, "step": 17 }, { "epoch": 0.0392156862745098, "grad_norm": 2.528285264968872, "learning_rate": 9.962962962962964e-07, "loss": 4.517210483551025, "step": 18 }, { "epoch": 0.04139433551198257, "grad_norm": 2.685351610183716, "learning_rate": 9.96078431372549e-07, "loss": 4.633927822113037, "step": 19 }, { "epoch": 0.04357298474945534, "grad_norm": 2.660351514816284, "learning_rate": 9.958605664488016e-07, "loss": 4.512837886810303, "step": 20 }, { "epoch": 0.0457516339869281, "grad_norm": 2.0510494709014893, "learning_rate": 9.956427015250544e-07, "loss": 4.446115016937256, "step": 21 }, { "epoch": 0.04793028322440087, "grad_norm": 2.7734460830688477, "learning_rate": 9.954248366013071e-07, "loss": 4.655968189239502, "step": 22 }, { "epoch": 0.05010893246187364, "grad_norm": 3.266709089279175, "learning_rate": 9.952069716775599e-07, "loss": 4.729773044586182, "step": 23 }, { "epoch": 0.05228758169934641, "grad_norm": 3.1700313091278076, "learning_rate": 9.949891067538126e-07, "loss": 4.628276348114014, "step": 24 }, { "epoch": 0.054466230936819175, "grad_norm": 2.6991522312164307, "learning_rate": 9.947712418300654e-07, "loss": 4.719573020935059, "step": 25 }, { "epoch": 0.05664488017429194, "grad_norm": 2.118373155593872, "learning_rate": 9.94553376906318e-07, "loss": 4.662531852722168, "step": 26 }, { "epoch": 0.058823529411764705, "grad_norm": 2.0858867168426514, "learning_rate": 9.943355119825706e-07, "loss": 4.563895225524902, "step": 27 }, { "epoch": 0.06100217864923747, "grad_norm": 2.572877883911133, "learning_rate": 9.941176470588236e-07, "loss": 4.559572219848633, "step": 28 }, { "epoch": 0.06318082788671024, "grad_norm": 3.0945630073547363, "learning_rate": 9.938997821350761e-07, "loss": 4.973132610321045, "step": 29 }, { "epoch": 0.06535947712418301, "grad_norm": 2.404824733734131, "learning_rate": 9.936819172113289e-07, "loss": 4.668265342712402, "step": 30 }, { "epoch": 0.06753812636165578, "grad_norm": 2.565981388092041, "learning_rate": 9.934640522875816e-07, "loss": 4.684271335601807, "step": 31 }, { "epoch": 0.06971677559912855, "grad_norm": 2.628347635269165, "learning_rate": 9.932461873638343e-07, "loss": 4.641284942626953, "step": 32 }, { "epoch": 0.0718954248366013, "grad_norm": 2.4909212589263916, "learning_rate": 9.93028322440087e-07, "loss": 4.462052345275879, "step": 33 }, { "epoch": 0.07407407407407407, "grad_norm": 2.5438127517700195, "learning_rate": 9.928104575163398e-07, "loss": 4.660407543182373, "step": 34 }, { "epoch": 0.07625272331154684, "grad_norm": 2.7430713176727295, "learning_rate": 9.925925925925926e-07, "loss": 4.468838214874268, "step": 35 }, { "epoch": 0.0784313725490196, "grad_norm": 1.817378044128418, "learning_rate": 9.923747276688453e-07, "loss": 4.502563953399658, "step": 36 }, { "epoch": 0.08061002178649238, "grad_norm": 2.461793899536133, "learning_rate": 9.92156862745098e-07, "loss": 4.617760181427002, "step": 37 }, { "epoch": 0.08278867102396514, "grad_norm": 2.876997947692871, "learning_rate": 9.919389978213508e-07, "loss": 4.588554382324219, "step": 38 }, { "epoch": 0.08496732026143791, "grad_norm": 2.035167932510376, "learning_rate": 9.917211328976033e-07, "loss": 4.634808540344238, "step": 39 }, { "epoch": 0.08714596949891068, "grad_norm": 2.403083562850952, "learning_rate": 9.915032679738563e-07, "loss": 4.598034381866455, "step": 40 }, { "epoch": 0.08932461873638345, "grad_norm": 2.5495636463165283, "learning_rate": 9.912854030501088e-07, "loss": 4.778774738311768, "step": 41 }, { "epoch": 0.0915032679738562, "grad_norm": 3.2183687686920166, "learning_rate": 9.910675381263615e-07, "loss": 4.950235843658447, "step": 42 }, { "epoch": 0.09368191721132897, "grad_norm": 2.383429765701294, "learning_rate": 9.908496732026143e-07, "loss": 4.586299419403076, "step": 43 }, { "epoch": 0.09586056644880174, "grad_norm": 2.5705180168151855, "learning_rate": 9.90631808278867e-07, "loss": 4.535789489746094, "step": 44 }, { "epoch": 0.09803921568627451, "grad_norm": 2.547631025314331, "learning_rate": 9.904139433551198e-07, "loss": 4.4937214851379395, "step": 45 }, { "epoch": 0.10021786492374728, "grad_norm": 2.2629098892211914, "learning_rate": 9.901960784313725e-07, "loss": 4.597417831420898, "step": 46 }, { "epoch": 0.10239651416122005, "grad_norm": 2.8135907649993896, "learning_rate": 9.899782135076253e-07, "loss": 4.574379920959473, "step": 47 }, { "epoch": 0.10457516339869281, "grad_norm": 3.0317435264587402, "learning_rate": 9.89760348583878e-07, "loss": 4.711512088775635, "step": 48 }, { "epoch": 0.10675381263616558, "grad_norm": 2.2007057666778564, "learning_rate": 9.895424836601307e-07, "loss": 4.393397331237793, "step": 49 }, { "epoch": 0.10893246187363835, "grad_norm": 1.6631900072097778, "learning_rate": 9.893246187363835e-07, "loss": 4.303951263427734, "step": 50 }, { "epoch": 0.1111111111111111, "grad_norm": 2.300197124481201, "learning_rate": 9.89106753812636e-07, "loss": 4.423295497894287, "step": 51 }, { "epoch": 0.11328976034858387, "grad_norm": 3.31419038772583, "learning_rate": 9.88888888888889e-07, "loss": 4.845146179199219, "step": 52 }, { "epoch": 0.11546840958605664, "grad_norm": 4.885872840881348, "learning_rate": 9.886710239651415e-07, "loss": 4.7251386642456055, "step": 53 }, { "epoch": 0.11764705882352941, "grad_norm": 2.013712167739868, "learning_rate": 9.884531590413942e-07, "loss": 4.463345527648926, "step": 54 }, { "epoch": 0.11982570806100218, "grad_norm": 2.244058132171631, "learning_rate": 9.88235294117647e-07, "loss": 4.357031345367432, "step": 55 }, { "epoch": 0.12200435729847495, "grad_norm": 1.6353936195373535, "learning_rate": 9.880174291938997e-07, "loss": 4.378466606140137, "step": 56 }, { "epoch": 0.12418300653594772, "grad_norm": 2.651231288909912, "learning_rate": 9.877995642701525e-07, "loss": 4.720828056335449, "step": 57 }, { "epoch": 0.12636165577342048, "grad_norm": 2.1700127124786377, "learning_rate": 9.875816993464052e-07, "loss": 4.482420921325684, "step": 58 }, { "epoch": 0.12854030501089325, "grad_norm": 2.560936689376831, "learning_rate": 9.87363834422658e-07, "loss": 4.585104942321777, "step": 59 }, { "epoch": 0.13071895424836602, "grad_norm": 1.7797707319259644, "learning_rate": 9.871459694989107e-07, "loss": 4.3326640129089355, "step": 60 }, { "epoch": 0.1328976034858388, "grad_norm": 2.122281312942505, "learning_rate": 9.869281045751634e-07, "loss": 4.474560737609863, "step": 61 }, { "epoch": 0.13507625272331156, "grad_norm": 2.31705904006958, "learning_rate": 9.867102396514162e-07, "loss": 4.515090465545654, "step": 62 }, { "epoch": 0.13725490196078433, "grad_norm": 1.9046130180358887, "learning_rate": 9.864923747276687e-07, "loss": 4.418384075164795, "step": 63 }, { "epoch": 0.1394335511982571, "grad_norm": 2.2110002040863037, "learning_rate": 9.862745098039217e-07, "loss": 4.565557479858398, "step": 64 }, { "epoch": 0.14161220043572983, "grad_norm": 2.6025781631469727, "learning_rate": 9.860566448801742e-07, "loss": 4.6747918128967285, "step": 65 }, { "epoch": 0.1437908496732026, "grad_norm": 2.562697172164917, "learning_rate": 9.85838779956427e-07, "loss": 4.7553181648254395, "step": 66 }, { "epoch": 0.14596949891067537, "grad_norm": 1.8945618867874146, "learning_rate": 9.856209150326797e-07, "loss": 4.398238658905029, "step": 67 }, { "epoch": 0.14814814814814814, "grad_norm": 2.071842670440674, "learning_rate": 9.854030501089324e-07, "loss": 4.499670028686523, "step": 68 }, { "epoch": 0.1503267973856209, "grad_norm": 2.426379919052124, "learning_rate": 9.851851851851852e-07, "loss": 4.735289573669434, "step": 69 }, { "epoch": 0.15250544662309368, "grad_norm": 2.2096705436706543, "learning_rate": 9.84967320261438e-07, "loss": 4.6081366539001465, "step": 70 }, { "epoch": 0.15468409586056645, "grad_norm": 2.9545485973358154, "learning_rate": 9.847494553376907e-07, "loss": 4.699771404266357, "step": 71 }, { "epoch": 0.1568627450980392, "grad_norm": 1.859724998474121, "learning_rate": 9.845315904139432e-07, "loss": 4.393412113189697, "step": 72 }, { "epoch": 0.15904139433551198, "grad_norm": 2.594811201095581, "learning_rate": 9.84313725490196e-07, "loss": 4.750296115875244, "step": 73 }, { "epoch": 0.16122004357298475, "grad_norm": 2.0538148880004883, "learning_rate": 9.840958605664487e-07, "loss": 4.523125171661377, "step": 74 }, { "epoch": 0.16339869281045752, "grad_norm": 2.1957290172576904, "learning_rate": 9.838779956427014e-07, "loss": 4.5541887283325195, "step": 75 }, { "epoch": 0.1655773420479303, "grad_norm": 2.0892882347106934, "learning_rate": 9.836601307189542e-07, "loss": 4.504140853881836, "step": 76 }, { "epoch": 0.16775599128540306, "grad_norm": 2.101895570755005, "learning_rate": 9.83442265795207e-07, "loss": 4.47087287902832, "step": 77 }, { "epoch": 0.16993464052287582, "grad_norm": 3.0677924156188965, "learning_rate": 9.832244008714596e-07, "loss": 4.681562423706055, "step": 78 }, { "epoch": 0.1721132897603486, "grad_norm": 1.7497062683105469, "learning_rate": 9.830065359477124e-07, "loss": 4.426023006439209, "step": 79 }, { "epoch": 0.17429193899782136, "grad_norm": 2.7591934204101562, "learning_rate": 9.827886710239651e-07, "loss": 4.677259922027588, "step": 80 }, { "epoch": 0.17647058823529413, "grad_norm": 1.8994778394699097, "learning_rate": 9.825708061002179e-07, "loss": 4.306804656982422, "step": 81 }, { "epoch": 0.1786492374727669, "grad_norm": 1.7388572692871094, "learning_rate": 9.823529411764704e-07, "loss": 4.367487907409668, "step": 82 }, { "epoch": 0.18082788671023964, "grad_norm": 1.8251928091049194, "learning_rate": 9.821350762527234e-07, "loss": 4.492748737335205, "step": 83 }, { "epoch": 0.1830065359477124, "grad_norm": 2.0507001876831055, "learning_rate": 9.819172113289759e-07, "loss": 4.469226360321045, "step": 84 }, { "epoch": 0.18518518518518517, "grad_norm": 3.837737798690796, "learning_rate": 9.816993464052286e-07, "loss": 4.880284786224365, "step": 85 }, { "epoch": 0.18736383442265794, "grad_norm": 2.32017183303833, "learning_rate": 9.814814814814814e-07, "loss": 4.6307244300842285, "step": 86 }, { "epoch": 0.1895424836601307, "grad_norm": 2.070546865463257, "learning_rate": 9.812636165577341e-07, "loss": 4.403173446655273, "step": 87 }, { "epoch": 0.19172113289760348, "grad_norm": 1.9438649415969849, "learning_rate": 9.810457516339869e-07, "loss": 4.322348117828369, "step": 88 }, { "epoch": 0.19389978213507625, "grad_norm": 2.0908868312835693, "learning_rate": 9.808278867102396e-07, "loss": 4.387789726257324, "step": 89 }, { "epoch": 0.19607843137254902, "grad_norm": 1.9938021898269653, "learning_rate": 9.806100217864923e-07, "loss": 4.4689040184021, "step": 90 }, { "epoch": 0.19825708061002179, "grad_norm": 2.0833871364593506, "learning_rate": 9.80392156862745e-07, "loss": 4.506947040557861, "step": 91 }, { "epoch": 0.20043572984749455, "grad_norm": 1.6076719760894775, "learning_rate": 9.801742919389978e-07, "loss": 4.388120174407959, "step": 92 }, { "epoch": 0.20261437908496732, "grad_norm": 2.3387317657470703, "learning_rate": 9.799564270152506e-07, "loss": 4.3787841796875, "step": 93 }, { "epoch": 0.2047930283224401, "grad_norm": 1.8435430526733398, "learning_rate": 9.79738562091503e-07, "loss": 4.4166035652160645, "step": 94 }, { "epoch": 0.20697167755991286, "grad_norm": 1.8981658220291138, "learning_rate": 9.79520697167756e-07, "loss": 4.423433303833008, "step": 95 }, { "epoch": 0.20915032679738563, "grad_norm": 2.6306850910186768, "learning_rate": 9.793028322440086e-07, "loss": 4.500404357910156, "step": 96 }, { "epoch": 0.2113289760348584, "grad_norm": 1.6204158067703247, "learning_rate": 9.790849673202613e-07, "loss": 4.223199367523193, "step": 97 }, { "epoch": 0.21350762527233116, "grad_norm": 1.995589017868042, "learning_rate": 9.78867102396514e-07, "loss": 4.439243793487549, "step": 98 }, { "epoch": 0.21568627450980393, "grad_norm": 2.1368906497955322, "learning_rate": 9.786492374727668e-07, "loss": 4.440408229827881, "step": 99 }, { "epoch": 0.2178649237472767, "grad_norm": 1.9011905193328857, "learning_rate": 9.784313725490196e-07, "loss": 4.256593704223633, "step": 100 }, { "epoch": 0.22004357298474944, "grad_norm": 2.1839473247528076, "learning_rate": 9.782135076252723e-07, "loss": 4.516264915466309, "step": 101 }, { "epoch": 0.2222222222222222, "grad_norm": 2.4889614582061768, "learning_rate": 9.77995642701525e-07, "loss": 4.4012908935546875, "step": 102 }, { "epoch": 0.22440087145969498, "grad_norm": 2.1062400341033936, "learning_rate": 9.777777777777778e-07, "loss": 4.377523899078369, "step": 103 }, { "epoch": 0.22657952069716775, "grad_norm": 2.010125160217285, "learning_rate": 9.775599128540305e-07, "loss": 4.375840187072754, "step": 104 }, { "epoch": 0.22875816993464052, "grad_norm": 2.392651319503784, "learning_rate": 9.773420479302833e-07, "loss": 4.444744110107422, "step": 105 }, { "epoch": 0.23093681917211328, "grad_norm": 1.9789717197418213, "learning_rate": 9.771241830065358e-07, "loss": 4.410606384277344, "step": 106 }, { "epoch": 0.23311546840958605, "grad_norm": 1.8157169818878174, "learning_rate": 9.769063180827888e-07, "loss": 4.317819595336914, "step": 107 }, { "epoch": 0.23529411764705882, "grad_norm": 2.4926273822784424, "learning_rate": 9.766884531590413e-07, "loss": 4.318043231964111, "step": 108 }, { "epoch": 0.2374727668845316, "grad_norm": 1.8115081787109375, "learning_rate": 9.76470588235294e-07, "loss": 4.343961238861084, "step": 109 }, { "epoch": 0.23965141612200436, "grad_norm": 2.1982710361480713, "learning_rate": 9.762527233115468e-07, "loss": 4.384943962097168, "step": 110 }, { "epoch": 0.24183006535947713, "grad_norm": 1.9262521266937256, "learning_rate": 9.760348583877995e-07, "loss": 4.344392776489258, "step": 111 }, { "epoch": 0.2440087145969499, "grad_norm": 2.537219285964966, "learning_rate": 9.758169934640523e-07, "loss": 4.52830171585083, "step": 112 }, { "epoch": 0.24618736383442266, "grad_norm": 2.884486675262451, "learning_rate": 9.75599128540305e-07, "loss": 4.582674980163574, "step": 113 }, { "epoch": 0.24836601307189543, "grad_norm": 2.7616825103759766, "learning_rate": 9.753812636165577e-07, "loss": 4.294858455657959, "step": 114 }, { "epoch": 0.25054466230936817, "grad_norm": 1.4845702648162842, "learning_rate": 9.751633986928105e-07, "loss": 4.354666233062744, "step": 115 }, { "epoch": 0.25272331154684097, "grad_norm": 1.797873854637146, "learning_rate": 9.749455337690632e-07, "loss": 4.310470104217529, "step": 116 }, { "epoch": 0.2549019607843137, "grad_norm": 2.25718355178833, "learning_rate": 9.74727668845316e-07, "loss": 4.4831223487854, "step": 117 }, { "epoch": 0.2570806100217865, "grad_norm": 2.2784883975982666, "learning_rate": 9.745098039215685e-07, "loss": 4.50039005279541, "step": 118 }, { "epoch": 0.25925925925925924, "grad_norm": 1.6839503049850464, "learning_rate": 9.742919389978214e-07, "loss": 4.3794145584106445, "step": 119 }, { "epoch": 0.26143790849673204, "grad_norm": 1.8006739616394043, "learning_rate": 9.74074074074074e-07, "loss": 4.287017822265625, "step": 120 }, { "epoch": 0.2636165577342048, "grad_norm": 2.229154586791992, "learning_rate": 9.738562091503267e-07, "loss": 4.192268371582031, "step": 121 }, { "epoch": 0.2657952069716776, "grad_norm": 2.1306228637695312, "learning_rate": 9.736383442265795e-07, "loss": 4.369370460510254, "step": 122 }, { "epoch": 0.2679738562091503, "grad_norm": 2.991955518722534, "learning_rate": 9.734204793028322e-07, "loss": 4.580495834350586, "step": 123 }, { "epoch": 0.2701525054466231, "grad_norm": 2.7565677165985107, "learning_rate": 9.73202614379085e-07, "loss": 4.399185657501221, "step": 124 }, { "epoch": 0.27233115468409586, "grad_norm": 2.9067633152008057, "learning_rate": 9.729847494553377e-07, "loss": 4.397314548492432, "step": 125 }, { "epoch": 0.27450980392156865, "grad_norm": 2.146064043045044, "learning_rate": 9.727668845315904e-07, "loss": 4.473128318786621, "step": 126 }, { "epoch": 0.2766884531590414, "grad_norm": 3.156425952911377, "learning_rate": 9.725490196078432e-07, "loss": 4.540253639221191, "step": 127 }, { "epoch": 0.2788671023965142, "grad_norm": 2.0272207260131836, "learning_rate": 9.723311546840957e-07, "loss": 4.391735553741455, "step": 128 }, { "epoch": 0.28104575163398693, "grad_norm": 2.0880813598632812, "learning_rate": 9.721132897603487e-07, "loss": 4.582479476928711, "step": 129 }, { "epoch": 0.28322440087145967, "grad_norm": 2.07149076461792, "learning_rate": 9.718954248366012e-07, "loss": 4.342536449432373, "step": 130 }, { "epoch": 0.28540305010893247, "grad_norm": 2.4895875453948975, "learning_rate": 9.71677559912854e-07, "loss": 4.407384395599365, "step": 131 }, { "epoch": 0.2875816993464052, "grad_norm": 1.7429146766662598, "learning_rate": 9.714596949891067e-07, "loss": 4.2959675788879395, "step": 132 }, { "epoch": 0.289760348583878, "grad_norm": 2.0402631759643555, "learning_rate": 9.712418300653594e-07, "loss": 4.344051837921143, "step": 133 }, { "epoch": 0.29193899782135074, "grad_norm": 2.063497304916382, "learning_rate": 9.710239651416122e-07, "loss": 4.3666534423828125, "step": 134 }, { "epoch": 0.29411764705882354, "grad_norm": 2.166395425796509, "learning_rate": 9.70806100217865e-07, "loss": 4.453709125518799, "step": 135 }, { "epoch": 0.2962962962962963, "grad_norm": 2.547016143798828, "learning_rate": 9.705882352941176e-07, "loss": 4.574546813964844, "step": 136 }, { "epoch": 0.2984749455337691, "grad_norm": 2.0315990447998047, "learning_rate": 9.703703703703704e-07, "loss": 4.206840515136719, "step": 137 }, { "epoch": 0.3006535947712418, "grad_norm": 2.6766695976257324, "learning_rate": 9.701525054466231e-07, "loss": 4.50142240524292, "step": 138 }, { "epoch": 0.3028322440087146, "grad_norm": 1.7705003023147583, "learning_rate": 9.699346405228759e-07, "loss": 4.3514580726623535, "step": 139 }, { "epoch": 0.30501089324618735, "grad_norm": 2.578533411026001, "learning_rate": 9.697167755991284e-07, "loss": 4.358044147491455, "step": 140 }, { "epoch": 0.30718954248366015, "grad_norm": 1.894898772239685, "learning_rate": 9.694989106753814e-07, "loss": 4.336588382720947, "step": 141 }, { "epoch": 0.3093681917211329, "grad_norm": 2.2183732986450195, "learning_rate": 9.692810457516339e-07, "loss": 4.514852523803711, "step": 142 }, { "epoch": 0.3115468409586057, "grad_norm": 3.2815968990325928, "learning_rate": 9.690631808278866e-07, "loss": 4.497104644775391, "step": 143 }, { "epoch": 0.3137254901960784, "grad_norm": 2.4144210815429688, "learning_rate": 9.688453159041394e-07, "loss": 4.424169540405273, "step": 144 }, { "epoch": 0.3159041394335512, "grad_norm": 2.2354543209075928, "learning_rate": 9.686274509803921e-07, "loss": 4.575407981872559, "step": 145 }, { "epoch": 0.31808278867102396, "grad_norm": 1.9561657905578613, "learning_rate": 9.684095860566449e-07, "loss": 4.317360877990723, "step": 146 }, { "epoch": 0.3202614379084967, "grad_norm": 2.022301435470581, "learning_rate": 9.681917211328976e-07, "loss": 4.194309234619141, "step": 147 }, { "epoch": 0.3224400871459695, "grad_norm": 2.0781233310699463, "learning_rate": 9.679738562091503e-07, "loss": 4.375706195831299, "step": 148 }, { "epoch": 0.32461873638344224, "grad_norm": 2.190183639526367, "learning_rate": 9.677559912854029e-07, "loss": 4.407813549041748, "step": 149 }, { "epoch": 0.32679738562091504, "grad_norm": 2.3785204887390137, "learning_rate": 9.675381263616558e-07, "loss": 4.511362075805664, "step": 150 }, { "epoch": 0.3289760348583878, "grad_norm": 1.4752343893051147, "learning_rate": 9.673202614379084e-07, "loss": 4.210807800292969, "step": 151 }, { "epoch": 0.3311546840958606, "grad_norm": 2.164095401763916, "learning_rate": 9.67102396514161e-07, "loss": 4.268368244171143, "step": 152 }, { "epoch": 0.3333333333333333, "grad_norm": 2.8139126300811768, "learning_rate": 9.668845315904138e-07, "loss": 4.446538925170898, "step": 153 }, { "epoch": 0.3355119825708061, "grad_norm": 2.291938543319702, "learning_rate": 9.666666666666666e-07, "loss": 4.46967077255249, "step": 154 }, { "epoch": 0.33769063180827885, "grad_norm": 1.9000574350357056, "learning_rate": 9.664488017429193e-07, "loss": 4.3823442459106445, "step": 155 }, { "epoch": 0.33986928104575165, "grad_norm": 2.3056159019470215, "learning_rate": 9.66230936819172e-07, "loss": 4.384174823760986, "step": 156 }, { "epoch": 0.3420479302832244, "grad_norm": 1.7677370309829712, "learning_rate": 9.660130718954248e-07, "loss": 4.266576766967773, "step": 157 }, { "epoch": 0.3442265795206972, "grad_norm": 2.136509895324707, "learning_rate": 9.657952069716776e-07, "loss": 4.410580158233643, "step": 158 }, { "epoch": 0.3464052287581699, "grad_norm": 2.492251396179199, "learning_rate": 9.655773420479303e-07, "loss": 4.469940662384033, "step": 159 }, { "epoch": 0.3485838779956427, "grad_norm": 2.383185386657715, "learning_rate": 9.65359477124183e-07, "loss": 4.482204437255859, "step": 160 }, { "epoch": 0.35076252723311546, "grad_norm": 2.1385204792022705, "learning_rate": 9.651416122004356e-07, "loss": 4.388243675231934, "step": 161 }, { "epoch": 0.35294117647058826, "grad_norm": 2.415370464324951, "learning_rate": 9.649237472766885e-07, "loss": 4.40315055847168, "step": 162 }, { "epoch": 0.355119825708061, "grad_norm": 2.164105176925659, "learning_rate": 9.64705882352941e-07, "loss": 4.432247161865234, "step": 163 }, { "epoch": 0.3572984749455338, "grad_norm": 2.2456839084625244, "learning_rate": 9.644880174291938e-07, "loss": 4.266363143920898, "step": 164 }, { "epoch": 0.35947712418300654, "grad_norm": 2.361248016357422, "learning_rate": 9.642701525054465e-07, "loss": 4.5409650802612305, "step": 165 }, { "epoch": 0.3616557734204793, "grad_norm": 3.0007307529449463, "learning_rate": 9.640522875816993e-07, "loss": 4.580550670623779, "step": 166 }, { "epoch": 0.3638344226579521, "grad_norm": 1.6576095819473267, "learning_rate": 9.63834422657952e-07, "loss": 4.340091228485107, "step": 167 }, { "epoch": 0.3660130718954248, "grad_norm": 1.9235918521881104, "learning_rate": 9.636165577342048e-07, "loss": 4.221652030944824, "step": 168 }, { "epoch": 0.3681917211328976, "grad_norm": 2.0118086338043213, "learning_rate": 9.633986928104575e-07, "loss": 4.3488874435424805, "step": 169 }, { "epoch": 0.37037037037037035, "grad_norm": 2.1841843128204346, "learning_rate": 9.631808278867103e-07, "loss": 4.367924213409424, "step": 170 }, { "epoch": 0.37254901960784315, "grad_norm": 2.247666835784912, "learning_rate": 9.629629629629628e-07, "loss": 4.505679607391357, "step": 171 }, { "epoch": 0.3747276688453159, "grad_norm": 1.8381444215774536, "learning_rate": 9.627450980392157e-07, "loss": 4.235549449920654, "step": 172 }, { "epoch": 0.3769063180827887, "grad_norm": 2.7297332286834717, "learning_rate": 9.625272331154683e-07, "loss": 4.471020221710205, "step": 173 }, { "epoch": 0.3790849673202614, "grad_norm": 1.9857988357543945, "learning_rate": 9.62309368191721e-07, "loss": 4.335488319396973, "step": 174 }, { "epoch": 0.3812636165577342, "grad_norm": 1.8306801319122314, "learning_rate": 9.620915032679738e-07, "loss": 4.086997985839844, "step": 175 }, { "epoch": 0.38344226579520696, "grad_norm": 1.687875747680664, "learning_rate": 9.618736383442265e-07, "loss": 4.265539169311523, "step": 176 }, { "epoch": 0.38562091503267976, "grad_norm": 1.860582947731018, "learning_rate": 9.616557734204792e-07, "loss": 4.2574849128723145, "step": 177 }, { "epoch": 0.3877995642701525, "grad_norm": 2.502021312713623, "learning_rate": 9.61437908496732e-07, "loss": 4.495733737945557, "step": 178 }, { "epoch": 0.3899782135076253, "grad_norm": 1.581700325012207, "learning_rate": 9.612200435729847e-07, "loss": 4.240194320678711, "step": 179 }, { "epoch": 0.39215686274509803, "grad_norm": 2.7113497257232666, "learning_rate": 9.610021786492375e-07, "loss": 4.6046671867370605, "step": 180 }, { "epoch": 0.39433551198257083, "grad_norm": 2.4433610439300537, "learning_rate": 9.607843137254902e-07, "loss": 4.441323280334473, "step": 181 }, { "epoch": 0.39651416122004357, "grad_norm": 1.9853641986846924, "learning_rate": 9.60566448801743e-07, "loss": 4.359451770782471, "step": 182 }, { "epoch": 0.39869281045751637, "grad_norm": 2.133108377456665, "learning_rate": 9.603485838779955e-07, "loss": 4.289493083953857, "step": 183 }, { "epoch": 0.4008714596949891, "grad_norm": 1.7538727521896362, "learning_rate": 9.601307189542484e-07, "loss": 4.28318452835083, "step": 184 }, { "epoch": 0.40305010893246185, "grad_norm": 3.3512535095214844, "learning_rate": 9.59912854030501e-07, "loss": 4.550060272216797, "step": 185 }, { "epoch": 0.40522875816993464, "grad_norm": 1.5646427869796753, "learning_rate": 9.596949891067537e-07, "loss": 4.34261417388916, "step": 186 }, { "epoch": 0.4074074074074074, "grad_norm": 1.7791746854782104, "learning_rate": 9.594771241830065e-07, "loss": 4.240348815917969, "step": 187 }, { "epoch": 0.4095860566448802, "grad_norm": 2.326260805130005, "learning_rate": 9.592592592592592e-07, "loss": 4.4669952392578125, "step": 188 }, { "epoch": 0.4117647058823529, "grad_norm": 1.837967038154602, "learning_rate": 9.59041394335512e-07, "loss": 4.39886474609375, "step": 189 }, { "epoch": 0.4139433551198257, "grad_norm": 1.744829535484314, "learning_rate": 9.588235294117647e-07, "loss": 4.2220540046691895, "step": 190 }, { "epoch": 0.41612200435729846, "grad_norm": 1.8719234466552734, "learning_rate": 9.586056644880174e-07, "loss": 4.2246575355529785, "step": 191 }, { "epoch": 0.41830065359477125, "grad_norm": 2.3283026218414307, "learning_rate": 9.583877995642702e-07, "loss": 4.3507843017578125, "step": 192 }, { "epoch": 0.420479302832244, "grad_norm": 1.8943367004394531, "learning_rate": 9.58169934640523e-07, "loss": 4.162900924682617, "step": 193 }, { "epoch": 0.4226579520697168, "grad_norm": 2.661996364593506, "learning_rate": 9.579520697167757e-07, "loss": 4.450239658355713, "step": 194 }, { "epoch": 0.42483660130718953, "grad_norm": 2.0999133586883545, "learning_rate": 9.577342047930282e-07, "loss": 4.250425338745117, "step": 195 }, { "epoch": 0.42701525054466233, "grad_norm": 1.9673007726669312, "learning_rate": 9.575163398692811e-07, "loss": 4.474855422973633, "step": 196 }, { "epoch": 0.42919389978213507, "grad_norm": 1.8445521593093872, "learning_rate": 9.572984749455337e-07, "loss": 4.2301716804504395, "step": 197 }, { "epoch": 0.43137254901960786, "grad_norm": 2.857212781906128, "learning_rate": 9.570806100217864e-07, "loss": 4.531691551208496, "step": 198 }, { "epoch": 0.4335511982570806, "grad_norm": 2.1888811588287354, "learning_rate": 9.568627450980392e-07, "loss": 4.4800615310668945, "step": 199 }, { "epoch": 0.4357298474945534, "grad_norm": 1.7973730564117432, "learning_rate": 9.566448801742919e-07, "loss": 4.28166389465332, "step": 200 }, { "epoch": 0.43790849673202614, "grad_norm": 3.003615140914917, "learning_rate": 9.564270152505446e-07, "loss": 4.628840446472168, "step": 201 }, { "epoch": 0.4400871459694989, "grad_norm": 1.9880698919296265, "learning_rate": 9.562091503267974e-07, "loss": 4.241949558258057, "step": 202 }, { "epoch": 0.4422657952069717, "grad_norm": 1.7790374755859375, "learning_rate": 9.559912854030501e-07, "loss": 4.304915904998779, "step": 203 }, { "epoch": 0.4444444444444444, "grad_norm": 1.8285244703292847, "learning_rate": 9.557734204793029e-07, "loss": 4.2064104080200195, "step": 204 }, { "epoch": 0.4466230936819172, "grad_norm": 2.522207260131836, "learning_rate": 9.555555555555556e-07, "loss": 4.3665032386779785, "step": 205 }, { "epoch": 0.44880174291938996, "grad_norm": 2.157550573348999, "learning_rate": 9.553376906318083e-07, "loss": 4.449486255645752, "step": 206 }, { "epoch": 0.45098039215686275, "grad_norm": 2.3344943523406982, "learning_rate": 9.551198257080609e-07, "loss": 4.397902488708496, "step": 207 }, { "epoch": 0.4531590413943355, "grad_norm": 1.9231369495391846, "learning_rate": 9.549019607843138e-07, "loss": 4.240512847900391, "step": 208 }, { "epoch": 0.4553376906318083, "grad_norm": 2.3051517009735107, "learning_rate": 9.546840958605664e-07, "loss": 4.396009922027588, "step": 209 }, { "epoch": 0.45751633986928103, "grad_norm": 1.9424047470092773, "learning_rate": 9.544662309368191e-07, "loss": 4.256721019744873, "step": 210 }, { "epoch": 0.4596949891067538, "grad_norm": 1.8167564868927002, "learning_rate": 9.542483660130718e-07, "loss": 4.232820510864258, "step": 211 }, { "epoch": 0.46187363834422657, "grad_norm": 2.1137053966522217, "learning_rate": 9.540305010893246e-07, "loss": 4.301129341125488, "step": 212 }, { "epoch": 0.46405228758169936, "grad_norm": 2.390566349029541, "learning_rate": 9.538126361655773e-07, "loss": 4.2755656242370605, "step": 213 }, { "epoch": 0.4662309368191721, "grad_norm": 2.054520845413208, "learning_rate": 9.535947712418301e-07, "loss": 4.281137943267822, "step": 214 }, { "epoch": 0.4684095860566449, "grad_norm": 1.689180612564087, "learning_rate": 9.533769063180827e-07, "loss": 4.184370040893555, "step": 215 }, { "epoch": 0.47058823529411764, "grad_norm": 1.9673049449920654, "learning_rate": 9.531590413943355e-07, "loss": 4.297755241394043, "step": 216 }, { "epoch": 0.47276688453159044, "grad_norm": 2.2872281074523926, "learning_rate": 9.529411764705881e-07, "loss": 4.3808817863464355, "step": 217 }, { "epoch": 0.4749455337690632, "grad_norm": 2.2688164710998535, "learning_rate": 9.527233115468409e-07, "loss": 4.442854404449463, "step": 218 }, { "epoch": 0.477124183006536, "grad_norm": 2.626431703567505, "learning_rate": 9.525054466230936e-07, "loss": 4.416866302490234, "step": 219 }, { "epoch": 0.4793028322440087, "grad_norm": 2.3252599239349365, "learning_rate": 9.522875816993463e-07, "loss": 4.290740013122559, "step": 220 }, { "epoch": 0.48148148148148145, "grad_norm": 2.6100525856018066, "learning_rate": 9.520697167755991e-07, "loss": 4.216197490692139, "step": 221 }, { "epoch": 0.48366013071895425, "grad_norm": 2.0119993686676025, "learning_rate": 9.518518518518518e-07, "loss": 4.252105236053467, "step": 222 }, { "epoch": 0.485838779956427, "grad_norm": 2.0626165866851807, "learning_rate": 9.516339869281044e-07, "loss": 4.326197624206543, "step": 223 }, { "epoch": 0.4880174291938998, "grad_norm": 1.8896783590316772, "learning_rate": 9.514161220043573e-07, "loss": 4.099881649017334, "step": 224 }, { "epoch": 0.49019607843137253, "grad_norm": 1.7077407836914062, "learning_rate": 9.511982570806099e-07, "loss": 4.1440558433532715, "step": 225 }, { "epoch": 0.4923747276688453, "grad_norm": 1.8319593667984009, "learning_rate": 9.509803921568627e-07, "loss": 4.277885913848877, "step": 226 }, { "epoch": 0.49455337690631807, "grad_norm": 2.157092332839966, "learning_rate": 9.507625272331154e-07, "loss": 4.496051788330078, "step": 227 }, { "epoch": 0.49673202614379086, "grad_norm": 1.6478551626205444, "learning_rate": 9.505446623093682e-07, "loss": 4.266190528869629, "step": 228 }, { "epoch": 0.4989106753812636, "grad_norm": 1.7191162109375, "learning_rate": 9.503267973856208e-07, "loss": 4.236968994140625, "step": 229 }, { "epoch": 0.5010893246187363, "grad_norm": 1.7083362340927124, "learning_rate": 9.501089324618736e-07, "loss": 4.369157791137695, "step": 230 }, { "epoch": 0.5032679738562091, "grad_norm": 2.716724157333374, "learning_rate": 9.498910675381263e-07, "loss": 4.3496317863464355, "step": 231 }, { "epoch": 0.5054466230936819, "grad_norm": 1.9012598991394043, "learning_rate": 9.49673202614379e-07, "loss": 4.437310695648193, "step": 232 }, { "epoch": 0.5076252723311547, "grad_norm": 2.0860955715179443, "learning_rate": 9.494553376906318e-07, "loss": 4.3534345626831055, "step": 233 }, { "epoch": 0.5098039215686274, "grad_norm": 1.9643157720565796, "learning_rate": 9.492374727668845e-07, "loss": 4.349437236785889, "step": 234 }, { "epoch": 0.5119825708061002, "grad_norm": 2.047245502471924, "learning_rate": 9.490196078431371e-07, "loss": 4.356471061706543, "step": 235 }, { "epoch": 0.514161220043573, "grad_norm": 1.9418092966079712, "learning_rate": 9.4880174291939e-07, "loss": 4.271924018859863, "step": 236 }, { "epoch": 0.5163398692810458, "grad_norm": 1.984614372253418, "learning_rate": 9.485838779956426e-07, "loss": 4.043069839477539, "step": 237 }, { "epoch": 0.5185185185185185, "grad_norm": 2.2965073585510254, "learning_rate": 9.483660130718954e-07, "loss": 4.426994323730469, "step": 238 }, { "epoch": 0.5206971677559913, "grad_norm": 1.6690049171447754, "learning_rate": 9.481481481481481e-07, "loss": 4.222958087921143, "step": 239 }, { "epoch": 0.5228758169934641, "grad_norm": 2.3808162212371826, "learning_rate": 9.479302832244009e-07, "loss": 4.478995323181152, "step": 240 }, { "epoch": 0.5250544662309368, "grad_norm": 1.7484283447265625, "learning_rate": 9.477124183006535e-07, "loss": 4.249248027801514, "step": 241 }, { "epoch": 0.5272331154684096, "grad_norm": 2.5079264640808105, "learning_rate": 9.474945533769063e-07, "loss": 4.2816548347473145, "step": 242 }, { "epoch": 0.5294117647058824, "grad_norm": 2.8622679710388184, "learning_rate": 9.47276688453159e-07, "loss": 4.538039207458496, "step": 243 }, { "epoch": 0.5315904139433552, "grad_norm": 2.358126163482666, "learning_rate": 9.470588235294117e-07, "loss": 4.52814245223999, "step": 244 }, { "epoch": 0.5337690631808278, "grad_norm": 2.1673479080200195, "learning_rate": 9.468409586056645e-07, "loss": 4.197198867797852, "step": 245 }, { "epoch": 0.5359477124183006, "grad_norm": 2.0004494190216064, "learning_rate": 9.466230936819172e-07, "loss": 4.114048480987549, "step": 246 }, { "epoch": 0.5381263616557734, "grad_norm": 1.7733162641525269, "learning_rate": 9.464052287581698e-07, "loss": 4.3549065589904785, "step": 247 }, { "epoch": 0.5403050108932462, "grad_norm": 2.3709092140197754, "learning_rate": 9.461873638344227e-07, "loss": 4.270395278930664, "step": 248 }, { "epoch": 0.5424836601307189, "grad_norm": 1.7482978105545044, "learning_rate": 9.459694989106753e-07, "loss": 4.320254325866699, "step": 249 }, { "epoch": 0.5446623093681917, "grad_norm": 1.8241703510284424, "learning_rate": 9.457516339869281e-07, "loss": 4.232961654663086, "step": 250 }, { "epoch": 0.5468409586056645, "grad_norm": 2.852616786956787, "learning_rate": 9.455337690631808e-07, "loss": 4.582944393157959, "step": 251 }, { "epoch": 0.5490196078431373, "grad_norm": 2.2779054641723633, "learning_rate": 9.453159041394335e-07, "loss": 4.473123550415039, "step": 252 }, { "epoch": 0.55119825708061, "grad_norm": 2.315554618835449, "learning_rate": 9.450980392156862e-07, "loss": 4.4862141609191895, "step": 253 }, { "epoch": 0.5533769063180828, "grad_norm": 1.513036847114563, "learning_rate": 9.44880174291939e-07, "loss": 4.0746355056762695, "step": 254 }, { "epoch": 0.5555555555555556, "grad_norm": 2.1710667610168457, "learning_rate": 9.446623093681917e-07, "loss": 4.378049850463867, "step": 255 }, { "epoch": 0.5577342047930284, "grad_norm": 2.427969217300415, "learning_rate": 9.444444444444444e-07, "loss": 4.507735252380371, "step": 256 }, { "epoch": 0.5599128540305011, "grad_norm": 2.7602696418762207, "learning_rate": 9.442265795206972e-07, "loss": 4.3347368240356445, "step": 257 }, { "epoch": 0.5620915032679739, "grad_norm": 2.0364744663238525, "learning_rate": 9.440087145969499e-07, "loss": 4.31652307510376, "step": 258 }, { "epoch": 0.5642701525054467, "grad_norm": 2.303499698638916, "learning_rate": 9.437908496732025e-07, "loss": 4.130202770233154, "step": 259 }, { "epoch": 0.5664488017429193, "grad_norm": 1.4112861156463623, "learning_rate": 9.435729847494554e-07, "loss": 4.151785850524902, "step": 260 }, { "epoch": 0.5686274509803921, "grad_norm": 1.8270570039749146, "learning_rate": 9.43355119825708e-07, "loss": 4.369143486022949, "step": 261 }, { "epoch": 0.5708061002178649, "grad_norm": 2.8040060997009277, "learning_rate": 9.431372549019608e-07, "loss": 4.293192386627197, "step": 262 }, { "epoch": 0.5729847494553377, "grad_norm": 2.1216909885406494, "learning_rate": 9.429193899782135e-07, "loss": 4.178761005401611, "step": 263 }, { "epoch": 0.5751633986928104, "grad_norm": 3.012073040008545, "learning_rate": 9.427015250544662e-07, "loss": 4.526463985443115, "step": 264 }, { "epoch": 0.5773420479302832, "grad_norm": 2.417126417160034, "learning_rate": 9.424836601307189e-07, "loss": 4.480845928192139, "step": 265 }, { "epoch": 0.579520697167756, "grad_norm": 1.8935476541519165, "learning_rate": 9.422657952069716e-07, "loss": 4.2331318855285645, "step": 266 }, { "epoch": 0.5816993464052288, "grad_norm": 2.131685733795166, "learning_rate": 9.420479302832244e-07, "loss": 4.164639949798584, "step": 267 }, { "epoch": 0.5838779956427015, "grad_norm": 1.919486165046692, "learning_rate": 9.418300653594771e-07, "loss": 4.158851146697998, "step": 268 }, { "epoch": 0.5860566448801743, "grad_norm": 1.8236701488494873, "learning_rate": 9.416122004357297e-07, "loss": 4.261025905609131, "step": 269 }, { "epoch": 0.5882352941176471, "grad_norm": 1.9933416843414307, "learning_rate": 9.413943355119826e-07, "loss": 4.264568328857422, "step": 270 }, { "epoch": 0.5904139433551199, "grad_norm": 2.719871759414673, "learning_rate": 9.411764705882352e-07, "loss": 4.393710613250732, "step": 271 }, { "epoch": 0.5925925925925926, "grad_norm": 2.377943992614746, "learning_rate": 9.40958605664488e-07, "loss": 4.4187188148498535, "step": 272 }, { "epoch": 0.5947712418300654, "grad_norm": 1.8118345737457275, "learning_rate": 9.407407407407407e-07, "loss": 4.238672733306885, "step": 273 }, { "epoch": 0.5969498910675382, "grad_norm": 3.0528037548065186, "learning_rate": 9.405228758169935e-07, "loss": 4.501391887664795, "step": 274 }, { "epoch": 0.599128540305011, "grad_norm": 1.970699667930603, "learning_rate": 9.403050108932461e-07, "loss": 4.277827739715576, "step": 275 }, { "epoch": 0.6013071895424836, "grad_norm": 2.3799169063568115, "learning_rate": 9.400871459694989e-07, "loss": 4.331878662109375, "step": 276 }, { "epoch": 0.6034858387799564, "grad_norm": 1.6306818723678589, "learning_rate": 9.398692810457516e-07, "loss": 4.269891738891602, "step": 277 }, { "epoch": 0.6056644880174292, "grad_norm": 2.062476873397827, "learning_rate": 9.396514161220042e-07, "loss": 4.327604293823242, "step": 278 }, { "epoch": 0.6078431372549019, "grad_norm": 2.5334386825561523, "learning_rate": 9.394335511982571e-07, "loss": 4.333818435668945, "step": 279 }, { "epoch": 0.6100217864923747, "grad_norm": 2.0433833599090576, "learning_rate": 9.392156862745097e-07, "loss": 4.203022003173828, "step": 280 }, { "epoch": 0.6122004357298475, "grad_norm": 2.2614269256591797, "learning_rate": 9.389978213507624e-07, "loss": 4.318673133850098, "step": 281 }, { "epoch": 0.6143790849673203, "grad_norm": 1.9716330766677856, "learning_rate": 9.387799564270152e-07, "loss": 4.232147693634033, "step": 282 }, { "epoch": 0.616557734204793, "grad_norm": 1.837843418121338, "learning_rate": 9.385620915032679e-07, "loss": 4.245857238769531, "step": 283 }, { "epoch": 0.6187363834422658, "grad_norm": 2.3850011825561523, "learning_rate": 9.383442265795206e-07, "loss": 4.42717981338501, "step": 284 }, { "epoch": 0.6209150326797386, "grad_norm": 2.4741547107696533, "learning_rate": 9.381263616557734e-07, "loss": 4.136049747467041, "step": 285 }, { "epoch": 0.6230936819172114, "grad_norm": 1.8070532083511353, "learning_rate": 9.37908496732026e-07, "loss": 4.264199733734131, "step": 286 }, { "epoch": 0.6252723311546841, "grad_norm": 2.5413472652435303, "learning_rate": 9.376906318082788e-07, "loss": 4.469062328338623, "step": 287 }, { "epoch": 0.6274509803921569, "grad_norm": 1.9982818365097046, "learning_rate": 9.374727668845315e-07, "loss": 4.051740646362305, "step": 288 }, { "epoch": 0.6296296296296297, "grad_norm": 2.351032018661499, "learning_rate": 9.372549019607843e-07, "loss": 4.301977634429932, "step": 289 }, { "epoch": 0.6318082788671024, "grad_norm": 1.9607293605804443, "learning_rate": 9.370370370370369e-07, "loss": 4.261730194091797, "step": 290 }, { "epoch": 0.6339869281045751, "grad_norm": 1.9776970148086548, "learning_rate": 9.368191721132898e-07, "loss": 4.373515605926514, "step": 291 }, { "epoch": 0.6361655773420479, "grad_norm": 2.6956138610839844, "learning_rate": 9.366013071895424e-07, "loss": 4.401850700378418, "step": 292 }, { "epoch": 0.6383442265795207, "grad_norm": 2.6304495334625244, "learning_rate": 9.363834422657951e-07, "loss": 4.5218892097473145, "step": 293 }, { "epoch": 0.6405228758169934, "grad_norm": 2.0205626487731934, "learning_rate": 9.361655773420479e-07, "loss": 4.327292442321777, "step": 294 }, { "epoch": 0.6427015250544662, "grad_norm": 2.179348945617676, "learning_rate": 9.359477124183006e-07, "loss": 4.2162299156188965, "step": 295 }, { "epoch": 0.644880174291939, "grad_norm": 2.004915952682495, "learning_rate": 9.357298474945533e-07, "loss": 4.26617956161499, "step": 296 }, { "epoch": 0.6470588235294118, "grad_norm": 2.487010955810547, "learning_rate": 9.355119825708061e-07, "loss": 4.518709182739258, "step": 297 }, { "epoch": 0.6492374727668845, "grad_norm": 2.1379663944244385, "learning_rate": 9.352941176470588e-07, "loss": 4.301877975463867, "step": 298 }, { "epoch": 0.6514161220043573, "grad_norm": 2.1979153156280518, "learning_rate": 9.350762527233115e-07, "loss": 4.34274959564209, "step": 299 }, { "epoch": 0.6535947712418301, "grad_norm": 1.719846487045288, "learning_rate": 9.348583877995642e-07, "loss": 4.1293253898620605, "step": 300 }, { "epoch": 0.6557734204793029, "grad_norm": 1.7805875539779663, "learning_rate": 9.34640522875817e-07, "loss": 4.257225513458252, "step": 301 }, { "epoch": 0.6579520697167756, "grad_norm": 1.6562926769256592, "learning_rate": 9.344226579520696e-07, "loss": 4.225170135498047, "step": 302 }, { "epoch": 0.6601307189542484, "grad_norm": 1.9313156604766846, "learning_rate": 9.342047930283225e-07, "loss": 4.125218868255615, "step": 303 }, { "epoch": 0.6623093681917211, "grad_norm": 3.258147716522217, "learning_rate": 9.339869281045751e-07, "loss": 4.621779441833496, "step": 304 }, { "epoch": 0.664488017429194, "grad_norm": 2.9603540897369385, "learning_rate": 9.337690631808278e-07, "loss": 4.509817600250244, "step": 305 }, { "epoch": 0.6666666666666666, "grad_norm": 2.6384479999542236, "learning_rate": 9.335511982570806e-07, "loss": 4.355105876922607, "step": 306 }, { "epoch": 0.6688453159041394, "grad_norm": 2.2285590171813965, "learning_rate": 9.333333333333333e-07, "loss": 4.502842903137207, "step": 307 }, { "epoch": 0.6710239651416122, "grad_norm": 2.0646419525146484, "learning_rate": 9.33115468409586e-07, "loss": 4.261293888092041, "step": 308 }, { "epoch": 0.673202614379085, "grad_norm": 2.292961359024048, "learning_rate": 9.328976034858388e-07, "loss": 4.331558704376221, "step": 309 }, { "epoch": 0.6753812636165577, "grad_norm": 3.0116658210754395, "learning_rate": 9.326797385620914e-07, "loss": 4.40883731842041, "step": 310 }, { "epoch": 0.6775599128540305, "grad_norm": 2.169278144836426, "learning_rate": 9.324618736383442e-07, "loss": 4.30955696105957, "step": 311 }, { "epoch": 0.6797385620915033, "grad_norm": 2.6467745304107666, "learning_rate": 9.322440087145968e-07, "loss": 4.36649751663208, "step": 312 }, { "epoch": 0.681917211328976, "grad_norm": 2.6238508224487305, "learning_rate": 9.320261437908497e-07, "loss": 4.338852882385254, "step": 313 }, { "epoch": 0.6840958605664488, "grad_norm": 2.0027642250061035, "learning_rate": 9.318082788671023e-07, "loss": 4.211042881011963, "step": 314 }, { "epoch": 0.6862745098039216, "grad_norm": 1.822209358215332, "learning_rate": 9.315904139433551e-07, "loss": 4.39182186126709, "step": 315 }, { "epoch": 0.6884531590413944, "grad_norm": 2.1167984008789062, "learning_rate": 9.313725490196078e-07, "loss": 4.396923065185547, "step": 316 }, { "epoch": 0.690631808278867, "grad_norm": 1.6318403482437134, "learning_rate": 9.311546840958605e-07, "loss": 4.246094226837158, "step": 317 }, { "epoch": 0.6928104575163399, "grad_norm": 1.8685541152954102, "learning_rate": 9.309368191721132e-07, "loss": 4.165795803070068, "step": 318 }, { "epoch": 0.6949891067538126, "grad_norm": 1.9460184574127197, "learning_rate": 9.30718954248366e-07, "loss": 4.289465427398682, "step": 319 }, { "epoch": 0.6971677559912854, "grad_norm": 2.909140110015869, "learning_rate": 9.305010893246187e-07, "loss": 4.520230293273926, "step": 320 }, { "epoch": 0.6993464052287581, "grad_norm": 1.8325444459915161, "learning_rate": 9.302832244008714e-07, "loss": 4.202396869659424, "step": 321 }, { "epoch": 0.7015250544662309, "grad_norm": 3.0213937759399414, "learning_rate": 9.300653594771241e-07, "loss": 4.599020481109619, "step": 322 }, { "epoch": 0.7037037037037037, "grad_norm": 2.5810821056365967, "learning_rate": 9.298474945533769e-07, "loss": 4.495166778564453, "step": 323 }, { "epoch": 0.7058823529411765, "grad_norm": 1.9636465311050415, "learning_rate": 9.296296296296295e-07, "loss": 4.338723659515381, "step": 324 }, { "epoch": 0.7080610021786492, "grad_norm": 2.700573682785034, "learning_rate": 9.294117647058824e-07, "loss": 4.1468682289123535, "step": 325 }, { "epoch": 0.710239651416122, "grad_norm": 2.126833438873291, "learning_rate": 9.29193899782135e-07, "loss": 4.13951301574707, "step": 326 }, { "epoch": 0.7124183006535948, "grad_norm": 2.4611458778381348, "learning_rate": 9.289760348583878e-07, "loss": 4.541684150695801, "step": 327 }, { "epoch": 0.7145969498910676, "grad_norm": 1.6345309019088745, "learning_rate": 9.287581699346405e-07, "loss": 4.199406147003174, "step": 328 }, { "epoch": 0.7167755991285403, "grad_norm": 2.4470903873443604, "learning_rate": 9.285403050108932e-07, "loss": 4.442519664764404, "step": 329 }, { "epoch": 0.7189542483660131, "grad_norm": 2.8630640506744385, "learning_rate": 9.283224400871459e-07, "loss": 4.423019886016846, "step": 330 }, { "epoch": 0.7211328976034859, "grad_norm": 2.1433722972869873, "learning_rate": 9.281045751633987e-07, "loss": 4.492217540740967, "step": 331 }, { "epoch": 0.7233115468409586, "grad_norm": 2.396273136138916, "learning_rate": 9.278867102396514e-07, "loss": 4.223300457000732, "step": 332 }, { "epoch": 0.7254901960784313, "grad_norm": 2.381741523742676, "learning_rate": 9.276688453159041e-07, "loss": 4.33833122253418, "step": 333 }, { "epoch": 0.7276688453159041, "grad_norm": 1.8426613807678223, "learning_rate": 9.274509803921568e-07, "loss": 4.187051296234131, "step": 334 }, { "epoch": 0.7298474945533769, "grad_norm": 2.631394624710083, "learning_rate": 9.272331154684096e-07, "loss": 4.438161373138428, "step": 335 }, { "epoch": 0.7320261437908496, "grad_norm": 2.438077926635742, "learning_rate": 9.270152505446622e-07, "loss": 4.312593460083008, "step": 336 }, { "epoch": 0.7342047930283224, "grad_norm": 3.4880318641662598, "learning_rate": 9.267973856209151e-07, "loss": 4.550225257873535, "step": 337 }, { "epoch": 0.7363834422657952, "grad_norm": 1.809789776802063, "learning_rate": 9.265795206971677e-07, "loss": 4.273606300354004, "step": 338 }, { "epoch": 0.738562091503268, "grad_norm": 1.7368791103363037, "learning_rate": 9.263616557734205e-07, "loss": 4.258157253265381, "step": 339 }, { "epoch": 0.7407407407407407, "grad_norm": 3.2504823207855225, "learning_rate": 9.261437908496732e-07, "loss": 4.630444526672363, "step": 340 }, { "epoch": 0.7429193899782135, "grad_norm": 2.2657594680786133, "learning_rate": 9.259259259259259e-07, "loss": 4.2488203048706055, "step": 341 }, { "epoch": 0.7450980392156863, "grad_norm": 1.7225124835968018, "learning_rate": 9.257080610021786e-07, "loss": 4.161405563354492, "step": 342 }, { "epoch": 0.7472766884531591, "grad_norm": 2.4991836547851562, "learning_rate": 9.254901960784314e-07, "loss": 4.285762310028076, "step": 343 }, { "epoch": 0.7494553376906318, "grad_norm": 2.18890643119812, "learning_rate": 9.252723311546841e-07, "loss": 4.175844192504883, "step": 344 }, { "epoch": 0.7516339869281046, "grad_norm": 2.609976053237915, "learning_rate": 9.250544662309368e-07, "loss": 4.452181816101074, "step": 345 }, { "epoch": 0.7538126361655774, "grad_norm": 3.713862419128418, "learning_rate": 9.248366013071895e-07, "loss": 4.6298112869262695, "step": 346 }, { "epoch": 0.7559912854030502, "grad_norm": 2.2569708824157715, "learning_rate": 9.246187363834423e-07, "loss": 4.350952625274658, "step": 347 }, { "epoch": 0.7581699346405228, "grad_norm": 1.7120873928070068, "learning_rate": 9.244008714596949e-07, "loss": 4.095013618469238, "step": 348 }, { "epoch": 0.7603485838779956, "grad_norm": 1.9212177991867065, "learning_rate": 9.241830065359478e-07, "loss": 4.035152435302734, "step": 349 }, { "epoch": 0.7625272331154684, "grad_norm": 2.2976410388946533, "learning_rate": 9.239651416122004e-07, "loss": 4.412503719329834, "step": 350 }, { "epoch": 0.7647058823529411, "grad_norm": 1.9677550792694092, "learning_rate": 9.23747276688453e-07, "loss": 4.223837375640869, "step": 351 }, { "epoch": 0.7668845315904139, "grad_norm": 2.0862555503845215, "learning_rate": 9.235294117647059e-07, "loss": 4.148982524871826, "step": 352 }, { "epoch": 0.7690631808278867, "grad_norm": 2.399984836578369, "learning_rate": 9.233115468409585e-07, "loss": 4.187849998474121, "step": 353 }, { "epoch": 0.7712418300653595, "grad_norm": 2.9156839847564697, "learning_rate": 9.230936819172113e-07, "loss": 4.528846263885498, "step": 354 }, { "epoch": 0.7734204793028322, "grad_norm": 1.7015364170074463, "learning_rate": 9.22875816993464e-07, "loss": 4.217906951904297, "step": 355 }, { "epoch": 0.775599128540305, "grad_norm": 1.6197303533554077, "learning_rate": 9.226579520697168e-07, "loss": 4.077565670013428, "step": 356 }, { "epoch": 0.7777777777777778, "grad_norm": 1.6793276071548462, "learning_rate": 9.224400871459694e-07, "loss": 4.178196430206299, "step": 357 }, { "epoch": 0.7799564270152506, "grad_norm": 2.427659511566162, "learning_rate": 9.222222222222222e-07, "loss": 4.2796711921691895, "step": 358 }, { "epoch": 0.7821350762527233, "grad_norm": 2.461893081665039, "learning_rate": 9.220043572984749e-07, "loss": 4.355025291442871, "step": 359 }, { "epoch": 0.7843137254901961, "grad_norm": 2.247121810913086, "learning_rate": 9.217864923747276e-07, "loss": 4.3149285316467285, "step": 360 }, { "epoch": 0.7864923747276689, "grad_norm": 1.7950170040130615, "learning_rate": 9.215686274509803e-07, "loss": 4.239467144012451, "step": 361 }, { "epoch": 0.7886710239651417, "grad_norm": 1.9137388467788696, "learning_rate": 9.213507625272331e-07, "loss": 4.354663848876953, "step": 362 }, { "epoch": 0.7908496732026143, "grad_norm": 2.1859757900238037, "learning_rate": 9.211328976034857e-07, "loss": 4.221486568450928, "step": 363 }, { "epoch": 0.7930283224400871, "grad_norm": 2.097460985183716, "learning_rate": 9.209150326797385e-07, "loss": 4.222695827484131, "step": 364 }, { "epoch": 0.7952069716775599, "grad_norm": 2.1484479904174805, "learning_rate": 9.206971677559912e-07, "loss": 4.170539379119873, "step": 365 }, { "epoch": 0.7973856209150327, "grad_norm": 1.9138914346694946, "learning_rate": 9.20479302832244e-07, "loss": 4.183845043182373, "step": 366 }, { "epoch": 0.7995642701525054, "grad_norm": 2.649709939956665, "learning_rate": 9.202614379084966e-07, "loss": 4.5764617919921875, "step": 367 }, { "epoch": 0.8017429193899782, "grad_norm": 2.4475293159484863, "learning_rate": 9.200435729847495e-07, "loss": 4.32996129989624, "step": 368 }, { "epoch": 0.803921568627451, "grad_norm": 2.343046188354492, "learning_rate": 9.198257080610021e-07, "loss": 4.385317325592041, "step": 369 }, { "epoch": 0.8061002178649237, "grad_norm": 1.8386296033859253, "learning_rate": 9.196078431372548e-07, "loss": 4.227548599243164, "step": 370 }, { "epoch": 0.8082788671023965, "grad_norm": 1.614132046699524, "learning_rate": 9.193899782135076e-07, "loss": 4.114497184753418, "step": 371 }, { "epoch": 0.8104575163398693, "grad_norm": 1.7535715103149414, "learning_rate": 9.191721132897603e-07, "loss": 4.214768409729004, "step": 372 }, { "epoch": 0.8126361655773421, "grad_norm": 2.0214993953704834, "learning_rate": 9.18954248366013e-07, "loss": 4.2975263595581055, "step": 373 }, { "epoch": 0.8148148148148148, "grad_norm": 2.0663769245147705, "learning_rate": 9.187363834422658e-07, "loss": 4.2605438232421875, "step": 374 }, { "epoch": 0.8169934640522876, "grad_norm": 1.6967873573303223, "learning_rate": 9.185185185185184e-07, "loss": 4.128928184509277, "step": 375 }, { "epoch": 0.8191721132897604, "grad_norm": 1.8438379764556885, "learning_rate": 9.183006535947712e-07, "loss": 4.261013984680176, "step": 376 }, { "epoch": 0.8213507625272332, "grad_norm": 2.049132823944092, "learning_rate": 9.180827886710239e-07, "loss": 4.306873798370361, "step": 377 }, { "epoch": 0.8235294117647058, "grad_norm": 1.8627986907958984, "learning_rate": 9.178649237472767e-07, "loss": 4.302986145019531, "step": 378 }, { "epoch": 0.8257080610021786, "grad_norm": 1.6413536071777344, "learning_rate": 9.176470588235293e-07, "loss": 4.1943793296813965, "step": 379 }, { "epoch": 0.8278867102396514, "grad_norm": 1.7843068838119507, "learning_rate": 9.174291938997822e-07, "loss": 4.1645684242248535, "step": 380 }, { "epoch": 0.8300653594771242, "grad_norm": 2.0969436168670654, "learning_rate": 9.172113289760348e-07, "loss": 4.049801826477051, "step": 381 }, { "epoch": 0.8322440087145969, "grad_norm": 2.387089967727661, "learning_rate": 9.169934640522875e-07, "loss": 4.419530391693115, "step": 382 }, { "epoch": 0.8344226579520697, "grad_norm": 1.9397732019424438, "learning_rate": 9.167755991285403e-07, "loss": 4.22908353805542, "step": 383 }, { "epoch": 0.8366013071895425, "grad_norm": 1.7454991340637207, "learning_rate": 9.16557734204793e-07, "loss": 4.344266891479492, "step": 384 }, { "epoch": 0.8387799564270153, "grad_norm": 1.9364548921585083, "learning_rate": 9.163398692810457e-07, "loss": 4.270198345184326, "step": 385 }, { "epoch": 0.840958605664488, "grad_norm": 2.5902137756347656, "learning_rate": 9.161220043572985e-07, "loss": 4.300908088684082, "step": 386 }, { "epoch": 0.8431372549019608, "grad_norm": 3.546748399734497, "learning_rate": 9.159041394335511e-07, "loss": 4.561097145080566, "step": 387 }, { "epoch": 0.8453159041394336, "grad_norm": 1.779766321182251, "learning_rate": 9.156862745098039e-07, "loss": 4.041597843170166, "step": 388 }, { "epoch": 0.8474945533769063, "grad_norm": 1.8111950159072876, "learning_rate": 9.154684095860566e-07, "loss": 4.299168586730957, "step": 389 }, { "epoch": 0.8496732026143791, "grad_norm": 2.395869016647339, "learning_rate": 9.152505446623094e-07, "loss": 4.353262424468994, "step": 390 }, { "epoch": 0.8518518518518519, "grad_norm": 2.060559034347534, "learning_rate": 9.15032679738562e-07, "loss": 4.2379302978515625, "step": 391 }, { "epoch": 0.8540305010893247, "grad_norm": 2.1842563152313232, "learning_rate": 9.148148148148148e-07, "loss": 4.31560754776001, "step": 392 }, { "epoch": 0.8562091503267973, "grad_norm": 1.7702100276947021, "learning_rate": 9.145969498910675e-07, "loss": 4.2212018966674805, "step": 393 }, { "epoch": 0.8583877995642701, "grad_norm": 3.1450130939483643, "learning_rate": 9.143790849673202e-07, "loss": 4.469844341278076, "step": 394 }, { "epoch": 0.8605664488017429, "grad_norm": 2.660785436630249, "learning_rate": 9.14161220043573e-07, "loss": 4.3779296875, "step": 395 }, { "epoch": 0.8627450980392157, "grad_norm": 1.6954896450042725, "learning_rate": 9.139433551198257e-07, "loss": 4.254485607147217, "step": 396 }, { "epoch": 0.8649237472766884, "grad_norm": 2.3023900985717773, "learning_rate": 9.137254901960783e-07, "loss": 4.345731258392334, "step": 397 }, { "epoch": 0.8671023965141612, "grad_norm": 2.0327610969543457, "learning_rate": 9.135076252723312e-07, "loss": 4.395279407501221, "step": 398 }, { "epoch": 0.869281045751634, "grad_norm": 2.6276464462280273, "learning_rate": 9.132897603485838e-07, "loss": 4.379213333129883, "step": 399 }, { "epoch": 0.8714596949891068, "grad_norm": 2.8123104572296143, "learning_rate": 9.130718954248366e-07, "loss": 4.4300079345703125, "step": 400 }, { "epoch": 0.8736383442265795, "grad_norm": 1.9245363473892212, "learning_rate": 9.128540305010893e-07, "loss": 4.219707012176514, "step": 401 }, { "epoch": 0.8758169934640523, "grad_norm": 1.8081426620483398, "learning_rate": 9.126361655773421e-07, "loss": 4.338159561157227, "step": 402 }, { "epoch": 0.8779956427015251, "grad_norm": 2.749330520629883, "learning_rate": 9.124183006535947e-07, "loss": 4.506350517272949, "step": 403 }, { "epoch": 0.8801742919389978, "grad_norm": 1.4844481945037842, "learning_rate": 9.122004357298475e-07, "loss": 4.188736438751221, "step": 404 }, { "epoch": 0.8823529411764706, "grad_norm": 2.186659336090088, "learning_rate": 9.119825708061002e-07, "loss": 4.305603981018066, "step": 405 }, { "epoch": 0.8845315904139434, "grad_norm": 2.066389322280884, "learning_rate": 9.117647058823529e-07, "loss": 4.277411937713623, "step": 406 }, { "epoch": 0.8867102396514162, "grad_norm": 1.9937480688095093, "learning_rate": 9.115468409586057e-07, "loss": 4.248752593994141, "step": 407 }, { "epoch": 0.8888888888888888, "grad_norm": 2.5236587524414062, "learning_rate": 9.113289760348584e-07, "loss": 4.383094310760498, "step": 408 }, { "epoch": 0.8910675381263616, "grad_norm": 2.0782392024993896, "learning_rate": 9.11111111111111e-07, "loss": 4.37091588973999, "step": 409 }, { "epoch": 0.8932461873638344, "grad_norm": 2.1633827686309814, "learning_rate": 9.108932461873638e-07, "loss": 4.201651096343994, "step": 410 }, { "epoch": 0.8954248366013072, "grad_norm": 2.063110589981079, "learning_rate": 9.106753812636165e-07, "loss": 4.328912734985352, "step": 411 }, { "epoch": 0.8976034858387799, "grad_norm": 2.8345420360565186, "learning_rate": 9.104575163398693e-07, "loss": 4.280642032623291, "step": 412 }, { "epoch": 0.8997821350762527, "grad_norm": 2.138573169708252, "learning_rate": 9.102396514161219e-07, "loss": 4.277536392211914, "step": 413 }, { "epoch": 0.9019607843137255, "grad_norm": 2.5910751819610596, "learning_rate": 9.100217864923748e-07, "loss": 4.363705158233643, "step": 414 }, { "epoch": 0.9041394335511983, "grad_norm": 2.3000571727752686, "learning_rate": 9.098039215686274e-07, "loss": 4.293865203857422, "step": 415 }, { "epoch": 0.906318082788671, "grad_norm": 1.9352879524230957, "learning_rate": 9.0958605664488e-07, "loss": 4.306790351867676, "step": 416 }, { "epoch": 0.9084967320261438, "grad_norm": 2.054032564163208, "learning_rate": 9.093681917211329e-07, "loss": 4.107227325439453, "step": 417 }, { "epoch": 0.9106753812636166, "grad_norm": 2.259209632873535, "learning_rate": 9.091503267973855e-07, "loss": 4.391446113586426, "step": 418 }, { "epoch": 0.9128540305010894, "grad_norm": 2.0108606815338135, "learning_rate": 9.089324618736383e-07, "loss": 4.246882915496826, "step": 419 }, { "epoch": 0.9150326797385621, "grad_norm": 2.10844349861145, "learning_rate": 9.08714596949891e-07, "loss": 4.255201816558838, "step": 420 }, { "epoch": 0.9172113289760349, "grad_norm": 2.282139539718628, "learning_rate": 9.084967320261437e-07, "loss": 4.2623724937438965, "step": 421 }, { "epoch": 0.9193899782135077, "grad_norm": 3.288189172744751, "learning_rate": 9.082788671023964e-07, "loss": 4.55091667175293, "step": 422 }, { "epoch": 0.9215686274509803, "grad_norm": 2.574491500854492, "learning_rate": 9.080610021786492e-07, "loss": 4.347263813018799, "step": 423 }, { "epoch": 0.9237472766884531, "grad_norm": 2.54724383354187, "learning_rate": 9.078431372549019e-07, "loss": 4.444369316101074, "step": 424 }, { "epoch": 0.9259259259259259, "grad_norm": 2.2615246772766113, "learning_rate": 9.076252723311546e-07, "loss": 4.327207565307617, "step": 425 }, { "epoch": 0.9281045751633987, "grad_norm": 2.0451500415802, "learning_rate": 9.074074074074074e-07, "loss": 4.299535751342773, "step": 426 }, { "epoch": 0.9302832244008714, "grad_norm": 1.9952106475830078, "learning_rate": 9.071895424836601e-07, "loss": 4.185388565063477, "step": 427 }, { "epoch": 0.9324618736383442, "grad_norm": 1.6416575908660889, "learning_rate": 9.069716775599127e-07, "loss": 4.195568084716797, "step": 428 }, { "epoch": 0.934640522875817, "grad_norm": 2.031949281692505, "learning_rate": 9.067538126361656e-07, "loss": 4.262988567352295, "step": 429 }, { "epoch": 0.9368191721132898, "grad_norm": 2.343195676803589, "learning_rate": 9.065359477124182e-07, "loss": 4.361613750457764, "step": 430 }, { "epoch": 0.9389978213507625, "grad_norm": 2.6240570545196533, "learning_rate": 9.06318082788671e-07, "loss": 4.398077964782715, "step": 431 }, { "epoch": 0.9411764705882353, "grad_norm": 2.5013344287872314, "learning_rate": 9.061002178649237e-07, "loss": 4.313061237335205, "step": 432 }, { "epoch": 0.9433551198257081, "grad_norm": 1.6849699020385742, "learning_rate": 9.058823529411764e-07, "loss": 3.9826457500457764, "step": 433 }, { "epoch": 0.9455337690631809, "grad_norm": 2.4513988494873047, "learning_rate": 9.056644880174291e-07, "loss": 4.215115547180176, "step": 434 }, { "epoch": 0.9477124183006536, "grad_norm": 1.684166669845581, "learning_rate": 9.054466230936819e-07, "loss": 4.183655261993408, "step": 435 }, { "epoch": 0.9498910675381264, "grad_norm": 1.5734747648239136, "learning_rate": 9.052287581699346e-07, "loss": 4.135869979858398, "step": 436 }, { "epoch": 0.9520697167755992, "grad_norm": 3.9758388996124268, "learning_rate": 9.050108932461873e-07, "loss": 4.34391450881958, "step": 437 }, { "epoch": 0.954248366013072, "grad_norm": 1.9097188711166382, "learning_rate": 9.0479302832244e-07, "loss": 4.253366947174072, "step": 438 }, { "epoch": 0.9564270152505446, "grad_norm": 1.6577128171920776, "learning_rate": 9.045751633986928e-07, "loss": 4.1301469802856445, "step": 439 }, { "epoch": 0.9586056644880174, "grad_norm": 2.240204334259033, "learning_rate": 9.043572984749454e-07, "loss": 4.202823162078857, "step": 440 }, { "epoch": 0.9607843137254902, "grad_norm": 2.3804500102996826, "learning_rate": 9.041394335511983e-07, "loss": 4.424624919891357, "step": 441 }, { "epoch": 0.9629629629629629, "grad_norm": 2.0912375450134277, "learning_rate": 9.039215686274509e-07, "loss": 4.134670257568359, "step": 442 }, { "epoch": 0.9651416122004357, "grad_norm": 1.5136829614639282, "learning_rate": 9.037037037037037e-07, "loss": 4.028146743774414, "step": 443 }, { "epoch": 0.9673202614379085, "grad_norm": 1.9006588459014893, "learning_rate": 9.034858387799564e-07, "loss": 4.248746871948242, "step": 444 }, { "epoch": 0.9694989106753813, "grad_norm": 1.9264826774597168, "learning_rate": 9.032679738562091e-07, "loss": 4.300281047821045, "step": 445 }, { "epoch": 0.971677559912854, "grad_norm": 2.325807571411133, "learning_rate": 9.030501089324618e-07, "loss": 4.411880016326904, "step": 446 }, { "epoch": 0.9738562091503268, "grad_norm": 2.2446606159210205, "learning_rate": 9.028322440087146e-07, "loss": 4.300230979919434, "step": 447 }, { "epoch": 0.9760348583877996, "grad_norm": 1.8679618835449219, "learning_rate": 9.026143790849673e-07, "loss": 4.216586589813232, "step": 448 }, { "epoch": 0.9782135076252724, "grad_norm": 2.71272349357605, "learning_rate": 9.0239651416122e-07, "loss": 4.361291885375977, "step": 449 }, { "epoch": 0.9803921568627451, "grad_norm": 1.8621163368225098, "learning_rate": 9.021786492374727e-07, "loss": 4.023743629455566, "step": 450 }, { "epoch": 0.9825708061002179, "grad_norm": 2.2548575401306152, "learning_rate": 9.019607843137255e-07, "loss": 4.341646671295166, "step": 451 }, { "epoch": 0.9847494553376906, "grad_norm": 1.8140944242477417, "learning_rate": 9.017429193899781e-07, "loss": 4.2807183265686035, "step": 452 }, { "epoch": 0.9869281045751634, "grad_norm": 2.296813726425171, "learning_rate": 9.01525054466231e-07, "loss": 4.189460277557373, "step": 453 }, { "epoch": 0.9891067538126361, "grad_norm": 1.9771134853363037, "learning_rate": 9.013071895424836e-07, "loss": 4.273813247680664, "step": 454 }, { "epoch": 0.9912854030501089, "grad_norm": 2.457392454147339, "learning_rate": 9.010893246187364e-07, "loss": 4.328354835510254, "step": 455 }, { "epoch": 0.9934640522875817, "grad_norm": 2.8040881156921387, "learning_rate": 9.00871459694989e-07, "loss": 4.24334716796875, "step": 456 }, { "epoch": 0.9956427015250545, "grad_norm": 2.514680862426758, "learning_rate": 9.006535947712418e-07, "loss": 4.360405921936035, "step": 457 }, { "epoch": 0.9978213507625272, "grad_norm": 2.3096747398376465, "learning_rate": 9.004357298474945e-07, "loss": 4.383677005767822, "step": 458 }, { "epoch": 1.0, "grad_norm": 1.7039768695831299, "learning_rate": 9.002178649237472e-07, "loss": 4.212360858917236, "step": 459 }, { "epoch": 1.0021786492374727, "grad_norm": 1.8019158840179443, "learning_rate": 9e-07, "loss": 4.091123104095459, "step": 460 }, { "epoch": 1.0043572984749456, "grad_norm": 1.8267475366592407, "learning_rate": 8.997821350762527e-07, "loss": 4.119062423706055, "step": 461 }, { "epoch": 1.0065359477124183, "grad_norm": 1.6241511106491089, "learning_rate": 8.995642701525053e-07, "loss": 3.9380340576171875, "step": 462 }, { "epoch": 1.008714596949891, "grad_norm": 1.6269277334213257, "learning_rate": 8.993464052287582e-07, "loss": 4.272838115692139, "step": 463 }, { "epoch": 1.0108932461873639, "grad_norm": 2.0016684532165527, "learning_rate": 8.991285403050108e-07, "loss": 4.311455249786377, "step": 464 }, { "epoch": 1.0130718954248366, "grad_norm": 1.878769874572754, "learning_rate": 8.989106753812636e-07, "loss": 4.199127197265625, "step": 465 }, { "epoch": 1.0152505446623095, "grad_norm": 2.174274206161499, "learning_rate": 8.986928104575163e-07, "loss": 4.203718662261963, "step": 466 }, { "epoch": 1.0174291938997821, "grad_norm": 3.320707321166992, "learning_rate": 8.98474945533769e-07, "loss": 4.5451202392578125, "step": 467 }, { "epoch": 1.0196078431372548, "grad_norm": 1.579047679901123, "learning_rate": 8.982570806100217e-07, "loss": 4.139333724975586, "step": 468 }, { "epoch": 1.0217864923747277, "grad_norm": 1.5734368562698364, "learning_rate": 8.980392156862745e-07, "loss": 4.19420051574707, "step": 469 }, { "epoch": 1.0239651416122004, "grad_norm": 1.9081121683120728, "learning_rate": 8.978213507625272e-07, "loss": 4.194459915161133, "step": 470 }, { "epoch": 1.026143790849673, "grad_norm": 1.9206856489181519, "learning_rate": 8.976034858387799e-07, "loss": 4.1511945724487305, "step": 471 }, { "epoch": 1.028322440087146, "grad_norm": 2.417661428451538, "learning_rate": 8.973856209150327e-07, "loss": 4.293420314788818, "step": 472 }, { "epoch": 1.0305010893246187, "grad_norm": 1.9362252950668335, "learning_rate": 8.971677559912854e-07, "loss": 4.23864221572876, "step": 473 }, { "epoch": 1.0326797385620916, "grad_norm": 2.0182526111602783, "learning_rate": 8.96949891067538e-07, "loss": 4.281663417816162, "step": 474 }, { "epoch": 1.0348583877995643, "grad_norm": 2.02213191986084, "learning_rate": 8.967320261437909e-07, "loss": 4.237555027008057, "step": 475 }, { "epoch": 1.037037037037037, "grad_norm": 1.7088505029678345, "learning_rate": 8.965141612200435e-07, "loss": 4.105838298797607, "step": 476 }, { "epoch": 1.0392156862745099, "grad_norm": 2.564328193664551, "learning_rate": 8.962962962962963e-07, "loss": 4.326336860656738, "step": 477 }, { "epoch": 1.0413943355119826, "grad_norm": 1.9669668674468994, "learning_rate": 8.96078431372549e-07, "loss": 4.221822738647461, "step": 478 }, { "epoch": 1.0435729847494553, "grad_norm": 2.3631763458251953, "learning_rate": 8.958605664488017e-07, "loss": 4.326210021972656, "step": 479 }, { "epoch": 1.0457516339869282, "grad_norm": 2.196126937866211, "learning_rate": 8.956427015250544e-07, "loss": 4.256839275360107, "step": 480 }, { "epoch": 1.0479302832244008, "grad_norm": 1.6833542585372925, "learning_rate": 8.954248366013072e-07, "loss": 4.236635684967041, "step": 481 }, { "epoch": 1.0501089324618735, "grad_norm": 2.1643688678741455, "learning_rate": 8.952069716775599e-07, "loss": 4.271406173706055, "step": 482 }, { "epoch": 1.0522875816993464, "grad_norm": 1.8688637018203735, "learning_rate": 8.949891067538126e-07, "loss": 4.174862861633301, "step": 483 }, { "epoch": 1.0544662309368191, "grad_norm": 1.6554330587387085, "learning_rate": 8.947712418300654e-07, "loss": 3.991610288619995, "step": 484 }, { "epoch": 1.056644880174292, "grad_norm": 2.1946685314178467, "learning_rate": 8.945533769063181e-07, "loss": 4.320149898529053, "step": 485 }, { "epoch": 1.0588235294117647, "grad_norm": 1.9151839017868042, "learning_rate": 8.943355119825707e-07, "loss": 4.221381187438965, "step": 486 }, { "epoch": 1.0610021786492374, "grad_norm": 2.062462091445923, "learning_rate": 8.941176470588236e-07, "loss": 4.060985565185547, "step": 487 }, { "epoch": 1.0631808278867103, "grad_norm": 3.206397533416748, "learning_rate": 8.938997821350762e-07, "loss": 4.444162845611572, "step": 488 }, { "epoch": 1.065359477124183, "grad_norm": 1.980419397354126, "learning_rate": 8.936819172113289e-07, "loss": 4.0883307456970215, "step": 489 }, { "epoch": 1.0675381263616557, "grad_norm": 2.5282535552978516, "learning_rate": 8.934640522875817e-07, "loss": 4.286747455596924, "step": 490 }, { "epoch": 1.0697167755991286, "grad_norm": 2.1868057250976562, "learning_rate": 8.932461873638343e-07, "loss": 4.197092056274414, "step": 491 }, { "epoch": 1.0718954248366013, "grad_norm": 1.874988079071045, "learning_rate": 8.930283224400871e-07, "loss": 4.091634750366211, "step": 492 }, { "epoch": 1.074074074074074, "grad_norm": 2.005953073501587, "learning_rate": 8.928104575163398e-07, "loss": 4.168447017669678, "step": 493 }, { "epoch": 1.0762527233115469, "grad_norm": 2.5611610412597656, "learning_rate": 8.925925925925926e-07, "loss": 4.390806674957275, "step": 494 }, { "epoch": 1.0784313725490196, "grad_norm": 2.8791651725769043, "learning_rate": 8.923747276688452e-07, "loss": 4.313453197479248, "step": 495 }, { "epoch": 1.0806100217864925, "grad_norm": 1.9287148714065552, "learning_rate": 8.92156862745098e-07, "loss": 4.122086048126221, "step": 496 }, { "epoch": 1.0827886710239651, "grad_norm": 2.4522085189819336, "learning_rate": 8.919389978213507e-07, "loss": 4.393043041229248, "step": 497 }, { "epoch": 1.0849673202614378, "grad_norm": 2.7219996452331543, "learning_rate": 8.917211328976034e-07, "loss": 3.987981081008911, "step": 498 }, { "epoch": 1.0871459694989107, "grad_norm": 2.597627639770508, "learning_rate": 8.915032679738562e-07, "loss": 4.416964530944824, "step": 499 }, { "epoch": 1.0893246187363834, "grad_norm": 2.4803967475891113, "learning_rate": 8.912854030501089e-07, "loss": 4.2766923904418945, "step": 500 }, { "epoch": 1.091503267973856, "grad_norm": 1.8213063478469849, "learning_rate": 8.910675381263616e-07, "loss": 4.1734137535095215, "step": 501 }, { "epoch": 1.093681917211329, "grad_norm": 3.17716383934021, "learning_rate": 8.908496732026144e-07, "loss": 4.528834342956543, "step": 502 }, { "epoch": 1.0958605664488017, "grad_norm": 1.9751923084259033, "learning_rate": 8.90631808278867e-07, "loss": 4.230244159698486, "step": 503 }, { "epoch": 1.0980392156862746, "grad_norm": 1.6025060415267944, "learning_rate": 8.904139433551198e-07, "loss": 4.070448398590088, "step": 504 }, { "epoch": 1.1002178649237473, "grad_norm": 1.8477643728256226, "learning_rate": 8.901960784313724e-07, "loss": 4.175595283508301, "step": 505 }, { "epoch": 1.10239651416122, "grad_norm": 2.1726014614105225, "learning_rate": 8.899782135076253e-07, "loss": 4.131043910980225, "step": 506 }, { "epoch": 1.1045751633986929, "grad_norm": 1.9106134176254272, "learning_rate": 8.897603485838779e-07, "loss": 4.129506587982178, "step": 507 }, { "epoch": 1.1067538126361656, "grad_norm": 1.8755360841751099, "learning_rate": 8.895424836601306e-07, "loss": 4.244847774505615, "step": 508 }, { "epoch": 1.1089324618736383, "grad_norm": 1.8631155490875244, "learning_rate": 8.893246187363834e-07, "loss": 4.1684346199035645, "step": 509 }, { "epoch": 1.1111111111111112, "grad_norm": 1.7448307275772095, "learning_rate": 8.891067538126361e-07, "loss": 4.011495113372803, "step": 510 }, { "epoch": 1.1132897603485838, "grad_norm": 2.4126994609832764, "learning_rate": 8.888888888888888e-07, "loss": 4.409214973449707, "step": 511 }, { "epoch": 1.1154684095860565, "grad_norm": 1.5890309810638428, "learning_rate": 8.886710239651416e-07, "loss": 4.0809645652771, "step": 512 }, { "epoch": 1.1176470588235294, "grad_norm": 1.8470577001571655, "learning_rate": 8.884531590413943e-07, "loss": 4.115182399749756, "step": 513 }, { "epoch": 1.1198257080610021, "grad_norm": 2.04971981048584, "learning_rate": 8.88235294117647e-07, "loss": 4.100502014160156, "step": 514 }, { "epoch": 1.122004357298475, "grad_norm": 2.7368593215942383, "learning_rate": 8.880174291938997e-07, "loss": 4.213084697723389, "step": 515 }, { "epoch": 1.1241830065359477, "grad_norm": 2.732602119445801, "learning_rate": 8.877995642701525e-07, "loss": 4.485930919647217, "step": 516 }, { "epoch": 1.1263616557734204, "grad_norm": 2.471958637237549, "learning_rate": 8.875816993464051e-07, "loss": 4.229752063751221, "step": 517 }, { "epoch": 1.1285403050108933, "grad_norm": 2.313220739364624, "learning_rate": 8.87363834422658e-07, "loss": 4.1530280113220215, "step": 518 }, { "epoch": 1.130718954248366, "grad_norm": 2.004079818725586, "learning_rate": 8.871459694989106e-07, "loss": 4.203451156616211, "step": 519 }, { "epoch": 1.132897603485839, "grad_norm": 1.9289637804031372, "learning_rate": 8.869281045751633e-07, "loss": 4.1828131675720215, "step": 520 }, { "epoch": 1.1350762527233116, "grad_norm": 1.90094792842865, "learning_rate": 8.867102396514161e-07, "loss": 4.277235984802246, "step": 521 }, { "epoch": 1.1372549019607843, "grad_norm": 1.9971296787261963, "learning_rate": 8.864923747276688e-07, "loss": 4.030582904815674, "step": 522 }, { "epoch": 1.1394335511982572, "grad_norm": 1.776956558227539, "learning_rate": 8.862745098039215e-07, "loss": 3.9445831775665283, "step": 523 }, { "epoch": 1.1416122004357299, "grad_norm": 2.343385696411133, "learning_rate": 8.860566448801743e-07, "loss": 4.167720317840576, "step": 524 }, { "epoch": 1.1437908496732025, "grad_norm": 1.4792426824569702, "learning_rate": 8.85838779956427e-07, "loss": 4.088743686676025, "step": 525 }, { "epoch": 1.1459694989106755, "grad_norm": 2.0691165924072266, "learning_rate": 8.856209150326797e-07, "loss": 4.190865993499756, "step": 526 }, { "epoch": 1.1481481481481481, "grad_norm": 2.091317892074585, "learning_rate": 8.854030501089324e-07, "loss": 4.185585021972656, "step": 527 }, { "epoch": 1.1503267973856208, "grad_norm": 1.547757625579834, "learning_rate": 8.851851851851852e-07, "loss": 4.005777359008789, "step": 528 }, { "epoch": 1.1525054466230937, "grad_norm": 2.2769548892974854, "learning_rate": 8.849673202614378e-07, "loss": 4.181492805480957, "step": 529 }, { "epoch": 1.1546840958605664, "grad_norm": 1.7844574451446533, "learning_rate": 8.847494553376907e-07, "loss": 4.11847448348999, "step": 530 }, { "epoch": 1.156862745098039, "grad_norm": 1.9298754930496216, "learning_rate": 8.845315904139433e-07, "loss": 4.18254280090332, "step": 531 }, { "epoch": 1.159041394335512, "grad_norm": 1.971434473991394, "learning_rate": 8.84313725490196e-07, "loss": 4.230887413024902, "step": 532 }, { "epoch": 1.1612200435729847, "grad_norm": 2.5972037315368652, "learning_rate": 8.840958605664488e-07, "loss": 4.173167705535889, "step": 533 }, { "epoch": 1.1633986928104576, "grad_norm": 2.239980936050415, "learning_rate": 8.838779956427015e-07, "loss": 4.283732891082764, "step": 534 }, { "epoch": 1.1655773420479303, "grad_norm": 2.714787244796753, "learning_rate": 8.836601307189542e-07, "loss": 4.40385627746582, "step": 535 }, { "epoch": 1.167755991285403, "grad_norm": 2.0600857734680176, "learning_rate": 8.83442265795207e-07, "loss": 4.272720813751221, "step": 536 }, { "epoch": 1.1699346405228759, "grad_norm": 2.1626570224761963, "learning_rate": 8.832244008714596e-07, "loss": 4.19901180267334, "step": 537 }, { "epoch": 1.1721132897603486, "grad_norm": 1.9571884870529175, "learning_rate": 8.830065359477124e-07, "loss": 4.026454448699951, "step": 538 }, { "epoch": 1.1742919389978215, "grad_norm": 2.067525863647461, "learning_rate": 8.827886710239651e-07, "loss": 4.051795482635498, "step": 539 }, { "epoch": 1.1764705882352942, "grad_norm": 1.7833117246627808, "learning_rate": 8.825708061002179e-07, "loss": 4.164949417114258, "step": 540 }, { "epoch": 1.1786492374727668, "grad_norm": 2.0552051067352295, "learning_rate": 8.823529411764705e-07, "loss": 4.134295463562012, "step": 541 }, { "epoch": 1.1808278867102397, "grad_norm": 1.839187741279602, "learning_rate": 8.821350762527234e-07, "loss": 4.210404872894287, "step": 542 }, { "epoch": 1.1830065359477124, "grad_norm": 2.6453094482421875, "learning_rate": 8.81917211328976e-07, "loss": 4.272027015686035, "step": 543 }, { "epoch": 1.1851851851851851, "grad_norm": 2.3471319675445557, "learning_rate": 8.816993464052287e-07, "loss": 4.13799524307251, "step": 544 }, { "epoch": 1.187363834422658, "grad_norm": 2.825316905975342, "learning_rate": 8.814814814814815e-07, "loss": 4.24373197555542, "step": 545 }, { "epoch": 1.1895424836601307, "grad_norm": 2.952958583831787, "learning_rate": 8.812636165577342e-07, "loss": 4.44833517074585, "step": 546 }, { "epoch": 1.1917211328976034, "grad_norm": 1.8417699337005615, "learning_rate": 8.810457516339869e-07, "loss": 4.099013328552246, "step": 547 }, { "epoch": 1.1938997821350763, "grad_norm": 1.6055537462234497, "learning_rate": 8.808278867102397e-07, "loss": 4.119231700897217, "step": 548 }, { "epoch": 1.196078431372549, "grad_norm": 2.205374240875244, "learning_rate": 8.806100217864923e-07, "loss": 4.224857807159424, "step": 549 }, { "epoch": 1.1982570806100217, "grad_norm": 1.8354541063308716, "learning_rate": 8.803921568627451e-07, "loss": 4.1977219581604, "step": 550 }, { "epoch": 1.2004357298474946, "grad_norm": 3.420647382736206, "learning_rate": 8.801742919389977e-07, "loss": 4.254315376281738, "step": 551 }, { "epoch": 1.2026143790849673, "grad_norm": 2.714442014694214, "learning_rate": 8.799564270152506e-07, "loss": 4.35409688949585, "step": 552 }, { "epoch": 1.2047930283224402, "grad_norm": 2.5161349773406982, "learning_rate": 8.797385620915032e-07, "loss": 4.179507732391357, "step": 553 }, { "epoch": 1.2069716775599129, "grad_norm": 1.7604862451553345, "learning_rate": 8.79520697167756e-07, "loss": 4.025084018707275, "step": 554 }, { "epoch": 1.2091503267973855, "grad_norm": 1.9522942304611206, "learning_rate": 8.793028322440087e-07, "loss": 4.150627613067627, "step": 555 }, { "epoch": 1.2113289760348585, "grad_norm": 1.4442564249038696, "learning_rate": 8.790849673202614e-07, "loss": 3.9935989379882812, "step": 556 }, { "epoch": 1.2135076252723311, "grad_norm": 1.842314600944519, "learning_rate": 8.788671023965141e-07, "loss": 4.112300395965576, "step": 557 }, { "epoch": 1.215686274509804, "grad_norm": 2.0112502574920654, "learning_rate": 8.786492374727669e-07, "loss": 4.174977779388428, "step": 558 }, { "epoch": 1.2178649237472767, "grad_norm": 2.875356912612915, "learning_rate": 8.784313725490196e-07, "loss": 4.330785751342773, "step": 559 }, { "epoch": 1.2200435729847494, "grad_norm": 2.6117005348205566, "learning_rate": 8.782135076252722e-07, "loss": 4.278181552886963, "step": 560 }, { "epoch": 1.2222222222222223, "grad_norm": 1.900143027305603, "learning_rate": 8.77995642701525e-07, "loss": 4.112969875335693, "step": 561 }, { "epoch": 1.224400871459695, "grad_norm": 2.344101905822754, "learning_rate": 8.777777777777777e-07, "loss": 4.2813591957092285, "step": 562 }, { "epoch": 1.2265795206971677, "grad_norm": 3.5927395820617676, "learning_rate": 8.775599128540304e-07, "loss": 4.283821105957031, "step": 563 }, { "epoch": 1.2287581699346406, "grad_norm": 1.8207329511642456, "learning_rate": 8.773420479302832e-07, "loss": 4.066518783569336, "step": 564 }, { "epoch": 1.2309368191721133, "grad_norm": 2.783954381942749, "learning_rate": 8.771241830065359e-07, "loss": 4.4028825759887695, "step": 565 }, { "epoch": 1.233115468409586, "grad_norm": 1.7942478656768799, "learning_rate": 8.769063180827885e-07, "loss": 4.108409404754639, "step": 566 }, { "epoch": 1.2352941176470589, "grad_norm": 1.718116283416748, "learning_rate": 8.766884531590414e-07, "loss": 4.059362411499023, "step": 567 }, { "epoch": 1.2374727668845316, "grad_norm": 1.9296554327011108, "learning_rate": 8.76470588235294e-07, "loss": 4.107104778289795, "step": 568 }, { "epoch": 1.2396514161220042, "grad_norm": 2.297661781311035, "learning_rate": 8.762527233115468e-07, "loss": 4.21260929107666, "step": 569 }, { "epoch": 1.2418300653594772, "grad_norm": 2.2548274993896484, "learning_rate": 8.760348583877995e-07, "loss": 4.274600982666016, "step": 570 }, { "epoch": 1.2440087145969498, "grad_norm": 1.8183406591415405, "learning_rate": 8.758169934640523e-07, "loss": 4.0709099769592285, "step": 571 }, { "epoch": 1.2461873638344227, "grad_norm": 2.6578145027160645, "learning_rate": 8.755991285403049e-07, "loss": 4.393871784210205, "step": 572 }, { "epoch": 1.2483660130718954, "grad_norm": 1.9764410257339478, "learning_rate": 8.753812636165577e-07, "loss": 4.13034200668335, "step": 573 }, { "epoch": 1.2505446623093681, "grad_norm": 1.810540795326233, "learning_rate": 8.751633986928104e-07, "loss": 4.121158599853516, "step": 574 }, { "epoch": 1.252723311546841, "grad_norm": 2.0350353717803955, "learning_rate": 8.749455337690631e-07, "loss": 4.358162879943848, "step": 575 }, { "epoch": 1.2549019607843137, "grad_norm": 1.644514799118042, "learning_rate": 8.747276688453159e-07, "loss": 4.1536664962768555, "step": 576 }, { "epoch": 1.2570806100217866, "grad_norm": 1.8875399827957153, "learning_rate": 8.745098039215686e-07, "loss": 4.085392951965332, "step": 577 }, { "epoch": 1.2592592592592593, "grad_norm": 3.08160400390625, "learning_rate": 8.742919389978212e-07, "loss": 4.294520378112793, "step": 578 }, { "epoch": 1.261437908496732, "grad_norm": 1.9114454984664917, "learning_rate": 8.740740740740741e-07, "loss": 4.14904260635376, "step": 579 }, { "epoch": 1.263616557734205, "grad_norm": 1.8417164087295532, "learning_rate": 8.738562091503267e-07, "loss": 4.083341121673584, "step": 580 }, { "epoch": 1.2657952069716776, "grad_norm": 2.2407267093658447, "learning_rate": 8.736383442265795e-07, "loss": 4.179035663604736, "step": 581 }, { "epoch": 1.2679738562091503, "grad_norm": 2.0293619632720947, "learning_rate": 8.734204793028322e-07, "loss": 4.019626617431641, "step": 582 }, { "epoch": 1.2701525054466232, "grad_norm": 2.203669309616089, "learning_rate": 8.73202614379085e-07, "loss": 4.264548301696777, "step": 583 }, { "epoch": 1.2723311546840959, "grad_norm": 1.843723177909851, "learning_rate": 8.729847494553376e-07, "loss": 3.9418485164642334, "step": 584 }, { "epoch": 1.2745098039215685, "grad_norm": 2.299044370651245, "learning_rate": 8.727668845315904e-07, "loss": 4.272276878356934, "step": 585 }, { "epoch": 1.2766884531590414, "grad_norm": 2.1515400409698486, "learning_rate": 8.725490196078431e-07, "loss": 4.007951259613037, "step": 586 }, { "epoch": 1.2788671023965141, "grad_norm": 2.155714988708496, "learning_rate": 8.723311546840958e-07, "loss": 4.171961784362793, "step": 587 }, { "epoch": 1.2810457516339868, "grad_norm": 1.9070075750350952, "learning_rate": 8.721132897603486e-07, "loss": 4.227767467498779, "step": 588 }, { "epoch": 1.2832244008714597, "grad_norm": 1.6582120656967163, "learning_rate": 8.718954248366013e-07, "loss": 4.170677661895752, "step": 589 }, { "epoch": 1.2854030501089324, "grad_norm": 2.051173686981201, "learning_rate": 8.716775599128539e-07, "loss": 4.14252233505249, "step": 590 }, { "epoch": 1.287581699346405, "grad_norm": 2.2936742305755615, "learning_rate": 8.714596949891068e-07, "loss": 4.2736005783081055, "step": 591 }, { "epoch": 1.289760348583878, "grad_norm": 1.9001537561416626, "learning_rate": 8.712418300653594e-07, "loss": 4.101021766662598, "step": 592 }, { "epoch": 1.2919389978213507, "grad_norm": 2.480698823928833, "learning_rate": 8.710239651416122e-07, "loss": 4.056864261627197, "step": 593 }, { "epoch": 1.2941176470588236, "grad_norm": 1.8155734539031982, "learning_rate": 8.708061002178649e-07, "loss": 4.0633864402771, "step": 594 }, { "epoch": 1.2962962962962963, "grad_norm": 1.714326024055481, "learning_rate": 8.705882352941177e-07, "loss": 4.063111782073975, "step": 595 }, { "epoch": 1.2984749455337692, "grad_norm": 1.7356120347976685, "learning_rate": 8.703703703703703e-07, "loss": 4.117801666259766, "step": 596 }, { "epoch": 1.3006535947712419, "grad_norm": 1.5628883838653564, "learning_rate": 8.701525054466231e-07, "loss": 4.079702854156494, "step": 597 }, { "epoch": 1.3028322440087146, "grad_norm": 1.7579734325408936, "learning_rate": 8.699346405228758e-07, "loss": 4.0935821533203125, "step": 598 }, { "epoch": 1.3050108932461875, "grad_norm": 1.7545808553695679, "learning_rate": 8.697167755991285e-07, "loss": 3.990138292312622, "step": 599 }, { "epoch": 1.3071895424836601, "grad_norm": 2.3353233337402344, "learning_rate": 8.694989106753812e-07, "loss": 4.172370910644531, "step": 600 }, { "epoch": 1.3093681917211328, "grad_norm": 2.1901204586029053, "learning_rate": 8.69281045751634e-07, "loss": 4.246954917907715, "step": 601 }, { "epoch": 1.3115468409586057, "grad_norm": 1.9137846231460571, "learning_rate": 8.690631808278866e-07, "loss": 4.196938991546631, "step": 602 }, { "epoch": 1.3137254901960784, "grad_norm": 2.2935667037963867, "learning_rate": 8.688453159041394e-07, "loss": 4.263093948364258, "step": 603 }, { "epoch": 1.3159041394335511, "grad_norm": 2.0060575008392334, "learning_rate": 8.686274509803921e-07, "loss": 4.077230930328369, "step": 604 }, { "epoch": 1.318082788671024, "grad_norm": 2.045654058456421, "learning_rate": 8.684095860566449e-07, "loss": 4.155270576477051, "step": 605 }, { "epoch": 1.3202614379084967, "grad_norm": 1.791175127029419, "learning_rate": 8.681917211328975e-07, "loss": 3.989689350128174, "step": 606 }, { "epoch": 1.3224400871459694, "grad_norm": 2.2741477489471436, "learning_rate": 8.679738562091503e-07, "loss": 4.343669414520264, "step": 607 }, { "epoch": 1.3246187363834423, "grad_norm": 2.6767468452453613, "learning_rate": 8.67755991285403e-07, "loss": 4.3884053230285645, "step": 608 }, { "epoch": 1.326797385620915, "grad_norm": 1.8251911401748657, "learning_rate": 8.675381263616557e-07, "loss": 3.866062641143799, "step": 609 }, { "epoch": 1.3289760348583877, "grad_norm": 2.0717525482177734, "learning_rate": 8.673202614379085e-07, "loss": 4.048012733459473, "step": 610 }, { "epoch": 1.3311546840958606, "grad_norm": 2.947192668914795, "learning_rate": 8.671023965141612e-07, "loss": 4.248272895812988, "step": 611 }, { "epoch": 1.3333333333333333, "grad_norm": 1.7812186479568481, "learning_rate": 8.668845315904138e-07, "loss": 4.139060974121094, "step": 612 }, { "epoch": 1.3355119825708062, "grad_norm": 1.6348546743392944, "learning_rate": 8.666666666666667e-07, "loss": 4.0591607093811035, "step": 613 }, { "epoch": 1.3376906318082789, "grad_norm": 1.8021186590194702, "learning_rate": 8.664488017429193e-07, "loss": 4.05558967590332, "step": 614 }, { "epoch": 1.3398692810457518, "grad_norm": 2.153714656829834, "learning_rate": 8.662309368191721e-07, "loss": 4.266423225402832, "step": 615 }, { "epoch": 1.3420479302832244, "grad_norm": 2.1807448863983154, "learning_rate": 8.660130718954248e-07, "loss": 4.325270652770996, "step": 616 }, { "epoch": 1.3442265795206971, "grad_norm": 2.289285659790039, "learning_rate": 8.657952069716776e-07, "loss": 4.212075233459473, "step": 617 }, { "epoch": 1.34640522875817, "grad_norm": 2.276578187942505, "learning_rate": 8.655773420479302e-07, "loss": 4.1278204917907715, "step": 618 }, { "epoch": 1.3485838779956427, "grad_norm": 1.9747040271759033, "learning_rate": 8.65359477124183e-07, "loss": 4.086096286773682, "step": 619 }, { "epoch": 1.3507625272331154, "grad_norm": 1.6472103595733643, "learning_rate": 8.651416122004357e-07, "loss": 4.108689785003662, "step": 620 }, { "epoch": 1.3529411764705883, "grad_norm": 1.5590437650680542, "learning_rate": 8.649237472766884e-07, "loss": 4.066740989685059, "step": 621 }, { "epoch": 1.355119825708061, "grad_norm": 2.112551689147949, "learning_rate": 8.647058823529412e-07, "loss": 4.318425178527832, "step": 622 }, { "epoch": 1.3572984749455337, "grad_norm": 3.0403637886047363, "learning_rate": 8.644880174291939e-07, "loss": 4.5624213218688965, "step": 623 }, { "epoch": 1.3594771241830066, "grad_norm": 1.8333574533462524, "learning_rate": 8.642701525054465e-07, "loss": 4.240340232849121, "step": 624 }, { "epoch": 1.3616557734204793, "grad_norm": 1.9124542474746704, "learning_rate": 8.640522875816994e-07, "loss": 3.9487993717193604, "step": 625 }, { "epoch": 1.363834422657952, "grad_norm": 2.546870708465576, "learning_rate": 8.63834422657952e-07, "loss": 4.243712425231934, "step": 626 }, { "epoch": 1.3660130718954249, "grad_norm": 1.8149443864822388, "learning_rate": 8.636165577342047e-07, "loss": 4.141324520111084, "step": 627 }, { "epoch": 1.3681917211328976, "grad_norm": 1.976372480392456, "learning_rate": 8.633986928104575e-07, "loss": 4.129775047302246, "step": 628 }, { "epoch": 1.3703703703703702, "grad_norm": 1.708207607269287, "learning_rate": 8.631808278867102e-07, "loss": 4.219161033630371, "step": 629 }, { "epoch": 1.3725490196078431, "grad_norm": 2.0918285846710205, "learning_rate": 8.629629629629629e-07, "loss": 4.162289142608643, "step": 630 }, { "epoch": 1.3747276688453158, "grad_norm": 2.2789528369903564, "learning_rate": 8.627450980392156e-07, "loss": 4.12668514251709, "step": 631 }, { "epoch": 1.3769063180827887, "grad_norm": 2.140989303588867, "learning_rate": 8.625272331154684e-07, "loss": 4.185495376586914, "step": 632 }, { "epoch": 1.3790849673202614, "grad_norm": 1.6602414846420288, "learning_rate": 8.62309368191721e-07, "loss": 4.035266399383545, "step": 633 }, { "epoch": 1.3812636165577343, "grad_norm": 2.071666955947876, "learning_rate": 8.620915032679739e-07, "loss": 4.279394149780273, "step": 634 }, { "epoch": 1.383442265795207, "grad_norm": 2.084772825241089, "learning_rate": 8.618736383442265e-07, "loss": 4.255674839019775, "step": 635 }, { "epoch": 1.3856209150326797, "grad_norm": 2.2395293712615967, "learning_rate": 8.616557734204792e-07, "loss": 4.059002876281738, "step": 636 }, { "epoch": 1.3877995642701526, "grad_norm": 1.7776122093200684, "learning_rate": 8.61437908496732e-07, "loss": 4.005791664123535, "step": 637 }, { "epoch": 1.3899782135076253, "grad_norm": 2.2446353435516357, "learning_rate": 8.612200435729847e-07, "loss": 4.245075702667236, "step": 638 }, { "epoch": 1.392156862745098, "grad_norm": 1.9267030954360962, "learning_rate": 8.610021786492374e-07, "loss": 4.217100143432617, "step": 639 }, { "epoch": 1.3943355119825709, "grad_norm": 2.371600389480591, "learning_rate": 8.607843137254902e-07, "loss": 4.454224586486816, "step": 640 }, { "epoch": 1.3965141612200436, "grad_norm": 1.731301188468933, "learning_rate": 8.605664488017429e-07, "loss": 4.082315921783447, "step": 641 }, { "epoch": 1.3986928104575163, "grad_norm": 1.8527662754058838, "learning_rate": 8.603485838779956e-07, "loss": 4.1427717208862305, "step": 642 }, { "epoch": 1.4008714596949892, "grad_norm": 2.3532307147979736, "learning_rate": 8.601307189542483e-07, "loss": 4.30314302444458, "step": 643 }, { "epoch": 1.4030501089324618, "grad_norm": 2.0085225105285645, "learning_rate": 8.599128540305011e-07, "loss": 4.08289098739624, "step": 644 }, { "epoch": 1.4052287581699345, "grad_norm": 2.219562292098999, "learning_rate": 8.596949891067537e-07, "loss": 4.21745491027832, "step": 645 }, { "epoch": 1.4074074074074074, "grad_norm": 1.594504952430725, "learning_rate": 8.594771241830066e-07, "loss": 4.054660797119141, "step": 646 }, { "epoch": 1.4095860566448801, "grad_norm": 2.6267402172088623, "learning_rate": 8.592592592592592e-07, "loss": 4.326120853424072, "step": 647 }, { "epoch": 1.4117647058823528, "grad_norm": 1.694117546081543, "learning_rate": 8.590413943355119e-07, "loss": 4.100274562835693, "step": 648 }, { "epoch": 1.4139433551198257, "grad_norm": 2.701446533203125, "learning_rate": 8.588235294117646e-07, "loss": 4.274817943572998, "step": 649 }, { "epoch": 1.4161220043572984, "grad_norm": 2.4802911281585693, "learning_rate": 8.586056644880174e-07, "loss": 4.324031352996826, "step": 650 }, { "epoch": 1.4183006535947713, "grad_norm": 2.1803507804870605, "learning_rate": 8.583877995642701e-07, "loss": 4.156269073486328, "step": 651 }, { "epoch": 1.420479302832244, "grad_norm": 2.0425868034362793, "learning_rate": 8.581699346405228e-07, "loss": 4.301708698272705, "step": 652 }, { "epoch": 1.422657952069717, "grad_norm": 1.7117418050765991, "learning_rate": 8.579520697167755e-07, "loss": 4.2754364013671875, "step": 653 }, { "epoch": 1.4248366013071896, "grad_norm": 2.270146369934082, "learning_rate": 8.577342047930283e-07, "loss": 4.212061405181885, "step": 654 }, { "epoch": 1.4270152505446623, "grad_norm": 1.9313299655914307, "learning_rate": 8.575163398692809e-07, "loss": 4.107442855834961, "step": 655 }, { "epoch": 1.4291938997821352, "grad_norm": 2.3042805194854736, "learning_rate": 8.572984749455338e-07, "loss": 4.111403465270996, "step": 656 }, { "epoch": 1.4313725490196079, "grad_norm": 2.694254159927368, "learning_rate": 8.570806100217864e-07, "loss": 4.4755635261535645, "step": 657 }, { "epoch": 1.4335511982570806, "grad_norm": 2.446035385131836, "learning_rate": 8.568627450980392e-07, "loss": 4.076883316040039, "step": 658 }, { "epoch": 1.4357298474945535, "grad_norm": 1.999457836151123, "learning_rate": 8.566448801742919e-07, "loss": 4.225375175476074, "step": 659 }, { "epoch": 1.4379084967320261, "grad_norm": 1.8990052938461304, "learning_rate": 8.564270152505446e-07, "loss": 4.106850624084473, "step": 660 }, { "epoch": 1.4400871459694988, "grad_norm": 1.8520864248275757, "learning_rate": 8.562091503267973e-07, "loss": 4.094986438751221, "step": 661 }, { "epoch": 1.4422657952069717, "grad_norm": 2.049786329269409, "learning_rate": 8.559912854030501e-07, "loss": 4.111976146697998, "step": 662 }, { "epoch": 1.4444444444444444, "grad_norm": 1.691466212272644, "learning_rate": 8.557734204793028e-07, "loss": 4.183073997497559, "step": 663 }, { "epoch": 1.446623093681917, "grad_norm": 2.075439214706421, "learning_rate": 8.555555555555555e-07, "loss": 4.08475923538208, "step": 664 }, { "epoch": 1.44880174291939, "grad_norm": 2.1009483337402344, "learning_rate": 8.553376906318082e-07, "loss": 4.212969779968262, "step": 665 }, { "epoch": 1.4509803921568627, "grad_norm": 2.1857805252075195, "learning_rate": 8.55119825708061e-07, "loss": 4.26734733581543, "step": 666 }, { "epoch": 1.4531590413943354, "grad_norm": 2.0799925327301025, "learning_rate": 8.549019607843136e-07, "loss": 4.240498065948486, "step": 667 }, { "epoch": 1.4553376906318083, "grad_norm": 1.8748165369033813, "learning_rate": 8.546840958605665e-07, "loss": 4.060379505157471, "step": 668 }, { "epoch": 1.457516339869281, "grad_norm": 2.179856061935425, "learning_rate": 8.544662309368191e-07, "loss": 4.0631279945373535, "step": 669 }, { "epoch": 1.4596949891067539, "grad_norm": 1.7492318153381348, "learning_rate": 8.542483660130719e-07, "loss": 4.136343002319336, "step": 670 }, { "epoch": 1.4618736383442266, "grad_norm": 1.8408169746398926, "learning_rate": 8.540305010893246e-07, "loss": 4.27632999420166, "step": 671 }, { "epoch": 1.4640522875816995, "grad_norm": 2.1574692726135254, "learning_rate": 8.538126361655773e-07, "loss": 4.055308818817139, "step": 672 }, { "epoch": 1.4662309368191722, "grad_norm": 2.2931578159332275, "learning_rate": 8.5359477124183e-07, "loss": 4.093464374542236, "step": 673 }, { "epoch": 1.4684095860566448, "grad_norm": 2.194950819015503, "learning_rate": 8.533769063180828e-07, "loss": 4.073080539703369, "step": 674 }, { "epoch": 1.4705882352941178, "grad_norm": 2.5183699131011963, "learning_rate": 8.531590413943355e-07, "loss": 4.375260829925537, "step": 675 }, { "epoch": 1.4727668845315904, "grad_norm": 2.575937509536743, "learning_rate": 8.529411764705882e-07, "loss": 4.187258720397949, "step": 676 }, { "epoch": 1.4749455337690631, "grad_norm": 2.9016573429107666, "learning_rate": 8.527233115468409e-07, "loss": 4.223710536956787, "step": 677 }, { "epoch": 1.477124183006536, "grad_norm": 2.3146796226501465, "learning_rate": 8.525054466230937e-07, "loss": 4.331415176391602, "step": 678 }, { "epoch": 1.4793028322440087, "grad_norm": 2.0989973545074463, "learning_rate": 8.522875816993463e-07, "loss": 4.1337103843688965, "step": 679 }, { "epoch": 1.4814814814814814, "grad_norm": 2.021897554397583, "learning_rate": 8.520697167755992e-07, "loss": 4.094692230224609, "step": 680 }, { "epoch": 1.4836601307189543, "grad_norm": 2.1566081047058105, "learning_rate": 8.518518518518518e-07, "loss": 4.125713348388672, "step": 681 }, { "epoch": 1.485838779956427, "grad_norm": 2.340773820877075, "learning_rate": 8.516339869281046e-07, "loss": 4.3238139152526855, "step": 682 }, { "epoch": 1.4880174291938997, "grad_norm": 1.5676244497299194, "learning_rate": 8.514161220043573e-07, "loss": 4.025534152984619, "step": 683 }, { "epoch": 1.4901960784313726, "grad_norm": 1.9404288530349731, "learning_rate": 8.5119825708061e-07, "loss": 4.189544200897217, "step": 684 }, { "epoch": 1.4923747276688453, "grad_norm": 1.8314532041549683, "learning_rate": 8.509803921568627e-07, "loss": 4.079095840454102, "step": 685 }, { "epoch": 1.494553376906318, "grad_norm": 1.8013782501220703, "learning_rate": 8.507625272331155e-07, "loss": 4.041965007781982, "step": 686 }, { "epoch": 1.4967320261437909, "grad_norm": 2.0033180713653564, "learning_rate": 8.505446623093682e-07, "loss": 4.268764972686768, "step": 687 }, { "epoch": 1.4989106753812635, "grad_norm": 1.851022481918335, "learning_rate": 8.503267973856209e-07, "loss": 4.118299961090088, "step": 688 }, { "epoch": 1.5010893246187362, "grad_norm": 1.7786550521850586, "learning_rate": 8.501089324618736e-07, "loss": 4.1545491218566895, "step": 689 }, { "epoch": 1.5032679738562091, "grad_norm": 2.040213108062744, "learning_rate": 8.498910675381264e-07, "loss": 4.190618515014648, "step": 690 }, { "epoch": 1.505446623093682, "grad_norm": 2.7984845638275146, "learning_rate": 8.49673202614379e-07, "loss": 4.385209560394287, "step": 691 }, { "epoch": 1.5076252723311547, "grad_norm": 2.5531365871429443, "learning_rate": 8.494553376906319e-07, "loss": 4.12838077545166, "step": 692 }, { "epoch": 1.5098039215686274, "grad_norm": 2.1750972270965576, "learning_rate": 8.492374727668845e-07, "loss": 4.345357894897461, "step": 693 }, { "epoch": 1.5119825708061003, "grad_norm": 2.0459821224212646, "learning_rate": 8.490196078431372e-07, "loss": 4.184515953063965, "step": 694 }, { "epoch": 1.514161220043573, "grad_norm": 1.7549774646759033, "learning_rate": 8.488017429193899e-07, "loss": 3.9388818740844727, "step": 695 }, { "epoch": 1.5163398692810457, "grad_norm": 1.8730785846710205, "learning_rate": 8.485838779956427e-07, "loss": 4.150574684143066, "step": 696 }, { "epoch": 1.5185185185185186, "grad_norm": 2.4714531898498535, "learning_rate": 8.483660130718954e-07, "loss": 4.330325603485107, "step": 697 }, { "epoch": 1.5206971677559913, "grad_norm": 2.2690491676330566, "learning_rate": 8.48148148148148e-07, "loss": 4.175599575042725, "step": 698 }, { "epoch": 1.522875816993464, "grad_norm": 2.6730496883392334, "learning_rate": 8.479302832244009e-07, "loss": 4.245357990264893, "step": 699 }, { "epoch": 1.5250544662309369, "grad_norm": 1.9761924743652344, "learning_rate": 8.477124183006535e-07, "loss": 4.170820236206055, "step": 700 }, { "epoch": 1.5272331154684096, "grad_norm": 2.378988742828369, "learning_rate": 8.474945533769062e-07, "loss": 4.461413383483887, "step": 701 }, { "epoch": 1.5294117647058822, "grad_norm": 2.0880000591278076, "learning_rate": 8.47276688453159e-07, "loss": 4.0684332847595215, "step": 702 }, { "epoch": 1.5315904139433552, "grad_norm": 2.232128381729126, "learning_rate": 8.470588235294117e-07, "loss": 4.241603851318359, "step": 703 }, { "epoch": 1.5337690631808278, "grad_norm": 1.497976303100586, "learning_rate": 8.468409586056644e-07, "loss": 4.133387565612793, "step": 704 }, { "epoch": 1.5359477124183005, "grad_norm": 2.3840863704681396, "learning_rate": 8.466230936819172e-07, "loss": 4.2359843254089355, "step": 705 }, { "epoch": 1.5381263616557734, "grad_norm": 2.4907233715057373, "learning_rate": 8.464052287581698e-07, "loss": 4.252405166625977, "step": 706 }, { "epoch": 1.5403050108932463, "grad_norm": 1.614211082458496, "learning_rate": 8.461873638344226e-07, "loss": 3.991302967071533, "step": 707 }, { "epoch": 1.5424836601307188, "grad_norm": 2.479708194732666, "learning_rate": 8.459694989106753e-07, "loss": 4.274848461151123, "step": 708 }, { "epoch": 1.5446623093681917, "grad_norm": 2.5522878170013428, "learning_rate": 8.457516339869281e-07, "loss": 3.9473726749420166, "step": 709 }, { "epoch": 1.5468409586056646, "grad_norm": 1.760374903678894, "learning_rate": 8.455337690631807e-07, "loss": 4.017210006713867, "step": 710 }, { "epoch": 1.5490196078431373, "grad_norm": 2.186105251312256, "learning_rate": 8.453159041394336e-07, "loss": 4.234235763549805, "step": 711 }, { "epoch": 1.55119825708061, "grad_norm": 2.194594621658325, "learning_rate": 8.450980392156862e-07, "loss": 4.195955753326416, "step": 712 }, { "epoch": 1.553376906318083, "grad_norm": 2.408224105834961, "learning_rate": 8.448801742919389e-07, "loss": 4.3126373291015625, "step": 713 }, { "epoch": 1.5555555555555556, "grad_norm": 2.0503766536712646, "learning_rate": 8.446623093681917e-07, "loss": 4.171034812927246, "step": 714 }, { "epoch": 1.5577342047930283, "grad_norm": 1.6654257774353027, "learning_rate": 8.444444444444444e-07, "loss": 4.137068748474121, "step": 715 }, { "epoch": 1.5599128540305012, "grad_norm": 1.556382417678833, "learning_rate": 8.442265795206971e-07, "loss": 4.143765449523926, "step": 716 }, { "epoch": 1.5620915032679739, "grad_norm": 2.2294673919677734, "learning_rate": 8.440087145969499e-07, "loss": 4.1836652755737305, "step": 717 }, { "epoch": 1.5642701525054465, "grad_norm": 1.9865788221359253, "learning_rate": 8.437908496732025e-07, "loss": 4.180343151092529, "step": 718 }, { "epoch": 1.5664488017429194, "grad_norm": 2.3708291053771973, "learning_rate": 8.435729847494553e-07, "loss": 4.239871025085449, "step": 719 }, { "epoch": 1.5686274509803921, "grad_norm": 2.380955696105957, "learning_rate": 8.43355119825708e-07, "loss": 4.237709045410156, "step": 720 }, { "epoch": 1.5708061002178648, "grad_norm": 2.0145769119262695, "learning_rate": 8.431372549019608e-07, "loss": 4.175321102142334, "step": 721 }, { "epoch": 1.5729847494553377, "grad_norm": 2.2467691898345947, "learning_rate": 8.429193899782134e-07, "loss": 4.149237632751465, "step": 722 }, { "epoch": 1.5751633986928104, "grad_norm": 2.143671751022339, "learning_rate": 8.427015250544663e-07, "loss": 3.9390757083892822, "step": 723 }, { "epoch": 1.577342047930283, "grad_norm": 1.530213475227356, "learning_rate": 8.424836601307189e-07, "loss": 4.089017391204834, "step": 724 }, { "epoch": 1.579520697167756, "grad_norm": 1.558199167251587, "learning_rate": 8.422657952069716e-07, "loss": 4.002987384796143, "step": 725 }, { "epoch": 1.581699346405229, "grad_norm": 2.310760259628296, "learning_rate": 8.420479302832244e-07, "loss": 4.219573020935059, "step": 726 }, { "epoch": 1.5838779956427014, "grad_norm": 2.2556145191192627, "learning_rate": 8.418300653594771e-07, "loss": 4.190657138824463, "step": 727 }, { "epoch": 1.5860566448801743, "grad_norm": 2.477578639984131, "learning_rate": 8.416122004357298e-07, "loss": 4.351266384124756, "step": 728 }, { "epoch": 1.5882352941176472, "grad_norm": 2.657353162765503, "learning_rate": 8.413943355119826e-07, "loss": 4.08535623550415, "step": 729 }, { "epoch": 1.5904139433551199, "grad_norm": 1.9748584032058716, "learning_rate": 8.411764705882352e-07, "loss": 4.138681411743164, "step": 730 }, { "epoch": 1.5925925925925926, "grad_norm": 1.679963231086731, "learning_rate": 8.40958605664488e-07, "loss": 4.044337272644043, "step": 731 }, { "epoch": 1.5947712418300655, "grad_norm": 1.506542682647705, "learning_rate": 8.407407407407407e-07, "loss": 4.0625786781311035, "step": 732 }, { "epoch": 1.5969498910675382, "grad_norm": 2.638867139816284, "learning_rate": 8.405228758169935e-07, "loss": 4.3120293617248535, "step": 733 }, { "epoch": 1.5991285403050108, "grad_norm": 2.296577215194702, "learning_rate": 8.403050108932461e-07, "loss": 4.209537029266357, "step": 734 }, { "epoch": 1.6013071895424837, "grad_norm": 1.6808831691741943, "learning_rate": 8.40087145969499e-07, "loss": 4.153660297393799, "step": 735 }, { "epoch": 1.6034858387799564, "grad_norm": 2.1877191066741943, "learning_rate": 8.398692810457516e-07, "loss": 4.160507678985596, "step": 736 }, { "epoch": 1.6056644880174291, "grad_norm": 1.8676464557647705, "learning_rate": 8.396514161220043e-07, "loss": 4.152212619781494, "step": 737 }, { "epoch": 1.607843137254902, "grad_norm": 2.745182514190674, "learning_rate": 8.394335511982571e-07, "loss": 4.211499214172363, "step": 738 }, { "epoch": 1.6100217864923747, "grad_norm": 3.043419599533081, "learning_rate": 8.392156862745098e-07, "loss": 4.244939804077148, "step": 739 }, { "epoch": 1.6122004357298474, "grad_norm": 1.875920057296753, "learning_rate": 8.389978213507625e-07, "loss": 4.152059555053711, "step": 740 }, { "epoch": 1.6143790849673203, "grad_norm": 1.872031569480896, "learning_rate": 8.387799564270153e-07, "loss": 4.129920959472656, "step": 741 }, { "epoch": 1.616557734204793, "grad_norm": 2.147571086883545, "learning_rate": 8.385620915032679e-07, "loss": 4.100576400756836, "step": 742 }, { "epoch": 1.6187363834422657, "grad_norm": 2.6817004680633545, "learning_rate": 8.383442265795207e-07, "loss": 4.372639179229736, "step": 743 }, { "epoch": 1.6209150326797386, "grad_norm": 1.7509095668792725, "learning_rate": 8.381263616557733e-07, "loss": 4.160117149353027, "step": 744 }, { "epoch": 1.6230936819172115, "grad_norm": 1.7169424295425415, "learning_rate": 8.379084967320262e-07, "loss": 4.160579681396484, "step": 745 }, { "epoch": 1.625272331154684, "grad_norm": 2.0976953506469727, "learning_rate": 8.376906318082788e-07, "loss": 4.128151893615723, "step": 746 }, { "epoch": 1.6274509803921569, "grad_norm": 1.8374370336532593, "learning_rate": 8.374727668845315e-07, "loss": 4.156114101409912, "step": 747 }, { "epoch": 1.6296296296296298, "grad_norm": 1.7638161182403564, "learning_rate": 8.372549019607843e-07, "loss": 4.080852031707764, "step": 748 }, { "epoch": 1.6318082788671024, "grad_norm": 1.6407208442687988, "learning_rate": 8.37037037037037e-07, "loss": 4.051687240600586, "step": 749 }, { "epoch": 1.6339869281045751, "grad_norm": 2.6351497173309326, "learning_rate": 8.368191721132897e-07, "loss": 4.1568427085876465, "step": 750 }, { "epoch": 1.636165577342048, "grad_norm": 2.361440896987915, "learning_rate": 8.366013071895425e-07, "loss": 4.19932746887207, "step": 751 }, { "epoch": 1.6383442265795207, "grad_norm": 2.2749528884887695, "learning_rate": 8.363834422657951e-07, "loss": 4.016824722290039, "step": 752 }, { "epoch": 1.6405228758169934, "grad_norm": 2.8418686389923096, "learning_rate": 8.361655773420479e-07, "loss": 4.288532733917236, "step": 753 }, { "epoch": 1.6427015250544663, "grad_norm": 1.7989444732666016, "learning_rate": 8.359477124183006e-07, "loss": 4.1045145988464355, "step": 754 }, { "epoch": 1.644880174291939, "grad_norm": 2.264406442642212, "learning_rate": 8.357298474945534e-07, "loss": 4.169315814971924, "step": 755 }, { "epoch": 1.6470588235294117, "grad_norm": 2.8426315784454346, "learning_rate": 8.35511982570806e-07, "loss": 4.2573981285095215, "step": 756 }, { "epoch": 1.6492374727668846, "grad_norm": 1.6943989992141724, "learning_rate": 8.352941176470589e-07, "loss": 4.0508809089660645, "step": 757 }, { "epoch": 1.6514161220043573, "grad_norm": 1.7788587808609009, "learning_rate": 8.350762527233115e-07, "loss": 4.100910186767578, "step": 758 }, { "epoch": 1.65359477124183, "grad_norm": 2.466076135635376, "learning_rate": 8.348583877995642e-07, "loss": 4.1073479652404785, "step": 759 }, { "epoch": 1.6557734204793029, "grad_norm": 1.523006558418274, "learning_rate": 8.34640522875817e-07, "loss": 3.8831026554107666, "step": 760 }, { "epoch": 1.6579520697167756, "grad_norm": 2.3753342628479004, "learning_rate": 8.344226579520697e-07, "loss": 4.336416721343994, "step": 761 }, { "epoch": 1.6601307189542482, "grad_norm": 2.4241554737091064, "learning_rate": 8.342047930283224e-07, "loss": 4.172718524932861, "step": 762 }, { "epoch": 1.6623093681917211, "grad_norm": 2.0291216373443604, "learning_rate": 8.339869281045752e-07, "loss": 4.307475566864014, "step": 763 }, { "epoch": 1.664488017429194, "grad_norm": 2.0748989582061768, "learning_rate": 8.337690631808278e-07, "loss": 4.1607537269592285, "step": 764 }, { "epoch": 1.6666666666666665, "grad_norm": 2.1463654041290283, "learning_rate": 8.335511982570805e-07, "loss": 4.235579013824463, "step": 765 }, { "epoch": 1.6688453159041394, "grad_norm": 2.1689393520355225, "learning_rate": 8.333333333333333e-07, "loss": 4.119271755218506, "step": 766 }, { "epoch": 1.6710239651416123, "grad_norm": 2.717575788497925, "learning_rate": 8.33115468409586e-07, "loss": 4.078428268432617, "step": 767 }, { "epoch": 1.673202614379085, "grad_norm": 2.0350773334503174, "learning_rate": 8.328976034858387e-07, "loss": 4.062885761260986, "step": 768 }, { "epoch": 1.6753812636165577, "grad_norm": 1.9195847511291504, "learning_rate": 8.326797385620915e-07, "loss": 4.073990821838379, "step": 769 }, { "epoch": 1.6775599128540306, "grad_norm": 2.949504852294922, "learning_rate": 8.324618736383442e-07, "loss": 4.335659027099609, "step": 770 }, { "epoch": 1.6797385620915033, "grad_norm": 1.9603420495986938, "learning_rate": 8.322440087145968e-07, "loss": 4.064672470092773, "step": 771 }, { "epoch": 1.681917211328976, "grad_norm": 2.2606916427612305, "learning_rate": 8.320261437908497e-07, "loss": 4.184751033782959, "step": 772 }, { "epoch": 1.6840958605664489, "grad_norm": 2.24015212059021, "learning_rate": 8.318082788671023e-07, "loss": 4.266111373901367, "step": 773 }, { "epoch": 1.6862745098039216, "grad_norm": 1.6783232688903809, "learning_rate": 8.315904139433551e-07, "loss": 4.032918453216553, "step": 774 }, { "epoch": 1.6884531590413943, "grad_norm": 1.5461317300796509, "learning_rate": 8.313725490196078e-07, "loss": 3.9564428329467773, "step": 775 }, { "epoch": 1.6906318082788672, "grad_norm": 1.6963696479797363, "learning_rate": 8.311546840958605e-07, "loss": 4.096341133117676, "step": 776 }, { "epoch": 1.6928104575163399, "grad_norm": 2.407982349395752, "learning_rate": 8.309368191721132e-07, "loss": 4.316627502441406, "step": 777 }, { "epoch": 1.6949891067538125, "grad_norm": 1.936503291130066, "learning_rate": 8.30718954248366e-07, "loss": 4.110929012298584, "step": 778 }, { "epoch": 1.6971677559912854, "grad_norm": 2.0853638648986816, "learning_rate": 8.305010893246187e-07, "loss": 4.044859886169434, "step": 779 }, { "epoch": 1.6993464052287581, "grad_norm": 2.164699077606201, "learning_rate": 8.302832244008714e-07, "loss": 4.141659736633301, "step": 780 }, { "epoch": 1.7015250544662308, "grad_norm": 1.9006580114364624, "learning_rate": 8.300653594771242e-07, "loss": 4.062077045440674, "step": 781 }, { "epoch": 1.7037037037037037, "grad_norm": 1.861655831336975, "learning_rate": 8.298474945533769e-07, "loss": 4.208712100982666, "step": 782 }, { "epoch": 1.7058823529411766, "grad_norm": 1.6372374296188354, "learning_rate": 8.296296296296295e-07, "loss": 3.95259165763855, "step": 783 }, { "epoch": 1.708061002178649, "grad_norm": 2.2819674015045166, "learning_rate": 8.294117647058824e-07, "loss": 4.135552883148193, "step": 784 }, { "epoch": 1.710239651416122, "grad_norm": 2.443122386932373, "learning_rate": 8.29193899782135e-07, "loss": 4.259946346282959, "step": 785 }, { "epoch": 1.712418300653595, "grad_norm": 2.012627601623535, "learning_rate": 8.289760348583878e-07, "loss": 4.121253490447998, "step": 786 }, { "epoch": 1.7145969498910676, "grad_norm": 1.3586057424545288, "learning_rate": 8.287581699346405e-07, "loss": 4.039555549621582, "step": 787 }, { "epoch": 1.7167755991285403, "grad_norm": 2.1511480808258057, "learning_rate": 8.285403050108932e-07, "loss": 4.173498630523682, "step": 788 }, { "epoch": 1.7189542483660132, "grad_norm": 2.0098958015441895, "learning_rate": 8.283224400871459e-07, "loss": 4.161426067352295, "step": 789 }, { "epoch": 1.7211328976034859, "grad_norm": 2.373271942138672, "learning_rate": 8.281045751633986e-07, "loss": 4.128650188446045, "step": 790 }, { "epoch": 1.7233115468409586, "grad_norm": 2.205798625946045, "learning_rate": 8.278867102396514e-07, "loss": 4.0204691886901855, "step": 791 }, { "epoch": 1.7254901960784315, "grad_norm": 1.9594868421554565, "learning_rate": 8.276688453159041e-07, "loss": 4.067202568054199, "step": 792 }, { "epoch": 1.7276688453159041, "grad_norm": 1.6024609804153442, "learning_rate": 8.274509803921567e-07, "loss": 4.223491191864014, "step": 793 }, { "epoch": 1.7298474945533768, "grad_norm": 1.7322437763214111, "learning_rate": 8.272331154684096e-07, "loss": 4.083629608154297, "step": 794 }, { "epoch": 1.7320261437908497, "grad_norm": 1.869983196258545, "learning_rate": 8.270152505446622e-07, "loss": 4.153186798095703, "step": 795 }, { "epoch": 1.7342047930283224, "grad_norm": 2.2324352264404297, "learning_rate": 8.26797385620915e-07, "loss": 4.168092250823975, "step": 796 }, { "epoch": 1.736383442265795, "grad_norm": 2.010200023651123, "learning_rate": 8.265795206971677e-07, "loss": 4.020692348480225, "step": 797 }, { "epoch": 1.738562091503268, "grad_norm": 2.0050137042999268, "learning_rate": 8.263616557734205e-07, "loss": 4.09138822555542, "step": 798 }, { "epoch": 1.7407407407407407, "grad_norm": 1.8392466306686401, "learning_rate": 8.261437908496731e-07, "loss": 4.189724445343018, "step": 799 }, { "epoch": 1.7429193899782134, "grad_norm": 1.8870618343353271, "learning_rate": 8.259259259259259e-07, "loss": 4.2867560386657715, "step": 800 }, { "epoch": 1.7450980392156863, "grad_norm": 1.8800386190414429, "learning_rate": 8.257080610021786e-07, "loss": 4.1035566329956055, "step": 801 }, { "epoch": 1.7472766884531592, "grad_norm": 2.35392165184021, "learning_rate": 8.254901960784313e-07, "loss": 4.240886211395264, "step": 802 }, { "epoch": 1.7494553376906317, "grad_norm": 2.8022584915161133, "learning_rate": 8.252723311546841e-07, "loss": 4.21259880065918, "step": 803 }, { "epoch": 1.7516339869281046, "grad_norm": 2.359691619873047, "learning_rate": 8.250544662309368e-07, "loss": 4.255124092102051, "step": 804 }, { "epoch": 1.7538126361655775, "grad_norm": 2.651008129119873, "learning_rate": 8.248366013071894e-07, "loss": 4.270796298980713, "step": 805 }, { "epoch": 1.7559912854030502, "grad_norm": 2.625868082046509, "learning_rate": 8.246187363834423e-07, "loss": 4.097858905792236, "step": 806 }, { "epoch": 1.7581699346405228, "grad_norm": 2.2341153621673584, "learning_rate": 8.244008714596949e-07, "loss": 4.22205924987793, "step": 807 }, { "epoch": 1.7603485838779958, "grad_norm": 1.8799047470092773, "learning_rate": 8.241830065359477e-07, "loss": 4.17194938659668, "step": 808 }, { "epoch": 1.7625272331154684, "grad_norm": 2.784691095352173, "learning_rate": 8.239651416122004e-07, "loss": 4.25189208984375, "step": 809 }, { "epoch": 1.7647058823529411, "grad_norm": 2.134303331375122, "learning_rate": 8.237472766884532e-07, "loss": 4.250917911529541, "step": 810 }, { "epoch": 1.766884531590414, "grad_norm": 2.457502603530884, "learning_rate": 8.235294117647058e-07, "loss": 4.16502046585083, "step": 811 }, { "epoch": 1.7690631808278867, "grad_norm": 2.587965250015259, "learning_rate": 8.233115468409586e-07, "loss": 4.173135757446289, "step": 812 }, { "epoch": 1.7712418300653594, "grad_norm": 2.3227202892303467, "learning_rate": 8.230936819172113e-07, "loss": 4.194047927856445, "step": 813 }, { "epoch": 1.7734204793028323, "grad_norm": 1.8524086475372314, "learning_rate": 8.22875816993464e-07, "loss": 4.043094635009766, "step": 814 }, { "epoch": 1.775599128540305, "grad_norm": 2.0924582481384277, "learning_rate": 8.226579520697168e-07, "loss": 4.184132099151611, "step": 815 }, { "epoch": 1.7777777777777777, "grad_norm": 1.8107885122299194, "learning_rate": 8.224400871459695e-07, "loss": 4.095870018005371, "step": 816 }, { "epoch": 1.7799564270152506, "grad_norm": 1.818077564239502, "learning_rate": 8.222222222222221e-07, "loss": 4.120379447937012, "step": 817 }, { "epoch": 1.7821350762527233, "grad_norm": 2.0244295597076416, "learning_rate": 8.22004357298475e-07, "loss": 4.082331657409668, "step": 818 }, { "epoch": 1.784313725490196, "grad_norm": 1.8549842834472656, "learning_rate": 8.217864923747276e-07, "loss": 4.0240797996521, "step": 819 }, { "epoch": 1.7864923747276689, "grad_norm": 1.7515335083007812, "learning_rate": 8.215686274509804e-07, "loss": 3.988206624984741, "step": 820 }, { "epoch": 1.7886710239651418, "grad_norm": 2.1797804832458496, "learning_rate": 8.213507625272331e-07, "loss": 4.180747032165527, "step": 821 }, { "epoch": 1.7908496732026142, "grad_norm": 1.6823264360427856, "learning_rate": 8.211328976034859e-07, "loss": 4.057288646697998, "step": 822 }, { "epoch": 1.7930283224400871, "grad_norm": 3.192840337753296, "learning_rate": 8.209150326797385e-07, "loss": 4.259395599365234, "step": 823 }, { "epoch": 1.79520697167756, "grad_norm": 2.389817953109741, "learning_rate": 8.206971677559913e-07, "loss": 4.227148056030273, "step": 824 }, { "epoch": 1.7973856209150327, "grad_norm": 2.1692278385162354, "learning_rate": 8.20479302832244e-07, "loss": 4.241482734680176, "step": 825 }, { "epoch": 1.7995642701525054, "grad_norm": 1.6416308879852295, "learning_rate": 8.202614379084967e-07, "loss": 4.1019062995910645, "step": 826 }, { "epoch": 1.8017429193899783, "grad_norm": 1.9569809436798096, "learning_rate": 8.200435729847495e-07, "loss": 4.153873920440674, "step": 827 }, { "epoch": 1.803921568627451, "grad_norm": 2.9770915508270264, "learning_rate": 8.198257080610022e-07, "loss": 4.370567798614502, "step": 828 }, { "epoch": 1.8061002178649237, "grad_norm": 1.706908941268921, "learning_rate": 8.196078431372548e-07, "loss": 3.9940435886383057, "step": 829 }, { "epoch": 1.8082788671023966, "grad_norm": 1.7746074199676514, "learning_rate": 8.193899782135077e-07, "loss": 3.9646553993225098, "step": 830 }, { "epoch": 1.8104575163398693, "grad_norm": 1.8066703081130981, "learning_rate": 8.191721132897603e-07, "loss": 4.044873237609863, "step": 831 }, { "epoch": 1.812636165577342, "grad_norm": 2.9295389652252197, "learning_rate": 8.189542483660131e-07, "loss": 4.238265037536621, "step": 832 }, { "epoch": 1.8148148148148149, "grad_norm": 1.475029468536377, "learning_rate": 8.187363834422658e-07, "loss": 4.009505748748779, "step": 833 }, { "epoch": 1.8169934640522876, "grad_norm": 1.9991767406463623, "learning_rate": 8.185185185185185e-07, "loss": 4.0429558753967285, "step": 834 }, { "epoch": 1.8191721132897603, "grad_norm": 4.568184852600098, "learning_rate": 8.183006535947712e-07, "loss": 4.427497863769531, "step": 835 }, { "epoch": 1.8213507625272332, "grad_norm": 2.8527064323425293, "learning_rate": 8.18082788671024e-07, "loss": 4.286968231201172, "step": 836 }, { "epoch": 1.8235294117647058, "grad_norm": 2.4208083152770996, "learning_rate": 8.178649237472767e-07, "loss": 4.215007781982422, "step": 837 }, { "epoch": 1.8257080610021785, "grad_norm": 2.705932140350342, "learning_rate": 8.176470588235293e-07, "loss": 4.467679500579834, "step": 838 }, { "epoch": 1.8278867102396514, "grad_norm": 2.1849515438079834, "learning_rate": 8.17429193899782e-07, "loss": 4.122862815856934, "step": 839 }, { "epoch": 1.8300653594771243, "grad_norm": 3.1176061630249023, "learning_rate": 8.172113289760348e-07, "loss": 4.375067234039307, "step": 840 }, { "epoch": 1.8322440087145968, "grad_norm": 1.7952834367752075, "learning_rate": 8.169934640522875e-07, "loss": 4.01256799697876, "step": 841 }, { "epoch": 1.8344226579520697, "grad_norm": 1.9415255784988403, "learning_rate": 8.167755991285402e-07, "loss": 4.252836227416992, "step": 842 }, { "epoch": 1.8366013071895426, "grad_norm": 1.674583911895752, "learning_rate": 8.16557734204793e-07, "loss": 4.167291164398193, "step": 843 }, { "epoch": 1.8387799564270153, "grad_norm": 1.6454881429672241, "learning_rate": 8.163398692810457e-07, "loss": 4.043299198150635, "step": 844 }, { "epoch": 1.840958605664488, "grad_norm": 1.7338085174560547, "learning_rate": 8.161220043572984e-07, "loss": 4.12736701965332, "step": 845 }, { "epoch": 1.843137254901961, "grad_norm": 2.2820301055908203, "learning_rate": 8.159041394335511e-07, "loss": 4.034628868103027, "step": 846 }, { "epoch": 1.8453159041394336, "grad_norm": 2.290955066680908, "learning_rate": 8.156862745098039e-07, "loss": 4.174453258514404, "step": 847 }, { "epoch": 1.8474945533769063, "grad_norm": 1.8796809911727905, "learning_rate": 8.154684095860565e-07, "loss": 4.108852863311768, "step": 848 }, { "epoch": 1.8496732026143792, "grad_norm": 2.643812894821167, "learning_rate": 8.152505446623094e-07, "loss": 4.296456336975098, "step": 849 }, { "epoch": 1.8518518518518519, "grad_norm": 2.319742441177368, "learning_rate": 8.15032679738562e-07, "loss": 4.1574506759643555, "step": 850 }, { "epoch": 1.8540305010893245, "grad_norm": 1.8312441110610962, "learning_rate": 8.148148148148147e-07, "loss": 4.146476745605469, "step": 851 }, { "epoch": 1.8562091503267975, "grad_norm": 1.5667997598648071, "learning_rate": 8.145969498910675e-07, "loss": 4.086141109466553, "step": 852 }, { "epoch": 1.8583877995642701, "grad_norm": 2.748506784439087, "learning_rate": 8.143790849673202e-07, "loss": 4.155338764190674, "step": 853 }, { "epoch": 1.8605664488017428, "grad_norm": 2.73716402053833, "learning_rate": 8.141612200435729e-07, "loss": 4.2161664962768555, "step": 854 }, { "epoch": 1.8627450980392157, "grad_norm": 1.7289466857910156, "learning_rate": 8.139433551198257e-07, "loss": 4.068624496459961, "step": 855 }, { "epoch": 1.8649237472766884, "grad_norm": 2.7699716091156006, "learning_rate": 8.137254901960784e-07, "loss": 4.3124237060546875, "step": 856 }, { "epoch": 1.867102396514161, "grad_norm": 2.5239439010620117, "learning_rate": 8.135076252723311e-07, "loss": 4.291884422302246, "step": 857 }, { "epoch": 1.869281045751634, "grad_norm": 2.590318202972412, "learning_rate": 8.132897603485838e-07, "loss": 4.298960208892822, "step": 858 }, { "epoch": 1.871459694989107, "grad_norm": 1.8926911354064941, "learning_rate": 8.130718954248366e-07, "loss": 4.203323841094971, "step": 859 }, { "epoch": 1.8736383442265794, "grad_norm": 2.1644389629364014, "learning_rate": 8.128540305010892e-07, "loss": 4.15595006942749, "step": 860 }, { "epoch": 1.8758169934640523, "grad_norm": 1.974097490310669, "learning_rate": 8.126361655773421e-07, "loss": 4.184225559234619, "step": 861 }, { "epoch": 1.8779956427015252, "grad_norm": 2.475646495819092, "learning_rate": 8.124183006535947e-07, "loss": 4.102884769439697, "step": 862 }, { "epoch": 1.8801742919389977, "grad_norm": 1.8593982458114624, "learning_rate": 8.122004357298474e-07, "loss": 4.142711162567139, "step": 863 }, { "epoch": 1.8823529411764706, "grad_norm": 1.8744142055511475, "learning_rate": 8.119825708061002e-07, "loss": 4.222711086273193, "step": 864 }, { "epoch": 1.8845315904139435, "grad_norm": 2.206489324569702, "learning_rate": 8.117647058823529e-07, "loss": 4.0585808753967285, "step": 865 }, { "epoch": 1.8867102396514162, "grad_norm": 1.7787939310073853, "learning_rate": 8.115468409586056e-07, "loss": 4.019634246826172, "step": 866 }, { "epoch": 1.8888888888888888, "grad_norm": 2.62564754486084, "learning_rate": 8.113289760348584e-07, "loss": 4.372350692749023, "step": 867 }, { "epoch": 1.8910675381263617, "grad_norm": 2.6030287742614746, "learning_rate": 8.11111111111111e-07, "loss": 4.2521138191223145, "step": 868 }, { "epoch": 1.8932461873638344, "grad_norm": 2.157879114151001, "learning_rate": 8.108932461873638e-07, "loss": 4.311684608459473, "step": 869 }, { "epoch": 1.8954248366013071, "grad_norm": 2.0609498023986816, "learning_rate": 8.106753812636165e-07, "loss": 4.147398948669434, "step": 870 }, { "epoch": 1.89760348583878, "grad_norm": 1.9126404523849487, "learning_rate": 8.104575163398693e-07, "loss": 4.073269844055176, "step": 871 }, { "epoch": 1.8997821350762527, "grad_norm": 2.366358995437622, "learning_rate": 8.102396514161219e-07, "loss": 4.171934604644775, "step": 872 }, { "epoch": 1.9019607843137254, "grad_norm": 1.909555196762085, "learning_rate": 8.100217864923748e-07, "loss": 4.017719745635986, "step": 873 }, { "epoch": 1.9041394335511983, "grad_norm": 1.932206392288208, "learning_rate": 8.098039215686274e-07, "loss": 4.0444254875183105, "step": 874 }, { "epoch": 1.906318082788671, "grad_norm": 2.2612082958221436, "learning_rate": 8.095860566448801e-07, "loss": 4.309908390045166, "step": 875 }, { "epoch": 1.9084967320261437, "grad_norm": 1.8239173889160156, "learning_rate": 8.093681917211329e-07, "loss": 4.119836330413818, "step": 876 }, { "epoch": 1.9106753812636166, "grad_norm": 2.0788235664367676, "learning_rate": 8.091503267973856e-07, "loss": 4.119347095489502, "step": 877 }, { "epoch": 1.9128540305010895, "grad_norm": 1.7048089504241943, "learning_rate": 8.089324618736383e-07, "loss": 4.058908462524414, "step": 878 }, { "epoch": 1.915032679738562, "grad_norm": 2.7200546264648438, "learning_rate": 8.087145969498911e-07, "loss": 4.247430324554443, "step": 879 }, { "epoch": 1.9172113289760349, "grad_norm": 2.107969045639038, "learning_rate": 8.084967320261437e-07, "loss": 4.083975315093994, "step": 880 }, { "epoch": 1.9193899782135078, "grad_norm": 2.0064475536346436, "learning_rate": 8.082788671023965e-07, "loss": 4.0684590339660645, "step": 881 }, { "epoch": 1.9215686274509802, "grad_norm": 2.1059844493865967, "learning_rate": 8.080610021786492e-07, "loss": 4.197170734405518, "step": 882 }, { "epoch": 1.9237472766884531, "grad_norm": 1.6788114309310913, "learning_rate": 8.07843137254902e-07, "loss": 4.0640177726745605, "step": 883 }, { "epoch": 1.925925925925926, "grad_norm": 1.9067230224609375, "learning_rate": 8.076252723311546e-07, "loss": 4.139192581176758, "step": 884 }, { "epoch": 1.9281045751633987, "grad_norm": 2.1744518280029297, "learning_rate": 8.074074074074075e-07, "loss": 4.210973739624023, "step": 885 }, { "epoch": 1.9302832244008714, "grad_norm": 3.132628917694092, "learning_rate": 8.071895424836601e-07, "loss": 4.3147358894348145, "step": 886 }, { "epoch": 1.9324618736383443, "grad_norm": 2.5264108180999756, "learning_rate": 8.069716775599128e-07, "loss": 4.206925392150879, "step": 887 }, { "epoch": 1.934640522875817, "grad_norm": 2.3532259464263916, "learning_rate": 8.067538126361655e-07, "loss": 4.180276870727539, "step": 888 }, { "epoch": 1.9368191721132897, "grad_norm": 2.0208284854888916, "learning_rate": 8.065359477124183e-07, "loss": 4.208629131317139, "step": 889 }, { "epoch": 1.9389978213507626, "grad_norm": 2.2953221797943115, "learning_rate": 8.06318082788671e-07, "loss": 4.210643768310547, "step": 890 }, { "epoch": 1.9411764705882353, "grad_norm": 1.93471360206604, "learning_rate": 8.061002178649237e-07, "loss": 4.065423011779785, "step": 891 }, { "epoch": 1.943355119825708, "grad_norm": 2.1485824584960938, "learning_rate": 8.058823529411764e-07, "loss": 4.177984714508057, "step": 892 }, { "epoch": 1.9455337690631809, "grad_norm": 1.8944772481918335, "learning_rate": 8.056644880174292e-07, "loss": 4.030359745025635, "step": 893 }, { "epoch": 1.9477124183006536, "grad_norm": 1.681435227394104, "learning_rate": 8.054466230936818e-07, "loss": 3.975057601928711, "step": 894 }, { "epoch": 1.9498910675381262, "grad_norm": 1.9575204849243164, "learning_rate": 8.052287581699347e-07, "loss": 4.2327351570129395, "step": 895 }, { "epoch": 1.9520697167755992, "grad_norm": 2.419712781906128, "learning_rate": 8.050108932461873e-07, "loss": 4.090088844299316, "step": 896 }, { "epoch": 1.954248366013072, "grad_norm": 1.8530755043029785, "learning_rate": 8.047930283224401e-07, "loss": 3.9878690242767334, "step": 897 }, { "epoch": 1.9564270152505445, "grad_norm": 2.2176408767700195, "learning_rate": 8.045751633986928e-07, "loss": 4.14769172668457, "step": 898 }, { "epoch": 1.9586056644880174, "grad_norm": 2.023941993713379, "learning_rate": 8.043572984749455e-07, "loss": 4.162088394165039, "step": 899 }, { "epoch": 1.9607843137254903, "grad_norm": 2.128354549407959, "learning_rate": 8.041394335511982e-07, "loss": 4.057473182678223, "step": 900 }, { "epoch": 1.9629629629629628, "grad_norm": 1.648612380027771, "learning_rate": 8.03921568627451e-07, "loss": 3.9806032180786133, "step": 901 }, { "epoch": 1.9651416122004357, "grad_norm": 2.2412312030792236, "learning_rate": 8.037037037037037e-07, "loss": 4.1701154708862305, "step": 902 }, { "epoch": 1.9673202614379086, "grad_norm": 2.0673563480377197, "learning_rate": 8.034858387799563e-07, "loss": 4.005623817443848, "step": 903 }, { "epoch": 1.9694989106753813, "grad_norm": 2.30501127243042, "learning_rate": 8.032679738562091e-07, "loss": 4.190729141235352, "step": 904 }, { "epoch": 1.971677559912854, "grad_norm": 1.9649978876113892, "learning_rate": 8.030501089324618e-07, "loss": 4.078850746154785, "step": 905 }, { "epoch": 1.973856209150327, "grad_norm": 2.7797210216522217, "learning_rate": 8.028322440087145e-07, "loss": 4.280642986297607, "step": 906 }, { "epoch": 1.9760348583877996, "grad_norm": 2.4379940032958984, "learning_rate": 8.026143790849674e-07, "loss": 4.129292011260986, "step": 907 }, { "epoch": 1.9782135076252723, "grad_norm": 2.885363817214966, "learning_rate": 8.0239651416122e-07, "loss": 4.4366607666015625, "step": 908 }, { "epoch": 1.9803921568627452, "grad_norm": 1.9498522281646729, "learning_rate": 8.021786492374726e-07, "loss": 3.9430274963378906, "step": 909 }, { "epoch": 1.9825708061002179, "grad_norm": 2.1333444118499756, "learning_rate": 8.019607843137255e-07, "loss": 4.335849761962891, "step": 910 }, { "epoch": 1.9847494553376905, "grad_norm": 1.8870283365249634, "learning_rate": 8.017429193899781e-07, "loss": 4.100715637207031, "step": 911 }, { "epoch": 1.9869281045751634, "grad_norm": 2.1186022758483887, "learning_rate": 8.015250544662309e-07, "loss": 4.182877540588379, "step": 912 }, { "epoch": 1.9891067538126361, "grad_norm": 2.29975962638855, "learning_rate": 8.013071895424836e-07, "loss": 4.1606974601745605, "step": 913 }, { "epoch": 1.9912854030501088, "grad_norm": 1.7296197414398193, "learning_rate": 8.010893246187364e-07, "loss": 4.1066765785217285, "step": 914 }, { "epoch": 1.9934640522875817, "grad_norm": 1.547702431678772, "learning_rate": 8.00871459694989e-07, "loss": 4.114107608795166, "step": 915 }, { "epoch": 1.9956427015250546, "grad_norm": 1.7530205249786377, "learning_rate": 8.006535947712418e-07, "loss": 4.111988544464111, "step": 916 }, { "epoch": 1.997821350762527, "grad_norm": 2.4457037448883057, "learning_rate": 8.004357298474945e-07, "loss": 4.1643524169921875, "step": 917 }, { "epoch": 2.0, "grad_norm": 1.922106146812439, "learning_rate": 8.002178649237472e-07, "loss": 4.082487106323242, "step": 918 }, { "epoch": 2.002178649237473, "grad_norm": 1.9809645414352417, "learning_rate": 8e-07, "loss": 4.04416036605835, "step": 919 }, { "epoch": 2.0043572984749454, "grad_norm": 2.190713405609131, "learning_rate": 7.997821350762527e-07, "loss": 4.115407466888428, "step": 920 }, { "epoch": 2.0065359477124183, "grad_norm": 1.71844482421875, "learning_rate": 7.995642701525053e-07, "loss": 3.956617593765259, "step": 921 }, { "epoch": 2.008714596949891, "grad_norm": 2.4385111331939697, "learning_rate": 7.993464052287582e-07, "loss": 4.193373203277588, "step": 922 }, { "epoch": 2.0108932461873636, "grad_norm": 1.7167409658432007, "learning_rate": 7.991285403050108e-07, "loss": 4.186039924621582, "step": 923 }, { "epoch": 2.0130718954248366, "grad_norm": 1.8537359237670898, "learning_rate": 7.989106753812636e-07, "loss": 4.054421424865723, "step": 924 }, { "epoch": 2.0152505446623095, "grad_norm": 1.5552583932876587, "learning_rate": 7.986928104575163e-07, "loss": 4.026552677154541, "step": 925 }, { "epoch": 2.017429193899782, "grad_norm": 1.904251217842102, "learning_rate": 7.984749455337691e-07, "loss": 4.01067590713501, "step": 926 }, { "epoch": 2.019607843137255, "grad_norm": 2.1596591472625732, "learning_rate": 7.982570806100217e-07, "loss": 4.07068395614624, "step": 927 }, { "epoch": 2.0217864923747277, "grad_norm": 1.7939938306808472, "learning_rate": 7.980392156862745e-07, "loss": 4.0983052253723145, "step": 928 }, { "epoch": 2.0239651416122006, "grad_norm": 2.171736001968384, "learning_rate": 7.978213507625272e-07, "loss": 4.171865940093994, "step": 929 }, { "epoch": 2.026143790849673, "grad_norm": 1.8935316801071167, "learning_rate": 7.976034858387799e-07, "loss": 4.150607109069824, "step": 930 }, { "epoch": 2.028322440087146, "grad_norm": 1.7581865787506104, "learning_rate": 7.973856209150327e-07, "loss": 4.055481910705566, "step": 931 }, { "epoch": 2.030501089324619, "grad_norm": 1.7308727502822876, "learning_rate": 7.971677559912854e-07, "loss": 3.9752423763275146, "step": 932 }, { "epoch": 2.0326797385620914, "grad_norm": 2.761657238006592, "learning_rate": 7.96949891067538e-07, "loss": 4.23914098739624, "step": 933 }, { "epoch": 2.0348583877995643, "grad_norm": 1.8745554685592651, "learning_rate": 7.967320261437908e-07, "loss": 4.08441686630249, "step": 934 }, { "epoch": 2.037037037037037, "grad_norm": 2.138507604598999, "learning_rate": 7.965141612200435e-07, "loss": 4.197592735290527, "step": 935 }, { "epoch": 2.0392156862745097, "grad_norm": 2.6836624145507812, "learning_rate": 7.962962962962963e-07, "loss": 4.111847877502441, "step": 936 }, { "epoch": 2.0413943355119826, "grad_norm": 2.5718531608581543, "learning_rate": 7.960784313725489e-07, "loss": 4.231671333312988, "step": 937 }, { "epoch": 2.0435729847494555, "grad_norm": 1.9438505172729492, "learning_rate": 7.958605664488018e-07, "loss": 4.091409206390381, "step": 938 }, { "epoch": 2.045751633986928, "grad_norm": 2.095153331756592, "learning_rate": 7.956427015250544e-07, "loss": 4.036088466644287, "step": 939 }, { "epoch": 2.047930283224401, "grad_norm": 2.1541659832000732, "learning_rate": 7.954248366013071e-07, "loss": 4.171873569488525, "step": 940 }, { "epoch": 2.0501089324618738, "grad_norm": 1.869860291481018, "learning_rate": 7.952069716775599e-07, "loss": 4.069897174835205, "step": 941 }, { "epoch": 2.052287581699346, "grad_norm": 2.0201714038848877, "learning_rate": 7.949891067538126e-07, "loss": 3.9777843952178955, "step": 942 }, { "epoch": 2.054466230936819, "grad_norm": 1.7647689580917358, "learning_rate": 7.947712418300653e-07, "loss": 4.108667373657227, "step": 943 }, { "epoch": 2.056644880174292, "grad_norm": 2.3349502086639404, "learning_rate": 7.945533769063181e-07, "loss": 4.174474239349365, "step": 944 }, { "epoch": 2.0588235294117645, "grad_norm": 2.9246246814727783, "learning_rate": 7.943355119825707e-07, "loss": 4.24221134185791, "step": 945 }, { "epoch": 2.0610021786492374, "grad_norm": 2.6591107845306396, "learning_rate": 7.941176470588235e-07, "loss": 4.140476226806641, "step": 946 }, { "epoch": 2.0631808278867103, "grad_norm": 2.3809421062469482, "learning_rate": 7.938997821350762e-07, "loss": 4.166418075561523, "step": 947 }, { "epoch": 2.065359477124183, "grad_norm": 1.6895897388458252, "learning_rate": 7.93681917211329e-07, "loss": 4.062405586242676, "step": 948 }, { "epoch": 2.0675381263616557, "grad_norm": 2.5451431274414062, "learning_rate": 7.934640522875816e-07, "loss": 4.108200550079346, "step": 949 }, { "epoch": 2.0697167755991286, "grad_norm": 2.07063627243042, "learning_rate": 7.932461873638345e-07, "loss": 4.042630195617676, "step": 950 }, { "epoch": 2.0718954248366015, "grad_norm": 1.6727862358093262, "learning_rate": 7.930283224400871e-07, "loss": 4.185661315917969, "step": 951 }, { "epoch": 2.074074074074074, "grad_norm": 2.1590938568115234, "learning_rate": 7.928104575163398e-07, "loss": 4.213985919952393, "step": 952 }, { "epoch": 2.076252723311547, "grad_norm": 3.1048004627227783, "learning_rate": 7.925925925925926e-07, "loss": 4.23642635345459, "step": 953 }, { "epoch": 2.0784313725490198, "grad_norm": 1.815842866897583, "learning_rate": 7.923747276688453e-07, "loss": 4.160055160522461, "step": 954 }, { "epoch": 2.0806100217864922, "grad_norm": 2.0997283458709717, "learning_rate": 7.92156862745098e-07, "loss": 4.003493309020996, "step": 955 }, { "epoch": 2.082788671023965, "grad_norm": 2.3799962997436523, "learning_rate": 7.919389978213508e-07, "loss": 4.0239691734313965, "step": 956 }, { "epoch": 2.084967320261438, "grad_norm": 1.9730435609817505, "learning_rate": 7.917211328976034e-07, "loss": 4.087383270263672, "step": 957 }, { "epoch": 2.0871459694989105, "grad_norm": 1.7735321521759033, "learning_rate": 7.915032679738562e-07, "loss": 4.105068206787109, "step": 958 }, { "epoch": 2.0893246187363834, "grad_norm": 2.7880523204803467, "learning_rate": 7.912854030501089e-07, "loss": 4.202101230621338, "step": 959 }, { "epoch": 2.0915032679738563, "grad_norm": 2.6837782859802246, "learning_rate": 7.910675381263617e-07, "loss": 4.172263145446777, "step": 960 }, { "epoch": 2.093681917211329, "grad_norm": 1.9854693412780762, "learning_rate": 7.908496732026143e-07, "loss": 4.039182662963867, "step": 961 }, { "epoch": 2.0958605664488017, "grad_norm": 1.8235325813293457, "learning_rate": 7.906318082788671e-07, "loss": 3.8521697521209717, "step": 962 }, { "epoch": 2.0980392156862746, "grad_norm": 1.9031305313110352, "learning_rate": 7.904139433551198e-07, "loss": 4.085139274597168, "step": 963 }, { "epoch": 2.100217864923747, "grad_norm": 2.6681697368621826, "learning_rate": 7.901960784313725e-07, "loss": 4.259571552276611, "step": 964 }, { "epoch": 2.10239651416122, "grad_norm": 2.0557339191436768, "learning_rate": 7.899782135076253e-07, "loss": 4.035128593444824, "step": 965 }, { "epoch": 2.104575163398693, "grad_norm": 1.978162407875061, "learning_rate": 7.89760348583878e-07, "loss": 3.96429705619812, "step": 966 }, { "epoch": 2.106753812636166, "grad_norm": 2.1063034534454346, "learning_rate": 7.895424836601306e-07, "loss": 4.046944618225098, "step": 967 }, { "epoch": 2.1089324618736383, "grad_norm": 1.9610927104949951, "learning_rate": 7.893246187363835e-07, "loss": 4.036257266998291, "step": 968 }, { "epoch": 2.111111111111111, "grad_norm": 1.9539198875427246, "learning_rate": 7.891067538126361e-07, "loss": 4.1784772872924805, "step": 969 }, { "epoch": 2.113289760348584, "grad_norm": 3.0677266120910645, "learning_rate": 7.888888888888889e-07, "loss": 4.375663757324219, "step": 970 }, { "epoch": 2.1154684095860565, "grad_norm": 1.674418330192566, "learning_rate": 7.886710239651416e-07, "loss": 4.017217636108398, "step": 971 }, { "epoch": 2.1176470588235294, "grad_norm": 2.112638473510742, "learning_rate": 7.884531590413944e-07, "loss": 4.2400712966918945, "step": 972 }, { "epoch": 2.1198257080610023, "grad_norm": 1.8848779201507568, "learning_rate": 7.88235294117647e-07, "loss": 4.109051704406738, "step": 973 }, { "epoch": 2.122004357298475, "grad_norm": 1.9091440439224243, "learning_rate": 7.880174291938998e-07, "loss": 4.103385925292969, "step": 974 }, { "epoch": 2.1241830065359477, "grad_norm": 2.4109978675842285, "learning_rate": 7.877995642701525e-07, "loss": 4.137935638427734, "step": 975 }, { "epoch": 2.1263616557734206, "grad_norm": 2.0930750370025635, "learning_rate": 7.875816993464051e-07, "loss": 3.9625084400177, "step": 976 }, { "epoch": 2.128540305010893, "grad_norm": 2.2295689582824707, "learning_rate": 7.87363834422658e-07, "loss": 4.082664489746094, "step": 977 }, { "epoch": 2.130718954248366, "grad_norm": 2.075425624847412, "learning_rate": 7.871459694989106e-07, "loss": 4.10584020614624, "step": 978 }, { "epoch": 2.132897603485839, "grad_norm": 2.0698938369750977, "learning_rate": 7.869281045751633e-07, "loss": 4.209216594696045, "step": 979 }, { "epoch": 2.1350762527233114, "grad_norm": 1.7843433618545532, "learning_rate": 7.867102396514161e-07, "loss": 4.010354518890381, "step": 980 }, { "epoch": 2.1372549019607843, "grad_norm": 2.0814990997314453, "learning_rate": 7.864923747276688e-07, "loss": 4.166459560394287, "step": 981 }, { "epoch": 2.139433551198257, "grad_norm": 3.037248134613037, "learning_rate": 7.862745098039215e-07, "loss": 4.249602317810059, "step": 982 }, { "epoch": 2.1416122004357296, "grad_norm": 1.7575691938400269, "learning_rate": 7.860566448801742e-07, "loss": 4.157761096954346, "step": 983 }, { "epoch": 2.1437908496732025, "grad_norm": 2.208876371383667, "learning_rate": 7.85838779956427e-07, "loss": 4.1271491050720215, "step": 984 }, { "epoch": 2.1459694989106755, "grad_norm": 2.226686716079712, "learning_rate": 7.856209150326797e-07, "loss": 4.138065814971924, "step": 985 }, { "epoch": 2.148148148148148, "grad_norm": 2.080857992172241, "learning_rate": 7.854030501089323e-07, "loss": 4.124225616455078, "step": 986 }, { "epoch": 2.150326797385621, "grad_norm": 3.2912957668304443, "learning_rate": 7.851851851851852e-07, "loss": 4.372919082641602, "step": 987 }, { "epoch": 2.1525054466230937, "grad_norm": 1.783996343612671, "learning_rate": 7.849673202614378e-07, "loss": 4.220183372497559, "step": 988 }, { "epoch": 2.1546840958605666, "grad_norm": 1.699868083000183, "learning_rate": 7.847494553376906e-07, "loss": 4.026693820953369, "step": 989 }, { "epoch": 2.156862745098039, "grad_norm": 2.708341360092163, "learning_rate": 7.845315904139433e-07, "loss": 4.100743293762207, "step": 990 }, { "epoch": 2.159041394335512, "grad_norm": 1.717261791229248, "learning_rate": 7.84313725490196e-07, "loss": 3.874121904373169, "step": 991 }, { "epoch": 2.161220043572985, "grad_norm": 2.2663161754608154, "learning_rate": 7.840958605664487e-07, "loss": 4.126644611358643, "step": 992 }, { "epoch": 2.1633986928104574, "grad_norm": 2.2513108253479004, "learning_rate": 7.838779956427015e-07, "loss": 4.046871662139893, "step": 993 }, { "epoch": 2.1655773420479303, "grad_norm": 1.6728419065475464, "learning_rate": 7.836601307189542e-07, "loss": 3.9428036212921143, "step": 994 }, { "epoch": 2.167755991285403, "grad_norm": 2.486781358718872, "learning_rate": 7.834422657952069e-07, "loss": 4.231690883636475, "step": 995 }, { "epoch": 2.1699346405228757, "grad_norm": 1.8654186725616455, "learning_rate": 7.832244008714597e-07, "loss": 4.195611476898193, "step": 996 }, { "epoch": 2.1721132897603486, "grad_norm": 2.992135524749756, "learning_rate": 7.830065359477124e-07, "loss": 4.13925313949585, "step": 997 }, { "epoch": 2.1742919389978215, "grad_norm": 1.3797930479049683, "learning_rate": 7.82788671023965e-07, "loss": 3.876946449279785, "step": 998 }, { "epoch": 2.176470588235294, "grad_norm": 2.4482626914978027, "learning_rate": 7.825708061002179e-07, "loss": 4.174283027648926, "step": 999 }, { "epoch": 2.178649237472767, "grad_norm": 1.5868717432022095, "learning_rate": 7.823529411764705e-07, "loss": 3.955674886703491, "step": 1000 }, { "epoch": 2.1808278867102397, "grad_norm": 3.826517343521118, "learning_rate": 7.821350762527233e-07, "loss": 4.409421920776367, "step": 1001 }, { "epoch": 2.183006535947712, "grad_norm": 1.7969684600830078, "learning_rate": 7.81917211328976e-07, "loss": 3.955528974533081, "step": 1002 }, { "epoch": 2.185185185185185, "grad_norm": 1.781435251235962, "learning_rate": 7.816993464052287e-07, "loss": 3.9549343585968018, "step": 1003 }, { "epoch": 2.187363834422658, "grad_norm": 2.5089151859283447, "learning_rate": 7.814814814814814e-07, "loss": 4.233953475952148, "step": 1004 }, { "epoch": 2.189542483660131, "grad_norm": 2.5341856479644775, "learning_rate": 7.812636165577342e-07, "loss": 4.16447639465332, "step": 1005 }, { "epoch": 2.1917211328976034, "grad_norm": 2.519514799118042, "learning_rate": 7.810457516339869e-07, "loss": 4.039306163787842, "step": 1006 }, { "epoch": 2.1938997821350763, "grad_norm": 2.6813533306121826, "learning_rate": 7.808278867102396e-07, "loss": 4.266514301300049, "step": 1007 }, { "epoch": 2.196078431372549, "grad_norm": 2.148061513900757, "learning_rate": 7.806100217864923e-07, "loss": 4.24880838394165, "step": 1008 }, { "epoch": 2.1982570806100217, "grad_norm": 2.3313589096069336, "learning_rate": 7.803921568627451e-07, "loss": 4.107240200042725, "step": 1009 }, { "epoch": 2.2004357298474946, "grad_norm": 2.3435449600219727, "learning_rate": 7.801742919389977e-07, "loss": 4.142623424530029, "step": 1010 }, { "epoch": 2.2026143790849675, "grad_norm": 2.232758045196533, "learning_rate": 7.799564270152506e-07, "loss": 3.982508420944214, "step": 1011 }, { "epoch": 2.20479302832244, "grad_norm": 2.7213692665100098, "learning_rate": 7.797385620915032e-07, "loss": 4.31868839263916, "step": 1012 }, { "epoch": 2.206971677559913, "grad_norm": 1.9918797016143799, "learning_rate": 7.79520697167756e-07, "loss": 4.021210193634033, "step": 1013 }, { "epoch": 2.2091503267973858, "grad_norm": 3.1012773513793945, "learning_rate": 7.793028322440087e-07, "loss": 4.101766109466553, "step": 1014 }, { "epoch": 2.2113289760348582, "grad_norm": 1.9030157327651978, "learning_rate": 7.790849673202614e-07, "loss": 4.060393333435059, "step": 1015 }, { "epoch": 2.213507625272331, "grad_norm": 2.100125312805176, "learning_rate": 7.788671023965141e-07, "loss": 4.12300443649292, "step": 1016 }, { "epoch": 2.215686274509804, "grad_norm": 1.6978819370269775, "learning_rate": 7.786492374727669e-07, "loss": 4.022552490234375, "step": 1017 }, { "epoch": 2.2178649237472765, "grad_norm": 1.5401928424835205, "learning_rate": 7.784313725490196e-07, "loss": 4.028479099273682, "step": 1018 }, { "epoch": 2.2200435729847494, "grad_norm": 2.3669793605804443, "learning_rate": 7.782135076252723e-07, "loss": 4.307399272918701, "step": 1019 }, { "epoch": 2.2222222222222223, "grad_norm": 2.366002082824707, "learning_rate": 7.77995642701525e-07, "loss": 4.0427961349487305, "step": 1020 }, { "epoch": 2.224400871459695, "grad_norm": 2.2637102603912354, "learning_rate": 7.777777777777778e-07, "loss": 4.165700912475586, "step": 1021 }, { "epoch": 2.2265795206971677, "grad_norm": 2.4778847694396973, "learning_rate": 7.775599128540304e-07, "loss": 4.052174091339111, "step": 1022 }, { "epoch": 2.2287581699346406, "grad_norm": 1.7212820053100586, "learning_rate": 7.773420479302833e-07, "loss": 3.9928884506225586, "step": 1023 }, { "epoch": 2.230936819172113, "grad_norm": 2.4934499263763428, "learning_rate": 7.771241830065359e-07, "loss": 4.031148433685303, "step": 1024 }, { "epoch": 2.233115468409586, "grad_norm": 1.4554413557052612, "learning_rate": 7.769063180827887e-07, "loss": 3.949693441390991, "step": 1025 }, { "epoch": 2.235294117647059, "grad_norm": 1.6713886260986328, "learning_rate": 7.766884531590414e-07, "loss": 3.930182933807373, "step": 1026 }, { "epoch": 2.237472766884532, "grad_norm": 2.785137176513672, "learning_rate": 7.764705882352941e-07, "loss": 4.092245578765869, "step": 1027 }, { "epoch": 2.2396514161220042, "grad_norm": 1.9379091262817383, "learning_rate": 7.762527233115468e-07, "loss": 4.03016471862793, "step": 1028 }, { "epoch": 2.241830065359477, "grad_norm": 1.9291555881500244, "learning_rate": 7.760348583877995e-07, "loss": 4.133252143859863, "step": 1029 }, { "epoch": 2.24400871459695, "grad_norm": 2.6825361251831055, "learning_rate": 7.758169934640523e-07, "loss": 3.8898584842681885, "step": 1030 }, { "epoch": 2.2461873638344225, "grad_norm": 1.5352925062179565, "learning_rate": 7.75599128540305e-07, "loss": 3.9538631439208984, "step": 1031 }, { "epoch": 2.2483660130718954, "grad_norm": 1.9697437286376953, "learning_rate": 7.753812636165576e-07, "loss": 4.0294508934021, "step": 1032 }, { "epoch": 2.2505446623093683, "grad_norm": 2.068450927734375, "learning_rate": 7.751633986928105e-07, "loss": 3.986194133758545, "step": 1033 }, { "epoch": 2.252723311546841, "grad_norm": 1.6835108995437622, "learning_rate": 7.749455337690631e-07, "loss": 4.0386738777160645, "step": 1034 }, { "epoch": 2.2549019607843137, "grad_norm": 2.124763011932373, "learning_rate": 7.747276688453159e-07, "loss": 4.148105621337891, "step": 1035 }, { "epoch": 2.2570806100217866, "grad_norm": 2.0839574337005615, "learning_rate": 7.745098039215686e-07, "loss": 4.108643054962158, "step": 1036 }, { "epoch": 2.259259259259259, "grad_norm": 1.7134324312210083, "learning_rate": 7.742919389978214e-07, "loss": 4.067982196807861, "step": 1037 }, { "epoch": 2.261437908496732, "grad_norm": 2.303831100463867, "learning_rate": 7.74074074074074e-07, "loss": 4.046720504760742, "step": 1038 }, { "epoch": 2.263616557734205, "grad_norm": 1.8157504796981812, "learning_rate": 7.738562091503268e-07, "loss": 4.077605247497559, "step": 1039 }, { "epoch": 2.265795206971678, "grad_norm": 2.402491807937622, "learning_rate": 7.736383442265795e-07, "loss": 4.210230827331543, "step": 1040 }, { "epoch": 2.2679738562091503, "grad_norm": 2.5801522731781006, "learning_rate": 7.734204793028322e-07, "loss": 4.184082984924316, "step": 1041 }, { "epoch": 2.270152505446623, "grad_norm": 1.9734416007995605, "learning_rate": 7.73202614379085e-07, "loss": 4.171882629394531, "step": 1042 }, { "epoch": 2.272331154684096, "grad_norm": 2.230268955230713, "learning_rate": 7.729847494553377e-07, "loss": 4.06036901473999, "step": 1043 }, { "epoch": 2.2745098039215685, "grad_norm": 1.9332524538040161, "learning_rate": 7.727668845315903e-07, "loss": 4.1247076988220215, "step": 1044 }, { "epoch": 2.2766884531590414, "grad_norm": 2.211947202682495, "learning_rate": 7.725490196078432e-07, "loss": 4.146850109100342, "step": 1045 }, { "epoch": 2.2788671023965144, "grad_norm": 1.9953583478927612, "learning_rate": 7.723311546840958e-07, "loss": 4.0555596351623535, "step": 1046 }, { "epoch": 2.281045751633987, "grad_norm": 1.9548943042755127, "learning_rate": 7.721132897603485e-07, "loss": 4.144200325012207, "step": 1047 }, { "epoch": 2.2832244008714597, "grad_norm": 2.7984726428985596, "learning_rate": 7.718954248366013e-07, "loss": 4.334352493286133, "step": 1048 }, { "epoch": 2.2854030501089326, "grad_norm": 2.1481170654296875, "learning_rate": 7.716775599128539e-07, "loss": 4.059964179992676, "step": 1049 }, { "epoch": 2.287581699346405, "grad_norm": 2.2029738426208496, "learning_rate": 7.714596949891067e-07, "loss": 3.9889395236968994, "step": 1050 }, { "epoch": 2.289760348583878, "grad_norm": 2.242189884185791, "learning_rate": 7.712418300653594e-07, "loss": 4.012440204620361, "step": 1051 }, { "epoch": 2.291938997821351, "grad_norm": 2.877549171447754, "learning_rate": 7.710239651416122e-07, "loss": 4.214752674102783, "step": 1052 }, { "epoch": 2.2941176470588234, "grad_norm": 1.9597904682159424, "learning_rate": 7.708061002178648e-07, "loss": 3.8630194664001465, "step": 1053 }, { "epoch": 2.2962962962962963, "grad_norm": 2.0277161598205566, "learning_rate": 7.705882352941177e-07, "loss": 4.149177074432373, "step": 1054 }, { "epoch": 2.298474945533769, "grad_norm": 2.77191162109375, "learning_rate": 7.703703703703703e-07, "loss": 4.183628559112549, "step": 1055 }, { "epoch": 2.3006535947712417, "grad_norm": 1.7363804578781128, "learning_rate": 7.70152505446623e-07, "loss": 4.004621505737305, "step": 1056 }, { "epoch": 2.3028322440087146, "grad_norm": 2.4279212951660156, "learning_rate": 7.699346405228758e-07, "loss": 3.960480213165283, "step": 1057 }, { "epoch": 2.3050108932461875, "grad_norm": 1.7218619585037231, "learning_rate": 7.697167755991285e-07, "loss": 4.0729827880859375, "step": 1058 }, { "epoch": 2.30718954248366, "grad_norm": 2.467456579208374, "learning_rate": 7.694989106753812e-07, "loss": 3.985095739364624, "step": 1059 }, { "epoch": 2.309368191721133, "grad_norm": 2.5542612075805664, "learning_rate": 7.69281045751634e-07, "loss": 4.381570339202881, "step": 1060 }, { "epoch": 2.3115468409586057, "grad_norm": 2.6501786708831787, "learning_rate": 7.690631808278866e-07, "loss": 4.124879360198975, "step": 1061 }, { "epoch": 2.313725490196078, "grad_norm": 2.2705535888671875, "learning_rate": 7.688453159041394e-07, "loss": 4.079562187194824, "step": 1062 }, { "epoch": 2.315904139433551, "grad_norm": 2.1979851722717285, "learning_rate": 7.686274509803921e-07, "loss": 4.295355796813965, "step": 1063 }, { "epoch": 2.318082788671024, "grad_norm": 2.7606277465820312, "learning_rate": 7.684095860566449e-07, "loss": 4.135920524597168, "step": 1064 }, { "epoch": 2.3202614379084965, "grad_norm": 2.195889711380005, "learning_rate": 7.681917211328975e-07, "loss": 4.124853610992432, "step": 1065 }, { "epoch": 2.3224400871459694, "grad_norm": 2.1189308166503906, "learning_rate": 7.679738562091504e-07, "loss": 4.16725492477417, "step": 1066 }, { "epoch": 2.3246187363834423, "grad_norm": 1.895348310470581, "learning_rate": 7.67755991285403e-07, "loss": 4.099375247955322, "step": 1067 }, { "epoch": 2.326797385620915, "grad_norm": 2.170478105545044, "learning_rate": 7.675381263616557e-07, "loss": 3.9279630184173584, "step": 1068 }, { "epoch": 2.3289760348583877, "grad_norm": 2.5550923347473145, "learning_rate": 7.673202614379085e-07, "loss": 4.1205549240112305, "step": 1069 }, { "epoch": 2.3311546840958606, "grad_norm": 1.715939998626709, "learning_rate": 7.671023965141612e-07, "loss": 3.925992727279663, "step": 1070 }, { "epoch": 2.3333333333333335, "grad_norm": 2.166231393814087, "learning_rate": 7.668845315904139e-07, "loss": 4.168858051300049, "step": 1071 }, { "epoch": 2.335511982570806, "grad_norm": 2.043109178543091, "learning_rate": 7.666666666666667e-07, "loss": 4.142437934875488, "step": 1072 }, { "epoch": 2.337690631808279, "grad_norm": 1.9515937566757202, "learning_rate": 7.664488017429193e-07, "loss": 4.054185390472412, "step": 1073 }, { "epoch": 2.3398692810457518, "grad_norm": 1.9453387260437012, "learning_rate": 7.662309368191721e-07, "loss": 3.9963419437408447, "step": 1074 }, { "epoch": 2.342047930283224, "grad_norm": 2.0230700969696045, "learning_rate": 7.660130718954248e-07, "loss": 4.102096080780029, "step": 1075 }, { "epoch": 2.344226579520697, "grad_norm": 2.199162721633911, "learning_rate": 7.657952069716776e-07, "loss": 4.034978866577148, "step": 1076 }, { "epoch": 2.34640522875817, "grad_norm": 2.19111967086792, "learning_rate": 7.655773420479302e-07, "loss": 4.056146621704102, "step": 1077 }, { "epoch": 2.348583877995643, "grad_norm": 1.9047857522964478, "learning_rate": 7.653594771241829e-07, "loss": 3.8937389850616455, "step": 1078 }, { "epoch": 2.3507625272331154, "grad_norm": 2.7760274410247803, "learning_rate": 7.651416122004357e-07, "loss": 4.261303424835205, "step": 1079 }, { "epoch": 2.3529411764705883, "grad_norm": 1.8977633714675903, "learning_rate": 7.649237472766884e-07, "loss": 4.000192642211914, "step": 1080 }, { "epoch": 2.355119825708061, "grad_norm": 2.5454342365264893, "learning_rate": 7.647058823529411e-07, "loss": 4.205329418182373, "step": 1081 }, { "epoch": 2.3572984749455337, "grad_norm": 2.199505090713501, "learning_rate": 7.644880174291939e-07, "loss": 4.0512566566467285, "step": 1082 }, { "epoch": 2.3594771241830066, "grad_norm": 2.1628358364105225, "learning_rate": 7.642701525054466e-07, "loss": 4.102871417999268, "step": 1083 }, { "epoch": 2.3616557734204795, "grad_norm": 2.2754695415496826, "learning_rate": 7.640522875816993e-07, "loss": 4.066337585449219, "step": 1084 }, { "epoch": 2.363834422657952, "grad_norm": 2.088714361190796, "learning_rate": 7.63834422657952e-07, "loss": 4.223723888397217, "step": 1085 }, { "epoch": 2.366013071895425, "grad_norm": 1.6105167865753174, "learning_rate": 7.636165577342048e-07, "loss": 3.925288438796997, "step": 1086 }, { "epoch": 2.3681917211328978, "grad_norm": 1.9630597829818726, "learning_rate": 7.633986928104574e-07, "loss": 3.9998910427093506, "step": 1087 }, { "epoch": 2.3703703703703702, "grad_norm": 2.178084135055542, "learning_rate": 7.631808278867103e-07, "loss": 4.118580341339111, "step": 1088 }, { "epoch": 2.372549019607843, "grad_norm": 2.2270848751068115, "learning_rate": 7.629629629629629e-07, "loss": 4.15642786026001, "step": 1089 }, { "epoch": 2.374727668845316, "grad_norm": 1.9717636108398438, "learning_rate": 7.627450980392156e-07, "loss": 4.061000823974609, "step": 1090 }, { "epoch": 2.3769063180827885, "grad_norm": 2.5779964923858643, "learning_rate": 7.625272331154684e-07, "loss": 4.158853530883789, "step": 1091 }, { "epoch": 2.3790849673202614, "grad_norm": 2.0886847972869873, "learning_rate": 7.623093681917211e-07, "loss": 4.042017459869385, "step": 1092 }, { "epoch": 2.3812636165577343, "grad_norm": 1.8842631578445435, "learning_rate": 7.620915032679738e-07, "loss": 4.072216510772705, "step": 1093 }, { "epoch": 2.383442265795207, "grad_norm": 1.9865723848342896, "learning_rate": 7.618736383442266e-07, "loss": 4.050338268280029, "step": 1094 }, { "epoch": 2.3856209150326797, "grad_norm": 2.123199462890625, "learning_rate": 7.616557734204792e-07, "loss": 3.9922521114349365, "step": 1095 }, { "epoch": 2.3877995642701526, "grad_norm": 2.214766263961792, "learning_rate": 7.61437908496732e-07, "loss": 4.223117828369141, "step": 1096 }, { "epoch": 2.389978213507625, "grad_norm": 2.3022191524505615, "learning_rate": 7.612200435729847e-07, "loss": 4.236771583557129, "step": 1097 }, { "epoch": 2.392156862745098, "grad_norm": 2.868095636367798, "learning_rate": 7.610021786492375e-07, "loss": 4.235693454742432, "step": 1098 }, { "epoch": 2.394335511982571, "grad_norm": 2.178117275238037, "learning_rate": 7.607843137254901e-07, "loss": 4.0461955070495605, "step": 1099 }, { "epoch": 2.3965141612200433, "grad_norm": 2.0200552940368652, "learning_rate": 7.60566448801743e-07, "loss": 4.00168514251709, "step": 1100 }, { "epoch": 2.3986928104575163, "grad_norm": 1.8530594110488892, "learning_rate": 7.603485838779956e-07, "loss": 4.213029384613037, "step": 1101 }, { "epoch": 2.400871459694989, "grad_norm": 1.8352652788162231, "learning_rate": 7.601307189542483e-07, "loss": 4.003929615020752, "step": 1102 }, { "epoch": 2.4030501089324616, "grad_norm": 2.3743045330047607, "learning_rate": 7.599128540305011e-07, "loss": 4.080774307250977, "step": 1103 }, { "epoch": 2.4052287581699345, "grad_norm": 1.8969862461090088, "learning_rate": 7.596949891067538e-07, "loss": 3.973989963531494, "step": 1104 }, { "epoch": 2.4074074074074074, "grad_norm": 2.0373218059539795, "learning_rate": 7.594771241830065e-07, "loss": 4.064883708953857, "step": 1105 }, { "epoch": 2.4095860566448803, "grad_norm": 2.1628737449645996, "learning_rate": 7.592592592592593e-07, "loss": 4.152292728424072, "step": 1106 }, { "epoch": 2.411764705882353, "grad_norm": 2.008713483810425, "learning_rate": 7.59041394335512e-07, "loss": 4.142504692077637, "step": 1107 }, { "epoch": 2.4139433551198257, "grad_norm": 3.168912410736084, "learning_rate": 7.588235294117647e-07, "loss": 4.14745569229126, "step": 1108 }, { "epoch": 2.4161220043572986, "grad_norm": 2.7517385482788086, "learning_rate": 7.586056644880174e-07, "loss": 4.202332496643066, "step": 1109 }, { "epoch": 2.418300653594771, "grad_norm": 1.893372893333435, "learning_rate": 7.583877995642702e-07, "loss": 4.002898216247559, "step": 1110 }, { "epoch": 2.420479302832244, "grad_norm": 1.516433596611023, "learning_rate": 7.581699346405228e-07, "loss": 4.058389186859131, "step": 1111 }, { "epoch": 2.422657952069717, "grad_norm": 1.7535470724105835, "learning_rate": 7.579520697167757e-07, "loss": 3.9967947006225586, "step": 1112 }, { "epoch": 2.4248366013071894, "grad_norm": 1.8241873979568481, "learning_rate": 7.577342047930283e-07, "loss": 3.9968738555908203, "step": 1113 }, { "epoch": 2.4270152505446623, "grad_norm": 1.770133376121521, "learning_rate": 7.575163398692809e-07, "loss": 3.971287727355957, "step": 1114 }, { "epoch": 2.429193899782135, "grad_norm": 1.866702914237976, "learning_rate": 7.572984749455338e-07, "loss": 4.093426704406738, "step": 1115 }, { "epoch": 2.431372549019608, "grad_norm": 1.8667665719985962, "learning_rate": 7.570806100217864e-07, "loss": 4.005439758300781, "step": 1116 }, { "epoch": 2.4335511982570806, "grad_norm": 2.8368396759033203, "learning_rate": 7.568627450980392e-07, "loss": 4.0411224365234375, "step": 1117 }, { "epoch": 2.4357298474945535, "grad_norm": 2.5705630779266357, "learning_rate": 7.566448801742919e-07, "loss": 4.241152763366699, "step": 1118 }, { "epoch": 2.4379084967320264, "grad_norm": 2.3278298377990723, "learning_rate": 7.564270152505446e-07, "loss": 4.055289268493652, "step": 1119 }, { "epoch": 2.440087145969499, "grad_norm": 2.3351783752441406, "learning_rate": 7.562091503267973e-07, "loss": 4.054620265960693, "step": 1120 }, { "epoch": 2.4422657952069717, "grad_norm": 2.389031171798706, "learning_rate": 7.559912854030501e-07, "loss": 4.182103633880615, "step": 1121 }, { "epoch": 2.4444444444444446, "grad_norm": 1.8051159381866455, "learning_rate": 7.557734204793028e-07, "loss": 4.035372734069824, "step": 1122 }, { "epoch": 2.446623093681917, "grad_norm": 1.7726399898529053, "learning_rate": 7.555555555555555e-07, "loss": 3.996882915496826, "step": 1123 }, { "epoch": 2.44880174291939, "grad_norm": 1.655211091041565, "learning_rate": 7.553376906318083e-07, "loss": 3.9541494846343994, "step": 1124 }, { "epoch": 2.450980392156863, "grad_norm": 1.7298704385757446, "learning_rate": 7.55119825708061e-07, "loss": 4.058326721191406, "step": 1125 }, { "epoch": 2.4531590413943354, "grad_norm": 2.0881330966949463, "learning_rate": 7.549019607843136e-07, "loss": 3.979346752166748, "step": 1126 }, { "epoch": 2.4553376906318083, "grad_norm": 2.56744384765625, "learning_rate": 7.546840958605664e-07, "loss": 4.0270819664001465, "step": 1127 }, { "epoch": 2.457516339869281, "grad_norm": 2.6657371520996094, "learning_rate": 7.544662309368191e-07, "loss": 4.1608405113220215, "step": 1128 }, { "epoch": 2.4596949891067537, "grad_norm": 2.7847893238067627, "learning_rate": 7.542483660130719e-07, "loss": 4.1925787925720215, "step": 1129 }, { "epoch": 2.4618736383442266, "grad_norm": 2.6322240829467773, "learning_rate": 7.540305010893245e-07, "loss": 4.243997573852539, "step": 1130 }, { "epoch": 2.4640522875816995, "grad_norm": 2.3505096435546875, "learning_rate": 7.538126361655773e-07, "loss": 4.0618896484375, "step": 1131 }, { "epoch": 2.466230936819172, "grad_norm": 2.641031265258789, "learning_rate": 7.5359477124183e-07, "loss": 4.0240559577941895, "step": 1132 }, { "epoch": 2.468409586056645, "grad_norm": 2.568450689315796, "learning_rate": 7.533769063180827e-07, "loss": 4.269472599029541, "step": 1133 }, { "epoch": 2.4705882352941178, "grad_norm": 2.4542112350463867, "learning_rate": 7.531590413943355e-07, "loss": 4.212238311767578, "step": 1134 }, { "epoch": 2.47276688453159, "grad_norm": 2.2966153621673584, "learning_rate": 7.529411764705882e-07, "loss": 4.277198314666748, "step": 1135 }, { "epoch": 2.474945533769063, "grad_norm": 2.5182044506073, "learning_rate": 7.527233115468408e-07, "loss": 4.033989906311035, "step": 1136 }, { "epoch": 2.477124183006536, "grad_norm": 1.9726289510726929, "learning_rate": 7.525054466230937e-07, "loss": 4.0817060470581055, "step": 1137 }, { "epoch": 2.4793028322440085, "grad_norm": 1.7920219898223877, "learning_rate": 7.522875816993463e-07, "loss": 3.9426379203796387, "step": 1138 }, { "epoch": 2.4814814814814814, "grad_norm": 2.3702869415283203, "learning_rate": 7.520697167755991e-07, "loss": 4.248757362365723, "step": 1139 }, { "epoch": 2.4836601307189543, "grad_norm": 2.6982192993164062, "learning_rate": 7.518518518518518e-07, "loss": 4.038442134857178, "step": 1140 }, { "epoch": 2.4858387799564268, "grad_norm": 1.733790636062622, "learning_rate": 7.516339869281046e-07, "loss": 3.9496192932128906, "step": 1141 }, { "epoch": 2.4880174291938997, "grad_norm": 1.634392499923706, "learning_rate": 7.514161220043572e-07, "loss": 4.041971683502197, "step": 1142 }, { "epoch": 2.4901960784313726, "grad_norm": 1.541372299194336, "learning_rate": 7.5119825708061e-07, "loss": 3.9966933727264404, "step": 1143 }, { "epoch": 2.4923747276688455, "grad_norm": 3.0458199977874756, "learning_rate": 7.509803921568627e-07, "loss": 4.233633518218994, "step": 1144 }, { "epoch": 2.494553376906318, "grad_norm": 2.0334129333496094, "learning_rate": 7.507625272331154e-07, "loss": 4.065428256988525, "step": 1145 }, { "epoch": 2.496732026143791, "grad_norm": 1.8928402662277222, "learning_rate": 7.505446623093682e-07, "loss": 4.102261543273926, "step": 1146 }, { "epoch": 2.4989106753812638, "grad_norm": 2.3177990913391113, "learning_rate": 7.503267973856209e-07, "loss": 4.160831451416016, "step": 1147 }, { "epoch": 2.5010893246187362, "grad_norm": 2.635880470275879, "learning_rate": 7.501089324618735e-07, "loss": 4.119940757751465, "step": 1148 }, { "epoch": 2.503267973856209, "grad_norm": 2.2068982124328613, "learning_rate": 7.498910675381264e-07, "loss": 4.008247375488281, "step": 1149 }, { "epoch": 2.505446623093682, "grad_norm": 2.197460412979126, "learning_rate": 7.49673202614379e-07, "loss": 4.079433917999268, "step": 1150 }, { "epoch": 2.507625272331155, "grad_norm": 1.5680408477783203, "learning_rate": 7.494553376906318e-07, "loss": 4.00532341003418, "step": 1151 }, { "epoch": 2.5098039215686274, "grad_norm": 1.5743340253829956, "learning_rate": 7.492374727668845e-07, "loss": 3.936984062194824, "step": 1152 }, { "epoch": 2.5119825708061003, "grad_norm": 2.0102932453155518, "learning_rate": 7.490196078431373e-07, "loss": 3.894803047180176, "step": 1153 }, { "epoch": 2.5141612200435732, "grad_norm": 2.352349042892456, "learning_rate": 7.488017429193899e-07, "loss": 4.079133987426758, "step": 1154 }, { "epoch": 2.5163398692810457, "grad_norm": 2.4313547611236572, "learning_rate": 7.485838779956427e-07, "loss": 4.128035068511963, "step": 1155 }, { "epoch": 2.5185185185185186, "grad_norm": 3.2733092308044434, "learning_rate": 7.483660130718954e-07, "loss": 4.317625999450684, "step": 1156 }, { "epoch": 2.5206971677559915, "grad_norm": 2.7658896446228027, "learning_rate": 7.481481481481481e-07, "loss": 3.9206411838531494, "step": 1157 }, { "epoch": 2.522875816993464, "grad_norm": 1.7387783527374268, "learning_rate": 7.479302832244009e-07, "loss": 4.015710830688477, "step": 1158 }, { "epoch": 2.525054466230937, "grad_norm": 3.007667064666748, "learning_rate": 7.477124183006536e-07, "loss": 4.181552886962891, "step": 1159 }, { "epoch": 2.52723311546841, "grad_norm": 1.8218690156936646, "learning_rate": 7.474945533769062e-07, "loss": 3.948695421218872, "step": 1160 }, { "epoch": 2.5294117647058822, "grad_norm": 2.0363476276397705, "learning_rate": 7.472766884531591e-07, "loss": 4.210475921630859, "step": 1161 }, { "epoch": 2.531590413943355, "grad_norm": 3.011033535003662, "learning_rate": 7.470588235294117e-07, "loss": 4.384279251098633, "step": 1162 }, { "epoch": 2.533769063180828, "grad_norm": 2.033252716064453, "learning_rate": 7.468409586056645e-07, "loss": 4.115696430206299, "step": 1163 }, { "epoch": 2.5359477124183005, "grad_norm": 2.756652593612671, "learning_rate": 7.466230936819172e-07, "loss": 4.182315826416016, "step": 1164 }, { "epoch": 2.5381263616557734, "grad_norm": 1.8879038095474243, "learning_rate": 7.4640522875817e-07, "loss": 3.9314353466033936, "step": 1165 }, { "epoch": 2.5403050108932463, "grad_norm": 1.7477552890777588, "learning_rate": 7.461873638344226e-07, "loss": 3.9081571102142334, "step": 1166 }, { "epoch": 2.542483660130719, "grad_norm": 2.0587830543518066, "learning_rate": 7.459694989106754e-07, "loss": 3.780139446258545, "step": 1167 }, { "epoch": 2.5446623093681917, "grad_norm": 2.340733528137207, "learning_rate": 7.457516339869281e-07, "loss": 4.027875900268555, "step": 1168 }, { "epoch": 2.5468409586056646, "grad_norm": 2.1383118629455566, "learning_rate": 7.455337690631808e-07, "loss": 4.1321821212768555, "step": 1169 }, { "epoch": 2.549019607843137, "grad_norm": 2.024571180343628, "learning_rate": 7.453159041394336e-07, "loss": 4.072457313537598, "step": 1170 }, { "epoch": 2.55119825708061, "grad_norm": 1.9865013360977173, "learning_rate": 7.450980392156863e-07, "loss": 4.1561384201049805, "step": 1171 }, { "epoch": 2.553376906318083, "grad_norm": 1.732537031173706, "learning_rate": 7.448801742919389e-07, "loss": 4.013232231140137, "step": 1172 }, { "epoch": 2.5555555555555554, "grad_norm": 2.174915075302124, "learning_rate": 7.446623093681917e-07, "loss": 4.050281047821045, "step": 1173 }, { "epoch": 2.5577342047930283, "grad_norm": 1.9418503046035767, "learning_rate": 7.444444444444444e-07, "loss": 4.093884468078613, "step": 1174 }, { "epoch": 2.559912854030501, "grad_norm": 1.484223484992981, "learning_rate": 7.442265795206972e-07, "loss": 3.9759645462036133, "step": 1175 }, { "epoch": 2.5620915032679736, "grad_norm": 1.8449352979660034, "learning_rate": 7.440087145969498e-07, "loss": 3.98661208152771, "step": 1176 }, { "epoch": 2.5642701525054465, "grad_norm": 1.8723292350769043, "learning_rate": 7.437908496732026e-07, "loss": 3.9509332180023193, "step": 1177 }, { "epoch": 2.5664488017429194, "grad_norm": 3.197317600250244, "learning_rate": 7.435729847494553e-07, "loss": 4.3281779289245605, "step": 1178 }, { "epoch": 2.568627450980392, "grad_norm": 1.7558633089065552, "learning_rate": 7.43355119825708e-07, "loss": 3.9255435466766357, "step": 1179 }, { "epoch": 2.570806100217865, "grad_norm": 2.129359006881714, "learning_rate": 7.431372549019608e-07, "loss": 3.9946537017822266, "step": 1180 }, { "epoch": 2.5729847494553377, "grad_norm": 2.119171380996704, "learning_rate": 7.429193899782135e-07, "loss": 4.2344865798950195, "step": 1181 }, { "epoch": 2.57516339869281, "grad_norm": 2.32218599319458, "learning_rate": 7.427015250544662e-07, "loss": 4.1057538986206055, "step": 1182 }, { "epoch": 2.577342047930283, "grad_norm": 2.822746753692627, "learning_rate": 7.42483660130719e-07, "loss": 4.102893829345703, "step": 1183 }, { "epoch": 2.579520697167756, "grad_norm": 1.7590959072113037, "learning_rate": 7.422657952069716e-07, "loss": 4.035136699676514, "step": 1184 }, { "epoch": 2.581699346405229, "grad_norm": 1.8279132843017578, "learning_rate": 7.420479302832243e-07, "loss": 3.943058967590332, "step": 1185 }, { "epoch": 2.5838779956427014, "grad_norm": 2.1571853160858154, "learning_rate": 7.418300653594771e-07, "loss": 4.208186149597168, "step": 1186 }, { "epoch": 2.5860566448801743, "grad_norm": 2.2204298973083496, "learning_rate": 7.416122004357298e-07, "loss": 4.098830223083496, "step": 1187 }, { "epoch": 2.588235294117647, "grad_norm": 2.358713388442993, "learning_rate": 7.413943355119825e-07, "loss": 4.179903507232666, "step": 1188 }, { "epoch": 2.59041394335512, "grad_norm": 2.276357650756836, "learning_rate": 7.411764705882352e-07, "loss": 4.1405158042907715, "step": 1189 }, { "epoch": 2.5925925925925926, "grad_norm": 2.221909761428833, "learning_rate": 7.40958605664488e-07, "loss": 4.108950138092041, "step": 1190 }, { "epoch": 2.5947712418300655, "grad_norm": 1.6492558717727661, "learning_rate": 7.407407407407406e-07, "loss": 4.0711541175842285, "step": 1191 }, { "epoch": 2.5969498910675384, "grad_norm": 2.0734970569610596, "learning_rate": 7.405228758169935e-07, "loss": 4.08068323135376, "step": 1192 }, { "epoch": 2.599128540305011, "grad_norm": 2.1566500663757324, "learning_rate": 7.403050108932461e-07, "loss": 4.096595287322998, "step": 1193 }, { "epoch": 2.6013071895424837, "grad_norm": 2.1739373207092285, "learning_rate": 7.400871459694988e-07, "loss": 3.9846582412719727, "step": 1194 }, { "epoch": 2.6034858387799567, "grad_norm": 1.965381145477295, "learning_rate": 7.398692810457516e-07, "loss": 4.002685070037842, "step": 1195 }, { "epoch": 2.605664488017429, "grad_norm": 1.9366487264633179, "learning_rate": 7.396514161220043e-07, "loss": 3.9636669158935547, "step": 1196 }, { "epoch": 2.607843137254902, "grad_norm": 2.498394727706909, "learning_rate": 7.39433551198257e-07, "loss": 4.058833122253418, "step": 1197 }, { "epoch": 2.610021786492375, "grad_norm": 2.562105894088745, "learning_rate": 7.392156862745098e-07, "loss": 4.131340980529785, "step": 1198 }, { "epoch": 2.6122004357298474, "grad_norm": 2.308809995651245, "learning_rate": 7.389978213507625e-07, "loss": 4.015336990356445, "step": 1199 }, { "epoch": 2.6143790849673203, "grad_norm": 2.1290252208709717, "learning_rate": 7.387799564270152e-07, "loss": 4.16436243057251, "step": 1200 }, { "epoch": 2.616557734204793, "grad_norm": 2.141458511352539, "learning_rate": 7.385620915032679e-07, "loss": 4.141194820404053, "step": 1201 }, { "epoch": 2.6187363834422657, "grad_norm": 2.1496827602386475, "learning_rate": 7.383442265795207e-07, "loss": 4.025135040283203, "step": 1202 }, { "epoch": 2.6209150326797386, "grad_norm": 1.948879599571228, "learning_rate": 7.381263616557733e-07, "loss": 4.031362533569336, "step": 1203 }, { "epoch": 2.6230936819172115, "grad_norm": 1.583351969718933, "learning_rate": 7.379084967320262e-07, "loss": 4.006800651550293, "step": 1204 }, { "epoch": 2.625272331154684, "grad_norm": 2.5793938636779785, "learning_rate": 7.376906318082788e-07, "loss": 4.362700462341309, "step": 1205 }, { "epoch": 2.627450980392157, "grad_norm": 1.7457002401351929, "learning_rate": 7.374727668845315e-07, "loss": 3.928466558456421, "step": 1206 }, { "epoch": 2.6296296296296298, "grad_norm": 1.6791082620620728, "learning_rate": 7.372549019607843e-07, "loss": 4.002142429351807, "step": 1207 }, { "epoch": 2.6318082788671022, "grad_norm": 2.104890823364258, "learning_rate": 7.37037037037037e-07, "loss": 4.059352397918701, "step": 1208 }, { "epoch": 2.633986928104575, "grad_norm": 2.069044351577759, "learning_rate": 7.368191721132897e-07, "loss": 4.143891334533691, "step": 1209 }, { "epoch": 2.636165577342048, "grad_norm": 1.9619617462158203, "learning_rate": 7.366013071895425e-07, "loss": 4.072021007537842, "step": 1210 }, { "epoch": 2.6383442265795205, "grad_norm": 1.6514337062835693, "learning_rate": 7.363834422657952e-07, "loss": 4.086104869842529, "step": 1211 }, { "epoch": 2.6405228758169934, "grad_norm": 2.469071626663208, "learning_rate": 7.361655773420479e-07, "loss": 4.228544235229492, "step": 1212 }, { "epoch": 2.6427015250544663, "grad_norm": 1.8714196681976318, "learning_rate": 7.359477124183006e-07, "loss": 4.041227340698242, "step": 1213 }, { "epoch": 2.644880174291939, "grad_norm": 2.2342441082000732, "learning_rate": 7.357298474945534e-07, "loss": 4.085497856140137, "step": 1214 }, { "epoch": 2.6470588235294117, "grad_norm": 2.4366915225982666, "learning_rate": 7.35511982570806e-07, "loss": 4.052178382873535, "step": 1215 }, { "epoch": 2.6492374727668846, "grad_norm": 2.090285301208496, "learning_rate": 7.352941176470589e-07, "loss": 4.0369391441345215, "step": 1216 }, { "epoch": 2.651416122004357, "grad_norm": 2.8174142837524414, "learning_rate": 7.350762527233115e-07, "loss": 4.037648677825928, "step": 1217 }, { "epoch": 2.65359477124183, "grad_norm": 2.20036244392395, "learning_rate": 7.348583877995642e-07, "loss": 4.0350189208984375, "step": 1218 }, { "epoch": 2.655773420479303, "grad_norm": 1.8572531938552856, "learning_rate": 7.34640522875817e-07, "loss": 4.120612621307373, "step": 1219 }, { "epoch": 2.6579520697167753, "grad_norm": 1.936388373374939, "learning_rate": 7.344226579520697e-07, "loss": 4.044631481170654, "step": 1220 }, { "epoch": 2.6601307189542482, "grad_norm": 2.2143683433532715, "learning_rate": 7.342047930283224e-07, "loss": 4.044402599334717, "step": 1221 }, { "epoch": 2.662309368191721, "grad_norm": 2.7869443893432617, "learning_rate": 7.339869281045751e-07, "loss": 4.108054161071777, "step": 1222 }, { "epoch": 2.664488017429194, "grad_norm": 2.411879539489746, "learning_rate": 7.337690631808279e-07, "loss": 4.1267805099487305, "step": 1223 }, { "epoch": 2.6666666666666665, "grad_norm": 2.4717910289764404, "learning_rate": 7.335511982570806e-07, "loss": 4.126617908477783, "step": 1224 }, { "epoch": 2.6688453159041394, "grad_norm": 1.81528902053833, "learning_rate": 7.333333333333332e-07, "loss": 3.9830334186553955, "step": 1225 }, { "epoch": 2.6710239651416123, "grad_norm": 2.294996500015259, "learning_rate": 7.331154684095861e-07, "loss": 4.136925220489502, "step": 1226 }, { "epoch": 2.6732026143790852, "grad_norm": 2.6548118591308594, "learning_rate": 7.328976034858387e-07, "loss": 4.300189018249512, "step": 1227 }, { "epoch": 2.6753812636165577, "grad_norm": 2.297008514404297, "learning_rate": 7.326797385620915e-07, "loss": 4.039825916290283, "step": 1228 }, { "epoch": 2.6775599128540306, "grad_norm": 2.4849953651428223, "learning_rate": 7.324618736383442e-07, "loss": 4.055141925811768, "step": 1229 }, { "epoch": 2.6797385620915035, "grad_norm": 2.077615737915039, "learning_rate": 7.322440087145969e-07, "loss": 4.234510898590088, "step": 1230 }, { "epoch": 2.681917211328976, "grad_norm": 2.173097848892212, "learning_rate": 7.320261437908496e-07, "loss": 4.117733001708984, "step": 1231 }, { "epoch": 2.684095860566449, "grad_norm": 2.8079543113708496, "learning_rate": 7.318082788671024e-07, "loss": 4.145205020904541, "step": 1232 }, { "epoch": 2.686274509803922, "grad_norm": 1.8126652240753174, "learning_rate": 7.315904139433551e-07, "loss": 4.149326324462891, "step": 1233 }, { "epoch": 2.6884531590413943, "grad_norm": 2.360736608505249, "learning_rate": 7.313725490196078e-07, "loss": 3.9278430938720703, "step": 1234 }, { "epoch": 2.690631808278867, "grad_norm": 1.9758261442184448, "learning_rate": 7.311546840958605e-07, "loss": 3.9275219440460205, "step": 1235 }, { "epoch": 2.69281045751634, "grad_norm": 1.8728855848312378, "learning_rate": 7.309368191721133e-07, "loss": 4.061554431915283, "step": 1236 }, { "epoch": 2.6949891067538125, "grad_norm": 2.038187026977539, "learning_rate": 7.307189542483659e-07, "loss": 3.8362436294555664, "step": 1237 }, { "epoch": 2.6971677559912854, "grad_norm": 2.6671066284179688, "learning_rate": 7.305010893246188e-07, "loss": 4.324222564697266, "step": 1238 }, { "epoch": 2.6993464052287583, "grad_norm": 2.309474468231201, "learning_rate": 7.302832244008714e-07, "loss": 4.243329048156738, "step": 1239 }, { "epoch": 2.701525054466231, "grad_norm": 2.4377169609069824, "learning_rate": 7.300653594771242e-07, "loss": 4.040592193603516, "step": 1240 }, { "epoch": 2.7037037037037037, "grad_norm": 2.26750111579895, "learning_rate": 7.298474945533769e-07, "loss": 4.113349437713623, "step": 1241 }, { "epoch": 2.7058823529411766, "grad_norm": 1.8802227973937988, "learning_rate": 7.296296296296296e-07, "loss": 3.9507367610931396, "step": 1242 }, { "epoch": 2.708061002178649, "grad_norm": 1.81936514377594, "learning_rate": 7.294117647058823e-07, "loss": 3.98329496383667, "step": 1243 }, { "epoch": 2.710239651416122, "grad_norm": 2.0246989727020264, "learning_rate": 7.291938997821351e-07, "loss": 4.072848320007324, "step": 1244 }, { "epoch": 2.712418300653595, "grad_norm": 2.3280224800109863, "learning_rate": 7.289760348583878e-07, "loss": 4.2408294677734375, "step": 1245 }, { "epoch": 2.7145969498910674, "grad_norm": 2.7856106758117676, "learning_rate": 7.287581699346405e-07, "loss": 4.128313064575195, "step": 1246 }, { "epoch": 2.7167755991285403, "grad_norm": 1.9694745540618896, "learning_rate": 7.285403050108932e-07, "loss": 4.18126106262207, "step": 1247 }, { "epoch": 2.718954248366013, "grad_norm": 1.9791138172149658, "learning_rate": 7.28322440087146e-07, "loss": 4.0716471672058105, "step": 1248 }, { "epoch": 2.7211328976034856, "grad_norm": 3.4779231548309326, "learning_rate": 7.281045751633986e-07, "loss": 4.228255748748779, "step": 1249 }, { "epoch": 2.7233115468409586, "grad_norm": 2.655686855316162, "learning_rate": 7.278867102396515e-07, "loss": 4.009222030639648, "step": 1250 }, { "epoch": 2.7254901960784315, "grad_norm": 2.551389694213867, "learning_rate": 7.276688453159041e-07, "loss": 4.137635231018066, "step": 1251 }, { "epoch": 2.727668845315904, "grad_norm": 2.050765037536621, "learning_rate": 7.274509803921567e-07, "loss": 3.941962480545044, "step": 1252 }, { "epoch": 2.729847494553377, "grad_norm": 2.982351303100586, "learning_rate": 7.272331154684096e-07, "loss": 4.287186622619629, "step": 1253 }, { "epoch": 2.7320261437908497, "grad_norm": 1.8161977529525757, "learning_rate": 7.270152505446622e-07, "loss": 3.920441150665283, "step": 1254 }, { "epoch": 2.734204793028322, "grad_norm": 1.8969671726226807, "learning_rate": 7.26797385620915e-07, "loss": 4.079735279083252, "step": 1255 }, { "epoch": 2.736383442265795, "grad_norm": 2.0750317573547363, "learning_rate": 7.265795206971677e-07, "loss": 4.113439559936523, "step": 1256 }, { "epoch": 2.738562091503268, "grad_norm": 1.8159763813018799, "learning_rate": 7.263616557734205e-07, "loss": 3.7927513122558594, "step": 1257 }, { "epoch": 2.7407407407407405, "grad_norm": 2.3881771564483643, "learning_rate": 7.261437908496731e-07, "loss": 4.0512895584106445, "step": 1258 }, { "epoch": 2.7429193899782134, "grad_norm": 1.8788857460021973, "learning_rate": 7.259259259259259e-07, "loss": 3.983058214187622, "step": 1259 }, { "epoch": 2.7450980392156863, "grad_norm": 2.2401680946350098, "learning_rate": 7.257080610021786e-07, "loss": 4.096259593963623, "step": 1260 }, { "epoch": 2.747276688453159, "grad_norm": 1.7303411960601807, "learning_rate": 7.254901960784313e-07, "loss": 3.9601075649261475, "step": 1261 }, { "epoch": 2.7494553376906317, "grad_norm": 1.6023896932601929, "learning_rate": 7.252723311546841e-07, "loss": 4.0621819496154785, "step": 1262 }, { "epoch": 2.7516339869281046, "grad_norm": 2.1377785205841064, "learning_rate": 7.250544662309368e-07, "loss": 4.249439716339111, "step": 1263 }, { "epoch": 2.7538126361655775, "grad_norm": 2.2939624786376953, "learning_rate": 7.248366013071894e-07, "loss": 4.082367897033691, "step": 1264 }, { "epoch": 2.7559912854030504, "grad_norm": 1.9332444667816162, "learning_rate": 7.246187363834423e-07, "loss": 4.069717884063721, "step": 1265 }, { "epoch": 2.758169934640523, "grad_norm": 2.436227798461914, "learning_rate": 7.244008714596949e-07, "loss": 4.202919960021973, "step": 1266 }, { "epoch": 2.7603485838779958, "grad_norm": 2.0189402103424072, "learning_rate": 7.241830065359477e-07, "loss": 3.9128737449645996, "step": 1267 }, { "epoch": 2.7625272331154687, "grad_norm": 2.101651906967163, "learning_rate": 7.239651416122004e-07, "loss": 3.9384663105010986, "step": 1268 }, { "epoch": 2.764705882352941, "grad_norm": 1.9226696491241455, "learning_rate": 7.237472766884532e-07, "loss": 3.954453945159912, "step": 1269 }, { "epoch": 2.766884531590414, "grad_norm": 1.9292939901351929, "learning_rate": 7.235294117647058e-07, "loss": 4.077046871185303, "step": 1270 }, { "epoch": 2.769063180827887, "grad_norm": 1.7969335317611694, "learning_rate": 7.233115468409585e-07, "loss": 4.0459513664245605, "step": 1271 }, { "epoch": 2.7712418300653594, "grad_norm": 1.706181526184082, "learning_rate": 7.230936819172113e-07, "loss": 3.9555344581604004, "step": 1272 }, { "epoch": 2.7734204793028323, "grad_norm": 2.5202817916870117, "learning_rate": 7.22875816993464e-07, "loss": 4.224204063415527, "step": 1273 }, { "epoch": 2.775599128540305, "grad_norm": 1.6407660245895386, "learning_rate": 7.226579520697167e-07, "loss": 3.967120409011841, "step": 1274 }, { "epoch": 2.7777777777777777, "grad_norm": 1.9387744665145874, "learning_rate": 7.224400871459695e-07, "loss": 3.9744906425476074, "step": 1275 }, { "epoch": 2.7799564270152506, "grad_norm": 1.6966843605041504, "learning_rate": 7.222222222222221e-07, "loss": 3.9272916316986084, "step": 1276 }, { "epoch": 2.7821350762527235, "grad_norm": 1.7865827083587646, "learning_rate": 7.220043572984749e-07, "loss": 4.138093948364258, "step": 1277 }, { "epoch": 2.784313725490196, "grad_norm": 3.4461591243743896, "learning_rate": 7.217864923747276e-07, "loss": 4.225474834442139, "step": 1278 }, { "epoch": 2.786492374727669, "grad_norm": 1.8746073246002197, "learning_rate": 7.215686274509804e-07, "loss": 4.057031631469727, "step": 1279 }, { "epoch": 2.7886710239651418, "grad_norm": 2.189147710800171, "learning_rate": 7.21350762527233e-07, "loss": 4.116001605987549, "step": 1280 }, { "epoch": 2.7908496732026142, "grad_norm": 2.070871353149414, "learning_rate": 7.211328976034859e-07, "loss": 4.182185173034668, "step": 1281 }, { "epoch": 2.793028322440087, "grad_norm": 2.063220739364624, "learning_rate": 7.209150326797385e-07, "loss": 4.255448818206787, "step": 1282 }, { "epoch": 2.79520697167756, "grad_norm": 1.7882076501846313, "learning_rate": 7.206971677559912e-07, "loss": 4.008045673370361, "step": 1283 }, { "epoch": 2.7973856209150325, "grad_norm": 2.296272039413452, "learning_rate": 7.20479302832244e-07, "loss": 4.0058112144470215, "step": 1284 }, { "epoch": 2.7995642701525054, "grad_norm": 2.4862987995147705, "learning_rate": 7.202614379084967e-07, "loss": 4.185858249664307, "step": 1285 }, { "epoch": 2.8017429193899783, "grad_norm": 1.8162435293197632, "learning_rate": 7.200435729847494e-07, "loss": 4.045594215393066, "step": 1286 }, { "epoch": 2.803921568627451, "grad_norm": 1.5840879678726196, "learning_rate": 7.198257080610022e-07, "loss": 4.000432968139648, "step": 1287 }, { "epoch": 2.8061002178649237, "grad_norm": 2.001845121383667, "learning_rate": 7.196078431372548e-07, "loss": 4.032077789306641, "step": 1288 }, { "epoch": 2.8082788671023966, "grad_norm": 1.8919546604156494, "learning_rate": 7.193899782135076e-07, "loss": 4.092050075531006, "step": 1289 }, { "epoch": 2.810457516339869, "grad_norm": 2.533766508102417, "learning_rate": 7.191721132897603e-07, "loss": 4.236661911010742, "step": 1290 }, { "epoch": 2.812636165577342, "grad_norm": 1.9859346151351929, "learning_rate": 7.189542483660131e-07, "loss": 4.032649040222168, "step": 1291 }, { "epoch": 2.814814814814815, "grad_norm": 2.626096487045288, "learning_rate": 7.187363834422657e-07, "loss": 4.135249137878418, "step": 1292 }, { "epoch": 2.8169934640522873, "grad_norm": 1.535330057144165, "learning_rate": 7.185185185185186e-07, "loss": 4.018188953399658, "step": 1293 }, { "epoch": 2.8191721132897603, "grad_norm": 2.141206741333008, "learning_rate": 7.183006535947712e-07, "loss": 4.127739429473877, "step": 1294 }, { "epoch": 2.821350762527233, "grad_norm": 2.061596632003784, "learning_rate": 7.180827886710239e-07, "loss": 3.965701103210449, "step": 1295 }, { "epoch": 2.8235294117647056, "grad_norm": 2.0047507286071777, "learning_rate": 7.178649237472767e-07, "loss": 4.109291076660156, "step": 1296 }, { "epoch": 2.8257080610021785, "grad_norm": 1.8402358293533325, "learning_rate": 7.176470588235294e-07, "loss": 4.114861965179443, "step": 1297 }, { "epoch": 2.8278867102396514, "grad_norm": 2.7175326347351074, "learning_rate": 7.174291938997821e-07, "loss": 4.331141948699951, "step": 1298 }, { "epoch": 2.8300653594771243, "grad_norm": 2.14866042137146, "learning_rate": 7.172113289760349e-07, "loss": 4.101419448852539, "step": 1299 }, { "epoch": 2.832244008714597, "grad_norm": 2.2630059719085693, "learning_rate": 7.169934640522875e-07, "loss": 4.183446884155273, "step": 1300 }, { "epoch": 2.8344226579520697, "grad_norm": 2.022808313369751, "learning_rate": 7.167755991285403e-07, "loss": 4.017683029174805, "step": 1301 }, { "epoch": 2.8366013071895426, "grad_norm": 1.5495890378952026, "learning_rate": 7.16557734204793e-07, "loss": 3.995725154876709, "step": 1302 }, { "epoch": 2.8387799564270155, "grad_norm": 1.8888450860977173, "learning_rate": 7.163398692810458e-07, "loss": 3.987718105316162, "step": 1303 }, { "epoch": 2.840958605664488, "grad_norm": 2.582493543624878, "learning_rate": 7.161220043572984e-07, "loss": 4.165774822235107, "step": 1304 }, { "epoch": 2.843137254901961, "grad_norm": 2.5158631801605225, "learning_rate": 7.159041394335513e-07, "loss": 4.01421594619751, "step": 1305 }, { "epoch": 2.845315904139434, "grad_norm": 2.3404486179351807, "learning_rate": 7.156862745098039e-07, "loss": 4.174933910369873, "step": 1306 }, { "epoch": 2.8474945533769063, "grad_norm": 2.007209062576294, "learning_rate": 7.154684095860566e-07, "loss": 3.9912877082824707, "step": 1307 }, { "epoch": 2.849673202614379, "grad_norm": 2.440896987915039, "learning_rate": 7.152505446623094e-07, "loss": 4.093720436096191, "step": 1308 }, { "epoch": 2.851851851851852, "grad_norm": 1.6974689960479736, "learning_rate": 7.150326797385621e-07, "loss": 3.960319757461548, "step": 1309 }, { "epoch": 2.8540305010893245, "grad_norm": 1.9464792013168335, "learning_rate": 7.148148148148148e-07, "loss": 4.000375747680664, "step": 1310 }, { "epoch": 2.8562091503267975, "grad_norm": 2.4507927894592285, "learning_rate": 7.145969498910676e-07, "loss": 4.183685302734375, "step": 1311 }, { "epoch": 2.8583877995642704, "grad_norm": 2.0516350269317627, "learning_rate": 7.143790849673202e-07, "loss": 4.095175743103027, "step": 1312 }, { "epoch": 2.860566448801743, "grad_norm": 2.400831699371338, "learning_rate": 7.14161220043573e-07, "loss": 4.138949394226074, "step": 1313 }, { "epoch": 2.8627450980392157, "grad_norm": 2.603877544403076, "learning_rate": 7.139433551198257e-07, "loss": 4.147704601287842, "step": 1314 }, { "epoch": 2.8649237472766886, "grad_norm": 2.026293992996216, "learning_rate": 7.137254901960785e-07, "loss": 4.003324508666992, "step": 1315 }, { "epoch": 2.867102396514161, "grad_norm": 2.4032931327819824, "learning_rate": 7.135076252723311e-07, "loss": 4.194486141204834, "step": 1316 }, { "epoch": 2.869281045751634, "grad_norm": 1.811413049697876, "learning_rate": 7.132897603485838e-07, "loss": 4.065234184265137, "step": 1317 }, { "epoch": 2.871459694989107, "grad_norm": 2.485569477081299, "learning_rate": 7.130718954248366e-07, "loss": 4.17244815826416, "step": 1318 }, { "epoch": 2.8736383442265794, "grad_norm": 1.7770358324050903, "learning_rate": 7.128540305010893e-07, "loss": 4.011878490447998, "step": 1319 }, { "epoch": 2.8758169934640523, "grad_norm": 2.1820082664489746, "learning_rate": 7.12636165577342e-07, "loss": 4.1406989097595215, "step": 1320 }, { "epoch": 2.877995642701525, "grad_norm": 2.2298965454101562, "learning_rate": 7.124183006535948e-07, "loss": 4.082643032073975, "step": 1321 }, { "epoch": 2.8801742919389977, "grad_norm": 1.6698293685913086, "learning_rate": 7.122004357298474e-07, "loss": 3.998666286468506, "step": 1322 }, { "epoch": 2.8823529411764706, "grad_norm": 2.3940682411193848, "learning_rate": 7.119825708061001e-07, "loss": 4.116477012634277, "step": 1323 }, { "epoch": 2.8845315904139435, "grad_norm": 2.1930761337280273, "learning_rate": 7.117647058823529e-07, "loss": 4.035118579864502, "step": 1324 }, { "epoch": 2.886710239651416, "grad_norm": 2.8842079639434814, "learning_rate": 7.115468409586056e-07, "loss": 4.20718240737915, "step": 1325 }, { "epoch": 2.888888888888889, "grad_norm": 3.226893424987793, "learning_rate": 7.113289760348583e-07, "loss": 4.3169684410095215, "step": 1326 }, { "epoch": 2.8910675381263617, "grad_norm": 2.317155122756958, "learning_rate": 7.111111111111111e-07, "loss": 4.070048809051514, "step": 1327 }, { "epoch": 2.893246187363834, "grad_norm": 1.9637353420257568, "learning_rate": 7.108932461873638e-07, "loss": 3.9112229347229004, "step": 1328 }, { "epoch": 2.895424836601307, "grad_norm": 2.1201820373535156, "learning_rate": 7.106753812636164e-07, "loss": 4.039405345916748, "step": 1329 }, { "epoch": 2.89760348583878, "grad_norm": 2.6084582805633545, "learning_rate": 7.104575163398693e-07, "loss": 4.193794250488281, "step": 1330 }, { "epoch": 2.8997821350762525, "grad_norm": 1.86362624168396, "learning_rate": 7.102396514161219e-07, "loss": 4.132012367248535, "step": 1331 }, { "epoch": 2.9019607843137254, "grad_norm": 1.6391324996948242, "learning_rate": 7.100217864923747e-07, "loss": 4.071481227874756, "step": 1332 }, { "epoch": 2.9041394335511983, "grad_norm": 1.8953934907913208, "learning_rate": 7.098039215686274e-07, "loss": 4.066107273101807, "step": 1333 }, { "epoch": 2.9063180827886708, "grad_norm": 1.9203068017959595, "learning_rate": 7.095860566448801e-07, "loss": 3.9545531272888184, "step": 1334 }, { "epoch": 2.9084967320261437, "grad_norm": 3.054277181625366, "learning_rate": 7.093681917211328e-07, "loss": 4.202263832092285, "step": 1335 }, { "epoch": 2.9106753812636166, "grad_norm": 1.9048999547958374, "learning_rate": 7.091503267973856e-07, "loss": 4.0587358474731445, "step": 1336 }, { "epoch": 2.9128540305010895, "grad_norm": 3.2379674911499023, "learning_rate": 7.089324618736383e-07, "loss": 4.130218982696533, "step": 1337 }, { "epoch": 2.915032679738562, "grad_norm": 2.0444176197052, "learning_rate": 7.08714596949891e-07, "loss": 3.97406268119812, "step": 1338 }, { "epoch": 2.917211328976035, "grad_norm": 2.59114146232605, "learning_rate": 7.084967320261438e-07, "loss": 4.170068740844727, "step": 1339 }, { "epoch": 2.9193899782135078, "grad_norm": 1.7566479444503784, "learning_rate": 7.082788671023965e-07, "loss": 4.0306925773620605, "step": 1340 }, { "epoch": 2.9215686274509802, "grad_norm": 3.0181102752685547, "learning_rate": 7.080610021786491e-07, "loss": 4.234060764312744, "step": 1341 }, { "epoch": 2.923747276688453, "grad_norm": 1.5292731523513794, "learning_rate": 7.07843137254902e-07, "loss": 3.893479347229004, "step": 1342 }, { "epoch": 2.925925925925926, "grad_norm": 2.910338878631592, "learning_rate": 7.076252723311546e-07, "loss": 4.231020450592041, "step": 1343 }, { "epoch": 2.928104575163399, "grad_norm": 2.610241651535034, "learning_rate": 7.074074074074074e-07, "loss": 4.038875579833984, "step": 1344 }, { "epoch": 2.9302832244008714, "grad_norm": 1.8566453456878662, "learning_rate": 7.071895424836601e-07, "loss": 3.989964246749878, "step": 1345 }, { "epoch": 2.9324618736383443, "grad_norm": 2.447510004043579, "learning_rate": 7.069716775599128e-07, "loss": 4.1165008544921875, "step": 1346 }, { "epoch": 2.9346405228758172, "grad_norm": 1.9309680461883545, "learning_rate": 7.067538126361655e-07, "loss": 4.091263294219971, "step": 1347 }, { "epoch": 2.9368191721132897, "grad_norm": 1.9691225290298462, "learning_rate": 7.065359477124183e-07, "loss": 3.9398040771484375, "step": 1348 }, { "epoch": 2.9389978213507626, "grad_norm": 2.2187540531158447, "learning_rate": 7.06318082788671e-07, "loss": 4.011201858520508, "step": 1349 }, { "epoch": 2.9411764705882355, "grad_norm": 2.1626088619232178, "learning_rate": 7.061002178649237e-07, "loss": 4.07414436340332, "step": 1350 }, { "epoch": 2.943355119825708, "grad_norm": 2.2275617122650146, "learning_rate": 7.058823529411765e-07, "loss": 4.046625137329102, "step": 1351 }, { "epoch": 2.945533769063181, "grad_norm": 1.7444688081741333, "learning_rate": 7.056644880174292e-07, "loss": 3.939298629760742, "step": 1352 }, { "epoch": 2.947712418300654, "grad_norm": 1.7920843362808228, "learning_rate": 7.054466230936818e-07, "loss": 4.132450103759766, "step": 1353 }, { "epoch": 2.9498910675381262, "grad_norm": 2.755037307739258, "learning_rate": 7.052287581699347e-07, "loss": 4.206420421600342, "step": 1354 }, { "epoch": 2.952069716775599, "grad_norm": 1.8267017602920532, "learning_rate": 7.050108932461873e-07, "loss": 3.933645725250244, "step": 1355 }, { "epoch": 2.954248366013072, "grad_norm": 1.7842082977294922, "learning_rate": 7.047930283224401e-07, "loss": 3.913052797317505, "step": 1356 }, { "epoch": 2.9564270152505445, "grad_norm": 2.61879825592041, "learning_rate": 7.045751633986928e-07, "loss": 4.2315239906311035, "step": 1357 }, { "epoch": 2.9586056644880174, "grad_norm": 3.755659818649292, "learning_rate": 7.043572984749455e-07, "loss": 4.200189113616943, "step": 1358 }, { "epoch": 2.9607843137254903, "grad_norm": 2.679037570953369, "learning_rate": 7.041394335511982e-07, "loss": 3.977792978286743, "step": 1359 }, { "epoch": 2.962962962962963, "grad_norm": 2.043102502822876, "learning_rate": 7.03921568627451e-07, "loss": 3.8886184692382812, "step": 1360 }, { "epoch": 2.9651416122004357, "grad_norm": 1.8222060203552246, "learning_rate": 7.037037037037037e-07, "loss": 3.917654037475586, "step": 1361 }, { "epoch": 2.9673202614379086, "grad_norm": 1.8615660667419434, "learning_rate": 7.034858387799564e-07, "loss": 3.9844086170196533, "step": 1362 }, { "epoch": 2.969498910675381, "grad_norm": 1.9505032300949097, "learning_rate": 7.032679738562091e-07, "loss": 3.819603204727173, "step": 1363 }, { "epoch": 2.971677559912854, "grad_norm": 1.6511313915252686, "learning_rate": 7.030501089324619e-07, "loss": 3.7985005378723145, "step": 1364 }, { "epoch": 2.973856209150327, "grad_norm": 1.7570555210113525, "learning_rate": 7.028322440087145e-07, "loss": 3.9723243713378906, "step": 1365 }, { "epoch": 2.9760348583877994, "grad_norm": 1.7688910961151123, "learning_rate": 7.026143790849673e-07, "loss": 3.964071273803711, "step": 1366 }, { "epoch": 2.9782135076252723, "grad_norm": 1.9701095819473267, "learning_rate": 7.0239651416122e-07, "loss": 3.9684109687805176, "step": 1367 }, { "epoch": 2.980392156862745, "grad_norm": 2.6153292655944824, "learning_rate": 7.021786492374728e-07, "loss": 4.253694534301758, "step": 1368 }, { "epoch": 2.9825708061002176, "grad_norm": 1.7457857131958008, "learning_rate": 7.019607843137254e-07, "loss": 3.894623279571533, "step": 1369 }, { "epoch": 2.9847494553376905, "grad_norm": 1.426893711090088, "learning_rate": 7.017429193899782e-07, "loss": 3.8718185424804688, "step": 1370 }, { "epoch": 2.9869281045751634, "grad_norm": 1.6016830205917358, "learning_rate": 7.015250544662309e-07, "loss": 3.8824973106384277, "step": 1371 }, { "epoch": 2.989106753812636, "grad_norm": 2.4757676124572754, "learning_rate": 7.013071895424836e-07, "loss": 3.9500131607055664, "step": 1372 }, { "epoch": 2.991285403050109, "grad_norm": 2.4047698974609375, "learning_rate": 7.010893246187364e-07, "loss": 4.061188220977783, "step": 1373 }, { "epoch": 2.9934640522875817, "grad_norm": 2.367016315460205, "learning_rate": 7.008714596949891e-07, "loss": 4.278520107269287, "step": 1374 }, { "epoch": 2.9956427015250546, "grad_norm": 1.9628345966339111, "learning_rate": 7.006535947712417e-07, "loss": 4.107932090759277, "step": 1375 }, { "epoch": 2.997821350762527, "grad_norm": 1.929387092590332, "learning_rate": 7.004357298474946e-07, "loss": 4.109278678894043, "step": 1376 }, { "epoch": 3.0, "grad_norm": 2.5878841876983643, "learning_rate": 7.002178649237472e-07, "loss": 4.148967266082764, "step": 1377 }, { "epoch": 3.002178649237473, "grad_norm": 1.99074387550354, "learning_rate": 7e-07, "loss": 3.885469913482666, "step": 1378 }, { "epoch": 3.0043572984749454, "grad_norm": 1.7841109037399292, "learning_rate": 6.997821350762527e-07, "loss": 4.01677131652832, "step": 1379 }, { "epoch": 3.0065359477124183, "grad_norm": 1.6619118452072144, "learning_rate": 6.995642701525055e-07, "loss": 3.959888219833374, "step": 1380 }, { "epoch": 3.008714596949891, "grad_norm": 1.9008047580718994, "learning_rate": 6.993464052287581e-07, "loss": 3.860025405883789, "step": 1381 }, { "epoch": 3.0108932461873636, "grad_norm": 2.534668207168579, "learning_rate": 6.991285403050109e-07, "loss": 4.2086181640625, "step": 1382 }, { "epoch": 3.0130718954248366, "grad_norm": 1.6604502201080322, "learning_rate": 6.989106753812636e-07, "loss": 4.00107479095459, "step": 1383 }, { "epoch": 3.0152505446623095, "grad_norm": 2.3147225379943848, "learning_rate": 6.986928104575163e-07, "loss": 4.10529088973999, "step": 1384 }, { "epoch": 3.017429193899782, "grad_norm": 1.906332015991211, "learning_rate": 6.984749455337691e-07, "loss": 4.026036262512207, "step": 1385 }, { "epoch": 3.019607843137255, "grad_norm": 2.1388070583343506, "learning_rate": 6.982570806100218e-07, "loss": 4.0032758712768555, "step": 1386 }, { "epoch": 3.0217864923747277, "grad_norm": 2.066720724105835, "learning_rate": 6.980392156862744e-07, "loss": 3.8206231594085693, "step": 1387 }, { "epoch": 3.0239651416122006, "grad_norm": 2.023543357849121, "learning_rate": 6.978213507625273e-07, "loss": 3.9002137184143066, "step": 1388 }, { "epoch": 3.026143790849673, "grad_norm": 2.716881275177002, "learning_rate": 6.976034858387799e-07, "loss": 4.027283191680908, "step": 1389 }, { "epoch": 3.028322440087146, "grad_norm": 1.7647732496261597, "learning_rate": 6.973856209150326e-07, "loss": 3.936671257019043, "step": 1390 }, { "epoch": 3.030501089324619, "grad_norm": 2.0423004627227783, "learning_rate": 6.971677559912854e-07, "loss": 4.04099702835083, "step": 1391 }, { "epoch": 3.0326797385620914, "grad_norm": 1.8968185186386108, "learning_rate": 6.969498910675382e-07, "loss": 4.039318561553955, "step": 1392 }, { "epoch": 3.0348583877995643, "grad_norm": 2.1601150035858154, "learning_rate": 6.967320261437908e-07, "loss": 4.059505462646484, "step": 1393 }, { "epoch": 3.037037037037037, "grad_norm": 2.1776959896087646, "learning_rate": 6.965141612200436e-07, "loss": 4.018625736236572, "step": 1394 }, { "epoch": 3.0392156862745097, "grad_norm": 2.645939826965332, "learning_rate": 6.962962962962963e-07, "loss": 4.1773223876953125, "step": 1395 }, { "epoch": 3.0413943355119826, "grad_norm": 2.0843727588653564, "learning_rate": 6.960784313725489e-07, "loss": 4.000868797302246, "step": 1396 }, { "epoch": 3.0435729847494555, "grad_norm": 1.9108518362045288, "learning_rate": 6.958605664488018e-07, "loss": 3.938875198364258, "step": 1397 }, { "epoch": 3.045751633986928, "grad_norm": 2.086155414581299, "learning_rate": 6.956427015250544e-07, "loss": 4.083115100860596, "step": 1398 }, { "epoch": 3.047930283224401, "grad_norm": 1.7537988424301147, "learning_rate": 6.954248366013071e-07, "loss": 3.8460171222686768, "step": 1399 }, { "epoch": 3.0501089324618738, "grad_norm": 2.052316665649414, "learning_rate": 6.952069716775599e-07, "loss": 4.090228080749512, "step": 1400 }, { "epoch": 3.052287581699346, "grad_norm": 1.9320030212402344, "learning_rate": 6.949891067538126e-07, "loss": 4.030124664306641, "step": 1401 }, { "epoch": 3.054466230936819, "grad_norm": 1.8705251216888428, "learning_rate": 6.947712418300653e-07, "loss": 3.9172561168670654, "step": 1402 }, { "epoch": 3.056644880174292, "grad_norm": 1.6915946006774902, "learning_rate": 6.945533769063181e-07, "loss": 4.026086330413818, "step": 1403 }, { "epoch": 3.0588235294117645, "grad_norm": 2.193324327468872, "learning_rate": 6.943355119825707e-07, "loss": 4.0866851806640625, "step": 1404 }, { "epoch": 3.0610021786492374, "grad_norm": 2.0877082347869873, "learning_rate": 6.941176470588235e-07, "loss": 4.0135016441345215, "step": 1405 }, { "epoch": 3.0631808278867103, "grad_norm": 3.1841933727264404, "learning_rate": 6.938997821350762e-07, "loss": 4.167190074920654, "step": 1406 }, { "epoch": 3.065359477124183, "grad_norm": 2.2970774173736572, "learning_rate": 6.93681917211329e-07, "loss": 4.064237117767334, "step": 1407 }, { "epoch": 3.0675381263616557, "grad_norm": 2.0280227661132812, "learning_rate": 6.934640522875816e-07, "loss": 4.065685272216797, "step": 1408 }, { "epoch": 3.0697167755991286, "grad_norm": 2.20867919921875, "learning_rate": 6.932461873638345e-07, "loss": 3.9557673931121826, "step": 1409 }, { "epoch": 3.0718954248366015, "grad_norm": 1.9739899635314941, "learning_rate": 6.930283224400871e-07, "loss": 3.9804179668426514, "step": 1410 }, { "epoch": 3.074074074074074, "grad_norm": 1.8980971574783325, "learning_rate": 6.928104575163398e-07, "loss": 4.008516788482666, "step": 1411 }, { "epoch": 3.076252723311547, "grad_norm": 2.786292791366577, "learning_rate": 6.925925925925925e-07, "loss": 4.2271528244018555, "step": 1412 }, { "epoch": 3.0784313725490198, "grad_norm": 1.9264106750488281, "learning_rate": 6.923747276688453e-07, "loss": 3.94272780418396, "step": 1413 }, { "epoch": 3.0806100217864922, "grad_norm": 2.499330520629883, "learning_rate": 6.92156862745098e-07, "loss": 4.097041130065918, "step": 1414 }, { "epoch": 3.082788671023965, "grad_norm": 1.8922934532165527, "learning_rate": 6.919389978213507e-07, "loss": 3.9729349613189697, "step": 1415 }, { "epoch": 3.084967320261438, "grad_norm": 2.1955277919769287, "learning_rate": 6.917211328976034e-07, "loss": 3.9401471614837646, "step": 1416 }, { "epoch": 3.0871459694989105, "grad_norm": 2.001605987548828, "learning_rate": 6.915032679738562e-07, "loss": 3.9274933338165283, "step": 1417 }, { "epoch": 3.0893246187363834, "grad_norm": 1.8342690467834473, "learning_rate": 6.912854030501088e-07, "loss": 3.9414124488830566, "step": 1418 }, { "epoch": 3.0915032679738563, "grad_norm": 2.4003210067749023, "learning_rate": 6.910675381263617e-07, "loss": 4.142580032348633, "step": 1419 }, { "epoch": 3.093681917211329, "grad_norm": 2.2883265018463135, "learning_rate": 6.908496732026143e-07, "loss": 4.042757987976074, "step": 1420 }, { "epoch": 3.0958605664488017, "grad_norm": 2.5580878257751465, "learning_rate": 6.90631808278867e-07, "loss": 4.029067516326904, "step": 1421 }, { "epoch": 3.0980392156862746, "grad_norm": 1.6991617679595947, "learning_rate": 6.904139433551198e-07, "loss": 4.007190704345703, "step": 1422 }, { "epoch": 3.100217864923747, "grad_norm": 2.04525089263916, "learning_rate": 6.901960784313725e-07, "loss": 3.9716062545776367, "step": 1423 }, { "epoch": 3.10239651416122, "grad_norm": 2.4602041244506836, "learning_rate": 6.899782135076252e-07, "loss": 4.052603721618652, "step": 1424 }, { "epoch": 3.104575163398693, "grad_norm": 2.2746026515960693, "learning_rate": 6.89760348583878e-07, "loss": 4.023218154907227, "step": 1425 }, { "epoch": 3.106753812636166, "grad_norm": 2.0962417125701904, "learning_rate": 6.895424836601307e-07, "loss": 4.021659851074219, "step": 1426 }, { "epoch": 3.1089324618736383, "grad_norm": 1.6195249557495117, "learning_rate": 6.893246187363834e-07, "loss": 3.9284095764160156, "step": 1427 }, { "epoch": 3.111111111111111, "grad_norm": 2.1618423461914062, "learning_rate": 6.891067538126361e-07, "loss": 4.034542083740234, "step": 1428 }, { "epoch": 3.113289760348584, "grad_norm": 2.2566585540771484, "learning_rate": 6.888888888888889e-07, "loss": 4.108163833618164, "step": 1429 }, { "epoch": 3.1154684095860565, "grad_norm": 1.830074667930603, "learning_rate": 6.886710239651415e-07, "loss": 3.950562000274658, "step": 1430 }, { "epoch": 3.1176470588235294, "grad_norm": 2.0250563621520996, "learning_rate": 6.884531590413944e-07, "loss": 3.8979663848876953, "step": 1431 }, { "epoch": 3.1198257080610023, "grad_norm": 2.0735223293304443, "learning_rate": 6.88235294117647e-07, "loss": 3.943770170211792, "step": 1432 }, { "epoch": 3.122004357298475, "grad_norm": 2.2416398525238037, "learning_rate": 6.880174291938997e-07, "loss": 3.872501850128174, "step": 1433 }, { "epoch": 3.1241830065359477, "grad_norm": 2.2735300064086914, "learning_rate": 6.877995642701525e-07, "loss": 4.21958589553833, "step": 1434 }, { "epoch": 3.1263616557734206, "grad_norm": 2.322718620300293, "learning_rate": 6.875816993464052e-07, "loss": 4.0644989013671875, "step": 1435 }, { "epoch": 3.128540305010893, "grad_norm": 2.864656448364258, "learning_rate": 6.873638344226579e-07, "loss": 4.106259822845459, "step": 1436 }, { "epoch": 3.130718954248366, "grad_norm": 2.2956597805023193, "learning_rate": 6.871459694989107e-07, "loss": 4.097358226776123, "step": 1437 }, { "epoch": 3.132897603485839, "grad_norm": 2.5181641578674316, "learning_rate": 6.869281045751634e-07, "loss": 4.0132341384887695, "step": 1438 }, { "epoch": 3.1350762527233114, "grad_norm": 2.1961445808410645, "learning_rate": 6.867102396514161e-07, "loss": 3.9344918727874756, "step": 1439 }, { "epoch": 3.1372549019607843, "grad_norm": 1.9562888145446777, "learning_rate": 6.864923747276688e-07, "loss": 3.9464187622070312, "step": 1440 }, { "epoch": 3.139433551198257, "grad_norm": 1.838987946510315, "learning_rate": 6.862745098039216e-07, "loss": 4.0822529792785645, "step": 1441 }, { "epoch": 3.1416122004357296, "grad_norm": 1.9166709184646606, "learning_rate": 6.860566448801742e-07, "loss": 3.9962897300720215, "step": 1442 }, { "epoch": 3.1437908496732025, "grad_norm": 2.61472749710083, "learning_rate": 6.858387799564271e-07, "loss": 4.170584678649902, "step": 1443 }, { "epoch": 3.1459694989106755, "grad_norm": 2.1313629150390625, "learning_rate": 6.856209150326797e-07, "loss": 4.014618873596191, "step": 1444 }, { "epoch": 3.148148148148148, "grad_norm": 2.1800546646118164, "learning_rate": 6.854030501089324e-07, "loss": 4.018746852874756, "step": 1445 }, { "epoch": 3.150326797385621, "grad_norm": 3.4233222007751465, "learning_rate": 6.851851851851852e-07, "loss": 4.045071601867676, "step": 1446 }, { "epoch": 3.1525054466230937, "grad_norm": 2.179487705230713, "learning_rate": 6.849673202614379e-07, "loss": 3.883610963821411, "step": 1447 }, { "epoch": 3.1546840958605666, "grad_norm": 2.244222640991211, "learning_rate": 6.847494553376906e-07, "loss": 4.000574111938477, "step": 1448 }, { "epoch": 3.156862745098039, "grad_norm": 2.074002504348755, "learning_rate": 6.845315904139434e-07, "loss": 4.085293769836426, "step": 1449 }, { "epoch": 3.159041394335512, "grad_norm": 2.06676983833313, "learning_rate": 6.84313725490196e-07, "loss": 3.9798166751861572, "step": 1450 }, { "epoch": 3.161220043572985, "grad_norm": 2.68137264251709, "learning_rate": 6.840958605664488e-07, "loss": 4.144998550415039, "step": 1451 }, { "epoch": 3.1633986928104574, "grad_norm": 2.1482813358306885, "learning_rate": 6.838779956427015e-07, "loss": 3.981295585632324, "step": 1452 }, { "epoch": 3.1655773420479303, "grad_norm": 1.544592022895813, "learning_rate": 6.836601307189543e-07, "loss": 3.887171506881714, "step": 1453 }, { "epoch": 3.167755991285403, "grad_norm": 3.5624892711639404, "learning_rate": 6.834422657952069e-07, "loss": 4.200488090515137, "step": 1454 }, { "epoch": 3.1699346405228757, "grad_norm": 2.115323066711426, "learning_rate": 6.832244008714598e-07, "loss": 3.959160327911377, "step": 1455 }, { "epoch": 3.1721132897603486, "grad_norm": 1.8137776851654053, "learning_rate": 6.830065359477124e-07, "loss": 4.2361602783203125, "step": 1456 }, { "epoch": 3.1742919389978215, "grad_norm": 2.7994914054870605, "learning_rate": 6.827886710239651e-07, "loss": 4.105461120605469, "step": 1457 }, { "epoch": 3.176470588235294, "grad_norm": 2.805366039276123, "learning_rate": 6.825708061002179e-07, "loss": 4.10658073425293, "step": 1458 }, { "epoch": 3.178649237472767, "grad_norm": 2.469013214111328, "learning_rate": 6.823529411764706e-07, "loss": 4.142499923706055, "step": 1459 }, { "epoch": 3.1808278867102397, "grad_norm": 1.803782343864441, "learning_rate": 6.821350762527233e-07, "loss": 3.9874911308288574, "step": 1460 }, { "epoch": 3.183006535947712, "grad_norm": 2.7465014457702637, "learning_rate": 6.819172113289759e-07, "loss": 4.2708001136779785, "step": 1461 }, { "epoch": 3.185185185185185, "grad_norm": 3.5362226963043213, "learning_rate": 6.816993464052287e-07, "loss": 4.277872085571289, "step": 1462 }, { "epoch": 3.187363834422658, "grad_norm": 2.1162943840026855, "learning_rate": 6.814814814814814e-07, "loss": 4.073018550872803, "step": 1463 }, { "epoch": 3.189542483660131, "grad_norm": 3.329760789871216, "learning_rate": 6.812636165577341e-07, "loss": 4.015101909637451, "step": 1464 }, { "epoch": 3.1917211328976034, "grad_norm": 1.8979910612106323, "learning_rate": 6.810457516339869e-07, "loss": 3.830190420150757, "step": 1465 }, { "epoch": 3.1938997821350763, "grad_norm": 1.8327301740646362, "learning_rate": 6.808278867102396e-07, "loss": 3.849546432495117, "step": 1466 }, { "epoch": 3.196078431372549, "grad_norm": 1.9226691722869873, "learning_rate": 6.806100217864922e-07, "loss": 3.97139835357666, "step": 1467 }, { "epoch": 3.1982570806100217, "grad_norm": 2.278614044189453, "learning_rate": 6.803921568627451e-07, "loss": 4.026494979858398, "step": 1468 }, { "epoch": 3.2004357298474946, "grad_norm": 1.7531808614730835, "learning_rate": 6.801742919389977e-07, "loss": 3.950976610183716, "step": 1469 }, { "epoch": 3.2026143790849675, "grad_norm": 1.9538357257843018, "learning_rate": 6.799564270152505e-07, "loss": 4.109023094177246, "step": 1470 }, { "epoch": 3.20479302832244, "grad_norm": 2.5554213523864746, "learning_rate": 6.797385620915032e-07, "loss": 4.0433855056762695, "step": 1471 }, { "epoch": 3.206971677559913, "grad_norm": 2.057340621948242, "learning_rate": 6.79520697167756e-07, "loss": 3.973937511444092, "step": 1472 }, { "epoch": 3.2091503267973858, "grad_norm": 2.3471949100494385, "learning_rate": 6.793028322440086e-07, "loss": 3.9948055744171143, "step": 1473 }, { "epoch": 3.2113289760348582, "grad_norm": 2.107889413833618, "learning_rate": 6.790849673202614e-07, "loss": 4.030394554138184, "step": 1474 }, { "epoch": 3.213507625272331, "grad_norm": 1.6140936613082886, "learning_rate": 6.788671023965141e-07, "loss": 3.9013614654541016, "step": 1475 }, { "epoch": 3.215686274509804, "grad_norm": 2.1479227542877197, "learning_rate": 6.786492374727668e-07, "loss": 3.9884190559387207, "step": 1476 }, { "epoch": 3.2178649237472765, "grad_norm": 2.5144078731536865, "learning_rate": 6.784313725490196e-07, "loss": 4.130520343780518, "step": 1477 }, { "epoch": 3.2200435729847494, "grad_norm": 1.7546236515045166, "learning_rate": 6.782135076252723e-07, "loss": 4.030064582824707, "step": 1478 }, { "epoch": 3.2222222222222223, "grad_norm": 2.0958869457244873, "learning_rate": 6.779956427015249e-07, "loss": 4.175377368927002, "step": 1479 }, { "epoch": 3.224400871459695, "grad_norm": 1.9308971166610718, "learning_rate": 6.777777777777778e-07, "loss": 3.9496281147003174, "step": 1480 }, { "epoch": 3.2265795206971677, "grad_norm": 2.518685817718506, "learning_rate": 6.775599128540304e-07, "loss": 4.093127250671387, "step": 1481 }, { "epoch": 3.2287581699346406, "grad_norm": 3.1964430809020996, "learning_rate": 6.773420479302832e-07, "loss": 4.238636016845703, "step": 1482 }, { "epoch": 3.230936819172113, "grad_norm": 2.0844388008117676, "learning_rate": 6.771241830065359e-07, "loss": 3.9222424030303955, "step": 1483 }, { "epoch": 3.233115468409586, "grad_norm": 2.324004650115967, "learning_rate": 6.769063180827887e-07, "loss": 3.9055259227752686, "step": 1484 }, { "epoch": 3.235294117647059, "grad_norm": 2.4430906772613525, "learning_rate": 6.766884531590413e-07, "loss": 3.9254887104034424, "step": 1485 }, { "epoch": 3.237472766884532, "grad_norm": 2.2300074100494385, "learning_rate": 6.764705882352941e-07, "loss": 3.959890365600586, "step": 1486 }, { "epoch": 3.2396514161220042, "grad_norm": 1.9558796882629395, "learning_rate": 6.762527233115468e-07, "loss": 3.8937530517578125, "step": 1487 }, { "epoch": 3.241830065359477, "grad_norm": 2.2767794132232666, "learning_rate": 6.760348583877995e-07, "loss": 3.9783568382263184, "step": 1488 }, { "epoch": 3.24400871459695, "grad_norm": 2.624711036682129, "learning_rate": 6.758169934640523e-07, "loss": 4.084935665130615, "step": 1489 }, { "epoch": 3.2461873638344225, "grad_norm": 1.7028882503509521, "learning_rate": 6.75599128540305e-07, "loss": 3.835364818572998, "step": 1490 }, { "epoch": 3.2483660130718954, "grad_norm": 2.1316075325012207, "learning_rate": 6.753812636165576e-07, "loss": 3.9242265224456787, "step": 1491 }, { "epoch": 3.2505446623093683, "grad_norm": 1.8656738996505737, "learning_rate": 6.751633986928105e-07, "loss": 4.021694183349609, "step": 1492 }, { "epoch": 3.252723311546841, "grad_norm": 1.9259978532791138, "learning_rate": 6.749455337690631e-07, "loss": 4.043118000030518, "step": 1493 }, { "epoch": 3.2549019607843137, "grad_norm": 3.5886590480804443, "learning_rate": 6.747276688453159e-07, "loss": 3.9827680587768555, "step": 1494 }, { "epoch": 3.2570806100217866, "grad_norm": 2.063985586166382, "learning_rate": 6.745098039215686e-07, "loss": 3.9971508979797363, "step": 1495 }, { "epoch": 3.259259259259259, "grad_norm": 1.7345422506332397, "learning_rate": 6.742919389978214e-07, "loss": 4.035468101501465, "step": 1496 }, { "epoch": 3.261437908496732, "grad_norm": 2.570397138595581, "learning_rate": 6.74074074074074e-07, "loss": 4.049781799316406, "step": 1497 }, { "epoch": 3.263616557734205, "grad_norm": 2.1837961673736572, "learning_rate": 6.738562091503268e-07, "loss": 4.043085098266602, "step": 1498 }, { "epoch": 3.265795206971678, "grad_norm": 1.7857271432876587, "learning_rate": 6.736383442265795e-07, "loss": 3.888166904449463, "step": 1499 }, { "epoch": 3.2679738562091503, "grad_norm": 2.3214173316955566, "learning_rate": 6.734204793028322e-07, "loss": 3.952064275741577, "step": 1500 }, { "epoch": 3.270152505446623, "grad_norm": 1.8853458166122437, "learning_rate": 6.73202614379085e-07, "loss": 3.9791715145111084, "step": 1501 }, { "epoch": 3.272331154684096, "grad_norm": 1.7566629648208618, "learning_rate": 6.729847494553377e-07, "loss": 4.032092094421387, "step": 1502 }, { "epoch": 3.2745098039215685, "grad_norm": 1.8536627292633057, "learning_rate": 6.727668845315903e-07, "loss": 3.840273141860962, "step": 1503 }, { "epoch": 3.2766884531590414, "grad_norm": 2.4068148136138916, "learning_rate": 6.725490196078432e-07, "loss": 4.044948101043701, "step": 1504 }, { "epoch": 3.2788671023965144, "grad_norm": 2.162353038787842, "learning_rate": 6.723311546840958e-07, "loss": 3.9844062328338623, "step": 1505 }, { "epoch": 3.281045751633987, "grad_norm": 2.0968120098114014, "learning_rate": 6.721132897603486e-07, "loss": 4.043510437011719, "step": 1506 }, { "epoch": 3.2832244008714597, "grad_norm": 2.137989044189453, "learning_rate": 6.718954248366013e-07, "loss": 3.9521114826202393, "step": 1507 }, { "epoch": 3.2854030501089326, "grad_norm": 1.7624846696853638, "learning_rate": 6.716775599128541e-07, "loss": 3.870739698410034, "step": 1508 }, { "epoch": 3.287581699346405, "grad_norm": 1.9973706007003784, "learning_rate": 6.714596949891067e-07, "loss": 3.925558567047119, "step": 1509 }, { "epoch": 3.289760348583878, "grad_norm": 2.206732749938965, "learning_rate": 6.712418300653594e-07, "loss": 4.265423774719238, "step": 1510 }, { "epoch": 3.291938997821351, "grad_norm": 2.2106332778930664, "learning_rate": 6.710239651416122e-07, "loss": 4.0103607177734375, "step": 1511 }, { "epoch": 3.2941176470588234, "grad_norm": 2.2201740741729736, "learning_rate": 6.708061002178649e-07, "loss": 4.171445369720459, "step": 1512 }, { "epoch": 3.2962962962962963, "grad_norm": 2.3164618015289307, "learning_rate": 6.705882352941176e-07, "loss": 4.015255928039551, "step": 1513 }, { "epoch": 3.298474945533769, "grad_norm": 2.2501156330108643, "learning_rate": 6.703703703703704e-07, "loss": 3.875021457672119, "step": 1514 }, { "epoch": 3.3006535947712417, "grad_norm": 3.003483772277832, "learning_rate": 6.70152505446623e-07, "loss": 4.233852386474609, "step": 1515 }, { "epoch": 3.3028322440087146, "grad_norm": 2.2097790241241455, "learning_rate": 6.699346405228758e-07, "loss": 4.11963415145874, "step": 1516 }, { "epoch": 3.3050108932461875, "grad_norm": 2.0084714889526367, "learning_rate": 6.697167755991285e-07, "loss": 3.93745756149292, "step": 1517 }, { "epoch": 3.30718954248366, "grad_norm": 2.571908950805664, "learning_rate": 6.694989106753813e-07, "loss": 4.130604267120361, "step": 1518 }, { "epoch": 3.309368191721133, "grad_norm": 2.184589385986328, "learning_rate": 6.692810457516339e-07, "loss": 4.098086357116699, "step": 1519 }, { "epoch": 3.3115468409586057, "grad_norm": 2.054344654083252, "learning_rate": 6.690631808278868e-07, "loss": 3.9580912590026855, "step": 1520 }, { "epoch": 3.313725490196078, "grad_norm": 1.6485657691955566, "learning_rate": 6.688453159041394e-07, "loss": 3.8768773078918457, "step": 1521 }, { "epoch": 3.315904139433551, "grad_norm": 2.3110055923461914, "learning_rate": 6.686274509803921e-07, "loss": 4.221701145172119, "step": 1522 }, { "epoch": 3.318082788671024, "grad_norm": 2.9195892810821533, "learning_rate": 6.684095860566449e-07, "loss": 4.104700088500977, "step": 1523 }, { "epoch": 3.3202614379084965, "grad_norm": 1.6320812702178955, "learning_rate": 6.681917211328976e-07, "loss": 3.9556353092193604, "step": 1524 }, { "epoch": 3.3224400871459694, "grad_norm": 2.41438627243042, "learning_rate": 6.679738562091503e-07, "loss": 4.127208232879639, "step": 1525 }, { "epoch": 3.3246187363834423, "grad_norm": 2.023083209991455, "learning_rate": 6.677559912854031e-07, "loss": 4.0807414054870605, "step": 1526 }, { "epoch": 3.326797385620915, "grad_norm": 1.97068452835083, "learning_rate": 6.675381263616557e-07, "loss": 4.041322708129883, "step": 1527 }, { "epoch": 3.3289760348583877, "grad_norm": 2.0367727279663086, "learning_rate": 6.673202614379085e-07, "loss": 4.1124701499938965, "step": 1528 }, { "epoch": 3.3311546840958606, "grad_norm": 1.8039915561676025, "learning_rate": 6.671023965141612e-07, "loss": 4.039503574371338, "step": 1529 }, { "epoch": 3.3333333333333335, "grad_norm": 1.888081669807434, "learning_rate": 6.66884531590414e-07, "loss": 4.012820720672607, "step": 1530 }, { "epoch": 3.335511982570806, "grad_norm": 2.505558490753174, "learning_rate": 6.666666666666666e-07, "loss": 3.927823781967163, "step": 1531 }, { "epoch": 3.337690631808279, "grad_norm": 2.0372414588928223, "learning_rate": 6.664488017429194e-07, "loss": 4.023717403411865, "step": 1532 }, { "epoch": 3.3398692810457518, "grad_norm": 2.661311149597168, "learning_rate": 6.662309368191721e-07, "loss": 4.06866979598999, "step": 1533 }, { "epoch": 3.342047930283224, "grad_norm": 1.5344665050506592, "learning_rate": 6.660130718954247e-07, "loss": 3.9649288654327393, "step": 1534 }, { "epoch": 3.344226579520697, "grad_norm": 1.6241490840911865, "learning_rate": 6.657952069716776e-07, "loss": 3.928068161010742, "step": 1535 }, { "epoch": 3.34640522875817, "grad_norm": 2.3528263568878174, "learning_rate": 6.655773420479302e-07, "loss": 3.9143824577331543, "step": 1536 }, { "epoch": 3.348583877995643, "grad_norm": 1.9632515907287598, "learning_rate": 6.65359477124183e-07, "loss": 3.8156890869140625, "step": 1537 }, { "epoch": 3.3507625272331154, "grad_norm": 3.2820322513580322, "learning_rate": 6.651416122004357e-07, "loss": 4.249518871307373, "step": 1538 }, { "epoch": 3.3529411764705883, "grad_norm": 2.094752550125122, "learning_rate": 6.649237472766884e-07, "loss": 4.02429723739624, "step": 1539 }, { "epoch": 3.355119825708061, "grad_norm": 2.140718698501587, "learning_rate": 6.647058823529411e-07, "loss": 4.066493034362793, "step": 1540 }, { "epoch": 3.3572984749455337, "grad_norm": 2.306347131729126, "learning_rate": 6.644880174291939e-07, "loss": 4.193305015563965, "step": 1541 }, { "epoch": 3.3594771241830066, "grad_norm": 1.5505253076553345, "learning_rate": 6.642701525054466e-07, "loss": 3.8491575717926025, "step": 1542 }, { "epoch": 3.3616557734204795, "grad_norm": 2.7861762046813965, "learning_rate": 6.640522875816993e-07, "loss": 4.005647659301758, "step": 1543 }, { "epoch": 3.363834422657952, "grad_norm": 2.063473701477051, "learning_rate": 6.63834422657952e-07, "loss": 3.9723851680755615, "step": 1544 }, { "epoch": 3.366013071895425, "grad_norm": 2.2926836013793945, "learning_rate": 6.636165577342048e-07, "loss": 4.001410961151123, "step": 1545 }, { "epoch": 3.3681917211328978, "grad_norm": 2.3156063556671143, "learning_rate": 6.633986928104574e-07, "loss": 4.135824203491211, "step": 1546 }, { "epoch": 3.3703703703703702, "grad_norm": 2.199681282043457, "learning_rate": 6.631808278867103e-07, "loss": 4.076396942138672, "step": 1547 }, { "epoch": 3.372549019607843, "grad_norm": 2.3338046073913574, "learning_rate": 6.629629629629629e-07, "loss": 3.8646137714385986, "step": 1548 }, { "epoch": 3.374727668845316, "grad_norm": 2.6371309757232666, "learning_rate": 6.627450980392156e-07, "loss": 4.1135149002075195, "step": 1549 }, { "epoch": 3.3769063180827885, "grad_norm": 2.1767961978912354, "learning_rate": 6.625272331154684e-07, "loss": 3.9783413410186768, "step": 1550 }, { "epoch": 3.3790849673202614, "grad_norm": 1.67704176902771, "learning_rate": 6.623093681917211e-07, "loss": 3.9778172969818115, "step": 1551 }, { "epoch": 3.3812636165577343, "grad_norm": 1.868902325630188, "learning_rate": 6.620915032679738e-07, "loss": 3.8784031867980957, "step": 1552 }, { "epoch": 3.383442265795207, "grad_norm": 2.7169857025146484, "learning_rate": 6.618736383442266e-07, "loss": 4.1389570236206055, "step": 1553 }, { "epoch": 3.3856209150326797, "grad_norm": 2.425119400024414, "learning_rate": 6.616557734204793e-07, "loss": 4.022069931030273, "step": 1554 }, { "epoch": 3.3877995642701526, "grad_norm": 2.4268035888671875, "learning_rate": 6.61437908496732e-07, "loss": 4.085735321044922, "step": 1555 }, { "epoch": 3.389978213507625, "grad_norm": 3.2321271896362305, "learning_rate": 6.612200435729846e-07, "loss": 3.9282948970794678, "step": 1556 }, { "epoch": 3.392156862745098, "grad_norm": 2.680157423019409, "learning_rate": 6.610021786492375e-07, "loss": 4.200001239776611, "step": 1557 }, { "epoch": 3.394335511982571, "grad_norm": 2.182263135910034, "learning_rate": 6.607843137254901e-07, "loss": 4.136861324310303, "step": 1558 }, { "epoch": 3.3965141612200433, "grad_norm": 2.8656883239746094, "learning_rate": 6.605664488017429e-07, "loss": 4.139788627624512, "step": 1559 }, { "epoch": 3.3986928104575163, "grad_norm": 2.0742883682250977, "learning_rate": 6.603485838779956e-07, "loss": 4.075631141662598, "step": 1560 }, { "epoch": 3.400871459694989, "grad_norm": 2.4853737354278564, "learning_rate": 6.601307189542483e-07, "loss": 4.023271560668945, "step": 1561 }, { "epoch": 3.4030501089324616, "grad_norm": 1.9311538934707642, "learning_rate": 6.59912854030501e-07, "loss": 4.020327091217041, "step": 1562 }, { "epoch": 3.4052287581699345, "grad_norm": 1.7614953517913818, "learning_rate": 6.596949891067538e-07, "loss": 3.829974889755249, "step": 1563 }, { "epoch": 3.4074074074074074, "grad_norm": 1.8581902980804443, "learning_rate": 6.594771241830065e-07, "loss": 3.9073550701141357, "step": 1564 }, { "epoch": 3.4095860566448803, "grad_norm": 2.0828447341918945, "learning_rate": 6.592592592592592e-07, "loss": 3.9741294384002686, "step": 1565 }, { "epoch": 3.411764705882353, "grad_norm": 1.8962204456329346, "learning_rate": 6.59041394335512e-07, "loss": 4.074599742889404, "step": 1566 }, { "epoch": 3.4139433551198257, "grad_norm": 2.078425645828247, "learning_rate": 6.588235294117647e-07, "loss": 3.9482553005218506, "step": 1567 }, { "epoch": 3.4161220043572986, "grad_norm": 2.237056255340576, "learning_rate": 6.586056644880173e-07, "loss": 4.039628505706787, "step": 1568 }, { "epoch": 3.418300653594771, "grad_norm": 2.082669496536255, "learning_rate": 6.583877995642702e-07, "loss": 4.009921073913574, "step": 1569 }, { "epoch": 3.420479302832244, "grad_norm": 2.168520450592041, "learning_rate": 6.581699346405228e-07, "loss": 3.943776845932007, "step": 1570 }, { "epoch": 3.422657952069717, "grad_norm": 1.4159547090530396, "learning_rate": 6.579520697167756e-07, "loss": 3.8544585704803467, "step": 1571 }, { "epoch": 3.4248366013071894, "grad_norm": 2.00474214553833, "learning_rate": 6.577342047930283e-07, "loss": 3.9580631256103516, "step": 1572 }, { "epoch": 3.4270152505446623, "grad_norm": 2.26857590675354, "learning_rate": 6.57516339869281e-07, "loss": 4.070056438446045, "step": 1573 }, { "epoch": 3.429193899782135, "grad_norm": 2.4878299236297607, "learning_rate": 6.572984749455337e-07, "loss": 4.007462501525879, "step": 1574 }, { "epoch": 3.431372549019608, "grad_norm": 1.8639774322509766, "learning_rate": 6.570806100217865e-07, "loss": 4.008357524871826, "step": 1575 }, { "epoch": 3.4335511982570806, "grad_norm": 1.9396090507507324, "learning_rate": 6.568627450980392e-07, "loss": 4.016916275024414, "step": 1576 }, { "epoch": 3.4357298474945535, "grad_norm": 1.5934680700302124, "learning_rate": 6.566448801742919e-07, "loss": 3.928342819213867, "step": 1577 }, { "epoch": 3.4379084967320264, "grad_norm": 2.1414589881896973, "learning_rate": 6.564270152505447e-07, "loss": 3.968453884124756, "step": 1578 }, { "epoch": 3.440087145969499, "grad_norm": 2.2438576221466064, "learning_rate": 6.562091503267974e-07, "loss": 4.06458854675293, "step": 1579 }, { "epoch": 3.4422657952069717, "grad_norm": 1.9901607036590576, "learning_rate": 6.5599128540305e-07, "loss": 4.171742916107178, "step": 1580 }, { "epoch": 3.4444444444444446, "grad_norm": 1.8331940174102783, "learning_rate": 6.557734204793029e-07, "loss": 3.8754470348358154, "step": 1581 }, { "epoch": 3.446623093681917, "grad_norm": 2.0914177894592285, "learning_rate": 6.555555555555555e-07, "loss": 4.004255771636963, "step": 1582 }, { "epoch": 3.44880174291939, "grad_norm": 2.876354455947876, "learning_rate": 6.553376906318083e-07, "loss": 4.181742191314697, "step": 1583 }, { "epoch": 3.450980392156863, "grad_norm": 2.023615837097168, "learning_rate": 6.55119825708061e-07, "loss": 4.088826656341553, "step": 1584 }, { "epoch": 3.4531590413943354, "grad_norm": 2.431533098220825, "learning_rate": 6.549019607843137e-07, "loss": 4.167555332183838, "step": 1585 }, { "epoch": 3.4553376906318083, "grad_norm": 1.8348562717437744, "learning_rate": 6.546840958605664e-07, "loss": 4.009975433349609, "step": 1586 }, { "epoch": 3.457516339869281, "grad_norm": 1.830723524093628, "learning_rate": 6.544662309368192e-07, "loss": 4.041357040405273, "step": 1587 }, { "epoch": 3.4596949891067537, "grad_norm": 1.6335645914077759, "learning_rate": 6.542483660130719e-07, "loss": 3.937516450881958, "step": 1588 }, { "epoch": 3.4618736383442266, "grad_norm": 2.445364475250244, "learning_rate": 6.540305010893246e-07, "loss": 4.240857124328613, "step": 1589 }, { "epoch": 3.4640522875816995, "grad_norm": 2.011526584625244, "learning_rate": 6.538126361655773e-07, "loss": 3.9540042877197266, "step": 1590 }, { "epoch": 3.466230936819172, "grad_norm": 2.1766555309295654, "learning_rate": 6.535947712418301e-07, "loss": 3.942925453186035, "step": 1591 }, { "epoch": 3.468409586056645, "grad_norm": 1.6843113899230957, "learning_rate": 6.533769063180827e-07, "loss": 3.895901918411255, "step": 1592 }, { "epoch": 3.4705882352941178, "grad_norm": 2.2822437286376953, "learning_rate": 6.531590413943356e-07, "loss": 4.026219367980957, "step": 1593 }, { "epoch": 3.47276688453159, "grad_norm": 2.383024215698242, "learning_rate": 6.529411764705882e-07, "loss": 4.012460231781006, "step": 1594 }, { "epoch": 3.474945533769063, "grad_norm": 2.2795052528381348, "learning_rate": 6.52723311546841e-07, "loss": 4.051187515258789, "step": 1595 }, { "epoch": 3.477124183006536, "grad_norm": 1.9153189659118652, "learning_rate": 6.525054466230937e-07, "loss": 3.959873676300049, "step": 1596 }, { "epoch": 3.4793028322440085, "grad_norm": 2.128236770629883, "learning_rate": 6.522875816993464e-07, "loss": 4.075303554534912, "step": 1597 }, { "epoch": 3.4814814814814814, "grad_norm": 2.3419225215911865, "learning_rate": 6.520697167755991e-07, "loss": 4.051676273345947, "step": 1598 }, { "epoch": 3.4836601307189543, "grad_norm": 2.332866907119751, "learning_rate": 6.518518518518519e-07, "loss": 3.992755651473999, "step": 1599 }, { "epoch": 3.4858387799564268, "grad_norm": 1.965418815612793, "learning_rate": 6.516339869281046e-07, "loss": 3.9951491355895996, "step": 1600 }, { "epoch": 3.4880174291938997, "grad_norm": 2.243504524230957, "learning_rate": 6.514161220043572e-07, "loss": 4.092234134674072, "step": 1601 }, { "epoch": 3.4901960784313726, "grad_norm": 2.0637664794921875, "learning_rate": 6.5119825708061e-07, "loss": 4.103988170623779, "step": 1602 }, { "epoch": 3.4923747276688455, "grad_norm": 1.7377527952194214, "learning_rate": 6.509803921568627e-07, "loss": 3.857974052429199, "step": 1603 }, { "epoch": 3.494553376906318, "grad_norm": 1.6800857782363892, "learning_rate": 6.507625272331154e-07, "loss": 3.7909767627716064, "step": 1604 }, { "epoch": 3.496732026143791, "grad_norm": 1.8651247024536133, "learning_rate": 6.505446623093681e-07, "loss": 4.04780387878418, "step": 1605 }, { "epoch": 3.4989106753812638, "grad_norm": 2.1319353580474854, "learning_rate": 6.503267973856209e-07, "loss": 4.1095499992370605, "step": 1606 }, { "epoch": 3.5010893246187362, "grad_norm": 2.1939101219177246, "learning_rate": 6.501089324618735e-07, "loss": 4.110766410827637, "step": 1607 }, { "epoch": 3.503267973856209, "grad_norm": 1.6418050527572632, "learning_rate": 6.498910675381263e-07, "loss": 3.950441360473633, "step": 1608 }, { "epoch": 3.505446623093682, "grad_norm": 2.8158135414123535, "learning_rate": 6.49673202614379e-07, "loss": 4.126222610473633, "step": 1609 }, { "epoch": 3.507625272331155, "grad_norm": 2.3030331134796143, "learning_rate": 6.494553376906318e-07, "loss": 4.015524387359619, "step": 1610 }, { "epoch": 3.5098039215686274, "grad_norm": 1.7790753841400146, "learning_rate": 6.492374727668844e-07, "loss": 3.8940985202789307, "step": 1611 }, { "epoch": 3.5119825708061003, "grad_norm": 2.3494789600372314, "learning_rate": 6.490196078431373e-07, "loss": 4.128986358642578, "step": 1612 }, { "epoch": 3.5141612200435732, "grad_norm": 1.8010234832763672, "learning_rate": 6.488017429193899e-07, "loss": 3.9849460124969482, "step": 1613 }, { "epoch": 3.5163398692810457, "grad_norm": 2.452320098876953, "learning_rate": 6.485838779956426e-07, "loss": 4.011632919311523, "step": 1614 }, { "epoch": 3.5185185185185186, "grad_norm": 2.476057529449463, "learning_rate": 6.483660130718954e-07, "loss": 4.125460147857666, "step": 1615 }, { "epoch": 3.5206971677559915, "grad_norm": 2.1332316398620605, "learning_rate": 6.481481481481481e-07, "loss": 4.025632858276367, "step": 1616 }, { "epoch": 3.522875816993464, "grad_norm": 2.13411545753479, "learning_rate": 6.479302832244008e-07, "loss": 3.8503386974334717, "step": 1617 }, { "epoch": 3.525054466230937, "grad_norm": 2.262455463409424, "learning_rate": 6.477124183006536e-07, "loss": 3.9996774196624756, "step": 1618 }, { "epoch": 3.52723311546841, "grad_norm": 1.709065318107605, "learning_rate": 6.474945533769062e-07, "loss": 3.831688404083252, "step": 1619 }, { "epoch": 3.5294117647058822, "grad_norm": 2.84405255317688, "learning_rate": 6.47276688453159e-07, "loss": 4.018346309661865, "step": 1620 }, { "epoch": 3.531590413943355, "grad_norm": 2.19694185256958, "learning_rate": 6.470588235294117e-07, "loss": 3.977410316467285, "step": 1621 }, { "epoch": 3.533769063180828, "grad_norm": 2.1647722721099854, "learning_rate": 6.468409586056645e-07, "loss": 3.9577980041503906, "step": 1622 }, { "epoch": 3.5359477124183005, "grad_norm": 2.78039813041687, "learning_rate": 6.466230936819171e-07, "loss": 4.227453231811523, "step": 1623 }, { "epoch": 3.5381263616557734, "grad_norm": 2.5184903144836426, "learning_rate": 6.4640522875817e-07, "loss": 3.9822120666503906, "step": 1624 }, { "epoch": 3.5403050108932463, "grad_norm": 1.8542176485061646, "learning_rate": 6.461873638344226e-07, "loss": 4.102168560028076, "step": 1625 }, { "epoch": 3.542483660130719, "grad_norm": 2.1520121097564697, "learning_rate": 6.459694989106753e-07, "loss": 4.01803731918335, "step": 1626 }, { "epoch": 3.5446623093681917, "grad_norm": 1.8203444480895996, "learning_rate": 6.457516339869281e-07, "loss": 3.9714584350585938, "step": 1627 }, { "epoch": 3.5468409586056646, "grad_norm": 2.730771780014038, "learning_rate": 6.455337690631808e-07, "loss": 4.162755966186523, "step": 1628 }, { "epoch": 3.549019607843137, "grad_norm": 2.9564573764801025, "learning_rate": 6.453159041394335e-07, "loss": 4.139929294586182, "step": 1629 }, { "epoch": 3.55119825708061, "grad_norm": 2.138350009918213, "learning_rate": 6.450980392156863e-07, "loss": 3.9129207134246826, "step": 1630 }, { "epoch": 3.553376906318083, "grad_norm": 2.051333427429199, "learning_rate": 6.448801742919389e-07, "loss": 4.118600368499756, "step": 1631 }, { "epoch": 3.5555555555555554, "grad_norm": 2.745764970779419, "learning_rate": 6.446623093681917e-07, "loss": 4.038797378540039, "step": 1632 }, { "epoch": 3.5577342047930283, "grad_norm": 3.3742687702178955, "learning_rate": 6.444444444444444e-07, "loss": 4.156570911407471, "step": 1633 }, { "epoch": 3.559912854030501, "grad_norm": 2.086090087890625, "learning_rate": 6.442265795206972e-07, "loss": 4.026094436645508, "step": 1634 }, { "epoch": 3.5620915032679736, "grad_norm": 2.4045121669769287, "learning_rate": 6.440087145969498e-07, "loss": 4.118899822235107, "step": 1635 }, { "epoch": 3.5642701525054465, "grad_norm": 2.235346794128418, "learning_rate": 6.437908496732027e-07, "loss": 4.209275245666504, "step": 1636 }, { "epoch": 3.5664488017429194, "grad_norm": 2.040891408920288, "learning_rate": 6.435729847494553e-07, "loss": 4.064647197723389, "step": 1637 }, { "epoch": 3.568627450980392, "grad_norm": 2.6607587337493896, "learning_rate": 6.43355119825708e-07, "loss": 4.131723880767822, "step": 1638 }, { "epoch": 3.570806100217865, "grad_norm": 2.5646963119506836, "learning_rate": 6.431372549019608e-07, "loss": 4.078649997711182, "step": 1639 }, { "epoch": 3.5729847494553377, "grad_norm": 2.4059865474700928, "learning_rate": 6.429193899782135e-07, "loss": 4.298575401306152, "step": 1640 }, { "epoch": 3.57516339869281, "grad_norm": 1.781819224357605, "learning_rate": 6.427015250544662e-07, "loss": 3.9318175315856934, "step": 1641 }, { "epoch": 3.577342047930283, "grad_norm": 2.1094720363616943, "learning_rate": 6.42483660130719e-07, "loss": 4.031168460845947, "step": 1642 }, { "epoch": 3.579520697167756, "grad_norm": 1.7770715951919556, "learning_rate": 6.422657952069716e-07, "loss": 4.077695369720459, "step": 1643 }, { "epoch": 3.581699346405229, "grad_norm": 1.9985582828521729, "learning_rate": 6.420479302832244e-07, "loss": 4.011483669281006, "step": 1644 }, { "epoch": 3.5838779956427014, "grad_norm": 2.859800338745117, "learning_rate": 6.418300653594771e-07, "loss": 4.141329288482666, "step": 1645 }, { "epoch": 3.5860566448801743, "grad_norm": 1.848609447479248, "learning_rate": 6.416122004357299e-07, "loss": 3.975315809249878, "step": 1646 }, { "epoch": 3.588235294117647, "grad_norm": 2.7798187732696533, "learning_rate": 6.413943355119825e-07, "loss": 4.100983619689941, "step": 1647 }, { "epoch": 3.59041394335512, "grad_norm": 2.1976139545440674, "learning_rate": 6.411764705882354e-07, "loss": 4.019747734069824, "step": 1648 }, { "epoch": 3.5925925925925926, "grad_norm": 2.018017292022705, "learning_rate": 6.40958605664488e-07, "loss": 3.9968926906585693, "step": 1649 }, { "epoch": 3.5947712418300655, "grad_norm": 2.4165799617767334, "learning_rate": 6.407407407407407e-07, "loss": 4.136794567108154, "step": 1650 }, { "epoch": 3.5969498910675384, "grad_norm": 2.8577213287353516, "learning_rate": 6.405228758169934e-07, "loss": 4.261354446411133, "step": 1651 }, { "epoch": 3.599128540305011, "grad_norm": 2.382066488265991, "learning_rate": 6.403050108932462e-07, "loss": 4.108253479003906, "step": 1652 }, { "epoch": 3.6013071895424837, "grad_norm": 2.375784158706665, "learning_rate": 6.400871459694989e-07, "loss": 3.891619920730591, "step": 1653 }, { "epoch": 3.6034858387799567, "grad_norm": 2.0734424591064453, "learning_rate": 6.398692810457516e-07, "loss": 3.95424222946167, "step": 1654 }, { "epoch": 3.605664488017429, "grad_norm": 2.579802989959717, "learning_rate": 6.396514161220043e-07, "loss": 4.118511199951172, "step": 1655 }, { "epoch": 3.607843137254902, "grad_norm": 1.714768409729004, "learning_rate": 6.394335511982571e-07, "loss": 4.067865371704102, "step": 1656 }, { "epoch": 3.610021786492375, "grad_norm": 1.9601842164993286, "learning_rate": 6.392156862745097e-07, "loss": 4.043153762817383, "step": 1657 }, { "epoch": 3.6122004357298474, "grad_norm": 2.1463890075683594, "learning_rate": 6.389978213507626e-07, "loss": 3.9242019653320312, "step": 1658 }, { "epoch": 3.6143790849673203, "grad_norm": 1.6684367656707764, "learning_rate": 6.387799564270152e-07, "loss": 3.933436155319214, "step": 1659 }, { "epoch": 3.616557734204793, "grad_norm": 2.6636037826538086, "learning_rate": 6.385620915032679e-07, "loss": 4.119812488555908, "step": 1660 }, { "epoch": 3.6187363834422657, "grad_norm": 1.6350538730621338, "learning_rate": 6.383442265795207e-07, "loss": 3.9082536697387695, "step": 1661 }, { "epoch": 3.6209150326797386, "grad_norm": 1.5400785207748413, "learning_rate": 6.381263616557734e-07, "loss": 3.9048826694488525, "step": 1662 }, { "epoch": 3.6230936819172115, "grad_norm": 2.743635892868042, "learning_rate": 6.379084967320261e-07, "loss": 4.2281012535095215, "step": 1663 }, { "epoch": 3.625272331154684, "grad_norm": 2.2308762073516846, "learning_rate": 6.376906318082789e-07, "loss": 4.013498306274414, "step": 1664 }, { "epoch": 3.627450980392157, "grad_norm": 1.9672714471817017, "learning_rate": 6.374727668845316e-07, "loss": 3.9958102703094482, "step": 1665 }, { "epoch": 3.6296296296296298, "grad_norm": 2.1728603839874268, "learning_rate": 6.372549019607843e-07, "loss": 3.8950469493865967, "step": 1666 }, { "epoch": 3.6318082788671022, "grad_norm": 1.8587826490402222, "learning_rate": 6.37037037037037e-07, "loss": 3.9745945930480957, "step": 1667 }, { "epoch": 3.633986928104575, "grad_norm": 1.9376271963119507, "learning_rate": 6.368191721132898e-07, "loss": 3.9442098140716553, "step": 1668 }, { "epoch": 3.636165577342048, "grad_norm": 2.0401036739349365, "learning_rate": 6.366013071895424e-07, "loss": 4.072229862213135, "step": 1669 }, { "epoch": 3.6383442265795205, "grad_norm": 1.7772295475006104, "learning_rate": 6.363834422657953e-07, "loss": 4.0096893310546875, "step": 1670 }, { "epoch": 3.6405228758169934, "grad_norm": 2.543602466583252, "learning_rate": 6.361655773420479e-07, "loss": 4.045433521270752, "step": 1671 }, { "epoch": 3.6427015250544663, "grad_norm": 1.73964524269104, "learning_rate": 6.359477124183005e-07, "loss": 4.098751068115234, "step": 1672 }, { "epoch": 3.644880174291939, "grad_norm": 1.8447500467300415, "learning_rate": 6.357298474945534e-07, "loss": 3.8695967197418213, "step": 1673 }, { "epoch": 3.6470588235294117, "grad_norm": 2.006380081176758, "learning_rate": 6.35511982570806e-07, "loss": 3.9636082649230957, "step": 1674 }, { "epoch": 3.6492374727668846, "grad_norm": 4.539744853973389, "learning_rate": 6.352941176470588e-07, "loss": 4.383431911468506, "step": 1675 }, { "epoch": 3.651416122004357, "grad_norm": 2.9229650497436523, "learning_rate": 6.350762527233115e-07, "loss": 4.183969497680664, "step": 1676 }, { "epoch": 3.65359477124183, "grad_norm": 2.1762044429779053, "learning_rate": 6.348583877995642e-07, "loss": 3.8164174556732178, "step": 1677 }, { "epoch": 3.655773420479303, "grad_norm": 2.534754514694214, "learning_rate": 6.346405228758169e-07, "loss": 4.118279457092285, "step": 1678 }, { "epoch": 3.6579520697167753, "grad_norm": 3.1890275478363037, "learning_rate": 6.344226579520697e-07, "loss": 4.120993614196777, "step": 1679 }, { "epoch": 3.6601307189542482, "grad_norm": 2.6981887817382812, "learning_rate": 6.342047930283224e-07, "loss": 3.988602876663208, "step": 1680 }, { "epoch": 3.662309368191721, "grad_norm": 2.122682809829712, "learning_rate": 6.339869281045751e-07, "loss": 3.944340467453003, "step": 1681 }, { "epoch": 3.664488017429194, "grad_norm": 2.3690590858459473, "learning_rate": 6.337690631808279e-07, "loss": 4.13071870803833, "step": 1682 }, { "epoch": 3.6666666666666665, "grad_norm": 3.523449420928955, "learning_rate": 6.335511982570806e-07, "loss": 4.224156856536865, "step": 1683 }, { "epoch": 3.6688453159041394, "grad_norm": 2.3078808784484863, "learning_rate": 6.333333333333332e-07, "loss": 4.104690074920654, "step": 1684 }, { "epoch": 3.6710239651416123, "grad_norm": 2.1735410690307617, "learning_rate": 6.331154684095861e-07, "loss": 3.94840145111084, "step": 1685 }, { "epoch": 3.6732026143790852, "grad_norm": 2.2892837524414062, "learning_rate": 6.328976034858387e-07, "loss": 3.975210189819336, "step": 1686 }, { "epoch": 3.6753812636165577, "grad_norm": 2.0813169479370117, "learning_rate": 6.326797385620915e-07, "loss": 3.8892881870269775, "step": 1687 }, { "epoch": 3.6775599128540306, "grad_norm": 2.035581111907959, "learning_rate": 6.324618736383442e-07, "loss": 3.843644380569458, "step": 1688 }, { "epoch": 3.6797385620915035, "grad_norm": 2.1441924571990967, "learning_rate": 6.322440087145969e-07, "loss": 4.049313545227051, "step": 1689 }, { "epoch": 3.681917211328976, "grad_norm": 2.145218849182129, "learning_rate": 6.320261437908496e-07, "loss": 3.9136717319488525, "step": 1690 }, { "epoch": 3.684095860566449, "grad_norm": 2.6168575286865234, "learning_rate": 6.318082788671024e-07, "loss": 3.874211311340332, "step": 1691 }, { "epoch": 3.686274509803922, "grad_norm": 2.573988914489746, "learning_rate": 6.315904139433551e-07, "loss": 4.208591938018799, "step": 1692 }, { "epoch": 3.6884531590413943, "grad_norm": 2.4346320629119873, "learning_rate": 6.313725490196078e-07, "loss": 4.053732395172119, "step": 1693 }, { "epoch": 3.690631808278867, "grad_norm": 1.7669482231140137, "learning_rate": 6.311546840958606e-07, "loss": 3.9012508392333984, "step": 1694 }, { "epoch": 3.69281045751634, "grad_norm": 2.095820665359497, "learning_rate": 6.309368191721133e-07, "loss": 4.0758256912231445, "step": 1695 }, { "epoch": 3.6949891067538125, "grad_norm": 2.8387928009033203, "learning_rate": 6.307189542483659e-07, "loss": 4.140333652496338, "step": 1696 }, { "epoch": 3.6971677559912854, "grad_norm": 1.919303297996521, "learning_rate": 6.305010893246188e-07, "loss": 3.978070020675659, "step": 1697 }, { "epoch": 3.6993464052287583, "grad_norm": 2.0941810607910156, "learning_rate": 6.302832244008714e-07, "loss": 3.955519914627075, "step": 1698 }, { "epoch": 3.701525054466231, "grad_norm": 2.208418130874634, "learning_rate": 6.300653594771242e-07, "loss": 4.099648952484131, "step": 1699 }, { "epoch": 3.7037037037037037, "grad_norm": 1.8822627067565918, "learning_rate": 6.298474945533768e-07, "loss": 4.024240016937256, "step": 1700 }, { "epoch": 3.7058823529411766, "grad_norm": 2.0224697589874268, "learning_rate": 6.296296296296296e-07, "loss": 4.023405075073242, "step": 1701 }, { "epoch": 3.708061002178649, "grad_norm": 2.393078088760376, "learning_rate": 6.294117647058823e-07, "loss": 4.095856189727783, "step": 1702 }, { "epoch": 3.710239651416122, "grad_norm": 1.875871181488037, "learning_rate": 6.29193899782135e-07, "loss": 3.928438663482666, "step": 1703 }, { "epoch": 3.712418300653595, "grad_norm": 2.4513442516326904, "learning_rate": 6.289760348583878e-07, "loss": 4.145753860473633, "step": 1704 }, { "epoch": 3.7145969498910674, "grad_norm": 2.11932373046875, "learning_rate": 6.287581699346405e-07, "loss": 3.8494873046875, "step": 1705 }, { "epoch": 3.7167755991285403, "grad_norm": 1.6206536293029785, "learning_rate": 6.285403050108931e-07, "loss": 3.8494253158569336, "step": 1706 }, { "epoch": 3.718954248366013, "grad_norm": 2.2260022163391113, "learning_rate": 6.28322440087146e-07, "loss": 4.088474273681641, "step": 1707 }, { "epoch": 3.7211328976034856, "grad_norm": 1.7367931604385376, "learning_rate": 6.281045751633986e-07, "loss": 3.8819549083709717, "step": 1708 }, { "epoch": 3.7233115468409586, "grad_norm": 1.8777233362197876, "learning_rate": 6.278867102396514e-07, "loss": 3.881998062133789, "step": 1709 }, { "epoch": 3.7254901960784315, "grad_norm": 1.9312036037445068, "learning_rate": 6.276688453159041e-07, "loss": 4.017609119415283, "step": 1710 }, { "epoch": 3.727668845315904, "grad_norm": 1.7141090631484985, "learning_rate": 6.274509803921569e-07, "loss": 3.923773765563965, "step": 1711 }, { "epoch": 3.729847494553377, "grad_norm": 2.4343507289886475, "learning_rate": 6.272331154684095e-07, "loss": 4.0891923904418945, "step": 1712 }, { "epoch": 3.7320261437908497, "grad_norm": 1.8762233257293701, "learning_rate": 6.270152505446623e-07, "loss": 3.9274184703826904, "step": 1713 }, { "epoch": 3.734204793028322, "grad_norm": 1.9290213584899902, "learning_rate": 6.26797385620915e-07, "loss": 3.9421050548553467, "step": 1714 }, { "epoch": 3.736383442265795, "grad_norm": 2.2159423828125, "learning_rate": 6.265795206971677e-07, "loss": 3.959439277648926, "step": 1715 }, { "epoch": 3.738562091503268, "grad_norm": 1.7754744291305542, "learning_rate": 6.263616557734205e-07, "loss": 4.004345893859863, "step": 1716 }, { "epoch": 3.7407407407407405, "grad_norm": 2.727639675140381, "learning_rate": 6.261437908496732e-07, "loss": 4.254756927490234, "step": 1717 }, { "epoch": 3.7429193899782134, "grad_norm": 1.659976840019226, "learning_rate": 6.259259259259258e-07, "loss": 3.775618314743042, "step": 1718 }, { "epoch": 3.7450980392156863, "grad_norm": 1.6151018142700195, "learning_rate": 6.257080610021787e-07, "loss": 3.799846887588501, "step": 1719 }, { "epoch": 3.747276688453159, "grad_norm": 3.7324047088623047, "learning_rate": 6.254901960784313e-07, "loss": 4.279714584350586, "step": 1720 }, { "epoch": 3.7494553376906317, "grad_norm": 3.356847047805786, "learning_rate": 6.252723311546841e-07, "loss": 4.156373977661133, "step": 1721 }, { "epoch": 3.7516339869281046, "grad_norm": 2.603149652481079, "learning_rate": 6.250544662309368e-07, "loss": 4.145864963531494, "step": 1722 }, { "epoch": 3.7538126361655775, "grad_norm": 2.234229326248169, "learning_rate": 6.248366013071896e-07, "loss": 4.139506816864014, "step": 1723 }, { "epoch": 3.7559912854030504, "grad_norm": 2.773479461669922, "learning_rate": 6.246187363834422e-07, "loss": 4.172159671783447, "step": 1724 }, { "epoch": 3.758169934640523, "grad_norm": 1.907021403312683, "learning_rate": 6.24400871459695e-07, "loss": 4.069182872772217, "step": 1725 }, { "epoch": 3.7603485838779958, "grad_norm": 2.052272319793701, "learning_rate": 6.241830065359477e-07, "loss": 3.950011730194092, "step": 1726 }, { "epoch": 3.7625272331154687, "grad_norm": 2.472555160522461, "learning_rate": 6.239651416122004e-07, "loss": 3.855431318283081, "step": 1727 }, { "epoch": 3.764705882352941, "grad_norm": 3.147519588470459, "learning_rate": 6.237472766884532e-07, "loss": 4.226633548736572, "step": 1728 }, { "epoch": 3.766884531590414, "grad_norm": 2.5070993900299072, "learning_rate": 6.235294117647059e-07, "loss": 4.183052062988281, "step": 1729 }, { "epoch": 3.769063180827887, "grad_norm": 2.2064385414123535, "learning_rate": 6.233115468409585e-07, "loss": 4.082653045654297, "step": 1730 }, { "epoch": 3.7712418300653594, "grad_norm": 1.5235601663589478, "learning_rate": 6.230936819172114e-07, "loss": 4.049082279205322, "step": 1731 }, { "epoch": 3.7734204793028323, "grad_norm": 2.2480363845825195, "learning_rate": 6.22875816993464e-07, "loss": 4.047230243682861, "step": 1732 }, { "epoch": 3.775599128540305, "grad_norm": 2.148350954055786, "learning_rate": 6.226579520697168e-07, "loss": 4.110304832458496, "step": 1733 }, { "epoch": 3.7777777777777777, "grad_norm": 2.467670440673828, "learning_rate": 6.224400871459695e-07, "loss": 3.990419864654541, "step": 1734 }, { "epoch": 3.7799564270152506, "grad_norm": 1.7682873010635376, "learning_rate": 6.222222222222223e-07, "loss": 3.916529417037964, "step": 1735 }, { "epoch": 3.7821350762527235, "grad_norm": 1.9989807605743408, "learning_rate": 6.220043572984749e-07, "loss": 4.025254726409912, "step": 1736 }, { "epoch": 3.784313725490196, "grad_norm": 2.335503578186035, "learning_rate": 6.217864923747277e-07, "loss": 4.217721939086914, "step": 1737 }, { "epoch": 3.786492374727669, "grad_norm": 2.2717456817626953, "learning_rate": 6.215686274509804e-07, "loss": 4.05740213394165, "step": 1738 }, { "epoch": 3.7886710239651418, "grad_norm": 1.4826730489730835, "learning_rate": 6.21350762527233e-07, "loss": 3.891862154006958, "step": 1739 }, { "epoch": 3.7908496732026142, "grad_norm": 1.690864086151123, "learning_rate": 6.211328976034859e-07, "loss": 3.852081537246704, "step": 1740 }, { "epoch": 3.793028322440087, "grad_norm": 2.0737998485565186, "learning_rate": 6.209150326797385e-07, "loss": 3.9603476524353027, "step": 1741 }, { "epoch": 3.79520697167756, "grad_norm": 2.350660562515259, "learning_rate": 6.206971677559912e-07, "loss": 3.988795280456543, "step": 1742 }, { "epoch": 3.7973856209150325, "grad_norm": 2.00583815574646, "learning_rate": 6.20479302832244e-07, "loss": 4.043881893157959, "step": 1743 }, { "epoch": 3.7995642701525054, "grad_norm": 1.736445665359497, "learning_rate": 6.202614379084967e-07, "loss": 3.979602575302124, "step": 1744 }, { "epoch": 3.8017429193899783, "grad_norm": 1.9559158086776733, "learning_rate": 6.200435729847494e-07, "loss": 3.872722625732422, "step": 1745 }, { "epoch": 3.803921568627451, "grad_norm": 2.1777803897857666, "learning_rate": 6.198257080610022e-07, "loss": 4.221410751342773, "step": 1746 }, { "epoch": 3.8061002178649237, "grad_norm": 2.152864933013916, "learning_rate": 6.196078431372548e-07, "loss": 3.958966016769409, "step": 1747 }, { "epoch": 3.8082788671023966, "grad_norm": 2.082481861114502, "learning_rate": 6.193899782135076e-07, "loss": 4.119745254516602, "step": 1748 }, { "epoch": 3.810457516339869, "grad_norm": 2.2072396278381348, "learning_rate": 6.191721132897602e-07, "loss": 4.008877754211426, "step": 1749 }, { "epoch": 3.812636165577342, "grad_norm": 2.1637251377105713, "learning_rate": 6.189542483660131e-07, "loss": 3.994089365005493, "step": 1750 }, { "epoch": 3.814814814814815, "grad_norm": 2.302915573120117, "learning_rate": 6.187363834422657e-07, "loss": 3.97430682182312, "step": 1751 }, { "epoch": 3.8169934640522873, "grad_norm": 1.9125581979751587, "learning_rate": 6.185185185185185e-07, "loss": 4.036306381225586, "step": 1752 }, { "epoch": 3.8191721132897603, "grad_norm": 1.9290575981140137, "learning_rate": 6.183006535947712e-07, "loss": 3.9534454345703125, "step": 1753 }, { "epoch": 3.821350762527233, "grad_norm": 2.8575313091278076, "learning_rate": 6.180827886710239e-07, "loss": 4.1254072189331055, "step": 1754 }, { "epoch": 3.8235294117647056, "grad_norm": 2.5597846508026123, "learning_rate": 6.178649237472766e-07, "loss": 4.1224164962768555, "step": 1755 }, { "epoch": 3.8257080610021785, "grad_norm": 1.7716273069381714, "learning_rate": 6.176470588235294e-07, "loss": 3.915372133255005, "step": 1756 }, { "epoch": 3.8278867102396514, "grad_norm": 3.0065836906433105, "learning_rate": 6.174291938997821e-07, "loss": 4.145368576049805, "step": 1757 }, { "epoch": 3.8300653594771243, "grad_norm": 1.5936673879623413, "learning_rate": 6.172113289760348e-07, "loss": 3.8635075092315674, "step": 1758 }, { "epoch": 3.832244008714597, "grad_norm": 1.9724961519241333, "learning_rate": 6.169934640522875e-07, "loss": 4.043174743652344, "step": 1759 }, { "epoch": 3.8344226579520697, "grad_norm": 2.055016040802002, "learning_rate": 6.167755991285403e-07, "loss": 4.008939743041992, "step": 1760 }, { "epoch": 3.8366013071895426, "grad_norm": 2.0414562225341797, "learning_rate": 6.165577342047929e-07, "loss": 4.069127082824707, "step": 1761 }, { "epoch": 3.8387799564270155, "grad_norm": 2.112516164779663, "learning_rate": 6.163398692810458e-07, "loss": 4.0863189697265625, "step": 1762 }, { "epoch": 3.840958605664488, "grad_norm": 1.7942603826522827, "learning_rate": 6.161220043572984e-07, "loss": 3.93813419342041, "step": 1763 }, { "epoch": 3.843137254901961, "grad_norm": 2.0174758434295654, "learning_rate": 6.159041394335511e-07, "loss": 3.9381344318389893, "step": 1764 }, { "epoch": 3.845315904139434, "grad_norm": 2.214595317840576, "learning_rate": 6.156862745098039e-07, "loss": 4.014326572418213, "step": 1765 }, { "epoch": 3.8474945533769063, "grad_norm": 2.1071479320526123, "learning_rate": 6.154684095860566e-07, "loss": 4.021290302276611, "step": 1766 }, { "epoch": 3.849673202614379, "grad_norm": 2.4200565814971924, "learning_rate": 6.152505446623093e-07, "loss": 4.159715175628662, "step": 1767 }, { "epoch": 3.851851851851852, "grad_norm": 2.7638680934906006, "learning_rate": 6.150326797385621e-07, "loss": 4.278736114501953, "step": 1768 }, { "epoch": 3.8540305010893245, "grad_norm": 2.3783159255981445, "learning_rate": 6.148148148148148e-07, "loss": 3.9321460723876953, "step": 1769 }, { "epoch": 3.8562091503267975, "grad_norm": 2.1348562240600586, "learning_rate": 6.145969498910675e-07, "loss": 3.9610700607299805, "step": 1770 }, { "epoch": 3.8583877995642704, "grad_norm": 2.5196473598480225, "learning_rate": 6.143790849673202e-07, "loss": 4.076618671417236, "step": 1771 }, { "epoch": 3.860566448801743, "grad_norm": 2.013888359069824, "learning_rate": 6.14161220043573e-07, "loss": 3.9635169506073, "step": 1772 }, { "epoch": 3.8627450980392157, "grad_norm": 2.16361403465271, "learning_rate": 6.139433551198256e-07, "loss": 3.9959752559661865, "step": 1773 }, { "epoch": 3.8649237472766886, "grad_norm": 2.394536018371582, "learning_rate": 6.137254901960785e-07, "loss": 4.135343551635742, "step": 1774 }, { "epoch": 3.867102396514161, "grad_norm": 2.1735332012176514, "learning_rate": 6.135076252723311e-07, "loss": 4.00130558013916, "step": 1775 }, { "epoch": 3.869281045751634, "grad_norm": 2.3322250843048096, "learning_rate": 6.132897603485838e-07, "loss": 4.021347999572754, "step": 1776 }, { "epoch": 3.871459694989107, "grad_norm": 1.8369220495224, "learning_rate": 6.130718954248366e-07, "loss": 3.9298174381256104, "step": 1777 }, { "epoch": 3.8736383442265794, "grad_norm": 2.1270298957824707, "learning_rate": 6.128540305010893e-07, "loss": 4.038597106933594, "step": 1778 }, { "epoch": 3.8758169934640523, "grad_norm": 2.308173656463623, "learning_rate": 6.12636165577342e-07, "loss": 4.085627555847168, "step": 1779 }, { "epoch": 3.877995642701525, "grad_norm": 2.36142897605896, "learning_rate": 6.124183006535948e-07, "loss": 3.9148573875427246, "step": 1780 }, { "epoch": 3.8801742919389977, "grad_norm": 2.231189012527466, "learning_rate": 6.122004357298475e-07, "loss": 4.12549352645874, "step": 1781 }, { "epoch": 3.8823529411764706, "grad_norm": 2.2756946086883545, "learning_rate": 6.119825708061002e-07, "loss": 3.794858932495117, "step": 1782 }, { "epoch": 3.8845315904139435, "grad_norm": 2.440094232559204, "learning_rate": 6.117647058823529e-07, "loss": 3.9510741233825684, "step": 1783 }, { "epoch": 3.886710239651416, "grad_norm": 2.3200080394744873, "learning_rate": 6.115468409586057e-07, "loss": 3.970952272415161, "step": 1784 }, { "epoch": 3.888888888888889, "grad_norm": 2.431164503097534, "learning_rate": 6.113289760348583e-07, "loss": 3.936002016067505, "step": 1785 }, { "epoch": 3.8910675381263617, "grad_norm": 3.404175281524658, "learning_rate": 6.111111111111112e-07, "loss": 4.2217020988464355, "step": 1786 }, { "epoch": 3.893246187363834, "grad_norm": 2.4005918502807617, "learning_rate": 6.108932461873638e-07, "loss": 4.099236488342285, "step": 1787 }, { "epoch": 3.895424836601307, "grad_norm": 2.274719476699829, "learning_rate": 6.106753812636165e-07, "loss": 4.065813064575195, "step": 1788 }, { "epoch": 3.89760348583878, "grad_norm": 2.091095447540283, "learning_rate": 6.104575163398693e-07, "loss": 4.0143303871154785, "step": 1789 }, { "epoch": 3.8997821350762525, "grad_norm": 2.1476387977600098, "learning_rate": 6.10239651416122e-07, "loss": 3.810451030731201, "step": 1790 }, { "epoch": 3.9019607843137254, "grad_norm": 2.1897993087768555, "learning_rate": 6.100217864923747e-07, "loss": 4.1004252433776855, "step": 1791 }, { "epoch": 3.9041394335511983, "grad_norm": 2.321040391921997, "learning_rate": 6.098039215686275e-07, "loss": 4.040799617767334, "step": 1792 }, { "epoch": 3.9063180827886708, "grad_norm": 2.7416083812713623, "learning_rate": 6.095860566448802e-07, "loss": 4.285754680633545, "step": 1793 }, { "epoch": 3.9084967320261437, "grad_norm": 1.9719347953796387, "learning_rate": 6.093681917211329e-07, "loss": 4.110438346862793, "step": 1794 }, { "epoch": 3.9106753812636166, "grad_norm": 1.7636198997497559, "learning_rate": 6.091503267973855e-07, "loss": 3.9557507038116455, "step": 1795 }, { "epoch": 3.9128540305010895, "grad_norm": 2.5199568271636963, "learning_rate": 6.089324618736384e-07, "loss": 4.138791561126709, "step": 1796 }, { "epoch": 3.915032679738562, "grad_norm": 2.176243543624878, "learning_rate": 6.08714596949891e-07, "loss": 3.962369680404663, "step": 1797 }, { "epoch": 3.917211328976035, "grad_norm": 2.005007028579712, "learning_rate": 6.084967320261438e-07, "loss": 3.89569354057312, "step": 1798 }, { "epoch": 3.9193899782135078, "grad_norm": 2.1093266010284424, "learning_rate": 6.082788671023965e-07, "loss": 3.748824119567871, "step": 1799 }, { "epoch": 3.9215686274509802, "grad_norm": 2.2011890411376953, "learning_rate": 6.080610021786492e-07, "loss": 4.017143726348877, "step": 1800 }, { "epoch": 3.923747276688453, "grad_norm": 2.2300422191619873, "learning_rate": 6.078431372549019e-07, "loss": 3.925917625427246, "step": 1801 }, { "epoch": 3.925925925925926, "grad_norm": 2.0129852294921875, "learning_rate": 6.076252723311547e-07, "loss": 3.8492233753204346, "step": 1802 }, { "epoch": 3.928104575163399, "grad_norm": 2.725393772125244, "learning_rate": 6.074074074074074e-07, "loss": 4.022107124328613, "step": 1803 }, { "epoch": 3.9302832244008714, "grad_norm": 2.240007162094116, "learning_rate": 6.071895424836601e-07, "loss": 4.069840431213379, "step": 1804 }, { "epoch": 3.9324618736383443, "grad_norm": 2.916633367538452, "learning_rate": 6.069716775599128e-07, "loss": 4.184571266174316, "step": 1805 }, { "epoch": 3.9346405228758172, "grad_norm": 2.869912624359131, "learning_rate": 6.067538126361656e-07, "loss": 4.064949035644531, "step": 1806 }, { "epoch": 3.9368191721132897, "grad_norm": 2.2792270183563232, "learning_rate": 6.065359477124182e-07, "loss": 3.921118974685669, "step": 1807 }, { "epoch": 3.9389978213507626, "grad_norm": 1.9473665952682495, "learning_rate": 6.063180827886711e-07, "loss": 3.9016194343566895, "step": 1808 }, { "epoch": 3.9411764705882355, "grad_norm": 1.7135004997253418, "learning_rate": 6.061002178649237e-07, "loss": 3.9680402278900146, "step": 1809 }, { "epoch": 3.943355119825708, "grad_norm": 1.9967173337936401, "learning_rate": 6.058823529411763e-07, "loss": 3.9364984035491943, "step": 1810 }, { "epoch": 3.945533769063181, "grad_norm": 1.6552560329437256, "learning_rate": 6.056644880174292e-07, "loss": 3.9183502197265625, "step": 1811 }, { "epoch": 3.947712418300654, "grad_norm": 2.13411545753479, "learning_rate": 6.054466230936818e-07, "loss": 4.048283576965332, "step": 1812 }, { "epoch": 3.9498910675381262, "grad_norm": 2.067193031311035, "learning_rate": 6.052287581699346e-07, "loss": 3.876558303833008, "step": 1813 }, { "epoch": 3.952069716775599, "grad_norm": 2.1471149921417236, "learning_rate": 6.050108932461873e-07, "loss": 3.962038278579712, "step": 1814 }, { "epoch": 3.954248366013072, "grad_norm": 2.019684314727783, "learning_rate": 6.047930283224401e-07, "loss": 4.16124153137207, "step": 1815 }, { "epoch": 3.9564270152505445, "grad_norm": 2.7269413471221924, "learning_rate": 6.045751633986927e-07, "loss": 4.245613098144531, "step": 1816 }, { "epoch": 3.9586056644880174, "grad_norm": 1.6143351793289185, "learning_rate": 6.043572984749455e-07, "loss": 3.9370615482330322, "step": 1817 }, { "epoch": 3.9607843137254903, "grad_norm": 2.1390585899353027, "learning_rate": 6.041394335511982e-07, "loss": 4.042698383331299, "step": 1818 }, { "epoch": 3.962962962962963, "grad_norm": 2.133162260055542, "learning_rate": 6.039215686274509e-07, "loss": 4.249001502990723, "step": 1819 }, { "epoch": 3.9651416122004357, "grad_norm": 2.6259710788726807, "learning_rate": 6.037037037037037e-07, "loss": 3.960115432739258, "step": 1820 }, { "epoch": 3.9673202614379086, "grad_norm": 2.0155632495880127, "learning_rate": 6.034858387799564e-07, "loss": 4.070213794708252, "step": 1821 }, { "epoch": 3.969498910675381, "grad_norm": 2.1826133728027344, "learning_rate": 6.03267973856209e-07, "loss": 4.134437561035156, "step": 1822 }, { "epoch": 3.971677559912854, "grad_norm": 2.137392997741699, "learning_rate": 6.030501089324619e-07, "loss": 3.965823173522949, "step": 1823 }, { "epoch": 3.973856209150327, "grad_norm": 2.3141119480133057, "learning_rate": 6.028322440087145e-07, "loss": 4.15632438659668, "step": 1824 }, { "epoch": 3.9760348583877994, "grad_norm": 1.882748007774353, "learning_rate": 6.026143790849673e-07, "loss": 4.03546142578125, "step": 1825 }, { "epoch": 3.9782135076252723, "grad_norm": 1.829885721206665, "learning_rate": 6.0239651416122e-07, "loss": 3.9411044120788574, "step": 1826 }, { "epoch": 3.980392156862745, "grad_norm": 2.226675033569336, "learning_rate": 6.021786492374728e-07, "loss": 4.039214611053467, "step": 1827 }, { "epoch": 3.9825708061002176, "grad_norm": 2.116541624069214, "learning_rate": 6.019607843137254e-07, "loss": 4.0517683029174805, "step": 1828 }, { "epoch": 3.9847494553376905, "grad_norm": 1.8461607694625854, "learning_rate": 6.017429193899782e-07, "loss": 3.909243583679199, "step": 1829 }, { "epoch": 3.9869281045751634, "grad_norm": 2.374688148498535, "learning_rate": 6.015250544662309e-07, "loss": 3.9544267654418945, "step": 1830 }, { "epoch": 3.989106753812636, "grad_norm": 1.9449944496154785, "learning_rate": 6.013071895424836e-07, "loss": 4.120140552520752, "step": 1831 }, { "epoch": 3.991285403050109, "grad_norm": 1.8293285369873047, "learning_rate": 6.010893246187364e-07, "loss": 4.100728511810303, "step": 1832 }, { "epoch": 3.9934640522875817, "grad_norm": 2.641796827316284, "learning_rate": 6.008714596949891e-07, "loss": 4.14451265335083, "step": 1833 }, { "epoch": 3.9956427015250546, "grad_norm": 2.5373268127441406, "learning_rate": 6.006535947712417e-07, "loss": 4.261931896209717, "step": 1834 }, { "epoch": 3.997821350762527, "grad_norm": 1.664333462715149, "learning_rate": 6.004357298474946e-07, "loss": 4.015068531036377, "step": 1835 }, { "epoch": 4.0, "grad_norm": 2.6240415573120117, "learning_rate": 6.002178649237472e-07, "loss": 3.982598304748535, "step": 1836 }, { "epoch": 4.0021786492374725, "grad_norm": 1.8433212041854858, "learning_rate": 6e-07, "loss": 4.016632556915283, "step": 1837 }, { "epoch": 4.004357298474946, "grad_norm": 2.224266767501831, "learning_rate": 5.997821350762527e-07, "loss": 3.9937636852264404, "step": 1838 }, { "epoch": 4.006535947712418, "grad_norm": 2.1857635974884033, "learning_rate": 5.995642701525055e-07, "loss": 4.070898056030273, "step": 1839 }, { "epoch": 4.008714596949891, "grad_norm": 2.0183205604553223, "learning_rate": 5.993464052287581e-07, "loss": 3.8474795818328857, "step": 1840 }, { "epoch": 4.010893246187364, "grad_norm": 1.9923603534698486, "learning_rate": 5.991285403050109e-07, "loss": 3.9661505222320557, "step": 1841 }, { "epoch": 4.0130718954248366, "grad_norm": 2.079989433288574, "learning_rate": 5.989106753812636e-07, "loss": 3.920581579208374, "step": 1842 }, { "epoch": 4.015250544662309, "grad_norm": 2.0751302242279053, "learning_rate": 5.986928104575163e-07, "loss": 4.069026947021484, "step": 1843 }, { "epoch": 4.017429193899782, "grad_norm": 1.80231511592865, "learning_rate": 5.98474945533769e-07, "loss": 3.890643835067749, "step": 1844 }, { "epoch": 4.019607843137255, "grad_norm": 2.1681160926818848, "learning_rate": 5.982570806100218e-07, "loss": 3.9741952419281006, "step": 1845 }, { "epoch": 4.021786492374727, "grad_norm": 2.1533560752868652, "learning_rate": 5.980392156862744e-07, "loss": 3.9955291748046875, "step": 1846 }, { "epoch": 4.023965141612201, "grad_norm": 2.8179562091827393, "learning_rate": 5.978213507625272e-07, "loss": 4.056132793426514, "step": 1847 }, { "epoch": 4.026143790849673, "grad_norm": 2.505323886871338, "learning_rate": 5.976034858387799e-07, "loss": 4.114338397979736, "step": 1848 }, { "epoch": 4.028322440087146, "grad_norm": 2.4437873363494873, "learning_rate": 5.973856209150327e-07, "loss": 4.120617389678955, "step": 1849 }, { "epoch": 4.030501089324619, "grad_norm": 1.9178996086120605, "learning_rate": 5.971677559912853e-07, "loss": 3.8032851219177246, "step": 1850 }, { "epoch": 4.032679738562091, "grad_norm": 1.7706966400146484, "learning_rate": 5.969498910675382e-07, "loss": 3.9342782497406006, "step": 1851 }, { "epoch": 4.034858387799564, "grad_norm": 2.7578330039978027, "learning_rate": 5.967320261437908e-07, "loss": 4.007630348205566, "step": 1852 }, { "epoch": 4.037037037037037, "grad_norm": 2.049274444580078, "learning_rate": 5.965141612200435e-07, "loss": 3.830537796020508, "step": 1853 }, { "epoch": 4.03921568627451, "grad_norm": 1.764740228652954, "learning_rate": 5.962962962962963e-07, "loss": 3.9933385848999023, "step": 1854 }, { "epoch": 4.041394335511983, "grad_norm": 2.8942337036132812, "learning_rate": 5.96078431372549e-07, "loss": 4.076817035675049, "step": 1855 }, { "epoch": 4.0435729847494555, "grad_norm": 1.9205644130706787, "learning_rate": 5.958605664488017e-07, "loss": 4.065840721130371, "step": 1856 }, { "epoch": 4.045751633986928, "grad_norm": 2.2004637718200684, "learning_rate": 5.956427015250545e-07, "loss": 3.8682448863983154, "step": 1857 }, { "epoch": 4.047930283224401, "grad_norm": 2.295980930328369, "learning_rate": 5.954248366013071e-07, "loss": 4.124575138092041, "step": 1858 }, { "epoch": 4.050108932461874, "grad_norm": 2.0185184478759766, "learning_rate": 5.952069716775599e-07, "loss": 3.8904166221618652, "step": 1859 }, { "epoch": 4.052287581699346, "grad_norm": 2.768627882003784, "learning_rate": 5.949891067538126e-07, "loss": 4.0539350509643555, "step": 1860 }, { "epoch": 4.05446623093682, "grad_norm": 2.899007797241211, "learning_rate": 5.947712418300654e-07, "loss": 4.1358323097229, "step": 1861 }, { "epoch": 4.056644880174292, "grad_norm": 1.9721002578735352, "learning_rate": 5.94553376906318e-07, "loss": 3.9690940380096436, "step": 1862 }, { "epoch": 4.0588235294117645, "grad_norm": 2.827955484390259, "learning_rate": 5.943355119825709e-07, "loss": 4.032541751861572, "step": 1863 }, { "epoch": 4.061002178649238, "grad_norm": 1.8216395378112793, "learning_rate": 5.941176470588235e-07, "loss": 4.025237083435059, "step": 1864 }, { "epoch": 4.06318082788671, "grad_norm": 1.5369893312454224, "learning_rate": 5.938997821350762e-07, "loss": 3.8082008361816406, "step": 1865 }, { "epoch": 4.065359477124183, "grad_norm": 1.9487756490707397, "learning_rate": 5.93681917211329e-07, "loss": 3.9466545581817627, "step": 1866 }, { "epoch": 4.067538126361656, "grad_norm": 2.019160032272339, "learning_rate": 5.934640522875817e-07, "loss": 4.113095283508301, "step": 1867 }, { "epoch": 4.069716775599129, "grad_norm": 3.363628387451172, "learning_rate": 5.932461873638344e-07, "loss": 4.013089656829834, "step": 1868 }, { "epoch": 4.071895424836601, "grad_norm": 2.5867652893066406, "learning_rate": 5.930283224400872e-07, "loss": 3.9745733737945557, "step": 1869 }, { "epoch": 4.074074074074074, "grad_norm": 2.3151323795318604, "learning_rate": 5.928104575163398e-07, "loss": 4.082669258117676, "step": 1870 }, { "epoch": 4.076252723311547, "grad_norm": 2.764443874359131, "learning_rate": 5.925925925925926e-07, "loss": 4.150216579437256, "step": 1871 }, { "epoch": 4.078431372549019, "grad_norm": 1.8825839757919312, "learning_rate": 5.923747276688453e-07, "loss": 3.9924800395965576, "step": 1872 }, { "epoch": 4.080610021786493, "grad_norm": 2.084299325942993, "learning_rate": 5.921568627450981e-07, "loss": 4.025864601135254, "step": 1873 }, { "epoch": 4.082788671023965, "grad_norm": 1.9222335815429688, "learning_rate": 5.919389978213507e-07, "loss": 4.0313005447387695, "step": 1874 }, { "epoch": 4.084967320261438, "grad_norm": 2.360382080078125, "learning_rate": 5.917211328976036e-07, "loss": 3.9400620460510254, "step": 1875 }, { "epoch": 4.087145969498911, "grad_norm": 2.379910707473755, "learning_rate": 5.915032679738562e-07, "loss": 3.970655918121338, "step": 1876 }, { "epoch": 4.089324618736383, "grad_norm": 1.7775471210479736, "learning_rate": 5.912854030501089e-07, "loss": 3.829343557357788, "step": 1877 }, { "epoch": 4.091503267973856, "grad_norm": 1.7356908321380615, "learning_rate": 5.910675381263617e-07, "loss": 3.851548671722412, "step": 1878 }, { "epoch": 4.093681917211329, "grad_norm": 1.7896106243133545, "learning_rate": 5.908496732026144e-07, "loss": 3.8877172470092773, "step": 1879 }, { "epoch": 4.095860566448802, "grad_norm": 2.969717025756836, "learning_rate": 5.90631808278867e-07, "loss": 4.387091159820557, "step": 1880 }, { "epoch": 4.098039215686274, "grad_norm": 1.846022605895996, "learning_rate": 5.904139433551199e-07, "loss": 3.9519588947296143, "step": 1881 }, { "epoch": 4.1002178649237475, "grad_norm": 2.2990846633911133, "learning_rate": 5.901960784313725e-07, "loss": 3.968109607696533, "step": 1882 }, { "epoch": 4.10239651416122, "grad_norm": 2.3675107955932617, "learning_rate": 5.899782135076252e-07, "loss": 4.0012664794921875, "step": 1883 }, { "epoch": 4.104575163398692, "grad_norm": 2.0487253665924072, "learning_rate": 5.89760348583878e-07, "loss": 4.129114151000977, "step": 1884 }, { "epoch": 4.106753812636166, "grad_norm": 2.2828757762908936, "learning_rate": 5.895424836601307e-07, "loss": 4.015626430511475, "step": 1885 }, { "epoch": 4.108932461873638, "grad_norm": 1.8878647089004517, "learning_rate": 5.893246187363834e-07, "loss": 3.9229094982147217, "step": 1886 }, { "epoch": 4.111111111111111, "grad_norm": 2.7275328636169434, "learning_rate": 5.891067538126361e-07, "loss": 4.089496612548828, "step": 1887 }, { "epoch": 4.113289760348584, "grad_norm": 2.154064178466797, "learning_rate": 5.888888888888889e-07, "loss": 4.031960487365723, "step": 1888 }, { "epoch": 4.1154684095860565, "grad_norm": 2.2672476768493652, "learning_rate": 5.886710239651415e-07, "loss": 3.9855151176452637, "step": 1889 }, { "epoch": 4.117647058823529, "grad_norm": 2.652647018432617, "learning_rate": 5.884531590413943e-07, "loss": 4.077037811279297, "step": 1890 }, { "epoch": 4.119825708061002, "grad_norm": 2.4852097034454346, "learning_rate": 5.88235294117647e-07, "loss": 4.051116943359375, "step": 1891 }, { "epoch": 4.122004357298475, "grad_norm": 2.3045897483825684, "learning_rate": 5.880174291938997e-07, "loss": 3.98571515083313, "step": 1892 }, { "epoch": 4.124183006535947, "grad_norm": 2.302323818206787, "learning_rate": 5.877995642701524e-07, "loss": 4.054691791534424, "step": 1893 }, { "epoch": 4.126361655773421, "grad_norm": 2.52131724357605, "learning_rate": 5.875816993464052e-07, "loss": 3.988799810409546, "step": 1894 }, { "epoch": 4.128540305010893, "grad_norm": 2.125218629837036, "learning_rate": 5.873638344226579e-07, "loss": 3.868726968765259, "step": 1895 }, { "epoch": 4.130718954248366, "grad_norm": 2.2681353092193604, "learning_rate": 5.871459694989106e-07, "loss": 4.170510292053223, "step": 1896 }, { "epoch": 4.132897603485839, "grad_norm": 2.189981460571289, "learning_rate": 5.869281045751634e-07, "loss": 3.8084707260131836, "step": 1897 }, { "epoch": 4.135076252723311, "grad_norm": 1.8507801294326782, "learning_rate": 5.867102396514161e-07, "loss": 4.030261993408203, "step": 1898 }, { "epoch": 4.137254901960785, "grad_norm": 1.6832940578460693, "learning_rate": 5.864923747276687e-07, "loss": 3.7887253761291504, "step": 1899 }, { "epoch": 4.139433551198257, "grad_norm": 2.713118076324463, "learning_rate": 5.862745098039216e-07, "loss": 3.981278657913208, "step": 1900 }, { "epoch": 4.14161220043573, "grad_norm": 2.4072394371032715, "learning_rate": 5.860566448801742e-07, "loss": 4.097848415374756, "step": 1901 }, { "epoch": 4.143790849673203, "grad_norm": 2.277208089828491, "learning_rate": 5.85838779956427e-07, "loss": 3.980844497680664, "step": 1902 }, { "epoch": 4.1459694989106755, "grad_norm": 2.348909854888916, "learning_rate": 5.856209150326797e-07, "loss": 4.104169845581055, "step": 1903 }, { "epoch": 4.148148148148148, "grad_norm": 2.282435417175293, "learning_rate": 5.854030501089324e-07, "loss": 4.128921031951904, "step": 1904 }, { "epoch": 4.150326797385621, "grad_norm": 1.9297035932540894, "learning_rate": 5.851851851851851e-07, "loss": 3.877847909927368, "step": 1905 }, { "epoch": 4.152505446623094, "grad_norm": 2.145216941833496, "learning_rate": 5.849673202614379e-07, "loss": 3.99857497215271, "step": 1906 }, { "epoch": 4.154684095860566, "grad_norm": 2.8292934894561768, "learning_rate": 5.847494553376906e-07, "loss": 4.118540287017822, "step": 1907 }, { "epoch": 4.1568627450980395, "grad_norm": 1.6904830932617188, "learning_rate": 5.845315904139433e-07, "loss": 3.9832193851470947, "step": 1908 }, { "epoch": 4.159041394335512, "grad_norm": 2.0759239196777344, "learning_rate": 5.843137254901961e-07, "loss": 3.942476987838745, "step": 1909 }, { "epoch": 4.1612200435729845, "grad_norm": 1.9308255910873413, "learning_rate": 5.840958605664488e-07, "loss": 4.0861358642578125, "step": 1910 }, { "epoch": 4.163398692810458, "grad_norm": 2.0829689502716064, "learning_rate": 5.838779956427014e-07, "loss": 4.026965618133545, "step": 1911 }, { "epoch": 4.16557734204793, "grad_norm": 2.0844359397888184, "learning_rate": 5.836601307189543e-07, "loss": 3.970393657684326, "step": 1912 }, { "epoch": 4.167755991285403, "grad_norm": 2.119354486465454, "learning_rate": 5.834422657952069e-07, "loss": 3.7895348072052, "step": 1913 }, { "epoch": 4.169934640522876, "grad_norm": 2.5785772800445557, "learning_rate": 5.832244008714597e-07, "loss": 4.000765323638916, "step": 1914 }, { "epoch": 4.172113289760349, "grad_norm": 1.9319825172424316, "learning_rate": 5.830065359477124e-07, "loss": 3.8581814765930176, "step": 1915 }, { "epoch": 4.174291938997821, "grad_norm": 1.910841703414917, "learning_rate": 5.827886710239651e-07, "loss": 3.8386404514312744, "step": 1916 }, { "epoch": 4.176470588235294, "grad_norm": 1.9723472595214844, "learning_rate": 5.825708061002178e-07, "loss": 4.003001689910889, "step": 1917 }, { "epoch": 4.178649237472767, "grad_norm": 1.7131481170654297, "learning_rate": 5.823529411764706e-07, "loss": 4.021419525146484, "step": 1918 }, { "epoch": 4.180827886710239, "grad_norm": 2.3385820388793945, "learning_rate": 5.821350762527233e-07, "loss": 4.0849385261535645, "step": 1919 }, { "epoch": 4.183006535947713, "grad_norm": 1.5295448303222656, "learning_rate": 5.81917211328976e-07, "loss": 3.764514923095703, "step": 1920 }, { "epoch": 4.185185185185185, "grad_norm": 2.0911004543304443, "learning_rate": 5.816993464052288e-07, "loss": 3.9662833213806152, "step": 1921 }, { "epoch": 4.187363834422658, "grad_norm": 2.088515281677246, "learning_rate": 5.814814814814815e-07, "loss": 3.9946036338806152, "step": 1922 }, { "epoch": 4.189542483660131, "grad_norm": 2.482851266860962, "learning_rate": 5.812636165577341e-07, "loss": 4.10518217086792, "step": 1923 }, { "epoch": 4.191721132897603, "grad_norm": 2.449373483657837, "learning_rate": 5.81045751633987e-07, "loss": 4.141642093658447, "step": 1924 }, { "epoch": 4.193899782135076, "grad_norm": 2.278756856918335, "learning_rate": 5.808278867102396e-07, "loss": 3.955918312072754, "step": 1925 }, { "epoch": 4.196078431372549, "grad_norm": 2.012216806411743, "learning_rate": 5.806100217864924e-07, "loss": 4.054948329925537, "step": 1926 }, { "epoch": 4.198257080610022, "grad_norm": 3.747513771057129, "learning_rate": 5.803921568627451e-07, "loss": 3.8651883602142334, "step": 1927 }, { "epoch": 4.200435729847494, "grad_norm": 2.888826370239258, "learning_rate": 5.801742919389978e-07, "loss": 3.9443564414978027, "step": 1928 }, { "epoch": 4.2026143790849675, "grad_norm": 2.002685785293579, "learning_rate": 5.799564270152505e-07, "loss": 3.9623191356658936, "step": 1929 }, { "epoch": 4.20479302832244, "grad_norm": 2.3890628814697266, "learning_rate": 5.797385620915033e-07, "loss": 3.9307239055633545, "step": 1930 }, { "epoch": 4.206971677559913, "grad_norm": 2.6411237716674805, "learning_rate": 5.79520697167756e-07, "loss": 4.12636661529541, "step": 1931 }, { "epoch": 4.209150326797386, "grad_norm": 2.165923833847046, "learning_rate": 5.793028322440087e-07, "loss": 4.0500264167785645, "step": 1932 }, { "epoch": 4.211328976034858, "grad_norm": 2.015401840209961, "learning_rate": 5.790849673202614e-07, "loss": 3.930546998977661, "step": 1933 }, { "epoch": 4.213507625272332, "grad_norm": 1.5617214441299438, "learning_rate": 5.788671023965142e-07, "loss": 3.973935842514038, "step": 1934 }, { "epoch": 4.215686274509804, "grad_norm": 2.1521759033203125, "learning_rate": 5.786492374727668e-07, "loss": 3.8701698780059814, "step": 1935 }, { "epoch": 4.2178649237472765, "grad_norm": 2.161684513092041, "learning_rate": 5.784313725490197e-07, "loss": 4.244114875793457, "step": 1936 }, { "epoch": 4.22004357298475, "grad_norm": 2.6336164474487305, "learning_rate": 5.782135076252723e-07, "loss": 3.907111883163452, "step": 1937 }, { "epoch": 4.222222222222222, "grad_norm": 2.1531503200531006, "learning_rate": 5.779956427015251e-07, "loss": 3.978759288787842, "step": 1938 }, { "epoch": 4.224400871459695, "grad_norm": 2.5304439067840576, "learning_rate": 5.777777777777777e-07, "loss": 3.951061248779297, "step": 1939 }, { "epoch": 4.226579520697168, "grad_norm": 2.3334195613861084, "learning_rate": 5.775599128540305e-07, "loss": 4.008544921875, "step": 1940 }, { "epoch": 4.228758169934641, "grad_norm": 2.1881768703460693, "learning_rate": 5.773420479302832e-07, "loss": 3.9474034309387207, "step": 1941 }, { "epoch": 4.230936819172113, "grad_norm": 2.2058069705963135, "learning_rate": 5.771241830065359e-07, "loss": 4.088275909423828, "step": 1942 }, { "epoch": 4.233115468409586, "grad_norm": 2.4022061824798584, "learning_rate": 5.769063180827887e-07, "loss": 3.843453884124756, "step": 1943 }, { "epoch": 4.235294117647059, "grad_norm": 2.53328800201416, "learning_rate": 5.766884531590414e-07, "loss": 4.113787651062012, "step": 1944 }, { "epoch": 4.237472766884531, "grad_norm": 2.6466219425201416, "learning_rate": 5.76470588235294e-07, "loss": 4.0008015632629395, "step": 1945 }, { "epoch": 4.239651416122005, "grad_norm": 2.7760889530181885, "learning_rate": 5.762527233115469e-07, "loss": 4.290622711181641, "step": 1946 }, { "epoch": 4.241830065359477, "grad_norm": 2.58681583404541, "learning_rate": 5.760348583877995e-07, "loss": 3.985222816467285, "step": 1947 }, { "epoch": 4.24400871459695, "grad_norm": 2.4293267726898193, "learning_rate": 5.758169934640522e-07, "loss": 3.9285213947296143, "step": 1948 }, { "epoch": 4.246187363834423, "grad_norm": 2.260406494140625, "learning_rate": 5.75599128540305e-07, "loss": 3.9199821949005127, "step": 1949 }, { "epoch": 4.248366013071895, "grad_norm": 1.8555725812911987, "learning_rate": 5.753812636165576e-07, "loss": 3.8519022464752197, "step": 1950 }, { "epoch": 4.250544662309368, "grad_norm": 1.9048469066619873, "learning_rate": 5.751633986928104e-07, "loss": 3.9101743698120117, "step": 1951 }, { "epoch": 4.252723311546841, "grad_norm": 2.1210808753967285, "learning_rate": 5.749455337690631e-07, "loss": 3.9760043621063232, "step": 1952 }, { "epoch": 4.254901960784314, "grad_norm": 2.1048710346221924, "learning_rate": 5.747276688453159e-07, "loss": 3.9306042194366455, "step": 1953 }, { "epoch": 4.257080610021786, "grad_norm": 2.209927797317505, "learning_rate": 5.745098039215685e-07, "loss": 3.914919376373291, "step": 1954 }, { "epoch": 4.2592592592592595, "grad_norm": 2.5841426849365234, "learning_rate": 5.742919389978214e-07, "loss": 4.197303295135498, "step": 1955 }, { "epoch": 4.261437908496732, "grad_norm": 2.3146142959594727, "learning_rate": 5.74074074074074e-07, "loss": 4.101532459259033, "step": 1956 }, { "epoch": 4.2636165577342044, "grad_norm": 3.0586097240448, "learning_rate": 5.738562091503267e-07, "loss": 4.06885290145874, "step": 1957 }, { "epoch": 4.265795206971678, "grad_norm": 1.9904552698135376, "learning_rate": 5.736383442265795e-07, "loss": 3.9511663913726807, "step": 1958 }, { "epoch": 4.26797385620915, "grad_norm": 2.4603748321533203, "learning_rate": 5.734204793028322e-07, "loss": 3.9690239429473877, "step": 1959 }, { "epoch": 4.270152505446623, "grad_norm": 2.144392251968384, "learning_rate": 5.732026143790849e-07, "loss": 3.879911184310913, "step": 1960 }, { "epoch": 4.272331154684096, "grad_norm": 2.2161247730255127, "learning_rate": 5.729847494553377e-07, "loss": 3.8592424392700195, "step": 1961 }, { "epoch": 4.2745098039215685, "grad_norm": 2.4097812175750732, "learning_rate": 5.727668845315903e-07, "loss": 3.987563371658325, "step": 1962 }, { "epoch": 4.276688453159041, "grad_norm": 1.9524035453796387, "learning_rate": 5.725490196078431e-07, "loss": 4.052401542663574, "step": 1963 }, { "epoch": 4.278867102396514, "grad_norm": 1.8114335536956787, "learning_rate": 5.723311546840958e-07, "loss": 3.8154892921447754, "step": 1964 }, { "epoch": 4.281045751633987, "grad_norm": 1.6790965795516968, "learning_rate": 5.721132897603486e-07, "loss": 3.9205970764160156, "step": 1965 }, { "epoch": 4.283224400871459, "grad_norm": 2.6177895069122314, "learning_rate": 5.718954248366012e-07, "loss": 3.9032325744628906, "step": 1966 }, { "epoch": 4.285403050108933, "grad_norm": 2.238166093826294, "learning_rate": 5.716775599128541e-07, "loss": 3.806116819381714, "step": 1967 }, { "epoch": 4.287581699346405, "grad_norm": 2.0535879135131836, "learning_rate": 5.714596949891067e-07, "loss": 3.9881768226623535, "step": 1968 }, { "epoch": 4.289760348583878, "grad_norm": 2.003664255142212, "learning_rate": 5.712418300653594e-07, "loss": 3.886110544204712, "step": 1969 }, { "epoch": 4.291938997821351, "grad_norm": 3.484581470489502, "learning_rate": 5.710239651416122e-07, "loss": 4.133213520050049, "step": 1970 }, { "epoch": 4.294117647058823, "grad_norm": 2.278512954711914, "learning_rate": 5.708061002178649e-07, "loss": 3.846045732498169, "step": 1971 }, { "epoch": 4.296296296296296, "grad_norm": 1.6446748971939087, "learning_rate": 5.705882352941176e-07, "loss": 3.904033660888672, "step": 1972 }, { "epoch": 4.298474945533769, "grad_norm": 2.6222848892211914, "learning_rate": 5.703703703703704e-07, "loss": 4.115198135375977, "step": 1973 }, { "epoch": 4.300653594771242, "grad_norm": 2.6092164516448975, "learning_rate": 5.70152505446623e-07, "loss": 4.1033244132995605, "step": 1974 }, { "epoch": 4.302832244008715, "grad_norm": 2.7374749183654785, "learning_rate": 5.699346405228758e-07, "loss": 3.897958993911743, "step": 1975 }, { "epoch": 4.3050108932461875, "grad_norm": 1.9809801578521729, "learning_rate": 5.697167755991285e-07, "loss": 3.9343788623809814, "step": 1976 }, { "epoch": 4.30718954248366, "grad_norm": 2.3806281089782715, "learning_rate": 5.694989106753813e-07, "loss": 3.952409267425537, "step": 1977 }, { "epoch": 4.309368191721133, "grad_norm": 2.3362655639648438, "learning_rate": 5.692810457516339e-07, "loss": 4.0490641593933105, "step": 1978 }, { "epoch": 4.311546840958606, "grad_norm": 2.560149669647217, "learning_rate": 5.690631808278868e-07, "loss": 3.9867913722991943, "step": 1979 }, { "epoch": 4.313725490196078, "grad_norm": 2.693664789199829, "learning_rate": 5.688453159041394e-07, "loss": 3.6772117614746094, "step": 1980 }, { "epoch": 4.315904139433552, "grad_norm": 2.121671199798584, "learning_rate": 5.686274509803921e-07, "loss": 4.043824672698975, "step": 1981 }, { "epoch": 4.318082788671024, "grad_norm": 2.146592617034912, "learning_rate": 5.684095860566449e-07, "loss": 3.933519124984741, "step": 1982 }, { "epoch": 4.3202614379084965, "grad_norm": 3.069981813430786, "learning_rate": 5.681917211328976e-07, "loss": 4.033236980438232, "step": 1983 }, { "epoch": 4.32244008714597, "grad_norm": 1.7917377948760986, "learning_rate": 5.679738562091503e-07, "loss": 3.8431732654571533, "step": 1984 }, { "epoch": 4.324618736383442, "grad_norm": 2.9762730598449707, "learning_rate": 5.677559912854031e-07, "loss": 4.034677028656006, "step": 1985 }, { "epoch": 4.326797385620915, "grad_norm": 2.182874917984009, "learning_rate": 5.675381263616557e-07, "loss": 3.8821003437042236, "step": 1986 }, { "epoch": 4.328976034858388, "grad_norm": 2.2028043270111084, "learning_rate": 5.673202614379085e-07, "loss": 4.016928672790527, "step": 1987 }, { "epoch": 4.331154684095861, "grad_norm": 1.9808611869812012, "learning_rate": 5.671023965141611e-07, "loss": 3.8086256980895996, "step": 1988 }, { "epoch": 4.333333333333333, "grad_norm": 2.0979678630828857, "learning_rate": 5.66884531590414e-07, "loss": 3.9220221042633057, "step": 1989 }, { "epoch": 4.335511982570806, "grad_norm": 2.318960428237915, "learning_rate": 5.666666666666666e-07, "loss": 3.860163927078247, "step": 1990 }, { "epoch": 4.337690631808279, "grad_norm": 2.173682451248169, "learning_rate": 5.664488017429193e-07, "loss": 3.919438362121582, "step": 1991 }, { "epoch": 4.339869281045751, "grad_norm": 2.0935144424438477, "learning_rate": 5.662309368191721e-07, "loss": 4.104275226593018, "step": 1992 }, { "epoch": 4.342047930283225, "grad_norm": 1.6438676118850708, "learning_rate": 5.660130718954248e-07, "loss": 3.834162712097168, "step": 1993 }, { "epoch": 4.344226579520697, "grad_norm": 2.708216667175293, "learning_rate": 5.657952069716775e-07, "loss": 4.193716526031494, "step": 1994 }, { "epoch": 4.34640522875817, "grad_norm": 2.1170170307159424, "learning_rate": 5.655773420479303e-07, "loss": 4.046345233917236, "step": 1995 }, { "epoch": 4.348583877995643, "grad_norm": 1.6900111436843872, "learning_rate": 5.65359477124183e-07, "loss": 3.9626054763793945, "step": 1996 }, { "epoch": 4.350762527233115, "grad_norm": 2.319507598876953, "learning_rate": 5.651416122004357e-07, "loss": 3.9088804721832275, "step": 1997 }, { "epoch": 4.352941176470588, "grad_norm": 2.3184568881988525, "learning_rate": 5.649237472766884e-07, "loss": 4.01774787902832, "step": 1998 }, { "epoch": 4.355119825708061, "grad_norm": 2.5491926670074463, "learning_rate": 5.647058823529412e-07, "loss": 4.0354156494140625, "step": 1999 }, { "epoch": 4.357298474945534, "grad_norm": 3.222482204437256, "learning_rate": 5.644880174291938e-07, "loss": 4.190352916717529, "step": 2000 }, { "epoch": 4.359477124183006, "grad_norm": 2.3533341884613037, "learning_rate": 5.642701525054467e-07, "loss": 3.9689807891845703, "step": 2001 }, { "epoch": 4.3616557734204795, "grad_norm": 1.9307218790054321, "learning_rate": 5.640522875816993e-07, "loss": 3.792652130126953, "step": 2002 }, { "epoch": 4.363834422657952, "grad_norm": 1.8470615148544312, "learning_rate": 5.63834422657952e-07, "loss": 3.969179153442383, "step": 2003 }, { "epoch": 4.366013071895424, "grad_norm": 2.147575616836548, "learning_rate": 5.636165577342048e-07, "loss": 3.9619686603546143, "step": 2004 }, { "epoch": 4.368191721132898, "grad_norm": 2.20467472076416, "learning_rate": 5.633986928104575e-07, "loss": 3.9386017322540283, "step": 2005 }, { "epoch": 4.37037037037037, "grad_norm": 2.130195140838623, "learning_rate": 5.631808278867102e-07, "loss": 3.8879778385162354, "step": 2006 }, { "epoch": 4.372549019607844, "grad_norm": 2.364340305328369, "learning_rate": 5.62962962962963e-07, "loss": 4.03952169418335, "step": 2007 }, { "epoch": 4.374727668845316, "grad_norm": 2.1971781253814697, "learning_rate": 5.627450980392157e-07, "loss": 3.976306438446045, "step": 2008 }, { "epoch": 4.3769063180827885, "grad_norm": 2.595695734024048, "learning_rate": 5.625272331154684e-07, "loss": 4.197473526000977, "step": 2009 }, { "epoch": 4.379084967320262, "grad_norm": 1.8222745656967163, "learning_rate": 5.623093681917211e-07, "loss": 3.8925719261169434, "step": 2010 }, { "epoch": 4.381263616557734, "grad_norm": 4.251540184020996, "learning_rate": 5.620915032679739e-07, "loss": 4.4020586013793945, "step": 2011 }, { "epoch": 4.383442265795207, "grad_norm": 2.3255889415740967, "learning_rate": 5.618736383442265e-07, "loss": 4.121799468994141, "step": 2012 }, { "epoch": 4.38562091503268, "grad_norm": 2.3374874591827393, "learning_rate": 5.616557734204794e-07, "loss": 3.935875654220581, "step": 2013 }, { "epoch": 4.387799564270153, "grad_norm": 2.123485803604126, "learning_rate": 5.61437908496732e-07, "loss": 4.086036682128906, "step": 2014 }, { "epoch": 4.389978213507625, "grad_norm": 1.6603772640228271, "learning_rate": 5.612200435729847e-07, "loss": 3.8445322513580322, "step": 2015 }, { "epoch": 4.392156862745098, "grad_norm": 2.031344175338745, "learning_rate": 5.610021786492375e-07, "loss": 4.001389026641846, "step": 2016 }, { "epoch": 4.394335511982571, "grad_norm": 2.146866798400879, "learning_rate": 5.607843137254902e-07, "loss": 3.9132134914398193, "step": 2017 }, { "epoch": 4.396514161220043, "grad_norm": 1.9432625770568848, "learning_rate": 5.605664488017429e-07, "loss": 3.970904588699341, "step": 2018 }, { "epoch": 4.398692810457517, "grad_norm": 2.304267644882202, "learning_rate": 5.603485838779957e-07, "loss": 3.9954910278320312, "step": 2019 }, { "epoch": 4.400871459694989, "grad_norm": 1.7531206607818604, "learning_rate": 5.601307189542484e-07, "loss": 3.984403371810913, "step": 2020 }, { "epoch": 4.403050108932462, "grad_norm": 2.7252981662750244, "learning_rate": 5.59912854030501e-07, "loss": 3.9143965244293213, "step": 2021 }, { "epoch": 4.405228758169935, "grad_norm": 2.8421473503112793, "learning_rate": 5.596949891067538e-07, "loss": 3.989109516143799, "step": 2022 }, { "epoch": 4.407407407407407, "grad_norm": 1.9191994667053223, "learning_rate": 5.594771241830065e-07, "loss": 3.930515766143799, "step": 2023 }, { "epoch": 4.40958605664488, "grad_norm": 1.961835503578186, "learning_rate": 5.592592592592592e-07, "loss": 3.9802639484405518, "step": 2024 }, { "epoch": 4.411764705882353, "grad_norm": 3.110952854156494, "learning_rate": 5.59041394335512e-07, "loss": 4.121373176574707, "step": 2025 }, { "epoch": 4.413943355119826, "grad_norm": 1.9341986179351807, "learning_rate": 5.588235294117647e-07, "loss": 4.037127494812012, "step": 2026 }, { "epoch": 4.416122004357298, "grad_norm": 2.9076457023620605, "learning_rate": 5.586056644880173e-07, "loss": 4.171000957489014, "step": 2027 }, { "epoch": 4.4183006535947715, "grad_norm": 2.0018222332000732, "learning_rate": 5.583877995642702e-07, "loss": 3.8729519844055176, "step": 2028 }, { "epoch": 4.420479302832244, "grad_norm": 1.6569304466247559, "learning_rate": 5.581699346405228e-07, "loss": 3.8704280853271484, "step": 2029 }, { "epoch": 4.4226579520697165, "grad_norm": 2.126499891281128, "learning_rate": 5.579520697167756e-07, "loss": 3.9223692417144775, "step": 2030 }, { "epoch": 4.42483660130719, "grad_norm": 1.9594639539718628, "learning_rate": 5.577342047930283e-07, "loss": 3.9888510704040527, "step": 2031 }, { "epoch": 4.427015250544662, "grad_norm": 2.0554957389831543, "learning_rate": 5.57516339869281e-07, "loss": 3.8875646591186523, "step": 2032 }, { "epoch": 4.429193899782135, "grad_norm": 2.1245691776275635, "learning_rate": 5.572984749455337e-07, "loss": 3.7980544567108154, "step": 2033 }, { "epoch": 4.431372549019608, "grad_norm": 2.1154372692108154, "learning_rate": 5.570806100217864e-07, "loss": 3.937669277191162, "step": 2034 }, { "epoch": 4.4335511982570806, "grad_norm": 2.0990939140319824, "learning_rate": 5.568627450980392e-07, "loss": 3.965672016143799, "step": 2035 }, { "epoch": 4.435729847494553, "grad_norm": 1.7949049472808838, "learning_rate": 5.566448801742919e-07, "loss": 3.9541585445404053, "step": 2036 }, { "epoch": 4.437908496732026, "grad_norm": 2.480093002319336, "learning_rate": 5.564270152505445e-07, "loss": 4.078892230987549, "step": 2037 }, { "epoch": 4.440087145969499, "grad_norm": 2.7102701663970947, "learning_rate": 5.562091503267974e-07, "loss": 4.048115253448486, "step": 2038 }, { "epoch": 4.442265795206971, "grad_norm": 2.157085418701172, "learning_rate": 5.5599128540305e-07, "loss": 4.0252556800842285, "step": 2039 }, { "epoch": 4.444444444444445, "grad_norm": 2.052421808242798, "learning_rate": 5.557734204793028e-07, "loss": 4.050649166107178, "step": 2040 }, { "epoch": 4.446623093681917, "grad_norm": 2.206092119216919, "learning_rate": 5.555555555555555e-07, "loss": 3.9752323627471924, "step": 2041 }, { "epoch": 4.44880174291939, "grad_norm": 2.5930681228637695, "learning_rate": 5.553376906318083e-07, "loss": 4.316627025604248, "step": 2042 }, { "epoch": 4.450980392156863, "grad_norm": 2.297529935836792, "learning_rate": 5.551198257080609e-07, "loss": 3.90909743309021, "step": 2043 }, { "epoch": 4.453159041394335, "grad_norm": 2.3751771450042725, "learning_rate": 5.549019607843137e-07, "loss": 4.124406337738037, "step": 2044 }, { "epoch": 4.455337690631808, "grad_norm": 1.9660383462905884, "learning_rate": 5.546840958605664e-07, "loss": 3.9943325519561768, "step": 2045 }, { "epoch": 4.457516339869281, "grad_norm": 1.8789093494415283, "learning_rate": 5.544662309368191e-07, "loss": 4.0108466148376465, "step": 2046 }, { "epoch": 4.459694989106754, "grad_norm": 1.5837063789367676, "learning_rate": 5.542483660130719e-07, "loss": 4.006556034088135, "step": 2047 }, { "epoch": 4.461873638344226, "grad_norm": 2.490687131881714, "learning_rate": 5.540305010893246e-07, "loss": 4.054236888885498, "step": 2048 }, { "epoch": 4.4640522875816995, "grad_norm": 2.134671449661255, "learning_rate": 5.538126361655772e-07, "loss": 4.01542854309082, "step": 2049 }, { "epoch": 4.466230936819172, "grad_norm": 1.6091417074203491, "learning_rate": 5.535947712418301e-07, "loss": 3.942314624786377, "step": 2050 }, { "epoch": 4.468409586056645, "grad_norm": 1.7473686933517456, "learning_rate": 5.533769063180827e-07, "loss": 3.9995803833007812, "step": 2051 }, { "epoch": 4.470588235294118, "grad_norm": 1.6239798069000244, "learning_rate": 5.531590413943355e-07, "loss": 3.8380870819091797, "step": 2052 }, { "epoch": 4.47276688453159, "grad_norm": 2.4022624492645264, "learning_rate": 5.529411764705882e-07, "loss": 4.13386869430542, "step": 2053 }, { "epoch": 4.474945533769064, "grad_norm": 2.111111879348755, "learning_rate": 5.52723311546841e-07, "loss": 4.017481803894043, "step": 2054 }, { "epoch": 4.477124183006536, "grad_norm": 2.5234689712524414, "learning_rate": 5.525054466230936e-07, "loss": 4.048630237579346, "step": 2055 }, { "epoch": 4.4793028322440085, "grad_norm": 2.1191225051879883, "learning_rate": 5.522875816993464e-07, "loss": 3.980756998062134, "step": 2056 }, { "epoch": 4.481481481481482, "grad_norm": 1.9922866821289062, "learning_rate": 5.520697167755991e-07, "loss": 3.954474925994873, "step": 2057 }, { "epoch": 4.483660130718954, "grad_norm": 2.4431328773498535, "learning_rate": 5.518518518518518e-07, "loss": 3.8490920066833496, "step": 2058 }, { "epoch": 4.485838779956427, "grad_norm": 2.0441172122955322, "learning_rate": 5.516339869281046e-07, "loss": 4.031130790710449, "step": 2059 }, { "epoch": 4.4880174291939, "grad_norm": 2.1703813076019287, "learning_rate": 5.514161220043573e-07, "loss": 3.93973970413208, "step": 2060 }, { "epoch": 4.490196078431373, "grad_norm": 1.9673213958740234, "learning_rate": 5.511982570806099e-07, "loss": 3.924880266189575, "step": 2061 }, { "epoch": 4.492374727668845, "grad_norm": 2.0356051921844482, "learning_rate": 5.509803921568628e-07, "loss": 3.9692015647888184, "step": 2062 }, { "epoch": 4.494553376906318, "grad_norm": 2.8181040287017822, "learning_rate": 5.507625272331154e-07, "loss": 3.973630428314209, "step": 2063 }, { "epoch": 4.496732026143791, "grad_norm": 2.5173583030700684, "learning_rate": 5.505446623093682e-07, "loss": 3.8992345333099365, "step": 2064 }, { "epoch": 4.498910675381263, "grad_norm": 2.8097803592681885, "learning_rate": 5.503267973856209e-07, "loss": 4.078397750854492, "step": 2065 }, { "epoch": 4.501089324618737, "grad_norm": 2.3424994945526123, "learning_rate": 5.501089324618737e-07, "loss": 3.9008402824401855, "step": 2066 }, { "epoch": 4.503267973856209, "grad_norm": 2.7894175052642822, "learning_rate": 5.498910675381263e-07, "loss": 4.140563011169434, "step": 2067 }, { "epoch": 4.505446623093682, "grad_norm": 2.032322645187378, "learning_rate": 5.496732026143791e-07, "loss": 3.931955575942993, "step": 2068 }, { "epoch": 4.507625272331155, "grad_norm": 2.1209676265716553, "learning_rate": 5.494553376906318e-07, "loss": 3.9801530838012695, "step": 2069 }, { "epoch": 4.509803921568627, "grad_norm": 2.0561015605926514, "learning_rate": 5.492374727668845e-07, "loss": 4.014986515045166, "step": 2070 }, { "epoch": 4.5119825708061, "grad_norm": 2.530646800994873, "learning_rate": 5.490196078431373e-07, "loss": 4.084571838378906, "step": 2071 }, { "epoch": 4.514161220043573, "grad_norm": 1.7882734537124634, "learning_rate": 5.4880174291939e-07, "loss": 3.878854751586914, "step": 2072 }, { "epoch": 4.516339869281046, "grad_norm": 2.3200531005859375, "learning_rate": 5.485838779956426e-07, "loss": 4.159471035003662, "step": 2073 }, { "epoch": 4.518518518518518, "grad_norm": 2.0865914821624756, "learning_rate": 5.483660130718955e-07, "loss": 3.8261046409606934, "step": 2074 }, { "epoch": 4.5206971677559915, "grad_norm": 2.0502188205718994, "learning_rate": 5.481481481481481e-07, "loss": 4.084341049194336, "step": 2075 }, { "epoch": 4.522875816993464, "grad_norm": 2.9494521617889404, "learning_rate": 5.479302832244009e-07, "loss": 3.972355365753174, "step": 2076 }, { "epoch": 4.525054466230936, "grad_norm": 2.061483383178711, "learning_rate": 5.477124183006536e-07, "loss": 3.905923366546631, "step": 2077 }, { "epoch": 4.52723311546841, "grad_norm": 2.2093703746795654, "learning_rate": 5.474945533769064e-07, "loss": 3.951500654220581, "step": 2078 }, { "epoch": 4.529411764705882, "grad_norm": 2.5047378540039062, "learning_rate": 5.47276688453159e-07, "loss": 4.092803955078125, "step": 2079 }, { "epoch": 4.531590413943356, "grad_norm": 2.0348215103149414, "learning_rate": 5.470588235294118e-07, "loss": 4.071959018707275, "step": 2080 }, { "epoch": 4.533769063180828, "grad_norm": 1.8799012899398804, "learning_rate": 5.468409586056645e-07, "loss": 3.9014737606048584, "step": 2081 }, { "epoch": 4.5359477124183005, "grad_norm": 2.608292579650879, "learning_rate": 5.466230936819172e-07, "loss": 3.9859933853149414, "step": 2082 }, { "epoch": 4.538126361655774, "grad_norm": 2.159672498703003, "learning_rate": 5.464052287581699e-07, "loss": 3.8420708179473877, "step": 2083 }, { "epoch": 4.540305010893246, "grad_norm": 1.9260433912277222, "learning_rate": 5.461873638344227e-07, "loss": 3.912128210067749, "step": 2084 }, { "epoch": 4.542483660130719, "grad_norm": 2.2899537086486816, "learning_rate": 5.459694989106753e-07, "loss": 3.949881076812744, "step": 2085 }, { "epoch": 4.544662309368192, "grad_norm": 1.7342382669448853, "learning_rate": 5.45751633986928e-07, "loss": 3.789456367492676, "step": 2086 }, { "epoch": 4.546840958605665, "grad_norm": 2.428940773010254, "learning_rate": 5.455337690631808e-07, "loss": 3.9477667808532715, "step": 2087 }, { "epoch": 4.549019607843137, "grad_norm": 2.159994125366211, "learning_rate": 5.453159041394335e-07, "loss": 3.972402811050415, "step": 2088 }, { "epoch": 4.55119825708061, "grad_norm": 2.3378331661224365, "learning_rate": 5.450980392156862e-07, "loss": 3.9161946773529053, "step": 2089 }, { "epoch": 4.553376906318083, "grad_norm": 2.041429042816162, "learning_rate": 5.448801742919389e-07, "loss": 3.9552011489868164, "step": 2090 }, { "epoch": 4.555555555555555, "grad_norm": 2.83227801322937, "learning_rate": 5.446623093681917e-07, "loss": 4.0975141525268555, "step": 2091 }, { "epoch": 4.557734204793029, "grad_norm": 2.1185712814331055, "learning_rate": 5.444444444444443e-07, "loss": 3.969053268432617, "step": 2092 }, { "epoch": 4.559912854030501, "grad_norm": 2.4258639812469482, "learning_rate": 5.442265795206972e-07, "loss": 4.025950908660889, "step": 2093 }, { "epoch": 4.562091503267974, "grad_norm": 1.8243402242660522, "learning_rate": 5.440087145969498e-07, "loss": 3.8476688861846924, "step": 2094 }, { "epoch": 4.564270152505447, "grad_norm": 1.8838999271392822, "learning_rate": 5.437908496732026e-07, "loss": 3.8914482593536377, "step": 2095 }, { "epoch": 4.5664488017429194, "grad_norm": 1.525075912475586, "learning_rate": 5.435729847494553e-07, "loss": 3.9081015586853027, "step": 2096 }, { "epoch": 4.568627450980392, "grad_norm": 2.064873456954956, "learning_rate": 5.43355119825708e-07, "loss": 3.890920639038086, "step": 2097 }, { "epoch": 4.570806100217865, "grad_norm": 2.490494728088379, "learning_rate": 5.431372549019607e-07, "loss": 4.063034534454346, "step": 2098 }, { "epoch": 4.572984749455338, "grad_norm": 1.7653751373291016, "learning_rate": 5.429193899782135e-07, "loss": 3.9016804695129395, "step": 2099 }, { "epoch": 4.57516339869281, "grad_norm": 2.485373020172119, "learning_rate": 5.427015250544662e-07, "loss": 4.078092575073242, "step": 2100 }, { "epoch": 4.5773420479302835, "grad_norm": 1.6884737014770508, "learning_rate": 5.424836601307189e-07, "loss": 3.8840041160583496, "step": 2101 }, { "epoch": 4.579520697167756, "grad_norm": 2.349407434463501, "learning_rate": 5.422657952069716e-07, "loss": 4.012859344482422, "step": 2102 }, { "epoch": 4.5816993464052285, "grad_norm": 2.5180716514587402, "learning_rate": 5.420479302832244e-07, "loss": 3.9702987670898438, "step": 2103 }, { "epoch": 4.583877995642702, "grad_norm": 2.562161922454834, "learning_rate": 5.41830065359477e-07, "loss": 4.1108880043029785, "step": 2104 }, { "epoch": 4.586056644880174, "grad_norm": 2.014618158340454, "learning_rate": 5.416122004357299e-07, "loss": 3.889551877975464, "step": 2105 }, { "epoch": 4.588235294117647, "grad_norm": 1.826614260673523, "learning_rate": 5.413943355119825e-07, "loss": 3.703611373901367, "step": 2106 }, { "epoch": 4.59041394335512, "grad_norm": 2.074070930480957, "learning_rate": 5.411764705882353e-07, "loss": 3.8792269229888916, "step": 2107 }, { "epoch": 4.592592592592593, "grad_norm": 2.766526937484741, "learning_rate": 5.40958605664488e-07, "loss": 4.033573627471924, "step": 2108 }, { "epoch": 4.594771241830065, "grad_norm": 2.59171462059021, "learning_rate": 5.407407407407407e-07, "loss": 4.057830333709717, "step": 2109 }, { "epoch": 4.596949891067538, "grad_norm": 2.164198160171509, "learning_rate": 5.405228758169934e-07, "loss": 3.9591665267944336, "step": 2110 }, { "epoch": 4.599128540305011, "grad_norm": 2.711167335510254, "learning_rate": 5.403050108932462e-07, "loss": 3.930108070373535, "step": 2111 }, { "epoch": 4.601307189542483, "grad_norm": 2.4858052730560303, "learning_rate": 5.400871459694989e-07, "loss": 3.909036874771118, "step": 2112 }, { "epoch": 4.603485838779957, "grad_norm": 3.106459617614746, "learning_rate": 5.398692810457516e-07, "loss": 4.2414326667785645, "step": 2113 }, { "epoch": 4.605664488017429, "grad_norm": 3.095247268676758, "learning_rate": 5.396514161220043e-07, "loss": 4.178194522857666, "step": 2114 }, { "epoch": 4.607843137254902, "grad_norm": 2.3876163959503174, "learning_rate": 5.394335511982571e-07, "loss": 4.032224655151367, "step": 2115 }, { "epoch": 4.610021786492375, "grad_norm": 2.089022636413574, "learning_rate": 5.392156862745097e-07, "loss": 3.8489670753479004, "step": 2116 }, { "epoch": 4.612200435729847, "grad_norm": 2.615384340286255, "learning_rate": 5.389978213507626e-07, "loss": 4.1334919929504395, "step": 2117 }, { "epoch": 4.61437908496732, "grad_norm": 2.3309578895568848, "learning_rate": 5.387799564270152e-07, "loss": 3.843961238861084, "step": 2118 }, { "epoch": 4.616557734204793, "grad_norm": 1.7548717260360718, "learning_rate": 5.38562091503268e-07, "loss": 3.930011510848999, "step": 2119 }, { "epoch": 4.618736383442266, "grad_norm": 1.7198600769042969, "learning_rate": 5.383442265795207e-07, "loss": 3.8586320877075195, "step": 2120 }, { "epoch": 4.620915032679738, "grad_norm": 2.3532145023345947, "learning_rate": 5.381263616557734e-07, "loss": 3.959071397781372, "step": 2121 }, { "epoch": 4.6230936819172115, "grad_norm": 1.6288414001464844, "learning_rate": 5.379084967320261e-07, "loss": 4.017592430114746, "step": 2122 }, { "epoch": 4.625272331154684, "grad_norm": 2.5089468955993652, "learning_rate": 5.376906318082789e-07, "loss": 3.892212152481079, "step": 2123 }, { "epoch": 4.627450980392156, "grad_norm": 2.723372220993042, "learning_rate": 5.374727668845316e-07, "loss": 4.0665435791015625, "step": 2124 }, { "epoch": 4.62962962962963, "grad_norm": 1.9607703685760498, "learning_rate": 5.372549019607843e-07, "loss": 3.8994808197021484, "step": 2125 }, { "epoch": 4.631808278867102, "grad_norm": 2.271247148513794, "learning_rate": 5.37037037037037e-07, "loss": 3.913398027420044, "step": 2126 }, { "epoch": 4.633986928104575, "grad_norm": 1.6798640489578247, "learning_rate": 5.368191721132898e-07, "loss": 3.847532033920288, "step": 2127 }, { "epoch": 4.636165577342048, "grad_norm": 2.70210599899292, "learning_rate": 5.366013071895424e-07, "loss": 3.928398609161377, "step": 2128 }, { "epoch": 4.6383442265795205, "grad_norm": 2.2566771507263184, "learning_rate": 5.363834422657952e-07, "loss": 4.0304975509643555, "step": 2129 }, { "epoch": 4.640522875816993, "grad_norm": 2.0814361572265625, "learning_rate": 5.361655773420479e-07, "loss": 4.000179767608643, "step": 2130 }, { "epoch": 4.642701525054466, "grad_norm": 2.457941770553589, "learning_rate": 5.359477124183006e-07, "loss": 4.03395938873291, "step": 2131 }, { "epoch": 4.644880174291939, "grad_norm": 2.0136923789978027, "learning_rate": 5.357298474945533e-07, "loss": 3.7748968601226807, "step": 2132 }, { "epoch": 4.647058823529412, "grad_norm": 1.8314669132232666, "learning_rate": 5.355119825708061e-07, "loss": 3.994091749191284, "step": 2133 }, { "epoch": 4.649237472766885, "grad_norm": 2.315887451171875, "learning_rate": 5.352941176470588e-07, "loss": 4.063084125518799, "step": 2134 }, { "epoch": 4.651416122004357, "grad_norm": 2.961186408996582, "learning_rate": 5.350762527233115e-07, "loss": 4.085721969604492, "step": 2135 }, { "epoch": 4.65359477124183, "grad_norm": 2.0640103816986084, "learning_rate": 5.348583877995643e-07, "loss": 3.9864070415496826, "step": 2136 }, { "epoch": 4.655773420479303, "grad_norm": 2.1938436031341553, "learning_rate": 5.34640522875817e-07, "loss": 4.000490188598633, "step": 2137 }, { "epoch": 4.657952069716775, "grad_norm": 3.5884897708892822, "learning_rate": 5.344226579520696e-07, "loss": 4.120272636413574, "step": 2138 }, { "epoch": 4.660130718954249, "grad_norm": 2.476365804672241, "learning_rate": 5.342047930283225e-07, "loss": 3.9939112663269043, "step": 2139 }, { "epoch": 4.662309368191721, "grad_norm": 2.175795555114746, "learning_rate": 5.339869281045751e-07, "loss": 3.9101030826568604, "step": 2140 }, { "epoch": 4.664488017429194, "grad_norm": 2.4076435565948486, "learning_rate": 5.337690631808279e-07, "loss": 3.983128547668457, "step": 2141 }, { "epoch": 4.666666666666667, "grad_norm": 1.9637694358825684, "learning_rate": 5.335511982570806e-07, "loss": 3.800849676132202, "step": 2142 }, { "epoch": 4.668845315904139, "grad_norm": 2.540755033493042, "learning_rate": 5.333333333333333e-07, "loss": 4.003988742828369, "step": 2143 }, { "epoch": 4.671023965141612, "grad_norm": 2.632617950439453, "learning_rate": 5.33115468409586e-07, "loss": 3.9694104194641113, "step": 2144 }, { "epoch": 4.673202614379085, "grad_norm": 2.2909977436065674, "learning_rate": 5.328976034858388e-07, "loss": 3.889561176300049, "step": 2145 }, { "epoch": 4.675381263616558, "grad_norm": 1.9505780935287476, "learning_rate": 5.326797385620915e-07, "loss": 4.047893047332764, "step": 2146 }, { "epoch": 4.67755991285403, "grad_norm": 2.1541428565979004, "learning_rate": 5.324618736383442e-07, "loss": 3.864476203918457, "step": 2147 }, { "epoch": 4.6797385620915035, "grad_norm": 1.7612714767456055, "learning_rate": 5.32244008714597e-07, "loss": 3.97965669631958, "step": 2148 }, { "epoch": 4.681917211328976, "grad_norm": 2.12245774269104, "learning_rate": 5.320261437908497e-07, "loss": 3.9374852180480957, "step": 2149 }, { "epoch": 4.684095860566448, "grad_norm": 2.376817226409912, "learning_rate": 5.318082788671023e-07, "loss": 3.873616933822632, "step": 2150 }, { "epoch": 4.686274509803922, "grad_norm": 2.6871206760406494, "learning_rate": 5.315904139433552e-07, "loss": 3.958369493484497, "step": 2151 }, { "epoch": 4.688453159041394, "grad_norm": 2.5996835231781006, "learning_rate": 5.313725490196078e-07, "loss": 4.202718257904053, "step": 2152 }, { "epoch": 4.690631808278867, "grad_norm": 2.2923851013183594, "learning_rate": 5.311546840958606e-07, "loss": 4.0816969871521, "step": 2153 }, { "epoch": 4.69281045751634, "grad_norm": 2.153865098953247, "learning_rate": 5.309368191721133e-07, "loss": 3.8530945777893066, "step": 2154 }, { "epoch": 4.6949891067538125, "grad_norm": 2.450063467025757, "learning_rate": 5.30718954248366e-07, "loss": 3.9239509105682373, "step": 2155 }, { "epoch": 4.697167755991286, "grad_norm": 1.9572272300720215, "learning_rate": 5.305010893246187e-07, "loss": 3.9247608184814453, "step": 2156 }, { "epoch": 4.699346405228758, "grad_norm": 2.3382740020751953, "learning_rate": 5.302832244008715e-07, "loss": 3.994748592376709, "step": 2157 }, { "epoch": 4.701525054466231, "grad_norm": 1.9592163562774658, "learning_rate": 5.300653594771242e-07, "loss": 3.6909468173980713, "step": 2158 }, { "epoch": 4.703703703703704, "grad_norm": 1.8269413709640503, "learning_rate": 5.298474945533768e-07, "loss": 3.954634189605713, "step": 2159 }, { "epoch": 4.705882352941177, "grad_norm": 2.8407979011535645, "learning_rate": 5.296296296296296e-07, "loss": 4.087118148803711, "step": 2160 }, { "epoch": 4.708061002178649, "grad_norm": 3.1121270656585693, "learning_rate": 5.294117647058823e-07, "loss": 4.188414573669434, "step": 2161 }, { "epoch": 4.710239651416122, "grad_norm": 2.6400368213653564, "learning_rate": 5.29193899782135e-07, "loss": 3.8665771484375, "step": 2162 }, { "epoch": 4.712418300653595, "grad_norm": 1.5918583869934082, "learning_rate": 5.289760348583878e-07, "loss": 3.9461987018585205, "step": 2163 }, { "epoch": 4.714596949891067, "grad_norm": 2.624528646469116, "learning_rate": 5.287581699346405e-07, "loss": 3.999235153198242, "step": 2164 }, { "epoch": 4.716775599128541, "grad_norm": 1.8700175285339355, "learning_rate": 5.285403050108931e-07, "loss": 3.9965691566467285, "step": 2165 }, { "epoch": 4.718954248366013, "grad_norm": 1.6883490085601807, "learning_rate": 5.28322440087146e-07, "loss": 3.8835833072662354, "step": 2166 }, { "epoch": 4.721132897603486, "grad_norm": 2.301767587661743, "learning_rate": 5.281045751633986e-07, "loss": 3.9328601360321045, "step": 2167 }, { "epoch": 4.723311546840959, "grad_norm": 2.2753498554229736, "learning_rate": 5.278867102396514e-07, "loss": 3.9365549087524414, "step": 2168 }, { "epoch": 4.7254901960784315, "grad_norm": 1.8338505029678345, "learning_rate": 5.276688453159041e-07, "loss": 3.913837194442749, "step": 2169 }, { "epoch": 4.727668845315904, "grad_norm": 1.8896178007125854, "learning_rate": 5.274509803921569e-07, "loss": 4.024864196777344, "step": 2170 }, { "epoch": 4.729847494553377, "grad_norm": 1.8769129514694214, "learning_rate": 5.272331154684095e-07, "loss": 3.920771837234497, "step": 2171 }, { "epoch": 4.73202614379085, "grad_norm": 2.0436315536499023, "learning_rate": 5.270152505446623e-07, "loss": 3.9269261360168457, "step": 2172 }, { "epoch": 4.734204793028322, "grad_norm": 1.948180913925171, "learning_rate": 5.26797385620915e-07, "loss": 3.869349718093872, "step": 2173 }, { "epoch": 4.7363834422657956, "grad_norm": 2.0264651775360107, "learning_rate": 5.265795206971677e-07, "loss": 3.8995184898376465, "step": 2174 }, { "epoch": 4.738562091503268, "grad_norm": 2.021568775177002, "learning_rate": 5.263616557734205e-07, "loss": 3.967611312866211, "step": 2175 }, { "epoch": 4.7407407407407405, "grad_norm": 2.011512517929077, "learning_rate": 5.261437908496732e-07, "loss": 3.984649419784546, "step": 2176 }, { "epoch": 4.742919389978214, "grad_norm": 2.2747642993927, "learning_rate": 5.259259259259258e-07, "loss": 4.100115776062012, "step": 2177 }, { "epoch": 4.745098039215686, "grad_norm": 2.1688554286956787, "learning_rate": 5.257080610021786e-07, "loss": 3.989992380142212, "step": 2178 }, { "epoch": 4.747276688453159, "grad_norm": 2.782025098800659, "learning_rate": 5.254901960784313e-07, "loss": 4.183507442474365, "step": 2179 }, { "epoch": 4.749455337690632, "grad_norm": 2.280728340148926, "learning_rate": 5.252723311546841e-07, "loss": 3.9378345012664795, "step": 2180 }, { "epoch": 4.751633986928105, "grad_norm": 2.451010227203369, "learning_rate": 5.250544662309367e-07, "loss": 4.091984748840332, "step": 2181 }, { "epoch": 4.753812636165577, "grad_norm": 2.0073137283325195, "learning_rate": 5.248366013071896e-07, "loss": 3.834050416946411, "step": 2182 }, { "epoch": 4.75599128540305, "grad_norm": 2.096696615219116, "learning_rate": 5.246187363834422e-07, "loss": 3.9146523475646973, "step": 2183 }, { "epoch": 4.758169934640523, "grad_norm": 1.8785223960876465, "learning_rate": 5.244008714596949e-07, "loss": 3.8380417823791504, "step": 2184 }, { "epoch": 4.760348583877995, "grad_norm": 2.353606700897217, "learning_rate": 5.241830065359477e-07, "loss": 4.002041816711426, "step": 2185 }, { "epoch": 4.762527233115469, "grad_norm": 1.8288589715957642, "learning_rate": 5.239651416122004e-07, "loss": 3.888721466064453, "step": 2186 }, { "epoch": 4.764705882352941, "grad_norm": 2.4094719886779785, "learning_rate": 5.237472766884531e-07, "loss": 4.0467939376831055, "step": 2187 }, { "epoch": 4.766884531590414, "grad_norm": 1.6682060956954956, "learning_rate": 5.235294117647059e-07, "loss": 3.796552896499634, "step": 2188 }, { "epoch": 4.769063180827887, "grad_norm": 2.6372475624084473, "learning_rate": 5.233115468409585e-07, "loss": 4.3329901695251465, "step": 2189 }, { "epoch": 4.771241830065359, "grad_norm": 1.8671406507492065, "learning_rate": 5.230936819172113e-07, "loss": 3.9116477966308594, "step": 2190 }, { "epoch": 4.773420479302832, "grad_norm": 1.5986055135726929, "learning_rate": 5.22875816993464e-07, "loss": 3.826262950897217, "step": 2191 }, { "epoch": 4.775599128540305, "grad_norm": 1.7989411354064941, "learning_rate": 5.226579520697168e-07, "loss": 3.938342332839966, "step": 2192 }, { "epoch": 4.777777777777778, "grad_norm": 2.430493116378784, "learning_rate": 5.224400871459694e-07, "loss": 3.9573771953582764, "step": 2193 }, { "epoch": 4.77995642701525, "grad_norm": 2.244939088821411, "learning_rate": 5.222222222222223e-07, "loss": 4.156424522399902, "step": 2194 }, { "epoch": 4.7821350762527235, "grad_norm": 1.7013659477233887, "learning_rate": 5.220043572984749e-07, "loss": 3.9130287170410156, "step": 2195 }, { "epoch": 4.784313725490196, "grad_norm": 2.9745895862579346, "learning_rate": 5.217864923747276e-07, "loss": 4.127662181854248, "step": 2196 }, { "epoch": 4.786492374727668, "grad_norm": 2.2594375610351562, "learning_rate": 5.215686274509804e-07, "loss": 3.9132022857666016, "step": 2197 }, { "epoch": 4.788671023965142, "grad_norm": 1.6201003789901733, "learning_rate": 5.213507625272331e-07, "loss": 3.8900020122528076, "step": 2198 }, { "epoch": 4.790849673202614, "grad_norm": 2.6863675117492676, "learning_rate": 5.211328976034858e-07, "loss": 4.023871421813965, "step": 2199 }, { "epoch": 4.793028322440087, "grad_norm": 2.29023814201355, "learning_rate": 5.209150326797386e-07, "loss": 3.8958775997161865, "step": 2200 }, { "epoch": 4.79520697167756, "grad_norm": 1.7762328386306763, "learning_rate": 5.206971677559912e-07, "loss": 3.9157400131225586, "step": 2201 }, { "epoch": 4.7973856209150325, "grad_norm": 1.789904236793518, "learning_rate": 5.20479302832244e-07, "loss": 3.8410162925720215, "step": 2202 }, { "epoch": 4.799564270152505, "grad_norm": 2.6801400184631348, "learning_rate": 5.202614379084967e-07, "loss": 4.09963846206665, "step": 2203 }, { "epoch": 4.801742919389978, "grad_norm": 2.4764461517333984, "learning_rate": 5.200435729847495e-07, "loss": 4.060131072998047, "step": 2204 }, { "epoch": 4.803921568627451, "grad_norm": 3.2093541622161865, "learning_rate": 5.198257080610021e-07, "loss": 4.065670013427734, "step": 2205 }, { "epoch": 4.806100217864923, "grad_norm": 2.7993669509887695, "learning_rate": 5.19607843137255e-07, "loss": 3.8701937198638916, "step": 2206 }, { "epoch": 4.808278867102397, "grad_norm": 2.2306466102600098, "learning_rate": 5.193899782135076e-07, "loss": 3.851048707962036, "step": 2207 }, { "epoch": 4.810457516339869, "grad_norm": 2.5088729858398438, "learning_rate": 5.191721132897603e-07, "loss": 4.02762508392334, "step": 2208 }, { "epoch": 4.812636165577342, "grad_norm": 1.726855754852295, "learning_rate": 5.189542483660131e-07, "loss": 3.8717987537384033, "step": 2209 }, { "epoch": 4.814814814814815, "grad_norm": 3.3268637657165527, "learning_rate": 5.187363834422658e-07, "loss": 4.072587490081787, "step": 2210 }, { "epoch": 4.816993464052287, "grad_norm": 1.7582250833511353, "learning_rate": 5.185185185185185e-07, "loss": 3.944211006164551, "step": 2211 }, { "epoch": 4.819172113289761, "grad_norm": 1.9972150325775146, "learning_rate": 5.183006535947713e-07, "loss": 3.972930908203125, "step": 2212 }, { "epoch": 4.821350762527233, "grad_norm": 2.224585771560669, "learning_rate": 5.180827886710239e-07, "loss": 3.967594623565674, "step": 2213 }, { "epoch": 4.823529411764706, "grad_norm": 1.771910309791565, "learning_rate": 5.178649237472767e-07, "loss": 3.961142063140869, "step": 2214 }, { "epoch": 4.825708061002179, "grad_norm": 2.2520458698272705, "learning_rate": 5.176470588235294e-07, "loss": 3.907545566558838, "step": 2215 }, { "epoch": 4.827886710239651, "grad_norm": 1.9613556861877441, "learning_rate": 5.174291938997822e-07, "loss": 3.8015570640563965, "step": 2216 }, { "epoch": 4.830065359477124, "grad_norm": 2.21315598487854, "learning_rate": 5.172113289760348e-07, "loss": 4.056735515594482, "step": 2217 }, { "epoch": 4.832244008714597, "grad_norm": 2.3269448280334473, "learning_rate": 5.169934640522877e-07, "loss": 3.88852858543396, "step": 2218 }, { "epoch": 4.83442265795207, "grad_norm": 2.3433544635772705, "learning_rate": 5.167755991285403e-07, "loss": 3.9755382537841797, "step": 2219 }, { "epoch": 4.836601307189542, "grad_norm": 1.994820475578308, "learning_rate": 5.16557734204793e-07, "loss": 3.998561143875122, "step": 2220 }, { "epoch": 4.8387799564270155, "grad_norm": 2.2278783321380615, "learning_rate": 5.163398692810458e-07, "loss": 4.12270975112915, "step": 2221 }, { "epoch": 4.840958605664488, "grad_norm": 1.8964664936065674, "learning_rate": 5.161220043572985e-07, "loss": 3.914201498031616, "step": 2222 }, { "epoch": 4.8431372549019605, "grad_norm": 2.155696153640747, "learning_rate": 5.159041394335512e-07, "loss": 3.9693925380706787, "step": 2223 }, { "epoch": 4.845315904139434, "grad_norm": 2.286709785461426, "learning_rate": 5.15686274509804e-07, "loss": 3.789750099182129, "step": 2224 }, { "epoch": 4.847494553376906, "grad_norm": 1.8653053045272827, "learning_rate": 5.154684095860566e-07, "loss": 3.870958089828491, "step": 2225 }, { "epoch": 4.849673202614379, "grad_norm": 1.9381932020187378, "learning_rate": 5.152505446623093e-07, "loss": 3.9062867164611816, "step": 2226 }, { "epoch": 4.851851851851852, "grad_norm": 2.136869192123413, "learning_rate": 5.15032679738562e-07, "loss": 3.9674057960510254, "step": 2227 }, { "epoch": 4.8540305010893245, "grad_norm": 2.2988593578338623, "learning_rate": 5.148148148148148e-07, "loss": 3.974482536315918, "step": 2228 }, { "epoch": 4.856209150326797, "grad_norm": 1.7715452909469604, "learning_rate": 5.145969498910675e-07, "loss": 3.9663734436035156, "step": 2229 }, { "epoch": 4.85838779956427, "grad_norm": 2.0895512104034424, "learning_rate": 5.143790849673201e-07, "loss": 3.915449619293213, "step": 2230 }, { "epoch": 4.860566448801743, "grad_norm": 2.4509024620056152, "learning_rate": 5.14161220043573e-07, "loss": 4.010859966278076, "step": 2231 }, { "epoch": 4.862745098039216, "grad_norm": 2.400008201599121, "learning_rate": 5.139433551198256e-07, "loss": 4.015111923217773, "step": 2232 }, { "epoch": 4.864923747276689, "grad_norm": 1.9863617420196533, "learning_rate": 5.137254901960784e-07, "loss": 3.8965554237365723, "step": 2233 }, { "epoch": 4.867102396514161, "grad_norm": 2.344766139984131, "learning_rate": 5.135076252723311e-07, "loss": 4.079988479614258, "step": 2234 }, { "epoch": 4.8692810457516345, "grad_norm": 1.8015066385269165, "learning_rate": 5.132897603485839e-07, "loss": 4.000588893890381, "step": 2235 }, { "epoch": 4.871459694989107, "grad_norm": 1.9901202917099, "learning_rate": 5.130718954248365e-07, "loss": 3.8814308643341064, "step": 2236 }, { "epoch": 4.873638344226579, "grad_norm": 1.4880259037017822, "learning_rate": 5.128540305010893e-07, "loss": 3.784320831298828, "step": 2237 }, { "epoch": 4.875816993464053, "grad_norm": 1.786797046661377, "learning_rate": 5.12636165577342e-07, "loss": 3.9652597904205322, "step": 2238 }, { "epoch": 4.877995642701525, "grad_norm": 2.2796404361724854, "learning_rate": 5.124183006535947e-07, "loss": 3.641780138015747, "step": 2239 }, { "epoch": 4.880174291938998, "grad_norm": 3.041426420211792, "learning_rate": 5.122004357298475e-07, "loss": 4.171051979064941, "step": 2240 }, { "epoch": 4.882352941176471, "grad_norm": 2.571495294570923, "learning_rate": 5.119825708061002e-07, "loss": 3.8197271823883057, "step": 2241 }, { "epoch": 4.8845315904139435, "grad_norm": 1.9779208898544312, "learning_rate": 5.117647058823528e-07, "loss": 3.963747978210449, "step": 2242 }, { "epoch": 4.886710239651416, "grad_norm": 2.2306408882141113, "learning_rate": 5.115468409586057e-07, "loss": 3.923487424850464, "step": 2243 }, { "epoch": 4.888888888888889, "grad_norm": 1.672719955444336, "learning_rate": 5.113289760348583e-07, "loss": 3.817233085632324, "step": 2244 }, { "epoch": 4.891067538126362, "grad_norm": 3.1602370738983154, "learning_rate": 5.111111111111111e-07, "loss": 4.113470077514648, "step": 2245 }, { "epoch": 4.893246187363834, "grad_norm": 1.9398164749145508, "learning_rate": 5.108932461873638e-07, "loss": 4.002882957458496, "step": 2246 }, { "epoch": 4.895424836601308, "grad_norm": 2.122755765914917, "learning_rate": 5.106753812636165e-07, "loss": 3.9538631439208984, "step": 2247 }, { "epoch": 4.89760348583878, "grad_norm": 2.832982063293457, "learning_rate": 5.104575163398692e-07, "loss": 4.044690132141113, "step": 2248 }, { "epoch": 4.8997821350762525, "grad_norm": 2.871750831604004, "learning_rate": 5.10239651416122e-07, "loss": 4.053842544555664, "step": 2249 }, { "epoch": 4.901960784313726, "grad_norm": 1.868222713470459, "learning_rate": 5.100217864923747e-07, "loss": 3.865553617477417, "step": 2250 }, { "epoch": 4.904139433551198, "grad_norm": 2.3855204582214355, "learning_rate": 5.098039215686274e-07, "loss": 3.838442325592041, "step": 2251 }, { "epoch": 4.906318082788671, "grad_norm": 2.3344788551330566, "learning_rate": 5.095860566448802e-07, "loss": 4.0279541015625, "step": 2252 }, { "epoch": 4.908496732026144, "grad_norm": 1.6994167566299438, "learning_rate": 5.093681917211329e-07, "loss": 3.8964104652404785, "step": 2253 }, { "epoch": 4.910675381263617, "grad_norm": 2.8505561351776123, "learning_rate": 5.091503267973855e-07, "loss": 4.12226676940918, "step": 2254 }, { "epoch": 4.912854030501089, "grad_norm": 1.9245264530181885, "learning_rate": 5.089324618736384e-07, "loss": 3.980527877807617, "step": 2255 }, { "epoch": 4.915032679738562, "grad_norm": 2.0368313789367676, "learning_rate": 5.08714596949891e-07, "loss": 3.8993148803710938, "step": 2256 }, { "epoch": 4.917211328976035, "grad_norm": 2.3526065349578857, "learning_rate": 5.084967320261438e-07, "loss": 3.939013719558716, "step": 2257 }, { "epoch": 4.919389978213507, "grad_norm": 2.2821571826934814, "learning_rate": 5.082788671023965e-07, "loss": 3.944556713104248, "step": 2258 }, { "epoch": 4.921568627450981, "grad_norm": 2.049231767654419, "learning_rate": 5.080610021786492e-07, "loss": 3.9394750595092773, "step": 2259 }, { "epoch": 4.923747276688453, "grad_norm": 2.615406036376953, "learning_rate": 5.078431372549019e-07, "loss": 3.973417282104492, "step": 2260 }, { "epoch": 4.925925925925926, "grad_norm": 3.0004823207855225, "learning_rate": 5.076252723311547e-07, "loss": 4.106883525848389, "step": 2261 }, { "epoch": 4.928104575163399, "grad_norm": 2.713779926300049, "learning_rate": 5.074074074074074e-07, "loss": 3.971985101699829, "step": 2262 }, { "epoch": 4.930283224400871, "grad_norm": 2.711402416229248, "learning_rate": 5.071895424836601e-07, "loss": 4.040958881378174, "step": 2263 }, { "epoch": 4.932461873638344, "grad_norm": 2.1009786128997803, "learning_rate": 5.069716775599129e-07, "loss": 3.838245153427124, "step": 2264 }, { "epoch": 4.934640522875817, "grad_norm": 2.2454657554626465, "learning_rate": 5.067538126361656e-07, "loss": 3.974653482437134, "step": 2265 }, { "epoch": 4.93681917211329, "grad_norm": 2.012840986251831, "learning_rate": 5.065359477124182e-07, "loss": 3.9041709899902344, "step": 2266 }, { "epoch": 4.938997821350762, "grad_norm": 2.1841979026794434, "learning_rate": 5.063180827886711e-07, "loss": 3.9864742755889893, "step": 2267 }, { "epoch": 4.9411764705882355, "grad_norm": 2.2137842178344727, "learning_rate": 5.061002178649237e-07, "loss": 3.988637924194336, "step": 2268 }, { "epoch": 4.943355119825708, "grad_norm": 2.9165918827056885, "learning_rate": 5.058823529411765e-07, "loss": 4.132877826690674, "step": 2269 }, { "epoch": 4.94553376906318, "grad_norm": 2.442948341369629, "learning_rate": 5.056644880174292e-07, "loss": 4.055883407592773, "step": 2270 }, { "epoch": 4.947712418300654, "grad_norm": 1.9010896682739258, "learning_rate": 5.054466230936819e-07, "loss": 3.8015527725219727, "step": 2271 }, { "epoch": 4.949891067538126, "grad_norm": 2.136239767074585, "learning_rate": 5.052287581699346e-07, "loss": 4.020484447479248, "step": 2272 }, { "epoch": 4.952069716775599, "grad_norm": 1.7918301820755005, "learning_rate": 5.050108932461873e-07, "loss": 3.863694429397583, "step": 2273 }, { "epoch": 4.954248366013072, "grad_norm": 2.152824640274048, "learning_rate": 5.047930283224401e-07, "loss": 4.029211521148682, "step": 2274 }, { "epoch": 4.9564270152505445, "grad_norm": 2.282872200012207, "learning_rate": 5.045751633986928e-07, "loss": 3.8456711769104004, "step": 2275 }, { "epoch": 4.958605664488017, "grad_norm": 2.507498264312744, "learning_rate": 5.043572984749454e-07, "loss": 3.8849070072174072, "step": 2276 }, { "epoch": 4.96078431372549, "grad_norm": 2.3902013301849365, "learning_rate": 5.041394335511983e-07, "loss": 4.021683692932129, "step": 2277 }, { "epoch": 4.962962962962963, "grad_norm": 2.1862683296203613, "learning_rate": 5.039215686274509e-07, "loss": 3.9404714107513428, "step": 2278 }, { "epoch": 4.965141612200435, "grad_norm": 2.3644559383392334, "learning_rate": 5.037037037037037e-07, "loss": 3.8481836318969727, "step": 2279 }, { "epoch": 4.967320261437909, "grad_norm": 2.165055990219116, "learning_rate": 5.034858387799564e-07, "loss": 4.019814968109131, "step": 2280 }, { "epoch": 4.969498910675381, "grad_norm": 2.3817899227142334, "learning_rate": 5.032679738562092e-07, "loss": 4.022697448730469, "step": 2281 }, { "epoch": 4.9716775599128535, "grad_norm": 1.8875834941864014, "learning_rate": 5.030501089324618e-07, "loss": 3.886652708053589, "step": 2282 }, { "epoch": 4.973856209150327, "grad_norm": 2.2983829975128174, "learning_rate": 5.028322440087146e-07, "loss": 3.9699504375457764, "step": 2283 }, { "epoch": 4.976034858387799, "grad_norm": 2.1770689487457275, "learning_rate": 5.026143790849673e-07, "loss": 3.953711986541748, "step": 2284 }, { "epoch": 4.978213507625273, "grad_norm": 2.0662243366241455, "learning_rate": 5.0239651416122e-07, "loss": 3.9703381061553955, "step": 2285 }, { "epoch": 4.980392156862745, "grad_norm": 2.4233412742614746, "learning_rate": 5.021786492374728e-07, "loss": 3.901291608810425, "step": 2286 }, { "epoch": 4.982570806100218, "grad_norm": 3.6833972930908203, "learning_rate": 5.019607843137255e-07, "loss": 4.2417473793029785, "step": 2287 }, { "epoch": 4.984749455337691, "grad_norm": 1.9776519536972046, "learning_rate": 5.017429193899781e-07, "loss": 4.026832580566406, "step": 2288 }, { "epoch": 4.9869281045751634, "grad_norm": 2.066070556640625, "learning_rate": 5.01525054466231e-07, "loss": 3.9790122509002686, "step": 2289 }, { "epoch": 4.989106753812636, "grad_norm": 2.324146032333374, "learning_rate": 5.013071895424836e-07, "loss": 4.042368412017822, "step": 2290 }, { "epoch": 4.991285403050109, "grad_norm": 2.4735517501831055, "learning_rate": 5.010893246187364e-07, "loss": 4.038393497467041, "step": 2291 }, { "epoch": 4.993464052287582, "grad_norm": 4.0087056159973145, "learning_rate": 5.008714596949891e-07, "loss": 4.464468479156494, "step": 2292 }, { "epoch": 4.995642701525054, "grad_norm": 1.7427257299423218, "learning_rate": 5.006535947712419e-07, "loss": 3.9285364151000977, "step": 2293 }, { "epoch": 4.9978213507625275, "grad_norm": 1.9140081405639648, "learning_rate": 5.004357298474945e-07, "loss": 4.093921661376953, "step": 2294 }, { "epoch": 5.0, "grad_norm": 2.1467864513397217, "learning_rate": 5.002178649237473e-07, "loss": 4.002460956573486, "step": 2295 }, { "epoch": 5.0021786492374725, "grad_norm": 2.0769295692443848, "learning_rate": 5e-07, "loss": 3.980248212814331, "step": 2296 }, { "epoch": 5.004357298474946, "grad_norm": 3.4399819374084473, "learning_rate": 4.997821350762527e-07, "loss": 4.111696243286133, "step": 2297 }, { "epoch": 5.006535947712418, "grad_norm": 2.556821823120117, "learning_rate": 4.995642701525054e-07, "loss": 4.088751792907715, "step": 2298 }, { "epoch": 5.008714596949891, "grad_norm": 2.8711681365966797, "learning_rate": 4.993464052287581e-07, "loss": 4.153500080108643, "step": 2299 }, { "epoch": 5.010893246187364, "grad_norm": 1.9966511726379395, "learning_rate": 4.991285403050108e-07, "loss": 3.930098056793213, "step": 2300 }, { "epoch": 5.0130718954248366, "grad_norm": 1.9359312057495117, "learning_rate": 4.989106753812636e-07, "loss": 3.916348695755005, "step": 2301 }, { "epoch": 5.015250544662309, "grad_norm": 1.5690878629684448, "learning_rate": 4.986928104575163e-07, "loss": 3.8394322395324707, "step": 2302 }, { "epoch": 5.017429193899782, "grad_norm": 2.3207757472991943, "learning_rate": 4.984749455337691e-07, "loss": 4.024141311645508, "step": 2303 }, { "epoch": 5.019607843137255, "grad_norm": 2.8214917182922363, "learning_rate": 4.982570806100217e-07, "loss": 4.053504943847656, "step": 2304 }, { "epoch": 5.021786492374727, "grad_norm": 2.3574771881103516, "learning_rate": 4.980392156862744e-07, "loss": 3.9401421546936035, "step": 2305 }, { "epoch": 5.023965141612201, "grad_norm": 2.4941906929016113, "learning_rate": 4.978213507625272e-07, "loss": 3.869664430618286, "step": 2306 }, { "epoch": 5.026143790849673, "grad_norm": 2.381814479827881, "learning_rate": 4.976034858387799e-07, "loss": 3.9263782501220703, "step": 2307 }, { "epoch": 5.028322440087146, "grad_norm": 2.670118570327759, "learning_rate": 4.973856209150327e-07, "loss": 4.176826477050781, "step": 2308 }, { "epoch": 5.030501089324619, "grad_norm": 2.173797369003296, "learning_rate": 4.971677559912853e-07, "loss": 3.879490852355957, "step": 2309 }, { "epoch": 5.032679738562091, "grad_norm": 1.6277283430099487, "learning_rate": 4.969498910675381e-07, "loss": 3.894775867462158, "step": 2310 }, { "epoch": 5.034858387799564, "grad_norm": 2.1290152072906494, "learning_rate": 4.967320261437908e-07, "loss": 3.963571548461914, "step": 2311 }, { "epoch": 5.037037037037037, "grad_norm": 2.236130714416504, "learning_rate": 4.965141612200435e-07, "loss": 3.8576841354370117, "step": 2312 }, { "epoch": 5.03921568627451, "grad_norm": 2.07883882522583, "learning_rate": 4.962962962962963e-07, "loss": 3.7692859172821045, "step": 2313 }, { "epoch": 5.041394335511983, "grad_norm": 2.280726432800293, "learning_rate": 4.96078431372549e-07, "loss": 3.9030230045318604, "step": 2314 }, { "epoch": 5.0435729847494555, "grad_norm": 2.217580795288086, "learning_rate": 4.958605664488017e-07, "loss": 3.888173818588257, "step": 2315 }, { "epoch": 5.045751633986928, "grad_norm": 2.4706170558929443, "learning_rate": 4.956427015250544e-07, "loss": 4.189489364624023, "step": 2316 }, { "epoch": 5.047930283224401, "grad_norm": 1.6941262483596802, "learning_rate": 4.954248366013071e-07, "loss": 3.7595252990722656, "step": 2317 }, { "epoch": 5.050108932461874, "grad_norm": 1.8976666927337646, "learning_rate": 4.952069716775599e-07, "loss": 3.9480810165405273, "step": 2318 }, { "epoch": 5.052287581699346, "grad_norm": 1.9700231552124023, "learning_rate": 4.949891067538126e-07, "loss": 3.9852473735809326, "step": 2319 }, { "epoch": 5.05446623093682, "grad_norm": 1.9452837705612183, "learning_rate": 4.947712418300654e-07, "loss": 3.8493895530700684, "step": 2320 }, { "epoch": 5.056644880174292, "grad_norm": 2.1107265949249268, "learning_rate": 4.94553376906318e-07, "loss": 3.761704444885254, "step": 2321 }, { "epoch": 5.0588235294117645, "grad_norm": 2.682438373565674, "learning_rate": 4.943355119825708e-07, "loss": 4.118619441986084, "step": 2322 }, { "epoch": 5.061002178649238, "grad_norm": 2.0839436054229736, "learning_rate": 4.941176470588235e-07, "loss": 3.878387451171875, "step": 2323 }, { "epoch": 5.06318082788671, "grad_norm": 2.047210216522217, "learning_rate": 4.938997821350762e-07, "loss": 3.8477039337158203, "step": 2324 }, { "epoch": 5.065359477124183, "grad_norm": 1.6127349138259888, "learning_rate": 4.93681917211329e-07, "loss": 3.893745183944702, "step": 2325 }, { "epoch": 5.067538126361656, "grad_norm": 1.7671970129013062, "learning_rate": 4.934640522875817e-07, "loss": 3.783796548843384, "step": 2326 }, { "epoch": 5.069716775599129, "grad_norm": 2.241070508956909, "learning_rate": 4.932461873638344e-07, "loss": 3.9195172786712646, "step": 2327 }, { "epoch": 5.071895424836601, "grad_norm": 2.649526596069336, "learning_rate": 4.930283224400871e-07, "loss": 4.026880741119385, "step": 2328 }, { "epoch": 5.074074074074074, "grad_norm": 1.723897933959961, "learning_rate": 4.928104575163398e-07, "loss": 3.827521324157715, "step": 2329 }, { "epoch": 5.076252723311547, "grad_norm": 2.079275369644165, "learning_rate": 4.925925925925926e-07, "loss": 4.033343315124512, "step": 2330 }, { "epoch": 5.078431372549019, "grad_norm": 2.304816484451294, "learning_rate": 4.923747276688453e-07, "loss": 3.91083025932312, "step": 2331 }, { "epoch": 5.080610021786493, "grad_norm": 2.5115511417388916, "learning_rate": 4.92156862745098e-07, "loss": 4.015562534332275, "step": 2332 }, { "epoch": 5.082788671023965, "grad_norm": 1.9942381381988525, "learning_rate": 4.919389978213507e-07, "loss": 3.905773401260376, "step": 2333 }, { "epoch": 5.084967320261438, "grad_norm": 2.608992099761963, "learning_rate": 4.917211328976034e-07, "loss": 3.966984748840332, "step": 2334 }, { "epoch": 5.087145969498911, "grad_norm": 2.4699113368988037, "learning_rate": 4.915032679738562e-07, "loss": 3.873391628265381, "step": 2335 }, { "epoch": 5.089324618736383, "grad_norm": 2.4117233753204346, "learning_rate": 4.912854030501089e-07, "loss": 4.010549545288086, "step": 2336 }, { "epoch": 5.091503267973856, "grad_norm": 2.2558798789978027, "learning_rate": 4.910675381263617e-07, "loss": 4.040482521057129, "step": 2337 }, { "epoch": 5.093681917211329, "grad_norm": 2.0947885513305664, "learning_rate": 4.908496732026143e-07, "loss": 3.9388058185577393, "step": 2338 }, { "epoch": 5.095860566448802, "grad_norm": 2.180865526199341, "learning_rate": 4.906318082788671e-07, "loss": 3.7664215564727783, "step": 2339 }, { "epoch": 5.098039215686274, "grad_norm": 3.114321231842041, "learning_rate": 4.904139433551198e-07, "loss": 4.093318939208984, "step": 2340 }, { "epoch": 5.1002178649237475, "grad_norm": 2.2297472953796387, "learning_rate": 4.901960784313725e-07, "loss": 3.827765464782715, "step": 2341 }, { "epoch": 5.10239651416122, "grad_norm": 2.115260124206543, "learning_rate": 4.899782135076253e-07, "loss": 4.063700199127197, "step": 2342 }, { "epoch": 5.104575163398692, "grad_norm": 3.015540599822998, "learning_rate": 4.89760348583878e-07, "loss": 4.191771507263184, "step": 2343 }, { "epoch": 5.106753812636166, "grad_norm": 2.097306728363037, "learning_rate": 4.895424836601307e-07, "loss": 3.9506032466888428, "step": 2344 }, { "epoch": 5.108932461873638, "grad_norm": 2.634305715560913, "learning_rate": 4.893246187363834e-07, "loss": 3.9359335899353027, "step": 2345 }, { "epoch": 5.111111111111111, "grad_norm": 2.07562518119812, "learning_rate": 4.891067538126361e-07, "loss": 3.9322080612182617, "step": 2346 }, { "epoch": 5.113289760348584, "grad_norm": 2.252413511276245, "learning_rate": 4.888888888888889e-07, "loss": 3.8525390625, "step": 2347 }, { "epoch": 5.1154684095860565, "grad_norm": 2.5415096282958984, "learning_rate": 4.886710239651416e-07, "loss": 3.986069679260254, "step": 2348 }, { "epoch": 5.117647058823529, "grad_norm": 2.0740127563476562, "learning_rate": 4.884531590413944e-07, "loss": 3.8755810260772705, "step": 2349 }, { "epoch": 5.119825708061002, "grad_norm": 2.2147586345672607, "learning_rate": 4.88235294117647e-07, "loss": 3.9549360275268555, "step": 2350 }, { "epoch": 5.122004357298475, "grad_norm": 2.177706718444824, "learning_rate": 4.880174291938998e-07, "loss": 3.8547329902648926, "step": 2351 }, { "epoch": 5.124183006535947, "grad_norm": 1.725949764251709, "learning_rate": 4.877995642701525e-07, "loss": 3.8841159343719482, "step": 2352 }, { "epoch": 5.126361655773421, "grad_norm": 2.1526031494140625, "learning_rate": 4.875816993464052e-07, "loss": 3.774921417236328, "step": 2353 }, { "epoch": 5.128540305010893, "grad_norm": 1.6586329936981201, "learning_rate": 4.87363834422658e-07, "loss": 3.798032760620117, "step": 2354 }, { "epoch": 5.130718954248366, "grad_norm": 1.834896445274353, "learning_rate": 4.871459694989107e-07, "loss": 3.784261465072632, "step": 2355 }, { "epoch": 5.132897603485839, "grad_norm": 2.1311023235321045, "learning_rate": 4.869281045751634e-07, "loss": 4.060948371887207, "step": 2356 }, { "epoch": 5.135076252723311, "grad_norm": 1.9448117017745972, "learning_rate": 4.867102396514161e-07, "loss": 3.9887118339538574, "step": 2357 }, { "epoch": 5.137254901960785, "grad_norm": 1.9374611377716064, "learning_rate": 4.864923747276688e-07, "loss": 3.7639803886413574, "step": 2358 }, { "epoch": 5.139433551198257, "grad_norm": 2.1011838912963867, "learning_rate": 4.862745098039216e-07, "loss": 3.952131509780884, "step": 2359 }, { "epoch": 5.14161220043573, "grad_norm": 2.850036144256592, "learning_rate": 4.860566448801743e-07, "loss": 4.110136985778809, "step": 2360 }, { "epoch": 5.143790849673203, "grad_norm": 1.9732683897018433, "learning_rate": 4.85838779956427e-07, "loss": 3.828014612197876, "step": 2361 }, { "epoch": 5.1459694989106755, "grad_norm": 2.245994806289673, "learning_rate": 4.856209150326797e-07, "loss": 3.899472951889038, "step": 2362 }, { "epoch": 5.148148148148148, "grad_norm": 1.864344596862793, "learning_rate": 4.854030501089325e-07, "loss": 3.992853879928589, "step": 2363 }, { "epoch": 5.150326797385621, "grad_norm": 2.334409236907959, "learning_rate": 4.851851851851852e-07, "loss": 3.981642961502075, "step": 2364 }, { "epoch": 5.152505446623094, "grad_norm": 2.3748230934143066, "learning_rate": 4.849673202614379e-07, "loss": 4.044300079345703, "step": 2365 }, { "epoch": 5.154684095860566, "grad_norm": 2.656719207763672, "learning_rate": 4.847494553376907e-07, "loss": 4.1518049240112305, "step": 2366 }, { "epoch": 5.1568627450980395, "grad_norm": 1.728630781173706, "learning_rate": 4.845315904139433e-07, "loss": 3.9062023162841797, "step": 2367 }, { "epoch": 5.159041394335512, "grad_norm": 1.884379506111145, "learning_rate": 4.843137254901961e-07, "loss": 3.8802452087402344, "step": 2368 }, { "epoch": 5.1612200435729845, "grad_norm": 1.89178466796875, "learning_rate": 4.840958605664488e-07, "loss": 3.9224658012390137, "step": 2369 }, { "epoch": 5.163398692810458, "grad_norm": 2.7705209255218506, "learning_rate": 4.838779956427014e-07, "loss": 3.9100828170776367, "step": 2370 }, { "epoch": 5.16557734204793, "grad_norm": 3.166144847869873, "learning_rate": 4.836601307189542e-07, "loss": 3.8246421813964844, "step": 2371 }, { "epoch": 5.167755991285403, "grad_norm": 2.314420700073242, "learning_rate": 4.834422657952069e-07, "loss": 3.973367214202881, "step": 2372 }, { "epoch": 5.169934640522876, "grad_norm": 2.2258713245391846, "learning_rate": 4.832244008714597e-07, "loss": 4.081875324249268, "step": 2373 }, { "epoch": 5.172113289760349, "grad_norm": 2.525428056716919, "learning_rate": 4.830065359477124e-07, "loss": 4.02772331237793, "step": 2374 }, { "epoch": 5.174291938997821, "grad_norm": 2.6258299350738525, "learning_rate": 4.827886710239651e-07, "loss": 4.000945091247559, "step": 2375 }, { "epoch": 5.176470588235294, "grad_norm": 1.9537543058395386, "learning_rate": 4.825708061002178e-07, "loss": 3.8978803157806396, "step": 2376 }, { "epoch": 5.178649237472767, "grad_norm": 1.9463226795196533, "learning_rate": 4.823529411764705e-07, "loss": 3.915982723236084, "step": 2377 }, { "epoch": 5.180827886710239, "grad_norm": 2.036850929260254, "learning_rate": 4.821350762527233e-07, "loss": 3.9575304985046387, "step": 2378 }, { "epoch": 5.183006535947713, "grad_norm": 2.14572811126709, "learning_rate": 4.81917211328976e-07, "loss": 3.98181414604187, "step": 2379 }, { "epoch": 5.185185185185185, "grad_norm": 2.2011964321136475, "learning_rate": 4.816993464052288e-07, "loss": 3.9643828868865967, "step": 2380 }, { "epoch": 5.187363834422658, "grad_norm": 2.095386266708374, "learning_rate": 4.814814814814814e-07, "loss": 3.7842111587524414, "step": 2381 }, { "epoch": 5.189542483660131, "grad_norm": 2.2015764713287354, "learning_rate": 4.812636165577341e-07, "loss": 3.9063446521759033, "step": 2382 }, { "epoch": 5.191721132897603, "grad_norm": 2.601858139038086, "learning_rate": 4.810457516339869e-07, "loss": 4.061300754547119, "step": 2383 }, { "epoch": 5.193899782135076, "grad_norm": 2.0003390312194824, "learning_rate": 4.808278867102396e-07, "loss": 3.8676564693450928, "step": 2384 }, { "epoch": 5.196078431372549, "grad_norm": 3.0686593055725098, "learning_rate": 4.806100217864924e-07, "loss": 4.1023640632629395, "step": 2385 }, { "epoch": 5.198257080610022, "grad_norm": 2.6862008571624756, "learning_rate": 4.803921568627451e-07, "loss": 4.004210948944092, "step": 2386 }, { "epoch": 5.200435729847494, "grad_norm": 2.3542697429656982, "learning_rate": 4.801742919389977e-07, "loss": 3.856398820877075, "step": 2387 }, { "epoch": 5.2026143790849675, "grad_norm": 2.189741611480713, "learning_rate": 4.799564270152505e-07, "loss": 3.9689502716064453, "step": 2388 }, { "epoch": 5.20479302832244, "grad_norm": 1.946071982383728, "learning_rate": 4.797385620915032e-07, "loss": 3.9323902130126953, "step": 2389 }, { "epoch": 5.206971677559913, "grad_norm": 1.936318039894104, "learning_rate": 4.79520697167756e-07, "loss": 3.7514476776123047, "step": 2390 }, { "epoch": 5.209150326797386, "grad_norm": 2.3906867504119873, "learning_rate": 4.793028322440087e-07, "loss": 3.890101194381714, "step": 2391 }, { "epoch": 5.211328976034858, "grad_norm": 1.8700975179672241, "learning_rate": 4.790849673202615e-07, "loss": 3.854606866836548, "step": 2392 }, { "epoch": 5.213507625272332, "grad_norm": 2.0464529991149902, "learning_rate": 4.788671023965141e-07, "loss": 3.9141108989715576, "step": 2393 }, { "epoch": 5.215686274509804, "grad_norm": 1.8650436401367188, "learning_rate": 4.786492374727668e-07, "loss": 3.991891384124756, "step": 2394 }, { "epoch": 5.2178649237472765, "grad_norm": 1.7712714672088623, "learning_rate": 4.784313725490196e-07, "loss": 3.912626266479492, "step": 2395 }, { "epoch": 5.22004357298475, "grad_norm": 2.601485252380371, "learning_rate": 4.782135076252723e-07, "loss": 4.000618934631348, "step": 2396 }, { "epoch": 5.222222222222222, "grad_norm": 1.9651075601577759, "learning_rate": 4.779956427015251e-07, "loss": 3.851883888244629, "step": 2397 }, { "epoch": 5.224400871459695, "grad_norm": 2.1064107418060303, "learning_rate": 4.777777777777778e-07, "loss": 4.1514811515808105, "step": 2398 }, { "epoch": 5.226579520697168, "grad_norm": 2.581458330154419, "learning_rate": 4.775599128540304e-07, "loss": 3.9684572219848633, "step": 2399 }, { "epoch": 5.228758169934641, "grad_norm": 2.417166233062744, "learning_rate": 4.773420479302832e-07, "loss": 3.9344799518585205, "step": 2400 }, { "epoch": 5.230936819172113, "grad_norm": 2.495058059692383, "learning_rate": 4.771241830065359e-07, "loss": 3.953261613845825, "step": 2401 }, { "epoch": 5.233115468409586, "grad_norm": 2.5622100830078125, "learning_rate": 4.769063180827887e-07, "loss": 3.9742703437805176, "step": 2402 }, { "epoch": 5.235294117647059, "grad_norm": 2.810973644256592, "learning_rate": 4.7668845315904136e-07, "loss": 4.064958572387695, "step": 2403 }, { "epoch": 5.237472766884531, "grad_norm": 2.4255316257476807, "learning_rate": 4.7647058823529405e-07, "loss": 4.084212779998779, "step": 2404 }, { "epoch": 5.239651416122005, "grad_norm": 1.8577384948730469, "learning_rate": 4.762527233115468e-07, "loss": 3.806368350982666, "step": 2405 }, { "epoch": 5.241830065359477, "grad_norm": 2.611760377883911, "learning_rate": 4.7603485838779953e-07, "loss": 3.9213972091674805, "step": 2406 }, { "epoch": 5.24400871459695, "grad_norm": 2.0986385345458984, "learning_rate": 4.758169934640522e-07, "loss": 4.016166687011719, "step": 2407 }, { "epoch": 5.246187363834423, "grad_norm": 2.2517600059509277, "learning_rate": 4.7559912854030496e-07, "loss": 3.8493921756744385, "step": 2408 }, { "epoch": 5.248366013071895, "grad_norm": 2.178561210632324, "learning_rate": 4.753812636165577e-07, "loss": 4.042523384094238, "step": 2409 }, { "epoch": 5.250544662309368, "grad_norm": 2.2715916633605957, "learning_rate": 4.751633986928104e-07, "loss": 3.8192691802978516, "step": 2410 }, { "epoch": 5.252723311546841, "grad_norm": 2.5293288230895996, "learning_rate": 4.7494553376906314e-07, "loss": 3.9288721084594727, "step": 2411 }, { "epoch": 5.254901960784314, "grad_norm": 2.1591267585754395, "learning_rate": 4.747276688453159e-07, "loss": 3.9597666263580322, "step": 2412 }, { "epoch": 5.257080610021786, "grad_norm": 2.567697525024414, "learning_rate": 4.7450980392156857e-07, "loss": 4.0053391456604, "step": 2413 }, { "epoch": 5.2592592592592595, "grad_norm": 1.7528166770935059, "learning_rate": 4.742919389978213e-07, "loss": 4.065176963806152, "step": 2414 }, { "epoch": 5.261437908496732, "grad_norm": 2.218909740447998, "learning_rate": 4.7407407407407405e-07, "loss": 3.8978190422058105, "step": 2415 }, { "epoch": 5.2636165577342044, "grad_norm": 2.1562907695770264, "learning_rate": 4.7385620915032674e-07, "loss": 3.892906427383423, "step": 2416 }, { "epoch": 5.265795206971678, "grad_norm": 1.8732043504714966, "learning_rate": 4.736383442265795e-07, "loss": 3.803743362426758, "step": 2417 }, { "epoch": 5.26797385620915, "grad_norm": 2.331484317779541, "learning_rate": 4.7342047930283223e-07, "loss": 3.9885964393615723, "step": 2418 }, { "epoch": 5.270152505446623, "grad_norm": 3.091379165649414, "learning_rate": 4.732026143790849e-07, "loss": 4.153085231781006, "step": 2419 }, { "epoch": 5.272331154684096, "grad_norm": 2.4549505710601807, "learning_rate": 4.7298474945533766e-07, "loss": 3.8778462409973145, "step": 2420 }, { "epoch": 5.2745098039215685, "grad_norm": 1.9165215492248535, "learning_rate": 4.727668845315904e-07, "loss": 4.000586032867432, "step": 2421 }, { "epoch": 5.276688453159041, "grad_norm": 2.101205348968506, "learning_rate": 4.725490196078431e-07, "loss": 3.841461658477783, "step": 2422 }, { "epoch": 5.278867102396514, "grad_norm": 1.5109742879867554, "learning_rate": 4.7233115468409584e-07, "loss": 3.9196317195892334, "step": 2423 }, { "epoch": 5.281045751633987, "grad_norm": 1.663589596748352, "learning_rate": 4.721132897603486e-07, "loss": 3.7496421337127686, "step": 2424 }, { "epoch": 5.283224400871459, "grad_norm": 2.220424175262451, "learning_rate": 4.7189542483660127e-07, "loss": 3.9335176944732666, "step": 2425 }, { "epoch": 5.285403050108933, "grad_norm": 2.0578386783599854, "learning_rate": 4.71677559912854e-07, "loss": 3.957624912261963, "step": 2426 }, { "epoch": 5.287581699346405, "grad_norm": 2.1653976440429688, "learning_rate": 4.7145969498910675e-07, "loss": 3.9926490783691406, "step": 2427 }, { "epoch": 5.289760348583878, "grad_norm": 1.8635482788085938, "learning_rate": 4.7124183006535944e-07, "loss": 3.8598217964172363, "step": 2428 }, { "epoch": 5.291938997821351, "grad_norm": 1.7865204811096191, "learning_rate": 4.710239651416122e-07, "loss": 3.846215009689331, "step": 2429 }, { "epoch": 5.294117647058823, "grad_norm": 2.3136632442474365, "learning_rate": 4.708061002178649e-07, "loss": 3.9363582134246826, "step": 2430 }, { "epoch": 5.296296296296296, "grad_norm": 2.1257917881011963, "learning_rate": 4.705882352941176e-07, "loss": 3.946455478668213, "step": 2431 }, { "epoch": 5.298474945533769, "grad_norm": 2.5667333602905273, "learning_rate": 4.7037037037037036e-07, "loss": 4.099484443664551, "step": 2432 }, { "epoch": 5.300653594771242, "grad_norm": 2.9974117279052734, "learning_rate": 4.7015250544662305e-07, "loss": 3.9803435802459717, "step": 2433 }, { "epoch": 5.302832244008715, "grad_norm": 3.0975303649902344, "learning_rate": 4.699346405228758e-07, "loss": 4.098940372467041, "step": 2434 }, { "epoch": 5.3050108932461875, "grad_norm": 2.1802942752838135, "learning_rate": 4.6971677559912853e-07, "loss": 3.9478683471679688, "step": 2435 }, { "epoch": 5.30718954248366, "grad_norm": 3.0469155311584473, "learning_rate": 4.694989106753812e-07, "loss": 4.097731113433838, "step": 2436 }, { "epoch": 5.309368191721133, "grad_norm": 2.504721164703369, "learning_rate": 4.6928104575163397e-07, "loss": 4.018413543701172, "step": 2437 }, { "epoch": 5.311546840958606, "grad_norm": 2.264599084854126, "learning_rate": 4.690631808278867e-07, "loss": 3.91333270072937, "step": 2438 }, { "epoch": 5.313725490196078, "grad_norm": 2.7780098915100098, "learning_rate": 4.688453159041394e-07, "loss": 4.063242435455322, "step": 2439 }, { "epoch": 5.315904139433552, "grad_norm": 3.8366637229919434, "learning_rate": 4.6862745098039214e-07, "loss": 4.379274368286133, "step": 2440 }, { "epoch": 5.318082788671024, "grad_norm": 2.004459857940674, "learning_rate": 4.684095860566449e-07, "loss": 3.948051929473877, "step": 2441 }, { "epoch": 5.3202614379084965, "grad_norm": 1.8589330911636353, "learning_rate": 4.6819172113289757e-07, "loss": 3.7770817279815674, "step": 2442 }, { "epoch": 5.32244008714597, "grad_norm": 2.3490312099456787, "learning_rate": 4.679738562091503e-07, "loss": 3.9436497688293457, "step": 2443 }, { "epoch": 5.324618736383442, "grad_norm": 2.707724094390869, "learning_rate": 4.6775599128540306e-07, "loss": 4.069700241088867, "step": 2444 }, { "epoch": 5.326797385620915, "grad_norm": 2.473013162612915, "learning_rate": 4.6753812636165575e-07, "loss": 4.06207275390625, "step": 2445 }, { "epoch": 5.328976034858388, "grad_norm": 2.101341962814331, "learning_rate": 4.673202614379085e-07, "loss": 3.999563455581665, "step": 2446 }, { "epoch": 5.331154684095861, "grad_norm": 2.1854166984558105, "learning_rate": 4.6710239651416123e-07, "loss": 4.1113409996032715, "step": 2447 }, { "epoch": 5.333333333333333, "grad_norm": 2.1924216747283936, "learning_rate": 4.668845315904139e-07, "loss": 3.9084463119506836, "step": 2448 }, { "epoch": 5.335511982570806, "grad_norm": 2.2967429161071777, "learning_rate": 4.6666666666666666e-07, "loss": 3.9576754570007324, "step": 2449 }, { "epoch": 5.337690631808279, "grad_norm": 2.084730386734009, "learning_rate": 4.664488017429194e-07, "loss": 4.036622047424316, "step": 2450 }, { "epoch": 5.339869281045751, "grad_norm": 1.9819799661636353, "learning_rate": 4.662309368191721e-07, "loss": 3.8663933277130127, "step": 2451 }, { "epoch": 5.342047930283225, "grad_norm": 1.9312454462051392, "learning_rate": 4.6601307189542484e-07, "loss": 3.9290995597839355, "step": 2452 }, { "epoch": 5.344226579520697, "grad_norm": 3.1120805740356445, "learning_rate": 4.6579520697167753e-07, "loss": 4.126702785491943, "step": 2453 }, { "epoch": 5.34640522875817, "grad_norm": 3.1190249919891357, "learning_rate": 4.6557734204793027e-07, "loss": 4.039900779724121, "step": 2454 }, { "epoch": 5.348583877995643, "grad_norm": 2.190804958343506, "learning_rate": 4.65359477124183e-07, "loss": 4.053610324859619, "step": 2455 }, { "epoch": 5.350762527233115, "grad_norm": 2.2910890579223633, "learning_rate": 4.651416122004357e-07, "loss": 4.070886135101318, "step": 2456 }, { "epoch": 5.352941176470588, "grad_norm": 1.7483876943588257, "learning_rate": 4.6492374727668844e-07, "loss": 3.8147990703582764, "step": 2457 }, { "epoch": 5.355119825708061, "grad_norm": 1.8002982139587402, "learning_rate": 4.647058823529412e-07, "loss": 3.8705127239227295, "step": 2458 }, { "epoch": 5.357298474945534, "grad_norm": 1.9350395202636719, "learning_rate": 4.644880174291939e-07, "loss": 3.8381216526031494, "step": 2459 }, { "epoch": 5.359477124183006, "grad_norm": 2.4765419960021973, "learning_rate": 4.642701525054466e-07, "loss": 3.8964948654174805, "step": 2460 }, { "epoch": 5.3616557734204795, "grad_norm": 2.8779125213623047, "learning_rate": 4.6405228758169936e-07, "loss": 4.192872047424316, "step": 2461 }, { "epoch": 5.363834422657952, "grad_norm": 2.3419430255889893, "learning_rate": 4.6383442265795205e-07, "loss": 4.132833003997803, "step": 2462 }, { "epoch": 5.366013071895424, "grad_norm": 2.1371819972991943, "learning_rate": 4.636165577342048e-07, "loss": 3.678009033203125, "step": 2463 }, { "epoch": 5.368191721132898, "grad_norm": 2.179095983505249, "learning_rate": 4.6339869281045754e-07, "loss": 4.061136722564697, "step": 2464 }, { "epoch": 5.37037037037037, "grad_norm": 2.6282296180725098, "learning_rate": 4.631808278867102e-07, "loss": 3.788999319076538, "step": 2465 }, { "epoch": 5.372549019607844, "grad_norm": 2.3086435794830322, "learning_rate": 4.6296296296296297e-07, "loss": 3.649672269821167, "step": 2466 }, { "epoch": 5.374727668845316, "grad_norm": 1.9422701597213745, "learning_rate": 4.627450980392157e-07, "loss": 3.8450353145599365, "step": 2467 }, { "epoch": 5.3769063180827885, "grad_norm": 2.594503164291382, "learning_rate": 4.625272331154684e-07, "loss": 3.79838228225708, "step": 2468 }, { "epoch": 5.379084967320262, "grad_norm": 2.021343946456909, "learning_rate": 4.6230936819172114e-07, "loss": 3.7903645038604736, "step": 2469 }, { "epoch": 5.381263616557734, "grad_norm": 2.3342370986938477, "learning_rate": 4.620915032679739e-07, "loss": 3.853668689727783, "step": 2470 }, { "epoch": 5.383442265795207, "grad_norm": 2.1967742443084717, "learning_rate": 4.618736383442265e-07, "loss": 3.9578707218170166, "step": 2471 }, { "epoch": 5.38562091503268, "grad_norm": 2.3123044967651367, "learning_rate": 4.6165577342047926e-07, "loss": 3.772067070007324, "step": 2472 }, { "epoch": 5.387799564270153, "grad_norm": 1.9820752143859863, "learning_rate": 4.61437908496732e-07, "loss": 3.93729829788208, "step": 2473 }, { "epoch": 5.389978213507625, "grad_norm": 2.273306369781494, "learning_rate": 4.612200435729847e-07, "loss": 3.843672037124634, "step": 2474 }, { "epoch": 5.392156862745098, "grad_norm": 2.305940628051758, "learning_rate": 4.6100217864923744e-07, "loss": 3.9318478107452393, "step": 2475 }, { "epoch": 5.394335511982571, "grad_norm": 1.997401475906372, "learning_rate": 4.6078431372549013e-07, "loss": 3.816626787185669, "step": 2476 }, { "epoch": 5.396514161220043, "grad_norm": 3.446178674697876, "learning_rate": 4.6056644880174287e-07, "loss": 4.108546257019043, "step": 2477 }, { "epoch": 5.398692810457517, "grad_norm": 2.0687096118927, "learning_rate": 4.603485838779956e-07, "loss": 3.8672733306884766, "step": 2478 }, { "epoch": 5.400871459694989, "grad_norm": 2.4340758323669434, "learning_rate": 4.601307189542483e-07, "loss": 3.957709550857544, "step": 2479 }, { "epoch": 5.403050108932462, "grad_norm": 2.554154634475708, "learning_rate": 4.5991285403050104e-07, "loss": 3.8899636268615723, "step": 2480 }, { "epoch": 5.405228758169935, "grad_norm": 2.621213912963867, "learning_rate": 4.596949891067538e-07, "loss": 3.922717332839966, "step": 2481 }, { "epoch": 5.407407407407407, "grad_norm": 2.1316442489624023, "learning_rate": 4.594771241830065e-07, "loss": 3.853131055831909, "step": 2482 }, { "epoch": 5.40958605664488, "grad_norm": 3.7525978088378906, "learning_rate": 4.592592592592592e-07, "loss": 4.162806987762451, "step": 2483 }, { "epoch": 5.411764705882353, "grad_norm": 2.061415910720825, "learning_rate": 4.5904139433551196e-07, "loss": 3.882089853286743, "step": 2484 }, { "epoch": 5.413943355119826, "grad_norm": 2.231280565261841, "learning_rate": 4.5882352941176465e-07, "loss": 4.026830673217773, "step": 2485 }, { "epoch": 5.416122004357298, "grad_norm": 2.1792173385620117, "learning_rate": 4.586056644880174e-07, "loss": 3.928882360458374, "step": 2486 }, { "epoch": 5.4183006535947715, "grad_norm": 1.8871893882751465, "learning_rate": 4.5838779956427014e-07, "loss": 3.8956761360168457, "step": 2487 }, { "epoch": 5.420479302832244, "grad_norm": 2.860650062561035, "learning_rate": 4.581699346405228e-07, "loss": 4.072261810302734, "step": 2488 }, { "epoch": 5.4226579520697165, "grad_norm": 1.926411509513855, "learning_rate": 4.5795206971677557e-07, "loss": 3.8375442028045654, "step": 2489 }, { "epoch": 5.42483660130719, "grad_norm": 2.331683874130249, "learning_rate": 4.577342047930283e-07, "loss": 4.0043182373046875, "step": 2490 }, { "epoch": 5.427015250544662, "grad_norm": 2.3865344524383545, "learning_rate": 4.57516339869281e-07, "loss": 4.046067237854004, "step": 2491 }, { "epoch": 5.429193899782135, "grad_norm": 2.148359775543213, "learning_rate": 4.5729847494553374e-07, "loss": 3.8591275215148926, "step": 2492 }, { "epoch": 5.431372549019608, "grad_norm": 1.9904264211654663, "learning_rate": 4.570806100217865e-07, "loss": 4.033174514770508, "step": 2493 }, { "epoch": 5.4335511982570806, "grad_norm": 2.2317848205566406, "learning_rate": 4.568627450980392e-07, "loss": 4.020777702331543, "step": 2494 }, { "epoch": 5.435729847494553, "grad_norm": 2.992077589035034, "learning_rate": 4.566448801742919e-07, "loss": 3.974294900894165, "step": 2495 }, { "epoch": 5.437908496732026, "grad_norm": 2.0023510456085205, "learning_rate": 4.5642701525054466e-07, "loss": 3.9681310653686523, "step": 2496 }, { "epoch": 5.440087145969499, "grad_norm": 2.313621997833252, "learning_rate": 4.5620915032679735e-07, "loss": 4.03427791595459, "step": 2497 }, { "epoch": 5.442265795206971, "grad_norm": 1.8024457693099976, "learning_rate": 4.559912854030501e-07, "loss": 3.953126907348633, "step": 2498 }, { "epoch": 5.444444444444445, "grad_norm": 2.203613758087158, "learning_rate": 4.5577342047930283e-07, "loss": 3.9872448444366455, "step": 2499 }, { "epoch": 5.446623093681917, "grad_norm": 3.4676554203033447, "learning_rate": 4.555555555555555e-07, "loss": 4.067174434661865, "step": 2500 }, { "epoch": 5.44880174291939, "grad_norm": 3.094250202178955, "learning_rate": 4.5533769063180827e-07, "loss": 4.1653900146484375, "step": 2501 }, { "epoch": 5.450980392156863, "grad_norm": 2.3093550205230713, "learning_rate": 4.5511982570806096e-07, "loss": 3.884216070175171, "step": 2502 }, { "epoch": 5.453159041394335, "grad_norm": 2.4796500205993652, "learning_rate": 4.549019607843137e-07, "loss": 4.061441898345947, "step": 2503 }, { "epoch": 5.455337690631808, "grad_norm": 2.1497154235839844, "learning_rate": 4.5468409586056644e-07, "loss": 3.821767807006836, "step": 2504 }, { "epoch": 5.457516339869281, "grad_norm": 1.6950663328170776, "learning_rate": 4.5446623093681913e-07, "loss": 3.8434295654296875, "step": 2505 }, { "epoch": 5.459694989106754, "grad_norm": 2.7823219299316406, "learning_rate": 4.5424836601307187e-07, "loss": 3.8627641201019287, "step": 2506 }, { "epoch": 5.461873638344226, "grad_norm": 2.8229947090148926, "learning_rate": 4.540305010893246e-07, "loss": 3.941807985305786, "step": 2507 }, { "epoch": 5.4640522875816995, "grad_norm": 1.7840098142623901, "learning_rate": 4.538126361655773e-07, "loss": 3.863316535949707, "step": 2508 }, { "epoch": 5.466230936819172, "grad_norm": 2.063682794570923, "learning_rate": 4.5359477124183005e-07, "loss": 3.8734934329986572, "step": 2509 }, { "epoch": 5.468409586056645, "grad_norm": 3.268489122390747, "learning_rate": 4.533769063180828e-07, "loss": 4.135243892669678, "step": 2510 }, { "epoch": 5.470588235294118, "grad_norm": 1.8094425201416016, "learning_rate": 4.531590413943355e-07, "loss": 3.9390926361083984, "step": 2511 }, { "epoch": 5.47276688453159, "grad_norm": 2.2907915115356445, "learning_rate": 4.529411764705882e-07, "loss": 3.8286190032958984, "step": 2512 }, { "epoch": 5.474945533769064, "grad_norm": 1.8624000549316406, "learning_rate": 4.5272331154684096e-07, "loss": 3.9127447605133057, "step": 2513 }, { "epoch": 5.477124183006536, "grad_norm": 2.4374709129333496, "learning_rate": 4.5250544662309365e-07, "loss": 4.008270740509033, "step": 2514 }, { "epoch": 5.4793028322440085, "grad_norm": 1.9677951335906982, "learning_rate": 4.522875816993464e-07, "loss": 3.8605897426605225, "step": 2515 }, { "epoch": 5.481481481481482, "grad_norm": 2.570993185043335, "learning_rate": 4.5206971677559914e-07, "loss": 3.9601223468780518, "step": 2516 }, { "epoch": 5.483660130718954, "grad_norm": 2.251038074493408, "learning_rate": 4.5185185185185183e-07, "loss": 3.9222683906555176, "step": 2517 }, { "epoch": 5.485838779956427, "grad_norm": 1.9417698383331299, "learning_rate": 4.5163398692810457e-07, "loss": 3.9413604736328125, "step": 2518 }, { "epoch": 5.4880174291939, "grad_norm": 4.287627220153809, "learning_rate": 4.514161220043573e-07, "loss": 4.0772929191589355, "step": 2519 }, { "epoch": 5.490196078431373, "grad_norm": 2.260634660720825, "learning_rate": 4.5119825708061e-07, "loss": 3.947706699371338, "step": 2520 }, { "epoch": 5.492374727668845, "grad_norm": 2.156792163848877, "learning_rate": 4.5098039215686274e-07, "loss": 3.994680166244507, "step": 2521 }, { "epoch": 5.494553376906318, "grad_norm": 2.5263938903808594, "learning_rate": 4.507625272331155e-07, "loss": 4.013535022735596, "step": 2522 }, { "epoch": 5.496732026143791, "grad_norm": 2.0405795574188232, "learning_rate": 4.505446623093682e-07, "loss": 3.9179396629333496, "step": 2523 }, { "epoch": 5.498910675381263, "grad_norm": 2.3446130752563477, "learning_rate": 4.503267973856209e-07, "loss": 3.831585168838501, "step": 2524 }, { "epoch": 5.501089324618737, "grad_norm": 2.767990827560425, "learning_rate": 4.501089324618736e-07, "loss": 4.091350078582764, "step": 2525 }, { "epoch": 5.503267973856209, "grad_norm": 2.4516661167144775, "learning_rate": 4.4989106753812635e-07, "loss": 4.05528450012207, "step": 2526 }, { "epoch": 5.505446623093682, "grad_norm": 2.1108052730560303, "learning_rate": 4.496732026143791e-07, "loss": 3.851036310195923, "step": 2527 }, { "epoch": 5.507625272331155, "grad_norm": 2.1539859771728516, "learning_rate": 4.494553376906318e-07, "loss": 3.936465263366699, "step": 2528 }, { "epoch": 5.509803921568627, "grad_norm": 2.1283352375030518, "learning_rate": 4.492374727668845e-07, "loss": 3.9901907444000244, "step": 2529 }, { "epoch": 5.5119825708061, "grad_norm": 1.8537161350250244, "learning_rate": 4.4901960784313727e-07, "loss": 3.8655552864074707, "step": 2530 }, { "epoch": 5.514161220043573, "grad_norm": 2.0296688079833984, "learning_rate": 4.4880174291938996e-07, "loss": 3.858367681503296, "step": 2531 }, { "epoch": 5.516339869281046, "grad_norm": 1.8372836112976074, "learning_rate": 4.485838779956427e-07, "loss": 3.808724880218506, "step": 2532 }, { "epoch": 5.518518518518518, "grad_norm": 1.663228988647461, "learning_rate": 4.4836601307189544e-07, "loss": 3.928398847579956, "step": 2533 }, { "epoch": 5.5206971677559915, "grad_norm": 1.8787132501602173, "learning_rate": 4.4814814814814813e-07, "loss": 3.9674715995788574, "step": 2534 }, { "epoch": 5.522875816993464, "grad_norm": 2.4739603996276855, "learning_rate": 4.479302832244009e-07, "loss": 3.925565719604492, "step": 2535 }, { "epoch": 5.525054466230936, "grad_norm": 2.796689748764038, "learning_rate": 4.477124183006536e-07, "loss": 3.958970069885254, "step": 2536 }, { "epoch": 5.52723311546841, "grad_norm": 2.400846481323242, "learning_rate": 4.474945533769063e-07, "loss": 3.905412435531616, "step": 2537 }, { "epoch": 5.529411764705882, "grad_norm": 3.153301954269409, "learning_rate": 4.4727668845315905e-07, "loss": 4.081331729888916, "step": 2538 }, { "epoch": 5.531590413943356, "grad_norm": 1.788343071937561, "learning_rate": 4.470588235294118e-07, "loss": 3.8238697052001953, "step": 2539 }, { "epoch": 5.533769063180828, "grad_norm": 2.061161756515503, "learning_rate": 4.4684095860566443e-07, "loss": 3.864631175994873, "step": 2540 }, { "epoch": 5.5359477124183005, "grad_norm": 2.3504536151885986, "learning_rate": 4.4662309368191717e-07, "loss": 3.8631200790405273, "step": 2541 }, { "epoch": 5.538126361655774, "grad_norm": 1.788260817527771, "learning_rate": 4.464052287581699e-07, "loss": 3.9058423042297363, "step": 2542 }, { "epoch": 5.540305010893246, "grad_norm": 2.017916679382324, "learning_rate": 4.461873638344226e-07, "loss": 3.958949565887451, "step": 2543 }, { "epoch": 5.542483660130719, "grad_norm": 2.7751238346099854, "learning_rate": 4.4596949891067534e-07, "loss": 3.8816072940826416, "step": 2544 }, { "epoch": 5.544662309368192, "grad_norm": 2.6087846755981445, "learning_rate": 4.457516339869281e-07, "loss": 3.974163293838501, "step": 2545 }, { "epoch": 5.546840958605665, "grad_norm": 2.4522829055786133, "learning_rate": 4.455337690631808e-07, "loss": 4.010571002960205, "step": 2546 }, { "epoch": 5.549019607843137, "grad_norm": 2.5334928035736084, "learning_rate": 4.453159041394335e-07, "loss": 4.052031517028809, "step": 2547 }, { "epoch": 5.55119825708061, "grad_norm": 2.043684720993042, "learning_rate": 4.450980392156862e-07, "loss": 3.9085757732391357, "step": 2548 }, { "epoch": 5.553376906318083, "grad_norm": 2.1377625465393066, "learning_rate": 4.4488017429193895e-07, "loss": 3.912374973297119, "step": 2549 }, { "epoch": 5.555555555555555, "grad_norm": 2.4885663986206055, "learning_rate": 4.446623093681917e-07, "loss": 3.9292349815368652, "step": 2550 }, { "epoch": 5.557734204793029, "grad_norm": 3.108199119567871, "learning_rate": 4.444444444444444e-07, "loss": 4.070436954498291, "step": 2551 }, { "epoch": 5.559912854030501, "grad_norm": 1.8879905939102173, "learning_rate": 4.442265795206971e-07, "loss": 3.8762552738189697, "step": 2552 }, { "epoch": 5.562091503267974, "grad_norm": 2.1787776947021484, "learning_rate": 4.4400871459694987e-07, "loss": 3.834547281265259, "step": 2553 }, { "epoch": 5.564270152505447, "grad_norm": 2.4146459102630615, "learning_rate": 4.4379084967320256e-07, "loss": 3.857961654663086, "step": 2554 }, { "epoch": 5.5664488017429194, "grad_norm": 3.0016376972198486, "learning_rate": 4.435729847494553e-07, "loss": 4.104483604431152, "step": 2555 }, { "epoch": 5.568627450980392, "grad_norm": 2.542121410369873, "learning_rate": 4.4335511982570804e-07, "loss": 3.888007879257202, "step": 2556 }, { "epoch": 5.570806100217865, "grad_norm": 2.2323555946350098, "learning_rate": 4.4313725490196073e-07, "loss": 4.14126443862915, "step": 2557 }, { "epoch": 5.572984749455338, "grad_norm": 2.0520131587982178, "learning_rate": 4.429193899782135e-07, "loss": 3.975024938583374, "step": 2558 }, { "epoch": 5.57516339869281, "grad_norm": 2.3716938495635986, "learning_rate": 4.427015250544662e-07, "loss": 3.9431204795837402, "step": 2559 }, { "epoch": 5.5773420479302835, "grad_norm": 2.4466121196746826, "learning_rate": 4.424836601307189e-07, "loss": 4.02531623840332, "step": 2560 }, { "epoch": 5.579520697167756, "grad_norm": 1.8248095512390137, "learning_rate": 4.4226579520697165e-07, "loss": 3.8594741821289062, "step": 2561 }, { "epoch": 5.5816993464052285, "grad_norm": 2.5598723888397217, "learning_rate": 4.420479302832244e-07, "loss": 3.9429802894592285, "step": 2562 }, { "epoch": 5.583877995642702, "grad_norm": 1.922767162322998, "learning_rate": 4.418300653594771e-07, "loss": 4.033390045166016, "step": 2563 }, { "epoch": 5.586056644880174, "grad_norm": 2.465663433074951, "learning_rate": 4.416122004357298e-07, "loss": 3.894766092300415, "step": 2564 }, { "epoch": 5.588235294117647, "grad_norm": 2.0405783653259277, "learning_rate": 4.4139433551198257e-07, "loss": 3.980952262878418, "step": 2565 }, { "epoch": 5.59041394335512, "grad_norm": 1.7539825439453125, "learning_rate": 4.4117647058823526e-07, "loss": 3.9248275756835938, "step": 2566 }, { "epoch": 5.592592592592593, "grad_norm": 1.6993571519851685, "learning_rate": 4.40958605664488e-07, "loss": 3.8596982955932617, "step": 2567 }, { "epoch": 5.594771241830065, "grad_norm": 2.289288282394409, "learning_rate": 4.4074074074074074e-07, "loss": 3.904547929763794, "step": 2568 }, { "epoch": 5.596949891067538, "grad_norm": 2.2740111351013184, "learning_rate": 4.4052287581699343e-07, "loss": 4.003801345825195, "step": 2569 }, { "epoch": 5.599128540305011, "grad_norm": 2.110121250152588, "learning_rate": 4.4030501089324617e-07, "loss": 3.9160680770874023, "step": 2570 }, { "epoch": 5.601307189542483, "grad_norm": 2.042811870574951, "learning_rate": 4.4008714596949886e-07, "loss": 3.8290023803710938, "step": 2571 }, { "epoch": 5.603485838779957, "grad_norm": 2.579333782196045, "learning_rate": 4.398692810457516e-07, "loss": 4.060407638549805, "step": 2572 }, { "epoch": 5.605664488017429, "grad_norm": 2.1584692001342773, "learning_rate": 4.3965141612200435e-07, "loss": 4.011291027069092, "step": 2573 }, { "epoch": 5.607843137254902, "grad_norm": 2.593604803085327, "learning_rate": 4.3943355119825704e-07, "loss": 3.814126491546631, "step": 2574 }, { "epoch": 5.610021786492375, "grad_norm": 2.3412721157073975, "learning_rate": 4.392156862745098e-07, "loss": 3.892456293106079, "step": 2575 }, { "epoch": 5.612200435729847, "grad_norm": 1.8879287242889404, "learning_rate": 4.389978213507625e-07, "loss": 3.8599982261657715, "step": 2576 }, { "epoch": 5.61437908496732, "grad_norm": 1.9327646493911743, "learning_rate": 4.387799564270152e-07, "loss": 3.8103511333465576, "step": 2577 }, { "epoch": 5.616557734204793, "grad_norm": 2.9916281700134277, "learning_rate": 4.3856209150326795e-07, "loss": 4.107762336730957, "step": 2578 }, { "epoch": 5.618736383442266, "grad_norm": 2.1788036823272705, "learning_rate": 4.383442265795207e-07, "loss": 3.866100788116455, "step": 2579 }, { "epoch": 5.620915032679738, "grad_norm": 3.3233306407928467, "learning_rate": 4.381263616557734e-07, "loss": 4.178204536437988, "step": 2580 }, { "epoch": 5.6230936819172115, "grad_norm": 2.037794589996338, "learning_rate": 4.3790849673202613e-07, "loss": 3.9620773792266846, "step": 2581 }, { "epoch": 5.625272331154684, "grad_norm": 2.4480597972869873, "learning_rate": 4.3769063180827887e-07, "loss": 4.010164737701416, "step": 2582 }, { "epoch": 5.627450980392156, "grad_norm": 2.2447574138641357, "learning_rate": 4.3747276688453156e-07, "loss": 3.9432878494262695, "step": 2583 }, { "epoch": 5.62962962962963, "grad_norm": 2.583470582962036, "learning_rate": 4.372549019607843e-07, "loss": 4.075560569763184, "step": 2584 }, { "epoch": 5.631808278867102, "grad_norm": 2.2131080627441406, "learning_rate": 4.3703703703703704e-07, "loss": 3.838273525238037, "step": 2585 }, { "epoch": 5.633986928104575, "grad_norm": 1.937259554862976, "learning_rate": 4.3681917211328973e-07, "loss": 3.765058755874634, "step": 2586 }, { "epoch": 5.636165577342048, "grad_norm": 1.8460949659347534, "learning_rate": 4.366013071895425e-07, "loss": 3.89005970954895, "step": 2587 }, { "epoch": 5.6383442265795205, "grad_norm": 2.1242222785949707, "learning_rate": 4.363834422657952e-07, "loss": 3.9160714149475098, "step": 2588 }, { "epoch": 5.640522875816993, "grad_norm": 1.7445588111877441, "learning_rate": 4.361655773420479e-07, "loss": 3.8209238052368164, "step": 2589 }, { "epoch": 5.642701525054466, "grad_norm": 3.3978216648101807, "learning_rate": 4.3594771241830065e-07, "loss": 4.001988887786865, "step": 2590 }, { "epoch": 5.644880174291939, "grad_norm": 1.930330753326416, "learning_rate": 4.357298474945534e-07, "loss": 3.8635802268981934, "step": 2591 }, { "epoch": 5.647058823529412, "grad_norm": 2.0543301105499268, "learning_rate": 4.355119825708061e-07, "loss": 3.8993570804595947, "step": 2592 }, { "epoch": 5.649237472766885, "grad_norm": 2.126906633377075, "learning_rate": 4.352941176470588e-07, "loss": 4.075771331787109, "step": 2593 }, { "epoch": 5.651416122004357, "grad_norm": 2.607572078704834, "learning_rate": 4.3507625272331157e-07, "loss": 4.021599769592285, "step": 2594 }, { "epoch": 5.65359477124183, "grad_norm": 2.4904773235321045, "learning_rate": 4.3485838779956426e-07, "loss": 3.938810348510742, "step": 2595 }, { "epoch": 5.655773420479303, "grad_norm": 2.0044853687286377, "learning_rate": 4.34640522875817e-07, "loss": 4.042023181915283, "step": 2596 }, { "epoch": 5.657952069716775, "grad_norm": 2.686253070831299, "learning_rate": 4.344226579520697e-07, "loss": 3.925539016723633, "step": 2597 }, { "epoch": 5.660130718954249, "grad_norm": 2.3651793003082275, "learning_rate": 4.3420479302832243e-07, "loss": 4.054675102233887, "step": 2598 }, { "epoch": 5.662309368191721, "grad_norm": 2.117581605911255, "learning_rate": 4.339869281045752e-07, "loss": 3.936934232711792, "step": 2599 }, { "epoch": 5.664488017429194, "grad_norm": 1.9870800971984863, "learning_rate": 4.3376906318082786e-07, "loss": 3.8733575344085693, "step": 2600 }, { "epoch": 5.666666666666667, "grad_norm": 2.2337589263916016, "learning_rate": 4.335511982570806e-07, "loss": 3.841040849685669, "step": 2601 }, { "epoch": 5.668845315904139, "grad_norm": 3.3782994747161865, "learning_rate": 4.3333333333333335e-07, "loss": 4.089313507080078, "step": 2602 }, { "epoch": 5.671023965141612, "grad_norm": 1.711124300956726, "learning_rate": 4.3311546840958604e-07, "loss": 3.844815731048584, "step": 2603 }, { "epoch": 5.673202614379085, "grad_norm": 1.6081990003585815, "learning_rate": 4.328976034858388e-07, "loss": 3.8780786991119385, "step": 2604 }, { "epoch": 5.675381263616558, "grad_norm": 2.0879838466644287, "learning_rate": 4.326797385620915e-07, "loss": 3.930532217025757, "step": 2605 }, { "epoch": 5.67755991285403, "grad_norm": 1.7950036525726318, "learning_rate": 4.324618736383442e-07, "loss": 3.776839017868042, "step": 2606 }, { "epoch": 5.6797385620915035, "grad_norm": 1.7798924446105957, "learning_rate": 4.3224400871459696e-07, "loss": 3.9315292835235596, "step": 2607 }, { "epoch": 5.681917211328976, "grad_norm": 2.0321929454803467, "learning_rate": 4.320261437908497e-07, "loss": 3.8917505741119385, "step": 2608 }, { "epoch": 5.684095860566448, "grad_norm": 2.8880672454833984, "learning_rate": 4.3180827886710233e-07, "loss": 4.068355083465576, "step": 2609 }, { "epoch": 5.686274509803922, "grad_norm": 2.6790153980255127, "learning_rate": 4.315904139433551e-07, "loss": 4.102491855621338, "step": 2610 }, { "epoch": 5.688453159041394, "grad_norm": 2.7597410678863525, "learning_rate": 4.313725490196078e-07, "loss": 4.175038814544678, "step": 2611 }, { "epoch": 5.690631808278867, "grad_norm": 2.3258578777313232, "learning_rate": 4.311546840958605e-07, "loss": 3.928623914718628, "step": 2612 }, { "epoch": 5.69281045751634, "grad_norm": 2.31689715385437, "learning_rate": 4.3093681917211325e-07, "loss": 3.848518133163452, "step": 2613 }, { "epoch": 5.6949891067538125, "grad_norm": 2.2761902809143066, "learning_rate": 4.30718954248366e-07, "loss": 3.9511983394622803, "step": 2614 }, { "epoch": 5.697167755991286, "grad_norm": 2.5403802394866943, "learning_rate": 4.305010893246187e-07, "loss": 4.121998310089111, "step": 2615 }, { "epoch": 5.699346405228758, "grad_norm": 2.6775107383728027, "learning_rate": 4.302832244008714e-07, "loss": 3.9874160289764404, "step": 2616 }, { "epoch": 5.701525054466231, "grad_norm": 2.0099003314971924, "learning_rate": 4.3006535947712417e-07, "loss": 3.9975833892822266, "step": 2617 }, { "epoch": 5.703703703703704, "grad_norm": 2.98152494430542, "learning_rate": 4.2984749455337686e-07, "loss": 4.111030578613281, "step": 2618 }, { "epoch": 5.705882352941177, "grad_norm": 2.4610984325408936, "learning_rate": 4.296296296296296e-07, "loss": 3.9533884525299072, "step": 2619 }, { "epoch": 5.708061002178649, "grad_norm": 2.2086970806121826, "learning_rate": 4.294117647058823e-07, "loss": 4.00372314453125, "step": 2620 }, { "epoch": 5.710239651416122, "grad_norm": 2.1680028438568115, "learning_rate": 4.2919389978213503e-07, "loss": 3.7136833667755127, "step": 2621 }, { "epoch": 5.712418300653595, "grad_norm": 2.354274034500122, "learning_rate": 4.289760348583878e-07, "loss": 4.098442077636719, "step": 2622 }, { "epoch": 5.714596949891067, "grad_norm": 1.6636000871658325, "learning_rate": 4.2875816993464046e-07, "loss": 3.991579532623291, "step": 2623 }, { "epoch": 5.716775599128541, "grad_norm": 1.9768298864364624, "learning_rate": 4.285403050108932e-07, "loss": 3.907506227493286, "step": 2624 }, { "epoch": 5.718954248366013, "grad_norm": 3.1966705322265625, "learning_rate": 4.2832244008714595e-07, "loss": 3.94140887260437, "step": 2625 }, { "epoch": 5.721132897603486, "grad_norm": 2.8067634105682373, "learning_rate": 4.2810457516339864e-07, "loss": 4.022510051727295, "step": 2626 }, { "epoch": 5.723311546840959, "grad_norm": 2.5204896926879883, "learning_rate": 4.278867102396514e-07, "loss": 3.9813497066497803, "step": 2627 }, { "epoch": 5.7254901960784315, "grad_norm": 2.991058588027954, "learning_rate": 4.276688453159041e-07, "loss": 4.058080673217773, "step": 2628 }, { "epoch": 5.727668845315904, "grad_norm": 2.4314026832580566, "learning_rate": 4.274509803921568e-07, "loss": 3.74995756149292, "step": 2629 }, { "epoch": 5.729847494553377, "grad_norm": 2.7981791496276855, "learning_rate": 4.2723311546840956e-07, "loss": 4.166955947875977, "step": 2630 }, { "epoch": 5.73202614379085, "grad_norm": 2.6349613666534424, "learning_rate": 4.270152505446623e-07, "loss": 4.099729537963867, "step": 2631 }, { "epoch": 5.734204793028322, "grad_norm": 2.4832732677459717, "learning_rate": 4.26797385620915e-07, "loss": 3.935551166534424, "step": 2632 }, { "epoch": 5.7363834422657956, "grad_norm": 2.0631015300750732, "learning_rate": 4.2657952069716773e-07, "loss": 3.8461554050445557, "step": 2633 }, { "epoch": 5.738562091503268, "grad_norm": 2.1605610847473145, "learning_rate": 4.2636165577342047e-07, "loss": 3.9164671897888184, "step": 2634 }, { "epoch": 5.7407407407407405, "grad_norm": 1.8026599884033203, "learning_rate": 4.2614379084967316e-07, "loss": 3.8615000247955322, "step": 2635 }, { "epoch": 5.742919389978214, "grad_norm": 2.2030038833618164, "learning_rate": 4.259259259259259e-07, "loss": 3.9182827472686768, "step": 2636 }, { "epoch": 5.745098039215686, "grad_norm": 2.3906962871551514, "learning_rate": 4.2570806100217865e-07, "loss": 4.133564472198486, "step": 2637 }, { "epoch": 5.747276688453159, "grad_norm": 2.149376630783081, "learning_rate": 4.2549019607843134e-07, "loss": 4.0050249099731445, "step": 2638 }, { "epoch": 5.749455337690632, "grad_norm": 2.0678155422210693, "learning_rate": 4.252723311546841e-07, "loss": 3.905987501144409, "step": 2639 }, { "epoch": 5.751633986928105, "grad_norm": 3.1490156650543213, "learning_rate": 4.250544662309368e-07, "loss": 4.0253005027771, "step": 2640 }, { "epoch": 5.753812636165577, "grad_norm": 1.7353774309158325, "learning_rate": 4.248366013071895e-07, "loss": 3.784801721572876, "step": 2641 }, { "epoch": 5.75599128540305, "grad_norm": 1.9060710668563843, "learning_rate": 4.2461873638344225e-07, "loss": 3.947833776473999, "step": 2642 }, { "epoch": 5.758169934640523, "grad_norm": 2.8345539569854736, "learning_rate": 4.2440087145969494e-07, "loss": 4.043293476104736, "step": 2643 }, { "epoch": 5.760348583877995, "grad_norm": 1.933709740638733, "learning_rate": 4.241830065359477e-07, "loss": 3.7808868885040283, "step": 2644 }, { "epoch": 5.762527233115469, "grad_norm": 1.9169070720672607, "learning_rate": 4.2396514161220043e-07, "loss": 3.9190356731414795, "step": 2645 }, { "epoch": 5.764705882352941, "grad_norm": 2.048957347869873, "learning_rate": 4.237472766884531e-07, "loss": 3.9359164237976074, "step": 2646 }, { "epoch": 5.766884531590414, "grad_norm": 2.0303955078125, "learning_rate": 4.2352941176470586e-07, "loss": 3.8808584213256836, "step": 2647 }, { "epoch": 5.769063180827887, "grad_norm": 1.9880160093307495, "learning_rate": 4.233115468409586e-07, "loss": 4.079999923706055, "step": 2648 }, { "epoch": 5.771241830065359, "grad_norm": 1.9752031564712524, "learning_rate": 4.230936819172113e-07, "loss": 3.7692012786865234, "step": 2649 }, { "epoch": 5.773420479302832, "grad_norm": 2.3841552734375, "learning_rate": 4.2287581699346403e-07, "loss": 3.867274284362793, "step": 2650 }, { "epoch": 5.775599128540305, "grad_norm": 3.2119529247283936, "learning_rate": 4.226579520697168e-07, "loss": 4.11391019821167, "step": 2651 }, { "epoch": 5.777777777777778, "grad_norm": 1.7157676219940186, "learning_rate": 4.2244008714596947e-07, "loss": 3.819920539855957, "step": 2652 }, { "epoch": 5.77995642701525, "grad_norm": 2.3560190200805664, "learning_rate": 4.222222222222222e-07, "loss": 4.034688472747803, "step": 2653 }, { "epoch": 5.7821350762527235, "grad_norm": 2.36694073677063, "learning_rate": 4.2200435729847495e-07, "loss": 4.009551048278809, "step": 2654 }, { "epoch": 5.784313725490196, "grad_norm": 1.924582600593567, "learning_rate": 4.2178649237472764e-07, "loss": 3.9132280349731445, "step": 2655 }, { "epoch": 5.786492374727668, "grad_norm": 2.388503074645996, "learning_rate": 4.215686274509804e-07, "loss": 3.9395604133605957, "step": 2656 }, { "epoch": 5.788671023965142, "grad_norm": 2.977668046951294, "learning_rate": 4.213507625272331e-07, "loss": 3.772836685180664, "step": 2657 }, { "epoch": 5.790849673202614, "grad_norm": 1.9053466320037842, "learning_rate": 4.211328976034858e-07, "loss": 3.9671216011047363, "step": 2658 }, { "epoch": 5.793028322440087, "grad_norm": 1.8380017280578613, "learning_rate": 4.2091503267973856e-07, "loss": 3.8931331634521484, "step": 2659 }, { "epoch": 5.79520697167756, "grad_norm": 2.8163392543792725, "learning_rate": 4.206971677559913e-07, "loss": 3.9400601387023926, "step": 2660 }, { "epoch": 5.7973856209150325, "grad_norm": 2.8206732273101807, "learning_rate": 4.20479302832244e-07, "loss": 4.029776096343994, "step": 2661 }, { "epoch": 5.799564270152505, "grad_norm": 2.05082106590271, "learning_rate": 4.2026143790849673e-07, "loss": 3.887138605117798, "step": 2662 }, { "epoch": 5.801742919389978, "grad_norm": 3.4134163856506348, "learning_rate": 4.200435729847495e-07, "loss": 4.031362056732178, "step": 2663 }, { "epoch": 5.803921568627451, "grad_norm": 2.383113145828247, "learning_rate": 4.1982570806100216e-07, "loss": 3.9988291263580322, "step": 2664 }, { "epoch": 5.806100217864923, "grad_norm": 2.208292007446289, "learning_rate": 4.196078431372549e-07, "loss": 3.873027801513672, "step": 2665 }, { "epoch": 5.808278867102397, "grad_norm": 2.2932066917419434, "learning_rate": 4.1938997821350765e-07, "loss": 4.146522045135498, "step": 2666 }, { "epoch": 5.810457516339869, "grad_norm": 2.0931713581085205, "learning_rate": 4.1917211328976034e-07, "loss": 3.909311056137085, "step": 2667 }, { "epoch": 5.812636165577342, "grad_norm": 2.0259337425231934, "learning_rate": 4.189542483660131e-07, "loss": 3.9741177558898926, "step": 2668 }, { "epoch": 5.814814814814815, "grad_norm": 2.3635504245758057, "learning_rate": 4.1873638344226577e-07, "loss": 3.8708181381225586, "step": 2669 }, { "epoch": 5.816993464052287, "grad_norm": 2.559295177459717, "learning_rate": 4.185185185185185e-07, "loss": 4.137092113494873, "step": 2670 }, { "epoch": 5.819172113289761, "grad_norm": 2.3032820224761963, "learning_rate": 4.1830065359477126e-07, "loss": 3.747190475463867, "step": 2671 }, { "epoch": 5.821350762527233, "grad_norm": 1.7889752388000488, "learning_rate": 4.1808278867102395e-07, "loss": 3.8897314071655273, "step": 2672 }, { "epoch": 5.823529411764706, "grad_norm": 4.177854061126709, "learning_rate": 4.178649237472767e-07, "loss": 3.99473237991333, "step": 2673 }, { "epoch": 5.825708061002179, "grad_norm": 2.3562891483306885, "learning_rate": 4.1764705882352943e-07, "loss": 3.8458049297332764, "step": 2674 }, { "epoch": 5.827886710239651, "grad_norm": 2.45624041557312, "learning_rate": 4.174291938997821e-07, "loss": 3.936573028564453, "step": 2675 }, { "epoch": 5.830065359477124, "grad_norm": 2.2671000957489014, "learning_rate": 4.1721132897603486e-07, "loss": 3.902677297592163, "step": 2676 }, { "epoch": 5.832244008714597, "grad_norm": 2.4369876384735107, "learning_rate": 4.169934640522876e-07, "loss": 3.960963487625122, "step": 2677 }, { "epoch": 5.83442265795207, "grad_norm": 2.013831853866577, "learning_rate": 4.1677559912854024e-07, "loss": 3.824352502822876, "step": 2678 }, { "epoch": 5.836601307189542, "grad_norm": 2.2829396724700928, "learning_rate": 4.16557734204793e-07, "loss": 3.8760313987731934, "step": 2679 }, { "epoch": 5.8387799564270155, "grad_norm": 2.0119521617889404, "learning_rate": 4.163398692810457e-07, "loss": 3.746777057647705, "step": 2680 }, { "epoch": 5.840958605664488, "grad_norm": 2.681353807449341, "learning_rate": 4.161220043572984e-07, "loss": 3.981935739517212, "step": 2681 }, { "epoch": 5.8431372549019605, "grad_norm": 2.326321601867676, "learning_rate": 4.1590413943355116e-07, "loss": 3.926668167114258, "step": 2682 }, { "epoch": 5.845315904139434, "grad_norm": 1.929669737815857, "learning_rate": 4.156862745098039e-07, "loss": 3.893970489501953, "step": 2683 }, { "epoch": 5.847494553376906, "grad_norm": 2.8343505859375, "learning_rate": 4.154684095860566e-07, "loss": 3.864583730697632, "step": 2684 }, { "epoch": 5.849673202614379, "grad_norm": 2.2575604915618896, "learning_rate": 4.1525054466230933e-07, "loss": 3.9042534828186035, "step": 2685 }, { "epoch": 5.851851851851852, "grad_norm": 2.1779448986053467, "learning_rate": 4.150326797385621e-07, "loss": 3.966813325881958, "step": 2686 }, { "epoch": 5.8540305010893245, "grad_norm": 2.007439374923706, "learning_rate": 4.1481481481481476e-07, "loss": 3.9220268726348877, "step": 2687 }, { "epoch": 5.856209150326797, "grad_norm": 2.8841729164123535, "learning_rate": 4.145969498910675e-07, "loss": 3.967092514038086, "step": 2688 }, { "epoch": 5.85838779956427, "grad_norm": 1.5715011358261108, "learning_rate": 4.1437908496732025e-07, "loss": 3.7554492950439453, "step": 2689 }, { "epoch": 5.860566448801743, "grad_norm": 2.165093183517456, "learning_rate": 4.1416122004357294e-07, "loss": 4.025774955749512, "step": 2690 }, { "epoch": 5.862745098039216, "grad_norm": 2.2693517208099365, "learning_rate": 4.139433551198257e-07, "loss": 3.9839367866516113, "step": 2691 }, { "epoch": 5.864923747276689, "grad_norm": 2.341226100921631, "learning_rate": 4.1372549019607837e-07, "loss": 4.0950798988342285, "step": 2692 }, { "epoch": 5.867102396514161, "grad_norm": 1.794605016708374, "learning_rate": 4.135076252723311e-07, "loss": 3.8538036346435547, "step": 2693 }, { "epoch": 5.8692810457516345, "grad_norm": 2.7539117336273193, "learning_rate": 4.1328976034858386e-07, "loss": 4.157118797302246, "step": 2694 }, { "epoch": 5.871459694989107, "grad_norm": 1.9992283582687378, "learning_rate": 4.1307189542483655e-07, "loss": 3.868530511856079, "step": 2695 }, { "epoch": 5.873638344226579, "grad_norm": 2.0789175033569336, "learning_rate": 4.128540305010893e-07, "loss": 3.9281997680664062, "step": 2696 }, { "epoch": 5.875816993464053, "grad_norm": 2.3684585094451904, "learning_rate": 4.1263616557734203e-07, "loss": 3.8534438610076904, "step": 2697 }, { "epoch": 5.877995642701525, "grad_norm": 3.6080727577209473, "learning_rate": 4.124183006535947e-07, "loss": 4.048846244812012, "step": 2698 }, { "epoch": 5.880174291938998, "grad_norm": 3.400923490524292, "learning_rate": 4.1220043572984746e-07, "loss": 4.227108955383301, "step": 2699 }, { "epoch": 5.882352941176471, "grad_norm": 2.2453441619873047, "learning_rate": 4.119825708061002e-07, "loss": 3.8142800331115723, "step": 2700 }, { "epoch": 5.8845315904139435, "grad_norm": 2.226677656173706, "learning_rate": 4.117647058823529e-07, "loss": 4.037888050079346, "step": 2701 }, { "epoch": 5.886710239651416, "grad_norm": 1.9434239864349365, "learning_rate": 4.1154684095860564e-07, "loss": 3.9082789421081543, "step": 2702 }, { "epoch": 5.888888888888889, "grad_norm": 1.9357587099075317, "learning_rate": 4.113289760348584e-07, "loss": 3.7905285358428955, "step": 2703 }, { "epoch": 5.891067538126362, "grad_norm": 2.271667003631592, "learning_rate": 4.1111111111111107e-07, "loss": 3.8400723934173584, "step": 2704 }, { "epoch": 5.893246187363834, "grad_norm": 2.3442177772521973, "learning_rate": 4.108932461873638e-07, "loss": 3.7387630939483643, "step": 2705 }, { "epoch": 5.895424836601308, "grad_norm": 2.3975791931152344, "learning_rate": 4.1067538126361655e-07, "loss": 3.911018133163452, "step": 2706 }, { "epoch": 5.89760348583878, "grad_norm": 4.048264026641846, "learning_rate": 4.1045751633986924e-07, "loss": 3.978177309036255, "step": 2707 }, { "epoch": 5.8997821350762525, "grad_norm": 1.8532121181488037, "learning_rate": 4.10239651416122e-07, "loss": 3.9556100368499756, "step": 2708 }, { "epoch": 5.901960784313726, "grad_norm": 2.081418514251709, "learning_rate": 4.1002178649237473e-07, "loss": 3.8124945163726807, "step": 2709 }, { "epoch": 5.904139433551198, "grad_norm": 2.2554779052734375, "learning_rate": 4.098039215686274e-07, "loss": 3.8083786964416504, "step": 2710 }, { "epoch": 5.906318082788671, "grad_norm": 2.4524054527282715, "learning_rate": 4.0958605664488016e-07, "loss": 3.874293088912964, "step": 2711 }, { "epoch": 5.908496732026144, "grad_norm": 2.1061952114105225, "learning_rate": 4.093681917211329e-07, "loss": 3.9136977195739746, "step": 2712 }, { "epoch": 5.910675381263617, "grad_norm": 3.443657159805298, "learning_rate": 4.091503267973856e-07, "loss": 4.014627456665039, "step": 2713 }, { "epoch": 5.912854030501089, "grad_norm": 1.9535818099975586, "learning_rate": 4.0893246187363833e-07, "loss": 3.9516966342926025, "step": 2714 }, { "epoch": 5.915032679738562, "grad_norm": 2.4827592372894287, "learning_rate": 4.08714596949891e-07, "loss": 3.875913381576538, "step": 2715 }, { "epoch": 5.917211328976035, "grad_norm": 1.5961930751800537, "learning_rate": 4.0849673202614377e-07, "loss": 3.8387022018432617, "step": 2716 }, { "epoch": 5.919389978213507, "grad_norm": 1.9023970365524292, "learning_rate": 4.082788671023965e-07, "loss": 3.7691142559051514, "step": 2717 }, { "epoch": 5.921568627450981, "grad_norm": 1.9774694442749023, "learning_rate": 4.080610021786492e-07, "loss": 3.8270163536071777, "step": 2718 }, { "epoch": 5.923747276688453, "grad_norm": 1.454064965248108, "learning_rate": 4.0784313725490194e-07, "loss": 3.851374626159668, "step": 2719 }, { "epoch": 5.925925925925926, "grad_norm": 2.9618053436279297, "learning_rate": 4.076252723311547e-07, "loss": 4.031748294830322, "step": 2720 }, { "epoch": 5.928104575163399, "grad_norm": 1.8813691139221191, "learning_rate": 4.0740740740740737e-07, "loss": 3.907438039779663, "step": 2721 }, { "epoch": 5.930283224400871, "grad_norm": 1.829480528831482, "learning_rate": 4.071895424836601e-07, "loss": 3.864711046218872, "step": 2722 }, { "epoch": 5.932461873638344, "grad_norm": 2.615191698074341, "learning_rate": 4.0697167755991286e-07, "loss": 3.9648897647857666, "step": 2723 }, { "epoch": 5.934640522875817, "grad_norm": 2.8833649158477783, "learning_rate": 4.0675381263616555e-07, "loss": 4.06254768371582, "step": 2724 }, { "epoch": 5.93681917211329, "grad_norm": 2.249730110168457, "learning_rate": 4.065359477124183e-07, "loss": 3.721928834915161, "step": 2725 }, { "epoch": 5.938997821350762, "grad_norm": 2.5788497924804688, "learning_rate": 4.0631808278867103e-07, "loss": 3.986614227294922, "step": 2726 }, { "epoch": 5.9411764705882355, "grad_norm": 2.506016731262207, "learning_rate": 4.061002178649237e-07, "loss": 4.191418170928955, "step": 2727 }, { "epoch": 5.943355119825708, "grad_norm": 1.9831163883209229, "learning_rate": 4.0588235294117646e-07, "loss": 4.001962661743164, "step": 2728 }, { "epoch": 5.94553376906318, "grad_norm": 2.2673614025115967, "learning_rate": 4.056644880174292e-07, "loss": 3.7318854331970215, "step": 2729 }, { "epoch": 5.947712418300654, "grad_norm": 2.8008623123168945, "learning_rate": 4.054466230936819e-07, "loss": 3.9830594062805176, "step": 2730 }, { "epoch": 5.949891067538126, "grad_norm": 2.0961508750915527, "learning_rate": 4.0522875816993464e-07, "loss": 3.932244062423706, "step": 2731 }, { "epoch": 5.952069716775599, "grad_norm": 1.6131685972213745, "learning_rate": 4.050108932461874e-07, "loss": 3.6689155101776123, "step": 2732 }, { "epoch": 5.954248366013072, "grad_norm": 2.001952648162842, "learning_rate": 4.0479302832244007e-07, "loss": 3.8900327682495117, "step": 2733 }, { "epoch": 5.9564270152505445, "grad_norm": 2.260925769805908, "learning_rate": 4.045751633986928e-07, "loss": 3.963017225265503, "step": 2734 }, { "epoch": 5.958605664488017, "grad_norm": 2.6928467750549316, "learning_rate": 4.0435729847494556e-07, "loss": 3.9896328449249268, "step": 2735 }, { "epoch": 5.96078431372549, "grad_norm": 3.362938404083252, "learning_rate": 4.0413943355119825e-07, "loss": 4.037815570831299, "step": 2736 }, { "epoch": 5.962962962962963, "grad_norm": 1.9294017553329468, "learning_rate": 4.03921568627451e-07, "loss": 3.9337856769561768, "step": 2737 }, { "epoch": 5.965141612200435, "grad_norm": 2.2000279426574707, "learning_rate": 4.0370370370370373e-07, "loss": 3.8770627975463867, "step": 2738 }, { "epoch": 5.967320261437909, "grad_norm": 2.2775895595550537, "learning_rate": 4.034858387799564e-07, "loss": 3.9038801193237305, "step": 2739 }, { "epoch": 5.969498910675381, "grad_norm": 1.9716219902038574, "learning_rate": 4.0326797385620916e-07, "loss": 3.943206787109375, "step": 2740 }, { "epoch": 5.9716775599128535, "grad_norm": 2.3644039630889893, "learning_rate": 4.0305010893246185e-07, "loss": 4.02962589263916, "step": 2741 }, { "epoch": 5.973856209150327, "grad_norm": 2.432537317276001, "learning_rate": 4.028322440087146e-07, "loss": 3.9139413833618164, "step": 2742 }, { "epoch": 5.976034858387799, "grad_norm": 2.7582411766052246, "learning_rate": 4.0261437908496734e-07, "loss": 4.0280609130859375, "step": 2743 }, { "epoch": 5.978213507625273, "grad_norm": 2.176626205444336, "learning_rate": 4.0239651416122003e-07, "loss": 3.9130055904388428, "step": 2744 }, { "epoch": 5.980392156862745, "grad_norm": 3.0084619522094727, "learning_rate": 4.0217864923747277e-07, "loss": 4.086002349853516, "step": 2745 }, { "epoch": 5.982570806100218, "grad_norm": 2.0140576362609863, "learning_rate": 4.019607843137255e-07, "loss": 3.8608486652374268, "step": 2746 }, { "epoch": 5.984749455337691, "grad_norm": 2.886404037475586, "learning_rate": 4.0174291938997815e-07, "loss": 4.0301995277404785, "step": 2747 }, { "epoch": 5.9869281045751634, "grad_norm": 2.2819876670837402, "learning_rate": 4.015250544662309e-07, "loss": 3.770372152328491, "step": 2748 }, { "epoch": 5.989106753812636, "grad_norm": 2.083848237991333, "learning_rate": 4.013071895424837e-07, "loss": 3.910144090652466, "step": 2749 }, { "epoch": 5.991285403050109, "grad_norm": 2.424997568130493, "learning_rate": 4.010893246187363e-07, "loss": 3.872319221496582, "step": 2750 }, { "epoch": 5.993464052287582, "grad_norm": 2.2087953090667725, "learning_rate": 4.0087145969498906e-07, "loss": 3.8855156898498535, "step": 2751 }, { "epoch": 5.995642701525054, "grad_norm": 2.290135622024536, "learning_rate": 4.006535947712418e-07, "loss": 3.9135353565216064, "step": 2752 }, { "epoch": 5.9978213507625275, "grad_norm": 2.197629690170288, "learning_rate": 4.004357298474945e-07, "loss": 3.913001298904419, "step": 2753 }, { "epoch": 6.0, "grad_norm": 1.9540055990219116, "learning_rate": 4.0021786492374724e-07, "loss": 3.9935522079467773, "step": 2754 }, { "epoch": 6.0021786492374725, "grad_norm": 2.1581790447235107, "learning_rate": 4e-07, "loss": 3.890756130218506, "step": 2755 }, { "epoch": 6.004357298474946, "grad_norm": 2.2821288108825684, "learning_rate": 3.9978213507625267e-07, "loss": 4.1803388595581055, "step": 2756 }, { "epoch": 6.006535947712418, "grad_norm": 2.0515947341918945, "learning_rate": 3.995642701525054e-07, "loss": 3.8874146938323975, "step": 2757 }, { "epoch": 6.008714596949891, "grad_norm": 2.3850741386413574, "learning_rate": 3.9934640522875816e-07, "loss": 3.898667335510254, "step": 2758 }, { "epoch": 6.010893246187364, "grad_norm": 2.2222816944122314, "learning_rate": 3.9912854030501085e-07, "loss": 3.9659416675567627, "step": 2759 }, { "epoch": 6.0130718954248366, "grad_norm": 2.3454220294952393, "learning_rate": 3.989106753812636e-07, "loss": 3.898486375808716, "step": 2760 }, { "epoch": 6.015250544662309, "grad_norm": 2.3402011394500732, "learning_rate": 3.9869281045751633e-07, "loss": 4.139708042144775, "step": 2761 }, { "epoch": 6.017429193899782, "grad_norm": 2.90443754196167, "learning_rate": 3.98474945533769e-07, "loss": 4.033339500427246, "step": 2762 }, { "epoch": 6.019607843137255, "grad_norm": 2.055462121963501, "learning_rate": 3.9825708061002176e-07, "loss": 3.87896728515625, "step": 2763 }, { "epoch": 6.021786492374727, "grad_norm": 2.348242998123169, "learning_rate": 3.9803921568627445e-07, "loss": 3.8206472396850586, "step": 2764 }, { "epoch": 6.023965141612201, "grad_norm": 2.72310733795166, "learning_rate": 3.978213507625272e-07, "loss": 3.851579427719116, "step": 2765 }, { "epoch": 6.026143790849673, "grad_norm": 3.355721950531006, "learning_rate": 3.9760348583877994e-07, "loss": 3.8435349464416504, "step": 2766 }, { "epoch": 6.028322440087146, "grad_norm": 2.7945358753204346, "learning_rate": 3.9738562091503263e-07, "loss": 4.063350200653076, "step": 2767 }, { "epoch": 6.030501089324619, "grad_norm": 1.9531781673431396, "learning_rate": 3.9716775599128537e-07, "loss": 3.930798292160034, "step": 2768 }, { "epoch": 6.032679738562091, "grad_norm": 2.2352359294891357, "learning_rate": 3.969498910675381e-07, "loss": 4.10651969909668, "step": 2769 }, { "epoch": 6.034858387799564, "grad_norm": 1.9945502281188965, "learning_rate": 3.967320261437908e-07, "loss": 3.9504239559173584, "step": 2770 }, { "epoch": 6.037037037037037, "grad_norm": 1.9642823934555054, "learning_rate": 3.9651416122004354e-07, "loss": 3.875643253326416, "step": 2771 }, { "epoch": 6.03921568627451, "grad_norm": 2.1001696586608887, "learning_rate": 3.962962962962963e-07, "loss": 3.8202455043792725, "step": 2772 }, { "epoch": 6.041394335511983, "grad_norm": 1.907766342163086, "learning_rate": 3.96078431372549e-07, "loss": 3.923729181289673, "step": 2773 }, { "epoch": 6.0435729847494555, "grad_norm": 2.828378677368164, "learning_rate": 3.958605664488017e-07, "loss": 3.864640235900879, "step": 2774 }, { "epoch": 6.045751633986928, "grad_norm": 2.457577705383301, "learning_rate": 3.9564270152505446e-07, "loss": 3.9808526039123535, "step": 2775 }, { "epoch": 6.047930283224401, "grad_norm": 1.9492075443267822, "learning_rate": 3.9542483660130715e-07, "loss": 3.895724296569824, "step": 2776 }, { "epoch": 6.050108932461874, "grad_norm": 2.855867624282837, "learning_rate": 3.952069716775599e-07, "loss": 4.026609897613525, "step": 2777 }, { "epoch": 6.052287581699346, "grad_norm": 2.401685953140259, "learning_rate": 3.9498910675381264e-07, "loss": 4.034386157989502, "step": 2778 }, { "epoch": 6.05446623093682, "grad_norm": 2.456636667251587, "learning_rate": 3.947712418300653e-07, "loss": 4.122689723968506, "step": 2779 }, { "epoch": 6.056644880174292, "grad_norm": 2.5077767372131348, "learning_rate": 3.9455337690631807e-07, "loss": 3.953502655029297, "step": 2780 }, { "epoch": 6.0588235294117645, "grad_norm": 2.493466854095459, "learning_rate": 3.943355119825708e-07, "loss": 3.8456008434295654, "step": 2781 }, { "epoch": 6.061002178649238, "grad_norm": 1.8083295822143555, "learning_rate": 3.941176470588235e-07, "loss": 3.8707079887390137, "step": 2782 }, { "epoch": 6.06318082788671, "grad_norm": 1.8042011260986328, "learning_rate": 3.9389978213507624e-07, "loss": 3.779026508331299, "step": 2783 }, { "epoch": 6.065359477124183, "grad_norm": 2.151664972305298, "learning_rate": 3.93681917211329e-07, "loss": 3.805323362350464, "step": 2784 }, { "epoch": 6.067538126361656, "grad_norm": 2.355818271636963, "learning_rate": 3.934640522875817e-07, "loss": 3.9169600009918213, "step": 2785 }, { "epoch": 6.069716775599129, "grad_norm": 2.4176559448242188, "learning_rate": 3.932461873638344e-07, "loss": 3.8970963954925537, "step": 2786 }, { "epoch": 6.071895424836601, "grad_norm": 2.540111780166626, "learning_rate": 3.930283224400871e-07, "loss": 3.9654178619384766, "step": 2787 }, { "epoch": 6.074074074074074, "grad_norm": 1.626636266708374, "learning_rate": 3.9281045751633985e-07, "loss": 3.7756693363189697, "step": 2788 }, { "epoch": 6.076252723311547, "grad_norm": 2.652946710586548, "learning_rate": 3.925925925925926e-07, "loss": 4.121876239776611, "step": 2789 }, { "epoch": 6.078431372549019, "grad_norm": 2.4139249324798584, "learning_rate": 3.923747276688453e-07, "loss": 4.131500720977783, "step": 2790 }, { "epoch": 6.080610021786493, "grad_norm": 1.7715076208114624, "learning_rate": 3.92156862745098e-07, "loss": 3.904203414916992, "step": 2791 }, { "epoch": 6.082788671023965, "grad_norm": 2.5377955436706543, "learning_rate": 3.9193899782135076e-07, "loss": 3.7966535091400146, "step": 2792 }, { "epoch": 6.084967320261438, "grad_norm": 2.0885801315307617, "learning_rate": 3.9172113289760345e-07, "loss": 3.891512393951416, "step": 2793 }, { "epoch": 6.087145969498911, "grad_norm": 2.156531572341919, "learning_rate": 3.915032679738562e-07, "loss": 3.9374799728393555, "step": 2794 }, { "epoch": 6.089324618736383, "grad_norm": 2.5689892768859863, "learning_rate": 3.9128540305010894e-07, "loss": 3.8487050533294678, "step": 2795 }, { "epoch": 6.091503267973856, "grad_norm": 2.2013235092163086, "learning_rate": 3.9106753812636163e-07, "loss": 3.898531675338745, "step": 2796 }, { "epoch": 6.093681917211329, "grad_norm": 2.5004374980926514, "learning_rate": 3.9084967320261437e-07, "loss": 3.9614272117614746, "step": 2797 }, { "epoch": 6.095860566448802, "grad_norm": 2.1822357177734375, "learning_rate": 3.906318082788671e-07, "loss": 3.849158763885498, "step": 2798 }, { "epoch": 6.098039215686274, "grad_norm": 2.3240208625793457, "learning_rate": 3.904139433551198e-07, "loss": 4.001067161560059, "step": 2799 }, { "epoch": 6.1002178649237475, "grad_norm": 3.5263564586639404, "learning_rate": 3.9019607843137255e-07, "loss": 4.136421203613281, "step": 2800 }, { "epoch": 6.10239651416122, "grad_norm": 1.7210748195648193, "learning_rate": 3.899782135076253e-07, "loss": 3.7857601642608643, "step": 2801 }, { "epoch": 6.104575163398692, "grad_norm": 2.7816381454467773, "learning_rate": 3.89760348583878e-07, "loss": 3.973454236984253, "step": 2802 }, { "epoch": 6.106753812636166, "grad_norm": 2.197359323501587, "learning_rate": 3.895424836601307e-07, "loss": 3.700894594192505, "step": 2803 }, { "epoch": 6.108932461873638, "grad_norm": 2.135517120361328, "learning_rate": 3.8932461873638346e-07, "loss": 3.8495235443115234, "step": 2804 }, { "epoch": 6.111111111111111, "grad_norm": 2.0006730556488037, "learning_rate": 3.8910675381263615e-07, "loss": 3.828949451446533, "step": 2805 }, { "epoch": 6.113289760348584, "grad_norm": 2.449094772338867, "learning_rate": 3.888888888888889e-07, "loss": 4.043616771697998, "step": 2806 }, { "epoch": 6.1154684095860565, "grad_norm": 2.0878069400787354, "learning_rate": 3.8867102396514164e-07, "loss": 3.904740810394287, "step": 2807 }, { "epoch": 6.117647058823529, "grad_norm": 2.5179474353790283, "learning_rate": 3.8845315904139433e-07, "loss": 3.916508913040161, "step": 2808 }, { "epoch": 6.119825708061002, "grad_norm": 2.484107732772827, "learning_rate": 3.8823529411764707e-07, "loss": 3.9362330436706543, "step": 2809 }, { "epoch": 6.122004357298475, "grad_norm": 3.005915403366089, "learning_rate": 3.8801742919389976e-07, "loss": 3.882631778717041, "step": 2810 }, { "epoch": 6.124183006535947, "grad_norm": 2.0040862560272217, "learning_rate": 3.877995642701525e-07, "loss": 3.85166597366333, "step": 2811 }, { "epoch": 6.126361655773421, "grad_norm": 1.9007065296173096, "learning_rate": 3.8758169934640524e-07, "loss": 3.782200813293457, "step": 2812 }, { "epoch": 6.128540305010893, "grad_norm": 1.82049560546875, "learning_rate": 3.8736383442265793e-07, "loss": 3.640810012817383, "step": 2813 }, { "epoch": 6.130718954248366, "grad_norm": 1.830133318901062, "learning_rate": 3.871459694989107e-07, "loss": 3.6519994735717773, "step": 2814 }, { "epoch": 6.132897603485839, "grad_norm": 2.6344339847564697, "learning_rate": 3.869281045751634e-07, "loss": 3.8660342693328857, "step": 2815 }, { "epoch": 6.135076252723311, "grad_norm": 2.2150161266326904, "learning_rate": 3.867102396514161e-07, "loss": 3.929478645324707, "step": 2816 }, { "epoch": 6.137254901960785, "grad_norm": 2.2873616218566895, "learning_rate": 3.8649237472766885e-07, "loss": 3.8789100646972656, "step": 2817 }, { "epoch": 6.139433551198257, "grad_norm": 2.1427316665649414, "learning_rate": 3.862745098039216e-07, "loss": 3.8884177207946777, "step": 2818 }, { "epoch": 6.14161220043573, "grad_norm": 2.5103025436401367, "learning_rate": 3.8605664488017423e-07, "loss": 4.077581405639648, "step": 2819 }, { "epoch": 6.143790849673203, "grad_norm": 2.552987575531006, "learning_rate": 3.8583877995642697e-07, "loss": 3.770615816116333, "step": 2820 }, { "epoch": 6.1459694989106755, "grad_norm": 1.9601927995681763, "learning_rate": 3.856209150326797e-07, "loss": 3.8407773971557617, "step": 2821 }, { "epoch": 6.148148148148148, "grad_norm": 2.4924614429473877, "learning_rate": 3.854030501089324e-07, "loss": 3.7955875396728516, "step": 2822 }, { "epoch": 6.150326797385621, "grad_norm": 2.0825183391571045, "learning_rate": 3.8518518518518515e-07, "loss": 3.913135051727295, "step": 2823 }, { "epoch": 6.152505446623094, "grad_norm": 3.2007412910461426, "learning_rate": 3.849673202614379e-07, "loss": 4.018160820007324, "step": 2824 }, { "epoch": 6.154684095860566, "grad_norm": 2.4269449710845947, "learning_rate": 3.847494553376906e-07, "loss": 3.951422929763794, "step": 2825 }, { "epoch": 6.1568627450980395, "grad_norm": 2.541095495223999, "learning_rate": 3.845315904139433e-07, "loss": 3.954603433609009, "step": 2826 }, { "epoch": 6.159041394335512, "grad_norm": 2.008593797683716, "learning_rate": 3.8431372549019606e-07, "loss": 3.943460464477539, "step": 2827 }, { "epoch": 6.1612200435729845, "grad_norm": 2.360502243041992, "learning_rate": 3.8409586056644875e-07, "loss": 4.0154595375061035, "step": 2828 }, { "epoch": 6.163398692810458, "grad_norm": 2.024301052093506, "learning_rate": 3.838779956427015e-07, "loss": 3.8056697845458984, "step": 2829 }, { "epoch": 6.16557734204793, "grad_norm": 2.2938482761383057, "learning_rate": 3.8366013071895424e-07, "loss": 3.8531887531280518, "step": 2830 }, { "epoch": 6.167755991285403, "grad_norm": 1.8626070022583008, "learning_rate": 3.8344226579520693e-07, "loss": 3.8344318866729736, "step": 2831 }, { "epoch": 6.169934640522876, "grad_norm": 2.0398828983306885, "learning_rate": 3.8322440087145967e-07, "loss": 3.8874497413635254, "step": 2832 }, { "epoch": 6.172113289760349, "grad_norm": 2.1307551860809326, "learning_rate": 3.830065359477124e-07, "loss": 3.964582681655884, "step": 2833 }, { "epoch": 6.174291938997821, "grad_norm": 2.773837089538574, "learning_rate": 3.827886710239651e-07, "loss": 3.913590431213379, "step": 2834 }, { "epoch": 6.176470588235294, "grad_norm": 2.6413803100585938, "learning_rate": 3.8257080610021784e-07, "loss": 3.968822479248047, "step": 2835 }, { "epoch": 6.178649237472767, "grad_norm": 2.449716329574585, "learning_rate": 3.8235294117647053e-07, "loss": 3.8939993381500244, "step": 2836 }, { "epoch": 6.180827886710239, "grad_norm": 2.269639730453491, "learning_rate": 3.821350762527233e-07, "loss": 3.824985980987549, "step": 2837 }, { "epoch": 6.183006535947713, "grad_norm": 2.967430353164673, "learning_rate": 3.81917211328976e-07, "loss": 4.055837154388428, "step": 2838 }, { "epoch": 6.185185185185185, "grad_norm": 1.5622947216033936, "learning_rate": 3.816993464052287e-07, "loss": 3.7798609733581543, "step": 2839 }, { "epoch": 6.187363834422658, "grad_norm": 1.8218069076538086, "learning_rate": 3.8148148148148145e-07, "loss": 3.918125629425049, "step": 2840 }, { "epoch": 6.189542483660131, "grad_norm": 2.5626583099365234, "learning_rate": 3.812636165577342e-07, "loss": 4.074217796325684, "step": 2841 }, { "epoch": 6.191721132897603, "grad_norm": 2.006070375442505, "learning_rate": 3.810457516339869e-07, "loss": 3.8673083782196045, "step": 2842 }, { "epoch": 6.193899782135076, "grad_norm": 2.9105818271636963, "learning_rate": 3.808278867102396e-07, "loss": 3.96002459526062, "step": 2843 }, { "epoch": 6.196078431372549, "grad_norm": 2.417985439300537, "learning_rate": 3.8061002178649237e-07, "loss": 3.8983066082000732, "step": 2844 }, { "epoch": 6.198257080610022, "grad_norm": 2.2709453105926514, "learning_rate": 3.8039215686274506e-07, "loss": 3.9501757621765137, "step": 2845 }, { "epoch": 6.200435729847494, "grad_norm": 1.7640527486801147, "learning_rate": 3.801742919389978e-07, "loss": 3.948831558227539, "step": 2846 }, { "epoch": 6.2026143790849675, "grad_norm": 2.7877869606018066, "learning_rate": 3.7995642701525054e-07, "loss": 4.0967512130737305, "step": 2847 }, { "epoch": 6.20479302832244, "grad_norm": 1.9877640008926392, "learning_rate": 3.7973856209150323e-07, "loss": 3.917044162750244, "step": 2848 }, { "epoch": 6.206971677559913, "grad_norm": 2.033499002456665, "learning_rate": 3.79520697167756e-07, "loss": 3.909771203994751, "step": 2849 }, { "epoch": 6.209150326797386, "grad_norm": 2.3336668014526367, "learning_rate": 3.793028322440087e-07, "loss": 3.9167516231536865, "step": 2850 }, { "epoch": 6.211328976034858, "grad_norm": 2.2667622566223145, "learning_rate": 3.790849673202614e-07, "loss": 3.889993906021118, "step": 2851 }, { "epoch": 6.213507625272332, "grad_norm": 4.0845465660095215, "learning_rate": 3.7886710239651415e-07, "loss": 4.039486408233643, "step": 2852 }, { "epoch": 6.215686274509804, "grad_norm": 1.81318199634552, "learning_rate": 3.786492374727669e-07, "loss": 3.8058764934539795, "step": 2853 }, { "epoch": 6.2178649237472765, "grad_norm": 3.310227870941162, "learning_rate": 3.784313725490196e-07, "loss": 4.204593658447266, "step": 2854 }, { "epoch": 6.22004357298475, "grad_norm": 1.8823285102844238, "learning_rate": 3.782135076252723e-07, "loss": 3.9917752742767334, "step": 2855 }, { "epoch": 6.222222222222222, "grad_norm": 2.587829351425171, "learning_rate": 3.7799564270152507e-07, "loss": 3.945617914199829, "step": 2856 }, { "epoch": 6.224400871459695, "grad_norm": 1.9478639364242554, "learning_rate": 3.7777777777777775e-07, "loss": 3.906728982925415, "step": 2857 }, { "epoch": 6.226579520697168, "grad_norm": 2.5859899520874023, "learning_rate": 3.775599128540305e-07, "loss": 3.896052360534668, "step": 2858 }, { "epoch": 6.228758169934641, "grad_norm": 1.9385472536087036, "learning_rate": 3.773420479302832e-07, "loss": 3.9157543182373047, "step": 2859 }, { "epoch": 6.230936819172113, "grad_norm": 2.1808862686157227, "learning_rate": 3.7712418300653593e-07, "loss": 3.82599139213562, "step": 2860 }, { "epoch": 6.233115468409586, "grad_norm": 2.4330925941467285, "learning_rate": 3.7690631808278867e-07, "loss": 3.869316577911377, "step": 2861 }, { "epoch": 6.235294117647059, "grad_norm": 2.5812525749206543, "learning_rate": 3.7668845315904136e-07, "loss": 3.9279696941375732, "step": 2862 }, { "epoch": 6.237472766884531, "grad_norm": 2.217205286026001, "learning_rate": 3.764705882352941e-07, "loss": 3.9154551029205322, "step": 2863 }, { "epoch": 6.239651416122005, "grad_norm": 2.9726850986480713, "learning_rate": 3.7625272331154685e-07, "loss": 4.003962516784668, "step": 2864 }, { "epoch": 6.241830065359477, "grad_norm": 1.9228473901748657, "learning_rate": 3.7603485838779954e-07, "loss": 3.9218032360076904, "step": 2865 }, { "epoch": 6.24400871459695, "grad_norm": 2.7837131023406982, "learning_rate": 3.758169934640523e-07, "loss": 3.877596139907837, "step": 2866 }, { "epoch": 6.246187363834423, "grad_norm": 2.2761573791503906, "learning_rate": 3.75599128540305e-07, "loss": 3.958130359649658, "step": 2867 }, { "epoch": 6.248366013071895, "grad_norm": 2.854459047317505, "learning_rate": 3.753812636165577e-07, "loss": 3.982140064239502, "step": 2868 }, { "epoch": 6.250544662309368, "grad_norm": 2.5942885875701904, "learning_rate": 3.7516339869281045e-07, "loss": 4.057247638702393, "step": 2869 }, { "epoch": 6.252723311546841, "grad_norm": 1.9385038614273071, "learning_rate": 3.749455337690632e-07, "loss": 3.9003939628601074, "step": 2870 }, { "epoch": 6.254901960784314, "grad_norm": 2.65320086479187, "learning_rate": 3.747276688453159e-07, "loss": 4.008857727050781, "step": 2871 }, { "epoch": 6.257080610021786, "grad_norm": 1.8623385429382324, "learning_rate": 3.7450980392156863e-07, "loss": 3.7533652782440186, "step": 2872 }, { "epoch": 6.2592592592592595, "grad_norm": 2.5272018909454346, "learning_rate": 3.7429193899782137e-07, "loss": 3.910216808319092, "step": 2873 }, { "epoch": 6.261437908496732, "grad_norm": 1.88286292552948, "learning_rate": 3.7407407407407406e-07, "loss": 3.802557945251465, "step": 2874 }, { "epoch": 6.2636165577342044, "grad_norm": 2.163102626800537, "learning_rate": 3.738562091503268e-07, "loss": 3.8898754119873047, "step": 2875 }, { "epoch": 6.265795206971678, "grad_norm": 2.735653877258301, "learning_rate": 3.7363834422657954e-07, "loss": 4.037626266479492, "step": 2876 }, { "epoch": 6.26797385620915, "grad_norm": 2.2453653812408447, "learning_rate": 3.7342047930283223e-07, "loss": 3.918471097946167, "step": 2877 }, { "epoch": 6.270152505446623, "grad_norm": 2.288511037826538, "learning_rate": 3.73202614379085e-07, "loss": 3.874009847640991, "step": 2878 }, { "epoch": 6.272331154684096, "grad_norm": 2.3177273273468018, "learning_rate": 3.729847494553377e-07, "loss": 3.9546852111816406, "step": 2879 }, { "epoch": 6.2745098039215685, "grad_norm": 2.894563913345337, "learning_rate": 3.727668845315904e-07, "loss": 4.044673919677734, "step": 2880 }, { "epoch": 6.276688453159041, "grad_norm": 2.1827526092529297, "learning_rate": 3.7254901960784315e-07, "loss": 3.896505355834961, "step": 2881 }, { "epoch": 6.278867102396514, "grad_norm": 1.640568494796753, "learning_rate": 3.7233115468409584e-07, "loss": 3.7612998485565186, "step": 2882 }, { "epoch": 6.281045751633987, "grad_norm": 2.2295453548431396, "learning_rate": 3.721132897603486e-07, "loss": 3.852679491043091, "step": 2883 }, { "epoch": 6.283224400871459, "grad_norm": 2.122971296310425, "learning_rate": 3.718954248366013e-07, "loss": 3.9805266857147217, "step": 2884 }, { "epoch": 6.285403050108933, "grad_norm": 1.9563064575195312, "learning_rate": 3.71677559912854e-07, "loss": 3.8753774166107178, "step": 2885 }, { "epoch": 6.287581699346405, "grad_norm": 2.005053758621216, "learning_rate": 3.7145969498910676e-07, "loss": 3.7840960025787354, "step": 2886 }, { "epoch": 6.289760348583878, "grad_norm": 2.0297915935516357, "learning_rate": 3.712418300653595e-07, "loss": 3.912670612335205, "step": 2887 }, { "epoch": 6.291938997821351, "grad_norm": 2.1387417316436768, "learning_rate": 3.7102396514161214e-07, "loss": 3.984457492828369, "step": 2888 }, { "epoch": 6.294117647058823, "grad_norm": 1.7951983213424683, "learning_rate": 3.708061002178649e-07, "loss": 3.8873531818389893, "step": 2889 }, { "epoch": 6.296296296296296, "grad_norm": 2.503214120864868, "learning_rate": 3.705882352941176e-07, "loss": 3.9130465984344482, "step": 2890 }, { "epoch": 6.298474945533769, "grad_norm": 2.230795383453369, "learning_rate": 3.703703703703703e-07, "loss": 3.7207350730895996, "step": 2891 }, { "epoch": 6.300653594771242, "grad_norm": 2.015151262283325, "learning_rate": 3.7015250544662305e-07, "loss": 3.8634963035583496, "step": 2892 }, { "epoch": 6.302832244008715, "grad_norm": 2.0331153869628906, "learning_rate": 3.699346405228758e-07, "loss": 3.983307123184204, "step": 2893 }, { "epoch": 6.3050108932461875, "grad_norm": 1.966975212097168, "learning_rate": 3.697167755991285e-07, "loss": 3.9104998111724854, "step": 2894 }, { "epoch": 6.30718954248366, "grad_norm": 2.804633855819702, "learning_rate": 3.6949891067538123e-07, "loss": 4.132657527923584, "step": 2895 }, { "epoch": 6.309368191721133, "grad_norm": 2.994953155517578, "learning_rate": 3.6928104575163397e-07, "loss": 4.042706489562988, "step": 2896 }, { "epoch": 6.311546840958606, "grad_norm": 1.9726364612579346, "learning_rate": 3.6906318082788666e-07, "loss": 3.913964033126831, "step": 2897 }, { "epoch": 6.313725490196078, "grad_norm": 2.9082083702087402, "learning_rate": 3.688453159041394e-07, "loss": 4.077859401702881, "step": 2898 }, { "epoch": 6.315904139433552, "grad_norm": 3.207425117492676, "learning_rate": 3.6862745098039214e-07, "loss": 4.061360836029053, "step": 2899 }, { "epoch": 6.318082788671024, "grad_norm": 2.67976450920105, "learning_rate": 3.6840958605664483e-07, "loss": 3.8711700439453125, "step": 2900 }, { "epoch": 6.3202614379084965, "grad_norm": 2.2568328380584717, "learning_rate": 3.681917211328976e-07, "loss": 3.8837368488311768, "step": 2901 }, { "epoch": 6.32244008714597, "grad_norm": 2.018939733505249, "learning_rate": 3.679738562091503e-07, "loss": 3.9284090995788574, "step": 2902 }, { "epoch": 6.324618736383442, "grad_norm": 3.1715540885925293, "learning_rate": 3.67755991285403e-07, "loss": 3.9041826725006104, "step": 2903 }, { "epoch": 6.326797385620915, "grad_norm": 2.2153737545013428, "learning_rate": 3.6753812636165575e-07, "loss": 3.8855648040771484, "step": 2904 }, { "epoch": 6.328976034858388, "grad_norm": 3.0897469520568848, "learning_rate": 3.673202614379085e-07, "loss": 3.892916202545166, "step": 2905 }, { "epoch": 6.331154684095861, "grad_norm": 2.5365817546844482, "learning_rate": 3.671023965141612e-07, "loss": 3.914942741394043, "step": 2906 }, { "epoch": 6.333333333333333, "grad_norm": 2.2578659057617188, "learning_rate": 3.668845315904139e-07, "loss": 3.9496216773986816, "step": 2907 }, { "epoch": 6.335511982570806, "grad_norm": 2.6075918674468994, "learning_rate": 3.666666666666666e-07, "loss": 3.9652979373931885, "step": 2908 }, { "epoch": 6.337690631808279, "grad_norm": 2.951972484588623, "learning_rate": 3.6644880174291936e-07, "loss": 3.8927671909332275, "step": 2909 }, { "epoch": 6.339869281045751, "grad_norm": 2.3737587928771973, "learning_rate": 3.662309368191721e-07, "loss": 3.897341012954712, "step": 2910 }, { "epoch": 6.342047930283225, "grad_norm": 2.1586451530456543, "learning_rate": 3.660130718954248e-07, "loss": 3.8299310207366943, "step": 2911 }, { "epoch": 6.344226579520697, "grad_norm": 1.900429129600525, "learning_rate": 3.6579520697167753e-07, "loss": 3.7779126167297363, "step": 2912 }, { "epoch": 6.34640522875817, "grad_norm": 1.7859916687011719, "learning_rate": 3.655773420479303e-07, "loss": 3.920595645904541, "step": 2913 }, { "epoch": 6.348583877995643, "grad_norm": 2.1546592712402344, "learning_rate": 3.6535947712418296e-07, "loss": 3.9284286499023438, "step": 2914 }, { "epoch": 6.350762527233115, "grad_norm": 1.9318739175796509, "learning_rate": 3.651416122004357e-07, "loss": 3.8978562355041504, "step": 2915 }, { "epoch": 6.352941176470588, "grad_norm": 3.121138095855713, "learning_rate": 3.6492374727668845e-07, "loss": 3.9574220180511475, "step": 2916 }, { "epoch": 6.355119825708061, "grad_norm": 3.6481125354766846, "learning_rate": 3.6470588235294114e-07, "loss": 4.046625137329102, "step": 2917 }, { "epoch": 6.357298474945534, "grad_norm": 1.8479105234146118, "learning_rate": 3.644880174291939e-07, "loss": 3.9784021377563477, "step": 2918 }, { "epoch": 6.359477124183006, "grad_norm": 2.0669946670532227, "learning_rate": 3.642701525054466e-07, "loss": 3.8222403526306152, "step": 2919 }, { "epoch": 6.3616557734204795, "grad_norm": 2.2020041942596436, "learning_rate": 3.640522875816993e-07, "loss": 3.8195385932922363, "step": 2920 }, { "epoch": 6.363834422657952, "grad_norm": 2.1006252765655518, "learning_rate": 3.6383442265795206e-07, "loss": 3.9472856521606445, "step": 2921 }, { "epoch": 6.366013071895424, "grad_norm": 1.7385783195495605, "learning_rate": 3.636165577342048e-07, "loss": 3.9088361263275146, "step": 2922 }, { "epoch": 6.368191721132898, "grad_norm": 2.2009735107421875, "learning_rate": 3.633986928104575e-07, "loss": 3.9042751789093018, "step": 2923 }, { "epoch": 6.37037037037037, "grad_norm": 2.2148616313934326, "learning_rate": 3.6318082788671023e-07, "loss": 4.027496337890625, "step": 2924 }, { "epoch": 6.372549019607844, "grad_norm": 1.7882962226867676, "learning_rate": 3.6296296296296297e-07, "loss": 3.750417709350586, "step": 2925 }, { "epoch": 6.374727668845316, "grad_norm": 1.757178544998169, "learning_rate": 3.6274509803921566e-07, "loss": 3.7279624938964844, "step": 2926 }, { "epoch": 6.3769063180827885, "grad_norm": 2.9875667095184326, "learning_rate": 3.625272331154684e-07, "loss": 4.140126705169678, "step": 2927 }, { "epoch": 6.379084967320262, "grad_norm": 2.653048515319824, "learning_rate": 3.6230936819172115e-07, "loss": 3.9882304668426514, "step": 2928 }, { "epoch": 6.381263616557734, "grad_norm": 2.6247081756591797, "learning_rate": 3.6209150326797384e-07, "loss": 3.9253458976745605, "step": 2929 }, { "epoch": 6.383442265795207, "grad_norm": 2.051320791244507, "learning_rate": 3.618736383442266e-07, "loss": 3.8979475498199463, "step": 2930 }, { "epoch": 6.38562091503268, "grad_norm": 2.3620944023132324, "learning_rate": 3.6165577342047927e-07, "loss": 3.943554401397705, "step": 2931 }, { "epoch": 6.387799564270153, "grad_norm": 2.3867650032043457, "learning_rate": 3.61437908496732e-07, "loss": 4.063164710998535, "step": 2932 }, { "epoch": 6.389978213507625, "grad_norm": 2.506464958190918, "learning_rate": 3.6122004357298475e-07, "loss": 3.9457550048828125, "step": 2933 }, { "epoch": 6.392156862745098, "grad_norm": 2.332274913787842, "learning_rate": 3.6100217864923744e-07, "loss": 3.6512362957000732, "step": 2934 }, { "epoch": 6.394335511982571, "grad_norm": 2.0359084606170654, "learning_rate": 3.607843137254902e-07, "loss": 3.9739272594451904, "step": 2935 }, { "epoch": 6.396514161220043, "grad_norm": 1.937720537185669, "learning_rate": 3.6056644880174293e-07, "loss": 3.9492201805114746, "step": 2936 }, { "epoch": 6.398692810457517, "grad_norm": 2.2559115886688232, "learning_rate": 3.603485838779956e-07, "loss": 3.882575035095215, "step": 2937 }, { "epoch": 6.400871459694989, "grad_norm": 1.9189167022705078, "learning_rate": 3.6013071895424836e-07, "loss": 3.6928093433380127, "step": 2938 }, { "epoch": 6.403050108932462, "grad_norm": 3.1577606201171875, "learning_rate": 3.599128540305011e-07, "loss": 3.8995063304901123, "step": 2939 }, { "epoch": 6.405228758169935, "grad_norm": 3.072448253631592, "learning_rate": 3.596949891067538e-07, "loss": 3.871556520462036, "step": 2940 }, { "epoch": 6.407407407407407, "grad_norm": 2.0533440113067627, "learning_rate": 3.5947712418300653e-07, "loss": 3.9532320499420166, "step": 2941 }, { "epoch": 6.40958605664488, "grad_norm": 1.960924506187439, "learning_rate": 3.592592592592593e-07, "loss": 3.770024299621582, "step": 2942 }, { "epoch": 6.411764705882353, "grad_norm": 2.768232583999634, "learning_rate": 3.5904139433551197e-07, "loss": 3.976017713546753, "step": 2943 }, { "epoch": 6.413943355119826, "grad_norm": 1.6107642650604248, "learning_rate": 3.588235294117647e-07, "loss": 3.83986496925354, "step": 2944 }, { "epoch": 6.416122004357298, "grad_norm": 2.173977851867676, "learning_rate": 3.5860566448801745e-07, "loss": 3.98513126373291, "step": 2945 }, { "epoch": 6.4183006535947715, "grad_norm": 2.4649524688720703, "learning_rate": 3.5838779956427014e-07, "loss": 3.944411277770996, "step": 2946 }, { "epoch": 6.420479302832244, "grad_norm": 2.4279651641845703, "learning_rate": 3.581699346405229e-07, "loss": 3.8748202323913574, "step": 2947 }, { "epoch": 6.4226579520697165, "grad_norm": 1.897396445274353, "learning_rate": 3.579520697167756e-07, "loss": 4.000637531280518, "step": 2948 }, { "epoch": 6.42483660130719, "grad_norm": 2.9975717067718506, "learning_rate": 3.577342047930283e-07, "loss": 3.921278476715088, "step": 2949 }, { "epoch": 6.427015250544662, "grad_norm": 2.27323055267334, "learning_rate": 3.5751633986928106e-07, "loss": 3.842027425765991, "step": 2950 }, { "epoch": 6.429193899782135, "grad_norm": 2.671358346939087, "learning_rate": 3.572984749455338e-07, "loss": 3.9840023517608643, "step": 2951 }, { "epoch": 6.431372549019608, "grad_norm": 2.725776195526123, "learning_rate": 3.570806100217865e-07, "loss": 4.012476921081543, "step": 2952 }, { "epoch": 6.4335511982570806, "grad_norm": 2.3042917251586914, "learning_rate": 3.5686274509803923e-07, "loss": 3.872568130493164, "step": 2953 }, { "epoch": 6.435729847494553, "grad_norm": 2.0946738719940186, "learning_rate": 3.566448801742919e-07, "loss": 3.8584043979644775, "step": 2954 }, { "epoch": 6.437908496732026, "grad_norm": 1.9776564836502075, "learning_rate": 3.5642701525054466e-07, "loss": 3.935131072998047, "step": 2955 }, { "epoch": 6.440087145969499, "grad_norm": 2.482260227203369, "learning_rate": 3.562091503267974e-07, "loss": 3.8584799766540527, "step": 2956 }, { "epoch": 6.442265795206971, "grad_norm": 3.046379566192627, "learning_rate": 3.5599128540305004e-07, "loss": 4.037785530090332, "step": 2957 }, { "epoch": 6.444444444444445, "grad_norm": 2.0044007301330566, "learning_rate": 3.557734204793028e-07, "loss": 3.9252264499664307, "step": 2958 }, { "epoch": 6.446623093681917, "grad_norm": 3.582510471343994, "learning_rate": 3.5555555555555553e-07, "loss": 3.984574794769287, "step": 2959 }, { "epoch": 6.44880174291939, "grad_norm": 2.440497636795044, "learning_rate": 3.553376906318082e-07, "loss": 3.9592173099517822, "step": 2960 }, { "epoch": 6.450980392156863, "grad_norm": 2.111196279525757, "learning_rate": 3.5511982570806096e-07, "loss": 3.9511470794677734, "step": 2961 }, { "epoch": 6.453159041394335, "grad_norm": 2.2958030700683594, "learning_rate": 3.549019607843137e-07, "loss": 3.882079839706421, "step": 2962 }, { "epoch": 6.455337690631808, "grad_norm": 2.0358786582946777, "learning_rate": 3.546840958605664e-07, "loss": 3.7771518230438232, "step": 2963 }, { "epoch": 6.457516339869281, "grad_norm": 2.149200439453125, "learning_rate": 3.5446623093681913e-07, "loss": 3.89250111579895, "step": 2964 }, { "epoch": 6.459694989106754, "grad_norm": 2.068413496017456, "learning_rate": 3.542483660130719e-07, "loss": 3.741276264190674, "step": 2965 }, { "epoch": 6.461873638344226, "grad_norm": 2.8882975578308105, "learning_rate": 3.5403050108932457e-07, "loss": 4.020902156829834, "step": 2966 }, { "epoch": 6.4640522875816995, "grad_norm": 2.3048551082611084, "learning_rate": 3.538126361655773e-07, "loss": 3.7213385105133057, "step": 2967 }, { "epoch": 6.466230936819172, "grad_norm": 2.81545352935791, "learning_rate": 3.5359477124183005e-07, "loss": 4.132477283477783, "step": 2968 }, { "epoch": 6.468409586056645, "grad_norm": 2.1324141025543213, "learning_rate": 3.5337690631808274e-07, "loss": 3.883312463760376, "step": 2969 }, { "epoch": 6.470588235294118, "grad_norm": 2.5512821674346924, "learning_rate": 3.531590413943355e-07, "loss": 3.8964755535125732, "step": 2970 }, { "epoch": 6.47276688453159, "grad_norm": 2.2697551250457764, "learning_rate": 3.529411764705882e-07, "loss": 3.8748598098754883, "step": 2971 }, { "epoch": 6.474945533769064, "grad_norm": 2.857429265975952, "learning_rate": 3.527233115468409e-07, "loss": 3.9744925498962402, "step": 2972 }, { "epoch": 6.477124183006536, "grad_norm": 1.563535213470459, "learning_rate": 3.5250544662309366e-07, "loss": 3.9701507091522217, "step": 2973 }, { "epoch": 6.4793028322440085, "grad_norm": 1.8370935916900635, "learning_rate": 3.522875816993464e-07, "loss": 3.8983235359191895, "step": 2974 }, { "epoch": 6.481481481481482, "grad_norm": 2.755561113357544, "learning_rate": 3.520697167755991e-07, "loss": 3.853394031524658, "step": 2975 }, { "epoch": 6.483660130718954, "grad_norm": 2.5147335529327393, "learning_rate": 3.5185185185185183e-07, "loss": 3.9784810543060303, "step": 2976 }, { "epoch": 6.485838779956427, "grad_norm": 2.2019877433776855, "learning_rate": 3.516339869281046e-07, "loss": 3.985142469406128, "step": 2977 }, { "epoch": 6.4880174291939, "grad_norm": 1.8553874492645264, "learning_rate": 3.5141612200435726e-07, "loss": 3.843658685684204, "step": 2978 }, { "epoch": 6.490196078431373, "grad_norm": 2.636251926422119, "learning_rate": 3.5119825708061e-07, "loss": 4.011788368225098, "step": 2979 }, { "epoch": 6.492374727668845, "grad_norm": 2.267942190170288, "learning_rate": 3.509803921568627e-07, "loss": 4.02308988571167, "step": 2980 }, { "epoch": 6.494553376906318, "grad_norm": 2.49973464012146, "learning_rate": 3.5076252723311544e-07, "loss": 3.8782384395599365, "step": 2981 }, { "epoch": 6.496732026143791, "grad_norm": 2.0294601917266846, "learning_rate": 3.505446623093682e-07, "loss": 3.8605432510375977, "step": 2982 }, { "epoch": 6.498910675381263, "grad_norm": 2.4433374404907227, "learning_rate": 3.5032679738562087e-07, "loss": 3.8513569831848145, "step": 2983 }, { "epoch": 6.501089324618737, "grad_norm": 2.6026201248168945, "learning_rate": 3.501089324618736e-07, "loss": 3.982654333114624, "step": 2984 }, { "epoch": 6.503267973856209, "grad_norm": 2.0982296466827393, "learning_rate": 3.4989106753812636e-07, "loss": 3.9492197036743164, "step": 2985 }, { "epoch": 6.505446623093682, "grad_norm": 1.9462487697601318, "learning_rate": 3.4967320261437904e-07, "loss": 3.826007604598999, "step": 2986 }, { "epoch": 6.507625272331155, "grad_norm": 1.8200119733810425, "learning_rate": 3.494553376906318e-07, "loss": 3.8337481021881104, "step": 2987 }, { "epoch": 6.509803921568627, "grad_norm": 3.03653621673584, "learning_rate": 3.4923747276688453e-07, "loss": 3.99133038520813, "step": 2988 }, { "epoch": 6.5119825708061, "grad_norm": 2.0483529567718506, "learning_rate": 3.490196078431372e-07, "loss": 3.9532501697540283, "step": 2989 }, { "epoch": 6.514161220043573, "grad_norm": 2.680312156677246, "learning_rate": 3.4880174291938996e-07, "loss": 3.8817358016967773, "step": 2990 }, { "epoch": 6.516339869281046, "grad_norm": 2.096580743789673, "learning_rate": 3.485838779956427e-07, "loss": 3.9716339111328125, "step": 2991 }, { "epoch": 6.518518518518518, "grad_norm": 2.1546998023986816, "learning_rate": 3.483660130718954e-07, "loss": 3.9912548065185547, "step": 2992 }, { "epoch": 6.5206971677559915, "grad_norm": 1.9747551679611206, "learning_rate": 3.4814814814814814e-07, "loss": 3.847069263458252, "step": 2993 }, { "epoch": 6.522875816993464, "grad_norm": 2.2029623985290527, "learning_rate": 3.479302832244009e-07, "loss": 3.8972973823547363, "step": 2994 }, { "epoch": 6.525054466230936, "grad_norm": 2.0458528995513916, "learning_rate": 3.4771241830065357e-07, "loss": 3.8033528327941895, "step": 2995 }, { "epoch": 6.52723311546841, "grad_norm": 2.5909583568573, "learning_rate": 3.474945533769063e-07, "loss": 3.9549102783203125, "step": 2996 }, { "epoch": 6.529411764705882, "grad_norm": 2.011653184890747, "learning_rate": 3.4727668845315905e-07, "loss": 3.846651554107666, "step": 2997 }, { "epoch": 6.531590413943356, "grad_norm": 2.2680180072784424, "learning_rate": 3.4705882352941174e-07, "loss": 3.8880059719085693, "step": 2998 }, { "epoch": 6.533769063180828, "grad_norm": 1.8188364505767822, "learning_rate": 3.468409586056645e-07, "loss": 3.9019479751586914, "step": 2999 }, { "epoch": 6.5359477124183005, "grad_norm": 2.8535215854644775, "learning_rate": 3.4662309368191723e-07, "loss": 4.105434417724609, "step": 3000 }, { "epoch": 6.538126361655774, "grad_norm": 2.394313097000122, "learning_rate": 3.464052287581699e-07, "loss": 4.0971479415893555, "step": 3001 }, { "epoch": 6.540305010893246, "grad_norm": 1.986720085144043, "learning_rate": 3.4618736383442266e-07, "loss": 3.820096492767334, "step": 3002 }, { "epoch": 6.542483660130719, "grad_norm": 1.8284218311309814, "learning_rate": 3.4596949891067535e-07, "loss": 3.823145627975464, "step": 3003 }, { "epoch": 6.544662309368192, "grad_norm": 2.500702381134033, "learning_rate": 3.457516339869281e-07, "loss": 3.8936562538146973, "step": 3004 }, { "epoch": 6.546840958605665, "grad_norm": 2.0503928661346436, "learning_rate": 3.4553376906318083e-07, "loss": 3.6996803283691406, "step": 3005 }, { "epoch": 6.549019607843137, "grad_norm": 2.5338430404663086, "learning_rate": 3.453159041394335e-07, "loss": 3.8191261291503906, "step": 3006 }, { "epoch": 6.55119825708061, "grad_norm": 1.7927347421646118, "learning_rate": 3.4509803921568627e-07, "loss": 3.6997828483581543, "step": 3007 }, { "epoch": 6.553376906318083, "grad_norm": 2.2854204177856445, "learning_rate": 3.44880174291939e-07, "loss": 3.9650588035583496, "step": 3008 }, { "epoch": 6.555555555555555, "grad_norm": 2.41652512550354, "learning_rate": 3.446623093681917e-07, "loss": 4.036604404449463, "step": 3009 }, { "epoch": 6.557734204793029, "grad_norm": 2.2954659461975098, "learning_rate": 3.4444444444444444e-07, "loss": 4.124475479125977, "step": 3010 }, { "epoch": 6.559912854030501, "grad_norm": 2.1567347049713135, "learning_rate": 3.442265795206972e-07, "loss": 3.8426215648651123, "step": 3011 }, { "epoch": 6.562091503267974, "grad_norm": 1.7745656967163086, "learning_rate": 3.4400871459694987e-07, "loss": 3.8356471061706543, "step": 3012 }, { "epoch": 6.564270152505447, "grad_norm": 1.7558118104934692, "learning_rate": 3.437908496732026e-07, "loss": 3.7647130489349365, "step": 3013 }, { "epoch": 6.5664488017429194, "grad_norm": 2.557704448699951, "learning_rate": 3.4357298474945536e-07, "loss": 3.8834216594696045, "step": 3014 }, { "epoch": 6.568627450980392, "grad_norm": 1.6821253299713135, "learning_rate": 3.4335511982570805e-07, "loss": 3.7506043910980225, "step": 3015 }, { "epoch": 6.570806100217865, "grad_norm": 2.0712523460388184, "learning_rate": 3.431372549019608e-07, "loss": 3.7994205951690674, "step": 3016 }, { "epoch": 6.572984749455338, "grad_norm": 2.796710252761841, "learning_rate": 3.4291938997821353e-07, "loss": 3.9927890300750732, "step": 3017 }, { "epoch": 6.57516339869281, "grad_norm": 2.669827461242676, "learning_rate": 3.427015250544662e-07, "loss": 3.970262289047241, "step": 3018 }, { "epoch": 6.5773420479302835, "grad_norm": 1.8571062088012695, "learning_rate": 3.4248366013071896e-07, "loss": 3.8717565536499023, "step": 3019 }, { "epoch": 6.579520697167756, "grad_norm": 2.1626546382904053, "learning_rate": 3.422657952069717e-07, "loss": 3.914860725402832, "step": 3020 }, { "epoch": 6.5816993464052285, "grad_norm": 2.824429988861084, "learning_rate": 3.420479302832244e-07, "loss": 3.8955113887786865, "step": 3021 }, { "epoch": 6.583877995642702, "grad_norm": 2.551236867904663, "learning_rate": 3.4183006535947714e-07, "loss": 3.909958839416504, "step": 3022 }, { "epoch": 6.586056644880174, "grad_norm": 3.4885129928588867, "learning_rate": 3.416122004357299e-07, "loss": 4.070733547210693, "step": 3023 }, { "epoch": 6.588235294117647, "grad_norm": 1.980860710144043, "learning_rate": 3.4139433551198257e-07, "loss": 3.8110055923461914, "step": 3024 }, { "epoch": 6.59041394335512, "grad_norm": 2.01179575920105, "learning_rate": 3.411764705882353e-07, "loss": 3.7341105937957764, "step": 3025 }, { "epoch": 6.592592592592593, "grad_norm": 2.025097370147705, "learning_rate": 3.4095860566448795e-07, "loss": 3.812980890274048, "step": 3026 }, { "epoch": 6.594771241830065, "grad_norm": 2.5649447441101074, "learning_rate": 3.407407407407407e-07, "loss": 3.8157222270965576, "step": 3027 }, { "epoch": 6.596949891067538, "grad_norm": 1.7688277959823608, "learning_rate": 3.4052287581699343e-07, "loss": 3.8471620082855225, "step": 3028 }, { "epoch": 6.599128540305011, "grad_norm": 2.309217691421509, "learning_rate": 3.403050108932461e-07, "loss": 4.030097007751465, "step": 3029 }, { "epoch": 6.601307189542483, "grad_norm": 2.16914701461792, "learning_rate": 3.4008714596949887e-07, "loss": 3.8934149742126465, "step": 3030 }, { "epoch": 6.603485838779957, "grad_norm": 2.1111106872558594, "learning_rate": 3.398692810457516e-07, "loss": 3.947129964828491, "step": 3031 }, { "epoch": 6.605664488017429, "grad_norm": 2.576458692550659, "learning_rate": 3.396514161220043e-07, "loss": 4.114477157592773, "step": 3032 }, { "epoch": 6.607843137254902, "grad_norm": 1.8736047744750977, "learning_rate": 3.3943355119825704e-07, "loss": 3.7360317707061768, "step": 3033 }, { "epoch": 6.610021786492375, "grad_norm": 2.38100266456604, "learning_rate": 3.392156862745098e-07, "loss": 3.7571065425872803, "step": 3034 }, { "epoch": 6.612200435729847, "grad_norm": 2.227349042892456, "learning_rate": 3.3899782135076247e-07, "loss": 4.14121675491333, "step": 3035 }, { "epoch": 6.61437908496732, "grad_norm": 1.9641168117523193, "learning_rate": 3.387799564270152e-07, "loss": 3.657348871231079, "step": 3036 }, { "epoch": 6.616557734204793, "grad_norm": 2.4615845680236816, "learning_rate": 3.3856209150326796e-07, "loss": 3.958583116531372, "step": 3037 }, { "epoch": 6.618736383442266, "grad_norm": 2.8684513568878174, "learning_rate": 3.3834422657952065e-07, "loss": 3.98235821723938, "step": 3038 }, { "epoch": 6.620915032679738, "grad_norm": 2.596769332885742, "learning_rate": 3.381263616557734e-07, "loss": 4.038553714752197, "step": 3039 }, { "epoch": 6.6230936819172115, "grad_norm": 2.5881268978118896, "learning_rate": 3.3790849673202613e-07, "loss": 3.7813849449157715, "step": 3040 }, { "epoch": 6.625272331154684, "grad_norm": 2.54952073097229, "learning_rate": 3.376906318082788e-07, "loss": 3.870988368988037, "step": 3041 }, { "epoch": 6.627450980392156, "grad_norm": 2.253216028213501, "learning_rate": 3.3747276688453156e-07, "loss": 3.7576398849487305, "step": 3042 }, { "epoch": 6.62962962962963, "grad_norm": 2.44197940826416, "learning_rate": 3.372549019607843e-07, "loss": 3.8272740840911865, "step": 3043 }, { "epoch": 6.631808278867102, "grad_norm": 3.498400926589966, "learning_rate": 3.37037037037037e-07, "loss": 4.093881130218506, "step": 3044 }, { "epoch": 6.633986928104575, "grad_norm": 2.9784979820251465, "learning_rate": 3.3681917211328974e-07, "loss": 4.047277450561523, "step": 3045 }, { "epoch": 6.636165577342048, "grad_norm": 2.319669008255005, "learning_rate": 3.366013071895425e-07, "loss": 3.854238271713257, "step": 3046 }, { "epoch": 6.6383442265795205, "grad_norm": 2.3635778427124023, "learning_rate": 3.3638344226579517e-07, "loss": 4.043557167053223, "step": 3047 }, { "epoch": 6.640522875816993, "grad_norm": 1.9815770387649536, "learning_rate": 3.361655773420479e-07, "loss": 3.8355398178100586, "step": 3048 }, { "epoch": 6.642701525054466, "grad_norm": 2.4714620113372803, "learning_rate": 3.3594771241830066e-07, "loss": 3.9825944900512695, "step": 3049 }, { "epoch": 6.644880174291939, "grad_norm": 1.9924376010894775, "learning_rate": 3.3572984749455335e-07, "loss": 3.8839516639709473, "step": 3050 }, { "epoch": 6.647058823529412, "grad_norm": 1.8417582511901855, "learning_rate": 3.355119825708061e-07, "loss": 3.872788667678833, "step": 3051 }, { "epoch": 6.649237472766885, "grad_norm": 1.9055577516555786, "learning_rate": 3.352941176470588e-07, "loss": 3.8667044639587402, "step": 3052 }, { "epoch": 6.651416122004357, "grad_norm": 2.4090116024017334, "learning_rate": 3.350762527233115e-07, "loss": 4.01497220993042, "step": 3053 }, { "epoch": 6.65359477124183, "grad_norm": 1.988858699798584, "learning_rate": 3.3485838779956426e-07, "loss": 3.7461600303649902, "step": 3054 }, { "epoch": 6.655773420479303, "grad_norm": 2.4156157970428467, "learning_rate": 3.3464052287581695e-07, "loss": 3.96933913230896, "step": 3055 }, { "epoch": 6.657952069716775, "grad_norm": 3.4449892044067383, "learning_rate": 3.344226579520697e-07, "loss": 4.141861438751221, "step": 3056 }, { "epoch": 6.660130718954249, "grad_norm": 2.1071739196777344, "learning_rate": 3.3420479302832244e-07, "loss": 3.6607961654663086, "step": 3057 }, { "epoch": 6.662309368191721, "grad_norm": 2.038261651992798, "learning_rate": 3.339869281045751e-07, "loss": 3.971569299697876, "step": 3058 }, { "epoch": 6.664488017429194, "grad_norm": 1.6774191856384277, "learning_rate": 3.3376906318082787e-07, "loss": 3.952996015548706, "step": 3059 }, { "epoch": 6.666666666666667, "grad_norm": 2.16687273979187, "learning_rate": 3.335511982570806e-07, "loss": 4.0004096031188965, "step": 3060 }, { "epoch": 6.668845315904139, "grad_norm": 1.9109755754470825, "learning_rate": 3.333333333333333e-07, "loss": 3.95473313331604, "step": 3061 }, { "epoch": 6.671023965141612, "grad_norm": 2.6327903270721436, "learning_rate": 3.3311546840958604e-07, "loss": 3.9876134395599365, "step": 3062 }, { "epoch": 6.673202614379085, "grad_norm": 2.090608596801758, "learning_rate": 3.328976034858388e-07, "loss": 3.8288733959198, "step": 3063 }, { "epoch": 6.675381263616558, "grad_norm": 3.291715383529663, "learning_rate": 3.326797385620915e-07, "loss": 4.122216701507568, "step": 3064 }, { "epoch": 6.67755991285403, "grad_norm": 2.173452377319336, "learning_rate": 3.324618736383442e-07, "loss": 3.8550822734832764, "step": 3065 }, { "epoch": 6.6797385620915035, "grad_norm": 2.3533987998962402, "learning_rate": 3.3224400871459696e-07, "loss": 3.9089348316192627, "step": 3066 }, { "epoch": 6.681917211328976, "grad_norm": 1.9868192672729492, "learning_rate": 3.3202614379084965e-07, "loss": 3.971923351287842, "step": 3067 }, { "epoch": 6.684095860566448, "grad_norm": 3.229611396789551, "learning_rate": 3.318082788671024e-07, "loss": 4.001605987548828, "step": 3068 }, { "epoch": 6.686274509803922, "grad_norm": 3.2783043384552, "learning_rate": 3.3159041394335513e-07, "loss": 3.9901797771453857, "step": 3069 }, { "epoch": 6.688453159041394, "grad_norm": 1.9532424211502075, "learning_rate": 3.313725490196078e-07, "loss": 3.8135673999786377, "step": 3070 }, { "epoch": 6.690631808278867, "grad_norm": 2.8018767833709717, "learning_rate": 3.3115468409586057e-07, "loss": 4.040621280670166, "step": 3071 }, { "epoch": 6.69281045751634, "grad_norm": 1.9358892440795898, "learning_rate": 3.309368191721133e-07, "loss": 3.8542792797088623, "step": 3072 }, { "epoch": 6.6949891067538125, "grad_norm": 2.1833884716033936, "learning_rate": 3.30718954248366e-07, "loss": 3.8169033527374268, "step": 3073 }, { "epoch": 6.697167755991286, "grad_norm": 2.9736666679382324, "learning_rate": 3.3050108932461874e-07, "loss": 4.010695934295654, "step": 3074 }, { "epoch": 6.699346405228758, "grad_norm": 2.893764019012451, "learning_rate": 3.3028322440087143e-07, "loss": 3.7495510578155518, "step": 3075 }, { "epoch": 6.701525054466231, "grad_norm": 2.2404775619506836, "learning_rate": 3.3006535947712417e-07, "loss": 3.914999485015869, "step": 3076 }, { "epoch": 6.703703703703704, "grad_norm": 2.059018611907959, "learning_rate": 3.298474945533769e-07, "loss": 3.9482953548431396, "step": 3077 }, { "epoch": 6.705882352941177, "grad_norm": 2.2411723136901855, "learning_rate": 3.296296296296296e-07, "loss": 3.9191455841064453, "step": 3078 }, { "epoch": 6.708061002178649, "grad_norm": 2.3673908710479736, "learning_rate": 3.2941176470588235e-07, "loss": 3.8075947761535645, "step": 3079 }, { "epoch": 6.710239651416122, "grad_norm": 1.8214458227157593, "learning_rate": 3.291938997821351e-07, "loss": 3.8000237941741943, "step": 3080 }, { "epoch": 6.712418300653595, "grad_norm": 2.4919795989990234, "learning_rate": 3.289760348583878e-07, "loss": 3.9326677322387695, "step": 3081 }, { "epoch": 6.714596949891067, "grad_norm": 2.0192863941192627, "learning_rate": 3.287581699346405e-07, "loss": 3.9046390056610107, "step": 3082 }, { "epoch": 6.716775599128541, "grad_norm": 1.9258607625961304, "learning_rate": 3.2854030501089326e-07, "loss": 3.9221882820129395, "step": 3083 }, { "epoch": 6.718954248366013, "grad_norm": 2.2846291065216064, "learning_rate": 3.2832244008714595e-07, "loss": 3.9059770107269287, "step": 3084 }, { "epoch": 6.721132897603486, "grad_norm": 2.5328574180603027, "learning_rate": 3.281045751633987e-07, "loss": 3.9716906547546387, "step": 3085 }, { "epoch": 6.723311546840959, "grad_norm": 1.7921470403671265, "learning_rate": 3.2788671023965144e-07, "loss": 3.8803882598876953, "step": 3086 }, { "epoch": 6.7254901960784315, "grad_norm": 2.2666919231414795, "learning_rate": 3.2766884531590413e-07, "loss": 3.8503811359405518, "step": 3087 }, { "epoch": 6.727668845315904, "grad_norm": 3.1635968685150146, "learning_rate": 3.2745098039215687e-07, "loss": 4.199276924133301, "step": 3088 }, { "epoch": 6.729847494553377, "grad_norm": 2.2606823444366455, "learning_rate": 3.272331154684096e-07, "loss": 3.902557373046875, "step": 3089 }, { "epoch": 6.73202614379085, "grad_norm": 2.091878652572632, "learning_rate": 3.270152505446623e-07, "loss": 3.90509295463562, "step": 3090 }, { "epoch": 6.734204793028322, "grad_norm": 2.260502338409424, "learning_rate": 3.2679738562091505e-07, "loss": 3.9124884605407715, "step": 3091 }, { "epoch": 6.7363834422657956, "grad_norm": 2.3148181438446045, "learning_rate": 3.265795206971678e-07, "loss": 3.8648431301116943, "step": 3092 }, { "epoch": 6.738562091503268, "grad_norm": 2.346400022506714, "learning_rate": 3.263616557734205e-07, "loss": 3.830939769744873, "step": 3093 }, { "epoch": 6.7407407407407405, "grad_norm": 2.7182984352111816, "learning_rate": 3.261437908496732e-07, "loss": 3.917409658432007, "step": 3094 }, { "epoch": 6.742919389978214, "grad_norm": 2.416100025177002, "learning_rate": 3.2592592592592596e-07, "loss": 3.5540740489959717, "step": 3095 }, { "epoch": 6.745098039215686, "grad_norm": 2.5460000038146973, "learning_rate": 3.257080610021786e-07, "loss": 3.9600000381469727, "step": 3096 }, { "epoch": 6.747276688453159, "grad_norm": 2.493220090866089, "learning_rate": 3.2549019607843134e-07, "loss": 4.009997844696045, "step": 3097 }, { "epoch": 6.749455337690632, "grad_norm": 2.0188937187194824, "learning_rate": 3.2527233115468403e-07, "loss": 3.8307504653930664, "step": 3098 }, { "epoch": 6.751633986928105, "grad_norm": 1.8663259744644165, "learning_rate": 3.2505446623093677e-07, "loss": 3.8094310760498047, "step": 3099 }, { "epoch": 6.753812636165577, "grad_norm": 2.719430446624756, "learning_rate": 3.248366013071895e-07, "loss": 3.73860502243042, "step": 3100 }, { "epoch": 6.75599128540305, "grad_norm": 2.0785038471221924, "learning_rate": 3.246187363834422e-07, "loss": 3.954622745513916, "step": 3101 }, { "epoch": 6.758169934640523, "grad_norm": 3.270721912384033, "learning_rate": 3.2440087145969495e-07, "loss": 3.9101741313934326, "step": 3102 }, { "epoch": 6.760348583877995, "grad_norm": 1.5544368028640747, "learning_rate": 3.241830065359477e-07, "loss": 3.8378684520721436, "step": 3103 }, { "epoch": 6.762527233115469, "grad_norm": 2.263551712036133, "learning_rate": 3.239651416122004e-07, "loss": 3.956531286239624, "step": 3104 }, { "epoch": 6.764705882352941, "grad_norm": 2.6050102710723877, "learning_rate": 3.237472766884531e-07, "loss": 4.075041770935059, "step": 3105 }, { "epoch": 6.766884531590414, "grad_norm": 2.01542592048645, "learning_rate": 3.2352941176470586e-07, "loss": 3.981266736984253, "step": 3106 }, { "epoch": 6.769063180827887, "grad_norm": 2.091552734375, "learning_rate": 3.2331154684095855e-07, "loss": 3.7910218238830566, "step": 3107 }, { "epoch": 6.771241830065359, "grad_norm": 2.8562300205230713, "learning_rate": 3.230936819172113e-07, "loss": 3.9885663986206055, "step": 3108 }, { "epoch": 6.773420479302832, "grad_norm": 1.9347617626190186, "learning_rate": 3.2287581699346404e-07, "loss": 3.842790365219116, "step": 3109 }, { "epoch": 6.775599128540305, "grad_norm": 2.3279168605804443, "learning_rate": 3.2265795206971673e-07, "loss": 3.8013176918029785, "step": 3110 }, { "epoch": 6.777777777777778, "grad_norm": 2.058438777923584, "learning_rate": 3.2244008714596947e-07, "loss": 3.930229663848877, "step": 3111 }, { "epoch": 6.77995642701525, "grad_norm": 2.7766125202178955, "learning_rate": 3.222222222222222e-07, "loss": 3.8978397846221924, "step": 3112 }, { "epoch": 6.7821350762527235, "grad_norm": 2.3577938079833984, "learning_rate": 3.220043572984749e-07, "loss": 3.9158897399902344, "step": 3113 }, { "epoch": 6.784313725490196, "grad_norm": 2.555028200149536, "learning_rate": 3.2178649237472765e-07, "loss": 3.9204163551330566, "step": 3114 }, { "epoch": 6.786492374727668, "grad_norm": 2.4627084732055664, "learning_rate": 3.215686274509804e-07, "loss": 3.969925880432129, "step": 3115 }, { "epoch": 6.788671023965142, "grad_norm": 2.4759762287139893, "learning_rate": 3.213507625272331e-07, "loss": 3.9532699584960938, "step": 3116 }, { "epoch": 6.790849673202614, "grad_norm": 2.421523332595825, "learning_rate": 3.211328976034858e-07, "loss": 4.028683662414551, "step": 3117 }, { "epoch": 6.793028322440087, "grad_norm": 2.410331964492798, "learning_rate": 3.2091503267973856e-07, "loss": 3.8498966693878174, "step": 3118 }, { "epoch": 6.79520697167756, "grad_norm": 2.071242570877075, "learning_rate": 3.2069716775599125e-07, "loss": 3.9812533855438232, "step": 3119 }, { "epoch": 6.7973856209150325, "grad_norm": 3.0793051719665527, "learning_rate": 3.20479302832244e-07, "loss": 4.096553325653076, "step": 3120 }, { "epoch": 6.799564270152505, "grad_norm": 2.4275755882263184, "learning_rate": 3.202614379084967e-07, "loss": 4.007339954376221, "step": 3121 }, { "epoch": 6.801742919389978, "grad_norm": 1.91127347946167, "learning_rate": 3.200435729847494e-07, "loss": 3.9467780590057373, "step": 3122 }, { "epoch": 6.803921568627451, "grad_norm": 1.87616765499115, "learning_rate": 3.1982570806100217e-07, "loss": 3.8175625801086426, "step": 3123 }, { "epoch": 6.806100217864923, "grad_norm": 1.791858434677124, "learning_rate": 3.1960784313725486e-07, "loss": 3.8546056747436523, "step": 3124 }, { "epoch": 6.808278867102397, "grad_norm": 2.5284907817840576, "learning_rate": 3.193899782135076e-07, "loss": 3.803478240966797, "step": 3125 }, { "epoch": 6.810457516339869, "grad_norm": 1.777126669883728, "learning_rate": 3.1917211328976034e-07, "loss": 3.7725439071655273, "step": 3126 }, { "epoch": 6.812636165577342, "grad_norm": 2.282637596130371, "learning_rate": 3.1895424836601303e-07, "loss": 3.76617169380188, "step": 3127 }, { "epoch": 6.814814814814815, "grad_norm": 3.273115873336792, "learning_rate": 3.187363834422658e-07, "loss": 4.218883037567139, "step": 3128 }, { "epoch": 6.816993464052287, "grad_norm": 2.6316819190979004, "learning_rate": 3.185185185185185e-07, "loss": 3.9513306617736816, "step": 3129 }, { "epoch": 6.819172113289761, "grad_norm": 2.813586950302124, "learning_rate": 3.183006535947712e-07, "loss": 4.073014736175537, "step": 3130 }, { "epoch": 6.821350762527233, "grad_norm": 2.1704940795898438, "learning_rate": 3.1808278867102395e-07, "loss": 4.015061855316162, "step": 3131 }, { "epoch": 6.823529411764706, "grad_norm": 1.7773287296295166, "learning_rate": 3.178649237472767e-07, "loss": 3.8769474029541016, "step": 3132 }, { "epoch": 6.825708061002179, "grad_norm": 2.0521132946014404, "learning_rate": 3.176470588235294e-07, "loss": 3.7308881282806396, "step": 3133 }, { "epoch": 6.827886710239651, "grad_norm": 2.610443592071533, "learning_rate": 3.174291938997821e-07, "loss": 3.792793035507202, "step": 3134 }, { "epoch": 6.830065359477124, "grad_norm": 3.0169806480407715, "learning_rate": 3.1721132897603487e-07, "loss": 3.9411442279815674, "step": 3135 }, { "epoch": 6.832244008714597, "grad_norm": 1.9717183113098145, "learning_rate": 3.1699346405228756e-07, "loss": 3.770158529281616, "step": 3136 }, { "epoch": 6.83442265795207, "grad_norm": 2.0835132598876953, "learning_rate": 3.167755991285403e-07, "loss": 3.8789424896240234, "step": 3137 }, { "epoch": 6.836601307189542, "grad_norm": 2.019814968109131, "learning_rate": 3.1655773420479304e-07, "loss": 3.9005815982818604, "step": 3138 }, { "epoch": 6.8387799564270155, "grad_norm": 2.758751392364502, "learning_rate": 3.1633986928104573e-07, "loss": 3.907430410385132, "step": 3139 }, { "epoch": 6.840958605664488, "grad_norm": 2.3574249744415283, "learning_rate": 3.1612200435729847e-07, "loss": 3.8488707542419434, "step": 3140 }, { "epoch": 6.8431372549019605, "grad_norm": 2.035641670227051, "learning_rate": 3.159041394335512e-07, "loss": 3.8677055835723877, "step": 3141 }, { "epoch": 6.845315904139434, "grad_norm": 2.2942090034484863, "learning_rate": 3.156862745098039e-07, "loss": 3.9505503177642822, "step": 3142 }, { "epoch": 6.847494553376906, "grad_norm": 1.680872917175293, "learning_rate": 3.1546840958605665e-07, "loss": 3.8733456134796143, "step": 3143 }, { "epoch": 6.849673202614379, "grad_norm": 2.708906412124634, "learning_rate": 3.152505446623094e-07, "loss": 3.929856061935425, "step": 3144 }, { "epoch": 6.851851851851852, "grad_norm": 2.2724320888519287, "learning_rate": 3.150326797385621e-07, "loss": 3.7919809818267822, "step": 3145 }, { "epoch": 6.8540305010893245, "grad_norm": 2.503544569015503, "learning_rate": 3.148148148148148e-07, "loss": 3.892822504043579, "step": 3146 }, { "epoch": 6.856209150326797, "grad_norm": 2.200284481048584, "learning_rate": 3.145969498910675e-07, "loss": 3.8618648052215576, "step": 3147 }, { "epoch": 6.85838779956427, "grad_norm": 3.2515292167663574, "learning_rate": 3.1437908496732025e-07, "loss": 4.024832725524902, "step": 3148 }, { "epoch": 6.860566448801743, "grad_norm": 2.5147266387939453, "learning_rate": 3.14161220043573e-07, "loss": 3.845046281814575, "step": 3149 }, { "epoch": 6.862745098039216, "grad_norm": 2.0478343963623047, "learning_rate": 3.139433551198257e-07, "loss": 3.867912530899048, "step": 3150 }, { "epoch": 6.864923747276689, "grad_norm": 2.910604238510132, "learning_rate": 3.1372549019607843e-07, "loss": 4.113101005554199, "step": 3151 }, { "epoch": 6.867102396514161, "grad_norm": 2.250399589538574, "learning_rate": 3.1350762527233117e-07, "loss": 3.7902004718780518, "step": 3152 }, { "epoch": 6.8692810457516345, "grad_norm": 2.4684929847717285, "learning_rate": 3.1328976034858386e-07, "loss": 3.914895534515381, "step": 3153 }, { "epoch": 6.871459694989107, "grad_norm": 1.6199991703033447, "learning_rate": 3.130718954248366e-07, "loss": 3.7150778770446777, "step": 3154 }, { "epoch": 6.873638344226579, "grad_norm": 2.027313232421875, "learning_rate": 3.1285403050108935e-07, "loss": 3.859861373901367, "step": 3155 }, { "epoch": 6.875816993464053, "grad_norm": 2.446025848388672, "learning_rate": 3.1263616557734204e-07, "loss": 3.792994499206543, "step": 3156 }, { "epoch": 6.877995642701525, "grad_norm": 2.2345998287200928, "learning_rate": 3.124183006535948e-07, "loss": 3.8510711193084717, "step": 3157 }, { "epoch": 6.880174291938998, "grad_norm": 2.318537473678589, "learning_rate": 3.122004357298475e-07, "loss": 3.7754669189453125, "step": 3158 }, { "epoch": 6.882352941176471, "grad_norm": 2.118309736251831, "learning_rate": 3.119825708061002e-07, "loss": 4.017240047454834, "step": 3159 }, { "epoch": 6.8845315904139435, "grad_norm": 2.5423591136932373, "learning_rate": 3.1176470588235295e-07, "loss": 4.039466857910156, "step": 3160 }, { "epoch": 6.886710239651416, "grad_norm": 2.791879892349243, "learning_rate": 3.115468409586057e-07, "loss": 4.059891223907471, "step": 3161 }, { "epoch": 6.888888888888889, "grad_norm": 2.2661116123199463, "learning_rate": 3.113289760348584e-07, "loss": 4.001110553741455, "step": 3162 }, { "epoch": 6.891067538126362, "grad_norm": 2.1319894790649414, "learning_rate": 3.111111111111111e-07, "loss": 3.866701602935791, "step": 3163 }, { "epoch": 6.893246187363834, "grad_norm": 1.8760854005813599, "learning_rate": 3.1089324618736387e-07, "loss": 3.8698570728302, "step": 3164 }, { "epoch": 6.895424836601308, "grad_norm": 1.945290207862854, "learning_rate": 3.106753812636165e-07, "loss": 3.9260427951812744, "step": 3165 }, { "epoch": 6.89760348583878, "grad_norm": 2.8366236686706543, "learning_rate": 3.1045751633986925e-07, "loss": 4.132519245147705, "step": 3166 }, { "epoch": 6.8997821350762525, "grad_norm": 2.872657537460327, "learning_rate": 3.10239651416122e-07, "loss": 4.007517337799072, "step": 3167 }, { "epoch": 6.901960784313726, "grad_norm": 2.769158363342285, "learning_rate": 3.100217864923747e-07, "loss": 4.104474067687988, "step": 3168 }, { "epoch": 6.904139433551198, "grad_norm": 2.153853178024292, "learning_rate": 3.098039215686274e-07, "loss": 3.8090574741363525, "step": 3169 }, { "epoch": 6.906318082788671, "grad_norm": 2.401947021484375, "learning_rate": 3.095860566448801e-07, "loss": 3.805752754211426, "step": 3170 }, { "epoch": 6.908496732026144, "grad_norm": 2.61177921295166, "learning_rate": 3.0936819172113285e-07, "loss": 3.8128387928009033, "step": 3171 }, { "epoch": 6.910675381263617, "grad_norm": 1.8685001134872437, "learning_rate": 3.091503267973856e-07, "loss": 3.8804759979248047, "step": 3172 }, { "epoch": 6.912854030501089, "grad_norm": 2.165362596511841, "learning_rate": 3.089324618736383e-07, "loss": 3.98067045211792, "step": 3173 }, { "epoch": 6.915032679738562, "grad_norm": 2.6422407627105713, "learning_rate": 3.0871459694989103e-07, "loss": 3.989976406097412, "step": 3174 }, { "epoch": 6.917211328976035, "grad_norm": 2.759948253631592, "learning_rate": 3.0849673202614377e-07, "loss": 3.778425931930542, "step": 3175 }, { "epoch": 6.919389978213507, "grad_norm": 2.689173460006714, "learning_rate": 3.0827886710239646e-07, "loss": 4.007768630981445, "step": 3176 }, { "epoch": 6.921568627450981, "grad_norm": 2.2359461784362793, "learning_rate": 3.080610021786492e-07, "loss": 3.876488208770752, "step": 3177 }, { "epoch": 6.923747276688453, "grad_norm": 2.0838751792907715, "learning_rate": 3.0784313725490195e-07, "loss": 3.9188637733459473, "step": 3178 }, { "epoch": 6.925925925925926, "grad_norm": 2.1782166957855225, "learning_rate": 3.0762527233115464e-07, "loss": 3.94630765914917, "step": 3179 }, { "epoch": 6.928104575163399, "grad_norm": 2.974594831466675, "learning_rate": 3.074074074074074e-07, "loss": 3.9461498260498047, "step": 3180 }, { "epoch": 6.930283224400871, "grad_norm": 2.648416757583618, "learning_rate": 3.071895424836601e-07, "loss": 4.144524097442627, "step": 3181 }, { "epoch": 6.932461873638344, "grad_norm": 1.8273569345474243, "learning_rate": 3.069716775599128e-07, "loss": 3.8538119792938232, "step": 3182 }, { "epoch": 6.934640522875817, "grad_norm": 2.3280649185180664, "learning_rate": 3.0675381263616555e-07, "loss": 3.9915690422058105, "step": 3183 }, { "epoch": 6.93681917211329, "grad_norm": 2.697645425796509, "learning_rate": 3.065359477124183e-07, "loss": 3.990304946899414, "step": 3184 }, { "epoch": 6.938997821350762, "grad_norm": 2.8186025619506836, "learning_rate": 3.06318082788671e-07, "loss": 3.9444191455841064, "step": 3185 }, { "epoch": 6.9411764705882355, "grad_norm": 2.663881301879883, "learning_rate": 3.0610021786492373e-07, "loss": 3.945077419281006, "step": 3186 }, { "epoch": 6.943355119825708, "grad_norm": 2.202643632888794, "learning_rate": 3.0588235294117647e-07, "loss": 3.8969366550445557, "step": 3187 }, { "epoch": 6.94553376906318, "grad_norm": 1.9453494548797607, "learning_rate": 3.0566448801742916e-07, "loss": 3.944124937057495, "step": 3188 }, { "epoch": 6.947712418300654, "grad_norm": 3.5688421726226807, "learning_rate": 3.054466230936819e-07, "loss": 4.15187931060791, "step": 3189 }, { "epoch": 6.949891067538126, "grad_norm": 2.028718948364258, "learning_rate": 3.0522875816993464e-07, "loss": 3.9157426357269287, "step": 3190 }, { "epoch": 6.952069716775599, "grad_norm": 1.7898690700531006, "learning_rate": 3.0501089324618733e-07, "loss": 3.800278425216675, "step": 3191 }, { "epoch": 6.954248366013072, "grad_norm": 1.9844012260437012, "learning_rate": 3.047930283224401e-07, "loss": 3.9170081615448, "step": 3192 }, { "epoch": 6.9564270152505445, "grad_norm": 2.3556458950042725, "learning_rate": 3.0457516339869277e-07, "loss": 3.9257309436798096, "step": 3193 }, { "epoch": 6.958605664488017, "grad_norm": 2.054551601409912, "learning_rate": 3.043572984749455e-07, "loss": 3.9930155277252197, "step": 3194 }, { "epoch": 6.96078431372549, "grad_norm": 2.2637317180633545, "learning_rate": 3.0413943355119825e-07, "loss": 3.9354236125946045, "step": 3195 }, { "epoch": 6.962962962962963, "grad_norm": 2.2369306087493896, "learning_rate": 3.0392156862745094e-07, "loss": 3.894728660583496, "step": 3196 }, { "epoch": 6.965141612200435, "grad_norm": 1.8306646347045898, "learning_rate": 3.037037037037037e-07, "loss": 3.8673174381256104, "step": 3197 }, { "epoch": 6.967320261437909, "grad_norm": 2.3980770111083984, "learning_rate": 3.034858387799564e-07, "loss": 3.795444965362549, "step": 3198 }, { "epoch": 6.969498910675381, "grad_norm": 2.1867315769195557, "learning_rate": 3.032679738562091e-07, "loss": 3.916268825531006, "step": 3199 }, { "epoch": 6.9716775599128535, "grad_norm": 2.882007360458374, "learning_rate": 3.0305010893246186e-07, "loss": 3.936495304107666, "step": 3200 }, { "epoch": 6.973856209150327, "grad_norm": 3.208742141723633, "learning_rate": 3.028322440087146e-07, "loss": 3.9611387252807617, "step": 3201 }, { "epoch": 6.976034858387799, "grad_norm": 2.0781800746917725, "learning_rate": 3.026143790849673e-07, "loss": 3.7012722492218018, "step": 3202 }, { "epoch": 6.978213507625273, "grad_norm": 1.6946752071380615, "learning_rate": 3.0239651416122003e-07, "loss": 3.7690110206604004, "step": 3203 }, { "epoch": 6.980392156862745, "grad_norm": 2.731096029281616, "learning_rate": 3.0217864923747277e-07, "loss": 3.854031562805176, "step": 3204 }, { "epoch": 6.982570806100218, "grad_norm": 2.961817979812622, "learning_rate": 3.0196078431372546e-07, "loss": 3.940237522125244, "step": 3205 }, { "epoch": 6.984749455337691, "grad_norm": 2.4024903774261475, "learning_rate": 3.017429193899782e-07, "loss": 3.8866384029388428, "step": 3206 }, { "epoch": 6.9869281045751634, "grad_norm": 2.311256170272827, "learning_rate": 3.0152505446623095e-07, "loss": 3.921316146850586, "step": 3207 }, { "epoch": 6.989106753812636, "grad_norm": 2.022911548614502, "learning_rate": 3.0130718954248364e-07, "loss": 3.8550713062286377, "step": 3208 }, { "epoch": 6.991285403050109, "grad_norm": 3.432126045227051, "learning_rate": 3.010893246187364e-07, "loss": 4.057179927825928, "step": 3209 }, { "epoch": 6.993464052287582, "grad_norm": 2.758265733718872, "learning_rate": 3.008714596949891e-07, "loss": 3.8182497024536133, "step": 3210 }, { "epoch": 6.995642701525054, "grad_norm": 1.9081852436065674, "learning_rate": 3.006535947712418e-07, "loss": 3.8967511653900146, "step": 3211 }, { "epoch": 6.9978213507625275, "grad_norm": 3.0453765392303467, "learning_rate": 3.0043572984749455e-07, "loss": 3.9538159370422363, "step": 3212 }, { "epoch": 7.0, "grad_norm": 2.1990392208099365, "learning_rate": 3.002178649237473e-07, "loss": 3.815922260284424, "step": 3213 } ], "logging_steps": 1, "max_steps": 4590, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 1000000000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2472662932372275e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }