diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,17226 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.9973915878708834, + "eval_steps": 500, + "global_step": 2452, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016302575806977503, + "grad_norm": 0.7529258131980896, + "learning_rate": 0.0, + "loss": 0.4678, + "step": 1 + }, + { + "epoch": 0.0032605151613955006, + "grad_norm": 0.2298814058303833, + "learning_rate": 2.1533827903669654e-05, + "loss": 0.417, + "step": 2 + }, + { + "epoch": 0.00489077274209325, + "grad_norm": 0.20245032012462616, + "learning_rate": 3.413030972429927e-05, + "loss": 0.3776, + "step": 3 + }, + { + "epoch": 0.006521030322791001, + "grad_norm": 0.1889941245317459, + "learning_rate": 4.306765580733931e-05, + "loss": 0.3834, + "step": 4 + }, + { + "epoch": 0.008151287903488751, + "grad_norm": 0.16519580781459808, + "learning_rate": 5e-05, + "loss": 0.4055, + "step": 5 + }, + { + "epoch": 0.0097815454841865, + "grad_norm": 0.14732758700847626, + "learning_rate": 5.5664137627968925e-05, + "loss": 0.3698, + "step": 6 + }, + { + "epoch": 0.011411803064884252, + "grad_norm": 0.1339130848646164, + "learning_rate": 6.0453097756108376e-05, + "loss": 0.348, + "step": 7 + }, + { + "epoch": 0.013042060645582002, + "grad_norm": 0.1276892125606537, + "learning_rate": 6.460148371100896e-05, + "loss": 0.3469, + "step": 8 + }, + { + "epoch": 0.014672318226279752, + "grad_norm": 0.13217830657958984, + "learning_rate": 6.826061944859854e-05, + "loss": 0.351, + "step": 9 + }, + { + "epoch": 0.016302575806977502, + "grad_norm": 0.13900792598724365, + "learning_rate": 7.153382790366967e-05, + "loss": 0.33, + "step": 10 + }, + { + "epoch": 0.01793283338767525, + "grad_norm": 0.1303739994764328, + "learning_rate": 7.449480512024892e-05, + "loss": 0.3493, + "step": 11 + }, + { + "epoch": 0.019563090968373, + "grad_norm": 0.14218388497829437, + "learning_rate": 7.719796553163858e-05, + "loss": 0.3731, + "step": 12 + }, + { + "epoch": 0.021193348549070755, + "grad_norm": 0.11100596189498901, + "learning_rate": 7.968463205835412e-05, + "loss": 0.3385, + "step": 13 + }, + { + "epoch": 0.022823606129768505, + "grad_norm": 0.11745980381965637, + "learning_rate": 8.198692565977803e-05, + "loss": 0.3337, + "step": 14 + }, + { + "epoch": 0.024453863710466255, + "grad_norm": 0.11711365729570389, + "learning_rate": 8.413030972429928e-05, + "loss": 0.3117, + "step": 15 + }, + { + "epoch": 0.026084121291164004, + "grad_norm": 0.15356451272964478, + "learning_rate": 8.613531161467861e-05, + "loss": 0.3327, + "step": 16 + }, + { + "epoch": 0.027714378871861754, + "grad_norm": 0.14109951257705688, + "learning_rate": 8.80187213861294e-05, + "loss": 0.2942, + "step": 17 + }, + { + "epoch": 0.029344636452559504, + "grad_norm": 0.25882235169410706, + "learning_rate": 8.979444735226819e-05, + "loss": 0.316, + "step": 18 + }, + { + "epoch": 0.030974894033257254, + "grad_norm": 0.157696932554245, + "learning_rate": 9.147414002175752e-05, + "loss": 0.3284, + "step": 19 + }, + { + "epoch": 0.032605151613955004, + "grad_norm": 0.13975587487220764, + "learning_rate": 9.306765580733931e-05, + "loss": 0.3245, + "step": 20 + }, + { + "epoch": 0.034235409194652754, + "grad_norm": 0.8783023953437805, + "learning_rate": 9.458340748040766e-05, + "loss": 0.3181, + "step": 21 + }, + { + "epoch": 0.0358656667753505, + "grad_norm": 0.11985931545495987, + "learning_rate": 9.602863302391859e-05, + "loss": 0.3098, + "step": 22 + }, + { + "epoch": 0.03749592435604825, + "grad_norm": 0.14401742815971375, + "learning_rate": 9.740960467331899e-05, + "loss": 0.3132, + "step": 23 + }, + { + "epoch": 0.039126181936746, + "grad_norm": 0.17440500855445862, + "learning_rate": 9.873179343530825e-05, + "loss": 0.3074, + "step": 24 + }, + { + "epoch": 0.04075643951744375, + "grad_norm": 0.12916715443134308, + "learning_rate": 0.0001, + "loss": 0.2955, + "step": 25 + }, + { + "epoch": 0.04238669709814151, + "grad_norm": 0.5290112495422363, + "learning_rate": 0.0001, + "loss": 0.3143, + "step": 26 + }, + { + "epoch": 0.04401695467883926, + "grad_norm": 0.1266779601573944, + "learning_rate": 0.0001, + "loss": 0.2958, + "step": 27 + }, + { + "epoch": 0.04564721225953701, + "grad_norm": 0.13447235524654388, + "learning_rate": 0.0001, + "loss": 0.3211, + "step": 28 + }, + { + "epoch": 0.04727746984023476, + "grad_norm": 0.12066753953695297, + "learning_rate": 0.0001, + "loss": 0.2894, + "step": 29 + }, + { + "epoch": 0.04890772742093251, + "grad_norm": 0.11640455573797226, + "learning_rate": 0.0001, + "loss": 0.2891, + "step": 30 + }, + { + "epoch": 0.05053798500163026, + "grad_norm": 0.1315324753522873, + "learning_rate": 0.0001, + "loss": 0.292, + "step": 31 + }, + { + "epoch": 0.05216824258232801, + "grad_norm": 0.09173787385225296, + "learning_rate": 0.0001, + "loss": 0.2786, + "step": 32 + }, + { + "epoch": 0.05379850016302576, + "grad_norm": 0.14253424108028412, + "learning_rate": 0.0001, + "loss": 0.2924, + "step": 33 + }, + { + "epoch": 0.05542875774372351, + "grad_norm": 0.1317639797925949, + "learning_rate": 0.0001, + "loss": 0.29, + "step": 34 + }, + { + "epoch": 0.05705901532442126, + "grad_norm": 0.11155499517917633, + "learning_rate": 0.0001, + "loss": 0.2575, + "step": 35 + }, + { + "epoch": 0.05868927290511901, + "grad_norm": 0.10907690972089767, + "learning_rate": 0.0001, + "loss": 0.2745, + "step": 36 + }, + { + "epoch": 0.06031953048581676, + "grad_norm": 0.11807228624820709, + "learning_rate": 0.0001, + "loss": 0.2713, + "step": 37 + }, + { + "epoch": 0.06194978806651451, + "grad_norm": 0.1130286455154419, + "learning_rate": 0.0001, + "loss": 0.2869, + "step": 38 + }, + { + "epoch": 0.06358004564721226, + "grad_norm": 0.1267092525959015, + "learning_rate": 0.0001, + "loss": 0.287, + "step": 39 + }, + { + "epoch": 0.06521030322791001, + "grad_norm": 0.13711895048618317, + "learning_rate": 0.0001, + "loss": 0.2662, + "step": 40 + }, + { + "epoch": 0.06684056080860776, + "grad_norm": 0.1149134635925293, + "learning_rate": 0.0001, + "loss": 0.2743, + "step": 41 + }, + { + "epoch": 0.06847081838930551, + "grad_norm": 0.1379522979259491, + "learning_rate": 0.0001, + "loss": 0.2915, + "step": 42 + }, + { + "epoch": 0.07010107597000326, + "grad_norm": 0.126117542386055, + "learning_rate": 0.0001, + "loss": 0.2527, + "step": 43 + }, + { + "epoch": 0.071731333550701, + "grad_norm": 0.13249348104000092, + "learning_rate": 0.0001, + "loss": 0.2951, + "step": 44 + }, + { + "epoch": 0.07336159113139876, + "grad_norm": 0.13240788877010345, + "learning_rate": 0.0001, + "loss": 0.2828, + "step": 45 + }, + { + "epoch": 0.0749918487120965, + "grad_norm": 0.1182844415307045, + "learning_rate": 0.0001, + "loss": 0.2564, + "step": 46 + }, + { + "epoch": 0.07662210629279426, + "grad_norm": 0.13393035531044006, + "learning_rate": 0.0001, + "loss": 0.2742, + "step": 47 + }, + { + "epoch": 0.078252363873492, + "grad_norm": 0.11272592842578888, + "learning_rate": 0.0001, + "loss": 0.279, + "step": 48 + }, + { + "epoch": 0.07988262145418976, + "grad_norm": 0.12920530140399933, + "learning_rate": 0.0001, + "loss": 0.2733, + "step": 49 + }, + { + "epoch": 0.0815128790348875, + "grad_norm": 0.13335202634334564, + "learning_rate": 0.0001, + "loss": 0.2647, + "step": 50 + }, + { + "epoch": 0.08314313661558526, + "grad_norm": 0.13447648286819458, + "learning_rate": 0.0001, + "loss": 0.2674, + "step": 51 + }, + { + "epoch": 0.08477339419628302, + "grad_norm": 0.13474304974079132, + "learning_rate": 0.0001, + "loss": 0.274, + "step": 52 + }, + { + "epoch": 0.08640365177698077, + "grad_norm": 0.15840347111225128, + "learning_rate": 0.0001, + "loss": 0.2871, + "step": 53 + }, + { + "epoch": 0.08803390935767852, + "grad_norm": 0.1323927342891693, + "learning_rate": 0.0001, + "loss": 0.2797, + "step": 54 + }, + { + "epoch": 0.08966416693837627, + "grad_norm": 0.15678195655345917, + "learning_rate": 0.0001, + "loss": 0.2742, + "step": 55 + }, + { + "epoch": 0.09129442451907402, + "grad_norm": 0.1214122325181961, + "learning_rate": 0.0001, + "loss": 0.258, + "step": 56 + }, + { + "epoch": 0.09292468209977177, + "grad_norm": 0.15963681042194366, + "learning_rate": 0.0001, + "loss": 0.2767, + "step": 57 + }, + { + "epoch": 0.09455493968046952, + "grad_norm": 0.14005902409553528, + "learning_rate": 0.0001, + "loss": 0.2786, + "step": 58 + }, + { + "epoch": 0.09618519726116727, + "grad_norm": 0.14053602516651154, + "learning_rate": 0.0001, + "loss": 0.2673, + "step": 59 + }, + { + "epoch": 0.09781545484186502, + "grad_norm": 0.1037927120923996, + "learning_rate": 0.0001, + "loss": 0.2532, + "step": 60 + }, + { + "epoch": 0.09944571242256277, + "grad_norm": 0.16954319179058075, + "learning_rate": 0.0001, + "loss": 0.257, + "step": 61 + }, + { + "epoch": 0.10107597000326052, + "grad_norm": 0.13848423957824707, + "learning_rate": 0.0001, + "loss": 0.2633, + "step": 62 + }, + { + "epoch": 0.10270622758395827, + "grad_norm": 0.1805400550365448, + "learning_rate": 0.0001, + "loss": 0.263, + "step": 63 + }, + { + "epoch": 0.10433648516465602, + "grad_norm": 0.17179027199745178, + "learning_rate": 0.0001, + "loss": 0.2681, + "step": 64 + }, + { + "epoch": 0.10596674274535377, + "grad_norm": 0.1394517719745636, + "learning_rate": 0.0001, + "loss": 0.27, + "step": 65 + }, + { + "epoch": 0.10759700032605152, + "grad_norm": 0.1399444192647934, + "learning_rate": 0.0001, + "loss": 0.262, + "step": 66 + }, + { + "epoch": 0.10922725790674927, + "grad_norm": 0.1266821324825287, + "learning_rate": 0.0001, + "loss": 0.2599, + "step": 67 + }, + { + "epoch": 0.11085751548744702, + "grad_norm": 0.13088282942771912, + "learning_rate": 0.0001, + "loss": 0.2636, + "step": 68 + }, + { + "epoch": 0.11248777306814477, + "grad_norm": 0.1161791980266571, + "learning_rate": 0.0001, + "loss": 0.2739, + "step": 69 + }, + { + "epoch": 0.11411803064884252, + "grad_norm": 0.1133728101849556, + "learning_rate": 0.0001, + "loss": 0.2369, + "step": 70 + }, + { + "epoch": 0.11574828822954027, + "grad_norm": 0.140262633562088, + "learning_rate": 0.0001, + "loss": 0.2752, + "step": 71 + }, + { + "epoch": 0.11737854581023802, + "grad_norm": 0.12292502820491791, + "learning_rate": 0.0001, + "loss": 0.2541, + "step": 72 + }, + { + "epoch": 0.11900880339093577, + "grad_norm": 0.14118324220180511, + "learning_rate": 0.0001, + "loss": 0.2611, + "step": 73 + }, + { + "epoch": 0.12063906097163352, + "grad_norm": 0.12177952378988266, + "learning_rate": 0.0001, + "loss": 0.2605, + "step": 74 + }, + { + "epoch": 0.12226931855233127, + "grad_norm": 0.13002616167068481, + "learning_rate": 0.0001, + "loss": 0.2609, + "step": 75 + }, + { + "epoch": 0.12389957613302902, + "grad_norm": 0.11555729806423187, + "learning_rate": 0.0001, + "loss": 0.2404, + "step": 76 + }, + { + "epoch": 0.12552983371372678, + "grad_norm": 0.1303972601890564, + "learning_rate": 0.0001, + "loss": 0.251, + "step": 77 + }, + { + "epoch": 0.12716009129442452, + "grad_norm": 0.1312289834022522, + "learning_rate": 0.0001, + "loss": 0.2635, + "step": 78 + }, + { + "epoch": 0.12879034887512228, + "grad_norm": 0.12450554966926575, + "learning_rate": 0.0001, + "loss": 0.2545, + "step": 79 + }, + { + "epoch": 0.13042060645582002, + "grad_norm": 0.1366516351699829, + "learning_rate": 0.0001, + "loss": 0.2566, + "step": 80 + }, + { + "epoch": 0.13205086403651778, + "grad_norm": 0.13325075805187225, + "learning_rate": 0.0001, + "loss": 0.2557, + "step": 81 + }, + { + "epoch": 0.13368112161721551, + "grad_norm": 0.15881724655628204, + "learning_rate": 0.0001, + "loss": 0.2467, + "step": 82 + }, + { + "epoch": 0.13531137919791328, + "grad_norm": 0.12714703381061554, + "learning_rate": 0.0001, + "loss": 0.259, + "step": 83 + }, + { + "epoch": 0.13694163677861101, + "grad_norm": 0.13727723062038422, + "learning_rate": 0.0001, + "loss": 0.2737, + "step": 84 + }, + { + "epoch": 0.13857189435930878, + "grad_norm": 0.12891346216201782, + "learning_rate": 0.0001, + "loss": 0.2747, + "step": 85 + }, + { + "epoch": 0.14020215194000651, + "grad_norm": 0.10840713977813721, + "learning_rate": 0.0001, + "loss": 0.2481, + "step": 86 + }, + { + "epoch": 0.14183240952070428, + "grad_norm": 0.12393542379140854, + "learning_rate": 0.0001, + "loss": 0.2547, + "step": 87 + }, + { + "epoch": 0.143462667101402, + "grad_norm": 0.16842670738697052, + "learning_rate": 0.0001, + "loss": 0.28, + "step": 88 + }, + { + "epoch": 0.14509292468209978, + "grad_norm": 0.13346168398857117, + "learning_rate": 0.0001, + "loss": 0.2359, + "step": 89 + }, + { + "epoch": 0.1467231822627975, + "grad_norm": 0.14012813568115234, + "learning_rate": 0.0001, + "loss": 0.242, + "step": 90 + }, + { + "epoch": 0.14835343984349528, + "grad_norm": 0.1721285581588745, + "learning_rate": 0.0001, + "loss": 0.2644, + "step": 91 + }, + { + "epoch": 0.149983697424193, + "grad_norm": 0.14937177300453186, + "learning_rate": 0.0001, + "loss": 0.2541, + "step": 92 + }, + { + "epoch": 0.15161395500489078, + "grad_norm": 0.13868822157382965, + "learning_rate": 0.0001, + "loss": 0.2609, + "step": 93 + }, + { + "epoch": 0.1532442125855885, + "grad_norm": 0.14626921713352203, + "learning_rate": 0.0001, + "loss": 0.2442, + "step": 94 + }, + { + "epoch": 0.15487447016628628, + "grad_norm": 0.13605758547782898, + "learning_rate": 0.0001, + "loss": 0.2524, + "step": 95 + }, + { + "epoch": 0.156504727746984, + "grad_norm": 0.1354619562625885, + "learning_rate": 0.0001, + "loss": 0.2504, + "step": 96 + }, + { + "epoch": 0.15813498532768178, + "grad_norm": 0.1597517877817154, + "learning_rate": 0.0001, + "loss": 0.2477, + "step": 97 + }, + { + "epoch": 0.1597652429083795, + "grad_norm": 0.13010838627815247, + "learning_rate": 0.0001, + "loss": 0.271, + "step": 98 + }, + { + "epoch": 0.16139550048907728, + "grad_norm": 0.13291318714618683, + "learning_rate": 0.0001, + "loss": 0.2447, + "step": 99 + }, + { + "epoch": 0.163025758069775, + "grad_norm": 0.16925360262393951, + "learning_rate": 0.0001, + "loss": 0.2617, + "step": 100 + }, + { + "epoch": 0.16465601565047278, + "grad_norm": 0.13861025869846344, + "learning_rate": 0.0001, + "loss": 0.2503, + "step": 101 + }, + { + "epoch": 0.1662862732311705, + "grad_norm": 0.16513262689113617, + "learning_rate": 0.0001, + "loss": 0.2513, + "step": 102 + }, + { + "epoch": 0.16791653081186828, + "grad_norm": 0.14727436006069183, + "learning_rate": 0.0001, + "loss": 0.2769, + "step": 103 + }, + { + "epoch": 0.16954678839256604, + "grad_norm": 0.14519990980625153, + "learning_rate": 0.0001, + "loss": 0.2506, + "step": 104 + }, + { + "epoch": 0.17117704597326378, + "grad_norm": 0.14038777351379395, + "learning_rate": 0.0001, + "loss": 0.2619, + "step": 105 + }, + { + "epoch": 0.17280730355396154, + "grad_norm": 0.13004878163337708, + "learning_rate": 0.0001, + "loss": 0.2466, + "step": 106 + }, + { + "epoch": 0.17443756113465927, + "grad_norm": 0.11581247299909592, + "learning_rate": 0.0001, + "loss": 0.2365, + "step": 107 + }, + { + "epoch": 0.17606781871535704, + "grad_norm": 0.14593157172203064, + "learning_rate": 0.0001, + "loss": 0.2446, + "step": 108 + }, + { + "epoch": 0.17769807629605477, + "grad_norm": 0.15348902344703674, + "learning_rate": 0.0001, + "loss": 0.2613, + "step": 109 + }, + { + "epoch": 0.17932833387675254, + "grad_norm": 0.11216390132904053, + "learning_rate": 0.0001, + "loss": 0.2328, + "step": 110 + }, + { + "epoch": 0.18095859145745027, + "grad_norm": 0.17392517626285553, + "learning_rate": 0.0001, + "loss": 0.2449, + "step": 111 + }, + { + "epoch": 0.18258884903814804, + "grad_norm": 0.13363705575466156, + "learning_rate": 0.0001, + "loss": 0.2577, + "step": 112 + }, + { + "epoch": 0.18421910661884577, + "grad_norm": 0.12435399740934372, + "learning_rate": 0.0001, + "loss": 0.2478, + "step": 113 + }, + { + "epoch": 0.18584936419954354, + "grad_norm": 0.15440337359905243, + "learning_rate": 0.0001, + "loss": 0.2655, + "step": 114 + }, + { + "epoch": 0.18747962178024127, + "grad_norm": 0.12815740704536438, + "learning_rate": 0.0001, + "loss": 0.2642, + "step": 115 + }, + { + "epoch": 0.18910987936093904, + "grad_norm": 0.11212265491485596, + "learning_rate": 0.0001, + "loss": 0.2496, + "step": 116 + }, + { + "epoch": 0.19074013694163677, + "grad_norm": 0.13762663304805756, + "learning_rate": 0.0001, + "loss": 0.2382, + "step": 117 + }, + { + "epoch": 0.19237039452233454, + "grad_norm": 0.11899245530366898, + "learning_rate": 0.0001, + "loss": 0.2473, + "step": 118 + }, + { + "epoch": 0.19400065210303227, + "grad_norm": 0.12855815887451172, + "learning_rate": 0.0001, + "loss": 0.2478, + "step": 119 + }, + { + "epoch": 0.19563090968373004, + "grad_norm": 0.116991326212883, + "learning_rate": 0.0001, + "loss": 0.268, + "step": 120 + }, + { + "epoch": 0.19726116726442777, + "grad_norm": 0.11379344016313553, + "learning_rate": 0.0001, + "loss": 0.2391, + "step": 121 + }, + { + "epoch": 0.19889142484512554, + "grad_norm": 0.12145145982503891, + "learning_rate": 0.0001, + "loss": 0.2433, + "step": 122 + }, + { + "epoch": 0.20052168242582327, + "grad_norm": 0.12102972716093063, + "learning_rate": 0.0001, + "loss": 0.2384, + "step": 123 + }, + { + "epoch": 0.20215194000652104, + "grad_norm": 0.13323700428009033, + "learning_rate": 0.0001, + "loss": 0.2393, + "step": 124 + }, + { + "epoch": 0.20378219758721877, + "grad_norm": 0.13560782372951508, + "learning_rate": 0.0001, + "loss": 0.2434, + "step": 125 + }, + { + "epoch": 0.20541245516791654, + "grad_norm": 0.14215120673179626, + "learning_rate": 0.0001, + "loss": 0.2478, + "step": 126 + }, + { + "epoch": 0.20704271274861427, + "grad_norm": 0.11859458684921265, + "learning_rate": 0.0001, + "loss": 0.2274, + "step": 127 + }, + { + "epoch": 0.20867297032931204, + "grad_norm": 0.12831608951091766, + "learning_rate": 0.0001, + "loss": 0.2625, + "step": 128 + }, + { + "epoch": 0.21030322791000977, + "grad_norm": 0.15069334208965302, + "learning_rate": 0.0001, + "loss": 0.2542, + "step": 129 + }, + { + "epoch": 0.21193348549070753, + "grad_norm": 0.13063862919807434, + "learning_rate": 0.0001, + "loss": 0.2428, + "step": 130 + }, + { + "epoch": 0.21356374307140527, + "grad_norm": 0.12685725092887878, + "learning_rate": 0.0001, + "loss": 0.2312, + "step": 131 + }, + { + "epoch": 0.21519400065210303, + "grad_norm": 0.13156022131443024, + "learning_rate": 0.0001, + "loss": 0.2548, + "step": 132 + }, + { + "epoch": 0.21682425823280077, + "grad_norm": 0.12375540286302567, + "learning_rate": 0.0001, + "loss": 0.241, + "step": 133 + }, + { + "epoch": 0.21845451581349853, + "grad_norm": 0.12112291157245636, + "learning_rate": 0.0001, + "loss": 0.2483, + "step": 134 + }, + { + "epoch": 0.22008477339419627, + "grad_norm": 0.13035452365875244, + "learning_rate": 0.0001, + "loss": 0.2418, + "step": 135 + }, + { + "epoch": 0.22171503097489403, + "grad_norm": 0.14445112645626068, + "learning_rate": 0.0001, + "loss": 0.2613, + "step": 136 + }, + { + "epoch": 0.22334528855559177, + "grad_norm": 0.11515045166015625, + "learning_rate": 0.0001, + "loss": 0.2423, + "step": 137 + }, + { + "epoch": 0.22497554613628953, + "grad_norm": 0.1339021772146225, + "learning_rate": 0.0001, + "loss": 0.2462, + "step": 138 + }, + { + "epoch": 0.2266058037169873, + "grad_norm": 0.14942044019699097, + "learning_rate": 0.0001, + "loss": 0.2289, + "step": 139 + }, + { + "epoch": 0.22823606129768503, + "grad_norm": 0.15505343675613403, + "learning_rate": 0.0001, + "loss": 0.2507, + "step": 140 + }, + { + "epoch": 0.2298663188783828, + "grad_norm": 0.15206627547740936, + "learning_rate": 0.0001, + "loss": 0.2466, + "step": 141 + }, + { + "epoch": 0.23149657645908053, + "grad_norm": 0.21628950536251068, + "learning_rate": 0.0001, + "loss": 0.2506, + "step": 142 + }, + { + "epoch": 0.2331268340397783, + "grad_norm": 0.146853506565094, + "learning_rate": 0.0001, + "loss": 0.2378, + "step": 143 + }, + { + "epoch": 0.23475709162047603, + "grad_norm": 0.13807877898216248, + "learning_rate": 0.0001, + "loss": 0.2304, + "step": 144 + }, + { + "epoch": 0.2363873492011738, + "grad_norm": 0.1247410997748375, + "learning_rate": 0.0001, + "loss": 0.2154, + "step": 145 + }, + { + "epoch": 0.23801760678187153, + "grad_norm": 0.11460768431425095, + "learning_rate": 0.0001, + "loss": 0.2539, + "step": 146 + }, + { + "epoch": 0.2396478643625693, + "grad_norm": 0.13893291354179382, + "learning_rate": 0.0001, + "loss": 0.2265, + "step": 147 + }, + { + "epoch": 0.24127812194326703, + "grad_norm": 0.13536906242370605, + "learning_rate": 0.0001, + "loss": 0.2267, + "step": 148 + }, + { + "epoch": 0.2429083795239648, + "grad_norm": 0.14529214799404144, + "learning_rate": 0.0001, + "loss": 0.2559, + "step": 149 + }, + { + "epoch": 0.24453863710466253, + "grad_norm": 0.13840830326080322, + "learning_rate": 0.0001, + "loss": 0.2353, + "step": 150 + }, + { + "epoch": 0.2461688946853603, + "grad_norm": 0.15911462903022766, + "learning_rate": 0.0001, + "loss": 0.2527, + "step": 151 + }, + { + "epoch": 0.24779915226605803, + "grad_norm": 0.11530350893735886, + "learning_rate": 0.0001, + "loss": 0.2568, + "step": 152 + }, + { + "epoch": 0.2494294098467558, + "grad_norm": 0.14135918021202087, + "learning_rate": 0.0001, + "loss": 0.2266, + "step": 153 + }, + { + "epoch": 0.25105966742745356, + "grad_norm": 0.18591678142547607, + "learning_rate": 0.0001, + "loss": 0.2593, + "step": 154 + }, + { + "epoch": 0.2526899250081513, + "grad_norm": 0.13683144748210907, + "learning_rate": 0.0001, + "loss": 0.2469, + "step": 155 + }, + { + "epoch": 0.25432018258884903, + "grad_norm": 0.17030082643032074, + "learning_rate": 0.0001, + "loss": 0.2413, + "step": 156 + }, + { + "epoch": 0.25595044016954677, + "grad_norm": 0.1563073843717575, + "learning_rate": 0.0001, + "loss": 0.2435, + "step": 157 + }, + { + "epoch": 0.25758069775024456, + "grad_norm": 0.129858136177063, + "learning_rate": 0.0001, + "loss": 0.2515, + "step": 158 + }, + { + "epoch": 0.2592109553309423, + "grad_norm": 0.13964857161045074, + "learning_rate": 0.0001, + "loss": 0.2271, + "step": 159 + }, + { + "epoch": 0.26084121291164003, + "grad_norm": 0.12167726457118988, + "learning_rate": 0.0001, + "loss": 0.2092, + "step": 160 + }, + { + "epoch": 0.26247147049233777, + "grad_norm": 0.12050545960664749, + "learning_rate": 0.0001, + "loss": 0.2321, + "step": 161 + }, + { + "epoch": 0.26410172807303556, + "grad_norm": 0.14696434140205383, + "learning_rate": 0.0001, + "loss": 0.2545, + "step": 162 + }, + { + "epoch": 0.2657319856537333, + "grad_norm": 0.10441072285175323, + "learning_rate": 0.0001, + "loss": 0.2347, + "step": 163 + }, + { + "epoch": 0.26736224323443103, + "grad_norm": 0.11823071539402008, + "learning_rate": 0.0001, + "loss": 0.2242, + "step": 164 + }, + { + "epoch": 0.26899250081512877, + "grad_norm": 0.1426367461681366, + "learning_rate": 0.0001, + "loss": 0.2614, + "step": 165 + }, + { + "epoch": 0.27062275839582656, + "grad_norm": 0.11903452128171921, + "learning_rate": 0.0001, + "loss": 0.2469, + "step": 166 + }, + { + "epoch": 0.2722530159765243, + "grad_norm": 0.12051168829202652, + "learning_rate": 0.0001, + "loss": 0.246, + "step": 167 + }, + { + "epoch": 0.27388327355722203, + "grad_norm": 0.1146332174539566, + "learning_rate": 0.0001, + "loss": 0.2485, + "step": 168 + }, + { + "epoch": 0.27551353113791976, + "grad_norm": 0.1366330087184906, + "learning_rate": 0.0001, + "loss": 0.2364, + "step": 169 + }, + { + "epoch": 0.27714378871861756, + "grad_norm": 0.19316819310188293, + "learning_rate": 0.0001, + "loss": 0.2373, + "step": 170 + }, + { + "epoch": 0.2787740462993153, + "grad_norm": 0.13915376365184784, + "learning_rate": 0.0001, + "loss": 0.2354, + "step": 171 + }, + { + "epoch": 0.28040430388001303, + "grad_norm": 0.16246309876441956, + "learning_rate": 0.0001, + "loss": 0.2452, + "step": 172 + }, + { + "epoch": 0.2820345614607108, + "grad_norm": 0.11810585111379623, + "learning_rate": 0.0001, + "loss": 0.2312, + "step": 173 + }, + { + "epoch": 0.28366481904140856, + "grad_norm": 0.15839870274066925, + "learning_rate": 0.0001, + "loss": 0.2464, + "step": 174 + }, + { + "epoch": 0.2852950766221063, + "grad_norm": 0.1045420840382576, + "learning_rate": 0.0001, + "loss": 0.2257, + "step": 175 + }, + { + "epoch": 0.286925334202804, + "grad_norm": 0.14196939766407013, + "learning_rate": 0.0001, + "loss": 0.2523, + "step": 176 + }, + { + "epoch": 0.2885555917835018, + "grad_norm": 0.14301368594169617, + "learning_rate": 0.0001, + "loss": 0.2387, + "step": 177 + }, + { + "epoch": 0.29018584936419956, + "grad_norm": 0.13265396654605865, + "learning_rate": 0.0001, + "loss": 0.2428, + "step": 178 + }, + { + "epoch": 0.2918161069448973, + "grad_norm": 0.126626119017601, + "learning_rate": 0.0001, + "loss": 0.2494, + "step": 179 + }, + { + "epoch": 0.293446364525595, + "grad_norm": 0.13440294563770294, + "learning_rate": 0.0001, + "loss": 0.2457, + "step": 180 + }, + { + "epoch": 0.2950766221062928, + "grad_norm": 0.1397274285554886, + "learning_rate": 0.0001, + "loss": 0.2274, + "step": 181 + }, + { + "epoch": 0.29670687968699055, + "grad_norm": 0.09933532774448395, + "learning_rate": 0.0001, + "loss": 0.225, + "step": 182 + }, + { + "epoch": 0.2983371372676883, + "grad_norm": 0.16503044962882996, + "learning_rate": 0.0001, + "loss": 0.2293, + "step": 183 + }, + { + "epoch": 0.299967394848386, + "grad_norm": 0.12690463662147522, + "learning_rate": 0.0001, + "loss": 0.2447, + "step": 184 + }, + { + "epoch": 0.3015976524290838, + "grad_norm": 0.15822118520736694, + "learning_rate": 0.0001, + "loss": 0.2492, + "step": 185 + }, + { + "epoch": 0.30322791000978155, + "grad_norm": 0.10334572941064835, + "learning_rate": 0.0001, + "loss": 0.2334, + "step": 186 + }, + { + "epoch": 0.3048581675904793, + "grad_norm": 0.12315747141838074, + "learning_rate": 0.0001, + "loss": 0.2335, + "step": 187 + }, + { + "epoch": 0.306488425171177, + "grad_norm": 0.13603512942790985, + "learning_rate": 0.0001, + "loss": 0.228, + "step": 188 + }, + { + "epoch": 0.3081186827518748, + "grad_norm": 0.1011793464422226, + "learning_rate": 0.0001, + "loss": 0.2224, + "step": 189 + }, + { + "epoch": 0.30974894033257255, + "grad_norm": 0.10281293839216232, + "learning_rate": 0.0001, + "loss": 0.2277, + "step": 190 + }, + { + "epoch": 0.3113791979132703, + "grad_norm": 0.1417594999074936, + "learning_rate": 0.0001, + "loss": 0.2516, + "step": 191 + }, + { + "epoch": 0.313009455493968, + "grad_norm": 0.11304276436567307, + "learning_rate": 0.0001, + "loss": 0.2323, + "step": 192 + }, + { + "epoch": 0.3146397130746658, + "grad_norm": 0.15146027505397797, + "learning_rate": 0.0001, + "loss": 0.2446, + "step": 193 + }, + { + "epoch": 0.31626997065536355, + "grad_norm": 0.11567169427871704, + "learning_rate": 0.0001, + "loss": 0.2018, + "step": 194 + }, + { + "epoch": 0.3179002282360613, + "grad_norm": 0.12554962933063507, + "learning_rate": 0.0001, + "loss": 0.2317, + "step": 195 + }, + { + "epoch": 0.319530485816759, + "grad_norm": 0.15784506499767303, + "learning_rate": 0.0001, + "loss": 0.2372, + "step": 196 + }, + { + "epoch": 0.3211607433974568, + "grad_norm": 0.13723263144493103, + "learning_rate": 0.0001, + "loss": 0.2395, + "step": 197 + }, + { + "epoch": 0.32279100097815455, + "grad_norm": 0.16114649176597595, + "learning_rate": 0.0001, + "loss": 0.244, + "step": 198 + }, + { + "epoch": 0.3244212585588523, + "grad_norm": 0.11904025822877884, + "learning_rate": 0.0001, + "loss": 0.2366, + "step": 199 + }, + { + "epoch": 0.32605151613955, + "grad_norm": 0.1232055053114891, + "learning_rate": 0.0001, + "loss": 0.2266, + "step": 200 + }, + { + "epoch": 0.3276817737202478, + "grad_norm": 0.1447722166776657, + "learning_rate": 0.0001, + "loss": 0.2451, + "step": 201 + }, + { + "epoch": 0.32931203130094555, + "grad_norm": 0.10923551023006439, + "learning_rate": 0.0001, + "loss": 0.2242, + "step": 202 + }, + { + "epoch": 0.3309422888816433, + "grad_norm": 0.13391102850437164, + "learning_rate": 0.0001, + "loss": 0.2342, + "step": 203 + }, + { + "epoch": 0.332572546462341, + "grad_norm": 0.14067484438419342, + "learning_rate": 0.0001, + "loss": 0.2334, + "step": 204 + }, + { + "epoch": 0.3342028040430388, + "grad_norm": 0.13190071284770966, + "learning_rate": 0.0001, + "loss": 0.2398, + "step": 205 + }, + { + "epoch": 0.33583306162373655, + "grad_norm": 0.12358011305332184, + "learning_rate": 0.0001, + "loss": 0.2218, + "step": 206 + }, + { + "epoch": 0.3374633192044343, + "grad_norm": 0.11492815613746643, + "learning_rate": 0.0001, + "loss": 0.2284, + "step": 207 + }, + { + "epoch": 0.3390935767851321, + "grad_norm": 0.11512556672096252, + "learning_rate": 0.0001, + "loss": 0.2265, + "step": 208 + }, + { + "epoch": 0.3407238343658298, + "grad_norm": 0.11229848861694336, + "learning_rate": 0.0001, + "loss": 0.2253, + "step": 209 + }, + { + "epoch": 0.34235409194652755, + "grad_norm": 0.11124417930841446, + "learning_rate": 0.0001, + "loss": 0.2449, + "step": 210 + }, + { + "epoch": 0.3439843495272253, + "grad_norm": 0.11999543011188507, + "learning_rate": 0.0001, + "loss": 0.2255, + "step": 211 + }, + { + "epoch": 0.3456146071079231, + "grad_norm": 0.10648955404758453, + "learning_rate": 0.0001, + "loss": 0.2104, + "step": 212 + }, + { + "epoch": 0.3472448646886208, + "grad_norm": 0.12528495490550995, + "learning_rate": 0.0001, + "loss": 0.2459, + "step": 213 + }, + { + "epoch": 0.34887512226931855, + "grad_norm": 0.12371502816677094, + "learning_rate": 0.0001, + "loss": 0.2483, + "step": 214 + }, + { + "epoch": 0.3505053798500163, + "grad_norm": 0.15203551948070526, + "learning_rate": 0.0001, + "loss": 0.23, + "step": 215 + }, + { + "epoch": 0.3521356374307141, + "grad_norm": 0.12010787427425385, + "learning_rate": 0.0001, + "loss": 0.2221, + "step": 216 + }, + { + "epoch": 0.3537658950114118, + "grad_norm": 0.1346224844455719, + "learning_rate": 0.0001, + "loss": 0.2268, + "step": 217 + }, + { + "epoch": 0.35539615259210955, + "grad_norm": 0.15142722427845, + "learning_rate": 0.0001, + "loss": 0.2338, + "step": 218 + }, + { + "epoch": 0.3570264101728073, + "grad_norm": 0.129653200507164, + "learning_rate": 0.0001, + "loss": 0.2325, + "step": 219 + }, + { + "epoch": 0.3586566677535051, + "grad_norm": 0.13703054189682007, + "learning_rate": 0.0001, + "loss": 0.2278, + "step": 220 + }, + { + "epoch": 0.3602869253342028, + "grad_norm": 0.1323288083076477, + "learning_rate": 0.0001, + "loss": 0.2331, + "step": 221 + }, + { + "epoch": 0.36191718291490055, + "grad_norm": 0.14150388538837433, + "learning_rate": 0.0001, + "loss": 0.2408, + "step": 222 + }, + { + "epoch": 0.3635474404955983, + "grad_norm": 0.16232647001743317, + "learning_rate": 0.0001, + "loss": 0.2228, + "step": 223 + }, + { + "epoch": 0.3651776980762961, + "grad_norm": 0.13750429451465607, + "learning_rate": 0.0001, + "loss": 0.2578, + "step": 224 + }, + { + "epoch": 0.3668079556569938, + "grad_norm": 0.15641643106937408, + "learning_rate": 0.0001, + "loss": 0.2202, + "step": 225 + }, + { + "epoch": 0.36843821323769155, + "grad_norm": 0.14466796815395355, + "learning_rate": 0.0001, + "loss": 0.2148, + "step": 226 + }, + { + "epoch": 0.3700684708183893, + "grad_norm": 0.11243683099746704, + "learning_rate": 0.0001, + "loss": 0.2353, + "step": 227 + }, + { + "epoch": 0.3716987283990871, + "grad_norm": 0.12383094429969788, + "learning_rate": 0.0001, + "loss": 0.2254, + "step": 228 + }, + { + "epoch": 0.3733289859797848, + "grad_norm": 0.11095986515283585, + "learning_rate": 0.0001, + "loss": 0.225, + "step": 229 + }, + { + "epoch": 0.37495924356048255, + "grad_norm": 0.12428440898656845, + "learning_rate": 0.0001, + "loss": 0.233, + "step": 230 + }, + { + "epoch": 0.3765895011411803, + "grad_norm": 0.14329205453395844, + "learning_rate": 0.0001, + "loss": 0.2307, + "step": 231 + }, + { + "epoch": 0.3782197587218781, + "grad_norm": 0.14026683568954468, + "learning_rate": 0.0001, + "loss": 0.2139, + "step": 232 + }, + { + "epoch": 0.3798500163025758, + "grad_norm": 0.11123739928007126, + "learning_rate": 0.0001, + "loss": 0.2191, + "step": 233 + }, + { + "epoch": 0.38148027388327355, + "grad_norm": 0.1463911086320877, + "learning_rate": 0.0001, + "loss": 0.2388, + "step": 234 + }, + { + "epoch": 0.3831105314639713, + "grad_norm": 0.13069649040699005, + "learning_rate": 0.0001, + "loss": 0.2185, + "step": 235 + }, + { + "epoch": 0.3847407890446691, + "grad_norm": 0.14931800961494446, + "learning_rate": 0.0001, + "loss": 0.2416, + "step": 236 + }, + { + "epoch": 0.3863710466253668, + "grad_norm": 0.16097313165664673, + "learning_rate": 0.0001, + "loss": 0.2347, + "step": 237 + }, + { + "epoch": 0.38800130420606455, + "grad_norm": 0.15226350724697113, + "learning_rate": 0.0001, + "loss": 0.2258, + "step": 238 + }, + { + "epoch": 0.3896315617867623, + "grad_norm": 0.13384918868541718, + "learning_rate": 0.0001, + "loss": 0.2265, + "step": 239 + }, + { + "epoch": 0.3912618193674601, + "grad_norm": 0.11753430217504501, + "learning_rate": 0.0001, + "loss": 0.2106, + "step": 240 + }, + { + "epoch": 0.3928920769481578, + "grad_norm": 0.1108139306306839, + "learning_rate": 0.0001, + "loss": 0.2293, + "step": 241 + }, + { + "epoch": 0.39452233452885554, + "grad_norm": 0.10754958540201187, + "learning_rate": 0.0001, + "loss": 0.2251, + "step": 242 + }, + { + "epoch": 0.39615259210955334, + "grad_norm": 0.11824717372655869, + "learning_rate": 0.0001, + "loss": 0.2507, + "step": 243 + }, + { + "epoch": 0.3977828496902511, + "grad_norm": 0.14322006702423096, + "learning_rate": 0.0001, + "loss": 0.2348, + "step": 244 + }, + { + "epoch": 0.3994131072709488, + "grad_norm": 0.10835988074541092, + "learning_rate": 0.0001, + "loss": 0.2141, + "step": 245 + }, + { + "epoch": 0.40104336485164654, + "grad_norm": 0.11391481757164001, + "learning_rate": 0.0001, + "loss": 0.2139, + "step": 246 + }, + { + "epoch": 0.40267362243234434, + "grad_norm": 0.11263515055179596, + "learning_rate": 0.0001, + "loss": 0.2354, + "step": 247 + }, + { + "epoch": 0.40430388001304207, + "grad_norm": 0.12588317692279816, + "learning_rate": 0.0001, + "loss": 0.2354, + "step": 248 + }, + { + "epoch": 0.4059341375937398, + "grad_norm": 0.12664109468460083, + "learning_rate": 0.0001, + "loss": 0.2371, + "step": 249 + }, + { + "epoch": 0.40756439517443754, + "grad_norm": 0.12330986559391022, + "learning_rate": 0.0001, + "loss": 0.2338, + "step": 250 + }, + { + "epoch": 0.40919465275513534, + "grad_norm": 0.11349525302648544, + "learning_rate": 0.0001, + "loss": 0.2315, + "step": 251 + }, + { + "epoch": 0.41082491033583307, + "grad_norm": 0.11793797463178635, + "learning_rate": 0.0001, + "loss": 0.2309, + "step": 252 + }, + { + "epoch": 0.4124551679165308, + "grad_norm": 0.14245276153087616, + "learning_rate": 0.0001, + "loss": 0.2469, + "step": 253 + }, + { + "epoch": 0.41408542549722854, + "grad_norm": 0.11353014409542084, + "learning_rate": 0.0001, + "loss": 0.2238, + "step": 254 + }, + { + "epoch": 0.41571568307792633, + "grad_norm": 0.13224811851978302, + "learning_rate": 0.0001, + "loss": 0.2419, + "step": 255 + }, + { + "epoch": 0.41734594065862407, + "grad_norm": 0.10972931236028671, + "learning_rate": 0.0001, + "loss": 0.2139, + "step": 256 + }, + { + "epoch": 0.4189761982393218, + "grad_norm": 0.17038947343826294, + "learning_rate": 0.0001, + "loss": 0.2233, + "step": 257 + }, + { + "epoch": 0.42060645582001954, + "grad_norm": 0.1390901654958725, + "learning_rate": 0.0001, + "loss": 0.2347, + "step": 258 + }, + { + "epoch": 0.42223671340071733, + "grad_norm": 0.10498136281967163, + "learning_rate": 0.0001, + "loss": 0.2185, + "step": 259 + }, + { + "epoch": 0.42386697098141507, + "grad_norm": 0.11392710357904434, + "learning_rate": 0.0001, + "loss": 0.2317, + "step": 260 + }, + { + "epoch": 0.4254972285621128, + "grad_norm": 0.12423884123563766, + "learning_rate": 0.0001, + "loss": 0.238, + "step": 261 + }, + { + "epoch": 0.42712748614281054, + "grad_norm": 0.12265395373106003, + "learning_rate": 0.0001, + "loss": 0.2151, + "step": 262 + }, + { + "epoch": 0.42875774372350833, + "grad_norm": 0.1066151112318039, + "learning_rate": 0.0001, + "loss": 0.2188, + "step": 263 + }, + { + "epoch": 0.43038800130420607, + "grad_norm": 0.10593073070049286, + "learning_rate": 0.0001, + "loss": 0.2179, + "step": 264 + }, + { + "epoch": 0.4320182588849038, + "grad_norm": 0.12529350817203522, + "learning_rate": 0.0001, + "loss": 0.2366, + "step": 265 + }, + { + "epoch": 0.43364851646560154, + "grad_norm": 0.13577479124069214, + "learning_rate": 0.0001, + "loss": 0.2304, + "step": 266 + }, + { + "epoch": 0.43527877404629933, + "grad_norm": 0.15030768513679504, + "learning_rate": 0.0001, + "loss": 0.2421, + "step": 267 + }, + { + "epoch": 0.43690903162699707, + "grad_norm": 0.12566924095153809, + "learning_rate": 0.0001, + "loss": 0.2366, + "step": 268 + }, + { + "epoch": 0.4385392892076948, + "grad_norm": 0.1277414709329605, + "learning_rate": 0.0001, + "loss": 0.2413, + "step": 269 + }, + { + "epoch": 0.44016954678839254, + "grad_norm": 0.12140754610300064, + "learning_rate": 0.0001, + "loss": 0.2072, + "step": 270 + }, + { + "epoch": 0.44179980436909033, + "grad_norm": 0.16660185158252716, + "learning_rate": 0.0001, + "loss": 0.2513, + "step": 271 + }, + { + "epoch": 0.44343006194978807, + "grad_norm": 0.11619671434164047, + "learning_rate": 0.0001, + "loss": 0.2158, + "step": 272 + }, + { + "epoch": 0.4450603195304858, + "grad_norm": 0.16923661530017853, + "learning_rate": 0.0001, + "loss": 0.2479, + "step": 273 + }, + { + "epoch": 0.44669057711118354, + "grad_norm": 0.14053957164287567, + "learning_rate": 0.0001, + "loss": 0.2383, + "step": 274 + }, + { + "epoch": 0.44832083469188133, + "grad_norm": 0.1332424134016037, + "learning_rate": 0.0001, + "loss": 0.2202, + "step": 275 + }, + { + "epoch": 0.44995109227257907, + "grad_norm": 0.10877622663974762, + "learning_rate": 0.0001, + "loss": 0.234, + "step": 276 + }, + { + "epoch": 0.4515813498532768, + "grad_norm": 0.11536753922700882, + "learning_rate": 0.0001, + "loss": 0.2199, + "step": 277 + }, + { + "epoch": 0.4532116074339746, + "grad_norm": 0.12691466510295868, + "learning_rate": 0.0001, + "loss": 0.2391, + "step": 278 + }, + { + "epoch": 0.45484186501467233, + "grad_norm": 0.12029401957988739, + "learning_rate": 0.0001, + "loss": 0.2258, + "step": 279 + }, + { + "epoch": 0.45647212259537007, + "grad_norm": 0.14181189239025116, + "learning_rate": 0.0001, + "loss": 0.2202, + "step": 280 + }, + { + "epoch": 0.4581023801760678, + "grad_norm": 0.1265992522239685, + "learning_rate": 0.0001, + "loss": 0.2297, + "step": 281 + }, + { + "epoch": 0.4597326377567656, + "grad_norm": 0.1276220828294754, + "learning_rate": 0.0001, + "loss": 0.2282, + "step": 282 + }, + { + "epoch": 0.46136289533746333, + "grad_norm": 0.15334482491016388, + "learning_rate": 0.0001, + "loss": 0.2213, + "step": 283 + }, + { + "epoch": 0.46299315291816107, + "grad_norm": 0.14251714944839478, + "learning_rate": 0.0001, + "loss": 0.2244, + "step": 284 + }, + { + "epoch": 0.4646234104988588, + "grad_norm": 0.1373746544122696, + "learning_rate": 0.0001, + "loss": 0.2258, + "step": 285 + }, + { + "epoch": 0.4662536680795566, + "grad_norm": 0.14110830426216125, + "learning_rate": 0.0001, + "loss": 0.2192, + "step": 286 + }, + { + "epoch": 0.46788392566025433, + "grad_norm": 0.12478266656398773, + "learning_rate": 0.0001, + "loss": 0.2256, + "step": 287 + }, + { + "epoch": 0.46951418324095207, + "grad_norm": 0.13609923422336578, + "learning_rate": 0.0001, + "loss": 0.2273, + "step": 288 + }, + { + "epoch": 0.4711444408216498, + "grad_norm": 0.15859074890613556, + "learning_rate": 0.0001, + "loss": 0.2314, + "step": 289 + }, + { + "epoch": 0.4727746984023476, + "grad_norm": 0.1267993301153183, + "learning_rate": 0.0001, + "loss": 0.2431, + "step": 290 + }, + { + "epoch": 0.47440495598304533, + "grad_norm": 0.1309819370508194, + "learning_rate": 0.0001, + "loss": 0.2036, + "step": 291 + }, + { + "epoch": 0.47603521356374306, + "grad_norm": 0.14452795684337616, + "learning_rate": 0.0001, + "loss": 0.2113, + "step": 292 + }, + { + "epoch": 0.4776654711444408, + "grad_norm": 0.10817385464906693, + "learning_rate": 0.0001, + "loss": 0.2198, + "step": 293 + }, + { + "epoch": 0.4792957287251386, + "grad_norm": 0.10400962829589844, + "learning_rate": 0.0001, + "loss": 0.2107, + "step": 294 + }, + { + "epoch": 0.48092598630583633, + "grad_norm": 0.14228041470050812, + "learning_rate": 0.0001, + "loss": 0.2355, + "step": 295 + }, + { + "epoch": 0.48255624388653406, + "grad_norm": 0.144633486866951, + "learning_rate": 0.0001, + "loss": 0.237, + "step": 296 + }, + { + "epoch": 0.4841865014672318, + "grad_norm": 0.14238397777080536, + "learning_rate": 0.0001, + "loss": 0.2299, + "step": 297 + }, + { + "epoch": 0.4858167590479296, + "grad_norm": 0.12710346281528473, + "learning_rate": 0.0001, + "loss": 0.2267, + "step": 298 + }, + { + "epoch": 0.4874470166286273, + "grad_norm": 0.12529858946800232, + "learning_rate": 0.0001, + "loss": 0.2301, + "step": 299 + }, + { + "epoch": 0.48907727420932506, + "grad_norm": 0.12333487719297409, + "learning_rate": 0.0001, + "loss": 0.2457, + "step": 300 + }, + { + "epoch": 0.4907075317900228, + "grad_norm": 0.10910045355558395, + "learning_rate": 0.0001, + "loss": 0.2179, + "step": 301 + }, + { + "epoch": 0.4923377893707206, + "grad_norm": 0.13991764187812805, + "learning_rate": 0.0001, + "loss": 0.2361, + "step": 302 + }, + { + "epoch": 0.4939680469514183, + "grad_norm": 0.1286964863538742, + "learning_rate": 0.0001, + "loss": 0.2172, + "step": 303 + }, + { + "epoch": 0.49559830453211606, + "grad_norm": 0.13012994825839996, + "learning_rate": 0.0001, + "loss": 0.199, + "step": 304 + }, + { + "epoch": 0.4972285621128138, + "grad_norm": 0.12035807967185974, + "learning_rate": 0.0001, + "loss": 0.232, + "step": 305 + }, + { + "epoch": 0.4988588196935116, + "grad_norm": 0.1246248111128807, + "learning_rate": 0.0001, + "loss": 0.2271, + "step": 306 + }, + { + "epoch": 0.5004890772742093, + "grad_norm": 0.13514195382595062, + "learning_rate": 0.0001, + "loss": 0.2089, + "step": 307 + }, + { + "epoch": 0.5021193348549071, + "grad_norm": 0.1276530623435974, + "learning_rate": 0.0001, + "loss": 0.2282, + "step": 308 + }, + { + "epoch": 0.5037495924356048, + "grad_norm": 0.12651173770427704, + "learning_rate": 0.0001, + "loss": 0.2235, + "step": 309 + }, + { + "epoch": 0.5053798500163026, + "grad_norm": 0.12165319919586182, + "learning_rate": 0.0001, + "loss": 0.2283, + "step": 310 + }, + { + "epoch": 0.5070101075970004, + "grad_norm": 0.10687411576509476, + "learning_rate": 0.0001, + "loss": 0.2285, + "step": 311 + }, + { + "epoch": 0.5086403651776981, + "grad_norm": 0.10170154273509979, + "learning_rate": 0.0001, + "loss": 0.2165, + "step": 312 + }, + { + "epoch": 0.5102706227583959, + "grad_norm": 0.130455881357193, + "learning_rate": 0.0001, + "loss": 0.2236, + "step": 313 + }, + { + "epoch": 0.5119008803390935, + "grad_norm": 0.12910182774066925, + "learning_rate": 0.0001, + "loss": 0.2339, + "step": 314 + }, + { + "epoch": 0.5135311379197913, + "grad_norm": 0.12536931037902832, + "learning_rate": 0.0001, + "loss": 0.225, + "step": 315 + }, + { + "epoch": 0.5151613955004891, + "grad_norm": 0.11117815226316452, + "learning_rate": 0.0001, + "loss": 0.2167, + "step": 316 + }, + { + "epoch": 0.5167916530811868, + "grad_norm": 0.10664685070514679, + "learning_rate": 0.0001, + "loss": 0.2132, + "step": 317 + }, + { + "epoch": 0.5184219106618846, + "grad_norm": 0.11488650739192963, + "learning_rate": 0.0001, + "loss": 0.2032, + "step": 318 + }, + { + "epoch": 0.5200521682425824, + "grad_norm": 0.1376049518585205, + "learning_rate": 0.0001, + "loss": 0.2198, + "step": 319 + }, + { + "epoch": 0.5216824258232801, + "grad_norm": 0.11661184579133987, + "learning_rate": 0.0001, + "loss": 0.2225, + "step": 320 + }, + { + "epoch": 0.5233126834039779, + "grad_norm": 0.15519613027572632, + "learning_rate": 0.0001, + "loss": 0.2195, + "step": 321 + }, + { + "epoch": 0.5249429409846755, + "grad_norm": 0.13077083230018616, + "learning_rate": 0.0001, + "loss": 0.2413, + "step": 322 + }, + { + "epoch": 0.5265731985653733, + "grad_norm": 0.13975407183170319, + "learning_rate": 0.0001, + "loss": 0.2307, + "step": 323 + }, + { + "epoch": 0.5282034561460711, + "grad_norm": 0.12319333851337433, + "learning_rate": 0.0001, + "loss": 0.2122, + "step": 324 + }, + { + "epoch": 0.5298337137267688, + "grad_norm": 0.12614178657531738, + "learning_rate": 0.0001, + "loss": 0.2126, + "step": 325 + }, + { + "epoch": 0.5314639713074666, + "grad_norm": 0.12005390971899033, + "learning_rate": 0.0001, + "loss": 0.2261, + "step": 326 + }, + { + "epoch": 0.5330942288881644, + "grad_norm": 0.14331591129302979, + "learning_rate": 0.0001, + "loss": 0.2204, + "step": 327 + }, + { + "epoch": 0.5347244864688621, + "grad_norm": 0.1153935119509697, + "learning_rate": 0.0001, + "loss": 0.2221, + "step": 328 + }, + { + "epoch": 0.5363547440495599, + "grad_norm": 0.14517177641391754, + "learning_rate": 0.0001, + "loss": 0.2324, + "step": 329 + }, + { + "epoch": 0.5379850016302575, + "grad_norm": 0.1347828060388565, + "learning_rate": 0.0001, + "loss": 0.2157, + "step": 330 + }, + { + "epoch": 0.5396152592109553, + "grad_norm": 0.12938876450061798, + "learning_rate": 0.0001, + "loss": 0.221, + "step": 331 + }, + { + "epoch": 0.5412455167916531, + "grad_norm": 0.12898674607276917, + "learning_rate": 0.0001, + "loss": 0.2243, + "step": 332 + }, + { + "epoch": 0.5428757743723508, + "grad_norm": 0.1460563838481903, + "learning_rate": 0.0001, + "loss": 0.2091, + "step": 333 + }, + { + "epoch": 0.5445060319530486, + "grad_norm": 0.12404835224151611, + "learning_rate": 0.0001, + "loss": 0.2299, + "step": 334 + }, + { + "epoch": 0.5461362895337464, + "grad_norm": 0.10986137390136719, + "learning_rate": 0.0001, + "loss": 0.2119, + "step": 335 + }, + { + "epoch": 0.5477665471144441, + "grad_norm": 0.10479142516851425, + "learning_rate": 0.0001, + "loss": 0.2132, + "step": 336 + }, + { + "epoch": 0.5493968046951418, + "grad_norm": 0.14422518014907837, + "learning_rate": 0.0001, + "loss": 0.2379, + "step": 337 + }, + { + "epoch": 0.5510270622758395, + "grad_norm": 0.13756605982780457, + "learning_rate": 0.0001, + "loss": 0.2205, + "step": 338 + }, + { + "epoch": 0.5526573198565373, + "grad_norm": 0.15542298555374146, + "learning_rate": 0.0001, + "loss": 0.227, + "step": 339 + }, + { + "epoch": 0.5542875774372351, + "grad_norm": 0.11967030167579651, + "learning_rate": 0.0001, + "loss": 0.2336, + "step": 340 + }, + { + "epoch": 0.5559178350179328, + "grad_norm": 0.11930400133132935, + "learning_rate": 0.0001, + "loss": 0.2273, + "step": 341 + }, + { + "epoch": 0.5575480925986306, + "grad_norm": 0.13011619448661804, + "learning_rate": 0.0001, + "loss": 0.2386, + "step": 342 + }, + { + "epoch": 0.5591783501793284, + "grad_norm": 0.1742897927761078, + "learning_rate": 0.0001, + "loss": 0.2248, + "step": 343 + }, + { + "epoch": 0.5608086077600261, + "grad_norm": 0.09916642308235168, + "learning_rate": 0.0001, + "loss": 0.2169, + "step": 344 + }, + { + "epoch": 0.5624388653407238, + "grad_norm": 0.11124306917190552, + "learning_rate": 0.0001, + "loss": 0.2173, + "step": 345 + }, + { + "epoch": 0.5640691229214216, + "grad_norm": 0.13426317274570465, + "learning_rate": 0.0001, + "loss": 0.2235, + "step": 346 + }, + { + "epoch": 0.5656993805021193, + "grad_norm": 0.12277089059352875, + "learning_rate": 0.0001, + "loss": 0.2234, + "step": 347 + }, + { + "epoch": 0.5673296380828171, + "grad_norm": 0.1190582811832428, + "learning_rate": 0.0001, + "loss": 0.2173, + "step": 348 + }, + { + "epoch": 0.5689598956635148, + "grad_norm": 0.10108431428670883, + "learning_rate": 0.0001, + "loss": 0.2133, + "step": 349 + }, + { + "epoch": 0.5705901532442126, + "grad_norm": 0.10684805363416672, + "learning_rate": 0.0001, + "loss": 0.2311, + "step": 350 + }, + { + "epoch": 0.5722204108249104, + "grad_norm": 0.10276418924331665, + "learning_rate": 0.0001, + "loss": 0.2137, + "step": 351 + }, + { + "epoch": 0.573850668405608, + "grad_norm": 0.10777677595615387, + "learning_rate": 0.0001, + "loss": 0.2083, + "step": 352 + }, + { + "epoch": 0.5754809259863058, + "grad_norm": 0.12655171751976013, + "learning_rate": 0.0001, + "loss": 0.2269, + "step": 353 + }, + { + "epoch": 0.5771111835670036, + "grad_norm": 0.11766703426837921, + "learning_rate": 0.0001, + "loss": 0.2185, + "step": 354 + }, + { + "epoch": 0.5787414411477013, + "grad_norm": 0.10404328256845474, + "learning_rate": 0.0001, + "loss": 0.2027, + "step": 355 + }, + { + "epoch": 0.5803716987283991, + "grad_norm": 0.10919814556837082, + "learning_rate": 0.0001, + "loss": 0.2025, + "step": 356 + }, + { + "epoch": 0.5820019563090968, + "grad_norm": 0.12742376327514648, + "learning_rate": 0.0001, + "loss": 0.2129, + "step": 357 + }, + { + "epoch": 0.5836322138897946, + "grad_norm": 0.133440762758255, + "learning_rate": 0.0001, + "loss": 0.2142, + "step": 358 + }, + { + "epoch": 0.5852624714704924, + "grad_norm": 0.11150208115577698, + "learning_rate": 0.0001, + "loss": 0.216, + "step": 359 + }, + { + "epoch": 0.58689272905119, + "grad_norm": 0.12375590205192566, + "learning_rate": 0.0001, + "loss": 0.2148, + "step": 360 + }, + { + "epoch": 0.5885229866318878, + "grad_norm": 0.13547080755233765, + "learning_rate": 0.0001, + "loss": 0.2105, + "step": 361 + }, + { + "epoch": 0.5901532442125856, + "grad_norm": 0.1400783509016037, + "learning_rate": 0.0001, + "loss": 0.2314, + "step": 362 + }, + { + "epoch": 0.5917835017932833, + "grad_norm": 0.11818625032901764, + "learning_rate": 0.0001, + "loss": 0.2098, + "step": 363 + }, + { + "epoch": 0.5934137593739811, + "grad_norm": 0.11656329035758972, + "learning_rate": 0.0001, + "loss": 0.2124, + "step": 364 + }, + { + "epoch": 0.5950440169546788, + "grad_norm": 0.17707152664661407, + "learning_rate": 0.0001, + "loss": 0.2004, + "step": 365 + }, + { + "epoch": 0.5966742745353766, + "grad_norm": 0.15075908601284027, + "learning_rate": 0.0001, + "loss": 0.2043, + "step": 366 + }, + { + "epoch": 0.5983045321160744, + "grad_norm": 0.14985023438930511, + "learning_rate": 0.0001, + "loss": 0.2274, + "step": 367 + }, + { + "epoch": 0.599934789696772, + "grad_norm": 0.13255703449249268, + "learning_rate": 0.0001, + "loss": 0.2174, + "step": 368 + }, + { + "epoch": 0.6015650472774698, + "grad_norm": 0.11377043277025223, + "learning_rate": 0.0001, + "loss": 0.2285, + "step": 369 + }, + { + "epoch": 0.6031953048581676, + "grad_norm": 0.10583726316690445, + "learning_rate": 0.0001, + "loss": 0.2111, + "step": 370 + }, + { + "epoch": 0.6048255624388653, + "grad_norm": 0.10989916324615479, + "learning_rate": 0.0001, + "loss": 0.2277, + "step": 371 + }, + { + "epoch": 0.6064558200195631, + "grad_norm": 0.13273194432258606, + "learning_rate": 0.0001, + "loss": 0.2269, + "step": 372 + }, + { + "epoch": 0.6080860776002608, + "grad_norm": 0.12810608744621277, + "learning_rate": 0.0001, + "loss": 0.2369, + "step": 373 + }, + { + "epoch": 0.6097163351809586, + "grad_norm": 0.11391692608594894, + "learning_rate": 0.0001, + "loss": 0.2178, + "step": 374 + }, + { + "epoch": 0.6113465927616564, + "grad_norm": 0.18068620562553406, + "learning_rate": 0.0001, + "loss": 0.2257, + "step": 375 + }, + { + "epoch": 0.612976850342354, + "grad_norm": 0.11202447861433029, + "learning_rate": 0.0001, + "loss": 0.1979, + "step": 376 + }, + { + "epoch": 0.6146071079230518, + "grad_norm": 0.13879020512104034, + "learning_rate": 0.0001, + "loss": 0.2235, + "step": 377 + }, + { + "epoch": 0.6162373655037496, + "grad_norm": 0.11466659605503082, + "learning_rate": 0.0001, + "loss": 0.2146, + "step": 378 + }, + { + "epoch": 0.6178676230844473, + "grad_norm": 0.13301534950733185, + "learning_rate": 0.0001, + "loss": 0.2374, + "step": 379 + }, + { + "epoch": 0.6194978806651451, + "grad_norm": 0.10728432238101959, + "learning_rate": 0.0001, + "loss": 0.2002, + "step": 380 + }, + { + "epoch": 0.6211281382458429, + "grad_norm": 0.11212770640850067, + "learning_rate": 0.0001, + "loss": 0.2232, + "step": 381 + }, + { + "epoch": 0.6227583958265406, + "grad_norm": 0.12755908071994781, + "learning_rate": 0.0001, + "loss": 0.2284, + "step": 382 + }, + { + "epoch": 0.6243886534072384, + "grad_norm": 0.11284497380256653, + "learning_rate": 0.0001, + "loss": 0.2235, + "step": 383 + }, + { + "epoch": 0.626018910987936, + "grad_norm": 0.13458143174648285, + "learning_rate": 0.0001, + "loss": 0.2183, + "step": 384 + }, + { + "epoch": 0.6276491685686338, + "grad_norm": 0.13338471949100494, + "learning_rate": 0.0001, + "loss": 0.2222, + "step": 385 + }, + { + "epoch": 0.6292794261493316, + "grad_norm": 0.11197499930858612, + "learning_rate": 0.0001, + "loss": 0.2184, + "step": 386 + }, + { + "epoch": 0.6309096837300293, + "grad_norm": 0.11694587767124176, + "learning_rate": 0.0001, + "loss": 0.2231, + "step": 387 + }, + { + "epoch": 0.6325399413107271, + "grad_norm": 0.12493859976530075, + "learning_rate": 0.0001, + "loss": 0.2138, + "step": 388 + }, + { + "epoch": 0.6341701988914249, + "grad_norm": 0.12846441566944122, + "learning_rate": 0.0001, + "loss": 0.2244, + "step": 389 + }, + { + "epoch": 0.6358004564721226, + "grad_norm": 0.11621380597352982, + "learning_rate": 0.0001, + "loss": 0.2232, + "step": 390 + }, + { + "epoch": 0.6374307140528204, + "grad_norm": 0.12881864607334137, + "learning_rate": 0.0001, + "loss": 0.2076, + "step": 391 + }, + { + "epoch": 0.639060971633518, + "grad_norm": 0.14580029249191284, + "learning_rate": 0.0001, + "loss": 0.222, + "step": 392 + }, + { + "epoch": 0.6406912292142158, + "grad_norm": 0.11677811294794083, + "learning_rate": 0.0001, + "loss": 0.2239, + "step": 393 + }, + { + "epoch": 0.6423214867949136, + "grad_norm": 0.13098081946372986, + "learning_rate": 0.0001, + "loss": 0.2217, + "step": 394 + }, + { + "epoch": 0.6439517443756113, + "grad_norm": 0.13351522386074066, + "learning_rate": 0.0001, + "loss": 0.2129, + "step": 395 + }, + { + "epoch": 0.6455820019563091, + "grad_norm": 0.10511767864227295, + "learning_rate": 0.0001, + "loss": 0.1971, + "step": 396 + }, + { + "epoch": 0.6472122595370069, + "grad_norm": 0.14180706441402435, + "learning_rate": 0.0001, + "loss": 0.2195, + "step": 397 + }, + { + "epoch": 0.6488425171177046, + "grad_norm": 0.1285708099603653, + "learning_rate": 0.0001, + "loss": 0.2371, + "step": 398 + }, + { + "epoch": 0.6504727746984024, + "grad_norm": 0.11256030201911926, + "learning_rate": 0.0001, + "loss": 0.2181, + "step": 399 + }, + { + "epoch": 0.6521030322791, + "grad_norm": 0.13401979207992554, + "learning_rate": 0.0001, + "loss": 0.2226, + "step": 400 + }, + { + "epoch": 0.6537332898597978, + "grad_norm": 0.1150553822517395, + "learning_rate": 0.0001, + "loss": 0.2307, + "step": 401 + }, + { + "epoch": 0.6553635474404956, + "grad_norm": 0.12015953660011292, + "learning_rate": 0.0001, + "loss": 0.2107, + "step": 402 + }, + { + "epoch": 0.6569938050211933, + "grad_norm": 0.10118319094181061, + "learning_rate": 0.0001, + "loss": 0.2045, + "step": 403 + }, + { + "epoch": 0.6586240626018911, + "grad_norm": 0.12295553833246231, + "learning_rate": 0.0001, + "loss": 0.2206, + "step": 404 + }, + { + "epoch": 0.6602543201825889, + "grad_norm": 0.11807361245155334, + "learning_rate": 0.0001, + "loss": 0.2197, + "step": 405 + }, + { + "epoch": 0.6618845777632866, + "grad_norm": 0.12740248441696167, + "learning_rate": 0.0001, + "loss": 0.2172, + "step": 406 + }, + { + "epoch": 0.6635148353439844, + "grad_norm": 0.12042457610368729, + "learning_rate": 0.0001, + "loss": 0.2225, + "step": 407 + }, + { + "epoch": 0.665145092924682, + "grad_norm": 0.17038275301456451, + "learning_rate": 0.0001, + "loss": 0.233, + "step": 408 + }, + { + "epoch": 0.6667753505053798, + "grad_norm": 0.11151735484600067, + "learning_rate": 0.0001, + "loss": 0.2013, + "step": 409 + }, + { + "epoch": 0.6684056080860776, + "grad_norm": 0.12992088496685028, + "learning_rate": 0.0001, + "loss": 0.2128, + "step": 410 + }, + { + "epoch": 0.6700358656667753, + "grad_norm": 0.10836778581142426, + "learning_rate": 0.0001, + "loss": 0.2187, + "step": 411 + }, + { + "epoch": 0.6716661232474731, + "grad_norm": 0.128456249833107, + "learning_rate": 0.0001, + "loss": 0.2078, + "step": 412 + }, + { + "epoch": 0.6732963808281709, + "grad_norm": 0.12796828150749207, + "learning_rate": 0.0001, + "loss": 0.2222, + "step": 413 + }, + { + "epoch": 0.6749266384088686, + "grad_norm": 0.11841002106666565, + "learning_rate": 0.0001, + "loss": 0.2127, + "step": 414 + }, + { + "epoch": 0.6765568959895664, + "grad_norm": 0.11400352418422699, + "learning_rate": 0.0001, + "loss": 0.2004, + "step": 415 + }, + { + "epoch": 0.6781871535702642, + "grad_norm": 0.1325376331806183, + "learning_rate": 0.0001, + "loss": 0.218, + "step": 416 + }, + { + "epoch": 0.6798174111509618, + "grad_norm": 0.128141388297081, + "learning_rate": 0.0001, + "loss": 0.2095, + "step": 417 + }, + { + "epoch": 0.6814476687316596, + "grad_norm": 0.1291760355234146, + "learning_rate": 0.0001, + "loss": 0.2231, + "step": 418 + }, + { + "epoch": 0.6830779263123573, + "grad_norm": 0.12524424493312836, + "learning_rate": 0.0001, + "loss": 0.2143, + "step": 419 + }, + { + "epoch": 0.6847081838930551, + "grad_norm": 0.128330260515213, + "learning_rate": 0.0001, + "loss": 0.199, + "step": 420 + }, + { + "epoch": 0.6863384414737529, + "grad_norm": 0.12733139097690582, + "learning_rate": 0.0001, + "loss": 0.2145, + "step": 421 + }, + { + "epoch": 0.6879686990544506, + "grad_norm": 0.10170670598745346, + "learning_rate": 0.0001, + "loss": 0.2053, + "step": 422 + }, + { + "epoch": 0.6895989566351484, + "grad_norm": 0.11686375737190247, + "learning_rate": 0.0001, + "loss": 0.2193, + "step": 423 + }, + { + "epoch": 0.6912292142158462, + "grad_norm": 0.1116994172334671, + "learning_rate": 0.0001, + "loss": 0.22, + "step": 424 + }, + { + "epoch": 0.6928594717965438, + "grad_norm": 0.10491323471069336, + "learning_rate": 0.0001, + "loss": 0.2127, + "step": 425 + }, + { + "epoch": 0.6944897293772416, + "grad_norm": 0.10770369321107864, + "learning_rate": 0.0001, + "loss": 0.2046, + "step": 426 + }, + { + "epoch": 0.6961199869579393, + "grad_norm": 0.12084191292524338, + "learning_rate": 0.0001, + "loss": 0.2174, + "step": 427 + }, + { + "epoch": 0.6977502445386371, + "grad_norm": 0.12580984830856323, + "learning_rate": 0.0001, + "loss": 0.2173, + "step": 428 + }, + { + "epoch": 0.6993805021193349, + "grad_norm": 0.1020190566778183, + "learning_rate": 0.0001, + "loss": 0.213, + "step": 429 + }, + { + "epoch": 0.7010107597000326, + "grad_norm": 0.10437814146280289, + "learning_rate": 0.0001, + "loss": 0.2187, + "step": 430 + }, + { + "epoch": 0.7026410172807304, + "grad_norm": 0.13456971943378448, + "learning_rate": 0.0001, + "loss": 0.2153, + "step": 431 + }, + { + "epoch": 0.7042712748614282, + "grad_norm": 0.12021128088235855, + "learning_rate": 0.0001, + "loss": 0.2146, + "step": 432 + }, + { + "epoch": 0.7059015324421258, + "grad_norm": 0.15099307894706726, + "learning_rate": 0.0001, + "loss": 0.2154, + "step": 433 + }, + { + "epoch": 0.7075317900228236, + "grad_norm": 0.11958514899015427, + "learning_rate": 0.0001, + "loss": 0.216, + "step": 434 + }, + { + "epoch": 0.7091620476035213, + "grad_norm": 0.11153008043766022, + "learning_rate": 0.0001, + "loss": 0.2034, + "step": 435 + }, + { + "epoch": 0.7107923051842191, + "grad_norm": 0.14314942061901093, + "learning_rate": 0.0001, + "loss": 0.2005, + "step": 436 + }, + { + "epoch": 0.7124225627649169, + "grad_norm": 0.10860710591077805, + "learning_rate": 0.0001, + "loss": 0.21, + "step": 437 + }, + { + "epoch": 0.7140528203456146, + "grad_norm": 0.1300295740365982, + "learning_rate": 0.0001, + "loss": 0.2148, + "step": 438 + }, + { + "epoch": 0.7156830779263124, + "grad_norm": 0.11765281856060028, + "learning_rate": 0.0001, + "loss": 0.2144, + "step": 439 + }, + { + "epoch": 0.7173133355070102, + "grad_norm": 0.1349002867937088, + "learning_rate": 0.0001, + "loss": 0.2079, + "step": 440 + }, + { + "epoch": 0.7189435930877078, + "grad_norm": 0.15499438345432281, + "learning_rate": 0.0001, + "loss": 0.2223, + "step": 441 + }, + { + "epoch": 0.7205738506684056, + "grad_norm": 0.15518735349178314, + "learning_rate": 0.0001, + "loss": 0.2317, + "step": 442 + }, + { + "epoch": 0.7222041082491033, + "grad_norm": 0.16259780526161194, + "learning_rate": 0.0001, + "loss": 0.2194, + "step": 443 + }, + { + "epoch": 0.7238343658298011, + "grad_norm": 0.13394394516944885, + "learning_rate": 0.0001, + "loss": 0.2175, + "step": 444 + }, + { + "epoch": 0.7254646234104989, + "grad_norm": 0.1212131530046463, + "learning_rate": 0.0001, + "loss": 0.2172, + "step": 445 + }, + { + "epoch": 0.7270948809911966, + "grad_norm": 0.1333845555782318, + "learning_rate": 0.0001, + "loss": 0.2125, + "step": 446 + }, + { + "epoch": 0.7287251385718944, + "grad_norm": 0.1470658779144287, + "learning_rate": 0.0001, + "loss": 0.2263, + "step": 447 + }, + { + "epoch": 0.7303553961525922, + "grad_norm": 0.11653272062540054, + "learning_rate": 0.0001, + "loss": 0.2203, + "step": 448 + }, + { + "epoch": 0.7319856537332898, + "grad_norm": 0.11255240440368652, + "learning_rate": 0.0001, + "loss": 0.2063, + "step": 449 + }, + { + "epoch": 0.7336159113139876, + "grad_norm": 0.1200360506772995, + "learning_rate": 0.0001, + "loss": 0.217, + "step": 450 + }, + { + "epoch": 0.7352461688946854, + "grad_norm": 0.12475258111953735, + "learning_rate": 0.0001, + "loss": 0.2232, + "step": 451 + }, + { + "epoch": 0.7368764264753831, + "grad_norm": 0.11277184635400772, + "learning_rate": 0.0001, + "loss": 0.2314, + "step": 452 + }, + { + "epoch": 0.7385066840560809, + "grad_norm": 0.11270298063755035, + "learning_rate": 0.0001, + "loss": 0.2112, + "step": 453 + }, + { + "epoch": 0.7401369416367786, + "grad_norm": 0.12822334468364716, + "learning_rate": 0.0001, + "loss": 0.2233, + "step": 454 + }, + { + "epoch": 0.7417671992174764, + "grad_norm": 0.11773821711540222, + "learning_rate": 0.0001, + "loss": 0.2213, + "step": 455 + }, + { + "epoch": 0.7433974567981741, + "grad_norm": 0.12635211646556854, + "learning_rate": 0.0001, + "loss": 0.2001, + "step": 456 + }, + { + "epoch": 0.7450277143788718, + "grad_norm": 0.15887145698070526, + "learning_rate": 0.0001, + "loss": 0.2276, + "step": 457 + }, + { + "epoch": 0.7466579719595696, + "grad_norm": 0.12271532416343689, + "learning_rate": 0.0001, + "loss": 0.219, + "step": 458 + }, + { + "epoch": 0.7482882295402674, + "grad_norm": 0.14966937899589539, + "learning_rate": 0.0001, + "loss": 0.2119, + "step": 459 + }, + { + "epoch": 0.7499184871209651, + "grad_norm": 0.15153633058071136, + "learning_rate": 0.0001, + "loss": 0.2316, + "step": 460 + }, + { + "epoch": 0.7515487447016629, + "grad_norm": 0.14428603649139404, + "learning_rate": 0.0001, + "loss": 0.2197, + "step": 461 + }, + { + "epoch": 0.7531790022823606, + "grad_norm": 0.12772101163864136, + "learning_rate": 0.0001, + "loss": 0.2159, + "step": 462 + }, + { + "epoch": 0.7548092598630584, + "grad_norm": 0.12549546360969543, + "learning_rate": 0.0001, + "loss": 0.2176, + "step": 463 + }, + { + "epoch": 0.7564395174437561, + "grad_norm": 0.13936737179756165, + "learning_rate": 0.0001, + "loss": 0.2187, + "step": 464 + }, + { + "epoch": 0.7580697750244538, + "grad_norm": 0.11733359098434448, + "learning_rate": 0.0001, + "loss": 0.2013, + "step": 465 + }, + { + "epoch": 0.7597000326051516, + "grad_norm": 0.14219063520431519, + "learning_rate": 0.0001, + "loss": 0.2263, + "step": 466 + }, + { + "epoch": 0.7613302901858494, + "grad_norm": 0.11690539866685867, + "learning_rate": 0.0001, + "loss": 0.2108, + "step": 467 + }, + { + "epoch": 0.7629605477665471, + "grad_norm": 0.13221612572669983, + "learning_rate": 0.0001, + "loss": 0.2051, + "step": 468 + }, + { + "epoch": 0.7645908053472449, + "grad_norm": 0.14684675633907318, + "learning_rate": 0.0001, + "loss": 0.2168, + "step": 469 + }, + { + "epoch": 0.7662210629279426, + "grad_norm": 0.1189989224076271, + "learning_rate": 0.0001, + "loss": 0.2204, + "step": 470 + }, + { + "epoch": 0.7678513205086404, + "grad_norm": 0.12807010114192963, + "learning_rate": 0.0001, + "loss": 0.2285, + "step": 471 + }, + { + "epoch": 0.7694815780893381, + "grad_norm": 0.13275974988937378, + "learning_rate": 0.0001, + "loss": 0.219, + "step": 472 + }, + { + "epoch": 0.7711118356700358, + "grad_norm": 0.1249246746301651, + "learning_rate": 0.0001, + "loss": 0.2218, + "step": 473 + }, + { + "epoch": 0.7727420932507336, + "grad_norm": 0.13339447975158691, + "learning_rate": 0.0001, + "loss": 0.2068, + "step": 474 + }, + { + "epoch": 0.7743723508314314, + "grad_norm": 0.12550900876522064, + "learning_rate": 0.0001, + "loss": 0.2103, + "step": 475 + }, + { + "epoch": 0.7760026084121291, + "grad_norm": 0.11490708589553833, + "learning_rate": 0.0001, + "loss": 0.2033, + "step": 476 + }, + { + "epoch": 0.7776328659928269, + "grad_norm": 0.12597325444221497, + "learning_rate": 0.0001, + "loss": 0.2135, + "step": 477 + }, + { + "epoch": 0.7792631235735246, + "grad_norm": 0.15443576872348785, + "learning_rate": 0.0001, + "loss": 0.2133, + "step": 478 + }, + { + "epoch": 0.7808933811542224, + "grad_norm": 0.19461177289485931, + "learning_rate": 0.0001, + "loss": 0.1955, + "step": 479 + }, + { + "epoch": 0.7825236387349201, + "grad_norm": 0.1330460160970688, + "learning_rate": 0.0001, + "loss": 0.1979, + "step": 480 + }, + { + "epoch": 0.7841538963156178, + "grad_norm": 0.11665869504213333, + "learning_rate": 0.0001, + "loss": 0.2149, + "step": 481 + }, + { + "epoch": 0.7857841538963156, + "grad_norm": 0.09942878782749176, + "learning_rate": 0.0001, + "loss": 0.2009, + "step": 482 + }, + { + "epoch": 0.7874144114770134, + "grad_norm": 0.11635838449001312, + "learning_rate": 0.0001, + "loss": 0.2046, + "step": 483 + }, + { + "epoch": 0.7890446690577111, + "grad_norm": 0.11658889055252075, + "learning_rate": 0.0001, + "loss": 0.2154, + "step": 484 + }, + { + "epoch": 0.7906749266384089, + "grad_norm": 0.122380830347538, + "learning_rate": 0.0001, + "loss": 0.2003, + "step": 485 + }, + { + "epoch": 0.7923051842191067, + "grad_norm": 0.11796704679727554, + "learning_rate": 0.0001, + "loss": 0.1966, + "step": 486 + }, + { + "epoch": 0.7939354417998044, + "grad_norm": 0.10663303732872009, + "learning_rate": 0.0001, + "loss": 0.2166, + "step": 487 + }, + { + "epoch": 0.7955656993805021, + "grad_norm": 0.11099519580602646, + "learning_rate": 0.0001, + "loss": 0.209, + "step": 488 + }, + { + "epoch": 0.7971959569611998, + "grad_norm": 0.14217236638069153, + "learning_rate": 0.0001, + "loss": 0.1955, + "step": 489 + }, + { + "epoch": 0.7988262145418976, + "grad_norm": 0.10185975581407547, + "learning_rate": 0.0001, + "loss": 0.194, + "step": 490 + }, + { + "epoch": 0.8004564721225954, + "grad_norm": 0.11912322789430618, + "learning_rate": 0.0001, + "loss": 0.1987, + "step": 491 + }, + { + "epoch": 0.8020867297032931, + "grad_norm": 0.11036073416471481, + "learning_rate": 0.0001, + "loss": 0.199, + "step": 492 + }, + { + "epoch": 0.8037169872839909, + "grad_norm": 0.13446538150310516, + "learning_rate": 0.0001, + "loss": 0.214, + "step": 493 + }, + { + "epoch": 0.8053472448646887, + "grad_norm": 0.12443588674068451, + "learning_rate": 0.0001, + "loss": 0.2067, + "step": 494 + }, + { + "epoch": 0.8069775024453864, + "grad_norm": 0.12884607911109924, + "learning_rate": 0.0001, + "loss": 0.2093, + "step": 495 + }, + { + "epoch": 0.8086077600260841, + "grad_norm": 0.1282474249601364, + "learning_rate": 0.0001, + "loss": 0.1896, + "step": 496 + }, + { + "epoch": 0.8102380176067818, + "grad_norm": 0.15667201578617096, + "learning_rate": 0.0001, + "loss": 0.2275, + "step": 497 + }, + { + "epoch": 0.8118682751874796, + "grad_norm": 0.12083553522825241, + "learning_rate": 0.0001, + "loss": 0.2061, + "step": 498 + }, + { + "epoch": 0.8134985327681774, + "grad_norm": 0.1412494033575058, + "learning_rate": 0.0001, + "loss": 0.2288, + "step": 499 + }, + { + "epoch": 0.8151287903488751, + "grad_norm": 0.11393098533153534, + "learning_rate": 0.0001, + "loss": 0.1828, + "step": 500 + }, + { + "epoch": 0.8151287903488751, + "eval_loss": 0.21177859604358673, + "eval_runtime": 2800.3476, + "eval_samples_per_second": 0.674, + "eval_steps_per_second": 0.169, + "step": 500 + }, + { + "epoch": 0.8167590479295729, + "grad_norm": 0.14163638651371002, + "learning_rate": 0.0001, + "loss": 0.2251, + "step": 501 + }, + { + "epoch": 0.8183893055102707, + "grad_norm": 0.14434993267059326, + "learning_rate": 0.0001, + "loss": 0.2147, + "step": 502 + }, + { + "epoch": 0.8200195630909684, + "grad_norm": 0.11927150934934616, + "learning_rate": 0.0001, + "loss": 0.1887, + "step": 503 + }, + { + "epoch": 0.8216498206716661, + "grad_norm": 0.12366633117198944, + "learning_rate": 0.0001, + "loss": 0.1949, + "step": 504 + }, + { + "epoch": 0.8232800782523638, + "grad_norm": 0.1395760029554367, + "learning_rate": 0.0001, + "loss": 0.1998, + "step": 505 + }, + { + "epoch": 0.8249103358330616, + "grad_norm": 0.1172211617231369, + "learning_rate": 0.0001, + "loss": 0.196, + "step": 506 + }, + { + "epoch": 0.8265405934137594, + "grad_norm": 0.14511209726333618, + "learning_rate": 0.0001, + "loss": 0.2301, + "step": 507 + }, + { + "epoch": 0.8281708509944571, + "grad_norm": 0.17221054434776306, + "learning_rate": 0.0001, + "loss": 0.2243, + "step": 508 + }, + { + "epoch": 0.8298011085751549, + "grad_norm": 0.1269037127494812, + "learning_rate": 0.0001, + "loss": 0.2086, + "step": 509 + }, + { + "epoch": 0.8314313661558527, + "grad_norm": 0.13336747884750366, + "learning_rate": 0.0001, + "loss": 0.2085, + "step": 510 + }, + { + "epoch": 0.8330616237365503, + "grad_norm": 0.13246330618858337, + "learning_rate": 0.0001, + "loss": 0.2264, + "step": 511 + }, + { + "epoch": 0.8346918813172481, + "grad_norm": 0.11336122453212738, + "learning_rate": 0.0001, + "loss": 0.204, + "step": 512 + }, + { + "epoch": 0.8363221388979458, + "grad_norm": 0.12755842506885529, + "learning_rate": 0.0001, + "loss": 0.2088, + "step": 513 + }, + { + "epoch": 0.8379523964786436, + "grad_norm": 0.11611583828926086, + "learning_rate": 0.0001, + "loss": 0.2037, + "step": 514 + }, + { + "epoch": 0.8395826540593414, + "grad_norm": 0.09402882307767868, + "learning_rate": 0.0001, + "loss": 0.1939, + "step": 515 + }, + { + "epoch": 0.8412129116400391, + "grad_norm": 0.12100663781166077, + "learning_rate": 0.0001, + "loss": 0.2104, + "step": 516 + }, + { + "epoch": 0.8428431692207369, + "grad_norm": 0.14289285242557526, + "learning_rate": 0.0001, + "loss": 0.2216, + "step": 517 + }, + { + "epoch": 0.8444734268014347, + "grad_norm": 0.10300295054912567, + "learning_rate": 0.0001, + "loss": 0.1796, + "step": 518 + }, + { + "epoch": 0.8461036843821323, + "grad_norm": 0.12143992632627487, + "learning_rate": 0.0001, + "loss": 0.2177, + "step": 519 + }, + { + "epoch": 0.8477339419628301, + "grad_norm": 0.12010245770215988, + "learning_rate": 0.0001, + "loss": 0.2016, + "step": 520 + }, + { + "epoch": 0.8493641995435279, + "grad_norm": 0.1474478542804718, + "learning_rate": 0.0001, + "loss": 0.2062, + "step": 521 + }, + { + "epoch": 0.8509944571242256, + "grad_norm": 0.12784548103809357, + "learning_rate": 0.0001, + "loss": 0.2317, + "step": 522 + }, + { + "epoch": 0.8526247147049234, + "grad_norm": 0.11985955387353897, + "learning_rate": 0.0001, + "loss": 0.2196, + "step": 523 + }, + { + "epoch": 0.8542549722856211, + "grad_norm": 0.11471915990114212, + "learning_rate": 0.0001, + "loss": 0.2082, + "step": 524 + }, + { + "epoch": 0.8558852298663189, + "grad_norm": 0.135053813457489, + "learning_rate": 0.0001, + "loss": 0.215, + "step": 525 + }, + { + "epoch": 0.8575154874470167, + "grad_norm": 0.10875121504068375, + "learning_rate": 0.0001, + "loss": 0.2143, + "step": 526 + }, + { + "epoch": 0.8591457450277143, + "grad_norm": 0.11824619024991989, + "learning_rate": 0.0001, + "loss": 0.2276, + "step": 527 + }, + { + "epoch": 0.8607760026084121, + "grad_norm": 0.10967472940683365, + "learning_rate": 0.0001, + "loss": 0.1933, + "step": 528 + }, + { + "epoch": 0.8624062601891099, + "grad_norm": 0.09989364445209503, + "learning_rate": 0.0001, + "loss": 0.2196, + "step": 529 + }, + { + "epoch": 0.8640365177698076, + "grad_norm": 0.10137049108743668, + "learning_rate": 0.0001, + "loss": 0.1993, + "step": 530 + }, + { + "epoch": 0.8656667753505054, + "grad_norm": 0.13142383098602295, + "learning_rate": 0.0001, + "loss": 0.2221, + "step": 531 + }, + { + "epoch": 0.8672970329312031, + "grad_norm": 0.1282247006893158, + "learning_rate": 0.0001, + "loss": 0.2167, + "step": 532 + }, + { + "epoch": 0.8689272905119009, + "grad_norm": 0.10532195121049881, + "learning_rate": 0.0001, + "loss": 0.2143, + "step": 533 + }, + { + "epoch": 0.8705575480925987, + "grad_norm": 0.10713239759206772, + "learning_rate": 0.0001, + "loss": 0.2045, + "step": 534 + }, + { + "epoch": 0.8721878056732963, + "grad_norm": 0.12712952494621277, + "learning_rate": 0.0001, + "loss": 0.2264, + "step": 535 + }, + { + "epoch": 0.8738180632539941, + "grad_norm": 0.12850171327590942, + "learning_rate": 0.0001, + "loss": 0.2284, + "step": 536 + }, + { + "epoch": 0.8754483208346919, + "grad_norm": 0.12360873818397522, + "learning_rate": 0.0001, + "loss": 0.197, + "step": 537 + }, + { + "epoch": 0.8770785784153896, + "grad_norm": 0.12052212655544281, + "learning_rate": 0.0001, + "loss": 0.1906, + "step": 538 + }, + { + "epoch": 0.8787088359960874, + "grad_norm": 0.14144307374954224, + "learning_rate": 0.0001, + "loss": 0.2131, + "step": 539 + }, + { + "epoch": 0.8803390935767851, + "grad_norm": 0.11279332637786865, + "learning_rate": 0.0001, + "loss": 0.2145, + "step": 540 + }, + { + "epoch": 0.8819693511574829, + "grad_norm": 0.161105215549469, + "learning_rate": 0.0001, + "loss": 0.2212, + "step": 541 + }, + { + "epoch": 0.8835996087381807, + "grad_norm": 0.11746472120285034, + "learning_rate": 0.0001, + "loss": 0.213, + "step": 542 + }, + { + "epoch": 0.8852298663188783, + "grad_norm": 0.1244741752743721, + "learning_rate": 0.0001, + "loss": 0.2065, + "step": 543 + }, + { + "epoch": 0.8868601238995761, + "grad_norm": 0.1004570797085762, + "learning_rate": 0.0001, + "loss": 0.2086, + "step": 544 + }, + { + "epoch": 0.8884903814802739, + "grad_norm": 0.10560411214828491, + "learning_rate": 0.0001, + "loss": 0.204, + "step": 545 + }, + { + "epoch": 0.8901206390609716, + "grad_norm": 0.11466600000858307, + "learning_rate": 0.0001, + "loss": 0.2012, + "step": 546 + }, + { + "epoch": 0.8917508966416694, + "grad_norm": 0.11946078389883041, + "learning_rate": 0.0001, + "loss": 0.21, + "step": 547 + }, + { + "epoch": 0.8933811542223671, + "grad_norm": 0.13734155893325806, + "learning_rate": 0.0001, + "loss": 0.2166, + "step": 548 + }, + { + "epoch": 0.8950114118030649, + "grad_norm": 0.11657248437404633, + "learning_rate": 0.0001, + "loss": 0.2106, + "step": 549 + }, + { + "epoch": 0.8966416693837627, + "grad_norm": 0.1141151636838913, + "learning_rate": 0.0001, + "loss": 0.1827, + "step": 550 + }, + { + "epoch": 0.8982719269644603, + "grad_norm": 0.1335282176733017, + "learning_rate": 0.0001, + "loss": 0.2069, + "step": 551 + }, + { + "epoch": 0.8999021845451581, + "grad_norm": 0.12621258199214935, + "learning_rate": 0.0001, + "loss": 0.1985, + "step": 552 + }, + { + "epoch": 0.9015324421258559, + "grad_norm": 0.13729962706565857, + "learning_rate": 0.0001, + "loss": 0.2068, + "step": 553 + }, + { + "epoch": 0.9031626997065536, + "grad_norm": 0.11198980361223221, + "learning_rate": 0.0001, + "loss": 0.2078, + "step": 554 + }, + { + "epoch": 0.9047929572872514, + "grad_norm": 0.12503336369991302, + "learning_rate": 0.0001, + "loss": 0.2077, + "step": 555 + }, + { + "epoch": 0.9064232148679492, + "grad_norm": 0.14169982075691223, + "learning_rate": 0.0001, + "loss": 0.2052, + "step": 556 + }, + { + "epoch": 0.9080534724486469, + "grad_norm": 0.10698267072439194, + "learning_rate": 0.0001, + "loss": 0.1958, + "step": 557 + }, + { + "epoch": 0.9096837300293447, + "grad_norm": 0.1422925740480423, + "learning_rate": 0.0001, + "loss": 0.1995, + "step": 558 + }, + { + "epoch": 0.9113139876100423, + "grad_norm": 0.11455567926168442, + "learning_rate": 0.0001, + "loss": 0.1998, + "step": 559 + }, + { + "epoch": 0.9129442451907401, + "grad_norm": 0.1599031537771225, + "learning_rate": 0.0001, + "loss": 0.2113, + "step": 560 + }, + { + "epoch": 0.9145745027714379, + "grad_norm": 0.13150307536125183, + "learning_rate": 0.0001, + "loss": 0.217, + "step": 561 + }, + { + "epoch": 0.9162047603521356, + "grad_norm": 0.10883089154958725, + "learning_rate": 0.0001, + "loss": 0.1914, + "step": 562 + }, + { + "epoch": 0.9178350179328334, + "grad_norm": 0.12056294828653336, + "learning_rate": 0.0001, + "loss": 0.2008, + "step": 563 + }, + { + "epoch": 0.9194652755135312, + "grad_norm": 0.13235759735107422, + "learning_rate": 0.0001, + "loss": 0.2019, + "step": 564 + }, + { + "epoch": 0.9210955330942289, + "grad_norm": 0.11871761828660965, + "learning_rate": 0.0001, + "loss": 0.2075, + "step": 565 + }, + { + "epoch": 0.9227257906749267, + "grad_norm": 0.13773424923419952, + "learning_rate": 0.0001, + "loss": 0.2157, + "step": 566 + }, + { + "epoch": 0.9243560482556243, + "grad_norm": 0.12058678269386292, + "learning_rate": 0.0001, + "loss": 0.1872, + "step": 567 + }, + { + "epoch": 0.9259863058363221, + "grad_norm": 0.12389074265956879, + "learning_rate": 0.0001, + "loss": 0.2092, + "step": 568 + }, + { + "epoch": 0.9276165634170199, + "grad_norm": 0.13207301497459412, + "learning_rate": 0.0001, + "loss": 0.2111, + "step": 569 + }, + { + "epoch": 0.9292468209977176, + "grad_norm": 0.12088489532470703, + "learning_rate": 0.0001, + "loss": 0.1886, + "step": 570 + }, + { + "epoch": 0.9308770785784154, + "grad_norm": 0.1485392153263092, + "learning_rate": 0.0001, + "loss": 0.2148, + "step": 571 + }, + { + "epoch": 0.9325073361591132, + "grad_norm": 0.15602104365825653, + "learning_rate": 0.0001, + "loss": 0.2115, + "step": 572 + }, + { + "epoch": 0.9341375937398109, + "grad_norm": 0.1451883614063263, + "learning_rate": 0.0001, + "loss": 0.2147, + "step": 573 + }, + { + "epoch": 0.9357678513205087, + "grad_norm": 0.15947164595127106, + "learning_rate": 0.0001, + "loss": 0.2026, + "step": 574 + }, + { + "epoch": 0.9373981089012063, + "grad_norm": 0.15256251394748688, + "learning_rate": 0.0001, + "loss": 0.209, + "step": 575 + }, + { + "epoch": 0.9390283664819041, + "grad_norm": 0.1322648674249649, + "learning_rate": 0.0001, + "loss": 0.198, + "step": 576 + }, + { + "epoch": 0.9406586240626019, + "grad_norm": 0.11925152689218521, + "learning_rate": 0.0001, + "loss": 0.2055, + "step": 577 + }, + { + "epoch": 0.9422888816432996, + "grad_norm": 0.12666535377502441, + "learning_rate": 0.0001, + "loss": 0.2063, + "step": 578 + }, + { + "epoch": 0.9439191392239974, + "grad_norm": 0.10876161605119705, + "learning_rate": 0.0001, + "loss": 0.2113, + "step": 579 + }, + { + "epoch": 0.9455493968046952, + "grad_norm": 0.12513647973537445, + "learning_rate": 0.0001, + "loss": 0.2093, + "step": 580 + }, + { + "epoch": 0.9471796543853929, + "grad_norm": 0.13541190326213837, + "learning_rate": 0.0001, + "loss": 0.2048, + "step": 581 + }, + { + "epoch": 0.9488099119660907, + "grad_norm": 0.12051568925380707, + "learning_rate": 0.0001, + "loss": 0.2205, + "step": 582 + }, + { + "epoch": 0.9504401695467883, + "grad_norm": 0.11748453974723816, + "learning_rate": 0.0001, + "loss": 0.211, + "step": 583 + }, + { + "epoch": 0.9520704271274861, + "grad_norm": 0.11520763486623764, + "learning_rate": 0.0001, + "loss": 0.2033, + "step": 584 + }, + { + "epoch": 0.9537006847081839, + "grad_norm": 0.11155470460653305, + "learning_rate": 0.0001, + "loss": 0.2004, + "step": 585 + }, + { + "epoch": 0.9553309422888816, + "grad_norm": 0.12134213000535965, + "learning_rate": 0.0001, + "loss": 0.2137, + "step": 586 + }, + { + "epoch": 0.9569611998695794, + "grad_norm": 0.1297822743654251, + "learning_rate": 0.0001, + "loss": 0.2052, + "step": 587 + }, + { + "epoch": 0.9585914574502772, + "grad_norm": 0.12923404574394226, + "learning_rate": 0.0001, + "loss": 0.209, + "step": 588 + }, + { + "epoch": 0.9602217150309749, + "grad_norm": 0.20898625254631042, + "learning_rate": 0.0001, + "loss": 0.2047, + "step": 589 + }, + { + "epoch": 0.9618519726116727, + "grad_norm": 0.12227951735258102, + "learning_rate": 0.0001, + "loss": 0.2107, + "step": 590 + }, + { + "epoch": 0.9634822301923704, + "grad_norm": 0.12081098556518555, + "learning_rate": 0.0001, + "loss": 0.2093, + "step": 591 + }, + { + "epoch": 0.9651124877730681, + "grad_norm": 0.13986457884311676, + "learning_rate": 0.0001, + "loss": 0.2161, + "step": 592 + }, + { + "epoch": 0.9667427453537659, + "grad_norm": 0.12605050206184387, + "learning_rate": 0.0001, + "loss": 0.2153, + "step": 593 + }, + { + "epoch": 0.9683730029344636, + "grad_norm": 0.11643191426992416, + "learning_rate": 0.0001, + "loss": 0.2062, + "step": 594 + }, + { + "epoch": 0.9700032605151614, + "grad_norm": 0.12521113455295563, + "learning_rate": 0.0001, + "loss": 0.2121, + "step": 595 + }, + { + "epoch": 0.9716335180958592, + "grad_norm": 0.11723621189594269, + "learning_rate": 0.0001, + "loss": 0.2053, + "step": 596 + }, + { + "epoch": 0.9732637756765569, + "grad_norm": 0.12405609339475632, + "learning_rate": 0.0001, + "loss": 0.2104, + "step": 597 + }, + { + "epoch": 0.9748940332572547, + "grad_norm": 0.10823733359575272, + "learning_rate": 0.0001, + "loss": 0.2174, + "step": 598 + }, + { + "epoch": 0.9765242908379524, + "grad_norm": 0.11131487786769867, + "learning_rate": 0.0001, + "loss": 0.213, + "step": 599 + }, + { + "epoch": 0.9781545484186501, + "grad_norm": 0.13913866877555847, + "learning_rate": 0.0001, + "loss": 0.2086, + "step": 600 + }, + { + "epoch": 0.9797848059993479, + "grad_norm": 0.11523352563381195, + "learning_rate": 0.0001, + "loss": 0.2103, + "step": 601 + }, + { + "epoch": 0.9814150635800456, + "grad_norm": 0.1258571743965149, + "learning_rate": 0.0001, + "loss": 0.2082, + "step": 602 + }, + { + "epoch": 0.9830453211607434, + "grad_norm": 0.14156940579414368, + "learning_rate": 0.0001, + "loss": 0.2005, + "step": 603 + }, + { + "epoch": 0.9846755787414412, + "grad_norm": 0.1144745722413063, + "learning_rate": 0.0001, + "loss": 0.218, + "step": 604 + }, + { + "epoch": 0.9863058363221389, + "grad_norm": 0.12323161959648132, + "learning_rate": 0.0001, + "loss": 0.2007, + "step": 605 + }, + { + "epoch": 0.9879360939028367, + "grad_norm": 0.1369849145412445, + "learning_rate": 0.0001, + "loss": 0.2187, + "step": 606 + }, + { + "epoch": 0.9895663514835344, + "grad_norm": 0.13043704628944397, + "learning_rate": 0.0001, + "loss": 0.2111, + "step": 607 + }, + { + "epoch": 0.9911966090642321, + "grad_norm": 0.11909907311201096, + "learning_rate": 0.0001, + "loss": 0.2054, + "step": 608 + }, + { + "epoch": 0.9928268666449299, + "grad_norm": 0.12159440666437149, + "learning_rate": 0.0001, + "loss": 0.2168, + "step": 609 + }, + { + "epoch": 0.9944571242256276, + "grad_norm": 0.11479892581701279, + "learning_rate": 0.0001, + "loss": 0.2116, + "step": 610 + }, + { + "epoch": 0.9960873818063254, + "grad_norm": 0.12101448327302933, + "learning_rate": 0.0001, + "loss": 0.2054, + "step": 611 + }, + { + "epoch": 0.9977176393870232, + "grad_norm": 0.1083630621433258, + "learning_rate": 0.0001, + "loss": 0.1981, + "step": 612 + }, + { + "epoch": 0.9993478969677209, + "grad_norm": 0.09980759769678116, + "learning_rate": 0.0001, + "loss": 0.2045, + "step": 613 + }, + { + "epoch": 1.0009781545484187, + "grad_norm": 0.1311112940311432, + "learning_rate": 0.0001, + "loss": 0.2099, + "step": 614 + }, + { + "epoch": 1.0026084121291163, + "grad_norm": 0.12701699137687683, + "learning_rate": 0.0001, + "loss": 0.1989, + "step": 615 + }, + { + "epoch": 1.0042386697098142, + "grad_norm": 0.10910508036613464, + "learning_rate": 0.0001, + "loss": 0.2063, + "step": 616 + }, + { + "epoch": 1.005868927290512, + "grad_norm": 0.11964251101016998, + "learning_rate": 0.0001, + "loss": 0.2046, + "step": 617 + }, + { + "epoch": 1.0074991848712096, + "grad_norm": 0.11345476657152176, + "learning_rate": 0.0001, + "loss": 0.1933, + "step": 618 + }, + { + "epoch": 1.0091294424519075, + "grad_norm": 0.11260963976383209, + "learning_rate": 0.0001, + "loss": 0.2019, + "step": 619 + }, + { + "epoch": 1.0107597000326052, + "grad_norm": 0.13378599286079407, + "learning_rate": 0.0001, + "loss": 0.2074, + "step": 620 + }, + { + "epoch": 1.0123899576133029, + "grad_norm": 0.10661476850509644, + "learning_rate": 0.0001, + "loss": 0.1972, + "step": 621 + }, + { + "epoch": 1.0140202151940008, + "grad_norm": 0.12620778381824493, + "learning_rate": 0.0001, + "loss": 0.2111, + "step": 622 + }, + { + "epoch": 1.0156504727746984, + "grad_norm": 0.12133750319480896, + "learning_rate": 0.0001, + "loss": 0.2022, + "step": 623 + }, + { + "epoch": 1.0172807303553961, + "grad_norm": 0.10500383377075195, + "learning_rate": 0.0001, + "loss": 0.1778, + "step": 624 + }, + { + "epoch": 1.0189109879360938, + "grad_norm": 0.12143319845199585, + "learning_rate": 0.0001, + "loss": 0.211, + "step": 625 + }, + { + "epoch": 1.0205412455167917, + "grad_norm": 0.11858654022216797, + "learning_rate": 0.0001, + "loss": 0.1871, + "step": 626 + }, + { + "epoch": 1.0221715030974894, + "grad_norm": 0.12005385756492615, + "learning_rate": 0.0001, + "loss": 0.2077, + "step": 627 + }, + { + "epoch": 1.023801760678187, + "grad_norm": 0.13842667639255524, + "learning_rate": 0.0001, + "loss": 0.2263, + "step": 628 + }, + { + "epoch": 1.025432018258885, + "grad_norm": 0.14517144858837128, + "learning_rate": 0.0001, + "loss": 0.2117, + "step": 629 + }, + { + "epoch": 1.0270622758395827, + "grad_norm": 0.12258317321538925, + "learning_rate": 0.0001, + "loss": 0.1921, + "step": 630 + }, + { + "epoch": 1.0286925334202803, + "grad_norm": 0.10833742469549179, + "learning_rate": 0.0001, + "loss": 0.1868, + "step": 631 + }, + { + "epoch": 1.0303227910009782, + "grad_norm": 0.15066489577293396, + "learning_rate": 0.0001, + "loss": 0.206, + "step": 632 + }, + { + "epoch": 1.031953048581676, + "grad_norm": 0.11644082516431808, + "learning_rate": 0.0001, + "loss": 0.2128, + "step": 633 + }, + { + "epoch": 1.0335833061623736, + "grad_norm": 0.1423221379518509, + "learning_rate": 0.0001, + "loss": 0.1998, + "step": 634 + }, + { + "epoch": 1.0352135637430715, + "grad_norm": 0.12416679412126541, + "learning_rate": 0.0001, + "loss": 0.1915, + "step": 635 + }, + { + "epoch": 1.0368438213237692, + "grad_norm": 0.11678506433963776, + "learning_rate": 0.0001, + "loss": 0.189, + "step": 636 + }, + { + "epoch": 1.0384740789044669, + "grad_norm": 0.12458233535289764, + "learning_rate": 0.0001, + "loss": 0.1911, + "step": 637 + }, + { + "epoch": 1.0401043364851648, + "grad_norm": 0.12417783588171005, + "learning_rate": 0.0001, + "loss": 0.1773, + "step": 638 + }, + { + "epoch": 1.0417345940658624, + "grad_norm": 0.11989506334066391, + "learning_rate": 0.0001, + "loss": 0.1932, + "step": 639 + }, + { + "epoch": 1.0433648516465601, + "grad_norm": 0.1083277091383934, + "learning_rate": 0.0001, + "loss": 0.1952, + "step": 640 + }, + { + "epoch": 1.0449951092272578, + "grad_norm": 0.17112882435321808, + "learning_rate": 0.0001, + "loss": 0.2021, + "step": 641 + }, + { + "epoch": 1.0466253668079557, + "grad_norm": 0.14799998700618744, + "learning_rate": 0.0001, + "loss": 0.2038, + "step": 642 + }, + { + "epoch": 1.0482556243886534, + "grad_norm": 0.14156191051006317, + "learning_rate": 0.0001, + "loss": 0.1908, + "step": 643 + }, + { + "epoch": 1.049885881969351, + "grad_norm": 0.1254645138978958, + "learning_rate": 0.0001, + "loss": 0.2045, + "step": 644 + }, + { + "epoch": 1.051516139550049, + "grad_norm": 0.11081767827272415, + "learning_rate": 0.0001, + "loss": 0.195, + "step": 645 + }, + { + "epoch": 1.0531463971307466, + "grad_norm": 0.11810845881700516, + "learning_rate": 0.0001, + "loss": 0.1939, + "step": 646 + }, + { + "epoch": 1.0547766547114443, + "grad_norm": 0.12629976868629456, + "learning_rate": 0.0001, + "loss": 0.2117, + "step": 647 + }, + { + "epoch": 1.0564069122921422, + "grad_norm": 0.11882951855659485, + "learning_rate": 0.0001, + "loss": 0.192, + "step": 648 + }, + { + "epoch": 1.05803716987284, + "grad_norm": 0.11729996651411057, + "learning_rate": 0.0001, + "loss": 0.1916, + "step": 649 + }, + { + "epoch": 1.0596674274535376, + "grad_norm": 0.11437559127807617, + "learning_rate": 0.0001, + "loss": 0.1885, + "step": 650 + }, + { + "epoch": 1.0612976850342355, + "grad_norm": 0.13408319652080536, + "learning_rate": 0.0001, + "loss": 0.2138, + "step": 651 + }, + { + "epoch": 1.0629279426149332, + "grad_norm": 0.14032766222953796, + "learning_rate": 0.0001, + "loss": 0.205, + "step": 652 + }, + { + "epoch": 1.0645582001956309, + "grad_norm": 0.1438027173280716, + "learning_rate": 0.0001, + "loss": 0.1934, + "step": 653 + }, + { + "epoch": 1.0661884577763288, + "grad_norm": 0.11743790656328201, + "learning_rate": 0.0001, + "loss": 0.1944, + "step": 654 + }, + { + "epoch": 1.0678187153570264, + "grad_norm": 0.1314900517463684, + "learning_rate": 0.0001, + "loss": 0.2108, + "step": 655 + }, + { + "epoch": 1.0694489729377241, + "grad_norm": 0.11884745210409164, + "learning_rate": 0.0001, + "loss": 0.1858, + "step": 656 + }, + { + "epoch": 1.0710792305184218, + "grad_norm": 0.10163510590791702, + "learning_rate": 0.0001, + "loss": 0.1984, + "step": 657 + }, + { + "epoch": 1.0727094880991197, + "grad_norm": 0.11893923580646515, + "learning_rate": 0.0001, + "loss": 0.1999, + "step": 658 + }, + { + "epoch": 1.0743397456798174, + "grad_norm": 0.12390507012605667, + "learning_rate": 0.0001, + "loss": 0.2142, + "step": 659 + }, + { + "epoch": 1.075970003260515, + "grad_norm": 0.11425681412220001, + "learning_rate": 0.0001, + "loss": 0.1883, + "step": 660 + }, + { + "epoch": 1.077600260841213, + "grad_norm": 0.11586383730173111, + "learning_rate": 0.0001, + "loss": 0.1894, + "step": 661 + }, + { + "epoch": 1.0792305184219106, + "grad_norm": 0.1234535500407219, + "learning_rate": 0.0001, + "loss": 0.2, + "step": 662 + }, + { + "epoch": 1.0808607760026083, + "grad_norm": 0.107200987637043, + "learning_rate": 0.0001, + "loss": 0.1875, + "step": 663 + }, + { + "epoch": 1.0824910335833062, + "grad_norm": 0.12569493055343628, + "learning_rate": 0.0001, + "loss": 0.1936, + "step": 664 + }, + { + "epoch": 1.084121291164004, + "grad_norm": 0.12856988608837128, + "learning_rate": 0.0001, + "loss": 0.1905, + "step": 665 + }, + { + "epoch": 1.0857515487447016, + "grad_norm": 0.16755834221839905, + "learning_rate": 0.0001, + "loss": 0.2023, + "step": 666 + }, + { + "epoch": 1.0873818063253995, + "grad_norm": 0.13660454750061035, + "learning_rate": 0.0001, + "loss": 0.2032, + "step": 667 + }, + { + "epoch": 1.0890120639060972, + "grad_norm": 0.10664553940296173, + "learning_rate": 0.0001, + "loss": 0.1725, + "step": 668 + }, + { + "epoch": 1.0906423214867949, + "grad_norm": 0.1499480903148651, + "learning_rate": 0.0001, + "loss": 0.2057, + "step": 669 + }, + { + "epoch": 1.0922725790674928, + "grad_norm": 0.13117225468158722, + "learning_rate": 0.0001, + "loss": 0.2064, + "step": 670 + }, + { + "epoch": 1.0939028366481904, + "grad_norm": 0.12112154811620712, + "learning_rate": 0.0001, + "loss": 0.1952, + "step": 671 + }, + { + "epoch": 1.0955330942288881, + "grad_norm": 0.12471124529838562, + "learning_rate": 0.0001, + "loss": 0.2005, + "step": 672 + }, + { + "epoch": 1.097163351809586, + "grad_norm": 0.14131243526935577, + "learning_rate": 0.0001, + "loss": 0.2099, + "step": 673 + }, + { + "epoch": 1.0987936093902837, + "grad_norm": 0.1315171718597412, + "learning_rate": 0.0001, + "loss": 0.1955, + "step": 674 + }, + { + "epoch": 1.1004238669709814, + "grad_norm": 0.12525959312915802, + "learning_rate": 0.0001, + "loss": 0.1937, + "step": 675 + }, + { + "epoch": 1.102054124551679, + "grad_norm": 0.12638898193836212, + "learning_rate": 0.0001, + "loss": 0.2045, + "step": 676 + }, + { + "epoch": 1.103684382132377, + "grad_norm": 0.12964749336242676, + "learning_rate": 0.0001, + "loss": 0.1799, + "step": 677 + }, + { + "epoch": 1.1053146397130746, + "grad_norm": 0.14780597388744354, + "learning_rate": 0.0001, + "loss": 0.1987, + "step": 678 + }, + { + "epoch": 1.1069448972937723, + "grad_norm": 0.14882171154022217, + "learning_rate": 0.0001, + "loss": 0.2038, + "step": 679 + }, + { + "epoch": 1.1085751548744702, + "grad_norm": 0.131666362285614, + "learning_rate": 0.0001, + "loss": 0.1915, + "step": 680 + }, + { + "epoch": 1.110205412455168, + "grad_norm": 0.12266353517770767, + "learning_rate": 0.0001, + "loss": 0.19, + "step": 681 + }, + { + "epoch": 1.1118356700358656, + "grad_norm": 0.13120290637016296, + "learning_rate": 0.0001, + "loss": 0.2028, + "step": 682 + }, + { + "epoch": 1.1134659276165635, + "grad_norm": 0.12845005095005035, + "learning_rate": 0.0001, + "loss": 0.202, + "step": 683 + }, + { + "epoch": 1.1150961851972612, + "grad_norm": 0.12827853858470917, + "learning_rate": 0.0001, + "loss": 0.2101, + "step": 684 + }, + { + "epoch": 1.1167264427779588, + "grad_norm": 0.10449423640966415, + "learning_rate": 0.0001, + "loss": 0.1814, + "step": 685 + }, + { + "epoch": 1.1183567003586568, + "grad_norm": 0.12488257884979248, + "learning_rate": 0.0001, + "loss": 0.2008, + "step": 686 + }, + { + "epoch": 1.1199869579393544, + "grad_norm": 0.1401628851890564, + "learning_rate": 0.0001, + "loss": 0.2046, + "step": 687 + }, + { + "epoch": 1.1216172155200521, + "grad_norm": 0.11075824499130249, + "learning_rate": 0.0001, + "loss": 0.1908, + "step": 688 + }, + { + "epoch": 1.12324747310075, + "grad_norm": 0.14995628595352173, + "learning_rate": 0.0001, + "loss": 0.1993, + "step": 689 + }, + { + "epoch": 1.1248777306814477, + "grad_norm": 0.13028737902641296, + "learning_rate": 0.0001, + "loss": 0.2035, + "step": 690 + }, + { + "epoch": 1.1265079882621454, + "grad_norm": 0.12029106914997101, + "learning_rate": 0.0001, + "loss": 0.166, + "step": 691 + }, + { + "epoch": 1.1281382458428433, + "grad_norm": 0.11109007149934769, + "learning_rate": 0.0001, + "loss": 0.1921, + "step": 692 + }, + { + "epoch": 1.129768503423541, + "grad_norm": 0.112598717212677, + "learning_rate": 0.0001, + "loss": 0.1849, + "step": 693 + }, + { + "epoch": 1.1313987610042386, + "grad_norm": 0.11764469742774963, + "learning_rate": 0.0001, + "loss": 0.1979, + "step": 694 + }, + { + "epoch": 1.1330290185849363, + "grad_norm": 0.13900335133075714, + "learning_rate": 0.0001, + "loss": 0.1949, + "step": 695 + }, + { + "epoch": 1.1346592761656342, + "grad_norm": 0.11250978708267212, + "learning_rate": 0.0001, + "loss": 0.193, + "step": 696 + }, + { + "epoch": 1.136289533746332, + "grad_norm": 0.13568443059921265, + "learning_rate": 0.0001, + "loss": 0.1858, + "step": 697 + }, + { + "epoch": 1.1379197913270296, + "grad_norm": 0.15527978539466858, + "learning_rate": 0.0001, + "loss": 0.1957, + "step": 698 + }, + { + "epoch": 1.1395500489077275, + "grad_norm": 0.12831154465675354, + "learning_rate": 0.0001, + "loss": 0.19, + "step": 699 + }, + { + "epoch": 1.1411803064884252, + "grad_norm": 0.1345253884792328, + "learning_rate": 0.0001, + "loss": 0.1916, + "step": 700 + }, + { + "epoch": 1.1428105640691228, + "grad_norm": 0.13202865421772003, + "learning_rate": 0.0001, + "loss": 0.187, + "step": 701 + }, + { + "epoch": 1.1444408216498207, + "grad_norm": 0.12138167768716812, + "learning_rate": 0.0001, + "loss": 0.1923, + "step": 702 + }, + { + "epoch": 1.1460710792305184, + "grad_norm": 0.13008633255958557, + "learning_rate": 0.0001, + "loss": 0.2001, + "step": 703 + }, + { + "epoch": 1.147701336811216, + "grad_norm": 0.13472393155097961, + "learning_rate": 0.0001, + "loss": 0.1965, + "step": 704 + }, + { + "epoch": 1.149331594391914, + "grad_norm": 0.13745389878749847, + "learning_rate": 0.0001, + "loss": 0.1929, + "step": 705 + }, + { + "epoch": 1.1509618519726117, + "grad_norm": 0.13292807340621948, + "learning_rate": 0.0001, + "loss": 0.2109, + "step": 706 + }, + { + "epoch": 1.1525921095533094, + "grad_norm": 0.1446523666381836, + "learning_rate": 0.0001, + "loss": 0.1948, + "step": 707 + }, + { + "epoch": 1.154222367134007, + "grad_norm": 0.1305355578660965, + "learning_rate": 0.0001, + "loss": 0.187, + "step": 708 + }, + { + "epoch": 1.155852624714705, + "grad_norm": 0.11248882114887238, + "learning_rate": 0.0001, + "loss": 0.1878, + "step": 709 + }, + { + "epoch": 1.1574828822954026, + "grad_norm": 0.11229637265205383, + "learning_rate": 0.0001, + "loss": 0.1864, + "step": 710 + }, + { + "epoch": 1.1591131398761005, + "grad_norm": 0.13594146072864532, + "learning_rate": 0.0001, + "loss": 0.2021, + "step": 711 + }, + { + "epoch": 1.1607433974567982, + "grad_norm": 0.10707145929336548, + "learning_rate": 0.0001, + "loss": 0.1841, + "step": 712 + }, + { + "epoch": 1.162373655037496, + "grad_norm": 0.13550283014774323, + "learning_rate": 0.0001, + "loss": 0.1909, + "step": 713 + }, + { + "epoch": 1.1640039126181936, + "grad_norm": 0.16137608885765076, + "learning_rate": 0.0001, + "loss": 0.1904, + "step": 714 + }, + { + "epoch": 1.1656341701988915, + "grad_norm": 0.1217959076166153, + "learning_rate": 0.0001, + "loss": 0.1652, + "step": 715 + }, + { + "epoch": 1.1672644277795892, + "grad_norm": 0.12333795428276062, + "learning_rate": 0.0001, + "loss": 0.1774, + "step": 716 + }, + { + "epoch": 1.1688946853602868, + "grad_norm": 0.13329114019870758, + "learning_rate": 0.0001, + "loss": 0.1945, + "step": 717 + }, + { + "epoch": 1.1705249429409847, + "grad_norm": 0.13846194744110107, + "learning_rate": 0.0001, + "loss": 0.2064, + "step": 718 + }, + { + "epoch": 1.1721552005216824, + "grad_norm": 0.12300273030996323, + "learning_rate": 0.0001, + "loss": 0.1854, + "step": 719 + }, + { + "epoch": 1.17378545810238, + "grad_norm": 0.13478830456733704, + "learning_rate": 0.0001, + "loss": 0.192, + "step": 720 + }, + { + "epoch": 1.175415715683078, + "grad_norm": 0.12126126885414124, + "learning_rate": 0.0001, + "loss": 0.1976, + "step": 721 + }, + { + "epoch": 1.1770459732637757, + "grad_norm": 0.12510600686073303, + "learning_rate": 0.0001, + "loss": 0.2058, + "step": 722 + }, + { + "epoch": 1.1786762308444734, + "grad_norm": 0.1227242723107338, + "learning_rate": 0.0001, + "loss": 0.167, + "step": 723 + }, + { + "epoch": 1.1803064884251713, + "grad_norm": 0.15479400753974915, + "learning_rate": 0.0001, + "loss": 0.2009, + "step": 724 + }, + { + "epoch": 1.181936746005869, + "grad_norm": 0.1210135892033577, + "learning_rate": 0.0001, + "loss": 0.2009, + "step": 725 + }, + { + "epoch": 1.1835670035865666, + "grad_norm": 0.15612360835075378, + "learning_rate": 0.0001, + "loss": 0.191, + "step": 726 + }, + { + "epoch": 1.1851972611672643, + "grad_norm": 0.20443665981292725, + "learning_rate": 0.0001, + "loss": 0.1861, + "step": 727 + }, + { + "epoch": 1.1868275187479622, + "grad_norm": 0.13557809591293335, + "learning_rate": 0.0001, + "loss": 0.2098, + "step": 728 + }, + { + "epoch": 1.18845777632866, + "grad_norm": 0.14073556661605835, + "learning_rate": 0.0001, + "loss": 0.2046, + "step": 729 + }, + { + "epoch": 1.1900880339093578, + "grad_norm": 0.11344654113054276, + "learning_rate": 0.0001, + "loss": 0.1878, + "step": 730 + }, + { + "epoch": 1.1917182914900555, + "grad_norm": 0.12960317730903625, + "learning_rate": 0.0001, + "loss": 0.2068, + "step": 731 + }, + { + "epoch": 1.1933485490707532, + "grad_norm": 0.11689286679029465, + "learning_rate": 0.0001, + "loss": 0.1806, + "step": 732 + }, + { + "epoch": 1.1949788066514508, + "grad_norm": 0.12490563094615936, + "learning_rate": 0.0001, + "loss": 0.2013, + "step": 733 + }, + { + "epoch": 1.1966090642321487, + "grad_norm": 0.13222745060920715, + "learning_rate": 0.0001, + "loss": 0.1923, + "step": 734 + }, + { + "epoch": 1.1982393218128464, + "grad_norm": 0.14103910326957703, + "learning_rate": 0.0001, + "loss": 0.1973, + "step": 735 + }, + { + "epoch": 1.199869579393544, + "grad_norm": 0.1409367471933365, + "learning_rate": 0.0001, + "loss": 0.1971, + "step": 736 + }, + { + "epoch": 1.201499836974242, + "grad_norm": 0.13670028746128082, + "learning_rate": 0.0001, + "loss": 0.19, + "step": 737 + }, + { + "epoch": 1.2031300945549397, + "grad_norm": 0.11378346383571625, + "learning_rate": 0.0001, + "loss": 0.1846, + "step": 738 + }, + { + "epoch": 1.2047603521356374, + "grad_norm": 0.12771202623844147, + "learning_rate": 0.0001, + "loss": 0.1952, + "step": 739 + }, + { + "epoch": 1.2063906097163353, + "grad_norm": 0.1480347216129303, + "learning_rate": 0.0001, + "loss": 0.2065, + "step": 740 + }, + { + "epoch": 1.208020867297033, + "grad_norm": 0.14294308423995972, + "learning_rate": 0.0001, + "loss": 0.199, + "step": 741 + }, + { + "epoch": 1.2096511248777306, + "grad_norm": 0.11501043289899826, + "learning_rate": 0.0001, + "loss": 0.1888, + "step": 742 + }, + { + "epoch": 1.2112813824584285, + "grad_norm": 0.12315449863672256, + "learning_rate": 0.0001, + "loss": 0.1999, + "step": 743 + }, + { + "epoch": 1.2129116400391262, + "grad_norm": 0.11507046967744827, + "learning_rate": 0.0001, + "loss": 0.2075, + "step": 744 + }, + { + "epoch": 1.214541897619824, + "grad_norm": 0.11354836076498032, + "learning_rate": 0.0001, + "loss": 0.1861, + "step": 745 + }, + { + "epoch": 1.2161721552005216, + "grad_norm": 0.13437440991401672, + "learning_rate": 0.0001, + "loss": 0.1985, + "step": 746 + }, + { + "epoch": 1.2178024127812195, + "grad_norm": 0.11698954552412033, + "learning_rate": 0.0001, + "loss": 0.1959, + "step": 747 + }, + { + "epoch": 1.2194326703619172, + "grad_norm": 0.12441843748092651, + "learning_rate": 0.0001, + "loss": 0.1955, + "step": 748 + }, + { + "epoch": 1.2210629279426148, + "grad_norm": 1.1128816604614258, + "learning_rate": 0.0001, + "loss": 0.2114, + "step": 749 + }, + { + "epoch": 1.2226931855233127, + "grad_norm": 0.1364130824804306, + "learning_rate": 0.0001, + "loss": 0.1847, + "step": 750 + }, + { + "epoch": 1.2243234431040104, + "grad_norm": 0.14699682593345642, + "learning_rate": 0.0001, + "loss": 0.1955, + "step": 751 + }, + { + "epoch": 1.225953700684708, + "grad_norm": 0.16500285267829895, + "learning_rate": 0.0001, + "loss": 0.2026, + "step": 752 + }, + { + "epoch": 1.227583958265406, + "grad_norm": 0.12731170654296875, + "learning_rate": 0.0001, + "loss": 0.1935, + "step": 753 + }, + { + "epoch": 1.2292142158461037, + "grad_norm": 0.12876448035240173, + "learning_rate": 0.0001, + "loss": 0.2066, + "step": 754 + }, + { + "epoch": 1.2308444734268014, + "grad_norm": 0.14302317798137665, + "learning_rate": 0.0001, + "loss": 0.1946, + "step": 755 + }, + { + "epoch": 1.2324747310074993, + "grad_norm": 0.14912651479244232, + "learning_rate": 0.0001, + "loss": 0.1836, + "step": 756 + }, + { + "epoch": 1.234104988588197, + "grad_norm": 0.1160784587264061, + "learning_rate": 0.0001, + "loss": 0.1853, + "step": 757 + }, + { + "epoch": 1.2357352461688946, + "grad_norm": 0.11871147900819778, + "learning_rate": 0.0001, + "loss": 0.1845, + "step": 758 + }, + { + "epoch": 1.2373655037495925, + "grad_norm": 0.14770323038101196, + "learning_rate": 0.0001, + "loss": 0.2033, + "step": 759 + }, + { + "epoch": 1.2389957613302902, + "grad_norm": 0.2331717312335968, + "learning_rate": 0.0001, + "loss": 0.188, + "step": 760 + }, + { + "epoch": 1.240626018910988, + "grad_norm": 0.12382549792528152, + "learning_rate": 0.0001, + "loss": 0.1991, + "step": 761 + }, + { + "epoch": 1.2422562764916858, + "grad_norm": 0.16363592445850372, + "learning_rate": 0.0001, + "loss": 0.1922, + "step": 762 + }, + { + "epoch": 1.2438865340723835, + "grad_norm": 0.10923486948013306, + "learning_rate": 0.0001, + "loss": 0.1951, + "step": 763 + }, + { + "epoch": 1.2455167916530812, + "grad_norm": 0.15710005164146423, + "learning_rate": 0.0001, + "loss": 0.2105, + "step": 764 + }, + { + "epoch": 1.2471470492337788, + "grad_norm": 0.1298326998949051, + "learning_rate": 0.0001, + "loss": 0.1957, + "step": 765 + }, + { + "epoch": 1.2487773068144767, + "grad_norm": 0.12656745314598083, + "learning_rate": 0.0001, + "loss": 0.1908, + "step": 766 + }, + { + "epoch": 1.2504075643951744, + "grad_norm": 0.23991864919662476, + "learning_rate": 0.0001, + "loss": 0.2039, + "step": 767 + }, + { + "epoch": 1.2520378219758723, + "grad_norm": 0.12446881830692291, + "learning_rate": 0.0001, + "loss": 0.2051, + "step": 768 + }, + { + "epoch": 1.25366807955657, + "grad_norm": 0.13822287321090698, + "learning_rate": 0.0001, + "loss": 0.2118, + "step": 769 + }, + { + "epoch": 1.2552983371372677, + "grad_norm": 0.1322907656431198, + "learning_rate": 0.0001, + "loss": 0.2043, + "step": 770 + }, + { + "epoch": 1.2569285947179654, + "grad_norm": 0.13346537947654724, + "learning_rate": 0.0001, + "loss": 0.1907, + "step": 771 + }, + { + "epoch": 1.2585588522986633, + "grad_norm": 0.133008673787117, + "learning_rate": 0.0001, + "loss": 0.1957, + "step": 772 + }, + { + "epoch": 1.260189109879361, + "grad_norm": 0.13915961980819702, + "learning_rate": 0.0001, + "loss": 0.1996, + "step": 773 + }, + { + "epoch": 1.2618193674600586, + "grad_norm": 0.12022515386343002, + "learning_rate": 0.0001, + "loss": 0.2048, + "step": 774 + }, + { + "epoch": 1.2634496250407565, + "grad_norm": 0.12311067432165146, + "learning_rate": 0.0001, + "loss": 0.2076, + "step": 775 + }, + { + "epoch": 1.2650798826214542, + "grad_norm": 0.11143454164266586, + "learning_rate": 0.0001, + "loss": 0.192, + "step": 776 + }, + { + "epoch": 1.266710140202152, + "grad_norm": 0.11484785377979279, + "learning_rate": 0.0001, + "loss": 0.1797, + "step": 777 + }, + { + "epoch": 1.2683403977828496, + "grad_norm": 0.12231987714767456, + "learning_rate": 0.0001, + "loss": 0.2006, + "step": 778 + }, + { + "epoch": 1.2699706553635475, + "grad_norm": 0.138115793466568, + "learning_rate": 0.0001, + "loss": 0.1953, + "step": 779 + }, + { + "epoch": 1.2716009129442452, + "grad_norm": 0.1262485831975937, + "learning_rate": 0.0001, + "loss": 0.2104, + "step": 780 + }, + { + "epoch": 1.273231170524943, + "grad_norm": 0.10364367067813873, + "learning_rate": 0.0001, + "loss": 0.1913, + "step": 781 + }, + { + "epoch": 1.2748614281056407, + "grad_norm": 0.11666619032621384, + "learning_rate": 0.0001, + "loss": 0.2092, + "step": 782 + }, + { + "epoch": 1.2764916856863384, + "grad_norm": 0.10511959344148636, + "learning_rate": 0.0001, + "loss": 0.1821, + "step": 783 + }, + { + "epoch": 1.278121943267036, + "grad_norm": 0.12640027701854706, + "learning_rate": 0.0001, + "loss": 0.1969, + "step": 784 + }, + { + "epoch": 1.279752200847734, + "grad_norm": 0.13358190655708313, + "learning_rate": 0.0001, + "loss": 0.1874, + "step": 785 + }, + { + "epoch": 1.2813824584284317, + "grad_norm": 0.13797491788864136, + "learning_rate": 0.0001, + "loss": 0.2039, + "step": 786 + }, + { + "epoch": 1.2830127160091294, + "grad_norm": 0.16730040311813354, + "learning_rate": 0.0001, + "loss": 0.1812, + "step": 787 + }, + { + "epoch": 1.2846429735898273, + "grad_norm": 0.13052508234977722, + "learning_rate": 0.0001, + "loss": 0.2029, + "step": 788 + }, + { + "epoch": 1.286273231170525, + "grad_norm": 0.11100683361291885, + "learning_rate": 0.0001, + "loss": 0.1756, + "step": 789 + }, + { + "epoch": 1.2879034887512226, + "grad_norm": 0.12207093834877014, + "learning_rate": 0.0001, + "loss": 0.1951, + "step": 790 + }, + { + "epoch": 1.2895337463319203, + "grad_norm": 0.10500862449407578, + "learning_rate": 0.0001, + "loss": 0.187, + "step": 791 + }, + { + "epoch": 1.2911640039126182, + "grad_norm": 0.12395953387022018, + "learning_rate": 0.0001, + "loss": 0.1789, + "step": 792 + }, + { + "epoch": 1.2927942614933159, + "grad_norm": 0.12728352844715118, + "learning_rate": 0.0001, + "loss": 0.2055, + "step": 793 + }, + { + "epoch": 1.2944245190740138, + "grad_norm": 0.12210983037948608, + "learning_rate": 0.0001, + "loss": 0.1848, + "step": 794 + }, + { + "epoch": 1.2960547766547115, + "grad_norm": 0.1309320330619812, + "learning_rate": 0.0001, + "loss": 0.2019, + "step": 795 + }, + { + "epoch": 1.2976850342354092, + "grad_norm": 0.14810331165790558, + "learning_rate": 0.0001, + "loss": 0.1983, + "step": 796 + }, + { + "epoch": 1.2993152918161068, + "grad_norm": 0.11592376232147217, + "learning_rate": 0.0001, + "loss": 0.1839, + "step": 797 + }, + { + "epoch": 1.3009455493968047, + "grad_norm": 0.12396306544542313, + "learning_rate": 0.0001, + "loss": 0.1964, + "step": 798 + }, + { + "epoch": 1.3025758069775024, + "grad_norm": 0.13260740041732788, + "learning_rate": 0.0001, + "loss": 0.2012, + "step": 799 + }, + { + "epoch": 1.3042060645582003, + "grad_norm": 0.13183878362178802, + "learning_rate": 0.0001, + "loss": 0.2066, + "step": 800 + }, + { + "epoch": 1.305836322138898, + "grad_norm": 0.12593010067939758, + "learning_rate": 0.0001, + "loss": 0.1916, + "step": 801 + }, + { + "epoch": 1.3074665797195957, + "grad_norm": 0.11641526967287064, + "learning_rate": 0.0001, + "loss": 0.1847, + "step": 802 + }, + { + "epoch": 1.3090968373002934, + "grad_norm": 0.1082523912191391, + "learning_rate": 0.0001, + "loss": 0.1816, + "step": 803 + }, + { + "epoch": 1.3107270948809913, + "grad_norm": 0.11721451580524445, + "learning_rate": 0.0001, + "loss": 0.1913, + "step": 804 + }, + { + "epoch": 1.312357352461689, + "grad_norm": 0.14624540507793427, + "learning_rate": 0.0001, + "loss": 0.2057, + "step": 805 + }, + { + "epoch": 1.3139876100423866, + "grad_norm": 0.11817949265241623, + "learning_rate": 0.0001, + "loss": 0.1876, + "step": 806 + }, + { + "epoch": 1.3156178676230845, + "grad_norm": 0.12821592390537262, + "learning_rate": 0.0001, + "loss": 0.2099, + "step": 807 + }, + { + "epoch": 1.3172481252037822, + "grad_norm": 0.13968628644943237, + "learning_rate": 0.0001, + "loss": 0.2101, + "step": 808 + }, + { + "epoch": 1.3188783827844799, + "grad_norm": 0.1454535573720932, + "learning_rate": 0.0001, + "loss": 0.2062, + "step": 809 + }, + { + "epoch": 1.3205086403651776, + "grad_norm": 0.1232244223356247, + "learning_rate": 0.0001, + "loss": 0.1775, + "step": 810 + }, + { + "epoch": 1.3221388979458755, + "grad_norm": 0.12356823682785034, + "learning_rate": 0.0001, + "loss": 0.1945, + "step": 811 + }, + { + "epoch": 1.3237691555265731, + "grad_norm": 0.1321604698896408, + "learning_rate": 0.0001, + "loss": 0.1956, + "step": 812 + }, + { + "epoch": 1.325399413107271, + "grad_norm": 0.13686218857765198, + "learning_rate": 0.0001, + "loss": 0.1997, + "step": 813 + }, + { + "epoch": 1.3270296706879687, + "grad_norm": 0.1265021115541458, + "learning_rate": 0.0001, + "loss": 0.2003, + "step": 814 + }, + { + "epoch": 1.3286599282686664, + "grad_norm": 0.13822129368782043, + "learning_rate": 0.0001, + "loss": 0.2037, + "step": 815 + }, + { + "epoch": 1.330290185849364, + "grad_norm": 0.1474665403366089, + "learning_rate": 0.0001, + "loss": 0.2014, + "step": 816 + }, + { + "epoch": 1.331920443430062, + "grad_norm": 0.1312236338853836, + "learning_rate": 0.0001, + "loss": 0.2047, + "step": 817 + }, + { + "epoch": 1.3335507010107597, + "grad_norm": 0.10650108009576797, + "learning_rate": 0.0001, + "loss": 0.1774, + "step": 818 + }, + { + "epoch": 1.3351809585914576, + "grad_norm": 0.12446287274360657, + "learning_rate": 0.0001, + "loss": 0.1993, + "step": 819 + }, + { + "epoch": 1.3368112161721553, + "grad_norm": 0.1500684767961502, + "learning_rate": 0.0001, + "loss": 0.2065, + "step": 820 + }, + { + "epoch": 1.338441473752853, + "grad_norm": 0.11526080965995789, + "learning_rate": 0.0001, + "loss": 0.1929, + "step": 821 + }, + { + "epoch": 1.3400717313335506, + "grad_norm": 0.14538566768169403, + "learning_rate": 0.0001, + "loss": 0.1951, + "step": 822 + }, + { + "epoch": 1.3417019889142485, + "grad_norm": 0.11737110465765, + "learning_rate": 0.0001, + "loss": 0.2022, + "step": 823 + }, + { + "epoch": 1.3433322464949462, + "grad_norm": 0.1339058130979538, + "learning_rate": 0.0001, + "loss": 0.2056, + "step": 824 + }, + { + "epoch": 1.3449625040756439, + "grad_norm": 0.1357828974723816, + "learning_rate": 0.0001, + "loss": 0.1997, + "step": 825 + }, + { + "epoch": 1.3465927616563418, + "grad_norm": 0.13894008100032806, + "learning_rate": 0.0001, + "loss": 0.1976, + "step": 826 + }, + { + "epoch": 1.3482230192370395, + "grad_norm": 0.11640580743551254, + "learning_rate": 0.0001, + "loss": 0.1931, + "step": 827 + }, + { + "epoch": 1.3498532768177371, + "grad_norm": 0.11283834278583527, + "learning_rate": 0.0001, + "loss": 0.1968, + "step": 828 + }, + { + "epoch": 1.3514835343984348, + "grad_norm": 0.13692787289619446, + "learning_rate": 0.0001, + "loss": 0.2087, + "step": 829 + }, + { + "epoch": 1.3531137919791327, + "grad_norm": 0.0954209491610527, + "learning_rate": 0.0001, + "loss": 0.1828, + "step": 830 + }, + { + "epoch": 1.3547440495598304, + "grad_norm": 0.1314362734556198, + "learning_rate": 0.0001, + "loss": 0.2074, + "step": 831 + }, + { + "epoch": 1.3563743071405283, + "grad_norm": 0.12032657861709595, + "learning_rate": 0.0001, + "loss": 0.1797, + "step": 832 + }, + { + "epoch": 1.358004564721226, + "grad_norm": 0.12045875936746597, + "learning_rate": 0.0001, + "loss": 0.1948, + "step": 833 + }, + { + "epoch": 1.3596348223019237, + "grad_norm": 0.12139002233743668, + "learning_rate": 0.0001, + "loss": 0.2074, + "step": 834 + }, + { + "epoch": 1.3612650798826214, + "grad_norm": 0.125333771109581, + "learning_rate": 0.0001, + "loss": 0.189, + "step": 835 + }, + { + "epoch": 1.3628953374633193, + "grad_norm": 0.12336897104978561, + "learning_rate": 0.0001, + "loss": 0.2015, + "step": 836 + }, + { + "epoch": 1.364525595044017, + "grad_norm": 0.1447877436876297, + "learning_rate": 0.0001, + "loss": 0.1979, + "step": 837 + }, + { + "epoch": 1.3661558526247148, + "grad_norm": 0.12751318514347076, + "learning_rate": 0.0001, + "loss": 0.1924, + "step": 838 + }, + { + "epoch": 1.3677861102054125, + "grad_norm": 0.14476466178894043, + "learning_rate": 0.0001, + "loss": 0.2007, + "step": 839 + }, + { + "epoch": 1.3694163677861102, + "grad_norm": 0.12900257110595703, + "learning_rate": 0.0001, + "loss": 0.1685, + "step": 840 + }, + { + "epoch": 1.3710466253668079, + "grad_norm": 0.14788837730884552, + "learning_rate": 0.0001, + "loss": 0.188, + "step": 841 + }, + { + "epoch": 1.3726768829475058, + "grad_norm": 0.1143098995089531, + "learning_rate": 0.0001, + "loss": 0.1925, + "step": 842 + }, + { + "epoch": 1.3743071405282035, + "grad_norm": 0.16243141889572144, + "learning_rate": 0.0001, + "loss": 0.1904, + "step": 843 + }, + { + "epoch": 1.3759373981089011, + "grad_norm": 0.1285100132226944, + "learning_rate": 0.0001, + "loss": 0.2101, + "step": 844 + }, + { + "epoch": 1.377567655689599, + "grad_norm": 0.12278752028942108, + "learning_rate": 0.0001, + "loss": 0.1676, + "step": 845 + }, + { + "epoch": 1.3791979132702967, + "grad_norm": 0.11762263625860214, + "learning_rate": 0.0001, + "loss": 0.1857, + "step": 846 + }, + { + "epoch": 1.3808281708509944, + "grad_norm": 0.14315438270568848, + "learning_rate": 0.0001, + "loss": 0.1981, + "step": 847 + }, + { + "epoch": 1.382458428431692, + "grad_norm": 0.13514509797096252, + "learning_rate": 0.0001, + "loss": 0.1874, + "step": 848 + }, + { + "epoch": 1.38408868601239, + "grad_norm": 0.14663557708263397, + "learning_rate": 0.0001, + "loss": 0.1922, + "step": 849 + }, + { + "epoch": 1.3857189435930877, + "grad_norm": 0.11271989345550537, + "learning_rate": 0.0001, + "loss": 0.1741, + "step": 850 + }, + { + "epoch": 1.3873492011737856, + "grad_norm": 0.1267681121826172, + "learning_rate": 0.0001, + "loss": 0.2004, + "step": 851 + }, + { + "epoch": 1.3889794587544833, + "grad_norm": 0.13862234354019165, + "learning_rate": 0.0001, + "loss": 0.1833, + "step": 852 + }, + { + "epoch": 1.390609716335181, + "grad_norm": 0.14076252281665802, + "learning_rate": 0.0001, + "loss": 0.1946, + "step": 853 + }, + { + "epoch": 1.3922399739158786, + "grad_norm": 0.11336036026477814, + "learning_rate": 0.0001, + "loss": 0.1826, + "step": 854 + }, + { + "epoch": 1.3938702314965765, + "grad_norm": 0.12610171735286713, + "learning_rate": 0.0001, + "loss": 0.2061, + "step": 855 + }, + { + "epoch": 1.3955004890772742, + "grad_norm": 0.12771809101104736, + "learning_rate": 0.0001, + "loss": 0.1826, + "step": 856 + }, + { + "epoch": 1.3971307466579719, + "grad_norm": 0.13943758606910706, + "learning_rate": 0.0001, + "loss": 0.2154, + "step": 857 + }, + { + "epoch": 1.3987610042386698, + "grad_norm": 0.1181144267320633, + "learning_rate": 0.0001, + "loss": 0.2047, + "step": 858 + }, + { + "epoch": 1.4003912618193675, + "grad_norm": 0.12739057838916779, + "learning_rate": 0.0001, + "loss": 0.1924, + "step": 859 + }, + { + "epoch": 1.4020215194000651, + "grad_norm": 0.14955663681030273, + "learning_rate": 0.0001, + "loss": 0.204, + "step": 860 + }, + { + "epoch": 1.4036517769807628, + "grad_norm": 0.12984280288219452, + "learning_rate": 0.0001, + "loss": 0.1833, + "step": 861 + }, + { + "epoch": 1.4052820345614607, + "grad_norm": 0.1367812603712082, + "learning_rate": 0.0001, + "loss": 0.2041, + "step": 862 + }, + { + "epoch": 1.4069122921421584, + "grad_norm": 0.11790581792593002, + "learning_rate": 0.0001, + "loss": 0.1927, + "step": 863 + }, + { + "epoch": 1.4085425497228563, + "grad_norm": 0.13845130801200867, + "learning_rate": 0.0001, + "loss": 0.198, + "step": 864 + }, + { + "epoch": 1.410172807303554, + "grad_norm": 0.13803738355636597, + "learning_rate": 0.0001, + "loss": 0.1942, + "step": 865 + }, + { + "epoch": 1.4118030648842517, + "grad_norm": 0.11733663082122803, + "learning_rate": 0.0001, + "loss": 0.1811, + "step": 866 + }, + { + "epoch": 1.4134333224649493, + "grad_norm": 0.14048326015472412, + "learning_rate": 0.0001, + "loss": 0.2063, + "step": 867 + }, + { + "epoch": 1.4150635800456473, + "grad_norm": 0.11841481178998947, + "learning_rate": 0.0001, + "loss": 0.1783, + "step": 868 + }, + { + "epoch": 1.416693837626345, + "grad_norm": 0.13812898099422455, + "learning_rate": 0.0001, + "loss": 0.1945, + "step": 869 + }, + { + "epoch": 1.4183240952070428, + "grad_norm": 0.12648576498031616, + "learning_rate": 0.0001, + "loss": 0.2121, + "step": 870 + }, + { + "epoch": 1.4199543527877405, + "grad_norm": 0.12122874706983566, + "learning_rate": 0.0001, + "loss": 0.198, + "step": 871 + }, + { + "epoch": 1.4215846103684382, + "grad_norm": 0.12100232392549515, + "learning_rate": 0.0001, + "loss": 0.1896, + "step": 872 + }, + { + "epoch": 1.4232148679491359, + "grad_norm": 0.11330271512269974, + "learning_rate": 0.0001, + "loss": 0.1883, + "step": 873 + }, + { + "epoch": 1.4248451255298338, + "grad_norm": 0.12889273464679718, + "learning_rate": 0.0001, + "loss": 0.1913, + "step": 874 + }, + { + "epoch": 1.4264753831105315, + "grad_norm": 0.12529848515987396, + "learning_rate": 0.0001, + "loss": 0.1947, + "step": 875 + }, + { + "epoch": 1.4281056406912291, + "grad_norm": 0.11464565992355347, + "learning_rate": 0.0001, + "loss": 0.1928, + "step": 876 + }, + { + "epoch": 1.429735898271927, + "grad_norm": 0.09968235343694687, + "learning_rate": 0.0001, + "loss": 0.18, + "step": 877 + }, + { + "epoch": 1.4313661558526247, + "grad_norm": 0.12522117793560028, + "learning_rate": 0.0001, + "loss": 0.2025, + "step": 878 + }, + { + "epoch": 1.4329964134333224, + "grad_norm": 0.13101626932621002, + "learning_rate": 0.0001, + "loss": 0.1883, + "step": 879 + }, + { + "epoch": 1.43462667101402, + "grad_norm": 0.18192481994628906, + "learning_rate": 0.0001, + "loss": 0.1906, + "step": 880 + }, + { + "epoch": 1.436256928594718, + "grad_norm": 0.1182754710316658, + "learning_rate": 0.0001, + "loss": 0.1737, + "step": 881 + }, + { + "epoch": 1.4378871861754157, + "grad_norm": 0.1254081428050995, + "learning_rate": 0.0001, + "loss": 0.1919, + "step": 882 + }, + { + "epoch": 1.4395174437561136, + "grad_norm": 0.12255612015724182, + "learning_rate": 0.0001, + "loss": 0.1998, + "step": 883 + }, + { + "epoch": 1.4411477013368112, + "grad_norm": 0.12978112697601318, + "learning_rate": 0.0001, + "loss": 0.189, + "step": 884 + }, + { + "epoch": 1.442777958917509, + "grad_norm": 0.1313999891281128, + "learning_rate": 0.0001, + "loss": 0.1847, + "step": 885 + }, + { + "epoch": 1.4444082164982066, + "grad_norm": 0.1327790915966034, + "learning_rate": 0.0001, + "loss": 0.2005, + "step": 886 + }, + { + "epoch": 1.4460384740789045, + "grad_norm": 0.13885673880577087, + "learning_rate": 0.0001, + "loss": 0.1899, + "step": 887 + }, + { + "epoch": 1.4476687316596022, + "grad_norm": 0.13544884324073792, + "learning_rate": 0.0001, + "loss": 0.1944, + "step": 888 + }, + { + "epoch": 1.4492989892403, + "grad_norm": 0.12309068441390991, + "learning_rate": 0.0001, + "loss": 0.2011, + "step": 889 + }, + { + "epoch": 1.4509292468209978, + "grad_norm": 0.11212994158267975, + "learning_rate": 0.0001, + "loss": 0.1709, + "step": 890 + }, + { + "epoch": 1.4525595044016955, + "grad_norm": 0.1351044625043869, + "learning_rate": 0.0001, + "loss": 0.1765, + "step": 891 + }, + { + "epoch": 1.4541897619823931, + "grad_norm": 0.1273559033870697, + "learning_rate": 0.0001, + "loss": 0.1856, + "step": 892 + }, + { + "epoch": 1.455820019563091, + "grad_norm": 0.12091701477766037, + "learning_rate": 0.0001, + "loss": 0.2017, + "step": 893 + }, + { + "epoch": 1.4574502771437887, + "grad_norm": 0.12357049435377121, + "learning_rate": 0.0001, + "loss": 0.1997, + "step": 894 + }, + { + "epoch": 1.4590805347244864, + "grad_norm": 0.13915151357650757, + "learning_rate": 0.0001, + "loss": 0.2016, + "step": 895 + }, + { + "epoch": 1.4607107923051843, + "grad_norm": 0.15922588109970093, + "learning_rate": 0.0001, + "loss": 0.1946, + "step": 896 + }, + { + "epoch": 1.462341049885882, + "grad_norm": 0.1327250450849533, + "learning_rate": 0.0001, + "loss": 0.2009, + "step": 897 + }, + { + "epoch": 1.4639713074665797, + "grad_norm": 0.1281212717294693, + "learning_rate": 0.0001, + "loss": 0.1885, + "step": 898 + }, + { + "epoch": 1.4656015650472773, + "grad_norm": 0.12630882859230042, + "learning_rate": 0.0001, + "loss": 0.1758, + "step": 899 + }, + { + "epoch": 1.4672318226279752, + "grad_norm": 0.17888978123664856, + "learning_rate": 0.0001, + "loss": 0.1969, + "step": 900 + }, + { + "epoch": 1.468862080208673, + "grad_norm": 0.11581540107727051, + "learning_rate": 0.0001, + "loss": 0.1811, + "step": 901 + }, + { + "epoch": 1.4704923377893708, + "grad_norm": 0.13671360909938812, + "learning_rate": 0.0001, + "loss": 0.1921, + "step": 902 + }, + { + "epoch": 1.4721225953700685, + "grad_norm": 0.12927068769931793, + "learning_rate": 0.0001, + "loss": 0.1907, + "step": 903 + }, + { + "epoch": 1.4737528529507662, + "grad_norm": 0.1307104527950287, + "learning_rate": 0.0001, + "loss": 0.1774, + "step": 904 + }, + { + "epoch": 1.4753831105314639, + "grad_norm": 0.12712356448173523, + "learning_rate": 0.0001, + "loss": 0.1945, + "step": 905 + }, + { + "epoch": 1.4770133681121618, + "grad_norm": 0.11870065331459045, + "learning_rate": 0.0001, + "loss": 0.1764, + "step": 906 + }, + { + "epoch": 1.4786436256928595, + "grad_norm": 0.13073444366455078, + "learning_rate": 0.0001, + "loss": 0.1941, + "step": 907 + }, + { + "epoch": 1.4802738832735574, + "grad_norm": 0.12752722203731537, + "learning_rate": 0.0001, + "loss": 0.1781, + "step": 908 + }, + { + "epoch": 1.481904140854255, + "grad_norm": 0.18502075970172882, + "learning_rate": 0.0001, + "loss": 0.1963, + "step": 909 + }, + { + "epoch": 1.4835343984349527, + "grad_norm": 0.14644372463226318, + "learning_rate": 0.0001, + "loss": 0.2031, + "step": 910 + }, + { + "epoch": 1.4851646560156504, + "grad_norm": 0.1375977247953415, + "learning_rate": 0.0001, + "loss": 0.1872, + "step": 911 + }, + { + "epoch": 1.4867949135963483, + "grad_norm": 0.1250106245279312, + "learning_rate": 0.0001, + "loss": 0.1939, + "step": 912 + }, + { + "epoch": 1.488425171177046, + "grad_norm": 0.1255251169204712, + "learning_rate": 0.0001, + "loss": 0.1918, + "step": 913 + }, + { + "epoch": 1.4900554287577437, + "grad_norm": 0.11189655214548111, + "learning_rate": 0.0001, + "loss": 0.1897, + "step": 914 + }, + { + "epoch": 1.4916856863384416, + "grad_norm": 0.12730088829994202, + "learning_rate": 0.0001, + "loss": 0.1828, + "step": 915 + }, + { + "epoch": 1.4933159439191392, + "grad_norm": 0.10540137439966202, + "learning_rate": 0.0001, + "loss": 0.191, + "step": 916 + }, + { + "epoch": 1.494946201499837, + "grad_norm": 0.10388045758008957, + "learning_rate": 0.0001, + "loss": 0.1904, + "step": 917 + }, + { + "epoch": 1.4965764590805346, + "grad_norm": 0.1330886036157608, + "learning_rate": 0.0001, + "loss": 0.1946, + "step": 918 + }, + { + "epoch": 1.4982067166612325, + "grad_norm": 0.12678848206996918, + "learning_rate": 0.0001, + "loss": 0.2083, + "step": 919 + }, + { + "epoch": 1.4998369742419302, + "grad_norm": 0.12516173720359802, + "learning_rate": 0.0001, + "loss": 0.202, + "step": 920 + }, + { + "epoch": 1.501467231822628, + "grad_norm": 0.1177758052945137, + "learning_rate": 0.0001, + "loss": 0.1943, + "step": 921 + }, + { + "epoch": 1.5030974894033258, + "grad_norm": 0.10760951787233353, + "learning_rate": 0.0001, + "loss": 0.1738, + "step": 922 + }, + { + "epoch": 1.5047277469840235, + "grad_norm": 0.1439739167690277, + "learning_rate": 0.0001, + "loss": 0.2107, + "step": 923 + }, + { + "epoch": 1.5063580045647211, + "grad_norm": 0.127821683883667, + "learning_rate": 0.0001, + "loss": 0.1788, + "step": 924 + }, + { + "epoch": 1.5079882621454188, + "grad_norm": 0.11863593012094498, + "learning_rate": 0.0001, + "loss": 0.1928, + "step": 925 + }, + { + "epoch": 1.5096185197261167, + "grad_norm": 0.1257062703371048, + "learning_rate": 0.0001, + "loss": 0.1814, + "step": 926 + }, + { + "epoch": 1.5112487773068146, + "grad_norm": 0.14376260340213776, + "learning_rate": 0.0001, + "loss": 0.1876, + "step": 927 + }, + { + "epoch": 1.5128790348875123, + "grad_norm": 0.12747377157211304, + "learning_rate": 0.0001, + "loss": 0.2062, + "step": 928 + }, + { + "epoch": 1.51450929246821, + "grad_norm": 0.10602065175771713, + "learning_rate": 0.0001, + "loss": 0.1834, + "step": 929 + }, + { + "epoch": 1.5161395500489077, + "grad_norm": 0.11980943381786346, + "learning_rate": 0.0001, + "loss": 0.1932, + "step": 930 + }, + { + "epoch": 1.5177698076296053, + "grad_norm": 0.12690620124340057, + "learning_rate": 0.0001, + "loss": 0.2067, + "step": 931 + }, + { + "epoch": 1.5194000652103032, + "grad_norm": 0.10429069399833679, + "learning_rate": 0.0001, + "loss": 0.1845, + "step": 932 + }, + { + "epoch": 1.5210303227910011, + "grad_norm": 0.11686432361602783, + "learning_rate": 0.0001, + "loss": 0.2038, + "step": 933 + }, + { + "epoch": 1.5226605803716988, + "grad_norm": 0.13629446923732758, + "learning_rate": 0.0001, + "loss": 0.2071, + "step": 934 + }, + { + "epoch": 1.5242908379523965, + "grad_norm": 0.11884860694408417, + "learning_rate": 0.0001, + "loss": 0.1866, + "step": 935 + }, + { + "epoch": 1.5259210955330942, + "grad_norm": 0.11080071330070496, + "learning_rate": 0.0001, + "loss": 0.1925, + "step": 936 + }, + { + "epoch": 1.5275513531137919, + "grad_norm": 0.13977265357971191, + "learning_rate": 0.0001, + "loss": 0.2035, + "step": 937 + }, + { + "epoch": 1.5291816106944898, + "grad_norm": 0.1306087076663971, + "learning_rate": 0.0001, + "loss": 0.1906, + "step": 938 + }, + { + "epoch": 1.5308118682751874, + "grad_norm": 0.1292889267206192, + "learning_rate": 0.0001, + "loss": 0.2064, + "step": 939 + }, + { + "epoch": 1.5324421258558854, + "grad_norm": 0.1151227205991745, + "learning_rate": 0.0001, + "loss": 0.1889, + "step": 940 + }, + { + "epoch": 1.534072383436583, + "grad_norm": 0.23857301473617554, + "learning_rate": 0.0001, + "loss": 0.1936, + "step": 941 + }, + { + "epoch": 1.5357026410172807, + "grad_norm": 0.12873469293117523, + "learning_rate": 0.0001, + "loss": 0.1864, + "step": 942 + }, + { + "epoch": 1.5373328985979784, + "grad_norm": 0.1140642762184143, + "learning_rate": 0.0001, + "loss": 0.1844, + "step": 943 + }, + { + "epoch": 1.538963156178676, + "grad_norm": 0.12932021915912628, + "learning_rate": 0.0001, + "loss": 0.1717, + "step": 944 + }, + { + "epoch": 1.540593413759374, + "grad_norm": 0.11678753048181534, + "learning_rate": 0.0001, + "loss": 0.1814, + "step": 945 + }, + { + "epoch": 1.5422236713400719, + "grad_norm": 0.11084496229887009, + "learning_rate": 0.0001, + "loss": 0.1635, + "step": 946 + }, + { + "epoch": 1.5438539289207696, + "grad_norm": 0.11934110522270203, + "learning_rate": 0.0001, + "loss": 0.1897, + "step": 947 + }, + { + "epoch": 1.5454841865014672, + "grad_norm": 0.13100844621658325, + "learning_rate": 0.0001, + "loss": 0.1852, + "step": 948 + }, + { + "epoch": 1.547114444082165, + "grad_norm": 0.12914130091667175, + "learning_rate": 0.0001, + "loss": 0.1799, + "step": 949 + }, + { + "epoch": 1.5487447016628626, + "grad_norm": 0.1111481636762619, + "learning_rate": 0.0001, + "loss": 0.1854, + "step": 950 + }, + { + "epoch": 1.5503749592435605, + "grad_norm": 0.1460474282503128, + "learning_rate": 0.0001, + "loss": 0.1864, + "step": 951 + }, + { + "epoch": 1.5520052168242582, + "grad_norm": 0.13828031718730927, + "learning_rate": 0.0001, + "loss": 0.1893, + "step": 952 + }, + { + "epoch": 1.553635474404956, + "grad_norm": 0.1216074749827385, + "learning_rate": 0.0001, + "loss": 0.1866, + "step": 953 + }, + { + "epoch": 1.5552657319856538, + "grad_norm": 0.11165913194417953, + "learning_rate": 0.0001, + "loss": 0.1858, + "step": 954 + }, + { + "epoch": 1.5568959895663514, + "grad_norm": 0.12925301492214203, + "learning_rate": 0.0001, + "loss": 0.198, + "step": 955 + }, + { + "epoch": 1.5585262471470491, + "grad_norm": 0.10411135852336884, + "learning_rate": 0.0001, + "loss": 0.1836, + "step": 956 + }, + { + "epoch": 1.560156504727747, + "grad_norm": 0.1324637234210968, + "learning_rate": 0.0001, + "loss": 0.1752, + "step": 957 + }, + { + "epoch": 1.5617867623084447, + "grad_norm": 0.1328878253698349, + "learning_rate": 0.0001, + "loss": 0.1861, + "step": 958 + }, + { + "epoch": 1.5634170198891426, + "grad_norm": 0.1299583464860916, + "learning_rate": 0.0001, + "loss": 0.192, + "step": 959 + }, + { + "epoch": 1.5650472774698403, + "grad_norm": 0.14747850596904755, + "learning_rate": 0.0001, + "loss": 0.1826, + "step": 960 + }, + { + "epoch": 1.566677535050538, + "grad_norm": 0.1322248876094818, + "learning_rate": 0.0001, + "loss": 0.1924, + "step": 961 + }, + { + "epoch": 1.5683077926312357, + "grad_norm": 0.14771921932697296, + "learning_rate": 0.0001, + "loss": 0.1908, + "step": 962 + }, + { + "epoch": 1.5699380502119333, + "grad_norm": 0.12810666859149933, + "learning_rate": 0.0001, + "loss": 0.1933, + "step": 963 + }, + { + "epoch": 1.5715683077926312, + "grad_norm": 0.12468911707401276, + "learning_rate": 0.0001, + "loss": 0.1969, + "step": 964 + }, + { + "epoch": 1.5731985653733291, + "grad_norm": 0.13675253093242645, + "learning_rate": 0.0001, + "loss": 0.1838, + "step": 965 + }, + { + "epoch": 1.5748288229540268, + "grad_norm": 0.14610256254673004, + "learning_rate": 0.0001, + "loss": 0.1871, + "step": 966 + }, + { + "epoch": 1.5764590805347245, + "grad_norm": 0.16016973555088043, + "learning_rate": 0.0001, + "loss": 0.2068, + "step": 967 + }, + { + "epoch": 1.5780893381154222, + "grad_norm": 0.11924766004085541, + "learning_rate": 0.0001, + "loss": 0.1963, + "step": 968 + }, + { + "epoch": 1.5797195956961199, + "grad_norm": 0.14039623737335205, + "learning_rate": 0.0001, + "loss": 0.193, + "step": 969 + }, + { + "epoch": 1.5813498532768178, + "grad_norm": 0.13962414860725403, + "learning_rate": 0.0001, + "loss": 0.2006, + "step": 970 + }, + { + "epoch": 1.5829801108575154, + "grad_norm": 0.1010158583521843, + "learning_rate": 0.0001, + "loss": 0.1793, + "step": 971 + }, + { + "epoch": 1.5846103684382133, + "grad_norm": 0.13020698726177216, + "learning_rate": 0.0001, + "loss": 0.1936, + "step": 972 + }, + { + "epoch": 1.586240626018911, + "grad_norm": 0.13899926841259003, + "learning_rate": 0.0001, + "loss": 0.1855, + "step": 973 + }, + { + "epoch": 1.5878708835996087, + "grad_norm": 0.13506311178207397, + "learning_rate": 0.0001, + "loss": 0.2038, + "step": 974 + }, + { + "epoch": 1.5895011411803064, + "grad_norm": 0.10924082249403, + "learning_rate": 0.0001, + "loss": 0.189, + "step": 975 + }, + { + "epoch": 1.5911313987610043, + "grad_norm": 0.14923515915870667, + "learning_rate": 0.0001, + "loss": 0.1987, + "step": 976 + }, + { + "epoch": 1.592761656341702, + "grad_norm": 0.11555776745080948, + "learning_rate": 0.0001, + "loss": 0.1819, + "step": 977 + }, + { + "epoch": 1.5943919139223999, + "grad_norm": 0.11343539506196976, + "learning_rate": 0.0001, + "loss": 0.1926, + "step": 978 + }, + { + "epoch": 1.5960221715030976, + "grad_norm": 0.12200161069631577, + "learning_rate": 0.0001, + "loss": 0.1883, + "step": 979 + }, + { + "epoch": 1.5976524290837952, + "grad_norm": 0.14577162265777588, + "learning_rate": 0.0001, + "loss": 0.1957, + "step": 980 + }, + { + "epoch": 1.599282686664493, + "grad_norm": 0.12125492095947266, + "learning_rate": 0.0001, + "loss": 0.1873, + "step": 981 + }, + { + "epoch": 1.6009129442451906, + "grad_norm": 0.15497727692127228, + "learning_rate": 0.0001, + "loss": 0.2019, + "step": 982 + }, + { + "epoch": 1.6025432018258885, + "grad_norm": 0.13305650651454926, + "learning_rate": 0.0001, + "loss": 0.1951, + "step": 983 + }, + { + "epoch": 1.6041734594065864, + "grad_norm": 0.13118699193000793, + "learning_rate": 0.0001, + "loss": 0.1704, + "step": 984 + }, + { + "epoch": 1.605803716987284, + "grad_norm": 0.12898042798042297, + "learning_rate": 0.0001, + "loss": 0.1937, + "step": 985 + }, + { + "epoch": 1.6074339745679818, + "grad_norm": 0.13094180822372437, + "learning_rate": 0.0001, + "loss": 0.1938, + "step": 986 + }, + { + "epoch": 1.6090642321486794, + "grad_norm": 0.13454465568065643, + "learning_rate": 0.0001, + "loss": 0.1833, + "step": 987 + }, + { + "epoch": 1.6106944897293771, + "grad_norm": 0.10670379549264908, + "learning_rate": 0.0001, + "loss": 0.1951, + "step": 988 + }, + { + "epoch": 1.612324747310075, + "grad_norm": 0.1214357241988182, + "learning_rate": 0.0001, + "loss": 0.1837, + "step": 989 + }, + { + "epoch": 1.6139550048907727, + "grad_norm": 0.12821504473686218, + "learning_rate": 0.0001, + "loss": 0.1937, + "step": 990 + }, + { + "epoch": 1.6155852624714706, + "grad_norm": 0.1238052025437355, + "learning_rate": 0.0001, + "loss": 0.2057, + "step": 991 + }, + { + "epoch": 1.6172155200521683, + "grad_norm": 0.15875551104545593, + "learning_rate": 0.0001, + "loss": 0.2037, + "step": 992 + }, + { + "epoch": 1.618845777632866, + "grad_norm": 0.12168974429368973, + "learning_rate": 0.0001, + "loss": 0.1965, + "step": 993 + }, + { + "epoch": 1.6204760352135636, + "grad_norm": 0.20948785543441772, + "learning_rate": 0.0001, + "loss": 0.1885, + "step": 994 + }, + { + "epoch": 1.6221062927942613, + "grad_norm": 0.12937723100185394, + "learning_rate": 0.0001, + "loss": 0.1893, + "step": 995 + }, + { + "epoch": 1.6237365503749592, + "grad_norm": 0.11235075443983078, + "learning_rate": 0.0001, + "loss": 0.185, + "step": 996 + }, + { + "epoch": 1.6253668079556571, + "grad_norm": 0.1234828382730484, + "learning_rate": 0.0001, + "loss": 0.1938, + "step": 997 + }, + { + "epoch": 1.6269970655363548, + "grad_norm": 0.11725595593452454, + "learning_rate": 0.0001, + "loss": 0.1844, + "step": 998 + }, + { + "epoch": 1.6286273231170525, + "grad_norm": 0.1212792843580246, + "learning_rate": 0.0001, + "loss": 0.2011, + "step": 999 + }, + { + "epoch": 1.6302575806977502, + "grad_norm": 0.12212159484624863, + "learning_rate": 0.0001, + "loss": 0.1965, + "step": 1000 + }, + { + "epoch": 1.6302575806977502, + "eval_loss": 0.20038433372974396, + "eval_runtime": 2056.6687, + "eval_samples_per_second": 0.918, + "eval_steps_per_second": 0.229, + "step": 1000 + }, + { + "epoch": 1.6318878382784479, + "grad_norm": 0.102206751704216, + "learning_rate": 0.0001, + "loss": 0.1835, + "step": 1001 + }, + { + "epoch": 1.6335180958591458, + "grad_norm": 0.12285874038934708, + "learning_rate": 0.0001, + "loss": 0.1868, + "step": 1002 + }, + { + "epoch": 1.6351483534398437, + "grad_norm": 0.12729693949222565, + "learning_rate": 0.0001, + "loss": 0.185, + "step": 1003 + }, + { + "epoch": 1.6367786110205413, + "grad_norm": 0.12659911811351776, + "learning_rate": 0.0001, + "loss": 0.1847, + "step": 1004 + }, + { + "epoch": 1.638408868601239, + "grad_norm": 0.13969117403030396, + "learning_rate": 0.0001, + "loss": 0.1903, + "step": 1005 + }, + { + "epoch": 1.6400391261819367, + "grad_norm": 0.14476704597473145, + "learning_rate": 0.0001, + "loss": 0.1901, + "step": 1006 + }, + { + "epoch": 1.6416693837626344, + "grad_norm": 0.15515267848968506, + "learning_rate": 0.0001, + "loss": 0.206, + "step": 1007 + }, + { + "epoch": 1.6432996413433323, + "grad_norm": 0.140212744474411, + "learning_rate": 0.0001, + "loss": 0.1761, + "step": 1008 + }, + { + "epoch": 1.64492989892403, + "grad_norm": 0.149306982755661, + "learning_rate": 0.0001, + "loss": 0.1917, + "step": 1009 + }, + { + "epoch": 1.6465601565047279, + "grad_norm": 0.12449630349874496, + "learning_rate": 0.0001, + "loss": 0.1913, + "step": 1010 + }, + { + "epoch": 1.6481904140854255, + "grad_norm": 0.11689987778663635, + "learning_rate": 0.0001, + "loss": 0.1709, + "step": 1011 + }, + { + "epoch": 1.6498206716661232, + "grad_norm": 0.14068616926670074, + "learning_rate": 0.0001, + "loss": 0.1958, + "step": 1012 + }, + { + "epoch": 1.651450929246821, + "grad_norm": 0.11247096955776215, + "learning_rate": 0.0001, + "loss": 0.206, + "step": 1013 + }, + { + "epoch": 1.6530811868275186, + "grad_norm": 0.1336074322462082, + "learning_rate": 0.0001, + "loss": 0.1828, + "step": 1014 + }, + { + "epoch": 1.6547114444082165, + "grad_norm": 0.1070534735918045, + "learning_rate": 0.0001, + "loss": 0.188, + "step": 1015 + }, + { + "epoch": 1.6563417019889144, + "grad_norm": 0.1153770238161087, + "learning_rate": 0.0001, + "loss": 0.1818, + "step": 1016 + }, + { + "epoch": 1.657971959569612, + "grad_norm": 0.12715262174606323, + "learning_rate": 0.0001, + "loss": 0.1945, + "step": 1017 + }, + { + "epoch": 1.6596022171503098, + "grad_norm": 0.13281576335430145, + "learning_rate": 0.0001, + "loss": 0.1821, + "step": 1018 + }, + { + "epoch": 1.6612324747310074, + "grad_norm": 0.12460318207740784, + "learning_rate": 0.0001, + "loss": 0.1999, + "step": 1019 + }, + { + "epoch": 1.6628627323117051, + "grad_norm": 0.12163477391004562, + "learning_rate": 0.0001, + "loss": 0.1883, + "step": 1020 + }, + { + "epoch": 1.664492989892403, + "grad_norm": 0.11481994390487671, + "learning_rate": 0.0001, + "loss": 0.1898, + "step": 1021 + }, + { + "epoch": 1.6661232474731007, + "grad_norm": 0.11116690933704376, + "learning_rate": 0.0001, + "loss": 0.1947, + "step": 1022 + }, + { + "epoch": 1.6677535050537986, + "grad_norm": 0.12174486368894577, + "learning_rate": 0.0001, + "loss": 0.1876, + "step": 1023 + }, + { + "epoch": 1.6693837626344963, + "grad_norm": 0.13974875211715698, + "learning_rate": 0.0001, + "loss": 0.1769, + "step": 1024 + }, + { + "epoch": 1.671014020215194, + "grad_norm": 0.10470610111951828, + "learning_rate": 0.0001, + "loss": 0.1814, + "step": 1025 + }, + { + "epoch": 1.6726442777958916, + "grad_norm": 0.11099997162818909, + "learning_rate": 0.0001, + "loss": 0.1944, + "step": 1026 + }, + { + "epoch": 1.6742745353765895, + "grad_norm": 0.11604826152324677, + "learning_rate": 0.0001, + "loss": 0.1937, + "step": 1027 + }, + { + "epoch": 1.6759047929572872, + "grad_norm": 0.10957235097885132, + "learning_rate": 0.0001, + "loss": 0.1756, + "step": 1028 + }, + { + "epoch": 1.6775350505379851, + "grad_norm": 0.13742849230766296, + "learning_rate": 0.0001, + "loss": 0.182, + "step": 1029 + }, + { + "epoch": 1.6791653081186828, + "grad_norm": 0.12097762525081635, + "learning_rate": 0.0001, + "loss": 0.1996, + "step": 1030 + }, + { + "epoch": 1.6807955656993805, + "grad_norm": 0.12486281245946884, + "learning_rate": 0.0001, + "loss": 0.2007, + "step": 1031 + }, + { + "epoch": 1.6824258232800782, + "grad_norm": 0.112356036901474, + "learning_rate": 0.0001, + "loss": 0.1873, + "step": 1032 + }, + { + "epoch": 1.6840560808607759, + "grad_norm": 0.11297633498907089, + "learning_rate": 0.0001, + "loss": 0.1872, + "step": 1033 + }, + { + "epoch": 1.6856863384414738, + "grad_norm": 0.13877838850021362, + "learning_rate": 0.0001, + "loss": 0.2053, + "step": 1034 + }, + { + "epoch": 1.6873165960221717, + "grad_norm": 0.144087016582489, + "learning_rate": 0.0001, + "loss": 0.1827, + "step": 1035 + }, + { + "epoch": 1.6889468536028693, + "grad_norm": 0.1187191903591156, + "learning_rate": 0.0001, + "loss": 0.1854, + "step": 1036 + }, + { + "epoch": 1.690577111183567, + "grad_norm": 0.15161648392677307, + "learning_rate": 0.0001, + "loss": 0.1974, + "step": 1037 + }, + { + "epoch": 1.6922073687642647, + "grad_norm": 0.1534930318593979, + "learning_rate": 0.0001, + "loss": 0.1882, + "step": 1038 + }, + { + "epoch": 1.6938376263449624, + "grad_norm": 0.09909740835428238, + "learning_rate": 0.0001, + "loss": 0.1792, + "step": 1039 + }, + { + "epoch": 1.6954678839256603, + "grad_norm": 0.13875959813594818, + "learning_rate": 0.0001, + "loss": 0.1884, + "step": 1040 + }, + { + "epoch": 1.697098141506358, + "grad_norm": 0.1244412511587143, + "learning_rate": 0.0001, + "loss": 0.1943, + "step": 1041 + }, + { + "epoch": 1.6987283990870559, + "grad_norm": 0.12313511967658997, + "learning_rate": 0.0001, + "loss": 0.1918, + "step": 1042 + }, + { + "epoch": 1.7003586566677535, + "grad_norm": 0.12056350708007812, + "learning_rate": 0.0001, + "loss": 0.1751, + "step": 1043 + }, + { + "epoch": 1.7019889142484512, + "grad_norm": 0.14450062811374664, + "learning_rate": 0.0001, + "loss": 0.1998, + "step": 1044 + }, + { + "epoch": 1.703619171829149, + "grad_norm": 0.13067997992038727, + "learning_rate": 0.0001, + "loss": 0.1956, + "step": 1045 + }, + { + "epoch": 1.7052494294098468, + "grad_norm": 0.1026124432682991, + "learning_rate": 0.0001, + "loss": 0.1906, + "step": 1046 + }, + { + "epoch": 1.7068796869905445, + "grad_norm": 0.12087104469537735, + "learning_rate": 0.0001, + "loss": 0.1803, + "step": 1047 + }, + { + "epoch": 1.7085099445712424, + "grad_norm": 0.12124813348054886, + "learning_rate": 0.0001, + "loss": 0.1974, + "step": 1048 + }, + { + "epoch": 1.71014020215194, + "grad_norm": 0.12620921432971954, + "learning_rate": 0.0001, + "loss": 0.1952, + "step": 1049 + }, + { + "epoch": 1.7117704597326378, + "grad_norm": 0.12863439321517944, + "learning_rate": 0.0001, + "loss": 0.1943, + "step": 1050 + }, + { + "epoch": 1.7134007173133354, + "grad_norm": 0.11415056884288788, + "learning_rate": 0.0001, + "loss": 0.1928, + "step": 1051 + }, + { + "epoch": 1.7150309748940331, + "grad_norm": 0.11588121205568314, + "learning_rate": 0.0001, + "loss": 0.1833, + "step": 1052 + }, + { + "epoch": 1.716661232474731, + "grad_norm": 0.11711575090885162, + "learning_rate": 0.0001, + "loss": 0.1854, + "step": 1053 + }, + { + "epoch": 1.718291490055429, + "grad_norm": 0.11321627348661423, + "learning_rate": 0.0001, + "loss": 0.1908, + "step": 1054 + }, + { + "epoch": 1.7199217476361266, + "grad_norm": 0.11224471032619476, + "learning_rate": 0.0001, + "loss": 0.1812, + "step": 1055 + }, + { + "epoch": 1.7215520052168243, + "grad_norm": 0.09902875125408173, + "learning_rate": 0.0001, + "loss": 0.185, + "step": 1056 + }, + { + "epoch": 1.723182262797522, + "grad_norm": 0.12637241184711456, + "learning_rate": 0.0001, + "loss": 0.2065, + "step": 1057 + }, + { + "epoch": 1.7248125203782196, + "grad_norm": 0.10967186093330383, + "learning_rate": 0.0001, + "loss": 0.1789, + "step": 1058 + }, + { + "epoch": 1.7264427779589175, + "grad_norm": 0.11306209117174149, + "learning_rate": 0.0001, + "loss": 0.1848, + "step": 1059 + }, + { + "epoch": 1.7280730355396152, + "grad_norm": 0.11642675846815109, + "learning_rate": 0.0001, + "loss": 0.19, + "step": 1060 + }, + { + "epoch": 1.7297032931203131, + "grad_norm": 0.14287948608398438, + "learning_rate": 0.0001, + "loss": 0.198, + "step": 1061 + }, + { + "epoch": 1.7313335507010108, + "grad_norm": 0.14489899575710297, + "learning_rate": 0.0001, + "loss": 0.1898, + "step": 1062 + }, + { + "epoch": 1.7329638082817085, + "grad_norm": 0.12616468966007233, + "learning_rate": 0.0001, + "loss": 0.181, + "step": 1063 + }, + { + "epoch": 1.7345940658624062, + "grad_norm": 0.1419239342212677, + "learning_rate": 0.0001, + "loss": 0.1981, + "step": 1064 + }, + { + "epoch": 1.7362243234431038, + "grad_norm": 0.12401966750621796, + "learning_rate": 0.0001, + "loss": 0.1822, + "step": 1065 + }, + { + "epoch": 1.7378545810238017, + "grad_norm": 0.15133000910282135, + "learning_rate": 0.0001, + "loss": 0.1974, + "step": 1066 + }, + { + "epoch": 1.7394848386044997, + "grad_norm": 0.1293191760778427, + "learning_rate": 0.0001, + "loss": 0.1904, + "step": 1067 + }, + { + "epoch": 1.7411150961851973, + "grad_norm": 0.12822459638118744, + "learning_rate": 0.0001, + "loss": 0.1893, + "step": 1068 + }, + { + "epoch": 1.742745353765895, + "grad_norm": 0.10721483081579208, + "learning_rate": 0.0001, + "loss": 0.1752, + "step": 1069 + }, + { + "epoch": 1.7443756113465927, + "grad_norm": 0.13893002271652222, + "learning_rate": 0.0001, + "loss": 0.1981, + "step": 1070 + }, + { + "epoch": 1.7460058689272904, + "grad_norm": 0.1082753837108612, + "learning_rate": 0.0001, + "loss": 0.1925, + "step": 1071 + }, + { + "epoch": 1.7476361265079883, + "grad_norm": 0.12488586455583572, + "learning_rate": 0.0001, + "loss": 0.1759, + "step": 1072 + }, + { + "epoch": 1.7492663840886862, + "grad_norm": 0.10307318717241287, + "learning_rate": 0.0001, + "loss": 0.1685, + "step": 1073 + }, + { + "epoch": 1.7508966416693839, + "grad_norm": 0.12732993066310883, + "learning_rate": 0.0001, + "loss": 0.1843, + "step": 1074 + }, + { + "epoch": 1.7525268992500815, + "grad_norm": 0.12339639663696289, + "learning_rate": 0.0001, + "loss": 0.185, + "step": 1075 + }, + { + "epoch": 1.7541571568307792, + "grad_norm": 0.10467349737882614, + "learning_rate": 0.0001, + "loss": 0.1787, + "step": 1076 + }, + { + "epoch": 1.755787414411477, + "grad_norm": 0.1297389268875122, + "learning_rate": 0.0001, + "loss": 0.1865, + "step": 1077 + }, + { + "epoch": 1.7574176719921748, + "grad_norm": 0.13278993964195251, + "learning_rate": 0.0001, + "loss": 0.1872, + "step": 1078 + }, + { + "epoch": 1.7590479295728725, + "grad_norm": 0.125654399394989, + "learning_rate": 0.0001, + "loss": 0.1731, + "step": 1079 + }, + { + "epoch": 1.7606781871535704, + "grad_norm": 0.12131417542695999, + "learning_rate": 0.0001, + "loss": 0.1717, + "step": 1080 + }, + { + "epoch": 1.762308444734268, + "grad_norm": 0.12412076443433762, + "learning_rate": 0.0001, + "loss": 0.1819, + "step": 1081 + }, + { + "epoch": 1.7639387023149657, + "grad_norm": 0.12504039704799652, + "learning_rate": 0.0001, + "loss": 0.1714, + "step": 1082 + }, + { + "epoch": 1.7655689598956634, + "grad_norm": 0.12615200877189636, + "learning_rate": 0.0001, + "loss": 0.1982, + "step": 1083 + }, + { + "epoch": 1.767199217476361, + "grad_norm": 0.13149379193782806, + "learning_rate": 0.0001, + "loss": 0.1983, + "step": 1084 + }, + { + "epoch": 1.768829475057059, + "grad_norm": 0.13638049364089966, + "learning_rate": 0.0001, + "loss": 0.1938, + "step": 1085 + }, + { + "epoch": 1.770459732637757, + "grad_norm": 0.10553500801324844, + "learning_rate": 0.0001, + "loss": 0.1867, + "step": 1086 + }, + { + "epoch": 1.7720899902184546, + "grad_norm": 0.13788992166519165, + "learning_rate": 0.0001, + "loss": 0.1825, + "step": 1087 + }, + { + "epoch": 1.7737202477991523, + "grad_norm": 0.11936747282743454, + "learning_rate": 0.0001, + "loss": 0.1885, + "step": 1088 + }, + { + "epoch": 1.77535050537985, + "grad_norm": 0.133977010846138, + "learning_rate": 0.0001, + "loss": 0.1954, + "step": 1089 + }, + { + "epoch": 1.7769807629605476, + "grad_norm": 0.12340915203094482, + "learning_rate": 0.0001, + "loss": 0.1825, + "step": 1090 + }, + { + "epoch": 1.7786110205412455, + "grad_norm": 0.11585668474435806, + "learning_rate": 0.0001, + "loss": 0.1854, + "step": 1091 + }, + { + "epoch": 1.7802412781219432, + "grad_norm": 0.1759333461523056, + "learning_rate": 0.0001, + "loss": 0.1947, + "step": 1092 + }, + { + "epoch": 1.7818715357026411, + "grad_norm": 0.12591791152954102, + "learning_rate": 0.0001, + "loss": 0.1897, + "step": 1093 + }, + { + "epoch": 1.7835017932833388, + "grad_norm": 0.11980108171701431, + "learning_rate": 0.0001, + "loss": 0.1962, + "step": 1094 + }, + { + "epoch": 1.7851320508640365, + "grad_norm": 0.12715961039066315, + "learning_rate": 0.0001, + "loss": 0.1961, + "step": 1095 + }, + { + "epoch": 1.7867623084447342, + "grad_norm": 0.1274608075618744, + "learning_rate": 0.0001, + "loss": 0.1855, + "step": 1096 + }, + { + "epoch": 1.788392566025432, + "grad_norm": 0.15268570184707642, + "learning_rate": 0.0001, + "loss": 0.1918, + "step": 1097 + }, + { + "epoch": 1.7900228236061297, + "grad_norm": 0.1284448206424713, + "learning_rate": 0.0001, + "loss": 0.1912, + "step": 1098 + }, + { + "epoch": 1.7916530811868276, + "grad_norm": 0.1219286322593689, + "learning_rate": 0.0001, + "loss": 0.1792, + "step": 1099 + }, + { + "epoch": 1.7932833387675253, + "grad_norm": 0.11822406947612762, + "learning_rate": 0.0001, + "loss": 0.1838, + "step": 1100 + }, + { + "epoch": 1.794913596348223, + "grad_norm": 0.12364920228719711, + "learning_rate": 0.0001, + "loss": 0.1897, + "step": 1101 + }, + { + "epoch": 1.7965438539289207, + "grad_norm": 0.11299461871385574, + "learning_rate": 0.0001, + "loss": 0.1955, + "step": 1102 + }, + { + "epoch": 1.7981741115096184, + "grad_norm": 0.13961385190486908, + "learning_rate": 0.0001, + "loss": 0.1899, + "step": 1103 + }, + { + "epoch": 1.7998043690903163, + "grad_norm": 0.1028355211019516, + "learning_rate": 0.0001, + "loss": 0.1776, + "step": 1104 + }, + { + "epoch": 1.8014346266710142, + "grad_norm": 0.107155442237854, + "learning_rate": 0.0001, + "loss": 0.1731, + "step": 1105 + }, + { + "epoch": 1.8030648842517119, + "grad_norm": 0.11978352069854736, + "learning_rate": 0.0001, + "loss": 0.1929, + "step": 1106 + }, + { + "epoch": 1.8046951418324095, + "grad_norm": 0.12182936072349548, + "learning_rate": 0.0001, + "loss": 0.1874, + "step": 1107 + }, + { + "epoch": 1.8063253994131072, + "grad_norm": 0.1270841509103775, + "learning_rate": 0.0001, + "loss": 0.1832, + "step": 1108 + }, + { + "epoch": 1.807955656993805, + "grad_norm": 0.10812801122665405, + "learning_rate": 0.0001, + "loss": 0.1704, + "step": 1109 + }, + { + "epoch": 1.8095859145745028, + "grad_norm": 0.11686091870069504, + "learning_rate": 0.0001, + "loss": 0.1765, + "step": 1110 + }, + { + "epoch": 1.8112161721552005, + "grad_norm": 0.1108395904302597, + "learning_rate": 0.0001, + "loss": 0.1832, + "step": 1111 + }, + { + "epoch": 1.8128464297358984, + "grad_norm": 0.13820597529411316, + "learning_rate": 0.0001, + "loss": 0.186, + "step": 1112 + }, + { + "epoch": 1.814476687316596, + "grad_norm": 0.1270742267370224, + "learning_rate": 0.0001, + "loss": 0.191, + "step": 1113 + }, + { + "epoch": 1.8161069448972937, + "grad_norm": 0.160540372133255, + "learning_rate": 0.0001, + "loss": 0.1821, + "step": 1114 + }, + { + "epoch": 1.8177372024779914, + "grad_norm": 0.11482474952936172, + "learning_rate": 0.0001, + "loss": 0.1914, + "step": 1115 + }, + { + "epoch": 1.8193674600586893, + "grad_norm": 0.13658450543880463, + "learning_rate": 0.0001, + "loss": 0.173, + "step": 1116 + }, + { + "epoch": 1.820997717639387, + "grad_norm": 0.10865119844675064, + "learning_rate": 0.0001, + "loss": 0.1821, + "step": 1117 + }, + { + "epoch": 1.822627975220085, + "grad_norm": 0.10327646881341934, + "learning_rate": 0.0001, + "loss": 0.1744, + "step": 1118 + }, + { + "epoch": 1.8242582328007826, + "grad_norm": 0.10378043353557587, + "learning_rate": 0.0001, + "loss": 0.1717, + "step": 1119 + }, + { + "epoch": 1.8258884903814803, + "grad_norm": 0.11684548109769821, + "learning_rate": 0.0001, + "loss": 0.1893, + "step": 1120 + }, + { + "epoch": 1.827518747962178, + "grad_norm": 0.1185649037361145, + "learning_rate": 0.0001, + "loss": 0.1865, + "step": 1121 + }, + { + "epoch": 1.8291490055428756, + "grad_norm": 0.11936715245246887, + "learning_rate": 0.0001, + "loss": 0.1844, + "step": 1122 + }, + { + "epoch": 1.8307792631235735, + "grad_norm": 0.10836116224527359, + "learning_rate": 0.0001, + "loss": 0.1826, + "step": 1123 + }, + { + "epoch": 1.8324095207042714, + "grad_norm": 0.10425475239753723, + "learning_rate": 0.0001, + "loss": 0.1873, + "step": 1124 + }, + { + "epoch": 1.8340397782849691, + "grad_norm": 0.12443797290325165, + "learning_rate": 0.0001, + "loss": 0.1775, + "step": 1125 + }, + { + "epoch": 1.8356700358656668, + "grad_norm": 0.1314772516489029, + "learning_rate": 0.0001, + "loss": 0.1825, + "step": 1126 + }, + { + "epoch": 1.8373002934463645, + "grad_norm": 0.1321101188659668, + "learning_rate": 0.0001, + "loss": 0.1775, + "step": 1127 + }, + { + "epoch": 1.8389305510270622, + "grad_norm": 0.11217895895242691, + "learning_rate": 0.0001, + "loss": 0.1814, + "step": 1128 + }, + { + "epoch": 1.84056080860776, + "grad_norm": 0.11161702871322632, + "learning_rate": 0.0001, + "loss": 0.1856, + "step": 1129 + }, + { + "epoch": 1.8421910661884577, + "grad_norm": 0.11947924643754959, + "learning_rate": 0.0001, + "loss": 0.1818, + "step": 1130 + }, + { + "epoch": 1.8438213237691556, + "grad_norm": 0.15216386318206787, + "learning_rate": 0.0001, + "loss": 0.18, + "step": 1131 + }, + { + "epoch": 1.8454515813498533, + "grad_norm": 0.12306888401508331, + "learning_rate": 0.0001, + "loss": 0.1918, + "step": 1132 + }, + { + "epoch": 1.847081838930551, + "grad_norm": 0.1321091651916504, + "learning_rate": 0.0001, + "loss": 0.1945, + "step": 1133 + }, + { + "epoch": 1.8487120965112487, + "grad_norm": 0.12340245395898819, + "learning_rate": 0.0001, + "loss": 0.1853, + "step": 1134 + }, + { + "epoch": 1.8503423540919464, + "grad_norm": 0.1099599152803421, + "learning_rate": 0.0001, + "loss": 0.1979, + "step": 1135 + }, + { + "epoch": 1.8519726116726443, + "grad_norm": 0.13142207264900208, + "learning_rate": 0.0001, + "loss": 0.1811, + "step": 1136 + }, + { + "epoch": 1.8536028692533422, + "grad_norm": 0.11548743396997452, + "learning_rate": 0.0001, + "loss": 0.1958, + "step": 1137 + }, + { + "epoch": 1.8552331268340398, + "grad_norm": 0.1179068312048912, + "learning_rate": 0.0001, + "loss": 0.1836, + "step": 1138 + }, + { + "epoch": 1.8568633844147375, + "grad_norm": 0.1258961409330368, + "learning_rate": 0.0001, + "loss": 0.1883, + "step": 1139 + }, + { + "epoch": 1.8584936419954352, + "grad_norm": 0.11142059415578842, + "learning_rate": 0.0001, + "loss": 0.1921, + "step": 1140 + }, + { + "epoch": 1.8601238995761329, + "grad_norm": 0.11219684034585953, + "learning_rate": 0.0001, + "loss": 0.1818, + "step": 1141 + }, + { + "epoch": 1.8617541571568308, + "grad_norm": 0.11732007563114166, + "learning_rate": 0.0001, + "loss": 0.185, + "step": 1142 + }, + { + "epoch": 1.8633844147375287, + "grad_norm": 0.10912680625915527, + "learning_rate": 0.0001, + "loss": 0.1823, + "step": 1143 + }, + { + "epoch": 1.8650146723182264, + "grad_norm": 0.13096576929092407, + "learning_rate": 0.0001, + "loss": 0.1958, + "step": 1144 + }, + { + "epoch": 1.866644929898924, + "grad_norm": 0.1521504819393158, + "learning_rate": 0.0001, + "loss": 0.1884, + "step": 1145 + }, + { + "epoch": 1.8682751874796217, + "grad_norm": 0.12232112884521484, + "learning_rate": 0.0001, + "loss": 0.2035, + "step": 1146 + }, + { + "epoch": 1.8699054450603194, + "grad_norm": 0.10666659474372864, + "learning_rate": 0.0001, + "loss": 0.1678, + "step": 1147 + }, + { + "epoch": 1.8715357026410173, + "grad_norm": 0.12789608538150787, + "learning_rate": 0.0001, + "loss": 0.1705, + "step": 1148 + }, + { + "epoch": 1.873165960221715, + "grad_norm": 0.1496013104915619, + "learning_rate": 0.0001, + "loss": 0.2019, + "step": 1149 + }, + { + "epoch": 1.874796217802413, + "grad_norm": 0.1169729083776474, + "learning_rate": 0.0001, + "loss": 0.1706, + "step": 1150 + }, + { + "epoch": 1.8764264753831106, + "grad_norm": 0.14244718849658966, + "learning_rate": 0.0001, + "loss": 0.1868, + "step": 1151 + }, + { + "epoch": 1.8780567329638083, + "grad_norm": 0.12071932852268219, + "learning_rate": 0.0001, + "loss": 0.186, + "step": 1152 + }, + { + "epoch": 1.879686990544506, + "grad_norm": 0.12490394711494446, + "learning_rate": 0.0001, + "loss": 0.1912, + "step": 1153 + }, + { + "epoch": 1.8813172481252036, + "grad_norm": 0.13060052692890167, + "learning_rate": 0.0001, + "loss": 0.2214, + "step": 1154 + }, + { + "epoch": 1.8829475057059015, + "grad_norm": 0.12722712755203247, + "learning_rate": 0.0001, + "loss": 0.1813, + "step": 1155 + }, + { + "epoch": 1.8845777632865994, + "grad_norm": 0.12535281479358673, + "learning_rate": 0.0001, + "loss": 0.1961, + "step": 1156 + }, + { + "epoch": 1.886208020867297, + "grad_norm": 0.11369742453098297, + "learning_rate": 0.0001, + "loss": 0.192, + "step": 1157 + }, + { + "epoch": 1.8878382784479948, + "grad_norm": 0.1047392338514328, + "learning_rate": 0.0001, + "loss": 0.1765, + "step": 1158 + }, + { + "epoch": 1.8894685360286925, + "grad_norm": 0.13275443017482758, + "learning_rate": 0.0001, + "loss": 0.1865, + "step": 1159 + }, + { + "epoch": 1.8910987936093901, + "grad_norm": 0.11752204596996307, + "learning_rate": 0.0001, + "loss": 0.1886, + "step": 1160 + }, + { + "epoch": 1.892729051190088, + "grad_norm": 0.11460895091295242, + "learning_rate": 0.0001, + "loss": 0.1898, + "step": 1161 + }, + { + "epoch": 1.8943593087707857, + "grad_norm": 0.12889966368675232, + "learning_rate": 0.0001, + "loss": 0.1826, + "step": 1162 + }, + { + "epoch": 1.8959895663514836, + "grad_norm": 0.11824796348810196, + "learning_rate": 0.0001, + "loss": 0.1775, + "step": 1163 + }, + { + "epoch": 1.8976198239321813, + "grad_norm": 0.10617636889219284, + "learning_rate": 0.0001, + "loss": 0.1718, + "step": 1164 + }, + { + "epoch": 1.899250081512879, + "grad_norm": 0.12249031662940979, + "learning_rate": 0.0001, + "loss": 0.1732, + "step": 1165 + }, + { + "epoch": 1.9008803390935767, + "grad_norm": 0.1171279028058052, + "learning_rate": 0.0001, + "loss": 0.1837, + "step": 1166 + }, + { + "epoch": 1.9025105966742746, + "grad_norm": 0.12532579898834229, + "learning_rate": 0.0001, + "loss": 0.1846, + "step": 1167 + }, + { + "epoch": 1.9041408542549723, + "grad_norm": 0.13807226717472076, + "learning_rate": 0.0001, + "loss": 0.18, + "step": 1168 + }, + { + "epoch": 1.9057711118356702, + "grad_norm": 0.1306944191455841, + "learning_rate": 0.0001, + "loss": 0.1849, + "step": 1169 + }, + { + "epoch": 1.9074013694163678, + "grad_norm": 0.3306502401828766, + "learning_rate": 0.0001, + "loss": 0.2122, + "step": 1170 + }, + { + "epoch": 1.9090316269970655, + "grad_norm": 0.1267193704843521, + "learning_rate": 0.0001, + "loss": 0.1893, + "step": 1171 + }, + { + "epoch": 1.9106618845777632, + "grad_norm": 0.1224604994058609, + "learning_rate": 0.0001, + "loss": 0.1989, + "step": 1172 + }, + { + "epoch": 1.9122921421584609, + "grad_norm": 0.11894525587558746, + "learning_rate": 0.0001, + "loss": 0.1731, + "step": 1173 + }, + { + "epoch": 1.9139223997391588, + "grad_norm": 0.11913865804672241, + "learning_rate": 0.0001, + "loss": 0.1845, + "step": 1174 + }, + { + "epoch": 1.9155526573198567, + "grad_norm": 0.11289380490779877, + "learning_rate": 0.0001, + "loss": 0.182, + "step": 1175 + }, + { + "epoch": 1.9171829149005544, + "grad_norm": 0.13549114763736725, + "learning_rate": 0.0001, + "loss": 0.206, + "step": 1176 + }, + { + "epoch": 1.918813172481252, + "grad_norm": 0.1213872879743576, + "learning_rate": 0.0001, + "loss": 0.1779, + "step": 1177 + }, + { + "epoch": 1.9204434300619497, + "grad_norm": 0.13372060656547546, + "learning_rate": 0.0001, + "loss": 0.1761, + "step": 1178 + }, + { + "epoch": 1.9220736876426474, + "grad_norm": 0.09771383553743362, + "learning_rate": 0.0001, + "loss": 0.1687, + "step": 1179 + }, + { + "epoch": 1.9237039452233453, + "grad_norm": 0.12244701385498047, + "learning_rate": 0.0001, + "loss": 0.1827, + "step": 1180 + }, + { + "epoch": 1.925334202804043, + "grad_norm": 0.1301904022693634, + "learning_rate": 0.0001, + "loss": 0.1814, + "step": 1181 + }, + { + "epoch": 1.926964460384741, + "grad_norm": 0.14816220104694366, + "learning_rate": 0.0001, + "loss": 0.2002, + "step": 1182 + }, + { + "epoch": 1.9285947179654386, + "grad_norm": 0.11727407574653625, + "learning_rate": 0.0001, + "loss": 0.1835, + "step": 1183 + }, + { + "epoch": 1.9302249755461363, + "grad_norm": 0.13211950659751892, + "learning_rate": 0.0001, + "loss": 0.1859, + "step": 1184 + }, + { + "epoch": 1.931855233126834, + "grad_norm": 0.12375782430171967, + "learning_rate": 0.0001, + "loss": 0.1837, + "step": 1185 + }, + { + "epoch": 1.9334854907075318, + "grad_norm": 0.14515690505504608, + "learning_rate": 0.0001, + "loss": 0.1757, + "step": 1186 + }, + { + "epoch": 1.9351157482882295, + "grad_norm": 0.11579223722219467, + "learning_rate": 0.0001, + "loss": 0.1767, + "step": 1187 + }, + { + "epoch": 1.9367460058689274, + "grad_norm": 0.11580117791891098, + "learning_rate": 0.0001, + "loss": 0.1745, + "step": 1188 + }, + { + "epoch": 1.938376263449625, + "grad_norm": 0.13860304653644562, + "learning_rate": 0.0001, + "loss": 0.1899, + "step": 1189 + }, + { + "epoch": 1.9400065210303228, + "grad_norm": 0.10386091470718384, + "learning_rate": 0.0001, + "loss": 0.1799, + "step": 1190 + }, + { + "epoch": 1.9416367786110205, + "grad_norm": 0.12089010328054428, + "learning_rate": 0.0001, + "loss": 0.1886, + "step": 1191 + }, + { + "epoch": 1.9432670361917181, + "grad_norm": 0.13745586574077606, + "learning_rate": 0.0001, + "loss": 0.1865, + "step": 1192 + }, + { + "epoch": 1.944897293772416, + "grad_norm": 0.12096656113862991, + "learning_rate": 0.0001, + "loss": 0.1929, + "step": 1193 + }, + { + "epoch": 1.946527551353114, + "grad_norm": 0.24482771754264832, + "learning_rate": 0.0001, + "loss": 0.1911, + "step": 1194 + }, + { + "epoch": 1.9481578089338116, + "grad_norm": 0.13980430364608765, + "learning_rate": 0.0001, + "loss": 0.1951, + "step": 1195 + }, + { + "epoch": 1.9497880665145093, + "grad_norm": 0.1281464695930481, + "learning_rate": 0.0001, + "loss": 0.1797, + "step": 1196 + }, + { + "epoch": 1.951418324095207, + "grad_norm": 0.11813049763441086, + "learning_rate": 0.0001, + "loss": 0.1836, + "step": 1197 + }, + { + "epoch": 1.9530485816759047, + "grad_norm": 0.13674259185791016, + "learning_rate": 0.0001, + "loss": 0.1921, + "step": 1198 + }, + { + "epoch": 1.9546788392566026, + "grad_norm": 0.12205852568149567, + "learning_rate": 0.0001, + "loss": 0.1726, + "step": 1199 + }, + { + "epoch": 1.9563090968373003, + "grad_norm": 0.12858718633651733, + "learning_rate": 0.0001, + "loss": 0.1814, + "step": 1200 + }, + { + "epoch": 1.9579393544179982, + "grad_norm": 0.1414395272731781, + "learning_rate": 0.0001, + "loss": 0.1791, + "step": 1201 + }, + { + "epoch": 1.9595696119986958, + "grad_norm": 0.10766829550266266, + "learning_rate": 0.0001, + "loss": 0.1905, + "step": 1202 + }, + { + "epoch": 1.9611998695793935, + "grad_norm": 0.12443807721138, + "learning_rate": 0.0001, + "loss": 0.1847, + "step": 1203 + }, + { + "epoch": 1.9628301271600912, + "grad_norm": 0.1413409411907196, + "learning_rate": 0.0001, + "loss": 0.1943, + "step": 1204 + }, + { + "epoch": 1.9644603847407889, + "grad_norm": 0.12135512381792068, + "learning_rate": 0.0001, + "loss": 0.1957, + "step": 1205 + }, + { + "epoch": 1.9660906423214868, + "grad_norm": 0.15406377613544464, + "learning_rate": 0.0001, + "loss": 0.1799, + "step": 1206 + }, + { + "epoch": 1.9677208999021847, + "grad_norm": 0.10838975757360458, + "learning_rate": 0.0001, + "loss": 0.1849, + "step": 1207 + }, + { + "epoch": 1.9693511574828824, + "grad_norm": 0.12363716959953308, + "learning_rate": 0.0001, + "loss": 0.2111, + "step": 1208 + }, + { + "epoch": 1.97098141506358, + "grad_norm": 0.10429894924163818, + "learning_rate": 0.0001, + "loss": 0.1778, + "step": 1209 + }, + { + "epoch": 1.9726116726442777, + "grad_norm": 0.11979079246520996, + "learning_rate": 0.0001, + "loss": 0.197, + "step": 1210 + }, + { + "epoch": 1.9742419302249754, + "grad_norm": 0.10798018425703049, + "learning_rate": 0.0001, + "loss": 0.1697, + "step": 1211 + }, + { + "epoch": 1.9758721878056733, + "grad_norm": 0.11077526211738586, + "learning_rate": 0.0001, + "loss": 0.1881, + "step": 1212 + }, + { + "epoch": 1.9775024453863712, + "grad_norm": 0.13390393555164337, + "learning_rate": 0.0001, + "loss": 0.1909, + "step": 1213 + }, + { + "epoch": 1.979132702967069, + "grad_norm": 0.11331623047590256, + "learning_rate": 0.0001, + "loss": 0.1869, + "step": 1214 + }, + { + "epoch": 1.9807629605477666, + "grad_norm": 0.12054945528507233, + "learning_rate": 0.0001, + "loss": 0.2016, + "step": 1215 + }, + { + "epoch": 1.9823932181284643, + "grad_norm": 0.11420212686061859, + "learning_rate": 0.0001, + "loss": 0.1714, + "step": 1216 + }, + { + "epoch": 1.984023475709162, + "grad_norm": 0.1471630334854126, + "learning_rate": 0.0001, + "loss": 0.1991, + "step": 1217 + }, + { + "epoch": 1.9856537332898598, + "grad_norm": 0.11184452474117279, + "learning_rate": 0.0001, + "loss": 0.1728, + "step": 1218 + }, + { + "epoch": 1.9872839908705575, + "grad_norm": 0.12244154512882233, + "learning_rate": 0.0001, + "loss": 0.1725, + "step": 1219 + }, + { + "epoch": 1.9889142484512554, + "grad_norm": 0.11920984834432602, + "learning_rate": 0.0001, + "loss": 0.1746, + "step": 1220 + }, + { + "epoch": 1.990544506031953, + "grad_norm": 0.14389824867248535, + "learning_rate": 0.0001, + "loss": 0.192, + "step": 1221 + }, + { + "epoch": 1.9921747636126508, + "grad_norm": 0.13001768290996552, + "learning_rate": 0.0001, + "loss": 0.1988, + "step": 1222 + }, + { + "epoch": 1.9938050211933485, + "grad_norm": 0.2044341266155243, + "learning_rate": 0.0001, + "loss": 0.1783, + "step": 1223 + }, + { + "epoch": 1.9954352787740461, + "grad_norm": 0.11807113885879517, + "learning_rate": 0.0001, + "loss": 0.1748, + "step": 1224 + }, + { + "epoch": 1.997065536354744, + "grad_norm": 0.12389995157718658, + "learning_rate": 0.0001, + "loss": 0.1968, + "step": 1225 + }, + { + "epoch": 1.998695793935442, + "grad_norm": 0.11298985034227371, + "learning_rate": 0.0001, + "loss": 0.1781, + "step": 1226 + }, + { + "epoch": 2.0003260515161396, + "grad_norm": 0.15368470549583435, + "learning_rate": 0.0001, + "loss": 0.1889, + "step": 1227 + }, + { + "epoch": 2.0019563090968373, + "grad_norm": 0.13915176689624786, + "learning_rate": 0.0001, + "loss": 0.1791, + "step": 1228 + }, + { + "epoch": 2.003586566677535, + "grad_norm": 0.13117247819900513, + "learning_rate": 0.0001, + "loss": 0.1753, + "step": 1229 + }, + { + "epoch": 2.0052168242582327, + "grad_norm": 0.1464276909828186, + "learning_rate": 0.0001, + "loss": 0.1773, + "step": 1230 + }, + { + "epoch": 2.0068470818389303, + "grad_norm": 0.1339617669582367, + "learning_rate": 0.0001, + "loss": 0.1857, + "step": 1231 + }, + { + "epoch": 2.0084773394196285, + "grad_norm": 0.12631379067897797, + "learning_rate": 0.0001, + "loss": 0.1711, + "step": 1232 + }, + { + "epoch": 2.010107597000326, + "grad_norm": 0.1561838537454605, + "learning_rate": 0.0001, + "loss": 0.173, + "step": 1233 + }, + { + "epoch": 2.011737854581024, + "grad_norm": 0.13479989767074585, + "learning_rate": 0.0001, + "loss": 0.1661, + "step": 1234 + }, + { + "epoch": 2.0133681121617215, + "grad_norm": 0.14471793174743652, + "learning_rate": 0.0001, + "loss": 0.1712, + "step": 1235 + }, + { + "epoch": 2.014998369742419, + "grad_norm": 0.13273389637470245, + "learning_rate": 0.0001, + "loss": 0.1689, + "step": 1236 + }, + { + "epoch": 2.016628627323117, + "grad_norm": 0.15367534756660461, + "learning_rate": 0.0001, + "loss": 0.1917, + "step": 1237 + }, + { + "epoch": 2.018258884903815, + "grad_norm": 0.14998719096183777, + "learning_rate": 0.0001, + "loss": 0.1683, + "step": 1238 + }, + { + "epoch": 2.0198891424845127, + "grad_norm": 0.12296893447637558, + "learning_rate": 0.0001, + "loss": 0.1604, + "step": 1239 + }, + { + "epoch": 2.0215194000652104, + "grad_norm": 0.1817280501127243, + "learning_rate": 0.0001, + "loss": 0.1783, + "step": 1240 + }, + { + "epoch": 2.023149657645908, + "grad_norm": 0.13128723204135895, + "learning_rate": 0.0001, + "loss": 0.1664, + "step": 1241 + }, + { + "epoch": 2.0247799152266057, + "grad_norm": 0.10108043253421783, + "learning_rate": 0.0001, + "loss": 0.1612, + "step": 1242 + }, + { + "epoch": 2.0264101728073034, + "grad_norm": 0.12584680318832397, + "learning_rate": 0.0001, + "loss": 0.1769, + "step": 1243 + }, + { + "epoch": 2.0280404303880015, + "grad_norm": 0.11849281191825867, + "learning_rate": 0.0001, + "loss": 0.1684, + "step": 1244 + }, + { + "epoch": 2.029670687968699, + "grad_norm": 0.15767361223697662, + "learning_rate": 0.0001, + "loss": 0.172, + "step": 1245 + }, + { + "epoch": 2.031300945549397, + "grad_norm": 0.13649272918701172, + "learning_rate": 0.0001, + "loss": 0.1777, + "step": 1246 + }, + { + "epoch": 2.0329312031300946, + "grad_norm": 0.12619361281394958, + "learning_rate": 0.0001, + "loss": 0.1843, + "step": 1247 + }, + { + "epoch": 2.0345614607107922, + "grad_norm": 0.12181363254785538, + "learning_rate": 0.0001, + "loss": 0.1651, + "step": 1248 + }, + { + "epoch": 2.03619171829149, + "grad_norm": 0.14900757372379303, + "learning_rate": 0.0001, + "loss": 0.1777, + "step": 1249 + }, + { + "epoch": 2.0378219758721876, + "grad_norm": 0.13565577566623688, + "learning_rate": 0.0001, + "loss": 0.1797, + "step": 1250 + }, + { + "epoch": 2.0394522334528857, + "grad_norm": 0.11237785965204239, + "learning_rate": 0.0001, + "loss": 0.1577, + "step": 1251 + }, + { + "epoch": 2.0410824910335834, + "grad_norm": 0.14608171582221985, + "learning_rate": 0.0001, + "loss": 0.1792, + "step": 1252 + }, + { + "epoch": 2.042712748614281, + "grad_norm": 0.14351628720760345, + "learning_rate": 0.0001, + "loss": 0.1848, + "step": 1253 + }, + { + "epoch": 2.0443430061949788, + "grad_norm": 0.13642996549606323, + "learning_rate": 0.0001, + "loss": 0.1919, + "step": 1254 + }, + { + "epoch": 2.0459732637756765, + "grad_norm": 0.13869710266590118, + "learning_rate": 0.0001, + "loss": 0.1739, + "step": 1255 + }, + { + "epoch": 2.047603521356374, + "grad_norm": 0.12874935567378998, + "learning_rate": 0.0001, + "loss": 0.1785, + "step": 1256 + }, + { + "epoch": 2.0492337789370723, + "grad_norm": 0.12782157957553864, + "learning_rate": 0.0001, + "loss": 0.1802, + "step": 1257 + }, + { + "epoch": 2.05086403651777, + "grad_norm": 0.12892591953277588, + "learning_rate": 0.0001, + "loss": 0.1704, + "step": 1258 + }, + { + "epoch": 2.0524942940984676, + "grad_norm": 0.1089789867401123, + "learning_rate": 0.0001, + "loss": 0.1578, + "step": 1259 + }, + { + "epoch": 2.0541245516791653, + "grad_norm": 0.1354093849658966, + "learning_rate": 0.0001, + "loss": 0.1675, + "step": 1260 + }, + { + "epoch": 2.055754809259863, + "grad_norm": 0.13709183037281036, + "learning_rate": 0.0001, + "loss": 0.1735, + "step": 1261 + }, + { + "epoch": 2.0573850668405607, + "grad_norm": 0.14396348595619202, + "learning_rate": 0.0001, + "loss": 0.1815, + "step": 1262 + }, + { + "epoch": 2.0590153244212583, + "grad_norm": 0.1237788200378418, + "learning_rate": 0.0001, + "loss": 0.1565, + "step": 1263 + }, + { + "epoch": 2.0606455820019565, + "grad_norm": 0.13664290308952332, + "learning_rate": 0.0001, + "loss": 0.1786, + "step": 1264 + }, + { + "epoch": 2.062275839582654, + "grad_norm": 0.16328440606594086, + "learning_rate": 0.0001, + "loss": 0.1598, + "step": 1265 + }, + { + "epoch": 2.063906097163352, + "grad_norm": 0.1353476196527481, + "learning_rate": 0.0001, + "loss": 0.1659, + "step": 1266 + }, + { + "epoch": 2.0655363547440495, + "grad_norm": 0.16309651732444763, + "learning_rate": 0.0001, + "loss": 0.1999, + "step": 1267 + }, + { + "epoch": 2.067166612324747, + "grad_norm": 0.13841314613819122, + "learning_rate": 0.0001, + "loss": 0.1974, + "step": 1268 + }, + { + "epoch": 2.068796869905445, + "grad_norm": 0.12356504052877426, + "learning_rate": 0.0001, + "loss": 0.1695, + "step": 1269 + }, + { + "epoch": 2.070427127486143, + "grad_norm": 0.13916341960430145, + "learning_rate": 0.0001, + "loss": 0.165, + "step": 1270 + }, + { + "epoch": 2.0720573850668407, + "grad_norm": 0.13832920789718628, + "learning_rate": 0.0001, + "loss": 0.1619, + "step": 1271 + }, + { + "epoch": 2.0736876426475384, + "grad_norm": 0.14106802642345428, + "learning_rate": 0.0001, + "loss": 0.1749, + "step": 1272 + }, + { + "epoch": 2.075317900228236, + "grad_norm": 0.12422151118516922, + "learning_rate": 0.0001, + "loss": 0.1718, + "step": 1273 + }, + { + "epoch": 2.0769481578089337, + "grad_norm": 0.14482441544532776, + "learning_rate": 0.0001, + "loss": 0.1807, + "step": 1274 + }, + { + "epoch": 2.0785784153896314, + "grad_norm": 0.11608091741800308, + "learning_rate": 0.0001, + "loss": 0.1759, + "step": 1275 + }, + { + "epoch": 2.0802086729703295, + "grad_norm": 0.12500539422035217, + "learning_rate": 0.0001, + "loss": 0.1542, + "step": 1276 + }, + { + "epoch": 2.081838930551027, + "grad_norm": 0.13771778345108032, + "learning_rate": 0.0001, + "loss": 0.1899, + "step": 1277 + }, + { + "epoch": 2.083469188131725, + "grad_norm": 0.12829305231571198, + "learning_rate": 0.0001, + "loss": 0.1617, + "step": 1278 + }, + { + "epoch": 2.0850994457124226, + "grad_norm": 0.12664735317230225, + "learning_rate": 0.0001, + "loss": 0.1756, + "step": 1279 + }, + { + "epoch": 2.0867297032931202, + "grad_norm": 0.14798226952552795, + "learning_rate": 0.0001, + "loss": 0.175, + "step": 1280 + }, + { + "epoch": 2.088359960873818, + "grad_norm": 0.13553886115550995, + "learning_rate": 0.0001, + "loss": 0.1745, + "step": 1281 + }, + { + "epoch": 2.0899902184545156, + "grad_norm": 0.14772772789001465, + "learning_rate": 0.0001, + "loss": 0.1727, + "step": 1282 + }, + { + "epoch": 2.0916204760352137, + "grad_norm": 0.14489415287971497, + "learning_rate": 0.0001, + "loss": 0.1595, + "step": 1283 + }, + { + "epoch": 2.0932507336159114, + "grad_norm": 0.14262838661670685, + "learning_rate": 0.0001, + "loss": 0.1735, + "step": 1284 + }, + { + "epoch": 2.094880991196609, + "grad_norm": 0.13312998414039612, + "learning_rate": 0.0001, + "loss": 0.172, + "step": 1285 + }, + { + "epoch": 2.0965112487773068, + "grad_norm": 0.1361507624387741, + "learning_rate": 0.0001, + "loss": 0.1679, + "step": 1286 + }, + { + "epoch": 2.0981415063580044, + "grad_norm": 0.13537771999835968, + "learning_rate": 0.0001, + "loss": 0.1656, + "step": 1287 + }, + { + "epoch": 2.099771763938702, + "grad_norm": 0.16186851263046265, + "learning_rate": 0.0001, + "loss": 0.1786, + "step": 1288 + }, + { + "epoch": 2.1014020215194003, + "grad_norm": 0.1386481523513794, + "learning_rate": 0.0001, + "loss": 0.1745, + "step": 1289 + }, + { + "epoch": 2.103032279100098, + "grad_norm": 0.12809383869171143, + "learning_rate": 0.0001, + "loss": 0.1676, + "step": 1290 + }, + { + "epoch": 2.1046625366807956, + "grad_norm": 0.12493950873613358, + "learning_rate": 0.0001, + "loss": 0.1634, + "step": 1291 + }, + { + "epoch": 2.1062927942614933, + "grad_norm": 0.12215922027826309, + "learning_rate": 0.0001, + "loss": 0.1633, + "step": 1292 + }, + { + "epoch": 2.107923051842191, + "grad_norm": 0.14522279798984528, + "learning_rate": 0.0001, + "loss": 0.1785, + "step": 1293 + }, + { + "epoch": 2.1095533094228887, + "grad_norm": 0.16697941720485687, + "learning_rate": 0.0001, + "loss": 0.1714, + "step": 1294 + }, + { + "epoch": 2.111183567003587, + "grad_norm": 0.12275785952806473, + "learning_rate": 0.0001, + "loss": 0.174, + "step": 1295 + }, + { + "epoch": 2.1128138245842845, + "grad_norm": 0.12233259528875351, + "learning_rate": 0.0001, + "loss": 0.1639, + "step": 1296 + }, + { + "epoch": 2.114444082164982, + "grad_norm": 0.145602285861969, + "learning_rate": 0.0001, + "loss": 0.1725, + "step": 1297 + }, + { + "epoch": 2.11607433974568, + "grad_norm": 0.18430952727794647, + "learning_rate": 0.0001, + "loss": 0.1617, + "step": 1298 + }, + { + "epoch": 2.1177045973263775, + "grad_norm": 0.12870638072490692, + "learning_rate": 0.0001, + "loss": 0.1741, + "step": 1299 + }, + { + "epoch": 2.119334854907075, + "grad_norm": 0.1242339164018631, + "learning_rate": 0.0001, + "loss": 0.1614, + "step": 1300 + }, + { + "epoch": 2.120965112487773, + "grad_norm": 0.15219736099243164, + "learning_rate": 0.0001, + "loss": 0.1759, + "step": 1301 + }, + { + "epoch": 2.122595370068471, + "grad_norm": 0.14123129844665527, + "learning_rate": 0.0001, + "loss": 0.1637, + "step": 1302 + }, + { + "epoch": 2.1242256276491687, + "grad_norm": 0.11224009841680527, + "learning_rate": 0.0001, + "loss": 0.1586, + "step": 1303 + }, + { + "epoch": 2.1258558852298663, + "grad_norm": 0.1388261467218399, + "learning_rate": 0.0001, + "loss": 0.1607, + "step": 1304 + }, + { + "epoch": 2.127486142810564, + "grad_norm": 0.13148367404937744, + "learning_rate": 0.0001, + "loss": 0.1894, + "step": 1305 + }, + { + "epoch": 2.1291164003912617, + "grad_norm": 0.1266559213399887, + "learning_rate": 0.0001, + "loss": 0.1582, + "step": 1306 + }, + { + "epoch": 2.1307466579719594, + "grad_norm": 0.15286579728126526, + "learning_rate": 0.0001, + "loss": 0.1852, + "step": 1307 + }, + { + "epoch": 2.1323769155526575, + "grad_norm": 0.1486169546842575, + "learning_rate": 0.0001, + "loss": 0.1673, + "step": 1308 + }, + { + "epoch": 2.134007173133355, + "grad_norm": 0.1324825882911682, + "learning_rate": 0.0001, + "loss": 0.1745, + "step": 1309 + }, + { + "epoch": 2.135637430714053, + "grad_norm": 0.12974712252616882, + "learning_rate": 0.0001, + "loss": 0.1786, + "step": 1310 + }, + { + "epoch": 2.1372676882947506, + "grad_norm": 0.17382071912288666, + "learning_rate": 0.0001, + "loss": 0.1659, + "step": 1311 + }, + { + "epoch": 2.1388979458754482, + "grad_norm": 0.14870017766952515, + "learning_rate": 0.0001, + "loss": 0.1827, + "step": 1312 + }, + { + "epoch": 2.140528203456146, + "grad_norm": 0.1424129158258438, + "learning_rate": 0.0001, + "loss": 0.1813, + "step": 1313 + }, + { + "epoch": 2.1421584610368436, + "grad_norm": 0.15458005666732788, + "learning_rate": 0.0001, + "loss": 0.1705, + "step": 1314 + }, + { + "epoch": 2.1437887186175417, + "grad_norm": 0.1291740983724594, + "learning_rate": 0.0001, + "loss": 0.1563, + "step": 1315 + }, + { + "epoch": 2.1454189761982394, + "grad_norm": 0.14298751950263977, + "learning_rate": 0.0001, + "loss": 0.176, + "step": 1316 + }, + { + "epoch": 2.147049233778937, + "grad_norm": 0.1237107664346695, + "learning_rate": 0.0001, + "loss": 0.1636, + "step": 1317 + }, + { + "epoch": 2.1486794913596348, + "grad_norm": 0.11446285992860794, + "learning_rate": 0.0001, + "loss": 0.1658, + "step": 1318 + }, + { + "epoch": 2.1503097489403324, + "grad_norm": 0.11239829659461975, + "learning_rate": 0.0001, + "loss": 0.1606, + "step": 1319 + }, + { + "epoch": 2.15194000652103, + "grad_norm": 0.12736567854881287, + "learning_rate": 0.0001, + "loss": 0.1778, + "step": 1320 + }, + { + "epoch": 2.1535702641017282, + "grad_norm": 0.13921810686588287, + "learning_rate": 0.0001, + "loss": 0.1882, + "step": 1321 + }, + { + "epoch": 2.155200521682426, + "grad_norm": 0.12388879060745239, + "learning_rate": 0.0001, + "loss": 0.1824, + "step": 1322 + }, + { + "epoch": 2.1568307792631236, + "grad_norm": 0.1522189825773239, + "learning_rate": 0.0001, + "loss": 0.1556, + "step": 1323 + }, + { + "epoch": 2.1584610368438213, + "grad_norm": 0.13830043375492096, + "learning_rate": 0.0001, + "loss": 0.1706, + "step": 1324 + }, + { + "epoch": 2.160091294424519, + "grad_norm": 0.14144088327884674, + "learning_rate": 0.0001, + "loss": 0.1699, + "step": 1325 + }, + { + "epoch": 2.1617215520052167, + "grad_norm": 0.14037571847438812, + "learning_rate": 0.0001, + "loss": 0.1673, + "step": 1326 + }, + { + "epoch": 2.1633518095859148, + "grad_norm": 0.16178935766220093, + "learning_rate": 0.0001, + "loss": 0.1881, + "step": 1327 + }, + { + "epoch": 2.1649820671666125, + "grad_norm": 0.12526622414588928, + "learning_rate": 0.0001, + "loss": 0.1675, + "step": 1328 + }, + { + "epoch": 2.16661232474731, + "grad_norm": 0.1448407918214798, + "learning_rate": 0.0001, + "loss": 0.1731, + "step": 1329 + }, + { + "epoch": 2.168242582328008, + "grad_norm": 0.15621818602085114, + "learning_rate": 0.0001, + "loss": 0.1764, + "step": 1330 + }, + { + "epoch": 2.1698728399087055, + "grad_norm": 0.14016954600811005, + "learning_rate": 0.0001, + "loss": 0.1752, + "step": 1331 + }, + { + "epoch": 2.171503097489403, + "grad_norm": 0.15519127249717712, + "learning_rate": 0.0001, + "loss": 0.1701, + "step": 1332 + }, + { + "epoch": 2.1731333550701013, + "grad_norm": 0.12022172659635544, + "learning_rate": 0.0001, + "loss": 0.1639, + "step": 1333 + }, + { + "epoch": 2.174763612650799, + "grad_norm": 0.1351272314786911, + "learning_rate": 0.0001, + "loss": 0.1653, + "step": 1334 + }, + { + "epoch": 2.1763938702314967, + "grad_norm": 0.13195279240608215, + "learning_rate": 0.0001, + "loss": 0.1735, + "step": 1335 + }, + { + "epoch": 2.1780241278121943, + "grad_norm": 0.14435507357120514, + "learning_rate": 0.0001, + "loss": 0.1801, + "step": 1336 + }, + { + "epoch": 2.179654385392892, + "grad_norm": 0.13603608310222626, + "learning_rate": 0.0001, + "loss": 0.1643, + "step": 1337 + }, + { + "epoch": 2.1812846429735897, + "grad_norm": 0.14588388800621033, + "learning_rate": 0.0001, + "loss": 0.1711, + "step": 1338 + }, + { + "epoch": 2.1829149005542874, + "grad_norm": 0.13602250814437866, + "learning_rate": 0.0001, + "loss": 0.1854, + "step": 1339 + }, + { + "epoch": 2.1845451581349855, + "grad_norm": 0.13646137714385986, + "learning_rate": 0.0001, + "loss": 0.1881, + "step": 1340 + }, + { + "epoch": 2.186175415715683, + "grad_norm": 0.13280268013477325, + "learning_rate": 0.0001, + "loss": 0.1708, + "step": 1341 + }, + { + "epoch": 2.187805673296381, + "grad_norm": 0.13370732963085175, + "learning_rate": 0.0001, + "loss": 0.1667, + "step": 1342 + }, + { + "epoch": 2.1894359308770786, + "grad_norm": 0.14326944947242737, + "learning_rate": 0.0001, + "loss": 0.1686, + "step": 1343 + }, + { + "epoch": 2.1910661884577762, + "grad_norm": 0.12916752696037292, + "learning_rate": 0.0001, + "loss": 0.1779, + "step": 1344 + }, + { + "epoch": 2.192696446038474, + "grad_norm": 0.14404352009296417, + "learning_rate": 0.0001, + "loss": 0.171, + "step": 1345 + }, + { + "epoch": 2.194326703619172, + "grad_norm": 0.12311873584985733, + "learning_rate": 0.0001, + "loss": 0.1735, + "step": 1346 + }, + { + "epoch": 2.1959569611998697, + "grad_norm": 0.15599961578845978, + "learning_rate": 0.0001, + "loss": 0.1748, + "step": 1347 + }, + { + "epoch": 2.1975872187805674, + "grad_norm": 0.12893660366535187, + "learning_rate": 0.0001, + "loss": 0.1829, + "step": 1348 + }, + { + "epoch": 2.199217476361265, + "grad_norm": 0.12859125435352325, + "learning_rate": 0.0001, + "loss": 0.1797, + "step": 1349 + }, + { + "epoch": 2.2008477339419628, + "grad_norm": 0.1159624233841896, + "learning_rate": 0.0001, + "loss": 0.1658, + "step": 1350 + }, + { + "epoch": 2.2024779915226604, + "grad_norm": 0.13532379269599915, + "learning_rate": 0.0001, + "loss": 0.1973, + "step": 1351 + }, + { + "epoch": 2.204108249103358, + "grad_norm": 0.142998605966568, + "learning_rate": 0.0001, + "loss": 0.1763, + "step": 1352 + }, + { + "epoch": 2.2057385066840562, + "grad_norm": 0.15357817709445953, + "learning_rate": 0.0001, + "loss": 0.1733, + "step": 1353 + }, + { + "epoch": 2.207368764264754, + "grad_norm": 0.133555606007576, + "learning_rate": 0.0001, + "loss": 0.177, + "step": 1354 + }, + { + "epoch": 2.2089990218454516, + "grad_norm": 0.12514221668243408, + "learning_rate": 0.0001, + "loss": 0.1706, + "step": 1355 + }, + { + "epoch": 2.2106292794261493, + "grad_norm": 0.12143199145793915, + "learning_rate": 0.0001, + "loss": 0.1728, + "step": 1356 + }, + { + "epoch": 2.212259537006847, + "grad_norm": 0.12493051588535309, + "learning_rate": 0.0001, + "loss": 0.1735, + "step": 1357 + }, + { + "epoch": 2.2138897945875446, + "grad_norm": 0.12735562026500702, + "learning_rate": 0.0001, + "loss": 0.1692, + "step": 1358 + }, + { + "epoch": 2.2155200521682428, + "grad_norm": 0.15920554101467133, + "learning_rate": 0.0001, + "loss": 0.1946, + "step": 1359 + }, + { + "epoch": 2.2171503097489405, + "grad_norm": 0.12582500278949738, + "learning_rate": 0.0001, + "loss": 0.1585, + "step": 1360 + }, + { + "epoch": 2.218780567329638, + "grad_norm": 0.16344155371189117, + "learning_rate": 0.0001, + "loss": 0.1666, + "step": 1361 + }, + { + "epoch": 2.220410824910336, + "grad_norm": 0.12184584140777588, + "learning_rate": 0.0001, + "loss": 0.1804, + "step": 1362 + }, + { + "epoch": 2.2220410824910335, + "grad_norm": 0.16029442846775055, + "learning_rate": 0.0001, + "loss": 0.1911, + "step": 1363 + }, + { + "epoch": 2.223671340071731, + "grad_norm": 0.12164244800806046, + "learning_rate": 0.0001, + "loss": 0.1907, + "step": 1364 + }, + { + "epoch": 2.225301597652429, + "grad_norm": 0.15586431324481964, + "learning_rate": 0.0001, + "loss": 0.1762, + "step": 1365 + }, + { + "epoch": 2.226931855233127, + "grad_norm": 0.1489764153957367, + "learning_rate": 0.0001, + "loss": 0.1857, + "step": 1366 + }, + { + "epoch": 2.2285621128138247, + "grad_norm": 0.153330996632576, + "learning_rate": 0.0001, + "loss": 0.1756, + "step": 1367 + }, + { + "epoch": 2.2301923703945223, + "grad_norm": 0.11781416088342667, + "learning_rate": 0.0001, + "loss": 0.1556, + "step": 1368 + }, + { + "epoch": 2.23182262797522, + "grad_norm": 0.1183483824133873, + "learning_rate": 0.0001, + "loss": 0.1564, + "step": 1369 + }, + { + "epoch": 2.2334528855559177, + "grad_norm": 0.12061773240566254, + "learning_rate": 0.0001, + "loss": 0.1717, + "step": 1370 + }, + { + "epoch": 2.2350831431366154, + "grad_norm": 0.151438370347023, + "learning_rate": 0.0001, + "loss": 0.1794, + "step": 1371 + }, + { + "epoch": 2.2367134007173135, + "grad_norm": 0.12716467678546906, + "learning_rate": 0.0001, + "loss": 0.1778, + "step": 1372 + }, + { + "epoch": 2.238343658298011, + "grad_norm": 0.13353648781776428, + "learning_rate": 0.0001, + "loss": 0.181, + "step": 1373 + }, + { + "epoch": 2.239973915878709, + "grad_norm": 0.12627844512462616, + "learning_rate": 0.0001, + "loss": 0.1714, + "step": 1374 + }, + { + "epoch": 2.2416041734594065, + "grad_norm": 0.14614202082157135, + "learning_rate": 0.0001, + "loss": 0.1751, + "step": 1375 + }, + { + "epoch": 2.2432344310401042, + "grad_norm": 0.12322308868169785, + "learning_rate": 0.0001, + "loss": 0.1688, + "step": 1376 + }, + { + "epoch": 2.244864688620802, + "grad_norm": 0.1395500749349594, + "learning_rate": 0.0001, + "loss": 0.1777, + "step": 1377 + }, + { + "epoch": 2.2464949462015, + "grad_norm": 0.12420187145471573, + "learning_rate": 0.0001, + "loss": 0.1617, + "step": 1378 + }, + { + "epoch": 2.2481252037821977, + "grad_norm": 0.1447030007839203, + "learning_rate": 0.0001, + "loss": 0.1794, + "step": 1379 + }, + { + "epoch": 2.2497554613628954, + "grad_norm": 0.1566443145275116, + "learning_rate": 0.0001, + "loss": 0.1863, + "step": 1380 + }, + { + "epoch": 2.251385718943593, + "grad_norm": 0.1401456892490387, + "learning_rate": 0.0001, + "loss": 0.1822, + "step": 1381 + }, + { + "epoch": 2.2530159765242908, + "grad_norm": 0.13101978600025177, + "learning_rate": 0.0001, + "loss": 0.1672, + "step": 1382 + }, + { + "epoch": 2.2546462341049884, + "grad_norm": 0.12039732187986374, + "learning_rate": 0.0001, + "loss": 0.1679, + "step": 1383 + }, + { + "epoch": 2.2562764916856866, + "grad_norm": 0.12393520027399063, + "learning_rate": 0.0001, + "loss": 0.1728, + "step": 1384 + }, + { + "epoch": 2.2579067492663842, + "grad_norm": 0.16083435714244843, + "learning_rate": 0.0001, + "loss": 0.1709, + "step": 1385 + }, + { + "epoch": 2.259537006847082, + "grad_norm": 0.13336148858070374, + "learning_rate": 0.0001, + "loss": 0.1675, + "step": 1386 + }, + { + "epoch": 2.2611672644277796, + "grad_norm": 0.14768260717391968, + "learning_rate": 0.0001, + "loss": 0.1747, + "step": 1387 + }, + { + "epoch": 2.2627975220084773, + "grad_norm": 0.13280175626277924, + "learning_rate": 0.0001, + "loss": 0.1616, + "step": 1388 + }, + { + "epoch": 2.264427779589175, + "grad_norm": 0.12707430124282837, + "learning_rate": 0.0001, + "loss": 0.1687, + "step": 1389 + }, + { + "epoch": 2.2660580371698726, + "grad_norm": 0.14237570762634277, + "learning_rate": 0.0001, + "loss": 0.1598, + "step": 1390 + }, + { + "epoch": 2.2676882947505708, + "grad_norm": 0.1464899331331253, + "learning_rate": 0.0001, + "loss": 0.1795, + "step": 1391 + }, + { + "epoch": 2.2693185523312684, + "grad_norm": 0.15607094764709473, + "learning_rate": 0.0001, + "loss": 0.1775, + "step": 1392 + }, + { + "epoch": 2.270948809911966, + "grad_norm": 0.1322450190782547, + "learning_rate": 0.0001, + "loss": 0.1729, + "step": 1393 + }, + { + "epoch": 2.272579067492664, + "grad_norm": 0.17985309660434723, + "learning_rate": 0.0001, + "loss": 0.1649, + "step": 1394 + }, + { + "epoch": 2.2742093250733615, + "grad_norm": 0.1275913417339325, + "learning_rate": 0.0001, + "loss": 0.184, + "step": 1395 + }, + { + "epoch": 2.275839582654059, + "grad_norm": 0.1359567791223526, + "learning_rate": 0.0001, + "loss": 0.1807, + "step": 1396 + }, + { + "epoch": 2.2774698402347573, + "grad_norm": 0.13110549747943878, + "learning_rate": 0.0001, + "loss": 0.1736, + "step": 1397 + }, + { + "epoch": 2.279100097815455, + "grad_norm": 0.13743306696414948, + "learning_rate": 0.0001, + "loss": 0.1657, + "step": 1398 + }, + { + "epoch": 2.2807303553961527, + "grad_norm": 0.15267515182495117, + "learning_rate": 0.0001, + "loss": 0.1674, + "step": 1399 + }, + { + "epoch": 2.2823606129768503, + "grad_norm": 0.12119441479444504, + "learning_rate": 0.0001, + "loss": 0.1627, + "step": 1400 + }, + { + "epoch": 2.283990870557548, + "grad_norm": 0.1421733945608139, + "learning_rate": 0.0001, + "loss": 0.161, + "step": 1401 + }, + { + "epoch": 2.2856211281382457, + "grad_norm": 0.1504882127046585, + "learning_rate": 0.0001, + "loss": 0.1869, + "step": 1402 + }, + { + "epoch": 2.2872513857189434, + "grad_norm": 0.15724492073059082, + "learning_rate": 0.0001, + "loss": 0.1826, + "step": 1403 + }, + { + "epoch": 2.2888816432996415, + "grad_norm": 0.1295897364616394, + "learning_rate": 0.0001, + "loss": 0.1697, + "step": 1404 + }, + { + "epoch": 2.290511900880339, + "grad_norm": 0.13183419406414032, + "learning_rate": 0.0001, + "loss": 0.1675, + "step": 1405 + }, + { + "epoch": 2.292142158461037, + "grad_norm": 0.14537960290908813, + "learning_rate": 0.0001, + "loss": 0.1912, + "step": 1406 + }, + { + "epoch": 2.2937724160417345, + "grad_norm": 0.1461685299873352, + "learning_rate": 0.0001, + "loss": 0.1815, + "step": 1407 + }, + { + "epoch": 2.295402673622432, + "grad_norm": 0.13184034824371338, + "learning_rate": 0.0001, + "loss": 0.1764, + "step": 1408 + }, + { + "epoch": 2.29703293120313, + "grad_norm": 0.15598301589488983, + "learning_rate": 0.0001, + "loss": 0.1688, + "step": 1409 + }, + { + "epoch": 2.298663188783828, + "grad_norm": 0.17276427149772644, + "learning_rate": 0.0001, + "loss": 0.1794, + "step": 1410 + }, + { + "epoch": 2.3002934463645257, + "grad_norm": 0.12554508447647095, + "learning_rate": 0.0001, + "loss": 0.1788, + "step": 1411 + }, + { + "epoch": 2.3019237039452234, + "grad_norm": 0.12981827557086945, + "learning_rate": 0.0001, + "loss": 0.1658, + "step": 1412 + }, + { + "epoch": 2.303553961525921, + "grad_norm": 0.10513079911470413, + "learning_rate": 0.0001, + "loss": 0.1587, + "step": 1413 + }, + { + "epoch": 2.3051842191066187, + "grad_norm": 0.12070135772228241, + "learning_rate": 0.0001, + "loss": 0.1652, + "step": 1414 + }, + { + "epoch": 2.3068144766873164, + "grad_norm": 0.11308653652667999, + "learning_rate": 0.0001, + "loss": 0.1609, + "step": 1415 + }, + { + "epoch": 2.308444734268014, + "grad_norm": 0.1290631741285324, + "learning_rate": 0.0001, + "loss": 0.179, + "step": 1416 + }, + { + "epoch": 2.3100749918487122, + "grad_norm": 0.15130403637886047, + "learning_rate": 0.0001, + "loss": 0.192, + "step": 1417 + }, + { + "epoch": 2.31170524942941, + "grad_norm": 0.12521226704120636, + "learning_rate": 0.0001, + "loss": 0.1743, + "step": 1418 + }, + { + "epoch": 2.3133355070101076, + "grad_norm": 0.15238836407661438, + "learning_rate": 0.0001, + "loss": 0.1806, + "step": 1419 + }, + { + "epoch": 2.3149657645908053, + "grad_norm": 0.15979474782943726, + "learning_rate": 0.0001, + "loss": 0.1987, + "step": 1420 + }, + { + "epoch": 2.316596022171503, + "grad_norm": 0.1352485716342926, + "learning_rate": 0.0001, + "loss": 0.1868, + "step": 1421 + }, + { + "epoch": 2.318226279752201, + "grad_norm": 0.12670114636421204, + "learning_rate": 0.0001, + "loss": 0.1807, + "step": 1422 + }, + { + "epoch": 2.3198565373328988, + "grad_norm": 0.13426585495471954, + "learning_rate": 0.0001, + "loss": 0.1739, + "step": 1423 + }, + { + "epoch": 2.3214867949135964, + "grad_norm": 0.1362699568271637, + "learning_rate": 0.0001, + "loss": 0.1839, + "step": 1424 + }, + { + "epoch": 2.323117052494294, + "grad_norm": 0.12018754333257675, + "learning_rate": 0.0001, + "loss": 0.1607, + "step": 1425 + }, + { + "epoch": 2.324747310074992, + "grad_norm": 0.13709640502929688, + "learning_rate": 0.0001, + "loss": 0.1784, + "step": 1426 + }, + { + "epoch": 2.3263775676556895, + "grad_norm": 0.12608668208122253, + "learning_rate": 0.0001, + "loss": 0.1727, + "step": 1427 + }, + { + "epoch": 2.328007825236387, + "grad_norm": 0.13371500372886658, + "learning_rate": 0.0001, + "loss": 0.1669, + "step": 1428 + }, + { + "epoch": 2.3296380828170853, + "grad_norm": 0.13639819622039795, + "learning_rate": 0.0001, + "loss": 0.1755, + "step": 1429 + }, + { + "epoch": 2.331268340397783, + "grad_norm": 0.13079668581485748, + "learning_rate": 0.0001, + "loss": 0.1843, + "step": 1430 + }, + { + "epoch": 2.3328985979784806, + "grad_norm": 0.11506593972444534, + "learning_rate": 0.0001, + "loss": 0.1596, + "step": 1431 + }, + { + "epoch": 2.3345288555591783, + "grad_norm": 0.15789979696273804, + "learning_rate": 0.0001, + "loss": 0.1814, + "step": 1432 + }, + { + "epoch": 2.336159113139876, + "grad_norm": 0.1199864000082016, + "learning_rate": 0.0001, + "loss": 0.1635, + "step": 1433 + }, + { + "epoch": 2.3377893707205737, + "grad_norm": 0.14094781875610352, + "learning_rate": 0.0001, + "loss": 0.184, + "step": 1434 + }, + { + "epoch": 2.339419628301272, + "grad_norm": 0.13015292584896088, + "learning_rate": 0.0001, + "loss": 0.1643, + "step": 1435 + }, + { + "epoch": 2.3410498858819695, + "grad_norm": 0.13332228362560272, + "learning_rate": 0.0001, + "loss": 0.1596, + "step": 1436 + }, + { + "epoch": 2.342680143462667, + "grad_norm": 0.11775743216276169, + "learning_rate": 0.0001, + "loss": 0.1783, + "step": 1437 + }, + { + "epoch": 2.344310401043365, + "grad_norm": 0.15774385631084442, + "learning_rate": 0.0001, + "loss": 0.188, + "step": 1438 + }, + { + "epoch": 2.3459406586240625, + "grad_norm": 0.12837542593479156, + "learning_rate": 0.0001, + "loss": 0.1666, + "step": 1439 + }, + { + "epoch": 2.34757091620476, + "grad_norm": 0.14266343414783478, + "learning_rate": 0.0001, + "loss": 0.173, + "step": 1440 + }, + { + "epoch": 2.349201173785458, + "grad_norm": 0.14660033583641052, + "learning_rate": 0.0001, + "loss": 0.1724, + "step": 1441 + }, + { + "epoch": 2.350831431366156, + "grad_norm": 0.12945188581943512, + "learning_rate": 0.0001, + "loss": 0.1567, + "step": 1442 + }, + { + "epoch": 2.3524616889468537, + "grad_norm": 0.13205265998840332, + "learning_rate": 0.0001, + "loss": 0.1634, + "step": 1443 + }, + { + "epoch": 2.3540919465275514, + "grad_norm": 0.11146970093250275, + "learning_rate": 0.0001, + "loss": 0.1698, + "step": 1444 + }, + { + "epoch": 2.355722204108249, + "grad_norm": 0.1483961045742035, + "learning_rate": 0.0001, + "loss": 0.1705, + "step": 1445 + }, + { + "epoch": 2.3573524616889467, + "grad_norm": 0.1443917602300644, + "learning_rate": 0.0001, + "loss": 0.1881, + "step": 1446 + }, + { + "epoch": 2.3589827192696444, + "grad_norm": 0.11366801708936691, + "learning_rate": 0.0001, + "loss": 0.1673, + "step": 1447 + }, + { + "epoch": 2.3606129768503425, + "grad_norm": 0.12853001058101654, + "learning_rate": 0.0001, + "loss": 0.1682, + "step": 1448 + }, + { + "epoch": 2.3622432344310402, + "grad_norm": 0.11675182729959488, + "learning_rate": 0.0001, + "loss": 0.1607, + "step": 1449 + }, + { + "epoch": 2.363873492011738, + "grad_norm": 0.13047528266906738, + "learning_rate": 0.0001, + "loss": 0.1585, + "step": 1450 + }, + { + "epoch": 2.3655037495924356, + "grad_norm": 0.13428814709186554, + "learning_rate": 0.0001, + "loss": 0.1742, + "step": 1451 + }, + { + "epoch": 2.3671340071731333, + "grad_norm": 0.14489330351352692, + "learning_rate": 0.0001, + "loss": 0.1624, + "step": 1452 + }, + { + "epoch": 2.368764264753831, + "grad_norm": 0.126490980386734, + "learning_rate": 0.0001, + "loss": 0.1825, + "step": 1453 + }, + { + "epoch": 2.3703945223345286, + "grad_norm": 0.13337522745132446, + "learning_rate": 0.0001, + "loss": 0.1653, + "step": 1454 + }, + { + "epoch": 2.3720247799152268, + "grad_norm": 0.13690900802612305, + "learning_rate": 0.0001, + "loss": 0.1849, + "step": 1455 + }, + { + "epoch": 2.3736550374959244, + "grad_norm": 0.12736913561820984, + "learning_rate": 0.0001, + "loss": 0.175, + "step": 1456 + }, + { + "epoch": 2.375285295076622, + "grad_norm": 0.14143235981464386, + "learning_rate": 0.0001, + "loss": 0.181, + "step": 1457 + }, + { + "epoch": 2.37691555265732, + "grad_norm": 0.12882289290428162, + "learning_rate": 0.0001, + "loss": 0.1672, + "step": 1458 + }, + { + "epoch": 2.3785458102380175, + "grad_norm": 0.12184431403875351, + "learning_rate": 0.0001, + "loss": 0.1691, + "step": 1459 + }, + { + "epoch": 2.3801760678187156, + "grad_norm": 0.12170469015836716, + "learning_rate": 0.0001, + "loss": 0.1721, + "step": 1460 + }, + { + "epoch": 2.3818063253994133, + "grad_norm": 0.13725103437900543, + "learning_rate": 0.0001, + "loss": 0.1846, + "step": 1461 + }, + { + "epoch": 2.383436582980111, + "grad_norm": 0.137665793299675, + "learning_rate": 0.0001, + "loss": 0.1675, + "step": 1462 + }, + { + "epoch": 2.3850668405608086, + "grad_norm": 0.11704001575708389, + "learning_rate": 0.0001, + "loss": 0.1695, + "step": 1463 + }, + { + "epoch": 2.3866970981415063, + "grad_norm": 0.11113730818033218, + "learning_rate": 0.0001, + "loss": 0.1621, + "step": 1464 + }, + { + "epoch": 2.388327355722204, + "grad_norm": 0.1406833976507187, + "learning_rate": 0.0001, + "loss": 0.179, + "step": 1465 + }, + { + "epoch": 2.3899576133029017, + "grad_norm": 0.12707440555095673, + "learning_rate": 0.0001, + "loss": 0.1697, + "step": 1466 + }, + { + "epoch": 2.3915878708835994, + "grad_norm": 0.14010672271251678, + "learning_rate": 0.0001, + "loss": 0.1871, + "step": 1467 + }, + { + "epoch": 2.3932181284642975, + "grad_norm": 0.12332647293806076, + "learning_rate": 0.0001, + "loss": 0.1621, + "step": 1468 + }, + { + "epoch": 2.394848386044995, + "grad_norm": 0.1202240139245987, + "learning_rate": 0.0001, + "loss": 0.1718, + "step": 1469 + }, + { + "epoch": 2.396478643625693, + "grad_norm": 0.11759907752275467, + "learning_rate": 0.0001, + "loss": 0.1584, + "step": 1470 + }, + { + "epoch": 2.3981089012063905, + "grad_norm": 0.12136384099721909, + "learning_rate": 0.0001, + "loss": 0.1681, + "step": 1471 + }, + { + "epoch": 2.399739158787088, + "grad_norm": 0.1522362232208252, + "learning_rate": 0.0001, + "loss": 0.1706, + "step": 1472 + }, + { + "epoch": 2.4013694163677863, + "grad_norm": 0.13118572533130646, + "learning_rate": 0.0001, + "loss": 0.1694, + "step": 1473 + }, + { + "epoch": 2.402999673948484, + "grad_norm": 0.11596754938364029, + "learning_rate": 0.0001, + "loss": 0.1693, + "step": 1474 + }, + { + "epoch": 2.4046299315291817, + "grad_norm": 0.13307926058769226, + "learning_rate": 0.0001, + "loss": 0.1749, + "step": 1475 + }, + { + "epoch": 2.4062601891098794, + "grad_norm": 0.13675716519355774, + "learning_rate": 0.0001, + "loss": 0.1717, + "step": 1476 + }, + { + "epoch": 2.407890446690577, + "grad_norm": 0.17052991688251495, + "learning_rate": 0.0001, + "loss": 0.1788, + "step": 1477 + }, + { + "epoch": 2.4095207042712747, + "grad_norm": 0.13308259844779968, + "learning_rate": 0.0001, + "loss": 0.165, + "step": 1478 + }, + { + "epoch": 2.4111509618519724, + "grad_norm": 0.13663795590400696, + "learning_rate": 0.0001, + "loss": 0.177, + "step": 1479 + }, + { + "epoch": 2.4127812194326705, + "grad_norm": 0.1282254457473755, + "learning_rate": 0.0001, + "loss": 0.1809, + "step": 1480 + }, + { + "epoch": 2.4144114770133682, + "grad_norm": 0.14260698854923248, + "learning_rate": 0.0001, + "loss": 0.1635, + "step": 1481 + }, + { + "epoch": 2.416041734594066, + "grad_norm": 0.12570828199386597, + "learning_rate": 0.0001, + "loss": 0.1719, + "step": 1482 + }, + { + "epoch": 2.4176719921747636, + "grad_norm": 0.13560189306735992, + "learning_rate": 0.0001, + "loss": 0.1699, + "step": 1483 + }, + { + "epoch": 2.4193022497554613, + "grad_norm": 0.15842407941818237, + "learning_rate": 0.0001, + "loss": 0.1759, + "step": 1484 + }, + { + "epoch": 2.420932507336159, + "grad_norm": 0.13507573306560516, + "learning_rate": 0.0001, + "loss": 0.1751, + "step": 1485 + }, + { + "epoch": 2.422562764916857, + "grad_norm": 0.13392452895641327, + "learning_rate": 0.0001, + "loss": 0.1855, + "step": 1486 + }, + { + "epoch": 2.4241930224975548, + "grad_norm": 0.14085431396961212, + "learning_rate": 0.0001, + "loss": 0.1565, + "step": 1487 + }, + { + "epoch": 2.4258232800782524, + "grad_norm": 0.11966408044099808, + "learning_rate": 0.0001, + "loss": 0.1696, + "step": 1488 + }, + { + "epoch": 2.42745353765895, + "grad_norm": 0.14620652794837952, + "learning_rate": 0.0001, + "loss": 0.179, + "step": 1489 + }, + { + "epoch": 2.429083795239648, + "grad_norm": 0.15509895980358124, + "learning_rate": 0.0001, + "loss": 0.1641, + "step": 1490 + }, + { + "epoch": 2.4307140528203455, + "grad_norm": 0.1289438009262085, + "learning_rate": 0.0001, + "loss": 0.1754, + "step": 1491 + }, + { + "epoch": 2.432344310401043, + "grad_norm": 0.1589927077293396, + "learning_rate": 0.0001, + "loss": 0.1778, + "step": 1492 + }, + { + "epoch": 2.4339745679817413, + "grad_norm": 0.13043618202209473, + "learning_rate": 0.0001, + "loss": 0.1866, + "step": 1493 + }, + { + "epoch": 2.435604825562439, + "grad_norm": 0.12837867438793182, + "learning_rate": 0.0001, + "loss": 0.1644, + "step": 1494 + }, + { + "epoch": 2.4372350831431366, + "grad_norm": 0.13602042198181152, + "learning_rate": 0.0001, + "loss": 0.18, + "step": 1495 + }, + { + "epoch": 2.4388653407238343, + "grad_norm": 0.12518282234668732, + "learning_rate": 0.0001, + "loss": 0.1792, + "step": 1496 + }, + { + "epoch": 2.440495598304532, + "grad_norm": 0.1458752602338791, + "learning_rate": 0.0001, + "loss": 0.174, + "step": 1497 + }, + { + "epoch": 2.4421258558852297, + "grad_norm": 0.16408641636371613, + "learning_rate": 0.0001, + "loss": 0.1638, + "step": 1498 + }, + { + "epoch": 2.443756113465928, + "grad_norm": 0.1250162422657013, + "learning_rate": 0.0001, + "loss": 0.1799, + "step": 1499 + }, + { + "epoch": 2.4453863710466255, + "grad_norm": 0.14285162091255188, + "learning_rate": 0.0001, + "loss": 0.183, + "step": 1500 + }, + { + "epoch": 2.4453863710466255, + "eval_loss": 0.19733810424804688, + "eval_runtime": 2885.0678, + "eval_samples_per_second": 0.654, + "eval_steps_per_second": 0.164, + "step": 1500 + }, + { + "epoch": 2.447016628627323, + "grad_norm": 0.1123531311750412, + "learning_rate": 0.0001, + "loss": 0.1653, + "step": 1501 + }, + { + "epoch": 2.448646886208021, + "grad_norm": 0.1342422366142273, + "learning_rate": 0.0001, + "loss": 0.1652, + "step": 1502 + }, + { + "epoch": 2.4502771437887185, + "grad_norm": 0.13005055487155914, + "learning_rate": 0.0001, + "loss": 0.1513, + "step": 1503 + }, + { + "epoch": 2.451907401369416, + "grad_norm": 0.1291249841451645, + "learning_rate": 0.0001, + "loss": 0.1658, + "step": 1504 + }, + { + "epoch": 2.453537658950114, + "grad_norm": 0.15009792149066925, + "learning_rate": 0.0001, + "loss": 0.1582, + "step": 1505 + }, + { + "epoch": 2.455167916530812, + "grad_norm": 0.13734357059001923, + "learning_rate": 0.0001, + "loss": 0.1729, + "step": 1506 + }, + { + "epoch": 2.4567981741115097, + "grad_norm": 0.14626893401145935, + "learning_rate": 0.0001, + "loss": 0.1753, + "step": 1507 + }, + { + "epoch": 2.4584284316922074, + "grad_norm": 0.13441750407218933, + "learning_rate": 0.0001, + "loss": 0.168, + "step": 1508 + }, + { + "epoch": 2.460058689272905, + "grad_norm": 0.17883312702178955, + "learning_rate": 0.0001, + "loss": 0.1724, + "step": 1509 + }, + { + "epoch": 2.4616889468536027, + "grad_norm": 0.13286323845386505, + "learning_rate": 0.0001, + "loss": 0.1707, + "step": 1510 + }, + { + "epoch": 2.463319204434301, + "grad_norm": 0.11390995234251022, + "learning_rate": 0.0001, + "loss": 0.152, + "step": 1511 + }, + { + "epoch": 2.4649494620149985, + "grad_norm": 0.13794218003749847, + "learning_rate": 0.0001, + "loss": 0.1693, + "step": 1512 + }, + { + "epoch": 2.466579719595696, + "grad_norm": 0.13029181957244873, + "learning_rate": 0.0001, + "loss": 0.1885, + "step": 1513 + }, + { + "epoch": 2.468209977176394, + "grad_norm": 0.1590529978275299, + "learning_rate": 0.0001, + "loss": 0.1656, + "step": 1514 + }, + { + "epoch": 2.4698402347570916, + "grad_norm": 0.129343181848526, + "learning_rate": 0.0001, + "loss": 0.1645, + "step": 1515 + }, + { + "epoch": 2.4714704923377893, + "grad_norm": 0.13485710322856903, + "learning_rate": 0.0001, + "loss": 0.178, + "step": 1516 + }, + { + "epoch": 2.473100749918487, + "grad_norm": 0.13252151012420654, + "learning_rate": 0.0001, + "loss": 0.1807, + "step": 1517 + }, + { + "epoch": 2.474731007499185, + "grad_norm": 0.14138251543045044, + "learning_rate": 0.0001, + "loss": 0.1807, + "step": 1518 + }, + { + "epoch": 2.4763612650798827, + "grad_norm": 0.12066026031970978, + "learning_rate": 0.0001, + "loss": 0.1579, + "step": 1519 + }, + { + "epoch": 2.4779915226605804, + "grad_norm": 0.128167986869812, + "learning_rate": 0.0001, + "loss": 0.1765, + "step": 1520 + }, + { + "epoch": 2.479621780241278, + "grad_norm": 0.11725836247205734, + "learning_rate": 0.0001, + "loss": 0.1584, + "step": 1521 + }, + { + "epoch": 2.481252037821976, + "grad_norm": 0.12235096096992493, + "learning_rate": 0.0001, + "loss": 0.1752, + "step": 1522 + }, + { + "epoch": 2.4828822954026735, + "grad_norm": 0.13664376735687256, + "learning_rate": 0.0001, + "loss": 0.1722, + "step": 1523 + }, + { + "epoch": 2.4845125529833716, + "grad_norm": 0.12066584080457687, + "learning_rate": 0.0001, + "loss": 0.1805, + "step": 1524 + }, + { + "epoch": 2.4861428105640693, + "grad_norm": 0.15553556382656097, + "learning_rate": 0.0001, + "loss": 0.1763, + "step": 1525 + }, + { + "epoch": 2.487773068144767, + "grad_norm": 0.11850383877754211, + "learning_rate": 0.0001, + "loss": 0.1758, + "step": 1526 + }, + { + "epoch": 2.4894033257254646, + "grad_norm": 0.1156458929181099, + "learning_rate": 0.0001, + "loss": 0.1723, + "step": 1527 + }, + { + "epoch": 2.4910335833061623, + "grad_norm": 0.14494523406028748, + "learning_rate": 0.0001, + "loss": 0.1721, + "step": 1528 + }, + { + "epoch": 2.49266384088686, + "grad_norm": 0.19197629392147064, + "learning_rate": 0.0001, + "loss": 0.1844, + "step": 1529 + }, + { + "epoch": 2.4942940984675577, + "grad_norm": 0.13929398357868195, + "learning_rate": 0.0001, + "loss": 0.1605, + "step": 1530 + }, + { + "epoch": 2.495924356048256, + "grad_norm": 0.14377978444099426, + "learning_rate": 0.0001, + "loss": 0.1757, + "step": 1531 + }, + { + "epoch": 2.4975546136289535, + "grad_norm": 0.1260727494955063, + "learning_rate": 0.0001, + "loss": 0.1584, + "step": 1532 + }, + { + "epoch": 2.499184871209651, + "grad_norm": 0.13934855163097382, + "learning_rate": 0.0001, + "loss": 0.1906, + "step": 1533 + }, + { + "epoch": 2.500815128790349, + "grad_norm": 0.1342552751302719, + "learning_rate": 0.0001, + "loss": 0.1642, + "step": 1534 + }, + { + "epoch": 2.5024453863710465, + "grad_norm": 0.14849533140659332, + "learning_rate": 0.0001, + "loss": 0.1774, + "step": 1535 + }, + { + "epoch": 2.5040756439517446, + "grad_norm": 0.12595170736312866, + "learning_rate": 0.0001, + "loss": 0.1757, + "step": 1536 + }, + { + "epoch": 2.5057059015324423, + "grad_norm": 0.15015050768852234, + "learning_rate": 0.0001, + "loss": 0.1598, + "step": 1537 + }, + { + "epoch": 2.50733615911314, + "grad_norm": 0.1296335756778717, + "learning_rate": 0.0001, + "loss": 0.1725, + "step": 1538 + }, + { + "epoch": 2.5089664166938377, + "grad_norm": 0.12165997922420502, + "learning_rate": 0.0001, + "loss": 0.1647, + "step": 1539 + }, + { + "epoch": 2.5105966742745354, + "grad_norm": 0.14188778400421143, + "learning_rate": 0.0001, + "loss": 0.1675, + "step": 1540 + }, + { + "epoch": 2.512226931855233, + "grad_norm": 0.12897147238254547, + "learning_rate": 0.0001, + "loss": 0.1661, + "step": 1541 + }, + { + "epoch": 2.5138571894359307, + "grad_norm": 0.10982251167297363, + "learning_rate": 0.0001, + "loss": 0.1734, + "step": 1542 + }, + { + "epoch": 2.5154874470166284, + "grad_norm": 0.11350741237401962, + "learning_rate": 0.0001, + "loss": 0.1587, + "step": 1543 + }, + { + "epoch": 2.5171177045973265, + "grad_norm": 0.13737499713897705, + "learning_rate": 0.0001, + "loss": 0.1791, + "step": 1544 + }, + { + "epoch": 2.518747962178024, + "grad_norm": 0.17301884293556213, + "learning_rate": 0.0001, + "loss": 0.17, + "step": 1545 + }, + { + "epoch": 2.520378219758722, + "grad_norm": 0.12698844075202942, + "learning_rate": 0.0001, + "loss": 0.1803, + "step": 1546 + }, + { + "epoch": 2.5220084773394196, + "grad_norm": 0.1362551897764206, + "learning_rate": 0.0001, + "loss": 0.1666, + "step": 1547 + }, + { + "epoch": 2.5236387349201173, + "grad_norm": 0.13497772812843323, + "learning_rate": 0.0001, + "loss": 0.1757, + "step": 1548 + }, + { + "epoch": 2.5252689925008154, + "grad_norm": 0.13246221840381622, + "learning_rate": 0.0001, + "loss": 0.1626, + "step": 1549 + }, + { + "epoch": 2.526899250081513, + "grad_norm": 0.11976587027311325, + "learning_rate": 0.0001, + "loss": 0.1704, + "step": 1550 + }, + { + "epoch": 2.5285295076622107, + "grad_norm": 0.17471329867839813, + "learning_rate": 0.0001, + "loss": 0.1743, + "step": 1551 + }, + { + "epoch": 2.5301597652429084, + "grad_norm": 0.1331670731306076, + "learning_rate": 0.0001, + "loss": 0.1565, + "step": 1552 + }, + { + "epoch": 2.531790022823606, + "grad_norm": 0.12313182651996613, + "learning_rate": 0.0001, + "loss": 0.1508, + "step": 1553 + }, + { + "epoch": 2.533420280404304, + "grad_norm": 0.14300191402435303, + "learning_rate": 0.0001, + "loss": 0.1825, + "step": 1554 + }, + { + "epoch": 2.5350505379850015, + "grad_norm": 0.14361177384853363, + "learning_rate": 0.0001, + "loss": 0.1751, + "step": 1555 + }, + { + "epoch": 2.536680795565699, + "grad_norm": 0.1338524967432022, + "learning_rate": 0.0001, + "loss": 0.1714, + "step": 1556 + }, + { + "epoch": 2.5383110531463973, + "grad_norm": 0.12020072340965271, + "learning_rate": 0.0001, + "loss": 0.1745, + "step": 1557 + }, + { + "epoch": 2.539941310727095, + "grad_norm": 0.1332116425037384, + "learning_rate": 0.0001, + "loss": 0.1644, + "step": 1558 + }, + { + "epoch": 2.5415715683077926, + "grad_norm": 0.1357092410326004, + "learning_rate": 0.0001, + "loss": 0.1667, + "step": 1559 + }, + { + "epoch": 2.5432018258884903, + "grad_norm": 0.12332719564437866, + "learning_rate": 0.0001, + "loss": 0.178, + "step": 1560 + }, + { + "epoch": 2.544832083469188, + "grad_norm": 0.11925667524337769, + "learning_rate": 0.0001, + "loss": 0.1736, + "step": 1561 + }, + { + "epoch": 2.546462341049886, + "grad_norm": 0.14202888309955597, + "learning_rate": 0.0001, + "loss": 0.1795, + "step": 1562 + }, + { + "epoch": 2.548092598630584, + "grad_norm": 0.18294650316238403, + "learning_rate": 0.0001, + "loss": 0.1859, + "step": 1563 + }, + { + "epoch": 2.5497228562112815, + "grad_norm": 0.14184100925922394, + "learning_rate": 0.0001, + "loss": 0.1628, + "step": 1564 + }, + { + "epoch": 2.551353113791979, + "grad_norm": 0.1371891051530838, + "learning_rate": 0.0001, + "loss": 0.18, + "step": 1565 + }, + { + "epoch": 2.552983371372677, + "grad_norm": 0.20214851200580597, + "learning_rate": 0.0001, + "loss": 0.1813, + "step": 1566 + }, + { + "epoch": 2.5546136289533745, + "grad_norm": 0.12881435453891754, + "learning_rate": 0.0001, + "loss": 0.1657, + "step": 1567 + }, + { + "epoch": 2.556243886534072, + "grad_norm": 0.1161297932267189, + "learning_rate": 0.0001, + "loss": 0.1641, + "step": 1568 + }, + { + "epoch": 2.55787414411477, + "grad_norm": 0.13545578718185425, + "learning_rate": 0.0001, + "loss": 0.17, + "step": 1569 + }, + { + "epoch": 2.559504401695468, + "grad_norm": 0.13674761354923248, + "learning_rate": 0.0001, + "loss": 0.175, + "step": 1570 + }, + { + "epoch": 2.5611346592761657, + "grad_norm": 0.12067017704248428, + "learning_rate": 0.0001, + "loss": 0.1545, + "step": 1571 + }, + { + "epoch": 2.5627649168568634, + "grad_norm": 0.13512328267097473, + "learning_rate": 0.0001, + "loss": 0.1706, + "step": 1572 + }, + { + "epoch": 2.564395174437561, + "grad_norm": 0.14814037084579468, + "learning_rate": 0.0001, + "loss": 0.1824, + "step": 1573 + }, + { + "epoch": 2.5660254320182587, + "grad_norm": 0.13570277392864227, + "learning_rate": 0.0001, + "loss": 0.18, + "step": 1574 + }, + { + "epoch": 2.567655689598957, + "grad_norm": 0.14428043365478516, + "learning_rate": 0.0001, + "loss": 0.176, + "step": 1575 + }, + { + "epoch": 2.5692859471796545, + "grad_norm": 0.11816335469484329, + "learning_rate": 0.0001, + "loss": 0.1658, + "step": 1576 + }, + { + "epoch": 2.570916204760352, + "grad_norm": 0.12405506521463394, + "learning_rate": 0.0001, + "loss": 0.1704, + "step": 1577 + }, + { + "epoch": 2.57254646234105, + "grad_norm": 0.12209967523813248, + "learning_rate": 0.0001, + "loss": 0.1683, + "step": 1578 + }, + { + "epoch": 2.5741767199217476, + "grad_norm": 0.12708422541618347, + "learning_rate": 0.0001, + "loss": 0.1846, + "step": 1579 + }, + { + "epoch": 2.5758069775024452, + "grad_norm": 0.13518854975700378, + "learning_rate": 0.0001, + "loss": 0.1692, + "step": 1580 + }, + { + "epoch": 2.577437235083143, + "grad_norm": 0.1394702047109604, + "learning_rate": 0.0001, + "loss": 0.1793, + "step": 1581 + }, + { + "epoch": 2.5790674926638406, + "grad_norm": 0.1345827430486679, + "learning_rate": 0.0001, + "loss": 0.1578, + "step": 1582 + }, + { + "epoch": 2.5806977502445387, + "grad_norm": 0.1303935945034027, + "learning_rate": 0.0001, + "loss": 0.1754, + "step": 1583 + }, + { + "epoch": 2.5823280078252364, + "grad_norm": 0.1360052078962326, + "learning_rate": 0.0001, + "loss": 0.1825, + "step": 1584 + }, + { + "epoch": 2.583958265405934, + "grad_norm": 0.1259981095790863, + "learning_rate": 0.0001, + "loss": 0.1651, + "step": 1585 + }, + { + "epoch": 2.5855885229866318, + "grad_norm": 0.13449037075042725, + "learning_rate": 0.0001, + "loss": 0.1901, + "step": 1586 + }, + { + "epoch": 2.58721878056733, + "grad_norm": 0.14254209399223328, + "learning_rate": 0.0001, + "loss": 0.187, + "step": 1587 + }, + { + "epoch": 2.5888490381480276, + "grad_norm": 0.11680305004119873, + "learning_rate": 0.0001, + "loss": 0.1669, + "step": 1588 + }, + { + "epoch": 2.5904792957287253, + "grad_norm": 0.12670142948627472, + "learning_rate": 0.0001, + "loss": 0.1659, + "step": 1589 + }, + { + "epoch": 2.592109553309423, + "grad_norm": 0.11500969529151917, + "learning_rate": 0.0001, + "loss": 0.1523, + "step": 1590 + }, + { + "epoch": 2.5937398108901206, + "grad_norm": 0.15863120555877686, + "learning_rate": 0.0001, + "loss": 0.1731, + "step": 1591 + }, + { + "epoch": 2.5953700684708183, + "grad_norm": 0.12249530106782913, + "learning_rate": 0.0001, + "loss": 0.1722, + "step": 1592 + }, + { + "epoch": 2.597000326051516, + "grad_norm": 0.1352062076330185, + "learning_rate": 0.0001, + "loss": 0.1787, + "step": 1593 + }, + { + "epoch": 2.5986305836322137, + "grad_norm": 0.14207401871681213, + "learning_rate": 0.0001, + "loss": 0.1954, + "step": 1594 + }, + { + "epoch": 2.600260841212912, + "grad_norm": 0.12589991092681885, + "learning_rate": 0.0001, + "loss": 0.161, + "step": 1595 + }, + { + "epoch": 2.6018910987936095, + "grad_norm": 0.15374112129211426, + "learning_rate": 0.0001, + "loss": 0.1631, + "step": 1596 + }, + { + "epoch": 2.603521356374307, + "grad_norm": 0.17806057631969452, + "learning_rate": 0.0001, + "loss": 0.1749, + "step": 1597 + }, + { + "epoch": 2.605151613955005, + "grad_norm": 0.13044816255569458, + "learning_rate": 0.0001, + "loss": 0.1602, + "step": 1598 + }, + { + "epoch": 2.6067818715357025, + "grad_norm": 0.13261814415454865, + "learning_rate": 0.0001, + "loss": 0.175, + "step": 1599 + }, + { + "epoch": 2.6084121291164006, + "grad_norm": 0.16431686282157898, + "learning_rate": 0.0001, + "loss": 0.1711, + "step": 1600 + }, + { + "epoch": 2.6100423866970983, + "grad_norm": 0.1591222733259201, + "learning_rate": 0.0001, + "loss": 0.1782, + "step": 1601 + }, + { + "epoch": 2.611672644277796, + "grad_norm": 0.15045471489429474, + "learning_rate": 0.0001, + "loss": 0.1576, + "step": 1602 + }, + { + "epoch": 2.6133029018584937, + "grad_norm": 0.14002764225006104, + "learning_rate": 0.0001, + "loss": 0.1731, + "step": 1603 + }, + { + "epoch": 2.6149331594391914, + "grad_norm": 0.13808737695217133, + "learning_rate": 0.0001, + "loss": 0.1754, + "step": 1604 + }, + { + "epoch": 2.616563417019889, + "grad_norm": 0.14910684525966644, + "learning_rate": 0.0001, + "loss": 0.1834, + "step": 1605 + }, + { + "epoch": 2.6181936746005867, + "grad_norm": 0.1458890289068222, + "learning_rate": 0.0001, + "loss": 0.173, + "step": 1606 + }, + { + "epoch": 2.6198239321812844, + "grad_norm": 0.13817395269870758, + "learning_rate": 0.0001, + "loss": 0.1745, + "step": 1607 + }, + { + "epoch": 2.6214541897619825, + "grad_norm": 0.22925616800785065, + "learning_rate": 0.0001, + "loss": 0.1693, + "step": 1608 + }, + { + "epoch": 2.62308444734268, + "grad_norm": 0.13849849998950958, + "learning_rate": 0.0001, + "loss": 0.1734, + "step": 1609 + }, + { + "epoch": 2.624714704923378, + "grad_norm": 0.12482542544603348, + "learning_rate": 0.0001, + "loss": 0.1864, + "step": 1610 + }, + { + "epoch": 2.6263449625040756, + "grad_norm": 0.12124403566122055, + "learning_rate": 0.0001, + "loss": 0.1803, + "step": 1611 + }, + { + "epoch": 2.6279752200847732, + "grad_norm": 0.12327743321657181, + "learning_rate": 0.0001, + "loss": 0.1581, + "step": 1612 + }, + { + "epoch": 2.6296054776654714, + "grad_norm": 0.17278893291950226, + "learning_rate": 0.0001, + "loss": 0.1637, + "step": 1613 + }, + { + "epoch": 2.631235735246169, + "grad_norm": 0.14170318841934204, + "learning_rate": 0.0001, + "loss": 0.1793, + "step": 1614 + }, + { + "epoch": 2.6328659928268667, + "grad_norm": 0.1279682219028473, + "learning_rate": 0.0001, + "loss": 0.1657, + "step": 1615 + }, + { + "epoch": 2.6344962504075644, + "grad_norm": 0.1350080519914627, + "learning_rate": 0.0001, + "loss": 0.1854, + "step": 1616 + }, + { + "epoch": 2.636126507988262, + "grad_norm": 0.1315418928861618, + "learning_rate": 0.0001, + "loss": 0.1732, + "step": 1617 + }, + { + "epoch": 2.6377567655689598, + "grad_norm": 0.13384385406970978, + "learning_rate": 0.0001, + "loss": 0.1727, + "step": 1618 + }, + { + "epoch": 2.6393870231496575, + "grad_norm": 0.4788396954536438, + "learning_rate": 0.0001, + "loss": 0.1705, + "step": 1619 + }, + { + "epoch": 2.641017280730355, + "grad_norm": 0.15386568009853363, + "learning_rate": 0.0001, + "loss": 0.1653, + "step": 1620 + }, + { + "epoch": 2.6426475383110533, + "grad_norm": 0.12126284837722778, + "learning_rate": 0.0001, + "loss": 0.1645, + "step": 1621 + }, + { + "epoch": 2.644277795891751, + "grad_norm": 0.11877623945474625, + "learning_rate": 0.0001, + "loss": 0.1656, + "step": 1622 + }, + { + "epoch": 2.6459080534724486, + "grad_norm": 0.13270951807498932, + "learning_rate": 0.0001, + "loss": 0.1861, + "step": 1623 + }, + { + "epoch": 2.6475383110531463, + "grad_norm": 0.13268467783927917, + "learning_rate": 0.0001, + "loss": 0.1803, + "step": 1624 + }, + { + "epoch": 2.6491685686338444, + "grad_norm": 0.11460261046886444, + "learning_rate": 0.0001, + "loss": 0.1849, + "step": 1625 + }, + { + "epoch": 2.650798826214542, + "grad_norm": 0.1381942331790924, + "learning_rate": 0.0001, + "loss": 0.1726, + "step": 1626 + }, + { + "epoch": 2.65242908379524, + "grad_norm": 0.11194758862257004, + "learning_rate": 0.0001, + "loss": 0.174, + "step": 1627 + }, + { + "epoch": 2.6540593413759375, + "grad_norm": 0.13582143187522888, + "learning_rate": 0.0001, + "loss": 0.188, + "step": 1628 + }, + { + "epoch": 2.655689598956635, + "grad_norm": 0.13221515715122223, + "learning_rate": 0.0001, + "loss": 0.1784, + "step": 1629 + }, + { + "epoch": 2.657319856537333, + "grad_norm": 0.13104896247386932, + "learning_rate": 0.0001, + "loss": 0.1644, + "step": 1630 + }, + { + "epoch": 2.6589501141180305, + "grad_norm": 0.15535131096839905, + "learning_rate": 0.0001, + "loss": 0.1723, + "step": 1631 + }, + { + "epoch": 2.660580371698728, + "grad_norm": 0.1257065385580063, + "learning_rate": 0.0001, + "loss": 0.1437, + "step": 1632 + }, + { + "epoch": 2.6622106292794263, + "grad_norm": 0.12850813567638397, + "learning_rate": 0.0001, + "loss": 0.1735, + "step": 1633 + }, + { + "epoch": 2.663840886860124, + "grad_norm": 0.12646383047103882, + "learning_rate": 0.0001, + "loss": 0.1771, + "step": 1634 + }, + { + "epoch": 2.6654711444408217, + "grad_norm": 0.2852911651134491, + "learning_rate": 0.0001, + "loss": 0.1851, + "step": 1635 + }, + { + "epoch": 2.6671014020215194, + "grad_norm": 0.11714810878038406, + "learning_rate": 0.0001, + "loss": 0.1549, + "step": 1636 + }, + { + "epoch": 2.668731659602217, + "grad_norm": 0.14019465446472168, + "learning_rate": 0.0001, + "loss": 0.185, + "step": 1637 + }, + { + "epoch": 2.670361917182915, + "grad_norm": 0.12677006423473358, + "learning_rate": 0.0001, + "loss": 0.1657, + "step": 1638 + }, + { + "epoch": 2.671992174763613, + "grad_norm": 0.12617303431034088, + "learning_rate": 0.0001, + "loss": 0.1729, + "step": 1639 + }, + { + "epoch": 2.6736224323443105, + "grad_norm": 0.1309848427772522, + "learning_rate": 0.0001, + "loss": 0.1661, + "step": 1640 + }, + { + "epoch": 2.675252689925008, + "grad_norm": 0.1385383903980255, + "learning_rate": 0.0001, + "loss": 0.1794, + "step": 1641 + }, + { + "epoch": 2.676882947505706, + "grad_norm": 0.1279730498790741, + "learning_rate": 0.0001, + "loss": 0.1661, + "step": 1642 + }, + { + "epoch": 2.6785132050864036, + "grad_norm": 0.12799672782421112, + "learning_rate": 0.0001, + "loss": 0.1628, + "step": 1643 + }, + { + "epoch": 2.6801434626671012, + "grad_norm": 0.16915291547775269, + "learning_rate": 0.0001, + "loss": 0.1707, + "step": 1644 + }, + { + "epoch": 2.681773720247799, + "grad_norm": 0.12088014930486679, + "learning_rate": 0.0001, + "loss": 0.1671, + "step": 1645 + }, + { + "epoch": 2.683403977828497, + "grad_norm": 0.13347816467285156, + "learning_rate": 0.0001, + "loss": 0.1715, + "step": 1646 + }, + { + "epoch": 2.6850342354091947, + "grad_norm": 0.12426477670669556, + "learning_rate": 0.0001, + "loss": 0.1917, + "step": 1647 + }, + { + "epoch": 2.6866644929898924, + "grad_norm": 0.16994018852710724, + "learning_rate": 0.0001, + "loss": 0.1929, + "step": 1648 + }, + { + "epoch": 2.68829475057059, + "grad_norm": 0.1453407108783722, + "learning_rate": 0.0001, + "loss": 0.1711, + "step": 1649 + }, + { + "epoch": 2.6899250081512878, + "grad_norm": 0.15362900495529175, + "learning_rate": 0.0001, + "loss": 0.1829, + "step": 1650 + }, + { + "epoch": 2.691555265731986, + "grad_norm": 0.15922309458255768, + "learning_rate": 0.0001, + "loss": 0.1655, + "step": 1651 + }, + { + "epoch": 2.6931855233126836, + "grad_norm": 0.153437077999115, + "learning_rate": 0.0001, + "loss": 0.1776, + "step": 1652 + }, + { + "epoch": 2.6948157808933813, + "grad_norm": 0.15213799476623535, + "learning_rate": 0.0001, + "loss": 0.1835, + "step": 1653 + }, + { + "epoch": 2.696446038474079, + "grad_norm": 0.10505233705043793, + "learning_rate": 0.0001, + "loss": 0.1501, + "step": 1654 + }, + { + "epoch": 2.6980762960547766, + "grad_norm": 0.13109369575977325, + "learning_rate": 0.0001, + "loss": 0.1601, + "step": 1655 + }, + { + "epoch": 2.6997065536354743, + "grad_norm": 0.1327008157968521, + "learning_rate": 0.0001, + "loss": 0.1781, + "step": 1656 + }, + { + "epoch": 2.701336811216172, + "grad_norm": 0.12708407640457153, + "learning_rate": 0.0001, + "loss": 0.1599, + "step": 1657 + }, + { + "epoch": 2.7029670687968697, + "grad_norm": 0.12450224906206131, + "learning_rate": 0.0001, + "loss": 0.1711, + "step": 1658 + }, + { + "epoch": 2.704597326377568, + "grad_norm": 0.13624098896980286, + "learning_rate": 0.0001, + "loss": 0.1636, + "step": 1659 + }, + { + "epoch": 2.7062275839582655, + "grad_norm": 0.1352638155221939, + "learning_rate": 0.0001, + "loss": 0.1732, + "step": 1660 + }, + { + "epoch": 2.707857841538963, + "grad_norm": 0.1240679994225502, + "learning_rate": 0.0001, + "loss": 0.1715, + "step": 1661 + }, + { + "epoch": 2.709488099119661, + "grad_norm": 0.131019726395607, + "learning_rate": 0.0001, + "loss": 0.1743, + "step": 1662 + }, + { + "epoch": 2.7111183567003585, + "grad_norm": 0.11371717602014542, + "learning_rate": 0.0001, + "loss": 0.1653, + "step": 1663 + }, + { + "epoch": 2.7127486142810566, + "grad_norm": 0.1314827799797058, + "learning_rate": 0.0001, + "loss": 0.1788, + "step": 1664 + }, + { + "epoch": 2.7143788718617543, + "grad_norm": 0.11662380397319794, + "learning_rate": 0.0001, + "loss": 0.1596, + "step": 1665 + }, + { + "epoch": 2.716009129442452, + "grad_norm": 0.18013271689414978, + "learning_rate": 0.0001, + "loss": 0.1539, + "step": 1666 + }, + { + "epoch": 2.7176393870231497, + "grad_norm": 0.1361352503299713, + "learning_rate": 0.0001, + "loss": 0.1683, + "step": 1667 + }, + { + "epoch": 2.7192696446038473, + "grad_norm": 0.12468544393777847, + "learning_rate": 0.0001, + "loss": 0.1651, + "step": 1668 + }, + { + "epoch": 2.720899902184545, + "grad_norm": 0.14304213225841522, + "learning_rate": 0.0001, + "loss": 0.1766, + "step": 1669 + }, + { + "epoch": 2.7225301597652427, + "grad_norm": 0.13735635578632355, + "learning_rate": 0.0001, + "loss": 0.1828, + "step": 1670 + }, + { + "epoch": 2.7241604173459404, + "grad_norm": 0.1301272064447403, + "learning_rate": 0.0001, + "loss": 0.1694, + "step": 1671 + }, + { + "epoch": 2.7257906749266385, + "grad_norm": 0.13478688895702362, + "learning_rate": 0.0001, + "loss": 0.1809, + "step": 1672 + }, + { + "epoch": 2.727420932507336, + "grad_norm": 0.1462695151567459, + "learning_rate": 0.0001, + "loss": 0.1605, + "step": 1673 + }, + { + "epoch": 2.729051190088034, + "grad_norm": 0.13804949820041656, + "learning_rate": 0.0001, + "loss": 0.1672, + "step": 1674 + }, + { + "epoch": 2.7306814476687316, + "grad_norm": 0.12185340374708176, + "learning_rate": 0.0001, + "loss": 0.1665, + "step": 1675 + }, + { + "epoch": 2.7323117052494297, + "grad_norm": 0.13339075446128845, + "learning_rate": 0.0001, + "loss": 0.1788, + "step": 1676 + }, + { + "epoch": 2.7339419628301274, + "grad_norm": 0.12658098340034485, + "learning_rate": 0.0001, + "loss": 0.1549, + "step": 1677 + }, + { + "epoch": 2.735572220410825, + "grad_norm": 0.12869013845920563, + "learning_rate": 0.0001, + "loss": 0.1788, + "step": 1678 + }, + { + "epoch": 2.7372024779915227, + "grad_norm": 0.12980249524116516, + "learning_rate": 0.0001, + "loss": 0.1783, + "step": 1679 + }, + { + "epoch": 2.7388327355722204, + "grad_norm": 0.14288075268268585, + "learning_rate": 0.0001, + "loss": 0.1708, + "step": 1680 + }, + { + "epoch": 2.740462993152918, + "grad_norm": 0.15478843450546265, + "learning_rate": 0.0001, + "loss": 0.1803, + "step": 1681 + }, + { + "epoch": 2.7420932507336158, + "grad_norm": 0.13585714995861053, + "learning_rate": 0.0001, + "loss": 0.185, + "step": 1682 + }, + { + "epoch": 2.7437235083143134, + "grad_norm": 0.1448957920074463, + "learning_rate": 0.0001, + "loss": 0.1716, + "step": 1683 + }, + { + "epoch": 2.7453537658950116, + "grad_norm": 0.14890708029270172, + "learning_rate": 0.0001, + "loss": 0.181, + "step": 1684 + }, + { + "epoch": 2.7469840234757092, + "grad_norm": 0.13350339233875275, + "learning_rate": 0.0001, + "loss": 0.1757, + "step": 1685 + }, + { + "epoch": 2.748614281056407, + "grad_norm": 0.1049543023109436, + "learning_rate": 0.0001, + "loss": 0.1668, + "step": 1686 + }, + { + "epoch": 2.7502445386371046, + "grad_norm": 3.6135470867156982, + "learning_rate": 0.0001, + "loss": 0.2121, + "step": 1687 + }, + { + "epoch": 2.7518747962178023, + "grad_norm": 0.1557306945323944, + "learning_rate": 0.0001, + "loss": 0.1742, + "step": 1688 + }, + { + "epoch": 2.7535050537985004, + "grad_norm": 0.1619507372379303, + "learning_rate": 0.0001, + "loss": 0.1586, + "step": 1689 + }, + { + "epoch": 2.755135311379198, + "grad_norm": 0.12189088016748428, + "learning_rate": 0.0001, + "loss": 0.1789, + "step": 1690 + }, + { + "epoch": 2.7567655689598958, + "grad_norm": 0.15941926836967468, + "learning_rate": 0.0001, + "loss": 0.1727, + "step": 1691 + }, + { + "epoch": 2.7583958265405935, + "grad_norm": 0.11842834204435349, + "learning_rate": 0.0001, + "loss": 0.1592, + "step": 1692 + }, + { + "epoch": 2.760026084121291, + "grad_norm": 0.1222245842218399, + "learning_rate": 0.0001, + "loss": 0.1593, + "step": 1693 + }, + { + "epoch": 2.761656341701989, + "grad_norm": 0.1369628608226776, + "learning_rate": 0.0001, + "loss": 0.155, + "step": 1694 + }, + { + "epoch": 2.7632865992826865, + "grad_norm": 0.14212529361248016, + "learning_rate": 0.0001, + "loss": 0.1681, + "step": 1695 + }, + { + "epoch": 2.764916856863384, + "grad_norm": 0.14328838884830475, + "learning_rate": 0.0001, + "loss": 0.1844, + "step": 1696 + }, + { + "epoch": 2.7665471144440823, + "grad_norm": 0.14077410101890564, + "learning_rate": 0.0001, + "loss": 0.1856, + "step": 1697 + }, + { + "epoch": 2.76817737202478, + "grad_norm": 0.13928896188735962, + "learning_rate": 0.0001, + "loss": 0.1718, + "step": 1698 + }, + { + "epoch": 2.7698076296054777, + "grad_norm": 0.13314402103424072, + "learning_rate": 0.0001, + "loss": 0.1705, + "step": 1699 + }, + { + "epoch": 2.7714378871861753, + "grad_norm": 0.1279967725276947, + "learning_rate": 0.0001, + "loss": 0.1758, + "step": 1700 + }, + { + "epoch": 2.773068144766873, + "grad_norm": 0.11493542045354843, + "learning_rate": 0.0001, + "loss": 0.1539, + "step": 1701 + }, + { + "epoch": 2.774698402347571, + "grad_norm": 0.12252707779407501, + "learning_rate": 0.0001, + "loss": 0.1752, + "step": 1702 + }, + { + "epoch": 2.776328659928269, + "grad_norm": 0.19633330404758453, + "learning_rate": 0.0001, + "loss": 0.1582, + "step": 1703 + }, + { + "epoch": 2.7779589175089665, + "grad_norm": 0.1423460990190506, + "learning_rate": 0.0001, + "loss": 0.1668, + "step": 1704 + }, + { + "epoch": 2.779589175089664, + "grad_norm": 0.13523077964782715, + "learning_rate": 0.0001, + "loss": 0.1843, + "step": 1705 + }, + { + "epoch": 2.781219432670362, + "grad_norm": 0.13212746381759644, + "learning_rate": 0.0001, + "loss": 0.1783, + "step": 1706 + }, + { + "epoch": 2.7828496902510595, + "grad_norm": 0.1263880431652069, + "learning_rate": 0.0001, + "loss": 0.1579, + "step": 1707 + }, + { + "epoch": 2.7844799478317572, + "grad_norm": 0.12073154002428055, + "learning_rate": 0.0001, + "loss": 0.1569, + "step": 1708 + }, + { + "epoch": 2.786110205412455, + "grad_norm": 0.14019356667995453, + "learning_rate": 0.0001, + "loss": 0.1809, + "step": 1709 + }, + { + "epoch": 2.787740462993153, + "grad_norm": 0.11406267434358597, + "learning_rate": 0.0001, + "loss": 0.1814, + "step": 1710 + }, + { + "epoch": 2.7893707205738507, + "grad_norm": 0.11895663291215897, + "learning_rate": 0.0001, + "loss": 0.1783, + "step": 1711 + }, + { + "epoch": 2.7910009781545484, + "grad_norm": 0.14709694683551788, + "learning_rate": 0.0001, + "loss": 0.1686, + "step": 1712 + }, + { + "epoch": 2.792631235735246, + "grad_norm": 0.13388898968696594, + "learning_rate": 0.0001, + "loss": 0.1654, + "step": 1713 + }, + { + "epoch": 2.7942614933159438, + "grad_norm": 0.1226249486207962, + "learning_rate": 0.0001, + "loss": 0.1739, + "step": 1714 + }, + { + "epoch": 2.795891750896642, + "grad_norm": 0.12996350228786469, + "learning_rate": 0.0001, + "loss": 0.1699, + "step": 1715 + }, + { + "epoch": 2.7975220084773396, + "grad_norm": 0.13227185606956482, + "learning_rate": 0.0001, + "loss": 0.1774, + "step": 1716 + }, + { + "epoch": 2.7991522660580372, + "grad_norm": 0.12332453578710556, + "learning_rate": 0.0001, + "loss": 0.1602, + "step": 1717 + }, + { + "epoch": 2.800782523638735, + "grad_norm": 0.1187395229935646, + "learning_rate": 0.0001, + "loss": 0.1578, + "step": 1718 + }, + { + "epoch": 2.8024127812194326, + "grad_norm": 0.13751424849033356, + "learning_rate": 0.0001, + "loss": 0.1734, + "step": 1719 + }, + { + "epoch": 2.8040430388001303, + "grad_norm": 0.13629640638828278, + "learning_rate": 0.0001, + "loss": 0.173, + "step": 1720 + }, + { + "epoch": 2.805673296380828, + "grad_norm": 0.11772475391626358, + "learning_rate": 0.0001, + "loss": 0.1677, + "step": 1721 + }, + { + "epoch": 2.8073035539615256, + "grad_norm": 0.13224674761295319, + "learning_rate": 0.0001, + "loss": 0.1757, + "step": 1722 + }, + { + "epoch": 2.8089338115422238, + "grad_norm": 0.1200365424156189, + "learning_rate": 0.0001, + "loss": 0.1792, + "step": 1723 + }, + { + "epoch": 2.8105640691229214, + "grad_norm": 0.14300167560577393, + "learning_rate": 0.0001, + "loss": 0.1748, + "step": 1724 + }, + { + "epoch": 2.812194326703619, + "grad_norm": 0.10958956182003021, + "learning_rate": 0.0001, + "loss": 0.1633, + "step": 1725 + }, + { + "epoch": 2.813824584284317, + "grad_norm": 0.13673417270183563, + "learning_rate": 0.0001, + "loss": 0.1618, + "step": 1726 + }, + { + "epoch": 2.815454841865015, + "grad_norm": 0.1306380033493042, + "learning_rate": 0.0001, + "loss": 0.183, + "step": 1727 + }, + { + "epoch": 2.8170850994457126, + "grad_norm": 0.12033358216285706, + "learning_rate": 0.0001, + "loss": 0.1616, + "step": 1728 + }, + { + "epoch": 2.8187153570264103, + "grad_norm": 0.14117878675460815, + "learning_rate": 0.0001, + "loss": 0.1715, + "step": 1729 + }, + { + "epoch": 2.820345614607108, + "grad_norm": 0.13309049606323242, + "learning_rate": 0.0001, + "loss": 0.1773, + "step": 1730 + }, + { + "epoch": 2.8219758721878057, + "grad_norm": 0.12040352821350098, + "learning_rate": 0.0001, + "loss": 0.1656, + "step": 1731 + }, + { + "epoch": 2.8236061297685033, + "grad_norm": 0.12827961146831512, + "learning_rate": 0.0001, + "loss": 0.1661, + "step": 1732 + }, + { + "epoch": 2.825236387349201, + "grad_norm": 0.1280909925699234, + "learning_rate": 0.0001, + "loss": 0.1806, + "step": 1733 + }, + { + "epoch": 2.8268666449298987, + "grad_norm": 0.12497354298830032, + "learning_rate": 0.0001, + "loss": 0.1788, + "step": 1734 + }, + { + "epoch": 2.828496902510597, + "grad_norm": 0.1294706016778946, + "learning_rate": 0.0001, + "loss": 0.1743, + "step": 1735 + }, + { + "epoch": 2.8301271600912945, + "grad_norm": 0.14673174917697906, + "learning_rate": 0.0001, + "loss": 0.168, + "step": 1736 + }, + { + "epoch": 2.831757417671992, + "grad_norm": 0.14964553713798523, + "learning_rate": 0.0001, + "loss": 0.1751, + "step": 1737 + }, + { + "epoch": 2.83338767525269, + "grad_norm": 0.17080165445804596, + "learning_rate": 0.0001, + "loss": 0.1651, + "step": 1738 + }, + { + "epoch": 2.8350179328333875, + "grad_norm": 0.12748171389102936, + "learning_rate": 0.0001, + "loss": 0.1565, + "step": 1739 + }, + { + "epoch": 2.8366481904140857, + "grad_norm": 0.1372041255235672, + "learning_rate": 0.0001, + "loss": 0.1816, + "step": 1740 + }, + { + "epoch": 2.8382784479947833, + "grad_norm": 0.13018721342086792, + "learning_rate": 0.0001, + "loss": 0.1758, + "step": 1741 + }, + { + "epoch": 2.839908705575481, + "grad_norm": 0.13896982371807098, + "learning_rate": 0.0001, + "loss": 0.1713, + "step": 1742 + }, + { + "epoch": 2.8415389631561787, + "grad_norm": 0.13519124686717987, + "learning_rate": 0.0001, + "loss": 0.1826, + "step": 1743 + }, + { + "epoch": 2.8431692207368764, + "grad_norm": 0.12529662251472473, + "learning_rate": 0.0001, + "loss": 0.1657, + "step": 1744 + }, + { + "epoch": 2.844799478317574, + "grad_norm": 0.1385057270526886, + "learning_rate": 0.0001, + "loss": 0.1742, + "step": 1745 + }, + { + "epoch": 2.8464297358982718, + "grad_norm": 0.13610967993736267, + "learning_rate": 0.0001, + "loss": 0.1729, + "step": 1746 + }, + { + "epoch": 2.8480599934789694, + "grad_norm": 0.1311800330877304, + "learning_rate": 0.0001, + "loss": 0.1829, + "step": 1747 + }, + { + "epoch": 2.8496902510596676, + "grad_norm": 0.13023614883422852, + "learning_rate": 0.0001, + "loss": 0.1679, + "step": 1748 + }, + { + "epoch": 2.8513205086403652, + "grad_norm": 0.12245821207761765, + "learning_rate": 0.0001, + "loss": 0.1699, + "step": 1749 + }, + { + "epoch": 2.852950766221063, + "grad_norm": 0.1297629326581955, + "learning_rate": 0.0001, + "loss": 0.172, + "step": 1750 + }, + { + "epoch": 2.8545810238017606, + "grad_norm": 0.12192010134458542, + "learning_rate": 0.0001, + "loss": 0.1727, + "step": 1751 + }, + { + "epoch": 2.8562112813824583, + "grad_norm": 0.12483784556388855, + "learning_rate": 0.0001, + "loss": 0.1658, + "step": 1752 + }, + { + "epoch": 2.8578415389631564, + "grad_norm": 0.1716589629650116, + "learning_rate": 0.0001, + "loss": 0.1795, + "step": 1753 + }, + { + "epoch": 2.859471796543854, + "grad_norm": 0.1266416311264038, + "learning_rate": 0.0001, + "loss": 0.1572, + "step": 1754 + }, + { + "epoch": 2.8611020541245518, + "grad_norm": 0.15566131472587585, + "learning_rate": 0.0001, + "loss": 0.1812, + "step": 1755 + }, + { + "epoch": 2.8627323117052494, + "grad_norm": 0.1435212343931198, + "learning_rate": 0.0001, + "loss": 0.1682, + "step": 1756 + }, + { + "epoch": 2.864362569285947, + "grad_norm": 0.13166531920433044, + "learning_rate": 0.0001, + "loss": 0.17, + "step": 1757 + }, + { + "epoch": 2.865992826866645, + "grad_norm": 0.1274840533733368, + "learning_rate": 0.0001, + "loss": 0.1661, + "step": 1758 + }, + { + "epoch": 2.8676230844473425, + "grad_norm": 0.1092241182923317, + "learning_rate": 0.0001, + "loss": 0.1655, + "step": 1759 + }, + { + "epoch": 2.86925334202804, + "grad_norm": 0.13460928201675415, + "learning_rate": 0.0001, + "loss": 0.1627, + "step": 1760 + }, + { + "epoch": 2.8708835996087383, + "grad_norm": 0.12913478910923004, + "learning_rate": 0.0001, + "loss": 0.1618, + "step": 1761 + }, + { + "epoch": 2.872513857189436, + "grad_norm": 0.11661716550588608, + "learning_rate": 0.0001, + "loss": 0.169, + "step": 1762 + }, + { + "epoch": 2.8741441147701337, + "grad_norm": 0.11179400235414505, + "learning_rate": 0.0001, + "loss": 0.1725, + "step": 1763 + }, + { + "epoch": 2.8757743723508313, + "grad_norm": 0.1252732276916504, + "learning_rate": 0.0001, + "loss": 0.1827, + "step": 1764 + }, + { + "epoch": 2.8774046299315295, + "grad_norm": 0.3473842144012451, + "learning_rate": 0.0001, + "loss": 0.1693, + "step": 1765 + }, + { + "epoch": 2.879034887512227, + "grad_norm": 0.1262052208185196, + "learning_rate": 0.0001, + "loss": 0.1728, + "step": 1766 + }, + { + "epoch": 2.880665145092925, + "grad_norm": 0.1326300948858261, + "learning_rate": 0.0001, + "loss": 0.1744, + "step": 1767 + }, + { + "epoch": 2.8822954026736225, + "grad_norm": 0.1303003877401352, + "learning_rate": 0.0001, + "loss": 0.1714, + "step": 1768 + }, + { + "epoch": 2.88392566025432, + "grad_norm": 0.12167539447546005, + "learning_rate": 0.0001, + "loss": 0.1677, + "step": 1769 + }, + { + "epoch": 2.885555917835018, + "grad_norm": 0.13617132604122162, + "learning_rate": 0.0001, + "loss": 0.1646, + "step": 1770 + }, + { + "epoch": 2.8871861754157155, + "grad_norm": 0.13883374631404877, + "learning_rate": 0.0001, + "loss": 0.173, + "step": 1771 + }, + { + "epoch": 2.888816432996413, + "grad_norm": 0.12531040608882904, + "learning_rate": 0.0001, + "loss": 0.1678, + "step": 1772 + }, + { + "epoch": 2.8904466905771113, + "grad_norm": 0.13892248272895813, + "learning_rate": 0.0001, + "loss": 0.1677, + "step": 1773 + }, + { + "epoch": 2.892076948157809, + "grad_norm": 0.14446908235549927, + "learning_rate": 0.0001, + "loss": 0.1666, + "step": 1774 + }, + { + "epoch": 2.8937072057385067, + "grad_norm": 0.14126956462860107, + "learning_rate": 0.0001, + "loss": 0.1626, + "step": 1775 + }, + { + "epoch": 2.8953374633192044, + "grad_norm": 0.1542399823665619, + "learning_rate": 0.0001, + "loss": 0.1741, + "step": 1776 + }, + { + "epoch": 2.896967720899902, + "grad_norm": 0.14131003618240356, + "learning_rate": 0.0001, + "loss": 0.174, + "step": 1777 + }, + { + "epoch": 2.8985979784806, + "grad_norm": 0.15592537820339203, + "learning_rate": 0.0001, + "loss": 0.1847, + "step": 1778 + }, + { + "epoch": 2.900228236061298, + "grad_norm": 0.14722147583961487, + "learning_rate": 0.0001, + "loss": 0.1804, + "step": 1779 + }, + { + "epoch": 2.9018584936419956, + "grad_norm": 0.13757076859474182, + "learning_rate": 0.0001, + "loss": 0.1856, + "step": 1780 + }, + { + "epoch": 2.9034887512226932, + "grad_norm": 0.2536503076553345, + "learning_rate": 0.0001, + "loss": 0.1513, + "step": 1781 + }, + { + "epoch": 2.905119008803391, + "grad_norm": 0.11684080958366394, + "learning_rate": 0.0001, + "loss": 0.1723, + "step": 1782 + }, + { + "epoch": 2.9067492663840886, + "grad_norm": 0.1301044374704361, + "learning_rate": 0.0001, + "loss": 0.1756, + "step": 1783 + }, + { + "epoch": 2.9083795239647863, + "grad_norm": 0.11158990114927292, + "learning_rate": 0.0001, + "loss": 0.1627, + "step": 1784 + }, + { + "epoch": 2.910009781545484, + "grad_norm": 0.14631851017475128, + "learning_rate": 0.0001, + "loss": 0.1763, + "step": 1785 + }, + { + "epoch": 2.911640039126182, + "grad_norm": 0.12340549379587173, + "learning_rate": 0.0001, + "loss": 0.1826, + "step": 1786 + }, + { + "epoch": 2.9132702967068798, + "grad_norm": 0.15119358897209167, + "learning_rate": 0.0001, + "loss": 0.1517, + "step": 1787 + }, + { + "epoch": 2.9149005542875774, + "grad_norm": 0.10862945020198822, + "learning_rate": 0.0001, + "loss": 0.1636, + "step": 1788 + }, + { + "epoch": 2.916530811868275, + "grad_norm": 0.12316179275512695, + "learning_rate": 0.0001, + "loss": 0.1726, + "step": 1789 + }, + { + "epoch": 2.918161069448973, + "grad_norm": 0.14677046239376068, + "learning_rate": 0.0001, + "loss": 0.1705, + "step": 1790 + }, + { + "epoch": 2.919791327029671, + "grad_norm": 0.1339065581560135, + "learning_rate": 0.0001, + "loss": 0.1771, + "step": 1791 + }, + { + "epoch": 2.9214215846103686, + "grad_norm": 0.1291988492012024, + "learning_rate": 0.0001, + "loss": 0.1741, + "step": 1792 + }, + { + "epoch": 2.9230518421910663, + "grad_norm": 0.13565215468406677, + "learning_rate": 0.0001, + "loss": 0.1692, + "step": 1793 + }, + { + "epoch": 2.924682099771764, + "grad_norm": 0.13081243634223938, + "learning_rate": 0.0001, + "loss": 0.1622, + "step": 1794 + }, + { + "epoch": 2.9263123573524616, + "grad_norm": 0.12821637094020844, + "learning_rate": 0.0001, + "loss": 0.1592, + "step": 1795 + }, + { + "epoch": 2.9279426149331593, + "grad_norm": 0.17648915946483612, + "learning_rate": 0.0001, + "loss": 0.1875, + "step": 1796 + }, + { + "epoch": 2.929572872513857, + "grad_norm": 0.11781003326177597, + "learning_rate": 0.0001, + "loss": 0.1591, + "step": 1797 + }, + { + "epoch": 2.9312031300945547, + "grad_norm": 0.13282893598079681, + "learning_rate": 0.0001, + "loss": 0.1734, + "step": 1798 + }, + { + "epoch": 2.932833387675253, + "grad_norm": 0.14549997448921204, + "learning_rate": 0.0001, + "loss": 0.1792, + "step": 1799 + }, + { + "epoch": 2.9344636452559505, + "grad_norm": 0.12919341027736664, + "learning_rate": 0.0001, + "loss": 0.1682, + "step": 1800 + }, + { + "epoch": 2.936093902836648, + "grad_norm": 0.12719695270061493, + "learning_rate": 0.0001, + "loss": 0.1824, + "step": 1801 + }, + { + "epoch": 2.937724160417346, + "grad_norm": 0.14096450805664062, + "learning_rate": 0.0001, + "loss": 0.1609, + "step": 1802 + }, + { + "epoch": 2.9393544179980435, + "grad_norm": 0.11929788440465927, + "learning_rate": 0.0001, + "loss": 0.1602, + "step": 1803 + }, + { + "epoch": 2.9409846755787417, + "grad_norm": 0.16668562591075897, + "learning_rate": 0.0001, + "loss": 0.1767, + "step": 1804 + }, + { + "epoch": 2.9426149331594393, + "grad_norm": 0.14096197485923767, + "learning_rate": 0.0001, + "loss": 0.1696, + "step": 1805 + }, + { + "epoch": 2.944245190740137, + "grad_norm": 0.1540384292602539, + "learning_rate": 0.0001, + "loss": 0.1849, + "step": 1806 + }, + { + "epoch": 2.9458754483208347, + "grad_norm": 0.11548353731632233, + "learning_rate": 0.0001, + "loss": 0.1676, + "step": 1807 + }, + { + "epoch": 2.9475057059015324, + "grad_norm": 0.12311840057373047, + "learning_rate": 0.0001, + "loss": 0.1664, + "step": 1808 + }, + { + "epoch": 2.94913596348223, + "grad_norm": 0.1358504742383957, + "learning_rate": 0.0001, + "loss": 0.1752, + "step": 1809 + }, + { + "epoch": 2.9507662210629277, + "grad_norm": 0.12379997223615646, + "learning_rate": 0.0001, + "loss": 0.1656, + "step": 1810 + }, + { + "epoch": 2.9523964786436254, + "grad_norm": 0.13933683931827545, + "learning_rate": 0.0001, + "loss": 0.1797, + "step": 1811 + }, + { + "epoch": 2.9540267362243235, + "grad_norm": 0.1396905779838562, + "learning_rate": 0.0001, + "loss": 0.1761, + "step": 1812 + }, + { + "epoch": 2.9556569938050212, + "grad_norm": 0.12376896291971207, + "learning_rate": 0.0001, + "loss": 0.1608, + "step": 1813 + }, + { + "epoch": 2.957287251385719, + "grad_norm": 0.12583288550376892, + "learning_rate": 0.0001, + "loss": 0.1684, + "step": 1814 + }, + { + "epoch": 2.9589175089664166, + "grad_norm": 0.13755297660827637, + "learning_rate": 0.0001, + "loss": 0.144, + "step": 1815 + }, + { + "epoch": 2.9605477665471147, + "grad_norm": 0.24326035380363464, + "learning_rate": 0.0001, + "loss": 0.1885, + "step": 1816 + }, + { + "epoch": 2.9621780241278124, + "grad_norm": 0.14443641901016235, + "learning_rate": 0.0001, + "loss": 0.1885, + "step": 1817 + }, + { + "epoch": 2.96380828170851, + "grad_norm": 0.12700864672660828, + "learning_rate": 0.0001, + "loss": 0.1641, + "step": 1818 + }, + { + "epoch": 2.9654385392892078, + "grad_norm": 0.148148775100708, + "learning_rate": 0.0001, + "loss": 0.1792, + "step": 1819 + }, + { + "epoch": 2.9670687968699054, + "grad_norm": 0.14224189519882202, + "learning_rate": 0.0001, + "loss": 0.1692, + "step": 1820 + }, + { + "epoch": 2.968699054450603, + "grad_norm": 0.15919244289398193, + "learning_rate": 0.0001, + "loss": 0.174, + "step": 1821 + }, + { + "epoch": 2.970329312031301, + "grad_norm": 0.2009061574935913, + "learning_rate": 0.0001, + "loss": 0.1737, + "step": 1822 + }, + { + "epoch": 2.9719595696119985, + "grad_norm": 0.12979143857955933, + "learning_rate": 0.0001, + "loss": 0.1668, + "step": 1823 + }, + { + "epoch": 2.9735898271926966, + "grad_norm": 0.12879891693592072, + "learning_rate": 0.0001, + "loss": 0.1608, + "step": 1824 + }, + { + "epoch": 2.9752200847733943, + "grad_norm": 0.11508017033338547, + "learning_rate": 0.0001, + "loss": 0.1705, + "step": 1825 + }, + { + "epoch": 2.976850342354092, + "grad_norm": 0.12390004843473434, + "learning_rate": 0.0001, + "loss": 0.1697, + "step": 1826 + }, + { + "epoch": 2.9784805999347896, + "grad_norm": 0.13983500003814697, + "learning_rate": 0.0001, + "loss": 0.1744, + "step": 1827 + }, + { + "epoch": 2.9801108575154873, + "grad_norm": 0.12721027433872223, + "learning_rate": 0.0001, + "loss": 0.1799, + "step": 1828 + }, + { + "epoch": 2.9817411150961854, + "grad_norm": 0.12273556739091873, + "learning_rate": 0.0001, + "loss": 0.1664, + "step": 1829 + }, + { + "epoch": 2.983371372676883, + "grad_norm": 0.12730666995048523, + "learning_rate": 0.0001, + "loss": 0.1716, + "step": 1830 + }, + { + "epoch": 2.985001630257581, + "grad_norm": 0.12331203371286392, + "learning_rate": 0.0001, + "loss": 0.1641, + "step": 1831 + }, + { + "epoch": 2.9866318878382785, + "grad_norm": 0.1258230060338974, + "learning_rate": 0.0001, + "loss": 0.1778, + "step": 1832 + }, + { + "epoch": 2.988262145418976, + "grad_norm": 0.14592202007770538, + "learning_rate": 0.0001, + "loss": 0.1888, + "step": 1833 + }, + { + "epoch": 2.989892402999674, + "grad_norm": 0.12880602478981018, + "learning_rate": 0.0001, + "loss": 0.1731, + "step": 1834 + }, + { + "epoch": 2.9915226605803715, + "grad_norm": 0.14615651965141296, + "learning_rate": 0.0001, + "loss": 0.1853, + "step": 1835 + }, + { + "epoch": 2.993152918161069, + "grad_norm": 0.12081672251224518, + "learning_rate": 0.0001, + "loss": 0.1837, + "step": 1836 + }, + { + "epoch": 2.9947831757417673, + "grad_norm": 0.11534418165683746, + "learning_rate": 0.0001, + "loss": 0.1773, + "step": 1837 + }, + { + "epoch": 2.996413433322465, + "grad_norm": 0.12042343616485596, + "learning_rate": 0.0001, + "loss": 0.1594, + "step": 1838 + }, + { + "epoch": 2.9980436909031627, + "grad_norm": 0.13987913727760315, + "learning_rate": 0.0001, + "loss": 0.1785, + "step": 1839 + }, + { + "epoch": 2.9996739484838604, + "grad_norm": 0.12604045867919922, + "learning_rate": 0.0001, + "loss": 0.1708, + "step": 1840 + }, + { + "epoch": 3.001304206064558, + "grad_norm": 0.13217991590499878, + "learning_rate": 0.0001, + "loss": 0.1621, + "step": 1841 + }, + { + "epoch": 3.0029344636452557, + "grad_norm": 0.1318402737379074, + "learning_rate": 0.0001, + "loss": 0.1565, + "step": 1842 + }, + { + "epoch": 3.004564721225954, + "grad_norm": 0.12177060544490814, + "learning_rate": 0.0001, + "loss": 0.162, + "step": 1843 + }, + { + "epoch": 3.0061949788066515, + "grad_norm": 0.11984521895647049, + "learning_rate": 0.0001, + "loss": 0.1441, + "step": 1844 + }, + { + "epoch": 3.007825236387349, + "grad_norm": 0.13043449819087982, + "learning_rate": 0.0001, + "loss": 0.1419, + "step": 1845 + }, + { + "epoch": 3.009455493968047, + "grad_norm": 0.12092608213424683, + "learning_rate": 0.0001, + "loss": 0.1503, + "step": 1846 + }, + { + "epoch": 3.0110857515487446, + "grad_norm": 0.15196233987808228, + "learning_rate": 0.0001, + "loss": 0.161, + "step": 1847 + }, + { + "epoch": 3.0127160091294423, + "grad_norm": 0.14589709043502808, + "learning_rate": 0.0001, + "loss": 0.1564, + "step": 1848 + }, + { + "epoch": 3.0143462667101404, + "grad_norm": 0.1587577909231186, + "learning_rate": 0.0001, + "loss": 0.1673, + "step": 1849 + }, + { + "epoch": 3.015976524290838, + "grad_norm": 0.14549551904201508, + "learning_rate": 0.0001, + "loss": 0.1573, + "step": 1850 + }, + { + "epoch": 3.0176067818715357, + "grad_norm": 0.1457890272140503, + "learning_rate": 0.0001, + "loss": 0.1552, + "step": 1851 + }, + { + "epoch": 3.0192370394522334, + "grad_norm": 0.1330796182155609, + "learning_rate": 0.0001, + "loss": 0.1522, + "step": 1852 + }, + { + "epoch": 3.020867297032931, + "grad_norm": 0.16735856235027313, + "learning_rate": 0.0001, + "loss": 0.1618, + "step": 1853 + }, + { + "epoch": 3.022497554613629, + "grad_norm": 0.13368208706378937, + "learning_rate": 0.0001, + "loss": 0.1522, + "step": 1854 + }, + { + "epoch": 3.024127812194327, + "grad_norm": 0.14208734035491943, + "learning_rate": 0.0001, + "loss": 0.1537, + "step": 1855 + }, + { + "epoch": 3.0257580697750246, + "grad_norm": 0.17149995267391205, + "learning_rate": 0.0001, + "loss": 0.1688, + "step": 1856 + }, + { + "epoch": 3.0273883273557223, + "grad_norm": 0.16402693092823029, + "learning_rate": 0.0001, + "loss": 0.153, + "step": 1857 + }, + { + "epoch": 3.02901858493642, + "grad_norm": 0.14791880548000336, + "learning_rate": 0.0001, + "loss": 0.1415, + "step": 1858 + }, + { + "epoch": 3.0306488425171176, + "grad_norm": 0.13278983533382416, + "learning_rate": 0.0001, + "loss": 0.1624, + "step": 1859 + }, + { + "epoch": 3.0322791000978153, + "grad_norm": 0.1486644297838211, + "learning_rate": 0.0001, + "loss": 0.1489, + "step": 1860 + }, + { + "epoch": 3.033909357678513, + "grad_norm": 0.15381312370300293, + "learning_rate": 0.0001, + "loss": 0.1546, + "step": 1861 + }, + { + "epoch": 3.035539615259211, + "grad_norm": 0.1434403508901596, + "learning_rate": 0.0001, + "loss": 0.1595, + "step": 1862 + }, + { + "epoch": 3.037169872839909, + "grad_norm": 0.14210708439350128, + "learning_rate": 0.0001, + "loss": 0.1533, + "step": 1863 + }, + { + "epoch": 3.0388001304206065, + "grad_norm": 0.1316472291946411, + "learning_rate": 0.0001, + "loss": 0.1406, + "step": 1864 + }, + { + "epoch": 3.040430388001304, + "grad_norm": 0.17375947535037994, + "learning_rate": 0.0001, + "loss": 0.1727, + "step": 1865 + }, + { + "epoch": 3.042060645582002, + "grad_norm": 0.13698440790176392, + "learning_rate": 0.0001, + "loss": 0.1458, + "step": 1866 + }, + { + "epoch": 3.0436909031626995, + "grad_norm": 0.14209020137786865, + "learning_rate": 0.0001, + "loss": 0.1467, + "step": 1867 + }, + { + "epoch": 3.0453211607433976, + "grad_norm": 0.14422006905078888, + "learning_rate": 0.0001, + "loss": 0.1502, + "step": 1868 + }, + { + "epoch": 3.0469514183240953, + "grad_norm": 0.13612805306911469, + "learning_rate": 0.0001, + "loss": 0.1519, + "step": 1869 + }, + { + "epoch": 3.048581675904793, + "grad_norm": 0.16070745885372162, + "learning_rate": 0.0001, + "loss": 0.1606, + "step": 1870 + }, + { + "epoch": 3.0502119334854907, + "grad_norm": 0.1366911679506302, + "learning_rate": 0.0001, + "loss": 0.1532, + "step": 1871 + }, + { + "epoch": 3.0518421910661884, + "grad_norm": 0.1536417007446289, + "learning_rate": 0.0001, + "loss": 0.1624, + "step": 1872 + }, + { + "epoch": 3.053472448646886, + "grad_norm": 0.14232809841632843, + "learning_rate": 0.0001, + "loss": 0.161, + "step": 1873 + }, + { + "epoch": 3.055102706227584, + "grad_norm": 0.13991610705852509, + "learning_rate": 0.0001, + "loss": 0.1451, + "step": 1874 + }, + { + "epoch": 3.056732963808282, + "grad_norm": 0.13531944155693054, + "learning_rate": 0.0001, + "loss": 0.1551, + "step": 1875 + }, + { + "epoch": 3.0583632213889795, + "grad_norm": 0.1320919394493103, + "learning_rate": 0.0001, + "loss": 0.1637, + "step": 1876 + }, + { + "epoch": 3.059993478969677, + "grad_norm": 0.1661989539861679, + "learning_rate": 0.0001, + "loss": 0.168, + "step": 1877 + }, + { + "epoch": 3.061623736550375, + "grad_norm": 0.14109432697296143, + "learning_rate": 0.0001, + "loss": 0.1594, + "step": 1878 + }, + { + "epoch": 3.0632539941310726, + "grad_norm": 0.12099778652191162, + "learning_rate": 0.0001, + "loss": 0.1361, + "step": 1879 + }, + { + "epoch": 3.0648842517117703, + "grad_norm": 0.15160542726516724, + "learning_rate": 0.0001, + "loss": 0.1442, + "step": 1880 + }, + { + "epoch": 3.0665145092924684, + "grad_norm": 0.15511579811573029, + "learning_rate": 0.0001, + "loss": 0.1517, + "step": 1881 + }, + { + "epoch": 3.068144766873166, + "grad_norm": 0.15297721326351166, + "learning_rate": 0.0001, + "loss": 0.1484, + "step": 1882 + }, + { + "epoch": 3.0697750244538637, + "grad_norm": 0.15919755399227142, + "learning_rate": 0.0001, + "loss": 0.1538, + "step": 1883 + }, + { + "epoch": 3.0714052820345614, + "grad_norm": 0.16629868745803833, + "learning_rate": 0.0001, + "loss": 0.1727, + "step": 1884 + }, + { + "epoch": 3.073035539615259, + "grad_norm": 0.15279151499271393, + "learning_rate": 0.0001, + "loss": 0.167, + "step": 1885 + }, + { + "epoch": 3.074665797195957, + "grad_norm": 0.13714101910591125, + "learning_rate": 0.0001, + "loss": 0.1564, + "step": 1886 + }, + { + "epoch": 3.076296054776655, + "grad_norm": 0.13693886995315552, + "learning_rate": 0.0001, + "loss": 0.1528, + "step": 1887 + }, + { + "epoch": 3.0779263123573526, + "grad_norm": 0.1455615609884262, + "learning_rate": 0.0001, + "loss": 0.1606, + "step": 1888 + }, + { + "epoch": 3.0795565699380503, + "grad_norm": 0.12297336757183075, + "learning_rate": 0.0001, + "loss": 0.1513, + "step": 1889 + }, + { + "epoch": 3.081186827518748, + "grad_norm": 0.13729313015937805, + "learning_rate": 0.0001, + "loss": 0.1594, + "step": 1890 + }, + { + "epoch": 3.0828170850994456, + "grad_norm": 0.15476392209529877, + "learning_rate": 0.0001, + "loss": 0.1646, + "step": 1891 + }, + { + "epoch": 3.0844473426801433, + "grad_norm": 0.13134579360485077, + "learning_rate": 0.0001, + "loss": 0.145, + "step": 1892 + }, + { + "epoch": 3.0860776002608414, + "grad_norm": 0.12954765558242798, + "learning_rate": 0.0001, + "loss": 0.1463, + "step": 1893 + }, + { + "epoch": 3.087707857841539, + "grad_norm": 0.13961279392242432, + "learning_rate": 0.0001, + "loss": 0.1502, + "step": 1894 + }, + { + "epoch": 3.089338115422237, + "grad_norm": 0.1431739330291748, + "learning_rate": 0.0001, + "loss": 0.1542, + "step": 1895 + }, + { + "epoch": 3.0909683730029345, + "grad_norm": 0.13088122010231018, + "learning_rate": 0.0001, + "loss": 0.1418, + "step": 1896 + }, + { + "epoch": 3.092598630583632, + "grad_norm": 0.14733296632766724, + "learning_rate": 0.0001, + "loss": 0.1506, + "step": 1897 + }, + { + "epoch": 3.09422888816433, + "grad_norm": 0.14778561890125275, + "learning_rate": 0.0001, + "loss": 0.1492, + "step": 1898 + }, + { + "epoch": 3.0958591457450275, + "grad_norm": 0.17354664206504822, + "learning_rate": 0.0001, + "loss": 0.1725, + "step": 1899 + }, + { + "epoch": 3.0974894033257256, + "grad_norm": 0.17014777660369873, + "learning_rate": 0.0001, + "loss": 0.1653, + "step": 1900 + }, + { + "epoch": 3.0991196609064233, + "grad_norm": 0.1588321030139923, + "learning_rate": 0.0001, + "loss": 0.1618, + "step": 1901 + }, + { + "epoch": 3.100749918487121, + "grad_norm": 0.1591597944498062, + "learning_rate": 0.0001, + "loss": 0.1574, + "step": 1902 + }, + { + "epoch": 3.1023801760678187, + "grad_norm": 0.1335812509059906, + "learning_rate": 0.0001, + "loss": 0.1508, + "step": 1903 + }, + { + "epoch": 3.1040104336485164, + "grad_norm": 0.13163426518440247, + "learning_rate": 0.0001, + "loss": 0.134, + "step": 1904 + }, + { + "epoch": 3.105640691229214, + "grad_norm": 0.17262452840805054, + "learning_rate": 0.0001, + "loss": 0.1662, + "step": 1905 + }, + { + "epoch": 3.107270948809912, + "grad_norm": 0.1346106082201004, + "learning_rate": 0.0001, + "loss": 0.1519, + "step": 1906 + }, + { + "epoch": 3.10890120639061, + "grad_norm": 0.15132804214954376, + "learning_rate": 0.0001, + "loss": 0.1567, + "step": 1907 + }, + { + "epoch": 3.1105314639713075, + "grad_norm": 0.14656689763069153, + "learning_rate": 0.0001, + "loss": 0.174, + "step": 1908 + }, + { + "epoch": 3.112161721552005, + "grad_norm": 0.12101754546165466, + "learning_rate": 0.0001, + "loss": 0.1313, + "step": 1909 + }, + { + "epoch": 3.113791979132703, + "grad_norm": 0.13873204588890076, + "learning_rate": 0.0001, + "loss": 0.1685, + "step": 1910 + }, + { + "epoch": 3.1154222367134006, + "grad_norm": 0.14690999686717987, + "learning_rate": 0.0001, + "loss": 0.1522, + "step": 1911 + }, + { + "epoch": 3.1170524942940983, + "grad_norm": 0.14875850081443787, + "learning_rate": 0.0001, + "loss": 0.1669, + "step": 1912 + }, + { + "epoch": 3.1186827518747964, + "grad_norm": 0.12188220769166946, + "learning_rate": 0.0001, + "loss": 0.1423, + "step": 1913 + }, + { + "epoch": 3.120313009455494, + "grad_norm": 0.14481259882450104, + "learning_rate": 0.0001, + "loss": 0.1567, + "step": 1914 + }, + { + "epoch": 3.1219432670361917, + "grad_norm": 0.1382434368133545, + "learning_rate": 0.0001, + "loss": 0.1495, + "step": 1915 + }, + { + "epoch": 3.1235735246168894, + "grad_norm": 0.14566384255886078, + "learning_rate": 0.0001, + "loss": 0.1476, + "step": 1916 + }, + { + "epoch": 3.125203782197587, + "grad_norm": 0.14900517463684082, + "learning_rate": 0.0001, + "loss": 0.1642, + "step": 1917 + }, + { + "epoch": 3.126834039778285, + "grad_norm": 0.1237114742398262, + "learning_rate": 0.0001, + "loss": 0.1387, + "step": 1918 + }, + { + "epoch": 3.128464297358983, + "grad_norm": 0.13117025792598724, + "learning_rate": 0.0001, + "loss": 0.1515, + "step": 1919 + }, + { + "epoch": 3.1300945549396806, + "grad_norm": 0.13853514194488525, + "learning_rate": 0.0001, + "loss": 0.1496, + "step": 1920 + }, + { + "epoch": 3.1317248125203783, + "grad_norm": 0.17825017869472504, + "learning_rate": 0.0001, + "loss": 0.1578, + "step": 1921 + }, + { + "epoch": 3.133355070101076, + "grad_norm": 0.1505771279335022, + "learning_rate": 0.0001, + "loss": 0.1535, + "step": 1922 + }, + { + "epoch": 3.1349853276817736, + "grad_norm": 0.1433846801519394, + "learning_rate": 0.0001, + "loss": 0.1572, + "step": 1923 + }, + { + "epoch": 3.1366155852624713, + "grad_norm": 0.1596456617116928, + "learning_rate": 0.0001, + "loss": 0.1604, + "step": 1924 + }, + { + "epoch": 3.138245842843169, + "grad_norm": 0.14370723068714142, + "learning_rate": 0.0001, + "loss": 0.1608, + "step": 1925 + }, + { + "epoch": 3.139876100423867, + "grad_norm": 0.16248880326747894, + "learning_rate": 0.0001, + "loss": 0.171, + "step": 1926 + }, + { + "epoch": 3.141506358004565, + "grad_norm": 0.1393871307373047, + "learning_rate": 0.0001, + "loss": 0.1605, + "step": 1927 + }, + { + "epoch": 3.1431366155852625, + "grad_norm": 0.12134724855422974, + "learning_rate": 0.0001, + "loss": 0.1594, + "step": 1928 + }, + { + "epoch": 3.14476687316596, + "grad_norm": 0.16671213507652283, + "learning_rate": 0.0001, + "loss": 0.1675, + "step": 1929 + }, + { + "epoch": 3.146397130746658, + "grad_norm": 0.7389642596244812, + "learning_rate": 0.0001, + "loss": 0.1587, + "step": 1930 + }, + { + "epoch": 3.148027388327356, + "grad_norm": 0.1277475357055664, + "learning_rate": 0.0001, + "loss": 0.1467, + "step": 1931 + }, + { + "epoch": 3.1496576459080536, + "grad_norm": 0.14921234548091888, + "learning_rate": 0.0001, + "loss": 0.1521, + "step": 1932 + }, + { + "epoch": 3.1512879034887513, + "grad_norm": 0.14741483330726624, + "learning_rate": 0.0001, + "loss": 0.158, + "step": 1933 + }, + { + "epoch": 3.152918161069449, + "grad_norm": 0.14746753871440887, + "learning_rate": 0.0001, + "loss": 0.1479, + "step": 1934 + }, + { + "epoch": 3.1545484186501467, + "grad_norm": 0.13620176911354065, + "learning_rate": 0.0001, + "loss": 0.1499, + "step": 1935 + }, + { + "epoch": 3.1561786762308444, + "grad_norm": 1.9854735136032104, + "learning_rate": 0.0001, + "loss": 0.1762, + "step": 1936 + }, + { + "epoch": 3.157808933811542, + "grad_norm": 0.1485620141029358, + "learning_rate": 0.0001, + "loss": 0.1523, + "step": 1937 + }, + { + "epoch": 3.15943919139224, + "grad_norm": 0.1689678281545639, + "learning_rate": 0.0001, + "loss": 0.1563, + "step": 1938 + }, + { + "epoch": 3.161069448972938, + "grad_norm": 0.14907783269882202, + "learning_rate": 0.0001, + "loss": 0.1649, + "step": 1939 + }, + { + "epoch": 3.1626997065536355, + "grad_norm": 0.15304021537303925, + "learning_rate": 0.0001, + "loss": 0.1689, + "step": 1940 + }, + { + "epoch": 3.164329964134333, + "grad_norm": 0.16715513169765472, + "learning_rate": 0.0001, + "loss": 0.1601, + "step": 1941 + }, + { + "epoch": 3.165960221715031, + "grad_norm": 0.15324808657169342, + "learning_rate": 0.0001, + "loss": 0.1612, + "step": 1942 + }, + { + "epoch": 3.1675904792957286, + "grad_norm": 0.14634795486927032, + "learning_rate": 0.0001, + "loss": 0.1568, + "step": 1943 + }, + { + "epoch": 3.1692207368764267, + "grad_norm": 0.15744540095329285, + "learning_rate": 0.0001, + "loss": 0.1552, + "step": 1944 + }, + { + "epoch": 3.1708509944571244, + "grad_norm": 0.15544916689395905, + "learning_rate": 0.0001, + "loss": 0.16, + "step": 1945 + }, + { + "epoch": 3.172481252037822, + "grad_norm": 0.14175128936767578, + "learning_rate": 0.0001, + "loss": 0.1639, + "step": 1946 + }, + { + "epoch": 3.1741115096185197, + "grad_norm": 0.14051491022109985, + "learning_rate": 0.0001, + "loss": 0.1379, + "step": 1947 + }, + { + "epoch": 3.1757417671992174, + "grad_norm": 0.14642100036144257, + "learning_rate": 0.0001, + "loss": 0.1535, + "step": 1948 + }, + { + "epoch": 3.177372024779915, + "grad_norm": 0.1421380490064621, + "learning_rate": 0.0001, + "loss": 0.1599, + "step": 1949 + }, + { + "epoch": 3.1790022823606128, + "grad_norm": 0.15691342949867249, + "learning_rate": 0.0001, + "loss": 0.1589, + "step": 1950 + }, + { + "epoch": 3.180632539941311, + "grad_norm": 0.14704185724258423, + "learning_rate": 0.0001, + "loss": 0.1508, + "step": 1951 + }, + { + "epoch": 3.1822627975220086, + "grad_norm": 0.15139469504356384, + "learning_rate": 0.0001, + "loss": 0.151, + "step": 1952 + }, + { + "epoch": 3.1838930551027063, + "grad_norm": 0.15881513059139252, + "learning_rate": 0.0001, + "loss": 0.1604, + "step": 1953 + }, + { + "epoch": 3.185523312683404, + "grad_norm": 0.16441747546195984, + "learning_rate": 0.0001, + "loss": 0.1677, + "step": 1954 + }, + { + "epoch": 3.1871535702641016, + "grad_norm": 0.17274102568626404, + "learning_rate": 0.0001, + "loss": 0.1674, + "step": 1955 + }, + { + "epoch": 3.1887838278447993, + "grad_norm": 0.13821010291576385, + "learning_rate": 0.0001, + "loss": 0.1478, + "step": 1956 + }, + { + "epoch": 3.1904140854254974, + "grad_norm": 0.14471560716629028, + "learning_rate": 0.0001, + "loss": 0.1524, + "step": 1957 + }, + { + "epoch": 3.192044343006195, + "grad_norm": 0.14315104484558105, + "learning_rate": 0.0001, + "loss": 0.1528, + "step": 1958 + }, + { + "epoch": 3.193674600586893, + "grad_norm": 0.15579240024089813, + "learning_rate": 0.0001, + "loss": 0.1575, + "step": 1959 + }, + { + "epoch": 3.1953048581675905, + "grad_norm": 0.16074621677398682, + "learning_rate": 0.0001, + "loss": 0.1498, + "step": 1960 + }, + { + "epoch": 3.196935115748288, + "grad_norm": 0.14559932053089142, + "learning_rate": 0.0001, + "loss": 0.1525, + "step": 1961 + }, + { + "epoch": 3.198565373328986, + "grad_norm": 0.146773099899292, + "learning_rate": 0.0001, + "loss": 0.1568, + "step": 1962 + }, + { + "epoch": 3.2001956309096835, + "grad_norm": 0.1405738741159439, + "learning_rate": 0.0001, + "loss": 0.1553, + "step": 1963 + }, + { + "epoch": 3.2018258884903816, + "grad_norm": 0.1669120341539383, + "learning_rate": 0.0001, + "loss": 0.1652, + "step": 1964 + }, + { + "epoch": 3.2034561460710793, + "grad_norm": 0.14730647206306458, + "learning_rate": 0.0001, + "loss": 0.1532, + "step": 1965 + }, + { + "epoch": 3.205086403651777, + "grad_norm": 0.15380671620368958, + "learning_rate": 0.0001, + "loss": 0.1585, + "step": 1966 + }, + { + "epoch": 3.2067166612324747, + "grad_norm": 0.1475609540939331, + "learning_rate": 0.0001, + "loss": 0.1533, + "step": 1967 + }, + { + "epoch": 3.2083469188131724, + "grad_norm": 0.15261127054691315, + "learning_rate": 0.0001, + "loss": 0.1528, + "step": 1968 + }, + { + "epoch": 3.20997717639387, + "grad_norm": 0.13426612317562103, + "learning_rate": 0.0001, + "loss": 0.1459, + "step": 1969 + }, + { + "epoch": 3.211607433974568, + "grad_norm": 0.16383317112922668, + "learning_rate": 0.0001, + "loss": 0.1457, + "step": 1970 + }, + { + "epoch": 3.213237691555266, + "grad_norm": 0.12917453050613403, + "learning_rate": 0.0001, + "loss": 0.1447, + "step": 1971 + }, + { + "epoch": 3.2148679491359635, + "grad_norm": 0.14621089398860931, + "learning_rate": 0.0001, + "loss": 0.1425, + "step": 1972 + }, + { + "epoch": 3.216498206716661, + "grad_norm": 0.14736516773700714, + "learning_rate": 0.0001, + "loss": 0.1542, + "step": 1973 + }, + { + "epoch": 3.218128464297359, + "grad_norm": 0.18349121510982513, + "learning_rate": 0.0001, + "loss": 0.1581, + "step": 1974 + }, + { + "epoch": 3.2197587218780566, + "grad_norm": 0.13845403492450714, + "learning_rate": 0.0001, + "loss": 0.1531, + "step": 1975 + }, + { + "epoch": 3.2213889794587547, + "grad_norm": 0.17356093227863312, + "learning_rate": 0.0001, + "loss": 0.1499, + "step": 1976 + }, + { + "epoch": 3.2230192370394524, + "grad_norm": 0.13243253529071808, + "learning_rate": 0.0001, + "loss": 0.1551, + "step": 1977 + }, + { + "epoch": 3.22464949462015, + "grad_norm": 0.16005612909793854, + "learning_rate": 0.0001, + "loss": 0.1485, + "step": 1978 + }, + { + "epoch": 3.2262797522008477, + "grad_norm": 0.150725856423378, + "learning_rate": 0.0001, + "loss": 0.1598, + "step": 1979 + }, + { + "epoch": 3.2279100097815454, + "grad_norm": 0.1280834674835205, + "learning_rate": 0.0001, + "loss": 0.1508, + "step": 1980 + }, + { + "epoch": 3.229540267362243, + "grad_norm": 0.1567755490541458, + "learning_rate": 0.0001, + "loss": 0.1561, + "step": 1981 + }, + { + "epoch": 3.231170524942941, + "grad_norm": 0.13241910934448242, + "learning_rate": 0.0001, + "loss": 0.1483, + "step": 1982 + }, + { + "epoch": 3.232800782523639, + "grad_norm": 0.1435912549495697, + "learning_rate": 0.0001, + "loss": 0.1466, + "step": 1983 + }, + { + "epoch": 3.2344310401043366, + "grad_norm": 0.1577858179807663, + "learning_rate": 0.0001, + "loss": 0.1617, + "step": 1984 + }, + { + "epoch": 3.2360612976850343, + "grad_norm": 0.14068260788917542, + "learning_rate": 0.0001, + "loss": 0.1511, + "step": 1985 + }, + { + "epoch": 3.237691555265732, + "grad_norm": 0.1652795672416687, + "learning_rate": 0.0001, + "loss": 0.1744, + "step": 1986 + }, + { + "epoch": 3.2393218128464296, + "grad_norm": 0.14117799699306488, + "learning_rate": 0.0001, + "loss": 0.1525, + "step": 1987 + }, + { + "epoch": 3.2409520704271273, + "grad_norm": 0.18951740860939026, + "learning_rate": 0.0001, + "loss": 0.1491, + "step": 1988 + }, + { + "epoch": 3.2425823280078254, + "grad_norm": 0.17044013738632202, + "learning_rate": 0.0001, + "loss": 0.1416, + "step": 1989 + }, + { + "epoch": 3.244212585588523, + "grad_norm": 0.16022269427776337, + "learning_rate": 0.0001, + "loss": 0.1707, + "step": 1990 + }, + { + "epoch": 3.245842843169221, + "grad_norm": 0.17762519419193268, + "learning_rate": 0.0001, + "loss": 0.1641, + "step": 1991 + }, + { + "epoch": 3.2474731007499185, + "grad_norm": 0.15027785301208496, + "learning_rate": 0.0001, + "loss": 0.1665, + "step": 1992 + }, + { + "epoch": 3.249103358330616, + "grad_norm": 0.12501652538776398, + "learning_rate": 0.0001, + "loss": 0.1388, + "step": 1993 + }, + { + "epoch": 3.250733615911314, + "grad_norm": 0.14507539570331573, + "learning_rate": 0.0001, + "loss": 0.1667, + "step": 1994 + }, + { + "epoch": 3.252363873492012, + "grad_norm": 0.12410466372966766, + "learning_rate": 0.0001, + "loss": 0.152, + "step": 1995 + }, + { + "epoch": 3.2539941310727096, + "grad_norm": 0.1520231068134308, + "learning_rate": 0.0001, + "loss": 0.165, + "step": 1996 + }, + { + "epoch": 3.2556243886534073, + "grad_norm": 0.15757714211940765, + "learning_rate": 0.0001, + "loss": 0.1722, + "step": 1997 + }, + { + "epoch": 3.257254646234105, + "grad_norm": 0.19692480564117432, + "learning_rate": 0.0001, + "loss": 0.1586, + "step": 1998 + }, + { + "epoch": 3.2588849038148027, + "grad_norm": 0.1453189253807068, + "learning_rate": 0.0001, + "loss": 0.1524, + "step": 1999 + }, + { + "epoch": 3.2605151613955003, + "grad_norm": 0.14371079206466675, + "learning_rate": 0.0001, + "loss": 0.1596, + "step": 2000 + }, + { + "epoch": 3.2605151613955003, + "eval_loss": 0.19923283159732819, + "eval_runtime": 2814.5584, + "eval_samples_per_second": 0.67, + "eval_steps_per_second": 0.168, + "step": 2000 + }, + { + "epoch": 3.262145418976198, + "grad_norm": 0.15101583302021027, + "learning_rate": 0.0001, + "loss": 0.1628, + "step": 2001 + }, + { + "epoch": 3.263775676556896, + "grad_norm": 0.14068691432476044, + "learning_rate": 0.0001, + "loss": 0.1411, + "step": 2002 + }, + { + "epoch": 3.265405934137594, + "grad_norm": 0.1584228277206421, + "learning_rate": 0.0001, + "loss": 0.1623, + "step": 2003 + }, + { + "epoch": 3.2670361917182915, + "grad_norm": 0.14234709739685059, + "learning_rate": 0.0001, + "loss": 0.1547, + "step": 2004 + }, + { + "epoch": 3.268666449298989, + "grad_norm": 0.13787250220775604, + "learning_rate": 0.0001, + "loss": 0.1578, + "step": 2005 + }, + { + "epoch": 3.270296706879687, + "grad_norm": 0.13636542856693268, + "learning_rate": 0.0001, + "loss": 0.1501, + "step": 2006 + }, + { + "epoch": 3.2719269644603846, + "grad_norm": 0.1498252898454666, + "learning_rate": 0.0001, + "loss": 0.1628, + "step": 2007 + }, + { + "epoch": 3.2735572220410827, + "grad_norm": 0.15761011838912964, + "learning_rate": 0.0001, + "loss": 0.1533, + "step": 2008 + }, + { + "epoch": 3.2751874796217804, + "grad_norm": 0.13434697687625885, + "learning_rate": 0.0001, + "loss": 0.1681, + "step": 2009 + }, + { + "epoch": 3.276817737202478, + "grad_norm": 0.1439303308725357, + "learning_rate": 0.0001, + "loss": 0.1577, + "step": 2010 + }, + { + "epoch": 3.2784479947831757, + "grad_norm": 0.1279498040676117, + "learning_rate": 0.0001, + "loss": 0.15, + "step": 2011 + }, + { + "epoch": 3.2800782523638734, + "grad_norm": 0.17215445637702942, + "learning_rate": 0.0001, + "loss": 0.1632, + "step": 2012 + }, + { + "epoch": 3.281708509944571, + "grad_norm": 0.17374494671821594, + "learning_rate": 0.0001, + "loss": 0.1626, + "step": 2013 + }, + { + "epoch": 3.2833387675252688, + "grad_norm": 0.14224795997142792, + "learning_rate": 0.0001, + "loss": 0.1463, + "step": 2014 + }, + { + "epoch": 3.284969025105967, + "grad_norm": 0.13347376883029938, + "learning_rate": 0.0001, + "loss": 0.1594, + "step": 2015 + }, + { + "epoch": 3.2865992826866646, + "grad_norm": 0.12881265580654144, + "learning_rate": 0.0001, + "loss": 0.1569, + "step": 2016 + }, + { + "epoch": 3.2882295402673622, + "grad_norm": 0.14080087840557098, + "learning_rate": 0.0001, + "loss": 0.1522, + "step": 2017 + }, + { + "epoch": 3.28985979784806, + "grad_norm": 0.16095654666423798, + "learning_rate": 0.0001, + "loss": 0.1608, + "step": 2018 + }, + { + "epoch": 3.2914900554287576, + "grad_norm": 0.11454547941684723, + "learning_rate": 0.0001, + "loss": 0.1356, + "step": 2019 + }, + { + "epoch": 3.2931203130094557, + "grad_norm": 0.13725461065769196, + "learning_rate": 0.0001, + "loss": 0.1502, + "step": 2020 + }, + { + "epoch": 3.2947505705901534, + "grad_norm": 0.22002211213111877, + "learning_rate": 0.0001, + "loss": 0.1513, + "step": 2021 + }, + { + "epoch": 3.296380828170851, + "grad_norm": 0.15117506682872772, + "learning_rate": 0.0001, + "loss": 0.1673, + "step": 2022 + }, + { + "epoch": 3.2980110857515488, + "grad_norm": 0.14475668966770172, + "learning_rate": 0.0001, + "loss": 0.157, + "step": 2023 + }, + { + "epoch": 3.2996413433322465, + "grad_norm": 0.13530389964580536, + "learning_rate": 0.0001, + "loss": 0.1447, + "step": 2024 + }, + { + "epoch": 3.301271600912944, + "grad_norm": 0.14106786251068115, + "learning_rate": 0.0001, + "loss": 0.1565, + "step": 2025 + }, + { + "epoch": 3.302901858493642, + "grad_norm": 0.16226093471050262, + "learning_rate": 0.0001, + "loss": 0.1623, + "step": 2026 + }, + { + "epoch": 3.3045321160743395, + "grad_norm": 0.1750541478395462, + "learning_rate": 0.0001, + "loss": 0.1672, + "step": 2027 + }, + { + "epoch": 3.3061623736550376, + "grad_norm": 0.15967412292957306, + "learning_rate": 0.0001, + "loss": 0.1487, + "step": 2028 + }, + { + "epoch": 3.3077926312357353, + "grad_norm": 0.18519139289855957, + "learning_rate": 0.0001, + "loss": 0.1666, + "step": 2029 + }, + { + "epoch": 3.309422888816433, + "grad_norm": 0.14711232483386993, + "learning_rate": 0.0001, + "loss": 0.1617, + "step": 2030 + }, + { + "epoch": 3.3110531463971307, + "grad_norm": 0.14961479604244232, + "learning_rate": 0.0001, + "loss": 0.149, + "step": 2031 + }, + { + "epoch": 3.3126834039778283, + "grad_norm": 0.16532696783542633, + "learning_rate": 0.0001, + "loss": 0.157, + "step": 2032 + }, + { + "epoch": 3.3143136615585265, + "grad_norm": 0.1479751318693161, + "learning_rate": 0.0001, + "loss": 0.164, + "step": 2033 + }, + { + "epoch": 3.315943919139224, + "grad_norm": 0.16249758005142212, + "learning_rate": 0.0001, + "loss": 0.1607, + "step": 2034 + }, + { + "epoch": 3.317574176719922, + "grad_norm": 0.15464435517787933, + "learning_rate": 0.0001, + "loss": 0.1514, + "step": 2035 + }, + { + "epoch": 3.3192044343006195, + "grad_norm": 0.12793324887752533, + "learning_rate": 0.0001, + "loss": 0.1558, + "step": 2036 + }, + { + "epoch": 3.320834691881317, + "grad_norm": 0.1387512981891632, + "learning_rate": 0.0001, + "loss": 0.1568, + "step": 2037 + }, + { + "epoch": 3.322464949462015, + "grad_norm": 0.14236465096473694, + "learning_rate": 0.0001, + "loss": 0.1571, + "step": 2038 + }, + { + "epoch": 3.3240952070427126, + "grad_norm": 0.12923693656921387, + "learning_rate": 0.0001, + "loss": 0.1368, + "step": 2039 + }, + { + "epoch": 3.3257254646234107, + "grad_norm": 0.14453841745853424, + "learning_rate": 0.0001, + "loss": 0.1476, + "step": 2040 + }, + { + "epoch": 3.3273557222041084, + "grad_norm": 0.13822996616363525, + "learning_rate": 0.0001, + "loss": 0.1458, + "step": 2041 + }, + { + "epoch": 3.328985979784806, + "grad_norm": 0.1528143286705017, + "learning_rate": 0.0001, + "loss": 0.1694, + "step": 2042 + }, + { + "epoch": 3.3306162373655037, + "grad_norm": 0.14497984945774078, + "learning_rate": 0.0001, + "loss": 0.1446, + "step": 2043 + }, + { + "epoch": 3.3322464949462014, + "grad_norm": 0.16992256045341492, + "learning_rate": 0.0001, + "loss": 0.1561, + "step": 2044 + }, + { + "epoch": 3.333876752526899, + "grad_norm": 0.16492198407649994, + "learning_rate": 0.0001, + "loss": 0.1653, + "step": 2045 + }, + { + "epoch": 3.335507010107597, + "grad_norm": 0.14697159826755524, + "learning_rate": 0.0001, + "loss": 0.145, + "step": 2046 + }, + { + "epoch": 3.337137267688295, + "grad_norm": 0.1527179777622223, + "learning_rate": 0.0001, + "loss": 0.154, + "step": 2047 + }, + { + "epoch": 3.3387675252689926, + "grad_norm": 0.14120477437973022, + "learning_rate": 0.0001, + "loss": 0.1583, + "step": 2048 + }, + { + "epoch": 3.3403977828496902, + "grad_norm": 0.13726529479026794, + "learning_rate": 0.0001, + "loss": 0.1491, + "step": 2049 + }, + { + "epoch": 3.342028040430388, + "grad_norm": 0.13249808549880981, + "learning_rate": 0.0001, + "loss": 0.1615, + "step": 2050 + }, + { + "epoch": 3.3436582980110856, + "grad_norm": 0.14583012461662292, + "learning_rate": 0.0001, + "loss": 0.1515, + "step": 2051 + }, + { + "epoch": 3.3452885555917833, + "grad_norm": 0.12669849395751953, + "learning_rate": 0.0001, + "loss": 0.136, + "step": 2052 + }, + { + "epoch": 3.3469188131724814, + "grad_norm": 0.15459409356117249, + "learning_rate": 0.0001, + "loss": 0.1408, + "step": 2053 + }, + { + "epoch": 3.348549070753179, + "grad_norm": 0.15946894884109497, + "learning_rate": 0.0001, + "loss": 0.1532, + "step": 2054 + }, + { + "epoch": 3.3501793283338768, + "grad_norm": 0.15201634168624878, + "learning_rate": 0.0001, + "loss": 0.1593, + "step": 2055 + }, + { + "epoch": 3.3518095859145745, + "grad_norm": 0.15227296948432922, + "learning_rate": 0.0001, + "loss": 0.1444, + "step": 2056 + }, + { + "epoch": 3.353439843495272, + "grad_norm": 0.15703243017196655, + "learning_rate": 0.0001, + "loss": 0.1547, + "step": 2057 + }, + { + "epoch": 3.35507010107597, + "grad_norm": 0.14738823473453522, + "learning_rate": 0.0001, + "loss": 0.1587, + "step": 2058 + }, + { + "epoch": 3.356700358656668, + "grad_norm": 0.1499371975660324, + "learning_rate": 0.0001, + "loss": 0.1602, + "step": 2059 + }, + { + "epoch": 3.3583306162373656, + "grad_norm": 0.15711835026741028, + "learning_rate": 0.0001, + "loss": 0.1648, + "step": 2060 + }, + { + "epoch": 3.3599608738180633, + "grad_norm": 0.13238079845905304, + "learning_rate": 0.0001, + "loss": 0.1369, + "step": 2061 + }, + { + "epoch": 3.361591131398761, + "grad_norm": 0.13998901844024658, + "learning_rate": 0.0001, + "loss": 0.1578, + "step": 2062 + }, + { + "epoch": 3.3632213889794587, + "grad_norm": 0.13403183221817017, + "learning_rate": 0.0001, + "loss": 0.1491, + "step": 2063 + }, + { + "epoch": 3.3648516465601563, + "grad_norm": 0.12701204419136047, + "learning_rate": 0.0001, + "loss": 0.1428, + "step": 2064 + }, + { + "epoch": 3.366481904140854, + "grad_norm": 0.13955272734165192, + "learning_rate": 0.0001, + "loss": 0.1618, + "step": 2065 + }, + { + "epoch": 3.368112161721552, + "grad_norm": 0.1396985650062561, + "learning_rate": 0.0001, + "loss": 0.1558, + "step": 2066 + }, + { + "epoch": 3.36974241930225, + "grad_norm": 0.1466887891292572, + "learning_rate": 0.0001, + "loss": 0.1565, + "step": 2067 + }, + { + "epoch": 3.3713726768829475, + "grad_norm": 0.148866206407547, + "learning_rate": 0.0001, + "loss": 0.1525, + "step": 2068 + }, + { + "epoch": 3.373002934463645, + "grad_norm": 0.16650554537773132, + "learning_rate": 0.0001, + "loss": 0.1728, + "step": 2069 + }, + { + "epoch": 3.374633192044343, + "grad_norm": 0.14089016616344452, + "learning_rate": 0.0001, + "loss": 0.1527, + "step": 2070 + }, + { + "epoch": 3.376263449625041, + "grad_norm": 0.15425576269626617, + "learning_rate": 0.0001, + "loss": 0.1612, + "step": 2071 + }, + { + "epoch": 3.3778937072057387, + "grad_norm": 0.2497289627790451, + "learning_rate": 0.0001, + "loss": 0.1661, + "step": 2072 + }, + { + "epoch": 3.3795239647864364, + "grad_norm": 0.15727926790714264, + "learning_rate": 0.0001, + "loss": 0.1641, + "step": 2073 + }, + { + "epoch": 3.381154222367134, + "grad_norm": 0.13486291468143463, + "learning_rate": 0.0001, + "loss": 0.1435, + "step": 2074 + }, + { + "epoch": 3.3827844799478317, + "grad_norm": 0.13273045420646667, + "learning_rate": 0.0001, + "loss": 0.1599, + "step": 2075 + }, + { + "epoch": 3.3844147375285294, + "grad_norm": 0.14401297271251678, + "learning_rate": 0.0001, + "loss": 0.162, + "step": 2076 + }, + { + "epoch": 3.386044995109227, + "grad_norm": 0.1639464646577835, + "learning_rate": 0.0001, + "loss": 0.1749, + "step": 2077 + }, + { + "epoch": 3.387675252689925, + "grad_norm": 0.14354901015758514, + "learning_rate": 0.0001, + "loss": 0.1571, + "step": 2078 + }, + { + "epoch": 3.389305510270623, + "grad_norm": 0.1772812008857727, + "learning_rate": 0.0001, + "loss": 0.164, + "step": 2079 + }, + { + "epoch": 3.3909357678513206, + "grad_norm": 0.1357707381248474, + "learning_rate": 0.0001, + "loss": 0.1572, + "step": 2080 + }, + { + "epoch": 3.3925660254320182, + "grad_norm": 0.15610982477664948, + "learning_rate": 0.0001, + "loss": 0.1718, + "step": 2081 + }, + { + "epoch": 3.394196283012716, + "grad_norm": 0.1462925523519516, + "learning_rate": 0.0001, + "loss": 0.1458, + "step": 2082 + }, + { + "epoch": 3.3958265405934136, + "grad_norm": 0.1445331573486328, + "learning_rate": 0.0001, + "loss": 0.1614, + "step": 2083 + }, + { + "epoch": 3.3974567981741117, + "grad_norm": 0.13011384010314941, + "learning_rate": 0.0001, + "loss": 0.1629, + "step": 2084 + }, + { + "epoch": 3.3990870557548094, + "grad_norm": 0.13652506470680237, + "learning_rate": 0.0001, + "loss": 0.145, + "step": 2085 + }, + { + "epoch": 3.400717313335507, + "grad_norm": 0.15374945104122162, + "learning_rate": 0.0001, + "loss": 0.1607, + "step": 2086 + }, + { + "epoch": 3.4023475709162048, + "grad_norm": 0.17759419977664948, + "learning_rate": 0.0001, + "loss": 0.1831, + "step": 2087 + }, + { + "epoch": 3.4039778284969024, + "grad_norm": 0.13576067984104156, + "learning_rate": 0.0001, + "loss": 0.1609, + "step": 2088 + }, + { + "epoch": 3.4056080860776, + "grad_norm": 0.14758118987083435, + "learning_rate": 0.0001, + "loss": 0.1648, + "step": 2089 + }, + { + "epoch": 3.407238343658298, + "grad_norm": 0.14185144007205963, + "learning_rate": 0.0001, + "loss": 0.1553, + "step": 2090 + }, + { + "epoch": 3.408868601238996, + "grad_norm": 0.15567590296268463, + "learning_rate": 0.0001, + "loss": 0.162, + "step": 2091 + }, + { + "epoch": 3.4104988588196936, + "grad_norm": 0.14208033680915833, + "learning_rate": 0.0001, + "loss": 0.1648, + "step": 2092 + }, + { + "epoch": 3.4121291164003913, + "grad_norm": 0.150962695479393, + "learning_rate": 0.0001, + "loss": 0.1659, + "step": 2093 + }, + { + "epoch": 3.413759373981089, + "grad_norm": 0.18408554792404175, + "learning_rate": 0.0001, + "loss": 0.1564, + "step": 2094 + }, + { + "epoch": 3.4153896315617867, + "grad_norm": 0.1329030841588974, + "learning_rate": 0.0001, + "loss": 0.1434, + "step": 2095 + }, + { + "epoch": 3.4170198891424843, + "grad_norm": 0.19760195910930634, + "learning_rate": 0.0001, + "loss": 0.1616, + "step": 2096 + }, + { + "epoch": 3.4186501467231825, + "grad_norm": 0.13045673072338104, + "learning_rate": 0.0001, + "loss": 0.1482, + "step": 2097 + }, + { + "epoch": 3.42028040430388, + "grad_norm": 0.13792401552200317, + "learning_rate": 0.0001, + "loss": 0.1491, + "step": 2098 + }, + { + "epoch": 3.421910661884578, + "grad_norm": 0.15530133247375488, + "learning_rate": 0.0001, + "loss": 0.1569, + "step": 2099 + }, + { + "epoch": 3.4235409194652755, + "grad_norm": 0.16320976614952087, + "learning_rate": 0.0001, + "loss": 0.1504, + "step": 2100 + }, + { + "epoch": 3.425171177045973, + "grad_norm": 0.16057944297790527, + "learning_rate": 0.0001, + "loss": 0.1575, + "step": 2101 + }, + { + "epoch": 3.426801434626671, + "grad_norm": 0.16377048194408417, + "learning_rate": 0.0001, + "loss": 0.1587, + "step": 2102 + }, + { + "epoch": 3.4284316922073685, + "grad_norm": 0.14860613644123077, + "learning_rate": 0.0001, + "loss": 0.1666, + "step": 2103 + }, + { + "epoch": 3.4300619497880667, + "grad_norm": 0.1717861294746399, + "learning_rate": 0.0001, + "loss": 0.1727, + "step": 2104 + }, + { + "epoch": 3.4316922073687643, + "grad_norm": 0.18878856301307678, + "learning_rate": 0.0001, + "loss": 0.1744, + "step": 2105 + }, + { + "epoch": 3.433322464949462, + "grad_norm": 0.22134870290756226, + "learning_rate": 0.0001, + "loss": 0.1502, + "step": 2106 + }, + { + "epoch": 3.4349527225301597, + "grad_norm": 0.12796226143836975, + "learning_rate": 0.0001, + "loss": 0.1472, + "step": 2107 + }, + { + "epoch": 3.4365829801108574, + "grad_norm": 0.17766544222831726, + "learning_rate": 0.0001, + "loss": 0.1631, + "step": 2108 + }, + { + "epoch": 3.4382132376915555, + "grad_norm": 0.16302259266376495, + "learning_rate": 0.0001, + "loss": 0.154, + "step": 2109 + }, + { + "epoch": 3.439843495272253, + "grad_norm": 0.12639376521110535, + "learning_rate": 0.0001, + "loss": 0.1595, + "step": 2110 + }, + { + "epoch": 3.441473752852951, + "grad_norm": 0.13385580480098724, + "learning_rate": 0.0001, + "loss": 0.148, + "step": 2111 + }, + { + "epoch": 3.4431040104336486, + "grad_norm": 0.16364267468452454, + "learning_rate": 0.0001, + "loss": 0.1521, + "step": 2112 + }, + { + "epoch": 3.4447342680143462, + "grad_norm": 0.1307305246591568, + "learning_rate": 0.0001, + "loss": 0.1623, + "step": 2113 + }, + { + "epoch": 3.446364525595044, + "grad_norm": 0.1305004358291626, + "learning_rate": 0.0001, + "loss": 0.1428, + "step": 2114 + }, + { + "epoch": 3.4479947831757416, + "grad_norm": 0.1550890952348709, + "learning_rate": 0.0001, + "loss": 0.1521, + "step": 2115 + }, + { + "epoch": 3.4496250407564393, + "grad_norm": 0.14810912311077118, + "learning_rate": 0.0001, + "loss": 0.1635, + "step": 2116 + }, + { + "epoch": 3.4512552983371374, + "grad_norm": 0.15610523521900177, + "learning_rate": 0.0001, + "loss": 0.1619, + "step": 2117 + }, + { + "epoch": 3.452885555917835, + "grad_norm": 0.1532841920852661, + "learning_rate": 0.0001, + "loss": 0.1609, + "step": 2118 + }, + { + "epoch": 3.4545158134985328, + "grad_norm": 0.14741961658000946, + "learning_rate": 0.0001, + "loss": 0.167, + "step": 2119 + }, + { + "epoch": 3.4561460710792304, + "grad_norm": 0.13078215718269348, + "learning_rate": 0.0001, + "loss": 0.1428, + "step": 2120 + }, + { + "epoch": 3.457776328659928, + "grad_norm": 0.18994919955730438, + "learning_rate": 0.0001, + "loss": 0.1548, + "step": 2121 + }, + { + "epoch": 3.4594065862406262, + "grad_norm": 0.12827709317207336, + "learning_rate": 0.0001, + "loss": 0.1586, + "step": 2122 + }, + { + "epoch": 3.461036843821324, + "grad_norm": 0.15168218314647675, + "learning_rate": 0.0001, + "loss": 0.1641, + "step": 2123 + }, + { + "epoch": 3.4626671014020216, + "grad_norm": 0.13048698008060455, + "learning_rate": 0.0001, + "loss": 0.1566, + "step": 2124 + }, + { + "epoch": 3.4642973589827193, + "grad_norm": 0.13012981414794922, + "learning_rate": 0.0001, + "loss": 0.1523, + "step": 2125 + }, + { + "epoch": 3.465927616563417, + "grad_norm": 0.14694590866565704, + "learning_rate": 0.0001, + "loss": 0.1473, + "step": 2126 + }, + { + "epoch": 3.4675578741441146, + "grad_norm": 0.13174067437648773, + "learning_rate": 0.0001, + "loss": 0.1531, + "step": 2127 + }, + { + "epoch": 3.4691881317248123, + "grad_norm": 0.18024425208568573, + "learning_rate": 0.0001, + "loss": 0.1624, + "step": 2128 + }, + { + "epoch": 3.4708183893055105, + "grad_norm": 0.14632080495357513, + "learning_rate": 0.0001, + "loss": 0.1709, + "step": 2129 + }, + { + "epoch": 3.472448646886208, + "grad_norm": 0.13840989768505096, + "learning_rate": 0.0001, + "loss": 0.1576, + "step": 2130 + }, + { + "epoch": 3.474078904466906, + "grad_norm": 0.1256123036146164, + "learning_rate": 0.0001, + "loss": 0.1486, + "step": 2131 + }, + { + "epoch": 3.4757091620476035, + "grad_norm": 0.19333600997924805, + "learning_rate": 0.0001, + "loss": 0.1526, + "step": 2132 + }, + { + "epoch": 3.477339419628301, + "grad_norm": 0.1740863025188446, + "learning_rate": 0.0001, + "loss": 0.162, + "step": 2133 + }, + { + "epoch": 3.478969677208999, + "grad_norm": 0.17544031143188477, + "learning_rate": 0.0001, + "loss": 0.1743, + "step": 2134 + }, + { + "epoch": 3.480599934789697, + "grad_norm": 0.13726355135440826, + "learning_rate": 0.0001, + "loss": 0.146, + "step": 2135 + }, + { + "epoch": 3.4822301923703947, + "grad_norm": 0.13387274742126465, + "learning_rate": 0.0001, + "loss": 0.1444, + "step": 2136 + }, + { + "epoch": 3.4838604499510923, + "grad_norm": 0.1480950564146042, + "learning_rate": 0.0001, + "loss": 0.1656, + "step": 2137 + }, + { + "epoch": 3.48549070753179, + "grad_norm": 0.14710885286331177, + "learning_rate": 0.0001, + "loss": 0.1501, + "step": 2138 + }, + { + "epoch": 3.4871209651124877, + "grad_norm": 0.14011640846729279, + "learning_rate": 0.0001, + "loss": 0.1571, + "step": 2139 + }, + { + "epoch": 3.4887512226931854, + "grad_norm": 0.15596067905426025, + "learning_rate": 0.0001, + "loss": 0.1686, + "step": 2140 + }, + { + "epoch": 3.490381480273883, + "grad_norm": 0.14129222929477692, + "learning_rate": 0.0001, + "loss": 0.1597, + "step": 2141 + }, + { + "epoch": 3.492011737854581, + "grad_norm": 0.1319301277399063, + "learning_rate": 0.0001, + "loss": 0.1619, + "step": 2142 + }, + { + "epoch": 3.493641995435279, + "grad_norm": 0.16085407137870789, + "learning_rate": 0.0001, + "loss": 0.1542, + "step": 2143 + }, + { + "epoch": 3.4952722530159765, + "grad_norm": 0.1328543722629547, + "learning_rate": 0.0001, + "loss": 0.1501, + "step": 2144 + }, + { + "epoch": 3.4969025105966742, + "grad_norm": 0.14981688559055328, + "learning_rate": 0.0001, + "loss": 0.1601, + "step": 2145 + }, + { + "epoch": 3.498532768177372, + "grad_norm": 0.14664553105831146, + "learning_rate": 0.0001, + "loss": 0.1583, + "step": 2146 + }, + { + "epoch": 3.50016302575807, + "grad_norm": 0.13246874511241913, + "learning_rate": 0.0001, + "loss": 0.1491, + "step": 2147 + }, + { + "epoch": 3.5017932833387677, + "grad_norm": 0.15695396065711975, + "learning_rate": 0.0001, + "loss": 0.1793, + "step": 2148 + }, + { + "epoch": 3.5034235409194654, + "grad_norm": 0.12087570875883102, + "learning_rate": 0.0001, + "loss": 0.1396, + "step": 2149 + }, + { + "epoch": 3.505053798500163, + "grad_norm": 0.13022390007972717, + "learning_rate": 0.0001, + "loss": 0.1577, + "step": 2150 + }, + { + "epoch": 3.5066840560808608, + "grad_norm": 0.15572933852672577, + "learning_rate": 0.0001, + "loss": 0.1532, + "step": 2151 + }, + { + "epoch": 3.5083143136615584, + "grad_norm": 0.17626436054706573, + "learning_rate": 0.0001, + "loss": 0.1604, + "step": 2152 + }, + { + "epoch": 3.509944571242256, + "grad_norm": 0.14553359150886536, + "learning_rate": 0.0001, + "loss": 0.163, + "step": 2153 + }, + { + "epoch": 3.511574828822954, + "grad_norm": 0.17382773756980896, + "learning_rate": 0.0001, + "loss": 0.1641, + "step": 2154 + }, + { + "epoch": 3.513205086403652, + "grad_norm": 0.12575078010559082, + "learning_rate": 0.0001, + "loss": 0.1493, + "step": 2155 + }, + { + "epoch": 3.5148353439843496, + "grad_norm": 0.1463012993335724, + "learning_rate": 0.0001, + "loss": 0.159, + "step": 2156 + }, + { + "epoch": 3.5164656015650473, + "grad_norm": 0.16834314167499542, + "learning_rate": 0.0001, + "loss": 0.1659, + "step": 2157 + }, + { + "epoch": 3.518095859145745, + "grad_norm": 0.15499548614025116, + "learning_rate": 0.0001, + "loss": 0.1584, + "step": 2158 + }, + { + "epoch": 3.5197261167264426, + "grad_norm": 0.13314367830753326, + "learning_rate": 0.0001, + "loss": 0.1515, + "step": 2159 + }, + { + "epoch": 3.5213563743071408, + "grad_norm": 0.1413407176733017, + "learning_rate": 0.0001, + "loss": 0.1614, + "step": 2160 + }, + { + "epoch": 3.5229866318878384, + "grad_norm": 0.14445054531097412, + "learning_rate": 0.0001, + "loss": 0.1595, + "step": 2161 + }, + { + "epoch": 3.524616889468536, + "grad_norm": 0.14501655101776123, + "learning_rate": 0.0001, + "loss": 0.1607, + "step": 2162 + }, + { + "epoch": 3.526247147049234, + "grad_norm": 0.14677855372428894, + "learning_rate": 0.0001, + "loss": 0.1588, + "step": 2163 + }, + { + "epoch": 3.5278774046299315, + "grad_norm": 0.13915497064590454, + "learning_rate": 0.0001, + "loss": 0.164, + "step": 2164 + }, + { + "epoch": 3.529507662210629, + "grad_norm": 0.12632952630519867, + "learning_rate": 0.0001, + "loss": 0.1442, + "step": 2165 + }, + { + "epoch": 3.531137919791327, + "grad_norm": 0.13267867267131805, + "learning_rate": 0.0001, + "loss": 0.159, + "step": 2166 + }, + { + "epoch": 3.5327681773720245, + "grad_norm": 0.12538422644138336, + "learning_rate": 0.0001, + "loss": 0.1409, + "step": 2167 + }, + { + "epoch": 3.5343984349527227, + "grad_norm": 0.13974052667617798, + "learning_rate": 0.0001, + "loss": 0.1461, + "step": 2168 + }, + { + "epoch": 3.5360286925334203, + "grad_norm": 0.1575179398059845, + "learning_rate": 0.0001, + "loss": 0.1596, + "step": 2169 + }, + { + "epoch": 3.537658950114118, + "grad_norm": 0.20956085622310638, + "learning_rate": 0.0001, + "loss": 0.1583, + "step": 2170 + }, + { + "epoch": 3.5392892076948157, + "grad_norm": 0.13932162523269653, + "learning_rate": 0.0001, + "loss": 0.1493, + "step": 2171 + }, + { + "epoch": 3.5409194652755134, + "grad_norm": 0.16261732578277588, + "learning_rate": 0.0001, + "loss": 0.1603, + "step": 2172 + }, + { + "epoch": 3.5425497228562115, + "grad_norm": 0.15178941190242767, + "learning_rate": 0.0001, + "loss": 0.1601, + "step": 2173 + }, + { + "epoch": 3.544179980436909, + "grad_norm": 0.17579567432403564, + "learning_rate": 0.0001, + "loss": 0.1531, + "step": 2174 + }, + { + "epoch": 3.545810238017607, + "grad_norm": 0.1470760852098465, + "learning_rate": 0.0001, + "loss": 0.1498, + "step": 2175 + }, + { + "epoch": 3.5474404955983045, + "grad_norm": 0.1457168012857437, + "learning_rate": 0.0001, + "loss": 0.1667, + "step": 2176 + }, + { + "epoch": 3.5490707531790022, + "grad_norm": 0.1363757997751236, + "learning_rate": 0.0001, + "loss": 0.1538, + "step": 2177 + }, + { + "epoch": 3.5507010107597, + "grad_norm": 0.15826475620269775, + "learning_rate": 0.0001, + "loss": 0.1643, + "step": 2178 + }, + { + "epoch": 3.5523312683403976, + "grad_norm": 0.13432374596595764, + "learning_rate": 0.0001, + "loss": 0.1487, + "step": 2179 + }, + { + "epoch": 3.5539615259210953, + "grad_norm": 0.1316751092672348, + "learning_rate": 0.0001, + "loss": 0.1515, + "step": 2180 + }, + { + "epoch": 3.5555917835017934, + "grad_norm": 0.14341945946216583, + "learning_rate": 0.0001, + "loss": 0.1507, + "step": 2181 + }, + { + "epoch": 3.557222041082491, + "grad_norm": 0.15055926144123077, + "learning_rate": 0.0001, + "loss": 0.164, + "step": 2182 + }, + { + "epoch": 3.5588522986631888, + "grad_norm": 0.18501447141170502, + "learning_rate": 0.0001, + "loss": 0.1526, + "step": 2183 + }, + { + "epoch": 3.5604825562438864, + "grad_norm": 0.13240905106067657, + "learning_rate": 0.0001, + "loss": 0.1487, + "step": 2184 + }, + { + "epoch": 3.5621128138245846, + "grad_norm": 0.15811198949813843, + "learning_rate": 0.0001, + "loss": 0.1659, + "step": 2185 + }, + { + "epoch": 3.5637430714052822, + "grad_norm": 0.17472024261951447, + "learning_rate": 0.0001, + "loss": 0.1605, + "step": 2186 + }, + { + "epoch": 3.56537332898598, + "grad_norm": 0.1416904181241989, + "learning_rate": 0.0001, + "loss": 0.1563, + "step": 2187 + }, + { + "epoch": 3.5670035865666776, + "grad_norm": 0.18020367622375488, + "learning_rate": 0.0001, + "loss": 0.1733, + "step": 2188 + }, + { + "epoch": 3.5686338441473753, + "grad_norm": 0.1482088267803192, + "learning_rate": 0.0001, + "loss": 0.1619, + "step": 2189 + }, + { + "epoch": 3.570264101728073, + "grad_norm": 0.2286827713251114, + "learning_rate": 0.0001, + "loss": 0.1745, + "step": 2190 + }, + { + "epoch": 3.5718943593087706, + "grad_norm": 0.15454155206680298, + "learning_rate": 0.0001, + "loss": 0.1571, + "step": 2191 + }, + { + "epoch": 3.5735246168894683, + "grad_norm": 0.15183959901332855, + "learning_rate": 0.0001, + "loss": 0.1604, + "step": 2192 + }, + { + "epoch": 3.5751548744701664, + "grad_norm": 0.1492152214050293, + "learning_rate": 0.0001, + "loss": 0.1735, + "step": 2193 + }, + { + "epoch": 3.576785132050864, + "grad_norm": 0.15264061093330383, + "learning_rate": 0.0001, + "loss": 0.1541, + "step": 2194 + }, + { + "epoch": 3.578415389631562, + "grad_norm": 0.13549228012561798, + "learning_rate": 0.0001, + "loss": 0.1521, + "step": 2195 + }, + { + "epoch": 3.5800456472122595, + "grad_norm": 0.1467984914779663, + "learning_rate": 0.0001, + "loss": 0.1607, + "step": 2196 + }, + { + "epoch": 3.581675904792957, + "grad_norm": 0.13193894922733307, + "learning_rate": 0.0001, + "loss": 0.158, + "step": 2197 + }, + { + "epoch": 3.5833061623736553, + "grad_norm": 0.1380847692489624, + "learning_rate": 0.0001, + "loss": 0.1567, + "step": 2198 + }, + { + "epoch": 3.584936419954353, + "grad_norm": 0.12376043945550919, + "learning_rate": 0.0001, + "loss": 0.1642, + "step": 2199 + }, + { + "epoch": 3.5865666775350507, + "grad_norm": 0.1520168036222458, + "learning_rate": 0.0001, + "loss": 0.1668, + "step": 2200 + }, + { + "epoch": 3.5881969351157483, + "grad_norm": 0.15130695700645447, + "learning_rate": 0.0001, + "loss": 0.1607, + "step": 2201 + }, + { + "epoch": 3.589827192696446, + "grad_norm": 0.21414104104042053, + "learning_rate": 0.0001, + "loss": 0.1748, + "step": 2202 + }, + { + "epoch": 3.5914574502771437, + "grad_norm": 0.13339370489120483, + "learning_rate": 0.0001, + "loss": 0.1569, + "step": 2203 + }, + { + "epoch": 3.5930877078578414, + "grad_norm": 0.14717626571655273, + "learning_rate": 0.0001, + "loss": 0.1485, + "step": 2204 + }, + { + "epoch": 3.594717965438539, + "grad_norm": 0.14454171061515808, + "learning_rate": 0.0001, + "loss": 0.1561, + "step": 2205 + }, + { + "epoch": 3.596348223019237, + "grad_norm": 0.1337466984987259, + "learning_rate": 0.0001, + "loss": 0.1548, + "step": 2206 + }, + { + "epoch": 3.597978480599935, + "grad_norm": 0.13074032962322235, + "learning_rate": 0.0001, + "loss": 0.1602, + "step": 2207 + }, + { + "epoch": 3.5996087381806325, + "grad_norm": 0.1627005934715271, + "learning_rate": 0.0001, + "loss": 0.1582, + "step": 2208 + }, + { + "epoch": 3.60123899576133, + "grad_norm": 0.16543006896972656, + "learning_rate": 0.0001, + "loss": 0.1699, + "step": 2209 + }, + { + "epoch": 3.602869253342028, + "grad_norm": 0.15218278765678406, + "learning_rate": 0.0001, + "loss": 0.164, + "step": 2210 + }, + { + "epoch": 3.604499510922726, + "grad_norm": 0.15008680522441864, + "learning_rate": 0.0001, + "loss": 0.1567, + "step": 2211 + }, + { + "epoch": 3.6061297685034237, + "grad_norm": 0.1402311772108078, + "learning_rate": 0.0001, + "loss": 0.149, + "step": 2212 + }, + { + "epoch": 3.6077600260841214, + "grad_norm": 0.12671758234500885, + "learning_rate": 0.0001, + "loss": 0.1452, + "step": 2213 + }, + { + "epoch": 3.609390283664819, + "grad_norm": 0.13454151153564453, + "learning_rate": 0.0001, + "loss": 0.1565, + "step": 2214 + }, + { + "epoch": 3.6110205412455167, + "grad_norm": 0.13921399414539337, + "learning_rate": 0.0001, + "loss": 0.1686, + "step": 2215 + }, + { + "epoch": 3.6126507988262144, + "grad_norm": 0.15764585137367249, + "learning_rate": 0.0001, + "loss": 0.1696, + "step": 2216 + }, + { + "epoch": 3.614281056406912, + "grad_norm": 0.13245287537574768, + "learning_rate": 0.0001, + "loss": 0.1497, + "step": 2217 + }, + { + "epoch": 3.61591131398761, + "grad_norm": 0.15295889973640442, + "learning_rate": 0.0001, + "loss": 0.1548, + "step": 2218 + }, + { + "epoch": 3.617541571568308, + "grad_norm": 0.1387680172920227, + "learning_rate": 0.0001, + "loss": 0.1566, + "step": 2219 + }, + { + "epoch": 3.6191718291490056, + "grad_norm": 0.16183353960514069, + "learning_rate": 0.0001, + "loss": 0.1713, + "step": 2220 + }, + { + "epoch": 3.6208020867297033, + "grad_norm": 0.15998132526874542, + "learning_rate": 0.0001, + "loss": 0.1716, + "step": 2221 + }, + { + "epoch": 3.622432344310401, + "grad_norm": 0.14948931336402893, + "learning_rate": 0.0001, + "loss": 0.1558, + "step": 2222 + }, + { + "epoch": 3.6240626018910986, + "grad_norm": 0.13298356533050537, + "learning_rate": 0.0001, + "loss": 0.1497, + "step": 2223 + }, + { + "epoch": 3.6256928594717968, + "grad_norm": 0.1536460816860199, + "learning_rate": 0.0001, + "loss": 0.1637, + "step": 2224 + }, + { + "epoch": 3.6273231170524944, + "grad_norm": 0.13881844282150269, + "learning_rate": 0.0001, + "loss": 0.1637, + "step": 2225 + }, + { + "epoch": 3.628953374633192, + "grad_norm": 0.14646394550800323, + "learning_rate": 0.0001, + "loss": 0.1608, + "step": 2226 + }, + { + "epoch": 3.63058363221389, + "grad_norm": 0.16166602075099945, + "learning_rate": 0.0001, + "loss": 0.1587, + "step": 2227 + }, + { + "epoch": 3.6322138897945875, + "grad_norm": 0.1409720927476883, + "learning_rate": 0.0001, + "loss": 0.1553, + "step": 2228 + }, + { + "epoch": 3.633844147375285, + "grad_norm": 0.14227084815502167, + "learning_rate": 0.0001, + "loss": 0.164, + "step": 2229 + }, + { + "epoch": 3.635474404955983, + "grad_norm": 0.13686221837997437, + "learning_rate": 0.0001, + "loss": 0.1513, + "step": 2230 + }, + { + "epoch": 3.6371046625366805, + "grad_norm": 0.1413184404373169, + "learning_rate": 0.0001, + "loss": 0.1614, + "step": 2231 + }, + { + "epoch": 3.6387349201173786, + "grad_norm": 0.15510554611682892, + "learning_rate": 0.0001, + "loss": 0.1584, + "step": 2232 + }, + { + "epoch": 3.6403651776980763, + "grad_norm": 0.17040590941905975, + "learning_rate": 0.0001, + "loss": 0.1584, + "step": 2233 + }, + { + "epoch": 3.641995435278774, + "grad_norm": 0.14824065566062927, + "learning_rate": 0.0001, + "loss": 0.1626, + "step": 2234 + }, + { + "epoch": 3.6436256928594717, + "grad_norm": 0.13972316682338715, + "learning_rate": 0.0001, + "loss": 0.1615, + "step": 2235 + }, + { + "epoch": 3.64525595044017, + "grad_norm": 0.13162623345851898, + "learning_rate": 0.0001, + "loss": 0.1767, + "step": 2236 + }, + { + "epoch": 3.6468862080208675, + "grad_norm": 0.1307874321937561, + "learning_rate": 0.0001, + "loss": 0.1464, + "step": 2237 + }, + { + "epoch": 3.648516465601565, + "grad_norm": 0.13576972484588623, + "learning_rate": 0.0001, + "loss": 0.1629, + "step": 2238 + }, + { + "epoch": 3.650146723182263, + "grad_norm": 0.17965367436408997, + "learning_rate": 0.0001, + "loss": 0.1658, + "step": 2239 + }, + { + "epoch": 3.6517769807629605, + "grad_norm": 0.1429440826177597, + "learning_rate": 0.0001, + "loss": 0.1532, + "step": 2240 + }, + { + "epoch": 3.653407238343658, + "grad_norm": 0.1440856158733368, + "learning_rate": 0.0001, + "loss": 0.1483, + "step": 2241 + }, + { + "epoch": 3.655037495924356, + "grad_norm": 0.15915343165397644, + "learning_rate": 0.0001, + "loss": 0.1551, + "step": 2242 + }, + { + "epoch": 3.6566677535050536, + "grad_norm": 0.16080951690673828, + "learning_rate": 0.0001, + "loss": 0.1573, + "step": 2243 + }, + { + "epoch": 3.6582980110857517, + "grad_norm": 0.166608989238739, + "learning_rate": 0.0001, + "loss": 0.1501, + "step": 2244 + }, + { + "epoch": 3.6599282686664494, + "grad_norm": 0.16346345841884613, + "learning_rate": 0.0001, + "loss": 0.1637, + "step": 2245 + }, + { + "epoch": 3.661558526247147, + "grad_norm": 0.1675470620393753, + "learning_rate": 0.0001, + "loss": 0.1635, + "step": 2246 + }, + { + "epoch": 3.6631887838278447, + "grad_norm": 0.15890179574489594, + "learning_rate": 0.0001, + "loss": 0.1633, + "step": 2247 + }, + { + "epoch": 3.6648190414085424, + "grad_norm": 0.1385503113269806, + "learning_rate": 0.0001, + "loss": 0.1517, + "step": 2248 + }, + { + "epoch": 3.6664492989892405, + "grad_norm": 0.14586155116558075, + "learning_rate": 0.0001, + "loss": 0.1647, + "step": 2249 + }, + { + "epoch": 3.6680795565699382, + "grad_norm": 0.15654848515987396, + "learning_rate": 0.0001, + "loss": 0.1782, + "step": 2250 + }, + { + "epoch": 3.669709814150636, + "grad_norm": 0.14317664504051208, + "learning_rate": 0.0001, + "loss": 0.1527, + "step": 2251 + }, + { + "epoch": 3.6713400717313336, + "grad_norm": 0.1403859406709671, + "learning_rate": 0.0001, + "loss": 0.1611, + "step": 2252 + }, + { + "epoch": 3.6729703293120313, + "grad_norm": 0.16431018710136414, + "learning_rate": 0.0001, + "loss": 0.1633, + "step": 2253 + }, + { + "epoch": 3.674600586892729, + "grad_norm": 0.14151932299137115, + "learning_rate": 0.0001, + "loss": 0.1465, + "step": 2254 + }, + { + "epoch": 3.6762308444734266, + "grad_norm": 0.1487053781747818, + "learning_rate": 0.0001, + "loss": 0.1595, + "step": 2255 + }, + { + "epoch": 3.6778611020541243, + "grad_norm": 0.15442033112049103, + "learning_rate": 0.0001, + "loss": 0.1564, + "step": 2256 + }, + { + "epoch": 3.6794913596348224, + "grad_norm": 0.13494384288787842, + "learning_rate": 0.0001, + "loss": 0.1544, + "step": 2257 + }, + { + "epoch": 3.68112161721552, + "grad_norm": 0.1306290328502655, + "learning_rate": 0.0001, + "loss": 0.1472, + "step": 2258 + }, + { + "epoch": 3.682751874796218, + "grad_norm": 0.15780360996723175, + "learning_rate": 0.0001, + "loss": 0.1579, + "step": 2259 + }, + { + "epoch": 3.6843821323769155, + "grad_norm": 0.14508086442947388, + "learning_rate": 0.0001, + "loss": 0.1634, + "step": 2260 + }, + { + "epoch": 3.686012389957613, + "grad_norm": 0.21266472339630127, + "learning_rate": 0.0001, + "loss": 0.1639, + "step": 2261 + }, + { + "epoch": 3.6876426475383113, + "grad_norm": 0.14544187486171722, + "learning_rate": 0.0001, + "loss": 0.1665, + "step": 2262 + }, + { + "epoch": 3.689272905119009, + "grad_norm": 0.1321001499891281, + "learning_rate": 0.0001, + "loss": 0.1571, + "step": 2263 + }, + { + "epoch": 3.6909031626997066, + "grad_norm": 0.15363967418670654, + "learning_rate": 0.0001, + "loss": 0.1452, + "step": 2264 + }, + { + "epoch": 3.6925334202804043, + "grad_norm": 0.149633526802063, + "learning_rate": 0.0001, + "loss": 0.1603, + "step": 2265 + }, + { + "epoch": 3.694163677861102, + "grad_norm": 0.15210863947868347, + "learning_rate": 0.0001, + "loss": 0.1528, + "step": 2266 + }, + { + "epoch": 3.6957939354417997, + "grad_norm": 0.1460522711277008, + "learning_rate": 0.0001, + "loss": 0.1634, + "step": 2267 + }, + { + "epoch": 3.6974241930224974, + "grad_norm": 0.15007948875427246, + "learning_rate": 0.0001, + "loss": 0.1468, + "step": 2268 + }, + { + "epoch": 3.699054450603195, + "grad_norm": 0.15326817333698273, + "learning_rate": 0.0001, + "loss": 0.1586, + "step": 2269 + }, + { + "epoch": 3.700684708183893, + "grad_norm": 0.14278486371040344, + "learning_rate": 0.0001, + "loss": 0.1694, + "step": 2270 + }, + { + "epoch": 3.702314965764591, + "grad_norm": 0.14537227153778076, + "learning_rate": 0.0001, + "loss": 0.1559, + "step": 2271 + }, + { + "epoch": 3.7039452233452885, + "grad_norm": 0.1516200751066208, + "learning_rate": 0.0001, + "loss": 0.1577, + "step": 2272 + }, + { + "epoch": 3.705575480925986, + "grad_norm": 0.14554426074028015, + "learning_rate": 0.0001, + "loss": 0.1645, + "step": 2273 + }, + { + "epoch": 3.707205738506684, + "grad_norm": 0.1325369030237198, + "learning_rate": 0.0001, + "loss": 0.1614, + "step": 2274 + }, + { + "epoch": 3.708835996087382, + "grad_norm": 0.1257917732000351, + "learning_rate": 0.0001, + "loss": 0.1445, + "step": 2275 + }, + { + "epoch": 3.7104662536680797, + "grad_norm": 0.1275888830423355, + "learning_rate": 0.0001, + "loss": 0.16, + "step": 2276 + }, + { + "epoch": 3.7120965112487774, + "grad_norm": 0.14174294471740723, + "learning_rate": 0.0001, + "loss": 0.1551, + "step": 2277 + }, + { + "epoch": 3.713726768829475, + "grad_norm": 0.16611164808273315, + "learning_rate": 0.0001, + "loss": 0.1635, + "step": 2278 + }, + { + "epoch": 3.7153570264101727, + "grad_norm": 0.1329105794429779, + "learning_rate": 0.0001, + "loss": 0.153, + "step": 2279 + }, + { + "epoch": 3.7169872839908704, + "grad_norm": 0.13968057930469513, + "learning_rate": 0.0001, + "loss": 0.158, + "step": 2280 + }, + { + "epoch": 3.718617541571568, + "grad_norm": 0.1441374272108078, + "learning_rate": 0.0001, + "loss": 0.1452, + "step": 2281 + }, + { + "epoch": 3.720247799152266, + "grad_norm": 0.14315244555473328, + "learning_rate": 0.0001, + "loss": 0.1561, + "step": 2282 + }, + { + "epoch": 3.721878056732964, + "grad_norm": 0.12441976368427277, + "learning_rate": 0.0001, + "loss": 0.1544, + "step": 2283 + }, + { + "epoch": 3.7235083143136616, + "grad_norm": 0.14892461895942688, + "learning_rate": 0.0001, + "loss": 0.1622, + "step": 2284 + }, + { + "epoch": 3.7251385718943593, + "grad_norm": 0.11621859669685364, + "learning_rate": 0.0001, + "loss": 0.1441, + "step": 2285 + }, + { + "epoch": 3.726768829475057, + "grad_norm": 0.1611883044242859, + "learning_rate": 0.0001, + "loss": 0.1531, + "step": 2286 + }, + { + "epoch": 3.728399087055755, + "grad_norm": 0.15095670521259308, + "learning_rate": 0.0001, + "loss": 0.1671, + "step": 2287 + }, + { + "epoch": 3.7300293446364527, + "grad_norm": 0.1477982997894287, + "learning_rate": 0.0001, + "loss": 0.1529, + "step": 2288 + }, + { + "epoch": 3.7316596022171504, + "grad_norm": 0.17801976203918457, + "learning_rate": 0.0001, + "loss": 0.1698, + "step": 2289 + }, + { + "epoch": 3.733289859797848, + "grad_norm": 0.12449807673692703, + "learning_rate": 0.0001, + "loss": 0.1411, + "step": 2290 + }, + { + "epoch": 3.734920117378546, + "grad_norm": 0.14423514902591705, + "learning_rate": 0.0001, + "loss": 0.1507, + "step": 2291 + }, + { + "epoch": 3.7365503749592435, + "grad_norm": 0.15219520032405853, + "learning_rate": 0.0001, + "loss": 0.1746, + "step": 2292 + }, + { + "epoch": 3.738180632539941, + "grad_norm": 0.13563816249370575, + "learning_rate": 0.0001, + "loss": 0.1446, + "step": 2293 + }, + { + "epoch": 3.739810890120639, + "grad_norm": 0.13731209933757782, + "learning_rate": 0.0001, + "loss": 0.1467, + "step": 2294 + }, + { + "epoch": 3.741441147701337, + "grad_norm": 0.14237910509109497, + "learning_rate": 0.0001, + "loss": 0.1635, + "step": 2295 + }, + { + "epoch": 3.7430714052820346, + "grad_norm": 0.13505598902702332, + "learning_rate": 0.0001, + "loss": 0.148, + "step": 2296 + }, + { + "epoch": 3.7447016628627323, + "grad_norm": 0.17628848552703857, + "learning_rate": 0.0001, + "loss": 0.1539, + "step": 2297 + }, + { + "epoch": 3.74633192044343, + "grad_norm": 0.14530231058597565, + "learning_rate": 0.0001, + "loss": 0.1572, + "step": 2298 + }, + { + "epoch": 3.7479621780241277, + "grad_norm": 0.14856889843940735, + "learning_rate": 0.0001, + "loss": 0.1623, + "step": 2299 + }, + { + "epoch": 3.749592435604826, + "grad_norm": 0.170349583029747, + "learning_rate": 0.0001, + "loss": 0.1598, + "step": 2300 + }, + { + "epoch": 3.7512226931855235, + "grad_norm": 0.14849968254566193, + "learning_rate": 0.0001, + "loss": 0.1751, + "step": 2301 + }, + { + "epoch": 3.752852950766221, + "grad_norm": 0.1442185491323471, + "learning_rate": 0.0001, + "loss": 0.1454, + "step": 2302 + }, + { + "epoch": 3.754483208346919, + "grad_norm": 0.14942088723182678, + "learning_rate": 0.0001, + "loss": 0.151, + "step": 2303 + }, + { + "epoch": 3.7561134659276165, + "grad_norm": 0.15279610455036163, + "learning_rate": 0.0001, + "loss": 0.1574, + "step": 2304 + }, + { + "epoch": 3.757743723508314, + "grad_norm": 0.1706325262784958, + "learning_rate": 0.0001, + "loss": 0.1537, + "step": 2305 + }, + { + "epoch": 3.759373981089012, + "grad_norm": 0.15288381278514862, + "learning_rate": 0.0001, + "loss": 0.1618, + "step": 2306 + }, + { + "epoch": 3.7610042386697096, + "grad_norm": 0.1183413565158844, + "learning_rate": 0.0001, + "loss": 0.1478, + "step": 2307 + }, + { + "epoch": 3.7626344962504077, + "grad_norm": 0.13402055203914642, + "learning_rate": 0.0001, + "loss": 0.1464, + "step": 2308 + }, + { + "epoch": 3.7642647538311054, + "grad_norm": 0.14990192651748657, + "learning_rate": 0.0001, + "loss": 0.1568, + "step": 2309 + }, + { + "epoch": 3.765895011411803, + "grad_norm": 0.15893998742103577, + "learning_rate": 0.0001, + "loss": 0.1541, + "step": 2310 + }, + { + "epoch": 3.7675252689925007, + "grad_norm": 0.15546587109565735, + "learning_rate": 0.0001, + "loss": 0.1541, + "step": 2311 + }, + { + "epoch": 3.7691555265731984, + "grad_norm": 0.16592715680599213, + "learning_rate": 0.0001, + "loss": 0.1632, + "step": 2312 + }, + { + "epoch": 3.7707857841538965, + "grad_norm": 0.15784431993961334, + "learning_rate": 0.0001, + "loss": 0.1627, + "step": 2313 + }, + { + "epoch": 3.772416041734594, + "grad_norm": 0.16422522068023682, + "learning_rate": 0.0001, + "loss": 0.1589, + "step": 2314 + }, + { + "epoch": 3.774046299315292, + "grad_norm": 0.14868198335170746, + "learning_rate": 0.0001, + "loss": 0.168, + "step": 2315 + }, + { + "epoch": 3.7756765568959896, + "grad_norm": 0.1566770076751709, + "learning_rate": 0.0001, + "loss": 0.1657, + "step": 2316 + }, + { + "epoch": 3.7773068144766873, + "grad_norm": 0.16978543996810913, + "learning_rate": 0.0001, + "loss": 0.1442, + "step": 2317 + }, + { + "epoch": 3.778937072057385, + "grad_norm": 0.15802276134490967, + "learning_rate": 0.0001, + "loss": 0.1564, + "step": 2318 + }, + { + "epoch": 3.7805673296380826, + "grad_norm": 0.15518346428871155, + "learning_rate": 0.0001, + "loss": 0.156, + "step": 2319 + }, + { + "epoch": 3.7821975872187803, + "grad_norm": 0.1578715294599533, + "learning_rate": 0.0001, + "loss": 0.1687, + "step": 2320 + }, + { + "epoch": 3.7838278447994784, + "grad_norm": 0.14465771615505219, + "learning_rate": 0.0001, + "loss": 0.1474, + "step": 2321 + }, + { + "epoch": 3.785458102380176, + "grad_norm": 0.1761205941438675, + "learning_rate": 0.0001, + "loss": 0.1566, + "step": 2322 + }, + { + "epoch": 3.787088359960874, + "grad_norm": 0.1283109188079834, + "learning_rate": 0.0001, + "loss": 0.1473, + "step": 2323 + }, + { + "epoch": 3.7887186175415715, + "grad_norm": 0.14071565866470337, + "learning_rate": 0.0001, + "loss": 0.151, + "step": 2324 + }, + { + "epoch": 3.7903488751222696, + "grad_norm": 0.17781133949756622, + "learning_rate": 0.0001, + "loss": 0.1635, + "step": 2325 + }, + { + "epoch": 3.7919791327029673, + "grad_norm": 0.1420576274394989, + "learning_rate": 0.0001, + "loss": 0.1668, + "step": 2326 + }, + { + "epoch": 3.793609390283665, + "grad_norm": 0.1406291425228119, + "learning_rate": 0.0001, + "loss": 0.1454, + "step": 2327 + }, + { + "epoch": 3.7952396478643626, + "grad_norm": 0.14093580842018127, + "learning_rate": 0.0001, + "loss": 0.1495, + "step": 2328 + }, + { + "epoch": 3.7968699054450603, + "grad_norm": 0.12997625768184662, + "learning_rate": 0.0001, + "loss": 0.1512, + "step": 2329 + }, + { + "epoch": 3.798500163025758, + "grad_norm": 0.1375696063041687, + "learning_rate": 0.0001, + "loss": 0.1515, + "step": 2330 + }, + { + "epoch": 3.8001304206064557, + "grad_norm": 0.15136592090129852, + "learning_rate": 0.0001, + "loss": 0.1583, + "step": 2331 + }, + { + "epoch": 3.8017606781871534, + "grad_norm": 0.1388722062110901, + "learning_rate": 0.0001, + "loss": 0.1516, + "step": 2332 + }, + { + "epoch": 3.8033909357678515, + "grad_norm": 0.15683171153068542, + "learning_rate": 0.0001, + "loss": 0.1726, + "step": 2333 + }, + { + "epoch": 3.805021193348549, + "grad_norm": 0.13569128513336182, + "learning_rate": 0.0001, + "loss": 0.1585, + "step": 2334 + }, + { + "epoch": 3.806651450929247, + "grad_norm": 0.14862379431724548, + "learning_rate": 0.0001, + "loss": 0.1553, + "step": 2335 + }, + { + "epoch": 3.8082817085099445, + "grad_norm": 0.14512869715690613, + "learning_rate": 0.0001, + "loss": 0.1635, + "step": 2336 + }, + { + "epoch": 3.809911966090642, + "grad_norm": 0.13408806920051575, + "learning_rate": 0.0001, + "loss": 0.1732, + "step": 2337 + }, + { + "epoch": 3.8115422236713403, + "grad_norm": 0.1384328156709671, + "learning_rate": 0.0001, + "loss": 0.1631, + "step": 2338 + }, + { + "epoch": 3.813172481252038, + "grad_norm": 0.1299862563610077, + "learning_rate": 0.0001, + "loss": 0.16, + "step": 2339 + }, + { + "epoch": 3.8148027388327357, + "grad_norm": 0.13506759703159332, + "learning_rate": 0.0001, + "loss": 0.1719, + "step": 2340 + }, + { + "epoch": 3.8164329964134334, + "grad_norm": 0.1495598405599594, + "learning_rate": 0.0001, + "loss": 0.1501, + "step": 2341 + }, + { + "epoch": 3.818063253994131, + "grad_norm": 0.1325817108154297, + "learning_rate": 0.0001, + "loss": 0.1525, + "step": 2342 + }, + { + "epoch": 3.8196935115748287, + "grad_norm": 0.13198593258857727, + "learning_rate": 0.0001, + "loss": 0.1731, + "step": 2343 + }, + { + "epoch": 3.8213237691555264, + "grad_norm": 0.1505766659975052, + "learning_rate": 0.0001, + "loss": 0.1608, + "step": 2344 + }, + { + "epoch": 3.822954026736224, + "grad_norm": 0.13334894180297852, + "learning_rate": 0.0001, + "loss": 0.1497, + "step": 2345 + }, + { + "epoch": 3.824584284316922, + "grad_norm": 0.15418575704097748, + "learning_rate": 0.0001, + "loss": 0.1588, + "step": 2346 + }, + { + "epoch": 3.82621454189762, + "grad_norm": 0.1457456350326538, + "learning_rate": 0.0001, + "loss": 0.1482, + "step": 2347 + }, + { + "epoch": 3.8278447994783176, + "grad_norm": 0.20289213955402374, + "learning_rate": 0.0001, + "loss": 0.1491, + "step": 2348 + }, + { + "epoch": 3.8294750570590153, + "grad_norm": 0.15711665153503418, + "learning_rate": 0.0001, + "loss": 0.16, + "step": 2349 + }, + { + "epoch": 3.831105314639713, + "grad_norm": 0.17610590159893036, + "learning_rate": 0.0001, + "loss": 0.1741, + "step": 2350 + }, + { + "epoch": 3.832735572220411, + "grad_norm": 0.15204782783985138, + "learning_rate": 0.0001, + "loss": 0.1443, + "step": 2351 + }, + { + "epoch": 3.8343658298011087, + "grad_norm": 0.18176928162574768, + "learning_rate": 0.0001, + "loss": 0.1685, + "step": 2352 + }, + { + "epoch": 3.8359960873818064, + "grad_norm": 0.16083329916000366, + "learning_rate": 0.0001, + "loss": 0.1564, + "step": 2353 + }, + { + "epoch": 3.837626344962504, + "grad_norm": 0.1513577103614807, + "learning_rate": 0.0001, + "loss": 0.1556, + "step": 2354 + }, + { + "epoch": 3.839256602543202, + "grad_norm": 0.15226048231124878, + "learning_rate": 0.0001, + "loss": 0.1565, + "step": 2355 + }, + { + "epoch": 3.8408868601238995, + "grad_norm": 0.16025280952453613, + "learning_rate": 0.0001, + "loss": 0.1512, + "step": 2356 + }, + { + "epoch": 3.842517117704597, + "grad_norm": 0.16616910696029663, + "learning_rate": 0.0001, + "loss": 0.1675, + "step": 2357 + }, + { + "epoch": 3.844147375285295, + "grad_norm": 0.14858278632164001, + "learning_rate": 0.0001, + "loss": 0.1332, + "step": 2358 + }, + { + "epoch": 3.845777632865993, + "grad_norm": 0.15278182923793793, + "learning_rate": 0.0001, + "loss": 0.154, + "step": 2359 + }, + { + "epoch": 3.8474078904466906, + "grad_norm": 0.15966159105300903, + "learning_rate": 0.0001, + "loss": 0.1541, + "step": 2360 + }, + { + "epoch": 3.8490381480273883, + "grad_norm": 0.12534458935260773, + "learning_rate": 0.0001, + "loss": 0.1547, + "step": 2361 + }, + { + "epoch": 3.850668405608086, + "grad_norm": 0.14975754916667938, + "learning_rate": 0.0001, + "loss": 0.179, + "step": 2362 + }, + { + "epoch": 3.8522986631887837, + "grad_norm": 0.130054771900177, + "learning_rate": 0.0001, + "loss": 0.1596, + "step": 2363 + }, + { + "epoch": 3.853928920769482, + "grad_norm": 0.1336638629436493, + "learning_rate": 0.0001, + "loss": 0.1566, + "step": 2364 + }, + { + "epoch": 3.8555591783501795, + "grad_norm": 0.14911198616027832, + "learning_rate": 0.0001, + "loss": 0.1617, + "step": 2365 + }, + { + "epoch": 3.857189435930877, + "grad_norm": 0.1424766629934311, + "learning_rate": 0.0001, + "loss": 0.164, + "step": 2366 + }, + { + "epoch": 3.858819693511575, + "grad_norm": 0.13621950149536133, + "learning_rate": 0.0001, + "loss": 0.1584, + "step": 2367 + }, + { + "epoch": 3.8604499510922725, + "grad_norm": 0.13983507454395294, + "learning_rate": 0.0001, + "loss": 0.1441, + "step": 2368 + }, + { + "epoch": 3.86208020867297, + "grad_norm": 0.14439599215984344, + "learning_rate": 0.0001, + "loss": 0.1568, + "step": 2369 + }, + { + "epoch": 3.863710466253668, + "grad_norm": 0.14129005372524261, + "learning_rate": 0.0001, + "loss": 0.1675, + "step": 2370 + }, + { + "epoch": 3.8653407238343656, + "grad_norm": 0.14672988653182983, + "learning_rate": 0.0001, + "loss": 0.1706, + "step": 2371 + }, + { + "epoch": 3.8669709814150637, + "grad_norm": 0.14792685210704803, + "learning_rate": 0.0001, + "loss": 0.1692, + "step": 2372 + }, + { + "epoch": 3.8686012389957614, + "grad_norm": 0.1503150761127472, + "learning_rate": 0.0001, + "loss": 0.1619, + "step": 2373 + }, + { + "epoch": 3.870231496576459, + "grad_norm": 0.15507511794567108, + "learning_rate": 0.0001, + "loss": 0.1681, + "step": 2374 + }, + { + "epoch": 3.8718617541571567, + "grad_norm": 0.1679387390613556, + "learning_rate": 0.0001, + "loss": 0.1451, + "step": 2375 + }, + { + "epoch": 3.873492011737855, + "grad_norm": 0.15510492026805878, + "learning_rate": 0.0001, + "loss": 0.1719, + "step": 2376 + }, + { + "epoch": 3.8751222693185525, + "grad_norm": 0.13814286887645721, + "learning_rate": 0.0001, + "loss": 0.1529, + "step": 2377 + }, + { + "epoch": 3.87675252689925, + "grad_norm": 0.12971526384353638, + "learning_rate": 0.0001, + "loss": 0.1413, + "step": 2378 + }, + { + "epoch": 3.878382784479948, + "grad_norm": 0.13721339404582977, + "learning_rate": 0.0001, + "loss": 0.1527, + "step": 2379 + }, + { + "epoch": 3.8800130420606456, + "grad_norm": 0.1391274333000183, + "learning_rate": 0.0001, + "loss": 0.1544, + "step": 2380 + }, + { + "epoch": 3.8816432996413432, + "grad_norm": 0.12956668436527252, + "learning_rate": 0.0001, + "loss": 0.1391, + "step": 2381 + }, + { + "epoch": 3.883273557222041, + "grad_norm": 0.12296263873577118, + "learning_rate": 0.0001, + "loss": 0.1668, + "step": 2382 + }, + { + "epoch": 3.8849038148027386, + "grad_norm": 0.14773519337177277, + "learning_rate": 0.0001, + "loss": 0.1704, + "step": 2383 + }, + { + "epoch": 3.8865340723834367, + "grad_norm": 0.14175285398960114, + "learning_rate": 0.0001, + "loss": 0.1571, + "step": 2384 + }, + { + "epoch": 3.8881643299641344, + "grad_norm": 0.1267097443342209, + "learning_rate": 0.0001, + "loss": 0.1431, + "step": 2385 + }, + { + "epoch": 3.889794587544832, + "grad_norm": 0.29268741607666016, + "learning_rate": 0.0001, + "loss": 0.155, + "step": 2386 + }, + { + "epoch": 3.8914248451255298, + "grad_norm": 0.13753896951675415, + "learning_rate": 0.0001, + "loss": 0.1561, + "step": 2387 + }, + { + "epoch": 3.8930551027062275, + "grad_norm": 0.1538880318403244, + "learning_rate": 0.0001, + "loss": 0.1545, + "step": 2388 + }, + { + "epoch": 3.8946853602869256, + "grad_norm": 0.15996767580509186, + "learning_rate": 0.0001, + "loss": 0.1551, + "step": 2389 + }, + { + "epoch": 3.8963156178676233, + "grad_norm": 0.1439359188079834, + "learning_rate": 0.0001, + "loss": 0.1504, + "step": 2390 + }, + { + "epoch": 3.897945875448321, + "grad_norm": 0.23938068747520447, + "learning_rate": 0.0001, + "loss": 0.1591, + "step": 2391 + }, + { + "epoch": 3.8995761330290186, + "grad_norm": 0.1339603215456009, + "learning_rate": 0.0001, + "loss": 0.1541, + "step": 2392 + }, + { + "epoch": 3.9012063906097163, + "grad_norm": 0.146190345287323, + "learning_rate": 0.0001, + "loss": 0.1639, + "step": 2393 + }, + { + "epoch": 3.902836648190414, + "grad_norm": 0.14381268620491028, + "learning_rate": 0.0001, + "loss": 0.1553, + "step": 2394 + }, + { + "epoch": 3.9044669057711117, + "grad_norm": 0.1367824673652649, + "learning_rate": 0.0001, + "loss": 0.1551, + "step": 2395 + }, + { + "epoch": 3.9060971633518093, + "grad_norm": 0.15872205793857574, + "learning_rate": 0.0001, + "loss": 0.1591, + "step": 2396 + }, + { + "epoch": 3.9077274209325075, + "grad_norm": 0.12379555404186249, + "learning_rate": 0.0001, + "loss": 0.1388, + "step": 2397 + }, + { + "epoch": 3.909357678513205, + "grad_norm": 0.14201846718788147, + "learning_rate": 0.0001, + "loss": 0.1549, + "step": 2398 + }, + { + "epoch": 3.910987936093903, + "grad_norm": 0.1701783388853073, + "learning_rate": 0.0001, + "loss": 0.1743, + "step": 2399 + }, + { + "epoch": 3.9126181936746005, + "grad_norm": 0.13417696952819824, + "learning_rate": 0.0001, + "loss": 0.1526, + "step": 2400 + }, + { + "epoch": 3.914248451255298, + "grad_norm": 0.1509416103363037, + "learning_rate": 0.0001, + "loss": 0.1607, + "step": 2401 + }, + { + "epoch": 3.9158787088359963, + "grad_norm": 0.1474318504333496, + "learning_rate": 0.0001, + "loss": 0.1509, + "step": 2402 + }, + { + "epoch": 3.917508966416694, + "grad_norm": 0.18277962505817413, + "learning_rate": 0.0001, + "loss": 0.1673, + "step": 2403 + }, + { + "epoch": 3.9191392239973917, + "grad_norm": 0.14905720949172974, + "learning_rate": 0.0001, + "loss": 0.1513, + "step": 2404 + }, + { + "epoch": 3.9207694815780894, + "grad_norm": 0.13981392979621887, + "learning_rate": 0.0001, + "loss": 0.1615, + "step": 2405 + }, + { + "epoch": 3.922399739158787, + "grad_norm": 0.13442060351371765, + "learning_rate": 0.0001, + "loss": 0.1514, + "step": 2406 + }, + { + "epoch": 3.9240299967394847, + "grad_norm": 0.13714581727981567, + "learning_rate": 0.0001, + "loss": 0.1563, + "step": 2407 + }, + { + "epoch": 3.9256602543201824, + "grad_norm": 0.15870222449302673, + "learning_rate": 0.0001, + "loss": 0.166, + "step": 2408 + }, + { + "epoch": 3.92729051190088, + "grad_norm": 0.14761915802955627, + "learning_rate": 0.0001, + "loss": 0.1587, + "step": 2409 + }, + { + "epoch": 3.928920769481578, + "grad_norm": 0.13917234539985657, + "learning_rate": 0.0001, + "loss": 0.1609, + "step": 2410 + }, + { + "epoch": 3.930551027062276, + "grad_norm": 0.13690310716629028, + "learning_rate": 0.0001, + "loss": 0.1478, + "step": 2411 + }, + { + "epoch": 3.9321812846429736, + "grad_norm": 0.1417568325996399, + "learning_rate": 0.0001, + "loss": 0.1648, + "step": 2412 + }, + { + "epoch": 3.9338115422236712, + "grad_norm": 0.13829518854618073, + "learning_rate": 0.0001, + "loss": 0.1589, + "step": 2413 + }, + { + "epoch": 3.9354417998043694, + "grad_norm": 0.11972015351057053, + "learning_rate": 0.0001, + "loss": 0.125, + "step": 2414 + }, + { + "epoch": 3.937072057385067, + "grad_norm": 0.13998998701572418, + "learning_rate": 0.0001, + "loss": 0.1721, + "step": 2415 + }, + { + "epoch": 3.9387023149657647, + "grad_norm": 0.13850805163383484, + "learning_rate": 0.0001, + "loss": 0.1451, + "step": 2416 + }, + { + "epoch": 3.9403325725464624, + "grad_norm": 0.13838806748390198, + "learning_rate": 0.0001, + "loss": 0.1509, + "step": 2417 + }, + { + "epoch": 3.94196283012716, + "grad_norm": 0.13861410319805145, + "learning_rate": 0.0001, + "loss": 0.1512, + "step": 2418 + }, + { + "epoch": 3.9435930877078578, + "grad_norm": 0.14857284724712372, + "learning_rate": 0.0001, + "loss": 0.145, + "step": 2419 + }, + { + "epoch": 3.9452233452885554, + "grad_norm": 0.13676993548870087, + "learning_rate": 0.0001, + "loss": 0.1464, + "step": 2420 + }, + { + "epoch": 3.946853602869253, + "grad_norm": 0.1672215759754181, + "learning_rate": 0.0001, + "loss": 0.1506, + "step": 2421 + }, + { + "epoch": 3.9484838604499513, + "grad_norm": 0.1676868051290512, + "learning_rate": 0.0001, + "loss": 0.1714, + "step": 2422 + }, + { + "epoch": 3.950114118030649, + "grad_norm": 0.15942254662513733, + "learning_rate": 0.0001, + "loss": 0.1543, + "step": 2423 + }, + { + "epoch": 3.9517443756113466, + "grad_norm": 0.1640825718641281, + "learning_rate": 0.0001, + "loss": 0.1555, + "step": 2424 + }, + { + "epoch": 3.9533746331920443, + "grad_norm": 0.13515083491802216, + "learning_rate": 0.0001, + "loss": 0.1591, + "step": 2425 + }, + { + "epoch": 3.955004890772742, + "grad_norm": 0.13103432953357697, + "learning_rate": 0.0001, + "loss": 0.1465, + "step": 2426 + }, + { + "epoch": 3.95663514835344, + "grad_norm": 0.13040922582149506, + "learning_rate": 0.0001, + "loss": 0.1504, + "step": 2427 + }, + { + "epoch": 3.958265405934138, + "grad_norm": 0.14233194291591644, + "learning_rate": 0.0001, + "loss": 0.1623, + "step": 2428 + }, + { + "epoch": 3.9598956635148355, + "grad_norm": 0.14986193180084229, + "learning_rate": 0.0001, + "loss": 0.154, + "step": 2429 + }, + { + "epoch": 3.961525921095533, + "grad_norm": 0.14771242439746857, + "learning_rate": 0.0001, + "loss": 0.1595, + "step": 2430 + }, + { + "epoch": 3.963156178676231, + "grad_norm": 0.12477671355009079, + "learning_rate": 0.0001, + "loss": 0.1552, + "step": 2431 + }, + { + "epoch": 3.9647864362569285, + "grad_norm": 0.12812167406082153, + "learning_rate": 0.0001, + "loss": 0.1576, + "step": 2432 + }, + { + "epoch": 3.966416693837626, + "grad_norm": 0.12061511725187302, + "learning_rate": 0.0001, + "loss": 0.1474, + "step": 2433 + }, + { + "epoch": 3.968046951418324, + "grad_norm": 0.15570174157619476, + "learning_rate": 0.0001, + "loss": 0.1528, + "step": 2434 + }, + { + "epoch": 3.969677208999022, + "grad_norm": 0.12087506800889969, + "learning_rate": 0.0001, + "loss": 0.1376, + "step": 2435 + }, + { + "epoch": 3.9713074665797197, + "grad_norm": 0.16076244413852692, + "learning_rate": 0.0001, + "loss": 0.159, + "step": 2436 + }, + { + "epoch": 3.9729377241604173, + "grad_norm": 0.17178180813789368, + "learning_rate": 0.0001, + "loss": 0.1613, + "step": 2437 + }, + { + "epoch": 3.974567981741115, + "grad_norm": 0.15928059816360474, + "learning_rate": 0.0001, + "loss": 0.1714, + "step": 2438 + }, + { + "epoch": 3.9761982393218127, + "grad_norm": 0.14910367131233215, + "learning_rate": 0.0001, + "loss": 0.1522, + "step": 2439 + }, + { + "epoch": 3.977828496902511, + "grad_norm": 0.158916637301445, + "learning_rate": 0.0001, + "loss": 0.1715, + "step": 2440 + }, + { + "epoch": 3.9794587544832085, + "grad_norm": 0.13345277309417725, + "learning_rate": 0.0001, + "loss": 0.1463, + "step": 2441 + }, + { + "epoch": 3.981089012063906, + "grad_norm": 0.1387554258108139, + "learning_rate": 0.0001, + "loss": 0.1705, + "step": 2442 + }, + { + "epoch": 3.982719269644604, + "grad_norm": 0.1317049264907837, + "learning_rate": 0.0001, + "loss": 0.1444, + "step": 2443 + }, + { + "epoch": 3.9843495272253016, + "grad_norm": 0.1379990130662918, + "learning_rate": 0.0001, + "loss": 0.1636, + "step": 2444 + }, + { + "epoch": 3.9859797848059992, + "grad_norm": 0.14041666686534882, + "learning_rate": 0.0001, + "loss": 0.1543, + "step": 2445 + }, + { + "epoch": 3.987610042386697, + "grad_norm": 0.1239699274301529, + "learning_rate": 0.0001, + "loss": 0.1477, + "step": 2446 + }, + { + "epoch": 3.9892402999673946, + "grad_norm": 0.16149182617664337, + "learning_rate": 0.0001, + "loss": 0.1662, + "step": 2447 + }, + { + "epoch": 3.9908705575480927, + "grad_norm": 0.13410444557666779, + "learning_rate": 0.0001, + "loss": 0.1662, + "step": 2448 + }, + { + "epoch": 3.9925008151287904, + "grad_norm": 0.13568072021007538, + "learning_rate": 0.0001, + "loss": 0.1534, + "step": 2449 + }, + { + "epoch": 3.994131072709488, + "grad_norm": 0.15375611186027527, + "learning_rate": 0.0001, + "loss": 0.1758, + "step": 2450 + }, + { + "epoch": 3.9957613302901858, + "grad_norm": 0.13414491713047028, + "learning_rate": 0.0001, + "loss": 0.1674, + "step": 2451 + }, + { + "epoch": 3.9973915878708834, + "grad_norm": 0.15805280208587646, + "learning_rate": 0.0001, + "loss": 0.1665, + "step": 2452 + }, + { + "epoch": 3.9973915878708834, + "step": 2452, + "total_flos": 7.122222383569568e+18, + "train_loss": 0.188311614755535, + "train_runtime": 156391.4769, + "train_samples_per_second": 0.471, + "train_steps_per_second": 0.016 + } + ], + "logging_steps": 1.0, + "max_steps": 2452, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 500, + "total_flos": 7.122222383569568e+18, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}